def _slice_padded(self, _bounds): pads = (max(-_bounds[0], 0), max(-_bounds[1], 0), max(_bounds[2]-self.shape[2], 0), max(_bounds[3]-self.shape[1], 0)) bounds = (max(_bounds[0], 0), max(_bounds[1], 0), max(min(_bounds[2], self.shape[2]), 0), max(min(_bounds[3], self.shape[1]), 0)) result = self[:, bounds[1]:bounds[3], bounds[0]:bounds[2]] if pads[0] > 0: dims = (result.shape[0], result.shape[1], pads[0]) result = da.concatenate([da.zeros(dims, chunks=dims, dtype=result.dtype), result], axis=2) if pads[2] > 0: dims = (result.shape[0], result.shape[1], pads[2]) result = da.concatenate([result, da.zeros(dims, chunks=dims, dtype=result.dtype)], axis=2) if pads[1] > 0: dims = (result.shape[0], pads[1], result.shape[2]) result = da.concatenate([da.zeros(dims, chunks=dims, dtype=result.dtype), result], axis=1) if pads[3] > 0: dims = (result.shape[0], pads[3], result.shape[2]) result = da.concatenate([result, da.zeros(dims, chunks=dims, dtype=result.dtype)], axis=1) return (result, _bounds[0], _bounds[1])
def test_mixed_concatenate(func): x = da.random.random((2, 3, 4), chunks=(1, 2, 2)) y = da.random.random((2, 3, 4), chunks=(1, 2, 2)) y[y < 0.4] = 0 yy = da.ma.masked_equal(y, 0) d = da.concatenate([x, y], axis=0) s = da.concatenate([x, yy], axis=0) dd = func(d) ss = func(s) assert_eq(dd, ss)
def test_mixed_concatenate(func): x = da.random.random((2, 3, 4), chunks=(1, 2, 2)) y = da.random.random((2, 3, 4), chunks=(1, 2, 2)) y[y < 0.8] = 0 yy = y.map_blocks(sparse.COO.from_numpy) d = da.concatenate([x, y], axis=0) s = da.concatenate([x, yy], axis=0) dd = func(d) ss = func(s) assert_eq(dd, ss)
def est_sh_part(varr, max_sh, npart, local): if varr.shape[0] <= 1: return varr.squeeze(), np.array([[0, 0]]) idx_spt = np.array_split(np.arange(varr.shape[0]), npart) fm_ls, sh_ls = [], [] for idx in idx_spt: if len(idx) > 0: fm, sh = est_sh_part(varr[idx, :, :], max_sh, npart, local) fm_ls.append(fm) sh_ls.append(sh) mid = int(len(sh_ls) / 2) sh_add_ls = [np.array([0, 0])] * len(sh_ls) for i, fm in enumerate(fm_ls): if i < mid: temp = fm_ls[i + 1] sh_idx = np.arange(i + 1) elif i > mid: temp = fm_ls[i - 1] sh_idx = np.arange(i, len(sh_ls)) else: continue sh_add = darr.from_delayed( delayed(match_temp)(fm, temp, max_sh, local), (2,), float ) for j in sh_idx: sh_ls[j] = sh_ls[j] + sh_add.reshape((1, -1)) sh_add_ls[j] = sh_add_ls[j] + sh_add for i, (fm, sh) in enumerate(zip(fm_ls, sh_add_ls)): fm_ls[i] = darr.nan_to_num( darr.from_delayed(delayed(shift_perframe)(fm, sh), fm.shape, fm.dtype) ) sh_ret = darr.concatenate(sh_ls) fm_ret = darr.stack(fm_ls) return fm_ret.max(axis=0), sh_ret
def euclidean(XA, XB): """Returns the distance between points using Euclidean distance (2-norm) as the distance metric between the points. Find the Euclidean distances between four 2-D coordinates: >>> coords = [(35.0456, -85.2672), ... (35.1174, -89.9711), ... (35.9728, -83.9422), ... (36.1667, -86.7833)] >>> euclidean(coords, coords) array([[ 0. , 4.7044, 1.6172, 1.8856], [ 4.7044, 0. , 6.0893, 3.3561], [ 1.6172, 6.0893, 0. , 2.8477], [ 1.8856, 3.3561, 2.8477, 0. ]]) """ mA = (XA.shape)[0] mB = (XB.shape)[0] distances = [] for i in xrange(0, mA): dm = np.zeros(shape=(1, mB), dtype=np.double) for j in xrange(0, mB): XA_XB = XA[i, :] - XB[j, :] dm[0, j] = da.sqrt(da.dot(XA_XB, XA_XB)) distances.append( da.from_array(dm, chunks=(mA + mB) / multiprocessing.cpu_count())) return da.concatenate(distances, axis=0)
def f_oneway(*args): # args = [np.asarray(arg, dtype=float) for arg in args] # ANOVA on N groups, each in its own array num_groups = len(args) alldata = da.concatenate(args) bign = len(alldata) # Determine the mean of the data, and subtract that from all inputs to a # variance (via sum_of_sq / sq_of_sum) calculation. Variance is invariance # to a shift in location, and centering all data around zero vastly # improves numerical stability. offset = alldata.mean() alldata -= offset sstot = _sum_of_squares(alldata) - (_square_of_sums(alldata) / float(bign)) ssbn = 0 for a in args: ssbn += _square_of_sums(a - offset) / float(len(a)) # Naming: variables ending in bn/b are for "between treatments", wn/w are # for "within treatments" ssbn -= (_square_of_sums(alldata) / float(bign)) sswn = sstot - ssbn dfbn = num_groups - 1 dfwn = bign - num_groups msb = ssbn / float(dfbn) msw = sswn / float(dfwn) f = msb / msw prob = _fdtrc(dfbn, dfwn, f) # equivalent to stats.f.sf return delayed(F_onewayResult, nout=2)(f, prob)
def get_lazy_well() -> da.Array: lazy_rows = [] # For level 0, return whole image for each tile for row in range(row_count): lazy_row: List[da.Array] = [] for col in range(column_count): tile_name = f"{row},{col}" LOGGER.debug(f"creating lazy_reader. row:{row} col:{col}") lazy_tile = da.from_delayed( lazy_reader(tile_name), shape=self.img_shape, dtype=self.numpy_type, ) lazy_row.append(lazy_tile) lazy_rows.append(da.concatenate(lazy_row, axis=4)) return da.concatenate(lazy_rows, axis=3)
def get_array(self, key): """Get all data from file for the given BUFR key.""" with open(self.filename, "rb") as fh: msgCount = 0 while True: bufr = ec.codes_bufr_new_from_file(fh) if bufr is None: break ec.codes_set(bufr, 'unpack', 1) # if is the first message initialise our final array if (msgCount == 0): arr = da.from_array(ec.codes_get_array(bufr, key, float), chunks=CHUNK_SIZE) else: tmpArr = da.from_array(ec.codes_get_array( bufr, key, float), chunks=CHUNK_SIZE) arr = da.concatenate((arr, tmpArr)) msgCount = msgCount + 1 ec.codes_release(bufr) if arr.size == 1: arr = arr[0] return arr
def test_bag_array_conversion(): import dask.bag as db b = db.range(10, npartitions=1) x, = b.map_partitions(np.asarray).to_delayed() x, = [da.from_delayed(a, shape=(10, ), dtype=int) for a in [x]] z = da.concatenate([x]) assert_eq(z, np.arange(10), check_graph=False)
def test_bag_array_conversion(): import dask.bag as db b = db.range(10, npartitions=1) x, = b.map_partitions(np.asarray).to_delayed() x, = [da.from_delayed(a, shape=(10,), dtype=int) for a in [x]] z = da.concatenate([x]) assert_eq(z, np.arange(10), check_graph=False)
def test_cupy_sparse_concatenate(axis): pytest.importorskip("cupyx") rs = da.random.RandomState(RandomState=cupy.random.RandomState) meta = cupyx.scipy.sparse.csr_matrix((0, 0)) xs = [] ys = [] for i in range(2): x = rs.random((1000, 10), chunks=(100, 10)) x[x < 0.9] = 0 xs.append(x) ys.append(x.map_blocks(cupyx.scipy.sparse.csr_matrix, meta=meta)) z = da.concatenate(ys, axis=axis) z = z.compute() if axis == 0: sp_concatenate = cupyx.scipy.sparse.vstack elif axis == 1: sp_concatenate = cupyx.scipy.sparse.hstack z_expected = sp_concatenate( [cupyx.scipy.sparse.csr_matrix(e.compute()) for e in xs]) assert (z.toarray() == z_expected.toarray()).all()
def transform(self, raw_X): msg = "'X' should be a 1-dimensional array with length 'num_samples'." if not dask.is_dask_collection(raw_X): return self._hasher(**self.get_params()).transform(raw_X) if isinstance(raw_X, db.Bag): bag2 = raw_X.map_partitions(self._transformer) objs = bag2.to_delayed() arrs = [ da.from_delayed(obj, (np.nan, self.n_features), self.dtype) for obj in objs ] result = da.concatenate(arrs, axis=0) elif isinstance(raw_X, dd.Series): result = raw_X.map_partitions(self._transformer) elif isinstance(raw_X, da.Array): # dask.Array chunks = ((np.nan, ) * raw_X.numblocks[0], (self.n_features, )) if raw_X.ndim == 1: result = raw_X.map_blocks(self._transformer, dtype="f8", chunks=chunks, new_axis=1) else: raise ValueError(msg) else: raise ValueError(msg) meta = scipy.sparse.eye(0, format="csr") result._meta = meta return result
def compute_center_of_geometry(traj): """Daskified version of mdtraj.geometry.compute_center_of_geometry This mimics py:method:`mdtraj.compute_center_of_geometry()` but returns the answer as a py:class:`dask.array` object Parameters ---------- traj : :py:class:`dask_traj.Trajectory` The trajectory to compute the angles for. Returns ------- com : dask.array, shape(n_frames, 3) Dask array with the delayed calculated Coordinates of center of geometry for each frame. """ xyz = traj.xyz length = len(xyz) lazy_results = [] current_frame = 0 for frames in xyz.chunks[0]: chunk_size = (frames, 3) next_frame = current_frame + frames lazy_results.append( wrap_da( f=_compute_center_of_geometry_chunk, chunk_size=chunk_size, xyz=xyz[current_frame:next_frame], )) current_frame = next_frame max_result = da.concatenate(lazy_results) results = max_result[:length] return results
def _project(self, X_dask): """Compute hidden layer output with Dask functionality. """ H_list = [] for hl, W in zip(self.hidden_layers_, self.W_): if hl.hidden_layer_ == HiddenLayerType.PAIRWISE: H0 = X_dask.map_blocks(pairwise_distances, W, dtype=X_dask.dtype, chunks=(X_dask.chunks[0], (W.shape[0], )), metric=hl.pairwise_metric) else: XW_dask = da.dot(X_dask, W.transpose()) if hl.ufunc_ is dummy: H0 = XW_dask elif hl.ufunc_ is np.tanh: H0 = da.tanh(XW_dask) else: H0 = XW_dask.map_blocks(hl.ufunc_) H_list.append(H0) if self.include_original_features: H_list.append(X_dask) H_list.append(da.ones((X_dask.shape[0], 1))) H_dask = da.concatenate(H_list, axis=1).rechunk(self.bsize_) return H_dask
def transform(self, X): """Transform a sequence of documents to a document-term matrix. Transformation is done in parallel, and correctly handles dask collections. Parameters ---------- X : dask.Bag of raw text documents, length = n_samples Samples. Each sample must be a text document (either bytes or unicode strings, file name or file object depending on the constructor argument) which will be tokenized and hashed. Returns ------- X : dask.array.Array, shape = (n_samples, self.n_features) Document-term matrix. Each block of the array is a scipy sparse matrix. Notes ----- The returned dask Array is composed scipy sparse matricies. If you need to compute on the result immediately, you may need to convert the individual blocks to ndarrays or pydata/sparse matricies. >>> import sparse >>> X.map_blocks(sparse.COO.from_scipy_sparse) # doctest: +SKIP See the :doc:`examples/text-vectorization` for more. """ transformer = super(HashingVectorizer, self).transform msg = "'X' should be a 1-dimensional array with length 'num_samples'." if not dask.is_dask_collection(X): return transformer(X) if isinstance(X, db.Bag): bag2 = X.map_partitions(transformer) objs = bag2.to_delayed() arrs = [ da.from_delayed(obj, (np.nan, self.n_features), self.dtype) for obj in objs ] result = da.concatenate(arrs, axis=0) elif isinstance(X, dd.Series): result = X.map_partitions(transformer) elif isinstance(X, da.Array): # dask.Array chunks = ((np.nan,) * X.numblocks[0], (self.n_features,)) if X.ndim == 1: result = X.map_blocks( transformer, dtype="f8", chunks=chunks, new_axis=1 ) else: raise ValueError(msg) else: raise ValueError(msg) return result
def f_oneway(*args): # args = [np.asarray(arg, dtype=float) for arg in args] # ANOVA on N groups, each in its own array num_groups = len(args) alldata = da.concatenate(args) bign = len(alldata) # Determine the mean of the data, and subtract that from all inputs to a # variance (via sum_of_sq / sq_of_sum) calculation. Variance is invariance # to a shift in location, and centering all data around zero vastly # improves numerical stability. offset = alldata.mean() alldata -= offset sstot = _sum_of_squares(alldata) - (_square_of_sums(alldata) / float(bign)) ssbn = 0 for a in args: ssbn += _square_of_sums(a - offset) / float(len(a)) # Naming: variables ending in bn/b are for "between treatments", wn/w are # for "within treatments" ssbn -= _square_of_sums(alldata) / float(bign) sswn = sstot - ssbn dfbn = num_groups - 1 dfwn = bign - num_groups msb = ssbn / float(dfbn) msw = sswn / float(dfwn) f = msb / msw prob = _fdtrc(dfbn, dfwn, f) # equivalent to stats.f.sf return delayed(F_onewayResult, nout=2)(f, prob)
def test_taql_where(ms, index_cols): # three cases test here, corresponding to the # if-elif-else ladder in xds_from_table # No group_cols case xds = xds_from_table(ms, taql_where="FIELD_ID >= 0 AND FIELD_ID < 2", columns=["FIELD_ID"]) assert len(xds) == 1 assert_array_equal(xds[0].FIELD_ID.data, [0, 0, 0, 1, 1, 1, 1]) # Group columns case xds = xds_from_table(ms, taql_where="FIELD_ID >= 0 AND FIELD_ID < 2", group_cols=["DATA_DESC_ID", "SCAN_NUMBER"], columns=["FIELD_ID"]) assert len(xds) == 2 # Check group id's assert xds[0].DATA_DESC_ID == 0 and xds[0].SCAN_NUMBER == 0 assert xds[1].DATA_DESC_ID == 0 and xds[1].SCAN_NUMBER == 1 # Check field id's in each group fields = da.concatenate([ds.FIELD_ID.data for ds in xds]) assert_array_equal(fields, [0, 0, 1, 1, 0, 1, 1]) # Group columns case xds = xds_from_table(ms, taql_where="FIELD_ID >= 0 AND FIELD_ID < 2", group_cols=["DATA_DESC_ID", "FIELD_ID"], columns=["FIELD_ID"]) assert len(xds) == 2 # Check group id's, no DATA_DESC_ID == 1 because it only # contains FIELD_ID == 2 assert xds[0].DATA_DESC_ID == 0 and xds[0].FIELD_ID == 0 assert xds[1].DATA_DESC_ID == 0 and xds[1].FIELD_ID == 1 # Group on each row xds = xds_from_table(ms, taql_where="FIELD_ID >= 0 AND FIELD_ID < 2", group_cols=["__row__"], columns=["FIELD_ID"]) assert len(xds) == 7 fields = da.concatenate([ds.FIELD_ID.data for ds in xds]) assert_array_equal(fields, [0, 0, 0, 1, 1, 1, 1])
def __init__(self, parent_ds): """ The plotter is constructed from a parent data set. From the parent's `XC` and `YC` variables, it constructs its own coordinates XC and YC, and an internal xarray Dataset object based on these coordinates. """ if not isinstance(parent_ds, xr.Dataset): raise TypeError( 'LLC_plotter must be constructed from an xarray dataset') self.parent = parent_ds XC = concatenate([parent_ds.XC[i,:,:].data \ for i in range(parent_ds.XC.shape[0])]) YC = concatenate([parent_ds.YC[i,:,:].data \ for i in range(parent_ds.YC.shape[0])]) XG = concatenate([parent_ds.XG[i,:,:].data \ for i in range(parent_ds.XG.shape[0])]) YG = concatenate([parent_ds.YG[i,:,:].data \ for i in range(parent_ds.YG.shape[0])]) # Important assumption - this certainly *should* be the case for any # sane data set. assert XC.shape == YC.shape and XG.shape == YG.shape assert XC.shape == XG.shape jdim, idim = XC.shape i = xr.DataArray(np.arange(idim), coords=[('i', np.arange(idim))]) j = xr.DataArray(np.arange(jdim), coords=[('j', np.arange(jdim))]) i_g = xr.DataArray(np.arange(idim), coords=[('i_g', np.arange(idim))]) j_g = xr.DataArray(np.arange(jdim), coords=[('j_g', np.arange(jdim))]) XC = xr.DataArray(XC, coords=[('j', j), ('i', i)]) YC = xr.DataArray(YC, coords=[('j', j), ('i', i)]) XG = xr.DataArray(XG, coords=[('j_g', j_g), ('i_g', i_g)]) YG = xr.DataArray(YG, coords=[('j_g', j_g), ('i_g', i_g)]) self.ds = xr.Dataset( coords={ 'i': i, 'j': j, 'i_g': i_g, 'j_g': j_g, 'XC': XC, 'XG': XG, 'YC': YC, 'YG': YG })
def _split(self, test_start, test_stop, n_samples, chunks, seeds): train_objs = [] test_objs = [] train_sizes = [] test_sizes = [] offset = 0 for chunk, seed in zip(chunks, seeds): start, stop = offset, offset + chunk test_id_start = max(test_start, start) test_id_stop = min(test_stop, stop) if test_id_start < test_id_stop: test_objs.append( dask.delayed(_generate_offset_idx)(chunk, test_id_start, test_id_stop, offset, seed)) test_sizes.append(test_id_stop - test_id_start) train_id_stop = min(test_id_start, stop) if train_id_stop > start: train_objs.append( dask.delayed(_generate_offset_idx)(chunk, start, train_id_stop, offset, seed)) train_sizes.append(train_id_stop - start) train_id_start = max(test_id_stop, start) if train_id_start < stop: train_objs.append( dask.delayed(_generate_offset_idx)(chunk, train_id_start, stop, offset, seed)) train_sizes.append(stop - train_id_start) offset = stop train_idx = da.concatenate([ da.from_delayed(obj, (train_size, ), np.dtype("int")) for obj, train_size in zip(train_objs, train_sizes) ]) test_idx = da.concatenate([ da.from_delayed(obj, (test_size, ), np.dtype("int")) for obj, test_size in zip(test_objs, test_sizes) ]) return train_idx, test_idx
def ConcatenateSources(*sources, **kwargs): """ Concatenate CatalogSource objects together, optionally including only certain columns in the returned source. .. note:: The returned catalog object carries the meta-data from only the first catalog supplied to this function (in the ``attrs`` dict). Parameters ---------- *sources : subclass of :class:`~nbodykit.base.catalog.CatalogSource` the catalog source objects to concatenate together columns : str, list of str, optional the columns to include in the concatenated catalog Returns ------- CatalogSource : the concatenated catalog source object Examples -------- >>> from nbodykit.lab import * >>> source1 = UniformCatalog(nbar=100, BoxSize=1.0) >>> source2 = UniformCatalog(nbar=100, BoxSize=1.0) >>> print(source1.csize, source2.csize) >>> combined = transform.ConcatenateSources(source1, source2, columns=['Position', 'Velocity']) >>> print(combined.csize) """ from nbodykit.base.catalog import CatalogSource columns = kwargs.get('columns', None) if isinstance(columns, string_types): columns = [columns] # concatenate all columns, if none provided if columns is None or columns == []: columns = sources[0].columns # check comms if not all(src.comm == sources[0].comm for src in sources): raise ValueError("cannot concatenate sources: comm mismatch") # check all columns are there for source in sources: if not all(col in source for col in columns): raise ValueError(("cannot concatenate sources: columns are missing " "from some sources")) # the total size size = numpy.sum([src.size for src in sources], dtype='intp') data = {} for col in columns: data[col] = da.concatenate([src[col] for src in sources], axis=0) toret = CatalogSource._from_columns(size, sources[0].comm, **data) toret.attrs.update(sources[0].attrs) return toret
def ConcatenateSources(*sources, **kwargs): """ Concatenate CatalogSource objects together, optionally including only certain columns in the returned source. .. note:: The returned catalog object carries the meta-data from only the first catalog supplied to this function (in the ``attrs`` dict). Parameters ---------- *sources : subclass of :class:`~nbodykit.base.catalog.CatalogSource` the catalog source objects to concatenate together columns : str, list of str, optional the columns to include in the concatenated catalog Returns ------- CatalogSource : the concatenated catalog source object Examples -------- >>> from nbodykit.lab import * >>> source1 = UniformCatalog(nbar=100, BoxSize=1.0) >>> source2 = UniformCatalog(nbar=100, BoxSize=1.0) >>> print(source1.csize, source2.csize) >>> combined = transform.ConcatenateSources(source1, source2, columns=['Position', 'Velocity']) >>> print(combined.csize) """ from nbodykit.base.catalog import CatalogSource columns = kwargs.get('columns', None) if isinstance(columns, string_types): columns = [columns] # concatenate all columns, if none provided if columns is None or columns == []: columns = sources[0].columns # check comms if not all(src.comm == sources[0].comm for src in sources): raise ValueError("cannot concatenate sources: comm mismatch") # check all columns are there for source in sources: if not all(col in source for col in columns): raise ValueError(("cannot concatenate sources: columns are missing " "from some sources")) # the total size size = sum(src.size for src in sources) data = {} for col in columns: data[col] = da.concatenate([src[col] for src in sources], axis=0) toret = CatalogSource._from_columns(size, sources[0].comm, **data) toret.attrs.update(sources[0].attrs) return toret
def _expand_tiepoint_array_1km(self, arr, lines, cols): arr = da.repeat(arr, lines, axis=1) arr = da.concatenate( (arr[:, :lines // 2, :], arr, arr[:, -(lines // 2):, :]), axis=1) arr = da.repeat(arr.reshape((-1, self.cscan_full_width - 1)), cols, axis=1) return da.hstack((arr, arr[:, -cols:]))
def _rmatvec(self, x): y = [] for iop, oper in enumerate(self.ops): y.append( oper.rmatvec(x[self.nnops[iop]:self.nnops[iop + 1]]).squeeze()) y = da.concatenate(y) y = y.rechunk(self.chunks[0]) return y
def from_iterator( # type: ignore[override] cls, name: str, iterator: Iterable[Tuple[str, str]], batch_size: int = 64, overwrite: bool = False, ) -> LabeledDataset: # # An alternative implementation can use itertool.tee + threading/async # https://stackoverflow.com/questions/50039223/how-to-execute-two-aggregate-functions-like-sum-concurrently-feeding-them-f # https://github.com/omnilib/aioitertools # # (Label, Doc) dataset = cls(name, overwrite=overwrite) data_path = common.PROJDIR / name / (name + ".raw.zarr.zip") label_path = common.PROJDIR / name / (name + ".label.raw.zarr.zip") labels, docs = unzip(iterator) if data_path.is_file() and (not overwrite): raise RuntimeError("File already exists") else: dataset.data["raw"] = Raw.from_dask_array( data_path, da.concatenate([ da.from_array(np.array(chunk)) for chunk in chunked(docs, batch_size) ]), overwrite=overwrite, ) if label_path.is_file() and (not overwrite): raise RuntimeError("File already exists") else: dataset.labels["raw"] = Raw.from_dask_array( label_path, da.concatenate([ da.from_array(np.array(chunk)) for chunk in chunked(labels, batch_size) ]), overwrite=overwrite, ) dataset.save() return dataset
def svs2dask_array(svs_file, tile_size=1000, overlap=0, remove_last=True, allow_unknown_chunksizes=False, transpose=False): """Convert SVS, TIF or TIFF to dask array. Parameters ---------- svs_file : str Image file. tile_size : int Size of chunk to be read in. overlap : int Do not modify, overlap between neighboring tiles. remove_last : bool Remove last tile because it has a custom size. allow_unknown_chunksizes : bool Allow different chunk sizes, more flexible, but slowdown. Returns ------- arr : dask.array.Array A Dask Array representing the contents of the image file. >>> arr = svs2dask_array(svs_file, tile_size=1000, overlap=0, remove_last=True, allow_unknown_chunksizes=False) >>> arr2 = arr.compute() >>> arr3 = to_pil(cv2.resize(arr2, dsize=(1440, 700), interpolation=cv2.INTER_CUBIC)) >>> arr3.save(test_image_name) """ # https://github.com/jlevy44/PathFlowAI/blob/master/pathflowai/utils.py img = openslide.open_slide(svs_file) if type(img) is openslide.OpenSlide: gen = deepzoom.DeepZoomGenerator( img, tile_size=tile_size, overlap=overlap, limit_bounds=True) max_level = len(gen.level_dimensions) - 1 n_tiles_x, n_tiles_y = gen.level_tiles[max_level] @dask.delayed(pure=True) def get_tile(level, column, row): tile = gen.get_tile(level, (column, row)) return np.array(tile).transpose((1, 0, 2)) sample_tile_shape = get_tile(max_level, 0, 0).shape.compute() rows = range(n_tiles_y - (0 if not remove_last else 1)) cols = range(n_tiles_x - (0 if not remove_last else 1)) arr = da.concatenate([da.concatenate([da.from_delayed(get_tile(max_level, col, row), sample_tile_shape, np.uint8) for row in rows], allow_unknown_chunksizes=allow_unknown_chunksizes, axis=1) for col in cols], allow_unknown_chunksizes=allow_unknown_chunksizes) if transpose: arr=arr.transpose([1, 0, 2]) return arr else: # img is instance of openslide.ImageSlide return dask_image.imread.imread(svs_file)
def make_da(delayed_list, length): sample = delayed_list[0].compute() arrays = [ da.from_delayed(item, dtype=sample.dtype, shape=sample.shape) for item in delayed_list ] result = da.concatenate(arrays, axis=0)[:length] return result
def as_known(X, lengths): blocks = X.to_delayed().flatten() P = X.shape[1] arrays = [ da.from_delayed(x, dtype=X.dtype, shape=(length, P)) for x, length in zip(blocks, lengths) ] return da.concatenate(arrays, axis=0)
def compute(self, data, cache_id=None, rows_per_scan=None, chunks=None, fill_value=None, weight_count=10000, weight_min=0.01, weight_distance_max=1.0, weight_delta_max=1.0, weight_sum_min=-1.0, maximum_weight_mode=None, **kwargs): """Resample the data according to the precomputed X/Y coordinates.""" # not used in this step kwargs.pop("persist", None) data_in, xr_obj = self._get_input_tuples(data) rows_per_scan = self._get_rows_per_scan(rows_per_scan) data_in = tuple(self._convert_to_dask(data_in, rows_per_scan)) out_chunks = normalize_chunks(chunks or 'auto', shape=self.target_geo_def.shape, dtype=data.dtype) fornav_kwargs = kwargs.copy() maximum_weight_mode = self._handle_mwm(data, maximum_weight_mode) fornav_kwargs.update( dict( weight_count=weight_count, weight_min=weight_min, weight_distance_max=weight_distance_max, weight_delta_max=weight_delta_max, weight_sum_min=weight_sum_min, maximum_weight_mode=maximum_weight_mode, rows_per_scan=rows_per_scan, )) # determine a fill value if they didn't tell us what they have as a # fill value in the numpy arrays if fill_value is None: fill_value = self._get_default_fill(data_in[0]) data_out = [] for data_subarr in data_in: res = self._run_fornav_single(data_subarr, out_chunks, self.target_geo_def, fill_value, **fornav_kwargs) data_out.append(res) if data.ndim == 2: out = data_out[0] else: out = da.concatenate([arr[None, ...] for arr in data_out], axis=0) if xr_obj is not None: dims = [d for d in xr_obj.dims if d not in ('y', 'x')] + ['y', 'x'] out = xr.DataArray(out, attrs=xr_obj.attrs.copy(), dims=dims) out = update_resampled_coords(xr_obj, out, self.target_geo_def) if isinstance(data, np.ndarray): return out.compute() return out
def work(self): import dask.array as da import numpy as np import h5py from luigi.file import atomic_file fs = [h5py.File(f.path, mode='r') for f in self.input()] # Verify all H5s have the same structure datasets, groups, samples = [[] for x in fs], [[] for x in fs ], [[] for x in fs] for i, f in enumerate(fs): f.visititems(lambda n, o: datasets[i].append(n) if isinstance( o, h5py.Dataset) else groups[i].append(n)) samples[i] = f['samples'][:] if not all([set(datasets[0]) == set(x) for x in datasets]) and np.all( samples == samples[0], axis=0): raise Exception( "All HDF5 files must have the same groups/datasets/samples!") datasets, groups, samples = datasets[0], groups[0], samples[0] # Drop Samples dataset and handle separately datasets = [x for x in datasets if x != 'samples'] combined = { d: da.concatenate([da.from_array(f[d], chunks=100000) for f in fs]) for d in datasets } shapes = [(np.sum([f.get(d).shape for f in fs], axis=0)[0], *fs[0].get(d).shape[1:]) for d in datasets] dtypes = [fs[0].get(d).dtype for d in datasets] # Handles Samples dataset datasets.append('samples') combined.update({'samples': da.from_array(fs[0]['samples'], chunks=1)}) shapes.append(samples.shape) dtypes.append(samples.dtype) af = atomic_file(self.output().path) fout = h5py.File(af.tmp_path, 'w') # Set up group structure for g in groups: fout.create_group(g) # Create the datasets out_datasets = {} for p, dtype, shape in zip(datasets, dtypes, shapes): g, d = os.path.split(p) out_datasets[p] = (fout[g] if g else fout).create_dataset( d, shape=shape, dtype=dtype, chunks=True, compression='gzip') for k in combined.keys(): s = da.store(combined[k], out_datasets[k], compute=False) s.compute(num_workers=self.n_cpu) print("Done " + k) af.move_to_final_destination()
def missing_spectrum( # pylint: disable=too-many-locals df: DataArray, bins: int) -> Dict[str, da.Array]: """Calculate a missing spectrum for each column.""" nrows, ncols = df.shape data = df.nulls if nrows > 1: num_bins = min(bins, nrows - 1) bin_size = nrows // num_bins chunk_size = min(1024 * 1024 * 128, nrows * ncols) # max 1024 x 1024 x 128 Bytes bool values nbins_per_chunk = max(chunk_size // (bin_size * data.shape[1]), 1) chunk_size = nbins_per_chunk * bin_size data = data.rechunk((chunk_size, None)) sep = nrows // chunk_size * chunk_size else: # avoid division or module by zero bin_size = 1 nbins_per_chunk = 1 chunk_size = 1 data = data.rechunk((chunk_size, None)) sep = 1 spectrum_missing_percs = data[:sep].map_blocks( missing_perc_blockwise(bin_size), chunks=(nbins_per_chunk, *data.chunksize[1:]), dtype=float, ) # calculation for the last chunk if sep != nrows: spectrum_missing_percs_remain = data[sep:].map_blocks( missing_perc_blockwise(bin_size), chunks=(int(np.ceil((nrows - sep) / bin_size)), *data.shape[1:]), dtype=float, ) spectrum_missing_percs = da.concatenate( [spectrum_missing_percs, spectrum_missing_percs_remain], axis=0) num_bins = spectrum_missing_percs.shape[0] locs0 = da.arange(num_bins) * bin_size locs1 = da.minimum(locs0 + bin_size, nrows) locs_middle = locs0 + bin_size / 2 return { "column": da.repeat(da.from_array(df.columns.values, (1, )), num_bins), "location": da.tile(locs_middle, ncols), "missing_rate": spectrum_missing_percs.T.ravel().rechunk(locs_middle.shape[0]), "loc_start": da.tile(locs0, ncols), "loc_end": da.tile(locs1, ncols), }
def as_stitched_array(self, channel_index=0, channel_name=None, t_index=0, verbose=True): if channel_name is not None: channel_index = self._channel_name_to_index(channel_name) z_list = [] for z in self.z_indices: # this doesn't work with explore acquisitions and would need to be updated rows, cols = self.get_num_rows_and_cols() empty_tile = np.zeros((self.image_height, self.image_width), self.pixel_type) row_list = [] for row in range(rows): if verbose: print('stitching row {} of {}'.format(row + 1, rows)) col_list = [] for col in range(cols): pos_index_array = np.nonzero( np.logical_and(self.row_col_array[:, 0] == row, self.row_col_array[:, 1] == col))[0] pos_index = None if pos_index_array.size == 0 else pos_index_array[ 0] if pos_index is not None and self.has_image( channel_index=channel_index, z_index=z, t_index=t_index, pos_index=pos_index): img = self.read_image(channel_index=channel_index, z_index=z, t_index=t_index, pos_index=pos_index, memmapped=True) else: img = empty_tile # crop to center of tile col_list.append( img[self.overlap[0] // 2:-self.overlap[0] // 2, self.overlap[1] // 2:-self.overlap[1] // 2]) stitched_col = da.concatenate(col_list, axis=1) row_list.append(stitched_col) stitched = da.concatenate(row_list, axis=0) z_list.append(stitched) return da.stack(z_list)
def _create_ranking_data(n_samples=100, output='array', chunk_size=50, **kwargs): X, y, g = make_ranking(n_samples=n_samples, random_state=42, **kwargs) rnd = np.random.RandomState(42) w = rnd.rand(X.shape[0]) * 0.01 g_rle = np.array([len(list(grp)) for _, grp in groupby(g)]) if output == 'dataframe': # add target, weight, and group to DataFrame so that partitions abide by group boundaries. X_df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])]) X = X_df.copy() X_df = X_df.assign(y=y, g=g, w=w) # set_index ensures partitions are based on group id. # See https://stackoverflow.com/questions/49532824/dask-dataframe-split-partitions-based-on-a-column-or-function. X_df.set_index('g', inplace=True) dX = dd.from_pandas(X_df, chunksize=chunk_size) # separate target, weight from features. dy = dX['y'] dw = dX['w'] dX = dX.drop(columns=['y', 'w']) dg = dX.index.to_series() # encode group identifiers into run-length encoding, the format LightGBMRanker is expecting # so that within each partition, sum(g) = n_samples. dg = dg.map_partitions(lambda p: p.groupby('g', sort=False).apply(lambda z: z.shape[0])) elif output == 'array': # ranking arrays: one chunk per group. Each chunk must include all columns. p = X.shape[1] dX, dy, dw, dg = [], [], [], [] for g_idx, rhs in enumerate(np.cumsum(g_rle)): lhs = rhs - g_rle[g_idx] dX.append(da.from_array(X[lhs:rhs, :], chunks=(rhs - lhs, p))) dy.append(da.from_array(y[lhs:rhs])) dw.append(da.from_array(w[lhs:rhs])) dg.append(da.from_array(np.array([g_rle[g_idx]]))) dX = da.concatenate(dX, axis=0) dy = da.concatenate(dy, axis=0) dw = da.concatenate(dw, axis=0) dg = da.concatenate(dg, axis=0) else: raise ValueError('Ranking data creation only supported for Dask arrays and dataframes') return X, y, w, g_rle, dX, dy, dw, dg
def _convert_C_to_F_order(client, X, chunksizes, n_features, dtype): X_ddh = DistributedDataHandler.create(data=X, client=client) X_converted = [client.submit(cp.array, X_part, copy=False, order='F', workers=[w]) for idx, (w, X_part) in enumerate(X_ddh.gpu_futures)] X_dela = _create_delayed(X_converted, dtype, chunksizes, n_features) return da.concatenate(X_dela, axis=0)
def pad_chunks(darray, chunklen): ''' make sure chunks are the right shape''' padlen = chunklen - np.mod(darray.shape[0], chunklen) if padlen == 0: return darray else: pad = da.zeros((padlen, ), dtype=np.complex64) padded = da.concatenate([darray, pad], axis=0) return padded
def parallel_request_OA(self) -> da.array: """ Requests elevation data from OpenAltimetry API in parallel. Currently supports OA_Products ['ATL06','ATL07','ATL08','ATL10','ATL12','ATL13'] For ATL03 Photon Data, OA only supports single date request according to: https://openaltimetry.org/data/swagger-ui/#/Public/getATL08DataByDate, with geospatial limitation of 1 degree lat/lon. Visualization of ATL03 data is not implemented within this module at this time. Returns ------- OA_data_da : dask.Array A dask array containing the ICESat-2 data. """ print("Generating urls") # generate parameter lists for OA requesting OA_para_list = self.generate_OA_parameters() url_number = len(OA_para_list) if url_number > 200: answer = user_check( "Too many API requests, this may take a long time, do you still want to continue: " "please enter yes/no\n") if answer == "yes": pass else: return print("Sending request to OpenAltimetry, please wait...") # Parallel processing requested_OA_data = [] with concurrent.futures.ThreadPoolExecutor( max_workers=len(OA_para_list)) as executor: parallel_OA_data = { executor.submit(self.request_OA_data, para): para for para in OA_para_list } for future in tqdm( iterable=concurrent.futures.as_completed(parallel_OA_data), total=len(parallel_OA_data), ): r = future.result() if r is not None: requested_OA_data.append(r) if not requested_OA_data: return else: OA_data_da = da.concatenate(requested_OA_data, axis=0) return OA_data_da
def _stage_1(G: Array, X: Array, Y: Array, alphas: Optional[NDArray[np.float_]] = None) -> Array: """Stage 1 - WGR Base Regression This stage will predict outcomes separately for each alpha parameter and variant block. This "compresses" the variant dimension into a smaller space that is much more amenable to efficient blockwise regressions in stage 2. Another interpretation for this operation is that all sample blocks are treated as folds in a K-fold CV fit within one single variant block. Predictions for any one combination of variant and sample block then correspond to a regression model fit all across sample blocks for that range of variants except for a single sample block. In other words, the predictions are out of sample which enables training of a stage 2 regressor based on these predictions, a technique commonly referred to as stacking. For more details, see the level 0 regression model described in step 1 of [Mbatchou et al. 2020](https://www.biorxiv.org/content/10.1101/2020.06.19.162354v2). """ assert G.ndim == 2 assert X.ndim == 2 assert Y.ndim == 2 # Check that chunking across samples is the same for all arrays assert G.shape[0] == X.shape[0] == Y.shape[0] assert G.numblocks[0] == X.numblocks[0] == Y.numblocks[0] assert G.chunks[0] == X.chunks[0] == Y.chunks[0] assert X.numblocks[1] == Y.numblocks[1] == 1 if alphas is None: alphas = get_alphas(G.shape[1], like=G) # Extract shape statistics n_sample = G.shape[0] n_outcome = Y.shape[1] n_alpha = alphas.size n_sample_block = G.numblocks[0] n_variant_block = G.numblocks[1] sample_chunks = Y.chunks[0] YP = [] for i in range(n_variant_block): # Extract all sample blocks for one variant block GB = G.blocks[:, i] # Prepend covariates and chunk along first dim only XGB = da.concatenate((X, GB), axis=1) XGB = XGB.rechunk(chunks=(None, -1)) # Fit and predict folds for each parameter and outcome YPB = _ridge_regression_cv(XGB, Y, alphas, n_zero_reg=X.shape[1])[-1] assert_block_shape(YPB, 1, n_sample_block, 1) assert_chunk_shape(YPB, n_alpha, sample_chunks[0], n_outcome) assert_array_shape(YPB, n_alpha, n_sample, n_outcome) YP.append(YPB) # Stack as (n_variant_block, n_alpha, n_sample, n_outcome) YP = da.stack(YP, axis=0) assert_block_shape(YP, n_variant_block, 1, n_sample_block, 1) assert_chunk_shape(YP, 1, n_alpha, sample_chunks[0], n_outcome) assert_array_shape(YP, n_variant_block, n_alpha, n_sample, n_outcome) return YP
def test_gh3937(): # test for github issue #3937 x = da.from_array([1, 2, 3.], (2,)) x = da.concatenate((x, [x[-1]])) y = x.rechunk((2,)) # This will produce Integral type indices that are not ints (np.int64), failing # the optimizer y = da.coarsen(np.sum, y, {0: 2}) # How to trigger the optimizer explicitly? y.compute()
def test_mixed_output_type(): y = da.random.random((10, 10), chunks=(5, 5)) y[y < 0.4] = 0 y = da.ma.masked_equal(y, 0) x = da.zeros((10, 1), chunks=(5, 1)) z = da.concatenate([x, y], axis=1) assert z.shape == (10, 11) zz = z.compute() assert isinstance(zz, np.ma.masked_array)
def interleaved_concat(arrays, indices, axis=0): """Concatenate each array along the given axis, but also assign each array element into the location given by indices. This operation is used for groupby.transform. """ if has_dask and isinstance(arrays[0], da.Array): if not _interleaved_indices_required(indices): return da.concatenate(arrays, axis) else: return _interleaved_concat_slow(arrays, indices, axis) else: return _interleaved_concat_numpy(arrays, indices, axis)
def coarsen_destagger_dask(x, blocks, stagger=None, mode='wrap'): """ Examples -------- >>> x = da.arange(6, chunks=6) >>> xc = coarsen_destagger_dask(x, {0: 2}, stagger=0) >>> xc.compute() array([ 1. , 3. , 3.5]) >>> x = da.from_array(x, chunks=x.shape) >>> xc = coarsen_destagger_dask(x, {0: 2}, stagger=0) >>> xc.compute() array([ 1. , 3. , 3.5]) """ output_numpy = False try: x._keys except AttributeError: output_numpy = True x = da.from_array(x, x.shape) xcoarse = coarsen_centered_np(x, blocks) # TODO refactor this code to another function if stagger is not None: blk = {key: val for key, val in blocks.items() if key != stagger} left_inds = np.arange(0, x.shape[stagger], blocks[stagger]) left = da.coarsen(np.sum, da.take(x, left_inds, stagger), blk) n = left.shape[stagger] # handle boundary conditions if mode == 'wrap': bc = da.take(left, [0], axis=stagger) elif mode == 'clip': bc = da.take(left, [-1], axis=stagger) else: raise ValueError(f"Unknown boundary `mode` given: {mode}") right = da.take(left, np.arange(1, n), axis=stagger) right = da.concatenate((right, bc), axis=stagger) xcoarse = xcoarse + (right - left)/2 n = np.prod(list(blocks.values())) ans = xcoarse/n if output_numpy: return ans.compute() else: return ans
def _create_global_data(self): self._result_data = self._scheme.prepared_data() for index, problem in self._global_problem.items(): if isinstance(problem, list): data = [da.from_array(self._result_data[p.dataset_descriptor.label] .data.sel({self._scheme.model.global_dimension: p.index}).values, chunks='auto') for p in problem] self._global_data[index] = da.concatenate(data).persist() else: data = self._result_data[problem.dataset_descriptor.label].data data = data.sel({self._scheme.model.global_dimension: problem.index}).values self._global_data[index] = ds.delayed(data).persist()
def _build_data(self): """ Generate the data payload for the new concatenated cube. Returns: The concatenated :class:`iris.cube.Cube` data payload. """ skeletons = self._skeletons data = [skeleton.data for skeleton in skeletons] data = da.concatenate(data, self.axis) return data
def test_mixed_output_type(): y = da.random.random((10, 10), chunks=(5, 5)) y[y < 0.8] = 0 y = y.map_blocks(sparse.COO.from_numpy) x = da.zeros((10, 1), chunks=(5, 1)) z = da.concatenate([x, y], axis=1) assert z.shape == (10, 11) zz = z.compute() assert isinstance(zz, sparse.COO) assert zz.nnz == y.compute().nnz
def _reshape_llc_data(data, jdim): """Fix the weird problem with llc data array order.""" # Can we do this without copying any data? # If not, we need to go upstream and implement this at the MDS level # Or can we fudge it with dask? # this is all very specific to the llc file output # would be nice to generalize more, but how? nside = data.shape[jdim] / LLC_NUM_FACES # how the LLC data is laid out along the j dimension strides = ((0,3), (3,6), (6,7), (7,10), (10,13)) # whether to reshape each face reshape = (False, False, False, True, True) # this will slice the data into 5 facets slices = [jdim * (slice(None),) + (slice(nside*st[0], nside*st[1]),) for st in strides] facet_arrays = [data[sl] for sl in slices] face_arrays = [] for ar, rs, st in zip(facet_arrays, reshape, strides): nfaces_in_facet = st[1] - st[0] shape = list(ar.shape) if rs: # we assume the other horizontal dimension is immediately after jdim shape[jdim] = ar.shape[jdim+1] shape[jdim+1] = ar.shape[jdim] # insert a length-1 dimension along which to concatenate shape.insert(jdim, 1) # modify the array shape in place, no copies allowed ar.shape = shape # now ar is propery shaped, but we still need to slice it into faces face_slice_dim = jdim + 1 + rs for n in range(nfaces_in_facet): face_slice = (face_slice_dim * (slice(None),) + (slice(nside*n, nside*(n+1)),)) data_face = ar[face_slice] face_arrays.append(data_face) # We can't concatenate using numpy (hcat etc.) because it makes a copy, # presumably loading the memmaps into memory. # Using dask gets around this. # But what if we want different chunks, or already chunked the data # upstream? Doesn't seem like this is ideal # TODO: Refactor handling of dask arrays and chunking #return np.concatenate(face_arrays, axis=jdim) # the dask version doesn't work because of this: # https://github.com/dask/dask/issues/1645 face_arrays_dask = [da.from_array(fa, chunks=fa.shape) for fa in face_arrays] concat = da.concatenate(face_arrays_dask, axis=jdim) return concat
def __init__(self, fasulist, check=None, hdfdir=None): if not isinstance(fasulist, list): fasulist = [fasulist,] if isinstance(fasulist[0], str): self.hflist = fasulist else: b = db.from_sequence([f._process(hdfdir=hdfdir) for f in fasulist]) self.hflist = b.compute() self.hlist = [h5py.File(h, 'r+') for h in self.hflist] if check is "names": nref = self.hlist[0]['n'] for i in range(1, len(self.hlist)): if not (nref[:] == self.hlist[i]['n'][:]).all: raise ValueError('Fasus with mismatched atom names') elif check is "masses": mref = self.hlist[0]['m'] for i in range(1, len(self.hlist)): if not (mref[:] == self.hlist[i]['m'][:]).all: raise ValueError('Fasus with mismatched atom masses') xs = [da.from_array(h['x'], chunks=CHUNKS) for h in self.hlist] self.x = da.concatenate(xs) if 'h' in h: xb = [da.from_array(h['b'], chunks=CHUNKS) for h in self.hlist] self.box = da.concatenate(xb) else: self.box = None self.fasulist = fasulist self.masses = self.hlist[0]['m'] self.shape = self.shape() self.top = self.fasulist[0].top
def get_valid_images(directory, width=224, height=224, channels=3): ''' Function to build needed arrays for training or validating the neural network using out of core processing. If labels are passed, get a list of training image files, their labels ''' validationList, _ = get_list_of_validation_files(directory) # Pass directory containing validation images print('There are ', len(validationList), ' files in the validation list.') print('Breaking the list into chunks to handle size of request.') chunkedList = get_chunks(validationList, 8000) print('The length of the chunkedList is: ', len(chunkedList)) if channels == 3: for i in range(len(chunkedList)): print('i =', i) validation_sublist = chunkedList[i][:] X = create_holding_array(validation_sublist, width = width, height=height, channels=channels) # Create empty array print('Shape of the holding array is: ', X.shape) print('Resizing 3-channel images for validation...') count = 0 # Set counter for empty array filenames = [] for validFile in validation_sublist: filenames.append(os.path.basename(validFile)) img = misc.imread(validFile) # Read the image img = misc.imresize(img, size = (width, height, channels)) # Resize image with color channel = 3 X[count] = img # Store resized image in empty array count += 1 # Advance index counter print('Shape of X is: ', X.shape) print('Transposing X...') X1 = np.transpose(X, (2,0,1)) print('Transposed shape for X is: ', X.shape) if i == 0: print('Creating a dask array for images...') X_array = da.from_array(X1, chunks=4000) else: print('Concatenating the dask arrays...') X_array = da.concatenate(X_array, da.from_array(X1, chunks=4000)) del X, X1 return X_array, filenames else: # If number of channels != 1 or != 3 print('Could not create dataset and resize training images...')
def concat(da_groups, axis=0) -> 'GroupManager': if axis == 0: all_groups = [da_group.groups for da_group in da_groups] da_group_dict = GroupManager() intersection_groups = set(all_groups[0]) for group in all_groups[1:]: intersection_groups = intersection_groups.intersection(set(group)) if len(intersection_groups) > 0: # to maintain connexions order groups = [group for group in all_groups[0] if group in intersection_groups] for group in groups: da_arrays = [da_group.conn[group] for da_group in da_groups] da_array_c = da.concatenate(da_arrays, axis=axis) da_group_dict[group] = da_array_c return da_group_dict else: return sum(da_groups) else: raise NotImplementedError
def _from_p(self, mode): """Convert the image from P or PA to RGB or RGBA.""" self._check_modes(("P", "PA")) if not self.palette: raise RuntimeError("Can't convert palettized image, missing palette.") pal = np.array(self.palette) pal = da.from_array(pal, chunks=pal.shape) if pal.shape[1] == 4: # colormap's alpha overrides data alpha mode = "RGBA" alpha = None elif self.mode.endswith("A"): # add a new/fake 'bands' dimension to the end alpha = self.data.sel(bands="A").data[..., None] mode = mode + "A" if not mode.endswith("A") else mode else: alpha = None flat_indexes = self.data.sel(bands='P').data.ravel().astype('int64') dim_sizes = ((key, val) for key, val in self.data.sizes.items() if key != 'bands') dims, new_shape = zip(*dim_sizes) dims = dims + ('bands',) new_shape = new_shape + (pal.shape[1],) new_data = pal[flat_indexes].reshape(new_shape) coords = dict(self.data.coords) coords["bands"] = list(mode) if alpha is not None: new_arr = da.concatenate((new_data, alpha), axis=-1) data = xr.DataArray(new_arr, coords=coords, attrs=self.data.attrs, dims=dims) else: data = xr.DataArray(new_data, coords=coords, attrs=self.data.attrs, dims=dims) return data
def palettize(self, colormap): """Palettize the current image using `colormap`. .. note:: Works only on "L" or "LA" images. """ if self.mode not in ("L", "LA"): raise ValueError("Image should be grayscale to colorize") l_data = self.data.sel(bands=['L']) new_data = l_data.data.map_blocks(self._palettize, colormap, dtype=l_data.dtype) self.palette = tuple(colormap.colors) if self.mode == "L": mode = "P" else: mode = "PA" new_data = da.concatenate([new_data, self.data.sel(bands=['A'])], axis=0) self.data.data = new_data self.data.coords['bands'] = list(mode)
def colorize(self, colormap): """Colorize the current image using `colormap`. .. note:: Works only on "L" or "LA" images. """ if self.mode not in ("L", "LA"): raise ValueError("Image should be grayscale to colorize") if self.mode == "LA": alpha = self.data.sel(bands=['A']) else: alpha = None l_data = self.data.sel(bands=['L']) new_data = l_data.data.map_blocks(self._colorize, colormap, chunks=(colormap.colors.shape[1],) + l_data.data.chunks[1:], dtype=np.float64) if colormap.colors.shape[1] == 4: mode = "RGBA" elif alpha is not None: new_data = da.concatenate([new_data, alpha.data], axis=0) mode = "RGBA" else: mode = "RGB" # copy the coordinates so we don't affect the original coords = dict(self.data.coords) coords['bands'] = list(mode) attrs = self.data.attrs dims = self.data.dims self.data = xr.DataArray(new_data, coords=coords, attrs=attrs, dims=dims)
def reset(self): ''' Removes any alignment from the trajectories ''' xs = [da.from_array(h['x'], chunks=CHUNKS) for h in self.hlist] self.x = da.concatenate(xs)
def stack(signal_list, axis=None, new_axis_name='stack_element', lazy=None, **kwargs): """Concatenate the signals in the list over a given axis or a new axis. The title is set to that of the first signal in the list. Parameters ---------- signal_list : list of BaseSignal instances axis : {None, int, str} If None, the signals are stacked over a new axis. The data must have the same dimensions. Otherwise the signals are stacked over the axis given by its integer index or its name. The data must have the same shape, except in the dimension corresponding to `axis`. new_axis_name : string The name of the new axis when `axis` is None. If an axis with this name already exists it automatically append '-i', where `i` are integers, until it finds a name that is not yet in use. lazy: {bool, None} Returns a LazySignal if True. If None, only returns lazy rezult if at least one is lazy. Returns ------- signal : BaseSignal instance (or subclass, determined by the objects in signal list) Examples -------- >>> data = np.arange(20) >>> s = hs.stack([hs.signals.Signal1D(data[:10]), ... hs.signals.Signal1D(data[10:])]) >>> s <Signal1D, title: Stack of , dimensions: (2, 10)> >>> s.data array([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]]) """ from itertools import zip_longest from hyperspy.signals import BaseSignal import dask.array as da from numbers import Number # TODO: remove next time deprecated = ['mmap', 'mmap_dir'] warn_str = "'{}' argument is deprecated, please use 'lazy' instead" for k in deprecated: if k in kwargs: lazy=True warnings.warn(warn_str.format(k), VisibleDeprecationWarning) axis_input = copy.deepcopy(axis) signal_list = list(signal_list) # Get the real signal with the most axes to get metadata/class/etc # first = sorted(filter(lambda _s: isinstance(_s, BaseSignal), signal_list), # key=lambda _s: _s.data.ndim)[-1] first = next(filter(lambda _s: isinstance(_s, BaseSignal), signal_list)) # Cast numbers as signals. Will broadcast later for i, _s in enumerate(signal_list): if isinstance(_s, BaseSignal): pass elif isinstance(_s, Number): sig = BaseSignal(_s) signal_list[i] = sig else: raise ValueError("{} type cannot be stacked.".format(type(_s))) if lazy is None: lazy = any(_s._lazy for _s in signal_list) if not isinstance(lazy, bool): raise ValueError("'lazy' argument has to be None, True or False") # Cast all as lazy if required for i, _s in enumerate(signal_list): if not _s._lazy: signal_list[i] = _s.as_lazy() if len(signal_list) > 1: newlist = broadcast_signals(*signal_list, ignore_axis=axis_input) if axis is not None: step_sizes = [s.axes_manager[axis].size for s in newlist] axis = newlist[0].axes_manager[axis] datalist = [s.data for s in newlist] newdata = da.stack(datalist, axis=0) if axis is None else \ da.concatenate(datalist, axis=axis.index_in_array) if axis_input is None: signal = first.__class__(newdata) signal._lazy = True signal._assign_subclass() signal.axes_manager._axes[1:] = copy.deepcopy(newlist[0].axes_manager._axes) axis_name = new_axis_name axis_names = [axis_.name for axis_ in signal.axes_manager._axes[1:]] j = 1 while axis_name in axis_names: axis_name = new_axis_name + "_%i" % j j += 1 eaxis = signal.axes_manager._axes[0] eaxis.name = axis_name eaxis.navigate = True # This triggers _update_parameters signal.metadata = copy.deepcopy(first.metadata) # Get the title from 1st object signal.metadata.General.title = ( "Stack of " + first.metadata.General.title) signal.original_metadata = DictionaryTreeBrowser({}) else: signal = newlist[0]._deepcopy_with_new_data(newdata) signal._lazy = True signal._assign_subclass() signal.get_dimensions_from_data() signal.original_metadata.add_node('stack_elements') for i, obj in enumerate(signal_list): signal.original_metadata.stack_elements.add_node('element%i' % i) node = signal.original_metadata.stack_elements['element%i' % i] node.original_metadata = \ obj.original_metadata.as_dictionary() node.metadata = \ obj.metadata.as_dictionary() if axis_input is None: axis_input = signal.axes_manager[-1 + 1j].index_in_axes_manager step_sizes = 1 signal.metadata._HyperSpy.set_item('Stacking_history.axis', axis_input) signal.metadata._HyperSpy.set_item('Stacking_history.step_sizes', step_sizes) if np.all([ s.metadata.has_item('Signal.Noise_properties.variance') for s in signal_list ]): variance = stack([ s.metadata.Signal.Noise_properties.variance for s in signal_list ], axis) signal.metadata.set_item('Signal.Noise_properties.variance', variance) else: signal = signal_list[0] # Leave as lazy or compute if lazy: signal = signal.as_lazy() else: signal.compute(False) return signal
def DAFT(x, axis=-1, chunksize=2**26): """Disk-Array Fourier Transform This function enables Fourier transforms of a very large series, where the entire series will not fit in memory. The standard radix-2 Cooley–Tukey algorithm is used to split the series up into smaller pieces until a given piece can be done entirely in memory. This smaller result is then stored as a `dask.array`, and combined with other similar results out of memory, using dask. Parameters ---------- x : array_like Input array, can be complex. axis : int, optional Axis over which to compute the FFT. If not given, the last axis is used. chunksize : int, optional Chunksize to use when splitting up the input array. Default is 2**24, which is about 64MB -- a reasonable target that reduces memory usage. Returns ------- X_da : dask Array object The Fourier transform is not yet computed; you must call `X_da.compute()` on the result to perform the computation. Example ------- >>> import numpy as np >>> from chest import Chest # For more flexible caching >>> cache = Chest(available_memory=(4 * 1024**3)) # Use 4GB at most >>> N = 2**26 >>> chunksize = N//(2**2) >>> np.random.seed(1234) >>> x = np.random.random(N) + 1j*np.random.random(N) >>> X_dask = DAFT(x, chunksize=chunksize) >>> %tic >>> X_DAFT = X_dask.compute(cache=cache) >>> %toc >>> %tic >>> X_np = np.fft.fft(x) >>> %toc >>> np.allclose(X_DAFT, X_np) """ import numpy as np import dask.array as da if axis<0: axis = x.ndim + axis N = x.shape[axis] chunks = tuple(1 if ax!=axis else chunksize for ax,dim in enumerate(x.shape)) if isinstance(x, da.Array): x_da = x.rechunk(chunks=chunks) else: x_da = da.from_array(x, chunks=chunks) W = np.exp(-2j * np.pi * np.arange(N) / N) # print(x.shape, axis, x_da.chunks, x_da.chunks[axis]); sys.stdout.flush() slice_even = tuple(slice(None) if ax!=axis else slice(None, None, 2) for ax in range(x_da.ndim)) slice_odd = tuple(slice(None) if ax!=axis else slice(1, None, 2) for ax in range(x_da.ndim)) if len(x_da.chunks[axis]) != 1: # TODO: Fix the following lines to be correct when x is multi-dimensional FFT_even = DAFT(x_da[slice_even], axis, chunksize=chunksize) FFT_odd = DAFT(x_da[slice_odd], axis, chunksize=chunksize) else: # TODO: Fix the following lines to be correct when x is multi-dimensional FFT_even = da.fft.fft(x_da[slice_even], n=None, axis=axis) FFT_odd = da.fft.fft(x_da[slice_odd], n=None, axis=axis) # TODO: Fix the following line to broadcast W correctly when x is multi-dimensional return da.concatenate([FFT_even + W[:N//2] * FFT_odd, FFT_even + W[N//2:] * FFT_odd], axis=axis)
def _expand_tiepoint_array_1km(self, arr, lines, cols): arr = da.repeat(arr, lines, axis=1) arr = da.concatenate((arr[:, :lines//2, :], arr, arr[:, -(lines//2):, :]), axis=1) arr = da.repeat(arr.reshape((-1, self.cscan_full_width - 1)), cols, axis=1) return da.hstack((arr, arr[:, -cols:]))
def valid_images_to_hdf5(directory, width=224, height=224, channels=3): ''' Function to build needed arrays for training or validating the neural network using out of core processing. If labels are passed, get a list of training image files, their labels ''' validationList, _ = get_list_of_validation_files(directory) # Pass directory containing validation images print('Creating the hdf5 file...') len_array = len(validationList) with h5py.File('validation_files.h5', 'w') as hf: dset = hf.create_dataset('validation_array', (len_array, channels, width, height), chunks=True) img_names = hf.create_dataset('image_names', (len_array,), chunks=True, dtype='S40') with h5py.File('validation_files.h5', 'r+') as hf: x = hf['validation_array'] X = da.from_array(x, chunks=1000) image_names = list(hf['image_names']) print('There are ', len(validationList), ' files in the validation list.') print('Breaking the validation list into chunks of 10,000...') chunkedList = get_chunks(validationList, 10000) # Break the list of files in to chunks of 10000 if channels == 3: for i, chunk in enumerate(chunkedList): # print(chunk) count = i + len(chunk[i][:])*i # Set counter for empty array # valid_sublist = chunk[i][:] print('Create empty list to store image names..') filenames = [] print('Creating an empty array to store images...') X = create_holding_array(chunk, width = width, height=height, channels=channels) # Create empty array for j, validFile in enumerate(chunk): print('Reading file #: ', j) filenames.append(os.path.basename(validFile)) # print(chunk) # input('') img = misc.imread(validFile) # Read the image img = misc.imresize(img, size = (width, height, channels)) # Resize image with color channel = 3 # img = np.transpose(img, (2,0,1)) # Store resized image in empty array X[j] = img asciiList = [] asciiList = [n.encode("ascii", "ignore") for n in filenames] X1 = np.transpose(X, (0, 3, 1, 2)) del X, filenames print(X1.shape) X_da = da.from_array(X1, chunks=1000) print('Opening validation_files.h5...') with h5py.File('validation_files.h5', 'r+') as hf: print('Putting validation_array in x...') x = hf['validation_array'] print('Putting validation_array in dask array...') dset = da.from_array(x, chunks=1000) print('Concatenating the two dask arrays...') X2 = da.concatenate([dset, X_da], axis=0) print('Storing the dask array in the hdf5 file...') da.store(X2, x) print('Put image_names dset into a list...') image_names = list(hf['image_names']) print('Extend the list with additional image names...') image_names.extend(asciiList) print('Done.') return filenames else: # If number of channels != 1 or != 3 print('Could not create dataset and resize training images...')
def read_PD0_bytes_ensembles(PD0_BYTES, return_pd0=False, headerid='\x7f\x7f', format='sentinel', use_dask=True, chunks=1e4, verbose=True, print_every=1000): """ Finds the hex positions in the bytearray that identify the header of each ensemble. Then read each ensemble into a dictionary and accumulates them in a list. """ chunks = int(chunks) if format=='workhorse': parsepd0 = parse_pd0_bytearray elif format=='sentinel': parsepd0 = parse_sentinelVpd0_bytearray else: print('Unknown *.pd0 format') # Split segments of the byte array per ensemble. ensbytes = PD0_BYTES.split(headerid) ensbytes = [headerid + ens for ens in ensbytes] # Prepend header id back. ensbytes = ensbytes[1:] # First entry is empty, cap it off. nens = len(ensbytes) nensm = nens - 1 fbad_ens = [] BAD_ENS = [] # embed() # Get timestamps for all ensembles. # Note that these timestamps indicate the Janus' (i.e., beams 1-4) pings, # which will not necessarily be the same as the vertical beam's timestamp. t = np.empty(nens, dtype=object) if use_dask: DATA = darr.from_array(np.array([], dtype=object, ndmin=1), chunks=chunks) ntotalchunks = nens//chunks rem_ens = nens%chunks has_tail=rem_ens>0 if has_tail: ntotalchunks+=1 # Last chunk takes remaining ensembles. DATAbuffskel = np.empty(chunks, dtype=object) DATAbuff = DATAbuffskel.copy() daNaN = darr.from_array(np.array(np.nan, ndmin=1), chunks=1) cont_inchnk=0 else: DATA = np.empty(nens, dtype=object) nChecksumError, nReadChecksumError, nReadHeaderError = 0, 0, 0 cont=0 cont_inchnk=0 for ensb in ensbytes: try: if use_dask: dd = delayed(parsepd0)(ensb) else: dd = parsepd0(ensb) # embed() t[cont] = dd['timestamp'] except (ChecksumError, ReadChecksumError, ReadHeaderError) as E: t[cont] = np.nan fbad_ens.append(cont) # Store index of bad ensemble. # BAD_ENS.append(ens) # Store bytes of the bad ensemble. # Which type of error was it? if isinstance(E, ChecksumError): nChecksumError += 1 elif isinstance(E, ReadChecksumError): nReadChecksumError += 1 elif isinstance(E, ReadHeaderError): nReadHeaderError += 1 if use_dask: if cont_inchnk==chunks: DATA = darr.concatenate((DATA, daNaN.copy())) DATAbuff = DATAbuffskel.copy() cont_inchnk=0 else: DATAbuff[cont_inchnk] = np.nan cont_inchnk+=1 if has_tail and cont==nensm: # Save the last chunk. DATA = darr.concatenate((DATA, daNaN.copy())) else: DATA[cont] = np.nan cont+=1 continue if use_dask: if cont_inchnk==chunks: DATA = darr.concatenate((DATA, darr.from_array(DATAbuff, chunks=chunks))) DATAbuff = DATAbuffskel.copy() cont_inchnk=0 # embed() else: DATAbuff[cont_inchnk] = dd cont_inchnk+=1 if has_tail and cont==nensm: # Save the last chunk. DATA = darr.concatenate((DATA, darr.from_array(DATAbuff, chunks=chunks))) else: DATA[cont] = dd cont+=1 if verbose and not cont%print_every: print("Ensemble %d"%cont) errortype_count = dict(bad_checksum=nChecksumError, read_checksum=nReadChecksumError, read_header=nReadHeaderError) # Extract ensemble-independent fields (store in xr.Dataset attributes). # fixed_attrs = _pick_misc(DATA) # FIXME fixed_attrs = [] # embed() if return_pd0: ret = (DATA, t, fixed_attrs, BAD_ENS, fbad_ens, errortype_count, PD0_BYTES) else: ret = (DATA, t, fixed_attrs, BAD_ENS, fbad_ens, errortype_count) return ret
def rolling_window(a, axis, window, center, fill_value): """ Dask's equivalence to np.utils.rolling_window """ orig_shape = a.shape # inputs for ghost if axis < 0: axis = a.ndim + axis depth = {d: 0 for d in range(a.ndim)} depth[axis] = int(window / 2) # For evenly sized window, we need to crop the first point of each block. offset = 1 if window % 2 == 0 else 0 if depth[axis] > min(a.chunks[axis]): raise ValueError( "For window size %d, every chunk should be larger than %d, " "but the smallest chunk size is %d. Rechunk your array\n" "with a larger chunk size or a chunk size that\n" "more evenly divides the shape of your array." % (window, depth[axis], min(a.chunks[axis]))) # Although dask.ghost pads values to boundaries of the array, # the size of the generated array is smaller than what we want # if center == False. if center: start = int(window / 2) # 10 -> 5, 9 -> 4 end = window - 1 - start else: start, end = window - 1, 0 pad_size = max(start, end) + offset - depth[axis] drop_size = 0 # pad_size becomes more than 0 when the ghosted array is smaller than # needed. In this case, we need to enlarge the original array by padding # before ghosting. if pad_size > 0: if pad_size < depth[axis]: # Ghosting requires each chunk larger than depth. If pad_size is # smaller than the depth, we enlarge this and truncate it later. drop_size = depth[axis] - pad_size pad_size = depth[axis] shape = list(a.shape) shape[axis] = pad_size chunks = list(a.chunks) chunks[axis] = (pad_size, ) fill_array = da.full(shape, fill_value, dtype=a.dtype, chunks=chunks) a = da.concatenate([fill_array, a], axis=axis) boundary = {d: fill_value for d in range(a.ndim)} # create ghosted arrays ag = da.ghost.ghost(a, depth=depth, boundary=boundary) # apply rolling func def func(x, window, axis=-1): x = np.asarray(x) rolling = nputils._rolling_window(x, window, axis) return rolling[(slice(None), ) * axis + (slice(offset, None), )] chunks = list(a.chunks) chunks.append(window) out = ag.map_blocks(func, dtype=a.dtype, new_axis=a.ndim, chunks=chunks, window=window, axis=axis) # crop boundary. index = (slice(None),) * axis + (slice(drop_size, drop_size + orig_shape[axis]), ) return out[index]