def test_async(c, s, a, b): x = create_test_data() assert not dask.is_dask_collection(x) y = x.chunk({'dim2': 4}) + 10 assert dask.is_dask_collection(y) assert dask.is_dask_collection(y.var1) assert dask.is_dask_collection(y.var2) z = y.persist() assert str(z) assert dask.is_dask_collection(z) assert dask.is_dask_collection(z.var1) assert dask.is_dask_collection(z.var2) assert len(y.__dask_graph__()) > len(z.__dask_graph__()) assert not futures_of(y) assert futures_of(z) future = c.compute(z) w = yield future assert not dask.is_dask_collection(w) assert_allclose(x + 10, w) assert s.task_state
def test_get_individual_entries_of_matrices_simulation_example(): """ Tests that the function returns a dask task and that the computed task returns the expected tuple while using the simuation """ task = get_individual_entries_of_matrices( lambda_2=2, lambda_1_1=0.1, lambda_1_2=0.5, mu_1=2, mu_2=1, num_of_servers_1=2, num_of_servers_2=2, threshold_1=3, threshold_2=5, system_capacity_1=4, system_capacity_2=6, buffer_capacity_1=2, buffer_capacity_2=2, target=2, alpha=0.5, use_simulation=True, runtime=300, num_of_trials=3, warm_up_time=5, seed_num_1=0, seed_num_2=0, ) assert da.is_dask_collection(task) values = da.compute(task) assert np.allclose( values, ((3, 5, 0.7613063676529543, -0.0006520260736895711, -0.027937014444158834), ), )
def merge( self, other, on=None, how="left", left_index=False, right_index=False, suffixes=("_x", "_y"), ): """Merging two dataframes on the column(s) indicated in *on*. """ if (left_index or right_index or not dask.is_dask_collection(other) or self.npartitions == 1 and how in ("inner", "right") or other.npartitions == 1 and how in ("inner", "left")): return dd.merge( self, other, how=how, suffixes=suffixes, left_index=left_index, right_index=right_index, ) if not on and not left_index and not right_index: on = [c for c in self.columns if c in other.columns] if not on: left_index = right_index = True return join_impl.join_frames( left=self, right=other, on=on, how=how, lsuffix=suffixes[0], rsuffix=suffixes[1], )
def gap_fill(x: xr.DataArray, fallback: xr.DataArray, nodata=None, attrs=None): """Fill missing values in `x` with values from `fallback`. x,fallback are expected to be xarray.DataArray with identical shape and dtype. out[pix] = x[pix] if x[pix] != x.nodata else fallback[pix] """ if nodata is None: nodata = getattr(x, "nodata", None) if nodata is None: nodata = default_nodata(x.dtype) else: nodata = x.dtype.type(nodata) if attrs is None: attrs = x.attrs.copy() if dask.is_dask_collection(x): data = da.map_blocks( _gap_fill_np, x.data, fallback.data, nodata, name=randomize("gap_fill"), dtype=x.dtype, ) else: data = _gap_fill_np(x.data, fallback.data, nodata) return xr.DataArray(data, attrs=attrs, dims=x.dims, coords=x.coords, name=x.name)
def convert_dask_collection(dc): """ Convert dask collection object into mars.core.Object via remote API Parameters ---------- dc: dask collection Dask collection object to be converted. Returns ------- Object Mars Object. """ if not is_dask_collection(dc): raise TypeError( f"'{type(dc).__name__}' object is not a valid dask collection") dc.__dask_graph__().validate() dsk = optimize(dc)[0].__dask_graph__() first_key = next(iter(dsk.keys())) if isinstance(first_key, str): key = [first_key] elif isinstance(first_key, tuple): key = sorted([i for i in dsk.keys() if i[0] == first_key[0]], key=lambda x: x[1]) else: raise ValueError( f"Dask collection object seems be broken, with unexpected key type:'{type(first_key).__name__}'" ) res = reduce(mars_dask_get(dsk, [key])) if isinstance(dc, Bag): return spawn(lambda x: list(x[0][0]), args=(res, )) else: return res
def xr_geomedian_tmad(ds, axis='time', where=None, **kw): """ :param ds: xr.Dataset|xr.DataArray|numpy array Other parameters: **kwargs -- passed on to pcm.gnmpcm maxiters : int 1000 eps : float 0.0001 num_threads: int| None None """ import hdstats def gm_tmad(arr, **kw): """ arr: a high dimensional numpy array where the last dimension will be reduced. returns: a numpy array with one less dimension than input. """ gm = hdstats.nangeomedian_pcm(arr, **kw) nt = kw.pop('num_threads', None) emad = hdstats.emad_pcm(arr, gm, num_threads=nt)[:,:, np.newaxis] smad = hdstats.smad_pcm(arr, gm, num_threads=nt)[:,:, np.newaxis] bcmad = hdstats.bcmad_pcm(arr, gm, num_threads=nt)[:,:, np.newaxis] return np.concatenate([gm, emad, smad, bcmad], axis=-1) def norm_input(ds, axis): if isinstance(ds, xr.DataArray): xx = ds if len(xx.dims) != 4: raise ValueError("Expect 4 dimensions on input: y,x,band,time") if axis is not None and xx.dims[3] != axis: raise ValueError(f"Can only reduce last dimension, expect: y,x,band,{axis}") return None, xx, xx.data elif isinstance(ds, xr.Dataset): xx = reshape_for_geomedian(ds, axis) return ds, xx, xx.data else: # assume numpy or similar xx_data = ds if xx_data.ndim != 4: raise ValueError("Expect 4 dimensions on input: y,x,band,time") return None, None, xx_data kw.setdefault('nocheck', False) kw.setdefault('num_threads', 1) kw.setdefault('eps', 1e-6) ds, xx, xx_data = norm_input(ds, axis) is_dask = dask.is_dask_collection(xx_data) if where is not None: if is_dask: raise NotImplementedError("Dask version doesn't support output masking currently") if where.shape != xx_data.shape[:2]: raise ValueError("Shape for `where` parameter doesn't match") set_nan = ~where else: set_nan = None if is_dask: if xx_data.shape[-2:] != xx_data.chunksize[-2:]: xx_data = xx_data.rechunk(xx_data.chunksize[:2] + (-1, -1)) data = da.map_blocks(lambda x: gm_tmad(x, **kw), xx_data, name=randomize('geomedian'), dtype=xx_data.dtype, chunks=xx_data.chunks[:-2] + (xx_data.chunks[-2][0]+3,), drop_axis=3) else: data = gm_tmad(xx_data, **kw) if set_nan is not None: data[set_nan, :] = np.nan if xx is None: return data dims = xx.dims[:-1] cc = {k: xx.coords[k] for k in dims} cc[dims[-1]] = np.hstack([xx.coords[dims[-1]].values,['edev', 'sdev', 'bcdev']]) xx_out = xr.DataArray(data, dims=dims, coords=cc) if ds is None: xx_out.attrs.update(xx.attrs) return xx_out ds_out = xx_out.to_dataset(dim='band') for b in ds.data_vars.keys(): src, dst = ds[b], ds_out[b] dst.attrs.update(src.attrs) return assign_crs(ds_out, crs=ds.geobox.crs)
def geomedian_with_mads( src: Union[xr.Dataset, xr.DataArray], compute_mads: bool = True, compute_count: bool = True, out_chunks: Optional[Tuple[int, int, int]] = None, reshape_strategy: str = "mem", scale: float = 1.0, offset: float = 0.0, eps: Optional[float] = None, maxiters: int = 1000, num_threads: int = 1, **kw, ) -> xr.Dataset: """ Compute Geomedian on Dask backed Dataset. NOTE: Default configuration of this code assumes that entire input can be loaded in to RAM on the Dask worker. It also assumes that there is only one worker in the cluster, or that entire task will get scheduled on one single worker only. See ``reshape_strategy`` parameter. :param src: xr.Dataset or a single array in YXBT order, bands can be either float or integer with `nodata` values to indicate gaps in data. :param compute_mads: Whether to compute smad,emad,bcmad statistics :param compute_count: Whether to compute count statistic (number of contributing observations per output pixels) :param out_chunks: Advanced option, allows to rechunk output internally, order is ``(ny, nx, nband)`` :param reshape_strategy: One of ``mem`` (default) or ``yxbt``. This is only applicable when supplying Dataset object. It controls how Dataset is reshaped into DataArray in the format expected by Geomedian code. If you have enough RAM and use single-worker Dask cluster, then use ``mem``, it should be the most efficient. If there is not enough RAM to load entire input you can try ``yxbt`` mode, but you might still run out of RAM anyway. If using multi-worker Dask cluster you have to use ``yxbt`` strategy. :param scale, offset: Only used when input contains integer values, actual Geomedian will run on scaled values ``scale*X+offset``. Only affects internal computation, final result is scaled back to the original value range. :param eps: Termination criteria passed on to geomedian algorithm :param maxiters: Maximum number of iterations done per output pixel :param num_threads: Configure internal concurrency of the Geomedian computation. Default is 1 as we assume that Dask will run a bunch of those concurrently. :param work_chunks: Default is ``(100, 100)``, only applicable when input is Dataset. """ if not dask.is_dask_collection(src): raise ValueError("This method only works on Dask inputs") if isinstance(src, xr.DataArray): yxbt = src else: # TODO: better automatic defaults for work_chunks ny, nx = kw.get("work_chunks", (100, 100)) if reshape_strategy == "mem": yxbt = yxbt_sink(src, (ny, nx, -1, -1)) elif reshape_strategy == "yxbt": yxbt = reshape_yxbt(src, yx_chunks=(ny, nx)) else: raise ValueError( f"Reshape strategy '{reshape_strategy}' not understood use one of: mem or yxbt" ) ny, nx, nb, nt = yxbt.shape nodata = yxbt.attrs.get("nodata", None) assert yxbt.chunks is not None if yxbt.data.numblocks[2:4] != (1, 1): raise ValueError( "There should be one dask block along time and band dimension") n_extras = (3 if compute_mads else 0) + (1 if compute_count else 0) chunks = (*yxbt.chunks[:2], (nb + n_extras, )) is_float = yxbt.dtype.kind == "f" if eps is None: eps = 1e-4 if is_float else 0.1 * scale op = functools.partial( _gm_mads_compute_f32, compute_mads=compute_mads, compute_count=compute_count, nodata=nodata, scale=scale, offset=offset, eps=eps, maxiters=maxiters, num_threads=num_threads, ) _gm = da.map_blocks(op, yxbt.data, dtype="float32", drop_axis=3, chunks=chunks, name="geomedian") if out_chunks is not None: _gm = _gm.rechunk(out_chunks) gm_data = _gm[:, :, :nb] if not is_float: gm_data = da.map_blocks( lambda x: from_float_np( x, yxbt.dtype, nodata, scale=1 / scale, offset=offset / scale), gm_data, dtype=yxbt.dtype, ) dims = yxbt.dims[:3] coords = {k: yxbt.coords[k] for k in dims} result = xr.DataArray(data=gm_data, dims=dims, coords=coords, attrs=yxbt.attrs).to_dataset("band") for dv in result.data_vars.values(): dv.attrs.update(yxbt.attrs) next_stat = nb if compute_mads: smad = _gm[:, :, next_stat + 0] emad = _gm[:, :, next_stat + 1] bcmad = _gm[:, :, next_stat + 2] next_stat += 3 if not is_float: emad = emad * (1 / scale) result["smad"] = xr.DataArray(data=smad, dims=dims[:2], coords=result.coords) result["emad"] = xr.DataArray(data=emad, dims=dims[:2], coords=result.coords) result["bcmad"] = xr.DataArray(data=bcmad, dims=dims[:2], coords=result.coords) if compute_count: count = _gm[:, :, next_stat].astype("uint16") next_stat += 1 result["count"] = xr.DataArray(data=count, dims=dims[:2], coords=result.coords) return result
def to_raster(data, filename, readxsize=None, readysize=None, use_dask_store=False, separate=False, out_block_type='zarr', keep_blocks=False, verbose=0, overwrite=False, gdal_cache=512, scheduler='mpool', n_jobs=1, n_workers=None, n_threads=None, n_chunks=None, overviews=False, resampling='nearest', use_client=False, address=None, total_memory=48, **kwargs): """ Writes a ``dask`` array to a raster file Args: data (DataArray): The ``xarray.DataArray`` to write. filename (str): The output file name to write to. readxsize (Optional[int]): The size of column chunks to read. If not given, ``readxsize`` defaults to Dask chunk size. readysize (Optional[int]): The size of row chunks to read. If not given, ``readysize`` defaults to Dask chunk size. separate (Optional[bool]): Whether to write blocks as separate files. Otherwise, write to a single file. use_dask_store (Optional[bool]): Whether to use ``dask.array.store`` to save with Dask task graphs. out_block_type (Optional[str]): The output block type. Choices are ['gtiff', 'zarr']. Only used if ``separate`` = ``True``. keep_blocks (Optional[bool]): Whether to keep the blocks stored on disk. Only used if ``separate`` = ``True``. verbose (Optional[int]): The verbosity level. overwrite (Optional[bool]): Whether to overwrite an existing file. gdal_cache (Optional[int]): The ``GDAL`` cache size (in MB). scheduler (Optional[str]): The ``concurrent.futures`` scheduler to use. Choices are ['processes', 'threads', 'mpool']. mpool: process pool of workers using ``multiprocessing.Pool`` processes: process pool of workers using ``concurrent.futures`` threads: thread pool of workers using ``concurrent.futures`` n_jobs (Optional[int]): The total number of parallel jobs. n_workers (Optional[int]): The number of processes. n_threads (Optional[int]): The number of threads. n_chunks (Optional[int]): The chunk size of windows. If not given, equal to ``n_workers`` x 50. overviews (Optional[bool or list]): Whether to build overview layers. resampling (Optional[str]): The resampling method for overviews when ``overviews`` is ``True`` or a ``list``. Choices are ['average', 'bilinear', 'cubic', 'cubic_spline', 'gauss', 'lanczos', 'max', 'med', 'min', 'mode', 'nearest']. use_client (Optional[bool]): Whether to use a ``dask`` client. address (Optional[str]): A cluster address to pass to client. Only used when ``use_client`` = ``True``. total_memory (Optional[int]): The total memory (in GB) required when ``use_client`` = ``True``. kwargs (Optional[dict]): Additional keyword arguments to pass to ``rasterio.write``. Returns: ``dask.delayed`` object Examples: >>> import geowombat as gw >>> >>> # Use 8 parallel workers >>> with gw.open('input.tif') as ds: >>> gw.to_raster(ds, 'output.tif', n_jobs=8) >>> >>> # Use 4 process workers and 2 thread workers >>> with gw.open('input.tif') as ds: >>> gw.to_raster(ds, 'output.tif', n_workers=4, n_threads=2) >>> >>> # Control the window chunks passed to concurrent.futures >>> with gw.open('input.tif') as ds: >>> gw.to_raster(ds, 'output.tif', n_workers=4, n_threads=2, n_chunks=16) >>> >>> # Compress the output and build overviews >>> with gw.open('input.tif') as ds: >>> gw.to_raster(ds, 'output.tif', n_jobs=8, overviews=True, compress='lzw') """ if MKL_LIB: __ = MKL_LIB.MKL_Set_Num_Threads(n_threads) pfile = Path(filename) if scheduler.lower() == 'mpool': pool_executor = multi.Pool else: pool_executor = concurrent.futures.ProcessPoolExecutor if scheduler.lower( ) == 'processes' else concurrent.futures.ThreadPoolExecutor if overwrite: if pfile.is_file(): pfile.unlink() if pfile.is_file(): logger.warning(' The output file already exists.') return if not is_dask_collection(data.data): logger.exception(' The data should be a dask array.') if use_client: if address: cluster_object = _cluster_dummy else: cluster_object = LocalCluster client_object = Client else: cluster_object = _cluster_dummy client_object = _client_dummy if isinstance(n_workers, int) and isinstance(n_threads, int): n_jobs = n_workers * n_threads else: n_workers = n_jobs n_threads = 1 mem_per_core = int(total_memory / n_workers) if not isinstance(n_chunks, int): n_chunks = n_workers * 50 if not isinstance(readxsize, int): readxsize = data.gw.col_chunks if not isinstance(readysize, int): readysize = data.gw.row_chunks chunksize = (data.gw.row_chunks, data.gw.col_chunks) # Force tiled outputs with no file sharing kwargs['sharing'] = False if data.gw.tiled: kwargs['tiled'] = True if 'compress' in kwargs: # Store the compression type because # it is removed in concurrent writing compress = True compress_type = kwargs['compress'] del kwargs['compress'] elif isinstance(data.gw.compress, str) and data.gw.compress.lower() in ['lzw', 'deflate']: compress = True compress_type = data.gw.compress else: compress = False if 'nodata' not in kwargs: if isinstance(data.gw.nodata, int) or isinstance( data.gw.nodata, float): kwargs['nodata'] = data.gw.nodata if 'blockxsize' not in kwargs: kwargs['blockxsize'] = data.gw.col_chunks if 'blockysize' not in kwargs: kwargs['blockysize'] = data.gw.row_chunks if 'bigtiff' not in kwargs: kwargs['bigtiff'] = data.gw.bigtiff if 'driver' not in kwargs: kwargs['driver'] = data.gw.driver if 'count' not in kwargs: kwargs['count'] = data.gw.nbands if 'width' not in kwargs: kwargs['width'] = data.gw.ncols if 'height' not in kwargs: kwargs['height'] = data.gw.nrows if separate: d_name = pfile.parent sub_dir = d_name.joinpath('sub_tmp_') zarr_file = sub_dir.joinpath('data.zarr').as_posix() sub_dir.mkdir(parents=True, exist_ok=True) root = zarr.open(zarr_file, mode='w') else: root = None if verbose > 0: logger.info(' Creating the file ...\n') with rio.open(filename, mode='w', **kwargs) as rio_dst: pass if verbose > 0: logger.info(' Writing data to file ...\n') with rio.Env(GDAL_CACHEMAX=gdal_cache): if not use_dask_store: windows = get_window_offsets(data.gw.nrows, data.gw.ncols, readysize, readxsize, return_as='list') n_windows = len(windows) # Iterate over the windows in chunks for wchunk in range(0, n_windows, n_chunks): window_slice = windows[wchunk:wchunk + n_chunks] n_windows_slice = len(window_slice) if verbose > 0: logger.info(' Windows {:,d}--{:,d} of {:,d} ...'.format( wchunk + 1, wchunk + n_windows_slice, n_windows)) if len(data.shape) == 2: data_gen = ((data[w.row_off:w.row_off + w.height, w.col_off:w.col_off + w.width], filename, w, n_threads, separate, chunksize, root) for w in window_slice) elif len(data.shape) == 3: data_gen = ((data[:, w.row_off:w.row_off + w.height, w.col_off:w.col_off + w.width], filename, w, n_threads, separate, chunksize, root) for w in window_slice) else: data_gen = ((data[:, :, w.row_off:w.row_off + w.height, w.col_off:w.col_off + w.width], filename, w, n_threads, separate, chunksize, root) for w in window_slice) with pool_executor(n_workers) as executor: if scheduler == 'mpool': for zarr_file in tqdm(executor.imap_unordered( _write_xarray, data_gen), total=n_windows_slice): pass else: for zarr_file in tqdm(executor.map( _write_xarray, data_gen), total=n_windows_slice): pass # if overviews: # # if not isinstance(overviews, list): # overviews = [2, 4, 8, 16] # # if resampling not in ['average', 'bilinear', 'cubic', 'cubic_spline', # 'gauss', 'lanczos', 'max', 'med', 'min', 'mode', 'nearest']: # # logger.warning(" The resampling method is not supported by rasterio. Setting to 'nearest'") # # resampling = 'nearest' # # if verbose > 0: # logger.info(' Building pyramid overviews ...') # # rio_dst.build_overviews(overviews, getattr(Resampling, resampling)) # rio_dst.update_tags(ns='overviews', resampling=resampling) else: with cluster_object( n_workers=n_workers, threads_per_worker=n_threads, scheduler_port=0, processes=False, memory_limit='{:d}GB'.format(mem_per_core)) as cluster: cluster_address = address if address else cluster with client_object(address=cluster_address) as client: with WriteDaskArray(filename, overwrite=overwrite, separate=separate, out_block_type=out_block_type, keep_blocks=keep_blocks, gdal_cache=gdal_cache, **kwargs) as dst: # Store the data and return a lazy evaluator res = da.store(da.squeeze(data.data), dst, lock=False, compute=False) if verbose > 0: logger.info(' Writing data to file ...') # Send the data to file # # *Note that the progress bar will # not work with a client. if use_client: res.compute(num_workers=n_jobs) else: with ProgressBar(): res.compute(num_workers=n_jobs) if verbose > 0: logger.info(' Finished writing data to file.') out_block_type = dst.out_block_type keep_blocks = dst.keep_blocks zarr_file = dst.zarr_file sub_dir = dst.sub_dir if compress: if verbose > 0: logger.info(' Compressing output file ...') if separate: group_keys = list(root.group_keys()) n_groups = len(group_keys) if out_block_type.lower() == 'zarr': # root = zarr.open(zarr_file, mode='r') open_file = zarr_file else: outfiles = sorted( fnmatch.filter(os.listdir(sub_dir), '*.tif')) outfiles = [os.path.join(sub_dir, fn) for fn in outfiles] # data_gen = ((fn, None, 'gtiff') for fn in outfiles) kwargs['compress'] = compress_type n_windows = len(group_keys) # Compress into one file with rio.open(filename, mode='w', **kwargs) as dst_: # Iterate over the windows in chunks for wchunk in range(0, n_groups, n_chunks): group_keys_slice = group_keys[wchunk:wchunk + n_chunks] n_windows_slice = len(group_keys_slice) if verbose > 0: logger.info( ' Windows {:,d}--{:,d} of {:,d} ...'.format( wchunk + 1, wchunk + n_windows_slice, n_windows)) ################################################ data_gen = ((open_file, group, 'zarr') for group in group_keys_slice) # for f in tqdm(executor.map(_compressor, data_gen), total=n_windows_slice): # pass # # futures = [executor.submit(_compress_dummy, iter_[0], iter_[1], None) for iter_ in data_gen] # # for f in tqdm(concurrent.futures.as_completed(futures), total=n_windows_slice): # # out_window, out_block = f.result() # # dst_.write(np.squeeze(out_block), # window=out_window, # indexes=out_indexes_) ################################################ # data_gen = ((root, group, 'zarr') for group in group_keys_slice) # for f, g, t in tqdm(data_gen, total=n_windows_slice): # # out_window, out_indexes, out_block = _block_read_func(f, g, t) # executor.map(_block_write_func, data_gen) with concurrent.futures.ProcessPoolExecutor( max_workers=n_workers) as executor: # Submit all of the tasks as futures futures = [ executor.submit(_block_read_func, f, g, t) for f, g, t in data_gen ] for f in tqdm( concurrent.futures.as_completed(futures), total=n_windows_slice): out_window, out_indexes, out_block = f.result() dst_.write(out_block, window=out_window, indexes=out_indexes) futures = None if not keep_blocks: shutil.rmtree(sub_dir) else: p = Path(filename) d_name = p.parent f_base, f_ext = os.path.splitext(p.name) ld = string.ascii_letters + string.digits rstr = ''.join(random.choice(ld) for i in range(0, 9)) temp_file = d_name.joinpath('{}_temp_{}{}'.format( f_base, rstr, f_ext)) compress_raster(filename, temp_file.as_posix(), n_jobs=n_jobs, gdal_cache=gdal_cache, compress=compress_type) temp_file.rename(filename) if verbose > 0: logger.info(' Finished compressing') if verbose > 0: logger.info('\nFinished writing the data.')
def _num_samples(X): result = sk_validation._num_samples(X) if dask.is_dask_collection(result): # dask dataframe result = result.compute() return result
def fit(self, X, y=None): if not dask.is_dask_collection(X): raise TypeError(_TYPE_MSG.format(type(X))) self._fit(X) return self
def _train( client, params, data, labels, dmatrix_kwargs={}, evals_result=None, **kwargs ): """ Asynchronous version of train See Also -------- train """ # Break apart Dask.array/dataframe into chunks/parts data_parts = data.to_delayed() label_parts = labels.to_delayed() if isinstance(data_parts, np.ndarray): assert data_parts.shape[1] == 1 data_parts = data_parts.flatten().tolist() if isinstance(label_parts, np.ndarray): assert label_parts.ndim == 1 or label_parts.shape[1] == 1 label_parts = label_parts.flatten().tolist() # Arrange parts into pairs. This enforces co-locality parts = list(map(delayed, zip(data_parts, label_parts))) parts = client.compute(parts) # Start computation in the background yield wait(parts) for part in parts: if part.status == "error": yield part # trigger error locally if kwargs.get("eval_set"): if any( is_dask_collection(e) for evals in kwargs.get("eval_set") for e in evals ): raise TypeError( "Evaluation set must not contain dask collections." ) # Because XGBoost-python doesn't yet allow iterative training, we need to # find the locations of all chunks and map them to particular Dask workers key_to_part_dict = dict([(part.key, part) for part in parts]) who_has = yield client.scheduler.who_has( keys=[part.key for part in parts] ) worker_map = defaultdict(list) for key, workers in who_has.items(): worker_map[first(workers)].append(key_to_part_dict[key]) ncores = yield client.scheduler.ncores() # Number of cores per worker # Start the XGBoost tracker on the Dask scheduler env = yield client._run_on_scheduler(start_tracker, None, len(worker_map)) # Tell each worker to train on the chunks/parts that it has locally futures = [ client.submit( train_part, env, assoc(params, "nthread", ncores[worker]), list_of_parts, workers=worker, dmatrix_kwargs=dmatrix_kwargs, **kwargs ) for worker, list_of_parts in worker_map.items() ] # Get the results, only one will be non-None results = yield client._gather(futures) result, _evals_result = [v for v in results if v.count(None) != len(v)][0] if evals_result is not None: evals_result.update(_evals_result) num_class = params.get("num_class") if num_class: result.set_attr(num_class=str(num_class)) raise gen.Return(result)
def __dask_layers__(self): return sum([ v.__dask_layers__() for v in self._data_vars.values() if dask.is_dask_collection(v) ], ())
def _load_into_memory(res): """Compute if res is lazy data.""" if dask.is_dask_collection(res): res = res.compute() return res
def test_df_inverse_transform(self): mask = ["3", "4"] a = dpp.MinMaxScaler(columns=mask) result = a.inverse_transform(a.fit_transform(df2)) assert dask.is_dask_collection(result) assert_eq_df(result, df2)
def xr_reproject_array( src: xr.DataArray, geobox: GeoBox, resampling: str = "nearest", chunks: Optional[Tuple[int, int]] = None, dst_nodata: Optional[NodataType] = None, ) -> xr.DataArray: """ Reproject DataArray to a given GeoBox :param src : Input src[(time,) y,x (, band)] :param geobox : GeoBox of the destination :param resampling: Resampling strategy as a string: nearest, bilinear, average, mode ... :param chunks : In Y,X dimensions only, default is to use input chunk size :param dst_nodata: nodata marker for dst image (default is to use src.nodata) """ src_nodata = getattr(src, "nodata", None) if dst_nodata is None: dst_nodata = src_nodata src_geobox = src.geobox assert src_geobox is not None yx_dims = spatial_dims(src) axis = tuple(src.dims).index(yx_dims[0]) src_dims = tuple(src.dims) dst_dims = src_dims[:axis] + geobox.dims + src_dims[axis + 2:] coords = geobox.xr_coords(with_crs=True) # copy non-spatial coords from src to dst src_non_spatial_dims = src_dims[:axis] + src_dims[axis + 2:] for dim in src_non_spatial_dims: if dim not in coords: coords[dim] = src.coords[dim] attrs = {} if dst_nodata is not None: attrs["nodata"] = dst_nodata if is_dask_collection(src): data = dask_reproject( src.data, src_geobox, geobox, resampling=resampling, chunks=chunks, src_nodata=src_nodata, dst_nodata=dst_nodata, axis=axis, ) else: data = _reproject_block_impl( src.data, src_geobox, geobox, resampling=resampling, src_nodata=src_nodata, dst_nodata=dst_nodata, axis=axis, ) return xr.DataArray(data, name=src.name, coords=coords, dims=dst_dims, attrs=attrs)
def __dask_keys__(self): return [ v.__dask_keys__() for v in self._data_vars.values() if dask.is_dask_collection(v) ]
def xr_phenology( da, stats=[ "SOS", "POS", "EOS", "Trough", "vSOS", "vPOS", "vEOS", "LOS", "AOS", "ROG", "ROS", ], method_sos="median", method_eos="median", complete='fast_complete', smoothing=None, show_progress=True, ): """ Obtain land surface phenology metrics from an xarray.DataArray containing a timeseries of a vegetation index like NDVI. last modified June 2020 Parameters ---------- da : xarray.DataArray DataArray should contain a 2D or 3D time series of a vegetation index like NDVI, EVI stats : list list of phenological statistics to return. Regardless of the metrics returned, all statistics are calculated due to inter-dependencies between metrics. Options include: SOS = DOY of start of season POS = DOY of peak of season EOS = DOY of end of season vSOS = Value at start of season vPOS = Value at peak of season vEOS = Value at end of season Trough = Minimum value of season LOS = Length of season (DOY) AOS = Amplitude of season (in value units) ROG = Rate of greening ROS = Rate of senescence method_sos : str If 'first' then vSOS is estimated as the first positive slope on the greening side of the curve. If 'median', then vSOS is estimated as the median value of the postive slopes on the greening side of the curve. method_eos : str If 'last' then vEOS is estimated as the last negative slope on the senescing side of the curve. If 'median', then vEOS is estimated as the 'median' value of the negative slopes on the senescing side of the curve. complete : str If 'fast_complete', the timeseries will be completed (gap filled) using fast_completion(), if 'linear', time series with be completed using da.interpolate_na(method='linear') smoothing : str If 'wiener', the timeseries will be smoothed using the scipy.signal.wiener filter with a window size of 3. If 'rolling_mean', then timeseries is smoothed using a rolling mean with a window size of 3. If set to 'linear', will be smoothed using da.resample(time='1W').interpolate('linear') Outputs ------- xarray.Dataset containing variables for the selected phenology statistics """ # Check inputs before running calculations if dask.is_dask_collection(da): if version.parse(xr.__version__) < version.parse('0.16.0'): raise TypeError( "Dask arrays are not currently supported by this function, " + "run da.compute() before passing dataArray.") stats_dtype = { "SOS": np.int16, "POS": np.int16, "EOS": np.int16, "Trough": np.float32, "vSOS": np.float32, "vPOS": np.float32, "vEOS": np.float32, "LOS": np.int16, "AOS": np.float32, "ROG": np.float32, "ROS": np.float32, } da_template = da.isel(time=0).drop('time') template = xr.Dataset({ var_name: da_template.astype(var_dtype) for var_name, var_dtype in stats_dtype.items() if var_name in stats }) da_all_time = da.chunk({'time': -1}) lazy_phenology = da_all_time.map_blocks(xr_phenology, kwargs=dict( stats=stats, method_sos=method_sos, method_eos=method_eos, complete=complete, smoothing=smoothing, ), template=xr.Dataset(template)) try: crs = da.geobox.crs lazy_phenology = assign_crs(lazy_phenology, str(crs)) except: pass return lazy_phenology if method_sos not in ("median", "first"): raise ValueError("method_sos should be either 'median' or 'first'") if method_eos not in ("median", "last"): raise ValueError("method_eos should be either 'median' or 'last'") # If stats supplied is not a list, convert to list. stats = stats if isinstance(stats, list) else [stats] #try to grab the crs info try: crs = da.geobox.crs except: pass # complete timeseries if complete is not None: if complete == 'fast_complete': if len(da.shape) == 1: print( "fast_complete does not operate on 1D timeseries, using 'linear' instead" ) da = da.interpolate_na(dim='time', method='linear') else: print("Completing using fast_complete...") da = fast_completion(da) if complete == 'linear': print("Completing using linear interp...") da = da.interpolate_na(dim='time', method='linear') if smoothing is not None: if smoothing == "wiener": if len(da.shape) == 1: print( "wiener method does not operate on 1D timeseries, using 'rolling_mean' instead" ) da = da.rolling(time=3, min_periods=1).mean() else: print(" Smoothing with wiener filter...") da = smooth(da) if smoothing == "rolling_mean": print(" Smoothing with rolling mean...") da = da.rolling(time=3, min_periods=1).mean() if smoothing == 'linear': print(" Smoothing using linear interpolation...") da = da.resample(time='1W').interpolate('linear') # remove any remaining all-NaN pixels mask = da.isnull().all("time") da = da.where(~mask, other=0) # calculate the statistics print(" Phenology...") vpos = _vpos(da) pos = _pos(da) trough = _trough(da) aos = _aos(vpos, trough) vsos = _vsos(da, pos, method_sos=method_sos) sos = _sos(vsos) veos = _veos(da, pos, method_eos=method_eos) eos = _eos(veos) los = _los(da, eos, sos) rog = _rog(vpos, vsos, pos, sos) ros = _ros(veos, vpos, eos, pos) # Dictionary containing the statistics stats_dict = { "SOS": sos.astype(np.int16), "EOS": eos.astype(np.int16), "vSOS": vsos.astype(np.float32), "vPOS": vpos.astype(np.float32), "Trough": trough.astype(np.float32), "POS": pos.astype(np.int16), "vEOS": veos.astype(np.float32), "LOS": los.astype(np.int16), "AOS": aos.astype(np.float32), "ROG": rog.astype(np.float32), "ROS": ros.astype(np.float32), } # intialise dataset with first statistic ds = stats_dict[stats[0]].to_dataset(name=stats[0]) # add the other stats to the dataset for stat in stats[1:]: print(" " + stat) stats_keep = stats_dict.get(stat) ds[stat] = stats_dict[stat] try: ds = assign_crs(ds, str(crs)) except: pass return ds.drop('time')
def return_inits_and_verif_dates( forecast: xr.Dataset, verif: xr.Dataset, alignment: str, reference: Optional[Union[str, List[str]]] = None, hist: Optional[xr.Dataset] = None, ) -> returnType: """Return initializations and verification dates per a given alignment strategy. Args: forecast (``xarray`` object): Prediction ensemble with ``init`` dim renamed to ``time`` and containing ``lead`` dim. verif (``xarray`` object): Verification data with ``time`` dim. alignment (str): Strategy for initialization-verification alignment. * 'same_inits': Use a common set of initializations that verify across all leads. This ensures that there is no bias in the result due to the state of the system for the given initializations. * 'same_verifs': Use a common verification window across all leads. This ensures that there is no bias in the result due to the observational period being verified against. * 'maximize': Use all available initializations at each lead that verify against the observations provided. This changes both the set of initializations and the verification window used at each lead. Return: inits (dict): Keys are the lead time integer, values are an ``xr.DataArray`` of initialization dates. verif_dates (dict): Keys are the lead time integer, values are an ``xr.CFTimeIndex`` of verification dates. """ if isinstance(reference, str): reference = [reference] elif reference is None: reference = [] is_in_list(alignment, VALID_ALIGNMENTS, "alignment") units = forecast["lead"].attrs["units"] leads = forecast["lead"].values # `init` renamed to `time` in compute functions. all_inits = forecast["time"] all_verifs = verif["time"] # If aligning reference='uninitialized', need to account for potential differences # in its temporal coverage. Note that the reference='uninitialized' only aligns # verification dates and doesn't need to care about inits. if hist is not None: all_verifs = np.sort(list(set(all_verifs.data) & set(hist["time"].data))) all_verifs = xr.DataArray(all_verifs, dims=["time"], coords=[all_verifs]) # Construct list of `n` offset over all leads. n, freq = get_multiple_lead_cftime_shift_args(units, leads) if "valid_time" not in forecast.coords: # old: create init_lead_matrix init_lead_matrix = _construct_init_lead_matrix(forecast, n, freq, leads) else: # new: use valid_time(init, lead) init_lead_matrix = forecast["valid_time"].drop_vars("valid_time").rename(None) if dask.is_dask_collection(init_lead_matrix): init_lead_matrix = init_lead_matrix.compute() # A union between `inits` and observations in the verification data is required # for persistence, since the persistence forecast is based off a common set of # initializations. if "persistence" in reference: union_with_verifs = _isin(all_inits, all_verifs) init_lead_matrix = init_lead_matrix.where(union_with_verifs, drop=True) valid_inits = init_lead_matrix["time"] if "same_init" in alignment: return _same_inits_alignment( init_lead_matrix, valid_inits, all_verifs, leads, n, freq ) elif "same_verif" in alignment: return _same_verifs_alignment( init_lead_matrix, valid_inits, all_verifs, leads, n, freq ) elif alignment == "maximize": return _maximize_alignment(init_lead_matrix, all_verifs, leads) else: raise ValueError
def temporal_statistics(da, stats): """ Obtain generic temporal statistics using the hdstats temporal library: https://github.com/daleroberts/hdstats/blob/master/hdstats/ts.pyx last modified June 2020 Parameters ---------- da : xarray.DataArray DataArray should contain a 3D time series. stats : list list of temporal statistics to calculate. Options include: 'discordance' = 'f_std' = std of discrete fourier transform coefficients, returns three layers: f_std_n1, f_std_n2, f_std_n3 'f_mean' = mean of discrete fourier transform coefficients, returns three layers: f_mean_n1, f_mean_n2, f_mean_n3 'f_median' = median of discrete fourier transform coefficients, returns three layers: f_median_n1, f_median_n2, f_median_n3 'mean_change' = mean of discrete difference along time dimension 'median_change' = median of discrete difference along time dimension 'abs_change' = mean of absolute discrete difference along time dimension 'complexity' = 'central_diff' = 'num_peaks' : The number of peaks in the timeseries, defined with a local window of size 10. NOTE: This statistic is very slow Outputs ------- xarray.Dataset containing variables for the selected temporal statistics """ # if dask arrays then map the blocks if dask.is_dask_collection(da): if version.parse(xr.__version__) < version.parse("0.16.0"): raise TypeError( "Dask arrays are only supported by this function if using, " + "xarray v0.16, run da.compute() before passing dataArray.") # create a template that matches the final datasets dims & vars arr = da.isel(time=0).drop("time") # deal with the case where fourier is first in the list if stats[0] in ("f_std", "f_median", "f_mean"): template = xr.zeros_like(arr).to_dataset(name=stats[0] + "_n1") template[stats[0] + "_n2"] = xr.zeros_like(arr) template[stats[0] + "_n3"] = xr.zeros_like(arr) for stat in stats[1:]: if stat in ("f_std", "f_median", "f_mean"): template[stat + "_n1"] = xr.zeros_like(arr) template[stat + "_n2"] = xr.zeros_like(arr) template[stat + "_n3"] = xr.zeros_like(arr) else: template[stat] = xr.zeros_like(arr) else: template = xr.zeros_like(arr).to_dataset(name=stats[0]) for stat in stats: if stat in ("f_std", "f_median", "f_mean"): template[stat + "_n1"] = xr.zeros_like(arr) template[stat + "_n2"] = xr.zeros_like(arr) template[stat + "_n3"] = xr.zeros_like(arr) else: template[stat] = xr.zeros_like(arr) try: template = template.drop('spatial_ref') except: pass # ensure the time chunk is set to -1 da_all_time = da.chunk({"time": -1}) # apply function across chunks lazy_ds = da_all_time.map_blocks(temporal_statistics, kwargs={"stats": stats}, template=template) try: crs = da.geobox.crs lazy_ds = assign_crs(lazy_ds, str(crs)) except: pass return lazy_ds # If stats supplied is not a list, convert to list. stats = stats if isinstance(stats, list) else [stats] # grab all the attributes of the xarray x, y, time, attrs = da.x, da.y, da.time, da.attrs # deal with any all-NaN pixels by filling with 0's mask = da.isnull().all("time") da = da.where(~mask, other=0) # complete timeseries print("Completing...") da = fast_completion(da) # ensure dim order is correct for functions da = da.transpose("y", "x", "time").values stats_dict = { "discordance": lambda da: hdstats.discordance(da, n=10), "f_std": lambda da: hdstats.fourier_std(da, n=3, step=5), "f_mean": lambda da: hdstats.fourier_mean(da, n=3, step=5), "f_median": lambda da: hdstats.fourier_median(da, n=3, step=5), "mean_change": lambda da: hdstats.mean_change(da), "median_change": lambda da: hdstats.median_change(da), "abs_change": lambda da: hdstats.mean_abs_change(da), "complexity": lambda da: hdstats.complexity(da), "central_diff": lambda da: hdstats.mean_central_diff(da), "num_peaks": lambda da: hdstats.number_peaks(da, 10), } print(" Statistics:") # if one of the fourier functions is first (or only) # stat in the list then we need to deal with this if stats[0] in ("f_std", "f_median", "f_mean"): print(" " + stats[0]) stat_func = stats_dict.get(str(stats[0])) zz = stat_func(da) n1 = zz[:, :, 0] n2 = zz[:, :, 1] n3 = zz[:, :, 2] # intialise dataset with first statistic ds = xr.DataArray(n1, attrs=attrs, coords={ "x": x, "y": y }, dims=["y", "x"]).to_dataset(name=stats[0] + "_n1") # add other datasets for i, j in zip([n2, n3], ["n2", "n3"]): ds[stats[0] + "_" + j] = xr.DataArray(i, attrs=attrs, coords={ "x": x, "y": y }, dims=["y", "x"]) else: # simpler if first function isn't fourier transform first_func = stats_dict.get(str(stats[0])) print(" " + stats[0]) ds = first_func(da) # convert back to xarray dataset ds = xr.DataArray(ds, attrs=attrs, coords={ "x": x, "y": y }, dims=["y", "x"]).to_dataset(name=stats[0]) # loop through the other functions for stat in stats[1:]: print(" " + stat) # handle the fourier transform examples if stat in ("f_std", "f_median", "f_mean"): stat_func = stats_dict.get(str(stat)) zz = stat_func(da) n1 = zz[:, :, 0] n2 = zz[:, :, 1] n3 = zz[:, :, 2] for i, j in zip([n1, n2, n3], ["n1", "n2", "n3"]): ds[stat + "_" + j] = xr.DataArray(i, attrs=attrs, coords={ "x": x, "y": y }, dims=["y", "x"]) else: # Select a stats function from the dictionary # and add to the dataset stat_func = stats_dict.get(str(stat)) ds[stat] = xr.DataArray(stat_func(da), attrs=attrs, coords={ "x": x, "y": y }, dims=["y", "x"]) # try to add back the geobox try: crs = da.geobox.crs ds = assign_crs(ds, str(crs)) except: pass return ds
def test_inverse_transform(self): a = dpp.StandardScaler() result = a.inverse_transform(a.fit_transform(X)) assert dask.is_dask_collection(result) assert_eq_ar(result, X)
def map_blocks( func: Callable[..., T_DSorDA], obj: Union[DataArray, Dataset], args: Sequence[Any] = (), kwargs: Mapping[str, Any] = None, ) -> T_DSorDA: """Apply a function to each chunk of a DataArray or Dataset. This function is experimental and its signature may change. Parameters ---------- func: callable User-provided function that accepts a DataArray or Dataset as its first parameter. The function will receive a subset of 'obj' (see below), corresponding to one chunk along each chunked dimension. ``func`` will be executed as ``func(obj_subset, *args, **kwargs)``. The function will be first run on mocked-up data, that looks like 'obj' but has sizes 0, to determine properties of the returned object such as dtype, variable names, new dimensions and new indexes (if any). This function must return either a single DataArray or a single Dataset. This function cannot change size of existing dimensions, or add new chunked dimensions. obj: DataArray, Dataset Passed to the function as its first argument, one dask chunk at a time. args: Sequence Passed verbatim to func after unpacking, after the sliced obj. xarray objects, if any, will not be split by chunks. Passing dask collections is not allowed. kwargs: Mapping Passed verbatim to func after unpacking. xarray objects, if any, will not be split by chunks. Passing dask collections is not allowed. Returns ------- A single DataArray or Dataset with dask backend, reassembled from the outputs of the function. Notes ----- This function is designed for when one needs to manipulate a whole xarray object within each chunk. In the more common case where one can work on numpy arrays, it is recommended to use apply_ufunc. If none of the variables in obj is backed by dask, calling this function is equivalent to calling ``func(obj, *args, **kwargs)``. See Also -------- dask.array.map_blocks, xarray.apply_ufunc, xarray.Dataset.map_blocks, xarray.DataArray.map_blocks Examples -------- Calculate an anomaly from climatology using ``.groupby()``. Using ``xr.map_blocks()`` allows for parallel operations with knowledge of ``xarray``, its indices, and its methods like ``.groupby()``. >>> def calculate_anomaly(da, groupby_type="time.month"): ... # Necessary workaround to xarray's check with zero dimensions ... # https://github.com/pydata/xarray/issues/3575 ... if sum(da.shape) == 0: ... return da ... gb = da.groupby(groupby_type) ... clim = gb.mean(dim="time") ... return gb - clim >>> time = xr.cftime_range("1990-01", "1992-01", freq="M") >>> np.random.seed(123) >>> array = xr.DataArray( ... np.random.rand(len(time)), dims="time", coords=[time] ... ).chunk() >>> xr.map_blocks(calculate_anomaly, array).compute() <xarray.DataArray (time: 24)> array([ 0.12894847, 0.11323072, -0.0855964 , -0.09334032, 0.26848862, 0.12382735, 0.22460641, 0.07650108, -0.07673453, -0.22865714, -0.19063865, 0.0590131 , -0.12894847, -0.11323072, 0.0855964 , 0.09334032, -0.26848862, -0.12382735, -0.22460641, -0.07650108, 0.07673453, 0.22865714, 0.19063865, -0.0590131 ]) Coordinates: * time (time) object 1990-01-31 00:00:00 ... 1991-12-31 00:00:00 Note that one must explicitly use ``args=[]`` and ``kwargs={}`` to pass arguments to the function being applied in ``xr.map_blocks()``: >>> xr.map_blocks( ... calculate_anomaly, array, kwargs={"groupby_type": "time.year"}, ... ) <xarray.DataArray (time: 24)> array([ 0.15361741, -0.25671244, -0.31600032, 0.008463 , 0.1766172 , -0.11974531, 0.43791243, 0.14197797, -0.06191987, -0.15073425, -0.19967375, 0.18619794, -0.05100474, -0.42989909, -0.09153273, 0.24841842, -0.30708526, -0.31412523, 0.04197439, 0.0422506 , 0.14482397, 0.35985481, 0.23487834, 0.12144652]) Coordinates: * time (time) object 1990-01-31 00:00:00 ... 1991-12-31 00:00:00 """ def _wrapper(func, obj, to_array, args, kwargs): if to_array: obj = dataset_to_dataarray(obj) result = func(obj, *args, **kwargs) for name, index in result.indexes.items(): if name in obj.indexes: if len(index) != len(obj.indexes[name]): raise ValueError( "Length of the %r dimension has changed. This is not allowed." % name ) return make_dict(result) if not isinstance(args, Sequence): raise TypeError("args must be a sequence (for example, a list or tuple).") if kwargs is None: kwargs = {} elif not isinstance(kwargs, Mapping): raise TypeError("kwargs must be a mapping (for example, a dict)") for value in list(args) + list(kwargs.values()): if dask.is_dask_collection(value): raise TypeError( "Cannot pass dask collections in args or kwargs yet. Please compute or " "load values before passing to map_blocks." ) if not dask.is_dask_collection(obj): return func(obj, *args, **kwargs) if isinstance(obj, DataArray): # only using _to_temp_dataset would break # func = lambda x: x.to_dataset() # since that relies on preserving name. if obj.name is None: dataset = obj._to_temp_dataset() else: dataset = obj.to_dataset() input_is_array = True else: dataset = obj input_is_array = False input_chunks = dataset.chunks template: Union[DataArray, Dataset] = infer_template(func, obj, *args, **kwargs) if isinstance(template, DataArray): result_is_array = True template_name = template.name template = template._to_temp_dataset() elif isinstance(template, Dataset): result_is_array = False else: raise TypeError( f"func output must be DataArray or Dataset; got {type(template)}" ) template_indexes = set(template.indexes) dataset_indexes = set(dataset.indexes) preserved_indexes = template_indexes & dataset_indexes new_indexes = template_indexes - dataset_indexes indexes = {dim: dataset.indexes[dim] for dim in preserved_indexes} indexes.update({k: template.indexes[k] for k in new_indexes}) # We're building a new HighLevelGraph hlg. We'll have one new layer # for each variable in the dataset, which is the result of the # func applied to the values. graph: Dict[Any, Any] = {} new_layers: DefaultDict[str, Dict[Any, Any]] = collections.defaultdict(dict) gname = "{}-{}".format( dask.utils.funcname(func), dask.base.tokenize(dataset, args, kwargs) ) # map dims to list of chunk indexes ichunk = {dim: range(len(chunks_v)) for dim, chunks_v in input_chunks.items()} # mapping from chunk index to slice bounds chunk_index_bounds = { dim: np.cumsum((0,) + chunks_v) for dim, chunks_v in input_chunks.items() } # iterate over all possible chunk combinations for v in itertools.product(*ichunk.values()): chunk_index_dict = dict(zip(dataset.dims, v)) # this will become [[name1, variable1], # [name2, variable2], # ...] # which is passed to dict and then to Dataset data_vars = [] coords = [] for name, variable in dataset.variables.items(): # make a task that creates tuple of (dims, chunk) if dask.is_dask_collection(variable.data): # recursively index into dask_keys nested list to get chunk chunk = variable.__dask_keys__() for dim in variable.dims: chunk = chunk[chunk_index_dict[dim]] chunk_variable_task = (f"{gname}-{chunk[0]}",) + v graph[chunk_variable_task] = ( tuple, [variable.dims, chunk, variable.attrs], ) else: # non-dask array with possibly chunked dimensions # index into variable appropriately subsetter = {} for dim in variable.dims: if dim in chunk_index_dict: which_chunk = chunk_index_dict[dim] subsetter[dim] = slice( chunk_index_bounds[dim][which_chunk], chunk_index_bounds[dim][which_chunk + 1], ) subset = variable.isel(subsetter) chunk_variable_task = ( "{}-{}".format(gname, dask.base.tokenize(subset)), ) + v graph[chunk_variable_task] = ( tuple, [subset.dims, subset, subset.attrs], ) # this task creates dict mapping variable name to above tuple if name in dataset._coord_names: coords.append([name, chunk_variable_task]) else: data_vars.append([name, chunk_variable_task]) from_wrapper = (gname,) + v graph[from_wrapper] = ( _wrapper, func, (Dataset, (dict, data_vars), (dict, coords), dataset.attrs), input_is_array, args, kwargs, ) # mapping from variable name to dask graph key var_key_map: Dict[Hashable, str] = {} for name, variable in template.variables.items(): if name in indexes: continue gname_l = f"{gname}-{name}" var_key_map[name] = gname_l key: Tuple[Any, ...] = (gname_l,) for dim in variable.dims: if dim in chunk_index_dict: key += (chunk_index_dict[dim],) else: # unchunked dimensions in the input have one chunk in the result key += (0,) # We're adding multiple new layers to the graph: # The first new layer is the result of the computation on # the array. # Then we add one layer per variable, which extracts the # result for that variable, and depends on just the first new # layer. new_layers[gname_l][key] = (operator.getitem, from_wrapper, name) hlg = HighLevelGraph.from_collections(gname, graph, dependencies=[dataset]) for gname_l, layer in new_layers.items(): # This adds in the getitems for each variable in the dataset. hlg.dependencies[gname_l] = {gname} hlg.layers[gname_l] = layer result = Dataset(coords=indexes, attrs=template.attrs) for name, gname_l in var_key_map.items(): dims = template[name].dims var_chunks = [] for dim in dims: if dim in input_chunks: var_chunks.append(input_chunks[dim]) elif dim in indexes: var_chunks.append((len(indexes[dim]),)) elif dim in template.dims: # new unindexed dimension var_chunks.append((template.sizes[dim],)) data = dask.array.Array( hlg, name=gname_l, chunks=var_chunks, dtype=template[name].dtype ) result[name] = (dims, data, template[name].attrs) result = result.set_coords(template._coord_names) if result_is_array: da = dataset_to_dataarray(result) da.name = template_name return da # type: ignore return result # type: ignore
def fit(self, X, y=None): if not dask.is_dask_collection(X): raise TypeError(_TYPE_MSG.format(type(X))) self._fit(X) self.n_features_in_ = X.shape[1] return self
def transform(self, X): """Transform a sequence of documents to a document-term matrix. Transformation is done in parallel, and correctly handles dask collections. Parameters ---------- X : dask.Bag of raw text documents, length = n_samples Samples. Each sample must be a text document (either bytes or unicode strings, file name or file object depending on the constructor argument) which will be tokenized and hashed. Returns ------- X : dask.array.Array, shape = (n_samples, self.n_features) Document-term matrix. Each block of the array is a scipy sparse matrix. Notes ----- The returned dask Array is composed scipy sparse matricies. If you need to compute on the result immediately, you may need to convert the individual blocks to ndarrays or pydata/sparse matricies. >>> import sparse >>> X.map_blocks(sparse.COO.from_scipy_sparse, dtype=X.dtype) # doctest: +SKIP See the :doc:`examples/text-vectorization` for more. """ transformer = super(HashingVectorizer, self).transform msg = "'X' should be a 1-dimensional array with length 'num_samples'." if not dask.is_dask_collection(X): return transformer(X) if isinstance(X, db.Bag): bag2 = X.map_partitions(transformer) objs = bag2.to_delayed() arrs = [ da.from_delayed(obj, (np.nan, self.n_features), self.dtype) for obj in objs ] result = da.concatenate(arrs, axis=0) elif isinstance(X, dd.Series): result = X.map_partitions(transformer) elif isinstance(X, da.Array): # dask.Array chunks = ((np.nan, ) * X.numblocks[0], (self.n_features, )) if X.ndim == 1: result = X.map_blocks(transformer, dtype="f8", chunks=chunks, new_axis=1) else: raise ValueError(msg) else: raise ValueError(msg) return result
def dispatch(self, hyper_model, X, y, X_val, y_val, max_trails, dataset_id, trail_store, **fit_kwargs): assert not any(dask.is_dask_collection(i) for i in (X, y, X_val, y_val)), \ f'{self.__class__.__name__} does not support to run trail with dask collection.' experiment = time.strftime('%Y%m%d%H%M%S') experiment_model_root = f'{_model_root}/{experiment}' os.makedirs(experiment_model_root, exist_ok=True) queue_size = int(config('search_queue', '1')) worker_count = int(config('search_executors', '3')) retry_limit = int(config('search_retry', '1000')) failed_counter = Counter() success_counter = Counter() def on_trail_start(trail_item): trail_item.start_at = time.time() if logger.is_info_enabled(): msg = f'Start trail {trail_item.trail_no}, space_id={trail_item.space_id}' \ + f',model_file={trail_item.model_file}' logger.info(msg) for callback in hyper_model.callbacks: # callback.on_build_estimator(hyper_model, space_sample, estimator, trail_no) #fixme callback.on_trail_begin(hyper_model, trail_item.space_sample, trail_item.trail_no) def on_trail_done(trail_item): trail_item.done_at = time.time() if trail_item.reward != 0 and not math.isnan( trail_item.reward): # success improved = hyper_model.history.append(trail_item) for callback in hyper_model.callbacks: callback.on_trail_end(hyper_model, trail_item.space_sample, trail_item.trail_no, trail_item.reward, improved, trail_item.elapsed) success_counter() else: for callback in hyper_model.callbacks: callback.on_trail_error(hyper_model, trail_item.space_sample, trail_item.trail_no) failed_counter() if logger.is_info_enabled(): elapsed = '%.3f' % (trail_item.done_at - trail_item.start_at) msg = f'Trail {trail_item.trail_no} done with reward={trail_item.reward}, ' \ f'elapsed {elapsed} seconds\n' \ f'----------------------------------------------------------------\n' \ f'space signatures: \n{hyper_model.history.get_space_signatures()}\n' \ f'----------------------------------------------------------------' logger.info(msg) if trail_store is not None: trail_store.put(dataset_id, trail_item) pool = DaskExecutorPool(worker_count, queue_size, on_trail_start, on_trail_done, hyper_model._run_trial, X, y, X_val, y_val, fit_kwargs) pool.start() trail_no = 1 retry_counter = 0 while trail_no <= max_trails and pool.running: if pool.qsize >= queue_size: time.sleep(0.1) continue space_sample = hyper_model.searcher.sample() if hyper_model.history.is_existed(space_sample): if retry_counter >= retry_limit: logger.info( f'Unable to take valid sample and exceed the retry limit 1000.' ) break trail = hyper_model.history.get_trail(space_sample) for callback in hyper_model.callbacks: callback.on_skip_trail(hyper_model, space_sample, trail_no, 'trail_existed', trail.reward, False, trail.elapsed) retry_counter += 1 continue try: if trail_store is not None: trail = trail_store.get(dataset_id, space_sample) if trail is not None: reward = trail.reward elapsed = trail.elapsed trail = Trail(space_sample, trail_no, reward, elapsed) improved = hyper_model.history.append(trail) hyper_model.searcher.update_result( space_sample, reward) for callback in hyper_model.callbacks: callback.on_skip_trail(hyper_model, space_sample, trail_no, 'hit_trail_store', reward, improved, elapsed) trail_no += 1 continue model_file = '%s/%05d_%s.pkl' % ( experiment_model_root, trail_no, space_sample.space_id) item = DaskTrailItem(space_sample, trail_no, model_file=model_file) pool.push(item) if logger.is_info_enabled(): logger.info( f'Found trail {trail_no}, queue size: {pool.qsize}') except EarlyStoppingError: pool.stop() break except KeyboardInterrupt: pool.stop() pool.interrupted = True print('KeyboardInterrupt') break except Exception as e: import traceback msg = f'{">" * 20} Search trail {trail_no} failed! {"<" * 20}\n' \ + f'{e.__class__.__name__}: {e}\n' \ + traceback.format_exc() \ + '*' * 50 logger.error(msg) finally: trail_no += 1 retry_counter = 0 # wait trails if pool.running: logger.info('Search done, wait trail tasks.') pool.push(None) # mark end pool.join() if logger.is_info_enabled(): logger.info( f'Search and all trails done, {success_counter.value} success, ' f'{failed_counter.value} failed.') return trail_no
def test_gap_fill(): a = np.zeros((5,), dtype="uint8") b = np.empty_like(a) b[:] = 33 a[0] = 11 ab = _gap_fill_np(a, b, 0) assert ab.dtype == a.dtype assert ab.tolist() == [11, 33, 33, 33, 33] xa = xr.DataArray( a, name="test_a", dims=("t",), attrs={"p1": 1, "nodata": 0}, coords=dict(t=np.arange(a.shape[0])), ) xb = xa + 0 xb.data[:] = b xab = gap_fill(xa, xb) assert xab.name == xa.name assert xab.attrs == xa.attrs assert xab.data.tolist() == [11, 33, 33, 33, 33] xa.attrs["nodata"] = 11 assert gap_fill(xa, xb).data.tolist() == [33, 0, 0, 0, 0] a = np.zeros((5,), dtype="float32") a[1:] = np.nan b = np.empty_like(a) b[:] = 33 ab = _gap_fill_np(a, b, np.nan) assert ab.dtype == a.dtype assert ab.tolist() == [0, 33, 33, 33, 33] xa = xr.DataArray( a, name="test_a", dims=("t",), attrs={"p1": 1}, coords=dict(t=np.arange(a.shape[0])), ) xb = xa + 0 xb.data[:] = b xab = gap_fill(xa, xb) assert xab.name == xa.name assert xab.attrs == xa.attrs assert xab.data.tolist() == [0, 33, 33, 33, 33] xa = xr.DataArray( da.from_array(a), name="test_a", dims=("t",), attrs={"p1": 1}, coords=dict(t=np.arange(a.shape[0])), ) xb = xr.DataArray( da.from_array(b), name="test_a", dims=("t",), attrs={"p1": 1}, coords=dict(t=np.arange(b.shape[0])), ) assert dask.is_dask_collection(xa) assert dask.is_dask_collection(xb) xab = gap_fill(xa, xb) assert dask.is_dask_collection(xab) assert xab.name == xa.name assert xab.attrs == xa.attrs assert xab.compute().values.tolist() == [0, 33, 33, 33, 33]
def int_geomedian(ds, scale=1, offset=0, wk_rows=-1, as_array=False, **kw): """ds -- xr.Dataset (possibly dask) with dims: (time, y, x) for each band on output time dimension is removed :param ds: Dataset with int data variables :param scale: Normalize data for running computation (output is scaled back to original values) :param offset: ``(x*scale + offset)`` :param wk_rows: reduce memory requirements by processing that many rows of a chunk at a time :param as_array: If set to True return DataArray with band dimension instead of Dataset :param kw: Passed on to hdstats (eps=1e-4, num_threads=1, maxiters=10_000, nocheck=True) """ band_names = [dv.name for dv in ds.data_vars.values()] xx, *_ = ds.data_vars.values() nodata = getattr(xx, "nodata", None) is_dask = dask.is_dask_collection(xx) if is_dask: if xx.data.chunksize[0] != xx.shape[0]: ds = ds.chunk(chunks={xx.dims[0]: -1}) xx, *_ = ds.data_vars.values() nt, ny, nx = xx.shape bands = [dv.data for dv in ds.data_vars.values()] band = bands[0] nb = len(bands) dtype = band.dtype kw.setdefault("nocheck", True) kw.setdefault("num_threads", 1) kw.setdefault("eps", 1e-4) kw.setdefault("maxiters", 10_000) if is_dask: chunks = ((nb, ), *xx.chunks[1:]) data = da.map_blocks( int_geomedian_np, *bands, nodata=nodata, scale=scale, offset=offset, wk_rows=wk_rows, **kw, name=randomize("geomedian"), dtype=dtype, chunks=chunks, drop_axis=[0], # time is dropped new_axis=[0], ) # band is added on the left else: data = int_geomedian_np(*bands, nodata=nodata, scale=scale, offset=offset, wk_rows=wk_rows, **kw) dims = ("band", *xx.dims[1:]) cc = {k: xx.coords[k] for k in dims[1:]} cc["band"] = band_names da_out = xr.DataArray(data, dims=dims, coords=cc) if as_array: if nodata is not None: da_out.attrs["nodata"] = nodata return da_out ds_out = da_out.to_dataset(dim="band") ds_out.attrs.update(ds.attrs) for b in ds.data_vars.keys(): src, dst = ds[b], ds_out[b] dst.attrs.update(src.attrs) return ds_out
def assert_chunk(actual, chunk_bool): """check that actual is chunked when chunk_bool==True.""" if chunk_bool: assert is_dask_collection(actual) else: assert not is_dask_collection(actual)
def xr_geomedian(ds, axis="time", where=None, **kw): """ :param ds: xr.Dataset|xr.DataArray|numpy array Other parameters: **kwargs -- passed on to pcm.gnmpcm maxiters : int 1000 eps : float 0.0001 num_threads: int| None None """ from hdstats import nangeomedian_pcm def norm_input(ds, axis): if isinstance(ds, xr.DataArray): xx = ds if len(xx.dims) != 4: raise ValueError("Expect 4 dimensions on input: y,x,band,time") if axis is not None and xx.dims[3] != axis: raise ValueError( f"Can only reduce last dimension, expect: y,x,band,{axis}") return None, xx, xx.data elif isinstance(ds, xr.Dataset): xx = reshape_for_geomedian(ds, axis) return ds, xx, xx.data else: # assume numpy or similar xx_data = ds if xx_data.ndim != 4: raise ValueError("Expect 4 dimensions on input: y,x,band,time") return None, None, xx_data kw.setdefault("nocheck", True) kw.setdefault("num_threads", 1) kw.setdefault("eps", 1e-6) ds, xx, xx_data = norm_input(ds, axis) is_dask = dask.is_dask_collection(xx_data) if where is not None: if is_dask: raise NotImplementedError( "Dask version doesn't support output masking currently") if where.shape != xx_data.shape[:2]: raise ValueError("Shape for `where` parameter doesn't match") set_nan = ~where else: set_nan = None if is_dask: if xx_data.shape[-2:] != xx_data.chunksize[-2:]: xx_data = xx_data.rechunk(xx_data.chunksize[:2] + (-1, -1)) data = da.map_blocks( lambda x: nangeomedian_pcm(x, **kw), xx_data, name=randomize("geomedian"), dtype=xx_data.dtype, drop_axis=3, ) else: data = nangeomedian_pcm(xx_data, **kw) if set_nan is not None: data[set_nan, :] = np.nan if xx is None: return data dims = xx.dims[:-1] cc = {k: xx.coords[k] for k in dims} xx_out = xr.DataArray(data, dims=dims, coords=cc) if ds is None: xx_out.attrs.update(xx.attrs) return xx_out ds_out = xx_out.to_dataset(dim="band") for b in ds.data_vars.keys(): src, dst = ds[b], ds_out[b] dst.attrs.update(src.attrs) return ds_out
def reshape_yxbt( xx: xr.Dataset, name: str = "reshape_yxbt", yx_chunks: Union[int, Tuple[int, int]] = -1, ) -> xr.DataArray: """ Reshape Dask-backed ``xr.Dataset[Time,Y,X]`` into ``xr.DataArray[Y,X,Band,Time]``. On the output DataArray there is exactly one chunk along both Time and Band dimensions. :param xx: Dataset with 3 dimensional bands, dimension order (time, y, x) :param name: Dask name of the output operation :param yx_chunks: If supplied subdivide YX chunks of input into smaller sections, note that this can only make yx chunks smaller not bigger. Every output chunk depends on one input chunk only, so output chunks might not be regular, for example if input chunk sizes are 10, and yx_chunks=3, you'll get chunks sized 3,3,3,1,3,3,3,1... (example only, never use chunks that small) .. note: Chunks along first dimension ought to be of size 1 exactly (default for time dimension when using dc.load). """ if isinstance(yx_chunks, int): yx_chunks = (yx_chunks, yx_chunks) if not is_dask_collection(xx): raise ValueError("Currently this code works only on Dask inputs") if not all(dv.data.numblocks[0] == dv.data.shape[0] for dv in xx.data_vars.values()): raise ValueError( "All input bands should have chunk=1 for the first dimension") name0 = name name = randomize(name) blocks, _ = _get_chunks_for_all_bands(xx) b0, *_ = xx.data_vars.values() attrs = dict(b0.attrs) nb = len(xx.data_vars.values()) nt, ny, nx = b0.shape deps = [dv.data for dv in xx.data_vars.values()] shape = (ny, nx, nb, nt) dtype = b0.dtype dims = b0.dims[1:] + ("band", b0.dims[0]) maxy, maxx = yx_chunks ychunks, xchunks = b0.data.chunks[1:3] _yy = list(_split_chunks(ychunks, maxy)) _xx = list(_split_chunks(xchunks, maxx)) ychunks = tuple(roi.stop - roi.start for _, _, roi in _yy) xchunks = tuple(roi.stop - roi.start for _, _, roi in _xx) chunks = [ychunks, xchunks, (nb, ), (nt, )] dsk = {} for iy, iy_src, y_roi in _yy: for ix, ix_src, x_roi in _xx: crop_yx = (y_roi, x_roi) _blocks = blocks[:, :, iy_src, ix_src].tolist() dsk[(name, iy, ix, 0, 0)] = ( functools.partial(_reshape_yxbt_impl, crop_yx=crop_yx), _blocks, ) dsk = HighLevelGraph.from_collections(name, dsk, dependencies=deps) data = da.Array(dsk, name, chunks=chunks, dtype=dtype, shape=shape) coords: Dict[Hashable, Any] = {k: c for k, c in xx.coords.items()} coords["band"] = list(xx.data_vars) return xr.DataArray(data=data, dims=dims, coords=coords, name=name0, attrs=attrs)
def map_blocks( func: Callable[..., T_DSorDA], obj: Union[DataArray, Dataset], args: Sequence[Any] = (), kwargs: Mapping[str, Any] = None, template: Union[DataArray, Dataset] = None, ) -> T_DSorDA: """Apply a function to each block of a DataArray or Dataset. .. warning:: This function is experimental and its signature may change. Parameters ---------- func : callable User-provided function that accepts a DataArray or Dataset as its first parameter ``obj``. The function will receive a subset or 'block' of ``obj`` (see below), corresponding to one chunk along each chunked dimension. ``func`` will be executed as ``func(subset_obj, *subset_args, **kwargs)``. This function must return either a single DataArray or a single Dataset. This function cannot add a new chunked dimension. obj : DataArray, Dataset Passed to the function as its first argument, one block at a time. args : sequence Passed to func after unpacking and subsetting any xarray objects by blocks. xarray objects in args must be aligned with obj, otherwise an error is raised. kwargs : mapping Passed verbatim to func after unpacking. xarray objects, if any, will not be subset to blocks. Passing dask collections in kwargs is not allowed. template : DataArray or Dataset, optional xarray object representing the final result after compute is called. If not provided, the function will be first run on mocked-up data, that looks like ``obj`` but has sizes 0, to determine properties of the returned object such as dtype, variable names, attributes, new dimensions and new indexes (if any). ``template`` must be provided if the function changes the size of existing dimensions. When provided, ``attrs`` on variables in `template` are copied over to the result. Any ``attrs`` set by ``func`` will be ignored. Returns ------- A single DataArray or Dataset with dask backend, reassembled from the outputs of the function. Notes ----- This function is designed for when ``func`` needs to manipulate a whole xarray object subset to each block. In the more common case where ``func`` can work on numpy arrays, it is recommended to use ``apply_ufunc``. If none of the variables in ``obj`` is backed by dask arrays, calling this function is equivalent to calling ``func(obj, *args, **kwargs)``. See Also -------- dask.array.map_blocks, xarray.apply_ufunc, xarray.Dataset.map_blocks xarray.DataArray.map_blocks Examples -------- Calculate an anomaly from climatology using ``.groupby()``. Using ``xr.map_blocks()`` allows for parallel operations with knowledge of ``xarray``, its indices, and its methods like ``.groupby()``. >>> def calculate_anomaly(da, groupby_type="time.month"): ... gb = da.groupby(groupby_type) ... clim = gb.mean(dim="time") ... return gb - clim ... >>> time = xr.cftime_range("1990-01", "1992-01", freq="M") >>> month = xr.DataArray(time.month, coords={"time": time}, dims=["time"]) >>> np.random.seed(123) >>> array = xr.DataArray( ... np.random.rand(len(time)), ... dims=["time"], ... coords={"time": time, "month": month}, ... ).chunk() >>> array.map_blocks(calculate_anomaly, template=array).compute() <xarray.DataArray (time: 24)> array([ 0.12894847, 0.11323072, -0.0855964 , -0.09334032, 0.26848862, 0.12382735, 0.22460641, 0.07650108, -0.07673453, -0.22865714, -0.19063865, 0.0590131 , -0.12894847, -0.11323072, 0.0855964 , 0.09334032, -0.26848862, -0.12382735, -0.22460641, -0.07650108, 0.07673453, 0.22865714, 0.19063865, -0.0590131 ]) Coordinates: * time (time) object 1990-01-31 00:00:00 ... 1991-12-31 00:00:00 month (time) int64 1 2 3 4 5 6 7 8 9 10 11 12 1 2 3 4 5 6 7 8 9 10 11 12 Note that one must explicitly use ``args=[]`` and ``kwargs={}`` to pass arguments to the function being applied in ``xr.map_blocks()``: >>> array.map_blocks( ... calculate_anomaly, ... kwargs={"groupby_type": "time.year"}, ... template=array, ... ) # doctest: +ELLIPSIS <xarray.DataArray (time: 24)> dask.array<calculate_anomaly-...-<this, shape=(24,), dtype=float64, chunksize=(24,), chunktype=numpy.ndarray> Coordinates: * time (time) object 1990-01-31 00:00:00 ... 1991-12-31 00:00:00 month (time) int64 dask.array<chunksize=(24,), meta=np.ndarray> """ def _wrapper( func: Callable, args: List, kwargs: dict, arg_is_array: Iterable[bool], expected: dict, ): """ Wrapper function that receives datasets in args; converts to dataarrays when necessary; passes these to the user function `func` and checks returned objects for expected shapes/sizes/etc. """ converted_args = [ dataset_to_dataarray(arg) if is_array else arg for is_array, arg in zip(arg_is_array, args) ] result = func(*converted_args, **kwargs) # check all dims are present missing_dimensions = set(expected["shapes"]) - set(result.sizes) if missing_dimensions: raise ValueError( f"Dimensions {missing_dimensions} missing on returned object.") # check that index lengths and values are as expected for name, index in result.indexes.items(): if name in expected["shapes"]: if len(index) != expected["shapes"][name]: raise ValueError( f"Received dimension {name!r} of length {len(index)}. Expected length {expected['shapes'][name]}." ) if name in expected["indexes"]: expected_index = expected["indexes"][name] if not index.equals(expected_index): raise ValueError( f"Expected index {name!r} to be {expected_index!r}. Received {index!r} instead." ) # check that all expected variables were returned check_result_variables(result, expected, "coords") if isinstance(result, Dataset): check_result_variables(result, expected, "data_vars") return make_dict(result) if template is not None and not isinstance(template, (DataArray, Dataset)): raise TypeError( f"template must be a DataArray or Dataset. Received {type(template).__name__} instead." ) if not isinstance(args, Sequence): raise TypeError( "args must be a sequence (for example, a list or tuple).") if kwargs is None: kwargs = {} elif not isinstance(kwargs, Mapping): raise TypeError("kwargs must be a mapping (for example, a dict)") for value in kwargs.values(): if dask.is_dask_collection(value): raise TypeError( "Cannot pass dask collections in kwargs yet. Please compute or " "load values before passing to map_blocks.") if not dask.is_dask_collection(obj): return func(obj, *args, **kwargs) all_args = [obj] + list(args) is_xarray = [isinstance(arg, (Dataset, DataArray)) for arg in all_args] is_array = [isinstance(arg, DataArray) for arg in all_args] # there should be a better way to group this. partition? xarray_indices, xarray_objs = unzip( (index, arg) for index, arg in enumerate(all_args) if is_xarray[index]) others = [(index, arg) for index, arg in enumerate(all_args) if not is_xarray[index]] # all xarray objects must be aligned. This is consistent with apply_ufunc. aligned = align(*xarray_objs, join="exact") xarray_objs = tuple( dataarray_to_dataset(arg) if is_da else arg for is_da, arg in zip(is_array, aligned)) _, npargs = unzip( sorted(list(zip(xarray_indices, xarray_objs)) + others, key=lambda x: x[0])) # check that chunk sizes are compatible input_chunks = dict(npargs[0].chunks) input_indexes = dict(npargs[0].indexes) for arg in xarray_objs[1:]: assert_chunks_compatible(npargs[0], arg) input_chunks.update(arg.chunks) input_indexes.update(arg.indexes) if template is None: # infer template by providing zero-shaped arrays template = infer_template(func, aligned[0], *args, **kwargs) template_indexes = set(template.indexes) preserved_indexes = template_indexes & set(input_indexes) new_indexes = template_indexes - set(input_indexes) indexes = {dim: input_indexes[dim] for dim in preserved_indexes} indexes.update({k: template.indexes[k] for k in new_indexes}) output_chunks = { dim: input_chunks[dim] for dim in template.dims if dim in input_chunks } else: # template xarray object has been provided with proper sizes and chunk shapes indexes = dict(template.indexes) if isinstance(template, DataArray): output_chunks = dict(zip(template.dims, template.chunks)) # type: ignore else: output_chunks = dict(template.chunks) for dim in output_chunks: if dim in input_chunks and len(input_chunks[dim]) != len( output_chunks[dim]): raise ValueError( "map_blocks requires that one block of the input maps to one block of output. " f"Expected number of output chunks along dimension {dim!r} to be {len(input_chunks[dim])}. " f"Received {len(output_chunks[dim])} instead. Please provide template if not provided, or " "fix the provided template.") if isinstance(template, DataArray): result_is_array = True template_name = template.name template = template._to_temp_dataset() elif isinstance(template, Dataset): result_is_array = False else: raise TypeError( f"func output must be DataArray or Dataset; got {type(template)}") # We're building a new HighLevelGraph hlg. We'll have one new layer # for each variable in the dataset, which is the result of the # func applied to the values. graph: Dict[Any, Any] = {} new_layers: DefaultDict[str, Dict[Any, Any]] = collections.defaultdict(dict) gname = "{}-{}".format(dask.utils.funcname(func), dask.base.tokenize(npargs[0], args, kwargs)) # map dims to list of chunk indexes ichunk = { dim: range(len(chunks_v)) for dim, chunks_v in input_chunks.items() } # mapping from chunk index to slice bounds input_chunk_bounds = { dim: np.cumsum((0, ) + chunks_v) for dim, chunks_v in input_chunks.items() } output_chunk_bounds = { dim: np.cumsum((0, ) + chunks_v) for dim, chunks_v in output_chunks.items() } def subset_dataset_to_block(graph: dict, gname: str, dataset: Dataset, input_chunk_bounds, chunk_index): """ Creates a task that subsets an xarray dataset to a block determined by chunk_index. Block extents are determined by input_chunk_bounds. Also subtasks that subset the constituent variables of a dataset. """ # this will become [[name1, variable1], # [name2, variable2], # ...] # which is passed to dict and then to Dataset data_vars = [] coords = [] chunk_tuple = tuple(chunk_index.values()) for name, variable in dataset.variables.items(): # make a task that creates tuple of (dims, chunk) if dask.is_dask_collection(variable.data): # recursively index into dask_keys nested list to get chunk chunk = variable.__dask_keys__() for dim in variable.dims: chunk = chunk[chunk_index[dim]] chunk_variable_task = ( f"{gname}-{name}-{chunk[0]}", ) + chunk_tuple graph[chunk_variable_task] = ( tuple, [variable.dims, chunk, variable.attrs], ) else: # non-dask array possibly with dimensions chunked on other variables # index into variable appropriately subsetter = { dim: _get_chunk_slicer(dim, chunk_index, input_chunk_bounds) for dim in variable.dims } subset = variable.isel(subsetter) chunk_variable_task = ("{}-{}".format( gname, dask.base.tokenize(subset)), ) + chunk_tuple graph[chunk_variable_task] = ( tuple, [subset.dims, subset, subset.attrs], ) # this task creates dict mapping variable name to above tuple if name in dataset._coord_names: coords.append([name, chunk_variable_task]) else: data_vars.append([name, chunk_variable_task]) return (Dataset, (dict, data_vars), (dict, coords), dataset.attrs) # iterate over all possible chunk combinations for chunk_tuple in itertools.product(*ichunk.values()): # mapping from dimension name to chunk index chunk_index = dict(zip(ichunk.keys(), chunk_tuple)) blocked_args = [ subset_dataset_to_block(graph, gname, arg, input_chunk_bounds, chunk_index) if isxr else arg for isxr, arg in zip(is_xarray, npargs) ] # expected["shapes", "coords", "data_vars", "indexes"] are used to # raise nice error messages in _wrapper expected = {} # input chunk 0 along a dimension maps to output chunk 0 along the same dimension # even if length of dimension is changed by the applied function expected["shapes"] = { k: output_chunks[k][v] for k, v in chunk_index.items() if k in output_chunks } expected["data_vars"] = set(template.data_vars.keys()) # type: ignore expected["coords"] = set(template.coords.keys()) # type: ignore expected["indexes"] = { dim: indexes[dim][_get_chunk_slicer(dim, chunk_index, output_chunk_bounds)] for dim in indexes } from_wrapper = (gname, ) + chunk_tuple graph[from_wrapper] = (_wrapper, func, blocked_args, kwargs, is_array, expected) # mapping from variable name to dask graph key var_key_map: Dict[Hashable, str] = {} for name, variable in template.variables.items(): if name in indexes: continue gname_l = f"{gname}-{name}" var_key_map[name] = gname_l key: Tuple[Any, ...] = (gname_l, ) for dim in variable.dims: if dim in chunk_index: key += (chunk_index[dim], ) else: # unchunked dimensions in the input have one chunk in the result # output can have new dimensions with exactly one chunk key += (0, ) # We're adding multiple new layers to the graph: # The first new layer is the result of the computation on # the array. # Then we add one layer per variable, which extracts the # result for that variable, and depends on just the first new # layer. new_layers[gname_l][key] = (operator.getitem, from_wrapper, name) hlg = HighLevelGraph.from_collections( gname, graph, dependencies=[arg for arg in npargs if dask.is_dask_collection(arg)], ) for gname_l, layer in new_layers.items(): # This adds in the getitems for each variable in the dataset. hlg.dependencies[gname_l] = {gname} hlg.layers[gname_l] = layer result = Dataset(coords=indexes, attrs=template.attrs) for index in result.indexes: result[index].attrs = template[index].attrs result[index].encoding = template[index].encoding for name, gname_l in var_key_map.items(): dims = template[name].dims var_chunks = [] for dim in dims: if dim in output_chunks: var_chunks.append(output_chunks[dim]) elif dim in indexes: var_chunks.append((len(indexes[dim]), )) elif dim in template.dims: # new unindexed dimension var_chunks.append((template.sizes[dim], )) data = dask.array.Array(hlg, name=gname_l, chunks=var_chunks, dtype=template[name].dtype) result[name] = (dims, data, template[name].attrs) result[name].encoding = template[name].encoding result = result.set_coords(template._coord_names) if result_is_array: da = dataset_to_dataarray(result) da.name = template_name return da # type: ignore return result # type: ignore
def test_transform_dtypes(self, array): result = dpp.LabelEncoder().fit_transform(array) assert result.dtype == np.intp if dask.is_dask_collection(array): assert result.dtype == result.compute().dtype