def test_blockwise_non_blockwise_output(): x = da.ones(10, chunks=(5, )) y = ((x + 1) + 2) + 3 w = y.sum() z = ((y * 2) * 3) * 4 z_top_before = tuple(z.dask.dicts[z.name].indices) (zz, ) = dask.optimize(z) z_top_after = tuple(z.dask.dicts[z.name].indices) assert z_top_before == z_top_after, "z_top mutated" dsk = optimize_blockwise(z.dask, keys=list(dask.core.flatten(z.__dask_keys__()))) assert isinstance(dsk, HighLevelGraph) assert (len([ layer for layer in dsk.dicts.values() if isinstance(layer, Blockwise) ]) == 1) dsk = optimize_blockwise( HighLevelGraph.merge(w.dask, z.dask), keys=list(dask.core.flatten([w.__dask_keys__(), z.__dask_keys__()])), ) assert isinstance(dsk, HighLevelGraph) assert (len([ layer for layer in z.dask.dicts.values() if isinstance(layer, Blockwise) ]) >= 1)
def collections_to_dsk(collections, optimize_graph=True, optimizations=(), **kwargs): """ Convert many collections into a single dask graph, after optimization """ from dask.highlevelgraph import HighLevelGraph optimizations = tuple(optimizations) + tuple(config.get("optimizations", ())) if optimize_graph: groups = groupby(optimization_function, collections) graphs = [] for opt, val in groups.items(): dsk, keys = _extract_graph_and_keys(val) dsk = opt(dsk, keys, **kwargs) for opt_inner in optimizations: dsk = opt_inner(dsk, keys, **kwargs) graphs.append(dsk) # Merge all graphs if any(isinstance(graph, HighLevelGraph) for graph in graphs): dsk = HighLevelGraph.merge(*graphs) else: dsk = merge(*map(ensure_dict, graphs)) else: dsk, _ = _extract_graph_and_keys(collections) return dsk
def __dask_graph__(self): graphs = {k: v.__dask_graph__() for k, v in self.data_vars.items()} graphs = {k: v for k, v in graphs.items() if v is not None} if len(graphs) > 0: return HighLevelGraph.merge(*graphs.values()) return None
def __dask_graph__(self): graphs = {k: v.__dask_graph__() for k, v in self.data_vars.items()} # Excise anything that is not a dask collection graphs = {k: v for k, v in graphs.items() if v is not None} if len(graphs) > 0: return HighLevelGraph.merge(*graphs.values()) return None
def compute_as_if_collection(cls, dsk, keys, scheduler=None, get=None, **kwargs): """Compute a graph as if it were of type cls. Allows for applying the same optimizations and default scheduler.""" from dask.highlevelgraph import HighLevelGraph schedule = get_scheduler(scheduler=scheduler, cls=cls, get=get) dsk2 = optimization_function(cls)(dsk, keys, **kwargs) # see https://github.com/dask/dask/issues/8991. # This merge should be removed once the underlying issue is fixed. dsk2 = HighLevelGraph.merge(dsk2) return schedule(dsk2, keys, **kwargs)
def rearrange_by_column_disk(df, column, npartitions=None, compute=False): """Shuffle using local disk See Also -------- rearrange_by_column_tasks: Same function, but using tasks rather than partd Has a more informative docstring """ if npartitions is None: npartitions = df.npartitions token = tokenize(df, column, npartitions) always_new_token = uuid.uuid1().hex p = ("zpartd-" + always_new_token, ) dsk1 = {p: (maybe_buffered_partd(), )} # Partition data on disk name = "shuffle-partition-" + always_new_token dsk2 = {(name, i): (shuffle_group_3, key, column, npartitions, p) for i, key in enumerate(df.__dask_keys__())} dependencies = [] if compute: graph = HighLevelGraph.merge(df.dask, dsk1, dsk2) graph = HighLevelGraph.from_collections(name, graph, dependencies=[df]) keys = [p, sorted(dsk2)] pp, values = compute_as_if_collection(DataFrame, graph, keys) dsk1 = {p: pp} dsk2 = dict(zip(sorted(dsk2), values)) else: dependencies.append(df) # Barrier barrier_token = "barrier-" + always_new_token dsk3 = {barrier_token: (barrier, list(dsk2))} # Collect groups name = "shuffle-collect-" + token dsk4 = {(name, i): (collect, p, i, df._meta, barrier_token) for i in range(npartitions)} divisions = (None, ) * (npartitions + 1) layer = toolz.merge(dsk1, dsk2, dsk3, dsk4) graph = HighLevelGraph.from_collections(name, layer, dependencies=dependencies) return new_dd_object(graph, name, df._meta, divisions)
def _extract_graph_and_keys(vals): """Given a list of dask vals, return a single graph and a list of keys such that ``get(dsk, keys)`` is equivalent to ``[v.compute() for v in vals]``.""" from dask.highlevelgraph import HighLevelGraph graphs, keys = [], [] for v in vals: graphs.append(v.__dask_graph__()) keys.append(v.__dask_keys__()) if any(isinstance(graph, HighLevelGraph) for graph in graphs): graph = HighLevelGraph.merge(*graphs) else: graph = merge(*map(ensure_dict, graphs)) return graph, keys
def cascaded_compute(callback, arrays, batch_size=None, optimize=True): """Dask helper function for iterating over computed dask arrays. Args: callback (callable): Called with a single numpy array computed from the provided dask arrays. arrays (list, tuple): Dask arrays to pass to callback. batch_size (int): Group computation in to this many arrays at a time. optimize (bool): Whether to try to optimize the dask graphs of the provided arrays. Returns: `dask.Delayed` object to be computed """ def _callback_wrapper(arr, previous_call, cb=callback): del previous_call # used only for task ordering return cb(arr) array_batches = [] if not batch_size: array_batches.append(arrays) else: arr_gens = iter(arrays) array_batches = (arrs for arrs in zip_longest(*([arr_gens] * batch_size))) for batch_arrs in array_batches: batch_arrs = [x for x in batch_arrs if x is not None] if optimize: # optimize Dask graph over all objects dsk = da.Array.__dask_optimize__( # combine all Dask Array graphs HighLevelGraph.merge(*[e.__dask_graph__() for e in batch_arrs]), # get Dask Array keys in result list(dask.core.flatten([e.__dask_keys__() for e in batch_arrs]))) # rebuild Dask Arrays batch_arrs = [ da.Array(dsk, e.name, e.chunks, e.dtype) for e in batch_arrs ] current_write = None for dask_arr in batch_arrs: current_write = dask.delayed(_callback_wrapper)(dask_arr, current_write) yield current_write
def _checkpoint_one(collection, split_every) -> Delayed: tok = tokenize(collection) name = "checkpoint-" + tok keys_iter = flatten(collection.__dask_keys__()) try: next(keys_iter) next(keys_iter) except StopIteration: # Collection has 0 or 1 keys; no need for a map step layer = {name: (chunks.checkpoint, collection.__dask_keys__())} dsk = HighLevelGraph.from_collections(name, layer, dependencies=(collection, )) return Delayed(name, dsk) # Collection has 2+ keys; apply a two-step map->reduce algorithm so that we # transfer over the network and store in RAM only a handful of None's instead of # the full computed collection's contents dsks = [] map_names = set() map_keys = [] for prev_name in get_collection_names(collection): map_name = "checkpoint_map-" + tokenize(prev_name, tok) map_names.add(map_name) map_layer = _build_map_layer(chunks.checkpoint, prev_name, map_name, collection) map_keys += list(map_layer.get_output_keys()) dsks.append( HighLevelGraph.from_collections(map_name, map_layer, dependencies=(collection, ))) # recursive aggregation reduce_layer: dict = {} while split_every and len(map_keys) > split_every: k = (name, len(reduce_layer)) reduce_layer[k] = (chunks.checkpoint, map_keys[:split_every]) map_keys = map_keys[split_every:] + [k] reduce_layer[name] = (chunks.checkpoint, map_keys) dsks.append( HighLevelGraph({name: reduce_layer}, dependencies={name: map_names})) dsk = HighLevelGraph.merge(*dsks) return Delayed(name, dsk)
def block_one(coll): tok = tokenize(coll, blocker) dsks = [] rename = {} for prev_name in get_collection_names(coll): new_name = "wait_on-" + tokenize(prev_name, tok) rename[prev_name] = new_name layer = _build_map_layer( chunks.bind, prev_name, new_name, coll, dependencies=(blocker,) ) dsks.append( HighLevelGraph.from_collections( new_name, layer, dependencies=(coll, blocker) ) ) dsk = HighLevelGraph.merge(*dsks) rebuild, args = coll.__dask_postpersist__() return rebuild(dsk, *args, rename=rename)
def test_blockwise_non_blockwise_output(): x = da.ones(10, chunks=(5,)) y = (((x + 1) + 2) + 3) w = y.sum() z = (((y * 2) * 3) * 4) z_top_before = tuple(z.dask.dicts[z.name].indices) (zz,) = dask.optimize(z) z_top_after = tuple(z.dask.dicts[z.name].indices) assert z_top_before == z_top_after, "z_top mutated" dsk = optimize_blockwise(z.dask, keys=list(dask.core.flatten(z.__dask_keys__()))) assert isinstance(dsk, HighLevelGraph) assert len([layer for layer in dsk.dicts.values() if isinstance(layer, Blockwise)]) == 1 dsk = optimize_blockwise(HighLevelGraph.merge(w.dask, z.dask), keys=list(dask.core.flatten([w.__dask_keys__(), z.__dask_keys__()]))) assert isinstance(dsk, HighLevelGraph) assert len([layer for layer in z.dask.dicts.values() if isinstance(layer, Blockwise)]) >= 1
def warp(self, dem=None, proj="EPSG:4326", **kwargs): """Delayed warp across an entire AOI or Image Creates a new dask image by deferring calls to the warp_geometry on chunks Args: dem (ndarray): optional. A DEM for warping to specific elevation planes proj (str): optional. An EPSG proj string to project the image data into ("EPSG:32612") Returns: daskarray: a warped image as deferred image array """ try: img_md = self.rda.metadata["image"] x_size = img_md["tileXSize"] y_size = img_md["tileYSize"] except (AttributeError, KeyError): x_size = kwargs.get("chunk_size", 256) y_size = kwargs.get("chunk_size", 256) # Create an affine transform to convert between real-world and pixels if self.proj is None: from_proj = "EPSG:4326" else: from_proj = self.proj try: # NOTE: this only works on images that have rda rpcs metadata center = wkt.loads(self.rda.metadata["image"]["imageBoundsWGS84"]).centroid g = box(*(center.buffer(self.rda.metadata["rpcs"]["gsd"] / 2).bounds)) tfm = partial(pyproj.transform, pyproj.Proj(init="EPSG:4326"), pyproj.Proj(init=proj)) gsd = kwargs.get("gsd", ops.transform(tfm, g).area ** 0.5) current_bounds = wkt.loads(self.rda.metadata["image"]["imageBoundsWGS84"]).bounds except (AttributeError, KeyError, TypeError): tfm = partial(pyproj.transform, pyproj.Proj(init=self.proj), pyproj.Proj(init=proj)) gsd = kwargs.get("gsd", (ops.transform(tfm, shape(self)).area / (self.shape[1] * self.shape[2])) ** 0.5 ) current_bounds = self.bounds tfm = partial(pyproj.transform, pyproj.Proj(init=from_proj), pyproj.Proj(init=proj)) itfm = partial(pyproj.transform, pyproj.Proj(init=proj), pyproj.Proj(init=from_proj)) output_bounds = ops.transform(tfm, box(*current_bounds)).bounds gtf = Affine.from_gdal(output_bounds[0], gsd, 0.0, output_bounds[3], 0.0, -1 * gsd) ll = ~gtf * (output_bounds[:2]) ur = ~gtf * (output_bounds[2:]) x_chunks = int((ur[0] - ll[0]) / x_size) + 1 y_chunks = int((ll[1] - ur[1]) / y_size) + 1 num_bands = self.shape[0] try: dtype = RDA_TO_DTYPE[img_md["dataType"]] except: dtype = 'uint8' daskmeta = { "dask": {}, "chunks": (num_bands, y_size, x_size), "dtype": dtype, "name": "warp-{}".format(self.name), "shape": (num_bands, y_chunks * y_size, x_chunks * x_size) } def px_to_geom(xmin, ymin): xmax = int(xmin + x_size) ymax = int(ymin + y_size) bounds = list((gtf * (xmin, ymax)) + (gtf * (xmax, ymin))) return box(*bounds) full_bounds = box(*output_bounds) dasks = [] if isinstance(dem, GeoDaskImage): if dem.proj != proj: dem = dem.warp(proj=proj, dem=dem) dasks.append(dem.dask) for y in xrange(y_chunks): for x in xrange(x_chunks): xmin = x * x_size ymin = y * y_size geometry = px_to_geom(xmin, ymin) daskmeta["dask"][(daskmeta["name"], 0, y, x)] = (self._warp, geometry, gsd, dem, proj, dtype, 5) daskmeta["dask"], _ = optimization.cull(HighLevelGraph.merge(daskmeta["dask"], *dasks), list(daskmeta["dask"].keys())) gi = mapping(full_bounds) gt = AffineTransform(gtf, proj) image = GeoDaskImage(daskmeta, __geo_interface__ = gi, __geo_transform__ = gt) return image[box(*output_bounds)]
def warp(self, dem=None, proj="EPSG:4326", **kwargs): """Delayed warp across an entire AOI or Image Creates a new dask image by deferring calls to the warp_geometry on chunks Args: dem (ndarray): optional. A DEM for warping to specific elevation planes proj (str): optional. An EPSG proj string to project the image data into ("EPSG:32612") Returns: daskarray: a warped image as deferred image array """ try: img_md = self.rda.metadata["image"] x_size = img_md["tileXSize"] y_size = img_md["tileYSize"] except (AttributeError, KeyError): x_size = kwargs.get("chunk_size", 256) y_size = kwargs.get("chunk_size", 256) # Create an affine transform to convert between real-world and pixels if self.proj is None: from_proj = "EPSG:4326" else: from_proj = self.proj try: # NOTE: this only works on images that have rda rpcs metadata center = wkt.loads(self.rda.metadata["image"]["imageBoundsWGS84"]).centroid g = box(*center.buffer(self.rda.metadata["rpcs"]["gsd"] / 2).bounds) tfm = partial(pyproj.transform, pyproj.Proj(init="EPSG:4326"), pyproj.Proj(init=proj)) gsd = kwargs.get("gsd", ops.transform(tfm, g).area ** 0.5) current_bounds = wkt.loads(self.rda.metadata["image"]["imageBoundsWGS84"]).bounds except (AttributeError, KeyError, TypeError): tfm = partial(pyproj.transform, pyproj.Proj(init=self.proj), pyproj.Proj(init=proj)) gsd = kwargs.get("gsd", (ops.transform(tfm, shape(self)).area / (self.shape[1] * self.shape[2])) ** 0.5) current_bounds = self.bounds tfm = partial(pyproj.transform, pyproj.Proj(init=from_proj), pyproj.Proj(init=proj)) itfm = partial(pyproj.transform, pyproj.Proj(init=proj), pyproj.Proj(init=from_proj)) output_bounds = ops.transform(tfm, box(*current_bounds)).bounds gtf = Affine.from_gdal(output_bounds[0], gsd, 0.0, output_bounds[3], 0.0, -1 * gsd) ll = ~gtf * (output_bounds[:2]) ur = ~gtf * (output_bounds[2:]) x_chunks = int((ur[0] - ll[0]) / x_size) + 1 y_chunks = int((ll[1] - ur[1]) / y_size) + 1 num_bands = self.shape[0] try: dtype = img_md["dataType"] except: dtype = 'uint8' daskmeta = { "dask": {}, "chunks": (num_bands, y_size, x_size), "dtype": dtype, "name": "warp-{}".format(self.name), "shape": (num_bands, y_chunks * y_size, x_chunks * x_size) } def px_to_geom(xmin, ymin): xmax = int(xmin + x_size) ymax = int(ymin + y_size) bounds = list((gtf * (xmin, ymax)) + (gtf * (xmax, ymin))) return box(*bounds) full_bounds = box(*output_bounds) dasks = [] if isinstance(dem, GeoDaskImage): if dem.proj != proj: dem = dem.warp(proj=proj, dem=dem) dasks.append(dem.dask) for y in range(y_chunks): for x in range(x_chunks): xmin = x * x_size ymin = y * y_size geometry = px_to_geom(xmin, ymin) daskmeta["dask"][(daskmeta["name"], 0, y, x)] = (self._warp, geometry, gsd, dem, proj, dtype, 5) daskmeta["dask"], _ = optimization.cull(HighLevelGraph.merge(daskmeta["dask"], *dasks), list(daskmeta["dask"].keys())) gi = mapping(full_bounds) gt = AffineTransform(gtf, proj) image = GeoDaskImage(daskmeta, __geo_interface__=gi, __geo_transform__=gt) return image[box(*output_bounds)]
def fit(model, x, y, compute=True, shuffle_blocks=True, random_state=None, **kwargs): """ Fit scikit learn model against dask arrays Model must support the ``partial_fit`` interface for online or batch learning. Ideally your rows are independent and identically distributed. By default, this function will step through chunks of the arrays in random order. Parameters ---------- model: sklearn model Any model supporting partial_fit interface x: dask Array Two dimensional array, likely tall and skinny y: dask Array One dimensional array with same chunks as x's rows compute : bool Whether to compute this result shuffle_blocks : bool Whether to shuffle the blocks with ``random_state`` or not random_state : int or numpy.random.RandomState Random state to use when shuffling blocks kwargs: options to pass to partial_fit Examples -------- >>> import dask.array as da >>> X = da.random.random((10, 3), chunks=(5, 3)) >>> y = da.random.randint(0, 2, 10, chunks=(5,)) >>> from sklearn.linear_model import SGDClassifier >>> sgd = SGDClassifier() >>> sgd = da.learn.fit(sgd, X, y, classes=[1, 0]) >>> sgd # doctest: +SKIP SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=False, verbose=0, warm_start=False) This passes all of X and y through the classifier sequentially. We can use the classifier as normal on in-memory data >>> import numpy as np >>> sgd.predict(np.random.random((4, 3))) # doctest: +SKIP array([1, 0, 0, 1]) Or predict on a larger dataset >>> z = da.random.random((400, 3), chunks=(100, 3)) >>> da.learn.predict(sgd, z) # doctest: +SKIP dask.array<x_11, shape=(400,), chunks=((100, 100, 100, 100),), dtype=int64> """ if not hasattr(x, "chunks") and hasattr(x, "to_dask_array"): x = x.to_dask_array() assert x.ndim == 2 if y is not None: if not hasattr(y, "chunks") and hasattr(y, "to_dask_array"): y = y.to_dask_array() assert y.ndim == 1 assert x.chunks[0] == y.chunks[0] assert hasattr(model, "partial_fit") if len(x.chunks[1]) > 1: x = x.rechunk(chunks=(x.chunks[0], sum(x.chunks[1]))) nblocks = len(x.chunks[0]) order = list(range(nblocks)) if shuffle_blocks: rng = sklearn.utils.check_random_state(random_state) rng.shuffle(order) name = "fit-" + dask.base.tokenize(model, x, y, kwargs, order) dsk = {(name, -1): model} dsk.update({(name, i): ( _partial_fit, (name, i - 1), (x.name, order[i], 0), (getattr(y, "name", ""), order[i]), kwargs, ) for i in range(nblocks)}) graphs = {x.name: x.__dask_graph__(), name: dsk} if hasattr(y, "__dask_graph__"): graphs[y.name] = y.__dask_graph__() try: from dask.highlevelgraph import HighLevelGraph new_dsk = HighLevelGraph.merge(*graphs.values()) except ImportError: from dask import sharedict new_dsk = sharedict.merge(*graphs.values()) value = Delayed((name, nblocks - 1), new_dsk) if compute: return value.compute() else: return value