def persistent_collections_to_dsk(collections, key=None, serializers=None, cache=None, *args, **kwargs): """ wrapper arount dask.base.collections_to_dsk *args and **kwargs are passed to collections_to_dsk """ dsk = collections_to_dsk(collections, *args, **kwargs) if key is not None: dsk, _ = cull(dsk, key) if serializers is not None: # load instead of compute dsk_serialized = get_relevant_keys_from_on_disk_cache(dsk, serializers) dsk.update(dsk_serialized) if cache is not None: # use cache instead of loadind dsk_cached = get_relevant_keys_from_memory_cache(dsk, cache) dsk.update(dsk_cached) # filter again task after function have been replaced by load or values if key is not None: dsk, _ = cull(dsk, key) return dsk
def optimize(dsk, keys, **kwargs): flatkeys = list(flatten(keys)) if isinstance(keys, list) else [keys] dsk, dependencies = cull(dsk, flatkeys) dsk, dependencies = fuse(dsk, keys, dependencies=dependencies, ave_width=_globals.get('fuse_ave_width', 1)) dsk, _ = cull(dsk, keys) return dsk
def __getitem__(self, geometry): if isinstance(geometry, BaseGeometry) or getattr( geometry, "__geo_interface__", None) is not None: image = GeoImage.__getitem__(self, geometry) image._ipe_op = self._ipe_op return image else: result = super(IpeImage, self).__getitem__(geometry) dsk, _ = optimize.cull(result.dask, result.__dask_keys__()) image = super(IpeImage, self.__class__).__new__(self.__class__, dsk, result.name, result.chunks, result.dtype, result.shape) if all([isinstance(e, slice) for e in geometry]) and len(geometry) == len(self.shape): xmin, ymin, xmax, ymax = geometry[2].start, geometry[ 1].start, geometry[2].stop, geometry[1].stop xmin = 0 if xmin is None else xmin ymin = 0 if ymin is None else ymin xmax = self.shape[2] if xmax is None else xmax ymax = self.shape[1] if ymax is None else ymax g = ops.transform(self.__geo_transform__.fwd, box(xmin, ymin, xmax, ymax)) image.__geo_interface__ = mapping(g) image.__geo_transform__ = self.__geo_transform__ + (xmin, ymin) else: image.__geo_interface__ = self.__geo_interface__ image.__geo_transform__ = self.__geo_transform__ image._ipe_op = self._ipe_op return image
def fuse_dag(dag, copy=False): if copy: dag = deepcopy(dag) dag = dict(dag) dsk, dep = cull(dag, dag.keys()) dsk, dep = fuse(dsk, ave_width=1, rename_keys=False) return dsk
def test_fuse_getitem(): def load(*args): pass dsk = {'x': (load, 'store', 'part', ['a', 'b']), 'y': (getitem, 'x', 'a')} dsk2 = fuse_getitem(dsk, load, 3) dsk2 = cull(dsk2, 'y') assert dsk2 == {'y': (load, 'store', 'part', 'a')}
def _cull(dsk, keys=None): if not keys: return dsk elif "token" not in dsk: return dsk if "token" not in keys: keys.append("token") dsk1, _ = optimize.cull(dsk, keys) return dsk1
def test_fuse_selections(): def load(*args): pass dsk = {'x': (load, 'store', 'part', ['a', 'b']), 'y': (getitem, 'x', 'a')} merge = lambda t1, t2: (load, t2[1], t2[2], t1[2]) dsk2 = fuse_selections(dsk, getitem, load, merge) dsk2 = cull(dsk2, 'y') assert dsk2 == {'y': (load, 'store', 'part', 'a')}
def test_inline_cull_dependencies(): d = {'a': 1, 'b': 'a', 'c': 'b', 'd': ['a', 'b', 'c'], 'e': (add, (len, 'd'), 'a')} d2, dependencies = cull(d, ['d', 'e']) inline(d2, {'b'}, dependencies=dependencies)
def test_cull(): # 'out' depends on 'x' and 'y', but not 'z' d = {'x': 1, 'y': (inc, 'x'), 'z': (inc, 'x'), 'out': (add, 'y', 10)} culled = cull(d, 'out') assert culled == {'x': 1, 'y': (inc, 'x'), 'out': (add, 'y', 10)} assert cull(d, 'out') == cull(d, ['out']) assert cull(d, ['out', 'z']) == d assert cull(d, [['out'], ['z']]) == cull(d, ['out', 'z']) assert raises(KeyError, lambda: cull(d, 'badkey'))
def test_inline_cull_dependencies(): d = { 'a': 1, 'b': 'a', 'c': 'b', 'd': ['a', 'b', 'c'], 'e': (add, (len, 'd'), 'a') } d2, dependencies = cull(d, ['d', 'e']) inline(d2, {'b'}, dependencies=dependencies)
def get(self, dsk, keys, optimize_graph=True, docker='lensa/dask.mesos', params={}, threaded=True, **kwargs): # should be mode instead: threaded / mesos / hybrid """ Compute dask graph Parameters ---------- dsk: dict keys: object, or nested lists of objects optimize_graph: bool docker: string, default docker image for computations on mesos params: dict, mesos options per dask key threaded: bool, offload task without mesos parameters to threads Examples -------- >>> from operator import add # doctest: +SKIP >>> e = MesosExecutor('127.0.0.1:8787') # doctest: +SKIP >>> e.get({'x': (add, 1, 2)}, 'x') # doctest: +SKIP 3 See Also -------- Executor.compute: Compute asynchronous collections """ # Optimize Dask dsk2, dependencies = cull(dsk, keys) if optimize_graph: dsk3, dependencies = fuse(dsk2, keys, dependencies) else: dsk3 = dsk2 def apply_async(execute_task, args): key = args[0] if threaded and key not in params: logging.info('Task `{}` is calculating in threads'.format(key)) return self.threadpool.submit(execute_task, *args) options = params.get(key, {}) options['id'] = key if 'docker' not in options: options['docker'] = docker logging.info('Task `{}` is calculating on mesos'.format(key)) return self.submit(execute_task, args, **options) # Run queue = Queue(self.zk, str(uuid4())) result = get_async(apply_async, 1e4, dsk3, keys, queue=queue, **kwargs) return result
def test_cull(): # 'out' depends on 'x' and 'y', but not 'z' d = {'x': 1, 'y': (inc, 'x'), 'z': (inc, 'x'), 'out': (add, 'y', 10)} culled, dependencies = cull(d, 'out') assert culled == {'x': 1, 'y': (inc, 'x'), 'out': (add, 'y', 10)} assert dependencies == {'x': [], 'y': ['x'], 'out': ['y']} assert cull(d, 'out') == cull(d, ['out']) assert cull(d, ['out', 'z'])[0] == d assert cull(d, [['out'], ['z']]) == cull(d, ['out', 'z']) pytest.raises(KeyError, lambda: cull(d, 'badkey'))
def __dask_optimize__(cls, dsk, keys): dsk1, _ = optimize.cull(dsk, keys) dsk2 = {} coll = [] for key, val in dsk1.items(): if isinstance(key, tuple) and key[0].startswith('image'): name, z, x, y = key dfn, url, token, chunk = val dsk2[key] = (operator.getitem, "load_urls", (z, x, y)) coll.append([url, token, (z, x, y)]) else: dsk2[key] = val dsk2['load_urls'] = (cls.__fetch__, coll) return dsk2
def get(dsk, keys, optimizations=[], num_workers=None, docker='lensa/dask.mesos', zk=os.getenv('ZOOKEEPER_HOST', '127.0.0.1:2181'), mesos=os.getenv('MESOS_MASTER', '127.0.0.1:5050'), **kwargs): """Mesos get function appropriate for Bags Parameters ---------- dsk: dict dask graph keys: object or list Desired results from graph optimizations: list of functions optimizations to perform on graph before execution num_workers: int Number of worker processes (defaults to number of cores) docker: string Default docker image name to run the dask in zk: string Zookeeper host and port the distributed Queue should connect to mesos: string Mesos Master hostname and port the Satyr framework should connect to """ pool, kazoo = _globals['pool'], _globals['kazoo'] if pool is None: pool = Pool(name='dask-pool', master=mesos, processes=num_workers) pool.start() cleanup_pool = True else: cleanup_pool = False if kazoo is None: kazoo = KazooClient(hosts=zk) kazoo.start() cleanup_kazoo = True else: cleanup_kazoo = False # Optimize Dask dsk2, dependencies = cull(dsk, keys) dsk3, dependencies = fuse(dsk2, keys, dependencies) dsk4 = pipe(dsk3, *optimizations) def apply_async(execute_task, args): key = args[0] func = args[1][0] params = func.params if isinstance(func, SatyrPack) else {} params['id'] = key if 'docker' not in params: params['docker'] = docker return pool.apply_async(execute_task, args, **params) try: # Run queue = Queue(kazoo, str(uuid4())) result = get_async(apply_async, 1e4, dsk3, keys, queue=queue, **kwargs) finally: if cleanup_kazoo: kazoo.stop() if cleanup_pool: pool.stop() return result
def is_valid(self, layer='_output'): return 'data_0' in cull(self.graph, layer)[0]
def warp(self, dem=None, proj="EPSG:4326", **kwargs): """ Delayed warp across an entire AOI or Image creates a new dask image by deferring calls to the warp_geometry on chunks kwargs: dem (ndarray): optional. A DEM for warping to specific elevation planes proj (str): optional. An EPSG proj string to project the image data into ("EPSG:32612") Returns: image (dask): a warped image as deferred image array (a dask) """ try: img_md = self.ipe.metadata["image"] x_size = img_md["tileXSize"] y_size = img_md["tileYSize"] except (AttributeError, KeyError): x_size = kwargs.get("chunk_size", 256) y_size = kwargs.get("chunk_size", 256) # Create an affine transform to convert between real-world and pixels if self.proj is None: from_proj = "EPSG:4326" else: from_proj = self.proj try: # NOTE: this only works on images that have IPE rpcs metadata center = wkt.loads( self.ipe.metadata["image"]["imageBoundsWGS84"]).centroid g = box(*(center.buffer(self.ipe.metadata["rpcs"]["gsd"] / 2).bounds)) # print "Input GSD (deg):", self.ipe.metadata["rpcs"]["gsd"] tfm = partial(pyproj.transform, pyproj.Proj(init="EPSG:4326"), pyproj.Proj(init=proj)) gsd = kwargs.get("gsd", ops.transform(tfm, g).area**0.5) current_bounds = wkt.loads( self.ipe.metadata["image"]["imageBoundsWGS84"]).bounds except (AttributeError, KeyError, TypeError): tfm = partial(pyproj.transform, pyproj.Proj(init=self.proj), pyproj.Proj(init=proj)) gsd = kwargs.get("gsd", (ops.transform(tfm, shape(self)).area / (self.shape[1] * self.shape[2]))**0.5) current_bounds = self.bounds tfm = partial(pyproj.transform, pyproj.Proj(init=from_proj), pyproj.Proj(init=proj)) itfm = partial(pyproj.transform, pyproj.Proj(init=proj), pyproj.Proj(init=from_proj)) output_bounds = ops.transform(tfm, box(*current_bounds)).bounds gtf = Affine.from_gdal(output_bounds[0], gsd, 0.0, output_bounds[3], 0.0, -1 * gsd) ll = ~gtf * (output_bounds[:2]) ur = ~gtf * (output_bounds[2:]) x_chunks = int((ur[0] - ll[0]) / x_size) + 1 y_chunks = int((ll[1] - ur[1]) / y_size) + 1 num_bands = self.shape[0] try: dtype = IPE_TO_DTYPE[img_md["dataType"]] except: dtype = 'uint8' daskmeta = { "dask": {}, "chunks": (num_bands, y_size, x_size), "dtype": dtype, "name": "warp-{}".format(self.name), "shape": (num_bands, y_chunks * y_size, x_chunks * x_size) } def px_to_geom(xmin, ymin): xmax = int(xmin + x_size) ymax = int(ymin + y_size) bounds = list((gtf * (xmin, ymax)) + (gtf * (xmax, ymin))) return box(*bounds) full_bounds = box(*output_bounds) dasks = [] if isinstance(dem, GeoImage): if dem.proj != proj: dem = dem.warp(proj=proj, dem=dem) dasks.append(dem.dask) for y in xrange(y_chunks): for x in xrange(x_chunks): xmin = x * x_size ymin = y * y_size geometry = px_to_geom(xmin, ymin) daskmeta["dask"][(daskmeta["name"], 0, y, x)] = (self._warp, geometry, gsd, dem, proj, dtype, 5) daskmeta["dask"], _ = optimize.cull( sharedict.merge(daskmeta["dask"], *dasks), list(daskmeta["dask"].keys())) result = GeoDaskWrapper(daskmeta, self) result.__geo_interface__ = mapping(full_bounds) result.__geo_transform__ = AffineTransform(gtf, proj) return GeoImage.__getitem__(result, box(*output_bounds))
def is_valid(self, layer=512): return '_graph_input' in cull(self.graph, layer)[0]
def optimize(dsk, keys, **kwargs): dsk, _ = cull(dsk, keys) return dsk