示例#1
0
def persistent_collections_to_dsk(collections,
                                  key=None,
                                  serializers=None,
                                  cache=None,
                                  *args,
                                  **kwargs):
    """
    wrapper arount dask.base.collections_to_dsk
    *args and **kwargs are passed to collections_to_dsk
    """

    dsk = collections_to_dsk(collections, *args, **kwargs)

    if key is not None:
        dsk, _ = cull(dsk, key)

    if serializers is not None:
        # load instead of compute
        dsk_serialized = get_relevant_keys_from_on_disk_cache(dsk, serializers)
        dsk.update(dsk_serialized)

    if cache is not None:
        # use cache instead of loadind
        dsk_cached = get_relevant_keys_from_memory_cache(dsk, cache)
        dsk.update(dsk_cached)

    # filter again task after function have been replaced by load or values
    if key is not None:
        dsk, _ = cull(dsk, key)

    return dsk
示例#2
0
文件: core.py 项目: sklam/dask_gdf
def optimize(dsk, keys, **kwargs):
    flatkeys = list(flatten(keys)) if isinstance(keys, list) else [keys]
    dsk, dependencies = cull(dsk, flatkeys)
    dsk, dependencies = fuse(dsk,
                             keys,
                             dependencies=dependencies,
                             ave_width=_globals.get('fuse_ave_width', 1))
    dsk, _ = cull(dsk, keys)
    return dsk
示例#3
0
    def __getitem__(self, geometry):
        if isinstance(geometry, BaseGeometry) or getattr(
                geometry, "__geo_interface__", None) is not None:
            image = GeoImage.__getitem__(self, geometry)
            image._ipe_op = self._ipe_op
            return image
        else:
            result = super(IpeImage, self).__getitem__(geometry)
            dsk, _ = optimize.cull(result.dask, result.__dask_keys__())
            image = super(IpeImage,
                          self.__class__).__new__(self.__class__, dsk,
                                                  result.name, result.chunks,
                                                  result.dtype, result.shape)

            if all([isinstance(e, slice)
                    for e in geometry]) and len(geometry) == len(self.shape):
                xmin, ymin, xmax, ymax = geometry[2].start, geometry[
                    1].start, geometry[2].stop, geometry[1].stop
                xmin = 0 if xmin is None else xmin
                ymin = 0 if ymin is None else ymin
                xmax = self.shape[2] if xmax is None else xmax
                ymax = self.shape[1] if ymax is None else ymax

                g = ops.transform(self.__geo_transform__.fwd,
                                  box(xmin, ymin, xmax, ymax))
                image.__geo_interface__ = mapping(g)
                image.__geo_transform__ = self.__geo_transform__ + (xmin, ymin)
            else:
                image.__geo_interface__ = self.__geo_interface__
                image.__geo_transform__ = self.__geo_transform__
            image._ipe_op = self._ipe_op
            return image
示例#4
0
def fuse_dag(dag, copy=False):
    if copy:
        dag = deepcopy(dag)
    dag = dict(dag)
    dsk, dep = cull(dag, dag.keys())
    dsk, dep = fuse(dsk, ave_width=1, rename_keys=False)
    return dsk
示例#5
0
def test_fuse_getitem():
    def load(*args):
        pass
    dsk = {'x': (load, 'store', 'part', ['a', 'b']),
           'y': (getitem, 'x', 'a')}
    dsk2 = fuse_getitem(dsk, load, 3)
    dsk2 = cull(dsk2, 'y')
    assert dsk2 == {'y': (load, 'store', 'part', 'a')}
示例#6
0
def test_fuse_getitem():
    def load(*args):
        pass

    dsk = {'x': (load, 'store', 'part', ['a', 'b']), 'y': (getitem, 'x', 'a')}
    dsk2 = fuse_getitem(dsk, load, 3)
    dsk2 = cull(dsk2, 'y')
    assert dsk2 == {'y': (load, 'store', 'part', 'a')}
示例#7
0
 def _cull(dsk, keys=None):
     if not keys:
         return dsk
     elif "token" not in dsk:
         return dsk
     if "token" not in keys:
         keys.append("token")
     dsk1, _ = optimize.cull(dsk, keys)
     return dsk1
示例#8
0
def test_fuse_selections():
    def load(*args):
        pass

    dsk = {'x': (load, 'store', 'part', ['a', 'b']), 'y': (getitem, 'x', 'a')}
    merge = lambda t1, t2: (load, t2[1], t2[2], t1[2])
    dsk2 = fuse_selections(dsk, getitem, load, merge)
    dsk2 = cull(dsk2, 'y')
    assert dsk2 == {'y': (load, 'store', 'part', 'a')}
示例#9
0
def test_inline_cull_dependencies():
    d = {'a': 1,
         'b': 'a',
         'c': 'b',
         'd': ['a', 'b', 'c'],
         'e': (add, (len, 'd'), 'a')}

    d2, dependencies = cull(d, ['d', 'e'])
    inline(d2, {'b'}, dependencies=dependencies)
示例#10
0
def test_fuse_selections():
    def load(*args):
        pass
    dsk = {'x': (load, 'store', 'part', ['a', 'b']),
           'y': (getitem, 'x', 'a')}
    merge = lambda t1, t2: (load, t2[1], t2[2], t1[2])
    dsk2 = fuse_selections(dsk, getitem, load, merge)
    dsk2 = cull(dsk2, 'y')
    assert dsk2 == {'y': (load, 'store', 'part', 'a')}
示例#11
0
def test_cull():
    # 'out' depends on 'x' and 'y', but not 'z'
    d = {'x': 1, 'y': (inc, 'x'), 'z': (inc, 'x'), 'out': (add, 'y', 10)}
    culled = cull(d, 'out')
    assert culled == {'x': 1, 'y': (inc, 'x'), 'out': (add, 'y', 10)}
    assert cull(d, 'out') == cull(d, ['out'])
    assert cull(d, ['out', 'z']) == d
    assert cull(d, [['out'], ['z']]) == cull(d, ['out', 'z'])
    assert raises(KeyError, lambda: cull(d, 'badkey'))
示例#12
0
def test_cull():
    # 'out' depends on 'x' and 'y', but not 'z'
    d = {'x': 1, 'y': (inc, 'x'), 'z': (inc, 'x'), 'out': (add, 'y', 10)}
    culled = cull(d, 'out')
    assert culled == {'x': 1, 'y': (inc, 'x'), 'out': (add, 'y', 10)}
    assert cull(d, 'out') == cull(d, ['out'])
    assert cull(d, ['out', 'z']) == d
    assert cull(d, [['out'], ['z']]) == cull(d, ['out', 'z'])
    assert raises(KeyError, lambda: cull(d, 'badkey'))
def test_inline_cull_dependencies():
    d = {
        'a': 1,
        'b': 'a',
        'c': 'b',
        'd': ['a', 'b', 'c'],
        'e': (add, (len, 'd'), 'a')
    }

    d2, dependencies = cull(d, ['d', 'e'])
    inline(d2, {'b'}, dependencies=dependencies)
示例#14
0
    def get(self,
            dsk,
            keys,
            optimize_graph=True,
            docker='lensa/dask.mesos',
            params={},
            threaded=True,
            **kwargs):  # should be mode instead: threaded / mesos / hybrid
        """ Compute dask graph
        Parameters
        ----------
        dsk: dict
        keys: object, or nested lists of objects
        optimize_graph: bool
        docker: string, default docker image for computations on mesos
        params: dict, mesos options per dask key
        threaded: bool, offload task without mesos parameters to threads
        Examples
        --------
        >>> from operator import add  # doctest: +SKIP
        >>> e = MesosExecutor('127.0.0.1:8787')  # doctest: +SKIP
        >>> e.get({'x': (add, 1, 2)}, 'x')  # doctest: +SKIP
        3
        See Also
        --------
        Executor.compute: Compute asynchronous collections
        """
        # Optimize Dask
        dsk2, dependencies = cull(dsk, keys)
        if optimize_graph:
            dsk3, dependencies = fuse(dsk2, keys, dependencies)
        else:
            dsk3 = dsk2

        def apply_async(execute_task, args):
            key = args[0]

            if threaded and key not in params:
                logging.info('Task `{}` is calculating in threads'.format(key))
                return self.threadpool.submit(execute_task, *args)

            options = params.get(key, {})
            options['id'] = key
            if 'docker' not in options:
                options['docker'] = docker
            logging.info('Task `{}` is calculating on mesos'.format(key))
            return self.submit(execute_task, args, **options)

        # Run
        queue = Queue(self.zk, str(uuid4()))
        result = get_async(apply_async, 1e4, dsk3, keys, queue=queue, **kwargs)

        return result
def test_cull():
    # 'out' depends on 'x' and 'y', but not 'z'
    d = {'x': 1, 'y': (inc, 'x'), 'z': (inc, 'x'), 'out': (add, 'y', 10)}
    culled, dependencies = cull(d, 'out')
    assert culled == {'x': 1, 'y': (inc, 'x'), 'out': (add, 'y', 10)}
    assert dependencies == {'x': [], 'y': ['x'], 'out': ['y']}

    assert cull(d, 'out') == cull(d, ['out'])
    assert cull(d, ['out', 'z'])[0] == d
    assert cull(d, [['out'], ['z']]) == cull(d, ['out', 'z'])
    pytest.raises(KeyError, lambda: cull(d, 'badkey'))
示例#16
0
def test_cull():
    # 'out' depends on 'x' and 'y', but not 'z'
    d = {'x': 1, 'y': (inc, 'x'), 'z': (inc, 'x'), 'out': (add, 'y', 10)}
    culled, dependencies = cull(d, 'out')
    assert culled == {'x': 1, 'y': (inc, 'x'), 'out': (add, 'y', 10)}
    assert dependencies == {'x': [], 'y': ['x'], 'out': ['y']}

    assert cull(d, 'out') == cull(d, ['out'])
    assert cull(d, ['out', 'z'])[0] == d
    assert cull(d, [['out'], ['z']]) == cull(d, ['out', 'z'])
    pytest.raises(KeyError, lambda: cull(d, 'badkey'))
示例#17
0
 def __dask_optimize__(cls, dsk, keys):
     dsk1, _ = optimize.cull(dsk, keys)
     dsk2 = {}
     coll = []
     for key, val in dsk1.items():
         if isinstance(key, tuple) and key[0].startswith('image'):
             name, z, x, y = key
             dfn, url, token, chunk = val
             dsk2[key] = (operator.getitem, "load_urls", (z, x, y))
             coll.append([url, token, (z, x, y)])
         else:
             dsk2[key] = val
     dsk2['load_urls'] = (cls.__fetch__, coll)
     return dsk2
示例#18
0
def get(dsk, keys, optimizations=[], num_workers=None,
        docker='lensa/dask.mesos',
        zk=os.getenv('ZOOKEEPER_HOST', '127.0.0.1:2181'),
        mesos=os.getenv('MESOS_MASTER', '127.0.0.1:5050'),
        **kwargs):
    """Mesos get function appropriate for Bags

    Parameters
    ----------

    dsk: dict
        dask graph
    keys: object or list
        Desired results from graph
    optimizations: list of functions
        optimizations to perform on graph before execution
    num_workers: int
        Number of worker processes (defaults to number of cores)
    docker: string
        Default docker image name to run the dask in
    zk: string
        Zookeeper host and port the distributed Queue should connect to
    mesos: string
        Mesos Master hostname and port the Satyr framework should connect to
    """
    pool, kazoo = _globals['pool'], _globals['kazoo']

    if pool is None:
        pool = Pool(name='dask-pool', master=mesos, processes=num_workers)
        pool.start()
        cleanup_pool = True
    else:
        cleanup_pool = False

    if kazoo is None:
        kazoo = KazooClient(hosts=zk)
        kazoo.start()
        cleanup_kazoo = True
    else:
        cleanup_kazoo = False

    # Optimize Dask
    dsk2, dependencies = cull(dsk, keys)
    dsk3, dependencies = fuse(dsk2, keys, dependencies)
    dsk4 = pipe(dsk3, *optimizations)

    def apply_async(execute_task, args):
        key = args[0]
        func = args[1][0]
        params = func.params if isinstance(func, SatyrPack) else {}

        params['id'] = key
        if 'docker' not in params:
            params['docker'] = docker

        return pool.apply_async(execute_task, args, **params)

    try:
        # Run
        queue = Queue(kazoo, str(uuid4()))
        result = get_async(apply_async, 1e4, dsk3, keys, queue=queue, **kwargs)
    finally:
        if cleanup_kazoo:
            kazoo.stop()
        if cleanup_pool:
            pool.stop()

    return result
示例#19
0
 def is_valid(self, layer='_output'):
     return 'data_0' in cull(self.graph, layer)[0]
示例#20
0
    def warp(self, dem=None, proj="EPSG:4326", **kwargs):
        """
        Delayed warp across an entire AOI or Image
        creates a new dask image by deferring calls to the warp_geometry on chunks

        kwargs:
            dem (ndarray): optional. A DEM for warping to specific elevation planes
            proj (str): optional. An EPSG proj string to project the image data into ("EPSG:32612")

        Returns:
            image (dask): a warped image as deferred image array (a dask)
        """
        try:
            img_md = self.ipe.metadata["image"]
            x_size = img_md["tileXSize"]
            y_size = img_md["tileYSize"]
        except (AttributeError, KeyError):
            x_size = kwargs.get("chunk_size", 256)
            y_size = kwargs.get("chunk_size", 256)

        # Create an affine transform to convert between real-world and pixels
        if self.proj is None:
            from_proj = "EPSG:4326"
        else:
            from_proj = self.proj

        try:
            # NOTE: this only works on images that have IPE rpcs metadata
            center = wkt.loads(
                self.ipe.metadata["image"]["imageBoundsWGS84"]).centroid
            g = box(*(center.buffer(self.ipe.metadata["rpcs"]["gsd"] /
                                    2).bounds))
            # print "Input GSD (deg):", self.ipe.metadata["rpcs"]["gsd"]
            tfm = partial(pyproj.transform, pyproj.Proj(init="EPSG:4326"),
                          pyproj.Proj(init=proj))
            gsd = kwargs.get("gsd", ops.transform(tfm, g).area**0.5)
            current_bounds = wkt.loads(
                self.ipe.metadata["image"]["imageBoundsWGS84"]).bounds
        except (AttributeError, KeyError, TypeError):
            tfm = partial(pyproj.transform, pyproj.Proj(init=self.proj),
                          pyproj.Proj(init=proj))
            gsd = kwargs.get("gsd", (ops.transform(tfm, shape(self)).area /
                                     (self.shape[1] * self.shape[2]))**0.5)
            current_bounds = self.bounds

        tfm = partial(pyproj.transform, pyproj.Proj(init=from_proj),
                      pyproj.Proj(init=proj))
        itfm = partial(pyproj.transform, pyproj.Proj(init=proj),
                       pyproj.Proj(init=from_proj))
        output_bounds = ops.transform(tfm, box(*current_bounds)).bounds
        gtf = Affine.from_gdal(output_bounds[0], gsd, 0.0, output_bounds[3],
                               0.0, -1 * gsd)

        ll = ~gtf * (output_bounds[:2])
        ur = ~gtf * (output_bounds[2:])
        x_chunks = int((ur[0] - ll[0]) / x_size) + 1
        y_chunks = int((ll[1] - ur[1]) / y_size) + 1

        num_bands = self.shape[0]

        try:
            dtype = IPE_TO_DTYPE[img_md["dataType"]]
        except:
            dtype = 'uint8'

        daskmeta = {
            "dask": {},
            "chunks": (num_bands, y_size, x_size),
            "dtype": dtype,
            "name": "warp-{}".format(self.name),
            "shape": (num_bands, y_chunks * y_size, x_chunks * x_size)
        }

        def px_to_geom(xmin, ymin):
            xmax = int(xmin + x_size)
            ymax = int(ymin + y_size)
            bounds = list((gtf * (xmin, ymax)) + (gtf * (xmax, ymin)))
            return box(*bounds)

        full_bounds = box(*output_bounds)

        dasks = []
        if isinstance(dem, GeoImage):
            if dem.proj != proj:
                dem = dem.warp(proj=proj, dem=dem)
            dasks.append(dem.dask)

        for y in xrange(y_chunks):
            for x in xrange(x_chunks):
                xmin = x * x_size
                ymin = y * y_size
                geometry = px_to_geom(xmin, ymin)
                daskmeta["dask"][(daskmeta["name"], 0, y,
                                  x)] = (self._warp, geometry, gsd, dem, proj,
                                         dtype, 5)
        daskmeta["dask"], _ = optimize.cull(
            sharedict.merge(daskmeta["dask"], *dasks),
            list(daskmeta["dask"].keys()))

        result = GeoDaskWrapper(daskmeta, self)
        result.__geo_interface__ = mapping(full_bounds)
        result.__geo_transform__ = AffineTransform(gtf, proj)
        return GeoImage.__getitem__(result, box(*output_bounds))
示例#21
0
 def is_valid(self, layer=512):
     return '_graph_input' in cull(self.graph, layer)[0]
示例#22
0
文件: core.py 项目: kayibal/sparsity
def optimize(dsk, keys, **kwargs):
    dsk, _ = cull(dsk, keys)
    return dsk