예제 #1
0
파일: unique.py 프로젝트: zhzheng92/limix
def _dask_unique(x, return_index=True):
    from dask.array.core import Array
    from dask import sharedict
    from numpy import cumsum, concatenate, unique
    from numpy.testing import assert_

    assert_(return_index)

    name = "unique-" + x.name

    def _unique(x):

        return unique(x, return_index=return_index)

    dsk = dict(((name, i), (_unique, key)) for i, key in enumerate(x._keys()))
    parts = Array._get(sharedict.merge((name, dsk), x.dask), list(dsk.keys()))

    arrs = [a[0] for a in parts]

    chunks = x.chunks[0]
    offset = cumsum((0, ) + chunks)[:-1]

    idxs = [parts[i][1] + offset[i] for i in range(len(parts))]

    arr = concatenate(arrs)
    idx = concatenate(idxs)

    u, i = unique(arr, return_index=True)
    return u, idx[i]
예제 #2
0
def elemwise(op, *args, **kwargs):
    # See also da.core.elemwise. Note: dask seems to be able to convert Python
    # and numpy objects in this function, thus supporting operations between
    # dask objects and others. This would be useful for us as well.
    # Do not support mismatching chunking for now.
    n_chunk = None
    slice_dim = None
    for arg in args:
        if isinstance(arg, DatasetCollection):
            if n_chunk is not None:
                assert n_chunk == arg.n_chunk
                assert slice_dim == arg.slice_dim
            else:
                n_chunk = arg.n_chunk
                slice_dim = arg.slice_dim
    out = '{}-{}'.format(funcname(op), tokenize(op, *args))
    out_ind = (0, )
    # Handling only 1D chunking here, so everything is (0,).
    arginds = list(
        (a, (0, ) if isinstance(a, DatasetCollection) else None) for a in args)
    numblocks = {a.name: a.numblocks for a, ind in arginds if ind is not None}
    argindsstr = list(
        concat([(a if ind is None else a.name, ind) for a, ind in arginds]))
    dsk = top(op, out, out_ind, *argindsstr, numblocks=numblocks, **kwargs)
    dsks = [a.dask for a, ind in arginds if ind is not None]
    return DatasetCollection(sharedict.merge((out, dsk), *dsks), out, n_chunk,
                             slice_dim)
예제 #3
0
    def _wrap(self, func, *args, **kwargs):
        """ Wrap numpy random function to produce dask.array random function

        extra_chunks should be a chunks tuple to append to the end of chunks
        """
        size = kwargs.pop('size', None)
        chunks = kwargs.pop('chunks')
        extra_chunks = kwargs.pop('extra_chunks', ())

        if size is not None and not isinstance(size, (tuple, list)):
            size = (size, )

        args_shapes = {
            ar.shape
            for ar in args if isinstance(ar, (Array, np.ndarray))
        }
        args_shapes.union({
            ar.shape
            for ar in kwargs.values() if isinstance(ar, (Array, np.ndarray))
        })

        shapes = list(args_shapes)
        if size is not None:
            shapes += [size]
        # broadcast to the final size(shape)
        size = broadcast_shapes(*shapes)
        chunks = normalize_chunks(chunks, size)
        slices = slices_from_chunks(chunks)

        def _broadcast_any(ar, shape, chunks):
            if isinstance(ar, Array):
                return broadcast_to(ar, shape).rechunk(chunks)
            if isinstance(ar, np.ndarray):
                return np.ascontiguousarray(np.broadcast_to(ar, shape))

        # Broadcast all arguments, get tiny versions as well
        # Start adding the relevant bits to the graph
        dsk = {}
        dsks = []
        lookup = {}
        small_args = []
        for i, ar in enumerate(args):
            if isinstance(ar, (np.ndarray, Array)):
                res = _broadcast_any(ar, size, chunks)
                if isinstance(res, Array):
                    dsks.append(res.dask)
                    lookup[i] = res.name
                elif isinstance(res, np.ndarray):
                    name = 'array-{}'.format(tokenize(res))
                    lookup[i] = name
                    dsk[name] = res
                small_args.append(ar[tuple(0 for _ in ar.shape)])
            else:
                small_args.append(ar)

        small_kwargs = {}
        for key, ar in kwargs.items():
            if isinstance(ar, (np.ndarray, Array)):
                res = _broadcast_any(ar, size, chunks)
                if isinstance(res, Array):
                    dsks.append(res.dask)
                    lookup[key] = res.name
                elif isinstance(res, np.ndarray):
                    name = 'array-{}'.format(tokenize(res))
                    lookup[key] = name
                    dsk[name] = res
                small_kwargs[key] = ar[tuple(0 for _ in ar.shape)]
            else:
                small_kwargs[key] = ar

        # Get dtype
        small_kwargs['size'] = (0, )
        dtype = func(xoroshiro128plus.RandomState(), *small_args,
                     **small_kwargs).dtype

        sizes = list(product(*chunks))
        state_data = random_state_data(len(sizes), self._numpy_state)
        token = tokenize(state_data, size, chunks, args, kwargs)
        name = 'da.random.{0}-{1}'.format(func.__name__, token)

        keys = product([name],
                       *([range(len(bd))
                          for bd in chunks] + [[0]] * len(extra_chunks)))
        blocks = product(*[range(len(bd)) for bd in chunks])
        vals = []
        for state, size, slc, block in zip(state_data, sizes, slices, blocks):
            arg = []
            for i, ar in enumerate(args):
                if i not in lookup:
                    arg.append(ar)
                else:
                    if isinstance(ar, Array):
                        arg.append((lookup[i], ) + block)
                    else:  # np.ndarray
                        arg.append((getitem, lookup[i], slc))
            kwrg = {}
            for k, ar in kwargs.items():
                if k not in lookup:
                    kwrg[k] = ar
                else:
                    if isinstance(ar, Array):
                        kwrg[k] = (lookup[k], ) + block
                    else:  # np.ndarray
                        kwrg[k] = (getitem, lookup[k], slc)
            vals.append((_apply_random, func.__name__, state, size, arg, kwrg))
        dsk.update(dict(zip(keys, vals)))
        dsk = sharedict.merge((name, dsk), *dsks)
        return Array(dsk, name, chunks + extra_chunks, dtype=dtype)
예제 #4
0
        def choice(self, a, size=None, replace=True, p=None, chunks=None):
            dsks = []
            # Normalize and validate `a`
            if isinstance(a, Integral):
                # On windows the output dtype differs if p is provided or
                # absent, see https://github.com/numpy/numpy/issues/9867
                dummy_p = np.array([1]) if p is not None else p
                dtype = np.random.choice(1, size=(), p=dummy_p).dtype
                len_a = a
                if a < 0:
                    raise ValueError("a must be greater than 0")
            else:
                a = asarray(a).rechunk(a.shape)
                dtype = a.dtype
                if a.ndim != 1:
                    raise ValueError("a must be one dimensional")
                len_a = len(a)
                dsks.append(a.dask)
                a = a.__dask_keys__()[0]

            # Normalize and validate `p`
            if p is not None:
                if not isinstance(p, Array):
                    # If p is not a dask array, first check the sum is close
                    # to 1 before converting.
                    p = np.asarray(p)
                    if not np.isclose(p.sum(), 1, rtol=1e-7, atol=0):
                        raise ValueError("probabilities do not sum to 1")
                    p = asarray(p)
                else:
                    p = p.rechunk(p.shape)

                if p.ndim != 1:
                    raise ValueError("p must be one dimensional")
                if len(p) != len_a:
                    raise ValueError("a and p must have the same size")

                dsks.append(p.dask)
                p = p.__dask_keys__()[0]

            if size is None:
                size = ()
            elif not isinstance(size, (tuple, list)):
                size = (size, )

            chunks = normalize_chunks(chunks, size)
            sizes = list(product(*chunks))
            state_data = random_state_data(len(sizes), self._numpy_state)

            name = 'da.random.choice-%s' % tokenize(state_data, size, chunks,
                                                    a, replace, p)
            keys = product([name], *(range(len(bd)) for bd in chunks))
            dsk = {
                k: (_choice, state, a, size, replace, p)
                for k, state, size in zip(keys, state_data, sizes)
            }

            return Array(sharedict.merge((name, dsk), *dsks),
                         name,
                         chunks,
                         dtype=dtype)
예제 #5
0
    def warp(self, dem=None, proj="EPSG:4326", **kwargs):
        """Delayed warp across an entire AOI or Image

        Creates a new dask image by deferring calls to the warp_geometry on chunks

        Args:
            dem (ndarray): optional. A DEM for warping to specific elevation planes
            proj (str): optional. An EPSG proj string to project the image data into ("EPSG:32612")

        Returns:
            daskarray: a warped image as deferred image array
        """
        try:
            img_md = self.rda.metadata["image"]
            x_size = img_md["tileXSize"]
            y_size = img_md["tileYSize"]
        except (AttributeError, KeyError):
            x_size = kwargs.get("chunk_size", 256)
            y_size = kwargs.get("chunk_size", 256)

        # Create an affine transform to convert between real-world and pixels
        if self.proj is None:
            from_proj = "EPSG:4326"
        else:
            from_proj = self.proj

        try:
            # NOTE: this only works on images that have rda rpcs metadata
            center = wkt.loads(
                self.rda.metadata["image"]["imageBoundsWGS84"]).centroid
            g = box(*(center.buffer(self.rda.metadata["rpcs"]["gsd"] /
                                    2).bounds))
            tfm = partial(pyproj.transform, pyproj.Proj(init="EPSG:4326"),
                          pyproj.Proj(init=proj))
            gsd = kwargs.get("gsd", ops.transform(tfm, g).area**0.5)
            current_bounds = wkt.loads(
                self.rda.metadata["image"]["imageBoundsWGS84"]).bounds
        except (AttributeError, KeyError, TypeError):
            tfm = partial(pyproj.transform, pyproj.Proj(init=self.proj),
                          pyproj.Proj(init=proj))
            gsd = kwargs.get("gsd", (ops.transform(tfm, shape(self)).area /
                                     (self.shape[1] * self.shape[2]))**0.5)
            current_bounds = self.bounds

        tfm = partial(pyproj.transform, pyproj.Proj(init=from_proj),
                      pyproj.Proj(init=proj))
        itfm = partial(pyproj.transform, pyproj.Proj(init=proj),
                       pyproj.Proj(init=from_proj))
        output_bounds = ops.transform(tfm, box(*current_bounds)).bounds
        gtf = Affine.from_gdal(output_bounds[0], gsd, 0.0, output_bounds[3],
                               0.0, -1 * gsd)

        ll = ~gtf * (output_bounds[:2])
        ur = ~gtf * (output_bounds[2:])
        x_chunks = int((ur[0] - ll[0]) / x_size) + 1
        y_chunks = int((ll[1] - ur[1]) / y_size) + 1

        num_bands = self.shape[0]

        try:
            dtype = RDA_TO_DTYPE[img_md["dataType"]]
        except:
            dtype = 'uint8'

        daskmeta = {
            "dask": {},
            "chunks": (num_bands, y_size, x_size),
            "dtype": dtype,
            "name": "warp-{}".format(self.name),
            "shape": (num_bands, y_chunks * y_size, x_chunks * x_size)
        }

        def px_to_geom(xmin, ymin):
            xmax = int(xmin + x_size)
            ymax = int(ymin + y_size)
            bounds = list((gtf * (xmin, ymax)) + (gtf * (xmax, ymin)))
            return box(*bounds)

        full_bounds = box(*output_bounds)

        dasks = []
        if isinstance(dem, GeoDaskImage):
            if dem.proj != proj:
                dem = dem.warp(proj=proj, dem=dem)
            dasks.append(dem.dask)

        for y in xrange(y_chunks):
            for x in xrange(x_chunks):
                xmin = x * x_size
                ymin = y * y_size
                geometry = px_to_geom(xmin, ymin)
                daskmeta["dask"][(daskmeta["name"], 0, y,
                                  x)] = (self._warp, geometry, gsd, dem, proj,
                                         dtype, 5)
        daskmeta["dask"], _ = optimization.cull(
            sharedict.merge(daskmeta["dask"], *dasks),
            list(daskmeta["dask"].keys()))

        gi = mapping(full_bounds)
        gt = AffineTransform(gtf, proj)
        image = GeoDaskImage(daskmeta,
                             __geo_interface__=gi,
                             __geo_transform__=gt)
        return image[box(*output_bounds)]
예제 #6
0
def fit(model,
        x,
        y,
        compute=True,
        shuffle_blocks=True,
        random_state=None,
        **kwargs):
    """ Fit scikit learn model against dask arrays

    Model must support the ``partial_fit`` interface for online or batch
    learning.

    Ideally your rows are independent and identically distributed. By default,
    this function will step through chunks of the arrays in random order.

    Parameters
    ----------
    model: sklearn model
        Any model supporting partial_fit interface
    x: dask Array
        Two dimensional array, likely tall and skinny
    y: dask Array
        One dimensional array with same chunks as x's rows
    compute : bool
        Whether to compute this result
    shuffle_blocks : bool
        Whether to shuffle the blocks with ``random_state`` or not
    random_state : int or numpy.random.RandomState
        Random state to use when shuffling blocks
    kwargs:
        options to pass to partial_fit

    Examples
    --------
    >>> import dask.array as da
    >>> X = da.random.random((10, 3), chunks=(5, 3))
    >>> y = da.random.randint(0, 2, 10, chunks=(5,))

    >>> from sklearn.linear_model import SGDClassifier
    >>> sgd = SGDClassifier()

    >>> sgd = da.learn.fit(sgd, X, y, classes=[1, 0])
    >>> sgd  # doctest: +SKIP
    SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,
           fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',
           loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5,
           random_state=None, shuffle=False, verbose=0, warm_start=False)

    This passes all of X and y through the classifier sequentially.  We can use
    the classifier as normal on in-memory data

    >>> import numpy as np
    >>> sgd.predict(np.random.random((4, 3)))  # doctest: +SKIP
    array([1, 0, 0, 1])

    Or predict on a larger dataset

    >>> z = da.random.random((400, 3), chunks=(100, 3))
    >>> da.learn.predict(sgd, z)  # doctest: +SKIP
    dask.array<x_11, shape=(400,), chunks=((100, 100, 100, 100),), dtype=int64>
    """
    if not hasattr(x, "chunks") and hasattr(x, "to_dask_array"):
        x = x.to_dask_array()
    assert x.ndim == 2
    if y is not None:
        if not hasattr(y, "chunks") and hasattr(y, "to_dask_array"):
            y = y.to_dask_array()

        assert y.ndim == 1
        assert x.chunks[0] == y.chunks[0]

    assert hasattr(model, "partial_fit")
    if len(x.chunks[1]) > 1:
        x = x.rechunk(chunks=(x.chunks[0], sum(x.chunks[1])))

    nblocks = len(x.chunks[0])
    order = list(range(nblocks))
    if shuffle_blocks:
        rng = sklearn.utils.check_random_state(random_state)
        rng.shuffle(order)

    name = "fit-" + dask.base.tokenize(model, x, y, kwargs, order)
    dsk = {(name, -1): model}
    dsk.update({(name, i): (
        _partial_fit,
        (name, i - 1),
        (x.name, order[i], 0),
        (getattr(y, "name", ""), order[i]),
        kwargs,
    )
                for i in range(nblocks)})

    graphs = {x.name: x.__dask_graph__(), name: dsk}
    if hasattr(y, "__dask_graph__"):
        graphs[y.name] = y.__dask_graph__()

    try:
        from dask.highlevelgraph import HighLevelGraph

        new_dsk = HighLevelGraph.merge(*graphs.values())
    except ImportError:
        from dask import sharedict

        new_dsk = sharedict.merge(*graphs.values())

    value = Delayed((name, nblocks - 1), new_dsk)

    if compute:
        return value.compute()
    else:
        return value
예제 #7
0
파일: meta.py 프로젝트: Geoyi/gbdxtools
    def warp(self, dem=None, proj="EPSG:4326", **kwargs):
        """
          Delayed warp across an entire AOI or Image
          creates a new dask image by deferring calls to the warp_geometry on chunks
        """
        try:
            img_md = self.ipe.metadata["image"]
            x_size = img_md["tileXSize"]
            y_size = img_md["tileYSize"]
        except (AttributeError, KeyError):
            x_size = kwargs.get("chunk_size", 256)
            y_size = kwargs.get("chunk_size", 256)

        # Create an affine transform to convert between real-world and pixels
        if self.proj is None:
            from_proj = "EPSG:4326"
        else:
            from_proj = self.proj

        try:
            # NOTE: this only works on images that have IPE rpcs metadata
            center = wkt.loads(
                self.ipe.metadata["image"]["imageBoundsWGS84"]).centroid
            g = box(*(center.buffer(self.ipe.metadata["rpcs"]["gsd"] /
                                    2).bounds))
            # print "Input GSD (deg):", self.ipe.metadata["rpcs"]["gsd"]
            tfm = partial(pyproj.transform, pyproj.Proj(init="EPSG:4326"),
                          pyproj.Proj(init=proj))
            gsd = kwargs.get("gsd", ops.transform(tfm, g).area**0.5)
            current_bounds = wkt.loads(
                self.ipe.metadata["image"]["imageBoundsWGS84"]).bounds
        except (AttributeError, KeyError, TypeError):
            tfm = partial(pyproj.transform, pyproj.Proj(init=self.proj),
                          pyproj.Proj(init=proj))
            gsd = kwargs.get("gsd", (ops.transform(tfm, shape(self)).area /
                                     (self.shape[1] * self.shape[2]))**0.5)
            current_bounds = self.bounds

        tfm = partial(pyproj.transform, pyproj.Proj(init=from_proj),
                      pyproj.Proj(init=proj))
        itfm = partial(pyproj.transform, pyproj.Proj(init=proj),
                       pyproj.Proj(init=from_proj))
        output_bounds = ops.transform(tfm, box(*current_bounds)).bounds
        gtf = Affine.from_gdal(output_bounds[0], gsd, 0.0, output_bounds[3],
                               0.0, -1 * gsd)

        ll = ~gtf * (output_bounds[:2])
        ur = ~gtf * (output_bounds[2:])
        x_chunks = int((ur[0] - ll[0]) / x_size) + 1
        y_chunks = int((ll[1] - ur[1]) / y_size) + 1

        daskmeta = {
            "dask": {},
            "chunks": (img_md["numBands"], y_size, x_size),
            "dtype": IPE_TO_DTYPE[img_md["dataType"]],
            "name": "warp-{}".format(self.ipe_id),
            "shape": (img_md["numBands"], y_chunks * y_size, x_chunks * x_size)
        }

        def px_to_geom(xmin, ymin):
            xmax = int(xmin + x_size)
            ymax = int(ymin + y_size)
            bounds = list((gtf * (xmin, ymax)) + (gtf * (xmax, ymin)))
            return box(*bounds)

        full_bounds = box(*output_bounds)

        dasks = []
        if isinstance(dem, GeoImage):
            if dem.proj != proj:
                dem = dem.warp(proj=proj)
            dasks.append(dem.dask)

        for y in xrange(y_chunks):
            for x in xrange(x_chunks):
                xmin = x * x_size
                ymin = y * y_size
                geometry = px_to_geom(xmin, ymin)
                full_bounds = box(*full_bounds.union(geometry).bounds)
                daskmeta["dask"][(daskmeta["name"], 0, y,
                                  x)] = (self._warp, geometry, gsd, dem, proj,
                                         5)
        daskmeta["dask"], _ = optimize.cull(
            sharedict.merge(daskmeta["dask"], *dasks),
            list(daskmeta["dask"].keys()))

        result = GeoDaskWrapper(daskmeta, self)
        result.__geo_interface__ = mapping(full_bounds)
        result.__geo_transform__ = AffineTransform(gtf, proj)
        return GeoImage.__getitem__(result, box(*output_bounds))