Exemplo n.º 1
0
def paths_to_tiled_image(paths,
                         context=None,
                         tile_size=(256, 256),
                         padding=(0, 0),
                         backend=backends[-1],
                         skip_chunk=False,
                         **kwargs):
    """
    Create an tiled ND image from a collection of paths
    :param paths: List[str] / RDD[str] a list or RDD of strings containing image paths
    :param context: SparkContext the context to make the RDD from if paths is a list
    :param tile_size: the size of tiles to cut
    :param padding:  the padding to use
    :param backend: The LazyImageBackend to use for reading the image data in (by default uses the last one, GDAL if available)
    :param skip_chunk: For developer use only allows the actual subchunking step to be delayed
    :param kwargs: other arguments for creating the initial RDD
    :return: a ChunkedRDD containing the image data as tiles (use .unchunk to make into a normal RDD)
    """
    path_rdd = paths if isinstance(paths, RDD) else context.parallelize(
        paths, **kwargs)
    _create_dmzi = lambda fle_path: DiskMappedLazyImage(fle_path, backend)
    in_rdd = path_rdd.zipWithIndex().map(
        lambda x:
        (x[1], _create_dmzi(x[0])))  # type: RDD[(int, DiskMappedLazyImage)]
    first_ds = _create_dmzi(path_rdd.first())
    ca_data = ChunkedArray(in_rdd,
                           shape=(path_rdd.count(), ) + first_ds.shape,
                           split=1,
                           dtype=first_ds[0, 0].dtype)
    if skip_chunk: return ca_data
    return ca_data._chunk(size=tile_size, axis=None, padding=padding)
Exemplo n.º 2
0
    def chunk(self, size="150", axis=None):
        """
        Chunks records of a distributed array.

        Chunking breaks arrays into subarrays, using an specified
        number of chunks along each value dimension. Can alternatively
        specify an average chunk size (in megabytes) and the number of
        chunks will be computed automatically.

        Parameters
        ----------
        size : tuple, int, or str, optional, default = "150"
            A string giving the size in megabytes, or a tuple with the number
            of chunks along each dimension.

        axis : int or tuple, optional, default = None
            One or more axis to chunk array along, if None
            will use all axes,

        Returns
        -------
        ChunkedArray
        """
        if type(size) is not str:
            size = tupleize((size))
        axis = tupleize((axis))

        from bolt.spark.chunk import ChunkedArray

        chnk = ChunkedArray(rdd=self._rdd, shape=self._shape, split=self._split, dtype=self._dtype)
        return chnk.chunk(size, axis)
Exemplo n.º 3
0
    def chunk(self, size="150", axis=None):
        """
        Chunks records of a distributed array.

        Chunking breaks arrays into subarrays, using an specified
        size of chunks along each value dimension. Can alternatively
        specify an average chunk byte size (in megabytes) and the size of
        chunks (as ints) will be computed automatically.

        Parameters
        ----------
        size : tuple, int, or str, optional, default = "150"
            A string giving the size in megabytes, or a tuple with the size
            of chunks along each dimension.

        axis : int or tuple, optional, default = None
            One or more axis to chunk array along, if None
            will use all axes,

        Returns
        -------
        ChunkedArray
        """
        if type(size) is not str:
            size = tupleize((size))
        axis = tupleize((axis))

        from bolt.spark.chunk import ChunkedArray

        chnk = ChunkedArray(rdd=self._rdd,
                            shape=self._shape,
                            split=self._split,
                            dtype=self._dtype)
        return chnk.chunk(size, axis)
Exemplo n.º 4
0
    def swap(self, kaxes, vaxes, size="150"):
        """
        Swap axes from keys to values.

        This is the core operation underlying shape manipulation
        on the Spark bolt array. It exchanges an arbitrary set of axes
        between the keys and the valeus. If either is None, will only
        move axes in one direction (from keys to values, or values to keys).
        Keys moved to values will be placed immediately after the split; 
        values moved to keys will be placed immediately before the split.

        Parameters
        ----------
        kaxes : tuple
            Axes from keys to move to values

        vaxes : tuple
            Axes from values to move to keys

        size : tuple or int, optional, default = "150"
            Can either provide a string giving the size in megabytes,
            or a tuple with the number of chunks along each
            value dimension being moved

        Returns
        -------
        BoltArraySpark
        """
        kaxes = asarray(tupleize(kaxes), 'int')
        vaxes = asarray(tupleize(vaxes), 'int')
        if type(size) is not str:
            size = tupleize(size)

        if len(kaxes) == self.keys.ndim and len(vaxes) == 0:
            raise ValueError('Cannot perform a swap that would '
                             'end up with all data on a single key')

        if len(kaxes) == 0 and len(vaxes) == 0:
            return self

        if self.values.ndim == 0:
            rdd = self._rdd.mapValues(lambda v: array(v, ndmin=1))
            shape = self._shape + (1,)
        else:
            rdd = self._rdd
            shape = self._shape

        from bolt.spark.chunk import ChunkedArray

        c = ChunkedArray(rdd, shape=shape, split=self._split, dtype=self._dtype)

        chunks = c.chunk(size, axis=vaxes)
        barray = chunks.move(kaxes, vaxes)

        if self.values.ndim == 0:
            barray._rdd = barray._rdd.mapValues(lambda v: v.squeeze())
            barray._shape = barray._shape[:-1]

        return barray
Exemplo n.º 5
0
    def swap(self, kaxes, vaxes, size="150"):
        """
        Swap axes from keys to values.

        This is the core operation underlying shape manipulation
        on the Spark bolt array. It exchanges an arbitrary set of axes
        between the keys and the valeus. If either is None, will only
        move axes in one direction (from keys to values, or values to keys).
        Keys moved to values will be placed immediately after the split;
        values moved to keys will be placed immediately before the split.

        Parameters
        ----------
        kaxes : tuple
            Axes from keys to move to values

        vaxes : tuple
            Axes from values to move to keys

        size : tuple or int, optional, default = "150"
            Can either provide a string giving the size in kilobytes,
            or a tuple with the number of chunks along each
            value dimension being moved

        Returns
        -------
        BoltArraySpark
        """
        kaxes = asarray(tupleize(kaxes), 'int')
        vaxes = asarray(tupleize(vaxes), 'int')
        if type(size) is not str:
            size = tupleize(size)

        if len(kaxes) == self.keys.ndim and len(vaxes) == 0:
            raise ValueError('Cannot perform a swap that would '
                             'end up with all data on a single key')

        if len(kaxes) == 0 and len(vaxes) == 0:
            return self

        from bolt.spark.chunk import ChunkedArray

        c = ChunkedArray(self._rdd,
                         shape=self._shape,
                         split=self._split,
                         dtype=self._dtype)

        chunks = c._chunk(size, axis=vaxes)
        swapped = chunks.keys_to_values(kaxes).values_to_keys(
            [v + len(kaxes) for v in vaxes])
        barray = swapped.unchunk()

        return barray
Exemplo n.º 6
0
def single_chunky_image(in_ds, context, tile_size=(256, 256), padding=(0, 0)):
    in_rdd = context.parallelize([((0, ), in_ds)])
    return ChunkedArray(
        in_rdd,  # type - RDD[(int, np.ndarray)]
        shape=(in_rdd.count(), ) + in_ds.size,
        split=1,
        dtype=in_ds[0, 0].dtype)._chunk(size=tile_size,
                                        axis=None,
                                        padding=padding)
Exemplo n.º 7
0
    def chunk(self, size="150", axis=None, padding=None):
        """
        Chunks records of a distributed array.

        Chunking breaks arrays into subarrays, using an specified
        size of chunks along each value dimension. Can alternatively
        specify an average chunk byte size (in kilobytes) and the size of
        chunks (as ints) will be computed automatically.

        Parameters
        ----------
        size : tuple, int, or str, optional, default = "150"
            A string giving the size in kilobytes, or a tuple with the size
            of chunks along each dimension.

        axis : int or tuple, optional, default = None
            One or more axis to chunk array along, if None
            will use all axes,

        padding: tuple or int, default = None
            Number of elements per dimension that will overlap with the adjacent chunk.
            If a tuple, specifies padding along each chunked dimension; if a int, same
            padding will be applied to all chunked dimensions.

        Returns
        -------
        ChunkedArray
        """
        if type(size) is not str:
            size = tupleize((size))
        axis = tupleize((axis))
        padding = tupleize((padding))

        from bolt.spark.chunk import ChunkedArray

        chnk = ChunkedArray(rdd=self._rdd,
                            shape=self._shape,
                            split=self._split,
                            dtype=self._dtype)
        return chnk._chunk(size, axis, padding)
Exemplo n.º 8
0
Arquivo: array.py Projeto: gdtm86/bolt
    def chunk(self, size="150", axis=None, padding=None):
        """
        Chunks records of a distributed array.

        Chunking breaks arrays into subarrays, using an specified
        size of chunks along each value dimension. Can alternatively
        specify an average chunk byte size (in kilobytes) and the size of
        chunks (as ints) will be computed automatically.

        Parameters
        ----------
        size : tuple, int, or str, optional, default = "150"
            A string giving the size in kilobytes, or a tuple with the size
            of chunks along each dimension.

        axis : int or tuple, optional, default = None
            One or more axis to chunk array along, if None
            will use all axes,

        padding: tuple or int, default = None
            Number of elements per dimension that will overlap with the adjacent chunk.
            If a tuple, specifies padding along each chunked dimension; if a int, same
            padding will be applied to all chunked dimensions.

        Returns
        -------
        ChunkedArray
        """
        if type(size) is not str:
            size = tupleize((size))
        axis = tupleize((axis))
        padding = tupleize((padding))

        from bolt.spark.chunk import ChunkedArray

        chnk = ChunkedArray(rdd=self._rdd, shape=self._shape, split=self._split, dtype=self._dtype)
        return chnk._chunk(size, axis, padding)