Exemplo n.º 1
0
    def chunk(self, size="150", axis=None):
        """
        Chunks records of a distributed array.

        Chunking breaks arrays into subarrays, using an specified
        number of chunks along each value dimension. Can alternatively
        specify an average chunk size (in megabytes) and the number of
        chunks will be computed automatically.

        Parameters
        ----------
        size : tuple, int, or str, optional, default = "150"
            A string giving the size in megabytes, or a tuple with the number
            of chunks along each dimension.

        axis : int or tuple, optional, default = None
            One or more axis to chunk array along, if None
            will use all axes,

        Returns
        -------
        ChunkedArray
        """
        if type(size) is not str:
            size = tupleize((size))
        axis = tupleize((axis))

        from bolt.spark.chunk import ChunkedArray

        chnk = ChunkedArray(rdd=self._rdd, shape=self._shape, split=self._split, dtype=self._dtype)
        return chnk.chunk(size, axis)
Exemplo n.º 2
0
    def chunk(self, size="150", axis=None):
        """
        Chunks records of a distributed array.

        Chunking breaks arrays into subarrays, using an specified
        size of chunks along each value dimension. Can alternatively
        specify an average chunk byte size (in megabytes) and the size of
        chunks (as ints) will be computed automatically.

        Parameters
        ----------
        size : tuple, int, or str, optional, default = "150"
            A string giving the size in megabytes, or a tuple with the size
            of chunks along each dimension.

        axis : int or tuple, optional, default = None
            One or more axis to chunk array along, if None
            will use all axes,

        Returns
        -------
        ChunkedArray
        """
        if type(size) is not str:
            size = tupleize((size))
        axis = tupleize((axis))

        from bolt.spark.chunk import ChunkedArray

        chnk = ChunkedArray(rdd=self._rdd,
                            shape=self._shape,
                            split=self._split,
                            dtype=self._dtype)
        return chnk.chunk(size, axis)
Exemplo n.º 3
0
    def swap(self, kaxes, vaxes, size="150"):
        """
        Swap axes from keys to values.

        This is the core operation underlying shape manipulation
        on the Spark bolt array. It exchanges an arbitrary set of axes
        between the keys and the valeus. If either is None, will only
        move axes in one direction (from keys to values, or values to keys).
        Keys moved to values will be placed immediately after the split; 
        values moved to keys will be placed immediately before the split.

        Parameters
        ----------
        kaxes : tuple
            Axes from keys to move to values

        vaxes : tuple
            Axes from values to move to keys

        size : tuple or int, optional, default = "150"
            Can either provide a string giving the size in megabytes,
            or a tuple with the number of chunks along each
            value dimension being moved

        Returns
        -------
        BoltArraySpark
        """
        kaxes = asarray(tupleize(kaxes), 'int')
        vaxes = asarray(tupleize(vaxes), 'int')
        if type(size) is not str:
            size = tupleize(size)

        if len(kaxes) == self.keys.ndim and len(vaxes) == 0:
            raise ValueError('Cannot perform a swap that would '
                             'end up with all data on a single key')

        if len(kaxes) == 0 and len(vaxes) == 0:
            return self

        if self.values.ndim == 0:
            rdd = self._rdd.mapValues(lambda v: array(v, ndmin=1))
            shape = self._shape + (1,)
        else:
            rdd = self._rdd
            shape = self._shape

        from bolt.spark.chunk import ChunkedArray

        c = ChunkedArray(rdd, shape=shape, split=self._split, dtype=self._dtype)

        chunks = c.chunk(size, axis=vaxes)
        barray = chunks.move(kaxes, vaxes)

        if self.values.ndim == 0:
            barray._rdd = barray._rdd.mapValues(lambda v: v.squeeze())
            barray._shape = barray._shape[:-1]

        return barray
Exemplo n.º 4
0
    def swap(self, kaxes, vaxes, size="150"):
        """
        Swap axes from keys to values.

        This is the core operation underlying shape manipulation
        on the Spark bolt array. It exchanges an arbitrary set of axes
        between the keys and the valeus. If either is None, will only
        move axes in one direction (from keys to values, or values to keys).
        Keys moved to values will be placed immediately after the split; 
        values moved to keys will be placed immediately before the split.

        Parameters
        ----------
        kaxes : tuple
            Axes from keys to move to values

        vaxes : tuple
            Axes from values to move to keys

        size : tuple or int, optional, default = "150"
            Can either provide a string giving the size in megabytes,
            or a tuple with the number of chunks along each
            value dimension being moved

        Returns
        -------
        BoltArraySpark
        """
        kaxes = asarray(tupleize(kaxes), 'int')
        vaxes = asarray(tupleize(vaxes), 'int')
        if type(size) is not str:
            size = tupleize(size)

        if len(kaxes) == self.keys.ndim and len(vaxes) == 0:
            raise ValueError('Cannot perform a swap that would '
                             'end up with all data on a single key')

        if len(kaxes) == 0 and len(vaxes) == 0:
            return self

        if self.values.ndim == 0:
            rdd = self._rdd.mapValues(lambda v: array(v, ndmin=1))
            shape = self._shape + (1, )
        else:
            rdd = self._rdd
            shape = self._shape

        from bolt.spark.chunk import ChunkedArray

        c = ChunkedArray(rdd,
                         shape=shape,
                         split=self._split,
                         dtype=self._dtype)

        chunks = c.chunk(size, axis=vaxes)
        barray = chunks.move(kaxes, vaxes)

        if self.values.ndim == 0:
            barray._rdd = barray._rdd.mapValues(lambda v: v.squeeze())
            barray._shape = barray._shape[:-1]

        return barray