示例#1
0
    def chunk(self, size="150", axis=None):
        """
        Chunks records of a distributed array.

        Chunking breaks arrays into subarrays, using an specified
        size of chunks along each value dimension. Can alternatively
        specify an average chunk byte size (in megabytes) and the size of
        chunks (as ints) will be computed automatically.

        Parameters
        ----------
        size : tuple, int, or str, optional, default = "150"
            A string giving the size in megabytes, or a tuple with the size
            of chunks along each dimension.

        axis : int or tuple, optional, default = None
            One or more axis to chunk array along, if None
            will use all axes,

        Returns
        -------
        ChunkedArray
        """
        if type(size) is not str:
            size = tupleize((size))
        axis = tupleize((axis))

        from bolt.spark.chunk import ChunkedArray

        chnk = ChunkedArray(rdd=self._rdd,
                            shape=self._shape,
                            split=self._split,
                            dtype=self._dtype)
        return chnk.chunk(size, axis)
示例#2
0
文件: array.py 项目: andrewosh/bolt
    def chunk(self, size="150", axis=None):
        """
        Chunks records of a distributed array.

        Chunking breaks arrays into subarrays, using an specified
        number of chunks along each value dimension. Can alternatively
        specify an average chunk size (in megabytes) and the number of
        chunks will be computed automatically.

        Parameters
        ----------
        size : tuple, int, or str, optional, default = "150"
            A string giving the size in megabytes, or a tuple with the number
            of chunks along each dimension.

        axis : int or tuple, optional, default = None
            One or more axis to chunk array along, if None
            will use all axes,

        Returns
        -------
        ChunkedArray
        """
        if type(size) is not str:
            size = tupleize((size))
        axis = tupleize((axis))

        from bolt.spark.chunk import ChunkedArray

        chnk = ChunkedArray(rdd=self._rdd, shape=self._shape, split=self._split, dtype=self._dtype)
        return chnk.chunk(size, axis)
示例#3
0
    def swap(self, key_axes, value_axes, size=150):

        key_axes, value_axes = tupleize(key_axes), tupleize(value_axes)

        if len(key_axes) == self.keys.ndim and len(value_axes) == 0:
            raise ValueError('Cannot perform a swap that would '
                             'end up with all data on a single key')

        if len(key_axes) == 0 and len(value_axes) == 0:
            return self

        if self.values.ndim == 0:
            rdd = self._rdd.mapValues(lambda v: array(v, ndmin=1))
            value_shape = (1, )
        else:
            rdd = self._rdd
            value_shape = self.values.shape

        from bolt.spark.swap import Swapper, Dims

        k = Dims(shape=self.keys.shape, axes=key_axes)
        v = Dims(shape=value_shape, axes=value_axes)
        s = Swapper(k, v, self.dtype, size)

        chunks = s.chunk(rdd)
        rdd = s.extract(chunks)

        shape = s.getshape()
        split = self.split - len(key_axes) + len(value_axes)

        if self.values.ndim == 0:
            rdd = rdd.mapValues(lambda v: v.squeeze())
            shape = shape[:-1]

        return self._constructor(rdd, shape=tuple(shape), split=split)
示例#4
0
    def swap(self, key_axes, value_axes, size=150):

        key_axes, value_axes = tupleize(key_axes), tupleize(value_axes)

        if len(key_axes) == self.keys.ndim and len(value_axes) == 0:
            raise ValueError('Cannot perform a swap that would '
                             'end up with all data on a single key')

        if len(key_axes) == 0 and len(value_axes) == 0:
            return self

        if self.values.ndim == 0:
            rdd = self._rdd.mapValues(lambda v: array(v, ndmin=1))
            value_shape = (1,)
        else:
            rdd = self._rdd
            value_shape = self.values.shape

        from bolt.spark.swap import Swapper, Dims

        k = Dims(shape=self.keys.shape, axes=key_axes)
        v = Dims(shape=value_shape, axes=value_axes)
        s = Swapper(k, v, self.dtype, size)

        chunks = s.chunk(rdd)
        rdd = s.extract(chunks)

        shape = s.getshape()
        split = self.split - len(key_axes) + len(value_axes)

        if self.values.ndim == 0:
            rdd = rdd.mapValues(lambda v: v.squeeze())
            shape = shape[:-1]

        return self._constructor(rdd, shape=tuple(shape), split=split)
示例#5
0
文件: array.py 项目: andrewosh/bolt
    def swap(self, kaxes, vaxes, size="150"):
        """
        Swap axes from keys to values.

        This is the core operation underlying shape manipulation
        on the Spark bolt array. It exchanges an arbitrary set of axes
        between the keys and the valeus. If either is None, will only
        move axes in one direction (from keys to values, or values to keys).
        Keys moved to values will be placed immediately after the split; 
        values moved to keys will be placed immediately before the split.

        Parameters
        ----------
        kaxes : tuple
            Axes from keys to move to values

        vaxes : tuple
            Axes from values to move to keys

        size : tuple or int, optional, default = "150"
            Can either provide a string giving the size in megabytes,
            or a tuple with the number of chunks along each
            value dimension being moved

        Returns
        -------
        BoltArraySpark
        """
        kaxes = asarray(tupleize(kaxes), 'int')
        vaxes = asarray(tupleize(vaxes), 'int')
        if type(size) is not str:
            size = tupleize(size)

        if len(kaxes) == self.keys.ndim and len(vaxes) == 0:
            raise ValueError('Cannot perform a swap that would '
                             'end up with all data on a single key')

        if len(kaxes) == 0 and len(vaxes) == 0:
            return self

        if self.values.ndim == 0:
            rdd = self._rdd.mapValues(lambda v: array(v, ndmin=1))
            shape = self._shape + (1,)
        else:
            rdd = self._rdd
            shape = self._shape

        from bolt.spark.chunk import ChunkedArray

        c = ChunkedArray(rdd, shape=shape, split=self._split, dtype=self._dtype)

        chunks = c.chunk(size, axis=vaxes)
        barray = chunks.move(kaxes, vaxes)

        if self.values.ndim == 0:
            barray._rdd = barray._rdd.mapValues(lambda v: v.squeeze())
            barray._shape = barray._shape[:-1]

        return barray
示例#6
0
    def swap(self, kaxes, vaxes, size="150"):
        """
        Swap axes from keys to values.

        This is the core operation underlying shape manipulation
        on the Spark bolt array. It exchanges an arbitrary set of axes
        between the keys and the valeus. If either is None, will only
        move axes in one direction (from keys to values, or values to keys).
        Keys moved to values will be placed immediately after the split;
        values moved to keys will be placed immediately before the split.

        Parameters
        ----------
        kaxes : tuple
            Axes from keys to move to values

        vaxes : tuple
            Axes from values to move to keys

        size : tuple or int, optional, default = "150"
            Can either provide a string giving the size in kilobytes,
            or a tuple with the number of chunks along each
            value dimension being moved

        Returns
        -------
        BoltArraySpark
        """
        kaxes = asarray(tupleize(kaxes), 'int')
        vaxes = asarray(tupleize(vaxes), 'int')
        if type(size) is not str:
            size = tupleize(size)

        if len(kaxes) == self.keys.ndim and len(vaxes) == 0:
            raise ValueError('Cannot perform a swap that would '
                             'end up with all data on a single key')

        if len(kaxes) == 0 and len(vaxes) == 0:
            return self

        from bolt.spark.chunk import ChunkedArray

        c = ChunkedArray(self._rdd,
                         shape=self._shape,
                         split=self._split,
                         dtype=self._dtype)

        chunks = c._chunk(size, axis=vaxes)
        swapped = chunks.keys_to_values(kaxes).values_to_keys(
            [v + len(kaxes) for v in vaxes])
        barray = swapped.unchunk()

        return barray
示例#7
0
    def filter(self, func, axis=(0, ), sort=False):
        """
        Filter array along an axis.

        Applies a function which should evaluate to boolean,
        along a single axis or multiple axes. Array will be
        aligned so that the desired set of axes are in the keys,
        which may incur a swap.

        Parameters
        ----------
        func : function
            Function to apply, should return boolean

        axis : tuple or int, optional, default=(0,)
            Axis or multiple axes to filter along.

        sort: bool, optional, default=False
            Whether or not to sort by key before reindexing

        Returns
        -------
        BoltArraySpark
        """
        axis = tupleize(axis)

        swapped = self._align(axis)

        def f(record):
            return func(record[1])

        rdd = swapped._rdd.filter(f)
        if sort:
            rdd = rdd.sortByKey().values()
        else:
            rdd = rdd.values()

        # count the resulting array in order to reindex (linearize) the keys
        count, zipped = zip_with_index(rdd)
        if not count:
            count = zipped.count()
        reindexed = zipped.map(lambda kv: (tupleize(kv[1]), kv[0]))

        # since we can only filter over one axis, the remaining shape is always the following
        remaining = list(swapped.shape[len(axis):])
        if count != 0:
            shape = tuple([count] + remaining)
        else:
            shape = (0, )

        return self._constructor(reindexed, shape=shape,
                                 split=swapped.split).__finalize__(swapped)
示例#8
0
文件: array.py 项目: anirband/bolt
    def map(self, func, axis=(0,), value_shape=None):
        """
        Apply a function across an axis.

        Array will be aligned so that the desired set of axes
        are in the keys, which may incur a swap.

        Parameters
        ----------
        func : function
            Function of a single array to apply

        axis : tuple or int, optional, default=(0,)
            Axis or multiple axes to apply function along.

        value_shape : tuple, optional, default=None
            Known shape of values resulting from operation

        Returns
        -------
        BoltArraySpark
        """
        axis = tupleize(axis)
        swapped = self._align(axis)

        if value_shape is None:
            # try to compute the size of each mapped element by applying func to a random array
            value_shape = None
            try:
                value_shape = func(random.randn(*swapped.values.shape).astype(self.dtype)).shape
            except Exception:
                first = swapped._rdd.first()
                if first:
                    # eval func on the first element
                    mapped = func(first[1])
                    value_shape = mapped.shape

        shape = tuple([swapped._shape[ax] for ax in range(len(axis))]) + tupleize(value_shape)

        rdd = swapped._rdd.mapValues(func)

        # reshaping will fail if the elements aren't uniformly shaped
        def check(v):
            if len(v.shape) > 0 and v.shape != tupleize(value_shape):
                raise Exception("Map operation did not produce values of uniform shape.")
            return v

        rdd = rdd.mapValues(lambda v: check(v))

        return self._constructor(rdd, shape=shape, split=swapped.split).__finalize__(swapped)
示例#9
0
文件: array.py 项目: gdtm86/bolt
    def filter(self, func, axis=(0,), sort=False):
        """
        Filter array along an axis.

        Applies a function which should evaluate to boolean,
        along a single axis or multiple axes. Array will be
        aligned so that the desired set of axes are in the keys,
        which may incur a swap.

        Parameters
        ----------
        func : function
            Function to apply, should return boolean

        axis : tuple or int, optional, default=(0,)
            Axis or multiple axes to filter along.

        sort: bool, optional, default=False
            Whether or not to sort by key before reindexing

        Returns
        -------
        BoltArraySpark
        """
        axis = tupleize(axis)

        swapped = self._align(axis)
        def f(record):
            return func(record[1])
        rdd = swapped._rdd.filter(f)
        if sort:
            rdd = rdd.sortByKey().values()
        else:
            rdd = rdd.values()

        # count the resulting array in order to reindex (linearize) the keys
        count, zipped = zip_with_index(rdd)
        if not count:
            count = zipped.count()
        reindexed = zipped.map(lambda kv: (tupleize(kv[1]), kv[0]))

        # since we can only filter over one axis, the remaining shape is always the following
        remaining = list(swapped.shape[len(axis):])
        if count != 0:
            shape = tuple([count] + remaining)
        else:
            shape = (0,)

        return self._constructor(reindexed, shape=shape, split=swapped.split).__finalize__(swapped)
示例#10
0
    def swap(self, kaxes, vaxes, size="150"):
        """
        Swap axes from keys to values.

        This is the core operation underlying shape manipulation
        on the Spark bolt array. It exchanges an arbitrary set of axes
        between the keys and the valeus. If either is None, will only
        move axes in one direction (from keys to values, or values to keys).
        Keys moved to values will be placed immediately after the split;
        values moved to keys will be placed immediately before the split.

        Parameters
        ----------
        kaxes : tuple
            Axes from keys to move to values

        vaxes : tuple
            Axes from values to move to keys

        size : tuple or int, optional, default = "150"
            Can either provide a string giving the size in kilobytes,
            or a tuple with the number of chunks along each
            value dimension being moved

        Returns
        -------
        BoltArraySpark
        """
        kaxes = asarray(tupleize(kaxes), 'int')
        vaxes = asarray(tupleize(vaxes), 'int')
        if type(size) is not str:
            size = tupleize(size)

        if len(kaxes) == self.keys.ndim and len(vaxes) == 0:
            raise ValueError('Cannot perform a swap that would '
                             'end up with all data on a single key')

        if len(kaxes) == 0 and len(vaxes) == 0:
            return self

        from bolt.spark.chunk import ChunkedArray

        chunks = self.chunk(size)

        swapped = chunks.keys_to_values(kaxes).values_to_keys([v+len(kaxes) for v in vaxes])
        barray = swapped.unchunk()

        return barray
示例#11
0
    def __getitem__(self, index):

        index = tupleize(index)

        if len(index) > self.ndim:
            raise ValueError("Too many indices for array")

        if not all([isinstance(i, (slice, int, list, set, ndarray)) for i in index]):
            raise ValueError("Each index must either be a slice, int, list, set, or ndarray")

        # fill unspecified axes with full slices
        if len(index) < self.ndim:
            index += tuple([slice(0, None, None) for _ in range(self.ndim - len(index))])

        # convert ints to lists if not all ints and slices
        if not all([isinstance(i, (int, slice)) for i in index]):
            index = tuple([[i] if isinstance(i, int) else i for i in index])

        # select basic or advanced indexing
        if all([isinstance(i, (slice, int)) for i in index]):
            rdd, shape, split = self._getbasic(index)
        elif all([isinstance(i, (set, list, ndarray)) for i in index]):
            rdd, shape, split = self._getadvanced(index)
        else:
            raise NotImplementedError("Cannot mix basic indexing (slices and ints) with "
                                      "advanced indexing (lists and ndarrays) across axes")

        result = self._constructor(rdd, shape=shape, split=split).__finalize__(self)

        # squeeze out int dimensions (and squeeze to singletons if all ints)
        if all([isinstance(i, int) for i in index]):
            return result.squeeze().toarray()[()]
        else:
            tosqueeze = tuple([i for i in index if isinstance(i, int)])
            return result.squeeze(tosqueeze)
示例#12
0
    def reduce(self, func, axis=(0,)):
        """
        Reduce an array along an axis.

        Applies a function of two arguments
        cumlutatively to all arrays along an axis.

        Parameters
        ----------
        func : function
            Function of two arrays that returns a single array

        axis : tuple or int, optional, default=(0,)
            Axis or multiple axes to reduce along.

        Returns
        -------
        BoltSparkArray
        """
        from bolt.local.array import BoltArrayLocal
        from numpy import ndarray

        axis = tupleize(axis)
        swapped = self._align(axis)
        arr = swapped._rdd.values().reduce(func)

        if not isinstance(arr, ndarray):
            # the result of a reduce can also be a scalar
            return arr
        elif arr.shape == (1,):
            # ndarrays with single values in them should be converted into scalars
            return arr[0]

        return BoltArrayLocal(arr)
示例#13
0
    def reduce(self, func, axis=(0, )):
        """
        Reduce an array along an axis.

        Applies a function of two arguments
        cumlutatively to all arrays along an axis.

        Parameters
        ----------
        func : function
            Function of two arrays that returns a single array

        axis : tuple or int, optional, default=(0,)
            Axis or multiple axes to reduce along.

        Returns
        -------
        BoltSparkArray
        """
        from bolt.local.array import BoltArrayLocal
        from numpy import ndarray

        axis = tupleize(axis)
        swapped = self._align(axis)
        arr = swapped._rdd.values().reduce(func)

        if not isinstance(arr, ndarray):
            # the result of a reduce can also be a scalar
            return arr
        elif arr.shape == (1, ):
            # ndarrays with single values in them should be converted into scalars
            return arr[0]

        return BoltArrayLocal(arr)
示例#14
0
    def sample(self, n=100, seed=None):
        """
        Extract random sample of records.

        Parameters
        ----------
        n : int, optional, default = 100
            The number of data points to sample.

        seed : int, optional, default = None
            Random seed.
        """
        if n < 1:
            raise ValueError("Number of samples must be larger than 0, got '%g'" % n)

        if seed is None:
            seed = random.randint(0, 2 ** 32)

        if self.mode == 'spark':
            result = asarray(self.values.tordd().values().takeSample(False, n, seed))

        else:
            basedims = [self.shape[d] for d in self.baseaxes]
            inds = [unravel_index(int(k), basedims) for k in random.rand(n) * prod(basedims)]
            result = asarray([self.values[tupleize(i) + (slice(None, None),)] for i in inds])

        return self._constructor(result, index=self.index)
示例#15
0
    def __getitem__(self, item):
        # handle values -- convert ints to slices so no dimensions are dropped
        if isinstance(item, int):
            item = tuple([slicify(item, self.shape[0])])
        if isinstance(item, tuple):
            item = tuple([slicify(i, n) if isinstance(i, int) else i for i, n in zip(item, self.shape[:len(item)])])
        if isinstance(item, (list, ndarray)):
            item = (item,)
        new = self._values.__getitem__(item)
        result = self._constructor(new).__finalize__(self, noprop=('index', 'labels'))

        # handle labels
        if self.labels is not None:
            if isinstance(item, int):
                label_item = ([item],)
            elif isinstance(item, (list, ndarray, slice)):
                label_item = (item, )
            elif isinstance(item, tuple):
                label_item = item[:len(self.baseaxes)]
            newlabels = self.labels
            for (i, s) in enumerate(label_item):
                if isinstance(s, slice):
                    newlabels = newlabels[[s if j==i else slice(None) for j in range(len(label_item))]]
                else:
                    newlabels = newlabels.take(tupleize(s), i)
            result.labels = newlabels

        return result
示例#16
0
    def map(self, func, axis=(0, )):
        """
        Apply a function across an axis.

        Array will be aligned so that the desired set of axes
        are in the keys, which may require a transpose/reshape.

        Parameters
        ----------
        func : function
            Function of a single array to apply

        axis : tuple or int, optional, default=(0,)
            Axis or multiple axes to apply function along.

        Returns
        -------
        BoltArrayLocal
        """
        axes = sorted(tupleize(axis))
        key_shape = [self.shape[axis] for axis in axes]
        reshaped = self._align(axes, key_shape=key_shape)

        mapped = asarray(list(map(func, reshaped)))
        elem_shape = mapped[0].shape

        # invert the previous reshape operation, using the shape of the map result
        linearized_shape_inv = key_shape + list(elem_shape)
        reordered = mapped.reshape(*linearized_shape_inv)

        return self._constructor(reordered)
示例#17
0
    def filter(self, func, axis=(0, )):
        """
        Filter array along an axis.

        Applies a function which should evaluate to boolean,
        along a single axis or multiple axes. Array will be
        aligned so that the desired set of axes are in the
        keys, which may require a transpose/reshape.

        Parameters
        ----------
        func : function
            Function to apply, should return boolean

        axis : tuple or int, optional, default=(0,)
            Axis or multiple axes to filter along.

        Returns
        -------
        BoltArrayLocal
        """
        axes = sorted(tupleize(axis))
        reshaped = self._align(axes)

        filtered = asarray(list(filter(func, reshaped)))

        return self._constructor(filtered)
示例#18
0
文件: base.py 项目: gdtm86/thunder
    def __getitem__(self, item):
        # handle values -- convert ints to slices so no dimensions are dropped
        if isinstance(item, int):
            item = tuple([slicify(item, self.shape[0])])
        if isinstance(item, tuple):
            item = tuple([slicify(i, n) if isinstance(i, int) else i for i, n in zip(item, self.shape[:len(item)])])
        if isinstance(item, (list, ndarray)):
            item = (item,)
        new = self._values.__getitem__(item)
        result = self._constructor(new).__finalize__(self, noprop=('index', 'labels'))

        # handle labels
        if self.labels is not None:
            if isinstance(item, int):
                label_item = ([item],)
            elif isinstance(item, (list, ndarray, slice)):
                label_item = (item, )
            elif isinstance(item, tuple):
                label_item = item[:len(self.baseaxes)]
            newlabels = self.labels
            for (i, s) in enumerate(label_item):
                if isinstance(s, slice):
                    newlabels = newlabels[[s if j==i else slice(None) for j in range(len(label_item))]]
                else:
                    newlabels = newlabels.take(tupleize(s), i)
            result.labels = newlabels

        return result
示例#19
0
    def sample(self, n=100, seed=None):
        """
        Extract random sample of records.

        Parameters
        ----------
        n : int, optional, default = 100
            The number of data points to sample.

        seed : int, optional, default = None
            Random seed.
        """
        if n < 1:
            raise ValueError(
                "Number of samples must be larger than 0, got '%g'" % n)

        if seed is None:
            seed = random.randint(0, 2**32)

        if self.mode == 'spark':
            result = asarray(self.values.tordd().values().takeSample(
                False, n, seed))

        else:
            basedims = [self.shape[d] for d in self.baseaxes]
            inds = [
                unravel_index(int(k), basedims)
                for k in random.rand(n) * prod(basedims)
            ]
            result = asarray([
                self.values[tupleize(i) + (slice(None, None), )] for i in inds
            ])

        return self._constructor(result, index=self.index)
示例#20
0
文件: base.py 项目: lnicalo/thunder
    def __getitem__(self, item):
        # handle values
        if isinstance(item, int):
            item = slice(item, item+1, None)
        if isinstance(item, tuple):
            item = tuple([slice(i, i+1, None) if isinstance(i, int) else i for i in item])
        if isinstance(item, (list, ndarray)):
            item = (item,)
        new = self._values.__getitem__(item)
        result = self._constructor(new).__finalize__(self, noprop=('index', 'labels'))

        # handle labels
        if self.labels is not None:
            if isinstance(item, int):
                label_item = ([item],)
            elif isinstance(item, (list, ndarray, slice)):
                label_item = (item, )
            elif isinstance(item, tuple):
                label_item = item[:len(self.baseaxes)]
            newlabels = self.labels
            for (i, s) in enumerate(label_item):
                if isinstance(s, slice):
                    newlabels = newlabels[[s if j==i else slice(None) for j in range(len(label_item))]]
                else:
                    newlabels = newlabels.take(tupleize(s), i)
            result.labels = newlabels

        return result
示例#21
0
    def reduce(self, func, axis=0):
        """
        """

        axes = sorted(tupleize(axis))

        # if the function is a ufunc, it can automatically handle reducing over multiple axes
        if isinstance(func, ufunc):
            inshape(self.shape, axes)
            reduced = func.reduce(self, axis=tuple(axes))
        else:
            reshaped = self._align(axes)
            reduced = reduce(func, reshaped)

        new_array = self._constructor(reduced)

        # ensure that the shape of the reduced array is valid
        expected_shape = [
            self.shape[i] for i in range(len(self.shape)) if i not in axes
        ]
        if new_array.shape != tuple(expected_shape):
            raise ValueError(
                "reduce did not yield a BoltArray with valid dimensions")

        return new_array
示例#22
0
文件: array.py 项目: andrewosh/bolt
    def map(self, func, axis=(0,)):
        """
        Apply a function across an axis.

        Array will be aligned so that the desired set of axes
        are in the keys, which may require a transpose/reshape.

        Parameters
        ----------
        func : function
            Function of a single array to apply

        axis : tuple or int, optional, default=(0,)
            Axis or multiple axes to apply function along.

        Returns
        -------
        BoltArrayLocal
        """
        axes = sorted(tupleize(axis))
        key_shape = [self.shape[axis] for axis in axes]
        reshaped = self._align(axes, key_shape=key_shape)

        mapped = asarray(list(map(func, reshaped)))
        elem_shape = mapped[0].shape

        # invert the previous reshape operation, using the shape of the map result
        linearized_shape_inv = key_shape + list(elem_shape)
        reordered = mapped.reshape(*linearized_shape_inv)

        return self._constructor(reordered)
示例#23
0
文件: array.py 项目: andrewosh/bolt
    def filter(self, func, axis=(0,)):
        """
        Filter array along an axis.

        Applies a function which should evaluate to boolean,
        along a single axis or multiple axes. Array will be
        aligned so that the desired set of axes are in the
        keys, which may require a transpose/reshape.

        Parameters
        ----------
        func : function
            Function to apply, should return boolean

        axis : tuple or int, optional, default=(0,)
            Axis or multiple axes to filter along.

        Returns
        -------
        BoltArrayLocal
        """
        axes = sorted(tupleize(axis))
        reshaped = self._align(axes)

        filtered = asarray(list(filter(func, reshaped)))

        return self._constructor(filtered)
示例#24
0
文件: base.py 项目: gdtm86/thunder
    def map(self, func, value_shape=None, dtype=None, with_keys=False):
        """
        Apply an array -> array function across an axis.

        Array will be aligned so that the desired set of axes
        are in the keys, which may require a transpose/reshape.

        Parameters
        ----------
        func : function
            Function of a single array to apply. If with_keys=True,
            function should be of a (tuple, array) pair.

        axis : tuple or int, optional, default=(0,)
            Axis or multiple axes to apply function along.

        value_shape : tuple, optional, default=None
            Known shape of values resulting from operation. Only
            valid in spark mode.

        dtype: numpy.dtype, optional, default=None
            Known shape of dtype resulting from operation. Only
            valid in spark mode.

        with_keys : bool, optional, default=False
            Include keys as an argument to the function
        """
        axis = self.baseaxes

        if self.mode == 'local':
            axes = sorted(tupleize(axis))
            key_shape = [self.shape[axis] for axis in axes]
            reshaped = self._align(axes, key_shape=key_shape)

            if with_keys:
                keys = zip(*unravel_index(range(prod(key_shape)), key_shape))
                mapped = asarray(list(map(func, zip(keys, reshaped))))
            else:
                mapped = asarray(list(map(func, reshaped)))

            try:
                elem_shape = mapped[0].shape
            except:
                elem_shape = (1,)

            expand = list(elem_shape)
            expand = [1] if len(expand) == 0 else expand

            # invert the previous reshape operation, using the shape of the map result
            linearized_shape_inv = key_shape + expand
            reordered = mapped.reshape(*linearized_shape_inv)

            return self._constructor(reordered, mode=self.mode).__finalize__(self, noprop=('index'))

        if self.mode == 'spark':
            expand = lambda x: array(func(x), ndmin=1)
            mapped = self.values.map(expand, axis, value_shape, dtype, with_keys)
            return self._constructor(mapped, mode=self.mode).__finalize__(self, noprop=('index',))
示例#25
0
    def filter(self, func, axis=0):
        """
        """
        axes = sorted(tupleize(axis))
        reshaped = self._align(axes)

        filtered = asarray(list(filter(func, reshaped)))

        return self._constructor(filtered)
示例#26
0
    def filter(self, func, axis=0):
        """
        """
        axes = sorted(tupleize(axis))
        reshaped = self._align(axes)

        filtered = asarray(list(filter(func, reshaped)))

        return self._constructor(filtered)
示例#27
0
    def __getitem__(self, index):
        """
        Get an item from the array through indexing.

        Supports basic indexing with slices and ints, or advanced
        indexing with lists or ndarrays of integers.
        Mixing basic and advanced indexing across axes is not
        currently supported.

        Parameters
        ----------
        index : tuple of slices, ints, list, sets, or ndarrays
            One or more index specifications

        Returns
        -------
        BoltSparkArray
        """
        index = tupleize(index)

        if len(index) > self.ndim:
            raise ValueError("Too many indices for array")

        if not all(
            [isinstance(i, (slice, int, list, set, ndarray)) for i in index]):
            raise ValueError(
                "Each index must either be a slice, int, list, set, or ndarray"
            )

        # fill unspecified axes with full slices
        if len(index) < self.ndim:
            index += tuple(
                [slice(0, None, None) for _ in range(self.ndim - len(index))])

        # convert ints to lists if not all ints and slices
        if not all([isinstance(i, (int, slice)) for i in index]):
            index = tuple([[i] if isinstance(i, int) else i for i in index])

        # select basic or advanced indexing
        if all([isinstance(i, (slice, int)) for i in index]):
            rdd, shape, split = self._getbasic(index)
        elif all([isinstance(i, (set, list, ndarray)) for i in index]):
            rdd, shape, split = self._getadvanced(index)
        else:
            raise NotImplementedError(
                "Cannot mix basic indexing (slices and ints) with "
                "advanced indexing (lists and ndarrays) across axes")

        result = self._constructor(rdd, shape=shape,
                                   split=split).__finalize__(self)

        # squeeze out int dimensions (and squeeze to singletons if all ints)
        if all([isinstance(i, int) for i in index]):
            return result.squeeze().toarray()[()]
        else:
            tosqueeze = tuple([i for i in index if isinstance(i, int)])
            return result.squeeze(tosqueeze)
示例#28
0
    def _map(self, func, axis=(0,), value_shape=None, dtype=None, with_keys=False):
        """
        Apply an array -> array function across an axis.

        Array will be aligned so that the desired set of axes
        are in the keys, which may require a transpose/reshape.

        Parameters
        ----------
        func : function
            Function of a single array to apply. If with_keys=True,
            function should be of a (tuple, array) pair.

        axis : tuple or int, optional, default=(0,)
            Axis or multiple axes to apply function along.

        value_shape : tuple, optional, default=None
            Known shape of values resulting from operation. Only
            valid in spark mode.

        dtype: numpy.dtype, optional, default=None
            Known shape of dtype resulting from operation. Only
            valid in spark mode.

        with_keys : bool, optional, default=False
            Include keys as an argument to the function
        """
        if self.mode == 'local':
            axes = sorted(tupleize(axis))
            key_shape = [self.shape[axis] for axis in axes]
            reshaped = self._align(axes, key_shape=key_shape)

            if with_keys:
                keys = zip(*unravel_index(range(prod(key_shape)), key_shape))
                mapped = asarray(list(map(func, zip(keys, reshaped))))
            else:
                mapped = asarray(list(map(func, reshaped)))

            try:
                elem_shape = mapped[0].shape
            except:
                elem_shape = (1,)

            expand = list(elem_shape)
            expand = [1] if len(expand) == 0 else expand

            # invert the previous reshape operation, using the shape of the map result
            linearized_shape_inv = key_shape + expand
            reordered = mapped.reshape(*linearized_shape_inv)

            return self._constructor(reordered, mode=self.mode).__finalize__(self, noprop=('index'))

        if self.mode == 'spark':
            expand = lambda x: array(func(x), ndmin=1)
            mapped = self.values.map(expand, axis, value_shape, dtype, with_keys)
            return self._constructor(mapped, mode=self.mode).__finalize__(self, noprop=('index',))
示例#29
0
文件: array.py 项目: anirband/bolt
    def filter(self, func, axis=(0,)):
        """
        Filter array along an axis.

        Applies a function which should evaluate to boolean,
        along a single axis or multiple axes. Array will be
        aligned so that the desired set of axes are in the keys,
        which may incur a swap.

        Parameters
        ----------
        func : function
            Function to apply, should return boolean

        axis : tuple or int, optional, default=(0,)
            Axis or multiple axes to filter along.

        Returns
        -------
        BoltArraySpark
        """
        axis = tupleize(axis)
        if len(axis) != 1:
            raise NotImplementedError("Filtering over multiple axes will not be "
                                      "supported until SparseBoltArray is implemented.")

        swapped = self._align(axis)
        rdd = swapped._rdd.values().filter(func)

        # count the resulting array in order to reindex (linearize) the keys
        count, zipped = zip_with_index(rdd)
        if not count:
            count = zipped.count()
        reindexed = zipped.map(lambda kv: (tupleize(kv[1]), kv[0]))

        # since we can only filter over one axis, the remaining shape is always the following
        remaining = list(swapped.shape[1:])
        if count != 0:
            shape = tuple([count] + remaining)
        else:
            shape = (0,)

        return self._constructor(reindexed, shape=shape, split=swapped.split).__finalize__(swapped)
示例#30
0
    def chunk(self, size="150", axis=None, padding=None):
        """
        Chunks records of a distributed array.

        Chunking breaks arrays into subarrays, using an specified
        size of chunks along each value dimension. Can alternatively
        specify an average chunk byte size (in kilobytes) and the size of
        chunks (as ints) will be computed automatically.

        Parameters
        ----------
        size : tuple, int, or str, optional, default = "150"
            A string giving the size in kilobytes, or a tuple with the size
            of chunks along each dimension.

        axis : int or tuple, optional, default = None
            One or more axis to chunk array along, if None
            will use all axes,

        padding: tuple or int, default = None
            Number of elements per dimension that will overlap with the adjacent chunk.
            If a tuple, specifies padding along each chunked dimension; if a int, same
            padding will be applied to all chunked dimensions.

        Returns
        -------
        ChunkedArray
        """
        if type(size) is not str:
            size = tupleize((size))
        axis = tupleize((axis))
        padding = tupleize((padding))

        from bolt.spark.chunk import ChunkedArray

        chnk = ChunkedArray(rdd=self._rdd,
                            shape=self._shape,
                            split=self._split,
                            dtype=self._dtype)
        return chnk._chunk(size, axis, padding)
示例#31
0
    def _stat(self, axis=None, func=None, name=None, keepdims=False):
        """
        Compute a statistic over an axis.

        Can provide either a function (for use in a reduce)
        or a name (for use by a stat counter).

        Parameters
        ----------
        axis : tuple or int, optional, default=None
            Axis to compute statistic over, if None
            will compute over all axes

        func : function, optional, default=None
            Function for reduce, see BoltArraySpark.reduce

        name : str
            A named statistic, see StatCounter

        keepdims : boolean, optional, default=False
            Keep axis remaining after operation with size 1.
        """
        if axis is None:
            axis = list(range(len(self.shape)))
        axis = tupleize(axis)

        if func and not name:
            return self.reduce(func, axis, keepdims)

        if name and not func:
            from bolt.local.array import BoltArrayLocal

            swapped = self._align(axis)

            def reducer(left, right):
                return left.combine(right)

            counter = swapped._rdd.values()\
                             .mapPartitions(lambda i: [StatCounter(values=i, stats=name)])\
                             .treeReduce(reducer, depth=3)

            arr = getattr(counter, name)

            if keepdims:
                for i in axis:
                    arr = expand_dims(arr, axis=i)

            return BoltArrayLocal(arr).toscalar()

        else:
            raise ValueError(
                'Must specify either a function or a statistic name.')
示例#32
0
文件: array.py 项目: andrewosh/bolt
    def __getitem__(self, index):
        """
        Get an item from the array through indexing.

        Supports basic indexing with slices and ints, or advanced
        indexing with lists or ndarrays of integers.
        Mixing basic and advanced indexing across axes is not
        currently supported.

        Parameters
        ----------
        index : tuple of slices, ints, list, sets, or ndarrays
            One or more index specifications

        Returns
        -------
        BoltSparkArray
        """
        index = tupleize(index)

        if len(index) > self.ndim:
            raise ValueError("Too many indices for array")

        if not all([isinstance(i, (slice, int, list, set, ndarray)) for i in index]):
            raise ValueError("Each index must either be a slice, int, list, set, or ndarray")

        # fill unspecified axes with full slices
        if len(index) < self.ndim:
            index += tuple([slice(0, None, None) for _ in range(self.ndim - len(index))])

        # convert ints to lists if not all ints and slices
        if not all([isinstance(i, (int, slice)) for i in index]):
            index = tuple([[i] if isinstance(i, int) else i for i in index])

        # select basic or advanced indexing
        if all([isinstance(i, (slice, int)) for i in index]):
            rdd, shape, split = self._getbasic(index)
        elif all([isinstance(i, (set, list, ndarray)) for i in index]):
            rdd, shape, split = self._getadvanced(index)
        else:
            raise NotImplementedError("Cannot mix basic indexing (slices and ints) with "
                                      "advanced indexing (lists and ndarrays) across axes")

        result = self._constructor(rdd, shape=shape, split=split).__finalize__(self)

        # squeeze out int dimensions (and squeeze to singletons if all ints)
        if all([isinstance(i, int) for i in index]):
            return result.squeeze().toarray()[()]
        else:
            tosqueeze = tuple([i for i in index if isinstance(i, int)])
            return result.squeeze(tosqueeze)
示例#33
0
    def _stat(self, axis=None, func=None, name=None, keepdims=False):
        """
        Compute a statistic over an axis.

        Can provide either a function (for use in a reduce)
        or a name (for use by a stat counter).

        Parameters
        ----------
        axis : tuple or int, optional, default=None
            Axis to compute statistic over, if None
            will compute over all axes

        func : function, optional, default=None
            Function for reduce, see BoltArraySpark.reduce

        name : str
            A named statistic, see StatCounter

        keepdims : boolean, optional, default=False
            Keep axis remaining after operation with size 1.
        """
        if axis is None:
            axis = list(range(len(self.shape)))
        axis = tupleize(axis)

        if func and not name:
            return self.reduce(func, axis, keepdims)

        if name and not func:
            from bolt.local.array import BoltArrayLocal

            swapped = self._align(axis)

            def reducer(left, right):
                return left.combine(right)

            counter = swapped._rdd.values()\
                             .mapPartitions(lambda i: [StatCounter(values=i, stats=name)])\
                             .reduce(reducer)

            arr = getattr(counter, name)

            if keepdims:
                for i in axis:
                    arr = expand_dims(arr, axis=i)

            return BoltArrayLocal(arr).toscalar()

        else:
            raise ValueError('Must specify either a function or a statistic name.')
示例#34
0
文件: array.py 项目: gdtm86/bolt
    def chunk(self, size="150", axis=None, padding=None):
        """
        Chunks records of a distributed array.

        Chunking breaks arrays into subarrays, using an specified
        size of chunks along each value dimension. Can alternatively
        specify an average chunk byte size (in kilobytes) and the size of
        chunks (as ints) will be computed automatically.

        Parameters
        ----------
        size : tuple, int, or str, optional, default = "150"
            A string giving the size in kilobytes, or a tuple with the size
            of chunks along each dimension.

        axis : int or tuple, optional, default = None
            One or more axis to chunk array along, if None
            will use all axes,

        padding: tuple or int, default = None
            Number of elements per dimension that will overlap with the adjacent chunk.
            If a tuple, specifies padding along each chunked dimension; if a int, same
            padding will be applied to all chunked dimensions.

        Returns
        -------
        ChunkedArray
        """
        if type(size) is not str:
            size = tupleize((size))
        axis = tupleize((axis))
        padding = tupleize((padding))

        from bolt.spark.chunk import ChunkedArray

        chnk = ChunkedArray(rdd=self._rdd, shape=self._shape, split=self._split, dtype=self._dtype)
        return chnk._chunk(size, axis, padding)
示例#35
0
    def map(self, func, axis=0):
        """
        """

        axes = sorted(tupleize(axis))
        key_shape = [self.shape[axis] for axis in axes]
        reshaped = self._align(axes, key_shape=key_shape)

        mapped = asarray(list(map(func, reshaped)))
        elem_shape = mapped[0].shape

        # invert the previous reshape operation, using the shape of the map result
        linearized_shape_inv = key_shape + list(elem_shape)
        reordered = mapped.reshape(*linearized_shape_inv)

        return self._constructor(reordered)
示例#36
0
    def map(self, func, axis=0):
        """
        """

        axes = sorted(tupleize(axis))
        key_shape = [self.shape[axis] for axis in axes]
        reshaped = self._align(axes, key_shape=key_shape)

        mapped = asarray(list(map(func, reshaped)))
        elem_shape = mapped[0].shape

        # invert the previous reshape operation, using the shape of the map result
        linearized_shape_inv = key_shape + list(elem_shape)
        reordered = mapped.reshape(*linearized_shape_inv)

        return self._constructor(reordered)
示例#37
0
文件: array.py 项目: andrewosh/bolt
    def _reshapebasic(self, shape):
        """
        Check if the requested reshape can be broken into independant reshapes
        on the keys and values. If it can, returns the index in the new shape
        separating keys from values, otherwise returns -1
        """
        new = tupleize(shape)
        old_key_size = prod(self.keys.shape)
        old_value_size = prod(self.values.shape)

        for i in range(len(new)):
            new_key_size = prod(new[:i])
            new_value_size = prod(new[i:])
            if new_key_size == old_key_size and new_value_size == old_value_size:
                return i

        return -1
示例#38
0
    def _reshapebasic(self, shape):
        """
        Check if the requested reshape can be broken into independant reshapes
        on the keys and values. If it can, returns the index in the new shape
        separating keys from values, otherwise returns -1
        """
        new = tupleize(shape)
        old_key_size = prod(self.keys.shape)
        old_value_size = prod(self.values.shape)

        for i in range(len(new)):
            new_key_size = prod(new[:i])
            new_value_size = prod(new[i:])
            if new_key_size == old_key_size and new_value_size == old_value_size:
                return i

        return -1
示例#39
0
    def filter(self, func, axis=(0, )):
        """
        Filter array along an axis.

        Applies a function which should evaluate to boolean,
        along a single axis or multiple axes.

        Parameters
        ----------
        func : function
            Function to apply, should return boolean

        axis : tuple or int, optional, default=(0,)
            Axis or multiple axes to filter along.

        Returns
        -------
        BoltSparkArray
        """
        axis = tupleize(axis)
        if len(axis) != 1:
            raise NotImplementedError(
                "Filtering over multiple axes will not be "
                "supported until SparseBoltArray is implemented.")

        swapped = self._align(axis)
        rdd = swapped._rdd.values().filter(func)

        # count the resulting array in order to reindex (linearize) the keys
        count, zipped = zip_with_index(rdd)
        if not count:
            count = zipped.count()
        reindexed = zipped.map(lambda kv: (kv[1], kv[0]))

        remaining = [
            swapped.shape[dim] for dim in range(len(swapped.shape))
            if dim not in axis
        ]
        if count != 0:
            shape = tuple([count] + remaining)
        else:
            shape = (0, )

        return self._constructor(reindexed, shape=shape,
                                 split=swapped.split).__finalize__(swapped)
示例#40
0
    def _stat(self, axis=None, func=None, name=None):
        """
        Compute a statistic over an axis.

        Can provide either a function (for use in a reduce)
        or a name (for use by a stat counter)

        Parameters
        ----------
        axis : tuple or int, optional, default=None
            Axis to compute statistic over, if None
            will compute over all axes

        func : function, optional, default=None
            Function for reduce, see BoltSparkArray.reduce

        name : str
            A named statistic, see StatCounter
        """
        if axis is None:
            axis = list(range(len(self.shape)))
        axis = tupleize(axis)

        if func and not name:
            return self.reduce(func, axis)

        if name and not func:
            from bolt.local.array import BoltArrayLocal

            swapped = self._align(axis)

            def reducer(left, right):
                return left.combine(right)

            counter = swapped._rdd.values()\
                             .mapPartitions(lambda i: [StatCounter(values=i, stats=name)])\
                             .reduce(reducer)
            res = BoltArrayLocal(getattr(counter, name))
            return res.toscalar()

        else:
            raise ValueError(
                'Must specify either a function or a statistic name.')
示例#41
0
    def reduce(self, func, axis=0):
        """
        Reduce an array along an axis.

        Applies an associative/commutative function of two arguments
        cumulatively to all arrays along an axis. Array will be aligned
        so that the desired set of axes are in the keys, which may
        require a transpose/reshape.

        Parameters
        ----------
        func : function
            Function of two arrays that returns a single array

        axis : tuple or int, optional, default=(0,)
            Axis or multiple axes to reduce along.

        Returns
        -------
        BoltArrayLocal
        """
        axes = sorted(tupleize(axis))

        # if the function is a ufunc, it can automatically handle reducing over multiple axes
        if isinstance(func, ufunc):
            inshape(self.shape, axes)
            reduced = func.reduce(self, axis=tuple(axes))
        else:
            reshaped = self._align(axes)
            reduced = reduce(func, reshaped)

        new_array = self._constructor(reduced)

        # ensure that the shape of the reduced array is valid
        expected_shape = [
            self.shape[i] for i in range(len(self.shape)) if i not in axes
        ]
        if new_array.shape != tuple(expected_shape):
            raise ValueError(
                "reduce did not yield a BoltArray with valid dimensions")

        return new_array
示例#42
0
    def __getitem__(self, index):

        index = tupleize(index)

        if len(index) > self.ndim:
            raise ValueError("Too many indices for array")

        if not all(
            [isinstance(i, (slice, int, list, set, ndarray)) for i in index]):
            raise ValueError(
                "Each index must either be a slice, int, list, set, or ndarray"
            )

        # fill unspecified axes with full slices
        if len(index) < self.ndim:
            index += tuple(
                [slice(0, None, None) for _ in range(self.ndim - len(index))])

        # convert ints to lists if not all ints and slices
        if not all([isinstance(i, (int, slice)) for i in index]):
            index = tuple([[i] if isinstance(i, int) else i for i in index])

        # select basic or advanced indexing
        if all([isinstance(i, (slice, int)) for i in index]):
            rdd, shape, split = self._getbasic(index)
        elif all([isinstance(i, (set, list, ndarray)) for i in index]):
            rdd, shape, split = self._getadvanced(index)
        else:
            raise NotImplementedError(
                "Cannot mix basic indexing (slices and ints) with "
                "advanced indexing (lists and ndarrays) across axes")

        result = self._constructor(rdd, shape=shape,
                                   split=split).__finalize__(self)

        # squeeze out int dimensions (and squeeze to singletons if all ints)
        if all([isinstance(i, int) for i in index]):
            return result.squeeze().toarray()[()]
        else:
            tosqueeze = tuple([i for i in index if isinstance(i, int)])
            return result.squeeze(tosqueeze)
示例#43
0
    def filter(self, func, axis=(0,)):
        """
        Filter array along an axis.

        Applies a function which should evaluate to boolean,
        along a single axis or multiple axes.

        Parameters
        ----------
        func : function
            Function to apply, should return boolean

        axis : tuple or int, optional, default=(0,)
            Axis or multiple axes to filter along.

        Returns
        -------
        BoltSparkArray
        """
        axis = tupleize(axis)
        if len(axis) != 1:
            raise NotImplementedError("Filtering over multiple axes will not be "
                                      "supported until SparseBoltArray is implemented.")

        swapped = self._align(axis)
        rdd = swapped._rdd.values().filter(func)

        # count the resulting array in order to reindex (linearize) the keys
        count, zipped = zip_with_index(rdd)
        if not count:
            count = zipped.count()
        reindexed = zipped.map(lambda kv: (kv[1], kv[0]))

        remaining = [swapped.shape[dim] for dim in range(len(swapped.shape)) if dim not in axis]
        if count != 0:
            shape = tuple([count] + remaining)
        else:
            shape = (0,)

        return self._constructor(reindexed, shape=shape, split=swapped.split).__finalize__(swapped)
示例#44
0
    def reduce(self, func, axis=(0,), keepdims=False):
        """
        Reduce an array along an axis.

        Applies a commutative/associative function of two
        arguments cumulatively to all arrays along an axis.
        Array will be aligned so that the desired set of axes
        are in the keys, which may incur a swap.

        Parameters
        ----------
        func : function
            Function of two arrays that returns a single array

        axis : tuple or int, optional, default=(0,)
            Axis or multiple axes to reduce along.

        Returns
        -------
        BoltArraySpark
        """
        from bolt.local.array import BoltArrayLocal
        from numpy import ndarray

        axis = tupleize(axis)
        swapped = self._align(axis)
        arr = swapped._rdd.values().reduce(func)

        if keepdims:
            for i in axis:
                arr = expand_dims(arr, axis=i)

        if not isinstance(arr, ndarray):
            # the result of a reduce can also be a scalar
            return arr
        elif arr.shape == (1,):
            # ndarrays with single values in them should be converted into scalars
            return arr[0]

        return BoltArrayLocal(arr)
示例#45
0
    def reduce(self, func, axis=(0, ), keepdims=False):
        """
        Reduce an array along an axis.

        Applies a commutative/associative function of two
        arguments cumulatively to all arrays along an axis.
        Array will be aligned so that the desired set of axes
        are in the keys, which may incur a swap.

        Parameters
        ----------
        func : function
            Function of two arrays that returns a single array

        axis : tuple or int, optional, default=(0,)
            Axis or multiple axes to reduce along.

        Returns
        -------
        BoltArraySpark
        """
        from bolt.local.array import BoltArrayLocal
        from numpy import ndarray

        axis = tupleize(axis)
        swapped = self._align(axis)
        arr = swapped._rdd.values().treeReduce(func, depth=3)

        if keepdims:
            for i in axis:
                arr = expand_dims(arr, axis=i)

        if not isinstance(arr, ndarray):
            # the result of a reduce can also be a scalar
            return arr
        elif arr.shape == (1, ):
            # ndarrays with single values in them should be converted into scalars
            return arr[0]

        return BoltArrayLocal(arr)
示例#46
0
    def reduce(self, func, axis=0):
        """
        """

        axes = sorted(tupleize(axis))

        # if the function is a ufunc, it can automatically handle reducing over multiple axes
        if isinstance(func, ufunc):
            inshape(self.shape, axes)
            reduced = func.reduce(self, axis=tuple(axes))
        else:
            reshaped = self._align(axes)
            reduced = reduce(func, reshaped)

        new_array = self._constructor(reduced)

        # ensure that the shape of the reduced array is valid
        expected_shape = [self.shape[i] for i in range(len(self.shape)) if i not in axes]
        if new_array.shape != tuple(expected_shape):
            raise ValueError("reduce did not yield a BoltArray with valid dimensions")

        return new_array
示例#47
0
    def chunk(self, size, axis=None):
        """
        Split values of distributed array into chunks.

        Transforms an underlying pair RDD of (key, value) into
        records of the form: (key, chunk id), (chunked value).
        Here, chunk id is a tuple identifying the chunk and
        chunked value is a subset of the data from each original value,
        that has been divided along the specified dimensions.

        Parameters
        ----------
        size : str or tuple or int
            If str, the average size (in MB) of the chunks in all value dimensions.
            If int or tuple, an explicit specification of the number chunks in
            each value dimension.

        axis : tuple, optional, default=None
            One or more axes to estimate chunks for, if provided any
            other axes will use one chunk.
        """
        axis = tupleize(axis)
        plan = self.getplan(size, axis)

        if any([x > y for x, y in zip(plan, self.vshape)]):
            raise ValueError("Chunk sizes %s cannot exceed value dimensions %s along any axis"
                             % (tuple(plan), tuple(self.vshape)))

        slices = self.getslices(plan, self.vshape)
        labels = list(product(*[list(enumerate(s)) for s in slices]))
        scheme = [list(zip(*s)) for s in labels]

        def _chunk(record):
            k, v = record[0], record[1]
            for (chk, slc) in scheme:
                yield (k, chk), v[slc]

        rdd = self._rdd.flatMap(_chunk)
        return self._constructor(rdd, plan=plan).__finalize__(self)
示例#48
0
文件: array.py 项目: andrewosh/bolt
    def reduce(self, func, axis=0):
        """
        Reduce an array along an axis.

        Applies an associative/commutative function of two arguments
        cumulatively to all arrays along an axis. Array will be aligned
        so that the desired set of axes are in the keys, which may
        require a transpose/reshape.

        Parameters
        ----------
        func : function
            Function of two arrays that returns a single array

        axis : tuple or int, optional, default=(0,)
            Axis or multiple axes to reduce along.

        Returns
        -------
        BoltArrayLocal
        """
        axes = sorted(tupleize(axis))

        # if the function is a ufunc, it can automatically handle reducing over multiple axes
        if isinstance(func, ufunc):
            inshape(self.shape, axes)
            reduced = func.reduce(self, axis=tuple(axes))
        else:
            reshaped = self._align(axes)
            reduced = reduce(func, reshaped)

        new_array = self._constructor(reduced)

        # ensure that the shape of the reduced array is valid
        expected_shape = [self.shape[i] for i in range(len(self.shape)) if i not in axes]
        if new_array.shape != tuple(expected_shape):
            raise ValueError("reduce did not yield a BoltArray with valid dimensions")

        return new_array
示例#49
0
    def __getitem__(self, item):
        # handle values
        if isinstance(item, int):
            item = slice(item, item + 1, None)
        if isinstance(item, tuple):
            item = tuple([
                slice(i, i + 1, None) if isinstance(i, int) else i
                for i in item
            ])
        if isinstance(item, (list, ndarray)):
            item = (item, )
        new = self._values.__getitem__(item)
        result = self._constructor(new).__finalize__(self,
                                                     noprop=('index',
                                                             'labels'))

        # handle labels
        if self.labels is not None:
            if isinstance(item, int):
                label_item = ([item], )
            elif isinstance(item, (list, ndarray, slice)):
                label_item = (item, )
            elif isinstance(item, tuple):
                label_item = item[:len(self.baseaxes)]
            newlabels = self.labels
            for (i, s) in enumerate(label_item):
                if isinstance(s, slice):
                    newlabels = newlabels[[
                        s if j == i else slice(None)
                        for j in range(len(label_item))
                    ]]
                else:
                    newlabels = newlabels.take(tupleize(s), i)
            result.labels = newlabels

        return result
示例#50
0
 def check(v):
     if len(v.shape) > 0 and v.shape != tupleize(value_shape):
         raise Exception("Map operation did not produce values of uniform shape.")
     return v
示例#51
0
文件: array.py 项目: gdtm86/bolt
    def map(self, func, axis=(0,), value_shape=None, dtype=None, with_keys=False):
        """
        Apply a function across an axis.

        Array will be aligned so that the desired set of axes
        are in the keys, which may incur a swap.

        Parameters
        ----------
        func : function
            Function of a single array to apply. If with_keys=True,
            function should be of a (tuple, array) pair.

        axis : tuple or int, optional, default=(0,)
            Axis or multiple axes to apply function along.

        value_shape : tuple, optional, default=None
            Known shape of values resulting from operation

        dtype: numpy.dtype, optional, default=None
            Known dtype of values resulting from operation

        with_keys : bool, optional, default=False
            Include keys as an argument to the function

        Returns
        -------
        BoltArraySpark
        """
        axis = tupleize(axis)
        swapped = self._align(axis)

        if with_keys:
            test_func = lambda x: func(((0,), x))
        else:
            test_func = func

        if value_shape is None or dtype is None:
            # try to compute the size of each mapped element by applying func to a random array
            try:
                mapped = test_func(random.randn(*swapped.values.shape).astype(self.dtype))
            except Exception:
                first = swapped._rdd.first()
                if first:
                    # eval func on the first element
                    mapped = test_func(first[1])
            if value_shape is None:
                value_shape = mapped.shape
            if dtype is None:
                dtype = mapped.dtype

        shape = tuple([swapped._shape[ax] for ax in range(len(axis))]) + tupleize(value_shape)

        if with_keys:
            rdd = swapped._rdd.map(lambda kv: (kv[0], func(kv)))
        else:
            rdd = swapped._rdd.mapValues(func)

        # reshaping will fail if the elements aren't uniformly shaped
        def check(v):
            if len(v.shape) > 0 and v.shape != tupleize(value_shape):
                raise Exception("Map operation did not produce values of uniform shape.")
            return v

        rdd = rdd.mapValues(lambda v: check(v))

        return self._constructor(rdd, shape=shape, dtype=dtype, split=swapped.split).__finalize__(swapped)
示例#52
0
def test_tupleize():

    assert tupleize([1, 2, 3]) == (1, 2, 3)
    assert tupleize((1, 2, 3)) == (1, 2, 3)
    assert tupleize((1,)) == (1,)
    assert tupleize(1) == (1,)
示例#53
0
def test_tupleize():

    assert tupleize([1, 2, 3]) == (1, 2, 3)
    assert tupleize((1, 2, 3)) == (1, 2, 3)
    assert tupleize((1, )) == (1, )
    assert tupleize(1) == (1, )
示例#54
0
 def check(v):
     if len(v.shape) > 0 and v.shape != tupleize(value_shape):
         raise Exception(
             "Map operation did not produce values of uniform shape.")
     return v
示例#55
0
    def map(self,
            func,
            axis=(0, ),
            value_shape=None,
            dtype=None,
            with_keys=False):
        """
        Apply a function across an axis.

        Array will be aligned so that the desired set of axes
        are in the keys, which may incur a swap.

        Parameters
        ----------
        func : function
            Function of a single array to apply. If with_keys=True,
            function should be of a (tuple, array) pair.

        axis : tuple or int, optional, default=(0,)
            Axis or multiple axes to apply function along.

        value_shape : tuple, optional, default=None
            Known shape of values resulting from operation

        dtype: numpy.dtype, optional, default=None
            Known dtype of values resulting from operation

        with_keys : bool, optional, default=False
            Include keys as an argument to the function

        Returns
        -------
        BoltArraySpark
        """
        axis = tupleize(axis)
        swapped = self._align(axis)

        if with_keys:
            test_func = lambda x: func(((0, ), x))
        else:
            test_func = func

        if value_shape is None or dtype is None:
            # try to compute the size of each mapped element by applying func to a random array
            try:
                mapped = test_func(
                    random.randn(*swapped.values.shape).astype(self.dtype))
            except Exception:
                first = swapped._rdd.first()
                if first:
                    # eval func on the first element
                    mapped = test_func(first[1])
            if value_shape is None:
                value_shape = mapped.shape
            if dtype is None:
                dtype = mapped.dtype

        shape = tuple([swapped._shape[ax]
                       for ax in range(len(axis))]) + tupleize(value_shape)

        if with_keys:
            rdd = swapped._rdd.map(lambda kv: (kv[0], func(kv)))
        else:
            rdd = swapped._rdd.mapValues(func)

        # reshaping will fail if the elements aren't uniformly shaped
        def check(v):
            if len(v.shape) > 0 and v.shape != tupleize(value_shape):
                raise Exception(
                    "Map operation did not produce values of uniform shape.")
            return v

        rdd = rdd.mapValues(lambda v: check(v))

        return self._constructor(rdd,
                                 shape=shape,
                                 dtype=dtype,
                                 split=swapped.split).__finalize__(swapped)