def chunk(self, size="150", axis=None): """ Chunks records of a distributed array. Chunking breaks arrays into subarrays, using an specified size of chunks along each value dimension. Can alternatively specify an average chunk byte size (in megabytes) and the size of chunks (as ints) will be computed automatically. Parameters ---------- size : tuple, int, or str, optional, default = "150" A string giving the size in megabytes, or a tuple with the size of chunks along each dimension. axis : int or tuple, optional, default = None One or more axis to chunk array along, if None will use all axes, Returns ------- ChunkedArray """ if type(size) is not str: size = tupleize((size)) axis = tupleize((axis)) from bolt.spark.chunk import ChunkedArray chnk = ChunkedArray(rdd=self._rdd, shape=self._shape, split=self._split, dtype=self._dtype) return chnk.chunk(size, axis)
def chunk(self, size="150", axis=None): """ Chunks records of a distributed array. Chunking breaks arrays into subarrays, using an specified number of chunks along each value dimension. Can alternatively specify an average chunk size (in megabytes) and the number of chunks will be computed automatically. Parameters ---------- size : tuple, int, or str, optional, default = "150" A string giving the size in megabytes, or a tuple with the number of chunks along each dimension. axis : int or tuple, optional, default = None One or more axis to chunk array along, if None will use all axes, Returns ------- ChunkedArray """ if type(size) is not str: size = tupleize((size)) axis = tupleize((axis)) from bolt.spark.chunk import ChunkedArray chnk = ChunkedArray(rdd=self._rdd, shape=self._shape, split=self._split, dtype=self._dtype) return chnk.chunk(size, axis)
def swap(self, key_axes, value_axes, size=150): key_axes, value_axes = tupleize(key_axes), tupleize(value_axes) if len(key_axes) == self.keys.ndim and len(value_axes) == 0: raise ValueError('Cannot perform a swap that would ' 'end up with all data on a single key') if len(key_axes) == 0 and len(value_axes) == 0: return self if self.values.ndim == 0: rdd = self._rdd.mapValues(lambda v: array(v, ndmin=1)) value_shape = (1, ) else: rdd = self._rdd value_shape = self.values.shape from bolt.spark.swap import Swapper, Dims k = Dims(shape=self.keys.shape, axes=key_axes) v = Dims(shape=value_shape, axes=value_axes) s = Swapper(k, v, self.dtype, size) chunks = s.chunk(rdd) rdd = s.extract(chunks) shape = s.getshape() split = self.split - len(key_axes) + len(value_axes) if self.values.ndim == 0: rdd = rdd.mapValues(lambda v: v.squeeze()) shape = shape[:-1] return self._constructor(rdd, shape=tuple(shape), split=split)
def swap(self, key_axes, value_axes, size=150): key_axes, value_axes = tupleize(key_axes), tupleize(value_axes) if len(key_axes) == self.keys.ndim and len(value_axes) == 0: raise ValueError('Cannot perform a swap that would ' 'end up with all data on a single key') if len(key_axes) == 0 and len(value_axes) == 0: return self if self.values.ndim == 0: rdd = self._rdd.mapValues(lambda v: array(v, ndmin=1)) value_shape = (1,) else: rdd = self._rdd value_shape = self.values.shape from bolt.spark.swap import Swapper, Dims k = Dims(shape=self.keys.shape, axes=key_axes) v = Dims(shape=value_shape, axes=value_axes) s = Swapper(k, v, self.dtype, size) chunks = s.chunk(rdd) rdd = s.extract(chunks) shape = s.getshape() split = self.split - len(key_axes) + len(value_axes) if self.values.ndim == 0: rdd = rdd.mapValues(lambda v: v.squeeze()) shape = shape[:-1] return self._constructor(rdd, shape=tuple(shape), split=split)
def swap(self, kaxes, vaxes, size="150"): """ Swap axes from keys to values. This is the core operation underlying shape manipulation on the Spark bolt array. It exchanges an arbitrary set of axes between the keys and the valeus. If either is None, will only move axes in one direction (from keys to values, or values to keys). Keys moved to values will be placed immediately after the split; values moved to keys will be placed immediately before the split. Parameters ---------- kaxes : tuple Axes from keys to move to values vaxes : tuple Axes from values to move to keys size : tuple or int, optional, default = "150" Can either provide a string giving the size in megabytes, or a tuple with the number of chunks along each value dimension being moved Returns ------- BoltArraySpark """ kaxes = asarray(tupleize(kaxes), 'int') vaxes = asarray(tupleize(vaxes), 'int') if type(size) is not str: size = tupleize(size) if len(kaxes) == self.keys.ndim and len(vaxes) == 0: raise ValueError('Cannot perform a swap that would ' 'end up with all data on a single key') if len(kaxes) == 0 and len(vaxes) == 0: return self if self.values.ndim == 0: rdd = self._rdd.mapValues(lambda v: array(v, ndmin=1)) shape = self._shape + (1,) else: rdd = self._rdd shape = self._shape from bolt.spark.chunk import ChunkedArray c = ChunkedArray(rdd, shape=shape, split=self._split, dtype=self._dtype) chunks = c.chunk(size, axis=vaxes) barray = chunks.move(kaxes, vaxes) if self.values.ndim == 0: barray._rdd = barray._rdd.mapValues(lambda v: v.squeeze()) barray._shape = barray._shape[:-1] return barray
def swap(self, kaxes, vaxes, size="150"): """ Swap axes from keys to values. This is the core operation underlying shape manipulation on the Spark bolt array. It exchanges an arbitrary set of axes between the keys and the valeus. If either is None, will only move axes in one direction (from keys to values, or values to keys). Keys moved to values will be placed immediately after the split; values moved to keys will be placed immediately before the split. Parameters ---------- kaxes : tuple Axes from keys to move to values vaxes : tuple Axes from values to move to keys size : tuple or int, optional, default = "150" Can either provide a string giving the size in kilobytes, or a tuple with the number of chunks along each value dimension being moved Returns ------- BoltArraySpark """ kaxes = asarray(tupleize(kaxes), 'int') vaxes = asarray(tupleize(vaxes), 'int') if type(size) is not str: size = tupleize(size) if len(kaxes) == self.keys.ndim and len(vaxes) == 0: raise ValueError('Cannot perform a swap that would ' 'end up with all data on a single key') if len(kaxes) == 0 and len(vaxes) == 0: return self from bolt.spark.chunk import ChunkedArray c = ChunkedArray(self._rdd, shape=self._shape, split=self._split, dtype=self._dtype) chunks = c._chunk(size, axis=vaxes) swapped = chunks.keys_to_values(kaxes).values_to_keys( [v + len(kaxes) for v in vaxes]) barray = swapped.unchunk() return barray
def filter(self, func, axis=(0, ), sort=False): """ Filter array along an axis. Applies a function which should evaluate to boolean, along a single axis or multiple axes. Array will be aligned so that the desired set of axes are in the keys, which may incur a swap. Parameters ---------- func : function Function to apply, should return boolean axis : tuple or int, optional, default=(0,) Axis or multiple axes to filter along. sort: bool, optional, default=False Whether or not to sort by key before reindexing Returns ------- BoltArraySpark """ axis = tupleize(axis) swapped = self._align(axis) def f(record): return func(record[1]) rdd = swapped._rdd.filter(f) if sort: rdd = rdd.sortByKey().values() else: rdd = rdd.values() # count the resulting array in order to reindex (linearize) the keys count, zipped = zip_with_index(rdd) if not count: count = zipped.count() reindexed = zipped.map(lambda kv: (tupleize(kv[1]), kv[0])) # since we can only filter over one axis, the remaining shape is always the following remaining = list(swapped.shape[len(axis):]) if count != 0: shape = tuple([count] + remaining) else: shape = (0, ) return self._constructor(reindexed, shape=shape, split=swapped.split).__finalize__(swapped)
def map(self, func, axis=(0,), value_shape=None): """ Apply a function across an axis. Array will be aligned so that the desired set of axes are in the keys, which may incur a swap. Parameters ---------- func : function Function of a single array to apply axis : tuple or int, optional, default=(0,) Axis or multiple axes to apply function along. value_shape : tuple, optional, default=None Known shape of values resulting from operation Returns ------- BoltArraySpark """ axis = tupleize(axis) swapped = self._align(axis) if value_shape is None: # try to compute the size of each mapped element by applying func to a random array value_shape = None try: value_shape = func(random.randn(*swapped.values.shape).astype(self.dtype)).shape except Exception: first = swapped._rdd.first() if first: # eval func on the first element mapped = func(first[1]) value_shape = mapped.shape shape = tuple([swapped._shape[ax] for ax in range(len(axis))]) + tupleize(value_shape) rdd = swapped._rdd.mapValues(func) # reshaping will fail if the elements aren't uniformly shaped def check(v): if len(v.shape) > 0 and v.shape != tupleize(value_shape): raise Exception("Map operation did not produce values of uniform shape.") return v rdd = rdd.mapValues(lambda v: check(v)) return self._constructor(rdd, shape=shape, split=swapped.split).__finalize__(swapped)
def filter(self, func, axis=(0,), sort=False): """ Filter array along an axis. Applies a function which should evaluate to boolean, along a single axis or multiple axes. Array will be aligned so that the desired set of axes are in the keys, which may incur a swap. Parameters ---------- func : function Function to apply, should return boolean axis : tuple or int, optional, default=(0,) Axis or multiple axes to filter along. sort: bool, optional, default=False Whether or not to sort by key before reindexing Returns ------- BoltArraySpark """ axis = tupleize(axis) swapped = self._align(axis) def f(record): return func(record[1]) rdd = swapped._rdd.filter(f) if sort: rdd = rdd.sortByKey().values() else: rdd = rdd.values() # count the resulting array in order to reindex (linearize) the keys count, zipped = zip_with_index(rdd) if not count: count = zipped.count() reindexed = zipped.map(lambda kv: (tupleize(kv[1]), kv[0])) # since we can only filter over one axis, the remaining shape is always the following remaining = list(swapped.shape[len(axis):]) if count != 0: shape = tuple([count] + remaining) else: shape = (0,) return self._constructor(reindexed, shape=shape, split=swapped.split).__finalize__(swapped)
def swap(self, kaxes, vaxes, size="150"): """ Swap axes from keys to values. This is the core operation underlying shape manipulation on the Spark bolt array. It exchanges an arbitrary set of axes between the keys and the valeus. If either is None, will only move axes in one direction (from keys to values, or values to keys). Keys moved to values will be placed immediately after the split; values moved to keys will be placed immediately before the split. Parameters ---------- kaxes : tuple Axes from keys to move to values vaxes : tuple Axes from values to move to keys size : tuple or int, optional, default = "150" Can either provide a string giving the size in kilobytes, or a tuple with the number of chunks along each value dimension being moved Returns ------- BoltArraySpark """ kaxes = asarray(tupleize(kaxes), 'int') vaxes = asarray(tupleize(vaxes), 'int') if type(size) is not str: size = tupleize(size) if len(kaxes) == self.keys.ndim and len(vaxes) == 0: raise ValueError('Cannot perform a swap that would ' 'end up with all data on a single key') if len(kaxes) == 0 and len(vaxes) == 0: return self from bolt.spark.chunk import ChunkedArray chunks = self.chunk(size) swapped = chunks.keys_to_values(kaxes).values_to_keys([v+len(kaxes) for v in vaxes]) barray = swapped.unchunk() return barray
def __getitem__(self, index): index = tupleize(index) if len(index) > self.ndim: raise ValueError("Too many indices for array") if not all([isinstance(i, (slice, int, list, set, ndarray)) for i in index]): raise ValueError("Each index must either be a slice, int, list, set, or ndarray") # fill unspecified axes with full slices if len(index) < self.ndim: index += tuple([slice(0, None, None) for _ in range(self.ndim - len(index))]) # convert ints to lists if not all ints and slices if not all([isinstance(i, (int, slice)) for i in index]): index = tuple([[i] if isinstance(i, int) else i for i in index]) # select basic or advanced indexing if all([isinstance(i, (slice, int)) for i in index]): rdd, shape, split = self._getbasic(index) elif all([isinstance(i, (set, list, ndarray)) for i in index]): rdd, shape, split = self._getadvanced(index) else: raise NotImplementedError("Cannot mix basic indexing (slices and ints) with " "advanced indexing (lists and ndarrays) across axes") result = self._constructor(rdd, shape=shape, split=split).__finalize__(self) # squeeze out int dimensions (and squeeze to singletons if all ints) if all([isinstance(i, int) for i in index]): return result.squeeze().toarray()[()] else: tosqueeze = tuple([i for i in index if isinstance(i, int)]) return result.squeeze(tosqueeze)
def reduce(self, func, axis=(0,)): """ Reduce an array along an axis. Applies a function of two arguments cumlutatively to all arrays along an axis. Parameters ---------- func : function Function of two arrays that returns a single array axis : tuple or int, optional, default=(0,) Axis or multiple axes to reduce along. Returns ------- BoltSparkArray """ from bolt.local.array import BoltArrayLocal from numpy import ndarray axis = tupleize(axis) swapped = self._align(axis) arr = swapped._rdd.values().reduce(func) if not isinstance(arr, ndarray): # the result of a reduce can also be a scalar return arr elif arr.shape == (1,): # ndarrays with single values in them should be converted into scalars return arr[0] return BoltArrayLocal(arr)
def reduce(self, func, axis=(0, )): """ Reduce an array along an axis. Applies a function of two arguments cumlutatively to all arrays along an axis. Parameters ---------- func : function Function of two arrays that returns a single array axis : tuple or int, optional, default=(0,) Axis or multiple axes to reduce along. Returns ------- BoltSparkArray """ from bolt.local.array import BoltArrayLocal from numpy import ndarray axis = tupleize(axis) swapped = self._align(axis) arr = swapped._rdd.values().reduce(func) if not isinstance(arr, ndarray): # the result of a reduce can also be a scalar return arr elif arr.shape == (1, ): # ndarrays with single values in them should be converted into scalars return arr[0] return BoltArrayLocal(arr)
def sample(self, n=100, seed=None): """ Extract random sample of records. Parameters ---------- n : int, optional, default = 100 The number of data points to sample. seed : int, optional, default = None Random seed. """ if n < 1: raise ValueError("Number of samples must be larger than 0, got '%g'" % n) if seed is None: seed = random.randint(0, 2 ** 32) if self.mode == 'spark': result = asarray(self.values.tordd().values().takeSample(False, n, seed)) else: basedims = [self.shape[d] for d in self.baseaxes] inds = [unravel_index(int(k), basedims) for k in random.rand(n) * prod(basedims)] result = asarray([self.values[tupleize(i) + (slice(None, None),)] for i in inds]) return self._constructor(result, index=self.index)
def __getitem__(self, item): # handle values -- convert ints to slices so no dimensions are dropped if isinstance(item, int): item = tuple([slicify(item, self.shape[0])]) if isinstance(item, tuple): item = tuple([slicify(i, n) if isinstance(i, int) else i for i, n in zip(item, self.shape[:len(item)])]) if isinstance(item, (list, ndarray)): item = (item,) new = self._values.__getitem__(item) result = self._constructor(new).__finalize__(self, noprop=('index', 'labels')) # handle labels if self.labels is not None: if isinstance(item, int): label_item = ([item],) elif isinstance(item, (list, ndarray, slice)): label_item = (item, ) elif isinstance(item, tuple): label_item = item[:len(self.baseaxes)] newlabels = self.labels for (i, s) in enumerate(label_item): if isinstance(s, slice): newlabels = newlabels[[s if j==i else slice(None) for j in range(len(label_item))]] else: newlabels = newlabels.take(tupleize(s), i) result.labels = newlabels return result
def map(self, func, axis=(0, )): """ Apply a function across an axis. Array will be aligned so that the desired set of axes are in the keys, which may require a transpose/reshape. Parameters ---------- func : function Function of a single array to apply axis : tuple or int, optional, default=(0,) Axis or multiple axes to apply function along. Returns ------- BoltArrayLocal """ axes = sorted(tupleize(axis)) key_shape = [self.shape[axis] for axis in axes] reshaped = self._align(axes, key_shape=key_shape) mapped = asarray(list(map(func, reshaped))) elem_shape = mapped[0].shape # invert the previous reshape operation, using the shape of the map result linearized_shape_inv = key_shape + list(elem_shape) reordered = mapped.reshape(*linearized_shape_inv) return self._constructor(reordered)
def filter(self, func, axis=(0, )): """ Filter array along an axis. Applies a function which should evaluate to boolean, along a single axis or multiple axes. Array will be aligned so that the desired set of axes are in the keys, which may require a transpose/reshape. Parameters ---------- func : function Function to apply, should return boolean axis : tuple or int, optional, default=(0,) Axis or multiple axes to filter along. Returns ------- BoltArrayLocal """ axes = sorted(tupleize(axis)) reshaped = self._align(axes) filtered = asarray(list(filter(func, reshaped))) return self._constructor(filtered)
def sample(self, n=100, seed=None): """ Extract random sample of records. Parameters ---------- n : int, optional, default = 100 The number of data points to sample. seed : int, optional, default = None Random seed. """ if n < 1: raise ValueError( "Number of samples must be larger than 0, got '%g'" % n) if seed is None: seed = random.randint(0, 2**32) if self.mode == 'spark': result = asarray(self.values.tordd().values().takeSample( False, n, seed)) else: basedims = [self.shape[d] for d in self.baseaxes] inds = [ unravel_index(int(k), basedims) for k in random.rand(n) * prod(basedims) ] result = asarray([ self.values[tupleize(i) + (slice(None, None), )] for i in inds ]) return self._constructor(result, index=self.index)
def __getitem__(self, item): # handle values if isinstance(item, int): item = slice(item, item+1, None) if isinstance(item, tuple): item = tuple([slice(i, i+1, None) if isinstance(i, int) else i for i in item]) if isinstance(item, (list, ndarray)): item = (item,) new = self._values.__getitem__(item) result = self._constructor(new).__finalize__(self, noprop=('index', 'labels')) # handle labels if self.labels is not None: if isinstance(item, int): label_item = ([item],) elif isinstance(item, (list, ndarray, slice)): label_item = (item, ) elif isinstance(item, tuple): label_item = item[:len(self.baseaxes)] newlabels = self.labels for (i, s) in enumerate(label_item): if isinstance(s, slice): newlabels = newlabels[[s if j==i else slice(None) for j in range(len(label_item))]] else: newlabels = newlabels.take(tupleize(s), i) result.labels = newlabels return result
def reduce(self, func, axis=0): """ """ axes = sorted(tupleize(axis)) # if the function is a ufunc, it can automatically handle reducing over multiple axes if isinstance(func, ufunc): inshape(self.shape, axes) reduced = func.reduce(self, axis=tuple(axes)) else: reshaped = self._align(axes) reduced = reduce(func, reshaped) new_array = self._constructor(reduced) # ensure that the shape of the reduced array is valid expected_shape = [ self.shape[i] for i in range(len(self.shape)) if i not in axes ] if new_array.shape != tuple(expected_shape): raise ValueError( "reduce did not yield a BoltArray with valid dimensions") return new_array
def map(self, func, axis=(0,)): """ Apply a function across an axis. Array will be aligned so that the desired set of axes are in the keys, which may require a transpose/reshape. Parameters ---------- func : function Function of a single array to apply axis : tuple or int, optional, default=(0,) Axis or multiple axes to apply function along. Returns ------- BoltArrayLocal """ axes = sorted(tupleize(axis)) key_shape = [self.shape[axis] for axis in axes] reshaped = self._align(axes, key_shape=key_shape) mapped = asarray(list(map(func, reshaped))) elem_shape = mapped[0].shape # invert the previous reshape operation, using the shape of the map result linearized_shape_inv = key_shape + list(elem_shape) reordered = mapped.reshape(*linearized_shape_inv) return self._constructor(reordered)
def filter(self, func, axis=(0,)): """ Filter array along an axis. Applies a function which should evaluate to boolean, along a single axis or multiple axes. Array will be aligned so that the desired set of axes are in the keys, which may require a transpose/reshape. Parameters ---------- func : function Function to apply, should return boolean axis : tuple or int, optional, default=(0,) Axis or multiple axes to filter along. Returns ------- BoltArrayLocal """ axes = sorted(tupleize(axis)) reshaped = self._align(axes) filtered = asarray(list(filter(func, reshaped))) return self._constructor(filtered)
def map(self, func, value_shape=None, dtype=None, with_keys=False): """ Apply an array -> array function across an axis. Array will be aligned so that the desired set of axes are in the keys, which may require a transpose/reshape. Parameters ---------- func : function Function of a single array to apply. If with_keys=True, function should be of a (tuple, array) pair. axis : tuple or int, optional, default=(0,) Axis or multiple axes to apply function along. value_shape : tuple, optional, default=None Known shape of values resulting from operation. Only valid in spark mode. dtype: numpy.dtype, optional, default=None Known shape of dtype resulting from operation. Only valid in spark mode. with_keys : bool, optional, default=False Include keys as an argument to the function """ axis = self.baseaxes if self.mode == 'local': axes = sorted(tupleize(axis)) key_shape = [self.shape[axis] for axis in axes] reshaped = self._align(axes, key_shape=key_shape) if with_keys: keys = zip(*unravel_index(range(prod(key_shape)), key_shape)) mapped = asarray(list(map(func, zip(keys, reshaped)))) else: mapped = asarray(list(map(func, reshaped))) try: elem_shape = mapped[0].shape except: elem_shape = (1,) expand = list(elem_shape) expand = [1] if len(expand) == 0 else expand # invert the previous reshape operation, using the shape of the map result linearized_shape_inv = key_shape + expand reordered = mapped.reshape(*linearized_shape_inv) return self._constructor(reordered, mode=self.mode).__finalize__(self, noprop=('index')) if self.mode == 'spark': expand = lambda x: array(func(x), ndmin=1) mapped = self.values.map(expand, axis, value_shape, dtype, with_keys) return self._constructor(mapped, mode=self.mode).__finalize__(self, noprop=('index',))
def filter(self, func, axis=0): """ """ axes = sorted(tupleize(axis)) reshaped = self._align(axes) filtered = asarray(list(filter(func, reshaped))) return self._constructor(filtered)
def __getitem__(self, index): """ Get an item from the array through indexing. Supports basic indexing with slices and ints, or advanced indexing with lists or ndarrays of integers. Mixing basic and advanced indexing across axes is not currently supported. Parameters ---------- index : tuple of slices, ints, list, sets, or ndarrays One or more index specifications Returns ------- BoltSparkArray """ index = tupleize(index) if len(index) > self.ndim: raise ValueError("Too many indices for array") if not all( [isinstance(i, (slice, int, list, set, ndarray)) for i in index]): raise ValueError( "Each index must either be a slice, int, list, set, or ndarray" ) # fill unspecified axes with full slices if len(index) < self.ndim: index += tuple( [slice(0, None, None) for _ in range(self.ndim - len(index))]) # convert ints to lists if not all ints and slices if not all([isinstance(i, (int, slice)) for i in index]): index = tuple([[i] if isinstance(i, int) else i for i in index]) # select basic or advanced indexing if all([isinstance(i, (slice, int)) for i in index]): rdd, shape, split = self._getbasic(index) elif all([isinstance(i, (set, list, ndarray)) for i in index]): rdd, shape, split = self._getadvanced(index) else: raise NotImplementedError( "Cannot mix basic indexing (slices and ints) with " "advanced indexing (lists and ndarrays) across axes") result = self._constructor(rdd, shape=shape, split=split).__finalize__(self) # squeeze out int dimensions (and squeeze to singletons if all ints) if all([isinstance(i, int) for i in index]): return result.squeeze().toarray()[()] else: tosqueeze = tuple([i for i in index if isinstance(i, int)]) return result.squeeze(tosqueeze)
def _map(self, func, axis=(0,), value_shape=None, dtype=None, with_keys=False): """ Apply an array -> array function across an axis. Array will be aligned so that the desired set of axes are in the keys, which may require a transpose/reshape. Parameters ---------- func : function Function of a single array to apply. If with_keys=True, function should be of a (tuple, array) pair. axis : tuple or int, optional, default=(0,) Axis or multiple axes to apply function along. value_shape : tuple, optional, default=None Known shape of values resulting from operation. Only valid in spark mode. dtype: numpy.dtype, optional, default=None Known shape of dtype resulting from operation. Only valid in spark mode. with_keys : bool, optional, default=False Include keys as an argument to the function """ if self.mode == 'local': axes = sorted(tupleize(axis)) key_shape = [self.shape[axis] for axis in axes] reshaped = self._align(axes, key_shape=key_shape) if with_keys: keys = zip(*unravel_index(range(prod(key_shape)), key_shape)) mapped = asarray(list(map(func, zip(keys, reshaped)))) else: mapped = asarray(list(map(func, reshaped))) try: elem_shape = mapped[0].shape except: elem_shape = (1,) expand = list(elem_shape) expand = [1] if len(expand) == 0 else expand # invert the previous reshape operation, using the shape of the map result linearized_shape_inv = key_shape + expand reordered = mapped.reshape(*linearized_shape_inv) return self._constructor(reordered, mode=self.mode).__finalize__(self, noprop=('index')) if self.mode == 'spark': expand = lambda x: array(func(x), ndmin=1) mapped = self.values.map(expand, axis, value_shape, dtype, with_keys) return self._constructor(mapped, mode=self.mode).__finalize__(self, noprop=('index',))
def filter(self, func, axis=(0,)): """ Filter array along an axis. Applies a function which should evaluate to boolean, along a single axis or multiple axes. Array will be aligned so that the desired set of axes are in the keys, which may incur a swap. Parameters ---------- func : function Function to apply, should return boolean axis : tuple or int, optional, default=(0,) Axis or multiple axes to filter along. Returns ------- BoltArraySpark """ axis = tupleize(axis) if len(axis) != 1: raise NotImplementedError("Filtering over multiple axes will not be " "supported until SparseBoltArray is implemented.") swapped = self._align(axis) rdd = swapped._rdd.values().filter(func) # count the resulting array in order to reindex (linearize) the keys count, zipped = zip_with_index(rdd) if not count: count = zipped.count() reindexed = zipped.map(lambda kv: (tupleize(kv[1]), kv[0])) # since we can only filter over one axis, the remaining shape is always the following remaining = list(swapped.shape[1:]) if count != 0: shape = tuple([count] + remaining) else: shape = (0,) return self._constructor(reindexed, shape=shape, split=swapped.split).__finalize__(swapped)
def chunk(self, size="150", axis=None, padding=None): """ Chunks records of a distributed array. Chunking breaks arrays into subarrays, using an specified size of chunks along each value dimension. Can alternatively specify an average chunk byte size (in kilobytes) and the size of chunks (as ints) will be computed automatically. Parameters ---------- size : tuple, int, or str, optional, default = "150" A string giving the size in kilobytes, or a tuple with the size of chunks along each dimension. axis : int or tuple, optional, default = None One or more axis to chunk array along, if None will use all axes, padding: tuple or int, default = None Number of elements per dimension that will overlap with the adjacent chunk. If a tuple, specifies padding along each chunked dimension; if a int, same padding will be applied to all chunked dimensions. Returns ------- ChunkedArray """ if type(size) is not str: size = tupleize((size)) axis = tupleize((axis)) padding = tupleize((padding)) from bolt.spark.chunk import ChunkedArray chnk = ChunkedArray(rdd=self._rdd, shape=self._shape, split=self._split, dtype=self._dtype) return chnk._chunk(size, axis, padding)
def _stat(self, axis=None, func=None, name=None, keepdims=False): """ Compute a statistic over an axis. Can provide either a function (for use in a reduce) or a name (for use by a stat counter). Parameters ---------- axis : tuple or int, optional, default=None Axis to compute statistic over, if None will compute over all axes func : function, optional, default=None Function for reduce, see BoltArraySpark.reduce name : str A named statistic, see StatCounter keepdims : boolean, optional, default=False Keep axis remaining after operation with size 1. """ if axis is None: axis = list(range(len(self.shape))) axis = tupleize(axis) if func and not name: return self.reduce(func, axis, keepdims) if name and not func: from bolt.local.array import BoltArrayLocal swapped = self._align(axis) def reducer(left, right): return left.combine(right) counter = swapped._rdd.values()\ .mapPartitions(lambda i: [StatCounter(values=i, stats=name)])\ .treeReduce(reducer, depth=3) arr = getattr(counter, name) if keepdims: for i in axis: arr = expand_dims(arr, axis=i) return BoltArrayLocal(arr).toscalar() else: raise ValueError( 'Must specify either a function or a statistic name.')
def __getitem__(self, index): """ Get an item from the array through indexing. Supports basic indexing with slices and ints, or advanced indexing with lists or ndarrays of integers. Mixing basic and advanced indexing across axes is not currently supported. Parameters ---------- index : tuple of slices, ints, list, sets, or ndarrays One or more index specifications Returns ------- BoltSparkArray """ index = tupleize(index) if len(index) > self.ndim: raise ValueError("Too many indices for array") if not all([isinstance(i, (slice, int, list, set, ndarray)) for i in index]): raise ValueError("Each index must either be a slice, int, list, set, or ndarray") # fill unspecified axes with full slices if len(index) < self.ndim: index += tuple([slice(0, None, None) for _ in range(self.ndim - len(index))]) # convert ints to lists if not all ints and slices if not all([isinstance(i, (int, slice)) for i in index]): index = tuple([[i] if isinstance(i, int) else i for i in index]) # select basic or advanced indexing if all([isinstance(i, (slice, int)) for i in index]): rdd, shape, split = self._getbasic(index) elif all([isinstance(i, (set, list, ndarray)) for i in index]): rdd, shape, split = self._getadvanced(index) else: raise NotImplementedError("Cannot mix basic indexing (slices and ints) with " "advanced indexing (lists and ndarrays) across axes") result = self._constructor(rdd, shape=shape, split=split).__finalize__(self) # squeeze out int dimensions (and squeeze to singletons if all ints) if all([isinstance(i, int) for i in index]): return result.squeeze().toarray()[()] else: tosqueeze = tuple([i for i in index if isinstance(i, int)]) return result.squeeze(tosqueeze)
def _stat(self, axis=None, func=None, name=None, keepdims=False): """ Compute a statistic over an axis. Can provide either a function (for use in a reduce) or a name (for use by a stat counter). Parameters ---------- axis : tuple or int, optional, default=None Axis to compute statistic over, if None will compute over all axes func : function, optional, default=None Function for reduce, see BoltArraySpark.reduce name : str A named statistic, see StatCounter keepdims : boolean, optional, default=False Keep axis remaining after operation with size 1. """ if axis is None: axis = list(range(len(self.shape))) axis = tupleize(axis) if func and not name: return self.reduce(func, axis, keepdims) if name and not func: from bolt.local.array import BoltArrayLocal swapped = self._align(axis) def reducer(left, right): return left.combine(right) counter = swapped._rdd.values()\ .mapPartitions(lambda i: [StatCounter(values=i, stats=name)])\ .reduce(reducer) arr = getattr(counter, name) if keepdims: for i in axis: arr = expand_dims(arr, axis=i) return BoltArrayLocal(arr).toscalar() else: raise ValueError('Must specify either a function or a statistic name.')
def map(self, func, axis=0): """ """ axes = sorted(tupleize(axis)) key_shape = [self.shape[axis] for axis in axes] reshaped = self._align(axes, key_shape=key_shape) mapped = asarray(list(map(func, reshaped))) elem_shape = mapped[0].shape # invert the previous reshape operation, using the shape of the map result linearized_shape_inv = key_shape + list(elem_shape) reordered = mapped.reshape(*linearized_shape_inv) return self._constructor(reordered)
def _reshapebasic(self, shape): """ Check if the requested reshape can be broken into independant reshapes on the keys and values. If it can, returns the index in the new shape separating keys from values, otherwise returns -1 """ new = tupleize(shape) old_key_size = prod(self.keys.shape) old_value_size = prod(self.values.shape) for i in range(len(new)): new_key_size = prod(new[:i]) new_value_size = prod(new[i:]) if new_key_size == old_key_size and new_value_size == old_value_size: return i return -1
def filter(self, func, axis=(0, )): """ Filter array along an axis. Applies a function which should evaluate to boolean, along a single axis or multiple axes. Parameters ---------- func : function Function to apply, should return boolean axis : tuple or int, optional, default=(0,) Axis or multiple axes to filter along. Returns ------- BoltSparkArray """ axis = tupleize(axis) if len(axis) != 1: raise NotImplementedError( "Filtering over multiple axes will not be " "supported until SparseBoltArray is implemented.") swapped = self._align(axis) rdd = swapped._rdd.values().filter(func) # count the resulting array in order to reindex (linearize) the keys count, zipped = zip_with_index(rdd) if not count: count = zipped.count() reindexed = zipped.map(lambda kv: (kv[1], kv[0])) remaining = [ swapped.shape[dim] for dim in range(len(swapped.shape)) if dim not in axis ] if count != 0: shape = tuple([count] + remaining) else: shape = (0, ) return self._constructor(reindexed, shape=shape, split=swapped.split).__finalize__(swapped)
def _stat(self, axis=None, func=None, name=None): """ Compute a statistic over an axis. Can provide either a function (for use in a reduce) or a name (for use by a stat counter) Parameters ---------- axis : tuple or int, optional, default=None Axis to compute statistic over, if None will compute over all axes func : function, optional, default=None Function for reduce, see BoltSparkArray.reduce name : str A named statistic, see StatCounter """ if axis is None: axis = list(range(len(self.shape))) axis = tupleize(axis) if func and not name: return self.reduce(func, axis) if name and not func: from bolt.local.array import BoltArrayLocal swapped = self._align(axis) def reducer(left, right): return left.combine(right) counter = swapped._rdd.values()\ .mapPartitions(lambda i: [StatCounter(values=i, stats=name)])\ .reduce(reducer) res = BoltArrayLocal(getattr(counter, name)) return res.toscalar() else: raise ValueError( 'Must specify either a function or a statistic name.')
def reduce(self, func, axis=0): """ Reduce an array along an axis. Applies an associative/commutative function of two arguments cumulatively to all arrays along an axis. Array will be aligned so that the desired set of axes are in the keys, which may require a transpose/reshape. Parameters ---------- func : function Function of two arrays that returns a single array axis : tuple or int, optional, default=(0,) Axis or multiple axes to reduce along. Returns ------- BoltArrayLocal """ axes = sorted(tupleize(axis)) # if the function is a ufunc, it can automatically handle reducing over multiple axes if isinstance(func, ufunc): inshape(self.shape, axes) reduced = func.reduce(self, axis=tuple(axes)) else: reshaped = self._align(axes) reduced = reduce(func, reshaped) new_array = self._constructor(reduced) # ensure that the shape of the reduced array is valid expected_shape = [ self.shape[i] for i in range(len(self.shape)) if i not in axes ] if new_array.shape != tuple(expected_shape): raise ValueError( "reduce did not yield a BoltArray with valid dimensions") return new_array
def __getitem__(self, index): index = tupleize(index) if len(index) > self.ndim: raise ValueError("Too many indices for array") if not all( [isinstance(i, (slice, int, list, set, ndarray)) for i in index]): raise ValueError( "Each index must either be a slice, int, list, set, or ndarray" ) # fill unspecified axes with full slices if len(index) < self.ndim: index += tuple( [slice(0, None, None) for _ in range(self.ndim - len(index))]) # convert ints to lists if not all ints and slices if not all([isinstance(i, (int, slice)) for i in index]): index = tuple([[i] if isinstance(i, int) else i for i in index]) # select basic or advanced indexing if all([isinstance(i, (slice, int)) for i in index]): rdd, shape, split = self._getbasic(index) elif all([isinstance(i, (set, list, ndarray)) for i in index]): rdd, shape, split = self._getadvanced(index) else: raise NotImplementedError( "Cannot mix basic indexing (slices and ints) with " "advanced indexing (lists and ndarrays) across axes") result = self._constructor(rdd, shape=shape, split=split).__finalize__(self) # squeeze out int dimensions (and squeeze to singletons if all ints) if all([isinstance(i, int) for i in index]): return result.squeeze().toarray()[()] else: tosqueeze = tuple([i for i in index if isinstance(i, int)]) return result.squeeze(tosqueeze)
def filter(self, func, axis=(0,)): """ Filter array along an axis. Applies a function which should evaluate to boolean, along a single axis or multiple axes. Parameters ---------- func : function Function to apply, should return boolean axis : tuple or int, optional, default=(0,) Axis or multiple axes to filter along. Returns ------- BoltSparkArray """ axis = tupleize(axis) if len(axis) != 1: raise NotImplementedError("Filtering over multiple axes will not be " "supported until SparseBoltArray is implemented.") swapped = self._align(axis) rdd = swapped._rdd.values().filter(func) # count the resulting array in order to reindex (linearize) the keys count, zipped = zip_with_index(rdd) if not count: count = zipped.count() reindexed = zipped.map(lambda kv: (kv[1], kv[0])) remaining = [swapped.shape[dim] for dim in range(len(swapped.shape)) if dim not in axis] if count != 0: shape = tuple([count] + remaining) else: shape = (0,) return self._constructor(reindexed, shape=shape, split=swapped.split).__finalize__(swapped)
def reduce(self, func, axis=(0,), keepdims=False): """ Reduce an array along an axis. Applies a commutative/associative function of two arguments cumulatively to all arrays along an axis. Array will be aligned so that the desired set of axes are in the keys, which may incur a swap. Parameters ---------- func : function Function of two arrays that returns a single array axis : tuple or int, optional, default=(0,) Axis or multiple axes to reduce along. Returns ------- BoltArraySpark """ from bolt.local.array import BoltArrayLocal from numpy import ndarray axis = tupleize(axis) swapped = self._align(axis) arr = swapped._rdd.values().reduce(func) if keepdims: for i in axis: arr = expand_dims(arr, axis=i) if not isinstance(arr, ndarray): # the result of a reduce can also be a scalar return arr elif arr.shape == (1,): # ndarrays with single values in them should be converted into scalars return arr[0] return BoltArrayLocal(arr)
def reduce(self, func, axis=(0, ), keepdims=False): """ Reduce an array along an axis. Applies a commutative/associative function of two arguments cumulatively to all arrays along an axis. Array will be aligned so that the desired set of axes are in the keys, which may incur a swap. Parameters ---------- func : function Function of two arrays that returns a single array axis : tuple or int, optional, default=(0,) Axis or multiple axes to reduce along. Returns ------- BoltArraySpark """ from bolt.local.array import BoltArrayLocal from numpy import ndarray axis = tupleize(axis) swapped = self._align(axis) arr = swapped._rdd.values().treeReduce(func, depth=3) if keepdims: for i in axis: arr = expand_dims(arr, axis=i) if not isinstance(arr, ndarray): # the result of a reduce can also be a scalar return arr elif arr.shape == (1, ): # ndarrays with single values in them should be converted into scalars return arr[0] return BoltArrayLocal(arr)
def reduce(self, func, axis=0): """ """ axes = sorted(tupleize(axis)) # if the function is a ufunc, it can automatically handle reducing over multiple axes if isinstance(func, ufunc): inshape(self.shape, axes) reduced = func.reduce(self, axis=tuple(axes)) else: reshaped = self._align(axes) reduced = reduce(func, reshaped) new_array = self._constructor(reduced) # ensure that the shape of the reduced array is valid expected_shape = [self.shape[i] for i in range(len(self.shape)) if i not in axes] if new_array.shape != tuple(expected_shape): raise ValueError("reduce did not yield a BoltArray with valid dimensions") return new_array
def chunk(self, size, axis=None): """ Split values of distributed array into chunks. Transforms an underlying pair RDD of (key, value) into records of the form: (key, chunk id), (chunked value). Here, chunk id is a tuple identifying the chunk and chunked value is a subset of the data from each original value, that has been divided along the specified dimensions. Parameters ---------- size : str or tuple or int If str, the average size (in MB) of the chunks in all value dimensions. If int or tuple, an explicit specification of the number chunks in each value dimension. axis : tuple, optional, default=None One or more axes to estimate chunks for, if provided any other axes will use one chunk. """ axis = tupleize(axis) plan = self.getplan(size, axis) if any([x > y for x, y in zip(plan, self.vshape)]): raise ValueError("Chunk sizes %s cannot exceed value dimensions %s along any axis" % (tuple(plan), tuple(self.vshape))) slices = self.getslices(plan, self.vshape) labels = list(product(*[list(enumerate(s)) for s in slices])) scheme = [list(zip(*s)) for s in labels] def _chunk(record): k, v = record[0], record[1] for (chk, slc) in scheme: yield (k, chk), v[slc] rdd = self._rdd.flatMap(_chunk) return self._constructor(rdd, plan=plan).__finalize__(self)
def reduce(self, func, axis=0): """ Reduce an array along an axis. Applies an associative/commutative function of two arguments cumulatively to all arrays along an axis. Array will be aligned so that the desired set of axes are in the keys, which may require a transpose/reshape. Parameters ---------- func : function Function of two arrays that returns a single array axis : tuple or int, optional, default=(0,) Axis or multiple axes to reduce along. Returns ------- BoltArrayLocal """ axes = sorted(tupleize(axis)) # if the function is a ufunc, it can automatically handle reducing over multiple axes if isinstance(func, ufunc): inshape(self.shape, axes) reduced = func.reduce(self, axis=tuple(axes)) else: reshaped = self._align(axes) reduced = reduce(func, reshaped) new_array = self._constructor(reduced) # ensure that the shape of the reduced array is valid expected_shape = [self.shape[i] for i in range(len(self.shape)) if i not in axes] if new_array.shape != tuple(expected_shape): raise ValueError("reduce did not yield a BoltArray with valid dimensions") return new_array
def __getitem__(self, item): # handle values if isinstance(item, int): item = slice(item, item + 1, None) if isinstance(item, tuple): item = tuple([ slice(i, i + 1, None) if isinstance(i, int) else i for i in item ]) if isinstance(item, (list, ndarray)): item = (item, ) new = self._values.__getitem__(item) result = self._constructor(new).__finalize__(self, noprop=('index', 'labels')) # handle labels if self.labels is not None: if isinstance(item, int): label_item = ([item], ) elif isinstance(item, (list, ndarray, slice)): label_item = (item, ) elif isinstance(item, tuple): label_item = item[:len(self.baseaxes)] newlabels = self.labels for (i, s) in enumerate(label_item): if isinstance(s, slice): newlabels = newlabels[[ s if j == i else slice(None) for j in range(len(label_item)) ]] else: newlabels = newlabels.take(tupleize(s), i) result.labels = newlabels return result
def check(v): if len(v.shape) > 0 and v.shape != tupleize(value_shape): raise Exception("Map operation did not produce values of uniform shape.") return v
def map(self, func, axis=(0,), value_shape=None, dtype=None, with_keys=False): """ Apply a function across an axis. Array will be aligned so that the desired set of axes are in the keys, which may incur a swap. Parameters ---------- func : function Function of a single array to apply. If with_keys=True, function should be of a (tuple, array) pair. axis : tuple or int, optional, default=(0,) Axis or multiple axes to apply function along. value_shape : tuple, optional, default=None Known shape of values resulting from operation dtype: numpy.dtype, optional, default=None Known dtype of values resulting from operation with_keys : bool, optional, default=False Include keys as an argument to the function Returns ------- BoltArraySpark """ axis = tupleize(axis) swapped = self._align(axis) if with_keys: test_func = lambda x: func(((0,), x)) else: test_func = func if value_shape is None or dtype is None: # try to compute the size of each mapped element by applying func to a random array try: mapped = test_func(random.randn(*swapped.values.shape).astype(self.dtype)) except Exception: first = swapped._rdd.first() if first: # eval func on the first element mapped = test_func(first[1]) if value_shape is None: value_shape = mapped.shape if dtype is None: dtype = mapped.dtype shape = tuple([swapped._shape[ax] for ax in range(len(axis))]) + tupleize(value_shape) if with_keys: rdd = swapped._rdd.map(lambda kv: (kv[0], func(kv))) else: rdd = swapped._rdd.mapValues(func) # reshaping will fail if the elements aren't uniformly shaped def check(v): if len(v.shape) > 0 and v.shape != tupleize(value_shape): raise Exception("Map operation did not produce values of uniform shape.") return v rdd = rdd.mapValues(lambda v: check(v)) return self._constructor(rdd, shape=shape, dtype=dtype, split=swapped.split).__finalize__(swapped)
def test_tupleize(): assert tupleize([1, 2, 3]) == (1, 2, 3) assert tupleize((1, 2, 3)) == (1, 2, 3) assert tupleize((1,)) == (1,) assert tupleize(1) == (1,)
def test_tupleize(): assert tupleize([1, 2, 3]) == (1, 2, 3) assert tupleize((1, 2, 3)) == (1, 2, 3) assert tupleize((1, )) == (1, ) assert tupleize(1) == (1, )
def check(v): if len(v.shape) > 0 and v.shape != tupleize(value_shape): raise Exception( "Map operation did not produce values of uniform shape.") return v
def map(self, func, axis=(0, ), value_shape=None, dtype=None, with_keys=False): """ Apply a function across an axis. Array will be aligned so that the desired set of axes are in the keys, which may incur a swap. Parameters ---------- func : function Function of a single array to apply. If with_keys=True, function should be of a (tuple, array) pair. axis : tuple or int, optional, default=(0,) Axis or multiple axes to apply function along. value_shape : tuple, optional, default=None Known shape of values resulting from operation dtype: numpy.dtype, optional, default=None Known dtype of values resulting from operation with_keys : bool, optional, default=False Include keys as an argument to the function Returns ------- BoltArraySpark """ axis = tupleize(axis) swapped = self._align(axis) if with_keys: test_func = lambda x: func(((0, ), x)) else: test_func = func if value_shape is None or dtype is None: # try to compute the size of each mapped element by applying func to a random array try: mapped = test_func( random.randn(*swapped.values.shape).astype(self.dtype)) except Exception: first = swapped._rdd.first() if first: # eval func on the first element mapped = test_func(first[1]) if value_shape is None: value_shape = mapped.shape if dtype is None: dtype = mapped.dtype shape = tuple([swapped._shape[ax] for ax in range(len(axis))]) + tupleize(value_shape) if with_keys: rdd = swapped._rdd.map(lambda kv: (kv[0], func(kv))) else: rdd = swapped._rdd.mapValues(func) # reshaping will fail if the elements aren't uniformly shaped def check(v): if len(v.shape) > 0 and v.shape != tupleize(value_shape): raise Exception( "Map operation did not produce values of uniform shape.") return v rdd = rdd.mapValues(lambda v: check(v)) return self._constructor(rdd, shape=shape, dtype=dtype, split=swapped.split).__finalize__(swapped)