def map(self, func): """ Apply a function on each subarray. Parameters ---------- func : function This is applied to each value in the intermediate RDD. Returns ------- StackedArray """ vshape = self.shape[self.split:] x = self._rdd.values().first() if x.shape == vshape: a, b = asarray([x]), asarray([x, x]) else: a, b = x, concatenate((x, x)) try: atest = func(a) btest = func(b) except Exception as e: raise RuntimeError( "Error evaluating function on test array, got error:\n %s" % e) if not (isinstance(atest, ndarray) and isinstance(btest, ndarray)): raise ValueError("Function must return ndarray") # different shapes map to the same new shape elif atest.shape == btest.shape: if self._rekeyed is True: # we've already rekeyed rdd = self._rdd.map(lambda kv: (kv[0], func(kv[1]))) shape = (self.shape[0], ) + atest.shape else: # do the rekeying count, rdd = zip_with_index(self._rdd.values()) rdd = rdd.map(lambda kv: ((kv[1], ), func(kv[0]))) shape = (count, ) + atest.shape split = 1 rekeyed = True # different shapes stay different (along the first dimension) elif atest.shape[0] == a.shape[0] and btest.shape[0] == b.shape[0]: shape = self.shape[0:self.split] + atest.shape[1:] split = self.split rdd = self._rdd.map(lambda kv: (kv[0], func(kv[1]))) rekeyed = self._rekeyed else: raise ValueError("Cannot infer effect of function on shape") return self._constructor(rdd, rekeyed=rekeyed, shape=shape, split=split).__finalize__(self)
def map(self, func): """ Apply a function on each subarray. Parameters ---------- func : function This is applied to each value in the intermediate RDD. Returns ------- StackedArray """ vshape = self.shape[self.split:] x = self._rdd.values().first() if x.shape == vshape: a, b = asarray([x]), asarray([x, x]) else: a, b = x, concatenate((x, x)) try: atest = func(a) btest = func(b) except Exception as e: raise RuntimeError("Error evaluating function on test array, got error:\n %s" % e) if not (isinstance(atest, ndarray) and isinstance(btest, ndarray)): raise ValueError("Function must return ndarray") # different shapes map to the same new shape elif atest.shape == btest.shape: if self._rekeyed is True: # we've already rekeyed rdd = self._rdd.map(lambda kv: (kv[0], func(kv[1]))) shape = (self.shape[0],) + atest.shape else: # do the rekeying count, rdd = zip_with_index(self._rdd.values()) rdd = rdd.map(lambda kv: ((kv[1],), func(kv[0]))) shape = (count,) + atest.shape split = 1 rekeyed = True # different shapes stay different (along the first dimension) elif atest.shape[0] == a.shape[0] and btest.shape[0] == b.shape[0]: shape = self.shape[0:self.split] + atest.shape[1:] split = self.split rdd = self._rdd.map(lambda kv: (kv[0], func(kv[1]))) rekeyed = self._rekeyed else: raise ValueError("Cannot infer effect of function on shape") return self._constructor(rdd, rekeyed=rekeyed, shape=shape, split=split).__finalize__(self)
def filter(self, func, axis=(0, ), sort=False): """ Filter array along an axis. Applies a function which should evaluate to boolean, along a single axis or multiple axes. Array will be aligned so that the desired set of axes are in the keys, which may incur a swap. Parameters ---------- func : function Function to apply, should return boolean axis : tuple or int, optional, default=(0,) Axis or multiple axes to filter along. sort: bool, optional, default=False Whether or not to sort by key before reindexing Returns ------- BoltArraySpark """ axis = tupleize(axis) swapped = self._align(axis) def f(record): return func(record[1]) rdd = swapped._rdd.filter(f) if sort: rdd = rdd.sortByKey().values() else: rdd = rdd.values() # count the resulting array in order to reindex (linearize) the keys count, zipped = zip_with_index(rdd) if not count: count = zipped.count() reindexed = zipped.map(lambda kv: (tupleize(kv[1]), kv[0])) # since we can only filter over one axis, the remaining shape is always the following remaining = list(swapped.shape[len(axis):]) if count != 0: shape = tuple([count] + remaining) else: shape = (0, ) return self._constructor(reindexed, shape=shape, split=swapped.split).__finalize__(swapped)
def filter(self, func, axis=(0,), sort=False): """ Filter array along an axis. Applies a function which should evaluate to boolean, along a single axis or multiple axes. Array will be aligned so that the desired set of axes are in the keys, which may incur a swap. Parameters ---------- func : function Function to apply, should return boolean axis : tuple or int, optional, default=(0,) Axis or multiple axes to filter along. sort: bool, optional, default=False Whether or not to sort by key before reindexing Returns ------- BoltArraySpark """ axis = tupleize(axis) swapped = self._align(axis) def f(record): return func(record[1]) rdd = swapped._rdd.filter(f) if sort: rdd = rdd.sortByKey().values() else: rdd = rdd.values() # count the resulting array in order to reindex (linearize) the keys count, zipped = zip_with_index(rdd) if not count: count = zipped.count() reindexed = zipped.map(lambda kv: (tupleize(kv[1]), kv[0])) # since we can only filter over one axis, the remaining shape is always the following remaining = list(swapped.shape[len(axis):]) if count != 0: shape = tuple([count] + remaining) else: shape = (0,) return self._constructor(reindexed, shape=shape, split=swapped.split).__finalize__(swapped)
def filter(self, func, axis=(0, )): """ Filter array along an axis. Applies a function which should evaluate to boolean, along a single axis or multiple axes. Array will be aligned so that the desired set of axes are in the keys, which may incur a swap. Parameters ---------- func : function Function to apply, should return boolean axis : tuple or int, optional, default=(0,) Axis or multiple axes to filter along. Returns ------- BoltArraySpark """ axis = tupleize(axis) if len(axis) != 1: raise NotImplementedError( "Filtering over multiple axes will not be " "supported until SparseBoltArray is implemented.") swapped = self._align(axis) rdd = swapped._rdd.values().filter(func) # count the resulting array in order to reindex (linearize) the keys count, zipped = zip_with_index(rdd) if not count: count = zipped.count() reindexed = zipped.map(lambda kv: (kv[1], kv[0])) # since we can only filter over one axis, the remaining shape is always the following remaining = list(swapped.shape[1:]) if count != 0: shape = tuple([count] + remaining) else: shape = (0, ) return self._constructor(reindexed, shape=shape, split=swapped.split).__finalize__(swapped)
def filter(self, func, axis=(0, )): """ Filter array along an axis. Applies a function which should evaluate to boolean, along a single axis or multiple axes. Parameters ---------- func : function Function to apply, should return boolean axis : tuple or int, optional, default=(0,) Axis or multiple axes to filter along. Returns ------- BoltSparkArray """ axis = tupleize(axis) if len(axis) != 1: raise NotImplementedError( "Filtering over multiple axes will not be " "supported until SparseBoltArray is implemented.") swapped = self._align(axis) rdd = swapped._rdd.values().filter(func) # count the resulting array in order to reindex (linearize) the keys count, zipped = zip_with_index(rdd) if not count: count = zipped.count() reindexed = zipped.map(lambda kv: (kv[1], kv[0])) remaining = [ swapped.shape[dim] for dim in range(len(swapped.shape)) if dim not in axis ] if count != 0: shape = tuple([count] + remaining) else: shape = (0, ) return self._constructor(reindexed, shape=shape, split=swapped.split).__finalize__(swapped)
def filter(self, func, axis=(0,)): """ Filter array along an axis. Applies a function which should evaluate to boolean, along a single axis or multiple axes. Array will be aligned so that the desired set of axes are in the keys, which may incur a swap. Parameters ---------- func : function Function to apply, should return boolean axis : tuple or int, optional, default=(0,) Axis or multiple axes to filter along. Returns ------- BoltArraySpark """ axis = tupleize(axis) if len(axis) != 1: raise NotImplementedError("Filtering over multiple axes will not be " "supported until SparseBoltArray is implemented.") swapped = self._align(axis) rdd = swapped._rdd.values().filter(func) # count the resulting array in order to reindex (linearize) the keys count, zipped = zip_with_index(rdd) if not count: count = zipped.count() reindexed = zipped.map(lambda kv: (kv[1], kv[0])) # since we can only filter over one axis, the remaining shape is always the following remaining = list(swapped.shape[1:]) if count != 0: shape = tuple([count] + remaining) else: shape = (0,) return self._constructor(reindexed, shape=shape, split=swapped.split).__finalize__(swapped)
def filter(self, func, axis=(0,)): """ Filter array along an axis. Applies a function which should evaluate to boolean, along a single axis or multiple axes. Parameters ---------- func : function Function to apply, should return boolean axis : tuple or int, optional, default=(0,) Axis or multiple axes to filter along. Returns ------- BoltSparkArray """ axis = tupleize(axis) if len(axis) != 1: raise NotImplementedError("Filtering over multiple axes will not be " "supported until SparseBoltArray is implemented.") swapped = self._align(axis) rdd = swapped._rdd.values().filter(func) # count the resulting array in order to reindex (linearize) the keys count, zipped = zip_with_index(rdd) if not count: count = zipped.count() reindexed = zipped.map(lambda kv: (kv[1], kv[0])) remaining = [swapped.shape[dim] for dim in range(len(swapped.shape)) if dim not in axis] if count != 0: shape = tuple([count] + remaining) else: shape = (0,) return self._constructor(reindexed, shape=shape, split=swapped.split).__finalize__(swapped)