def generate_groups(data, group_index, ngroups, axis=0, factory=lambda x: x): """ Parameters ---------- data : BlockManager Returns ------- generator """ indexer = lib.groupsort_indexer(group_index.astype("i4"), ngroups)[0] group_index = group_index.take(indexer) if isinstance(data, BlockManager): # this is sort of wasteful but... sorted_axis = data.axes[axis].take(indexer) sorted_data = data.reindex_axis(sorted_axis, axis=axis) if isinstance(data, Series): sorted_axis = data.index.take(indexer) sorted_data = data.reindex(sorted_axis) elif isinstance(data, DataFrame): sorted_data = data.take(indexer, axis=axis) if isinstance(sorted_data, DataFrame): def _get_slice(slob): if axis == 0: return sorted_data[slob] else: return sorted_data.ix[:, slob] elif isinstance(sorted_data, BlockManager): def _get_slice(slob): return factory(sorted_data.get_slice(slob, axis=axis)) elif isinstance(sorted_data, Series): def _get_slice(slob): return sorted_data._get_values(slob) else: # pragma: no cover def _get_slice(slob): return sorted_data[slob] starts, ends = lib.generate_slices(group_index.astype("i4"), ngroups) for i, (start, end) in enumerate(zip(starts, ends)): # Since I'm now compressing the group ids, it's now not "possible" to # produce empty slices because such groups would not be observed in the # data assert start < end yield i, _get_slice(slice(start, end))
def generate_groups(data, label_list, shape, axis=0, factory=lambda x: x): """ Parameters ---------- data : BlockManager Returns ------- generator """ group_index = get_group_index(label_list, shape) na_mask = np.zeros(len(label_list[0]), dtype=bool) for arr in label_list: na_mask |= arr == -1 group_index[na_mask] = -1 indexer = lib.groupsort_indexer(group_index.astype('i4'), np.prod(shape))[0] group_index = group_index.take(indexer) if isinstance(data, BlockManager): # this is sort of wasteful but... sorted_axis = data.axes[axis].take(indexer) sorted_data = data.reindex_axis(sorted_axis, axis=axis) if isinstance(data, Series): sorted_axis = data.index.take(indexer) sorted_data = data.reindex(sorted_axis) elif isinstance(data, DataFrame): sorted_data = data.take(indexer, axis=axis) if isinstance(sorted_data, DataFrame): def _get_slice(slob): if axis == 0: return sorted_data[slob] else: return sorted_data.ix[:, slob] elif isinstance(sorted_data, BlockManager): def _get_slice(slob): return factory(sorted_data.get_slice(slob, axis=axis)) elif isinstance(sorted_data, Series): def _get_slice(slob): return sorted_data._get_values(slob) else: # pragma: no cover def _get_slice(slob): return sorted_data[slob] starts, ends = lib.generate_slices(group_index.astype('i4'), np.prod(shape)) for i, (start, end) in enumerate(zip(starts, ends)): if start == end: yield i, None else: yield i, _get_slice(slice(start, end))
def generate_groups(data, group_index, ngroups, axis=0, factory=lambda x: x): """ Parameters ---------- data : BlockManager Returns ------- generator """ group_index = com._ensure_int32(group_index) indexer = lib.groupsort_indexer(group_index, ngroups)[0] group_index = group_index.take(indexer) if isinstance(data, BlockManager): # this is sort of wasteful but... sorted_axis = data.axes[axis].take(indexer) sorted_data = data.reindex_axis(sorted_axis, axis=axis) if isinstance(data, Series): sorted_axis = data.index.take(indexer) sorted_data = data.reindex(sorted_axis) elif isinstance(data, DataFrame): sorted_data = data.take(indexer, axis=axis) if isinstance(sorted_data, DataFrame): def _get_slice(slob): if axis == 0: return sorted_data[slob] else: return sorted_data.ix[:, slob] elif isinstance(sorted_data, BlockManager): def _get_slice(slob): return factory(sorted_data.get_slice(slob, axis=axis)) elif isinstance(sorted_data, Series): def _get_slice(slob): return sorted_data._get_values(slob) else: # pragma: no cover def _get_slice(slob): return sorted_data[slob] starts, ends = lib.generate_slices(group_index, ngroups) for i, (start, end) in enumerate(zip(starts, ends)): # Since I'm now compressing the group ids, it's now not "possible" to # produce empty slices because such groups would not be observed in the # data assert (start < end) yield i, _get_slice(slice(start, end))