def _get_group_keys(self): """ Parameters ---------- Returns ------- """ if self.left_index: if isinstance(self.left.index, MultiIndex): left_keys = [ lev.values.take(lab) for lev, lab in zip( self.left.index.levels, self.left.index.labels) ] else: left_keys = [self.left.index.values] else: left_keys = self.left_join_keys if self.right_index: if isinstance(self.right.index, MultiIndex): right_keys = [ lev.values.take(lab) for lev, lab in zip( self.right.index.levels, self.right.index.labels) ] else: right_keys = [self.right.index.values] else: right_keys = self.right_join_keys assert (len(left_keys) == len(right_keys)) left_labels = [] right_labels = [] group_sizes = [] for lk, rk in zip(left_keys, right_keys): llab, rlab, count = _factorize_objects(lk, rk, sort=self.sort) left_labels.append(llab) right_labels.append(rlab) group_sizes.append(count) left_group_key = get_group_index(left_labels, group_sizes) right_group_key = get_group_index(right_labels, group_sizes) max_groups = 1L for x in group_sizes: max_groups *= long(x) if max_groups > 2**63: # pragma: no cover raise Exception('Combinatorial explosion! (boom)') left_group_key, right_group_key, max_groups = \ _factorize_int64(left_group_key, right_group_key, sort=self.sort) return left_group_key, right_group_key, max_groups
def _get_group_keys(self): """ Parameters ---------- Returns ------- """ if self.left_index: if isinstance(self.left.index, MultiIndex): left_keys = [lev.values.take(lab) for lev, lab in zip(self.left.index.levels, self.left.index.labels)] else: left_keys = [self.left.index.values] else: left_keys = self.left_join_keys if self.right_index: if isinstance(self.right.index, MultiIndex): right_keys = [lev.values.take(lab) for lev, lab in zip(self.right.index.levels, self.right.index.labels)] else: right_keys = [self.right.index.values] else: right_keys = self.right_join_keys assert(len(left_keys) == len(right_keys)) left_labels = [] right_labels = [] group_sizes = [] for lk, rk in zip(left_keys, right_keys): llab, rlab, count = _factorize_objects(lk, rk, sort=self.sort) left_labels.append(llab) right_labels.append(rlab) group_sizes.append(count) left_group_key = get_group_index(left_labels, group_sizes) right_group_key = get_group_index(right_labels, group_sizes) max_groups = 1L for x in group_sizes: max_groups *= long(x) if max_groups > 2**63: # pragma: no cover raise Exception('Combinatorial explosion! (boom)') left_group_key, right_group_key, max_groups = \ _factorize_int64(left_group_key, right_group_key, sort=self.sort) return left_group_key, right_group_key, max_groups
def _make_selectors(self): new_levels = self.new_index_levels # make the mask group_index = get_group_index(self.sorted_labels[:-1], [len(x) for x in new_levels]) group_index = _ensure_platform_int(group_index) group_mask = np.zeros(self.full_shape[0], dtype=bool) group_mask.put(group_index, True) stride = self.index.levshape[self.level] selector = self.sorted_labels[-1] + stride * group_index mask = np.zeros(np.prod(self.full_shape), dtype=bool) mask.put(selector, True) # compress labels unique_groups = np.arange(self.full_shape[0])[group_mask] compressor = group_index.searchsorted(unique_groups) if mask.sum() < len(self.index): raise ReshapeError("Index contains duplicate entries, " "cannot reshape") self.group_mask = group_mask self.group_index = group_index self.mask = mask self.unique_groups = unique_groups self.compressor = compressor
def _make_selectors(self): new_levels = self.new_index_levels # make the mask group_index = get_group_index(self.sorted_labels[:-1], [len(x) for x in new_levels]) comp_index, obs_ids = _compress_group_index(group_index) ngroups = len(obs_ids) comp_index = _ensure_platform_int(comp_index) stride = self.index.levshape[self.level] self.full_shape = ngroups, stride selector = self.sorted_labels[-1] + stride * comp_index mask = np.zeros(np.prod(self.full_shape), dtype=bool) mask.put(selector, True) if mask.sum() < len(self.index): raise ReshapeError('Index contains duplicate entries, ' 'cannot reshape') self.group_index = comp_index self.mask = mask self.unique_groups = obs_ids self.compressor = comp_index.searchsorted(np.arange(ngroups))
def _make_selectors(self): new_levels = self.new_index_levels # make the mask group_index = get_group_index(self.sorted_labels, [len(x) for x in new_levels]) group_mask = np.zeros(self.full_shape[0], dtype=bool) group_mask.put(group_index, True) stride = self.index.levshape[self.level] selector = self.sorted_labels[-1] + stride * group_index mask = np.zeros(np.prod(self.full_shape), dtype=bool) mask.put(selector, True) # compress labels unique_groups = np.arange(self.full_shape[0])[group_mask] compressor = group_index.searchsorted(unique_groups) if mask.sum() < len(self.index): raise ReshapeError('Index contains duplicate entries, ' 'cannot reshape') self.group_mask = group_mask self.group_index = group_index self.mask = mask self.unique_groups = unique_groups self.compressor = compressor
def _get_multiindex_indexer(join_keys, index, sort=False): shape = [] labels = [] for level, key in zip(index.levels, join_keys): llab, rlab, count = _factorize_objects(level, key, sort=False) labels.append(rlab) shape.append(count) left_group_key = get_group_index(labels, shape) right_group_key = get_group_index(index.labels, shape) left_group_key, right_group_key, max_groups = _factorize_int64(left_group_key, right_group_key, sort=False) left_indexer, right_indexer = lib.left_outer_join( left_group_key.astype("i4"), right_group_key.astype("i4"), max_groups, sort=False ) return left_indexer, right_indexer
def _get_multiindex_indexer(join_keys, index, sort=False): shape = [] labels = [] for level, key in zip(index.levels, join_keys): llab, rlab, count = _factorize_keys(level, key, sort=False) labels.append(rlab) shape.append(count) left_group_key = get_group_index(labels, shape) right_group_key = get_group_index(index.labels, shape) left_group_key, right_group_key, max_groups = _factorize_keys(left_group_key, right_group_key, sort=False) left_indexer, right_indexer = algos.left_outer_join( com._ensure_int64(left_group_key), com._ensure_int64(right_group_key), max_groups, sort=False ) return left_indexer, right_indexer
def _get_join_indexers(left_keys, right_keys, sort=False, how='inner'): """ Parameters ---------- Returns ------- """ if len(left_keys) != len(right_keys): raise AssertionError('left_key and right_keys must be the same length') left_labels = [] right_labels = [] group_sizes = [] for lk, rk in zip(left_keys, right_keys): llab, rlab, count = _factorize_keys(lk, rk, sort=sort) left_labels.append(llab) right_labels.append(rlab) group_sizes.append(count) max_groups = long(1) for x in group_sizes: max_groups *= long(x) if max_groups > 2**63: # pragma: no cover left_group_key, right_group_key, max_groups = \ _factorize_keys(lib.fast_zip(left_labels), lib.fast_zip(right_labels)) else: left_group_key = get_group_index(left_labels, group_sizes) right_group_key = get_group_index(right_labels, group_sizes) left_group_key, right_group_key, max_groups = \ _factorize_keys(left_group_key, right_group_key, sort=sort) # preserve left frame order if how == 'left' and sort == False kwargs = {'sort': sort} if how == 'left' else {} join_func = _join_functions[how] return join_func(left_group_key, right_group_key, max_groups, **kwargs)
def _get_join_indexers(left_keys, right_keys, sort=False, how='inner'): """ Parameters ---------- Returns ------- """ if len(left_keys) != len(right_keys): raise AssertionError('left_key and right_keys must be the same length') left_labels = [] right_labels = [] group_sizes = [] for lk, rk in zip(left_keys, right_keys): llab, rlab, count = _factorize_keys(lk, rk, sort=sort) left_labels.append(llab) right_labels.append(rlab) group_sizes.append(count) max_groups = long(1) for x in group_sizes: max_groups *= long(x) if max_groups > 2 ** 63: # pragma: no cover left_group_key, right_group_key, max_groups = \ _factorize_keys(lib.fast_zip(left_labels), lib.fast_zip(right_labels)) else: left_group_key = get_group_index(left_labels, group_sizes) right_group_key = get_group_index(right_labels, group_sizes) left_group_key, right_group_key, max_groups = \ _factorize_keys(left_group_key, right_group_key, sort=sort) # preserve left frame order if how == 'left' and sort == False kwargs = {'sort':sort} if how == 'left' else {} join_func = _join_functions[how] return join_func(left_group_key, right_group_key, max_groups, **kwargs)
def _get_join_indexers(left_keys, right_keys, sort=False, how='inner'): """ Parameters ---------- Returns ------- """ if not ((len(left_keys) == len(right_keys))): raise AssertionError() left_labels = [] right_labels = [] group_sizes = [] for lk, rk in zip(left_keys, right_keys): llab, rlab, count = _factorize_keys(lk, rk, sort=sort) left_labels.append(llab) right_labels.append(rlab) group_sizes.append(count) max_groups = 1L for x in group_sizes: max_groups *= long(x) if max_groups > 2 ** 63: # pragma: no cover left_group_key, right_group_key, max_groups = \ _factorize_keys(lib.fast_zip(left_labels), lib.fast_zip(right_labels)) else: left_group_key = get_group_index(left_labels, group_sizes) right_group_key = get_group_index(right_labels, group_sizes) left_group_key, right_group_key, max_groups = \ _factorize_keys(left_group_key, right_group_key, sort=sort) join_func = _join_functions[how] return join_func(left_group_key, right_group_key, max_groups)
def _get_join_indexers(left_keys, right_keys, sort=False, how='inner'): """ Parameters ---------- Returns ------- """ if not ((len(left_keys) == len(right_keys))): raise AssertionError() left_labels = [] right_labels = [] group_sizes = [] for lk, rk in zip(left_keys, right_keys): llab, rlab, count = _factorize_keys(lk, rk, sort=sort) left_labels.append(llab) right_labels.append(rlab) group_sizes.append(count) max_groups = long(1) for x in group_sizes: max_groups *= long(x) if max_groups > 2**63: # pragma: no cover left_group_key, right_group_key, max_groups = \ _factorize_keys(lib.fast_zip(left_labels), lib.fast_zip(right_labels)) else: left_group_key = get_group_index(left_labels, group_sizes) right_group_key = get_group_index(right_labels, group_sizes) left_group_key, right_group_key, max_groups = \ _factorize_keys(left_group_key, right_group_key, sort=sort) join_func = _join_functions[how] return join_func(left_group_key, right_group_key, max_groups)
def _get_multiindex_indexer(join_keys, index, sort=False): shape = [] labels = [] for level, key in zip(index.levels, join_keys): llab, rlab, count = _factorize_keys(level, key, sort=False) labels.append(rlab) shape.append(count) left_group_key = get_group_index(labels, shape) right_group_key = get_group_index(index.labels, shape) left_group_key, right_group_key, max_groups = \ _factorize_keys(left_group_key, right_group_key, sort=False) left_indexer, right_indexer = \ algos.left_outer_join(com._ensure_int64(left_group_key), com._ensure_int64(right_group_key), max_groups, sort=False) return left_indexer, right_indexer
def _get_multiindex_indexer(join_keys, index, sort=False): shape = [] labels = [] for level, key in zip(index.levels, join_keys): llab, rlab, count = _factorize_objects(level, key, sort=False) labels.append(rlab) shape.append(count) left_group_key = get_group_index(labels, shape) right_group_key = get_group_index(index.labels, shape) left_group_key, right_group_key, max_groups = \ _factorize_int64(left_group_key, right_group_key, sort=False) left_indexer, right_indexer = \ lib.left_outer_join(left_group_key.astype('i4'), right_group_key.astype('i4'), max_groups, sort=False) return left_indexer, right_indexer
def _get_group_keys(self): """ Parameters ---------- Returns ------- """ left_keys = self.left_join_keys right_keys = self.right_join_keys assert(len(left_keys) == len(right_keys)) left_labels = [] right_labels = [] group_sizes = [] for lk, rk in zip(left_keys, right_keys): llab, rlab, count = _factorize_keys(lk, rk, sort=self.sort) left_labels.append(llab) right_labels.append(rlab) group_sizes.append(count) left_group_key = get_group_index(left_labels, group_sizes) right_group_key = get_group_index(right_labels, group_sizes) max_groups = 1L for x in group_sizes: max_groups *= long(x) if max_groups > 2**63: # pragma: no cover raise Exception('Combinatorial explosion! (boom)') left_group_key, right_group_key, max_groups = \ _factorize_keys(left_group_key, right_group_key, sort=self.sort) return left_group_key, right_group_key, max_groups
def _get_group_keys(self): """ Parameters ---------- Returns ------- """ left_keys = self.left_join_keys right_keys = self.right_join_keys assert (len(left_keys) == len(right_keys)) left_labels = [] right_labels = [] group_sizes = [] for lk, rk in zip(left_keys, right_keys): llab, rlab, count = _factorize_objects(lk, rk, sort=self.sort) left_labels.append(llab) right_labels.append(rlab) group_sizes.append(count) left_group_key = get_group_index(left_labels, group_sizes) right_group_key = get_group_index(right_labels, group_sizes) max_groups = 1L for x in group_sizes: max_groups *= long(x) if max_groups > 2**63: # pragma: no cover raise Exception('Combinatorial explosion! (boom)') left_group_key, right_group_key, max_groups = \ _factorize_int64(left_group_key, right_group_key, sort=self.sort) return left_group_key, right_group_key, max_groups
def _get_join_indexers(left_keys, right_keys, sort=False, how='inner'): """ Parameters ---------- Returns ------- """ assert(len(left_keys) == len(right_keys)) left_labels = [] right_labels = [] group_sizes = [] for lk, rk in zip(left_keys, right_keys): llab, rlab, count = _factorize_keys(lk, rk, sort=sort) left_labels.append(llab) right_labels.append(rlab) group_sizes.append(count) left_group_key = get_group_index(left_labels, group_sizes) right_group_key = get_group_index(right_labels, group_sizes) max_groups = 1L for x in group_sizes: max_groups *= long(x) if max_groups > 2 ** 63: # pragma: no cover raise MergeError('Combinatorial explosion! (boom)') left_group_key, right_group_key, max_groups = \ _factorize_keys(left_group_key, right_group_key, sort=sort) join_func = _join_functions[how] return join_func(left_group_key, right_group_key, max_groups)
def _get_join_indexers(left_keys, right_keys, sort=False, how='inner'): """ Parameters ---------- Returns ------- """ assert (len(left_keys) == len(right_keys)) left_labels = [] right_labels = [] group_sizes = [] for lk, rk in zip(left_keys, right_keys): llab, rlab, count = _factorize_keys(lk, rk, sort=sort) left_labels.append(llab) right_labels.append(rlab) group_sizes.append(count) left_group_key = get_group_index(left_labels, group_sizes) right_group_key = get_group_index(right_labels, group_sizes) max_groups = 1L for x in group_sizes: max_groups *= long(x) if max_groups > 2**63: # pragma: no cover raise MergeError('Combinatorial explosion! (boom)') left_group_key, right_group_key, max_groups = \ _factorize_keys(left_group_key, right_group_key, sort=sort) join_func = _join_functions[how] return join_func(left_group_key, right_group_key, max_groups)
def _make_sorted_values_labels(self): v = self.level labs = self.index.labels levs = self.index.levels to_sort = labs[:v] + labs[v + 1:] + [labs[v]] sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]] group_index = get_group_index(to_sort, sizes) comp_index, obs_ids = _compress_group_index(group_index) ngroups = len(obs_ids) indexer = algos.groupsort_indexer(comp_index, ngroups)[0] indexer = _ensure_platform_int(indexer) self.sorted_values = com.take_2d(self.values, indexer, axis=0) self.sorted_labels = [l.take(indexer) for l in to_sort]
def _make_sorted_values_labels(self): v = self.level labs = self.index.labels levs = self.index.levels to_sort = labs[:v] + labs[v + 1:] + [labs[v]] sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]] group_index = get_group_index(to_sort, sizes) comp_index, obs_ids = _compress_group_index(group_index) ngroups = len(obs_ids) indexer = lib.groupsort_indexer(comp_index, ngroups)[0] indexer = _ensure_platform_int(indexer) self.sorted_values = com.take_2d(self.values, indexer, axis=0) self.sorted_labels = [l.take(indexer) for l in to_sort]
def _make_sorted_values_labels(self): v = self.level labs = self.index.labels levs = self.index.levels to_sort = labs[:v] + labs[v + 1 :] + [labs[v]] sizes = [len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]] group_index = get_group_index(to_sort, sizes) max_groups = np.prod(sizes) if max_groups > 1000000: comp_index, obs_ids = _compress_group_index(group_index) ngroups = len(obs_ids) else: comp_index, ngroups = group_index, max_groups indexer = lib.groupsort_indexer(comp_index, ngroups)[0] indexer = _ensure_platform_int(indexer) self.sorted_values = self.values.take(indexer, axis=0) self.sorted_labels = [l.take(indexer) for l in to_sort]
def get_compressed_ids(labels, sizes): # no overflow if com._long_prod(sizes) < 2 ** 63: group_index = get_group_index(labels, sizes) comp_index, obs_ids = _compress_group_index(group_index) else: n = len(labels[0]) mask = np.zeros(n, dtype=bool) for v in labels: mask |= v < 0 while com._long_prod(sizes) >= 2 ** 63: i = len(sizes) while com._long_prod(sizes[:i]) >= 2 ** 63: i -= 1 rem_index, rem_ids = get_compressed_ids(labels[:i], sizes[:i]) sizes = [len(rem_ids)] + sizes[i:] labels = [rem_index] + labels[i:] return get_compressed_ids(labels, sizes) return comp_index, obs_ids
def get_compressed_ids(labels, sizes): # no overflow if com._long_prod(sizes) < 2**63: group_index = get_group_index(labels, sizes) comp_index, obs_ids = _compress_group_index(group_index) else: n = len(labels[0]) mask = np.zeros(n, dtype=bool) for v in labels: mask |= v < 0 while com._long_prod(sizes) >= 2**63: i = len(sizes) while com._long_prod(sizes[:i]) >= 2**63: i -= 1 rem_index, rem_ids = get_compressed_ids(labels[:i], sizes[:i]) sizes = [len(rem_ids)] + sizes[i:] labels = [rem_index] + labels[i:] return get_compressed_ids(labels, sizes) return comp_index, obs_ids
def _unstack_multiple(data, clocs): if len(clocs) == 0: return data # NOTE: This doesn't deal with hierarchical columns yet index = data.index clocs = [index._get_level_number(i) for i in clocs] rlocs = [i for i in range(index.nlevels) if i not in clocs] clevels = [index.levels[i] for i in clocs] clabels = [index.labels[i] for i in clocs] cnames = [index.names[i] for i in clocs] rlevels = [index.levels[i] for i in rlocs] rlabels = [index.labels[i] for i in rlocs] rnames = [index.names[i] for i in rlocs] shape = [len(x) for x in clevels] group_index = get_group_index(clabels, shape) comp_ids, obs_ids = _compress_group_index(group_index, sort=False) recons_labels = decons_group_index(obs_ids, shape) dummy_index = MultiIndex(levels=rlevels + [obs_ids], labels=rlabels + [comp_ids], names=rnames + ['__placeholder__']) if isinstance(data, Series): dummy = Series(data.values, index=dummy_index) unstacked = dummy.unstack('__placeholder__') new_levels = clevels new_names = cnames new_labels = recons_labels else: if isinstance(data.columns, MultiIndex): result = data for i in range(len(clocs)): val = clocs[i] result = result.unstack(val) clocs = [val if i > val else val - 1 for val in clocs] return result dummy = DataFrame(data.values, index=dummy_index, columns=data.columns) unstacked = dummy.unstack('__placeholder__') if isinstance(unstacked, Series): unstcols = unstacked.index else: unstcols = unstacked.columns new_levels = [unstcols.levels[0]] + clevels new_names = [data.columns.name] + cnames new_labels = [unstcols.labels[0]] for rec in recons_labels: new_labels.append(rec.take(unstcols.labels[-1])) new_columns = MultiIndex(levels=new_levels, labels=new_labels, names=new_names) if isinstance(unstacked, Series): unstacked.index = new_columns else: unstacked.columns = new_columns return unstacked
def get_compressed_ids(labels, sizes): from pandas.core.groupby import get_group_index ids = get_group_index(labels, sizes, sort=True, xnull=False) return _compress_group_index(ids, sort=True)
def testit(label_list, shape): group_index = get_group_index(label_list, shape, sort=True, xnull=True) label_list2 = decons_group_index(group_index, shape) for a, b in zip(label_list, label_list2): assert (np.array_equal(a, b))
import pandas._tseries as lib f = np.std grouped = df.groupby(['A', 'B']) label_list = [ping.labels for ping in grouped.groupings] shape = [len(ping.ids) for ping in grouped.groupings] from pandas.core.groupby import get_group_index group_index = get_group_index(label_list, shape).astype('i4') ngroups = np.prod(shape) indexer = lib.groupsort_indexer(group_index, ngroups) values = df['C'].values.take(indexer) group_index = group_index.take(indexer) f = lambda x: x.std(ddof=1) grouper = lib.Grouper(df['C'], np.ndarray.std, group_index, ngroups) result = grouper.get_result() expected = grouped.std()