def test_groupsort_indexer(): a = np.random.randint(0, 1000, 100).astype(np.int64) b = np.random.randint(0, 1000, 100).astype(np.int64) result = algos.groupsort_indexer(a, 1000)[0] # need to use a stable sort expected = np.argsort(a, kind='mergesort') assert(np.array_equal(result, expected)) # compare with lexsort key = a * 1000 + b result = algos.groupsort_indexer(key, 1000000)[0] expected = np.lexsort((b, a)) assert(np.array_equal(result, expected))
def test_groupsort_indexer(): a = np.random.randint(0, 1000, 100).astype(np.int64) b = np.random.randint(0, 1000, 100).astype(np.int64) result = algos.groupsort_indexer(a, 1000)[0] # need to use a stable sort expected = np.argsort(a, kind='mergesort') assert (np.array_equal(result, expected)) # compare with lexsort key = a * 1000 + b result = algos.groupsort_indexer(key, 1000000)[0] expected = np.lexsort((b, a)) assert (np.array_equal(result, expected))
def get_group_index_sorter(group_index, ngroups): """ _algos.groupsort_indexer implements `counting sort` and it is at least O(ngroups), where ngroups = prod(shape) shape = map(len, keys) that is, linear in the number of combinations (cartesian product) of unique values of groupby keys. This can be huge when doing multi-key groupby. np.argsort(kind='mergesort') is O(count x log(count)) where count is the length of the data-frame; Both algorithms are `stable` sort and that is necessary for correctness of groupby operations. e.g. consider: df.groupby(key)[col].transform('first') """ count = len(group_index) alpha = 0.0 # taking complexities literally; there may be beta = 1.0 # some room for fine-tuning these parameters do_groupsort = (count > 0 and ((alpha + beta * ngroups) < (count * np.log(count)))) if do_groupsort: sorter, _ = _algos.groupsort_indexer(_ensure_int64(group_index), ngroups) return _ensure_platform_int(sorter) else: return group_index.argsort(kind='mergesort')
def _make_sorted_values_labels(self): v = self.level labs = list(self.index.labels) levs = list(self.index.levels) to_sort = labs[:v] + labs[v + 1:] + [labs[v]] sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]] comp_index, obs_ids = get_compressed_ids(to_sort, sizes) ngroups = len(obs_ids) indexer = _algos.groupsort_indexer(comp_index, ngroups)[0] indexer = _ensure_platform_int(indexer) self.sorted_values = algos.take_nd(self.values, indexer, axis=0) self.sorted_labels = [l.take(indexer) for l in to_sort]
def _make_sorted_values_labels(self): v = self.level labs = list(self.index.labels) levs = list(self.index.levels) to_sort = labs[:v] + labs[v + 1 :] + [labs[v]] sizes = [len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]] comp_index, obs_ids = get_compressed_ids(to_sort, sizes) ngroups = len(obs_ids) indexer = _algos.groupsort_indexer(comp_index, ngroups)[0] indexer = _ensure_platform_int(indexer) self.sorted_values = algos.take_nd(self.values, indexer, axis=0) self.sorted_labels = [l.take(indexer) for l in to_sort]
def _make_sorted_values_labels(self): v = self.level labs = self.index.labels levs = self.index.levels to_sort = labs[:v] + labs[v + 1:] + [labs[v]] sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]] group_index = get_group_index(to_sort, sizes) comp_index, obs_ids = _compress_group_index(group_index) ngroups = len(obs_ids) indexer = algos.groupsort_indexer(comp_index, ngroups)[0] indexer = _ensure_platform_int(indexer) self.sorted_values = com.take_2d(self.values, indexer, axis=0) self.sorted_labels = [l.take(indexer) for l in to_sort]