def _make_labels(self): if self._was_factor: # pragma: no cover raise Exception('Should not call this method grouping by level') else: values = self.grouper if values.dtype != np.object_: values = values.astype('O') # khash rizer = lib.Factorizer(len(values)) labels, counts = rizer.factorize(values, sort=False) uniques = Index(rizer.uniques, name=self.name) if self.sort and len(counts) > 0: sorter = uniques.argsort() reverse_indexer = np.empty(len(sorter), dtype=np.int32) reverse_indexer.put(sorter, np.arange(len(sorter))) mask = labels < 0 labels = reverse_indexer.take(labels) np.putmask(labels, mask, -1) uniques = uniques.take(sorter) counts = counts.take(sorter) self._labels = labels self._group_index = uniques self._counts = counts
def khash_unique(values, expected_K, size_hint=False, sort=False, memory=False): if memory: gc.collect() before_mem = proc.get_memory_info().rss if size_hint: rizer = lib.Factorizer(len(values)) else: rizer = lib.Factorizer(100) result = [] result = rizer.unique(values) if memory: result = proc.get_memory_info().rss - before_mem return result if sort: result.sort() assert(len(result) == expected_K)
def _factorize_objects(left_index, right_index, sort=True): rizer = lib.Factorizer(max(len(left_index), len(right_index))) llab, _ = rizer.factorize(left_index.astype('O')) rlab, _ = rizer.factorize(right_index.astype('O')) count = rizer.get_count() if sort: llab, rlab = _sort_labels(rizer.uniques, llab, rlab) # TODO: na handling return llab, rlab, count