def unique1d(values): """ Hash table-based unique """ if np.issubdtype(values.dtype, np.floating): table = htable.Float64HashTable(len(values)) uniques = np.array(table.unique(_ensure_float64(values)), dtype=np.float64) elif np.issubdtype(values.dtype, np.datetime64): table = htable.Int64HashTable(len(values)) uniques = table.unique(_ensure_int64(values)) uniques = uniques.view('M8[ns]') elif np.issubdtype(values.dtype, np.timedelta64): table = htable.Int64HashTable(len(values)) uniques = table.unique(_ensure_int64(values)) uniques = uniques.view('m8[ns]') elif np.issubdtype(values.dtype, np.signedinteger): table = htable.Int64HashTable(len(values)) uniques = table.unique(_ensure_int64(values)) elif np.issubdtype(values.dtype, np.unsignedinteger): table = htable.UInt64HashTable(len(values)) uniques = table.unique(_ensure_uint64(values)) else: # its cheaper to use a String Hash Table than Object if lib.infer_dtype(values) in ['string']: table = htable.StringHashTable(len(values)) else: table = htable.PyObjectHashTable(len(values)) uniques = table.unique(_ensure_object(values)) return uniques
def test_get_labels_groupby_for_Int64(writable): table = ht.Int64HashTable() vals = np.array([1, 2, -1, 2, 1, -1], dtype=np.int64) vals.flags.writeable = writable arr, unique = table.get_labels_groupby(vals) expected_arr = np.array([0, 1, -1, 1, 0, -1], dtype=np.intp) expected_unique = np.array([1, 2], dtype=np.int64) tm.assert_numpy_array_equal(arr, expected_arr) tm.assert_numpy_array_equal(unique, expected_unique)
def __init__(self, comp_ids, ngroups, levels, labels): self.levels = levels self.labels = labels self.comp_ids = comp_ids.astype(np.int64) self.k = len(labels) self.tables = [hashtable.Int64HashTable(ngroups) for _ in range(self.k)] self._populate_tables()
def get_flattened_list( comp_ids: np.ndarray, ngroups: int, levels: Iterable[Index], labels: Iterable[np.ndarray], ) -> List[Tuple]: """Map compressed group id -> key tuple.""" comp_ids = comp_ids.astype(np.int64, copy=False) arrays: DefaultDict[int, List[int]] = defaultdict(list) for labs, level in zip(labels, levels): table = hashtable.Int64HashTable(ngroups) table.map(comp_ids, labs.astype(np.int64, copy=False)) for i in range(ngroups): arrays[i].append(level[table.get_item(i)]) return [tuple(array) for array in arrays.values()]
def compress_group_index(group_index, sort: bool = True): """ Group_index is offsets into cartesian product of all possible labels. This space can be huge, so this function compresses it, by computing offsets (comp_ids) into the list of unique labels (obs_group_ids). """ size_hint = len(group_index) table = hashtable.Int64HashTable(size_hint) group_index = ensure_int64(group_index) # note, group labels come out ascending (ie, 1,2,3 etc) comp_ids, obs_group_ids = table.get_labels_groupby(group_index) if sort and len(obs_group_ids) > 0: obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids) return ensure_int64(comp_ids), ensure_int64(obs_group_ids)