def test_merge_category(): strs1 = nvstrings.to_device( ["eee", "aaa", "eee", "ddd", "ccc", "ccc", "ccc", "eee", "aaa"]) strs2 = nvstrings.to_device( ["ggg", "fff", "hhh", "aaa", "fff", "fff", "ggg", "hhh", "bbb"]) cat1 = nvcategory.from_strings(strs1) cat2 = nvcategory.from_strings(strs2) ncat = cat1.merge_category(cat2) expected_keys = ["aaa", "ccc", "ddd", "eee", "bbb", "fff", "ggg", "hhh"] expected_values = [3, 0, 3, 2, 1, 1, 1, 3, 0, 6, 5, 7, 0, 5, 5, 6, 7, 4] assert_eq(ncat.keys(), expected_keys) assert_eq(ncat.values(), expected_values)
def test_merge_and_remap(): strs1 = nvstrings.to_device( ["eee", "aaa", "eee", "ddd", "ccc", "ccc", "ccc", "eee", "aaa"]) strs2 = nvstrings.to_device( ["ggg", "fff", "hhh", "aaa", "fff", "fff", "ggg", "hhh", "bbb"]) cat1 = nvcategory.from_strings(strs1) cat2 = nvcategory.from_strings(strs2) ncat = cat1.merge_and_remap(cat2) expected_keys = ['aaa', 'bbb', 'ccc', 'ddd', 'eee', 'fff', 'ggg', 'hhh'] expected_values = [4, 0, 4, 3, 2, 2, 2, 4, 0, 6, 5, 7, 0, 5, 5, 6, 7, 1] assert_eq(ncat.keys(), expected_keys) assert_eq(ncat.values(), expected_values)
def test_gather_index_exception(func): strs = nvstrings.to_device( ["eee", "aaa", "eee", "ddd", "ccc", "ccc", "ccc", "eee", "aaa"]) cat = nvcategory.from_strings(strs) indexes = [0, 2, 0, 4] with pytest.raises(Exception): func(cat, indexes)
def unique(self, method="sort"): """ Get unique strings in the data """ import nvcategory as nvc return StringColumn(nvc.from_strings(self.data).keys())
def transform(self, y: cudf.Series) -> cudf.Series: self._check_is_fitted() y = _enforce_str(y) encoded = cudf.Series( nvcategory.from_strings(y.data).set_keys( self._cats.keys()).values()) return encoded.replace(-1, 0)
def test_value_for_index(): strs = nvstrings.to_device( ["eee", "aaa", "eee", "ddd", "ccc", "ccc", "ccc", "eee", "aaa"]) cat = nvcategory.from_strings(strs) got = cat.value_for_index(7) expected = 3 assert got == expected
def test_indexes_for_key(): strs = nvstrings.to_device( ["eee", "aaa", "eee", "ddd", "ccc", "ccc", "ccc", "eee", "aaa"]) cat = nvcategory.from_strings(strs) got = cat.indexes_for_key('ccc') expected = [4, 5, 6] assert_eq(got, expected)
def test_value(): strs = nvstrings.to_device( ["eee", "aaa", "eee", "ddd", "ccc", "ccc", "ccc", "eee", "aaa"]) cat = nvcategory.from_strings(strs) got = cat.value('ccc') expected = 1 assert got == expected
def fit_transform(self, y: cudf.Series) -> cudf.Series: """ Simultaneously fit and transform an input This is functionally equivalent to (but faster than) `LabelEncoder().fit(y).transform(y)` """ self._dtype = y.dtype # Convert y to nvstrings series, if it isn't one y = _enforce_str(y) # Bottleneck is here, despite everything being done on the device nvs = _get_nvstring_from_series(y) if nvs is not None: self._cats = nvcategory.from_strings(nvs) else: self._cats = {} self._fitted = True arr: rmm.device_array = rmm.device_array( len(y), dtype=np.int32 ) if nvs is not None: self._cats.values(devptr=arr.device_ctypes_pointer.value) return cudf.Series(arr)
def test_gather_strings(): strs = nvstrings.to_device( ["eee", "aaa", "eee", "ddd", "ccc", "ccc", "ccc", "eee", "aaa"]) cat = nvcategory.from_strings(strs) got = cat.gather_strings([0, 2, 0]) expected = ['aaa', 'ddd', 'aaa'] assert_eq(got, expected)
def transform(self, y: cudf.Series) -> cudf.Series: """ Transform an input into its categorical keys. This is intended for use with small inputs relative to the size of the dataset. For fitting and transforming an entire dataset, prefer `fit_transform`. Parameters ---------- y : cudf.Series Input keys to be transformed. Its values should match the categories given to `fit` Returns ------- encoded : cudf.Series The ordinally encoded input series Raises ------ KeyError if a category appears that was not seen in `fit` """ self._check_is_fitted() y = _enforce_str(y) encoded = cudf.Series( nvcategory.from_strings(_get_nvstring_from_series(y)) .set_keys(self._cats.keys()) .values() ) if encoded.isin([-1]).any(): raise KeyError("Attempted to encode unseen key") return encoded
def fit(self, y: cudf.Series) -> "LabelEncoder": """ Fit a LabelEncoder (nvcategory) instance to a set of categories Parameters ---------- y : cudf.Series Series containing the categories to be encoded. It's elements may or may not be unique Returns ------- self : LabelEncoder A fitted instance of itself to allow method chaining """ self._dtype = y.dtype y = _enforce_str(y) nvs = _get_nvstring_from_series(y) if nvs is not None: self._cats = nvcategory.from_strings(nvs) else: self._cats = {} self._fitted = True return self
def test_values(): strs = nvstrings.to_device( ["eee", "aaa", "eee", "ddd", "ccc", "ccc", "ccc", "eee", "aaa"]) cat = nvcategory.from_strings(strs) got = cat.values() expected = [3, 0, 3, 2, 1, 1, 1, 3, 0] assert_eq(got, expected)
def test_remove_unused_keys(): strs1 = nvstrings.to_device(["a", "b", "b", "f", "c", "f"]) strs2 = nvstrings.to_device(["b", "c", "e", "d"]) cat = nvcategory.from_strings(strs1) cat1 = cat.set_keys(strs2) cat1_unused_removed = cat1.remove_unused_keys() assert_eq(cat1_unused_removed.keys(), ['b', 'c'])
def fit(self, y: cudf.Series) -> "LabelEncoder": self._dtype = y.dtype y = _enforce_str(y) self._cats = nvcategory.from_strings(y.data) self._fitted = True return self
def test_gather_and_remap(): strs1 = nvstrings.to_device(["a", "b", "b", "f", "c", "f"]) cat = nvcategory.from_strings(strs1) cat1 = cat.gather_and_remap([1, 3, 2, 3, 1, 2]) expected_keys = ['b', 'c', 'f'] expected_values = [0, 2, 1, 2, 0, 1] assert_eq(cat1.keys(), expected_keys) assert_eq(cat1.values(), expected_values)
def test_gather(): strs1 = nvstrings.to_device(["a", "b", "b", "f", "c", "f"]) cat = nvcategory.from_strings(strs1) cat1 = cat.gather([1, 3, 2, 3, 1, 2]) expected_keys = ["a", "b", "c", "f"] expected_values = [1, 3, 2, 3, 1, 2] assert_eq(cat1.keys(), expected_keys) assert_eq(cat1.values(), expected_values)
def test_add_strings(): strs = nvstrings.to_device( ["eee", "aaa", "eee", "ddd", "ccc", "ccc", "ccc", "eee", "aaa"]) cat = nvcategory.from_strings(strs) got = cat.add_strings(strs) expected_keys = ['aaa', 'ccc', 'ddd', 'eee'] expected_values = [3, 0, 3, 2, 1, 1, 1, 3, 0, 3, 0, 3, 2, 1, 1, 1, 3, 0] assert_eq(got.keys(), expected_keys) assert_eq(got.values(), expected_values)
def setups(self, to: TabularGPU): self.lbls = { n: nvcategory.from_strings(_to_str(to.iloc[:, n]).data).keys() for n in to.all_cat_names } self.classes = { n: CategoryMap(_remove_none(c.to_host()), add_na=(n in to.cat_names)) for n, c in self.lbls.items() }
def test_from_strings(): strs1 = nvstrings.to_device( ["eee", "aaa", "eee", "ddd", "ccc", "ccc", "ccc", "eee", "aaa"]) strs2 = nvstrings.to_device( ["ggg", "fff", "hhh", "aaa", "fff", "fff", "ggg", "hhh", "bbb"]) cat = nvcategory.from_strings(strs1, strs2) expected_keys = ["aaa", "bbb", "ccc", "ddd", "eee", "fff", "ggg", "hhh"] expected_values = [4, 0, 4, 3, 2, 2, 2, 4, 0, 6, 5, 7, 0, 5, 5, 6, 7, 1] assert_eq(cat.keys(), expected_keys) assert_eq(cat.values(), expected_values)
def test_remove_strings(): strs = nvstrings.to_device( ["eee", "aaa", "eee", "ddd", "ccc", "ccc", "ccc", "eee", "aaa"]) cat = nvcategory.from_strings(strs) removal_strings = nvstrings.to_device(["ccc", "aaa", "bbb"]) got = cat.remove_strings(removal_strings) expected_keys = ['ddd', 'eee'] expected_values = [1, 1, 0, 1] assert_eq(got.keys(), expected_keys) assert_eq(got.values(), expected_values)
def fit_transform(self, y: cudf.Series) -> cudf.Series: self._dtype = y.dtype # Convert y to nvstrings series, if it isn't one y = _enforce_str(y) # Bottleneck is here, despite everything being done on the device self._cats = nvcategory.from_strings(y.data) self._fitted = True arr: cp.array = cp.array(y.data.size(), dtype=np.int32) self._cats.values(devptr=arr.device_ctypes_pointer.value) return cudf.Series(arr)
def test_set_keys(): strs1 = nvstrings.to_device(["a", "b", "b", "f", "c", "f"]) strs2 = nvstrings.to_device(["b", "c", "e", "d"]) cat = nvcategory.from_strings(strs1) cat1 = cat.set_keys(strs2) assert_eq(cat1.keys(), ['b', 'c', 'd', 'e'])
def test_size(): strs = nvstrings.to_device( ["eee", "aaa", "eee", "ddd", "ccc", "ccc", "ccc", "eee", "aaa"]) cat = nvcategory.from_strings(strs) assert strs.size() == cat.size()
def test_remove_keys(): strs1 = nvstrings.to_device(["a", "b", "b", "f", "c", "f"]) strs2 = nvstrings.to_device(["b", "d"]) cat = nvcategory.from_strings(strs1) cat1 = cat.remove_keys(strs2) assert_eq(cat1.keys(), ['a', 'c', 'f'])
def test_to_strings(): strs = nvstrings.to_device( ["eee", "aaa", "eee", "ddd", "ccc", "ccc", "ccc", "eee", "aaa"]) cat = nvcategory.from_strings(strs) got = cat.to_strings() assert_eq(got, strs)
# import nvstrings, nvcategory # create strs = nvstrings.to_device(["eee","aaa","eee","ddd","ccc","ccc","ccc","eee","aaa"]) print(strs.size(),strs) cat = nvcategory.from_strings(strs) print(cat.size(),cat) print(".values():",cat.values()) print(".value_for_index(7)",cat.value_for_index(7)) print(".value(ccc):",cat.value('ccc')) print(".indexes_for_key(ccc):",cat.indexes_for_key('ccc')) print(".to_strings():",cat.to_strings()) # add print("-------------------------") print("add strings:") strs = nvstrings.to_device(["ggg","fff","hhh","aaa","fff","fff","ggg","hhh","bbb"]) print(strs.size(),strs) cat = cat.add_strings(strs) print(cat.size(),cat.keys()) print(".values():",cat.values()) print(".value_for_index(7)",cat.value_for_index(7)) print(".value(aaa):",cat.value('aaa')) print(".indexes_for_key(aaa):",cat.indexes_for_key('aaa')) print(".to_strings():",cat.to_strings()) print(".gather_strings([0,2,0]):",cat.gather_strings([0,2,0])) # remove
def test_keys_size(): strs1 = nvstrings.to_device(["a", "b", "b", "f", "c", "f"]) cat = nvcategory.from_strings(strs1) got = cat.keys_size() assert got == 4
def nvcategory(self): if self._nvcategory is None: import nvcategory as nvc self._nvcategory = nvc.from_strings(self.data) return self._nvcategory
def test_keys(): strs1 = nvstrings.to_device(["a", "b", "b", "f", "c", "f"]) cat = nvcategory.from_strings(strs1) got = cat.keys() expected = ['a', 'b', 'c', 'f'] assert_eq(got, expected)