def nvstrings(self): if self._nvstrings is None: if self.nullable: mask_ptr = self.mask_ptr else: mask_ptr = None if self.size == 0: self._nvstrings = nvstrings.to_device([]) else: self._nvstrings = nvstrings.from_offsets( self.children[1].data_ptr, self.children[0].data_ptr, self.size, mask_ptr, ncount=self.null_count, bdevmem=True, ) return self._nvstrings
def test_unique_tokens(): # default space delimiter strs = nvstrings.to_device( ["this is my favorite book", "Your Favorite book is different", None, ""] ) unique_tokens_outcome = nvtext.unique_tokens(strs) expected = set(['Favorite', 'Your', 'book', 'different', 'favorite', 'is', 'my', 'this']) assert set(unique_tokens_outcome.to_host()) == expected # custom delimiter unique_tokens_outcome = nvtext.unique_tokens(strs, delimiter='my') expected = set([' favorite book', 'Your Favorite book is different', 'this is ']) assert set(unique_tokens_outcome.to_host()) == expected
def test_tokens_counts(): strs = nvstrings.to_device( ["apples are green", "apples are a fruit", None, ""]) query_strings = nvtext.unique_tokens(strs) # host results contains_outcome = nvtext.tokens_counts(strs, query_strings) expected = [[0, 1, 1, 0, 1], [1, 1, 1, 1, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]] assert contains_outcome == expected # device results outcome_darray = rmm.device_array((strs.size(), query_strings.size()), dtype=np.int32) nvtext.tokens_counts(strs, query_strings, devptr=outcome_darray.device_ctypes_pointer.value) assert np.array_equal(outcome_darray.copy_to_host(), expected)
def test_token_count(): # default space delimiter strs = nvstrings.to_device([ "the quick brown fox jumped over the lazy brown dog", "the sable siamésé cat jumped under the brown sofa", None, "", "test_str\x01test_str\x02test_str\x03test_str\x04test_str\x05", ]) outcome = nvtext.token_count(strs) expected = [10, 9, 0, 0, 5] assert outcome == expected # custom delimiter outcome = nvtext.token_count(strs, delimiter="o") expected = [6, 3, 0, 0, 1] assert outcome == expected # test device pointer outcome_darray = rmm.device_array(strs.size(), dtype=np.int32) nvtext.token_count(strs, devptr=outcome_darray.device_ctypes_pointer.value) expected = [10, 9, 0, 0, 5] assert np.array_equal(outcome_darray.copy_to_host(), expected) # test multi char delimiter got = nvtext.token_count(strs, delimiter=["a", "e", "i", "o", "u"]) expected = [14, 15, 0, 0, 6] assert got == expected # test empty list of delimiter got = nvtext.token_count(strs, delimiter=[]) expected = [10, 9, 0, 0, 5] assert got == expected # test device pointer got_darray = rmm.device_array(strs.size(), dtype=np.int32) nvtext.token_count( strs, delimiter=["a", "e", "i", "o"], devptr=got_darray.device_ctypes_pointer.value, ) expected = [12, 13, 0, 0, 6] assert np.array_equal(got_darray.copy_to_host(), expected)
def test_scatter_count(): # regular strings = ["Dickens", "Einstein", "Christie"] dstrings = nvstrings.to_device(strings) expected = [ "Dickens", "Einstein", "Einstein", "Christie", "Christie", "Christie", ] outcome = nvtext.scatter_count(dstrings, [1, 2, 3]) assert outcome.to_host() == expected # with nulls expected = ["Dickens", "Dickens"] outcome = nvtext.scatter_count(dstrings, [2, 0, None]) assert outcome.to_host() == expected
def test_token_count(): # default space delimiter strs = nvstrings.to_device([ "the quick brown fox jumped over the lazy brown dog", "the sable siamésé cat jumped under the brown sofa", None, "" ]) outcome = nvtext.token_count(strs) expected = [10, 9, 0, 0] assert outcome == expected # custom delimiter outcome = nvtext.token_count(strs, delimiter='o') expected = [6, 3, 0, 0] assert outcome == expected # test device pointer outcome_darray = rmm.device_array(strs.size(), dtype=np.int32) nvtext.token_count(strs, devptr=outcome_darray.device_ctypes_pointer.value) expected = [10, 9, 0, 0] assert np.array_equal(outcome_darray.copy_to_host(), expected)
def test_stod(): s = nvstrings.to_device( [ "1234", "5678", "90", None, "-876", "543.2", "-0.12", "2.553", "-.002", "", "de", "abc123", "123abc", "456e", "-1.78e+5", "-122.33644782", ] ) got = s.stod() expected = [ 1234.0, 5678.0, 90.0, None, -876.0, 543.2, -0.12, 2.553, -0.002, 0.0, 0.0, 0.0, 123.0, 456.0, -178000.0, -122.33644781999999, ] assert_eq(got, expected)
def test_consonants(): strs = nvstrings.to_device( ["toys", "syzygy", "buddy", "county", "counties", "private", "", None]) got = nvtext.is_consonant(strs, 2, y_char="y", vowels="aeiou") expect = [True, True, True, False, False, False, False, False] assert got == expect got = nvtext.is_consonant(strs, 5) expect = [False, False, False, False, False, True, False, False] assert got == expect got = nvtext.is_consonant(strs, -2) expect = [True, True, True, True, False, True, False, False] assert got == expect indices_array = np.array([1, 2, 3, 4, -5, 6, 7, 8], dtype=np.int32) darr = rmm.to_device(indices_array) got = nvtext.is_vowel(strs, darr.device_ctypes_pointer.value, True) expect = [True, False, False, False, False, True, False, False] assert got == expect
def column_empty(row_count, dtype, masked, categories=None): """Allocate a new column like the given row_count and dtype. """ dtype = pd.api.types.pandas_dtype(dtype) if masked: mask = cudautils.make_mask(row_count) cudautils.fill_value(mask, 0) else: mask = None if ( categories is not None or pd.api.types.is_categorical_dtype(dtype) ): mem = rmm.device_array((row_count,), dtype=dtype) data = Buffer(mem) dtype = 'category' elif dtype.kind in 'OU': if row_count == 0: data = nvstrings.to_device([]) else: mem = rmm.device_array((row_count,), dtype='float64') data = nvstrings.dtos(mem, len(mem), nulls=mask, bdevmem=True) else: mem = rmm.device_array((row_count,), dtype=dtype) data = Buffer(mem) if mask is not None: mask = Buffer(mask) from cudf.dataframe.columnops import build_column return build_column(data, dtype, mask, categories)
def test_hash(): s = nvstrings.to_device( [ "1234", "5678", "90", None, "-876", "543.2", "-0.12", ".55", "-.002", "", "de", "abc123", "123abc", "456e", "-1.78e+5", ] ) got = s.hash() expected = [ 1762063109, 3008518326, 3419725934, None, 1225421472, 2952354928, 2093756495, 1292375090, 2098378342, 1257683291, 3758453927, 213530502, 2957649541, 4248160425, 2735531987, ] assert_eq(got, expected)
def test_stof(): s = nvstrings.to_device( [ "1234", "5678", "90", None, "-876", "543.2", "-0.12", ".55", "-.002", "", "de", "abc123", "123abc", "456e", "-1.78e+5", ] ) got = s.stof() expected = [ 1234.0, 5678.0, 90.0, None, -876.0, 543.2000122070312, -0.11999999731779099, 0.550000011920929, -0.0020000000949949026, 0.0, 0.0, 0.0, 123.0, 456.0, -178000.0, ] assert_eq(got, expected)
def test_stol(): s = nvstrings.to_device( [ "1234", "5678", "90", None, "-876", "543.2", "-0.12", "2.55", "-.002", "", "de", "abc123", "123abc", "456e", "-1.78e+5", ] ) got = s.stol() expected = [ 1234, 5678, 90, None, -876, 543, 0, 2, 0, 0, 0, 0, 123, 456, -1, ] assert_eq(got, expected)
def scalar_broadcast_to(scalar, shape, dtype): from cudf.utils.cudautils import fill_value from cudf.utils.dtypes import to_cudf_compatible_scalar scalar = to_cudf_compatible_scalar(scalar, dtype=dtype) if not isinstance(shape, tuple): shape = (shape, ) if np.dtype(dtype) == np.dtype("object"): import nvstrings from cudf.core.column import StringColumn from cudf.utils.cudautils import zeros gather_map = zeros(shape[0], dtype="int32") scalar_str_col = StringColumn(nvstrings.to_device([scalar])) return scalar_str_col[gather_map] else: da = rmm.device_array(shape, dtype=dtype) if da.size != 0: fill_value(da, scalar) return da
def test_contains(pattern): s = [ "5", "hej", "\t \n", "12345", "\\", "d", "c:\\Tools", "+27", "1c2", "1C2", "0:00:0", "0:0:00", "00:0:0", "00:00:0", "00:0:00", "0:00:00", "00:00:00", "Hello world !", "Hello world! ", "Hello worldcup !", "0123456789", "1C2", "Xaa", "abcdefghxxx", "ABCDEFGH", "abcdefgh", "abc def", "abc\ndef", "aa\r\nbb\r\ncc\r\n\r\n", "abcabc", ] pstrs = pd.Series(s) nvstrs = nvstrings.to_device(s) got = nvstrs.contains(pattern) expected = pstrs.str.contains(pattern).values assert_eq(got, expected)
def tokenize(strs, delimiter=None): """ Each string is split into tokens using the provided delimiter(s). The nvstrings instance returned contains the tokens in the order they were found. Parameters ---------- strs : nvstrings The strings for this operation delimiter : str or nvstrings or list of strs The string used to locate the split points of each string. Default is whitespace. Examples -------- >>> import nvstrings, nvtext >>> s = nvstrings.to_device(["hello world", ... "goodbye world", ... "hello goodbye"]) >>> t = nvtext.tokenize(s) >>> print(t) ["hello","world","goodbye","world","hello","goodbye"] """ rtn = None if delimiter is None: rtn = pyniNVText.n_tokenize(strs, delimiter) if isinstance(delimiter, str): rtn = pyniNVText.n_tokenize(strs, delimiter) if isinstance(delimiter, list): delimiter = nvs.to_device(delimiter) if isinstance(delimiter, nvs.nvstrings): rtn = pyniNVText.n_tokenize_multi(strs, delimiter) if rtn is not None: rtn = nvs.nvstrings(rtn) return rtn
# import nvstrings # from librmm_cffi import librmm as rmm from librmm_cffi import librmm_config as rmm_cfg rmm_cfg.use_pool_allocator = True rmm.initialize() # strs = nvstrings.to_device(["abc","defghi",None,"jkl","mno","pqr","stu","dog and cat","accénted",""]) print(strs) print(".sort(1):",strs.sort(1)) print(".sort(2):",strs.sort(2)) print(".sort(2,desc):",strs.sort(2,False)) print(".sort(3):",strs.sort(3)) print(".order(1):",strs.order(1)) print(".order(2):",strs.order(2)) print(".order(2,desc):",strs.order(2,False)) print(".order(3):",strs.order(3)) strs = nvstrings.to_device(["d","cc","bbb","aaaa"]) print(strs) print(".sort(1):",strs.sort(1)) print(".sort(2):",strs.sort(2)) print(".sort(2,desc):",strs.sort(2,False)) print(".sort(3):",strs.sort(3)) print(".order(1):",strs.order(1)) print(".order(2):",strs.order(2)) print(".order(2,desc):",strs.order(2,False)) print(".order(3):",strs.order(3))
def test_check_device_memory(): assert_eq(nvstrings.to_device(['a' * 7]).device_memory(), 24) assert_eq(nvstrings.to_device(['ab' * 7]).device_memory(), 32)
def test_free(): # TODO: Check that GPU memory has been freed. data = nvstrings.to_device(["a", "b", "c", "d"]) nvstrings.free(data)
import nvstrings strs = nvstrings.to_device( ['', None, 'a b', ' a b ', ' aa bb ', ' a bbb c', ' aa b ccc ']) print("split_record():") for s in strs.split_record(): print(" ", s) print("split_record(n=1):") for s in strs.split_record(n=1): print(" ", s) print("split_record(n=2):") for s in strs.split_record(n=2): print(" ", s) print("rsplit_record():") for s in strs.rsplit_record(): print(" ", s) print("rsplit_record(n=1):") for s in strs.rsplit_record(n=1): print(" ", s) print("rsplit_record(n=2):") for s in strs.rsplit_record(n=2): print(" ", s) print("split():") for s in strs.split():
def test_character_tokenize(): strs = nvstrings.to_device( [ "the quick fox jumped over the lazy dog", "the siamésé cat jumped under the sofa", None, "", ] ) outcome = nvtext.character_tokenize(strs) expected = [ "t", "h", "e", " ", "q", "u", "i", "c", "k", " ", "f", "o", "x", " ", "j", "u", "m", "p", "e", "d", " ", "o", "v", "e", "r", " ", "t", "h", "e", " ", "l", "a", "z", "y", " ", "d", "o", "g", "t", "h", "e", " ", "s", "i", "a", "m", "é", "s", "é", " ", "c", "a", "t", " ", "j", "u", "m", "p", "e", "d", " ", "u", "n", "d", "e", "r", " ", "t", "h", "e", " ", "s", "o", "f", "a", ] assert outcome.to_host() == expected
def test_gather(): strs = nvstrings.to_device(["abc", "defghi", None, "cat"]) got = strs.gather([1, 3, 2]) expected = ['defghi', 'cat', None] assert got.to_host() == expected
# import nvstrings # from librmm_cffi import librmm as rmm from librmm_cffi import librmm_config as rmm_cfg rmm_cfg.use_pool_allocator = True rmm.initialize() # strs = nvstrings.to_device([ "quick brown fox jumped over lazy brown dog", None, "hello there, accéntéd world", "" ]) print(strs) print(".wrap(10):", strs.wrap(10)) print(".wrap(20):", strs.wrap(20)) print(".wrap(50):", strs.wrap(50)) strs = None
def test_gather_bool(): strs = nvstrings.to_device(["abc", "defghi", None, "cat"]) got = strs.gather([True, False, False, True]) expected = ['abc', 'cat'] assert got.to_host() == expected
def test_rjust(): strs = nvstrings.to_device(["abc", "Def", None, "jLl"]) got = strs.rjust(4) expected = [' abc', ' Def', None, ' jLl'] assert_eq(got, expected)
import pandas as pd import nvstrings import time df = pd.read_csv('/data/7584-rows.csv', sep=',') df.columns values = df["address"].values values dstrs = nvstrings.to_device(values.tolist()) hstrs = pd.Series(values.tolist()) print("precision = %0.9f seconds" % time.clock_getres(time.CLOCK_MONOTONIC_RAW)) print(str(dstrs.size()), "strings") # st = time.clock_gettime(time.CLOCK_MONOTONIC_RAW) d = dstrs.split(' ') et1 = (time.clock_gettime(time.CLOCK_MONOTONIC_RAW) - st) print("nvstrings.split() = %05f" % et1) # st = time.clock_gettime(time.CLOCK_MONOTONIC_RAW) h = hstrs.str.split(' ') et2 = (time.clock_gettime(time.CLOCK_MONOTONIC_RAW) - st) print(" pandas.split() = %05f" % et2) print("speedup = %0.5fx" % (et2/et1) ) # clear output d = None h = None
import nvstrings strs = nvstrings.to_device(["hello","there","world","accéntéd",None,""]) print(strs) print(".translate():",strs.translate([])) print(".translate([[e,a]]):",strs.translate([['e','a']])) print(".translate([[e,é]]):",strs.translate([['e','é']])) print(".translate([[é,e],[o,None]]):",strs.translate([['é','e'],['o',None]])) print(".translate(maketrans(e,a):",strs.translate(str.maketrans('e','a'))) print(".translate(maketrans(elh,ELH):",strs.translate(str.maketrans('elh','ELH'))) import string print() strs = nvstrings.to_device(["This, of course, is only an example!","And; will have @all the #punctuation that $money can buy.","The %percent & the *star along with the (parenthesis) with dashes-and-under_lines.","Equations: 3+3=6; 3/4 < 1 and > 0"]) print(strs) print(".translate(punctuation=None):\n",strs.translate(str.maketrans('','',string.punctuation))) print(".translate(punctuation=' '):\n",strs.translate(str.maketrans(string.punctuation,' '*len(string.punctuation))))
import nvstrings import time dstrs_in = nvstrings.from_csv('../tweets.csv', 7) vlist = dstrs_in.to_host() vlist.extend(vlist) vlist.extend(vlist) vlist.extend(vlist) vlist.extend(vlist) vlist.extend(vlist) vlist.extend(vlist) vlist.extend(vlist) vlist.extend(vlist) len(vlist) dstrs = nvstrings.to_device(vlist) hstrs = pd.Series(vlist) print("precision = %0.9f seconds" % time.clock_getres(time.CLOCK_MONOTONIC_RAW)) print("strings =", dstrs.size()) # st = time.clock_gettime(time.CLOCK_MONOTONIC_RAW) d = dstrs.contains('@.+@') et1 = (time.clock_gettime(time.CLOCK_MONOTONIC_RAW) - st) print("nvstrings.contains('@.+@') = %05f" % et1) st = time.clock_gettime(time.CLOCK_MONOTONIC_RAW) h = hstrs.str.contains('@.+@') et2 = (time.clock_gettime(time.CLOCK_MONOTONIC_RAW) - st) print("pandas.contains('@.+@') = %05f" % et2)
def test_remove_strings(): strs = nvstrings.to_device(["abc", "defghi", None, "cat"]) got = strs.remove_strings([0, 2]) expected = ['defghi', 'cat'] assert got.to_host() == expected
from librmm_cffi import librmm as rmm from librmm_cffi import librmm_config as rmm_cfg # setup rmm to use memory pool rmm_cfg.use_pool_allocator = True rmm_cfg.initial_pool_size = 2 << 30 # set to 2GiB. Default is 1/2 total GPU memory rmm_cfg.use_managed_memory = False # default is false rmm_cfg.enable_logging = True rmm.initialize() import nvstrings # strs = nvstrings.to_device( ["Hello", "there", "world", None, "1234", "-123.4", "accénted", ""]) print(strs) # case print(".lower():", strs.lower()) print(".upper():", strs.upper()) print(".swapcase():", strs.swapcase()) print(".capitalize():", strs.capitalize()) print(".title():", strs.title()) # combine print(".cat([1,2,3,4,5,6,é,nil]:", strs.cat(["1", "2", "3", "4", "5", "6", "é", None])) print(".join(:):", strs.join(sep=':')) # compare print(".compare(there):", strs.compare("there"))
def test_add_strings(): s1 = nvstrings.to_device(["dog and cat", None, "accénted", ""]) s2 = nvstrings.to_device(["aaa", None, "", "bbb"]) got = s1.add_strings(s2) expected = ['dog and cat', None, 'accénted', '', 'aaa', None, '', 'bbb'] assert got.to_host() == expected