예제 #1
0
파일: string.py 프로젝트: trevorsm7/cudf
 def nvstrings(self):
     if self._nvstrings is None:
         if self.nullable:
             mask_ptr = self.mask_ptr
         else:
             mask_ptr = None
         if self.size == 0:
             self._nvstrings = nvstrings.to_device([])
         else:
             self._nvstrings = nvstrings.from_offsets(
                 self.children[1].data_ptr,
                 self.children[0].data_ptr,
                 self.size,
                 mask_ptr,
                 ncount=self.null_count,
                 bdevmem=True,
             )
     return self._nvstrings
예제 #2
0
def test_unique_tokens():
    # default space delimiter
    strs = nvstrings.to_device(
        ["this is my favorite book",
         "Your Favorite book is different",
         None,
         ""]
    )
    unique_tokens_outcome = nvtext.unique_tokens(strs)
    expected = set(['Favorite', 'Your', 'book', 'different',
                    'favorite', 'is', 'my', 'this'])
    assert set(unique_tokens_outcome.to_host()) == expected

    # custom delimiter
    unique_tokens_outcome = nvtext.unique_tokens(strs, delimiter='my')
    expected = set([' favorite book', 'Your Favorite book is different',
                    'this is '])
    assert set(unique_tokens_outcome.to_host()) == expected
예제 #3
0
def test_tokens_counts():
    strs = nvstrings.to_device(
        ["apples are green", "apples are a fruit", None, ""])

    query_strings = nvtext.unique_tokens(strs)

    # host results
    contains_outcome = nvtext.tokens_counts(strs, query_strings)
    expected = [[0, 1, 1, 0, 1], [1, 1, 1, 1, 0], [0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0]]
    assert contains_outcome == expected

    # device results
    outcome_darray = rmm.device_array((strs.size(), query_strings.size()),
                                      dtype=np.int32)
    nvtext.tokens_counts(strs,
                         query_strings,
                         devptr=outcome_darray.device_ctypes_pointer.value)
    assert np.array_equal(outcome_darray.copy_to_host(), expected)
예제 #4
0
파일: test_text.py 프로젝트: zivzone/cudf
def test_token_count():
    # default space delimiter
    strs = nvstrings.to_device([
        "the quick brown fox jumped over the lazy brown dog",
        "the sable siamésé cat jumped under the brown sofa",
        None,
        "",
        "test_str\x01test_str\x02test_str\x03test_str\x04test_str\x05",
    ])
    outcome = nvtext.token_count(strs)
    expected = [10, 9, 0, 0, 5]
    assert outcome == expected

    # custom delimiter
    outcome = nvtext.token_count(strs, delimiter="o")
    expected = [6, 3, 0, 0, 1]
    assert outcome == expected

    # test device pointer
    outcome_darray = rmm.device_array(strs.size(), dtype=np.int32)
    nvtext.token_count(strs, devptr=outcome_darray.device_ctypes_pointer.value)
    expected = [10, 9, 0, 0, 5]
    assert np.array_equal(outcome_darray.copy_to_host(), expected)

    # test multi char delimiter
    got = nvtext.token_count(strs, delimiter=["a", "e", "i", "o", "u"])
    expected = [14, 15, 0, 0, 6]
    assert got == expected

    # test empty list of delimiter
    got = nvtext.token_count(strs, delimiter=[])
    expected = [10, 9, 0, 0, 5]
    assert got == expected

    # test device pointer
    got_darray = rmm.device_array(strs.size(), dtype=np.int32)
    nvtext.token_count(
        strs,
        delimiter=["a", "e", "i", "o"],
        devptr=got_darray.device_ctypes_pointer.value,
    )
    expected = [12, 13, 0, 0, 6]
    assert np.array_equal(got_darray.copy_to_host(), expected)
예제 #5
0
def test_scatter_count():
    # regular
    strings = ["Dickens", "Einstein", "Christie"]
    dstrings = nvstrings.to_device(strings)
    expected = [
        "Dickens",
        "Einstein",
        "Einstein",
        "Christie",
        "Christie",
        "Christie",
    ]
    outcome = nvtext.scatter_count(dstrings, [1, 2, 3])
    assert outcome.to_host() == expected

    # with nulls
    expected = ["Dickens", "Dickens"]
    outcome = nvtext.scatter_count(dstrings, [2, 0, None])
    assert outcome.to_host() == expected
예제 #6
0
def test_token_count():
    # default space delimiter
    strs = nvstrings.to_device([
        "the quick brown fox jumped over the lazy brown dog",
        "the sable siamésé cat jumped under the brown sofa", None, ""
    ])
    outcome = nvtext.token_count(strs)
    expected = [10, 9, 0, 0]
    assert outcome == expected

    # custom delimiter
    outcome = nvtext.token_count(strs, delimiter='o')
    expected = [6, 3, 0, 0]
    assert outcome == expected

    # test device pointer
    outcome_darray = rmm.device_array(strs.size(), dtype=np.int32)
    nvtext.token_count(strs, devptr=outcome_darray.device_ctypes_pointer.value)
    expected = [10, 9, 0, 0]
    assert np.array_equal(outcome_darray.copy_to_host(), expected)
예제 #7
0
def test_stod():
    s = nvstrings.to_device(
        [
            "1234",
            "5678",
            "90",
            None,
            "-876",
            "543.2",
            "-0.12",
            "2.553",
            "-.002",
            "",
            "de",
            "abc123",
            "123abc",
            "456e",
            "-1.78e+5",
            "-122.33644782",
        ]
    )
    got = s.stod()
    expected = [
        1234.0,
        5678.0,
        90.0,
        None,
        -876.0,
        543.2,
        -0.12,
        2.553,
        -0.002,
        0.0,
        0.0,
        0.0,
        123.0,
        456.0,
        -178000.0,
        -122.33644781999999,
    ]
    assert_eq(got, expected)
예제 #8
0
def test_consonants():

    strs = nvstrings.to_device(
        ["toys", "syzygy", "buddy", "county", "counties", "private", "", None])
    got = nvtext.is_consonant(strs, 2, y_char="y", vowels="aeiou")
    expect = [True, True, True, False, False, False, False, False]
    assert got == expect

    got = nvtext.is_consonant(strs, 5)
    expect = [False, False, False, False, False, True, False, False]
    assert got == expect

    got = nvtext.is_consonant(strs, -2)
    expect = [True, True, True, True, False, True, False, False]
    assert got == expect

    indices_array = np.array([1, 2, 3, 4, -5, 6, 7, 8], dtype=np.int32)
    darr = rmm.to_device(indices_array)
    got = nvtext.is_vowel(strs, darr.device_ctypes_pointer.value, True)
    expect = [True, False, False, False, False, True, False, False]
    assert got == expect
예제 #9
0
파일: columnops.py 프로젝트: yutiansut/cudf
def column_empty(row_count, dtype, masked, categories=None):
    """Allocate a new column like the given row_count and dtype.
    """
    dtype = pd.api.types.pandas_dtype(dtype)

    if masked:
        mask = cudautils.make_mask(row_count)
        cudautils.fill_value(mask, 0)
    else:
        mask = None

    if (
        categories is not None
        or pd.api.types.is_categorical_dtype(dtype)
    ):
        mem = rmm.device_array((row_count,), dtype=dtype)
        data = Buffer(mem)
        dtype = 'category'
    elif dtype.kind in 'OU':
        if row_count == 0:
            data = nvstrings.to_device([])
        else:
            mem = rmm.device_array((row_count,), dtype='float64')
            data = nvstrings.dtos(mem,
                                  len(mem),
                                  nulls=mask,
                                  bdevmem=True)
    else:
        mem = rmm.device_array((row_count,), dtype=dtype)
        data = Buffer(mem)

    if mask is not None:
        mask = Buffer(mask)

    from cudf.dataframe.columnops import build_column
    return build_column(data,
                        dtype,
                        mask,
                        categories)
예제 #10
0
def test_hash():
    s = nvstrings.to_device(
        [
            "1234",
            "5678",
            "90",
            None,
            "-876",
            "543.2",
            "-0.12",
            ".55",
            "-.002",
            "",
            "de",
            "abc123",
            "123abc",
            "456e",
            "-1.78e+5",
        ]
    )
    got = s.hash()
    expected = [
        1762063109,
        3008518326,
        3419725934,
        None,
        1225421472,
        2952354928,
        2093756495,
        1292375090,
        2098378342,
        1257683291,
        3758453927,
        213530502,
        2957649541,
        4248160425,
        2735531987,
    ]
    assert_eq(got, expected)
예제 #11
0
def test_stof():
    s = nvstrings.to_device(
        [
            "1234",
            "5678",
            "90",
            None,
            "-876",
            "543.2",
            "-0.12",
            ".55",
            "-.002",
            "",
            "de",
            "abc123",
            "123abc",
            "456e",
            "-1.78e+5",
        ]
    )
    got = s.stof()
    expected = [
        1234.0,
        5678.0,
        90.0,
        None,
        -876.0,
        543.2000122070312,
        -0.11999999731779099,
        0.550000011920929,
        -0.0020000000949949026,
        0.0,
        0.0,
        0.0,
        123.0,
        456.0,
        -178000.0,
    ]
    assert_eq(got, expected)
예제 #12
0
def test_stol():
    s = nvstrings.to_device(
        [
            "1234",
            "5678",
            "90",
            None,
            "-876",
            "543.2",
            "-0.12",
            "2.55",
            "-.002",
            "",
            "de",
            "abc123",
            "123abc",
            "456e",
            "-1.78e+5",
        ]
    )
    got = s.stol()
    expected = [
        1234,
        5678,
        90,
        None,
        -876,
        543,
        0,
        2,
        0,
        0,
        0,
        0,
        123,
        456,
        -1,
    ]
    assert_eq(got, expected)
예제 #13
0
def scalar_broadcast_to(scalar, shape, dtype):
    from cudf.utils.cudautils import fill_value
    from cudf.utils.dtypes import to_cudf_compatible_scalar

    scalar = to_cudf_compatible_scalar(scalar, dtype=dtype)

    if not isinstance(shape, tuple):
        shape = (shape, )

    if np.dtype(dtype) == np.dtype("object"):
        import nvstrings
        from cudf.core.column import StringColumn
        from cudf.utils.cudautils import zeros

        gather_map = zeros(shape[0], dtype="int32")
        scalar_str_col = StringColumn(nvstrings.to_device([scalar]))
        return scalar_str_col[gather_map]
    else:
        da = rmm.device_array(shape, dtype=dtype)
        if da.size != 0:
            fill_value(da, scalar)
        return da
예제 #14
0
파일: test_regex.py 프로젝트: zivzone/cudf
def test_contains(pattern):
    s = [
        "5",
        "hej",
        "\t \n",
        "12345",
        "\\",
        "d",
        "c:\\Tools",
        "+27",
        "1c2",
        "1C2",
        "0:00:0",
        "0:0:00",
        "00:0:0",
        "00:00:0",
        "00:0:00",
        "0:00:00",
        "00:00:00",
        "Hello world !",
        "Hello world!   ",
        "Hello worldcup  !",
        "0123456789",
        "1C2",
        "Xaa",
        "abcdefghxxx",
        "ABCDEFGH",
        "abcdefgh",
        "abc def",
        "abc\ndef",
        "aa\r\nbb\r\ncc\r\n\r\n",
        "abcabc",
    ]
    pstrs = pd.Series(s)
    nvstrs = nvstrings.to_device(s)
    got = nvstrs.contains(pattern)
    expected = pstrs.str.contains(pattern).values
    assert_eq(got, expected)
예제 #15
0
def tokenize(strs, delimiter=None):
    """
    Each string is split into tokens using the provided delimiter(s).
    The nvstrings instance returned contains the tokens in the order
    they were found.

    Parameters
    ----------
    strs : nvstrings
        The strings for this operation
    delimiter : str or nvstrings or list of strs
        The string used to locate the split points of each string.
        Default is whitespace.

    Examples
    --------
    >>> import nvstrings, nvtext
    >>> s = nvstrings.to_device(["hello world",
    ...                          "goodbye world",
    ...                          "hello goodbye"])
    >>> t = nvtext.tokenize(s)
    >>> print(t)
    ["hello","world","goodbye","world","hello","goodbye"]

    """
    rtn = None
    if delimiter is None:
        rtn = pyniNVText.n_tokenize(strs, delimiter)
    if isinstance(delimiter, str):
        rtn = pyniNVText.n_tokenize(strs, delimiter)
    if isinstance(delimiter, list):
        delimiter = nvs.to_device(delimiter)
    if isinstance(delimiter, nvs.nvstrings):
        rtn = pyniNVText.n_tokenize_multi(strs, delimiter)
    if rtn is not None:
        rtn = nvs.nvstrings(rtn)
    return rtn
예제 #16
0
#
import nvstrings
#
from librmm_cffi import librmm as rmm
from librmm_cffi import librmm_config as rmm_cfg
rmm_cfg.use_pool_allocator = True 
rmm.initialize()
#
strs = nvstrings.to_device(["abc","defghi",None,"jkl","mno","pqr","stu","dog and cat","accénted",""])
print(strs)
print(".sort(1):",strs.sort(1))
print(".sort(2):",strs.sort(2))
print(".sort(2,desc):",strs.sort(2,False))
print(".sort(3):",strs.sort(3))

print(".order(1):",strs.order(1))
print(".order(2):",strs.order(2))
print(".order(2,desc):",strs.order(2,False))
print(".order(3):",strs.order(3))

strs = nvstrings.to_device(["d","cc","bbb","aaaa"])
print(strs)
print(".sort(1):",strs.sort(1))
print(".sort(2):",strs.sort(2))
print(".sort(2,desc):",strs.sort(2,False))
print(".sort(3):",strs.sort(3))

print(".order(1):",strs.order(1))
print(".order(2):",strs.order(2))
print(".order(2,desc):",strs.order(2,False))
print(".order(3):",strs.order(3))
예제 #17
0
def test_check_device_memory():
    assert_eq(nvstrings.to_device(['a' * 7]).device_memory(), 24)
    assert_eq(nvstrings.to_device(['ab' * 7]).device_memory(), 32)
예제 #18
0
def test_free():
    # TODO: Check that GPU memory has been freed.
    data = nvstrings.to_device(["a", "b", "c", "d"])
    nvstrings.free(data)
예제 #19
0
import nvstrings

strs = nvstrings.to_device(
    ['', None, 'a b', ' a b ', '  aa  bb  ', ' a  bbb   c', ' aa b  ccc  '])

print("split_record():")
for s in strs.split_record():
    print(" ", s)

print("split_record(n=1):")
for s in strs.split_record(n=1):
    print(" ", s)

print("split_record(n=2):")
for s in strs.split_record(n=2):
    print(" ", s)

print("rsplit_record():")
for s in strs.rsplit_record():
    print(" ", s)

print("rsplit_record(n=1):")
for s in strs.rsplit_record(n=1):
    print(" ", s)

print("rsplit_record(n=2):")
for s in strs.rsplit_record(n=2):
    print(" ", s)

print("split():")
for s in strs.split():
예제 #20
0
def test_character_tokenize():
    strs = nvstrings.to_device(
        [
            "the quick fox jumped over the lazy dog",
            "the siamésé cat jumped under the sofa",
            None,
            "",
        ]
    )
    outcome = nvtext.character_tokenize(strs)
    expected = [
        "t",
        "h",
        "e",
        " ",
        "q",
        "u",
        "i",
        "c",
        "k",
        " ",
        "f",
        "o",
        "x",
        " ",
        "j",
        "u",
        "m",
        "p",
        "e",
        "d",
        " ",
        "o",
        "v",
        "e",
        "r",
        " ",
        "t",
        "h",
        "e",
        " ",
        "l",
        "a",
        "z",
        "y",
        " ",
        "d",
        "o",
        "g",
        "t",
        "h",
        "e",
        " ",
        "s",
        "i",
        "a",
        "m",
        "é",
        "s",
        "é",
        " ",
        "c",
        "a",
        "t",
        " ",
        "j",
        "u",
        "m",
        "p",
        "e",
        "d",
        " ",
        "u",
        "n",
        "d",
        "e",
        "r",
        " ",
        "t",
        "h",
        "e",
        " ",
        "s",
        "o",
        "f",
        "a",
    ]

    assert outcome.to_host() == expected
예제 #21
0
def test_gather():
    strs = nvstrings.to_device(["abc", "defghi", None, "cat"])
    got = strs.gather([1, 3, 2])
    expected = ['defghi', 'cat', None]
    assert got.to_host() == expected
예제 #22
0
#
import nvstrings
#
from librmm_cffi import librmm as rmm
from librmm_cffi import librmm_config as rmm_cfg
rmm_cfg.use_pool_allocator = True
rmm.initialize()
#
strs = nvstrings.to_device([
    "quick brown fox jumped over lazy brown dog", None,
    "hello there, accéntéd world", ""
])
print(strs)
print(".wrap(10):", strs.wrap(10))
print(".wrap(20):", strs.wrap(20))
print(".wrap(50):", strs.wrap(50))

strs = None
예제 #23
0
def test_gather_bool():
    strs = nvstrings.to_device(["abc", "defghi", None, "cat"])
    got = strs.gather([True, False, False, True])
    expected = ['abc', 'cat']
    assert got.to_host() == expected
예제 #24
0
def test_rjust():
    strs = nvstrings.to_device(["abc", "Def", None, "jLl"])
    got = strs.rjust(4)
    expected = [' abc', ' Def', None, ' jLl']
    assert_eq(got, expected)
예제 #25
0
import pandas as pd
import nvstrings
import time

df = pd.read_csv('/data/7584-rows.csv', sep=',')
df.columns

values = df["address"].values
values

dstrs = nvstrings.to_device(values.tolist())
hstrs = pd.Series(values.tolist())

print("precision = %0.9f seconds" % time.clock_getres(time.CLOCK_MONOTONIC_RAW))
print(str(dstrs.size()), "strings")
#
st = time.clock_gettime(time.CLOCK_MONOTONIC_RAW)
d = dstrs.split(' ')
et1 = (time.clock_gettime(time.CLOCK_MONOTONIC_RAW) - st)
print("nvstrings.split() = %05f" % et1)

#
st = time.clock_gettime(time.CLOCK_MONOTONIC_RAW)
h = hstrs.str.split(' ')
et2 = (time.clock_gettime(time.CLOCK_MONOTONIC_RAW) - st)
print("     pandas.split() = %05f" % et2)
print("speedup = %0.5fx" % (et2/et1) )

# clear output
d = None
h = None
예제 #26
0
import nvstrings

strs = nvstrings.to_device(["hello","there","world","accéntéd",None,""])
print(strs)
print(".translate():",strs.translate([]))
print(".translate([[e,a]]):",strs.translate([['e','a']]))
print(".translate([[e,é]]):",strs.translate([['e','é']]))
print(".translate([[é,e],[o,None]]):",strs.translate([['é','e'],['o',None]]))

print(".translate(maketrans(e,a):",strs.translate(str.maketrans('e','a')))
print(".translate(maketrans(elh,ELH):",strs.translate(str.maketrans('elh','ELH')))

import string
print()
strs = nvstrings.to_device(["This, of course, is only an example!","And; will have @all the #punctuation that $money can buy.","The %percent & the *star along with the (parenthesis) with dashes-and-under_lines.","Equations: 3+3=6; 3/4 < 1 and > 0"])
print(strs)
print(".translate(punctuation=None):\n",strs.translate(str.maketrans('','',string.punctuation)))
print(".translate(punctuation=' '):\n",strs.translate(str.maketrans(string.punctuation,' '*len(string.punctuation))))
예제 #27
0
import nvstrings
import time

dstrs_in = nvstrings.from_csv('../tweets.csv', 7)
vlist = dstrs_in.to_host()
vlist.extend(vlist)
vlist.extend(vlist)
vlist.extend(vlist)
vlist.extend(vlist)
vlist.extend(vlist)
vlist.extend(vlist)
vlist.extend(vlist)
vlist.extend(vlist)
len(vlist)

dstrs = nvstrings.to_device(vlist)
hstrs = pd.Series(vlist)

print("precision = %0.9f seconds" %
      time.clock_getres(time.CLOCK_MONOTONIC_RAW))
print("strings =", dstrs.size())
#
st = time.clock_gettime(time.CLOCK_MONOTONIC_RAW)
d = dstrs.contains('@.+@')
et1 = (time.clock_gettime(time.CLOCK_MONOTONIC_RAW) - st)
print("nvstrings.contains('@.+@') = %05f" % et1)

st = time.clock_gettime(time.CLOCK_MONOTONIC_RAW)
h = hstrs.str.contains('@.+@')
et2 = (time.clock_gettime(time.CLOCK_MONOTONIC_RAW) - st)
print("pandas.contains('@.+@') = %05f" % et2)
예제 #28
0
def test_remove_strings():
    strs = nvstrings.to_device(["abc", "defghi", None, "cat"])
    got = strs.remove_strings([0, 2])
    expected = ['defghi', 'cat']
    assert got.to_host() == expected
예제 #29
0
from librmm_cffi import librmm as rmm
from librmm_cffi import librmm_config as rmm_cfg

# setup rmm to use memory pool
rmm_cfg.use_pool_allocator = True
rmm_cfg.initial_pool_size = 2 << 30  # set to 2GiB. Default is 1/2 total GPU memory
rmm_cfg.use_managed_memory = False  # default is false
rmm_cfg.enable_logging = True
rmm.initialize()

import nvstrings

#
strs = nvstrings.to_device(
    ["Hello", "there", "world", None, "1234", "-123.4", "accénted", ""])
print(strs)

# case
print(".lower():", strs.lower())
print(".upper():", strs.upper())
print(".swapcase():", strs.swapcase())
print(".capitalize():", strs.capitalize())
print(".title():", strs.title())

# combine
print(".cat([1,2,3,4,5,6,é,nil]:",
      strs.cat(["1", "2", "3", "4", "5", "6", "é", None]))
print(".join(:):", strs.join(sep=':'))

# compare
print(".compare(there):", strs.compare("there"))
예제 #30
0
def test_add_strings():
    s1 = nvstrings.to_device(["dog and cat", None, "accénted", ""])
    s2 = nvstrings.to_device(["aaa", None, "", "bbb"])
    got = s1.add_strings(s2)
    expected = ['dog and cat', None, 'accénted', '', 'aaa', None, '', 'bbb']
    assert got.to_host() == expected