示例#1
0
def test_from_csv():
    tweets = nvstrings.from_csv("../../data/tweets.csv", 7)
    got = tweets[:5]
    expected = [
        "@Bill_Porter nice to know that your site is back :-)",
        "@sudhamshu after trying out various tools to take notes and I found that paper is the best to take notes and to maintain todo lists.",
        "@neetashankar Yeah, I got the connection. I am getting 20 mbps for a 15 mbps connection. Customer service is also good.",
        '@Bill_Porter All posts from your website http://t.co/NUWn5HUFsK seems to have been deleted. I am getting a ""Not Found"" page even in homepage',
        'Today is ""bring your kids"" day at office and the entire office is taken over by cute little creatures ;)',
    ]

    assert_eq(got, expected)
示例#2
0
import nvstrings
import time

#df = pd.read_csv('/home/jovyan/reviews-1m.csv', sep=',')
#values = df["text"].values
#vlist = values.tolist()

print("precision = %0.9f seconds" % time.clock_getres(time.CLOCK_MONOTONIC_RAW))

for i in range(3):
    lines = (i+1) * 1000000
    #vlist.extend(vlist)
    #print("strings:",len(vlist))
    #
    #dstrs = nvstrings.to_device(vlist)
    dstrs = nvstrings.from_csv("/home/jovyan/reviews.txt",0,lines=lines)
    vlist = dstrs.to_host()
    print("strings = ",len(vlist))
    hstrs = pd.Series(vlist)
    #
    st = time.clock_gettime(time.CLOCK_MONOTONIC_RAW)
    d = dstrs.slice(3,103)
    et1 = (time.clock_gettime(time.CLOCK_MONOTONIC_RAW) - st)
    print("nvstrings.slice() = %05f" % et1)
    #
    st = time.clock_gettime(time.CLOCK_MONOTONIC_RAW)
    h = hstrs.str.slice(3,103)
    et2 = (time.clock_gettime(time.CLOCK_MONOTONIC_RAW) - st)
    print("   pandas.slice() = %05f" % et2)
    print("speedup = %0.1fx" % (et2/et1) )
    #
示例#3
0
print(strs.size(),strs)
cat = cat.remove_strings(strs)
print(cat.size(),cat.keys())

print(".values():",cat.values())
print(".value_for_index(7)",cat.value_for_index(7))
print(".value(fff):",cat.value('fff'))
print(".indexes_for_key(fff):",cat.indexes_for_key('fff'))
print(".to_strings():",cat.to_strings())

# multiple strings in one call
print("-------------------------")
strs1 = nvstrings.to_device(["eee","aaa","eee","ddd","ccc","ccc","ccc","eee","aaa"])
strs2 = nvstrings.to_device(["ggg","fff","hhh","aaa","fff","fff","ggg","hhh","bbb"])
print(".from_strings(strs1,strs2)")
cat = nvcategory.from_strings(strs1,strs2)
print(cat.size(),cat)

print(".values():",cat.values())
print(".value(ccc):",cat.value('ccc'))
print(".indexes_for_key(ccc):",cat.indexes_for_key('ccc'))
print(".gather_strings([0,2,0,3,1]):",cat.gather_strings([0,2,0,3,1]))

# Masonry, Reinforced Concrete, Reinforced Masonry, Steel Frame, Wood
print("-------------------------")
print("36634-rows.csv:")
strs = nvstrings.from_csv("../../data/36634-rows.csv",16)
cat = nvcategory.from_strings(strs)
print(cat.size(),cat.keys())
print("len(.values()):",len(cat.values()))
print(".value(Wood):",cat.value('Wood'))
示例#4
0
import pandas as pd
import nvstrings
import time

dstrs_in = nvstrings.from_csv('../tweets.csv', 7)
vlist = dstrs_in.to_host()
vlist.extend(vlist)
vlist.extend(vlist)
vlist.extend(vlist)
vlist.extend(vlist)
vlist.extend(vlist)
vlist.extend(vlist)
vlist.extend(vlist)
vlist.extend(vlist)
len(vlist)

dstrs = nvstrings.to_device(vlist)
hstrs = pd.Series(vlist)

print("precision = %0.9f seconds" %
      time.clock_getres(time.CLOCK_MONOTONIC_RAW))
print("strings =", dstrs.size())
#
st = time.clock_gettime(time.CLOCK_MONOTONIC_RAW)
d = dstrs.contains('@.+@')
et1 = (time.clock_gettime(time.CLOCK_MONOTONIC_RAW) - st)
print("nvstrings.contains('@.+@') = %05f" % et1)

st = time.clock_gettime(time.CLOCK_MONOTONIC_RAW)
h = hstrs.str.contains('@.+@')
et2 = (time.clock_gettime(time.CLOCK_MONOTONIC_RAW) - st)
示例#5
0
import pandas as pd
import nvstrings
import time

# setup rmm to use memory pool
from librmm_cffi import librmm as rmm
from librmm_cffi import librmm_config as rmm_cfg
rmm_cfg.use_pool_allocator = True
rmm_cfg.initial_pool_size = 8 << 30  # 8GB
rmm.initialize()

strs = nvstrings.from_csv('/data/tweets.csv', 7).to_host()

vlist1 = []
vlist1.extend(strs)
vlist1.extend(strs)
vlist1.extend(strs)
vlist1.extend(vlist1)
vlist1.extend(vlist1)
vlist1.extend(vlist1)
vlist1.extend(vlist1)
vlist1.extend(vlist1)
vlist1.extend(vlist1)
vlist1.extend(vlist1)

print("precision = %0.9f seconds" %
      time.clock_getres(time.CLOCK_MONOTONIC_RAW))

stats = {'strings': [], 'pandas': [], 'nvstrings': []}
vlist = []
for i in range(20):
示例#6
0
import pandas as pd
import nvstrings
import time

strs = nvstrings.from_csv('/home/jovyan/tweets.csv', 7).to_host()

vlist1 = []
vlist1.extend(strs)
vlist1.extend(strs)
vlist1.extend(strs)
vlist1.extend(vlist1)
vlist1.extend(vlist1)
vlist1.extend(vlist1)
vlist1.extend(vlist1)
vlist1.extend(vlist1)
vlist1.extend(vlist1)
vlist1.extend(vlist1)

print("precision = %0.9f seconds" %
      time.clock_getres(time.CLOCK_MONOTONIC_RAW))

stats = {'strings': [], 'pandas': [], 'nvstrings': []}
vlist = []
for i in range(50):
    #
    vlist.extend(vlist1)
    stats['strings'].append(len(vlist))
    #
    dstrs = nvstrings.to_device(vlist)
    hstrs = pd.Series(vlist)
    #
示例#7
0
import nvstrings

strs = nvstrings.from_csv('../../data/tweets.csv', 7)

print("slice(1,15):", strs.slice(1, 15))
示例#8
0
import nvstrings, nvcategory
import numpy as np
from numba import cuda
import time

print("precision = %0.9f seconds" %
      time.clock_getres(time.CLOCK_MONOTONIC_RAW))

lines = 1000000
# column 5 = style (e.g. American Pale Ale, Vienna Lager, etc)
dstrs = nvstrings.from_csv("/home/dwendt/data/reviews/beers-1m.csv",
                           5,
                           lines=lines)
#input("press enter")
#
slist = []
stats1 = []
stats2 = []

for i in range(50):
    idx = i + 1
    #print(idx,'million')
    #
    slist.append(dstrs)
    #
    st = time.clock_gettime(time.CLOCK_MONOTONIC_RAW)
    cat = nvcategory.from_strings_list(slist)
    et = (time.clock_gettime(time.CLOCK_MONOTONIC_RAW) - st)
    print(cat.keys_size(), cat.size())
    print("  from_strings_list = %05f" % et)
    stats1.append(et)
示例#9
0
#
import nvstrings

#
strs = nvstrings.from_csv("../../data/7584-rows.csv", 1)
#print(strs)

cols = strs.split_column(" ", 2)
print(cols[1])
#print(cols[1].len())
示例#10
0
import nvstrings
import numpy as np
from numba import cuda
import time

print("precision = %0.9f seconds" % time.clock_getres(time.CLOCK_MONOTONIC_RAW))

lines = 1000000
dstrs = nvstrings.from_csv("/home/dwendt/data/reviews/reviews.txt",0,lines=lines)
#
# there are 14 of these:
rwords = ["fruit","vintage","zest","foam","sweet","juic","malt","wheat","citrus","pine","crisp","dark","golden","bitter"]
# there are 133 of these:
swords =['i','me','my','myself','we','our','ours','ourselves','you','your','yours','yourself',
            'yourselves','he','him','his','himself','she','her','hers','herself','it','its','itself',
            'they','them','their','theirs','themselves','what','which','who','whom','this','that',
            'these','those','am','is','are','was','were','be','been','being','have','has','had',
            'having','do','does','did','doing','a','an','the','and','but','if','or','because','as',
            'until','while','of','at','by','for','with','about','against','between','into','through',
            'during','before','after','above','below','to','from','up','down','in','out','on','off',
            'over','under','again','further','then','once','here','there','when','where','why','how',
            'all','any','both','each','few','more','most','other','some','such','no','nor','not',
            'only','own','same','so','than','too','very','s','t','can','will','just','don','should',
            'now','uses','use','using','used','one','also']

stats = []
for i in range(len(rwords)):
    #
    words = rwords[0:i+1]
    print(words)
示例#11
0
import pandas as pd
import nvstrings
import time

dstrs_in = nvstrings.from_csv('/data/tweets.csv', 7)
vlist = dstrs_in.to_host()
vlist.extend(vlist)
vlist.extend(vlist)
vlist.extend(vlist)
vlist.extend(vlist)
vlist.extend(vlist)
vlist.extend(vlist)
vlist.extend(vlist)
vlist.extend(vlist)
len(vlist)

dstrs = nvstrings.to_device(vlist)
hstrs = pd.Series(vlist)

print("precision = %0.9f seconds" %
      time.clock_getres(time.CLOCK_MONOTONIC_RAW))
print("strings =", dstrs.size())
#
st = time.clock_gettime(time.CLOCK_MONOTONIC_RAW)
d = dstrs.contains('@.+@')
et1 = (time.clock_gettime(time.CLOCK_MONOTONIC_RAW) - st)
print("nvstrings.contains('@.+@') = %05f" % et1)

st = time.clock_gettime(time.CLOCK_MONOTONIC_RAW)
h = hstrs.str.contains('@.+@')
et2 = (time.clock_gettime(time.CLOCK_MONOTONIC_RAW) - st)