Пример #1
0
def check_correctness(seed):
    N = 10**4

    start = ak.random_strings_uniform(minlen=1, maxlen=8, size=N, seed=seed)
    end = ak.random_strings_uniform(minlen=1, maxlen=8, size=N, seed=seed)

    # each string in test_substring contains '1 string 1' with random strings before and after
    test_substring = start.stick(end, delimiter='1 string 1')

    assert test_substring.contains('1 string 1').all()
    assert test_substring.contains('1 string 1', regex=True).all()
    assert test_substring.contains('\\d string \\d', regex=True).all()
Пример #2
0
def time_substring_search(N, trials, seed):
    print(">>> arkouda substring search")
    cfg = ak.get_config()
    print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N))

    start = ak.random_strings_uniform(minlen=1, maxlen=8, size=N, seed=seed)
    end = ak.random_strings_uniform(minlen=1, maxlen=8, size=N, seed=seed)

    # each string in test_substring contains '1 string 1' with random strings before and after
    test_substring = start.stick(end, delimiter='1 string 1')
    nbytes = test_substring.nbytes * test_substring.entry.itemsize

    non_regex_times = []
    regex_literal_times = []
    regex_pattern_times = []
    for i in range(trials):
        start = time.time()
        non_regex = test_substring.contains('1 string 1')
        end = time.time()
        non_regex_times.append(end - start)

        start = time.time()
        regex_literal = test_substring.contains('1 string 1', regex=True)
        end = time.time()
        regex_literal_times.append(end - start)

        start = time.time()
        regex_pattern = test_substring.contains('\\d string \\d', regex=True)
        end = time.time()
        regex_pattern_times.append(end - start)

    avg_non_regex = sum(non_regex_times) / trials
    avg_regex_literal = sum(regex_literal_times) / trials
    avg_regex_pattern = sum(regex_pattern_times) / trials

    assert non_regex.all()
    assert regex_literal.all()
    assert regex_pattern.all()

    print("non-regex with literal substring Average time = {:.4f} sec".format(
        avg_non_regex))
    print("regex with literal substring Average time = {:.4f} sec".format(
        avg_regex_literal))
    print("regex with pattern Average time = {:.4f} sec".format(
        avg_regex_pattern))

    print("non-regex with literal substring Average rate = {:.4f} GiB/sec".
          format(nbytes / 2**30 / avg_non_regex))
    print("regex with literal substring Average rate = {:.4f} GiB/sec".format(
        nbytes / 2**30 / avg_regex_literal))
    print("regex with pattern Average rate = {:.4f} GiB/sec".format(
        nbytes / 2**30 / avg_regex_pattern))
Пример #3
0
def time_ak_gather(isize, vsize, trials, dtype, random):
    print(">>> arkouda gather")
    cfg = ak.get_config()
    Ni = isize * cfg["numLocales"]
    Nv = vsize * cfg["numLocales"]
    print("numLocales = {}, num_indices = {:,} ; num_values = {:,}".format(cfg["numLocales"], Ni, Nv))
    # Index vector is always random
    i = ak.randint(0, Nv, Ni)
    if random:
        if dtype == 'int64':
            v = ak.randint(0, 2**32, Nv)
        elif dtype == 'float64':
            v = ak.randint(0, 1, Nv, dtype=ak.float64)
        elif dtype == 'str':
            v = ak.random_strings_uniform(1, 16, Nv)
    else:   
        if dtype == 'str':
            v = ak.random_strings_uniform(1, 16, Nv)
        else:
            v = ak.ones(Nv, dtype=dtype)
    print("v={}".format(v))    
    print("v.offsets={}".format(v.offsets))    
    print("v.nbytes={}".format(v.nbytes))    
    print("v[1]={}".format(v[1]))    
    print("In Gather size={}".format(v.size))    
    print("In Gather nbytes={}".format(v.nbytes))    
    print("In Gather ndim={}".format(v.ndim))    
    print("In Gather shape={}".format(v.shape))    
    print("In Gather offsets name ={}".format(v.offsets.name))
    print("In Gather offsets size={}".format(v.offsets.size))
    print("In Gather bytes name ={}".format(v.bytes.name))
    print("In Gather bytes size={}".format(v.bytes.size))
    timings = []
    for _ in range(trials):
        print("In Gather loop i={}".format(i))
        print("In Gather v[i]={}".format(v[i]))
        start = time.time()
        c = v[i]
        end = time.time()
        print("In Gather loop c={}".format(c))
        timings.append(end - start)
    tavg = sum(timings) / trials

    print("Average time = {:.4f} sec".format(tavg))
    if dtype == 'str':
        offsets_transferred = 3 * c.offsets.size * c.offsets.itemsize
        bytes_transferred = (c.offsets.size * c.offsets.itemsize) + (2 * c.bytes.size)
        bytes_per_sec = (offsets_transferred + bytes_transferred) / tavg
    else:
        bytes_per_sec = (c.size * c.itemsize * 3) / tavg
    print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec/2**30))
Пример #4
0
def time_ak_sa(vsize, trials, dtype):
    print(">>> arkouda suffix array")
    cfg = ak.get_config()
    Nv = vsize * cfg["numLocales"]
    print("numLocales = {},  num of strings  = {:,}".format(
        cfg["numLocales"], Nv))
    #    v = ak.random_strings_uniform(90000000, 100000000, Nv)
    v = ak.random_strings_uniform(1, 16, Nv)
    c = ak.suffix_array(v)
    print("size of suffix array={}".format(c.bytes.size))
    #    print("All the random strings are as follows")
    for k in range(vsize):
        print("the {} th random tring ={}".format(k, v[k]))
        print("the {} th suffix array ={}".format(k, c[k]))
        print("")
#    print(v)
    timings = []
    for _ in range(trials):
        start = time.time()
        ak.suffix_array(v)
        end = time.time()
        timings.append(end - start)
    tavg = sum(timings) / trials

    print("Average time = {:.4f} sec".format(tavg))
    if dtype == 'str':
        offsets_transferred = 3 * c.offsets.size * c.offsets.itemsize
        bytes_transferred = (c.offsets.size *
                             c.offsets.itemsize) + (2 * c.bytes.size)
        bytes_per_sec = (offsets_transferred + bytes_transferred) / tavg
    else:
        bytes_per_sec = (c.size * c.itemsize * 3) / tavg
Пример #5
0
def time_ak_argsort(N_per_locale, trials, dtype, seed):
    print(">>> arkouda {} argsort".format(dtype))
    cfg = ak.get_config()
    N = N_per_locale * cfg["numLocales"]
    print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N))
    if dtype == 'int64':
        a = ak.randint(0, 2**32, N, seed=seed)
        nbytes = a.size * a.itemsize
    elif dtype == 'float64':
        a = ak.randint(0, 1, N, dtype=ak.float64, seed=seed)
        nbytes = a.size * a.itemsize
    elif dtype == 'str':
        a = ak.random_strings_uniform(1, 16, N, seed=seed)
        nbytes = a.nbytes * a.entry.itemsize

    timings = []
    for i in range(trials):
        start = time.time()
        perm = ak.argsort(a)
        end = time.time()
        timings.append(end - start)
    tavg = sum(timings) / trials

    if dtype in ('int64', 'float64'):
        assert ak.is_sorted(a[perm])
    print("Average time = {:.4f} sec".format(tavg))
    bytes_per_sec = nbytes / tavg
    print("Average rate = {:.4f} GiB/sec".format(bytes_per_sec / 2**30))
Пример #6
0
def time_ak_coargsort(N_per_locale, trials, dtype, seed):
    print(">>> arkouda {} coargsort".format(dtype))
    cfg = ak.get_config()
    N = N_per_locale * cfg["numLocales"]
    print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N))
    for numArrays in (1, 2, 8, 16):
        if seed is None:
            seeds = [None for _ in range(numArrays)]
        else:
            seeds = [seed+i for i in range(numArrays)]
        if dtype == 'int64':
            arrs = [ak.randint(0, 2**32, N//numArrays, seed=s) for s in seeds]
            nbytes = sum(a.size * a.itemsize for a in arrs)
        elif dtype == 'float64':
            arrs = [ak.randint(0, 1, N//numArrays, dtype=ak.float64, seed=s) for s in seeds]
            nbytes = sum(a.size * a.itemsize for a in arrs)
        elif dtype == 'str':
            arrs = [ak.random_strings_uniform(1, 8, N//numArrays, seed=s) for s in seeds]
            nbytes = sum(a.bytes.size * a.bytes.itemsize for a in arrs)

        timings = []
        for i in range(trials):
            start = time.time()
            perm = ak.coargsort(arrs)
            end = time.time()
            timings.append(end - start)
        tavg = sum(timings) / trials

        a = arrs[0][perm]
        if dtype in ('int64', 'float64'):
            assert ak.is_sorted(a)
        print("{}-array Average time = {:.4f} sec".format(numArrays, tavg))
        bytes_per_sec = nbytes / tavg
        print("{}-array Average rate = {:.4f} GiB/sec".format(numArrays, bytes_per_sec/2**30))
Пример #7
0
def check_correctness(dtype, seed):
    N = 10**4
    if dtype == 'int64':
        a = ak.randint(0, 2**32, N, seed=seed)
    elif dtype == 'float64':
        a = ak.randint(0, 1, N, dtype=ak.float64, seed=seed)
    elif dtype == 'str':
        a = ak.random_strings_uniform(1, 16, N, seed=seed)

    perm = ak.argsort(a)
    if dtype in ('int64', 'float64'):
        assert ak.is_sorted(a[perm])
Пример #8
0
def time_ak_in1d(size, trials):
    print(">>> arkouda string in1d")
    cfg = ak.get_config()
    N = size * cfg["numLocales"]
    a = ak.random_strings_uniform(1, MAXSTRLEN, N)

    for regime, bsize in zip(('Medium', 'Large'), (MEDIUM, LARGE)):
        print(
            "{} regime: numLocales = {}  a.size = {:,}  b.size = {:,}".format(
                regime, cfg["numLocales"], N, bsize))
        b = ak.random_strings_uniform(1, MAXSTRLEN, bsize)
        timings = []
        for _ in range(trials):
            start = time.time()
            c = ak.in1d(a, b)
            end = time.time()
            timings.append(end - start)
        tavg = sum(timings) / trials
        print("{} average time = {:.4f} sec".format(regime, tavg))
        bytes_per_sec = (a.size * 8 + a.nbytes + b.size * 8 + b.nbytes) / tavg
        print("{} average rate = {:.2f} GiB/sec".format(
            regime, bytes_per_sec / 2**30))
Пример #9
0
def check_correctness(vsize, strlen, trials, dtype):
    Ni = strlen
    Nv = vsize

    v = ak.random_strings_uniform(1, Ni, Nv)
    c = ak.suffix_array(v)
    for k in range(Nv):
        s = v[k]
        sa = suffixArray(s)
        aksa = c[k]
        #        _,tmp=c[k].split(maxsplit=1)
        #        aksa=tmp.split()
        #        intaksa  = [int(numeric_string) for numeric_string in aksa]
        #        intaksa  = aksa[1:-1]
        #        print(sa)
        #        print(intaksa)
        assert (sa == aksa)
Пример #10
0
def generate_arrays(N, numArrays, dtype, seed):
    totalbytes = 0
    arrays = []
    for i in range(numArrays):
        if dtype == 'int64' or (i % 2 == 0 and dtype == 'mixed'):
            a = ak.randint(0, 2**32, N//numArrays, seed=seed)
            arrays.append(a)
            totalbytes += a.size * a.itemsize
        else:
            a = ak.random_strings_uniform(1, 16, N//numArrays, seed=seed)
            arrays.append(a)
            totalbytes += (a.nbytes * a.entry.itemsize)
        if seed is not None:
            seed += 1
    if numArrays == 1:
        arrays = arrays[0]
    return arrays, totalbytes
Пример #11
0
def check_correctness(dtype, seed):
    N = 10**4
    if dtype == 'int64':
        a = ak.randint(0, 2**32, N, seed=seed)
        z = ak.zeros(N, dtype=dtype)
    elif dtype == 'float64':
        a = ak.randint(0, 1, N, dtype=ak.float64, seed=seed)
        z = ak.zeros(N, dtype=dtype)
    elif dtype == 'str':
        a = ak.random_strings_uniform(1, 16, N, seed=seed)
        z = ak.cast(ak.zeros(N), 'str')

    perm = ak.coargsort([a, z])
    if dtype in ('int64', 'float64'):
        assert ak.is_sorted(a[perm])
    perm = ak.coargsort([z, a])
    if dtype in ('int64', 'float64'):
        assert ak.is_sorted(a[perm])
Пример #12
0
def time_ak_gather(isize, vsize, trials, dtype, random, seed):
    print(">>> arkouda {} gather".format(dtype))
    cfg = ak.get_config()
    Ni = isize * cfg["numLocales"]
    Nv = vsize * cfg["numLocales"]
    print("numLocales = {}, num_indices = {:,} ; num_values = {:,}".format(cfg["numLocales"], Ni, Nv))
    # Index vector is always random
    i = ak.randint(0, Nv, Ni, seed=seed)
    if seed is not None:
        seed += 1
    if random or seed is not None:
        if dtype == 'int64':
            v = ak.randint(0, 2**32, Nv, seed=seed)
        elif dtype == 'float64':
            v = ak.randint(0, 1, Nv, dtype=ak.float64, seed=seed)
        elif dtype == 'bool':
            v = ak.randint(0, 1, Nv, dtype=ak.bool, seed=seed)
        elif dtype == 'str':
            v = ak.random_strings_uniform(1, 16, Nv, seed=seed)
    else:   
        if dtype == 'str':
            v = ak.cast(ak.arange(Nv), 'str')
        else:
            v = ak.ones(Nv, dtype=dtype)
    
    timings = []
    for _ in range(trials):
        start = time.time()
        c = v[i]
        end = time.time()
        timings.append(end - start)
    tavg = sum(timings) / trials

    print("Average time = {:.4f} sec".format(tavg))
    if dtype == 'str':
        offsets_transferred = 3 * c.offsets.size * c.offsets.itemsize
        bytes_transferred = (c.offsets.size * c.offsets.itemsize) + (2 * c.bytes.size)
        bytes_per_sec = (offsets_transferred + bytes_transferred) / tavg
    else:
        bytes_per_sec = (c.size * c.itemsize * 3) / tavg
    print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec/2**30))
Пример #13
0
def time_ak_sa(vsize, strlen, trials, dtype):
    print(">>> arkouda suffix array")
    cfg = ak.get_config()
    Nv = vsize * cfg["numLocales"]
    print("numLocales = {},  num of strings  = {:,}".format(
        cfg["numLocales"], Nv))

    if dtype == 'str':
        v = ak.random_strings_uniform(1, strlen, Nv)
    else:
        print("Wrong data type")
    c = ak.suffix_array(v)
    #    print("size of suffix array={}".format(c.bytes.size))
    #    print("offset/number of suffix array={}".format(c.offsets.size))
    #    print("itemsize of suffix array={}".format(c.offsets.itemsize))
    print("All the random strings are as follows")
    for k in range(vsize):
        print("the {} th random tring ={}".format(k, v[k]))
        print("the {} th suffix array ={}".format(k, c[k]))
        print("")
    timings = []
    for _ in range(trials):
        start = time.time()
        c = ak.suffix_array(v)
        end = time.time()
        timings.append(end - start)
    tavg = sum(timings) / trials

    print("Average time = {:.4f} sec".format(tavg))
    if dtype == 'str':
        offsets_transferred = 0 * c.offsets.size * c.offsets.itemsize
        bytes_transferred = (c.bytes.size *
                             c.offsets.itemsize) + (0 * c.bytes.size)
        bytes_per_sec = (offsets_transferred + bytes_transferred) / tavg
    else:
        print("Wrong data type")
    print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec / 2**30))
Пример #14
0

errors = False
if __name__ == '__main__':
    if len(sys.argv) > 1:
        ak.connect(server=sys.argv[1], port=sys.argv[2])
    else:
        ak.connect()

    # with open(__file__, 'r') as f:
    #     base_words = np.array(f.read().split())
    # test_strings = np.random.choice(base_words, N, replace=True)
    # strings = ak.array(test_strings)

    base_words1 = ak.random_strings_uniform(0,
                                            10,
                                            UNIQUE,
                                            characters='printable')
    base_words2 = ak.random_strings_lognormal(2,
                                              0.25,
                                              UNIQUE,
                                              characters='printable')
    base_words = ak.concatenate((base_words1, base_words2))
    np_base_words = np.hstack(
        (base_words1.to_ndarray(), base_words2.to_ndarray()))
    assert (compare_strings(base_words.to_ndarray(), np_base_words))
    choices = ak.randint(0, base_words.size, N)
    strings = base_words[choices]
    test_strings = strings.to_ndarray()
    cat = ak.Categorical(strings)
    print("strings =", strings)
    print("categorical =", cat)