def time_ak_write_read(N_per_locale, trials, dtype, path, seed): print(">>> arkouda {} write/read".format(dtype)) cfg = ak.get_config() N = N_per_locale * cfg["numLocales"] print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N)) a = ak.randint(0, 2**32, N) writetimes = [] readtimes = [] for i in range(trials): start = time.time() a.save_parquet(path) end = time.time() writetimes.append(end - start) start = time.time() b = ak.read_parquet(path+'*') end = time.time() readtimes.append(end - start) for f in glob(path + '_LOCALE*'): os.remove(f) avgwrite = sum(writetimes) / trials avgread = sum(readtimes) / trials print("write Average time = {:.4f} sec".format(avgwrite)) print("read Average time = {:.4f} sec".format(avgread)) nb = a.size * a.itemsize print("write Average rate = {:.2f} GiB/sec".format(nb/2**30/avgwrite)) print("read Average rate = {:.2f} GiB/sec".format(nb/2**30/avgread))
def time_ak_scatter(isize, vsize, trials, dtype, random): print(">>> arkouda scatter") cfg = ak.get_config() Ni = isize * cfg["numLocales"] Nv = vsize * cfg["numLocales"] print("numLocales = {}, num_indices = {:,} ; num_values = {:,}".format( cfg["numLocales"], Ni, Nv)) # Index vector is always random i = ak.randint(0, Nv, Ni) c = ak.zeros(Nv, dtype=dtype) if random: if dtype == 'int64': v = ak.randint(0, 2**32, Ni) elif dtype == 'float64': v = ak.randint(0, 1, Ni, dtype=ak.float64) else: v = ak.ones(Ni, dtype=dtype) timings = [] for _ in range(trials): start = time.time() c[i] = v end = time.time() timings.append(end - start) tavg = sum(timings) / trials print("Average time = {:.4f} sec".format(tavg)) bytes_per_sec = (i.size * i.itemsize * 3) / tavg print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec / 2**30))
def time_ak_reduce(N_per_locale, trials, dtype, random): print(">>> arkouda reduce") cfg = ak.get_config() N = N_per_locale * cfg["numLocales"] print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N)) if random: if dtype == 'int64': a = ak.randint(0, 2**32, N) elif dtype == 'float64': a = ak.randint(0, 1, N, dtype=ak.float64) else: a = ak.arange(0, N, 1) if dtype == 'float64': a = 1.0 * a timings = {op: [] for op in OPS} results = {} for i in range(trials): for op in timings.keys(): fxn = getattr(a, op) start = time.time() r = fxn() end = time.time() timings[op].append(end - start) results[op] = r tavg = {op: sum(t) / trials for op, t in timings.items()} for op, t in tavg.items(): print("{} = {}".format(op, results[op])) print(" Average time = {:.4f} sec".format(t)) bytes_per_sec = (a.size * a.itemsize) / t print(" Average rate = {:.2f} GiB/sec".format(bytes_per_sec / 2**30))
def time_ak_argsort(N_per_locale, trials, dtype, scale_by_locales): print(">>> arkouda argsort") cfg = ak.get_config() if scale_by_locales: N = N_per_locale * cfg["numLocales"] else: N = N_per_locale print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N)) if dtype == 'int64': a = ak.randint(0, 2**32, N) elif dtype == 'float64': a = ak.randint(0, 1, N, dtype=ak.float64) timings = [] for i in range(trials): start = time.time() perm = ak.argsort(a) end = time.time() timings.append(end - start) tavg = sum(timings) / trials assert ak.is_sorted(a[perm]) print("Average time = {:.4f} sec".format(tavg)) bytes_per_sec = (a.size * a.itemsize) / tavg print("Average rate = {:.4f} GiB/sec".format(bytes_per_sec/2**30))
def time_ak_coargsort(N_per_locale, trials, dtype): print(">>> arkouda coargsort") cfg = ak.get_config() N = N_per_locale * cfg["numLocales"] print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N)) for numArrays in (1, 2, 8, 16): if dtype == 'int64': arrs = [ ak.randint(0, 2**32, N // numArrays) for _ in range(numArrays) ] elif dtype == 'float64': arrs = [ ak.randint(0, 1, N // numArrays, dtype=ak.float64) for _ in range(numArrays) ] timings = [] for i in range(trials): start = time.time() perm = ak.coargsort(arrs) end = time.time() timings.append(end - start) tavg = sum(timings) / trials a = arrs[0][perm] assert ak.is_sorted(a) print("{}-array Average time = {:.4f} sec".format(numArrays, tavg)) bytes_per_sec = sum(a.size * a.itemsize for a in arrs) / tavg print("{}-array Average rate = {:.4f} GiB/sec".format( numArrays, bytes_per_sec / 2**30))
def time_ak_in1d(size, trials): print(">>> arkouda int64 in1d") cfg = ak.get_config() N = size * cfg["numLocales"] a = ak.arange(N) % LARGE for regime, bsize in zip(('Medium', 'Large'), (MEDIUM, LARGE)): print( "{} regime: numLocales = {} a.size = {:,} b.size = {:,}".format( regime, cfg["numLocales"], N, bsize)) b = ak.arange(bsize) expected_misses = (LARGE - bsize) * (a.size // LARGE) + max( (0, (a.size % LARGE) - bsize)) timings = [] for _ in range(trials): start = time.time() c = ak.in1d(a, b) end = time.time() timings.append(end - start) assert (c.size - c.sum()) == expected_misses, "Incorrect result" tavg = sum(timings) / trials print("{} average time = {:.4f} sec".format(regime, tavg)) bytes_per_sec = (a.size * a.itemsize + b.size * b.itemsize) / tavg print("{} average rate = {:.2f} GiB/sec".format( regime, bytes_per_sec / 2**30))
def time_ak_array_transfer(N, trials, dtype, random, seed): print(">>> arkouda {} array creation".format(dtype)) cfg = ak.get_config() print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N)) a = ak.randint(0, 2**32, N, dtype=dtype, seed=seed) nb = a.size * a.itemsize ak.client.maxTransferBytes = nb to_ndarray_times = [] to_pdarray_times = [] for i in range(trials): start = time.time() npa = a.to_ndarray() end = time.time() to_ndarray_times.append(end - start) start = time.time() aka = ak.array(npa) end = time.time() to_pdarray_times.append(end - start) gc.collect() avgnd = sum(to_ndarray_times) / trials avgpd = sum(to_pdarray_times) / trials print("to_ndarray Average time = {:.4f} sec".format(avgnd)) print("ak.array Average time = {:.4f} sec".format(avgpd)) print("to_ndarray Average rate = {:.4f} GiB/sec".format(nb / 2**30 / avgnd)) print("ak.array Average rate = {:.4f} GiB/sec".format(nb / 2**30 / avgpd))
def time_ak_setops(N_per_locale, trials, dtype, seed): print(">>> arkouda {} setops".format(dtype)) cfg = ak.get_config() N = N_per_locale * cfg["numLocales"] print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N)) if dtype == 'int64': a = ak.randint(0, 2**32, N, seed=seed) b = ak.randint(0, 2**32, N, seed=seed) timings = {op: [] for op in OPS} results = {} for i in range(trials): for op in timings.keys(): fxn = getattr(ak, op) start = time.time() r = fxn(a, b) end = time.time() timings[op].append(end - start) results[op] = r tavg = {op: sum(t) / trials for op, t in timings.items()} for op, t in tavg.items(): print(" {} Average time = {:.4f} sec".format(op, t)) bytes_per_sec = (a.size * a.itemsize * 2) / t print(" {} Average rate = {:.2f} GiB/sec".format( op, bytes_per_sec / 2**30))
def time_ak_write(N_per_locale, numfiles, trials, dtype, path, seed, parquet): print(">>> arkouda {} write".format(dtype)) cfg = ak.get_config() N = N_per_locale * cfg["numLocales"] print("numLocales = {}, N = {:,}, filesPerLoc = {}".format( cfg["numLocales"], N, numfiles)) if dtype == 'int64': a = ak.randint(0, 2**32, N, seed=seed) elif dtype == 'float64': a = ak.randint(0, 1, N, dtype=ak.float64, seed=seed) writetimes = [] for i in range(trials): for j in range(numfiles): start = time.time() a.save(f"{path}{j:04}") if not parquet else a.save_parquet( f"{path}{j:04}") end = time.time() writetimes.append(end - start) avgwrite = sum(writetimes) / trials print("write Average time = {:.4f} sec".format(avgwrite)) nb = a.size * a.itemsize * numfiles print("write Average rate = {:.2f} GiB/sec".format(nb / 2**30 / avgwrite))
def time_ak_scan(N_per_locale, trials, dtype, random, seed): print(">>> arkouda {} scan".format(dtype)) cfg = ak.get_config() N = N_per_locale * cfg["numLocales"] print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N)) if random or args.seed is not None: if dtype == 'int64': a = ak.randint(1, N, N, seed=seed) elif dtype == 'float64': a = ak.uniform(N, seed=seed) + 0.5 else: a = ak.arange(1, N, 1) if dtype == 'float64': a = 1.0 * a timings = {op: [] for op in OPS} final_values = {} for i in range(trials): for op in timings.keys(): fxn = getattr(ak, op) start = time.time() r = fxn(a) end = time.time() timings[op].append(end - start) final_values[op] = r[r.size-1] tavg = {op: sum(t) / trials for op, t in timings.items()} for op, t in tavg.items(): print("{}, final value = {}".format(op, final_values[op])) print(" {} Average time = {:.4f} sec".format(op, t)) bytes_per_sec = (a.size * a.itemsize * 2) / t print(" {} Average rate = {:.2f} GiB/sec".format(op, bytes_per_sec/2**30))
def time_ak_stream(N_per_locale, trials, alpha, dtype, random): print(">>> arkouda stream") cfg = ak.get_config() N = N_per_locale * cfg["numLocales"] print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N)) if random: if dtype == 'int64': a = ak.randint(0, 2**32, N) b = ak.randint(0, 2**32, N) elif dtype == 'float64': a = ak.randint(0, 1, N, dtype=ak.float64) b = ak.randint(0, 1, N, dtype=ak.float64) else: a = ak.ones(N, dtype=dtype) b = ak.ones(N, dtype=dtype) timings = [] for i in range(trials): start = time.time() c = a + b * alpha end = time.time() timings.append(end - start) tavg = sum(timings) / trials print("Average time = {:.4f} sec".format(tavg)) bytes_per_sec = (c.size * c.itemsize * 3) / tavg print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec / 2**30))
def time_ak_argsort(N_per_locale, trials, dtype, seed): print(">>> arkouda {} argsort".format(dtype)) cfg = ak.get_config() N = N_per_locale * cfg["numLocales"] print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N)) if dtype == 'int64': a = ak.randint(0, 2**32, N, seed=seed) nbytes = a.size * a.itemsize elif dtype == 'float64': a = ak.randint(0, 1, N, dtype=ak.float64, seed=seed) nbytes = a.size * a.itemsize elif dtype == 'str': a = ak.random_strings_uniform(1, 16, N, seed=seed) nbytes = a.nbytes * a.entry.itemsize timings = [] for i in range(trials): start = time.time() perm = ak.argsort(a) end = time.time() timings.append(end - start) tavg = sum(timings) / trials if dtype in ('int64', 'float64'): assert ak.is_sorted(a[perm]) print("Average time = {:.4f} sec".format(tavg)) bytes_per_sec = nbytes / tavg print("Average rate = {:.4f} GiB/sec".format(bytes_per_sec / 2**30))
def time_ak_sa(vsize, trials, dtype): print(">>> arkouda suffix array") cfg = ak.get_config() Nv = vsize * cfg["numLocales"] print("numLocales = {}, num of strings = {:,}".format( cfg["numLocales"], Nv)) # v = ak.random_strings_uniform(90000000, 100000000, Nv) v = ak.random_strings_uniform(1, 16, Nv) c = ak.suffix_array(v) print("size of suffix array={}".format(c.bytes.size)) # print("All the random strings are as follows") for k in range(vsize): print("the {} th random tring ={}".format(k, v[k])) print("the {} th suffix array ={}".format(k, c[k])) print("") # print(v) timings = [] for _ in range(trials): start = time.time() ak.suffix_array(v) end = time.time() timings.append(end - start) tavg = sum(timings) / trials print("Average time = {:.4f} sec".format(tavg)) if dtype == 'str': offsets_transferred = 3 * c.offsets.size * c.offsets.itemsize bytes_transferred = (c.offsets.size * c.offsets.itemsize) + (2 * c.bytes.size) bytes_per_sec = (offsets_transferred + bytes_transferred) / tavg else: bytes_per_sec = (c.size * c.itemsize * 3) / tavg
def time_ak_coargsort(N_per_locale, trials, dtype, seed): print(">>> arkouda {} coargsort".format(dtype)) cfg = ak.get_config() N = N_per_locale * cfg["numLocales"] print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N)) for numArrays in (1, 2, 8, 16): if seed is None: seeds = [None for _ in range(numArrays)] else: seeds = [seed+i for i in range(numArrays)] if dtype == 'int64': arrs = [ak.randint(0, 2**32, N//numArrays, seed=s) for s in seeds] nbytes = sum(a.size * a.itemsize for a in arrs) elif dtype == 'float64': arrs = [ak.randint(0, 1, N//numArrays, dtype=ak.float64, seed=s) for s in seeds] nbytes = sum(a.size * a.itemsize for a in arrs) elif dtype == 'str': arrs = [ak.random_strings_uniform(1, 8, N//numArrays, seed=s) for s in seeds] nbytes = sum(a.bytes.size * a.bytes.itemsize for a in arrs) timings = [] for i in range(trials): start = time.time() perm = ak.coargsort(arrs) end = time.time() timings.append(end - start) tavg = sum(timings) / trials a = arrs[0][perm] if dtype in ('int64', 'float64'): assert ak.is_sorted(a) print("{}-array Average time = {:.4f} sec".format(numArrays, tavg)) bytes_per_sec = nbytes / tavg print("{}-array Average rate = {:.4f} GiB/sec".format(numArrays, bytes_per_sec/2**30))
def check_coargsort(N_per_locale): print(">>> arkouda coargsort") cfg = ak.get_config() N = N_per_locale * cfg["numLocales"] print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N)) check_int(N) check_float(N) check_int_float(N) check_large(N)
def time_substring_search(N, trials, seed): print(">>> arkouda substring search") cfg = ak.get_config() print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N)) start = ak.random_strings_uniform(minlen=1, maxlen=8, size=N, seed=seed) end = ak.random_strings_uniform(minlen=1, maxlen=8, size=N, seed=seed) # each string in test_substring contains '1 string 1' with random strings before and after test_substring = start.stick(end, delimiter='1 string 1') nbytes = test_substring.nbytes * test_substring.entry.itemsize non_regex_times = [] regex_literal_times = [] regex_pattern_times = [] for i in range(trials): start = time.time() non_regex = test_substring.contains('1 string 1') end = time.time() non_regex_times.append(end - start) start = time.time() regex_literal = test_substring.contains('1 string 1', regex=True) end = time.time() regex_literal_times.append(end - start) start = time.time() regex_pattern = test_substring.contains('\\d string \\d', regex=True) end = time.time() regex_pattern_times.append(end - start) avg_non_regex = sum(non_regex_times) / trials avg_regex_literal = sum(regex_literal_times) / trials avg_regex_pattern = sum(regex_pattern_times) / trials assert non_regex.all() assert regex_literal.all() assert regex_pattern.all() print("non-regex with literal substring Average time = {:.4f} sec".format( avg_non_regex)) print("regex with literal substring Average time = {:.4f} sec".format( avg_regex_literal)) print("regex with pattern Average time = {:.4f} sec".format( avg_regex_pattern)) print("non-regex with literal substring Average rate = {:.4f} GiB/sec". format(nbytes / 2**30 / avg_non_regex)) print("regex with literal substring Average rate = {:.4f} GiB/sec".format( nbytes / 2**30 / avg_regex_literal)) print("regex with pattern Average rate = {:.4f} GiB/sec".format( nbytes / 2**30 / avg_regex_pattern))
def time_ak_gather(isize, vsize, trials, dtype, random): print(">>> arkouda gather") cfg = ak.get_config() Ni = isize * cfg["numLocales"] Nv = vsize * cfg["numLocales"] print("numLocales = {}, num_indices = {:,} ; num_values = {:,}".format(cfg["numLocales"], Ni, Nv)) # Index vector is always random i = ak.randint(0, Nv, Ni) if random: if dtype == 'int64': v = ak.randint(0, 2**32, Nv) elif dtype == 'float64': v = ak.randint(0, 1, Nv, dtype=ak.float64) elif dtype == 'str': v = ak.random_strings_uniform(1, 16, Nv) else: if dtype == 'str': v = ak.random_strings_uniform(1, 16, Nv) else: v = ak.ones(Nv, dtype=dtype) print("v={}".format(v)) print("v.offsets={}".format(v.offsets)) print("v.nbytes={}".format(v.nbytes)) print("v[1]={}".format(v[1])) print("In Gather size={}".format(v.size)) print("In Gather nbytes={}".format(v.nbytes)) print("In Gather ndim={}".format(v.ndim)) print("In Gather shape={}".format(v.shape)) print("In Gather offsets name ={}".format(v.offsets.name)) print("In Gather offsets size={}".format(v.offsets.size)) print("In Gather bytes name ={}".format(v.bytes.name)) print("In Gather bytes size={}".format(v.bytes.size)) timings = [] for _ in range(trials): print("In Gather loop i={}".format(i)) print("In Gather v[i]={}".format(v[i])) start = time.time() c = v[i] end = time.time() print("In Gather loop c={}".format(c)) timings.append(end - start) tavg = sum(timings) / trials print("Average time = {:.4f} sec".format(tavg)) if dtype == 'str': offsets_transferred = 3 * c.offsets.size * c.offsets.itemsize bytes_transferred = (c.offsets.size * c.offsets.itemsize) + (2 * c.bytes.size) bytes_per_sec = (offsets_transferred + bytes_transferred) / tavg else: bytes_per_sec = (c.size * c.itemsize * 3) / tavg print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec/2**30))
def time_ak_groupby(N_per_locale, trials, dtype, seed): print(">>> arkouda groupby") cfg = ak.get_config() N = N_per_locale * cfg["numLocales"] print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N)) for numArrays in (1, 2, 8, 16): arrays, totalbytes = generate_arrays(N, numArrays, dtype, seed) timings = [] for i in range(trials): start = time.time() g = ak.GroupBy(arrays) end = time.time() timings.append(end - start) tavg = sum(timings) / trials print("{}-array Average time = {:.4f} sec".format(numArrays, tavg)) bytes_per_sec = totalbytes / tavg print("{}-array Average rate = {:.4f} GiB/sec".format( numArrays, bytes_per_sec / 2**30))
def time_flatten(N, trials): print(">>> arkouda flatten") cfg = ak.get_config() print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N)) thirds = [ak.cast(ak.arange(i, N*3, 3), 'str') for i in range(3)] thickrange = thirds[0].stick(thirds[1], delimiter='_').stick(thirds[2], delimiter='_') nbytes = thickrange.nbytes * thickrange.entry.itemsize non_regex_times = [] regex_literal_times = [] regex_pattern_times = [] for i in range(trials): start = time.time() non_regex = thickrange.flatten('_') end = time.time() non_regex_times.append(end - start) start = time.time() regex_literal = thickrange.flatten('_', regex=True) end = time.time() regex_literal_times.append(end - start) start = time.time() regex_pattern = thickrange.flatten('_+', regex=True) end = time.time() regex_pattern_times.append(end - start) avg_non_regex = sum(non_regex_times) / trials avg_regex_literal = sum(regex_literal_times) / trials avg_regex_pattern = sum(regex_pattern_times) / trials answer = ak.cast(ak.arange(N*3), 'str') assert (non_regex == answer).all() assert (regex_literal == answer).all() assert (regex_pattern == answer).all() print("non-regex flatten with literal delimiter Average time = {:.4f} sec".format(avg_non_regex)) print("regex flatten with literal delimiter Average time = {:.4f} sec".format(avg_regex_literal)) print("regex flatten with pattern delimiter Average time = {:.4f} sec".format(avg_regex_pattern)) print("non-regex flatten with literal delimiter Average rate = {:.4f} GiB/sec".format(nbytes/2**30/avg_non_regex)) print("regex flatten with literal delimiter Average rate = {:.4f} GiB/sec".format(nbytes/2**30/avg_regex_literal)) print("regex flatten with pattern delimiter Average rate = {:.4f} GiB/sec".format(nbytes/2**30/avg_regex_pattern))
def time_ak_array_create(N_per_locale, trials, dtype, random, seed): print(">>> arkouda {} array creation".format(dtype)) cfg = ak.get_config() N = N_per_locale * cfg["numLocales"] print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N)) timings = {op: [] for op in OPS} for i in range(trials): for op in timings.keys(): start = time.time() a = create_ak_array(N, op, dtype, seed) end = time.time() timings[op].append(end - start) tavg = {op: sum(t) / trials for op, t in timings.items()} for op, t in tavg.items(): print(" {} Average time = {:.4f} sec".format(op, t)) bytes_per_sec = (a.size * a.itemsize) / t print(" {} Average rate = {:.2f} GiB/sec".format(op, bytes_per_sec/2**30))
def time_ak_gather(isize, vsize, trials, dtype, random, seed): print(">>> arkouda {} gather".format(dtype)) cfg = ak.get_config() Ni = isize * cfg["numLocales"] Nv = vsize * cfg["numLocales"] print("numLocales = {}, num_indices = {:,} ; num_values = {:,}".format(cfg["numLocales"], Ni, Nv)) # Index vector is always random i = ak.randint(0, Nv, Ni, seed=seed) if seed is not None: seed += 1 if random or seed is not None: if dtype == 'int64': v = ak.randint(0, 2**32, Nv, seed=seed) elif dtype == 'float64': v = ak.randint(0, 1, Nv, dtype=ak.float64, seed=seed) elif dtype == 'bool': v = ak.randint(0, 1, Nv, dtype=ak.bool, seed=seed) elif dtype == 'str': v = ak.random_strings_uniform(1, 16, Nv, seed=seed) else: if dtype == 'str': v = ak.cast(ak.arange(Nv), 'str') else: v = ak.ones(Nv, dtype=dtype) timings = [] for _ in range(trials): start = time.time() c = v[i] end = time.time() timings.append(end - start) tavg = sum(timings) / trials print("Average time = {:.4f} sec".format(tavg)) if dtype == 'str': offsets_transferred = 3 * c.offsets.size * c.offsets.itemsize bytes_transferred = (c.offsets.size * c.offsets.itemsize) + (2 * c.bytes.size) bytes_per_sec = (offsets_transferred + bytes_transferred) / tavg else: bytes_per_sec = (c.size * c.itemsize * 3) / tavg print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec/2**30))
def time_ak_read(N_per_locale, numfiles, trials, dtype, path, seed, parquet): print(">>> arkouda {} read".format(dtype)) cfg = ak.get_config() N = N_per_locale * cfg["numLocales"] print("numLocales = {}, N = {:,}, filesPerLoc = {}".format( cfg["numLocales"], N, numfiles)) a = ak.array([]) readtimes = [] for i in range(trials): start = time.time() a = ak.read_all(path + '*') if not parquet else ak.read_parquet(path + '*') end = time.time() readtimes.append(end - start) avgread = sum(readtimes) / trials print("read Average time = {:.4f} sec".format(avgread)) nb = a.size * a.itemsize print("read Average rate = {:.2f} GiB/sec".format(nb / 2**30 / avgread))
def time_ak_write_read(N_per_locale, trials, dtype, path, seed): print(">>> arkouda write/read") cfg = ak.get_config() N = N_per_locale * cfg["numLocales"] print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N)) if dtype == 'int64': a = ak.randint(0, 2**32, N, seed=seed) elif dtype == 'float64': a = ak.randint(0, 1, N, dtype=ak.float64, seed=seed) writetimes = [] readtimes = [] for i in range(trials): start = time.time() a.save(path) end = time.time() writetimes.append(end - start) start = time.time() b = ak.load(path) end = time.time() readtimes.append(end - start) for f in glob(path + '_LOCALE*'): os.remove(f) avgwrite = sum(writetimes) / trials avgread = sum(readtimes) / trials print("Write times: min = {:.4f} sec, max = {:.4f} sec, avg = {:.4f} sec". format(min(writetimes), max(writetimes), avgwrite)) print("Read times : min = {:.4f} sec, max = {:.4f} sec, avg = {:.4f} sec". format(min(readtimes), max(readtimes), avgread)) nb = a.size * a.itemsize print( "Write rates: min = {:.4f} GiB/sec, max = {:.4f} GiB/sec, avg = {:.4f} GiB/sec" .format(nb / 2**30 / max(writetimes), nb / 2**30 / min(writetimes), nb / 2**30 / avgwrite)) print( "Read rates : min = {:.4f} GiB/sec, max = {:.4f} GiB/sec, avg = {:.4f} GiB/sec" .format(nb / 2**30 / max(readtimes), nb / 2**30 / min(readtimes), nb / 2**30 / avgread))
def time_ak_in1d(size, trials): print(">>> arkouda string in1d") cfg = ak.get_config() N = size * cfg["numLocales"] a = ak.random_strings_uniform(1, MAXSTRLEN, N) for regime, bsize in zip(('Medium', 'Large'), (MEDIUM, LARGE)): print( "{} regime: numLocales = {} a.size = {:,} b.size = {:,}".format( regime, cfg["numLocales"], N, bsize)) b = ak.random_strings_uniform(1, MAXSTRLEN, bsize) timings = [] for _ in range(trials): start = time.time() c = ak.in1d(a, b) end = time.time() timings.append(end - start) tavg = sum(timings) / trials print("{} average time = {:.4f} sec".format(regime, tavg)) bytes_per_sec = (a.size * 8 + a.nbytes + b.size * 8 + b.nbytes) / tavg print("{} average rate = {:.2f} GiB/sec".format( regime, bytes_per_sec / 2**30))
def time_ak_sa(vsize, strlen, trials, dtype): print(">>> arkouda suffix array") cfg = ak.get_config() Nv = vsize * cfg["numLocales"] print("numLocales = {}, num of strings = {:,}".format( cfg["numLocales"], Nv)) if dtype == 'str': v = ak.random_strings_uniform(1, strlen, Nv) else: print("Wrong data type") c = ak.suffix_array(v) # print("size of suffix array={}".format(c.bytes.size)) # print("offset/number of suffix array={}".format(c.offsets.size)) # print("itemsize of suffix array={}".format(c.offsets.itemsize)) print("All the random strings are as follows") for k in range(vsize): print("the {} th random tring ={}".format(k, v[k])) print("the {} th suffix array ={}".format(k, c[k])) print("") timings = [] for _ in range(trials): start = time.time() c = ak.suffix_array(v) end = time.time() timings.append(end - start) tavg = sum(timings) / trials print("Average time = {:.4f} sec".format(tavg)) if dtype == 'str': offsets_transferred = 0 * c.offsets.size * c.offsets.itemsize bytes_transferred = (c.bytes.size * c.offsets.itemsize) + (0 * c.bytes.size) bytes_per_sec = (offsets_transferred + bytes_transferred) / tavg else: print("Wrong data type") print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec / 2**30))
def time_ak_aggregate(N_per_locale, trials, seed): print(">>> arkouda aggregate") cfg = ak.get_config() N = N_per_locale * cfg["numLocales"] print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N)) keys, intvals, boolvals = generate_arrays(N, seed) g = ak.GroupBy(keys, assume_sorted=True) for op in ak.GroupBy.Reductions: if op in BOOLOPS: v = boolvals else: v = intvals totalbytes = v.size * v.itemsize timings = [] for i in range(trials): start = time.time() res = g.aggregate(v, op)[1] end = time.time() timings.append(end - start) tavg = sum(timings) / trials print("Aggregate {} Average time = {:.4f} sec".format(op, tavg)) bytes_per_sec = totalbytes / tavg print("Aggregate {} Average rate = {:.4f} GiB/sec".format( op, bytes_per_sec / 2**30))
def report_mem(pre=''): cfg = ak.get_config() used = ak.get_mem_used() / (cfg['numLocales'] * cfg['physicalMemory']) print(f"{pre} mem use: {ak.get_mem_used()/(1024**4): .2f} TB ({used:.1%})")
args = parser.parse_args() ak.set_defaults() ak.verbose = False if args.server is not None: if args.port is not None: ak.connect(server=args.server, port=args.port) else: ak.connect(server=args.server) else: if args.port is not None: ak.connect(port=args.port) else: ak.connect() print(ak.get_config()) if len(args.hdffiles) == 0: print("usage: {} [--server server] [--port port] hdffiles ".format(sys.argv[0])) # fields in the files to read and create pdarrays in the dict fields = ['srcIP', 'dstIP', 'srcPort', 'dstPort', 'start'] # read in the files, all data from hdffiles # will be concatenated together in the fields/columns nfDF = {field: ak.read_hdf(field, args.hdffiles) for field in fields} # print out the pdarrays in the dict and their types print(nfDF['start'],nfDF['start'].dtype) print(nfDF['srcIP'],type(nfDF['srcIP'])) # Strings dosen't have a dtype?!? print(nfDF['dstIP'],type(nfDF['dstIP'])) # Strings dosen't have a dtype?!?