def run() -> None: values: Tuple[int, int, int, int, int, bool] = parse_args() N: int = values[0] M: int = values[1] nrepeat: int = 100 print(f"Total size S = {N * M} N = {N} M = {M}") p = pk.RangePolicy(pk.get_default_space(), 0, N) w = Workload(N, M) pk.parallel_for(p, w.y_init) pk.parallel_for(pk.RangePolicy(pk.get_default_space(), 0, M), w.x_init) pk.parallel_for(p, w.matrix_init) timer = pk.Timer() for i in range(nrepeat): result = pk.parallel_reduce(p, w.yAx) timer_result = timer.seconds() print(f"Computed result for {N} x {M} is {result}") solution = N * M if result != solution: pk.printf("Error: result (%lf) != solution (%lf)\n", result, solution) print(f"N({N}) M({M}) nrepeat({nrepeat}) problem(MB) time({timer_result}) bandwidth(GB/s)")
def run() -> None: values: Tuple[int, int, int, int, int, bool] = parse_args() N: int = values[0] M: int = values[1] nrepeat: int = 1 print(f"Total size S = {N * M} N = {N} M = {M}") y = pk.View([N], pk.double) x = pk.View([M], pk.double) A = pk.View([N * M], pk.double) p = pk.RangePolicy(pk.get_default_space(), 0, N) pk.parallel_for(p, y_init, y=y) pk.parallel_for(pk.RangePolicy(pk.get_default_space(), 0, M), y_init, y=x) pk.parallel_for(p, matrix_init, M=M, A=A) timer = pk.Timer() for i in range(nrepeat): result = pk.parallel_reduce(p, yAx, M=M, y=y, x=x, A=A) timer_result = timer.seconds() print(f"Computed result for {N} x {M} is {result}") solution = N * M if result != solution: pk.printf("Error: result (%lf) != solution (%lf)\n", result, solution) print(f"N({N}) M({M}) nrepeat({nrepeat}) problem(MB) time({timer_result}) bandwidth(GB/s)")
def run() -> None: random.seed(1010101) indices = 8192 data = 33554432 repeats = 10 space = pk.ExecutionSpace.OpenMP parser = argparse.ArgumentParser() parser.add_argument("--indices", type=int) parser.add_argument("--data", type=int) parser.add_argument("--repeats", type=int) parser.add_argument("--atomics", action="store_true") parser.add_argument("--execution_space", type=str) args = parser.parse_args() if args.indices: indices = args.indices if args.data: data = args.data if args.repeats: repeats = args.repeats use_atomics = args.atomics if args.execution_space: space = pk.ExecutionSpace(args.execution_space) pk.set_default_space(space) w = Benchmark(indices, data, repeats, use_atomics) range_indices = pk.RangePolicy(pk.get_default_space(), 0, indices) range_data = pk.RangePolicy(pk.get_default_space(), 0, data) print("Reports fastest timing per kernel") print("Creating Views...") print("Memory Sizes:") print(f"- Elements: {data} ({1e-6*data*8} MB)") print(f"- Indices: {indices} ({1e-6*indices*8} MB)") print(f"- Atomics: {'yes' if use_atomics else 'no'}") print(f"Benchmark kernels will be performed for {repeats} iterations") print("Initializing Views...") pk.parallel_for(range_data, w.init_data) pk.parallel_for(range_indices, w.init_indices) print("Starting benchmarking...") timer = pk.Timer() for i in range(repeats): for i in range(indices): w.indices[i] = random.randrange(data) if use_atomics: pk.parallel_for(range_indices, w.run_gups_atomic) else: pk.parallel_for(range_indices, w.run_gups) gupsTime = timer.seconds() print(f"GUP/s Random: {1e-9 * repeats * indices / gupsTime}") print(w.data)
def run() -> None: values: Tuple[int, int, int, int, int, bool] = parse_args() N: int = values[0] M: int = values[1] fill: bool = values[-1] nrepeat: int = 100 print(f"Total size S = {N * M} N = {N} M = {M}") pk.set_default_space(pk.ExecutionSpace.Cuda) y: pk.View1D = pk.View([N], pk.double) x: pk.View1D = pk.View([M], pk.double) A: pk.View2D = pk.View([N, M], pk.double) p = pk.RangePolicy(pk.get_default_space(), 0, N) pk.parallel_for(p, y_init, y=y) pk.parallel_for(pk.RangePolicy(pk.get_default_space(), 0, M), y_init, y=x) pk.parallel_for(p, matrix_init, M=M, A=A) # if fill: # y.fill(1) # x.fill(1) # A.fill(1) # else: # for i in range(N): # y[i] = 1 # for i in range(M): # x[i] = 1 # for j in range(N): # for i in range(M): # A[j][i] = 1 timer = pk.Timer() for i in range(nrepeat): result = pk.parallel_reduce(p, yAx, M=M, y=y, x=x, A=A) timer_result = timer.seconds() print(f"Computed result for {N} x {M} is {result}") solution: float = N * M if result != solution: pk.printf("Error: result (%lf) != solution (%lf)\n", result, solution) print( f"N({N}) M({M}) nrepeat({nrepeat}) problem(MB) time({timer_result}) bandwidth(GB/s)" )
def setUp(self): self.threads: int = 50 self.value: int = 7 self.functor = Add1DTestScanFunctor(self.threads, self.value) self.range_policy = pk.RangePolicy(pk.ExecutionSpace.Default, 0, self.threads)
def test_dynamic2D(self): expected_result: int = self.i_4 * self.i_1 * self.i_2 result: int = pk.parallel_reduce( pk.RangePolicy(pk.ExecutionSpace.Default, 0, self.i_2), self.functor.dynamic2D) self.assertEqual(expected_result, result)
def run() -> None: parser = argparse.ArgumentParser() parser.add_argument('iterations', type=int) parser.add_argument('length', type=int) parser.add_argument('offset', nargs='?', type=int, default=0) args = parser.parse_args() iterations = args.iterations length = args.length offset = args.offset scalar = 3 if iterations < 1: sys.exit("ERROR: iterations must be >= 1") if length <= 0: sys.exit("ERROR: vector length must be positive") # emulate cpp example if length <= 0: sys.exit("ERROR: offset must be nonnegative") print("Number of iterations = ", iterations) print("Vector length = ", length) print("Offset = ", offset) p = pk.RangePolicy(pk.ExecutionSpace.OpenMP, 0, length) w = Workload(iterations, length, offset, scalar) pk.parallel_for(p, w.init_views) # pk.fence() timer = pk.Timer() for i in range(iterations): pk.parallel_for(p, w.nstream) # pk.fence() nstream_time = timer.seconds() # verify correctness ar: float = 0 br: float = 2 cr: float = 2 for i in range(iterations): ar += br + scalar * cr ar *= length asum = pk.parallel_reduce(p, w.res_reduce) # pk.fence() episilon: float = 1.0e-8 if (abs(ar - asum) / asum > episilon): print("ERROR: Failed Valication on output array") else: avgtime: float = nstream_time / iterations nbytes: float = 4.0 * length * 4 print("Solution validates") print("Rate (MB/s): %.2f" % (1.e-6 * nbytes / avgtime)) print("Avg time (ms): %f" % (avgtime / 1.e-3))
def setUp(self): self.threads: int = 50 self.i_1: int = 7 self.i_2: int = 2 self.b_1: bool = False self.b_2: bool = True self.functor = ASTTestReduceFunctor(self.threads, self.i_1, self.i_2, self.b_1, self.b_2) self.range_policy = pk.RangePolicy(pk.ExecutionSpace.Default, 0, self.threads)
def setUp(self): self.threads: int = 10 self.i_1: int = 10 self.i_2: int = 15 self.i_3: int = 20 self.i_4: int = 10 self.functor = ViewsTestFunctor(self.threads, self.i_1, self.i_2, self.i_3, self.i_4) self.range_policy = pk.RangePolicy(pk.ExecutionSpace.Default, 0, self.threads)
def setUp(self): self.threads: int = 1 self.i_1: int = 5 self.i_2: int = 2 self.f_1: float = 7.0 self.f_2: float = 3.0 self.functor = AtomicsTestFunctor(self.threads, self.i_1, self.i_2, self.f_1, self.f_2) self.range_policy = pk.RangePolicy(pk.ExecutionSpace.Default, 0, self.threads)
def setUp(self): self.threads: int = 50 self.i_1: int = 5 self.i_2: int = 1 self.f_1: float = 5.5 self.b_1: bool = True self.functor = ClasstypesTestFunctor(self.threads, self.i_1, self.i_2, self.f_1, self.b_1) self.range_policy = pk.RangePolicy(pk.ExecutionSpace.Default, 0, self.threads)
def setUp(self): self.threads: int = 50 self.i_1: int = 7 self.i_2: int = 2 self.f_1: float = 5.5 self.f_2: float = 1.3 self.b_1: bool = True self.functor = KokkosFunctionsTestReduceFunctor( self.threads, self.i_1, self.i_2, self.f_1, self.f_2, self.b_1) self.range_policy = pk.RangePolicy(pk.ExecutionSpace.Default, 0, self.threads)
space = pk.ExecutionSpace.OpenMP if args.execution_space: space = pk.ExecutionSpace(args.execution_space) pk.set_default_space(space) N = args.N K = args.K D = args.D R = args.R U = args.U F = args.F scalar_size = 8 policy = pk.RangePolicy(pk.get_default_space(), 0, N) w = Benchmark_double_8(N, K, D, R, F) timer = pk.Timer() for r in range(R): pk.parallel_for(policy, w.benchmark) pk.fence() seconds = timer.seconds() num_bytes = 1.0 * N * K * R * (2 * scalar_size + 4) + N * R * scalar_size flops = 1.0 * N * K * R * (F * 2 * U + 2 * (U - 1)) gather_ops = 1.0 * N * K * R * 2 seconds = seconds print( f"SNKDRUF: {scalar_size/4} {N} {K} {D} {R} {U} {F} Time: {seconds} " +
import pykokkos as pk @pk.functor class Workload: def __init__(self, N: int): self.A: pk.View1D[pk.int32] = pk.View([N], pk.int32) @pk.workunit def init(self, i: int): self.A[i] = i @pk.workunit def scan(self, i: int, acc: pk.Acc[pk.double], last_pass: bool): acc += self.A[i] if last_pass: self.A[i] = acc if __name__ == "__main__": N = 10 w = Workload(N) p = pk.RangePolicy(pk.ExecutionSpace.OpenMP, 0, N) pk.parallel_for(p, w.init) timer = pk.Timer() result = pk.parallel_scan(p, w.scan) timer_result = timer.seconds() print(f"{w.A} total={result} time({timer_result})")
parser.add_argument("-n", "--numtimes", type=int, help="Run the test NUM times (NUM >= 2)") parser.add_argument("-space", "--execution_space", type=str) args = parser.parse_args() if args.arraysize: array_size = args.arraysize if args.numtimes: num_times = args.numtimes if args.execution_space: space = pk.ExecutionSpace(space) a: pk.View1D[pk.double] = pk.View([array_size], pk.double) b: pk.View1D[pk.double] = pk.View([array_size], pk.double) c: pk.View1D[pk.double] = pk.View([array_size], pk.double) p = pk.RangePolicy(space, 0, array_size) pk.parallel_for(p, init_arrays, a=a, b=b, c=c, initA=startA, initB=startB, initC=startC) timer = pk.Timer() timings = [[] for i in range(5)] for i in range(num_times): pk.parallel_for(p, copy, a=a, c=c) timings[0].append(timer.seconds()) timer.reset() pk.parallel_for(p, mul, b=b, scalar=scalar, c=c) timings[1].append(timer.seconds()) timer.reset() pk.parallel_for(p, add, a=a, b=b, c=c) timings[2].append(timer.seconds())
def create_lattice(self, comm: Comm) -> None: s: System = copy.deepcopy(self.system) if self.lattice_style == LatticeType.LATTICE_SC.value: self.system.domain_x = self.lattice_constant * self.lattice_nx self.system.domain_y = self.lattice_constant * self.lattice_ny self.system.domain_z = self.lattice_constant * self.lattice_nz comm.create_domain_decomposition() s = copy.deepcopy(self.system) ix_start: int = math.floor(s.sub_domain_lo_x / s.domain_x * self.lattice_nx - 0.5) iy_start: int = math.floor(s.sub_domain_lo_y / s.domain_y * self.lattice_ny - 0.5) iz_start: int = math.floor(s.sub_domain_lo_z / s.domain_z * self.lattice_nz - 0.5) ix_end: int = math.floor(s.sub_domain_hi_x / s.domain_x * self.lattice_nx + 0.5) iy_end: int = math.floor(s.sub_domain_hi_y / s.domain_y * self.lattice_ny + 0.5) iz_end: int = math.floor(s.sub_domain_hi_z / s.domain_z * self.lattice_nz + 0.5) n: int = 0 for iz in range(iz_start, iz_end + 1): ztmp: float = (self.lattice_constant * (iz + self.lattice_offset_z)) for iy in range(iy_start, iy_end + 1): ytmp: float = (self.lattice_constant * (iy + self.lattice_offset_y)) for ix in range(ix_start, ix_end + 1): xtmp: float = (self.lattice_constant * (ix + self.lattice_offset_x)) if (xtmp >= s.sub_domain_lo_x and ytmp >= s.sub_domain_lo_y and ztmp >= s.sub_domain_lo_z and xtmp < s.sub_domain_hi_x and ytmp < s.sub_domain_hi_y and ztmp < s.sub_domain_hi_z): n += 1 self.system.N_local = n self.system.N = n self.system.grow(n) s = copy.deepcopy(self.system) for iz in range(iz_start, iz_end + 1): ztmp: float = (self.lattice_constant * (iz + self.lattice_offset_z)) for iy in range(iy_start, iy_end + 1): ytmp: float = (self.lattice_constant * (iy + self.lattice_offset_y)) for ix in range(ix_start, ix_end + 1): xtmp: float = (self.lattice_constant * (ix + self.lattice_offset_x)) if (xtmp >= s.sub_domain_lo_x and ytmp >= s.sub_domain_lo_y and ztmp >= s.sub_domain_lo_z and xtmp < s.sub_domain_hi_x and ytmp < s.sub_domain_hi_y and ztmp < s.sub_domain_hi_z): n += 1 self.system.grow(n) s = copy.deepcopy(self.system) n = 0 for iz in range(iz_start, iz_end + 1): ztmp: float = (self.lattice_constant * (iz + self.lattice_offset_z)) for iy in range(iy_start, iy_end + 1): ytmp: float = (self.lattice_constant * (iy + self.lattice_offset_y)) for ix in range(ix_start, ix_end + 1): xtmp: float = (self.lattice_constant * (ix + self.lattice_offset_x)) s.x[n][0] = xtmp s.x[n][1] = ytmp s.x[n][2] = ztmp s.type[n] = random.randint(0, s.ntypes - 1) s.id[n] = n + 1 n += 1 comm.reduce_int(self.system.N, 1) N_local_offset: int = n comm.scan_int(N_local_offset, 1) for i in range(n): s.id[i] += N_local_offset - n if self.system.do_print: print(f"Atoms: {self.system.N} {self.system.N_local}") if self.lattice_style == LatticeType.LATTICE_FCC.value: self.system.domain_x = self.lattice_constant * self.lattice_nx self.system.domain_y = self.lattice_constant * self.lattice_ny self.system.domain_z = self.lattice_constant * self.lattice_nz comm.create_domain_decomposition() s = copy.deepcopy(self.system) basis: List[List[float]] = [[0.0, 0.0, 0.0], [0.5, 0.5, 0.0], [0.5, 0.0, 0.5], [0.0, 0.5, 0.5]] basis_view = pk.View([4, 3], pk.double) for i in range(4): basis_view[i][0] = basis[i][0] basis_view[i][1] = basis[i][1] basis_view[i][2] = basis[i][2] for i in range(4): basis_view[i][0] += self.lattice_offset_x basis_view[i][1] += self.lattice_offset_y basis_view[i][2] += self.lattice_offset_z print(f"{s.sub_domain_lo_x} {s.domain_x} {self.lattice_nx} - 0.5") ix_start: int = math.floor(s.sub_domain_lo_x / s.domain_x * self.lattice_nx - 0.5) iy_start: int = math.floor(s.sub_domain_lo_y / s.domain_y * self.lattice_ny - 0.5) iz_start: int = math.floor(s.sub_domain_lo_z / s.domain_z * self.lattice_nz - 0.5) ix_end: int = math.floor(s.sub_domain_hi_x / s.domain_x * self.lattice_nx + 0.5) iy_end: int = math.floor(s.sub_domain_hi_y / s.domain_y * self.lattice_ny + 0.5) iz_end: int = math.floor(s.sub_domain_hi_z / s.domain_z * self.lattice_nz + 0.5) init_s = init_system(s, ix_start, ix_end, iy_start, iy_end, iz_start, iz_end, self.lattice_constant, basis_view) n: int = pk.parallel_reduce( "init_s", pk.RangePolicy(pk.Default, iz_start + 1, iz_end + 1), init_s.get_n) # n: int = calculate_n(ix_start, ix_end, iy_start, iy_end, iz_start, iz_end, # self.lattice_constant, np.array(basis), # s.sub_domain_lo_x, s.sub_domain_lo_y, s.sub_domain_lo_z, # s.sub_domain_hi_x, s.sub_domain_hi_y, s.sub_domain_hi_z) self.system.N_local = n self.system.N = n # Instead of calling it get_n twice, multiply by 2 (unlike c++ version) n *= 2 self.system.grow(n) s = self.system n: int = init_x(ix_start, ix_end, iy_start, iy_end, iz_start, iz_end, self.lattice_constant, basis_view.data, s.sub_domain_lo_x, s.sub_domain_lo_y, s.sub_domain_lo_z, s.sub_domain_hi_x, s.sub_domain_hi_y, s.sub_domain_hi_z, s.x.data, s.type.data, s.id.data, s.ntypes) N_local_offset: int = n comm.scan_int(N_local_offset, 1) to_add: int = N_local_offset - n s.id.data += to_add comm.reduce_int(self.system.N, 1) if self.system.do_print: print(f"Atoms: {self.system.N} {self.system.N_local}") s = self.system total_mass: float = 0.0 total_momentum_x: float = 0.0 total_momentum_y: float = 0.0 total_momentum_z: float = 0.0 ibase: int = self.temperature_seed ibase &= 0xffffffff ibase_bin = list(ibase.to_bytes(4, "little")) for i in range(4): if (ibase_bin[i] & (1 << 7)): ibase_bin[i] -= 1 << 8 ibase &= 0xffffffff ibase_bin = list(ibase.to_bytes(4, "little")) for i in range(4): if (ibase_bin[i] & (1 << 7)): ibase_bin[i] -= 1 << 8 hash_uint: int = 0 for i in ibase_bin: hash_uint += i hash_uint &= 0xFFFFFFFF hash_uint += hash_uint << 10 hash_uint &= 0xFFFFFFFF hash_uint ^= hash_uint >> 6 hash_uint &= 0xFFFFFFFF x_bytes = np.reshape(np.frombuffer(s.x.data.tobytes(), dtype=np.byte), (s.x.shape[0], s.x.shape[1], 8)).astype(int) total_mass, total_momentum_x, total_momentum_y, total_momentum_z = init_v( hash_uint, x_bytes, s.v.data, s.mass.data, s.type.data, self.system.N_local) s.q.fill(0.0) comm.reduce_float(total_momentum_x, 1) comm.reduce_float(total_momentum_y, 1) comm.reduce_float(total_momentum_z, 1) comm.reduce_float(total_mass, 1) system_vx: float = total_momentum_x / total_mass system_vy: float = total_momentum_y / total_mass system_vz: float = total_momentum_z / total_mass system_v = np.array([system_vx, system_vy, system_vz]) s.v.data -= system_v temp = Temperature(comm) T: float = temp.compute(self.system) T_init_scale: float = math.sqrt(self.temperature_target / T) s.v.data *= T_init_scale
indices = args.indices if args.data: data = args.data if args.repeats: repeats = args.repeats use_atomics = args.atomics if args.execution_space: space = pk.ExecutionSpace(args.execution_space) pk.set_default_space(space) indices_view: pk.View1D[pk.int64] = pk.View([indices], pk.int64) data_view: pk.View1D[pk.int64] = pk.View([data], pk.int64) datum: pk.int64 = -1 range_indices = pk.RangePolicy(pk.get_default_space(), 0, indices) range_data = pk.RangePolicy(pk.get_default_space(), 0, data) print("Reports fastest timing per kernel") print("Creating Views...") print("Memory Sizes:") print(f"- Elements: {data} ({1e-6*data*8} MB)") print(f"- Indices: {indices} ({1e-6*indices*8} MB)") print(f"- Atomics: {'yes' if use_atomics else 'no'}") print(f"Benchmark kernels will be performed for {repeats} iterations") print("Initializing Views...") pk.parallel_for(range_data, init_data, data=data_view) pk.parallel_for(range_indices, init_indices, indices=indices_view) print("Starting benchmarking...")