def run() -> None: values: Tuple[int, int, int, int, int, bool] = parse_args() N: int = values[0] M: int = values[1] E: int = values[3] fill: bool = values[-1] nrepeat: int = 1000 print(f"Total size S = {N * M} N = {N} M = {M} E = {E}") w = Workload(N, M, E, fill) p = pk.TeamPolicy(E, "auto", 32, pk.get_default_space()) timer = pk.Timer() for i in range(nrepeat): result = pk.parallel_reduce(p, w.yAx) timer_result = timer.seconds() print(f"Computed result for {N} x {M} x {E} is {result}") solution: float = N * M * E if result != solution: pk.printf("Error: result (%lf) != solution (%lf)\n", result, solution) print( f"N({N}) M({M}) E({E}) nrepeat({nrepeat}) problem(MB) time({timer_result}) bandwidth(GB/s)" )
def test_outer_for(self): expected_result: float = 0 for i in range(self.M): expected_result += self.value pk.parallel_for(pk.TeamPolicy(self.N, pk.AUTO, space=self.execution_space), self.functor.outer_for) for i in range(self.N): result: int = self.functor.for_view[i] self.assertEqual(expected_result, result)
def run(self): timer = pk.Timer() for i in range(self.nrepeat): self.result = pk.parallel_reduce("team_vector_loop", pk.TeamPolicy(self.E, "auto", 32), self.yAx) self.timer_result = timer.seconds()
def run(self): timer = pk.Timer() for i in range(self.nrepeat): self.result = pk.parallel_reduce("team_policy", pk.TeamPolicy(self.N, "auto"), self.yAx) self.timer_result = timer.seconds()
def test_yAx_plus1(self): expected_result: float = 0 for j in range(self.N): temp2: float = 0 for i in range(self.M): temp2 += self.A[j][i] * self.x[i] expected_result += (self.y[j] + 1) * temp2 result: int = pk.parallel_reduce(pk.TeamPolicy(self.N, pk.AUTO, space=self.execution_space), self.functor.yAx_plus1) self.assertEqual(expected_result, result)
def run() -> None: values: Tuple[int, int, int, int, int, bool] = parse_args() N: int = values[0] M: int = values[1] E: int = values[3] fill: bool = values[-1] nrepeat: int = 1000 print(f"Total size S = {N * M} N = {N} M = {M} E = {E}") y: pk.View2D = pk.View([E, N], pk.double, layout=pk.Layout.LayoutRight) x: pk.View2D = pk.View([E, M], pk.double, layout=pk.Layout.LayoutRight) A: pk.View3D = pk.View([E, N, M], pk.double, layout=pk.Layout.LayoutRight) if fill: y.fill(1) x.fill(1) A.fill(1) else: for e in range(E): for i in range(N): y[e][i] = 1 for i in range(M): x[e][i] = 1 for j in range(N): for i in range(M): A[e][j][i] = 1 p = pk.TeamPolicy(E, "auto", 32, pk.get_default_space()) timer = pk.Timer() for i in range(nrepeat): result = pk.parallel_reduce(p, yAx, N=N, M=M, y=y, x=x, A=A) timer_result = timer.seconds() print( f"Computed result for {N} x {M} x {E} is {result}") solution: float = N * M * E if result != solution: pk.printf("Error: result (%lf) != solution (%lf)\n", result, solution) print(f"N({N}) M({M}) E({E}) nrepeat({nrepeat}) problem(MB) time({timer_result}) bandwidth(GB/s)")
def test_yAx_vector(self): expected_result: float = 0 for e in range(self.E): tempN: float = 0 for j in range(self.N): tempM: float = 0 for i in range(self.M): tempM += self.A_vector[e][j][i] * self.x_vector[e][i] tempN += self.y_vector[e][j] * tempM expected_result += tempN result: float = pk.parallel_reduce(pk.TeamPolicy(self.E, pk.AUTO, 32, self.execution_space), self.functor.yAx_vector) self.assertEqual(expected_result, result)
print("S must be 0 (shared scratch memory not supported)") exit(1) space = pk.ExecutionSpace.OpenMP if args.execution_space: space = pk.ExecutionSpace(args.execution_space) N = args.N K = args.K R = args.R U = args.U F = args.F T = args.T S = args.S scalar_size = 8 pk.set_default_space(space) r = pk.TeamPolicy(N, T, space=pk.get_default_space()) w = Benchmark_double_8(N, K, R, args.D, F, T, S) timer = pk.Timer() pk.parallel_for(r, w.benchmark) seconds = timer.seconds() num_bytes = 1.0 * N * K * R * 3 * scalar_size flops = 1.0 * N * K * R * (F * 2 * U + 2 * (U - 1)) print(f"NKRUFTS: {N} {K} {R} {U} {F} {T} {S} Time: {seconds} " + f"Bandwidth: {1.0 * num_bytes / seconds / (1024**3)} GiB/s GFlop/s: {1e-9 * flops / seconds}") print(w.C)
def run(self): timer = pk.Timer() pk.parallel_for("bytes_and_flops", pk.TeamPolicy(self.N, self.T), self.benchmark) pk.fence() self.seconds = timer.seconds()
def run(self) -> None: if self.parallel_for: pk.parallel_for(pk.TeamPolicy(self.nbins, 1, 8), self.pfor) else: self.PE = pk.parallel_reduce( pk.TeamPolicy(self.nbins, 1, 8), self.preduce)