def test_subviews_sum(self): expected_result: int = self.threads * (self.i_1 * 2) temp: int = pk.parallel_reduce(self.range_policy, self.functor.views) # initialize views result: int = pk.parallel_reduce(self.range_policy, self.functor.subviews) self.assertEqual(expected_result, result)
def run(self): timer = pk.Timer() for i in range(self.nrepeat): self.result = pk.parallel_reduce("subview", self.N, self.yAx) self.timer_result = timer.seconds()
def team_reduce(j: int, team_acc: pk.Acc[pk.double]): def vector_reduce(i: int, vector_acc: pk.Acc[pk.double]): vector_acc += self.A_vector[e][j][i] * self.x_vector[e][i] tempM: float = pk.parallel_reduce(pk.ThreadVectorRange(team_member, self.M), vector_reduce) team_acc += self.y_vector[e][j] * tempM
def test_dynamic2D(self): expected_result: int = self.i_4 * self.i_1 * self.i_2 result: int = pk.parallel_reduce( pk.RangePolicy(pk.ExecutionSpace.Default, 0, self.i_2), self.functor.dynamic2D) self.assertEqual(expected_result, result)
def run() -> None: values: Tuple[int, int, int, int, int, bool] = parse_args() N: int = values[0] M: int = values[1] nrepeat: int = 1 print(f"Total size S = {N * M} N = {N} M = {M}") y = pk.View([N], pk.double) x = pk.View([M], pk.double) A = pk.View([N * M], pk.double) p = pk.RangePolicy(pk.get_default_space(), 0, N) pk.parallel_for(p, y_init, y=y) pk.parallel_for(pk.RangePolicy(pk.get_default_space(), 0, M), y_init, y=x) pk.parallel_for(p, matrix_init, M=M, A=A) timer = pk.Timer() for i in range(nrepeat): result = pk.parallel_reduce(p, yAx, M=M, y=y, x=x, A=A) timer_result = timer.seconds() print(f"Computed result for {N} x {M} is {result}") solution = N * M if result != solution: pk.printf("Error: result (%lf) != solution (%lf)\n", result, solution) print(f"N({N}) M({M}) nrepeat({nrepeat}) problem(MB) time({timer_result}) bandwidth(GB/s)")
def run() -> None: values: Tuple[int, int, int, int, int, bool] = parse_args() N: int = values[0] M: int = values[1] E: int = values[3] fill: bool = values[-1] nrepeat: int = 1000 print(f"Total size S = {N * M} N = {N} M = {M} E = {E}") w = Workload(N, M, E, fill) p = pk.TeamPolicy(E, "auto", 32, pk.get_default_space()) timer = pk.Timer() for i in range(nrepeat): result = pk.parallel_reduce(p, w.yAx) timer_result = timer.seconds() print(f"Computed result for {N} x {M} x {E} is {result}") solution: float = N * M * E if result != solution: pk.printf("Error: result (%lf) != solution (%lf)\n", result, solution) print( f"N({N}) M({M}) E({E}) nrepeat({nrepeat}) problem(MB) time({timer_result}) bandwidth(GB/s)" )
def test_dep_two(self): dep_two = DepTwo(DepOne(self.i_1, self.f_1, self.b_1)) expected_result: float = self.threads * dep_two.sum() result: float = pk.parallel_reduce(self.range_policy, self.functor.dep_two_work) self.assertEqual(expected_result, result)
def run() -> None: parser = argparse.ArgumentParser() parser.add_argument('iterations', type=int) parser.add_argument('length', type=int) parser.add_argument('offset', nargs='?', type=int, default=0) args = parser.parse_args() iterations = args.iterations length = args.length offset = args.offset scalar = 3 if iterations < 1: sys.exit("ERROR: iterations must be >= 1") if length <= 0: sys.exit("ERROR: vector length must be positive") # emulate cpp example if length <= 0: sys.exit("ERROR: offset must be nonnegative") print("Number of iterations = ", iterations) print("Vector length = ", length) print("Offset = ", offset) p = pk.RangePolicy(pk.ExecutionSpace.OpenMP, 0, length) w = Workload(iterations, length, offset, scalar) pk.parallel_for(p, w.init_views) # pk.fence() timer = pk.Timer() for i in range(iterations): pk.parallel_for(p, w.nstream) # pk.fence() nstream_time = timer.seconds() # verify correctness ar: float = 0 br: float = 2 cr: float = 2 for i in range(iterations): ar += br + scalar * cr ar *= length asum = pk.parallel_reduce(p, w.res_reduce) # pk.fence() episilon: float = 1.0e-8 if (abs(ar - asum) / asum > episilon): print("ERROR: Failed Valication on output array") else: avgtime: float = nstream_time / iterations nbytes: float = 4.0 * length * 4 print("Solution validates") print("Rate (MB/s): %.2f" % (1.e-6 * nbytes / avgtime)) print("Avg time (ms): %f" % (avgtime / 1.e-3))
def run() -> None: values: Tuple[int, int, int, int, int, bool] = parse_args() N: int = values[0] M: int = values[1] nrepeat: int = 100 print(f"Total size S = {N * M} N = {N} M = {M}") p = pk.RangePolicy(pk.get_default_space(), 0, N) w = Workload(N, M) pk.parallel_for(p, w.y_init) pk.parallel_for(pk.RangePolicy(pk.get_default_space(), 0, M), w.x_init) pk.parallel_for(p, w.matrix_init) timer = pk.Timer() for i in range(nrepeat): result = pk.parallel_reduce(p, w.yAx) timer_result = timer.seconds() print(f"Computed result for {N} x {M} is {result}") solution = N * M if result != solution: pk.printf("Error: result (%lf) != solution (%lf)\n", result, solution) print(f"N({N}) M({M}) nrepeat({nrepeat}) problem(MB) time({timer_result}) bandwidth(GB/s)")
def test_dep_one_return(self): dep_one = DepOne(self.i_1, self.f_1, self.b_1) expected_result: float = self.threads * dep_one.sum() result: float = pk.parallel_reduce(self.range_policy, self.functor.dep_one_return) self.assertEqual(expected_result, result)
def team_reduce(j: int, team_acc: pk.Acc[float]): def vector_reduce(i: int, vector_acc: pk.Acc[float]): vector_acc += A[e][j][i] * x[e][i] tempM: float = pk.parallel_reduce( pk.ThreadVectorRange(team_member, M), vector_reduce) team_acc += y[e][j] * tempM
def test_for_step_stmt(self): expected_result: int = 0 for i in range(self.i_2, self.i_1, self.i_2): expected_result += self.threads * self.i_2 result: int = pk.parallel_reduce(self.range_policy, self.functor.for_step_stmt) self.assertEqual(expected_result, result)
def run(self) -> None: if self.parallel_for: if self.half_neigh: pk.parallel_for("ForceLJNeigh::compute", self.N_local, self.halfneigh_for) else: pk.parallel_for("ForceLJNeigh::compute", self.N_local, self.fullneigh_for) else: if self.half_neigh: self.energy = pk.parallel_reduce( "ForceLJNeigh::compute_energy", self.N_local, self.halfneigh_reduce) else: self.energy = pk.parallel_reduce( "ForceLJNeigh::compute_energy", self.N_local, self.fullneigh_reduce)
def test_gt_op(self): if self.value_1 > self.value_2: expected_result: int = self.threads * self.value_1 else: expected_result: int = self.threads * self.value_2 result: int = pk.parallel_reduce(self.range_policy, self.functor.gt_op) self.assertEqual(expected_result, result)
def run(self): timer = pk.Timer() for i in range(self.nrepeat): self.result = pk.parallel_reduce("team_policy", pk.TeamPolicy(self.N, "auto"), self.yAx) self.timer_result = timer.seconds()
def test_bool_op(self): if not self.b_1: expected_result: int = self.threads * (self.i_1) else: expected_result: int = self.threads * (self.i_2) result: int = pk.parallel_reduce(self.range_policy, self.functor.bool_op) self.assertEqual(expected_result, result)
def test_continue(self): expected_result: int = 0 for i in range(self.i_1): expected_result += self.threads * self.i_2 continue result: int = pk.parallel_reduce(self.range_policy, self.functor.continue_stmt) self.assertEqual(expected_result, result)
def test_if_else_stmt(self): if self.b_1: expected_result: int = self.threads * self.i_1 else: expected_result: int = self.threads * self.i_2 result: int = pk.parallel_reduce(self.range_policy, self.functor.if_else_stmt) self.assertEqual(expected_result, result)
def run(self): timer = pk.Timer() pk.parallel_for(self.N, self.matrix_init) for i in range(self.nrepeat): self.result = pk.parallel_reduce("04", self.N, self.yAx) self.timer_result = timer.seconds()
def test_dep_two_mutate(self): dep_one = DepOne(self.i_1, self.f_1, self.b_1) dep_two = DepTwo(dep_one) dep_one.i = self.i_2 expected_result: float = self.threads * dep_two.sum() result: float = pk.parallel_reduce(self.range_policy, self.functor.dep_two_mutate) self.assertEqual(expected_result, result)
def test_bool_sum(self): if self.b_1: expected_result: int = self.threads * self.i_1 else: expected_result: int = self.threads * self.i_2 result: int = pk.parallel_reduce(self.range_policy, self.functor.add_bool) self.assertEqual(expected_result, result)
def outer_for(self, team_member: pk.TeamMember) -> None: j: int = team_member.league_rank() def inner_reduce(i: int, acc: pk.Acc[pk.double]): acc += self.value if team_member.team_rank() == 0: temp: float = pk.parallel_reduce(pk.TeamThreadRange(team_member, self.M), inner_reduce) self.for_view[j] = temp
def test_compare(self): if self.i_1 > self.i_2: expected_result: int = self.threads * (self.i_1) else: expected_result: int = self.threads * (self.i_2) result: int = pk.parallel_reduce(self.range_policy, self.functor.compare) self.assertEqual(expected_result, result)
def run(self): timer = pk.Timer() for i in range(self.nrepeat): self.result = pk.parallel_reduce("team_vector_loop", pk.TeamPolicy(self.E, "auto", 32), self.yAx) self.timer_result = timer.seconds()
def yAx(self, team_member: pk.TeamMember, acc: pk.Acc[pk.double]) -> None: j: int = team_member.league_rank() def inner_reduce(i: int, inner_acc: pk.Acc[pk.double]): inner_acc += self.A[j][i] * self.x[i] temp2: float = pk.parallel_reduce(pk.TeamThreadRange(team_member, self.M), inner_reduce) if team_member.team_rank() == 0: acc += self.y[j] * temp2
def test_while_stmt(self): x: int = 0 expected_result: int = 0 while x < self.i_1: expected_result += self.threads * self.i_2 x += 1 result: int = pk.parallel_reduce(self.range_policy, self.functor.while_stmt) self.assertEqual(expected_result, result)
def test_yAx_plus1(self): expected_result: float = 0 for j in range(self.N): temp2: float = 0 for i in range(self.M): temp2 += self.A[j][i] * self.x[i] expected_result += (self.y[j] + 1) * temp2 result: int = pk.parallel_reduce(pk.TeamPolicy(self.N, pk.AUTO, space=self.execution_space), self.functor.yAx_plus1) self.assertEqual(expected_result, result)
def run(self): pk.parallel_for(N, self.init_y) pk.parallel_for(M, self.init_x) pk.parallel_for(pk.MDRangePolicy([0, 0], [self.N, self.M]), self.init_A) timer = pk.Timer() for i in range(self.nrepeat): self.result = pk.parallel_reduce("mdrange", self.N, self.yAx) self.timer_result = timer.seconds()
def run(self): pk.parallel_for(self.N, self.y_init) # pk.parallel_for(self.N, lambda i : self.y[i] = 1) pk.parallel_for(self.M, self.x_init) # pk.parallel_for(self.N, lambda i : self.x[i] = 1) pk.parallel_for(self.N, self.matrix_init) timer = pk.Timer() for i in range(self.nrepeat): self.result = pk.parallel_reduce("01", self.N, self.yAx) self.timer_result = timer.seconds()
def yAx(team_member: pk.TeamMember, acc: pk.Acc[float], M: int, y: pk.View1D[pk.double], x: pk.View1D[pk.double], A: pk.View2D[pk.double]): j: int = team_member.league_rank() def inner_reduce(i: int, inner_acc: pk.Acc[float]): inner_acc += A[j][i] * x[i] temp2: float = pk.parallel_reduce(pk.TeamThreadRange(team_member, M), inner_reduce) if team_member.team_rank() == 0: acc += y[j] * temp2