def yAx_plus1(self, team_member: pk.TeamMember, acc: pk.Acc[pk.double]) -> None: j: int = team_member.league_rank() def inner_reduce(i: int, inner_acc: pk.Acc[pk.double]): inner_acc += self.A[j][i] * self.x[i] def inner_for(i: int): self.yprime[j][i] += 1 temp2: float = pk.parallel_reduce(pk.TeamThreadRange(team_member, self.M), inner_reduce) pk.parallel_for(pk.TeamThreadRange(team_member, self.N), inner_for) if team_member.team_rank() == 0: acc += self.yprime[j][j] * temp2
def benchmark(self, team: pk.TeamMember): n: int = team.league_rank() for r in range(self.R): def team_for(i: int): a1: pk.double = self.A[n][i][0] b: pk.double = self.B[n][i][0] a2: pk.double = a1 * 1.3 a3: pk.double = a2 * 1.1 a4: pk.double = a3 * 1.1 a5: pk.double = a4 * 1.3 a6: pk.double = a5 * 1.1 a7: pk.double = a6 * 1.1 a8: pk.double = a7 * 1.1 for f in range(self.F): a1 += b * a1 a2 += b * a2 a3 += b * a3 a4 += b * a4 a5 += b * a5 a6 += b * a6 a7 += b * a7 a8 += b * a8 self.C[n][i][0] = a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8 pk.parallel_for(pk.TeamThreadRange(team, self.K), team_for)
def benchmark(team: pk.TeamMember, A: pk.View3D[pk.double], B: pk.View3D[pk.double], C: pk.View3D[pk.double], R: int, F: int, K: int): n: int = team.league_rank() for r in range(R): def team_for(i: int): a1: pk.double = A[n][i][0] b: pk.double = B[n][i][0] a2: pk.double = a1 * 1.3 a3: pk.double = a2 * 1.1 a4: pk.double = a3 * 1.1 a5: pk.double = a4 * 1.3 a6: pk.double = a5 * 1.1 a7: pk.double = a6 * 1.1 a8: pk.double = a7 * 1.1 for f in range(F): a1 += b * a1 a2 += b * a2 a3 += b * a3 a4 += b * a4 a5 += b * a5 a6 += b * a6 a7 += b * a7 a8 += b * a8 C[n][i][0] = a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8 pk.parallel_for(pk.TeamThreadRange(team, K), team_for)
def outer_for(self, team_member: pk.TeamMember) -> None: j: int = team_member.league_rank() def inner_reduce(i: int, acc: pk.Acc[pk.double]): acc += self.value if team_member.team_rank() == 0: temp: float = pk.parallel_reduce(pk.TeamThreadRange(team_member, self.M), inner_reduce) self.for_view[j] = temp
def yAx(self, team_member: pk.TeamMember, acc: pk.Acc[pk.double]) -> None: j: int = team_member.league_rank() def inner_reduce(i: int, inner_acc: pk.Acc[pk.double]): inner_acc += self.A[j][i] * self.x[i] temp2: float = pk.parallel_reduce(pk.TeamThreadRange(team_member, self.M), inner_reduce) if team_member.team_rank() == 0: acc += self.y[j] * temp2
def yAx(team_member: pk.TeamMember, acc: pk.Acc[float], M: int, y: pk.View1D[pk.double], x: pk.View1D[pk.double], A: pk.View2D[pk.double]): j: int = team_member.league_rank() def inner_reduce(i: int, inner_acc: pk.Acc[float]): inner_acc += A[j][i] * x[i] temp2: float = pk.parallel_reduce(pk.TeamThreadRange(team_member, M), inner_reduce) if team_member.team_rank() == 0: acc += y[j] * temp2
def yAx_vector(self, team_member: pk.TeamMember, acc: pk.Acc[pk.double]) -> None: e: int = team_member.league_rank() def team_reduce(j: int, team_acc: pk.Acc[pk.double]): def vector_reduce(i: int, vector_acc: pk.Acc[pk.double]): vector_acc += self.A_vector[e][j][i] * self.x_vector[e][i] tempM: float = pk.parallel_reduce(pk.ThreadVectorRange(team_member, self.M), vector_reduce) team_acc += self.y_vector[e][j] * tempM tempN: float = pk.parallel_reduce( pk.TeamThreadRange(team_member, self.N), team_reduce) def single_closure(): nonlocal acc acc += tempN pk.single(pk.PerTeam(team_member), single_closure)
def yAx(team_member: pk.TeamMember, acc: pk.Acc[float], N:int, M: int, y: pk.View2D[pk.double], x: pk.View2D[pk.double], A: pk.View3D[pk.double]): e: int = team_member.league_rank() def team_reduce(j: int, team_acc: pk.Acc[float]): def vector_reduce(i: int, vector_acc: pk.Acc[float]): vector_acc += A[e][j][i] * x[e][i] tempM: float = pk.parallel_reduce( pk.ThreadVectorRange(team_member, M), vector_reduce) team_acc += y[e][j] * tempM tempN: float = pk.parallel_reduce( pk.TeamThreadRange(team_member, N), team_reduce) def single_closure(): nonlocal acc acc += tempN pk.single(pk.PerTeam(team_member), single_closure)
def preduce(self, team: pk.TeamMember, PE_bi: pk.Acc[pk.double]) -> None: bx: int = team.league_rank() // (self.nbiny * self.nbinz) by: int = (team.league_rank() // self.nbinz) % self.nbiny bz: int = team.league_rank() % self.nbinz shift_flag: bool = True i_offset: int = self.bin_offsets[bx][by][bz] def team_thread_reduce(bi: int, PE_i: pk.Acc[pk.double]): i: int = self.permute_vector[i_offset + bi] if i >= self.N_local: return x_i: float = self.x[i][0] y_i: float = self.x[i][1] z_i: float = self.x[i][2] type_i: int = self.type[i] bx_j_start: int = bx if bx > 0: bx_j_start = bx - 1 bx_j_stop: int = bx + 1 if bx + 1 < self.nbinx: bx_j_stop = bx + 2 by_j_start: int = by if by > 0: by_j_start = by - 1 by_j_stop: int = by + 1 if by + 1 < self.nbiny: by_j_stop = by + 2 bz_j_start: int = bz if bz > 0: bz_j_start = bz - 1 bz_j_stop: int = bz + 1 if bz + 1 < self.nbinx: bz_j_stop = bz + 2 for bx_j in range(bx_j_start, bx_j_stop): for by_j in range(by_j_start, by_j_stop): for bz_j in range(bz_j_start, bz_j_stop): j_offset: int = self.bin_offsets[bx_j][by_j][bz_j] def thread_vector_reduce(bj: int, PE_ibj: pk.Acc[pk.double]): j: int = self.permute_vector[j_offset + bj] dx: float = x_i - self.x[j][0] dy: float = y_i - self.x[j][1] dz: float = z_i - self.x[j][2] type_j: int = self.type[j] rsq: float = (dx * dx) + (dy * dy) + (dz * dz) if rsq < self.cutsq[type_i][type_j] and i != j: r2inv: float = 1.0 / rsq r6inv: float = r2inv * r2inv * r2inv PE_ibj += 0.5 * r6inv * \ (0.5 * self.lj1[type_i][type_j] * r6inv - self.lj2[type_i][type_j]) / 6.0 if shift_flag: r2invc: float = 1.0 / \ self.cutsq[type_i][type_j] r6invc: float = r2inv * r2inv * r2inv PE_ibj -= 0.5 * r6invc * \ (0.5 * self.lj1[type_i][type_j] * r6invc - self.lj2[type_i][type_j]) / 6.0 thread_vector_count: int = self.bin_count[bx_j][by_j][bz_j] PE_ibj: float = pk.parallel_reduce(pk.ThreadVectorRange( team, thread_vector_count), thread_vector_reduce) PE_i += PE_ibj team_thread_count: int = self.bin_count[bx][by][bz] PE_i: float = pk.parallel_reduce(pk.TeamThreadRange( team, team_thread_count), team_thread_reduce)
def pfor(self, team: pk.TeamMember) -> None: bx: int = team.league_rank() // (self.nbiny * self.nbinz) by: int = (team.league_rank() // self.nbinz) % self.nbiny bz: int = team.league_rank() % self.nbinz i_offset: int = self.bin_offsets[bx][by][bz] def team_thread_for(bi: int): i: int = self.permute_vector[i_offset + bi] if i >= self.N_local: return x_i: float = self.x[i][0] y_i: float = self.x[i][1] z_i: float = self.x[i][2] type_i: int = self.type[i] f_i: t_scalar3 = t_scalar3() bx_j_start: int = bx if bx > 0: bx_j_start = bx - 1 bx_j_stop: int = bx + 1 if bx + 1 < self.nbinx: bx_j_stop = bx + 2 by_j_start: int = by if by > 0: by_j_start = by - 1 by_j_stop: int = by + 1 if by + 1 < self.nbiny: by_j_stop = by + 2 bz_j_start: int = bz if bz > 0: bz_j_start = bz - 1 bz_j_stop: int = bz + 1 if bz + 1 < self.nbinx: bz_j_stop = bz + 2 for bx_j in range(bx_j_start, bx_j_stop): for by_j in range(by_j_start, by_j_stop): for bz_j in range(bz_j_start, bz_j_stop): j_offset: int = self.bin_offsets[bx_j][by_j][bz_j] f_i_tmp: t_scalar3 = t_scalar3() def thread_vector_reduce_x(bj: int, lf_i: pk.Acc[pk.double]): j: int = self.permute_vector[j_offset + bj] dx: float = x_i - self.x[j][0] dy: float = y_i - self.x[j][1] dz: float = z_i - self.x[j][2] type_j: int = self.type[j] rsq: float = (dx * dx) + (dy * dy) + (dz * dz) if rsq < self.cutsq[type_i][type_j] and i != j: r2inv: float = 1.0 / rsq r6inv: float = r2inv * r2inv * r2inv fpair: float = ( r6inv * (self.lj1[type_i][type_j] * r6inv - self.lj2[type_i][type_j])) * r2inv lf_i += dx * fpair def thread_vector_reduce_y(bj: int, lf_i: pk.Acc[pk.double]): j: int = self.permute_vector[j_offset + bj] dx: float = x_i - self.x[j][0] dy: float = y_i - self.x[j][1] dz: float = z_i - self.x[j][2] type_j: int = self.type[j] rsq: float = (dx * dx) + (dy * dy) + (dz * dz) if rsq < self.cutsq[type_i][type_j] and i != j: r2inv: float = 1.0 / rsq r6inv: float = r2inv * r2inv * r2inv fpair: float = ( r6inv * (self.lj1[type_i][type_j] * r6inv - self.lj2[type_i][type_j])) * r2inv lf_i += dy * fpair def thread_vector_reduce_z(bj: int, lf_i: pk.Acc[pk.double]): j: int = self.permute_vector[j_offset + bj] dx: float = x_i - self.x[j][0] dy: float = y_i - self.x[j][1] dz: float = z_i - self.x[j][2] type_j: int = self.type[j] rsq: float = (dx * dx) + (dy * dy) + (dz * dz) if rsq < self.cutsq[type_i][type_j] and i != j: r2inv: float = 1.0 / rsq r6inv: float = r2inv * r2inv * r2inv fpair: float = ( r6inv * (self.lj1[type_i][type_j] * r6inv - self.lj2[type_i][type_j])) * r2inv lf_i += dz * fpair thread_vector_count: int = self.bin_count[bx_j][by_j][bz_j] f_i_tmp_x: float = pk.parallel_reduce( pk.ThreadVectorRange(team, thread_vector_count), thread_vector_reduce_x) f_i_tmp_y: float = pk.parallel_reduce( pk.ThreadVectorRange(team, thread_vector_count), thread_vector_reduce_y) f_i_tmp_z: float = pk.parallel_reduce( pk.ThreadVectorRange(team, thread_vector_count), thread_vector_reduce_z) f_i.x += f_i_tmp_x f_i.y += f_i_tmp_y f_i.z += f_i_tmp_z self.f[i][0] = f_i.x self.f[i][1] = f_i.y self.f[i][2] = f_i.z team_thread_count: int = self.bin_count[bx][by][bz] pk.parallel_for(pk.TeamThreadRange( team, team_thread_count), team_thread_for)