Exemplo n.º 1
0
    def test_atomic_xor(self):
        expected_result: int = self.i_1 ^ self.i_2

        pk.parallel_for(self.range_policy, self.functor.atomic_xor)
        result: int = self.functor.view1D_xor[0]

        self.assertEqual(expected_result, result)
Exemplo n.º 2
0
def run() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument('iterations', type=int)
    parser.add_argument('length', type=int)
    parser.add_argument('offset', nargs='?', type=int, default=0)
    args = parser.parse_args()
    iterations = args.iterations
    length = args.length
    offset = args.offset
    scalar = 3

    if iterations < 1:
        sys.exit("ERROR: iterations must be >= 1")

    if length <= 0:
        sys.exit("ERROR: vector length must be positive")

    # emulate cpp example
    if length <= 0:
        sys.exit("ERROR: offset must be nonnegative")

    print("Number of iterations = ", iterations)
    print("Vector length        = ", length)
    print("Offset               = ", offset)

    p = pk.RangePolicy(pk.ExecutionSpace.OpenMP, 0, length)
    w = Workload(iterations, length, offset, scalar)

    pk.parallel_for(p, w.init_views)
    # pk.fence()

    timer = pk.Timer()

    for i in range(iterations):
        pk.parallel_for(p, w.nstream)

    # pk.fence()
    nstream_time = timer.seconds()

    # verify correctness
    ar: float = 0
    br: float = 2
    cr: float = 2
    for i in range(iterations):
        ar += br + scalar * cr

    ar *= length

    asum = pk.parallel_reduce(p, w.res_reduce)
    # pk.fence()

    episilon: float = 1.0e-8
    if (abs(ar - asum) / asum > episilon):
        print("ERROR: Failed Valication on output array")
    else:
        avgtime: float = nstream_time / iterations
        nbytes: float = 4.0 * length * 4
        print("Solution validates")
        print("Rate (MB/s): %.2f" % (1.e-6 * nbytes / avgtime))
        print("Avg time (ms): %f" % (avgtime / 1.e-3))
Exemplo n.º 3
0
    def test_atomic_min(self):
        expected_result: float = min(self.f_1, self.f_2)

        pk.parallel_for(self.range_policy, self.functor.atomic_min)
        result: float = self.functor.view1D_min[0]

        self.assertEqual(expected_result, result)
Exemplo n.º 4
0
    def test_atomic_sub(self):
        expected_result: float = self.f_1 - self.f_2

        pk.parallel_for(self.range_policy, self.functor.atomic_sub)
        result: float = self.functor.view1D_sub[0]

        self.assertEqual(expected_result, result)
Exemplo n.º 5
0
def benchmark(team: pk.TeamMember, A: pk.View3D[pk.double],
              B: pk.View3D[pk.double], C: pk.View3D[pk.double], R: int, F: int,
              K: int):

    n: int = team.league_rank()
    for r in range(R):

        def team_for(i: int):
            a1: pk.double = A[n][i][0]
            b: pk.double = B[n][i][0]
            a2: pk.double = a1 * 1.3
            a3: pk.double = a2 * 1.1
            a4: pk.double = a3 * 1.1
            a5: pk.double = a4 * 1.3
            a6: pk.double = a5 * 1.1
            a7: pk.double = a6 * 1.1
            a8: pk.double = a7 * 1.1

            for f in range(F):
                a1 += b * a1
                a2 += b * a2
                a3 += b * a3
                a4 += b * a4
                a5 += b * a5
                a6 += b * a6
                a7 += b * a7
                a8 += b * a8

            C[n][i][0] = a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8

        pk.parallel_for(pk.TeamThreadRange(team, K), team_for)
Exemplo n.º 6
0
    def run(self):
        timer = pk.Timer()
        for r in range(self.R):
            pk.parallel_for("gather", self.N, self.benchmark)
            pk.fence()

        self.seconds = timer.seconds()
Exemplo n.º 7
0
    def benchmark(self, team: pk.TeamMember):
        n: int = team.league_rank()
        for r in range(self.R):
            def team_for(i: int):
                a1: pk.double = self.A[n][i][0] 
                b: pk.double = self.B[n][i][0]
                a2: pk.double = a1 * 1.3
                a3: pk.double = a2 * 1.1
                a4: pk.double = a3 * 1.1
                a5: pk.double = a4 * 1.3
                a6: pk.double = a5 * 1.1
                a7: pk.double = a6 * 1.1
                a8: pk.double = a7 * 1.1

                for f in range(self.F):
                    a1 += b * a1
                    a2 += b * a2
                    a3 += b * a3
                    a4 += b * a4
                    a5 += b * a5
                    a6 += b * a6
                    a7 += b * a7
                    a8 += b * a8

                self.C[n][i][0] = a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8


            pk.parallel_for(pk.TeamThreadRange(team, self.K), team_for)
Exemplo n.º 8
0
    def run(self) -> None:
        nbin: List[int] = [self.nbinx, self.nbiny, self.nbinz]
        min_values: List[float] = [self.minx, self.miny, self.miny]
        max_values: List[float] = [self.maxx, self.maxy, self.maxz]

        x_sub = self.x[self.range_min:self.range_max, :]
        binop = pk.BinOp3D(x_sub, nbin, min_values, max_values)
        sorter = pk.BinSort(x_sub, binop)
        sorter.create_permute_vector()
        self.permute_vector = sorter.get_permute_vector()

        self.bin_count_1d = sorter.get_bin_count()
        self.bin_offsets_1d = sorter.get_bin_offsets()

        pk.parallel_for("Binning::AssignOffsets",
                        self.nbinx * self.nbiny * self.nbinz,
                        self.assign_offsets)

        if self.sort:
            sorter.sort(x_sub)
            v_sub = self.v[self.range_min:self.range_max, :]
            sorter.sort(v_sub)
            f_sub = self.f[self.range_min:self.range_max, :]
            sorter.sort(f_sub)
            sorter.sort(self.type)
            sorter.sort(self.id)
            sorter.sort(self.q)
Exemplo n.º 9
0
    def test_v1d(self):
        pk.parallel_for(self.range_policy, self.functor.v1d)

        for i in range(self.i_1):
            expected_result: int = self.i_4 + i
            self.assertEqual(expected_result, self.functor.view1D[i])
            self.assertEqual(expected_result, self.functor.myView1D[i])
Exemplo n.º 10
0
    def run(self):
        pk.parallel_for(self.N, lambda i: i, self.A)

        timer = pk.Timer()

        self.result = pk.parallel_scan(self.N, self.scan)

        self.timer_result = timer.seconds()
Exemplo n.º 11
0
    def test_v2d(self):
        pk.parallel_for(self.range_policy, self.functor.v2d)

        for i in range(self.i_1):
            for j in range(self.i_2):
                expected_result: int = self.i_4 + i + j
                self.assertEqual(expected_result, self.functor.view2D[i][j])
                self.assertEqual(expected_result, self.functor.myView2D[i][j])
Exemplo n.º 12
0
    def run(self):
        timer = pk.Timer()

        pk.parallel_for(self.N, self.matrix_init)

        for i in range(self.nrepeat):
            self.result = pk.parallel_reduce("04", self.N, self.yAx)

        self.timer_result = timer.seconds()
Exemplo n.º 13
0
    def test_outer_for(self):
        expected_result: float = 0
        for i in range(self.M):
            expected_result += self.value

        pk.parallel_for(pk.TeamPolicy(self.N, pk.AUTO, space=self.execution_space), self.functor.outer_for)
        for i in range(self.N):
            result: int = self.functor.for_view[i]
            self.assertEqual(expected_result, result)
Exemplo n.º 14
0
    def test_subscript(self):
        expected_result = self.i_1
        pk.parallel_for(self.range_policy, self.functor.subscript)

        for i in range(self.threads):
            self.assertEqual(expected_result, self.functor.view1D[i])
            for j in range(self.threads):
                self.assertEqual(expected_result, self.functor.view2D[i][j])
                for k in range(self.threads):
                    self.assertEqual(expected_result,
                                     self.functor.view3D[i][j][k])
Exemplo n.º 15
0
    def run(self) -> None:
        x: List[int] = [self.x_0, 2, 3]
        pk.parallel_for(self.total_threads, self.work)
        bin_op = pk.BinOp1D(self.view, (self.total_threads // 2),
                            self.total_threads, self.total_threads * 2 - 1)
        bin_sort = pk.BinSort(self.view, bin_op)
        bin_sort.create_permute_vector()
        self.permute_vector = bin_sort.get_permute_vector()
        self.bin_offsets = bin_sort.get_bin_offsets()
        self.bin_count = bin_sort.get_bin_count()

        bin_sort.sort(self.view)
Exemplo n.º 16
0
    def yAx_plus1(self, team_member: pk.TeamMember, acc: pk.Acc[pk.double]) -> None:
        j: int = team_member.league_rank()

        def inner_reduce(i: int, inner_acc: pk.Acc[pk.double]):
            inner_acc += self.A[j][i] * self.x[i]

        def inner_for(i: int):
            self.yprime[j][i] += 1

        temp2: float = pk.parallel_reduce(pk.TeamThreadRange(team_member, self.M), inner_reduce)
        pk.parallel_for(pk.TeamThreadRange(team_member, self.N), inner_for)

        if team_member.team_rank() == 0:
            acc += self.yprime[j][j] * temp2
Exemplo n.º 17
0
    def run(self):
        t: int = tile_size
        r: int = radius

        pk.parallel_for(pk.MDRangePolicy([0, 0], [n, n], [t, t]), self.init)
        pk.fence()

        timer = pk.Timer()

        for i in range(iterations):
            if (i == 1):
                pk.fence()

            if r == 1:
                # star1 stencil
                pk.parallel_for(
                    "stencil", pk.MDRangePolicy([r, r], [n - r, n - r],
                                                [t, t]), self.star1)
            elif r == 2:
                # star2 stencil
                pk.parallel_for(
                    "stencil", pk.MDRangePolicy([r, r], [n - r, n - r],
                                                [t, t]), self.star2)
            else:
                # star3 stencil
                pk.parallel_for(
                    "stencil", pk.MDRangePolicy([r, r], [n - r, n - r],
                                                [t, t]), self.star3)

            pk.parallel_for(pk.MDRangePolicy([0, 0], [n, n], [t, t]),
                            self.increment)

        pk.fence()
        self.stencil_time = timer.seconds()

        active_points: int = (n - 2 * r) * (n - 2 * r)

        # verify correctness
        self.norm = pk.parallel_reduce(
            pk.MDRangePolicy([r, r], [n - r, n - r], [t, t]), self.norm_reduce)
        pk.fence()
        self.norm /= active_points

        episilon: float = 1.0e-8
        reference_norm: float = 2 * (iterations)
        if (abs(self.norm - reference_norm) > episilon):
            pk.printf("ERROR: L1 norm != Reference norm err=%.2f\n",
                      abs(self.norm - reference_norm))
        else:
            pk.printf("Solution validates\n")
Exemplo n.º 18
0
def run() -> None:
    random.seed(1010101)

    indices = 8192
    data = 33554432
    repeats = 10
    space = pk.ExecutionSpace.OpenMP

    parser = argparse.ArgumentParser()
    parser.add_argument("--indices", type=int)
    parser.add_argument("--data", type=int)
    parser.add_argument("--repeats", type=int)
    parser.add_argument("--atomics", action="store_true")
    parser.add_argument("--execution_space", type=str)
    args = parser.parse_args()
    if args.indices:
        indices = args.indices
    if args.data:
        data = args.data
    if args.repeats:
        repeats = args.repeats
    use_atomics = args.atomics
    if args.execution_space:
        space = pk.ExecutionSpace(args.execution_space)

    pk.set_default_space(space)

    w = Benchmark(indices, data, repeats, use_atomics)
    range_indices = pk.RangePolicy(pk.get_default_space(), 0, indices)
    range_data = pk.RangePolicy(pk.get_default_space(), 0, data)

    print("Reports fastest timing per kernel")
    print("Creating Views...")
    print("Memory Sizes:")
    print(f"- Elements: {data} ({1e-6*data*8} MB)")
    print(f"- Indices: {indices} ({1e-6*indices*8} MB)")
    print(f"- Atomics: {'yes' if use_atomics else 'no'}")
    print(f"Benchmark kernels will be performed for {repeats} iterations")

    print("Initializing Views...")
    pk.parallel_for(range_data, w.init_data)
    pk.parallel_for(range_indices, w.init_indices)

    print("Starting benchmarking...")

    timer = pk.Timer()
    for i in range(repeats):
        for i in range(indices):
            w.indices[i] = random.randrange(data)

        if use_atomics:
            pk.parallel_for(range_indices, w.run_gups_atomic)
        else:
            pk.parallel_for(range_indices, w.run_gups)

    gupsTime = timer.seconds()
    print(f"GUP/s Random: {1e-9 * repeats * indices / gupsTime}")
    print(w.data)
Exemplo n.º 19
0
 def run(self) -> None:
     if self.parallel_for:
         if self.half_neigh:
             pk.parallel_for("ForceLJNeigh::compute", self.N_local,
                             self.halfneigh_for)
         else:
             pk.parallel_for("ForceLJNeigh::compute", self.N_local,
                             self.fullneigh_for)
     else:
         if self.half_neigh:
             self.energy = pk.parallel_reduce(
                 "ForceLJNeigh::compute_energy", self.N_local,
                 self.halfneigh_reduce)
         else:
             self.energy = pk.parallel_reduce(
                 "ForceLJNeigh::compute_energy", self.N_local,
                 self.fullneigh_reduce)
Exemplo n.º 20
0
 def run(self) -> None:
     if self.workunit_id == 0:
         pk.parallel_for("CommSerial::exchange_self", self.N_local,
                         self.tag_exchange_self)
     elif self.workunit_id == 1:
         pk.parallel_for("CommSerial::halo_exchange_self", self.nparticles,
                         self.tag_halo_self)
     elif self.workunit_id == 2:
         pk.parallel_for("CommSerial::halo_update_self",
                         self.update_threads, self.tag_halo_update_self)
     elif self.workunit_id == 3:
         pk.parallel_for("CommSerial::halo_force_self", self.force_threads,
                         self.tag_halo_force_self)
Exemplo n.º 21
0
    def test_real(self):
        pk.set_default_precision(pk.int32)
        view: pk.View1d = pk.View([self.threads])

        self.assertTrue(view.dtype is pk.DataType.int32)
        self.assertTrue(
            pk.View._get_dtype_name(str(type(view.array))) == "int32")

        f = RealViewTestFunctor(view)
        w = RealViewTestWorkload(self.threads, view)
        pk.parallel_for(self.threads, f.pfor)
        pk.execute(pk.ExecutionSpace.Default, w)

        view.set_precision(pk.float)

        self.assertTrue(view.dtype is pk.DataType.float)
        self.assertTrue(
            pk.View._get_dtype_name(str(type(view.array))) == "float32")
        pk.parallel_for(self.threads, f.pfor)
        pk.execute(pk.ExecutionSpace.Default, w)
Exemplo n.º 22
0
def run() -> None:
    values: Tuple[int, int, int, int, int, bool] = parse_args()
    N: int = values[0]
    M: int = values[1]
    nrepeat: int = 1 
    print(f"Total size S = {N * M} N = {N} M = {M}")

    y = pk.View([N], pk.double)
    x = pk.View([M], pk.double)
    A = pk.View([N * M], pk.double)

    p = pk.RangePolicy(pk.get_default_space(), 0, N)
    pk.parallel_for(p, y_init, y=y)
    pk.parallel_for(pk.RangePolicy(pk.get_default_space(), 0, M), y_init, y=x)
    pk.parallel_for(p, matrix_init, M=M, A=A)

    timer = pk.Timer()

    for i in range(nrepeat):
        result = pk.parallel_reduce(p, yAx, M=M, y=y, x=x, A=A)

    timer_result = timer.seconds()

    print(f"Computed result for {N} x {M} is {result}")
    solution = N * M

    if result != solution:
        pk.printf("Error: result (%lf) != solution (%lf)\n", result, solution)

    print(f"N({N}) M({M}) nrepeat({nrepeat}) problem(MB) time({timer_result}) bandwidth(GB/s)")
Exemplo n.º 23
0
def run() -> None:
    values: Tuple[int, int, int, int, int, bool] = parse_args()
    N: int = values[0]
    M: int = values[1]
    nrepeat: int = 100
    print(f"Total size S = {N * M} N = {N} M = {M}")

    p = pk.RangePolicy(pk.get_default_space(), 0, N)
    w = Workload(N, M)
    pk.parallel_for(p, w.y_init)
    pk.parallel_for(pk.RangePolicy(pk.get_default_space(), 0, M), w.x_init)
    pk.parallel_for(p, w.matrix_init)

    timer = pk.Timer()

    for i in range(nrepeat):
        result = pk.parallel_reduce(p, w.yAx)

    timer_result = timer.seconds()

    print(f"Computed result for {N} x {M} is {result}")
    solution = N * M

    if result != solution:
        pk.printf("Error: result (%lf) != solution (%lf)\n", result, solution)

    print(f"N({N}) M({M}) nrepeat({nrepeat}) problem(MB) time({timer_result}) bandwidth(GB/s)")
Exemplo n.º 24
0
    def run(self):
        pk.parallel_for(self.array_size, self.init_arrays)

        timer = pk.Timer()
        for i in range(self.num_times):
            pk.parallel_for("babel_stream", self.array_size, self.copy)
            pk.fence()
            # self.runtimes[0][i] = timer.seconds()
            # timer.reset()

            pk.parallel_for("babel_stream", self.array_size, self.mul)
            pk.fence()
            # self.runtimes[1][i] = timer.seconds()
            # timer.reset()

            pk.parallel_for("babel_stream", self.array_size, self.add)
            pk.fence()
            pk.parallel_for("babel_stream", self.array_size, self.triad)
            pk.fence()
            self.sum = pk.parallel_reduce("babel_stream", self.array_size,
                                          self.dot)

        self.runtime = timer.seconds()
Exemplo n.º 25
0
    def run(self):
        pk.parallel_for(self.length, self.init)
        # pk.parallel_for(self.length, lambda i: 0, self.A)
        # pk.parallel_for(self.length, lambda i: 2, self.B)
        # pk.parallel_for(self.length, lambda i: 2, self.C)
        pk.fence()

        timer = pk.Timer()

        for i in range(self.iterations):
            pk.parallel_for("nstream", self.length, self.nstream)

        pk.fence()
        self.nstream_time = timer.seconds()

        # verify correctness
        ar: float = 0
        br: float = 2
        cr: float = 2
        for i in range(self.iterations):
            ar += br + self.scalar * cr

        ar *= self.length

        self.asum = pk.parallel_reduce(self.length,
                                       lambda i, acc: acc + abs(self.A[i]))
        pk.fence()

        episilon: float = 1.0e-8
        if (abs(ar - self.asum) / self.asum > episilon):
            pk.printf("ERROR: Failed Valication on output array\n")
        else:
            avgtime: float = self.nstream_time / self.iterations
            nbytes: float = 4.0 * self.length * 4
            pk.printf("Solution validates\n")
            pk.printf("Rate (MB/s): %.2f\n", 1.e-6 * nbytes / avgtime)
            pk.printf("Avg time (ms): %f\n", avgtime / 1.e-3)
Exemplo n.º 26
0
    def run(self):
        pk.parallel_for(N, self.init_y)
        pk.parallel_for(M, self.init_x)
        pk.parallel_for(pk.MDRangePolicy([0, 0], [self.N, self.M]),
                        self.init_A)

        timer = pk.Timer()

        for i in range(self.nrepeat):
            self.result = pk.parallel_reduce("mdrange", self.N, self.yAx)

        self.timer_result = timer.seconds()
Exemplo n.º 27
0
    def run(self):
        pk.parallel_for(self.N, self.y_init)
        # pk.parallel_for(self.N, lambda i : self.y[i] = 1)
        pk.parallel_for(self.M, self.x_init)
        # pk.parallel_for(self.N, lambda i : self.x[i] = 1)
        pk.parallel_for(self.N, self.matrix_init)

        timer = pk.Timer()

        for i in range(self.nrepeat):
            self.result = pk.parallel_reduce("01", self.N, self.yAx)

        self.timer_result = timer.seconds()
Exemplo n.º 28
0
def run() -> None:
    values: Tuple[int, int, int, int, int, bool] = parse_args()
    N: int = values[0]
    M: int = values[1]
    fill: bool = values[-1]
    nrepeat: int = 100
    print(f"Total size S = {N * M} N = {N} M = {M}")

    pk.set_default_space(pk.ExecutionSpace.Cuda)

    y: pk.View1D = pk.View([N], pk.double)
    x: pk.View1D = pk.View([M], pk.double)
    A: pk.View2D = pk.View([N, M], pk.double)

    p = pk.RangePolicy(pk.get_default_space(), 0, N)
    pk.parallel_for(p, y_init, y=y)
    pk.parallel_for(pk.RangePolicy(pk.get_default_space(), 0, M), y_init, y=x)
    pk.parallel_for(p, matrix_init, M=M, A=A)

    # if fill:
    #     y.fill(1)
    #     x.fill(1)
    #     A.fill(1)
    # else:
    #     for i in range(N):
    #         y[i] = 1

    #     for i in range(M):
    #         x[i] = 1

    #     for j in range(N):
    #         for i in range(M):
    #             A[j][i] = 1

    timer = pk.Timer()

    for i in range(nrepeat):
        result = pk.parallel_reduce(p, yAx, M=M, y=y, x=x, A=A)

    timer_result = timer.seconds()

    print(f"Computed result for {N} x {M} is {result}")
    solution: float = N * M

    if result != solution:
        pk.printf("Error: result (%lf) != solution (%lf)\n", result, solution)

    print(
        f"N({N}) M({M}) nrepeat({nrepeat}) problem(MB) time({timer_result}) bandwidth(GB/s)"
    )
Exemplo n.º 29
0
    def run(self):
        printf("Initializing Views...\n")
        pk.parallel_for(self.dataCount, self.init_data)
        pk.parallel_for(self.indicesCount, self.init_indices)

        printf("Starting benchmarking...\n")
        pk.fence()

        timer = pk.Timer()
        for i in range(self.repeats):
            # FIXME: randomize indices
            # for i in range(self.indicesCount):
            #     self.indices[i] = random.randrange(self.dataCount)

            if self.use_atomics:
                pk.parallel_for("gups", self.indicesCount,
                                self.run_gups_atomic)
            else:
                pk.parallel_for("gups", self.indicesCount, self.run_gups)

            pk.fence()

        self.gupsTime = timer.seconds()
Exemplo n.º 30
0
    def run(self):
        pk.parallel_for(
            pk.MDRangePolicy([0, 0], [self.order, self.order],
                             [self.tile_size, self.tile_size]), self.init)
        pk.fence()

        timer = pk.Timer()

        for i in range(self.iterations):
            if self.permute:
                pk.parallel_for(
                    "transpose",
                    pk.MDRangePolicy([0, 0], [self.order, self.order],
                                     [self.tile_size, self.tile_size],
                                     rank=pk.Rank(2, pk.Iterate.Left,
                                                  pk.Iterate.Right)),
                    self.tranpose)
            else:
                pk.parallel_for(
                    "transpose",
                    pk.MDRangePolicy([0, 0], [self.order, self.order],
                                     [self.tile_size, self.tile_size],
                                     rank=pk.Rank(2, pk.Iterate.Right,
                                                  pk.Iterate.Left)),
                    self.tranpose)

        self.transpose_time = timer.seconds()

        self.abserr = pk.parallel_reduce(
            pk.MDRangePolicy([0, 0], [self.order, self.order],
                             [self.tile_size, self.tile_size]),
            self.abserr_reduce)

        pk.printf("%f\n", self.abserr)
        episilon: float = 1.0e-8
        if (self.abserr > episilon):
            pk.printf(
                "ERROR: aggregated squared error exceeds threshold %.2f\n",
                self.abserr)
        else:
            pk.printf("Solution validates %2.f\n", self.abserr)