Exemplo n.º 1
0
def run() -> None:
    values: Tuple[int, int, int, int, int, bool] = parse_args()
    N: int = values[0]
    M: int = values[1]
    nrepeat: int = 100
    print(f"Total size S = {N * M} N = {N} M = {M}")

    p = pk.RangePolicy(pk.get_default_space(), 0, N)
    w = Workload(N, M)
    pk.parallel_for(p, w.y_init)
    pk.parallel_for(pk.RangePolicy(pk.get_default_space(), 0, M), w.x_init)
    pk.parallel_for(p, w.matrix_init)

    timer = pk.Timer()

    for i in range(nrepeat):
        result = pk.parallel_reduce(p, w.yAx)

    timer_result = timer.seconds()

    print(f"Computed result for {N} x {M} is {result}")
    solution = N * M

    if result != solution:
        pk.printf("Error: result (%lf) != solution (%lf)\n", result, solution)

    print(f"N({N}) M({M}) nrepeat({nrepeat}) problem(MB) time({timer_result}) bandwidth(GB/s)")
Exemplo n.º 2
0
def run() -> None:
    values: Tuple[int, int, int, int, int, bool] = parse_args()
    N: int = values[0]
    M: int = values[1]
    nrepeat: int = 1 
    print(f"Total size S = {N * M} N = {N} M = {M}")

    y = pk.View([N], pk.double)
    x = pk.View([M], pk.double)
    A = pk.View([N * M], pk.double)

    p = pk.RangePolicy(pk.get_default_space(), 0, N)
    pk.parallel_for(p, y_init, y=y)
    pk.parallel_for(pk.RangePolicy(pk.get_default_space(), 0, M), y_init, y=x)
    pk.parallel_for(p, matrix_init, M=M, A=A)

    timer = pk.Timer()

    for i in range(nrepeat):
        result = pk.parallel_reduce(p, yAx, M=M, y=y, x=x, A=A)

    timer_result = timer.seconds()

    print(f"Computed result for {N} x {M} is {result}")
    solution = N * M

    if result != solution:
        pk.printf("Error: result (%lf) != solution (%lf)\n", result, solution)

    print(f"N({N}) M({M}) nrepeat({nrepeat}) problem(MB) time({timer_result}) bandwidth(GB/s)")
Exemplo n.º 3
0
def run() -> None:
    random.seed(1010101)

    indices = 8192
    data = 33554432
    repeats = 10
    space = pk.ExecutionSpace.OpenMP

    parser = argparse.ArgumentParser()
    parser.add_argument("--indices", type=int)
    parser.add_argument("--data", type=int)
    parser.add_argument("--repeats", type=int)
    parser.add_argument("--atomics", action="store_true")
    parser.add_argument("--execution_space", type=str)
    args = parser.parse_args()
    if args.indices:
        indices = args.indices
    if args.data:
        data = args.data
    if args.repeats:
        repeats = args.repeats
    use_atomics = args.atomics
    if args.execution_space:
        space = pk.ExecutionSpace(args.execution_space)

    pk.set_default_space(space)

    w = Benchmark(indices, data, repeats, use_atomics)
    range_indices = pk.RangePolicy(pk.get_default_space(), 0, indices)
    range_data = pk.RangePolicy(pk.get_default_space(), 0, data)

    print("Reports fastest timing per kernel")
    print("Creating Views...")
    print("Memory Sizes:")
    print(f"- Elements: {data} ({1e-6*data*8} MB)")
    print(f"- Indices: {indices} ({1e-6*indices*8} MB)")
    print(f"- Atomics: {'yes' if use_atomics else 'no'}")
    print(f"Benchmark kernels will be performed for {repeats} iterations")

    print("Initializing Views...")
    pk.parallel_for(range_data, w.init_data)
    pk.parallel_for(range_indices, w.init_indices)

    print("Starting benchmarking...")

    timer = pk.Timer()
    for i in range(repeats):
        for i in range(indices):
            w.indices[i] = random.randrange(data)

        if use_atomics:
            pk.parallel_for(range_indices, w.run_gups_atomic)
        else:
            pk.parallel_for(range_indices, w.run_gups)

    gupsTime = timer.seconds()
    print(f"GUP/s Random: {1e-9 * repeats * indices / gupsTime}")
    print(w.data)
Exemplo n.º 4
0
def run() -> None:
    values: Tuple[int, int, int, int, int, bool] = parse_args()
    N: int = values[0]
    M: int = values[1]
    fill: bool = values[-1]
    nrepeat: int = 100
    print(f"Total size S = {N * M} N = {N} M = {M}")

    pk.set_default_space(pk.ExecutionSpace.Cuda)

    y: pk.View1D = pk.View([N], pk.double)
    x: pk.View1D = pk.View([M], pk.double)
    A: pk.View2D = pk.View([N, M], pk.double)

    p = pk.RangePolicy(pk.get_default_space(), 0, N)
    pk.parallel_for(p, y_init, y=y)
    pk.parallel_for(pk.RangePolicy(pk.get_default_space(), 0, M), y_init, y=x)
    pk.parallel_for(p, matrix_init, M=M, A=A)

    # if fill:
    #     y.fill(1)
    #     x.fill(1)
    #     A.fill(1)
    # else:
    #     for i in range(N):
    #         y[i] = 1

    #     for i in range(M):
    #         x[i] = 1

    #     for j in range(N):
    #         for i in range(M):
    #             A[j][i] = 1

    timer = pk.Timer()

    for i in range(nrepeat):
        result = pk.parallel_reduce(p, yAx, M=M, y=y, x=x, A=A)

    timer_result = timer.seconds()

    print(f"Computed result for {N} x {M} is {result}")
    solution: float = N * M

    if result != solution:
        pk.printf("Error: result (%lf) != solution (%lf)\n", result, solution)

    print(
        f"N({N}) M({M}) nrepeat({nrepeat}) problem(MB) time({timer_result}) bandwidth(GB/s)"
    )
Exemplo n.º 5
0
    def setUp(self):
        self.threads: int = 50
        self.value: int = 7

        self.functor = Add1DTestScanFunctor(self.threads, self.value)
        self.range_policy = pk.RangePolicy(pk.ExecutionSpace.Default, 0,
                                           self.threads)
Exemplo n.º 6
0
    def test_dynamic2D(self):
        expected_result: int = self.i_4 * self.i_1 * self.i_2
        result: int = pk.parallel_reduce(
            pk.RangePolicy(pk.ExecutionSpace.Default, 0, self.i_2),
            self.functor.dynamic2D)

        self.assertEqual(expected_result, result)
Exemplo n.º 7
0
def run() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument('iterations', type=int)
    parser.add_argument('length', type=int)
    parser.add_argument('offset', nargs='?', type=int, default=0)
    args = parser.parse_args()
    iterations = args.iterations
    length = args.length
    offset = args.offset
    scalar = 3

    if iterations < 1:
        sys.exit("ERROR: iterations must be >= 1")

    if length <= 0:
        sys.exit("ERROR: vector length must be positive")

    # emulate cpp example
    if length <= 0:
        sys.exit("ERROR: offset must be nonnegative")

    print("Number of iterations = ", iterations)
    print("Vector length        = ", length)
    print("Offset               = ", offset)

    p = pk.RangePolicy(pk.ExecutionSpace.OpenMP, 0, length)
    w = Workload(iterations, length, offset, scalar)

    pk.parallel_for(p, w.init_views)
    # pk.fence()

    timer = pk.Timer()

    for i in range(iterations):
        pk.parallel_for(p, w.nstream)

    # pk.fence()
    nstream_time = timer.seconds()

    # verify correctness
    ar: float = 0
    br: float = 2
    cr: float = 2
    for i in range(iterations):
        ar += br + scalar * cr

    ar *= length

    asum = pk.parallel_reduce(p, w.res_reduce)
    # pk.fence()

    episilon: float = 1.0e-8
    if (abs(ar - asum) / asum > episilon):
        print("ERROR: Failed Valication on output array")
    else:
        avgtime: float = nstream_time / iterations
        nbytes: float = 4.0 * length * 4
        print("Solution validates")
        print("Rate (MB/s): %.2f" % (1.e-6 * nbytes / avgtime))
        print("Avg time (ms): %f" % (avgtime / 1.e-3))
Exemplo n.º 8
0
    def setUp(self):
        self.threads: int = 50
        self.i_1: int = 7
        self.i_2: int = 2
        self.b_1: bool = False
        self.b_2: bool = True

        self.functor = ASTTestReduceFunctor(self.threads, self.i_1, self.i_2,
                                            self.b_1, self.b_2)
        self.range_policy = pk.RangePolicy(pk.ExecutionSpace.Default, 0,
                                           self.threads)
Exemplo n.º 9
0
    def setUp(self):
        self.threads: int = 10
        self.i_1: int = 10
        self.i_2: int = 15
        self.i_3: int = 20
        self.i_4: int = 10

        self.functor = ViewsTestFunctor(self.threads, self.i_1, self.i_2,
                                        self.i_3, self.i_4)
        self.range_policy = pk.RangePolicy(pk.ExecutionSpace.Default, 0,
                                           self.threads)
Exemplo n.º 10
0
    def setUp(self):
        self.threads: int = 1
        self.i_1: int = 5
        self.i_2: int = 2
        self.f_1: float = 7.0
        self.f_2: float = 3.0

        self.functor = AtomicsTestFunctor(self.threads, self.i_1, self.i_2,
                                          self.f_1, self.f_2)
        self.range_policy = pk.RangePolicy(pk.ExecutionSpace.Default, 0,
                                           self.threads)
Exemplo n.º 11
0
    def setUp(self):
        self.threads: int = 50
        self.i_1: int = 5
        self.i_2: int = 1
        self.f_1: float = 5.5
        self.b_1: bool = True

        self.functor = ClasstypesTestFunctor(self.threads, self.i_1, self.i_2,
                                             self.f_1, self.b_1)
        self.range_policy = pk.RangePolicy(pk.ExecutionSpace.Default, 0,
                                           self.threads)
Exemplo n.º 12
0
    def setUp(self):
        self.threads: int = 50
        self.i_1: int = 7
        self.i_2: int = 2
        self.f_1: float = 5.5
        self.f_2: float = 1.3
        self.b_1: bool = True

        self.functor = KokkosFunctionsTestReduceFunctor(
            self.threads, self.i_1, self.i_2, self.f_1, self.f_2, self.b_1)
        self.range_policy = pk.RangePolicy(pk.ExecutionSpace.Default, 0,
                                           self.threads)
Exemplo n.º 13
0
    space = pk.ExecutionSpace.OpenMP
    if args.execution_space:
        space = pk.ExecutionSpace(args.execution_space)

    pk.set_default_space(space)

    N = args.N
    K = args.K
    D = args.D
    R = args.R
    U = args.U
    F = args.F
    scalar_size = 8

    policy = pk.RangePolicy(pk.get_default_space(), 0, N)
    w = Benchmark_double_8(N, K, D, R, F)

    timer = pk.Timer()
    for r in range(R):
        pk.parallel_for(policy, w.benchmark)
        pk.fence()

    seconds = timer.seconds()

    num_bytes = 1.0 * N * K * R * (2 * scalar_size + 4) + N * R * scalar_size
    flops = 1.0 * N * K * R * (F * 2 * U + 2 * (U - 1))
    gather_ops = 1.0 * N * K * R * 2
    seconds = seconds
    print(
        f"SNKDRUF: {scalar_size/4} {N} {K} {D} {R} {U} {F} Time: {seconds} " +
Exemplo n.º 14
0
import pykokkos as pk


@pk.functor
class Workload:
    def __init__(self, N: int):
        self.A: pk.View1D[pk.int32] = pk.View([N], pk.int32)

    @pk.workunit
    def init(self, i: int):
        self.A[i] = i

    @pk.workunit
    def scan(self, i: int, acc: pk.Acc[pk.double], last_pass: bool):
        acc += self.A[i]
        if last_pass:
            self.A[i] = acc


if __name__ == "__main__":
    N = 10
    w = Workload(N)
    p = pk.RangePolicy(pk.ExecutionSpace.OpenMP, 0, N)
    pk.parallel_for(p, w.init)

    timer = pk.Timer()
    result = pk.parallel_scan(p, w.scan)
    timer_result = timer.seconds()

    print(f"{w.A} total={result} time({timer_result})")
Exemplo n.º 15
0
    parser.add_argument("-n", "--numtimes", type=int, help="Run the test NUM times (NUM >= 2)")
    parser.add_argument("-space", "--execution_space", type=str)
    args = parser.parse_args()

    if args.arraysize:
        array_size = args.arraysize
    if args.numtimes:
        num_times = args.numtimes
    if args.execution_space:
        space = pk.ExecutionSpace(space)

    a: pk.View1D[pk.double] = pk.View([array_size], pk.double)
    b: pk.View1D[pk.double] = pk.View([array_size], pk.double)
    c: pk.View1D[pk.double] = pk.View([array_size], pk.double)

    p = pk.RangePolicy(space, 0, array_size)
    pk.parallel_for(p, init_arrays, a=a, b=b, c=c, initA=startA, initB=startB, initC=startC)

    timer = pk.Timer()
    timings = [[] for i in range(5)]
    for i in range(num_times):
        pk.parallel_for(p, copy, a=a, c=c)
        timings[0].append(timer.seconds())
        timer.reset()

        pk.parallel_for(p, mul, b=b, scalar=scalar, c=c)
        timings[1].append(timer.seconds())
        timer.reset()

        pk.parallel_for(p, add, a=a, b=b, c=c)
        timings[2].append(timer.seconds())
Exemplo n.º 16
0
    def create_lattice(self, comm: Comm) -> None:
        s: System = copy.deepcopy(self.system)

        if self.lattice_style == LatticeType.LATTICE_SC.value:
            self.system.domain_x = self.lattice_constant * self.lattice_nx
            self.system.domain_y = self.lattice_constant * self.lattice_ny
            self.system.domain_z = self.lattice_constant * self.lattice_nz

            comm.create_domain_decomposition()
            s = copy.deepcopy(self.system)

            ix_start: int = math.floor(s.sub_domain_lo_x / s.domain_x *
                                       self.lattice_nx - 0.5)
            iy_start: int = math.floor(s.sub_domain_lo_y / s.domain_y *
                                       self.lattice_ny - 0.5)
            iz_start: int = math.floor(s.sub_domain_lo_z / s.domain_z *
                                       self.lattice_nz - 0.5)

            ix_end: int = math.floor(s.sub_domain_hi_x / s.domain_x *
                                     self.lattice_nx + 0.5)
            iy_end: int = math.floor(s.sub_domain_hi_y / s.domain_y *
                                     self.lattice_ny + 0.5)
            iz_end: int = math.floor(s.sub_domain_hi_z / s.domain_z *
                                     self.lattice_nz + 0.5)

            n: int = 0

            for iz in range(iz_start, iz_end + 1):
                ztmp: float = (self.lattice_constant *
                               (iz + self.lattice_offset_z))

                for iy in range(iy_start, iy_end + 1):
                    ytmp: float = (self.lattice_constant *
                                   (iy + self.lattice_offset_y))

                    for ix in range(ix_start, ix_end + 1):
                        xtmp: float = (self.lattice_constant *
                                       (ix + self.lattice_offset_x))

                        if (xtmp >= s.sub_domain_lo_x
                                and ytmp >= s.sub_domain_lo_y
                                and ztmp >= s.sub_domain_lo_z
                                and xtmp < s.sub_domain_hi_x
                                and ytmp < s.sub_domain_hi_y
                                and ztmp < s.sub_domain_hi_z):
                            n += 1

            self.system.N_local = n
            self.system.N = n
            self.system.grow(n)

            s = copy.deepcopy(self.system)

            for iz in range(iz_start, iz_end + 1):
                ztmp: float = (self.lattice_constant *
                               (iz + self.lattice_offset_z))

                for iy in range(iy_start, iy_end + 1):
                    ytmp: float = (self.lattice_constant *
                                   (iy + self.lattice_offset_y))

                    for ix in range(ix_start, ix_end + 1):
                        xtmp: float = (self.lattice_constant *
                                       (ix + self.lattice_offset_x))

                        if (xtmp >= s.sub_domain_lo_x
                                and ytmp >= s.sub_domain_lo_y
                                and ztmp >= s.sub_domain_lo_z
                                and xtmp < s.sub_domain_hi_x
                                and ytmp < s.sub_domain_hi_y
                                and ztmp < s.sub_domain_hi_z):
                            n += 1

            self.system.grow(n)
            s = copy.deepcopy(self.system)
            n = 0

            for iz in range(iz_start, iz_end + 1):
                ztmp: float = (self.lattice_constant *
                               (iz + self.lattice_offset_z))

                for iy in range(iy_start, iy_end + 1):
                    ytmp: float = (self.lattice_constant *
                                   (iy + self.lattice_offset_y))

                    for ix in range(ix_start, ix_end + 1):
                        xtmp: float = (self.lattice_constant *
                                       (ix + self.lattice_offset_x))

                        s.x[n][0] = xtmp
                        s.x[n][1] = ytmp
                        s.x[n][2] = ztmp
                        s.type[n] = random.randint(0, s.ntypes - 1)
                        s.id[n] = n + 1
                        n += 1

            comm.reduce_int(self.system.N, 1)

            N_local_offset: int = n
            comm.scan_int(N_local_offset, 1)
            for i in range(n):
                s.id[i] += N_local_offset - n

            if self.system.do_print:
                print(f"Atoms: {self.system.N} {self.system.N_local}")

        if self.lattice_style == LatticeType.LATTICE_FCC.value:
            self.system.domain_x = self.lattice_constant * self.lattice_nx
            self.system.domain_y = self.lattice_constant * self.lattice_ny
            self.system.domain_z = self.lattice_constant * self.lattice_nz

            comm.create_domain_decomposition()
            s = copy.deepcopy(self.system)

            basis: List[List[float]] = [[0.0, 0.0, 0.0], [0.5, 0.5, 0.0],
                                        [0.5, 0.0, 0.5], [0.0, 0.5, 0.5]]
            basis_view = pk.View([4, 3], pk.double)
            for i in range(4):
                basis_view[i][0] = basis[i][0]
                basis_view[i][1] = basis[i][1]
                basis_view[i][2] = basis[i][2]

            for i in range(4):
                basis_view[i][0] += self.lattice_offset_x
                basis_view[i][1] += self.lattice_offset_y
                basis_view[i][2] += self.lattice_offset_z

            print(f"{s.sub_domain_lo_x} {s.domain_x} {self.lattice_nx} - 0.5")
            ix_start: int = math.floor(s.sub_domain_lo_x / s.domain_x *
                                       self.lattice_nx - 0.5)
            iy_start: int = math.floor(s.sub_domain_lo_y / s.domain_y *
                                       self.lattice_ny - 0.5)
            iz_start: int = math.floor(s.sub_domain_lo_z / s.domain_z *
                                       self.lattice_nz - 0.5)

            ix_end: int = math.floor(s.sub_domain_hi_x / s.domain_x *
                                     self.lattice_nx + 0.5)
            iy_end: int = math.floor(s.sub_domain_hi_y / s.domain_y *
                                     self.lattice_ny + 0.5)
            iz_end: int = math.floor(s.sub_domain_hi_z / s.domain_z *
                                     self.lattice_nz + 0.5)

            init_s = init_system(s, ix_start, ix_end, iy_start, iy_end,
                                 iz_start, iz_end, self.lattice_constant,
                                 basis_view)
            n: int = pk.parallel_reduce(
                "init_s", pk.RangePolicy(pk.Default, iz_start + 1, iz_end + 1),
                init_s.get_n)

            # n: int = calculate_n(ix_start, ix_end, iy_start, iy_end, iz_start, iz_end,
            #                      self.lattice_constant, np.array(basis),
            #                      s.sub_domain_lo_x, s.sub_domain_lo_y, s.sub_domain_lo_z,
            #                      s.sub_domain_hi_x, s.sub_domain_hi_y, s.sub_domain_hi_z)

            self.system.N_local = n
            self.system.N = n

            # Instead of calling it get_n twice, multiply by 2 (unlike c++ version)
            n *= 2
            self.system.grow(n)
            s = self.system

            n: int = init_x(ix_start, ix_end, iy_start, iy_end, iz_start,
                            iz_end, self.lattice_constant, basis_view.data,
                            s.sub_domain_lo_x, s.sub_domain_lo_y,
                            s.sub_domain_lo_z, s.sub_domain_hi_x,
                            s.sub_domain_hi_y, s.sub_domain_hi_z, s.x.data,
                            s.type.data, s.id.data, s.ntypes)

            N_local_offset: int = n
            comm.scan_int(N_local_offset, 1)
            to_add: int = N_local_offset - n
            s.id.data += to_add

            comm.reduce_int(self.system.N, 1)

            if self.system.do_print:
                print(f"Atoms: {self.system.N} {self.system.N_local}")

        s = self.system
        total_mass: float = 0.0
        total_momentum_x: float = 0.0
        total_momentum_y: float = 0.0
        total_momentum_z: float = 0.0

        ibase: int = self.temperature_seed
        ibase &= 0xffffffff
        ibase_bin = list(ibase.to_bytes(4, "little"))
        for i in range(4):
            if (ibase_bin[i] & (1 << 7)):
                ibase_bin[i] -= 1 << 8

        ibase &= 0xffffffff
        ibase_bin = list(ibase.to_bytes(4, "little"))
        for i in range(4):
            if (ibase_bin[i] & (1 << 7)):
                ibase_bin[i] -= 1 << 8

        hash_uint: int = 0
        for i in ibase_bin:
            hash_uint += i
            hash_uint &= 0xFFFFFFFF
            hash_uint += hash_uint << 10
            hash_uint &= 0xFFFFFFFF
            hash_uint ^= hash_uint >> 6
            hash_uint &= 0xFFFFFFFF

        x_bytes = np.reshape(np.frombuffer(s.x.data.tobytes(), dtype=np.byte),
                             (s.x.shape[0], s.x.shape[1], 8)).astype(int)
        total_mass, total_momentum_x, total_momentum_y, total_momentum_z = init_v(
            hash_uint, x_bytes, s.v.data, s.mass.data, s.type.data,
            self.system.N_local)

        s.q.fill(0.0)

        comm.reduce_float(total_momentum_x, 1)
        comm.reduce_float(total_momentum_y, 1)
        comm.reduce_float(total_momentum_z, 1)
        comm.reduce_float(total_mass, 1)

        system_vx: float = total_momentum_x / total_mass
        system_vy: float = total_momentum_y / total_mass
        system_vz: float = total_momentum_z / total_mass

        system_v = np.array([system_vx, system_vy, system_vz])
        s.v.data -= system_v

        temp = Temperature(comm)
        T: float = temp.compute(self.system)

        T_init_scale: float = math.sqrt(self.temperature_target / T)

        s.v.data *= T_init_scale
Exemplo n.º 17
0
        indices = args.indices
    if args.data:
        data = args.data
    if args.repeats:
        repeats = args.repeats
    use_atomics = args.atomics
    if args.execution_space:
        space = pk.ExecutionSpace(args.execution_space)

    pk.set_default_space(space)

    indices_view: pk.View1D[pk.int64] = pk.View([indices], pk.int64)
    data_view: pk.View1D[pk.int64] = pk.View([data], pk.int64)
    datum: pk.int64 = -1

    range_indices = pk.RangePolicy(pk.get_default_space(), 0, indices)
    range_data = pk.RangePolicy(pk.get_default_space(), 0, data)

    print("Reports fastest timing per kernel")
    print("Creating Views...")
    print("Memory Sizes:")
    print(f"- Elements: {data} ({1e-6*data*8} MB)")
    print(f"- Indices: {indices} ({1e-6*indices*8} MB)")
    print(f"- Atomics: {'yes' if use_atomics else 'no'}")
    print(f"Benchmark kernels will be performed for {repeats} iterations")

    print("Initializing Views...")
    pk.parallel_for(range_data, init_data, data=data_view)
    pk.parallel_for(range_indices, init_indices, indices=indices_view)

    print("Starting benchmarking...")