示例#1
0
def run() -> None:
    values: Tuple[int, int, int, int, int, bool] = parse_args()
    N: int = values[0]
    M: int = values[1]
    nrepeat: int = 1 
    print(f"Total size S = {N * M} N = {N} M = {M}")

    y = pk.View([N], pk.double)
    x = pk.View([M], pk.double)
    A = pk.View([N * M], pk.double)

    p = pk.RangePolicy(pk.get_default_space(), 0, N)
    pk.parallel_for(p, y_init, y=y)
    pk.parallel_for(pk.RangePolicy(pk.get_default_space(), 0, M), y_init, y=x)
    pk.parallel_for(p, matrix_init, M=M, A=A)

    timer = pk.Timer()

    for i in range(nrepeat):
        result = pk.parallel_reduce(p, yAx, M=M, y=y, x=x, A=A)

    timer_result = timer.seconds()

    print(f"Computed result for {N} x {M} is {result}")
    solution = N * M

    if result != solution:
        pk.printf("Error: result (%lf) != solution (%lf)\n", result, solution)

    print(f"N({N}) M({M}) nrepeat({nrepeat}) problem(MB) time({timer_result}) bandwidth(GB/s)")
示例#2
0
文件: 01.py 项目: kokkos/pykokkos
def run() -> None:
    values: Tuple[int, int, int, int, int, bool] = parse_args()
    N: int = values[0]
    M: int = values[1]
    nrepeat: int = 100
    print(f"Total size S = {N * M} N = {N} M = {M}")

    p = pk.RangePolicy(pk.get_default_space(), 0, N)
    w = Workload(N, M)
    pk.parallel_for(p, w.y_init)
    pk.parallel_for(pk.RangePolicy(pk.get_default_space(), 0, M), w.x_init)
    pk.parallel_for(p, w.matrix_init)

    timer = pk.Timer()

    for i in range(nrepeat):
        result = pk.parallel_reduce(p, w.yAx)

    timer_result = timer.seconds()

    print(f"Computed result for {N} x {M} is {result}")
    solution = N * M

    if result != solution:
        pk.printf("Error: result (%lf) != solution (%lf)\n", result, solution)

    print(f"N({N}) M({M}) nrepeat({nrepeat}) problem(MB) time({timer_result}) bandwidth(GB/s)")
示例#3
0
文件: gups.py 项目: kokkos/pykokkos
def run() -> None:
    random.seed(1010101)

    indices = 8192
    data = 33554432
    repeats = 10
    space = pk.ExecutionSpace.OpenMP

    parser = argparse.ArgumentParser()
    parser.add_argument("--indices", type=int)
    parser.add_argument("--data", type=int)
    parser.add_argument("--repeats", type=int)
    parser.add_argument("--atomics", action="store_true")
    parser.add_argument("--execution_space", type=str)
    args = parser.parse_args()
    if args.indices:
        indices = args.indices
    if args.data:
        data = args.data
    if args.repeats:
        repeats = args.repeats
    use_atomics = args.atomics
    if args.execution_space:
        space = pk.ExecutionSpace(args.execution_space)

    pk.set_default_space(space)

    w = Benchmark(indices, data, repeats, use_atomics)
    range_indices = pk.RangePolicy(pk.get_default_space(), 0, indices)
    range_data = pk.RangePolicy(pk.get_default_space(), 0, data)

    print("Reports fastest timing per kernel")
    print("Creating Views...")
    print("Memory Sizes:")
    print(f"- Elements: {data} ({1e-6*data*8} MB)")
    print(f"- Indices: {indices} ({1e-6*indices*8} MB)")
    print(f"- Atomics: {'yes' if use_atomics else 'no'}")
    print(f"Benchmark kernels will be performed for {repeats} iterations")

    print("Initializing Views...")
    pk.parallel_for(range_data, w.init_data)
    pk.parallel_for(range_indices, w.init_indices)

    print("Starting benchmarking...")

    timer = pk.Timer()
    for i in range(repeats):
        for i in range(indices):
            w.indices[i] = random.randrange(data)

        if use_atomics:
            pk.parallel_for(range_indices, w.run_gups_atomic)
        else:
            pk.parallel_for(range_indices, w.run_gups)

    gupsTime = timer.seconds()
    print(f"GUP/s Random: {1e-9 * repeats * indices / gupsTime}")
    print(w.data)
示例#4
0
文件: 04.py 项目: kokkos/pykokkos
def run() -> None:
    values: Tuple[int, int, int, int, int, bool] = parse_args()
    N: int = values[0]
    M: int = values[1]
    fill: bool = values[-1]
    nrepeat: int = 100
    print(f"Total size S = {N * M} N = {N} M = {M}")

    pk.set_default_space(pk.ExecutionSpace.Cuda)

    y: pk.View1D = pk.View([N], pk.double)
    x: pk.View1D = pk.View([M], pk.double)
    A: pk.View2D = pk.View([N, M], pk.double)

    p = pk.RangePolicy(pk.get_default_space(), 0, N)
    pk.parallel_for(p, y_init, y=y)
    pk.parallel_for(pk.RangePolicy(pk.get_default_space(), 0, M), y_init, y=x)
    pk.parallel_for(p, matrix_init, M=M, A=A)

    # if fill:
    #     y.fill(1)
    #     x.fill(1)
    #     A.fill(1)
    # else:
    #     for i in range(N):
    #         y[i] = 1

    #     for i in range(M):
    #         x[i] = 1

    #     for j in range(N):
    #         for i in range(M):
    #             A[j][i] = 1

    timer = pk.Timer()

    for i in range(nrepeat):
        result = pk.parallel_reduce(p, yAx, M=M, y=y, x=x, A=A)

    timer_result = timer.seconds()

    print(f"Computed result for {N} x {M} is {result}")
    solution: float = N * M

    if result != solution:
        pk.printf("Error: result (%lf) != solution (%lf)\n", result, solution)

    print(
        f"N({N}) M({M}) nrepeat({nrepeat}) problem(MB) time({timer_result}) bandwidth(GB/s)"
    )
示例#5
0
def run() -> None:
    values: Tuple[int, int, int, int, int, bool] = parse_args()
    N: int = values[0]
    M: int = values[1]
    E: int = values[3]
    fill: bool = values[-1]
    nrepeat: int = 1000
    print(f"Total size S = {N * M} N = {N} M = {M} E = {E}")

    w = Workload(N, M, E, fill)
    p = pk.TeamPolicy(E, "auto", 32, pk.get_default_space())

    timer = pk.Timer()

    for i in range(nrepeat):
        result = pk.parallel_reduce(p, w.yAx)

    timer_result = timer.seconds()

    print(f"Computed result for {N} x {M} x {E} is {result}")
    solution: float = N * M * E

    if result != solution:
        pk.printf("Error: result (%lf) != solution (%lf)\n", result, solution)

    print(
        f"N({N}) M({M}) E({E}) nrepeat({nrepeat}) problem(MB) time({timer_result}) bandwidth(GB/s)"
    )
示例#6
0
def run() -> None:
    values: Tuple[int, int, int, int, int, bool] = parse_args()
    N: int = values[0]
    M: int = values[1]
    E: int = values[3]
    fill: bool = values[-1]
    nrepeat: int = 1000
    print(f"Total size S = {N * M} N = {N} M = {M} E = {E}")

    y: pk.View2D = pk.View([E, N], pk.double, layout=pk.Layout.LayoutRight)
    x: pk.View2D = pk.View([E, M], pk.double, layout=pk.Layout.LayoutRight)
    A: pk.View3D = pk.View([E, N, M], pk.double, layout=pk.Layout.LayoutRight)

    if fill:
        y.fill(1)
        x.fill(1)
        A.fill(1)
    else:
        for e in range(E):
            for i in range(N):
                y[e][i] = 1

            for i in range(M):
                x[e][i] = 1

            for j in range(N):
                for i in range(M):
                    A[e][j][i] = 1

    p = pk.TeamPolicy(E, "auto", 32, pk.get_default_space())

    timer = pk.Timer()

    for i in range(nrepeat):
        result = pk.parallel_reduce(p, yAx, N=N, M=M, y=y, x=x, A=A)

    timer_result = timer.seconds()

    print(
        f"Computed result for {N} x {M} x {E} is {result}")
    solution: float = N * M * E

    if result != solution:
        pk.printf("Error: result (%lf) != solution (%lf)\n",
                  result, solution)

    print(f"N({N}) M({M}) E({E}) nrepeat({nrepeat}) problem(MB) time({timer_result}) bandwidth(GB/s)")
示例#7
0
    space = pk.ExecutionSpace.OpenMP
    if args.execution_space:
        space = pk.ExecutionSpace(args.execution_space)

    pk.set_default_space(space)

    N = args.N
    K = args.K
    D = args.D
    R = args.R
    U = args.U
    F = args.F
    scalar_size = 8

    policy = pk.RangePolicy(pk.get_default_space(), 0, N)
    w = Benchmark_double_8(N, K, D, R, F)

    timer = pk.Timer()
    for r in range(R):
        pk.parallel_for(policy, w.benchmark)
        pk.fence()

    seconds = timer.seconds()

    num_bytes = 1.0 * N * K * R * (2 * scalar_size + 4) + N * R * scalar_size
    flops = 1.0 * N * K * R * (F * 2 * U + 2 * (U - 1))
    gather_ops = 1.0 * N * K * R * 2
    seconds = seconds
    print(
        f"SNKDRUF: {scalar_size/4} {N} {K} {D} {R} {U} {F} Time: {seconds} " +
示例#8
0
            team_acc += self.y[e][j] * tempM

        tempN: float = pk.parallel_reduce(
            pk.TeamThreadRange(team_member, self.N), team_reduce)

        def single_closure():
            nonlocal acc
            acc += tempN

        pk.single(pk.PerTeam(team_member), single_closure)


if __name__ == "__main__":
    values: Tuple[int, int, int, int, int, bool] = parse_args()
    N: int = values[0]
    M: int = values[1]
    E: int = values[3]
    nrepeat: int = values[4]
    fill: bool = values[-1]

    space: str = values[-2]
    if space == "":
        space = pk.ExecutionSpace.OpenMP
    else:
        space = pk.ExecutionSpace(space)

    pk.set_default_space(space)

    print(f"Total size S = {N * M} N = {N} M = {M} E = {E}")
    pk.execute(pk.get_default_space(), Workload(N, M, E, nrepeat, fill))
示例#9
0
文件: gups.py 项目: kokkos/pykokkos
        indices = args.indices
    if args.data:
        data = args.data
    if args.repeats:
        repeats = args.repeats
    use_atomics = args.atomics
    if args.execution_space:
        space = pk.ExecutionSpace(args.execution_space)

    pk.set_default_space(space)

    indices_view: pk.View1D[pk.int64] = pk.View([indices], pk.int64)
    data_view: pk.View1D[pk.int64] = pk.View([data], pk.int64)
    datum: pk.int64 = -1

    range_indices = pk.RangePolicy(pk.get_default_space(), 0, indices)
    range_data = pk.RangePolicy(pk.get_default_space(), 0, data)

    print("Reports fastest timing per kernel")
    print("Creating Views...")
    print("Memory Sizes:")
    print(f"- Elements: {data} ({1e-6*data*8} MB)")
    print(f"- Indices: {indices} ({1e-6*indices*8} MB)")
    print(f"- Atomics: {'yes' if use_atomics else 'no'}")
    print(f"Benchmark kernels will be performed for {repeats} iterations")

    print("Initializing Views...")
    pk.parallel_for(range_data, init_data, data=data_view)
    pk.parallel_for(range_indices, init_indices, indices=indices_view)

    print("Starting benchmarking...")
示例#10
0
    @pk.workunit
    def init_A(self, j: int, i: int):
        self.A[j][i] = 1

    @pk.workunit
    def yAx(self, j: int, acc: pk.Acc[float]):
        temp2: float = 0
        for i in range(self.M):
            temp2 += self.A[j][i] * self.x[i]

        acc += self.y[j] * temp2


if __name__ == "__main__":
    values: Tuple[int, int, int, int, int, bool] = parse_args()
    N: int = values[0]
    M: int = values[1]
    nrepeat: int = values[4]

    space: str = values[-2]
    if space == "":
        space = pk.ExecutionSpace.OpenMP
    else:
        space = pk.ExecutionSpace(space)

    pk.set_default_space(space)

    print(f"Total size S = {N * M} N = {N} M = {M}")
    pk.execute(pk.get_default_space(), Workload(N, M, nrepeat))
示例#11
0
        print("S must be 0 (shared scratch memory not supported)")
        exit(1) 

    space = pk.ExecutionSpace.OpenMP
    if args.execution_space:
        space = pk.ExecutionSpace(args.execution_space)

    N = args.N
    K = args.K
    R = args.R
    U = args.U
    F = args.F
    T = args.T
    S = args.S
    scalar_size = 8
    
    pk.set_default_space(space)

    r = pk.TeamPolicy(N, T, space=pk.get_default_space())
    w = Benchmark_double_8(N, K, R, args.D, F, T, S)

    timer = pk.Timer()
    pk.parallel_for(r, w.benchmark)
    seconds = timer.seconds()

    num_bytes = 1.0 * N * K * R * 3 * scalar_size
    flops = 1.0 * N * K * R * (F * 2 * U + 2 * (U - 1))
    print(f"NKRUFTS: {N} {K} {R} {U} {F} {T} {S} Time: {seconds} " +
            f"Bandwidth: {1.0 * num_bytes / seconds / (1024**3)} GiB/s GFlop/s: {1e-9 * flops / seconds}")
    print(w.C)
示例#12
0
        type=int,
        help="shared memory per team (used to control occupancy on GPUs)")
    parser.add_argument("-space", "--execution_space", type=str)
    args = parser.parse_args()

    if args.P != 2:
        print("only support P=2")
        exit(1)
    if args.U != 8:
        print("only support U=8")
        exit(1)
    if args.D not in [1, 2, 4, 8, 16, 32]:
        print("D must be one of 1, 2, 4, 8, 16, 32")
        exit(1)
    if args.S != 0:
        print("S must be 0 (shared scratch memory not supported)")
        exit(1)

    space = pk.ExecutionSpace.OpenMP
    if args.execution_space:
        space = pk.ExecutionSpace(args.execution_space)

    pk.set_default_space(space)

    args.N = 2**args.N

    pk.execute(
        pk.get_default_space(),
        Benchmark_double_8(args.N, args.K, args.R, args.D, args.F, args.T,
                           args.S))
示例#13
0
文件: gather.py 项目: kokkos/pykokkos
    parser = argparse.ArgumentParser()
    parser.add_argument("S", type=int, help="Scalar Type Size (1==float, 2==double, 4==complex<double>)")
    parser.add_argument("N", type=int, help="Number of Entities")
    parser.add_argument("K", type=int, help="Number of things to gather per entity")
    parser.add_argument("D", type=int, help="Max distance of gathered things of an entity")
    parser.add_argument("R", type=int, help="how often to loop through the K dimension with each team")
    parser.add_argument("U", type=int, help="how many independent flops to do per load")
    parser.add_argument("F", type=int, help="how many times to repeat the U unrolled operations before reading next element")
    parser.add_argument("-space", "--execution_space", type=str)
    args = parser.parse_args()

    if args.S != 2:
        print("only support S=2")
        exit(1)
    if args.U != 8:
        print("only support U=8")
        exit(1)
    if 2 ** args.N < args.D:
        print("N must be larger or equal to D")
        exit(1)

    space = pk.ExecutionSpace.OpenMP
    if args.execution_space:
        space = pk.ExecutionSpace(args.execution_space)
    
    pk.set_default_space(space)

    n = 2 ** args.N

    pk.execute(pk.get_default_space(), Benchmark_double_8(n, args.K, args.D, args.R, args.F))
示例#14
0
    random.seed(1010101)

    indices = 8192
    data = 33554432
    repeats = 10
    space = pk.ExecutionSpace.OpenMP

    parser = argparse.ArgumentParser()
    parser.add_argument("--indices", type=int)
    parser.add_argument("--data", type=int)
    parser.add_argument("--repeats", type=int)
    parser.add_argument("--atomics", action="store_true")
    parser.add_argument("-space", "--execution_space", type=str)
    args = parser.parse_args()
    if args.indices:
        indices = args.indices
        indices = 2**indices
    if args.data:
        data = args.data
        data = 2**data
    if args.repeats:
        repeats = args.repeats
    use_atomics = args.atomics
    if args.execution_space:
        space = pk.ExecutionSpace(args.execution_space)

    pk.set_default_space(space)

    pk.execute(pk.get_default_space(),
               Benchmark(indices, data, repeats, use_atomics))