示例#1
0
    def compute_energy(self, system: System, binning: Binning, neighbor: Neighbor) -> float:
        self.x = system.x
        self.id = system.id
        self.type = system.type
        self.N_local = system.N_local

        self.bin_count = binning.bincount
        self.bin_offsets = binning.binoffsets
        self.permute_vector = binning.permute_vector

        self.nhalo = binning.nhalo
        self.nbinx = binning.nbinx
        self.nbiny = binning.nbiny
        self.nbinz = binning.nbinz

        self.nbins: int = self.nbinx * self.nbiny * self.nbinz

        self.parallel_for = False
        pk.execute(self, dependencies=[t_scalar3])

        self.x = t_x()
        self.type = t_type()
        self.f = t_f()

        return self.PE
示例#2
0
    def exchange(self) -> None:
        self.s = copy.copy(self.system)
        self.N_local = self.system.N_local

        self.bind_views()
        self.workunit_id = 0
        pk.execute(pk.ExecutionSpace.Default, self)
示例#3
0
    def test_add_squares(self):
        expected_result: int = self.value * self.value * self.threads

        workload = Add1DSquareTestReduce(self.threads, self.value)
        pk.execute(pk.ExecutionSpace.Default, workload)
        result: int = workload.sum
        self.assertEqual(expected_result, result)
示例#4
0
    def compute(self, system: System, binning: Binning, neighbor: Neighbor, fill: bool) -> None:
        self.x = system.x
        self.f = system.f
        self.id = system.id
        self.type = system.type
        self.N_local = system.N_local

        self.step = self.step_i
        self.bin_count = binning.bincount
        self.bin_offsets = binning.binoffsets
        self.permute_vector = binning.permute_vector

        self.nhalo = binning.nhalo
        self.nbinx = binning.nbinx
        self.nbiny = binning.nbiny
        self.nbinz = binning.nbinz

        if fill:
            self.f.fill(0)
        else:
            for i in range(self.f.x):
                for j in range(self.f.y):
                    self.f[i][j] = 0.0

        self.nbins: int = self.nbinx * self.nbiny * self.nbinz

        self.parallel_for = True
        pk.execute(self)

        self.step_i += 1
        self.x = t_x()
        self.type = t_type()
        self.f = t_f()
示例#5
0
def run():
    workload = Workload(10)
    pk.execute(pk.ExecutionSpace.Default, workload)
    print(workload.view)
    print(workload.permute_vector)
    print(workload.bin_offsets)
    print(workload.bin_count)
示例#6
0
    def final_integrate(self) -> None:
        workload = FinalIntegrateFunctor(
            self.system.v, self.system.f, self.system.type,
            self.system.mass, self.dtf, self.dtv, self.system.id,
            self.step, self.system.x, self.system.N_local)

        pk.execute(pk.ExecutionSpace.Default, workload)
        self.step += 1
示例#7
0
    def create_binning(self, dx_in: float, dy_in: float, dz_in: float,
                       halo_depth: int, do_local: bool, do_ghost: bool,
                       sort: bool) -> None:
        if do_local or do_ghost:
            self.nhalo = halo_depth
            range_min: int = 0 if do_local else self.system.N_local
            range_max: int = int(
                ((self.system.N_local +
                  self.system.N_ghost) if do_ghost else self.system.N_local))

            self.range_min = range_min
            self.range_max = range_max

            self.nbinx = int(self.system.sub_domain_x / dx_in)
            self.nbiny = int(self.system.sub_domain_y / dy_in)
            self.nbinz = int(self.system.sub_domain_z / dz_in)

            if self.nbinx == 0:
                self.nbinx = 1
            if self.nbiny == 0:
                self.nbiny = 1
            if self.nbinz == 0:
                self.nbinz = 1

            dx: float = self.system.sub_domain_x / self.nbinx
            dy: float = self.system.sub_domain_y / self.nbiny
            dz: float = self.system.sub_domain_z / self.nbinz

            self.nbinx += 2 * halo_depth
            self.nbiny += 2 * halo_depth
            self.nbinz += 2 * halo_depth

            eps: float = dx / 1000
            self.minx = -dx * halo_depth - eps + self.system.sub_domain_lo_x
            self.maxx = dx * halo_depth + eps + self.system.sub_domain_hi_x
            self.miny = -dy * halo_depth - eps + self.system.sub_domain_lo_y
            self.maxy = dy * halo_depth + eps + self.system.sub_domain_hi_y
            self.minz = -dz * halo_depth - eps + self.system.sub_domain_lo_z
            self.maxz = dz * halo_depth + eps + self.system.sub_domain_hi_z

            # Bind views
            self.x = self.system.x
            self.v = self.system.v
            self.f = self.system.f
            self.type = self.system.type
            self.id = self.system.id
            self.q = self.system.q

            # Views
            self.bincount: pk.View3D = self.t_bincount(self.nbinx, self.nbiny,
                                                       self.nbinz, pk.int32)
            self.binoffsets: pk.View3D = self.t_binoffsets(
                self.nbinx, self.nbiny, self.nbinz, pk.int32)

            self.sort = sort
            self.permute_vector.resize(0, range_max - range_min)
            pk.execute(pk.ExecutionSpace.Default, self)
示例#8
0
    def update_halo(self) -> None:
        self.N_ghost = 0
        self.s = copy.copy(self.system)
        for self.phase in range(0, 6):
            self.pack_indicies = self.pack_indicies_all[self.phase, :]

            self.bind_views()
            self.workunit_id = 2
            self.update_threads = self.num_ghost[self.phase]
            pk.execute(pk.ExecutionSpace.Default, self)
            self.N_ghost += self.num_ghost[self.phase]
示例#9
0
    def test_add_for(self):
        initial_value: int = 5
        added_value: int = 7
        expected_result: int = initial_value + added_value

        workload = Add1DTestFor(self.threads, initial_value, added_value)
        pk.execute(pk.ExecutionSpace.Default, workload)

        for i in range(self.threads):
            result: int = workload.view[i]
            self.assertEqual(result, expected_result)
示例#10
0
    def compute(self, system: System) -> float:
        self.v = system.v
        self.mass = system.mass
        self.type = system.type

        self.N_local = system.N_local
        pk.execute(pk.ExecutionSpace.Default, self)

        dof: int = 3 * system.N - 3
        factor: float = system.mvv2e / (1.0 * dof * system.boltz)

        self.comm.reduce_float(self.T, 1)

        return self.T * factor
示例#11
0
    def update_force(self) -> None:
        self.s = copy.copy(self.system)
        self.ghost_offsets[0] = self.s.N_local
        for self.phase in range(1, 6):
            self.ghost_offsets[self.phase] = self.ghost_offsets[
                self.phase - 1] + self.num_ghost[self.phase - 1]

        for self.phase in range(5, -1, -1):
            self.pack_indicies = self.pack_indicies_all[self.phase, :]

            self.bind_views()
            self.workunit_id = 3
            self.force_threads = self.num_ghost[self.phase]
            pk.execute(pk.ExecutionSpace.Default, self)
示例#12
0
    def compute(self, system: System) -> float:
        self.v = system.v
        self.mass = system.mass
        self.type = system.type

        self.N_local = system.N_local
        pk.execute(pk.ExecutionSpace.Default, self)

        self.v = t_v(0, 3)
        self.mass = t_mass(0)
        self.type = t_type(0)

        factor: float = 0.5 * system.mvv2e

        self.comm.reduce_float(self.KE, 1)
        return self.KE * factor
示例#13
0
    def compute(self, system: System, binning: Binning,
                neighbor: Neighbor) -> None:
        neigh_list: NeighList2D = neighbor.get_neigh_list()
        self.num_neighs_view: pk.View1D = neigh_list.num_neighs
        self.neighs_view: pk.View2D = neigh_list.neighs

        self.N_local = system.N_local
        self.x = system.x
        self.f = system.f
        self.f_a = system.f
        self.type = system.type
        self.id = system.id

        self.parallel_for = True
        pk.execute(pk.ExecutionSpace.Default, self)

        self.step += 1
示例#14
0
    def exchange_halo(self) -> None:
        self.N_local = self.system.N_local
        self.N_ghost = 0

        self.s = copy.copy(self.system)
        for self.phase in range(6):
            self.pack_indicies = self.pack_indicies_all[self.phase, :]
            count: int = 0
            self.pack_count[0] = 0

            sub: int = 0
            if self.phase % 2 == 1:
                sub = self.num_ghost[self.phase - 1]

            self.nparticles = self.N_local + self.N_ghost - sub
            self.bind_views()
            self.workunit_id = 1
            pk.execute(pk.ExecutionSpace.Default, self)

            count = self.pack_count[0]

            redo: bool = False

            if self.N_local + self.N_ghost + count > self.s.x.extent(0):
                self.system.grow(self.N_local + int(self.N_ghost) + int(count))
                self.s = copy.copy(self.system)
                redo = True

            if count > self.pack_indicies.extent(0):
                self.pack_indicies_all.resize(0, 6)
                self.pack_indicies_all.resize(1, int(count * 1.1))
                self.pack_indicies = self.pack_indicies_all[self.phase, :]
                redo = True

            if redo:
                self.pack_count[0] = 0
                self.workunit_id = 1
                self.bind_views()
                pk.execute(pk.ExecutionSpace.Default, self)

            self.num_ghost[self.phase] = count

            self.N_ghost += count

        self.system.N_ghost = self.N_ghost
示例#15
0
    def test_real(self):
        pk.set_default_precision(pk.int32)
        view: pk.View1d = pk.View([self.threads])

        self.assertTrue(view.dtype is pk.DataType.int32)
        self.assertTrue(
            pk.View._get_dtype_name(str(type(view.array))) == "int32")

        f = RealViewTestFunctor(view)
        w = RealViewTestWorkload(self.threads, view)
        pk.parallel_for(self.threads, f.pfor)
        pk.execute(pk.ExecutionSpace.Default, w)

        view.set_precision(pk.float)

        self.assertTrue(view.dtype is pk.DataType.float)
        self.assertTrue(
            pk.View._get_dtype_name(str(type(view.array))) == "float32")
        pk.parallel_for(self.threads, f.pfor)
        pk.execute(pk.ExecutionSpace.Default, w)
示例#16
0
        self.mat: pk.View2D[pk.int32] = pk.View([r, c], pk.int32)

        for i in range(r):
            self.mat[i] = list(range(c * i, c * (i + 1)))

        for row in self.mat:
            print(row)
        print(f"Initialized {r}x{c} array")

    @pk.main
    def run(self):
        pk.parallel_for(self.r, self.sum_row)
        self.total = pk.parallel_reduce(self.r, self.final_sum)

    @pk.callback
    def results(self):
        print("Total =", self.total)

    @pk.workunit
    def sum_row(self, i: int):
        for j in range(1, self.c):
            self.mat[i][0] += self.mat[i][j]

    @pk.workunit
    def final_sum(self, i: int, accumulator: pk.Acc[pk.double]):
        accumulator += self.mat[i][0]


if __name__ == "__main__":
    pk.execute(pk.ExecutionSpace.OpenMP, MatrixSum(5, 10))
示例#17
0
        type=int,
        help="shared memory per team (used to control occupancy on GPUs)")
    parser.add_argument("-space", "--execution_space", type=str)
    args = parser.parse_args()

    if args.P != 2:
        print("only support P=2")
        exit(1)
    if args.U != 8:
        print("only support U=8")
        exit(1)
    if args.D not in [1, 2, 4, 8, 16, 32]:
        print("D must be one of 1, 2, 4, 8, 16, 32")
        exit(1)
    if args.S != 0:
        print("S must be 0 (shared scratch memory not supported)")
        exit(1)

    space = pk.ExecutionSpace.OpenMP
    if args.execution_space:
        space = pk.ExecutionSpace(args.execution_space)

    pk.set_default_space(space)

    args.N = 2**args.N

    pk.execute(
        pk.get_default_space(),
        Benchmark_double_8(args.N, args.K, args.R, args.D, args.F, args.T,
                           args.S))
示例#18
0
import pykokkos as pk


@pk.workload
class SimpleSpaces:
    def __init__(self, n):
        self.N: int = n
        self.sum: int = 0
        self.a: pk.View2D[pk.int32] = pk.View([n, 3], pk.int32)
        for i in range(n):
            for j in range(3):
                self.a[i][j] = i * n + j

    @pk.main
    def run(self):
        self.sum = pk.parallel_reduce(self.N, self.reduction)

    @pk.callback
    def use_results(self):
        print(self.sum)

    @pk.workunit
    def reduction(self, i: int, acc: pk.Acc[pk.double]):
        acc += self.a[i][0] - self.a[i][1] + self.a[i][2]


if __name__ == "__main__":
    pk.execute(pk.ExecutionSpace.OpenMP, SimpleSpaces(10))
示例#19
0
import pykokkos as pk


@pk.workload
class HelloWorld:
    def __init__(self, n):
        self.N: int = n

    @pk.main
    def run(self):
        pk.parallel_for(self.N, lambda i: pk.printf("Hello from i = %i\n", i))


if __name__ == "__main__":
    pk.execute(pk.ExecutionSpace.OpenMP, HelloWorld(10))
示例#20
0
    parser.add_argument('length', type=int)
    parser.add_argument('offset', nargs='?', type=int, default=0)
    parser.add_argument("-space", "--execution_space", type=str)

    args = parser.parse_args()
    iterations = args.iterations
    length = args.length
    offset = args.offset

    if iterations < 1:
        sys.exit("ERROR: iterations must be >= 1")

    if length <= 0:
        sys.exit("ERROR: vector length must be positive")

    # emulate cpp example
    if length <= 0:
        sys.exit("ERROR: offset must be nonnegative")

    if args.execution_space:
        space = pk.ExecutionSpace(args.execution_space)
        pk.set_default_space(space)

    # pk.enable_uvm()

    length = 2**length
    print("Number of iterations = ", iterations)
    print("Vector length        = ", length)
    print("Offset               = ", offset)
    pk.execute(pk.ExecutionSpace.Default, main(iterations, length, offset))
示例#21
0
    permute = args.permute

    if iterations < 1:
        sys.exit("ERROR: iterations must be >= 1")

    if order <= 0:
        sys.exit("ERROR: Matrix Order must be greater than 0")
    elif order > 46340:
        sys.exit("ERROR: matrix dimension too large - overflow risk")

    # a negative tile size means no tiling of the local transpose
    if (tile_size <= 0):
        tile_size = order

    if permute != 0 and permute != 1:
        sys.exit("ERROR: permute must be 0 (no) or 1 (yes)")

    if args.execution_space:
        space = pk.ExecutionSpace(args.execution_space)
        pk.set_default_space(space)

    # pk.enable_uvm()

    order = 2**order
    print("Number of iterations = ", iterations)
    print("Matrix order         = ", order)
    print("Tile size            = ", tile_size)
    print("Permute loops        = ", "yes" if permute else "no")
    pk.execute(pk.ExecutionSpace.Default,
               main(iterations, order, tile_size, permute))
示例#22
0
            self.data[i] = random.randint(0, n)

    @pk.main
    def run(self):
        pk.parallel_for(self.N, self.findprimes)

    @pk.callback
    def results(self):
        for i in range(int(self.count[0])):
            print(int(self.result[i]), end=", ")
        print("\nFound", int(self.count[0]), "prime numbers in", self.N,
              "random numbers")

    @pk.workunit
    def findprimes(self, i: int):
        number: int = self.data[i]
        upper_bound: int = math.sqrt(number) + 1
        is_prime: bool = not (number % 2 == 0)
        k: int = 3
        idx: int = 0
        while k < upper_bound and is_prime:
            is_prime = not (number % k == 0)
            k += 2
        if is_prime:
            idx = self.count[0] = self.count[0] + 1
            self.result[idx - 1] = number


if __name__ == "__main__":
    pk.execute(pk.ExecutionSpace.OpenMP, SimpleAtomics(100))
示例#23
0
文件: gather.py 项目: kokkos/pykokkos
    parser = argparse.ArgumentParser()
    parser.add_argument("S", type=int, help="Scalar Type Size (1==float, 2==double, 4==complex<double>)")
    parser.add_argument("N", type=int, help="Number of Entities")
    parser.add_argument("K", type=int, help="Number of things to gather per entity")
    parser.add_argument("D", type=int, help="Max distance of gathered things of an entity")
    parser.add_argument("R", type=int, help="how often to loop through the K dimension with each team")
    parser.add_argument("U", type=int, help="how many independent flops to do per load")
    parser.add_argument("F", type=int, help="how many times to repeat the U unrolled operations before reading next element")
    parser.add_argument("-space", "--execution_space", type=str)
    args = parser.parse_args()

    if args.S != 2:
        print("only support S=2")
        exit(1)
    if args.U != 8:
        print("only support U=8")
        exit(1)
    if 2 ** args.N < args.D:
        print("N must be larger or equal to D")
        exit(1)

    space = pk.ExecutionSpace.OpenMP
    if args.execution_space:
        space = pk.ExecutionSpace(args.execution_space)
    
    pk.set_default_space(space)

    n = 2 ** args.N

    pk.execute(pk.get_default_space(), Benchmark_double_8(n, args.K, args.D, args.R, args.F))
示例#24
0
    elif n > 46340:
        sys.exit("ERROR: grid dimension too large - overflow risk")

    # default tile size for tiling of local transpose
    tile_size = 32
    # if tile_size <= 0:
    #     tile_size = n
    # if tile_size > n:
    #     tile_size = n

    # stencil pattern
    star = False if (stencil == "grid") else True

    if (radius < 1) or (2 * radius + 1 > n):
        sys.exit("ERROR: Stencil radius negative or too large")

    if args.execution_space:
        space = pk.ExecutionSpace(args.execution_space)
        pk.set_default_space(space)

    # pk.enable_uvm()

    n = 2**n
    print("Number of iterations = ", iterations)
    print("Grid size            = ", n)
    print("Tile size            = ", tile_size)
    print("Type of stencil      = ", "star" if star else "grid")
    print("Radius of stencil    = ", radius)
    pk.execute(pk.ExecutionSpace.Default,
               main(iterations, n, tile_size, star, radius))
示例#25
0
import pykokkos as pk


@pk.workload
class SquareSum:
    def __init__(self, n):
        self.N: int = n
        self.total: int = 0

    @pk.main
    def run(self):
        self.total = pk.parallel_reduce(self.N, self.squaresum)

    @pk.callback
    def results(self):
        print("Sum:", self.total)

    @pk.workunit
    def squaresum(self, i: int, acc: pk.Acc[pk.double]):
        acc += i * i


if __name__ == "__main__":
    pk.execute(pk.ExecutionSpace.OpenMP, SquareSum(10))
示例#26
0

@pk.workload
class RandomSum:
    def __init__(self, n):
        self.N: int = n
        self.total: int = 0
        self.a: pk.View1D[pk.int32] = pk.View([n], pk.int32)

        for i in range(self.N):
            self.a[i] = random.randint(0, 10)

        print("Initialized view:", self.a)

    @pk.main
    def run(self):
        self.total = pk.parallel_reduce(self.N, self.my_reduction)

    @pk.callback
    def results(self):
        print("Sum:", self.total)

    @pk.workunit
    def my_reduction(self, i: int, accumulator: pk.Acc[pk.int32]):
        accumulator += self.a[i]


if __name__ == "__main__":
    n = 10
    pk.execute(pk.ExecutionSpace.OpenMP, RandomSum(n))
示例#27
0
            team_acc += self.y[e][j] * tempM

        tempN: float = pk.parallel_reduce(
            pk.TeamThreadRange(team_member, self.N), team_reduce)

        def single_closure():
            nonlocal acc
            acc += tempN

        pk.single(pk.PerTeam(team_member), single_closure)


if __name__ == "__main__":
    values: Tuple[int, int, int, int, int, bool] = parse_args()
    N: int = values[0]
    M: int = values[1]
    E: int = values[3]
    nrepeat: int = values[4]
    fill: bool = values[-1]

    space: str = values[-2]
    if space == "":
        space = pk.ExecutionSpace.OpenMP
    else:
        space = pk.ExecutionSpace(space)

    pk.set_default_space(space)

    print(f"Total size S = {N * M} N = {N} M = {M} E = {E}")
    pk.execute(pk.get_default_space(), Workload(N, M, E, nrepeat, fill))
示例#28
0
    def __init__(self, N: int):
        self.N: int = N
        self.A: pk.View1D[pk.int32] = pk.View([N], pk.int32)

        self.result: int = 0
        self.timer_result: float = 0

    @pk.main
    def run(self):
        pk.parallel_for(self.N, lambda i: i, self.A)

        timer = pk.Timer()

        self.result = pk.parallel_scan(self.N, self.scan)

        self.timer_result = timer.seconds()

    @pk.callback
    def results(self):
        print(f"{self.A} total={self.result} time({self.timer_result})")

    @pk.workunit
    def scan(self, i: int, acc: pk.Acc[pk.double], last_pass: bool):
        acc += self.A[i]
        if last_pass:
            self.A[i] = acc


if __name__ == "__main__":
    pk.execute(pk.ExecutionSpace.OpenMP, Workload(10))
示例#29
0
@pk.workload
class Math:
    def __init__(self, n):
        self.N: int = n
        self.a: pk.View1D[pk.int32] = pk.View([n], pk.int32)

        for i in range(self.N):
            self.a[i] = math.sqrt(math.tau)

        print("Initialized view:", self.a)

    @pk.main
    def run(self):
        pk.parallel_for(self.N, self.my_calculation)

    @pk.callback
    def results(self):
        print("Results: ", self.a)

    @pk.workunit
    def my_calculation(self, i: int):
        pk.printf("Running index %d\n", i)
        self.a[i] += (math.cos(self.a[i]) + 2**i -
                      math.pi / math.fabs(self.a[(i + 1) % self.N]))


if __name__ == "__main__":
    n = 10
    pk.execute(pk.ExecutionSpace.OpenMP, Math(n))
示例#30
0
import pykokkos as pk


@pk.workload
class AddOne:
    def __init__(self, n):
        self.N: int = n
        self.a: pk.View1D[pk.int32] = pk.View([n], pk.int32)

        for i in range(self.N):
            self.a[i] = 2
        print(f"Initialized view: [{self.a[0]}, ... repeats {n-1} times]")

    @pk.main
    def run(self):
        y: int = 1
        pk.parallel_for(self.N, lambda i: self.a[i] + y, self.a)

    @pk.callback
    def results(self):
        print(f"Results: [{self.a[0]}, ... repeats {n-1} times]")


if __name__ == "__main__":
    n = 100 * 1000
    pk.execute(pk.ExecutionSpace.OpenMP, AddOne(n))