예제 #1
0
def update_send_counts(
        host_b_se_indices,
        device_b_scan,
        host_send_counts):

    cuda_runtime.cuda_err_check(
    cuda_mpi.LIB_CUDA_MPI['cudaCopySendCounts'](
        host_b_se_indices.ctypes_data,
        device_b_scan.ctypes_data,
        host_send_counts.ctypes_data
    ))
예제 #2
0
    def apply(self, n_to_fill, empty_slots, replacement_slots):

        args = list(cuda_runtime.kernel_launch_args_1d(n_to_fill,
                                                       threads=1024))
        args.append(ctypes.c_int(n_to_fill))
        args.append(empty_slots.ctypes_data)
        args.append(replacement_slots.ctypes_data)

        for ix in self._names:
            args.append(getattr(self._state, ix).ctypes_data)

        cuda_runtime.cuda_err_check(self._lib(*args))
예제 #3
0
    def apply(self):

        n = self.state.npart_local
        self._per_particle_flag.resize(n + 1)
        self._per_particle_flag.npart_local = n
        self._per_particle_flag.zero()

        self._specific_method()

        # exclusive scan on array of flags
        cuda_runtime.LIB_CUDA_MISC['cudaExclusiveScanInt'](
            self._per_particle_flag.ctypes_data, ctypes.c_int(n + 1))

        # number leaving is in element n+1
        end_ptr = ppmd.host.pointer_offset(self._per_particle_flag.ctypes_data,
                                           n * ctypes.sizeof(ctypes.c_int))

        n2_ = ctypes.c_int()
        cuda_runtime.cuda_mem_cpy(ctypes.byref(n2_), end_ptr,
                                  ctypes.c_size_t(ctypes.sizeof(ctypes.c_int)),
                                  'cudaMemcpyDeviceToHost')
        n2 = n2_.value

        # compute new npart_local
        new_n = n - n2

        # the empty slots before the new end need filling
        end_ptr = ppmd.host.pointer_offset(self._per_particle_flag.ctypes_data,
                                           new_n * ctypes.sizeof(ctypes.c_int))
        n_to_fill_ = ctypes.c_int()
        cuda_runtime.cuda_mem_cpy(ctypes.byref(n_to_fill_), end_ptr,
                                  ctypes.c_size_t(ctypes.sizeof(ctypes.c_int)),
                                  'cudaMemcpyDeviceToHost')

        # number to fill in [0, npart_local - 1]
        n_to_fill = n_to_fill_.value

        # if there are empty slots
        if n2 > 0:
            self._empty_slots.resize(n_to_fill)
            self._empty_slots.zero()

            args = list(cuda_runtime.kernel_launch_args_1d(new_n, threads=1024)) + \
                   [self._per_particle_flag.ctypes_data,
                    ctypes.c_int(new_n),
                    self._empty_slots.ctypes_data]

            cuda_runtime.cuda_err_check(
                cuda_mpi.LIB_CUDA_MPI['cudaFindEmptySlots'](*args))

        # this first returned array actaully is an exclusive sum of the flags
        return self._per_particle_flag, self._empty_slots, n_to_fill, new_n
예제 #4
0
    def apply(self, per_particle_flag, n_to_fill, new_n, search_n):
        if n_to_fill == 0:
            return None

        self._replacement_slots.resize(n_to_fill)
        self._replacement_slots.zero()

        args = list(cuda_runtime.kernel_launch_args_1d(search_n, threads=1024)) + \
               [per_particle_flag.ctypes_data,
                ctypes.c_int(new_n),
                ctypes.c_int(search_n),
                self._replacement_slots.ctypes_data]

        cuda_runtime.cuda_err_check(
            cuda_mpi.LIB_CUDA_MPI['cudaFindNewSlots'](*args))

        return self._replacement_slots
예제 #5
0
    def prepare_halo_sort(self, max_halo_layers=None):
        assert max_halo_layers is not None, "no size passed"

        # Is a resize needed?
        if max_halo_layers > self._n_layers:
            print("resizing occupancy matrix, you should not see this message")

            new_matrix = cuda_base.device_buffer_2d(nrow=self.matrix.nrow,
                                                    ncol=max_halo_layers,
                                                    dtype=ctypes.c_int32)

            cuda_runtime.cuda_err_check(self._p1_lib['copy_matrix_cols'](
                ctypes.c_int32(self.matrix.ncol),
                ctypes.c_int32(new_matrix.ncol),
                ctypes.c_int32(new_matrix.nrow), self.matrix.ctypes_data,
                new_matrix.ctypes_data))

            self.matrix.free()
            self.matrix = new_matrix

            self._n_layers = max_halo_layers
예제 #6
0
def update_cell_occ_matrix(
    length,
    max_count,
    occ_matrix_stride,
    n_local,
    d_halo_indices,
    d_ccc,
    d_halo_scan,
    d_occ_matrix
    ):

    #print "occ halo pointer pre", d_occ_matrix.ctypes_data
    cuda_runtime.cuda_err_check(
    cuda_mpi.LIB_CUDA_MPI['cudaHaloFillOccupancyMatrix'](
        ctypes.c_int32(length),
        ctypes.c_int32(max_count),
        ctypes.c_int32(occ_matrix_stride),
        ctypes.c_int32(n_local),
        d_halo_indices.ctypes_data,
        d_ccc.ctypes_data,
        d_halo_scan.ctypes_data,
        d_occ_matrix.ctypes_data
    )
    )
예제 #7
0
    def apply(self):
        """
        Enforce the boundary conditions on the held state.
        """

        comm = self.state.domain.comm

        self.timer_apply.start()

        if comm.Get_size() == 1:
            """
            BC code for one proc. porbably removable when restricting to large
             parallel systems.
            """

            self.timer_lib_overhead.start()

            if self._one_process_pbc_lib is None:
                with open(
                        str(cuda_config.LIB_DIR) + '/cudaOneProcPBCSource.cu',
                        'r') as fh:
                    _one_proc_pbc_code = fh.read()

                _one_proc_pbc_kernel = kernel.Kernel('_one_proc_pbc_kernel',
                                                     _one_proc_pbc_code,
                                                     None,
                                                     static_args={
                                                         'E0': ctypes.c_double,
                                                         'E1': ctypes.c_double,
                                                         'E2': ctypes.c_double
                                                     })

                self._one_process_pbc_lib = cuda_loop.ParticleLoop(
                    _one_proc_pbc_kernel, {
                        'P': self.state.get_position_dat()(access.RW),
                        'BCFLAG': self._flag(access.INC_ZERO)
                    })

            self.timer_lib_overhead.pause()

            _E = self.state.domain.extent

            self.timer_move.start()
            self._one_process_pbc_lib.execute(
                n=self.state.get_position_dat().npart_local,
                static_args={
                    'E0': ctypes.c_double(_E[0]),
                    'E1': ctypes.c_double(_E[1]),
                    'E2': ctypes.c_double(_E[2])
                })

            res = self._flag[0]
            if res > 0:
                self._flag[0] = 1

            self.timer_move.pause()

        ############ ----- MULTIPROC -------
        else:

            if self._escape_guard_lib is None:
                # build lib
                self._escape_guard_lib = \
                    cuda_build.build_static_libs('cudaNProcPBC')

            # --- init escape count ----
            if self._escape_count is None:
                self._escape_count = cuda_base.Array(ncomp=1,
                                                     dtype=ctypes.c_int32)
            self._escape_count[0] = 0

            # --- init escape dir count ----
            if self._escape_dir_count is None:
                self._escape_dir_count = cuda_base.Array(ncomp=26,
                                                         dtype=ctypes.c_int32)
            self._escape_dir_count[:] = 0

            # --- init escape list ----
            nl3 = self.state.get_position_dat().npart_local * 3

            if self._escape_list is None:
                self._escape_list = cuda_base.Array(ncomp=nl3,
                                                    dtype=ctypes.c_int32)
            elif self._escape_list.ncomp < nl3:
                self._escape_list.realloc(nl3)

            # --- find escapees ---

            nl = self.state.get_position_dat().npart_local

            if nl > 0:
                cuda_runtime.cuda_err_check(
                    self._escape_guard_lib['cudaNProcPBCStageOne'](
                        ctypes.c_int32(nl),
                        self.state.domain.boundary.ctypes_data,
                        self.state.get_position_dat().ctypes_data,
                        self.state.domain.get_shift().ctypes_data,
                        self._escape_count.ctypes_data,
                        self._escape_dir_count.ctypes_data,
                        self._escape_list.ctypes_data))

            dir_max = np.max(self._escape_dir_count[:]) + 1

            if self._escape_matrix is None:
                self._escape_matrix = cuda_base.Matrix(nrow=26,
                                                       ncol=dir_max,
                                                       dtype=ctypes.c_int32)

            elif self._escape_matrix.ncol < dir_max:
                self._escape_matrix.realloc(nrow=26, ncol=dir_max)

            # --- Populate escape matrix (essentially sort by direction)

            escape_count = self._escape_count[0]
            if (nl > 0) and (escape_count > 0):
                cuda_runtime.cuda_err_check(
                    self._escape_guard_lib['cudaNProcPBCStageTwo'](
                        ctypes.c_int32(escape_count),
                        ctypes.c_int32(self._escape_matrix.ncol),
                        self._escape_list.ctypes_data,
                        self._escape_matrix.ctypes_data))

            self.state.move_to_neighbour(directions_matrix=self._escape_matrix,
                                         dir_counts=self._escape_dir_count)

            self.state.filter_on_domain_boundary()