def update_send_counts( host_b_se_indices, device_b_scan, host_send_counts): cuda_runtime.cuda_err_check( cuda_mpi.LIB_CUDA_MPI['cudaCopySendCounts']( host_b_se_indices.ctypes_data, device_b_scan.ctypes_data, host_send_counts.ctypes_data ))
def apply(self, n_to_fill, empty_slots, replacement_slots): args = list(cuda_runtime.kernel_launch_args_1d(n_to_fill, threads=1024)) args.append(ctypes.c_int(n_to_fill)) args.append(empty_slots.ctypes_data) args.append(replacement_slots.ctypes_data) for ix in self._names: args.append(getattr(self._state, ix).ctypes_data) cuda_runtime.cuda_err_check(self._lib(*args))
def apply(self): n = self.state.npart_local self._per_particle_flag.resize(n + 1) self._per_particle_flag.npart_local = n self._per_particle_flag.zero() self._specific_method() # exclusive scan on array of flags cuda_runtime.LIB_CUDA_MISC['cudaExclusiveScanInt']( self._per_particle_flag.ctypes_data, ctypes.c_int(n + 1)) # number leaving is in element n+1 end_ptr = ppmd.host.pointer_offset(self._per_particle_flag.ctypes_data, n * ctypes.sizeof(ctypes.c_int)) n2_ = ctypes.c_int() cuda_runtime.cuda_mem_cpy(ctypes.byref(n2_), end_ptr, ctypes.c_size_t(ctypes.sizeof(ctypes.c_int)), 'cudaMemcpyDeviceToHost') n2 = n2_.value # compute new npart_local new_n = n - n2 # the empty slots before the new end need filling end_ptr = ppmd.host.pointer_offset(self._per_particle_flag.ctypes_data, new_n * ctypes.sizeof(ctypes.c_int)) n_to_fill_ = ctypes.c_int() cuda_runtime.cuda_mem_cpy(ctypes.byref(n_to_fill_), end_ptr, ctypes.c_size_t(ctypes.sizeof(ctypes.c_int)), 'cudaMemcpyDeviceToHost') # number to fill in [0, npart_local - 1] n_to_fill = n_to_fill_.value # if there are empty slots if n2 > 0: self._empty_slots.resize(n_to_fill) self._empty_slots.zero() args = list(cuda_runtime.kernel_launch_args_1d(new_n, threads=1024)) + \ [self._per_particle_flag.ctypes_data, ctypes.c_int(new_n), self._empty_slots.ctypes_data] cuda_runtime.cuda_err_check( cuda_mpi.LIB_CUDA_MPI['cudaFindEmptySlots'](*args)) # this first returned array actaully is an exclusive sum of the flags return self._per_particle_flag, self._empty_slots, n_to_fill, new_n
def apply(self, per_particle_flag, n_to_fill, new_n, search_n): if n_to_fill == 0: return None self._replacement_slots.resize(n_to_fill) self._replacement_slots.zero() args = list(cuda_runtime.kernel_launch_args_1d(search_n, threads=1024)) + \ [per_particle_flag.ctypes_data, ctypes.c_int(new_n), ctypes.c_int(search_n), self._replacement_slots.ctypes_data] cuda_runtime.cuda_err_check( cuda_mpi.LIB_CUDA_MPI['cudaFindNewSlots'](*args)) return self._replacement_slots
def prepare_halo_sort(self, max_halo_layers=None): assert max_halo_layers is not None, "no size passed" # Is a resize needed? if max_halo_layers > self._n_layers: print("resizing occupancy matrix, you should not see this message") new_matrix = cuda_base.device_buffer_2d(nrow=self.matrix.nrow, ncol=max_halo_layers, dtype=ctypes.c_int32) cuda_runtime.cuda_err_check(self._p1_lib['copy_matrix_cols']( ctypes.c_int32(self.matrix.ncol), ctypes.c_int32(new_matrix.ncol), ctypes.c_int32(new_matrix.nrow), self.matrix.ctypes_data, new_matrix.ctypes_data)) self.matrix.free() self.matrix = new_matrix self._n_layers = max_halo_layers
def update_cell_occ_matrix( length, max_count, occ_matrix_stride, n_local, d_halo_indices, d_ccc, d_halo_scan, d_occ_matrix ): #print "occ halo pointer pre", d_occ_matrix.ctypes_data cuda_runtime.cuda_err_check( cuda_mpi.LIB_CUDA_MPI['cudaHaloFillOccupancyMatrix']( ctypes.c_int32(length), ctypes.c_int32(max_count), ctypes.c_int32(occ_matrix_stride), ctypes.c_int32(n_local), d_halo_indices.ctypes_data, d_ccc.ctypes_data, d_halo_scan.ctypes_data, d_occ_matrix.ctypes_data ) )
def apply(self): """ Enforce the boundary conditions on the held state. """ comm = self.state.domain.comm self.timer_apply.start() if comm.Get_size() == 1: """ BC code for one proc. porbably removable when restricting to large parallel systems. """ self.timer_lib_overhead.start() if self._one_process_pbc_lib is None: with open( str(cuda_config.LIB_DIR) + '/cudaOneProcPBCSource.cu', 'r') as fh: _one_proc_pbc_code = fh.read() _one_proc_pbc_kernel = kernel.Kernel('_one_proc_pbc_kernel', _one_proc_pbc_code, None, static_args={ 'E0': ctypes.c_double, 'E1': ctypes.c_double, 'E2': ctypes.c_double }) self._one_process_pbc_lib = cuda_loop.ParticleLoop( _one_proc_pbc_kernel, { 'P': self.state.get_position_dat()(access.RW), 'BCFLAG': self._flag(access.INC_ZERO) }) self.timer_lib_overhead.pause() _E = self.state.domain.extent self.timer_move.start() self._one_process_pbc_lib.execute( n=self.state.get_position_dat().npart_local, static_args={ 'E0': ctypes.c_double(_E[0]), 'E1': ctypes.c_double(_E[1]), 'E2': ctypes.c_double(_E[2]) }) res = self._flag[0] if res > 0: self._flag[0] = 1 self.timer_move.pause() ############ ----- MULTIPROC ------- else: if self._escape_guard_lib is None: # build lib self._escape_guard_lib = \ cuda_build.build_static_libs('cudaNProcPBC') # --- init escape count ---- if self._escape_count is None: self._escape_count = cuda_base.Array(ncomp=1, dtype=ctypes.c_int32) self._escape_count[0] = 0 # --- init escape dir count ---- if self._escape_dir_count is None: self._escape_dir_count = cuda_base.Array(ncomp=26, dtype=ctypes.c_int32) self._escape_dir_count[:] = 0 # --- init escape list ---- nl3 = self.state.get_position_dat().npart_local * 3 if self._escape_list is None: self._escape_list = cuda_base.Array(ncomp=nl3, dtype=ctypes.c_int32) elif self._escape_list.ncomp < nl3: self._escape_list.realloc(nl3) # --- find escapees --- nl = self.state.get_position_dat().npart_local if nl > 0: cuda_runtime.cuda_err_check( self._escape_guard_lib['cudaNProcPBCStageOne']( ctypes.c_int32(nl), self.state.domain.boundary.ctypes_data, self.state.get_position_dat().ctypes_data, self.state.domain.get_shift().ctypes_data, self._escape_count.ctypes_data, self._escape_dir_count.ctypes_data, self._escape_list.ctypes_data)) dir_max = np.max(self._escape_dir_count[:]) + 1 if self._escape_matrix is None: self._escape_matrix = cuda_base.Matrix(nrow=26, ncol=dir_max, dtype=ctypes.c_int32) elif self._escape_matrix.ncol < dir_max: self._escape_matrix.realloc(nrow=26, ncol=dir_max) # --- Populate escape matrix (essentially sort by direction) escape_count = self._escape_count[0] if (nl > 0) and (escape_count > 0): cuda_runtime.cuda_err_check( self._escape_guard_lib['cudaNProcPBCStageTwo']( ctypes.c_int32(escape_count), ctypes.c_int32(self._escape_matrix.ncol), self._escape_list.ctypes_data, self._escape_matrix.ctypes_data)) self.state.move_to_neighbour(directions_matrix=self._escape_matrix, dir_counts=self._escape_dir_count) self.state.filter_on_domain_boundary()