def copy_to_device(self): self._d_matrix.realloc(self._h_matrix.nrow, self._h_matrix.ncol) cuda_runtime.cuda_mem_cpy(self._d_matrix.ctypes_data, self._h_matrix.ctypes_data, ctypes.c_size_t(self._h_matrix.size), 'cudaMemcpyHostToDevice') self._h_matrix.data.fill(0)
def copy_from_device(self): self._h_matrix.realloc(self._d_matrix.nrow, self._d_matrix.ncol) cuda_runtime.cuda_mem_cpy(self._h_matrix.ctypes_data, self._d_matrix.ctypes_data, ctypes.c_size_t(self._h_matrix.size), 'cudaMemcpyDeviceToHost')
def apply(self): n = self.state.npart_local self._per_particle_flag.resize(n + 1) self._per_particle_flag.npart_local = n self._per_particle_flag.zero() self._specific_method() # exclusive scan on array of flags cuda_runtime.LIB_CUDA_MISC['cudaExclusiveScanInt']( self._per_particle_flag.ctypes_data, ctypes.c_int(n + 1)) # number leaving is in element n+1 end_ptr = ppmd.host.pointer_offset(self._per_particle_flag.ctypes_data, n * ctypes.sizeof(ctypes.c_int)) n2_ = ctypes.c_int() cuda_runtime.cuda_mem_cpy(ctypes.byref(n2_), end_ptr, ctypes.c_size_t(ctypes.sizeof(ctypes.c_int)), 'cudaMemcpyDeviceToHost') n2 = n2_.value # compute new npart_local new_n = n - n2 # the empty slots before the new end need filling end_ptr = ppmd.host.pointer_offset(self._per_particle_flag.ctypes_data, new_n * ctypes.sizeof(ctypes.c_int)) n_to_fill_ = ctypes.c_int() cuda_runtime.cuda_mem_cpy(ctypes.byref(n_to_fill_), end_ptr, ctypes.c_size_t(ctypes.sizeof(ctypes.c_int)), 'cudaMemcpyDeviceToHost') # number to fill in [0, npart_local - 1] n_to_fill = n_to_fill_.value # if there are empty slots if n2 > 0: self._empty_slots.resize(n_to_fill) self._empty_slots.zero() args = list(cuda_runtime.kernel_launch_args_1d(new_n, threads=1024)) + \ [self._per_particle_flag.ctypes_data, ctypes.c_int(new_n), self._empty_slots.ctypes_data] cuda_runtime.cuda_err_check( cuda_mpi.LIB_CUDA_MPI['cudaFindEmptySlots'](*args)) # this first returned array actaully is an exclusive sum of the flags return self._per_particle_flag, self._empty_slots, n_to_fill, new_n
def cell_contents_count_scan(self): """ Get the exclusive scan of the cell contents counts array :return: """ self._ccc_scan.realloc_zeros(self._domain.cell_count) cuda_runtime.cuda_mem_cpy( self._ccc_scan.ctypes_data, self.cell_contents_count.ctypes_data, ctypes.c_size_t(self._domain.cell_count * ctypes.sizeof(self.cell_contents_count.dtype)), 'cudaMemcpyDeviceToDevice') cuda_runtime.LIB_CUDA_MISC['cudaExclusiveScanInt']( self._ccc_scan.ctypes_data, ctypes.c_int(self._ccc_scan.ncomp)) return self._ccc_scan
def copy_h2d_exclusive_scan(in_array, out_array): """ Copy an Array and compute an exclusive scan on the copy. Resizes out array to length of in array plus 1. :param in_array: :param out_array: """ assert type(in_array) is cuda_base.Array, "in_array as incorrect type" assert type(out_array) is cuda_base.Array, "out_array as incorrect type" if out_array.ncomp != (in_array.ncomp + 1): out_array.realloc(in_array.ncomp + 1) cuda_runtime.cuda_mem_cpy(d_ptr=out_array.ctypes_data, s_ptr=in_array.ctypes_data, size=in_array.ncomp * ctypes.sizeof(in_array.dtype), cpy_type="cudaMemcpyHostToDevice") cuda_runtime.cuda_exclusive_scan(out_array, in_array.ncomp+1) return
def _halo_update_groups(self): hm = self._halo_manager hmb = hm.get_boundary_cell_groups() self._halo_b_cell_indices.realloc(hmb[0].ncomp) cuda_runtime.cuda_mem_cpy( d_ptr=self._halo_b_cell_indices.ctypes_data, s_ptr=hmb[0].ctypes_data, size=ctypes.c_size_t(hmb[0].ncomp * ctypes.sizeof(ctypes.c_int)), cpy_type="cudaMemcpyHostToDevice") self._halo_b_groups_se_indices.realloc(hmb[1].ncomp) cuda_runtime.cuda_mem_cpy( d_ptr=self._halo_b_groups_se_indices.ctypes_data, s_ptr=hmb[1].ctypes_data, size=ctypes.c_size_t(hmb[1].ncomp * ctypes.sizeof(ctypes.c_int)), cpy_type="cudaMemcpyHostToDevice") hmh = hm.get_halo_cell_groups() self._halo_h_cell_indices.realloc(hmh[0].ncomp) cuda_runtime.cuda_mem_cpy( d_ptr=self._halo_h_cell_indices.ctypes_data, s_ptr=hmh[0].ctypes_data, size=ctypes.c_size_t(hmh[0].ncomp * ctypes.sizeof(ctypes.c_int)), cpy_type="cudaMemcpyHostToDevice") self._halo_h_groups_se_indices.realloc(hmh[1].ncomp) cuda_runtime.cuda_mem_cpy( d_ptr=self._halo_h_groups_se_indices.ctypes_data, s_ptr=hmh[1].ctypes_data, size=ctypes.c_size_t(hmh[1].ncomp * ctypes.sizeof(ctypes.c_int)), cpy_type="cudaMemcpyHostToDevice") self._halo_position_shifts[:] = hm.get_position_shifts()[:] self._halo_device_version = self.domain.cell_array.version
def copy_from_device(self): self._h_array.realloc(self._d_array.ncomp) cuda_runtime.cuda_mem_cpy(self._h_array.ctypes_data, self._d_array.ctypes_data, ctypes.c_size_t(self._h_array.size), 'cudaMemcpyDeviceToHost')