Пример #1
0
    def copy_to_device(self):

        self._d_matrix.realloc(self._h_matrix.nrow, self._h_matrix.ncol)
        cuda_runtime.cuda_mem_cpy(self._d_matrix.ctypes_data,
                                  self._h_matrix.ctypes_data,
                                  ctypes.c_size_t(self._h_matrix.size),
                                  'cudaMemcpyHostToDevice')
        self._h_matrix.data.fill(0)
Пример #2
0
    def copy_from_device(self):

        self._h_matrix.realloc(self._d_matrix.nrow, self._d_matrix.ncol)

        cuda_runtime.cuda_mem_cpy(self._h_matrix.ctypes_data,
                                  self._d_matrix.ctypes_data,
                                  ctypes.c_size_t(self._h_matrix.size),
                                  'cudaMemcpyDeviceToHost')
Пример #3
0
    def apply(self):

        n = self.state.npart_local
        self._per_particle_flag.resize(n + 1)
        self._per_particle_flag.npart_local = n
        self._per_particle_flag.zero()

        self._specific_method()

        # exclusive scan on array of flags
        cuda_runtime.LIB_CUDA_MISC['cudaExclusiveScanInt'](
            self._per_particle_flag.ctypes_data, ctypes.c_int(n + 1))

        # number leaving is in element n+1
        end_ptr = ppmd.host.pointer_offset(self._per_particle_flag.ctypes_data,
                                           n * ctypes.sizeof(ctypes.c_int))

        n2_ = ctypes.c_int()
        cuda_runtime.cuda_mem_cpy(ctypes.byref(n2_), end_ptr,
                                  ctypes.c_size_t(ctypes.sizeof(ctypes.c_int)),
                                  'cudaMemcpyDeviceToHost')
        n2 = n2_.value

        # compute new npart_local
        new_n = n - n2

        # the empty slots before the new end need filling
        end_ptr = ppmd.host.pointer_offset(self._per_particle_flag.ctypes_data,
                                           new_n * ctypes.sizeof(ctypes.c_int))
        n_to_fill_ = ctypes.c_int()
        cuda_runtime.cuda_mem_cpy(ctypes.byref(n_to_fill_), end_ptr,
                                  ctypes.c_size_t(ctypes.sizeof(ctypes.c_int)),
                                  'cudaMemcpyDeviceToHost')

        # number to fill in [0, npart_local - 1]
        n_to_fill = n_to_fill_.value

        # if there are empty slots
        if n2 > 0:
            self._empty_slots.resize(n_to_fill)
            self._empty_slots.zero()

            args = list(cuda_runtime.kernel_launch_args_1d(new_n, threads=1024)) + \
                   [self._per_particle_flag.ctypes_data,
                    ctypes.c_int(new_n),
                    self._empty_slots.ctypes_data]

            cuda_runtime.cuda_err_check(
                cuda_mpi.LIB_CUDA_MPI['cudaFindEmptySlots'](*args))

        # this first returned array actaully is an exclusive sum of the flags
        return self._per_particle_flag, self._empty_slots, n_to_fill, new_n
Пример #4
0
    def cell_contents_count_scan(self):
        """
        Get the exclusive scan of the cell contents counts array
        :return:
        """

        self._ccc_scan.realloc_zeros(self._domain.cell_count)

        cuda_runtime.cuda_mem_cpy(
            self._ccc_scan.ctypes_data, self.cell_contents_count.ctypes_data,
            ctypes.c_size_t(self._domain.cell_count *
                            ctypes.sizeof(self.cell_contents_count.dtype)),
            'cudaMemcpyDeviceToDevice')

        cuda_runtime.LIB_CUDA_MISC['cudaExclusiveScanInt'](
            self._ccc_scan.ctypes_data, ctypes.c_int(self._ccc_scan.ncomp))

        return self._ccc_scan
Пример #5
0
def copy_h2d_exclusive_scan(in_array, out_array):
    """
    Copy an Array and compute an exclusive scan on the copy. Resizes out array
    to length of in array plus 1.
    :param in_array:
    :param out_array:
    """

    assert type(in_array) is cuda_base.Array, "in_array as incorrect type"
    assert type(out_array) is cuda_base.Array, "out_array as incorrect type"

    if out_array.ncomp != (in_array.ncomp + 1):
        out_array.realloc(in_array.ncomp + 1)

    cuda_runtime.cuda_mem_cpy(d_ptr=out_array.ctypes_data,
                              s_ptr=in_array.ctypes_data,
                              size=in_array.ncomp * ctypes.sizeof(in_array.dtype),
                              cpy_type="cudaMemcpyHostToDevice")

    cuda_runtime.cuda_exclusive_scan(out_array, in_array.ncomp+1)

    return
Пример #6
0
    def _halo_update_groups(self):
        hm = self._halo_manager
        hmb = hm.get_boundary_cell_groups()
        self._halo_b_cell_indices.realloc(hmb[0].ncomp)
        cuda_runtime.cuda_mem_cpy(
            d_ptr=self._halo_b_cell_indices.ctypes_data,
            s_ptr=hmb[0].ctypes_data,
            size=ctypes.c_size_t(hmb[0].ncomp * ctypes.sizeof(ctypes.c_int)),
            cpy_type="cudaMemcpyHostToDevice")

        self._halo_b_groups_se_indices.realloc(hmb[1].ncomp)
        cuda_runtime.cuda_mem_cpy(
            d_ptr=self._halo_b_groups_se_indices.ctypes_data,
            s_ptr=hmb[1].ctypes_data,
            size=ctypes.c_size_t(hmb[1].ncomp * ctypes.sizeof(ctypes.c_int)),
            cpy_type="cudaMemcpyHostToDevice")

        hmh = hm.get_halo_cell_groups()

        self._halo_h_cell_indices.realloc(hmh[0].ncomp)

        cuda_runtime.cuda_mem_cpy(
            d_ptr=self._halo_h_cell_indices.ctypes_data,
            s_ptr=hmh[0].ctypes_data,
            size=ctypes.c_size_t(hmh[0].ncomp * ctypes.sizeof(ctypes.c_int)),
            cpy_type="cudaMemcpyHostToDevice")

        self._halo_h_groups_se_indices.realloc(hmh[1].ncomp)
        cuda_runtime.cuda_mem_cpy(
            d_ptr=self._halo_h_groups_se_indices.ctypes_data,
            s_ptr=hmh[1].ctypes_data,
            size=ctypes.c_size_t(hmh[1].ncomp * ctypes.sizeof(ctypes.c_int)),
            cpy_type="cudaMemcpyHostToDevice")

        self._halo_position_shifts[:] = hm.get_position_shifts()[:]

        self._halo_device_version = self.domain.cell_array.version
Пример #7
0
 def copy_from_device(self):
     self._h_array.realloc(self._d_array.ncomp)
     cuda_runtime.cuda_mem_cpy(self._h_array.ctypes_data,
                               self._d_array.ctypes_data,
                               ctypes.c_size_t(self._h_array.size),
                               'cudaMemcpyDeviceToHost')