예제 #1
0
    def __init__(self, occ_matrix=OCCUPANCY_MATRIX, cutoff=None):

        self._occ_matrix = occ_matrix
        assert cutoff is not None, "cuda_cell::NeighbourListLayerBased.setup error: No cutoff passed."
        self._rc = cutoff

        self.max_neigbours_per_particle = None
        self.version_id_1 = 0
        self.version_id_2 = 0

        self.list1 = cuda_base.Matrix(nrow=1, ncol=1, dtype=ctypes.c_int)
        self.list2 = cuda_base.Matrix(nrow=1, ncol=1, dtype=ctypes.c_int)

        with open(
                str(ppmd.cuda.cuda_config.LIB_DIR) +
                '/cudaNeighbourListSplitSource.cu', 'r') as fh:
            _code = fh.read()
        with open(
                str(ppmd.cuda.cuda_config.LIB_DIR) +
                '/cudaNeighbourListSplitSource.h', 'r') as fh:
            _header = fh.read()
        _name1 = 'NeighbourList'
        _name2 = 'NeighbourList2'
        _lib = cuda_build.simple_lib_creator(_header, _code, _name1)
        self._lib1 = _lib[_name1]
        self._lib2 = _lib[_name2]
예제 #2
0
    def __init__(self, kernel, dat_dict, n=None, types_map=None):

        self._types_map = types_map
        self._kernel = kernel
        self._dat_dict = access.DatArgStore(self._get_allowed_types(),
                                            dat_dict)

        # set compiler as NVCC default
        self._cc = cuda_build.NVCC

        self.loop_timer = ppmd.modules.code_timer.LoopTimer()

        self._components = {
            'LIB_PAIR_INDEX_0': '_i',
            'LIB_NAME': str(self._kernel.name) + '_wrapper'
        }

        self._group = None

        for pd in self._dat_dict.items():

            if issubclass(type(pd[1][0]), cuda_data.ParticleDat):
                if pd[1][0].group is not None:
                    self._group = pd[1][0].group
                    break

        #start code creation
        self._generate()

        # Create library
        self._lib = cuda_build.simple_lib_creator(
            self._generate_header_source(), self._components['LIB_SRC'],
            self._components['LIB_NAME'])
예제 #3
0
    def _build(self):
        """
        Build the library to create the cell occupancy matrix.
        :return:
        """
        assert self._setup is not False, "Run CellOccupancyMatrix.setup() first."

        with open(
                str(ppmd.cuda.cuda_config.LIB_DIR) +
                '/cudaCellOccupancyMatrixSource.cu', 'r') as fh:
            _code = fh.read()
        with open(
                str(ppmd.cuda.cuda_config.LIB_DIR) +
                '/cudaCellOccupancyMatrixSource.h', 'r') as fh:
            _header = fh.read()
        _name = 'CellOccupancyMatrix'

        self._p1_lib = cuda_build.simple_lib_creator(_header, _code,
                                                     'CellOccupancyMatrix')

        self._init = True
예제 #4
0
def _build_exchange_lib(dat):
    with open(
            str(ppmd.cuda.cuda_config.LIB_DIR) + '/cudaHaloExchangeSource.cu',
            'r') as fh:
        code = fh.read()
    with open(
            str(ppmd.cuda.cuda_config.LIB_DIR) + '/cudaHaloExchangeSource.h',
            'r') as fh:
        hcode = fh.read()
    assert code is not None, "Failure to read CUDA MPI packing code source"

    d = dict()
    d['DTYPE'] = host.ctypes_map[dat.dtype]
    d['NCOMP'] = dat.ncomp
    d['MPI_DTYPE'] = host.mpi_type_map[dat.dtype]

    code = code % d
    hcode = hcode % d

    return cuda_build.simple_lib_creator(
        hcode, code, 'ParticleDat_exchange')['cudaHaloExchangePD']
예제 #5
0
def _build_norm_linf_lib(dtype):
    """
    Build the L1 norm lib for a ParticleDat
    """

    with open(
            str(ppmd.cuda.cuda_config.LIB_DIR) + '/cudaLInfNormSource.cu',
            'r') as fh:
        code = fh.read()
    with open(
            str(ppmd.cuda.cuda_config.LIB_DIR) + '/cudaLInfNormSource.h',
            'r') as fh:
        hcode = fh.read()
    assert code is not None, "Failure to read CUDA L inf NORM packing code source"

    d = dict()

    d['TYPENAME'] = host.ctypes_map[dtype]

    code = code % d
    hcode = hcode % d

    return cuda_build.simple_lib_creator(
        hcode, code, 'ParticleDat_Linf_Norm')['cudaLInfNorm']
예제 #6
0
    def __init__(self, domain, cell_width, positions):

        self.domain = domain
        boundary = domain.boundary

        assert cell_width > 0, "bad cell width"
        assert boundary[1] > boundary[0], "nonsensical boundary"
        assert boundary[3] > boundary[2], "nonsensical boundary"
        assert boundary[5] > boundary[4], "nonsensical boundary"

        self.positions = positions

        self.cell_array = host.Array(ncomp=3, dtype=ctypes.c_int)
        self.cell_sizes = host.Array(ncomp=3, dtype=ctypes.c_double)

        # get sizes just considering interior
        cell_array = [0, 0, 0]
        cell_array[0] = int(float(boundary[1] - boundary[0]) / cell_width)
        cell_array[1] = int(float(boundary[3] - boundary[2]) / cell_width)
        cell_array[2] = int(float(boundary[5] - boundary[4]) / cell_width)

        cell_sizes = [0, 0, 0]
        cell_sizes[0] = float(boundary[1] - boundary[0]) / cell_array[0]
        cell_sizes[1] = float(boundary[3] - boundary[2]) / cell_array[1]
        cell_sizes[2] = float(boundary[5] - boundary[4]) / cell_array[2]
        self.cell_sizes[:] = cell_sizes[:]

        padx = int(math.ceil(
            self.domain.cell_edge_lengths[0] / cell_sizes[0])) + 1
        pady = int(math.ceil(
            self.domain.cell_edge_lengths[1] / cell_sizes[1])) + 1
        padz = int(math.ceil(
            self.domain.cell_edge_lengths[2] / cell_sizes[2])) + 1

        rpadx = padx * cell_sizes[0]
        rpady = pady * cell_sizes[1]
        rpadz = padz * cell_sizes[2]

        #print "CA", cell_array[:], "CS", self.cell_sizes[:], "CES", self.domain.cell_edge_lengths[:]

        self.cell_array[0] = cell_array[0] + 2 * padx
        self.cell_array[1] = cell_array[1] + 2 * pady
        self.cell_array[2] = cell_array[2] + 2 * padz

        #print "CA2", self.cell_array[:]

        self.boundary = host.Array(ncomp=6, dtype=ctypes.c_double)
        self.boundary[0] = boundary[0] - rpadx
        self.boundary[1] = boundary[1] + rpadx
        self.boundary[2] = boundary[2] - rpady
        self.boundary[3] = boundary[3] + rpady
        self.boundary[4] = boundary[4] - rpadz
        self.boundary[5] = boundary[5] + rpadz

        self.cell_count = cell_array[0] * cell_array[1] * cell_array[2]
        self.particle_layers = cuda_base.Array(ncomp=1, dtype=ctypes.c_int)
        self.cell_reverse_lookup = cuda_base.Array(ncomp=1, dtype=ctypes.c_int)
        self.cell_contents_count = cuda_base.Array(ncomp=self.cell_count,
                                                   dtype=ctypes.c_int)
        self.matrix = cuda_base.Matrix(nrow=self.cell_count,
                                       ncol=1,
                                       dtype=ctypes.c_int)
        self.num_layers = 0

        with open(
                str(ppmd.cuda.cuda_config.LIB_DIR) +
                '/cudaSubCellOccupancyMatrixSource.cu', 'r') as fh:
            _code = fh.read()
        with open(
                str(ppmd.cuda.cuda_config.LIB_DIR) +
                '/cudaSubCellOccupancyMatrixSource.h', 'r') as fh:
            _header = fh.read()
        _name = 'SubCellOccupancyMatrix'
        lib = cuda_build.simple_lib_creator(_header, _code, _name)
        self._sort_lib = lib['LayerSort']
        self._fill_lib = lib['PopMatrix']

        self.version_id = 0
예제 #7
0
    def _build_1p_halo_lib(self):

        _name = '_1p_halo_lib'

        _hargs = '''const int blocksize[3],
                    const int threadsize[3],
                    int * h_n_total,
                    const int h_n,
                    const int h_npc,
                    const cuda_Array<int> d_b,
                    const cuda_Array<int> d_h,
                    const cuda_Array<int> d_bhc_map,
                    cuda_Array<int> d_ccc,
                    cuda_Matrix<int> d_occ_matrix,
                    cuda_ParticleDat<%(TYPE)s> d_dat
                   ''' % {
            'TYPE': host.ctypes_map[self.idtype]
        }

        _dargs = '''const cuda_Array<int> d_b,
                    const cuda_Array<int> d_h,
                    const cuda_Array<int> d_bhc_map,
                    cuda_Array<int> d_ccc,
                    cuda_Matrix<int> d_occ_matrix,
                    cuda_ParticleDat<%(TYPE)s> d_dat
                    ''' % {
            'TYPE': host.ctypes_map[self.idtype]
        }

        _d_call_args = '''d_b, d_h, d_bhc_map, d_ccc, d_occ_matrix, d_dat'''

        if type(self) == PositionDat:
            _hargs += ''', const cuda_Array<double> d_shifts'''
            _dargs += ''', const cuda_Array<double> d_shifts'''

            _d_call_args += ''', d_shifts'''

            # self._position_shifts = self.group._halo_manager.get_position_shifts()
            _shift_code = ''' + d_shifts.ptr[d_bhc_map.ptr[_cx]*3 + _comp]'''
            _occ_code = '''
            d_occ_matrix.ptr[ d_npc * d_h.ptr[_cx] + _pi ] = hpx;

            // if particle layer is zero write the cell contents count.
            if (_pi == 0){
                d_ccc.ptr[d_h.ptr[_cx]] = d_ccc.ptr[d_b.ptr[_cx]];
            }

            '''
        else:
            _shift_code = ''''''
            _occ_code = ''''''

        _header = '''
            #include <cuda_generic.h>
            extern "C" int %(NAME)s(%(HARGS)s);
        ''' % {
            'NAME': _name,
            'HARGS': _hargs
        }

        _src = '''

        __constant__ int d_n_total;
        __constant__ int d_n;
        __constant__ int d_npc;

        __global__ void d_1p_halo_copy_shift(%(DARGS)s){

            //particle index
            const int idx = (threadIdx.x + blockIdx.x*blockDim.x)/%(NCOMP)s;


            if (idx < d_n){
                //component corresponding to thread.
                const int _comp = (threadIdx.x + blockIdx.x*blockDim.x) %% %(NCOMP)s;

                const int _cx = idx/d_npc;
                const int _bc = d_b.ptr[_cx]; // some boundary cell
                const int _pi = idx %% d_npc; // particle layer


                if (_pi < d_ccc.ptr[_bc]){ //Do we need this thread to do anything?

                    // local index of particle
                    const int px = d_occ_matrix.ptr[_bc*d_npc + _pi];

                    //halo index of particle
                    const int hpx = d_n_total + idx;

                    d_dat.ptr[hpx* %(NCOMP)s + _comp] = d_dat.ptr[px * %(NCOMP)s + _comp] %(SHIFT_CODE)s ;

                    //printf("hpx %%d, px %%d, _cx %%d, _bc %%d halo %%d \\n", hpx, px, _cx, _bc, d_bhc_map.ptr[_cx]);

                    %(OCC_CODE)s

                    //printf("shift %%f, halo %%d, _comp %%d \\n", d_shifts.ptr[d_bhc_map.ptr[_cx]*3 + _comp], d_bhc_map.ptr[_cx], _comp);

                    }


            }
            return;


        }

        int %(NAME)s(%(HARGS)s){
            checkCudaErrors(cudaMemcpyToSymbol(d_n_total, h_n_total, sizeof(int)));
            checkCudaErrors(cudaMemcpyToSymbol(d_n, &h_n, sizeof(h_n)));
            checkCudaErrors(cudaMemcpyToSymbol(d_npc, &h_npc, sizeof(h_npc)));

            dim3 bs; bs.x = blocksize[0]; bs.y = blocksize[1]; bs.z = blocksize[2];
            dim3 ts; ts.x = threadsize[0]; ts.y = threadsize[1]; ts.z = threadsize[2];

            d_1p_halo_copy_shift<<<bs,ts>>>(%(D_C_ARGS)s);
            checkCudaErrors(cudaDeviceSynchronize());
            getLastCudaError("1proc halo lib Execution failed. \\n");

            return 0;
        }
        ''' % {
            'NAME': _name,
            'HARGS': _hargs,
            'DARGS': _dargs,
            'D_C_ARGS': _d_call_args,
            'NCOMP': self.ncomp,
            'SHIFT_CODE': _shift_code,
            'OCC_CODE': _occ_code
        }

        self._1p_halo_lib = cuda_build.simple_lib_creator(
            _header, _src, _name)[_name]
예제 #8
0
    def __init__(self, kernel=None, dat_dict=None, shell_cutoff=None, sub_divide=None):

        self._dat_dict = access.DatArgStore(
            self._get_allowed_types(), dat_dict)
        self._cc = cuda_build.NVCC

        self._kernel = kernel
        self.shell_cutoff = shell_cutoff


        if sub_divide is None:
            rs_default = 5.
        else:
            rs_default = sub_divide

        self.sub_divide_size = rs_default

        #print "ACTUAL SUB CELL WIDTH", self.sub_divide_size

        self.loop_timer = ppmd.modules.code_timer.LoopTimer()
        self.wrapper_timer = opt.SynchronizedTimer(runtime.TIMER)


        self._components = {'LIB_PAIR_INDEX_0': '_i',
                            'LIB_PAIR_INDEX_1': '_j',
                            'LIB_NAME': str(self._kernel.name) + '_wrapper'}
        self._gather_size_limit = 4
        self._generate()


        self._lib = cuda_build.simple_lib_creator(
            self._generate_header_source(),
            self._components['LIB_SRC'],
            self._kernel.name,
        )[self._components['LIB_NAME']]

        self._group = None

        for pd in self._dat_dict.items():
            if issubclass(type(pd[1][0]), cuda_data.PositionDat):
                self._group = pd[1][0].group
                break

        assert self._group is not None, "No cell to particle map found"


        new_decomp_flag = self._group.domain.cell_decompose(
            self.shell_cutoff
        )

        if new_decomp_flag:
            self._group.get_cell_to_particle_map().create()

        self._key = (self.shell_cutoff,
                     self._group.domain,
                     self._group.get_position_dat())

        _nd = PairLoopCellByCell._cell_lists
        if not self._key in _nd.keys() or new_decomp_flag:
            _nd[self._key] = cuda_cell.SubCellOccupancyMatrix(
                domain=self._group.domain,
                cell_width=self.sub_divide_size,
                positions=self._group.get_position_dat(),
            )
        self.cell_list = _nd[self._key]

        self._cell_list_count = 0
        self._invocations = 0

        # get the offset list
        oslist = cell.convert_offset_tuples(
            cell.radius_cell_decompose(shell_cutoff, self.cell_list.cell_sizes),
            self.cell_list.cell_array,
            remove_zero=True
        )

        self.offset_list = cuda_base.Array(ncomp=len(oslist), dtype=ctypes.c_int)
        self.offset_list[:] = oslist[:]
예제 #9
0
    def __init__(self, kernel=None, dat_dict=None, shell_cutoff=None):

        self._dat_dict = access.DatArgStore(
            self._get_allowed_types(), dat_dict)

        self._cc = cuda_build.NVCC


        self._kernel = kernel
        '''
        if type(shell_cutoff) is not logic.Distance:
            shell_cutoff = logic.Distance(shell_cutoff)
        '''
        self.shell_cutoff = shell_cutoff

        self.loop_timer = ppmd.modules.code_timer.LoopTimer()
        self.wrapper_timer = opt.SynchronizedTimer(runtime.TIMER)


        self._components = {'LIB_PAIR_INDEX_0': '_i',
                            'LIB_PAIR_INDEX_1': '_j',
                            'LIB_NAME': str(self._kernel.name) + '_wrapper'}
        self._gather_size_limit = 4
        self._generate()


        self._lib = cuda_build.simple_lib_creator(
            self._generate_header_source(),
            self._components['LIB_SRC'],
            self._kernel.name,
        )[self._components['LIB_NAME']]

        self._group = None

        for pd in self._dat_dict.items():
            if issubclass(type(pd[1][0]), cuda_data.PositionDat):
                self._group = pd[1][0].group
                break

        assert self._group is not None, "No cell to particle map found"


        new_decomp_flag = self._group.domain.cell_decompose(
            self.shell_cutoff
        )

        if new_decomp_flag:
            self._group.get_cell_to_particle_map().create()

        self._key = (self.shell_cutoff,
                     self._group.domain,
                     self._group.get_position_dat())

        _nd = PairLoopNeighbourListNSSplit._neighbour_list_dict_PNLNS_split
        if not self._key in _nd.keys() or new_decomp_flag:
            _nd[self._key] = cuda_cell.NeighbourListLayerSplit(
                occ_matrix=self._group.get_cell_to_particle_map(),
                cutoff=self.shell_cutoff
            )

        self.neighbour_list = _nd[self._key]

        self._neighbourlist_count = 0
        self._invocations = 0
예제 #10
0
    def __init__(self, state, particle_dat_names):
        self._state = state
        self._names = particle_dat_names

        call_template = '''
        bs; bs.x = %(NCOMP)s * blocksize[0]; bs.y = blocksize[1]; bs.z = blocksize[2];
        ts; ts.x = threadsize[0]; ts.y = threadsize[1]; ts.z = threadsize[2];
        compression_kernel<%(DTYPE)s><<<bs,ts>>>(%(PTR_NAME)s, %(NCOMP)s);
        '''

        extra_params = ''
        kernel_calls = '''
        '''

        for ix in self._names:
            dat = getattr(self._state, ix)
            sdtype = ppmd.host.ctypes_map[dat.dtype]

            extra_params += ', ' + sdtype + '* ' + ix
            kernel_calls += call_template % {
                'DTYPE': sdtype,
                'PTR_NAME': ix,
                'NCOMP': dat.ncomp
            }

        name = 'compression_lib'

        header_code = '''
        #include "cuda_generic.h"

        __constant__ int d_n_empty;
        __constant__ int* d_e_slots;
        __constant__ int* d_r_slots;

        template <typename T>
        __global__ void compression_kernel(T* __restrict__ d_ptr,
                                           const int ncomp){

            const int ix = threadIdx.x + blockIdx.x*blockDim.x;
            if (ix < d_n_empty * ncomp){

                const int sx = ix / ncomp;
                const int comp = ix %% ncomp;
                const int eslot = d_e_slots[sx];
                const int rslot = d_r_slots[sx];

                d_ptr[eslot*ncomp + comp] = d_ptr[rslot*ncomp + comp];

            }
            return;
        }

        extern "C" int compression_lib(const int blocksize[3],
                                       const int threadsize[3],
                                       const int h_n_empty,
                                       const int* d_e_slots_p,
                                       const int* d_r_slots_p
                                       %(EXTRA_PARAMS)s
                                       );

        ''' % {
            'EXTRA_PARAMS': extra_params
        }

        src_code = '''

        int compression_lib(const int blocksize[3],
                            const int threadsize[3],
                            const int h_n_empty,
                            const int* d_e_slots_p,
                            const int* d_r_slots_p
                            %(EXTRA_PARAMS)s
                            ){

            dim3 bs;
            dim3 ts;
            checkCudaErrors(cudaMemcpyToSymbol(d_n_empty, &h_n_empty, sizeof(int)));
            checkCudaErrors(cudaMemcpyToSymbol(d_e_slots, &d_e_slots_p, sizeof(int*)));
            checkCudaErrors(cudaMemcpyToSymbol(d_r_slots, &d_r_slots_p, sizeof(int*)));


            %(KERNEL_CALLS)s


            return (int) cudaDeviceSynchronize();

        }
        ''' % {
            'KERNEL_CALLS': kernel_calls,
            'EXTRA_PARAMS': extra_params
        }

        self._lib = cuda_build.simple_lib_creator(header_code, src_code,
                                                  name)[name]