Python SourceModule.get_function 예제들, pycuda.compiler.SourceModule.get_function Python 예제들

예제 #1

0

파일 보기

파일: ParallelAlgorithms.py 프로젝트: ctralie/GeometricCoverSongs

def initParallelAlgorithms():
    global bitonicSort_
    fin = open("ParallelAlgorithms/bitonicSort.cu")
    mod = SourceModule(fin.read())
    fin.close()
    bitonicSort_ = mod.get_function("bitonicSort")

    global finishCSM_
    global getSumSquares_
    fin = open("ParallelAlgorithms/CSMHelper.cu")
    mod = SourceModule(fin.read())
    fin.close()
    finishCSM_ = mod.get_function("finishCSM")
    getSumSquares_ = mod.get_function("getSumSquares")

    #Run each of the algorithms on dummy data so that they're pre-compiled

    #1) Bitonic Sort
    X = np.random.randn(16, 16)
    N = np.int32(16)
    NPow2 = N
    NThreads = N/2
    XG = gpuarray.to_gpu(X)
    bitonicSort_(XG, N, NPow2, block=(NThreads, 1, 1), grid=(X.shape[0], 1), shared=4*NPow2)

    linalg.init()
    #2) Other primitive operations
    NegXDotX = linalg.dot(XG, XG)
    XPlusX = skcuda.misc.add(XG, XG)
    XSqr = skcuda.misc.multiply(XG, XG)
    XSqr = skcuda.misc.sum(XSqr, 1)
    XPlusCol = skcuda.misc.add_matvec(XG, XSqr, 0)

예제 #2

0

파일 보기

파일: 470-2d-wave-pycuda-mpi.py 프로젝트: wbkifun/my_stuff

	def prepare_kernel(s):
		mod = SourceModule("""
			__global__ void update_src(int idx, int tstep, float *f) {
				f[idx] += sin(0.1*tstep);
			}
			__global__ void update(int nx, int ny, float *c, float *f, float *g) {
				int tx = threadIdx.x;
				int idx = blockIdx.x*blockDim.x + tx;

				extern __shared__ float gs[];
				gs[tx+1] = g[idx];

				int i = idx/ny, j = idx%ny;
				if(j>0 && j<ny-1) {
					if(tx==0) gs[tx]=g[idx-1];
					if(tx==blockDim.x-1) gs[tx+2]=g[idx+1];
				}
				__syncthreads();

				if(i>0 && j>0 && i<nx-1 && j<ny-1) {
					f[idx] = c[idx]*(g[idx+ny]+g[idx-ny]+gs[tx+2]+gs[tx]-4*gs[tx+1])+2*gs[tx+1]-f[idx];
				}
			}
			""")
		s.update_src = mod.get_function("update_src")
		s.update = mod.get_function("update")

		Db, s.Dg = (256,1,1), (s.nx*s.ny/256+1, 1)
		s.nnx, s.nny = sc.int32(s.nx), sc.int32(s.ny)

		s.update_src.prepare("iiP", block=(1,1,1))
		s.update.prepare("iiPPP", block=Db, shared=(256+2)*4)

예제 #3

0

파일 보기

파일: threshold_cuda.py 프로젝트: cdcapano/pycbc

def get_tkernel(slen, window):
    if window < 32:
        raise ValueError("GPU threshold kernel does not support a window smaller than 32 samples")

    elif window <= 4096:
        nt = 128
    elif window <= 16384:
        nt = 256
    elif window <= 32768:
        nt = 512
    else:
        nt = 1024

    nb = int(numpy.ceil(slen / float(window)))

    if nb > 1024:
        raise ValueError("More than 1024 blocks not supported yet")

    try:
        return tfn_cache[(nt, nb)], nt, nb
    except KeyError:
        mod = SourceModule(tkernel1.render(chunk=nt))
        mod2 = SourceModule(tkernel2.render(blocks=nb))
        fn = mod.get_function("threshold_and_cluster")
        fn.prepare("PPPif")
        fn2 = mod2.get_function("threshold_and_cluster2")
        fn2.prepare("PPfi")
        tfn_cache[(nt, nb)] = (fn, fn2)
        return tfn_cache[(nt, nb)], nt, nb

예제 #4

0

파일 보기

파일: gputools.py 프로젝트: matthiaslee/VMBD

def gradient_gpu(y_gpu, mode='valid'):

  shape = np.array(y_gpu.shape).astype(np.uint32)
  dtype = y_gpu.dtype
  block_size = (16,16,1)
  grid_size = (int(np.ceil(float(shape[1])/block_size[0])),
               int(np.ceil(float(shape[0])/block_size[1])))
  shared_size = int((1+block_size[0])*(1+block_size[1])*dtype.itemsize)

  preproc = _generate_preproc(dtype, shape)
  mod = SourceModule(preproc + kernel_code, keep=True)

  if mode == 'valid':
    gradient_gpu = mod.get_function("gradient_valid")

    gradx_gpu = cua.empty((y_gpu.shape[0], y_gpu.shape[1]-1), y_gpu.dtype)
    grady_gpu = cua.empty((y_gpu.shape[0]-1, y_gpu.shape[1]), y_gpu.dtype)

  if mode == 'same':
    gradient_gpu = mod.get_function("gradient_same")

    gradx_gpu = cua.empty((y_gpu.shape[0], y_gpu.shape[1]), y_gpu.dtype)
    grady_gpu = cua.empty((y_gpu.shape[0], y_gpu.shape[1]), y_gpu.dtype)
    
  gradient_gpu(gradx_gpu.gpudata, grady_gpu.gpudata, y_gpu.gpudata,
               block=block_size, grid=grid_size, shared=shared_size)

  return (gradx_gpu, grady_gpu)

예제 #5

0

파일 보기

파일: kernels.py 프로젝트: mpitx/nbody

def get_kernel_functions(**kwargs):
    reset_mod = SourceModule(reset_template.render(kwargs))
    add_mod = SourceModule(add_template.render(kwargs))
    step_mod = SourceModule(step_template.render(kwargs))
    reset_func = reset_mod.get_function('reset_force')
    add_func = add_mod.get_function('add_force')
    update_func = step_mod.get_function('update')
    return (reset_func, add_func, update_func,)

예제 #6

0

파일 보기

파일: dg_modal_gpu.py 프로젝트: wbkifun/my_research

 def prepare_update(self):
     from pycuda.compiler import SourceModule
     import os
     src_path = '/'.join( os.path.abspath(__file__).split('/')[:-1] )
     kernels = open(src_path + '/core.cu').read()
     mod = SourceModule(kernels)
     #mod = cuda.module_from_file('core.cubin')
     self.update_pre = mod.get_function('update_pre')
     self.update_ul = mod.get_function('update_ul')

예제 #7

0

파일 보기

파일: binary_ops.py 프로젝트: ChenYuWHU/BinaryNet

    def make_thunk(self, node, storage_map, _, _2):
        
        mod = SourceModule(open("binary_kernels.cu").read())
        concatenate_rows_kernel = mod.get_function("concatenate_rows_kernel")
        concatenate_cols_kernel = mod.get_function("concatenate_cols_kernel")
        xnor_kernel = mod.get_function("xnor_gemm")
    
        inputs = [storage_map[v] for v in node.inputs]
        outputs = [storage_map[v] for v in node.outputs]
        
        # THIS IS PROBABLY THE PART YOU ARE INTERESTED IN
        def thunk():
            
            # inputs
            A = inputs[0][0]
            B = inputs[1][0]
            
            # dimensions
            m = A.shape[0]
            n = A.shape[1]
            k = B.shape[1]
            assert n == B.shape[0] # Otherwise GEMM is impossible
            assert n%(32*16) == 0 # Concatenation and block size
            
            # output
            output_shape = (m, k)
            C = outputs[0]
            # only allocate if there is no previous allocation of the right size.
            if C[0] is None or C[0].shape != output_shape:
                C[0] = cuda.CudaNdarray.zeros(output_shape)           
            
            # Concatenating the rows of A  
            Ac = drv.mem_alloc(m*n*4/32)
            block_size = 64            
            block = (block_size,1,1)
            grid = (m*n/(block_size*32)+1,1)
            concatenate_rows_kernel(A,Ac, np.intc(m*n/32), block= block, grid=grid)
            
            # Concatenating the columns of B
            Bc = drv.mem_alloc(n*k*4/32)  
            block_size = 64 
            block = (block_size,1,1)
            grid = (k/block_size+1,1)
            concatenate_cols_kernel(B,Bc, np.intc(n), np.intc(k), block= block, grid=grid)
            
            # Launching xnor_kernel
            block_size = 16
            block = (block_size,block_size,1)
            grid = (k / block_size + 1, m / block_size + 1) # better too many blocks than too little
            xnor_kernel(Ac,Bc,C[0], np.intc(m), np.intc(n/32.), np.intc(k), block= block, grid=grid)
            
        thunk.inputs = inputs
        thunk.outputs = outputs
        thunk.lazy = False

        return thunk

예제 #8

0

파일 보기

파일: wigner_bloch_cuda_1d.py 프로젝트: dibondar/AccurateWigner

    def get_K_min(self):
        """
        Return the kinetic energy minimum
        """
        # fill array with values of the kinetic energy
        fill_compiled = SourceModule(
            self.fill_V_K.format(cuda_consts=self.cuda_consts, K=self.K, V=self.V)
        )
        fill_compiled.get_function("fill_K")(self.weighted, **self.wigner_mapper_params)

        return gpuarray.min(self.weighted).get()

예제 #9

0

파일 보기

파일: wigner_bloch_cuda_1d.py 프로젝트: andregcampos/FastWigner

    def __init__(self, **kwargs):
        """
        In addition to kwagrs of WignerMoyalCUDA1D.__init__ this constructor accepts:

        kT (optional)- the temperature for the Gibbs state [rho = exp(-H/kT)]
        dbeta (optional) -  inverse temperature increments for the split-operator propagation
        t_initial (optional) - if the Hamiltonian is time dependent, then the the Gibbs state will be calculated
            for the hamiltonian at t_initial (default value of zero).
        """
        if 't_initial' not in kwargs:
            kwargs.update(t_initial=0.)
            print("Warning: Initial time (t_initial) was not specified. So the default value was used t_initial = 0.")

        try:
            self.kT = kwargs['kT']
            # remove kT from kwargs so that it does not enter into self.cuda_consts
            del kwargs['kT']
        except KeyError:
            self.kT = 0.

        try:
            self.dbeta = kwargs['dbeta']
            # remove dbeta from kwargs so that it does not enter into self.cuda_consts
            del kwargs['dbeta']
        except KeyError:
            # if dbeta is not defined, just choose some value
            self.dbeta = 0.01

        if 'dt' not in kwargs:
            # Save the inverse temperature increment as dt
            kwargs.update(dt=self.dbeta)

        # Initialize parent class
        WignerMoyalCUDA1D.__init__(self, **kwargs)

        # Save the minimums of the potential (V) and kinetic (K) energy
        self.cuda_consts += "    const double V_min = %.15e;\n" % self.get_V_min()
        self.cuda_consts += "    const double K_min = %.15e;\n" % self.get_K_min()

        print("\n================================ Compiling Bloch expK and expV ================================\n")

        bloch_expK_expV_compiled = SourceModule(
            self.bloch_expK_expV_cuda_source.format(
                cuda_consts=self.cuda_consts, K=self.K, V=self.V,
                abs_boundary_lambda_p=self.abs_boundary_lambda_p, abs_boundary_x_theta=self.abs_boundary_x_theta
            )
        )

        self.bloch_expK_bulk = bloch_expK_expV_compiled.get_function("bloch_expK_bulk")
        self.bloch_expK_boundary = bloch_expK_expV_compiled.get_function("bloch_expK_boundary")

        self.bloch_expV_bulk = bloch_expK_expV_compiled.get_function("bloch_expV_bulk")
        self.bloch_expV_boundary = bloch_expK_expV_compiled.get_function("bloch_expV_boundary")

예제 #10

0

파일 보기

파일: gradient.py 프로젝트: aoeftiger/PyPIC

def make_GPU_gradient(mesh, context):
    '''Prepare to compute gradient on the GPU w.r.t. the given mesh.
    Return gradient function.
    '''
    mx = int(getattr(mesh, 'nx', 1))
    my = int(getattr(mesh, 'ny', 1))
    mz = int(getattr(mesh, 'nz', 1))

    dxInv = np.array(1./getattr(mesh, 'dx', 1), dtype=np.float64)
    dyInv = np.array(1./getattr(mesh, 'dy', 1), dtype=np.float64)
    dzInv = np.array(1./getattr(mesh, 'dz', 1), dtype=np.float64)

    sizeof_double = 8
    with open(where + 'gradient2.cu') as fdlib:
        source = fdlib.read()
    module = SourceModule(source)

    mx_ptr = module.get_global("mx")[0]
    my_ptr = module.get_global("my")[0]
    mz_ptr = module.get_global("mz")[0]
    cuda.memcpy_htod(mx_ptr, np.array(mx, dtype=np.int32))
    cuda.memcpy_htod(my_ptr, np.array(my, dtype=np.int32))
    cuda.memcpy_htod(mz_ptr, np.array(mz, dtype=np.int32))

    dxInv_ptr = module.get_global("dxInv")[0]
    dyInv_ptr = module.get_global("dyInv")[0]
    dzInv_ptr = module.get_global("dzInv")[0]
    cuda.memcpy_htod(dxInv_ptr, dxInv)
    cuda.memcpy_htod(dyInv_ptr, dyInv)
    cuda.memcpy_htod(dzInv_ptr, dzInv)

    deriv_x = module.get_function("gradient_x")
    deriv_y = module.get_function("gradient_y")
    deriv_z = module.get_function("gradient_z")

    block, grid = mesh.get_domain_decomposition(DeviceData().max_threads)

    d_deriv_x = gpuarray.empty(shape=(1, mesh.n_nodes), dtype=np.float64)
    d_deriv_y = gpuarray.empty_like(d_deriv_x)
    d_deriv_z = gpuarray.empty_like(d_deriv_x)

    def _gradient(scalar_values):
        '''Calculate three-dimensional gradient for GPUArray
        scalar_values.
        '''
        deriv_x(scalar_values, d_deriv_x, block=block, grid=grid)
        deriv_y(scalar_values, d_deriv_y, block=block, grid=grid)
        deriv_z(scalar_values, d_deriv_z, block=block, grid=grid)
        context.synchronize()

        return (d_deriv_x, d_deriv_y, d_deriv_z)[:mesh.dimension]
    return _gradient

예제 #11

0

파일 보기

파일: Prior.py 프로젝트: beamandrew/BNN

 def __init__(self,shape,scale,layer,precision=np.float32):
     self.precision = precision
     self.shape = shape
     self.scale = scale
     
     init_var = invgamma.rvs(1.0,scale=1.0,size=(1,layer.weights.shape[1])).astype(precision)
     self.sW = gpuarray.to_gpu(init_var)
     
     init_var = invgamma.rvs(1.0,scale=1.0,size=(1,1)).astype(precision)
     self.sB = gpuarray.to_gpu(init_var)
     kernels = SourceModule(open(path+'/kernels.cu', "r").read())        
     self.add_prior_w_kernel = kernels.get_function("add_gaussian_unit_grad")
     self.add_prior_b_kernel = kernels.get_function("add_bias_grad")
     self.scale_momentum_kernel = kernels.get_function("scale_momentum_normal_unit")

예제 #12

0

파일 보기

파일: rho_bloch_cuda_1d.py 프로젝트: andregcampos/FastWigner

    def get_K_min(self):
        """
        Return the kinetic energy minimum
        """
        # allocate memory
        k_p_p_prime = gpuarray.zeros((self.P.size, self.P.size), np.float64)

        # fill array with values of the kinetic energy
        fill_compiled = SourceModule(
            self.fill_V_K.format(cuda_consts=self.cuda_consts, K=self.K, V=self.V)
        )
        fill_compiled.get_function("fill_K")(k_p_p_prime, **self.rho_mapper_params)

        return gpuarray.min(k_p_p_prime).get()

예제 #13

0

파일 보기

파일: cpu_basic_python_implementation.py 프로젝트: Tamme/mutationalsignaturesNCSUT

def init_rng(seed):
    global _dropout_kernel, _saltpepper_kernel, _rng_state, _rng_threads, _rng_blocks
    from pycuda.characterize import sizeof
    ds = sizeof("curandState", "#include <curand_kernel.h>")
    _rng_state = drv.mem_alloc(_rng_threads * _rng_blocks * ds)

    src = SourceModule(
    '''
    #include <curand_kernel.h>

    extern "C"
    {
    __global__ void setup_rng(curandState* rng_state, const unsigned seed)
    {
        const unsigned tid = blockIdx.x*blockDim.x+threadIdx.x;
        curand_init(seed, tid, 0, &rng_state[tid]);
    }

    __global__ void dropout_eltw(float* x, const unsigned size,
                                 float dropout_rate,
                                 curandState* rng_state) {
        const unsigned tid = blockIdx.x*blockDim.x+threadIdx.x;
        const unsigned num_threads = gridDim.x*blockDim.x;
        curandState localState = rng_state[tid];
        for (unsigned i = tid; i < size; i += num_threads)
            x[i] = (curand_uniform(&localState) < dropout_rate) ? 0.0 : x[i];
        rng_state[tid] = localState;
    }

    __global__ void saltpepper_eltw(float* x, const unsigned size,
                                    float dropout_rate,
                                    curandState* rng_state) {
        const unsigned tid = blockIdx.x*blockDim.x+threadIdx.x;
        const unsigned num_threads = gridDim.x*blockDim.x;
        curandState localState = rng_state[tid];
        for (unsigned i = tid; i < size; i += num_threads)
            x[i] = (curand_uniform(&localState) < dropout_rate) ? 0.0 : x[i];
            x[i] = (curand_uniform(&localState) < dropout_rate) ? 1.0 : x[i];
        rng_state[tid] = localState;
    }
    }
    ''', no_extern_c=True)
    setup_rng = src.get_function("setup_rng")
    setup_rng.prepare("Pi")
    setup_rng.prepared_call((_rng_threads, 1, 1), (_rng_blocks, 1, 1),
                            _rng_state, np.uint32(seed))
    _dropout_kernel = src.get_function("dropout_eltw")
    _dropout_kernel.prepare("PifP")
    _saltpepper_kernel = src.get_function("saltpepper_eltw")
    _saltpepper_kernel.prepare("PifP")

예제 #14

0

파일 보기

파일: wigner_bloch_cuda_1d.py 프로젝트: andregcampos/FastWigner

    def get_K_min(self):
        """
        Return the kinetic energy minimum in the lambda p space
        """
        # allocate memory
        k_p_lambda = gpuarray.zeros((self.P.size, self.Lambda.size), np.float64)

        # fill array with values of the kinetic energy
        fill_compiled = SourceModule(
            self.fill_V_K.format(cuda_consts=self.cuda_consts, K=self.K, V=self.V)
        )
        fill_compiled.get_function("fill_K_bulk")(k_p_lambda, **self.K_bulk_mapper_params)
        fill_compiled.get_function("fill_K_boundary")(k_p_lambda, **self.K_boundary_mapper_params)

        return gpuarray.min(k_p_lambda).get()

예제 #15

0

파일 보기

파일: wigner_bloch_cuda_1d.py 프로젝트: andregcampos/FastWigner

    def get_V_min(self):
        """
        Return the potential energy minimum in the x theta space
        """
        # allocate memory
        v_theta_x = gpuarray.zeros((self.Theta.size, self.X.size), np.float64)

        # fill array with values of the potential energy
        fill_compiled = SourceModule(
            self.fill_V_K.format(cuda_consts=self.cuda_consts, K=self.K, V=self.V)
        )
        fill_compiled.get_function("fill_V_bulk")(v_theta_x, **self.V_bulk_mapper_params)
        fill_compiled.get_function("fill_V_boundary")(v_theta_x, **self.V_boundary_mapper_params)

        return gpuarray.min(v_theta_x).get()

예제 #16

0

파일 보기

파일: tomography_cuda.py 프로젝트: martina88esposito/tomohowk

class CudaCalculator(object):
    def __init__(self, eta, beta, L, no_angles, no_pulses, order=5):
        self.mod_K = SourceModule(RADON_KERNEL.format(order, no_angles, no_pulses))
        self.K_gpu = self.mod_K.get_function("K_l")
        self.mod_reduction = SourceModule(REDUCTION_KERNEL)
        self.reduction_gpu = self.mod_reduction.get_function("reduction")
        self.eta = eta
        self.gamma = gamma(eta)
        self.beta = beta
        self.L = L
        self.h = calc_h(L, beta, eta)
        drv.memcpy_htod(self.mod_K.get_global("rsq4pi")[0], scipy.array([1./sqrt(4.*pi)], dtype=scipy.float32))
        drv.memcpy_htod(self.mod_K.get_global("sqeta")[0], scipy.array([sqrt(self.eta)], dtype=scipy.float32))
        drv.memcpy_htod(self.mod_K.get_global("h")[0], scipy.array([self.h], dtype=scipy.float32))
        drv.memcpy_htod(self.mod_K.get_global("four_pi_gamma")[0],
                        scipy.array([4.*pi*self.gamma], dtype=scipy.float32))
        y = sqrt(self.gamma)/self.h
        drv.memcpy_htod(self.mod_K.get_global("y")[0], scipy.array([y], dtype=scipy.float32))
        n = scipy.arange(1, order+1, dtype=scipy.float32)
        n2 = n**2
        ex = exp(-n2/4.)
        pre_s2 = ex*cosh(n*y)
        pre_s3 = ex*n*sinh(n*y)
        drv.memcpy_htod(self.mod_K.get_global("n2")[0], n2)
        drv.memcpy_htod(self.mod_K.get_global("pre_s1")[0], ex)
        drv.memcpy_htod(self.mod_K.get_global("pre_s2")[0], pre_s2)
        drv.memcpy_htod(self.mod_K.get_global("pre_s3")[0], pre_s3)

    def K(self, Q, P, angles, quadratures):
        drv.memcpy_htod(self.mod_K.get_global("cos_phi")[0], cos(angles).astype(scipy.float32))
        drv.memcpy_htod(self.mod_K.get_global("sin_phi")[0], sin(angles).astype(scipy.float32))
        Nx = Q.shape[0]
        Ny = int(floor(quadratures.size / 1024.))
        K = scipy.empty((Nx,), dtype=scipy.float32)
        Kb = drv.mem_alloc(4*Ny*Nx)
        Q_gpu = drv.to_device(Q)
        P_gpu = drv.to_device(P)
        self.K_gpu(drv.In(quadratures), Q_gpu, P_gpu, Kb,
                   block=(1, 1024, 1), grid=(Nx, Ny), shared=1024*4)
        self.reduction_gpu(Kb, drv.Out(K), block=(1, Ny, 1), grid=(Nx, 1), shared=Ny*4)
        return K/self.L

    def reconstruct_wigner(self, angles_quadratures, Nq, Np):
        angles, quadratures = angles_quadratures
        q_mean, p_mean, s_max = estimate_position_from_quadratures(self.eta, angles, quadratures)
        q, p, Q, P = build_mesh(q_mean, p_mean, s_max, Nq, Np)
        W = self.K(Q.ravel(), P.ravel(), angles, quadratures)
        return q_mean, p_mean, Q, P, W.reshape(Q.shape)

예제 #17

0

파일 보기

파일: dihedral_ops.py 프로젝트: AngelaGuoguo/kaggle-ndsb

    def make_thunk(self, node, storage_map, _, _2):
        inputs = [storage_map[v] for v in node.inputs]
        outputs = [storage_map[v] for v in node.outputs]

        mod = SourceModule("""
            __global__ void cyclic_roll_grad(float * input, float * output, int batch_size, int num_features) {
                int x = blockIdx.x*blockDim.x + threadIdx.x; // feature dim, fastest varying index!
                int y = blockIdx.y*blockDim.y + threadIdx.y; // batch dim

                int height = 4 * batch_size;
                int width = 4 * num_features;

                float val = 0;

                if (x < num_features && y < height) {
                    for (int i = 0; i < 4; i++) {
                        int y_in = (y + batch_size * (4 - i)) % height;
                        int x_in = x + num_features * i;

                        val += input[y_in * width + x_in];
                    }

                    output[y * num_features + x] = val;
                }
            }""")
        kernel = mod.get_function("cyclic_roll_grad")

        def thunk():
            in_shape = inputs[0][0].shape
            rows, cols = in_shape
            
            assert rows % 4 == 0
            assert cols % 4 == 0

            out_shape = (rows, cols // 4)
            
            batch_size = rows // 4
            num_features = cols // 4

            out = outputs[0]

            # only allocate if there is no previous allocation of the right size.
            if out[0] is None or out[0].shape != out_shape:
                out[0] = cuda.CudaNdarray.zeros(out_shape)

            x_block = 16
            y_block = 16
            block = (x_block, y_block, 1)

            x_grid = int(np.ceil(float(out_shape[1]) / x_block))
            y_grid = int(np.ceil(float(out_shape[0]) / y_block))
            grid = (x_grid, y_grid, 1)

            kernel(inputs[0][0], out[0], np.intc(batch_size), np.intc(num_features), block=block, grid=grid)

        thunk.inputs = inputs
        thunk.outputs = outputs
        thunk.lazy = False

        return thunk

예제 #18

0

파일 보기

파일: testtran.py 프로젝트: wfrisby/pycudapiv

    def trans(self):
        transposes = """
        __global__ void transpose16(float2 * idata, float2 *odata, int width, int height, int num)
        {
 
        __shared__ float blockx[16][16+1];
        __shared__ float blocky[16][16+1];

        int xIndex = blockIdx.x * 16 + threadIdx.x;
        int yIndex = blockIdx.y * 16 + threadIdx.y;
        int index = xIndex + yIndex*width;

           for (int i= 0; i< 16; i+=8) {
               blockx[threadIdx.y+i][threadIdx.x] = idata[index + i*width].x;
               blocky[threadIdx.y+i][threadIdx.x] = idata[index + i*width].y;
           }

        __syncthreads();

            for (int i = 0; i < 16; i+=8) {
                odata[index+i*height].x = blockx[threadIdx.x][threadIdx.y+i];
                odata[index+i*height].y = blocky[threadIdx.x][threadIdx.y+i];
            }
        }
        """
        t = SourceModule(transposes)
        self.t16 = t.get_function("transpose16")

예제 #19

0

파일 보기

파일: pyeulertour.py 프로젝트: zenlc2000/pycuda-euler

def calculate_circuit_graph_vertex_data_device(d_D, d_C, length):
    logger = logging.getLogger('eulercuda.pyeulertour.calculate_circuit_graph_vertex_data_device')
    logger.info("started.")
    mod = SourceModule("""
    __global__ void calculateCircuitGraphVertexData( unsigned int * D,unsigned int * C,unsigned int ecount){

        unsigned int tid=(blockDim.x*blockDim.y * gridDim.x*blockIdx.y) + (blockDim.x*blockDim.y*blockIdx.x)+(blockDim.x*threadIdx.y)+threadIdx.x;
        if( tid <ecount)
        {
            unsigned int c=D[tid];
            atomicExch(C+c,1);
        }
    }
    """)
    calculate_circuit_graph_vertex_data = mod.get_function('calculateCircuitGraphVertexData')
    block_dim, grid_dim = getOptimalLaunchConfiguration(length, 512)
    logger.info('block_dim = %s, grid_dim = %s' % (block_dim, grid_dim))
    np_d_D = gpuarray.to_gpu(d_D)
    np_d_C = gpuarray.to_gpu(d_C)
    calculate_circuit_graph_vertex_data(
        np_d_D,
        np_d_C,
        np.uintc(length),
        block=block_dim, grid=grid_dim
    )
    np_d_D.get(d_D)
    np_d_C.get(d_C)
    # devdata = pycuda.tools.DeviceData()
    # orec = pycuda.tools.OccupancyRecord(devdata, block_dim[0] * grid_dim[1])
    # logger.info("Occupancy = %s" % (orec.occupancy * 100))
    logger.info("Finished. Leaving.")
    return d_D, d_C

예제 #20

0

파일 보기

파일: LPU.py 프로젝트: yiyin/neurokernel

    def _extract_projection_func(self, state_var):
        template = """
        __global__ void extract_projection(%(type)s* all_V,
                                           %(type)s* projection_V,
                                           int* all_index,
                                           int* projection_index, int N)
        {
              int tid = threadIdx.x + blockIdx.x * blockDim.x;
              int total_threads = blockDim.x * gridDim.x;

              int a_ind, p_ind;
              for(int i = tid; i < N; i += total_threads)
              {
                   a_ind = all_index[i];
                   p_ind = projection_index[i];

                   projection_V[p_ind] = all_V[a_ind];
              }
        }
        """
        mod = SourceModule(template % {"type": dtype_to_ctype(state_var.dtype)}, options=self.compile_options)
        func = mod.get_function("extract_projection")
        func.prepare("PPPPi")  # [np.intp, np.intp, np.intp, np.intp, np.int32])

        return func

예제 #21

0

파일 보기

파일: float_ew.py 프로젝트: ferenckulcsar/neon

def _get_shuffle_kernel(dtype):

    code = _shuffle_kernel % _ew_types[dtype]
    module = SourceModule(code)
    kernel = module.get_function("dimShuffle")
    kernel.prepare("PPIIIIIIIIIIIIII")
    return kernel

예제 #22

0

파일 보기

파일: cuda.py 프로젝트: fjarri-attic/beclab

class _ProgramWrapper:

	def __init__(self, env, source, sync_calls, double=False, prelude="", **kwds):
		# program and kernels are tied to queue, which is not exactly logical,
		# but works for our purposes and makes code simpler (because program uses
		# single queue for all calculations anyway)
		self._env = env
		self._compile(source, double=double, prelude=prelude, **kwds)
		self._sync_calls = sync_calls

	def _compile(self, source, double=False, prelude="", manual_extern_c=False, **kwds):
		"""
		Adds helper functions and defines to given source, renders it,
		compiles and saves OpenCL program object.
		"""
		kernel_src = Template(source).render(**kwds)
		src = _header.render(cuda=True, double=double, kernels=kernel_src, prelude=prelude,
			manual_extern_c=manual_extern_c)
		try:
			self._program = SourceModule(src, no_extern_c=True, options=['-use_fast_math'])
		except:
			for i, l in enumerate(src.split('\n')):
				print i + 1, ": ", l
			raise

	def __getattr__(self, name):
		return _KernelWrapper(self._env, self._program.get_function(name), self._sync_calls)

예제 #23

0

파일 보기

파일: float_ew.py 프로젝트: ferenckulcsar/neon

def _get_transpose_kernel(dtype):

    code = _transpose_kernel % _ew_types[dtype]
    module = SourceModule(code)
    kernel = module.get_function("transpose")
    kernel.prepare("PPII")
    return kernel

예제 #24

0

파일 보기

파일: pycuda_example.py 프로젝트: olivierverdier/Theano

    def make_node(self, *inputs):
        _inputs = [gpu_contiguous(as_cuda_ndarray_variable(i)) for i in inputs]
        if self.nin > 0 and len(_inputs) != self.nin:
            raise TypeError('Wrong argument count', (self.nin, len(_inputs)))
        for i in _inputs[1:]:
            if i.type.ndim != inputs[0].type.ndim:
                raise TypeError('different ranks among inputs')

        if any([any(i.type.broadcastable) for i in inputs]):
            raise Exception("pycuda don't support broadcasted dimensions")
        assert len(inputs)==2#TODO remove

        otype = CudaNdarrayType(broadcastable=[False]*_inputs[0].type.ndim)
        assert self.nout == 1

        fct_name = "pycuda_elemwise_%s"%str(self.scalar_op)
        out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)])
        in_name = ["i"+str(id) for id in range(len(inputs))]
        out_name = ["o"+str(id) for id in range(self.nout)]
        c_code = self.scalar_op.c_code(out_node, "some_name", tuple([n+"[i]"for n in in_name]), tuple(n+"[i]"for n in out_name), {})
        c_code_param = ", ".join([var.type.dtype_specs()[1]+" *"+name for var,name in zip(inputs,in_name) + zip(out_node.outputs,out_name)]+["int size"])
        mod = SourceModule("""
#include<Python.h>
#include <numpy/arrayobject.h>
  __global__ void %s(%s)
  {
    int i = (blockIdx.x+blockIdx.y*gridDim.x)*(blockDim.x*blockDim.y);
    i += threadIdx.x + threadIdx.y*blockDim.x;
    if(i<size){
        %s
    }
  }
  """%(fct_name,c_code_param,c_code))
        self.pycuda_fct = mod.get_function(fct_name)
        return out_node

예제 #25

0

파일 보기

파일: gputools.py 프로젝트: matthiaslee/VMBD

def edgetaper_gpu(y_gpu, sf, win='barthann'):

  shape = np.array(y_gpu.shape).astype(np.uint32)
  dtype = y_gpu.dtype
  block_size = (16,16,1)
  grid_size = (int(np.ceil(float(shape[1])/block_size[0])),
               int(np.ceil(float(shape[0])/block_size[1])))

  # Ensure that sf is odd
  sf = sf+(1-np.mod(sf,2))
  wx = scipy.signal.get_window(win, sf[1])
  wy = scipy.signal.get_window(win, sf[0])
  maxw = wx.max() * wy.max()
  
  hsf = np.floor(sf/2)
  wx = (wx[0:hsf[1]] / maxw).astype(dtype)
  wy = (wy[0:hsf[0]] / maxw).astype(dtype)

  preproc = _generate_preproc(dtype, shape)
  preproc += '#define wx_size %d\n' % wx.size
  preproc += '#define wy_size %d\n' % wy.size
  mod = SourceModule(preproc + edgetaper_code, keep=True)
  edgetaper_gpu = mod.get_function("edgetaper")
  wx_gpu, wx_size = mod.get_global('wx')
  wy_gpu, wy_size = mod.get_global('wy')

  cu.memcpy_htod(wx_gpu, wx)
  cu.memcpy_htod(wy_gpu, wy)

  edgetaper_gpu(y_gpu, np.int32(hsf[1]), np.int32(hsf[0]),
                block=block_size, grid=grid_size)

예제 #26

0

파일 보기

파일: correlation.py 프로젝트: akassab/gpu_project

    def _init(self, pn, qn, wf_qn):
        super()._init(pn, qn, wf_qn)

        self._wf_q_grid_gpu = gpuarray.to_gpu(N.ascontiguousarray(self._wf_q_grid))
        self._wf_gpu = gpuarray.to_gpu(N.ascontiguousarray(self._wf))

        mod = SourceModule("""
            __global__ void transform(double *ps, double *qs, double *wf_q_grid, double *wf, double *out_real, double *out_imag) {{
                int idx_x = threadIdx.x + blockIdx.x * blockDim.x;
                int idx_y = threadIdx.y + blockIdx.y * blockDim.y;
                int idx = idx_x + idx_y * {qn};
                double qdiff, prefactor, s, c;

                if (idx_x >= {qn} || idx_y >= {pn})
                    return;

                for (int j = 0; j < {wf_qn}; j++) {{
                    qdiff = wf_q_grid[j] - qs[idx];
                    prefactor = exp({g} * qdiff * qdiff) * wf[j];
                    sincos({h} * ps[idx] * qdiff, &s, &c);

                    out_real[idx] += prefactor * c;
                    out_imag[idx] += prefactor * s;
                }}
            }}
        """.format(g=-0.5*self._gamma, h=1./HBAR, pn=pn, qn=qn, wf_qn=wf_qn))
        self._kernel = mod.get_function('transform')
        self._kernel.prepare('PPPPPP')

        self._gpu_grid, self._gpu_block = carve_array(qn, pn)

예제 #27

0

파일 보기

파일: gputools.py 프로젝트: matthiaslee/VMBD

def gamma_uncompress_gpu(image):
  """Do a gamma decompression on the image.

  This function does an in-place gamma decompression on the input image.

  Args:
      y_gpu: Input image on GPU.

  Returns:
      Nothing, but modifies y_gpu in place.
  """
  
  shape = np.array(y_gpu.shape).astype(np.uint32)
  dtype = y_gpu.dtype
  if len(shape) == 3:
    dim = int(shape[2])
  else:
    dim = 1
  block_size = (16,16,dim)
  grid_size = (int(np.ceil(float(shape[1])/block_size[0])),
               int(np.ceil(float(shape[0])/block_size[1])))
  
  preproc = _generate_preproc(dtype, shape)
  mod = SourceModule(preproc + kernel_code, keep=True)
  
  gamma_uncompress_fun = mod.get_function("gamma_uncompress")
  gamma_uncompress_fun(y_gpu.gpudata, block=block_size, grid=grid_size)

예제 #28

0

파일 보기

파일: gputools.py 프로젝트: matthiaslee/VMBD

def shock_filter_gpu(y_gpu, iter=3, dt=0.1, h=1):
  """Evolve image according to a "shock filter" process.

  This function does an in-place shock filtering on the input image.

  Args:
      y_gpu: Input image on GPU.
      iter: Number of time steps.
      dt: Duration of one time step.
      h: Size of grid steps.

  Returns:
      Nothing, but modifies y_gpu in place.
  """

  stream = cu.Stream()

  shape = np.array(y_gpu.shape).astype(np.uint32)
  dtype = y_gpu.dtype
  block_size = (16,16,1)
  grid_size = (int(np.ceil(float(shape[1])/block_size[0])),
               int(np.ceil(float(shape[0])/block_size[1])))
  shared_size = int((2+block_size[0])*(2+block_size[1])*dtype.itemsize)  

  preproc = _generate_preproc(dtype, shape)  
  mod = SourceModule(preproc + kernel_code, keep=True)

  shock_gpu = mod.get_function("shock")
  
  for i in range(iter):
    shock_gpu(y_gpu.gpudata, np.float32(dt), np.float32(h),
          block=block_size, grid=grid_size, stream=stream, shared=shared_size)
    stream.synchronize()

예제 #29

0

파일 보기

파일: codegen.py 프로젝트: jfrelinger/gpustats

class CUDAModule(object):
    """
    Interfaces with PyCUDA

    Parameters
    ----------
    kernel_dict :
    """
    def __init__(self, kernel_dict):
        self.kernel_dict = kernel_dict
        self.support_code = _get_support_code()

        self.all_code = self._get_full_source()
        try:
            self.pycuda_module = SourceModule(self.all_code)
        except Exception:
            f = open('foo.cu', 'w')
            print >> f, self.all_code
            f.close()
            raise

    def _get_full_source(self):
        formatted_kernels = [kern.get_code()
                             for kern in self.kernel_dict.values()]
        return '\n'.join([self.support_code] + formatted_kernels)

    def get_function(self, name):
        return self.pycuda_module.get_function('k_%s' % name)

예제 #30

0

파일 보기

파일: gputools.py 프로젝트: matthiaslee/VMBD

def bfilter_gpu(y_gpu, w=5, sigma_d=3, sigma_r=0.1):
  """Two dimensional bilateral filtering.

  This function implements 2-D bilateral filtering using the method
  outlined in:
      C. Tomasi and R. Manduchi. Bilateral Filtering for 
      Gray and Color Images. In Proceedings of the IEEE 
      International Conference on Computer Vision, 1998.
  This operation is in place.

  Args:
      y_gpu: Input image on GPU.
      w: Half-size of the filter.
      sigma_d: Spatial domain deviation.
      sigma_r: Intensity domain deviation.

  Returns:
      Nothing, but modifies y_gpu in place.    
  """

  shape = np.array(y_gpu.shape).astype(np.uint32)
  dtype = y_gpu.dtype
  block_size = (16,16,1)
  grid_size = (int(np.ceil(float(shape[1])/block_size[0])),
               int(np.ceil(float(shape[0])/block_size[1])))
  shared_size = int((2*w+block_size[0])*(2*w+block_size[1])*dtype.itemsize)

  preproc = _generate_preproc(dtype, shape)
  mod = SourceModule(preproc + kernel_code, keep=True)

  bfilter_gpu = mod.get_function("bfilter")

  bfilter_gpu(y_gpu.gpudata, np.int32(w), np.float32(sigma_d),
              np.float32(sigma_r),
              block=block_size, grid=grid_size, shared=shared_size)

예제 #31

0

파일 보기

    from pycuda.compiler import SourceModule
    args = 'float *fr00'
    body = 'f = 0.543*fr00[idx];\n\t__syncthreads();\n'
    for i in range(nloop):
        if (i > 0):
            args += ', float *fr%.2d' % i
            body += '\tf += %1.3f*fr%.2d[idx];\n\t__syncthreads();\n' % (
                np.random.ranf(), i)
        kernels += kernel_template.replace('NAME', 'func%.2d' % i).replace(
            'ARGS', args).replace('BODY', body)
    print kernels

    mod = SourceModule(kernels)
    kern_list = []
    for i in range(nloop):
        kern_list.append(mod.get_function("func%.2d" % i))

    # measure kernel execution time
    start = cuda.Event()
    stop = cuda.Event()
    exec_time = np.zeros(nloop, dtype=np.float64)
    for k in range(10):
        for i in range(nloop):
            fr_gpus = 'fr_gpu_list[0]'
            for j in range(1, i + 1):
                fr_gpus += ', fr_gpu_list[%d]' % j
            cmd = 'kern_list[%d](fw_gpu, %s, block=(512,1,1), grid=(nx/512,1))' % (
                i, fr_gpus)
            #print cmd

            start.record()

예제 #32

0

파일 보기

import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

import numpy as np

a = np.random.randn(4, 4)

a = a.astype(np.float32)

a_gpu = cuda.mem_alloc(a.nbytes)

cuda.memcpy_htod(a_gpu, a)

mod = SourceModule("""
    __global__ void doublify(float *a)
    {
        int idx = threadIdx.x + threadIdx.y * 4;
        a[idx] *= 2;
    }
    """)

func = mod.get_function("doublify")
func(a_gpu, block=(4, 4, 1))

a_doubled = np.empty_like(a)

cuda.memcpy_dtoh(a_doubled, a_gpu)

print a_doubled
print a

예제 #33

0

파일 보기

                for(int binsidx = binpfx[bin_idx]; binsidx < binpfx[bin_idx] + bincounters[bin_idx]; binsidx++){{
                    int otherr = (int)(bins[binsidx][0]); // location in field
                    int otherc = (int)(bins[binsidx][1]); 
                    int viewr = otherr - myr + view_range;
                    int viewc = otherc - myc + view_range;
                    // If if statement passes, then other agent is in view range.
                    if(0 <= viewr && viewr < {view_size} && 0 <= viewc && viewc < {view_size}){{ 
                        states[viewr * {view_size} + viewc] = 1.0;
                    }}
                }}
            }}
        }}

        // Render -1 for walls
        for(int viewr = 0; viewr < {view_size}; viewr++){{
            for(int viewc = 0; viewc < {view_size}; viewc++){{
                int fieldr = myr + viewr - view_range;
                int fieldc = myc + viewc - view_range;
                if(fieldr < 0 || fieldc < 0 || fieldr >= {rows} || fieldc >= {cols}){{
                    int stateidx = statesize * i + viewr * {view_size} + viewc;
                    states[stateidx] = -1;
                }}
            }}
        }}
    }}
}}"""
print(kernel)

mod = SourceModule(kernel)
step = mod.get_function("tensor")

예제 #34

0

파일 보기

def _get_bprop_upsampling(clss, compute_capability):

    code = r"""
#define FLT_MAX 3.402823466E+38F

%(common)s

__global__ void spool_bprop_upsampling(
    const %(type)s* I, %(type)s* O, unsigned char* A,
    float alpha, float beta, int flags,
    int N, int W, int H, int D, int C,
    int WN, int HWN, int DHWN, int P, int Q,
    int magic_P, int shift_P, int QN, int PQN, int MPQN,
    int pad_c, int pad_d, int pad_h, int pad_w,
    int str_c, int str_d, int str_h, int str_w,
    int S, int RS, int RST, int JRST,
    int magic_S, int shift_S,
    int magic_RS, int shift_RS, int magic_RST, int shift_RST,
    int supP, int supQ, int shlP, int maskP, int shrP,
    int shlQ, int maskQ, int shrQ, int maskN, int shrN
    %(stats_args)s
    )
{
    extern __shared__ int lut[];
    int tid = threadIdx.x;

    int q  = blockIdx.x;
    int mp = blockIdx.y;
    int k  = blockIdx.z;

    int m = mp * magic_P; m >>= shift_P;
    int p = mp - m*supP;

    // zigzag q back and forth to improve L2 cache perf
    if (p & 1)
        q = supQ - q - 1;

    // Superblock P and Q
    p = (p << shlP) + ((tid & maskP) >> shrP);
    q = (q << shlQ) + ((tid & maskQ) >> shrQ);
    int n = tid & maskN;

    int sb = tid >> shrN;

    int offset = k*MPQN + m*PQN + p*QN + mad16(q, N, n);
    I += n;
    O += offset;
    A += offset;

    float O_val = beta != 0.0f && p < P && q < Q && n < N ? %(cvt)s(__ldg(O)) : 0.0f;

    if (tid < 32)
    {
        int kj = k * str_c - pad_c;
        int mt = m * str_d - pad_d;
        int pr = p * str_h - pad_h;
        int qs = q * str_w - pad_w;

        int inc = min(maskN + 1, 32);

        int jrst = n;
        while (jrst < JRST)
        {
            int j   = div16(jrst, magic_RST, shift_RST);
            int rst = mod16(jrst, j, RST);

            int t   = div16(rst, magic_RS, shift_RS);
            int rs  = mod16(rst, t, RS);

            int r   = div16(rs, magic_S, shift_S);
            int s   = mod16(rs, r, S);

            int x = qs + s;
            int y = pr + r;
            int z = mt + t;
            int c = kj + j;

            bool bounds_x  = x >= 0 && x < W;
            bool bounds_y  = y >= 0 && y < H;
            bool bounds_z  = z >= 0 && z < D;
            bool bounds_c  = c >= 0 && c < C;
            bool in_bounds = bounds_x && bounds_y && bounds_z && bounds_c;

            int sliceI  = c*DHWN + z*HWN + y*WN + x*N;

            int lut_offset = mad16(sb, JRST, jrst);

            lut[lut_offset] = in_bounds ? sliceI : -1;
            jrst += inc;
        }
    }
    __syncthreads();

    int intermediate_max = 0;

    if (p < P && q < Q && n < N)
    {
        int jrst = 0;
        int argmax = 0;
        float max = -FLT_MAX;
        while (jrst < JRST)
        {
            int lut_offset = mad16(sb, JRST, jrst);

            //int slice0 = lut[lut_offset + 0];
            //int slice1 = lut[lut_offset + 1];
            //int slice2 = lut[lut_offset + 2];
            //int slice3 = lut[lut_offset + 3];
            int slice = lut[lut_offset + *A - jrst];

            // val needs to stay in fp32 or can't be se to FLT_MAX
            //float val0 = jrst + 0 < JRST && slice0 >= 0 ? %(cvt)s(__ldg(I + slice0)) : -FLT_MAX;
            //float val1 = jrst + 1 < JRST && slice1 >= 0 ? %(cvt)s(__ldg(I + slice1)) : -FLT_MAX;
            //float val2 = jrst + 2 < JRST && slice2 >= 0 ? %(cvt)s(__ldg(I + slice2)) : -FLT_MAX;
            //float val3 = jrst + 3 < JRST && slice3 >= 0 ? %(cvt)s(__ldg(I + slice3)) : -FLT_MAX;

            //if (*A == jrst + 0) {
            //    max = val0;
            //}
            //if (*A == jrst + 1) {
            //    max = val1;
            //}
            //if (*A == jrst + 2) {
            //    max = val2;
            //}
            //if (*A == jrst + 3) {
            //    max = val3;
            //}

            max = %(cvt)s(__ldg(I + slice));

            jrst += 4;
        }
        // convert back to fp to write out
        %(type)s temp_out = %(cvt_out)s( %(mul_by_scale)s (max*alpha + O_val*beta));
        if (!(flags & 1)) {
            *O = temp_out;
        }

        intermediate_max = max_abs(0, temp_out);  // compute abs
    }
    intermediate_max += 0;
    %(atomic_max)s
}
"""

    template_vals = prepare_template_vals(clss, compute_capability)
    code = code % template_vals
    module = SourceModule(code)
    kernel = module.get_function("spool_bprop_upsampling")
    sig = "3P 2f 44I" + ("Pf" if (clss[0] == "x") else "")
    kernel.prepare(sig)
    return kernel

예제 #35

0

파일 보기

#define INDEX(a, b, yshape) (a)*(yshape) + (b)

__global__ void matrixAddition(float *A,float *B, float alpha, float beta, int ydim)
{
unsigned int idx = threadIdx.x+(blockIdx.x*(blockDim.x*blockDim.y));
  
  unsigned int a      = idx/ydim;
  unsigned int b      = idx%ydim;
  
  A[INDEX(a, b, ydim)] = alpha*A[INDEX(a, b, ydim)]+beta*B[INDEX(a, b, ydim)];

}
"""
    )

matrixAddition = mod.get_function("matrixAddition")


def matAdd(A, B, alpha, beta):
    forme1 = A.shape
    forme2 = B.shape
    if (forme1 != forme2):
        sys.exit('matrix dimensions differ')

    aSize = forme1[0] * forme1[1]
    xdim = np.int32(forme1[0])
    ydim = np.int32(forme1[1])

    A = np.reshape(A, aSize, order='F').astype(np.float32)
    B = np.reshape(B, aSize, order='F').astype(np.float32)
    alpha = np.float32(alpha)

예제 #36

0

파일 보기

    // result must start off as a zero array
    __global__ void BatchMatColDotKernel(const int p, const int q, const float *W, const float *W_tilde, 
    const int *batch_i, const int *batch_j, float *result) {
      int batch_index = blockIdx.y * blockDim.y + threadIdx.y;
      if (batch_index >= p) return;
      int col = blockIdx.x * blockDim.x + threadIdx.x;
      if (col >= q) return; 

      int row_W = batch_i[batch_index];
      int row_W_tilde = batch_j[batch_index];

      atomicAdd(&result[batch_index], W[row_W * q + col] * W_tilde[row_W_tilde * q + col]);
    }
    """)

batchMatSubtractInplace = mod.get_function("BatchMatSubtractInplaceKernel")
batchVecSubtractInplace = mod.get_function("BatchVecSubtractInplaceKernel")
batchMatVecRowMult = mod.get_function("BatchMatVecRowMultKernel")
batchCopyVectorKernel = mod.get_function("BatchCopyVectorKernel")
batchMatColDot = mod.get_function("BatchMatColDotKernel")


def assertResultsClose(gpu_result, actual_result):
    assert np.allclose(gpu_result, actual_result), "GPU Result:\n" + repr(gpu_result) \
      + "\nActual Result:\n" + repr(actual_result) + "\n"


def testBatchCopyVectorKernel():
    batch_size = 128
    num_elems = 1000

예제 #37

0

파일 보기

파일: IP CUDA Final.py 프로젝트: Amith-Kumar-V/ImageProcessingCUDA

def CudaNegative(inPath, outPath):

    totalT0 = time.clock()

    im = Image.open(inPath)
    px = numpy.array(im)
    px = px.astype(numpy.float32)

    getAndConvertT1 = time.clock()

    allocT0 = time.clock()
    d_px = cuda.mem_alloc(px.nbytes)
    cuda.memcpy_htod(d_px, px)

    allocT1 = time.clock()

    #Kernel declaration
    kernelT0 = time.clock()

    #Kernel grid and block size
    BLOCK_SIZE = 1024
    block = (1024, 1, 1)
    checkSize = numpy.int32(im.size[0] * im.size[1])
    grid = (int(im.size[0] * im.size[1] / BLOCK_SIZE) + 1, 1, 1)

    #Kernel text
    kernel = """
 
    __global__ void ng( float *inIm, int check ){
 
        int idx = (threadIdx.x ) + blockDim.x * blockIdx.x ;
 
        if(idx *3 < check*3)
        { 
        	inIm[idx*3]= 255-inIm[idx*3];
        	inIm[idx*3+1]= 255-inIm[idx*3+1];
        	inIm[idx*3+2]= 255-inIm[idx*3+2];
        }
    }
    """

    #Compile and get kernel function
    mod = SourceModule(kernel)
    func = mod.get_function("ng")
    func(d_px, checkSize, block=block, grid=grid)

    kernelT1 = time.clock()

    #Get back data from gpu
    backDataT0 = time.clock()

    ngPx = numpy.empty_like(px)
    cuda.memcpy_dtoh(ngPx, d_px)
    ngPx = (numpy.uint8(ngPx))

    backDataT1 = time.clock()

    #Save image
    storeImageT0 = time.clock()
    pil_im = Image.fromarray(ngPx, mode="RGB")

    pil_im.save(outPath)

    totalT1 = time.clock()

    getAndConvertTime = getAndConvertT1 - totalT0
    allocTime = allocT1 - allocT0
    kernelTime = kernelT1 - kernelT0
    backDataTime = backDataT1 - backDataT0
    storeImageTime = totalT1 - storeImageT0
    totalTime = totalT1 - totalT0

    print "Negative image"
    print "Image size: ", im.size
    print "Time taken to get and convert image data: ", getAndConvertTime
    print "Time taken to allocate memory on the GPU: ", allocTime
    print "Kernel execution time: ", kernelTime
    print "Time taken to get image data from GPU and convert it: ", backDataTime
    print "Time taken to save the image: ", storeImageTime
    print "Total execution time: ", totalTime
    print

예제 #38

0

파일 보기

파일: fluxCalculations.py 프로젝트: atristandyer/sweSolver

from gpu.gpuSimulation import useCachedKernels

fluxCode = open(
    os.path.join(os.path.dirname(__file__), './fluxCalculations.cu'), 'r')

try:

    # Put the kernel code into a SourceModule
    if useCachedKernels:
        fluxModule = SourceModule(fluxCode.read())
    else:
        fluxModule = SourceModule(fluxCode.read(), cache_dir=False)
    fluxCode.close()

    # Create reference to the specific functions in the SourceModule
    FluxSolverFn = fluxModule.get_function("FluxSolver")
    BuildRFn = fluxModule.get_function("buildRValues")

    # Create callable functions
    def FluxSolver(FluxesGPU, UIntPtsGPU, BottomIntPtsGPU, propSpeedsGPU, m, n,
                   blockDims, gridDims):

        FluxSolverFn(FluxesGPU,
                     UIntPtsGPU,
                     BottomIntPtsGPU,
                     propSpeedsGPU,
                     np.int32(m),
                     np.int32(n),
                     block=(blockDims[0], blockDims[1], 1),
                     grid=(gridDims[0], gridDims[1]))

예제 #39

0

파일 보기

mod = SourceModule("""
surface<void, 2> surf;
__global__ void kernel(int width, int height)
{
    // Calculate surface coordinates
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x < 400 && y < 400) {
        float data = x / 400.f;
        // Write to output surface
        surf2Dwrite(data, surf, x*4, y);
    }
}
""")

kernel_function = mod.get_function('kernel')
surface_ref = mod.get_surfref('surf')
# surface_ref.set_array(Density.ping_array,0)
surface_ref.set_array(Density.ping_array)

def Program(fragment):
    program = gloo.Program("vertex_passthrough.vert", fragment, count=4)
    program['Position'] = [(-1,-1), (-1,+1), (+1,-1), (+1,+1)]
    return program



Density = Slab(GridWidth, GridHeight, 1, gl.GL_LINEAR)

prog_visualize = Program("visualize.frag")

예제 #40

0

파일 보기

파일: morphology.py 프로젝트: mprender/FiberFit

def _diameter_kernel():
    """Returns the CUDA kernel to estimate 3D diameter of structures in a 3D local window.
    """
    diameter_kernel_src = """

    texture<float, cudaTextureType3D, cudaReadModeElementType> tex_data;

    __global__ void diameter3d (unsigned int width,
                                unsigned int height,
                                unsigned int depth,
                                const int n_points,
                                const float norm_factor,
                                const int max_iters,
                                const int n_scan_angles,
                                const int *X,
                                const int *Y,
                                const int *Z,
                                const float *scan_angl_arr,
                                const float *azth_data,
                                const float *lat_data,
                                float *radius_arr)
    {
        unsigned long blockId, idx;
        blockId = blockIdx.x + blockIdx.y * gridDim.x;
        idx = blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;

        if (idx > n_points) {
            return;
        }

        float _x, _y, _z;
        _x = (float)X[idx] + 0.5;
        _y = (float)Y[idx] + 0.5;
        _z = (float)Z[idx] + 0.5;

        // -----------------------------------------
        // Find the diameter

        float azth = azth_data[idx];
        azth += M_PI_2;
        float lat = lat_data[idx];

        float cy = cosf(lat), sy = sinf(lat);
        float cz = cosf(azth), sz = sinf(azth);
        //float cz = -sinf(azth), sz = cosf(azth); // taking into account azth + pi/2

        //vector along fiber
        //float dx = -sy * sz;
        //float dy = cy* sz;
        //float dz = cy;
        float uvec[3] = {0.0, 0.0, 1.0};
        float fiber_vector_x[3] = {0.0,
                                   uvec[2]*cy + uvec[3]*sy,
                                  -uvec[2]*sy + uvec[3]*cy};

        float fiber_vector_z[3] = {fiber_vector_x[0]*cz - fiber_vector_x[1]*sz,
                                   fiber_vector_x[0]*sz + fiber_vector_x[1]*cz,
                                   fiber_vector_x[2]};

        float dx = fiber_vector_z[0], dy = fiber_vector_z[1], dz = fiber_vector_z[2];

        //scan vector perpendicular to a fiber vector (rotation X -> Z)
        float scan_vec[3] = {0, 1, 0}; // unit vector perpendicular to default (0,0,1) direction
        float rot_scan_vec_x[3] = {0.0,
                                   scan_vec[1]*cy + scan_vec[2]*sy,
                                  -scan_vec[1]*sy + scan_vec[2]*cy};


        float rot_scan_vec_z[3] = {rot_scan_vec_x[0]*cz - rot_scan_vec_x[1]*sz,
                                   rot_scan_vec_x[0]*sz + rot_scan_vec_x[1]*cz,
                                   rot_scan_vec_x[2]};

        float out_radius = 0;

        for (int scan_angl_idx = 0; scan_angl_idx < n_scan_angles; scan_angl_idx++) {
            float theta = scan_angl_arr[scan_angl_idx];

            float ct = cosf(theta), st = sinf(theta);
            float x = rot_scan_vec_z[0], y = rot_scan_vec_z[1], z = rot_scan_vec_z[2];
            float u = dx, v = dy, w = dz;

            //rotation of point (x,y,z) around axis (u,v,w)
            float scan_vec_coords[3] =
                        {u*(u*x + v*y + w*z)*(1.0f - ct) + x*ct + (-w*y + v*z)*st,
                         v*(u*x + v*y + w*z)*(1.0f - ct) + y*ct + (w*x - u*z)*st,
                         w*(u*x + v*y + w*z)*(1.0f - ct) + z*ct + (-v*x + u*y)*st};

            float nc[3] = {_x, _y, _z};
            float p[3];

            for (int i = 0; i < max_iters; i++) {
                nc[0] += scan_vec_coords[0];
                nc[1] += scan_vec_coords[1];
                nc[2] += scan_vec_coords[2];

                if (tex3D(tex_data, nc[0], nc[1], nc[2]) == 0) {
                        p[0] = nc[0];
                        p[1] = nc[1];
                        p[2] = nc[2];
                        break;
                }
            }

            out_radius += norm3df(p[0] - _x, p[1] - _y, p[2] - _z);
        }

        radius_arr[idx] = out_radius * norm_factor;
    }
    """

    dm_program = SourceModule(diameter_kernel_src)
    diameter3d = dm_program.get_function("diameter3d")

    return dm_program, diameter3d

예제 #41

0

파일 보기

파일: 746-smem-extend_y-15block-tex-register.py 프로젝트: xj361685640/fdtd_accelerate

    ey_gpu = cuda.to_device(f)
    ez_gpu = cuda.to_device(f)
    hx_gpu = cuda.to_device(f)
    hy_gpu = cuda.to_device(f)
    hz_gpu = cuda.to_device(f)

    cex_gpu = cuda.to_device(set_c(cf, (None, -1, -1)))
    cey_gpu = cuda.to_device(set_c(cf, (-1, None, -1)))
    cez_gpu = cuda.to_device(set_c(cf, (-1, -1, None)))

    # prepare kernels
    from pycuda.compiler import SourceModule
    mod = SourceModule(
        kernels.replace('TPB', str(tpb)).replace('nyz', str(ny * nz)).replace(
            'nx', str(nx)).replace('ny', str(ny)).replace('nz', str(nz)))
    update_h = mod.get_function("update_h")
    update_e = mod.get_function("update_e")
    update_src = mod.get_function("update_src")
    thx = mod.get_texref("thx")
    thy = mod.get_texref("thy")
    thz = mod.get_texref("thz")
    tcex = mod.get_texref("tcex")
    tcey = mod.get_texref("tcey")
    tcez = mod.get_texref("tcez")

    thx.set_address(hx_gpu, f.nbytes)
    thy.set_address(hy_gpu, f.nbytes)
    thz.set_address(hz_gpu, f.nbytes)
    tcex.set_address(cex_gpu, cf.nbytes)
    tcey.set_address(cey_gpu, cf.nbytes)
    tcez.set_address(cez_gpu, cf.nbytes)

예제 #42

0

파일 보기

def _build_kernel_float32():
    code = """

    __device__ float weight(float B, float A, float div1) {
        float tmp = B - A;
        return exp( - tmp * tmp * div1 );
        }

    __global__ void Funktion(int axis, int segment, float *raw, int *slices, float *a, int xsh, int ysh, int zsh, int *indices, int sorw, float *Beta, int nbrw) {

        int col_g = blockIdx.x * blockDim.x + threadIdx.x;
        int row_g = blockIdx.y * blockDim.y + threadIdx.y;
        int slc_g = blockIdx.z;

        int xsh_g, ysh_g, plane, row, column;

        if (axis == 0) {
            plane  = indices[slc_g];
            row    = row_g;
            column = col_g;
            xsh_g  = xsh;
            ysh_g  = ysh;
            }
        else if (axis == 1) {
            row    = indices[slc_g];
            plane  = row_g;
            column = col_g;
            xsh_g  = xsh;
            ysh_g  = zsh;
            }
        else if (axis == 2) {
            column = indices[slc_g];
            plane  = row_g;
            row    = col_g;
            xsh_g  = ysh;
            ysh_g  = zsh;
            }

        int flat   = xsh * ysh;
        int flat_g = xsh_g * ysh_g;
        unsigned int index = slc_g * flat_g + row_g * xsh_g + col_g;

        if (index<gridDim.z*flat_g && plane>0 && plane<zsh-1 && row>0 && row<ysh-1 && column>0 && column<xsh-1) {

            if (slices[index]==segment) {

                /* Adaptive random walks */
                int found = 0;
                if ((col_g + row_g) % 4 == 0) {
                    found = 1;
                    }
                else {
                    for (int y = -100; y < 101; y++) {
                        for (int x = -100; x < 101; x++) {
                            if (row_g+y > 0 && col_g+x > 0 && row_g+y < ysh_g-1 && col_g+x < xsh_g-1) {
                                unsigned int tmp = slc_g * flat_g + (row_g + y) * xsh_g + (col_g + x);
                                if (slices[tmp] != segment && slices[tmp] != -1) {
                                    found = 1;
                                    }
                                }
                            }
                        }
                    }

                if (found == 1) {

                    float rand;
                    float W0,W1,W2,W3,W4,W5;
                    int n,o,p;

                    /* Initialize MRG32k3a */
                    float norm = 2.328306549295728e-10;
                    float m1 = 4294967087.0;
                    float m2 = 4294944443.0;
                    float a12 = 1403580.0;
                    float a13n = 810728.0;
                    float a21 = 527612.0;
                    float a23n = 1370589.0;
                    long k1;
                    float p1, p2;
                    float s10 = index, s11 = index, s12 = index, s20 = index, s21 = index, s22 = index;

                    /* Compute standard deviation */
                    unsigned int position = plane*flat + row*xsh + column;
                    float B = raw[position];
                    float var = Beta[index];
                    float div1 = 1 / (2 * var);

                    int k = plane;
                    int l = row;
                    int m = column;

                    int step = 0;
                    int n_rw = 0;

                    /* Compute random walks */
                    while (n_rw < nbrw) {

                        /* Compute weights */
                        W0 = weight(B, raw[position + flat], div1);
                        W1 = weight(B, raw[position - flat], div1);
                        W2 = weight(B, raw[position + xsh], div1);
                        W3 = weight(B, raw[position - xsh], div1);
                        W4 = weight(B, raw[position + 1], div1);
                        W5 = weight(B, raw[position - 1], div1);

                        W1 += W0;
                        W2 += W1;
                        W3 += W2;
                        W4 += W3;
                        W5 += W4;

                        /* Compute random numbers with MRG32k3a */

                        /* Component 1 */
                        p1 = a12 * s11 - a13n * s10;
                        k1 = p1 / m1;
                        p1 -= k1 * m1;
                        if (p1 < 0.0){
                            p1 += m1;}
                        s10 = s11;
                        s11 = s12;
                        s12 = p1;

                        /* Component 2 */
                        p2 = a21 * s22 - a23n * s20;
                        k1 = p2 / m2;
                        p2 -= k1 * m2;
                        if (p2 < 0.0){
                            p2 += m2;}
                        s20 = s21;
                        s21 = s22;
                        s22 = p2;

                        /* Combination */
                        if (p1 <= p2) {
                            rand = W5 * ((p1 - p2 + m1) * norm);
                            }
                        else {
                            rand = W5 * ((p1 - p2) * norm);
                            }

                        /* Determine new direction of random walk */
                        if (rand<W0 || rand==0){n=1; o=0; p=0;}
                        else if (rand>=W0 && rand<W1){n=-1; o=0; p=0;}
                        else if (rand>=W1 && rand<W2){n=0; o=1; p=0;}
                        else if (rand>=W2 && rand<W3){n=0; o=-1; p=0;}
                        else if (rand>=W3 && rand<W4){n=0; o=0; p=1;}
                        else if (rand>=W4 && rand<=W5){n=0; o=0; p=-1;}

                        /* Move in new direction */
                        if (k+n>0 && k+n<zsh-1 && l+o>0 && l+o<ysh-1 && m+p>0 && m+p<xsh-1) {
                            k += n;
                            l += o;
                            m += p;
                            position = k*flat + l*xsh + m;
                            atomicAdd(&a[position], 1);
                            }

                        step += 1;

                        if (step==sorw) {
                            k = plane;
                            l = row;
                            m = column;
                            position = k*flat + l*xsh + m;
                            n_rw += 1;
                            step = 0;
                            }
                        }
                    }
                }
            }
        }
    """
    mod = SourceModule(code)
    kernel = mod.get_function("Funktion")
    return kernel

예제 #43

0

파일 보기

	mcopy = cuda.Memcpy3D()
	mcopy.width_in_bytes = mcopy.src_pitch = f.strides[1]
	mcopy.src_height = mcopy.height = ny
	mcopy.depth = nz

	memcopy(mcopy, set_c(f,(None,-1,-1)), cex_gpu)
	memcopy(mcopy, set_c(f,(-1,None,-1)), cey_gpu)
	memcopy(mcopy, set_c(f,(-1,-1,None)), cez_gpu)
	memcopy(mcopy, set_c(f,(None,0,0)), chx_gpu)
	memcopy(mcopy, set_c(f,(0,None,0)), chy_gpu)
	memcopy(mcopy, set_c(f,(0,0,None)), chz_gpu)

	# prepare kernels
	from pycuda.compiler import SourceModule
	mod = SourceModule(kernels)
	update_e = mod.get_function("update_e")
	update_h = mod.get_function("update_h")
	update_src = mod.get_function("update_src")
	tex = mod.get_texref("tex")
	tey = mod.get_texref("tey")
	tez = mod.get_texref("tez")
	thx = mod.get_texref("thx")
	thy = mod.get_texref("thy")
	thz = mod.get_texref("thz")
	tcex = mod.get_texref("tcex")
	tcey = mod.get_texref("tcey")
	tcez = mod.get_texref("tcez")
	tchx = mod.get_texref("tchx")
	tchy = mod.get_texref("tchy")
	tchz = mod.get_texref("tchz")

예제 #44

0

파일 보기

파일: IP CUDA Final.py 프로젝트: Amith-Kumar-V/ImageProcessingCUDA

def CudaBrightness(inPath, outPath):

    totalT0 = time.clock()

    im = Image.open(inPath)
    px = numpy.array(im)
    px = px.astype(numpy.float32)

    getAndConvertT1 = time.clock()

    allocT0 = time.clock()
    d_px = cuda.mem_alloc(px.nbytes)
    cuda.memcpy_htod(d_px, px)

    allocT1 = time.clock()

    #Kernel declaration
    kernelT0 = time.clock()

    #Kernel grid and block size
    BLOCK_SIZE = 1024
    block = (1024, 1, 1)
    checkSize = numpy.int32(im.size[0] * im.size[1])
    grid = (int(im.size[0] * im.size[1] / BLOCK_SIZE) + 1, 1, 1)

    #Kernel text
    kernel = """
 
    __global__ void br( float *inIm, int check, int brightness ){
 
        int idx = (threadIdx.x ) + blockDim.x * blockIdx.x ;
        if(idx *3 < check*3)
        { 
			if(inIm[idx*3]+brightness > 255)
				inIm[idx*3] = 255;
			else 
        		inIm[idx*3]= inIm[idx*3]+brightness;
        	
        	if(inIm[idx*3+1]+brightness > 255)
				inIm[idx*3+1] = 255;
			else 
        		inIm[idx*3+1]= inIm[idx*3+1]+brightness;
        	
        	if(inIm[idx*3+2]+brightness > 255)
				inIm[idx*3+2] = 255;
			else 
        		inIm[idx*3+2]= inIm[idx*3+2]+brightness;
        }
    }
    """

    brightness = int(
        raw_input("Enter the level of brightness (-255 to 255): "))
    print
    if brightness > 255:
        brightness = 255
    if brightness < -255:
        brightness = -255
    brightness = numpy.int32(brightness)
    #Compile and get kernel function
    mod = SourceModule(kernel)
    func = mod.get_function("br")
    func(d_px, checkSize, brightness, block=block, grid=grid)

    kernelT1 = time.clock()

    #Get back data from gpu
    backDataT0 = time.clock()

    brPx = numpy.empty_like(px)
    cuda.memcpy_dtoh(brPx, d_px)
    brPx = (numpy.uint8(brPx))

    backDataT1 = time.clock()

    #Save image
    storeImageT0 = time.clock()
    pil_im = Image.fromarray(brPx, mode="RGB")

    pil_im.save(outPath)

    totalT1 = time.clock()

    getAndConvertTime = getAndConvertT1 - totalT0
    allocTime = allocT1 - allocT0
    kernelTime = kernelT1 - kernelT0
    backDataTime = backDataT1 - backDataT0
    storeImageTime = totalT1 - storeImageT0
    totalTime = totalT1 - totalT0

    print "Brightness filter"
    print "Image size : ", im.size
    print "Time taken to get and convert image data: ", getAndConvertTime
    print "Time taken to allocate memory on the GPU: ", allocTime
    print "Kernel execution time: ", kernelTime
    print "Time taken to get image data from GPU and convert it: ", backDataTime
    print "Time taken to save the image: ", storeImageTime
    print "Total execution time: ", totalTime
    print

예제 #45

0

파일 보기

파일: 07_square.py 프로젝트: robotchaoX/Hands-On-GPU-Accelerated-Computer-Vision-with-OpenCV-and-CUDA

import pycuda.gpuarray as gpuarray
import pycuda.driver as drv
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy

# Kernel function
mod = SourceModule("""
    __global__ void square(float *d_a)
    {
        int idx = threadIdx.x + threadIdx.y*5;
        d_a[idx] = d_a[idx]*d_a[idx];
    }
""")
square = mod.get_function("square")

# ---------------Using mem_alloc--------------- #
start = drv.Event()
end = drv.Event()
h_a = numpy.random.randint(1, 5, (5, 5))
h_a = h_a.astype(numpy.float32)
h_b = h_a.copy()
start.record()
d_a = drv.mem_alloc(h_a.size * h_a.dtype.itemsize)
drv.memcpy_htod(d_a, h_a)
# Calling kernel
square(d_a, block=(5, 5, 1), grid=(1, 1), shared=0)
h_result = numpy.empty_like(h_a)
drv.memcpy_dtoh(h_result, d_a)
end.record()

예제 #46

0

파일 보기

파일: IP CUDA Final.py 프로젝트: Amith-Kumar-V/ImageProcessingCUDA

def CudaColor(inPath, outPath):

    totalT0 = time.clock()

    im = Image.open(inPath)
    px = numpy.array(im)
    px = px.astype(numpy.float32)

    getAndConvertT1 = time.clock()

    allocT0 = time.clock()
    d_px = cuda.mem_alloc(px.nbytes)
    cuda.memcpy_htod(d_px, px)

    allocT1 = time.clock()

    #Kernel declaration
    kernelT0 = time.clock()

    #Kernel grid and block size
    BLOCK_SIZE = 1024
    block = (1024, 1, 1)
    checkSize = numpy.int32(im.size[0] * im.size[1])
    grid = (int(im.size[0] * im.size[1] / BLOCK_SIZE) + 1, 1, 1)

    #Kernel text
    kernel = """
 
    __global__ void co( float *inIm, int check, int color){
 
        int idx = (threadIdx.x ) + blockDim.x * blockIdx.x ;
        if(idx*3 < check*3)
        { 
			if(color == 0)
			{
				inIm[idx*3+1] = inIm[idx*3+1]-255;
				inIm[idx*3+2] = inIm[idx*3+2]-255;
			}
			else if(color == 1)
			{
				inIm[idx*3] = inIm[idx*3]-255;
				inIm[idx*3+2] = inIm[idx*3+2]-255;
			}
			else if(color == 2)
			{
				inIm[idx*3] = inIm[idx*3]-255;
				inIm[idx*3+1] = inIm[idx*3+1]-255;
			}
			
			if(inIm[idx*3] < 0)
				inIm[idx*3] = 0;
			if(inIm[idx*3] > 255)
				inIm[idx*3] = 255;
				
			if(inIm[idx*3+1] < 0)
				inIm[idx*3+1] = 0;
			if(inIm[idx*3+1] > 255)
				inIm[idx*3+1] = 255;
				
			if(inIm[idx*3+2] < 0)
				inIm[idx*3+2] = 0;
			if(inIm[idx*3+2] > 255)
				inIm[idx*3+2] = 255;
        }
    }
    """

    color = int(
        raw_input("Enter the color of the filter (0-Red;1-Green;2-Blue): "))
    print
    color = numpy.int32(color)
    #Compile and get kernel function
    mod = SourceModule(kernel)
    func = mod.get_function("co")
    func(d_px, checkSize, color, block=block, grid=grid)

    kernelT1 = time.clock()

    #Get back data from gpu
    backDataT0 = time.clock()

    coPx = numpy.empty_like(px)
    cuda.memcpy_dtoh(coPx, d_px)
    coPx = (numpy.uint8(coPx))

    backDataT1 = time.clock()

    #Save image
    storeImageT0 = time.clock()
    pil_im = Image.fromarray(coPx, mode="RGB")

    pil_im.save(outPath)

    totalT1 = time.clock()

    getAndConvertTime = getAndConvertT1 - totalT0
    allocTime = allocT1 - allocT0
    kernelTime = kernelT1 - kernelT0
    backDataTime = backDataT1 - backDataT0
    storeImageTime = totalT1 - storeImageT0
    totalTime = totalT1 - totalT0

    print "Color Filter"
    print "Image size : ", im.size
    print "Time taken to get and convert image data: ", getAndConvertTime
    print "Time taken to allocate memory on the GPU: ", allocTime
    print "Kernel execution time: ", kernelTime
    print "Time taken to get image data from GPU and convert it: ", backDataTime
    print "Time taken to save the image: ", storeImageTime
    print "Total execution time: ", totalTime
    print

예제 #47

0

파일 보기

            z_re2 = z_re*z_re;

            z_im = 2 * z_re * z_im + c_im;
            z_re = z_re2 - z_im2 + c_re;
        }
    }
    if(z_im2 + z_re2 > 1000){
        b[gid] = log(2 * cabs(z_re, z_im)*log(cabs(z_re, z_im))/cabs(dz_re, dz_im));
    } else {
        b[gid] = 0;
    }
}
"""

mod = SourceModule(KERNELS)
exterior_distance_gpu = mod.get_function('exterior_distance')


def exterior_distances(im, re):
    b = np.zeros_like(im)
    exterior_distance_gpu(cuda.In(im),
                          cuda.In(re),
                          cuda.InOut(b),
                          grid=(b.size // 1024, 1, 1),
                          block=(1024, 1, 1))
    return b


def process_and_store_image(image, re, im, zoom):
    image = cv2.GaussianBlur(image, (3, 3), cv2.BORDER_DEFAULT)
    image = cv2.resize(image, (16 * 256, 16 * 256),

예제 #48

0

파일 보기

파일: expm_affine_by_series_2D.py 프로젝트: marysalvana/cpabDiffeo

        
        
    }
        


    
}
}
"""
try:
    Context.get_device()
except:
    import pycuda.autoinit
mod = SourceModule(krnl, no_extern_c=True)
_gpu_expm = mod.get_function("expm")


def gpu_expm(As, Ts_vectorized, p=12):
    N = len(As)
    if Ts_vectorized.ndim != 2 or Ts_vectorized.shape[1] != 6:
        raise ValueError(Ts_vectorized.shape)

    threadsPerBlock = 512
    nBlocks = int(np.ceil(float(N) / float(threadsPerBlock)))

    _gpu_expm(As.gpu,
              Ts_vectorized.gpu,
              np.int32(N),
              np.int32(p),
              grid=(nBlocks, 1, 1),

예제 #49

0

파일 보기

파일: boundaryConditions.py 프로젝트: LuXiaofeng2018/sweSolverPythonGPU

        {
            meshU[row*n*3 + col*3] = meshU[row*n*3 + 2*3];
            meshU[row*n*3 + col*3 + 1] = meshU[m*n*3 + 2*3 + 1];
            meshU[row*n*3 + col*3 + 2] = meshU[m*n*3 + 2*3 + 2];
        }
        else if (col > n-3 && col < n && row < m)
        {
            meshU[row*n*3 + col*3] = meshU[row*n*3 + (n-3)*3];
            meshU[row*n*3 + col*3 + 1] = meshU[m*n*3 + (n-3)*3 + 1];
            meshU[row*n*3 + col*3 + 2] = meshU[m*n*3 + (n-3)*3 + 2];
        }
    }

""")

wallBoundaries = boundaryConditionsModule.get_function("applyWallBoundaries")
openBoundaries = boundaryConditionsModule.get_function("applyOpenBoundaries")

def applyWallBoundaries(meshUGPU, m, n, blockDims, gridDims):

    wallBoundaries(meshUGPU,
                   np.int32(m), np.int32(n),
                   block=(blockDims[0], blockDims[1], 1), grid=(gridDims[0], gridDims[1]))

def applyWallBoundariesTimed(meshUGPU, m, n, blockDims, gridDims):

    return wallBoundaries(meshUGPU,
                          np.int32(m), np.int32(n),
                          block=(blockDims[0], blockDims[1], 1), grid=(gridDims[0], gridDims[1]), time_kernel=True)

def applyOpenBoundaries(meshUGPU, m, n, blockDims, gridDims):

예제 #50

0

파일 보기

파일: build_model_GPU.py 프로젝트: rohit1607/DDDAS2020_DG3

def build_sparse_transition_model_at_T(T, T_gpu, vel_data_gpu, params, bDimx, params_gpu, xs_gpu, ys_gpu, ac_angles,
                                       results, sumR_sa, save_file_for_each_a=False):
    gsize = int(params[0])
    num_actions = int(params[1])
    nrzns = int(params[2])

    all_u_mat_gpu, all_v_mat_gpu, all_ui_mat_gpu, all_vi_mat_gpu, all_Yi_gpu = vel_data_gpu

    results_gpu_list = []
    sumR_sa_gpu_list = []
    for i in range(num_actions):
        results_gpu_list.append(cuda.mem_alloc(results.nbytes))
        sumR_sa_gpu_list.append(cuda.mem_alloc(sumR_sa.nbytes))
    for i in range(num_actions):
        cuda.memcpy_htod(results_gpu_list[i], results)
        cuda.memcpy_htod(sumR_sa_gpu_list[i], sumR_sa)

    print("alloted mem in inner func")


    # let one thread access a state centre. access coresponding velocities, run all actions
    # TODO: dt may not be int for a genral purpose code

    mod = SourceModule("""
    __device__ int32_t get_thread_idx()
            // assigns idx to thread with which it accesses the flattened 3d vxrzns matrix
            // for a given T and a given action. 
            // runs for both 2d and 3d grid
            // TODO: may have to change this considering cache locality
        {
            // here i, j, k refer to a general matrix M[i][j][k]
            int32_t i = threadIdx.x;
            int32_t j = blockIdx.y;
            int32_t k = blockIdx.x;
            int32_t idx = k + (j*gridDim.x)  + (i*gridDim.x*gridDim.y)+ blockIdx.z*blockDim.x*gridDim.x*gridDim.y;
            return idx;
        }
    __device__ int32_t state1D_from_thread(int32_t T)
    {   
        // j ~ blockIdx.x
        // i ~ blockIdx.y 
        // The above three consitute a spatial state index from i and j of grid
        // last term is for including time index as well.
        return (blockIdx.x + (blockIdx.y*gridDim.x) + (T*gridDim.x*gridDim.y) ); 
    }
    __device__ int32_t state1D_from_ij(int32_t*  posid, int32_t T)
    {
        // posid = {i , j}
        // state id = j + i*dim(i) + T*dim(i)*dim(j)
        return (posid[1] + posid[0]*gridDim.x + (T*gridDim.x*gridDim.y) ) ; 
    }
    __device__ bool is_edge_state(int32_t i, int32_t j)
    {
        // n = gsize -1 that is the last index of the domain assuming square domain
        int32_t n = gridDim.x - 1;
        if (i == 0 || i == n || j == 0 || j == n ) 
            {
                return true;
            }
        else return false;
    }
    __device__ bool is_terminal(int32_t i, int32_t j, float* params)
    {
        int32_t i_term = params[8];         // terminal state indices
        int32_t j_term = params[9];
        if(i == i_term && j == j_term)
        {
            return true;
        }
        else return false;
    }
    __device__ bool my_isnan(int s)
    {
    // By IEEE 754 rule, NaN is not equal to NaN
    return s != s;
    }
    __device__ void get_xypos_from_ij(int32_t i, int32_t j, float* xs, float* ys, float* x, float* y)
    {
        *x = xs[j];
        *y = ys[gridDim.x - 1 - i];
        return;
    }
    __device__ float get_angle_in_0_2pi(float theta)
    {
        float f_pi = 3.141592;
        if (theta < 0)
        {
            return theta + (2*f_pi);
        }
        else
        {
            return theta;
        }  
    }
    __device__ float calculate_reward_const_dt(float* xs, float* ys, int32_t i_old, int32_t j_old, float xold, float yold, int32_t* newposids, float* params, float vnet_x, float vnet_y )
    {
        // xold and yold are centre of old state (i_old, j_old)
        float dt = params[4];
        float r1, r2, theta1, theta2, theta, h;
        float dt_new;
        float xnew, ynew;
        if (newposids[0] == i_old && newposids[1] == j_old)
        {
            dt_new = dt;
        }
        else
        {
            get_xypos_from_ij(newposids[0], newposids[1], xs, ys, &xnew, &ynew); //get centre of new states
            h = sqrtf((xnew - xold)*(xnew - xold) + (ynew - yold)*(ynew - yold));
            r1 = h/(sqrtf((vnet_x*vnet_x) + (vnet_y*vnet_y)));
            theta1 = get_angle_in_0_2pi(atan2f(vnet_y, vnet_x));
            theta2 = get_angle_in_0_2pi(atan2f(ynew - yold, xnew - xold));
            theta = fabsf(theta1 -theta2);
            r2 = fabsf(sinf(theta));
            dt_new = r1 + r2;
            if (threadIdx.x == 0 && blockIdx.z == 0 && blockIdx.x == 1 && blockIdx.y == 1)
            {
                params[24] = r1;
                params[25] = r2;
            }
        }
        return -dt_new;
    }
    __device__ void move(float ac_angle, float vx, float vy, float* xs, float* ys, int32_t* posids, float* params, float* r )
    {
            int32_t n = params[0] - 1;      // gsize - 1
            // int32_t num_actions = params[1];
            // int32_t nrzns = params[2];
            float F = params[3];
            float dt = params[4];
            float r_outbound = params[5];
            float r_terminal = params[6];
            float Dj = fabsf(xs[1] - xs[0]);
            float Di = fabsf(ys[1] - ys[0]);
            float r_step = 0;
            *r = 0;
            int32_t i0 = posids[0];
            int32_t j0 = posids[1];
            float vnetx = F*cosf(ac_angle) + vx;
            float vnety = F*sinf(ac_angle) + vy;
            float x, y;
            get_xypos_from_ij(i0, j0, xs, ys, &x, &y); // x, y stores centre coords of state i0,j0
            float xnew = x + (vnetx * dt);
            float ynew = y + (vnety * dt);
            //checks TODO: remove checks once verified
            if (threadIdx.x == 0 && blockIdx.z == 0 && blockIdx.x == 1 && blockIdx.y == 1)
            {
                params[12] = x;
                params[13] = y;
                params[14] = vnetx;
                params[15] = vnety;
                params[16] = xnew;
                params[17] = ynew;
                params[18] = ac_angle;
            }
            if (xnew > xs[n])
                {
                    xnew = xs[n];
                    *r += r_outbound;
                }
            else if (xnew < xs[0])
                {
                    xnew = xs[0];
                    *r += r_outbound;
                }
            if (ynew > ys[n])
                {
                    ynew =  ys[n];
                    *r += r_outbound;
                }
            else if (ynew < ys[0])
                {
                    ynew =  ys[0];
                    *r += r_outbound;
                }
            // TODO:xxDONE check logic wrt remainderf. remquof had issue
            int32_t xind, yind;
            //float remx = remquof((xnew - xs[0]), Dj, &xind);
            //float remy = remquof(-(ynew - ys[n]), Di, &yind);
            float remx = remainderf((xnew - xs[0]), Dj);
            float remy = remainderf(-(ynew - ys[n]), Di);
            xind = ((xnew - xs[0]) - remx)/Dj;
            yind = (-(ynew - ys[n]) - remy)/Di;
            if ((remx >= 0.5 * Dj) && (remy >= 0.5 * Di))
                {
                    xind += 1;
                    yind += 1;
                }
            else if ((remx >= 0.5 * Dj && remy < 0.5 * Di))
                {
                    xind += 1;
                }
            else if ((remx < 0.5 * Dj && remy >= 0.5 * Di))
                {
                    yind += 1;
                }
            if (!(my_isnan(xind) || my_isnan(yind)))
                {
                    posids[0] = yind;
                    posids[1] = xind;
                    if (is_edge_state(posids[0], posids[1]))     //line 110
                        {
                            *r += r_outbound;
                        }
                    
                    if (threadIdx.x == 0 && blockIdx.z == 0 && blockIdx.x == 1 && blockIdx.y == 1)
                    {
                        params[26] = 9999;
                    }
                }
            r_step = calculate_reward_const_dt(xs, ys, i0, j0, x, y, posids, params, vnetx, vnety);
            //TODO: change back to normal when needed
            //r_step = -dt;
            *r += r_step; //TODO: numerical check remaining
            if (is_terminal(posids[0], posids[1], params))
                {
                    *r += r_terminal;
                }
            
            if (threadIdx.x == 0 && blockIdx.z == 0 && blockIdx.x == 1 && blockIdx.y == 1)
            {
                params[19] = xnew;
                params[20] = ynew;
                params[21] = yind;
                params[22] = xind;
                params[23] = *r;
                //params[17] = ynew;
                //params[18] = ac_angle;
            }
    }


    __device__ void extract_velocity(float* vx, float* vy, int32_t T, float* all_u_mat, float* all_v_mat, float* all_ui_mat, float* all_vi_mat, float* all_Yi, float* params)
    {
        int32_t nrzns = params[2];
        int32_t nmodes = params[7];              
        
        int32_t sp_uvi, str_uvi, sp_Yi, str_Yi; //startpoints and strides for accessing all_ui_mat, all_vi_mat and all_Yi
        float sum_x = 0;
        float sum_y = 0;
        float vx_mean, vy_mean;

        //thread index. also used to access resultant vxrzns[nrzns, gsize, gsize]
        int32_t idx = get_thread_idx();

        //rzn index to identify which of the 5k rzn it is. used to access all_Yi.
        int32_t rzn_id = (blockIdx.z * blockDim.x)  + threadIdx.x ;

        //mean_id is the index used to access the flattened all_u_mat[t,i,j].
        int32_t mean_id = state1D_from_thread(T);

        //to access all_ui_mat and all_vi_mat
        str_uvi = gridDim.x * gridDim.y;
        sp_uvi = (T * nmodes * str_uvi) + (gridDim.x * blockIdx.y) + (blockIdx.x);

        // to access all_Yi
        sp_Yi = (T * nrzns * nmodes) + (rzn_id * nmodes);

        vx_mean = all_u_mat[mean_id];
        for(int i = 0; i < nmodes; i++)
        {
            sum_x += all_ui_mat[sp_uvi + (i*str_uvi)]*all_Yi[sp_Yi + i];
        }

        vy_mean = all_v_mat[mean_id];
        for(int i = 0; i < nmodes; i++)
        {
            sum_y += all_vi_mat[sp_uvi + (i*str_uvi)]*all_Yi[sp_Yi + i];
        }
        
        *vx = vx_mean + sum_x;
        *vy = vy_mean + sum_y;
     
        return;
    }


    //test: changer from float* to float ac_angle
    __global__ void transition_calc(float* T_arr, float* all_u_mat, float* all_v_mat, float* all_ui_mat, float* all_vi_mat, float* all_Yi,
                                    float ac_angle, float* xs, float* ys, float* params, float* sumR_sa, float* results)
                                            // resutls directions- 1: along S2;  2: along S1;    3: along columns towards count
    {
        int32_t gsize = params[0];          // size of grid along 1 direction. ASSUMING square grid.
        int32_t num_actions = params[1];    
        int32_t nrzns = params[2];
        float F = params[3];
        float dt = params[4];
        float r_outbound = params[5];
        float r_terminal = params[6];
        int32_t nmodes = params[7];              
        int32_t i_term = params[8];         // terminal state indices
        int32_t j_term = params[9];
        int32_t nT = params[10];
        int32_t is_stationary = params[11];
        int32_t T = (int32_t)T_arr[0];
        int32_t idx = get_thread_idx();

        float vx, vy;

        if(idx < gridDim.x*gridDim.y*nrzns)
        {
            int32_t posids[2] = {blockIdx.y, blockIdx.x};    //static declaration of array of size 2 to hold i and j values of S1. 
            int32_t sp_id;      //sp_id is space_id. S1%(gsize*gsize)

            //  Afer move() these will be overwritten by i and j values of S2
            float r;              // to store immediate reward

            extract_velocity(&vx, &vy, T, all_u_mat, all_v_mat, all_ui_mat, all_vi_mat, all_Yi, params);

            //move(*ac_angle, vx, vy, xs, ys, posids, params, &r);
            move(ac_angle, vx, vy, xs, ys, posids, params, &r);
            int32_t S1, S2;
            if (is_stationary == 1)
            {
                T = 0;
                S1 = state1D_from_thread(T);     //get init state number corresponding to thread id
                S2 = state1D_from_ij(posids, T);   //get successor state number corresponding to posid and next timestep T+1        
            }
            else
            {
                S1 = state1D_from_thread(T);     //get init state number corresponding to thread id
                S2 = state1D_from_ij(posids, T+1);   //get successor state number corresponding to posid and next timestep T+1        
                sp_id = S1%(gsize*gsize);
            }
            //writing to sumR_sa. this array will later be divided by num_rzns, to get the avg
            float a = atomicAdd(&sumR_sa[sp_id], r); //TODO: try reduction if this is slow overall
            results[idx] = S2;
            __syncthreads();
            /*if (threadIdx.x == 0 && blockIdx.z == 0)
            {
                sumR_sa[S1] = sumR_sa[S1]/nrzns;    //TODO: change name to R_sa from sumR_sa since were not storing sum anymore
            }
           */
        }//if ends
        return;
    }
        """)

    # sumR_sa2 = np.empty_like(sumR_sa, dtype = np.float32)
    # cuda.memcpy_dtoh(sumR_sa2, sumR_sa_gpu)
    # print("sumR_sa",sumR_sa)
    # print("sumR_sa",sumR_sa2[0:10001])
    # T = np.array(T64, dtype = np.float32)
    params2 = np.empty_like(params).astype(np.float32)
    func = mod.get_function("transition_calc")
    for i in range(num_actions):
        print('T', T, " call kernel for action: ",i)
        func(T_gpu, all_u_mat_gpu, all_v_mat_gpu, all_ui_mat_gpu, all_vi_mat_gpu, all_Yi_gpu, ac_angles[i], xs_gpu, ys_gpu, params_gpu, sumR_sa_gpu_list[i], results_gpu_list[i],
             block=(bDimx, 1, 1), grid=(gsize, gsize, (nrzns // bDimx) + 1))
        if i == 0:
            cuda.memcpy_dtoh(params2, params_gpu)
            print("params check:",)
            print(  '\nangle= ', params2[18],
                    '\nx =' ,params2[12],
                '\ny =' ,params2[13] ,
                    '\nvnetx = ',params2[14],
                    '\nvnety =', params2[15],
                    '\nxnew =', params2[16],
                    '\nynew =', params2[17],
                    '\nxnewupd =', params2[19],
                    '\nynewupd =', params2[20],
                    '\nyind i=', params2[21],
                    '\nxind j=', params2[22],
                    '\nr- =', params2[23],
                    '\nr1+ =', params2[24],
                    '\nr2+ =', params2[25],
                    '\nenter_isnan =', params2[26]
                )

    results2_list = []
    sum_Rsa2_list = []
    for i in range(num_actions):
        results2_list.append(np.empty_like(results))
        sum_Rsa2_list.append(np.empty_like(sumR_sa))

    # SYNCHRONISATION - pycuda does it implicitly.

    for i in range(num_actions):
        cuda.memcpy_dtoh(results2_list[i], results_gpu_list[i])
        cuda.memcpy_dtoh(sum_Rsa2_list[i], sumR_sa_gpu_list[i])
        print("memcpy_dtoh for action: ", i)


    for i in range(num_actions):
        sum_Rsa2_list[i] = sum_Rsa2_list[i] / nrzns

    # print("sumR_sa2\n",sumR_sa2,"\n\n")

    # print("results_a0\n",results2_list[0].T[50::int(gsize**2)])
    print("OK REACHED END OF cuda relevant CODE\n")

    # make a list of inputs, each elelment for an action. and run parallal get_coo_ for each action
    # if save_file_for_each_a is true then each file must be named appopriately.
    if save_file_for_each_a == True:
        f1 = 'COO_Highway2D_T' + str(T) + '_a'
        f3 = '_of_' + str(num_actions) + 'A.npy'
        inputs = [(results2_list[i], nrzns, T, f1 + str(i) + f3) for i in range(num_actions)]
    else:
        inputs = [(results2_list[i], nrzns, T, None) for i in range(num_actions)]

    # coo_list_a is a list of coo for each each action for the given timestep.
    with Pool(num_actions) as p:
        coo_list_a = p.starmap(get_COO_, inputs)
    # print("coo print\n", coo.T[4880:4900, :])
    print("\n\n")
    # print("time taken by cuda compute and transfer\n", (t2 - t1) / 60)
    # print("time taken for post processing to coo on cpu\n",(t3 - t2) / 60)

    return coo_list_a, sum_Rsa2_list

예제 #51

0

파일 보기

      x2[i] = 0.5*mu2eps*(s2[i+1]+s2[i-1]) + (1-mu2eps)*s2[i];
  }
  else if (i == 0){
     x1[i] = mu1eps*s1[i+1] + (1-mu1eps)*s1[i];
     x2[i] = mu2eps*s2[i+1] + (1-mu2eps)*s2[i];
  }
  else{
    x1[i] = mu1eps*s1[i-1] + (1-mu1eps)*s1[i];
    x2[i] = mu2eps*s2[i-1] + (1-mu2eps)*s2[i];
  }
  
  picture[i] = x1[i]/(x1[i] + x2[i]);
}
""")

fitness = mod.get_function("fitness")
diffuse_display = mod.get_function("diffuse_display")

#parameters

side = 50
length = side * side

pop = 100

mu1list = numpy.linspace(9, 11, 15)
mu2 = 10

eps = .01

b, c, d = 0, 0, 1

예제 #52

0

파일 보기

        smemPos += smemStride;
        gmemPos += gmemStride;
    }
}
'''

template = string.Template(template)
code = template.substitute(KERNEL_RADIUS=KERNEL_RADIUS,
                           KERNEL_W=KERNEL_W,
                           COLUMN_TILE_H=COLUMN_TILE_H,
                           COLUMN_TILE_W=COLUMN_TILE_W,
                           ROW_TILE_W=ROW_TILE_W,
                           KERNEL_RADIUS_ALIGNED=KERNEL_RADIUS_ALIGNED)

module = SourceModule(code)
convolutionRowGPU = module.get_function('convolutionRowGPU')
convolutionColumnGPU = module.get_function('convolutionColumnGPU')
d_Kernel_rows = module.get_global('d_Kernel_rows')[0]
d_Kernel_columns = module.get_global('d_Kernel_columns')[0]


# Helper functions for computing alignment...
def iDivUp(a, b):
    # Round a / b to nearest higher integer value
    a = numpy.int32(a)
    b = numpy.int32(b)
    return (a / b + 1) if (a % b != 0) else (a / b)


def iDivDown(a, b):
    # Round a / b to nearest lower integer value

예제 #53

0

파일 보기

파일: lookuptable.py 프로젝트: johndpope/https-github.com-NervanaSystems-neon

def _get_lut_bprop_kernel(dtype, deterministic=False):
    """
    Builds the bprop kernel for lookup table layers based on templated code.
    If the deterministic version is requested, an index buffer must be passed
    as an argument. This index buffer re-orders items in the input tensor
    so that word_ids are sorted. This is required since we need to be sure that
    each thread only updates weights for one word id.

    Arguments:
        dtype (np.dtype): The data which the kernel will operate on.
        deterministic (boolean): Builds the deterministic kernel when this is
            set to True.
    """
    if not deterministic:
        code = r"""
__global__ void lut_bprop(
    int* inputs, %(type)s* dW, %(type)s* errors, const int nin,
    const int embedding_dim, const int vocab_size, const int pad_idx)
{
    const int tid  = threadIdx.x;
    const int bid  = blockIdx.x;

    int word_id = inputs[bid];
    int error_row = bid * embedding_dim;
    int output_row = word_id * embedding_dim;

    if(word_id != pad_idx)
    {
        for(int i = tid; i < embedding_dim; i += blockDim.x)
        {
            atomicAdd(&dW[output_row + i], errors[error_row + i]);
        }
    }
}
"""

        code = code % {"type": _ew_types[dtype]["type"]}

        module = SourceModule(code, options=["--use_fast_math"])
        kernel = module.get_function("lut_bprop")
        kernel.prepare("PPPIIIi")
    else:
        code = r"""
__global__ void lut_bprop(
    int* inputs, int* index_buffer, %(type)s* dW, %(type)s* errors,
    const int nin, const int embedding_dim, const int vocab_size,
    const int pad_idx)
{
    const int tid  = threadIdx.x;
    const int bid  = blockIdx.x;

    int index_position = bid;
    int index = index_buffer[index_position];
    int word_id = inputs[index];

    if((bid == 0 || word_id != inputs[index_buffer[bid - 1]]) && word_id != pad_idx)
    {
        int output_row = word_id * embedding_dim;

        do {
            int error_row = index * embedding_dim;

            for(int i = tid; i < embedding_dim; i += blockDim.x)
            {
                dW[output_row + i] += errors[error_row + i];
            }

            index_position++;
            if(index_position == gridDim.x)
            {
                break;
            }
            index = index_buffer[index_position];
        } while(inputs[index] == word_id);
    }
}
"""

        code = code % {"type": _ew_types[dtype]["type"]}

        module = SourceModule(code, options=["--use_fast_math"])
        kernel = module.get_function("lut_bprop")
        kernel.prepare("PPPPIIIi")

    kernel.name = "lut_bprop"
    return kernel

예제 #54

0

파일 보기

         __syncthreads();
         if (tid >= iter )
         {
             sum_buf[tid] = sum_buf[tid] + sum_buf[tid - iter];            
         }
         
         iter *= 2;
     }
         
    __syncthreads();
    out[tid] = sum_buf[tid];
    __syncthreads();
        
}
""")
naive_gpu = naive_ker.get_function("naive_prefix")

if __name__ == '__main__':

    testvec = np.random.randn(1024).astype(np.float64)
    testvec_gpu = gpuarray.to_gpu(testvec)

    outvec_gpu = gpuarray.empty_like(testvec_gpu)

    naive_gpu(testvec_gpu, outvec_gpu, block=(1024, 1, 1), grid=(1, 1, 1))

    total_sum = sum(testvec)
    total_sum_gpu = outvec_gpu[-1].get()

    print "Does our kernel work correctly? : {}".format(
        np.allclose(total_sum_gpu, total_sum))

예제 #55

0

파일 보기

def counting_vowels_in_text(text):
    """Returns the number of vowels found in the text?"""

    mod = SourceModule("""
    __global__ void count_vowels(char *text, int *results, int text_size, int chunk_size, int threads_per_block, int blocks_per_grid)
    {
        int index = blockDim.x * blockIdx.x + threadIdx.x;

        int start = index * chunk_size;
        int end = ( index + 1 ) * chunk_size;

        end = min( end, text_size );
        if (end < start)
        {
            return; 
        }

        int i = 0; 

        start = start * 4;
        end = end * 4;
        
        for(i = start; i <= end; i++){

            if (text[i] == 'a' || text[i] == 'A' || text[i] == 'e' || text[i] == 'E' || text[i] == 'i' 
                || text[i] == 'I' || text[i] =='o' || text[i] =='O' || text[i] == 'u' || text[i] == 'U' || text[i] == 'y' || text[i] == 'Y')
            {
                results[i] = 1;
            }
        }

    }
    """)
    cuda.start_profiler()

    max_text_size_in_mb = 100

    text_chunks = []
    text_chunks_count = math.ceil(
        len(text) / (max_text_size_in_mb * (1024**2)))

    while (len(text) > (max_text_size_in_mb * 1024**2)):
        text_chunk = text[:math.ceil(len(text) / (text_chunks_count))]
        text2 = text[math.ceil(len(text) / (text_chunks_count)):]

        text = text2
        text_chunks.append(text_chunk)

    text_chunks.append(text)

    cumulative_results = 0
    for text_chunk in text_chunks:
        device_text = gpuarray.to_gpu(numpy.array([text_chunk], dtype=str))
        device_results = gpuarray.zeros(len(text_chunk) * 4, dtype=numpy.int32)

        chunk_size = 1000
        threads_per_block = 512
        blocks_per_grid = numpy.int(
            math.ceil(len(text_chunk) / (chunk_size * threads_per_block)))

        device_text_size = numpy.int32(len(text_chunk))

        function = mod.get_function("count_vowels")
        function(device_text,
                 device_results,
                 device_text_size,
                 numpy.int32(chunk_size),
                 numpy.int32(blocks_per_grid),
                 block=(threads_per_block, 1, 1),
                 grid=(blocks_per_grid, 1, 1))

        host_results = device_results.get()

        results = numpy.count_nonzero(host_results == 1)
        cumulative_results += results

    cuda.stop_profiler()
    return cumulative_results

예제 #56

0

파일 보기

파일: lookuptable.py 프로젝트: johndpope/https-github.com-NervanaSystems-neon

def _get_sorting_kernel(kernel_id, block_size):
    """
    Builds kernels used for sorting inputs. There are several kernels here
    corresponding to the steps in the algorithm. The algorithm works by
    determining the sorted position for each input item. This is done with
    a bucket sort algorithm, where each word_id is a bucket. The first step
    determines the size of each bucket (number of occurences of each word_id).
    Next, a prefix some is computed over the list of bucket sizes to find
    where each bucket will be placed in the output buffer. Finally, each thread
    places it's index into the correct sorted position based on the bucket
    start index (computed from the prefix sum) and that thread's offset into
    the bucket (which is taken from the output of the atomic add done in the
    first step.)

    Arguments:
        kernel_id (Integer): Which step to build the kernel for [0, 4]
        block_size (Integer): Number of threads per block for the prefix sum
            kernels.
    """
    code = r"""
#define THREADS %(threads)s
#define STORE_BLOCKSUM %(store_blocksum)s
__global__ void sort_inputs0(
        int* inputs, int* index_buffer, int* offset_buffer, int* word_counts, const int vocab_size,
        const int input_length)
{
    const int tid = threadIdx.x + (blockDim.x * blockIdx.x);
    int word_id;

    if(tid < input_length)
    {
        word_id = inputs[tid];
        offset_buffer[tid] = atomicAdd(&word_counts[word_id], 1);
    }
}

__device__ void scan(int* buffer, int* blocksum, int global_length)
{
    const int tid = (threadIdx.x << 1) + 1;
    const int gid = ((threadIdx.x + (blockIdx.x * blockDim.x)) << 1) + 1;

    __shared__ int local_counts[THREADS * 2];
    local_counts[tid] = buffer[gid];
    local_counts[tid - 1] = buffer[gid - 1];

    #pragma unroll
    for(int skip = 1; skip <= THREADS; skip <<= 1)
    {
        int mask = (skip << 1) - 1;
        if((tid & mask) == mask)
        {
            local_counts[tid] += local_counts[tid - skip];
        }

        __syncthreads();
    }

    if(tid == (THREADS * 2 - 1))
    {
#if STORE_BLOCKSUM
        blocksum[blockIdx.x] = local_counts[tid];
#endif
        local_counts[tid] = 0;
    }

    #pragma unroll
    for(int skip = THREADS; skip > 0; skip >>= 1)
    {
        int mask = (skip << 1) - 1;
        if((tid & mask) == mask)
        {
            int temp = local_counts[tid - skip];
            local_counts[tid - skip] = local_counts[tid];
            local_counts[tid] += temp;
        }

        __syncthreads();
    }

    if(gid < global_length)
    {
        buffer[gid] = local_counts[tid];
        buffer[gid - 1] = local_counts[tid - 1];
    }
}

__global__ void sort_inputs1(
        int* inputs, int* index_buffer, int* offset_buffer, int* word_counts, const int vocab_size,
        const int input_length)
{
    scan(word_counts, word_counts + vocab_size, vocab_size);
}

__global__ void sort_inputs2(
        int* inputs, int* index_buffer, int* offset_buffer, int* word_counts, const int vocab_size,
        const int input_length)
{
    scan(word_counts + vocab_size, 0, blockDim.x);
}

__global__ void sort_inputs3(
        int* inputs, int* index_buffer, int* offset_buffer, int* word_counts, const int vocab_size,
        const int input_length)
{
    const int gid = (threadIdx.x + (blockIdx.x * blockDim.x)) << 1;

    if(gid < vocab_size)
    {
        word_counts[gid] += word_counts[vocab_size + blockIdx.x];
        word_counts[gid + 1] += word_counts[vocab_size + blockIdx.x];
    }
}

__global__ void sort_inputs4(
        int* inputs, int* index_buffer, int* offset_buffer, int* word_counts, const int vocab_size,
        const int input_length)
{
    const int tid = threadIdx.x + (blockDim.x * blockIdx.x);
    int word_id;

    if(tid < input_length)
    {
        word_id = inputs[tid];
        int sorted_position = word_counts[word_id] + offset_buffer[tid];
        index_buffer[sorted_position] = tid;
    }
}
"""
    code = code % {
        "threads": block_size,
        "store_blocksum": (1 if kernel_id == 1 else 0)
    }
    module = SourceModule(code, options=["--use_fast_math"])

    function_name = "sort_inputs" + str(kernel_id)
    kernel = module.get_function(function_name)
    kernel.prepare("PPPPII")
    kernel.name = "sort_inputs"
    return kernel

예제 #57

0

파일 보기

파일: vectorized_memory.py 프로젝트: steve1029/hands_on_gpu_pycuda

__global__ void vec_ker(int *ints, double *doubles) {     

 int4 f1, f2;

 f1 = *reinterpret_cast<int4*>(ints);
 f2 = *reinterpret_cast<int4*>(&ints[4]);

 printf("First int4: %d, %d, %d, %d\\n", f1.x, f1.y, f1.z, f1.w);
 printf("Second int4: %d, %d, %d, %d\\n", f2.x, f2.y, f2.z, f2.w);
 
 double2 d1, d2;
 
 d1 = *reinterpret_cast<double2*>(doubles);
 d2 = *reinterpret_cast<double2*>(&doubles[2]);
 
 printf("First double2: %f, %f\\n", d1.x, d1.y);
 printf("Second double2: %f, %f\\n", d2.x, d2.y);
 
}'''

vec_mod = SourceModule(VecCode)
vec_ker = vec_mod.get_function('vec_ker')

ints = gpuarray.to_gpu(np.int32([1,2,3,4,5,6,7,8]))
doubles = gpuarray.to_gpu(np.double([1.11,2.22,3.33,4.44]))


print 'Vectorized Memory Test:'

vec_ker(ints, doubles, grid=(1,1,1), block=(1,1,1))

예제 #58

0

파일 보기

M = 1024
L = 1024
THREADS_PER_BLOCK = 32

mod = SourceModule("""
    #define N 1024
    #define M 1024
    #define L 1024  

    __global__ void multiplicacionMatrices(float *a, float *b, float *c) {
        int i = threadIdx.x + blockIdx.x*blockDim.x; 
        int j = threadIdx.y + blockIdx.y*blockDim.y; 
            
        c[j+i*L] = 0;

        for(int k=0 ; k < M ; k++ ){
            c[j+i*L] += a[k+i*M] * b[j+k*L];
        }
    }
""")

a = np.array([1 for i in range(N*M)]).astype(np.float32) 
b = np.array([1 for i in range(M*L)]).astype(np.float32)
c = np.array([0 for i in range(N*L)]).astype(np.float32)

multiplicacion_matrices = mod.get_function("multiplicacionMatrices")

multiplicacion_matrices(cuda.In(a), cuda.In(b), cuda.Out(c),
    block=(THREADS_PER_BLOCK, THREADS_PER_BLOCK, 1), grid=(N//THREADS_PER_BLOCK, L//THREADS_PER_BLOCK, 1))

print(c.reshape((N,L)))

예제 #59

0

파일 보기

s = cuda.Event()
e = cuda.Event()
s.record()

code = """
	__global__ void add_one(int n, int start, float *x)
	{
		int index = start + blockIdx.x * blockDim.x + threadIdx.x;
		int stride = blockDim.x * gridDim.x;
		for (int i = index; i < n; i += stride)
			x[i] += 1.0;
	}
	"""
mod = SourceModule(code)
add_one = mod.get_function("add_one")

N = np.int32(1e8)
nStreams = 2
streamSize = np.int32(N / nStreams)

x = np.ones(N, dtype=np.float32)

x_gpu = gpuarray.empty(N, dtype=np.float32)

#cuda.memcpy_htod(x_gpu, x)

stream = []
for i in range(nStreams):
    stream.append(cuda.Stream())

예제 #60

0

파일 보기

class TSDFVolume(object):
    def __init__(self, vol_bnds, voxel_size):

        # Define voxel volume parameters.
        self._vol_bnds = vol_bnds  # 3x2, rows: (x, y, z), columns: (min, max) in world coordinates in meters
        self._voxel_size = voxel_size  # in meters (determines volume discretization and resolution)
        self._trunc_margin = self._voxel_size * 5  # truncation on SDF

        # Adjust volume bounds.
        self._vol_dim = np.ceil((self._vol_bnds[:, 1] - self._vol_bnds[:, 0]) /
                                self._voxel_size).copy(order='C').astype(
                                    int)  # ensure C-order contigous
        self._vol_bnds[:,
                       1] = self._vol_bnds[:,
                                           0] + self._vol_dim * self._voxel_size
        self._vol_origin = self._vol_bnds[:, 0].copy(order='C').astype(
            np.float32)  # ensure C-order contigous
        print("Voxel volume size: {:d} x {:d} x {:d}".format(
            self._vol_dim[0], self._vol_dim[1], self._vol_dim[2]))

        # Initialize pointers to voxel volume in CPU memory.
        self._tsdf_vol_cpu = np.ones(self._vol_dim).astype(np.float32)
        self._weight_vol_cpu = np.zeros(self._vol_dim).astype(
            np.float32
        )  # for computing the cumulative moving average of observations per voxel
        self._color_vol_cpu = np.zeros(self._vol_dim).astype(np.float32)

        # Copy voxel volumes to GPU.
        if TSDF_GPU_MODE:
            self._tsdf_vol_gpu = cuda.mem_alloc(self._tsdf_vol_cpu.nbytes)
            cuda.memcpy_htod(self._tsdf_vol_gpu, self._tsdf_vol_cpu)
            self._weight_vol_gpu = cuda.mem_alloc(self._weight_vol_cpu.nbytes)
            cuda.memcpy_htod(self._weight_vol_gpu, self._weight_vol_cpu)
            self._color_vol_gpu = cuda.mem_alloc(self._color_vol_cpu.nbytes)
            cuda.memcpy_htod(self._color_vol_gpu, self._color_vol_cpu)

            # Cuda kernel function (C++)
            self._cuda_src_mod = SourceModule("""
              __global__ void integrate(float * tsdf_vol,
                                        float * weight_vol,
                                        float * color_vol,
                                        float * vol_dim,
                                        float * vol_origin,
                                        float * cam_intr,
                                        float * cam_pose,
                                        float * other_params,
                                        float * color_im,
                                        float * depth_im) {

                // Get voxel index.
                int gpu_loop_idx = (int) other_params[0];
                int max_threads_per_block = blockDim.x;
                int block_idx = blockIdx.z * gridDim.y * gridDim.x + blockIdx.y * gridDim.x + blockIdx.x;
                int voxel_idx = gpu_loop_idx * gridDim.x * gridDim.y * gridDim.z * max_threads_per_block + block_idx * max_threads_per_block + threadIdx.x;
                
                int vol_dim_x = (int)vol_dim[0];
                int vol_dim_y = (int)vol_dim[1];
                int vol_dim_z = (int)vol_dim[2];

                if (voxel_idx > vol_dim_x * vol_dim_y * vol_dim_z)
                    return;

                // Get voxel grid coordinates.
                float voxel_x = floorf(((float)voxel_idx) / ((float)(vol_dim_y * vol_dim_z)));
                float voxel_y = floorf(((float)(voxel_idx - ((int)voxel_x) * vol_dim_y * vol_dim_z)) / ((float)vol_dim_z));
                float voxel_z = (float)(voxel_idx - ((int)voxel_x) * vol_dim_y * vol_dim_z - ((int)voxel_y) * vol_dim_z);

                // Voxel grid coordinates to world coordinates.
                float voxel_size = other_params[1];
                float pt_x = vol_origin[0] + voxel_x * voxel_size;
                float pt_y = vol_origin[1] + voxel_y * voxel_size;
                float pt_z = vol_origin[2] + voxel_z * voxel_size;

                // World coordinates to camera coordinates.
                float tmp_pt_x = pt_x - cam_pose[0*4+3];
                float tmp_pt_y = pt_y - cam_pose[1*4+3];
                float tmp_pt_z = pt_z - cam_pose[2*4+3];
                float cam_pt_x = cam_pose[0*4+0] * tmp_pt_x + cam_pose[1*4+0] * tmp_pt_y + cam_pose[2*4+0] * tmp_pt_z;
                float cam_pt_y = cam_pose[0*4+1] * tmp_pt_x + cam_pose[1*4+1] * tmp_pt_y + cam_pose[2*4+1] * tmp_pt_z;
                float cam_pt_z = cam_pose[0*4+2] * tmp_pt_x + cam_pose[1*4+2] * tmp_pt_y + cam_pose[2*4+2] * tmp_pt_z;

                // Camera coordinates to image pixels.
                int pixel_x = (int) roundf(cam_intr[0*3+0] * (cam_pt_x / cam_pt_z) + cam_intr[0*3+2]);
                int pixel_y = (int) roundf(cam_intr[1*3+1] * (cam_pt_y / cam_pt_z) + cam_intr[1*3+2]);

                // Skip if outside view frustum.
                int im_h = (int) other_params[2];
                int im_w = (int) other_params[3];
                if (pixel_x < 0 || pixel_x >= im_w || pixel_y < 0 || pixel_y >= im_h || cam_pt_z < 0)
                    return;

                // Skip invalid depth.
                float depth_value = depth_im[pixel_y*im_w+pixel_x];
                if (depth_value == 0)
                    return;

                // Integrate TSDF.
                float trunc_margin = other_params[4];
                float depth_diff = depth_value-cam_pt_z;
                if (depth_diff < -trunc_margin)
                    return;
                float dist = fmin(1.0f, depth_diff / trunc_margin);
                float w_old = weight_vol[voxel_idx];
                float obs_weight = other_params[5];
                float w_new = w_old + obs_weight;
                weight_vol[voxel_idx] = w_new;
                tsdf_vol[voxel_idx] = (tsdf_vol[voxel_idx] * w_old + dist) / w_new;

                // Integrate color.
                float old_color = color_vol[voxel_idx];
                float old_b = floorf(old_color / (256 * 256));
                float old_g = floorf((old_color - old_b * 256 * 256) / 256);
                float old_r = old_color - old_b * 256 * 256 - old_g * 256;
                float new_color = color_im[pixel_y*im_w+pixel_x];
                float new_b = floorf(new_color / (256 * 256));
                float new_g = floorf((new_color - new_b * 256 * 256) / 256);
                float new_r = new_color - new_b * 256 * 256 - new_g * 256;
                new_b = fmin(roundf((old_b*w_old + new_b) / w_new), 255.0f);
                new_g = fmin(roundf((old_g*w_old + new_g) / w_new), 255.0f);
                new_r = fmin(roundf((old_r*w_old + new_r) / w_new), 255.0f);
                color_vol[voxel_idx] = new_b * 256 * 256 + new_g * 256 + new_r;

              }""")

            self._cuda_integrate = self._cuda_src_mod.get_function("integrate")

            # Determine block/grid size on GPU.
            gpu_dev = cuda.Device(0)
            self._max_gpu_threads_per_block = gpu_dev.MAX_THREADS_PER_BLOCK
            n_blocks = int(
                np.ceil(
                    float(np.prod(self._vol_dim)) /
                    float(self._max_gpu_threads_per_block)))
            grid_dim_x = min(gpu_dev.MAX_GRID_DIM_X,
                             int(np.floor(np.cbrt(n_blocks))))
            grid_dim_y = min(gpu_dev.MAX_GRID_DIM_Y,
                             int(np.floor(np.sqrt(n_blocks / grid_dim_x))))
            grid_dim_z = min(
                gpu_dev.MAX_GRID_DIM_Z,
                int(np.ceil(float(n_blocks) / float(grid_dim_x * grid_dim_y))))
            self._max_gpu_grid_dim = np.array(
                [grid_dim_x, grid_dim_y, grid_dim_z]).astype(int)
            self._n_gpu_loops = int(
                np.ceil(
                    float(np.prod(self._vol_dim)) / float(
                        np.prod(self._max_gpu_grid_dim) *
                        self._max_gpu_threads_per_block)))

    def integrate(self, color_im, depth_im, cam_intr, cam_pose, obs_weight=1.):
        im_h = depth_im.shape[0]
        im_w = depth_im.shape[1]

        # Fold RGB color image into a single channel image.
        color_im = color_im.astype(np.float32)
        color_im = np.floor(color_im[:, :, 2] * 256 * 256 +
                            color_im[:, :, 1] * 256 + color_im[:, :, 0])

        # GPU mode: integrate voxel volume (calls CUDA kernel).
        if TSDF_GPU_MODE:
            for gpu_loop_idx in range(self._n_gpu_loops):
                self._cuda_integrate(
                    self._tsdf_vol_gpu,
                    self._weight_vol_gpu,
                    self._color_vol_gpu,
                    cuda.InOut(self._vol_dim.astype(np.float32)),
                    cuda.InOut(self._vol_origin.astype(np.float32)),
                    cuda.InOut(cam_intr.reshape(-1).astype(np.float32)),
                    cuda.InOut(cam_pose.reshape(-1).astype(np.float32)),
                    cuda.InOut(
                        np.asarray([
                            gpu_loop_idx, self._voxel_size, im_h, im_w,
                            self._trunc_margin, obs_weight
                        ], np.float32)),
                    cuda.InOut(color_im.reshape(-1).astype(np.float32)),
                    cuda.InOut(depth_im.reshape(-1).astype(np.float32)),
                    block=(self._max_gpu_threads_per_block, 1, 1),
                    grid=(int(self._max_gpu_grid_dim[0]),
                          int(self._max_gpu_grid_dim[1]),
                          int(self._max_gpu_grid_dim[2])))

        # CPU mode: integrate voxel volume (vectorized implementation).
        else:

            # Get voxel grid coordinates.
            xv, yv, zv = np.meshgrid(range(self._vol_dim[0]),
                                     range(self._vol_dim[1]),
                                     range(self._vol_dim[2]),
                                     indexing='ij')
            vox_coords = np.concatenate(
                (xv.reshape(1, -1), yv.reshape(1, -1), zv.reshape(1, -1)),
                axis=0).astype(int)

            # Voxel coordinates to world coordinates.
            world_pts = self._vol_origin.reshape(
                -1, 1) + vox_coords.astype(float) * self._voxel_size

            # World coordinates to camera coordinates.
            world2cam = np.linalg.inv(cam_pose)
            cam_pts = np.dot(world2cam[:3, :3], world_pts) + np.tile(
                world2cam[:3, 3].reshape(3, 1), (1, world_pts.shape[1]))

            # Camera coordinates to image pixels.
            pix_x = np.round(cam_intr[0, 0] * (cam_pts[0, :] / cam_pts[2, :]) +
                             cam_intr[0, 2]).astype(int)
            pix_y = np.round(cam_intr[1, 1] * (cam_pts[1, :] / cam_pts[2, :]) +
                             cam_intr[1, 2]).astype(int)

            # Skip if outside view frustum.
            valid_pix = np.logical_and(
                pix_x >= 0,
                np.logical_and(
                    pix_x < im_w,
                    np.logical_and(
                        pix_y >= 0,
                        np.logical_and(pix_y < im_h, cam_pts[2, :] > 0))))

            depth_val = np.zeros(pix_x.shape)
            depth_val[valid_pix] = depth_im[pix_y[valid_pix], pix_x[valid_pix]]

            # Integrate TSDF.
            depth_diff = depth_val - cam_pts[2, :]
            valid_pts = np.logical_and(depth_val > 0,
                                       depth_diff >= -self._trunc_margin)
            dist = np.minimum(1., np.divide(depth_diff, self._trunc_margin))
            w_old = self._weight_vol_cpu[vox_coords[0, valid_pts],
                                         vox_coords[1, valid_pts],
                                         vox_coords[2, valid_pts]]
            w_new = w_old + obs_weight
            self._weight_vol_cpu[vox_coords[0, valid_pts],
                                 vox_coords[1, valid_pts],
                                 vox_coords[2, valid_pts]] = w_new
            tsdf_vals = self._tsdf_vol_cpu[vox_coords[0, valid_pts],
                                           vox_coords[1, valid_pts],
                                           vox_coords[2, valid_pts]]
            self._tsdf_vol_cpu[vox_coords[0, valid_pts], vox_coords[1,
                                                                    valid_pts],
                               vox_coords[2, valid_pts]] = np.divide(
                                   np.multiply(tsdf_vals, w_old) +
                                   dist[valid_pts], w_new)

            # Integrate color.
            old_color = self._color_vol_cpu[vox_coords[0, valid_pts],
                                            vox_coords[1, valid_pts],
                                            vox_coords[2, valid_pts]]
            old_b = np.floor(old_color / (256. * 256.))
            old_g = np.floor((old_color - old_b * 256. * 256.) / 256.)
            old_r = old_color - old_b * 256. * 256. - old_g * 256.
            new_color = color_im[pix_y[valid_pts], pix_x[valid_pts]]
            new_b = np.floor(new_color / (256. * 256.))
            new_g = np.floor((new_color - new_b * 256. * 256.) / 256.)
            new_r = new_color - new_b * 256. * 256. - new_g * 256.
            new_b = np.minimum(
                np.round(np.divide(np.multiply(old_b, w_old) + new_b, w_new)),
                255.)
            new_g = np.minimum(
                np.round(np.divide(np.multiply(old_g, w_old) + new_g, w_new)),
                255.)
            new_r = np.minimum(
                np.round(np.divide(np.multiply(old_r, w_old) + new_r, w_new)),
                255.)
            self._color_vol_cpu[vox_coords[0, valid_pts], vox_coords[
                1, valid_pts], vox_coords[
                    2, valid_pts]] = new_b * 256. * 256. + new_g * 256. + new_r

    # Copy voxel volume to CPU.
    def get_volume(self):
        if TSDF_GPU_MODE:
            cuda.memcpy_dtoh(self._tsdf_vol_cpu, self._tsdf_vol_gpu)
            cuda.memcpy_dtoh(self._color_vol_cpu, self._color_vol_gpu)
        return self._tsdf_vol_cpu, self._color_vol_cpu

    # Get mesh of voxel volume via marching cubes.
    def get_mesh(self):
        tsdf_vol, color_vol = self.get_volume()

        # Marching cubes.
        verts, faces, norms, _ = measure.marching_cubes_lewiner(tsdf_vol,
                                                                level=0)
        verts_ind = np.round(verts).astype(int)
        verts = verts * self._voxel_size + self._vol_origin  # voxel grid coordinates to world coordinates

        # Get vertex colors.
        rgb_vals = color_vol[verts_ind[:, 0], verts_ind[:, 1], verts_ind[:, 2]]
        colors_b = np.floor(rgb_vals / (256 * 256))
        colors_g = np.floor((rgb_vals - colors_b * 256 * 256) / 256)
        colors_r = rgb_vals - colors_b * 256 * 256 - colors_g * 256
        colors = np.floor(np.asarray([colors_r, colors_g, colors_b])).T
        colors = colors.astype(np.uint8)
        return verts, faces, norms, colors