def initParallelAlgorithms(): global bitonicSort_ fin = open("ParallelAlgorithms/bitonicSort.cu") mod = SourceModule(fin.read()) fin.close() bitonicSort_ = mod.get_function("bitonicSort") global finishCSM_ global getSumSquares_ fin = open("ParallelAlgorithms/CSMHelper.cu") mod = SourceModule(fin.read()) fin.close() finishCSM_ = mod.get_function("finishCSM") getSumSquares_ = mod.get_function("getSumSquares") #Run each of the algorithms on dummy data so that they're pre-compiled #1) Bitonic Sort X = np.random.randn(16, 16) N = np.int32(16) NPow2 = N NThreads = N/2 XG = gpuarray.to_gpu(X) bitonicSort_(XG, N, NPow2, block=(NThreads, 1, 1), grid=(X.shape[0], 1), shared=4*NPow2) linalg.init() #2) Other primitive operations NegXDotX = linalg.dot(XG, XG) XPlusX = skcuda.misc.add(XG, XG) XSqr = skcuda.misc.multiply(XG, XG) XSqr = skcuda.misc.sum(XSqr, 1) XPlusCol = skcuda.misc.add_matvec(XG, XSqr, 0)
def prepare_kernel(s): mod = SourceModule(""" __global__ void update_src(int idx, int tstep, float *f) { f[idx] += sin(0.1*tstep); } __global__ void update(int nx, int ny, float *c, float *f, float *g) { int tx = threadIdx.x; int idx = blockIdx.x*blockDim.x + tx; extern __shared__ float gs[]; gs[tx+1] = g[idx]; int i = idx/ny, j = idx%ny; if(j>0 && j<ny-1) { if(tx==0) gs[tx]=g[idx-1]; if(tx==blockDim.x-1) gs[tx+2]=g[idx+1]; } __syncthreads(); if(i>0 && j>0 && i<nx-1 && j<ny-1) { f[idx] = c[idx]*(g[idx+ny]+g[idx-ny]+gs[tx+2]+gs[tx]-4*gs[tx+1])+2*gs[tx+1]-f[idx]; } } """) s.update_src = mod.get_function("update_src") s.update = mod.get_function("update") Db, s.Dg = (256,1,1), (s.nx*s.ny/256+1, 1) s.nnx, s.nny = sc.int32(s.nx), sc.int32(s.ny) s.update_src.prepare("iiP", block=(1,1,1)) s.update.prepare("iiPPP", block=Db, shared=(256+2)*4)
def get_tkernel(slen, window): if window < 32: raise ValueError("GPU threshold kernel does not support a window smaller than 32 samples") elif window <= 4096: nt = 128 elif window <= 16384: nt = 256 elif window <= 32768: nt = 512 else: nt = 1024 nb = int(numpy.ceil(slen / float(window))) if nb > 1024: raise ValueError("More than 1024 blocks not supported yet") try: return tfn_cache[(nt, nb)], nt, nb except KeyError: mod = SourceModule(tkernel1.render(chunk=nt)) mod2 = SourceModule(tkernel2.render(blocks=nb)) fn = mod.get_function("threshold_and_cluster") fn.prepare("PPPif") fn2 = mod2.get_function("threshold_and_cluster2") fn2.prepare("PPfi") tfn_cache[(nt, nb)] = (fn, fn2) return tfn_cache[(nt, nb)], nt, nb
def gradient_gpu(y_gpu, mode='valid'): shape = np.array(y_gpu.shape).astype(np.uint32) dtype = y_gpu.dtype block_size = (16,16,1) grid_size = (int(np.ceil(float(shape[1])/block_size[0])), int(np.ceil(float(shape[0])/block_size[1]))) shared_size = int((1+block_size[0])*(1+block_size[1])*dtype.itemsize) preproc = _generate_preproc(dtype, shape) mod = SourceModule(preproc + kernel_code, keep=True) if mode == 'valid': gradient_gpu = mod.get_function("gradient_valid") gradx_gpu = cua.empty((y_gpu.shape[0], y_gpu.shape[1]-1), y_gpu.dtype) grady_gpu = cua.empty((y_gpu.shape[0]-1, y_gpu.shape[1]), y_gpu.dtype) if mode == 'same': gradient_gpu = mod.get_function("gradient_same") gradx_gpu = cua.empty((y_gpu.shape[0], y_gpu.shape[1]), y_gpu.dtype) grady_gpu = cua.empty((y_gpu.shape[0], y_gpu.shape[1]), y_gpu.dtype) gradient_gpu(gradx_gpu.gpudata, grady_gpu.gpudata, y_gpu.gpudata, block=block_size, grid=grid_size, shared=shared_size) return (gradx_gpu, grady_gpu)
def get_kernel_functions(**kwargs): reset_mod = SourceModule(reset_template.render(kwargs)) add_mod = SourceModule(add_template.render(kwargs)) step_mod = SourceModule(step_template.render(kwargs)) reset_func = reset_mod.get_function('reset_force') add_func = add_mod.get_function('add_force') update_func = step_mod.get_function('update') return (reset_func, add_func, update_func,)
def prepare_update(self): from pycuda.compiler import SourceModule import os src_path = '/'.join( os.path.abspath(__file__).split('/')[:-1] ) kernels = open(src_path + '/core.cu').read() mod = SourceModule(kernels) #mod = cuda.module_from_file('core.cubin') self.update_pre = mod.get_function('update_pre') self.update_ul = mod.get_function('update_ul')
def make_thunk(self, node, storage_map, _, _2): mod = SourceModule(open("binary_kernels.cu").read()) concatenate_rows_kernel = mod.get_function("concatenate_rows_kernel") concatenate_cols_kernel = mod.get_function("concatenate_cols_kernel") xnor_kernel = mod.get_function("xnor_gemm") inputs = [storage_map[v] for v in node.inputs] outputs = [storage_map[v] for v in node.outputs] # THIS IS PROBABLY THE PART YOU ARE INTERESTED IN def thunk(): # inputs A = inputs[0][0] B = inputs[1][0] # dimensions m = A.shape[0] n = A.shape[1] k = B.shape[1] assert n == B.shape[0] # Otherwise GEMM is impossible assert n%(32*16) == 0 # Concatenation and block size # output output_shape = (m, k) C = outputs[0] # only allocate if there is no previous allocation of the right size. if C[0] is None or C[0].shape != output_shape: C[0] = cuda.CudaNdarray.zeros(output_shape) # Concatenating the rows of A Ac = drv.mem_alloc(m*n*4/32) block_size = 64 block = (block_size,1,1) grid = (m*n/(block_size*32)+1,1) concatenate_rows_kernel(A,Ac, np.intc(m*n/32), block= block, grid=grid) # Concatenating the columns of B Bc = drv.mem_alloc(n*k*4/32) block_size = 64 block = (block_size,1,1) grid = (k/block_size+1,1) concatenate_cols_kernel(B,Bc, np.intc(n), np.intc(k), block= block, grid=grid) # Launching xnor_kernel block_size = 16 block = (block_size,block_size,1) grid = (k / block_size + 1, m / block_size + 1) # better too many blocks than too little xnor_kernel(Ac,Bc,C[0], np.intc(m), np.intc(n/32.), np.intc(k), block= block, grid=grid) thunk.inputs = inputs thunk.outputs = outputs thunk.lazy = False return thunk
def get_K_min(self): """ Return the kinetic energy minimum """ # fill array with values of the kinetic energy fill_compiled = SourceModule( self.fill_V_K.format(cuda_consts=self.cuda_consts, K=self.K, V=self.V) ) fill_compiled.get_function("fill_K")(self.weighted, **self.wigner_mapper_params) return gpuarray.min(self.weighted).get()
def __init__(self, **kwargs): """ In addition to kwagrs of WignerMoyalCUDA1D.__init__ this constructor accepts: kT (optional)- the temperature for the Gibbs state [rho = exp(-H/kT)] dbeta (optional) - inverse temperature increments for the split-operator propagation t_initial (optional) - if the Hamiltonian is time dependent, then the the Gibbs state will be calculated for the hamiltonian at t_initial (default value of zero). """ if 't_initial' not in kwargs: kwargs.update(t_initial=0.) print("Warning: Initial time (t_initial) was not specified. So the default value was used t_initial = 0.") try: self.kT = kwargs['kT'] # remove kT from kwargs so that it does not enter into self.cuda_consts del kwargs['kT'] except KeyError: self.kT = 0. try: self.dbeta = kwargs['dbeta'] # remove dbeta from kwargs so that it does not enter into self.cuda_consts del kwargs['dbeta'] except KeyError: # if dbeta is not defined, just choose some value self.dbeta = 0.01 if 'dt' not in kwargs: # Save the inverse temperature increment as dt kwargs.update(dt=self.dbeta) # Initialize parent class WignerMoyalCUDA1D.__init__(self, **kwargs) # Save the minimums of the potential (V) and kinetic (K) energy self.cuda_consts += " const double V_min = %.15e;\n" % self.get_V_min() self.cuda_consts += " const double K_min = %.15e;\n" % self.get_K_min() print("\n================================ Compiling Bloch expK and expV ================================\n") bloch_expK_expV_compiled = SourceModule( self.bloch_expK_expV_cuda_source.format( cuda_consts=self.cuda_consts, K=self.K, V=self.V, abs_boundary_lambda_p=self.abs_boundary_lambda_p, abs_boundary_x_theta=self.abs_boundary_x_theta ) ) self.bloch_expK_bulk = bloch_expK_expV_compiled.get_function("bloch_expK_bulk") self.bloch_expK_boundary = bloch_expK_expV_compiled.get_function("bloch_expK_boundary") self.bloch_expV_bulk = bloch_expK_expV_compiled.get_function("bloch_expV_bulk") self.bloch_expV_boundary = bloch_expK_expV_compiled.get_function("bloch_expV_boundary")
def make_GPU_gradient(mesh, context): '''Prepare to compute gradient on the GPU w.r.t. the given mesh. Return gradient function. ''' mx = int(getattr(mesh, 'nx', 1)) my = int(getattr(mesh, 'ny', 1)) mz = int(getattr(mesh, 'nz', 1)) dxInv = np.array(1./getattr(mesh, 'dx', 1), dtype=np.float64) dyInv = np.array(1./getattr(mesh, 'dy', 1), dtype=np.float64) dzInv = np.array(1./getattr(mesh, 'dz', 1), dtype=np.float64) sizeof_double = 8 with open(where + 'gradient2.cu') as fdlib: source = fdlib.read() module = SourceModule(source) mx_ptr = module.get_global("mx")[0] my_ptr = module.get_global("my")[0] mz_ptr = module.get_global("mz")[0] cuda.memcpy_htod(mx_ptr, np.array(mx, dtype=np.int32)) cuda.memcpy_htod(my_ptr, np.array(my, dtype=np.int32)) cuda.memcpy_htod(mz_ptr, np.array(mz, dtype=np.int32)) dxInv_ptr = module.get_global("dxInv")[0] dyInv_ptr = module.get_global("dyInv")[0] dzInv_ptr = module.get_global("dzInv")[0] cuda.memcpy_htod(dxInv_ptr, dxInv) cuda.memcpy_htod(dyInv_ptr, dyInv) cuda.memcpy_htod(dzInv_ptr, dzInv) deriv_x = module.get_function("gradient_x") deriv_y = module.get_function("gradient_y") deriv_z = module.get_function("gradient_z") block, grid = mesh.get_domain_decomposition(DeviceData().max_threads) d_deriv_x = gpuarray.empty(shape=(1, mesh.n_nodes), dtype=np.float64) d_deriv_y = gpuarray.empty_like(d_deriv_x) d_deriv_z = gpuarray.empty_like(d_deriv_x) def _gradient(scalar_values): '''Calculate three-dimensional gradient for GPUArray scalar_values. ''' deriv_x(scalar_values, d_deriv_x, block=block, grid=grid) deriv_y(scalar_values, d_deriv_y, block=block, grid=grid) deriv_z(scalar_values, d_deriv_z, block=block, grid=grid) context.synchronize() return (d_deriv_x, d_deriv_y, d_deriv_z)[:mesh.dimension] return _gradient
def __init__(self,shape,scale,layer,precision=np.float32): self.precision = precision self.shape = shape self.scale = scale init_var = invgamma.rvs(1.0,scale=1.0,size=(1,layer.weights.shape[1])).astype(precision) self.sW = gpuarray.to_gpu(init_var) init_var = invgamma.rvs(1.0,scale=1.0,size=(1,1)).astype(precision) self.sB = gpuarray.to_gpu(init_var) kernels = SourceModule(open(path+'/kernels.cu', "r").read()) self.add_prior_w_kernel = kernels.get_function("add_gaussian_unit_grad") self.add_prior_b_kernel = kernels.get_function("add_bias_grad") self.scale_momentum_kernel = kernels.get_function("scale_momentum_normal_unit")
def get_K_min(self): """ Return the kinetic energy minimum """ # allocate memory k_p_p_prime = gpuarray.zeros((self.P.size, self.P.size), np.float64) # fill array with values of the kinetic energy fill_compiled = SourceModule( self.fill_V_K.format(cuda_consts=self.cuda_consts, K=self.K, V=self.V) ) fill_compiled.get_function("fill_K")(k_p_p_prime, **self.rho_mapper_params) return gpuarray.min(k_p_p_prime).get()
def init_rng(seed): global _dropout_kernel, _saltpepper_kernel, _rng_state, _rng_threads, _rng_blocks from pycuda.characterize import sizeof ds = sizeof("curandState", "#include <curand_kernel.h>") _rng_state = drv.mem_alloc(_rng_threads * _rng_blocks * ds) src = SourceModule( ''' #include <curand_kernel.h> extern "C" { __global__ void setup_rng(curandState* rng_state, const unsigned seed) { const unsigned tid = blockIdx.x*blockDim.x+threadIdx.x; curand_init(seed, tid, 0, &rng_state[tid]); } __global__ void dropout_eltw(float* x, const unsigned size, float dropout_rate, curandState* rng_state) { const unsigned tid = blockIdx.x*blockDim.x+threadIdx.x; const unsigned num_threads = gridDim.x*blockDim.x; curandState localState = rng_state[tid]; for (unsigned i = tid; i < size; i += num_threads) x[i] = (curand_uniform(&localState) < dropout_rate) ? 0.0 : x[i]; rng_state[tid] = localState; } __global__ void saltpepper_eltw(float* x, const unsigned size, float dropout_rate, curandState* rng_state) { const unsigned tid = blockIdx.x*blockDim.x+threadIdx.x; const unsigned num_threads = gridDim.x*blockDim.x; curandState localState = rng_state[tid]; for (unsigned i = tid; i < size; i += num_threads) x[i] = (curand_uniform(&localState) < dropout_rate) ? 0.0 : x[i]; x[i] = (curand_uniform(&localState) < dropout_rate) ? 1.0 : x[i]; rng_state[tid] = localState; } } ''', no_extern_c=True) setup_rng = src.get_function("setup_rng") setup_rng.prepare("Pi") setup_rng.prepared_call((_rng_threads, 1, 1), (_rng_blocks, 1, 1), _rng_state, np.uint32(seed)) _dropout_kernel = src.get_function("dropout_eltw") _dropout_kernel.prepare("PifP") _saltpepper_kernel = src.get_function("saltpepper_eltw") _saltpepper_kernel.prepare("PifP")
def get_K_min(self): """ Return the kinetic energy minimum in the lambda p space """ # allocate memory k_p_lambda = gpuarray.zeros((self.P.size, self.Lambda.size), np.float64) # fill array with values of the kinetic energy fill_compiled = SourceModule( self.fill_V_K.format(cuda_consts=self.cuda_consts, K=self.K, V=self.V) ) fill_compiled.get_function("fill_K_bulk")(k_p_lambda, **self.K_bulk_mapper_params) fill_compiled.get_function("fill_K_boundary")(k_p_lambda, **self.K_boundary_mapper_params) return gpuarray.min(k_p_lambda).get()
def get_V_min(self): """ Return the potential energy minimum in the x theta space """ # allocate memory v_theta_x = gpuarray.zeros((self.Theta.size, self.X.size), np.float64) # fill array with values of the potential energy fill_compiled = SourceModule( self.fill_V_K.format(cuda_consts=self.cuda_consts, K=self.K, V=self.V) ) fill_compiled.get_function("fill_V_bulk")(v_theta_x, **self.V_bulk_mapper_params) fill_compiled.get_function("fill_V_boundary")(v_theta_x, **self.V_boundary_mapper_params) return gpuarray.min(v_theta_x).get()
class CudaCalculator(object): def __init__(self, eta, beta, L, no_angles, no_pulses, order=5): self.mod_K = SourceModule(RADON_KERNEL.format(order, no_angles, no_pulses)) self.K_gpu = self.mod_K.get_function("K_l") self.mod_reduction = SourceModule(REDUCTION_KERNEL) self.reduction_gpu = self.mod_reduction.get_function("reduction") self.eta = eta self.gamma = gamma(eta) self.beta = beta self.L = L self.h = calc_h(L, beta, eta) drv.memcpy_htod(self.mod_K.get_global("rsq4pi")[0], scipy.array([1./sqrt(4.*pi)], dtype=scipy.float32)) drv.memcpy_htod(self.mod_K.get_global("sqeta")[0], scipy.array([sqrt(self.eta)], dtype=scipy.float32)) drv.memcpy_htod(self.mod_K.get_global("h")[0], scipy.array([self.h], dtype=scipy.float32)) drv.memcpy_htod(self.mod_K.get_global("four_pi_gamma")[0], scipy.array([4.*pi*self.gamma], dtype=scipy.float32)) y = sqrt(self.gamma)/self.h drv.memcpy_htod(self.mod_K.get_global("y")[0], scipy.array([y], dtype=scipy.float32)) n = scipy.arange(1, order+1, dtype=scipy.float32) n2 = n**2 ex = exp(-n2/4.) pre_s2 = ex*cosh(n*y) pre_s3 = ex*n*sinh(n*y) drv.memcpy_htod(self.mod_K.get_global("n2")[0], n2) drv.memcpy_htod(self.mod_K.get_global("pre_s1")[0], ex) drv.memcpy_htod(self.mod_K.get_global("pre_s2")[0], pre_s2) drv.memcpy_htod(self.mod_K.get_global("pre_s3")[0], pre_s3) def K(self, Q, P, angles, quadratures): drv.memcpy_htod(self.mod_K.get_global("cos_phi")[0], cos(angles).astype(scipy.float32)) drv.memcpy_htod(self.mod_K.get_global("sin_phi")[0], sin(angles).astype(scipy.float32)) Nx = Q.shape[0] Ny = int(floor(quadratures.size / 1024.)) K = scipy.empty((Nx,), dtype=scipy.float32) Kb = drv.mem_alloc(4*Ny*Nx) Q_gpu = drv.to_device(Q) P_gpu = drv.to_device(P) self.K_gpu(drv.In(quadratures), Q_gpu, P_gpu, Kb, block=(1, 1024, 1), grid=(Nx, Ny), shared=1024*4) self.reduction_gpu(Kb, drv.Out(K), block=(1, Ny, 1), grid=(Nx, 1), shared=Ny*4) return K/self.L def reconstruct_wigner(self, angles_quadratures, Nq, Np): angles, quadratures = angles_quadratures q_mean, p_mean, s_max = estimate_position_from_quadratures(self.eta, angles, quadratures) q, p, Q, P = build_mesh(q_mean, p_mean, s_max, Nq, Np) W = self.K(Q.ravel(), P.ravel(), angles, quadratures) return q_mean, p_mean, Q, P, W.reshape(Q.shape)
def make_thunk(self, node, storage_map, _, _2): inputs = [storage_map[v] for v in node.inputs] outputs = [storage_map[v] for v in node.outputs] mod = SourceModule(""" __global__ void cyclic_roll_grad(float * input, float * output, int batch_size, int num_features) { int x = blockIdx.x*blockDim.x + threadIdx.x; // feature dim, fastest varying index! int y = blockIdx.y*blockDim.y + threadIdx.y; // batch dim int height = 4 * batch_size; int width = 4 * num_features; float val = 0; if (x < num_features && y < height) { for (int i = 0; i < 4; i++) { int y_in = (y + batch_size * (4 - i)) % height; int x_in = x + num_features * i; val += input[y_in * width + x_in]; } output[y * num_features + x] = val; } }""") kernel = mod.get_function("cyclic_roll_grad") def thunk(): in_shape = inputs[0][0].shape rows, cols = in_shape assert rows % 4 == 0 assert cols % 4 == 0 out_shape = (rows, cols // 4) batch_size = rows // 4 num_features = cols // 4 out = outputs[0] # only allocate if there is no previous allocation of the right size. if out[0] is None or out[0].shape != out_shape: out[0] = cuda.CudaNdarray.zeros(out_shape) x_block = 16 y_block = 16 block = (x_block, y_block, 1) x_grid = int(np.ceil(float(out_shape[1]) / x_block)) y_grid = int(np.ceil(float(out_shape[0]) / y_block)) grid = (x_grid, y_grid, 1) kernel(inputs[0][0], out[0], np.intc(batch_size), np.intc(num_features), block=block, grid=grid) thunk.inputs = inputs thunk.outputs = outputs thunk.lazy = False return thunk
def trans(self): transposes = """ __global__ void transpose16(float2 * idata, float2 *odata, int width, int height, int num) { __shared__ float blockx[16][16+1]; __shared__ float blocky[16][16+1]; int xIndex = blockIdx.x * 16 + threadIdx.x; int yIndex = blockIdx.y * 16 + threadIdx.y; int index = xIndex + yIndex*width; for (int i= 0; i< 16; i+=8) { blockx[threadIdx.y+i][threadIdx.x] = idata[index + i*width].x; blocky[threadIdx.y+i][threadIdx.x] = idata[index + i*width].y; } __syncthreads(); for (int i = 0; i < 16; i+=8) { odata[index+i*height].x = blockx[threadIdx.x][threadIdx.y+i]; odata[index+i*height].y = blocky[threadIdx.x][threadIdx.y+i]; } } """ t = SourceModule(transposes) self.t16 = t.get_function("transpose16")
def calculate_circuit_graph_vertex_data_device(d_D, d_C, length): logger = logging.getLogger('eulercuda.pyeulertour.calculate_circuit_graph_vertex_data_device') logger.info("started.") mod = SourceModule(""" __global__ void calculateCircuitGraphVertexData( unsigned int * D,unsigned int * C,unsigned int ecount){ unsigned int tid=(blockDim.x*blockDim.y * gridDim.x*blockIdx.y) + (blockDim.x*blockDim.y*blockIdx.x)+(blockDim.x*threadIdx.y)+threadIdx.x; if( tid <ecount) { unsigned int c=D[tid]; atomicExch(C+c,1); } } """) calculate_circuit_graph_vertex_data = mod.get_function('calculateCircuitGraphVertexData') block_dim, grid_dim = getOptimalLaunchConfiguration(length, 512) logger.info('block_dim = %s, grid_dim = %s' % (block_dim, grid_dim)) np_d_D = gpuarray.to_gpu(d_D) np_d_C = gpuarray.to_gpu(d_C) calculate_circuit_graph_vertex_data( np_d_D, np_d_C, np.uintc(length), block=block_dim, grid=grid_dim ) np_d_D.get(d_D) np_d_C.get(d_C) # devdata = pycuda.tools.DeviceData() # orec = pycuda.tools.OccupancyRecord(devdata, block_dim[0] * grid_dim[1]) # logger.info("Occupancy = %s" % (orec.occupancy * 100)) logger.info("Finished. Leaving.") return d_D, d_C
def _extract_projection_func(self, state_var): template = """ __global__ void extract_projection(%(type)s* all_V, %(type)s* projection_V, int* all_index, int* projection_index, int N) { int tid = threadIdx.x + blockIdx.x * blockDim.x; int total_threads = blockDim.x * gridDim.x; int a_ind, p_ind; for(int i = tid; i < N; i += total_threads) { a_ind = all_index[i]; p_ind = projection_index[i]; projection_V[p_ind] = all_V[a_ind]; } } """ mod = SourceModule(template % {"type": dtype_to_ctype(state_var.dtype)}, options=self.compile_options) func = mod.get_function("extract_projection") func.prepare("PPPPi") # [np.intp, np.intp, np.intp, np.intp, np.int32]) return func
def _get_shuffle_kernel(dtype): code = _shuffle_kernel % _ew_types[dtype] module = SourceModule(code) kernel = module.get_function("dimShuffle") kernel.prepare("PPIIIIIIIIIIIIII") return kernel
class _ProgramWrapper: def __init__(self, env, source, sync_calls, double=False, prelude="", **kwds): # program and kernels are tied to queue, which is not exactly logical, # but works for our purposes and makes code simpler (because program uses # single queue for all calculations anyway) self._env = env self._compile(source, double=double, prelude=prelude, **kwds) self._sync_calls = sync_calls def _compile(self, source, double=False, prelude="", manual_extern_c=False, **kwds): """ Adds helper functions and defines to given source, renders it, compiles and saves OpenCL program object. """ kernel_src = Template(source).render(**kwds) src = _header.render(cuda=True, double=double, kernels=kernel_src, prelude=prelude, manual_extern_c=manual_extern_c) try: self._program = SourceModule(src, no_extern_c=True, options=['-use_fast_math']) except: for i, l in enumerate(src.split('\n')): print i + 1, ": ", l raise def __getattr__(self, name): return _KernelWrapper(self._env, self._program.get_function(name), self._sync_calls)
def _get_transpose_kernel(dtype): code = _transpose_kernel % _ew_types[dtype] module = SourceModule(code) kernel = module.get_function("transpose") kernel.prepare("PPII") return kernel
def make_node(self, *inputs): _inputs = [gpu_contiguous(as_cuda_ndarray_variable(i)) for i in inputs] if self.nin > 0 and len(_inputs) != self.nin: raise TypeError('Wrong argument count', (self.nin, len(_inputs))) for i in _inputs[1:]: if i.type.ndim != inputs[0].type.ndim: raise TypeError('different ranks among inputs') if any([any(i.type.broadcastable) for i in inputs]): raise Exception("pycuda don't support broadcasted dimensions") assert len(inputs)==2#TODO remove otype = CudaNdarrayType(broadcastable=[False]*_inputs[0].type.ndim) assert self.nout == 1 fct_name = "pycuda_elemwise_%s"%str(self.scalar_op) out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)]) in_name = ["i"+str(id) for id in range(len(inputs))] out_name = ["o"+str(id) for id in range(self.nout)] c_code = self.scalar_op.c_code(out_node, "some_name", tuple([n+"[i]"for n in in_name]), tuple(n+"[i]"for n in out_name), {}) c_code_param = ", ".join([var.type.dtype_specs()[1]+" *"+name for var,name in zip(inputs,in_name) + zip(out_node.outputs,out_name)]+["int size"]) mod = SourceModule(""" #include<Python.h> #include <numpy/arrayobject.h> __global__ void %s(%s) { int i = (blockIdx.x+blockIdx.y*gridDim.x)*(blockDim.x*blockDim.y); i += threadIdx.x + threadIdx.y*blockDim.x; if(i<size){ %s } } """%(fct_name,c_code_param,c_code)) self.pycuda_fct = mod.get_function(fct_name) return out_node
def edgetaper_gpu(y_gpu, sf, win='barthann'): shape = np.array(y_gpu.shape).astype(np.uint32) dtype = y_gpu.dtype block_size = (16,16,1) grid_size = (int(np.ceil(float(shape[1])/block_size[0])), int(np.ceil(float(shape[0])/block_size[1]))) # Ensure that sf is odd sf = sf+(1-np.mod(sf,2)) wx = scipy.signal.get_window(win, sf[1]) wy = scipy.signal.get_window(win, sf[0]) maxw = wx.max() * wy.max() hsf = np.floor(sf/2) wx = (wx[0:hsf[1]] / maxw).astype(dtype) wy = (wy[0:hsf[0]] / maxw).astype(dtype) preproc = _generate_preproc(dtype, shape) preproc += '#define wx_size %d\n' % wx.size preproc += '#define wy_size %d\n' % wy.size mod = SourceModule(preproc + edgetaper_code, keep=True) edgetaper_gpu = mod.get_function("edgetaper") wx_gpu, wx_size = mod.get_global('wx') wy_gpu, wy_size = mod.get_global('wy') cu.memcpy_htod(wx_gpu, wx) cu.memcpy_htod(wy_gpu, wy) edgetaper_gpu(y_gpu, np.int32(hsf[1]), np.int32(hsf[0]), block=block_size, grid=grid_size)
def _init(self, pn, qn, wf_qn): super()._init(pn, qn, wf_qn) self._wf_q_grid_gpu = gpuarray.to_gpu(N.ascontiguousarray(self._wf_q_grid)) self._wf_gpu = gpuarray.to_gpu(N.ascontiguousarray(self._wf)) mod = SourceModule(""" __global__ void transform(double *ps, double *qs, double *wf_q_grid, double *wf, double *out_real, double *out_imag) {{ int idx_x = threadIdx.x + blockIdx.x * blockDim.x; int idx_y = threadIdx.y + blockIdx.y * blockDim.y; int idx = idx_x + idx_y * {qn}; double qdiff, prefactor, s, c; if (idx_x >= {qn} || idx_y >= {pn}) return; for (int j = 0; j < {wf_qn}; j++) {{ qdiff = wf_q_grid[j] - qs[idx]; prefactor = exp({g} * qdiff * qdiff) * wf[j]; sincos({h} * ps[idx] * qdiff, &s, &c); out_real[idx] += prefactor * c; out_imag[idx] += prefactor * s; }} }} """.format(g=-0.5*self._gamma, h=1./HBAR, pn=pn, qn=qn, wf_qn=wf_qn)) self._kernel = mod.get_function('transform') self._kernel.prepare('PPPPPP') self._gpu_grid, self._gpu_block = carve_array(qn, pn)
def gamma_uncompress_gpu(image): """Do a gamma decompression on the image. This function does an in-place gamma decompression on the input image. Args: y_gpu: Input image on GPU. Returns: Nothing, but modifies y_gpu in place. """ shape = np.array(y_gpu.shape).astype(np.uint32) dtype = y_gpu.dtype if len(shape) == 3: dim = int(shape[2]) else: dim = 1 block_size = (16,16,dim) grid_size = (int(np.ceil(float(shape[1])/block_size[0])), int(np.ceil(float(shape[0])/block_size[1]))) preproc = _generate_preproc(dtype, shape) mod = SourceModule(preproc + kernel_code, keep=True) gamma_uncompress_fun = mod.get_function("gamma_uncompress") gamma_uncompress_fun(y_gpu.gpudata, block=block_size, grid=grid_size)
def shock_filter_gpu(y_gpu, iter=3, dt=0.1, h=1): """Evolve image according to a "shock filter" process. This function does an in-place shock filtering on the input image. Args: y_gpu: Input image on GPU. iter: Number of time steps. dt: Duration of one time step. h: Size of grid steps. Returns: Nothing, but modifies y_gpu in place. """ stream = cu.Stream() shape = np.array(y_gpu.shape).astype(np.uint32) dtype = y_gpu.dtype block_size = (16,16,1) grid_size = (int(np.ceil(float(shape[1])/block_size[0])), int(np.ceil(float(shape[0])/block_size[1]))) shared_size = int((2+block_size[0])*(2+block_size[1])*dtype.itemsize) preproc = _generate_preproc(dtype, shape) mod = SourceModule(preproc + kernel_code, keep=True) shock_gpu = mod.get_function("shock") for i in range(iter): shock_gpu(y_gpu.gpudata, np.float32(dt), np.float32(h), block=block_size, grid=grid_size, stream=stream, shared=shared_size) stream.synchronize()
class CUDAModule(object): """ Interfaces with PyCUDA Parameters ---------- kernel_dict : """ def __init__(self, kernel_dict): self.kernel_dict = kernel_dict self.support_code = _get_support_code() self.all_code = self._get_full_source() try: self.pycuda_module = SourceModule(self.all_code) except Exception: f = open('foo.cu', 'w') print >> f, self.all_code f.close() raise def _get_full_source(self): formatted_kernels = [kern.get_code() for kern in self.kernel_dict.values()] return '\n'.join([self.support_code] + formatted_kernels) def get_function(self, name): return self.pycuda_module.get_function('k_%s' % name)
def bfilter_gpu(y_gpu, w=5, sigma_d=3, sigma_r=0.1): """Two dimensional bilateral filtering. This function implements 2-D bilateral filtering using the method outlined in: C. Tomasi and R. Manduchi. Bilateral Filtering for Gray and Color Images. In Proceedings of the IEEE International Conference on Computer Vision, 1998. This operation is in place. Args: y_gpu: Input image on GPU. w: Half-size of the filter. sigma_d: Spatial domain deviation. sigma_r: Intensity domain deviation. Returns: Nothing, but modifies y_gpu in place. """ shape = np.array(y_gpu.shape).astype(np.uint32) dtype = y_gpu.dtype block_size = (16,16,1) grid_size = (int(np.ceil(float(shape[1])/block_size[0])), int(np.ceil(float(shape[0])/block_size[1]))) shared_size = int((2*w+block_size[0])*(2*w+block_size[1])*dtype.itemsize) preproc = _generate_preproc(dtype, shape) mod = SourceModule(preproc + kernel_code, keep=True) bfilter_gpu = mod.get_function("bfilter") bfilter_gpu(y_gpu.gpudata, np.int32(w), np.float32(sigma_d), np.float32(sigma_r), block=block_size, grid=grid_size, shared=shared_size)
from pycuda.compiler import SourceModule args = 'float *fr00' body = 'f = 0.543*fr00[idx];\n\t__syncthreads();\n' for i in range(nloop): if (i > 0): args += ', float *fr%.2d' % i body += '\tf += %1.3f*fr%.2d[idx];\n\t__syncthreads();\n' % ( np.random.ranf(), i) kernels += kernel_template.replace('NAME', 'func%.2d' % i).replace( 'ARGS', args).replace('BODY', body) print kernels mod = SourceModule(kernels) kern_list = [] for i in range(nloop): kern_list.append(mod.get_function("func%.2d" % i)) # measure kernel execution time start = cuda.Event() stop = cuda.Event() exec_time = np.zeros(nloop, dtype=np.float64) for k in range(10): for i in range(nloop): fr_gpus = 'fr_gpu_list[0]' for j in range(1, i + 1): fr_gpus += ', fr_gpu_list[%d]' % j cmd = 'kern_list[%d](fw_gpu, %s, block=(512,1,1), grid=(nx/512,1))' % ( i, fr_gpus) #print cmd start.record()
import pycuda.driver as cuda import pycuda.autoinit from pycuda.compiler import SourceModule import numpy as np a = np.random.randn(4, 4) a = a.astype(np.float32) a_gpu = cuda.mem_alloc(a.nbytes) cuda.memcpy_htod(a_gpu, a) mod = SourceModule(""" __global__ void doublify(float *a) { int idx = threadIdx.x + threadIdx.y * 4; a[idx] *= 2; } """) func = mod.get_function("doublify") func(a_gpu, block=(4, 4, 1)) a_doubled = np.empty_like(a) cuda.memcpy_dtoh(a_doubled, a_gpu) print a_doubled print a
for(int binsidx = binpfx[bin_idx]; binsidx < binpfx[bin_idx] + bincounters[bin_idx]; binsidx++){{ int otherr = (int)(bins[binsidx][0]); // location in field int otherc = (int)(bins[binsidx][1]); int viewr = otherr - myr + view_range; int viewc = otherc - myc + view_range; // If if statement passes, then other agent is in view range. if(0 <= viewr && viewr < {view_size} && 0 <= viewc && viewc < {view_size}){{ states[viewr * {view_size} + viewc] = 1.0; }} }} }} }} // Render -1 for walls for(int viewr = 0; viewr < {view_size}; viewr++){{ for(int viewc = 0; viewc < {view_size}; viewc++){{ int fieldr = myr + viewr - view_range; int fieldc = myc + viewc - view_range; if(fieldr < 0 || fieldc < 0 || fieldr >= {rows} || fieldc >= {cols}){{ int stateidx = statesize * i + viewr * {view_size} + viewc; states[stateidx] = -1; }} }} }} }} }}""" print(kernel) mod = SourceModule(kernel) step = mod.get_function("tensor")
def _get_bprop_upsampling(clss, compute_capability): code = r""" #define FLT_MAX 3.402823466E+38F %(common)s __global__ void spool_bprop_upsampling( const %(type)s* I, %(type)s* O, unsigned char* A, float alpha, float beta, int flags, int N, int W, int H, int D, int C, int WN, int HWN, int DHWN, int P, int Q, int magic_P, int shift_P, int QN, int PQN, int MPQN, int pad_c, int pad_d, int pad_h, int pad_w, int str_c, int str_d, int str_h, int str_w, int S, int RS, int RST, int JRST, int magic_S, int shift_S, int magic_RS, int shift_RS, int magic_RST, int shift_RST, int supP, int supQ, int shlP, int maskP, int shrP, int shlQ, int maskQ, int shrQ, int maskN, int shrN %(stats_args)s ) { extern __shared__ int lut[]; int tid = threadIdx.x; int q = blockIdx.x; int mp = blockIdx.y; int k = blockIdx.z; int m = mp * magic_P; m >>= shift_P; int p = mp - m*supP; // zigzag q back and forth to improve L2 cache perf if (p & 1) q = supQ - q - 1; // Superblock P and Q p = (p << shlP) + ((tid & maskP) >> shrP); q = (q << shlQ) + ((tid & maskQ) >> shrQ); int n = tid & maskN; int sb = tid >> shrN; int offset = k*MPQN + m*PQN + p*QN + mad16(q, N, n); I += n; O += offset; A += offset; float O_val = beta != 0.0f && p < P && q < Q && n < N ? %(cvt)s(__ldg(O)) : 0.0f; if (tid < 32) { int kj = k * str_c - pad_c; int mt = m * str_d - pad_d; int pr = p * str_h - pad_h; int qs = q * str_w - pad_w; int inc = min(maskN + 1, 32); int jrst = n; while (jrst < JRST) { int j = div16(jrst, magic_RST, shift_RST); int rst = mod16(jrst, j, RST); int t = div16(rst, magic_RS, shift_RS); int rs = mod16(rst, t, RS); int r = div16(rs, magic_S, shift_S); int s = mod16(rs, r, S); int x = qs + s; int y = pr + r; int z = mt + t; int c = kj + j; bool bounds_x = x >= 0 && x < W; bool bounds_y = y >= 0 && y < H; bool bounds_z = z >= 0 && z < D; bool bounds_c = c >= 0 && c < C; bool in_bounds = bounds_x && bounds_y && bounds_z && bounds_c; int sliceI = c*DHWN + z*HWN + y*WN + x*N; int lut_offset = mad16(sb, JRST, jrst); lut[lut_offset] = in_bounds ? sliceI : -1; jrst += inc; } } __syncthreads(); int intermediate_max = 0; if (p < P && q < Q && n < N) { int jrst = 0; int argmax = 0; float max = -FLT_MAX; while (jrst < JRST) { int lut_offset = mad16(sb, JRST, jrst); //int slice0 = lut[lut_offset + 0]; //int slice1 = lut[lut_offset + 1]; //int slice2 = lut[lut_offset + 2]; //int slice3 = lut[lut_offset + 3]; int slice = lut[lut_offset + *A - jrst]; // val needs to stay in fp32 or can't be se to FLT_MAX //float val0 = jrst + 0 < JRST && slice0 >= 0 ? %(cvt)s(__ldg(I + slice0)) : -FLT_MAX; //float val1 = jrst + 1 < JRST && slice1 >= 0 ? %(cvt)s(__ldg(I + slice1)) : -FLT_MAX; //float val2 = jrst + 2 < JRST && slice2 >= 0 ? %(cvt)s(__ldg(I + slice2)) : -FLT_MAX; //float val3 = jrst + 3 < JRST && slice3 >= 0 ? %(cvt)s(__ldg(I + slice3)) : -FLT_MAX; //if (*A == jrst + 0) { // max = val0; //} //if (*A == jrst + 1) { // max = val1; //} //if (*A == jrst + 2) { // max = val2; //} //if (*A == jrst + 3) { // max = val3; //} max = %(cvt)s(__ldg(I + slice)); jrst += 4; } // convert back to fp to write out %(type)s temp_out = %(cvt_out)s( %(mul_by_scale)s (max*alpha + O_val*beta)); if (!(flags & 1)) { *O = temp_out; } intermediate_max = max_abs(0, temp_out); // compute abs } intermediate_max += 0; %(atomic_max)s } """ template_vals = prepare_template_vals(clss, compute_capability) code = code % template_vals module = SourceModule(code) kernel = module.get_function("spool_bprop_upsampling") sig = "3P 2f 44I" + ("Pf" if (clss[0] == "x") else "") kernel.prepare(sig) return kernel
#define INDEX(a, b, yshape) (a)*(yshape) + (b) __global__ void matrixAddition(float *A,float *B, float alpha, float beta, int ydim) { unsigned int idx = threadIdx.x+(blockIdx.x*(blockDim.x*blockDim.y)); unsigned int a = idx/ydim; unsigned int b = idx%ydim; A[INDEX(a, b, ydim)] = alpha*A[INDEX(a, b, ydim)]+beta*B[INDEX(a, b, ydim)]; } """ ) matrixAddition = mod.get_function("matrixAddition") def matAdd(A, B, alpha, beta): forme1 = A.shape forme2 = B.shape if (forme1 != forme2): sys.exit('matrix dimensions differ') aSize = forme1[0] * forme1[1] xdim = np.int32(forme1[0]) ydim = np.int32(forme1[1]) A = np.reshape(A, aSize, order='F').astype(np.float32) B = np.reshape(B, aSize, order='F').astype(np.float32) alpha = np.float32(alpha)
// result must start off as a zero array __global__ void BatchMatColDotKernel(const int p, const int q, const float *W, const float *W_tilde, const int *batch_i, const int *batch_j, float *result) { int batch_index = blockIdx.y * blockDim.y + threadIdx.y; if (batch_index >= p) return; int col = blockIdx.x * blockDim.x + threadIdx.x; if (col >= q) return; int row_W = batch_i[batch_index]; int row_W_tilde = batch_j[batch_index]; atomicAdd(&result[batch_index], W[row_W * q + col] * W_tilde[row_W_tilde * q + col]); } """) batchMatSubtractInplace = mod.get_function("BatchMatSubtractInplaceKernel") batchVecSubtractInplace = mod.get_function("BatchVecSubtractInplaceKernel") batchMatVecRowMult = mod.get_function("BatchMatVecRowMultKernel") batchCopyVectorKernel = mod.get_function("BatchCopyVectorKernel") batchMatColDot = mod.get_function("BatchMatColDotKernel") def assertResultsClose(gpu_result, actual_result): assert np.allclose(gpu_result, actual_result), "GPU Result:\n" + repr(gpu_result) \ + "\nActual Result:\n" + repr(actual_result) + "\n" def testBatchCopyVectorKernel(): batch_size = 128 num_elems = 1000
def CudaNegative(inPath, outPath): totalT0 = time.clock() im = Image.open(inPath) px = numpy.array(im) px = px.astype(numpy.float32) getAndConvertT1 = time.clock() allocT0 = time.clock() d_px = cuda.mem_alloc(px.nbytes) cuda.memcpy_htod(d_px, px) allocT1 = time.clock() #Kernel declaration kernelT0 = time.clock() #Kernel grid and block size BLOCK_SIZE = 1024 block = (1024, 1, 1) checkSize = numpy.int32(im.size[0] * im.size[1]) grid = (int(im.size[0] * im.size[1] / BLOCK_SIZE) + 1, 1, 1) #Kernel text kernel = """ __global__ void ng( float *inIm, int check ){ int idx = (threadIdx.x ) + blockDim.x * blockIdx.x ; if(idx *3 < check*3) { inIm[idx*3]= 255-inIm[idx*3]; inIm[idx*3+1]= 255-inIm[idx*3+1]; inIm[idx*3+2]= 255-inIm[idx*3+2]; } } """ #Compile and get kernel function mod = SourceModule(kernel) func = mod.get_function("ng") func(d_px, checkSize, block=block, grid=grid) kernelT1 = time.clock() #Get back data from gpu backDataT0 = time.clock() ngPx = numpy.empty_like(px) cuda.memcpy_dtoh(ngPx, d_px) ngPx = (numpy.uint8(ngPx)) backDataT1 = time.clock() #Save image storeImageT0 = time.clock() pil_im = Image.fromarray(ngPx, mode="RGB") pil_im.save(outPath) totalT1 = time.clock() getAndConvertTime = getAndConvertT1 - totalT0 allocTime = allocT1 - allocT0 kernelTime = kernelT1 - kernelT0 backDataTime = backDataT1 - backDataT0 storeImageTime = totalT1 - storeImageT0 totalTime = totalT1 - totalT0 print "Negative image" print "Image size: ", im.size print "Time taken to get and convert image data: ", getAndConvertTime print "Time taken to allocate memory on the GPU: ", allocTime print "Kernel execution time: ", kernelTime print "Time taken to get image data from GPU and convert it: ", backDataTime print "Time taken to save the image: ", storeImageTime print "Total execution time: ", totalTime print
from gpu.gpuSimulation import useCachedKernels fluxCode = open( os.path.join(os.path.dirname(__file__), './fluxCalculations.cu'), 'r') try: # Put the kernel code into a SourceModule if useCachedKernels: fluxModule = SourceModule(fluxCode.read()) else: fluxModule = SourceModule(fluxCode.read(), cache_dir=False) fluxCode.close() # Create reference to the specific functions in the SourceModule FluxSolverFn = fluxModule.get_function("FluxSolver") BuildRFn = fluxModule.get_function("buildRValues") # Create callable functions def FluxSolver(FluxesGPU, UIntPtsGPU, BottomIntPtsGPU, propSpeedsGPU, m, n, blockDims, gridDims): FluxSolverFn(FluxesGPU, UIntPtsGPU, BottomIntPtsGPU, propSpeedsGPU, np.int32(m), np.int32(n), block=(blockDims[0], blockDims[1], 1), grid=(gridDims[0], gridDims[1]))
mod = SourceModule(""" surface<void, 2> surf; __global__ void kernel(int width, int height) { // Calculate surface coordinates unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; if (x < 400 && y < 400) { float data = x / 400.f; // Write to output surface surf2Dwrite(data, surf, x*4, y); } } """) kernel_function = mod.get_function('kernel') surface_ref = mod.get_surfref('surf') # surface_ref.set_array(Density.ping_array,0) surface_ref.set_array(Density.ping_array) def Program(fragment): program = gloo.Program("vertex_passthrough.vert", fragment, count=4) program['Position'] = [(-1,-1), (-1,+1), (+1,-1), (+1,+1)] return program Density = Slab(GridWidth, GridHeight, 1, gl.GL_LINEAR) prog_visualize = Program("visualize.frag")
def _diameter_kernel(): """Returns the CUDA kernel to estimate 3D diameter of structures in a 3D local window. """ diameter_kernel_src = """ texture<float, cudaTextureType3D, cudaReadModeElementType> tex_data; __global__ void diameter3d (unsigned int width, unsigned int height, unsigned int depth, const int n_points, const float norm_factor, const int max_iters, const int n_scan_angles, const int *X, const int *Y, const int *Z, const float *scan_angl_arr, const float *azth_data, const float *lat_data, float *radius_arr) { unsigned long blockId, idx; blockId = blockIdx.x + blockIdx.y * gridDim.x; idx = blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x; if (idx > n_points) { return; } float _x, _y, _z; _x = (float)X[idx] + 0.5; _y = (float)Y[idx] + 0.5; _z = (float)Z[idx] + 0.5; // ----------------------------------------- // Find the diameter float azth = azth_data[idx]; azth += M_PI_2; float lat = lat_data[idx]; float cy = cosf(lat), sy = sinf(lat); float cz = cosf(azth), sz = sinf(azth); //float cz = -sinf(azth), sz = cosf(azth); // taking into account azth + pi/2 //vector along fiber //float dx = -sy * sz; //float dy = cy* sz; //float dz = cy; float uvec[3] = {0.0, 0.0, 1.0}; float fiber_vector_x[3] = {0.0, uvec[2]*cy + uvec[3]*sy, -uvec[2]*sy + uvec[3]*cy}; float fiber_vector_z[3] = {fiber_vector_x[0]*cz - fiber_vector_x[1]*sz, fiber_vector_x[0]*sz + fiber_vector_x[1]*cz, fiber_vector_x[2]}; float dx = fiber_vector_z[0], dy = fiber_vector_z[1], dz = fiber_vector_z[2]; //scan vector perpendicular to a fiber vector (rotation X -> Z) float scan_vec[3] = {0, 1, 0}; // unit vector perpendicular to default (0,0,1) direction float rot_scan_vec_x[3] = {0.0, scan_vec[1]*cy + scan_vec[2]*sy, -scan_vec[1]*sy + scan_vec[2]*cy}; float rot_scan_vec_z[3] = {rot_scan_vec_x[0]*cz - rot_scan_vec_x[1]*sz, rot_scan_vec_x[0]*sz + rot_scan_vec_x[1]*cz, rot_scan_vec_x[2]}; float out_radius = 0; for (int scan_angl_idx = 0; scan_angl_idx < n_scan_angles; scan_angl_idx++) { float theta = scan_angl_arr[scan_angl_idx]; float ct = cosf(theta), st = sinf(theta); float x = rot_scan_vec_z[0], y = rot_scan_vec_z[1], z = rot_scan_vec_z[2]; float u = dx, v = dy, w = dz; //rotation of point (x,y,z) around axis (u,v,w) float scan_vec_coords[3] = {u*(u*x + v*y + w*z)*(1.0f - ct) + x*ct + (-w*y + v*z)*st, v*(u*x + v*y + w*z)*(1.0f - ct) + y*ct + (w*x - u*z)*st, w*(u*x + v*y + w*z)*(1.0f - ct) + z*ct + (-v*x + u*y)*st}; float nc[3] = {_x, _y, _z}; float p[3]; for (int i = 0; i < max_iters; i++) { nc[0] += scan_vec_coords[0]; nc[1] += scan_vec_coords[1]; nc[2] += scan_vec_coords[2]; if (tex3D(tex_data, nc[0], nc[1], nc[2]) == 0) { p[0] = nc[0]; p[1] = nc[1]; p[2] = nc[2]; break; } } out_radius += norm3df(p[0] - _x, p[1] - _y, p[2] - _z); } radius_arr[idx] = out_radius * norm_factor; } """ dm_program = SourceModule(diameter_kernel_src) diameter3d = dm_program.get_function("diameter3d") return dm_program, diameter3d
ey_gpu = cuda.to_device(f) ez_gpu = cuda.to_device(f) hx_gpu = cuda.to_device(f) hy_gpu = cuda.to_device(f) hz_gpu = cuda.to_device(f) cex_gpu = cuda.to_device(set_c(cf, (None, -1, -1))) cey_gpu = cuda.to_device(set_c(cf, (-1, None, -1))) cez_gpu = cuda.to_device(set_c(cf, (-1, -1, None))) # prepare kernels from pycuda.compiler import SourceModule mod = SourceModule( kernels.replace('TPB', str(tpb)).replace('nyz', str(ny * nz)).replace( 'nx', str(nx)).replace('ny', str(ny)).replace('nz', str(nz))) update_h = mod.get_function("update_h") update_e = mod.get_function("update_e") update_src = mod.get_function("update_src") thx = mod.get_texref("thx") thy = mod.get_texref("thy") thz = mod.get_texref("thz") tcex = mod.get_texref("tcex") tcey = mod.get_texref("tcey") tcez = mod.get_texref("tcez") thx.set_address(hx_gpu, f.nbytes) thy.set_address(hy_gpu, f.nbytes) thz.set_address(hz_gpu, f.nbytes) tcex.set_address(cex_gpu, cf.nbytes) tcey.set_address(cey_gpu, cf.nbytes) tcez.set_address(cez_gpu, cf.nbytes)
def _build_kernel_float32(): code = """ __device__ float weight(float B, float A, float div1) { float tmp = B - A; return exp( - tmp * tmp * div1 ); } __global__ void Funktion(int axis, int segment, float *raw, int *slices, float *a, int xsh, int ysh, int zsh, int *indices, int sorw, float *Beta, int nbrw) { int col_g = blockIdx.x * blockDim.x + threadIdx.x; int row_g = blockIdx.y * blockDim.y + threadIdx.y; int slc_g = blockIdx.z; int xsh_g, ysh_g, plane, row, column; if (axis == 0) { plane = indices[slc_g]; row = row_g; column = col_g; xsh_g = xsh; ysh_g = ysh; } else if (axis == 1) { row = indices[slc_g]; plane = row_g; column = col_g; xsh_g = xsh; ysh_g = zsh; } else if (axis == 2) { column = indices[slc_g]; plane = row_g; row = col_g; xsh_g = ysh; ysh_g = zsh; } int flat = xsh * ysh; int flat_g = xsh_g * ysh_g; unsigned int index = slc_g * flat_g + row_g * xsh_g + col_g; if (index<gridDim.z*flat_g && plane>0 && plane<zsh-1 && row>0 && row<ysh-1 && column>0 && column<xsh-1) { if (slices[index]==segment) { /* Adaptive random walks */ int found = 0; if ((col_g + row_g) % 4 == 0) { found = 1; } else { for (int y = -100; y < 101; y++) { for (int x = -100; x < 101; x++) { if (row_g+y > 0 && col_g+x > 0 && row_g+y < ysh_g-1 && col_g+x < xsh_g-1) { unsigned int tmp = slc_g * flat_g + (row_g + y) * xsh_g + (col_g + x); if (slices[tmp] != segment && slices[tmp] != -1) { found = 1; } } } } } if (found == 1) { float rand; float W0,W1,W2,W3,W4,W5; int n,o,p; /* Initialize MRG32k3a */ float norm = 2.328306549295728e-10; float m1 = 4294967087.0; float m2 = 4294944443.0; float a12 = 1403580.0; float a13n = 810728.0; float a21 = 527612.0; float a23n = 1370589.0; long k1; float p1, p2; float s10 = index, s11 = index, s12 = index, s20 = index, s21 = index, s22 = index; /* Compute standard deviation */ unsigned int position = plane*flat + row*xsh + column; float B = raw[position]; float var = Beta[index]; float div1 = 1 / (2 * var); int k = plane; int l = row; int m = column; int step = 0; int n_rw = 0; /* Compute random walks */ while (n_rw < nbrw) { /* Compute weights */ W0 = weight(B, raw[position + flat], div1); W1 = weight(B, raw[position - flat], div1); W2 = weight(B, raw[position + xsh], div1); W3 = weight(B, raw[position - xsh], div1); W4 = weight(B, raw[position + 1], div1); W5 = weight(B, raw[position - 1], div1); W1 += W0; W2 += W1; W3 += W2; W4 += W3; W5 += W4; /* Compute random numbers with MRG32k3a */ /* Component 1 */ p1 = a12 * s11 - a13n * s10; k1 = p1 / m1; p1 -= k1 * m1; if (p1 < 0.0){ p1 += m1;} s10 = s11; s11 = s12; s12 = p1; /* Component 2 */ p2 = a21 * s22 - a23n * s20; k1 = p2 / m2; p2 -= k1 * m2; if (p2 < 0.0){ p2 += m2;} s20 = s21; s21 = s22; s22 = p2; /* Combination */ if (p1 <= p2) { rand = W5 * ((p1 - p2 + m1) * norm); } else { rand = W5 * ((p1 - p2) * norm); } /* Determine new direction of random walk */ if (rand<W0 || rand==0){n=1; o=0; p=0;} else if (rand>=W0 && rand<W1){n=-1; o=0; p=0;} else if (rand>=W1 && rand<W2){n=0; o=1; p=0;} else if (rand>=W2 && rand<W3){n=0; o=-1; p=0;} else if (rand>=W3 && rand<W4){n=0; o=0; p=1;} else if (rand>=W4 && rand<=W5){n=0; o=0; p=-1;} /* Move in new direction */ if (k+n>0 && k+n<zsh-1 && l+o>0 && l+o<ysh-1 && m+p>0 && m+p<xsh-1) { k += n; l += o; m += p; position = k*flat + l*xsh + m; atomicAdd(&a[position], 1); } step += 1; if (step==sorw) { k = plane; l = row; m = column; position = k*flat + l*xsh + m; n_rw += 1; step = 0; } } } } } } """ mod = SourceModule(code) kernel = mod.get_function("Funktion") return kernel
mcopy = cuda.Memcpy3D() mcopy.width_in_bytes = mcopy.src_pitch = f.strides[1] mcopy.src_height = mcopy.height = ny mcopy.depth = nz memcopy(mcopy, set_c(f,(None,-1,-1)), cex_gpu) memcopy(mcopy, set_c(f,(-1,None,-1)), cey_gpu) memcopy(mcopy, set_c(f,(-1,-1,None)), cez_gpu) memcopy(mcopy, set_c(f,(None,0,0)), chx_gpu) memcopy(mcopy, set_c(f,(0,None,0)), chy_gpu) memcopy(mcopy, set_c(f,(0,0,None)), chz_gpu) # prepare kernels from pycuda.compiler import SourceModule mod = SourceModule(kernels) update_e = mod.get_function("update_e") update_h = mod.get_function("update_h") update_src = mod.get_function("update_src") tex = mod.get_texref("tex") tey = mod.get_texref("tey") tez = mod.get_texref("tez") thx = mod.get_texref("thx") thy = mod.get_texref("thy") thz = mod.get_texref("thz") tcex = mod.get_texref("tcex") tcey = mod.get_texref("tcey") tcez = mod.get_texref("tcez") tchx = mod.get_texref("tchx") tchy = mod.get_texref("tchy") tchz = mod.get_texref("tchz")
def CudaBrightness(inPath, outPath): totalT0 = time.clock() im = Image.open(inPath) px = numpy.array(im) px = px.astype(numpy.float32) getAndConvertT1 = time.clock() allocT0 = time.clock() d_px = cuda.mem_alloc(px.nbytes) cuda.memcpy_htod(d_px, px) allocT1 = time.clock() #Kernel declaration kernelT0 = time.clock() #Kernel grid and block size BLOCK_SIZE = 1024 block = (1024, 1, 1) checkSize = numpy.int32(im.size[0] * im.size[1]) grid = (int(im.size[0] * im.size[1] / BLOCK_SIZE) + 1, 1, 1) #Kernel text kernel = """ __global__ void br( float *inIm, int check, int brightness ){ int idx = (threadIdx.x ) + blockDim.x * blockIdx.x ; if(idx *3 < check*3) { if(inIm[idx*3]+brightness > 255) inIm[idx*3] = 255; else inIm[idx*3]= inIm[idx*3]+brightness; if(inIm[idx*3+1]+brightness > 255) inIm[idx*3+1] = 255; else inIm[idx*3+1]= inIm[idx*3+1]+brightness; if(inIm[idx*3+2]+brightness > 255) inIm[idx*3+2] = 255; else inIm[idx*3+2]= inIm[idx*3+2]+brightness; } } """ brightness = int( raw_input("Enter the level of brightness (-255 to 255): ")) print if brightness > 255: brightness = 255 if brightness < -255: brightness = -255 brightness = numpy.int32(brightness) #Compile and get kernel function mod = SourceModule(kernel) func = mod.get_function("br") func(d_px, checkSize, brightness, block=block, grid=grid) kernelT1 = time.clock() #Get back data from gpu backDataT0 = time.clock() brPx = numpy.empty_like(px) cuda.memcpy_dtoh(brPx, d_px) brPx = (numpy.uint8(brPx)) backDataT1 = time.clock() #Save image storeImageT0 = time.clock() pil_im = Image.fromarray(brPx, mode="RGB") pil_im.save(outPath) totalT1 = time.clock() getAndConvertTime = getAndConvertT1 - totalT0 allocTime = allocT1 - allocT0 kernelTime = kernelT1 - kernelT0 backDataTime = backDataT1 - backDataT0 storeImageTime = totalT1 - storeImageT0 totalTime = totalT1 - totalT0 print "Brightness filter" print "Image size : ", im.size print "Time taken to get and convert image data: ", getAndConvertTime print "Time taken to allocate memory on the GPU: ", allocTime print "Kernel execution time: ", kernelTime print "Time taken to get image data from GPU and convert it: ", backDataTime print "Time taken to save the image: ", storeImageTime print "Total execution time: ", totalTime print
import pycuda.gpuarray as gpuarray import pycuda.driver as drv import pycuda.autoinit from pycuda.compiler import SourceModule import numpy # Kernel function mod = SourceModule(""" __global__ void square(float *d_a) { int idx = threadIdx.x + threadIdx.y*5; d_a[idx] = d_a[idx]*d_a[idx]; } """) square = mod.get_function("square") # ---------------Using mem_alloc--------------- # start = drv.Event() end = drv.Event() h_a = numpy.random.randint(1, 5, (5, 5)) h_a = h_a.astype(numpy.float32) h_b = h_a.copy() start.record() d_a = drv.mem_alloc(h_a.size * h_a.dtype.itemsize) drv.memcpy_htod(d_a, h_a) # Calling kernel square(d_a, block=(5, 5, 1), grid=(1, 1), shared=0) h_result = numpy.empty_like(h_a) drv.memcpy_dtoh(h_result, d_a) end.record()
def CudaColor(inPath, outPath): totalT0 = time.clock() im = Image.open(inPath) px = numpy.array(im) px = px.astype(numpy.float32) getAndConvertT1 = time.clock() allocT0 = time.clock() d_px = cuda.mem_alloc(px.nbytes) cuda.memcpy_htod(d_px, px) allocT1 = time.clock() #Kernel declaration kernelT0 = time.clock() #Kernel grid and block size BLOCK_SIZE = 1024 block = (1024, 1, 1) checkSize = numpy.int32(im.size[0] * im.size[1]) grid = (int(im.size[0] * im.size[1] / BLOCK_SIZE) + 1, 1, 1) #Kernel text kernel = """ __global__ void co( float *inIm, int check, int color){ int idx = (threadIdx.x ) + blockDim.x * blockIdx.x ; if(idx*3 < check*3) { if(color == 0) { inIm[idx*3+1] = inIm[idx*3+1]-255; inIm[idx*3+2] = inIm[idx*3+2]-255; } else if(color == 1) { inIm[idx*3] = inIm[idx*3]-255; inIm[idx*3+2] = inIm[idx*3+2]-255; } else if(color == 2) { inIm[idx*3] = inIm[idx*3]-255; inIm[idx*3+1] = inIm[idx*3+1]-255; } if(inIm[idx*3] < 0) inIm[idx*3] = 0; if(inIm[idx*3] > 255) inIm[idx*3] = 255; if(inIm[idx*3+1] < 0) inIm[idx*3+1] = 0; if(inIm[idx*3+1] > 255) inIm[idx*3+1] = 255; if(inIm[idx*3+2] < 0) inIm[idx*3+2] = 0; if(inIm[idx*3+2] > 255) inIm[idx*3+2] = 255; } } """ color = int( raw_input("Enter the color of the filter (0-Red;1-Green;2-Blue): ")) print color = numpy.int32(color) #Compile and get kernel function mod = SourceModule(kernel) func = mod.get_function("co") func(d_px, checkSize, color, block=block, grid=grid) kernelT1 = time.clock() #Get back data from gpu backDataT0 = time.clock() coPx = numpy.empty_like(px) cuda.memcpy_dtoh(coPx, d_px) coPx = (numpy.uint8(coPx)) backDataT1 = time.clock() #Save image storeImageT0 = time.clock() pil_im = Image.fromarray(coPx, mode="RGB") pil_im.save(outPath) totalT1 = time.clock() getAndConvertTime = getAndConvertT1 - totalT0 allocTime = allocT1 - allocT0 kernelTime = kernelT1 - kernelT0 backDataTime = backDataT1 - backDataT0 storeImageTime = totalT1 - storeImageT0 totalTime = totalT1 - totalT0 print "Color Filter" print "Image size : ", im.size print "Time taken to get and convert image data: ", getAndConvertTime print "Time taken to allocate memory on the GPU: ", allocTime print "Kernel execution time: ", kernelTime print "Time taken to get image data from GPU and convert it: ", backDataTime print "Time taken to save the image: ", storeImageTime print "Total execution time: ", totalTime print
z_re2 = z_re*z_re; z_im = 2 * z_re * z_im + c_im; z_re = z_re2 - z_im2 + c_re; } } if(z_im2 + z_re2 > 1000){ b[gid] = log(2 * cabs(z_re, z_im)*log(cabs(z_re, z_im))/cabs(dz_re, dz_im)); } else { b[gid] = 0; } } """ mod = SourceModule(KERNELS) exterior_distance_gpu = mod.get_function('exterior_distance') def exterior_distances(im, re): b = np.zeros_like(im) exterior_distance_gpu(cuda.In(im), cuda.In(re), cuda.InOut(b), grid=(b.size // 1024, 1, 1), block=(1024, 1, 1)) return b def process_and_store_image(image, re, im, zoom): image = cv2.GaussianBlur(image, (3, 3), cv2.BORDER_DEFAULT) image = cv2.resize(image, (16 * 256, 16 * 256),
} } } """ try: Context.get_device() except: import pycuda.autoinit mod = SourceModule(krnl, no_extern_c=True) _gpu_expm = mod.get_function("expm") def gpu_expm(As, Ts_vectorized, p=12): N = len(As) if Ts_vectorized.ndim != 2 or Ts_vectorized.shape[1] != 6: raise ValueError(Ts_vectorized.shape) threadsPerBlock = 512 nBlocks = int(np.ceil(float(N) / float(threadsPerBlock))) _gpu_expm(As.gpu, Ts_vectorized.gpu, np.int32(N), np.int32(p), grid=(nBlocks, 1, 1),
{ meshU[row*n*3 + col*3] = meshU[row*n*3 + 2*3]; meshU[row*n*3 + col*3 + 1] = meshU[m*n*3 + 2*3 + 1]; meshU[row*n*3 + col*3 + 2] = meshU[m*n*3 + 2*3 + 2]; } else if (col > n-3 && col < n && row < m) { meshU[row*n*3 + col*3] = meshU[row*n*3 + (n-3)*3]; meshU[row*n*3 + col*3 + 1] = meshU[m*n*3 + (n-3)*3 + 1]; meshU[row*n*3 + col*3 + 2] = meshU[m*n*3 + (n-3)*3 + 2]; } } """) wallBoundaries = boundaryConditionsModule.get_function("applyWallBoundaries") openBoundaries = boundaryConditionsModule.get_function("applyOpenBoundaries") def applyWallBoundaries(meshUGPU, m, n, blockDims, gridDims): wallBoundaries(meshUGPU, np.int32(m), np.int32(n), block=(blockDims[0], blockDims[1], 1), grid=(gridDims[0], gridDims[1])) def applyWallBoundariesTimed(meshUGPU, m, n, blockDims, gridDims): return wallBoundaries(meshUGPU, np.int32(m), np.int32(n), block=(blockDims[0], blockDims[1], 1), grid=(gridDims[0], gridDims[1]), time_kernel=True) def applyOpenBoundaries(meshUGPU, m, n, blockDims, gridDims):
def build_sparse_transition_model_at_T(T, T_gpu, vel_data_gpu, params, bDimx, params_gpu, xs_gpu, ys_gpu, ac_angles, results, sumR_sa, save_file_for_each_a=False): gsize = int(params[0]) num_actions = int(params[1]) nrzns = int(params[2]) all_u_mat_gpu, all_v_mat_gpu, all_ui_mat_gpu, all_vi_mat_gpu, all_Yi_gpu = vel_data_gpu results_gpu_list = [] sumR_sa_gpu_list = [] for i in range(num_actions): results_gpu_list.append(cuda.mem_alloc(results.nbytes)) sumR_sa_gpu_list.append(cuda.mem_alloc(sumR_sa.nbytes)) for i in range(num_actions): cuda.memcpy_htod(results_gpu_list[i], results) cuda.memcpy_htod(sumR_sa_gpu_list[i], sumR_sa) print("alloted mem in inner func") # let one thread access a state centre. access coresponding velocities, run all actions # TODO: dt may not be int for a genral purpose code mod = SourceModule(""" __device__ int32_t get_thread_idx() // assigns idx to thread with which it accesses the flattened 3d vxrzns matrix // for a given T and a given action. // runs for both 2d and 3d grid // TODO: may have to change this considering cache locality { // here i, j, k refer to a general matrix M[i][j][k] int32_t i = threadIdx.x; int32_t j = blockIdx.y; int32_t k = blockIdx.x; int32_t idx = k + (j*gridDim.x) + (i*gridDim.x*gridDim.y)+ blockIdx.z*blockDim.x*gridDim.x*gridDim.y; return idx; } __device__ int32_t state1D_from_thread(int32_t T) { // j ~ blockIdx.x // i ~ blockIdx.y // The above three consitute a spatial state index from i and j of grid // last term is for including time index as well. return (blockIdx.x + (blockIdx.y*gridDim.x) + (T*gridDim.x*gridDim.y) ); } __device__ int32_t state1D_from_ij(int32_t* posid, int32_t T) { // posid = {i , j} // state id = j + i*dim(i) + T*dim(i)*dim(j) return (posid[1] + posid[0]*gridDim.x + (T*gridDim.x*gridDim.y) ) ; } __device__ bool is_edge_state(int32_t i, int32_t j) { // n = gsize -1 that is the last index of the domain assuming square domain int32_t n = gridDim.x - 1; if (i == 0 || i == n || j == 0 || j == n ) { return true; } else return false; } __device__ bool is_terminal(int32_t i, int32_t j, float* params) { int32_t i_term = params[8]; // terminal state indices int32_t j_term = params[9]; if(i == i_term && j == j_term) { return true; } else return false; } __device__ bool my_isnan(int s) { // By IEEE 754 rule, NaN is not equal to NaN return s != s; } __device__ void get_xypos_from_ij(int32_t i, int32_t j, float* xs, float* ys, float* x, float* y) { *x = xs[j]; *y = ys[gridDim.x - 1 - i]; return; } __device__ float get_angle_in_0_2pi(float theta) { float f_pi = 3.141592; if (theta < 0) { return theta + (2*f_pi); } else { return theta; } } __device__ float calculate_reward_const_dt(float* xs, float* ys, int32_t i_old, int32_t j_old, float xold, float yold, int32_t* newposids, float* params, float vnet_x, float vnet_y ) { // xold and yold are centre of old state (i_old, j_old) float dt = params[4]; float r1, r2, theta1, theta2, theta, h; float dt_new; float xnew, ynew; if (newposids[0] == i_old && newposids[1] == j_old) { dt_new = dt; } else { get_xypos_from_ij(newposids[0], newposids[1], xs, ys, &xnew, &ynew); //get centre of new states h = sqrtf((xnew - xold)*(xnew - xold) + (ynew - yold)*(ynew - yold)); r1 = h/(sqrtf((vnet_x*vnet_x) + (vnet_y*vnet_y))); theta1 = get_angle_in_0_2pi(atan2f(vnet_y, vnet_x)); theta2 = get_angle_in_0_2pi(atan2f(ynew - yold, xnew - xold)); theta = fabsf(theta1 -theta2); r2 = fabsf(sinf(theta)); dt_new = r1 + r2; if (threadIdx.x == 0 && blockIdx.z == 0 && blockIdx.x == 1 && blockIdx.y == 1) { params[24] = r1; params[25] = r2; } } return -dt_new; } __device__ void move(float ac_angle, float vx, float vy, float* xs, float* ys, int32_t* posids, float* params, float* r ) { int32_t n = params[0] - 1; // gsize - 1 // int32_t num_actions = params[1]; // int32_t nrzns = params[2]; float F = params[3]; float dt = params[4]; float r_outbound = params[5]; float r_terminal = params[6]; float Dj = fabsf(xs[1] - xs[0]); float Di = fabsf(ys[1] - ys[0]); float r_step = 0; *r = 0; int32_t i0 = posids[0]; int32_t j0 = posids[1]; float vnetx = F*cosf(ac_angle) + vx; float vnety = F*sinf(ac_angle) + vy; float x, y; get_xypos_from_ij(i0, j0, xs, ys, &x, &y); // x, y stores centre coords of state i0,j0 float xnew = x + (vnetx * dt); float ynew = y + (vnety * dt); //checks TODO: remove checks once verified if (threadIdx.x == 0 && blockIdx.z == 0 && blockIdx.x == 1 && blockIdx.y == 1) { params[12] = x; params[13] = y; params[14] = vnetx; params[15] = vnety; params[16] = xnew; params[17] = ynew; params[18] = ac_angle; } if (xnew > xs[n]) { xnew = xs[n]; *r += r_outbound; } else if (xnew < xs[0]) { xnew = xs[0]; *r += r_outbound; } if (ynew > ys[n]) { ynew = ys[n]; *r += r_outbound; } else if (ynew < ys[0]) { ynew = ys[0]; *r += r_outbound; } // TODO:xxDONE check logic wrt remainderf. remquof had issue int32_t xind, yind; //float remx = remquof((xnew - xs[0]), Dj, &xind); //float remy = remquof(-(ynew - ys[n]), Di, &yind); float remx = remainderf((xnew - xs[0]), Dj); float remy = remainderf(-(ynew - ys[n]), Di); xind = ((xnew - xs[0]) - remx)/Dj; yind = (-(ynew - ys[n]) - remy)/Di; if ((remx >= 0.5 * Dj) && (remy >= 0.5 * Di)) { xind += 1; yind += 1; } else if ((remx >= 0.5 * Dj && remy < 0.5 * Di)) { xind += 1; } else if ((remx < 0.5 * Dj && remy >= 0.5 * Di)) { yind += 1; } if (!(my_isnan(xind) || my_isnan(yind))) { posids[0] = yind; posids[1] = xind; if (is_edge_state(posids[0], posids[1])) //line 110 { *r += r_outbound; } if (threadIdx.x == 0 && blockIdx.z == 0 && blockIdx.x == 1 && blockIdx.y == 1) { params[26] = 9999; } } r_step = calculate_reward_const_dt(xs, ys, i0, j0, x, y, posids, params, vnetx, vnety); //TODO: change back to normal when needed //r_step = -dt; *r += r_step; //TODO: numerical check remaining if (is_terminal(posids[0], posids[1], params)) { *r += r_terminal; } if (threadIdx.x == 0 && blockIdx.z == 0 && blockIdx.x == 1 && blockIdx.y == 1) { params[19] = xnew; params[20] = ynew; params[21] = yind; params[22] = xind; params[23] = *r; //params[17] = ynew; //params[18] = ac_angle; } } __device__ void extract_velocity(float* vx, float* vy, int32_t T, float* all_u_mat, float* all_v_mat, float* all_ui_mat, float* all_vi_mat, float* all_Yi, float* params) { int32_t nrzns = params[2]; int32_t nmodes = params[7]; int32_t sp_uvi, str_uvi, sp_Yi, str_Yi; //startpoints and strides for accessing all_ui_mat, all_vi_mat and all_Yi float sum_x = 0; float sum_y = 0; float vx_mean, vy_mean; //thread index. also used to access resultant vxrzns[nrzns, gsize, gsize] int32_t idx = get_thread_idx(); //rzn index to identify which of the 5k rzn it is. used to access all_Yi. int32_t rzn_id = (blockIdx.z * blockDim.x) + threadIdx.x ; //mean_id is the index used to access the flattened all_u_mat[t,i,j]. int32_t mean_id = state1D_from_thread(T); //to access all_ui_mat and all_vi_mat str_uvi = gridDim.x * gridDim.y; sp_uvi = (T * nmodes * str_uvi) + (gridDim.x * blockIdx.y) + (blockIdx.x); // to access all_Yi sp_Yi = (T * nrzns * nmodes) + (rzn_id * nmodes); vx_mean = all_u_mat[mean_id]; for(int i = 0; i < nmodes; i++) { sum_x += all_ui_mat[sp_uvi + (i*str_uvi)]*all_Yi[sp_Yi + i]; } vy_mean = all_v_mat[mean_id]; for(int i = 0; i < nmodes; i++) { sum_y += all_vi_mat[sp_uvi + (i*str_uvi)]*all_Yi[sp_Yi + i]; } *vx = vx_mean + sum_x; *vy = vy_mean + sum_y; return; } //test: changer from float* to float ac_angle __global__ void transition_calc(float* T_arr, float* all_u_mat, float* all_v_mat, float* all_ui_mat, float* all_vi_mat, float* all_Yi, float ac_angle, float* xs, float* ys, float* params, float* sumR_sa, float* results) // resutls directions- 1: along S2; 2: along S1; 3: along columns towards count { int32_t gsize = params[0]; // size of grid along 1 direction. ASSUMING square grid. int32_t num_actions = params[1]; int32_t nrzns = params[2]; float F = params[3]; float dt = params[4]; float r_outbound = params[5]; float r_terminal = params[6]; int32_t nmodes = params[7]; int32_t i_term = params[8]; // terminal state indices int32_t j_term = params[9]; int32_t nT = params[10]; int32_t is_stationary = params[11]; int32_t T = (int32_t)T_arr[0]; int32_t idx = get_thread_idx(); float vx, vy; if(idx < gridDim.x*gridDim.y*nrzns) { int32_t posids[2] = {blockIdx.y, blockIdx.x}; //static declaration of array of size 2 to hold i and j values of S1. int32_t sp_id; //sp_id is space_id. S1%(gsize*gsize) // Afer move() these will be overwritten by i and j values of S2 float r; // to store immediate reward extract_velocity(&vx, &vy, T, all_u_mat, all_v_mat, all_ui_mat, all_vi_mat, all_Yi, params); //move(*ac_angle, vx, vy, xs, ys, posids, params, &r); move(ac_angle, vx, vy, xs, ys, posids, params, &r); int32_t S1, S2; if (is_stationary == 1) { T = 0; S1 = state1D_from_thread(T); //get init state number corresponding to thread id S2 = state1D_from_ij(posids, T); //get successor state number corresponding to posid and next timestep T+1 } else { S1 = state1D_from_thread(T); //get init state number corresponding to thread id S2 = state1D_from_ij(posids, T+1); //get successor state number corresponding to posid and next timestep T+1 sp_id = S1%(gsize*gsize); } //writing to sumR_sa. this array will later be divided by num_rzns, to get the avg float a = atomicAdd(&sumR_sa[sp_id], r); //TODO: try reduction if this is slow overall results[idx] = S2; __syncthreads(); /*if (threadIdx.x == 0 && blockIdx.z == 0) { sumR_sa[S1] = sumR_sa[S1]/nrzns; //TODO: change name to R_sa from sumR_sa since were not storing sum anymore } */ }//if ends return; } """) # sumR_sa2 = np.empty_like(sumR_sa, dtype = np.float32) # cuda.memcpy_dtoh(sumR_sa2, sumR_sa_gpu) # print("sumR_sa",sumR_sa) # print("sumR_sa",sumR_sa2[0:10001]) # T = np.array(T64, dtype = np.float32) params2 = np.empty_like(params).astype(np.float32) func = mod.get_function("transition_calc") for i in range(num_actions): print('T', T, " call kernel for action: ",i) func(T_gpu, all_u_mat_gpu, all_v_mat_gpu, all_ui_mat_gpu, all_vi_mat_gpu, all_Yi_gpu, ac_angles[i], xs_gpu, ys_gpu, params_gpu, sumR_sa_gpu_list[i], results_gpu_list[i], block=(bDimx, 1, 1), grid=(gsize, gsize, (nrzns // bDimx) + 1)) if i == 0: cuda.memcpy_dtoh(params2, params_gpu) print("params check:",) print( '\nangle= ', params2[18], '\nx =' ,params2[12], '\ny =' ,params2[13] , '\nvnetx = ',params2[14], '\nvnety =', params2[15], '\nxnew =', params2[16], '\nynew =', params2[17], '\nxnewupd =', params2[19], '\nynewupd =', params2[20], '\nyind i=', params2[21], '\nxind j=', params2[22], '\nr- =', params2[23], '\nr1+ =', params2[24], '\nr2+ =', params2[25], '\nenter_isnan =', params2[26] ) results2_list = [] sum_Rsa2_list = [] for i in range(num_actions): results2_list.append(np.empty_like(results)) sum_Rsa2_list.append(np.empty_like(sumR_sa)) # SYNCHRONISATION - pycuda does it implicitly. for i in range(num_actions): cuda.memcpy_dtoh(results2_list[i], results_gpu_list[i]) cuda.memcpy_dtoh(sum_Rsa2_list[i], sumR_sa_gpu_list[i]) print("memcpy_dtoh for action: ", i) for i in range(num_actions): sum_Rsa2_list[i] = sum_Rsa2_list[i] / nrzns # print("sumR_sa2\n",sumR_sa2,"\n\n") # print("results_a0\n",results2_list[0].T[50::int(gsize**2)]) print("OK REACHED END OF cuda relevant CODE\n") # make a list of inputs, each elelment for an action. and run parallal get_coo_ for each action # if save_file_for_each_a is true then each file must be named appopriately. if save_file_for_each_a == True: f1 = 'COO_Highway2D_T' + str(T) + '_a' f3 = '_of_' + str(num_actions) + 'A.npy' inputs = [(results2_list[i], nrzns, T, f1 + str(i) + f3) for i in range(num_actions)] else: inputs = [(results2_list[i], nrzns, T, None) for i in range(num_actions)] # coo_list_a is a list of coo for each each action for the given timestep. with Pool(num_actions) as p: coo_list_a = p.starmap(get_COO_, inputs) # print("coo print\n", coo.T[4880:4900, :]) print("\n\n") # print("time taken by cuda compute and transfer\n", (t2 - t1) / 60) # print("time taken for post processing to coo on cpu\n",(t3 - t2) / 60) return coo_list_a, sum_Rsa2_list
x2[i] = 0.5*mu2eps*(s2[i+1]+s2[i-1]) + (1-mu2eps)*s2[i]; } else if (i == 0){ x1[i] = mu1eps*s1[i+1] + (1-mu1eps)*s1[i]; x2[i] = mu2eps*s2[i+1] + (1-mu2eps)*s2[i]; } else{ x1[i] = mu1eps*s1[i-1] + (1-mu1eps)*s1[i]; x2[i] = mu2eps*s2[i-1] + (1-mu2eps)*s2[i]; } picture[i] = x1[i]/(x1[i] + x2[i]); } """) fitness = mod.get_function("fitness") diffuse_display = mod.get_function("diffuse_display") #parameters side = 50 length = side * side pop = 100 mu1list = numpy.linspace(9, 11, 15) mu2 = 10 eps = .01 b, c, d = 0, 0, 1
smemPos += smemStride; gmemPos += gmemStride; } } ''' template = string.Template(template) code = template.substitute(KERNEL_RADIUS=KERNEL_RADIUS, KERNEL_W=KERNEL_W, COLUMN_TILE_H=COLUMN_TILE_H, COLUMN_TILE_W=COLUMN_TILE_W, ROW_TILE_W=ROW_TILE_W, KERNEL_RADIUS_ALIGNED=KERNEL_RADIUS_ALIGNED) module = SourceModule(code) convolutionRowGPU = module.get_function('convolutionRowGPU') convolutionColumnGPU = module.get_function('convolutionColumnGPU') d_Kernel_rows = module.get_global('d_Kernel_rows')[0] d_Kernel_columns = module.get_global('d_Kernel_columns')[0] # Helper functions for computing alignment... def iDivUp(a, b): # Round a / b to nearest higher integer value a = numpy.int32(a) b = numpy.int32(b) return (a / b + 1) if (a % b != 0) else (a / b) def iDivDown(a, b): # Round a / b to nearest lower integer value
def _get_lut_bprop_kernel(dtype, deterministic=False): """ Builds the bprop kernel for lookup table layers based on templated code. If the deterministic version is requested, an index buffer must be passed as an argument. This index buffer re-orders items in the input tensor so that word_ids are sorted. This is required since we need to be sure that each thread only updates weights for one word id. Arguments: dtype (np.dtype): The data which the kernel will operate on. deterministic (boolean): Builds the deterministic kernel when this is set to True. """ if not deterministic: code = r""" __global__ void lut_bprop( int* inputs, %(type)s* dW, %(type)s* errors, const int nin, const int embedding_dim, const int vocab_size, const int pad_idx) { const int tid = threadIdx.x; const int bid = blockIdx.x; int word_id = inputs[bid]; int error_row = bid * embedding_dim; int output_row = word_id * embedding_dim; if(word_id != pad_idx) { for(int i = tid; i < embedding_dim; i += blockDim.x) { atomicAdd(&dW[output_row + i], errors[error_row + i]); } } } """ code = code % {"type": _ew_types[dtype]["type"]} module = SourceModule(code, options=["--use_fast_math"]) kernel = module.get_function("lut_bprop") kernel.prepare("PPPIIIi") else: code = r""" __global__ void lut_bprop( int* inputs, int* index_buffer, %(type)s* dW, %(type)s* errors, const int nin, const int embedding_dim, const int vocab_size, const int pad_idx) { const int tid = threadIdx.x; const int bid = blockIdx.x; int index_position = bid; int index = index_buffer[index_position]; int word_id = inputs[index]; if((bid == 0 || word_id != inputs[index_buffer[bid - 1]]) && word_id != pad_idx) { int output_row = word_id * embedding_dim; do { int error_row = index * embedding_dim; for(int i = tid; i < embedding_dim; i += blockDim.x) { dW[output_row + i] += errors[error_row + i]; } index_position++; if(index_position == gridDim.x) { break; } index = index_buffer[index_position]; } while(inputs[index] == word_id); } } """ code = code % {"type": _ew_types[dtype]["type"]} module = SourceModule(code, options=["--use_fast_math"]) kernel = module.get_function("lut_bprop") kernel.prepare("PPPPIIIi") kernel.name = "lut_bprop" return kernel
__syncthreads(); if (tid >= iter ) { sum_buf[tid] = sum_buf[tid] + sum_buf[tid - iter]; } iter *= 2; } __syncthreads(); out[tid] = sum_buf[tid]; __syncthreads(); } """) naive_gpu = naive_ker.get_function("naive_prefix") if __name__ == '__main__': testvec = np.random.randn(1024).astype(np.float64) testvec_gpu = gpuarray.to_gpu(testvec) outvec_gpu = gpuarray.empty_like(testvec_gpu) naive_gpu(testvec_gpu, outvec_gpu, block=(1024, 1, 1), grid=(1, 1, 1)) total_sum = sum(testvec) total_sum_gpu = outvec_gpu[-1].get() print "Does our kernel work correctly? : {}".format( np.allclose(total_sum_gpu, total_sum))
def counting_vowels_in_text(text): """Returns the number of vowels found in the text?""" mod = SourceModule(""" __global__ void count_vowels(char *text, int *results, int text_size, int chunk_size, int threads_per_block, int blocks_per_grid) { int index = blockDim.x * blockIdx.x + threadIdx.x; int start = index * chunk_size; int end = ( index + 1 ) * chunk_size; end = min( end, text_size ); if (end < start) { return; } int i = 0; start = start * 4; end = end * 4; for(i = start; i <= end; i++){ if (text[i] == 'a' || text[i] == 'A' || text[i] == 'e' || text[i] == 'E' || text[i] == 'i' || text[i] == 'I' || text[i] =='o' || text[i] =='O' || text[i] == 'u' || text[i] == 'U' || text[i] == 'y' || text[i] == 'Y') { results[i] = 1; } } } """) cuda.start_profiler() max_text_size_in_mb = 100 text_chunks = [] text_chunks_count = math.ceil( len(text) / (max_text_size_in_mb * (1024**2))) while (len(text) > (max_text_size_in_mb * 1024**2)): text_chunk = text[:math.ceil(len(text) / (text_chunks_count))] text2 = text[math.ceil(len(text) / (text_chunks_count)):] text = text2 text_chunks.append(text_chunk) text_chunks.append(text) cumulative_results = 0 for text_chunk in text_chunks: device_text = gpuarray.to_gpu(numpy.array([text_chunk], dtype=str)) device_results = gpuarray.zeros(len(text_chunk) * 4, dtype=numpy.int32) chunk_size = 1000 threads_per_block = 512 blocks_per_grid = numpy.int( math.ceil(len(text_chunk) / (chunk_size * threads_per_block))) device_text_size = numpy.int32(len(text_chunk)) function = mod.get_function("count_vowels") function(device_text, device_results, device_text_size, numpy.int32(chunk_size), numpy.int32(blocks_per_grid), block=(threads_per_block, 1, 1), grid=(blocks_per_grid, 1, 1)) host_results = device_results.get() results = numpy.count_nonzero(host_results == 1) cumulative_results += results cuda.stop_profiler() return cumulative_results
def _get_sorting_kernel(kernel_id, block_size): """ Builds kernels used for sorting inputs. There are several kernels here corresponding to the steps in the algorithm. The algorithm works by determining the sorted position for each input item. This is done with a bucket sort algorithm, where each word_id is a bucket. The first step determines the size of each bucket (number of occurences of each word_id). Next, a prefix some is computed over the list of bucket sizes to find where each bucket will be placed in the output buffer. Finally, each thread places it's index into the correct sorted position based on the bucket start index (computed from the prefix sum) and that thread's offset into the bucket (which is taken from the output of the atomic add done in the first step.) Arguments: kernel_id (Integer): Which step to build the kernel for [0, 4] block_size (Integer): Number of threads per block for the prefix sum kernels. """ code = r""" #define THREADS %(threads)s #define STORE_BLOCKSUM %(store_blocksum)s __global__ void sort_inputs0( int* inputs, int* index_buffer, int* offset_buffer, int* word_counts, const int vocab_size, const int input_length) { const int tid = threadIdx.x + (blockDim.x * blockIdx.x); int word_id; if(tid < input_length) { word_id = inputs[tid]; offset_buffer[tid] = atomicAdd(&word_counts[word_id], 1); } } __device__ void scan(int* buffer, int* blocksum, int global_length) { const int tid = (threadIdx.x << 1) + 1; const int gid = ((threadIdx.x + (blockIdx.x * blockDim.x)) << 1) + 1; __shared__ int local_counts[THREADS * 2]; local_counts[tid] = buffer[gid]; local_counts[tid - 1] = buffer[gid - 1]; #pragma unroll for(int skip = 1; skip <= THREADS; skip <<= 1) { int mask = (skip << 1) - 1; if((tid & mask) == mask) { local_counts[tid] += local_counts[tid - skip]; } __syncthreads(); } if(tid == (THREADS * 2 - 1)) { #if STORE_BLOCKSUM blocksum[blockIdx.x] = local_counts[tid]; #endif local_counts[tid] = 0; } #pragma unroll for(int skip = THREADS; skip > 0; skip >>= 1) { int mask = (skip << 1) - 1; if((tid & mask) == mask) { int temp = local_counts[tid - skip]; local_counts[tid - skip] = local_counts[tid]; local_counts[tid] += temp; } __syncthreads(); } if(gid < global_length) { buffer[gid] = local_counts[tid]; buffer[gid - 1] = local_counts[tid - 1]; } } __global__ void sort_inputs1( int* inputs, int* index_buffer, int* offset_buffer, int* word_counts, const int vocab_size, const int input_length) { scan(word_counts, word_counts + vocab_size, vocab_size); } __global__ void sort_inputs2( int* inputs, int* index_buffer, int* offset_buffer, int* word_counts, const int vocab_size, const int input_length) { scan(word_counts + vocab_size, 0, blockDim.x); } __global__ void sort_inputs3( int* inputs, int* index_buffer, int* offset_buffer, int* word_counts, const int vocab_size, const int input_length) { const int gid = (threadIdx.x + (blockIdx.x * blockDim.x)) << 1; if(gid < vocab_size) { word_counts[gid] += word_counts[vocab_size + blockIdx.x]; word_counts[gid + 1] += word_counts[vocab_size + blockIdx.x]; } } __global__ void sort_inputs4( int* inputs, int* index_buffer, int* offset_buffer, int* word_counts, const int vocab_size, const int input_length) { const int tid = threadIdx.x + (blockDim.x * blockIdx.x); int word_id; if(tid < input_length) { word_id = inputs[tid]; int sorted_position = word_counts[word_id] + offset_buffer[tid]; index_buffer[sorted_position] = tid; } } """ code = code % { "threads": block_size, "store_blocksum": (1 if kernel_id == 1 else 0) } module = SourceModule(code, options=["--use_fast_math"]) function_name = "sort_inputs" + str(kernel_id) kernel = module.get_function(function_name) kernel.prepare("PPPPII") kernel.name = "sort_inputs" return kernel
__global__ void vec_ker(int *ints, double *doubles) { int4 f1, f2; f1 = *reinterpret_cast<int4*>(ints); f2 = *reinterpret_cast<int4*>(&ints[4]); printf("First int4: %d, %d, %d, %d\\n", f1.x, f1.y, f1.z, f1.w); printf("Second int4: %d, %d, %d, %d\\n", f2.x, f2.y, f2.z, f2.w); double2 d1, d2; d1 = *reinterpret_cast<double2*>(doubles); d2 = *reinterpret_cast<double2*>(&doubles[2]); printf("First double2: %f, %f\\n", d1.x, d1.y); printf("Second double2: %f, %f\\n", d2.x, d2.y); }''' vec_mod = SourceModule(VecCode) vec_ker = vec_mod.get_function('vec_ker') ints = gpuarray.to_gpu(np.int32([1,2,3,4,5,6,7,8])) doubles = gpuarray.to_gpu(np.double([1.11,2.22,3.33,4.44])) print 'Vectorized Memory Test:' vec_ker(ints, doubles, grid=(1,1,1), block=(1,1,1))
M = 1024 L = 1024 THREADS_PER_BLOCK = 32 mod = SourceModule(""" #define N 1024 #define M 1024 #define L 1024 __global__ void multiplicacionMatrices(float *a, float *b, float *c) { int i = threadIdx.x + blockIdx.x*blockDim.x; int j = threadIdx.y + blockIdx.y*blockDim.y; c[j+i*L] = 0; for(int k=0 ; k < M ; k++ ){ c[j+i*L] += a[k+i*M] * b[j+k*L]; } } """) a = np.array([1 for i in range(N*M)]).astype(np.float32) b = np.array([1 for i in range(M*L)]).astype(np.float32) c = np.array([0 for i in range(N*L)]).astype(np.float32) multiplicacion_matrices = mod.get_function("multiplicacionMatrices") multiplicacion_matrices(cuda.In(a), cuda.In(b), cuda.Out(c), block=(THREADS_PER_BLOCK, THREADS_PER_BLOCK, 1), grid=(N//THREADS_PER_BLOCK, L//THREADS_PER_BLOCK, 1)) print(c.reshape((N,L)))
s = cuda.Event() e = cuda.Event() s.record() code = """ __global__ void add_one(int n, int start, float *x) { int index = start + blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; for (int i = index; i < n; i += stride) x[i] += 1.0; } """ mod = SourceModule(code) add_one = mod.get_function("add_one") N = np.int32(1e8) nStreams = 2 streamSize = np.int32(N / nStreams) x = np.ones(N, dtype=np.float32) x_gpu = gpuarray.empty(N, dtype=np.float32) #cuda.memcpy_htod(x_gpu, x) stream = [] for i in range(nStreams): stream.append(cuda.Stream())
class TSDFVolume(object): def __init__(self, vol_bnds, voxel_size): # Define voxel volume parameters. self._vol_bnds = vol_bnds # 3x2, rows: (x, y, z), columns: (min, max) in world coordinates in meters self._voxel_size = voxel_size # in meters (determines volume discretization and resolution) self._trunc_margin = self._voxel_size * 5 # truncation on SDF # Adjust volume bounds. self._vol_dim = np.ceil((self._vol_bnds[:, 1] - self._vol_bnds[:, 0]) / self._voxel_size).copy(order='C').astype( int) # ensure C-order contigous self._vol_bnds[:, 1] = self._vol_bnds[:, 0] + self._vol_dim * self._voxel_size self._vol_origin = self._vol_bnds[:, 0].copy(order='C').astype( np.float32) # ensure C-order contigous print("Voxel volume size: {:d} x {:d} x {:d}".format( self._vol_dim[0], self._vol_dim[1], self._vol_dim[2])) # Initialize pointers to voxel volume in CPU memory. self._tsdf_vol_cpu = np.ones(self._vol_dim).astype(np.float32) self._weight_vol_cpu = np.zeros(self._vol_dim).astype( np.float32 ) # for computing the cumulative moving average of observations per voxel self._color_vol_cpu = np.zeros(self._vol_dim).astype(np.float32) # Copy voxel volumes to GPU. if TSDF_GPU_MODE: self._tsdf_vol_gpu = cuda.mem_alloc(self._tsdf_vol_cpu.nbytes) cuda.memcpy_htod(self._tsdf_vol_gpu, self._tsdf_vol_cpu) self._weight_vol_gpu = cuda.mem_alloc(self._weight_vol_cpu.nbytes) cuda.memcpy_htod(self._weight_vol_gpu, self._weight_vol_cpu) self._color_vol_gpu = cuda.mem_alloc(self._color_vol_cpu.nbytes) cuda.memcpy_htod(self._color_vol_gpu, self._color_vol_cpu) # Cuda kernel function (C++) self._cuda_src_mod = SourceModule(""" __global__ void integrate(float * tsdf_vol, float * weight_vol, float * color_vol, float * vol_dim, float * vol_origin, float * cam_intr, float * cam_pose, float * other_params, float * color_im, float * depth_im) { // Get voxel index. int gpu_loop_idx = (int) other_params[0]; int max_threads_per_block = blockDim.x; int block_idx = blockIdx.z * gridDim.y * gridDim.x + blockIdx.y * gridDim.x + blockIdx.x; int voxel_idx = gpu_loop_idx * gridDim.x * gridDim.y * gridDim.z * max_threads_per_block + block_idx * max_threads_per_block + threadIdx.x; int vol_dim_x = (int)vol_dim[0]; int vol_dim_y = (int)vol_dim[1]; int vol_dim_z = (int)vol_dim[2]; if (voxel_idx > vol_dim_x * vol_dim_y * vol_dim_z) return; // Get voxel grid coordinates. float voxel_x = floorf(((float)voxel_idx) / ((float)(vol_dim_y * vol_dim_z))); float voxel_y = floorf(((float)(voxel_idx - ((int)voxel_x) * vol_dim_y * vol_dim_z)) / ((float)vol_dim_z)); float voxel_z = (float)(voxel_idx - ((int)voxel_x) * vol_dim_y * vol_dim_z - ((int)voxel_y) * vol_dim_z); // Voxel grid coordinates to world coordinates. float voxel_size = other_params[1]; float pt_x = vol_origin[0] + voxel_x * voxel_size; float pt_y = vol_origin[1] + voxel_y * voxel_size; float pt_z = vol_origin[2] + voxel_z * voxel_size; // World coordinates to camera coordinates. float tmp_pt_x = pt_x - cam_pose[0*4+3]; float tmp_pt_y = pt_y - cam_pose[1*4+3]; float tmp_pt_z = pt_z - cam_pose[2*4+3]; float cam_pt_x = cam_pose[0*4+0] * tmp_pt_x + cam_pose[1*4+0] * tmp_pt_y + cam_pose[2*4+0] * tmp_pt_z; float cam_pt_y = cam_pose[0*4+1] * tmp_pt_x + cam_pose[1*4+1] * tmp_pt_y + cam_pose[2*4+1] * tmp_pt_z; float cam_pt_z = cam_pose[0*4+2] * tmp_pt_x + cam_pose[1*4+2] * tmp_pt_y + cam_pose[2*4+2] * tmp_pt_z; // Camera coordinates to image pixels. int pixel_x = (int) roundf(cam_intr[0*3+0] * (cam_pt_x / cam_pt_z) + cam_intr[0*3+2]); int pixel_y = (int) roundf(cam_intr[1*3+1] * (cam_pt_y / cam_pt_z) + cam_intr[1*3+2]); // Skip if outside view frustum. int im_h = (int) other_params[2]; int im_w = (int) other_params[3]; if (pixel_x < 0 || pixel_x >= im_w || pixel_y < 0 || pixel_y >= im_h || cam_pt_z < 0) return; // Skip invalid depth. float depth_value = depth_im[pixel_y*im_w+pixel_x]; if (depth_value == 0) return; // Integrate TSDF. float trunc_margin = other_params[4]; float depth_diff = depth_value-cam_pt_z; if (depth_diff < -trunc_margin) return; float dist = fmin(1.0f, depth_diff / trunc_margin); float w_old = weight_vol[voxel_idx]; float obs_weight = other_params[5]; float w_new = w_old + obs_weight; weight_vol[voxel_idx] = w_new; tsdf_vol[voxel_idx] = (tsdf_vol[voxel_idx] * w_old + dist) / w_new; // Integrate color. float old_color = color_vol[voxel_idx]; float old_b = floorf(old_color / (256 * 256)); float old_g = floorf((old_color - old_b * 256 * 256) / 256); float old_r = old_color - old_b * 256 * 256 - old_g * 256; float new_color = color_im[pixel_y*im_w+pixel_x]; float new_b = floorf(new_color / (256 * 256)); float new_g = floorf((new_color - new_b * 256 * 256) / 256); float new_r = new_color - new_b * 256 * 256 - new_g * 256; new_b = fmin(roundf((old_b*w_old + new_b) / w_new), 255.0f); new_g = fmin(roundf((old_g*w_old + new_g) / w_new), 255.0f); new_r = fmin(roundf((old_r*w_old + new_r) / w_new), 255.0f); color_vol[voxel_idx] = new_b * 256 * 256 + new_g * 256 + new_r; }""") self._cuda_integrate = self._cuda_src_mod.get_function("integrate") # Determine block/grid size on GPU. gpu_dev = cuda.Device(0) self._max_gpu_threads_per_block = gpu_dev.MAX_THREADS_PER_BLOCK n_blocks = int( np.ceil( float(np.prod(self._vol_dim)) / float(self._max_gpu_threads_per_block))) grid_dim_x = min(gpu_dev.MAX_GRID_DIM_X, int(np.floor(np.cbrt(n_blocks)))) grid_dim_y = min(gpu_dev.MAX_GRID_DIM_Y, int(np.floor(np.sqrt(n_blocks / grid_dim_x)))) grid_dim_z = min( gpu_dev.MAX_GRID_DIM_Z, int(np.ceil(float(n_blocks) / float(grid_dim_x * grid_dim_y)))) self._max_gpu_grid_dim = np.array( [grid_dim_x, grid_dim_y, grid_dim_z]).astype(int) self._n_gpu_loops = int( np.ceil( float(np.prod(self._vol_dim)) / float( np.prod(self._max_gpu_grid_dim) * self._max_gpu_threads_per_block))) def integrate(self, color_im, depth_im, cam_intr, cam_pose, obs_weight=1.): im_h = depth_im.shape[0] im_w = depth_im.shape[1] # Fold RGB color image into a single channel image. color_im = color_im.astype(np.float32) color_im = np.floor(color_im[:, :, 2] * 256 * 256 + color_im[:, :, 1] * 256 + color_im[:, :, 0]) # GPU mode: integrate voxel volume (calls CUDA kernel). if TSDF_GPU_MODE: for gpu_loop_idx in range(self._n_gpu_loops): self._cuda_integrate( self._tsdf_vol_gpu, self._weight_vol_gpu, self._color_vol_gpu, cuda.InOut(self._vol_dim.astype(np.float32)), cuda.InOut(self._vol_origin.astype(np.float32)), cuda.InOut(cam_intr.reshape(-1).astype(np.float32)), cuda.InOut(cam_pose.reshape(-1).astype(np.float32)), cuda.InOut( np.asarray([ gpu_loop_idx, self._voxel_size, im_h, im_w, self._trunc_margin, obs_weight ], np.float32)), cuda.InOut(color_im.reshape(-1).astype(np.float32)), cuda.InOut(depth_im.reshape(-1).astype(np.float32)), block=(self._max_gpu_threads_per_block, 1, 1), grid=(int(self._max_gpu_grid_dim[0]), int(self._max_gpu_grid_dim[1]), int(self._max_gpu_grid_dim[2]))) # CPU mode: integrate voxel volume (vectorized implementation). else: # Get voxel grid coordinates. xv, yv, zv = np.meshgrid(range(self._vol_dim[0]), range(self._vol_dim[1]), range(self._vol_dim[2]), indexing='ij') vox_coords = np.concatenate( (xv.reshape(1, -1), yv.reshape(1, -1), zv.reshape(1, -1)), axis=0).astype(int) # Voxel coordinates to world coordinates. world_pts = self._vol_origin.reshape( -1, 1) + vox_coords.astype(float) * self._voxel_size # World coordinates to camera coordinates. world2cam = np.linalg.inv(cam_pose) cam_pts = np.dot(world2cam[:3, :3], world_pts) + np.tile( world2cam[:3, 3].reshape(3, 1), (1, world_pts.shape[1])) # Camera coordinates to image pixels. pix_x = np.round(cam_intr[0, 0] * (cam_pts[0, :] / cam_pts[2, :]) + cam_intr[0, 2]).astype(int) pix_y = np.round(cam_intr[1, 1] * (cam_pts[1, :] / cam_pts[2, :]) + cam_intr[1, 2]).astype(int) # Skip if outside view frustum. valid_pix = np.logical_and( pix_x >= 0, np.logical_and( pix_x < im_w, np.logical_and( pix_y >= 0, np.logical_and(pix_y < im_h, cam_pts[2, :] > 0)))) depth_val = np.zeros(pix_x.shape) depth_val[valid_pix] = depth_im[pix_y[valid_pix], pix_x[valid_pix]] # Integrate TSDF. depth_diff = depth_val - cam_pts[2, :] valid_pts = np.logical_and(depth_val > 0, depth_diff >= -self._trunc_margin) dist = np.minimum(1., np.divide(depth_diff, self._trunc_margin)) w_old = self._weight_vol_cpu[vox_coords[0, valid_pts], vox_coords[1, valid_pts], vox_coords[2, valid_pts]] w_new = w_old + obs_weight self._weight_vol_cpu[vox_coords[0, valid_pts], vox_coords[1, valid_pts], vox_coords[2, valid_pts]] = w_new tsdf_vals = self._tsdf_vol_cpu[vox_coords[0, valid_pts], vox_coords[1, valid_pts], vox_coords[2, valid_pts]] self._tsdf_vol_cpu[vox_coords[0, valid_pts], vox_coords[1, valid_pts], vox_coords[2, valid_pts]] = np.divide( np.multiply(tsdf_vals, w_old) + dist[valid_pts], w_new) # Integrate color. old_color = self._color_vol_cpu[vox_coords[0, valid_pts], vox_coords[1, valid_pts], vox_coords[2, valid_pts]] old_b = np.floor(old_color / (256. * 256.)) old_g = np.floor((old_color - old_b * 256. * 256.) / 256.) old_r = old_color - old_b * 256. * 256. - old_g * 256. new_color = color_im[pix_y[valid_pts], pix_x[valid_pts]] new_b = np.floor(new_color / (256. * 256.)) new_g = np.floor((new_color - new_b * 256. * 256.) / 256.) new_r = new_color - new_b * 256. * 256. - new_g * 256. new_b = np.minimum( np.round(np.divide(np.multiply(old_b, w_old) + new_b, w_new)), 255.) new_g = np.minimum( np.round(np.divide(np.multiply(old_g, w_old) + new_g, w_new)), 255.) new_r = np.minimum( np.round(np.divide(np.multiply(old_r, w_old) + new_r, w_new)), 255.) self._color_vol_cpu[vox_coords[0, valid_pts], vox_coords[ 1, valid_pts], vox_coords[ 2, valid_pts]] = new_b * 256. * 256. + new_g * 256. + new_r # Copy voxel volume to CPU. def get_volume(self): if TSDF_GPU_MODE: cuda.memcpy_dtoh(self._tsdf_vol_cpu, self._tsdf_vol_gpu) cuda.memcpy_dtoh(self._color_vol_cpu, self._color_vol_gpu) return self._tsdf_vol_cpu, self._color_vol_cpu # Get mesh of voxel volume via marching cubes. def get_mesh(self): tsdf_vol, color_vol = self.get_volume() # Marching cubes. verts, faces, norms, _ = measure.marching_cubes_lewiner(tsdf_vol, level=0) verts_ind = np.round(verts).astype(int) verts = verts * self._voxel_size + self._vol_origin # voxel grid coordinates to world coordinates # Get vertex colors. rgb_vals = color_vol[verts_ind[:, 0], verts_ind[:, 1], verts_ind[:, 2]] colors_b = np.floor(rgb_vals / (256 * 256)) colors_g = np.floor((rgb_vals - colors_b * 256 * 256) / 256) colors_r = rgb_vals - colors_b * 256 * 256 - colors_g * 256 colors = np.floor(np.asarray([colors_r, colors_g, colors_b])).T colors = colors.astype(np.uint8) return verts, faces, norms, colors