def main(): if len(sys.argv) < 3: print "INVALID USAGE: Missing arguments" elif len(sys.argv) == 3: # Gather information for GPU file = open(sys.argv[1]).readlines() for i in range(0, len(file)): file[i] = file[i].replace("\n", "").split(" ") trajectories = numpy.array([int(file[0][0])]).astype(numpy.float32) num_stops = numpy.array([int(file[0][1])]).astype(numpy.float32) stops = numpy.array(file[1:]).astype(numpy.float32) dest = numpy.zeros_like(stops) # Create callable python function distance = kernel.get_function("distance") distance(driver.In(trajectories), driver.In(num_stops), driver.In(stops), driver.Out(dest), block=(1, 1, 1)) with open(sys.argv[2], "w") as output_file: for line in dest.tolist(): for element in line: output_file.write(str(int(element)) + " ") output_file.write("\n")
def test_posteriors4(): n_trajs = 1 n_observations = 10 fwdlattice = np.random.rand(n_observations, 4).astype(np.float32) bwdlattice = np.random.rand(n_observations, 4).astype(np.float32) posteriors = np.zeros_like(fwdlattice) mod.get_function('posteriors4')(drv.In(fwdlattice), drv.In(bwdlattice), np.int32(n_trajs), drv.In( np.array([n_observations], dtype=np.int32)), drv.In(np.array([0], dtype=np.int32)), drv.Out(posteriors), block=(32, 1, 1), grid=(1, 1)) print 'cuda' print posteriors print 'reference' gamma = fwdlattice + bwdlattice print np.exp(gamma.T - logsumexp(gamma, axis=1)).T
def main(): multiply_them = mod.get_function("multiply_them") a = numpy.random.randn(400).astype(numpy.float32) b = numpy.random.randn(400).astype(numpy.float32) dest = numpy.zeros_like(a) multiply_them(drv.Out(dest), drv.In(a), drv.In(b), block=(400, 1, 1), grid=(1, 1)) c = dest - a * b print(c) # Get the kernel code and specify the matrix size kcpKernel = XcpKernel % {'XiSize': XiSize} # Compile the kernel code aoMod = SourceModule(kcpKernel) # Get the kernel function from the compiled module aoMultiplyKernel = aoMod.get_function("MMultiplyKernel") # Define the input and result matrices kfX = numpy.random.random_sample((XiSize, XiSize)).astype(numpy.float32) kfY = numpy.random.random_sample((XiSize, XiSize)).astype(numpy.float32) kfR = numpy.zeros_like(kfX) kfR2 = numpy.matmul(kfX, kfY) # Execute the kernel aoMultiplyKernel(drv.Out(kfR), drv.In(kfX), drv.In(kfY), block=(int(XiSize / 1), int(XiSize / 1), 4), grid=(1, 1)) print(kfX) print(kfY) print(kfR) print(kfR2)
def image_filter(image, filter, nums_thread=100): global ft_mod global g_filter if ft_mod is None: ft_mod = SourceModule(filter_source) g_filter = ft_mod.get_function("g_filter") image = np.array(image).astype(np.float64) filter = np.array(filter).astype(np.float64) if len(image.shape) != 4: image = image.reshape([1] + list(image.shape)) if len(filter.shape) != 4: filter = filter.reshape([1] + list(filter.shape)) parameters, outshape = build_parameters(image, filter, nums_thread) #print("outshape:",outshape,"from shape", image.shape) outputs = np.zeros(outshape, dtype=np.float64) g_filter(drv.In(image), drv.In(filter), drv.In(parameters), drv.Out(outputs), block=(nums_thread, 1, 1), grid=(1, 1)) return outputs
def predict(self, x): x = np.float32(x, order='C') y_pred = [] distance = mod.get_function('distance') train_row, train_col = np.int32(self.x.shape) test_row, test_col = np.int32(x.shape) grid_x = int(math.ceil(train_col/self.thread_size)) grid_y = int(math.ceil(test_row/self.thread_size)) # print(grid_x, grid_y) dis_arr = np.zeros((test_row, train_col), dtype=np.float32) distance( cuda.In(x), cuda.In(self.x), cuda.Out(dis_arr), test_row, test_col, train_row, train_col, test_row, train_col, block=(self.thread_size, self.thread_size, 1), grid=(grid_x, grid_y) ) # print(dis_arr) for i in range(len(dis_arr)): sorted_index = np.argsort(dis_arr[i]) top_k_index = sorted_index[:self.k] y_pred.append(self._vote(ys=self.y[top_k_index])) return np.array(y_pred)
def _draw(self, pts, colors): if not pts: return imsize = self.imsize dt0 = time() ind_count = zeros(self.imsize2, npint) colors = row_stack(colors).astype(npfloat) inds = concatenate(pts).astype(npint) _inds = cuda.mem_alloc(inds.nbytes) cuda.memcpy_htod(_inds, inds) aggn = inds.shape[0] self.cuda_agg(npint(aggn), npint(imsize), _inds, cuda.InOut(ind_count), block=(THREADS, 1, 1), grid=(int(aggn // THREADS) + 1, 1)) ind_count_map = _build_ind_count(ind_count) _ind_count_map = cuda.mem_alloc(ind_count_map.nbytes) cuda.memcpy_htod(_ind_count_map, ind_count_map) sort_colors = zeros((aggn, 4), npfloat) _sort_colors = cuda.mem_alloc(sort_colors.nbytes) cuda.memcpy_htod(_sort_colors, sort_colors) self.cuda_agg_bin(npint(aggn), _ind_count_map, cuda.In(colors), _inds, _sort_colors, block=(THREADS, 1, 1), grid=(int(aggn // THREADS) + 1, 1)) dotn, _ = ind_count_map.shape self.cuda_dot(npint(dotn), self._img, _ind_count_map, _sort_colors, block=(THREADS, 1, 1), grid=(int(dotn // THREADS) + 1, 1)) if self.verbose is not None: print('-- drew dots: {:d}. time: {:0.4f}'.format( colors.shape[0], time() - dt0)) self._updated = True
def calc_next_world_gpu(world, next_world, height, width): mod = SourceModule(""" __global__ void life_game_gpu(const int* __restrict__ world, int *next_world, const int mat_size_y, const int mat_size_x){ int mat_x = threadIdx.x + blockIdx.x * blockDim.x; int mat_y = threadIdx.y + blockIdx.y * blockDim.y; if (mat_x >= mat_size_x) { return; } if (mat_y >= mat_size_y) { return; } int current_value = world[(mat_y % mat_size_y) * mat_size_x + (mat_x % mat_size_x)]; int next_value = current_value; int num_live = 0; num_live += world[((mat_y - 1) % mat_size_y) * mat_size_x + ((mat_x - 1) % mat_size_x)]; num_live += world[((mat_y - 1) % mat_size_y) * mat_size_x + ((mat_x) % mat_size_x)]; num_live += world[((mat_y - 1) % mat_size_y) * mat_size_x + ((mat_x + 1) % mat_size_x)]; num_live += world[((mat_y) % mat_size_y) * mat_size_x + ((mat_x - 1) % mat_size_x)]; num_live += world[((mat_y) % mat_size_y) * mat_size_x + ((mat_x + 1) % mat_size_x)]; num_live += world[((mat_y + 1) % mat_size_y) * mat_size_x + ((mat_x - 1) % mat_size_x)]; num_live += world[((mat_y + 1) % mat_size_y) * mat_size_x + ((mat_x) % mat_size_x)]; num_live += world[((mat_y + 1) % mat_size_y) * mat_size_x + ((mat_x + 1) % mat_size_x)]; if (current_value == 0 && num_live == 3) next_value = 1; else if (current_value == 1 && num_live >= 2 && num_live <= 3) next_value = 1; else next_value = 0; next_world[mat_y * mat_size_x + mat_x] = next_value; } """) life_game_gpu = mod.get_function("life_game_gpu") block = (BLOCKSIZE, BLOCKSIZE, 1) grid = ((width + block[0] - 1) // block[0], (height + block[1] - 1) // block[1]) # print("Grid = ({0}, {1}), Block = ({2}, {3})".format(grid[0], grid[1], block[0], block[1])) # start = cuda.Event() # end = cuda.Event() # start.record() life_game_gpu(cuda.In(world), cuda.Out(next_world), numpy.int32(height), numpy.int32(width), block=block, grid=grid)
def test(): mask = cv2.imread('cur_mask.jpg') mask = mask[:, :, 0] # mask_g = mask[:,:,1] # mask_b = mask[:,:,2] x = [] y = [] prev_time = time.time() for i in range(len(mask)): for j in range(len(mask[0])): if mask[i][j] == 2: print('x = %d,y = %d' % (j, i)) y.append(i) x.append(j) logger.info('process frame time:' + str(time.time() - prev_time)) prev_time = time.time() x1 = min(x) y1 = min(y) x2 = max(x) y2 = max(y) logger.info('post process frame time:' + str(time.time() - prev_time)) print(x1, y1, x2, y2) # w = numpy.int32(len(mask[0])) # h = numpy.int32(len(mask)) w = numpy.int64(len(mask[0])) h = numpy.int64(len(mask)) mask_np = numpy.array(mask) mask_np = mask_np.reshape(-1).astype(float) N = len(mask_np) # print(N) # print(w) # print(h) a = numpy.zeros(N, dtype=numpy.float) b = numpy.zeros(N, dtype=numpy.float) nTheads = 1024 nBlocks = int((N + nTheads - 1) / nTheads) print("nBlocks:%d\n" % nBlocks) prev_time = time.time() func(drv.In(mask_np), drv.InOut(a), drv.InOut(b), w, h, block=(nTheads, 1, 1), grid=(nBlocks, )) logger.info('gpu process frame time:' + str(time.time() - prev_time)) print(max(a)) print(max(b)) print(a)
def simulate_positions(module, Nobs, N, bounds, radius, d, dN, pa, ps, seed=666, Nthreads=64): Nphotons = Nobs*N print "Total Threads: %s" % Nphotons assert(Nphotons <= 1.1e8) d = np.uint32(d) dN = np.uint32(dN) radius = np.float32(radius) Ndoms = np.uint32(pow((d/dN)*2+1, 3) - 1) t1 = time.time() rng_states = get_rng_states(module, Nphotons, seed=seed) t2 = time.time() d_list = get_doms(module, Ndoms, radius, d, dN) t3 = time.time() x = np.random.uniform(bounds[0][0], bounds[0][1], N) y = np.random.uniform(bounds[1][0], bounds[1][1], N) z = np.random.uniform(bounds[2][0], bounds[2][1], N) # print x # print y # print z pInit = np.concatenate([x, y, z]).astype(np.float32) t4 = time.time() # print "t2-t1: ", t2-t1 # print "t3-t2: ", t3-t2 # print "t4-t3: ", t4-t3 start = time.time() datahits = np.zeros(Ndoms*N, dtype=np.int32) datahitsNum = -np.ones(Nobs*N, dtype=np.int32) datatimes = np.zeros(Nphotons, dtype=np.float32) datapositions = np.zeros(Nphotons*3, dtype=np.float32) simulate = module.get_function('simulate_positions') simulate(np.uint64(Nphotons), np.uint64(Nobs), rng_states, d_list, cuda.InOut(datahits), cuda.InOut(datatimes), cuda.InOut(datahitsNum), cuda.InOut(datapositions), cuda.In(pInit), np.float32(pa), np.float32(ps),np.uint32(Ndoms), block=(Nthreads, 1, 1), grid=(Nphotons//Nthreads + 1, 1)) print "end-start", time.time() - start # print "sumHits: ", sum(datahits) # print datahits = np.reshape(np.array(datahits, dtype=float), (N, Ndoms)) datahitsNum = np.reshape(np.array(datahitsNum, dtype=float), (N, Nobs)) datatimes = np.reshape(np.array(datatimes, dtype=float), (N, Nobs)) pInit = np.reshape(pInit, (3, N)).T return datahits, datahitsNum, datatimes, pInit
def score_df(self, thread_count): """Scores each collision using a scoring function that gives a score of 2 to each person that was killed, a score of 1 to each person injured, and divides those two scores added up by an average of 20 people per accident, then multiplies that fraction by 5 for a severity score of 0-5""" mod = SourceModule(""" __global__ void score_function(float *dest, float *killed, float *injured) { const int i = (blockIdx.x * blockDim.x) + threadIdx.x; dest[i] = (((killed[i] * 2.0) + injured[i]) / 8.0) * 5.0; } """) df = self.df[[ 'SCORE', 'LATITUDE', 'LONGITUDE', 'NUMBER OF PERSONS KILLED', 'NUMBER OF PERSONS INJURED' ]].values.astype(np.float32) # Calculate kernel params n = len(df[:, 0]) output = np.zeros_like(df[:, 0]) thread_size = thread_count core_size = self.get_core_size(thread_count, n) # Run kernel score_function = mod.get_function("score_function") score_function(cuda.Out(output), cuda.In(df[:, 3]), cuda.In(df[:, 4]), block=(thread_size, 1, 1), grid=(core_size, 1)) df[:, 0] = output # Only return score with lat/long return df[:, [0, 1, 2]]
def gpuFunc(iterator): iterator = iter(iterator) cpu_data = np.asarray(list(iterator), dtype=np.float32) datasize = len(cpu_data) # * 3 for data dimensions. /256 for block size. gridNum = int(np.ceil(datasize / 256.0)) # +1 for overprovisioning in case there is dangling threads centroids = np.empty(datasize, gpuarray.vec.float2) cuda.init() dev = cuda.Device(dev_id) contx = dev.make_context() # The GPU kernel below takes centroids IDs and 1-D data points in # form of float2 (x,y). X is for the centroid ID whereas Y is # the actual point coordinate. try: mod = SourceModule(cudakernel) func = mod.get_function("assignToCentroid") func(cuda.In(cpu_data), cuda.In(kPoints), np.int32(datasize), np.int32(len(kPoints)), cuda.Out(centroids), block=(16, 16, 1), grid=(gridNum, 1), shared=0) closest = [(val[0], (np.asarray(val[1]), 1)) for val in centroids] except Exception as err: raise Exception("Error {} in node {}".format( err, socket.gethostname())) contx.pop() del cpu_data del datasize del centroids del contx return iter(closest)
def gpuinv3x3(inp, n): # internal constants not to be modified hpat = (0x07584, 0x08172, 0x04251, 0x08365, 0x06280, 0x05032, 0x06473, 0x07061, 0x03140) # Convert parameters into numpy array # *** change next line between float32 and float64 to match float or double inpd = np.array(inp, dtype=np.float64) hpatd = np.array(hpat, dtype=np.uint32) # *** change next line between float32 and float64 to match float or double output = np.empty((n * 9), dtype=np.float64) # Get kernel function matinv3x3 = kernel_3x3.get_function("inv3x3") # Define block, grid and compute blockDim = (288, 1, 1) # do not change gridDim = ((n / 32) + 1, 1, 1) # Kernel function matinv3x3(cuda.In(inpd), cuda.Out(output), np.uint64(n), cuda.In(hpatd), block=blockDim, grid=gridDim) return output
def cal_cv(self, inputs): """ calculate the constraint values for each individual """ rows, cols = inputs.shape # prepare data k_layouts = np.float32(inputs).flatten() k_cvs = np.float32(np.zeros(rows)) k_sx = np.float32(self.sx) k_sy = np.float32(self.sy) # pick out the function func = self.kernels.get_function("cal_cv_turb") func(drv.In(k_layouts), drv.Out(k_cvs), drv.In(k_sx), drv.In(k_sy), grid=(int(rows // 10 + 1), 1, 1), block=(10, 1, 1)) return k_cvs
def magnetic_field_at(magnet: Magnet, point: P3) -> P3: """ magnet 在 point 处产生的磁场 这个方法需要反复传输数据,速度比 CPU 慢 """ GPU_ACCELERETE.__compile() if isinstance(magnet, CCT): # point 转为局部坐标,并变成 numpy 向量 p = magnet.local_coordinate_system.point_to_local_coordinate( point).to_numpy_ndarry3_float32() length = int(magnet.dispersed_path3.shape[0]) winding = magnet.dispersed_path3.flatten().astype(numpy.float32) ret = numpy.zeros((3, ), dtype=numpy.float32) GPU_ACCELERETE.CUDA_MAGNETIC_FIELD_AT_CCT( drv.In(winding), drv.In(p), drv.In(numpy.array([length]).astype(numpy.int32)), drv.Out(ret), block=(512, 1, 1), grid=(256, 1), ) print(p) return P3.from_numpy_ndarry(ret * magnet.current * 1e-7)
def gpu(): oc = numpy.empty_like(ia, dtype=ia.dtype) blocks_per_grid = int((num + threads_per_block - 1) / threads_per_block) mod = SourceModule(""" __global__ void vectorAdd(const float *A, const float *B, float *C, int numElements) { int i = blockDim.x * blockIdx.x + threadIdx.x; if (i < numElements) { C[i] = sinf(A[i]) + sinf(B[i]); } } """) vec_add = mod.get_function("vectorAdd") vec_add(cuda.In(ia), cuda.In(ib), cuda.Out(oc), numpy.int32(num), block=(threads_per_block, 1, 1), grid=(blocks_per_grid, 1, 1), shared=0) return oc
def compute_new_pendulum_states_amount_of_chaos_adaptive_step_size_method( self, currentStates, amountOfChaos, timeAlreadyExecuted, maxTimeToExecute, startFromDefaultState): logger.info('Computing new pendulum states with ' + str(self.algorithm.name) + ' method') logger.info('Using the "amount of chaos" kernel') logger.info('time step: ' + str(self.timeStep) + ' seconds') logger.info('error tolerance: ' + str(self.errorTolerance)) logger.info('amount of time already computed: ' + str(timeAlreadyExecuted) + ' seconds') logger.info('max time to simulate: ' + str(maxTimeToExecute) + ' seconds') logger.info('amount of time to simulate: ' + str(maxTimeToExecute - timeAlreadyExecuted) + ' seconds') # Compute the double pendulum fractal image. logger.info('Running pendulum simulation kernel...') kernelStart = time.time() self.computeDoublePendulumFractalWithAmountOfChaosMethod( self.npFloatType(self.point1Mass), self.npFloatType(self.point2Mass), self.npFloatType(self.pendulum1Length), self.npFloatType(self.pendulum2Length), self.npFloatType(self.gravity), self.npFloatType(self.angle1Min), self.npFloatType(self.angle1Max), self.npFloatType(self.angle2Min), self.npFloatType(self.angle2Max), cuda.InOut(currentStates), cuda.In(amountOfChaos), np.int32(startFromDefaultState), self.npFloatType(timeAlreadyExecuted), np.int32(self.numberOfAnglesToTestX), np.int32(self.numberOfAnglesToTestY), self.npFloatType(self.timeStep), self.npFloatType(self.errorTolerance), self.npFloatType(maxTimeToExecute), # block=(1, 1, 1), grid=(1, 1)) # block=(2, 2, 1), grid=(1, 1)) # block=(4, 4, 1), grid=(4, 4)) # block=(8, 8, 1), grid=(8, 8)) block=(16, 16, 1), grid=(16, 16)) # block=(32, 32, 1), grid=(32, 32)) # Print the time it took to run the kernel. timeToExecuteLastKernel = time.time() - kernelStart logger.info('Completed pendulum simulation kernel in ' + str(timeToExecuteLastKernel) + ' seconds')
def test_pycuda_only(): """Run pycuda only example to test that pycuda works.""" from pycuda.compiler import SourceModule mod = SourceModule(""" __global__ void multiply_them(float *dest, float *a, float *b) { const int i = threadIdx.x; dest[i] = a[i] * b[i]; } """) multiply_them = mod.get_function("multiply_them") # Test with pycuda in/out of numpy.ndarray a = np.random.randn(100).astype(np.float32) b = np.random.randn(100).astype(np.float32) dest = np.zeros_like(a) multiply_them(drv.Out(dest), drv.In(a), drv.In(b), block=(400, 1, 1), grid=(1, 1)) assert (dest == a * b).all()
def run(path): # im = Image.open(path) # im.show() img1 = np.array(Image.open(path).convert('RGB')) # 打开图像并转化为数字矩阵 # print(type(img1)) img2 = np.ones_like(img1) * 255 # 通过cuda来调用gpu模块 operatepic = mod.get_function('operatepic') test = mod.get_function('test') print(img1.shape) h, w, c = img1.shape k = 21 # 卷积核kernel b = Count * 10 # 块儿数 t = 128 # 每个块儿线程数 info = [h, w, k, b, t] info = np.int32(info) print(info) start = time.time() # 根据自己的GPU性能选择合适的block、grid,如果超出会报错 for i in range(10): operatepic(drv.In(info), drv.In(img1), drv.Out(img2), block=(t, 1, 1), grid=(b, 1)) stop = time.time() print((stop - start) / 10)
def avg_vote_gpu(nnf, img, patch_size): mod = SourceModule(open( os.path.join(package_directory, "GeneralizedPatchMatch.cu")).read(), no_extern_c=True) avg_vote = mod.get_function("avg_vote") output = np.zeros([img.shape[-1], *nnf.shape[:2]], dtype=np.float32) threads = 20 avg_vote(drv.In(xy_to_int(nnf)), drv.In(np.ascontiguousarray(img.transpose(2, 0, 1))), drv.InOut(output), np.int32(img.shape[-1]), np.int32(nnf.shape[0]), np.int32(nnf.shape[1]), np.int32(img.shape[0]), np.int32(img.shape[1]), np.int32(patch_size), block=(threads, threads, 1), grid=(get_blocks_for_dim(nnf.shape[1], threads), get_blocks_for_dim(nnf.shape[0], threads))) return np.ascontiguousarray(output.transpose(1, 2, 0))
def remove_empty_anchor(view, anchors, limit): # input: # ahchors: (N, 4) 4->(y1, x1, y2, x2) (x > y) # view: (W, H, C) mod = cuda.module_from_buffer(module_buff) func = mod.get_function('_Z12remove_emptyPfPiS_S0_S0_') anchors_shape = np.array(anchors.shape).astype(np.int32) view_shape = np.array(view.shape).astype(np.int32) index = np.zeros((anchors.shape[0], view_shape[2])).astype(np.float32) func( cuda.InOut(index), cuda.In(anchors), cuda.In(view), cuda.In(anchors_shape), cuda.In(view_shape), block=(int(view_shape[2]), 1, 1), # a thread <-> a value in a specific 2d pos(need to sum the channel) grid=(int(anchors_shape[0]), 50, 1) # a grid <-> an anchor and a line(x) # 50 must > anchors width ) index = np.sum(index, axis=1) return np.where(index > limit)[0]
def interpolate(self, x): r = R.from_euler('xyz', x, degrees=True) for pos, i in zip(self.sensor_pos, range(self.number_of_sensors)): self.input[i] = r.apply(pos) self.interpol(np.int32(self.number_of_sensors), drv.In(self.input), drv.Out(self.output), texrefs=[self.texref], block=self.bdim, grid=self.gdim) output_rot = np.zeros((self.number_of_sensors, 3)) for i in range(self.number_of_sensors): output_rot[i] = r.inv().apply(self.output[i]) return self.input, self.output, output_rot
def speed(n_samples, n_states, n_features, W1, W2): module = SourceModule(tpl.render(W1=W1, W2=W2)) func = module.get_function('log_diag_mvn_likelihood') sequences = np.zeros((n_samples, n_features), dtype=np.float32) means = np.zeros((n_states, n_features), dtype=np.float32) variances = np.ones((n_states, n_features), dtype=np.float32) logvariances = np.ones((n_states, n_features), dtype=np.float32) loglikelihoods = np.zeros((n_samples, n_states), dtype=np.float32) N_THREADS = 4096 start = cuda.Event() end = cuda.Event() start.record() func(cuda.In(sequences), cuda.In(means), cuda.In(variances), cuda.In(np.log(variances)), np.int32(n_samples), np.int32(n_states), np.int32(n_features), cuda.InOut(loglikelihoods), block=(W1 * W2, 1, 1), grid=(N_THREADS / (W1 * W2), 1)) end.record() end.synchronize() return { 'n_samples': n_samples, 'n_states': n_states, 'n_features': n_features, 'W1': W1, 'W2': W2, 'time': np.around(start.time_till(end), decimals=3) }
def calc_next_world_gpu(world, next_world): height, width = world.shape ## CUDAカーネルを定義 mod = SourceModule(""" __global__ void get_next_world(int *world, int *nextWorld, int height, int width){ int x = threadIdx.x + blockIdx.x * blockDim.x; int y = threadIdx.y + blockIdx.y * blockDim.y; const int index = y * width + x; int current_value; int next_value; if (x >= width) { return; } if (y >= height) { return; } current_value = world[index]; int numlive = 0; numlive += world[((y - 1) % height ) * width + ((x - 1) % width)]; numlive += world[((y - 1) % height ) * width + ( x % width)]; numlive += world[((y - 1) % height ) * width + ((x + 1) % width)]; numlive += world[( y % height ) * width + ((x - 1) % width)]; numlive += world[( y % height ) * width + ((x + 1) % width)]; numlive += world[((y + 1) % height ) * width + ((x - 1) % width)]; numlive += world[((y + 1) % height ) * width + ( x % width)]; numlive += world[((y + 1) % height ) * width + ((x + 1) % width)]; if (current_value == 0 && numlive == 3){ next_value = 1; }else if (current_value == 1 && numlive >= 2 && numlive <= 3){ next_value = 1; }else{ next_value = 0; } nextWorld[index] = next_value; } """) set_next_cell_value_GPU = mod.get_function("get_next_world") block = (BLOCKSIZE, BLOCKSIZE, 1) grid = ((width + block[0] - 1) // block[0], (height + block[1] - 1) // block[1]) set_next_cell_value_GPU(cuda.In(world), cuda.Out(next_world), numpy.int32(height), numpy.int32(width), block=block, grid=grid)
def simulate_grid(module, Nobs, N, oversampling, datahits, datatimes, radius, d, dN, pa, ps, seed=666, Nthreads=64): Nruns = N*N Nphotons = Nobs*Nruns*oversampling Nobs *= oversampling print "Total Threads: %s" % Nphotons assert(Nphotons <= 1.1e8) d = np.uint32(d) dN = np.uint32(dN) radius = np.float32(radius) Ndoms = np.uint32(pow((d/dN)*2+1, 3) - 1) t1 = time.time() rng_states = get_rng_states(module, Nphotons, seed=seed) t2 = time.time() d_list = get_doms(module, Ndoms, radius, d, dN) t3 = time.time() x = np.linspace(-20, 20, N) y = np.linspace(-20, 20, N) X, Y = np.meshgrid(x, y) Z = np.zeros(N*N) pInit = np.concatenate([X.flatten(), Y.flatten(), Z]).astype(np.float32) t4 = time.time() print "t2-t1: ", t2-t1 print "t3-t2: ", t3-t2 print "t4-t3: ", t4-t3 start = time.time() datahits = np.zeros(Ndoms*Nruns, dtype=np.int32) datatimesbinned = np.zeros(Ndoms*Nruns, dtype=np.float32) datatimes = np.zeros(Nphotons, dtype=np.float32) datapositions = np.zeros(Nphotons*3, dtype=np.float32) simulate = module.get_function('simulate_grid') simulate(np.uint64(Nphotons), np.uint64(Nobs), rng_states, d_list, cuda.InOut(datahits), cuda.InOut(datatimes), cuda.InOut(datatimesbinned), cuda.InOut(datapositions), cuda.In(pInit), np.float32(pa), np.float32(ps),np.uint32(Ndoms), block=(Nthreads, 1, 1), grid=(Nphotons//Nthreads + 1, 1)) print "end-start", time.time() - start print "sumHits: ", sum(datahits) print datahits = np.reshape(np.array(datahits, dtype=float), (Nruns, Ndoms))/oversampling datatimesbinned = np.reshape(np.array(datatimesbinned, dtype=float), (Nruns, Ndoms))/oversampling return datahits, np.array(datatimes, dtype=float), datatimesbinned, datapositions
def onestepIteration(dist, timestep, maxit): """ iterates the function image on a 2d grid through an euler anisotropic diffusion operator with timestep=timestep maxit number of times """ image = 1 * dist forme = image.shape if (np.size(forme) > 2): sys.exit('Only works on gray images') aSize = forme[0] * forme[1] xdim = np.int32(forme[0]) ydim = np.int32(forme[1]) image[0, :] = image[1, :] image[xdim - 1, :] = image[xdim - 2, :] image[:, ydim - 1] = image[:, ydim - 2] image[:, 0] = image[:, 1] image = image.reshape(aSize, order='C').astype(np.float32) final = np.zeros(aSize).astype(np.float32) #reshaping the image matrix #block size: B := dim1*dim2*dim3=1024 #gird size : dim1*dimr2*dim3 = ceiling(aSize/B) blockX = int(1024) multiplier = aSize / float(1024) if (aSize / float(1024) > int(aSize / float(1024))): gridX = int(multiplier + 1) else: gridX = int(multiplier) for k in range(0, maxit): diffIteration(drv.In(image), drv.Out(final), ydim, xdim, np.float32(timestep), block=(blockX, 1, 1), grid=(gridX, 1, 1)) final = final.reshape(forme, order='C') final[0, :] = final[1, :] final[xdim - 1, :] = final[xdim - 2, :] final[:, ydim - 1] = final[:, ydim - 2] final[:, 0] = final[:, 1] image = final.reshape(aSize, order='C').astype(np.float32) return final.reshape(forme, order='C')
def hitungcuda(a5): # cuda.init() # device = cuda.Device(0) # ctx = device.make_context() from pycuda.compiler import SourceModule mod = SourceModule(""" __global__ void conv5(float *r5r, float *r5i, float *a5, float *f5r, float *f5i) { const int i = blockDim.x * blockIdx.x + threadIdx.x; const int j = blockDim.y * blockIdx.y + threadIdx.y; int Idx = i + j * blockDim.x * gridDim.x; r5r[Idx] = a5[Idx] * f5r[Idx]; r5i[Idx] = a5[Idx] * f5i[Idx]; } """) # ctx.pop() conv5 = mod.get_function("conv5") conv5(cuda.Out(r5r), cuda.Out(r5i), cuda.In(a5), cuda.In(f5r), cuda.In(f5i), block=(68, 4, 1), grid=(5, 5))
def run_gpu(self, list_one, list_two, dimension, window): """ run CUDA GPU computing """ sys.stdout.flush() list_one_float = np.array(list_one).astype(np.float32) list_two_float = np.array(list_two).astype(np.float32) dest = np.zeros_like(list_one_float) xdim = self.MAX_THREADS * self.capability[0] ydim = 1 zdim = 1 array_len = np.asarray(np.int32(len(dest))) blocks_per_grid = int(math.ceil(len(dest)/float(xdim))) self.cuda_exec_func( drv.Out(dest), drv.In(list_one_float), drv.In(list_two_float), drv.In(array_len), block=(xdim, ydim, zdim), grid=(blocks_per_grid, 1) ) return dest.tolist()
def cuda_hamming_dist(self, vec_a, vec_b): #dest = numpy.zeros_like(vec_b) dest = numpy.array(vec_b) length = numpy.array([vec_b.shape[0]]).astype(numpy.uint64) #for d in dest: # print d custom_grid = (int(math.ceil(float(length[0]) / (50 * 256))), 1) print "custom grid: ", custom_grid self.hamming_dist(drv.In(vec_a), drv.InOut(dest), drv.In(length), block=self.block, grid=custom_grid) print dest #for d in dest: # print d print dest.shape return dest
def compute_hist(values, bins): hist = np.zeros(bins).astype(np.int32) hist_func = mod.get_function("hist") block = (128, 1, 1) grid = (int((len(values) + block[0] - 1) / block[0]), 1, 1) hist_func(cuda.In(values), np.int32(len(values)), cuda.InOut(hist), np.int32(bins), grid=grid, block=block) return hist
def __call__(self, Qdrift, Tdrift, dT, Pdrift, dP, **kwds): depos = numpy.vstack( (Qdrift.cpu().numpy(), Tdrift.cpu().numpy(), dT.cpu().numpy(), Pdrift.cpu().numpy(), dP.cpu().numpy())).T.copy(order='C') ndepos = depos.shape[0] tbeg = self.tb.bin_trunc(Tdrift - self.nsigma * dT).cpu().numpy() tend = self.tb.bin_trunc(Tdrift + self.nsigma * dT).cpu().numpy() pbeg = self.pb.bin_trunc(Pdrift - self.nsigma * dP).cpu().numpy() pend = self.pb.bin_trunc(Pdrift + self.nsigma * dP).cpu().numpy() tnum = tend - tbeg + 1 pnum = pend - pbeg + 1 Nticks = numpy.zeros(ndepos) + self.shape[1] offsets = numpy.vstack((tbeg, pbeg, Nticks)) offsets = offsets.T.copy(order='C') out = numpy.zeros(self.shape, dtype=numpy.float32) for idepo in range(ndepos): depo = depos[idepo] offset = offsets[idepo] block = (int(tnum[idepo]), int(pnum[idepo]), 1) #print (idepo, block, offset) print(idepo, depo) self.meth(drv.InOut(out), drv.In(self.bindesc), drv.In(offset), drv.In(depo), block=block) t1 = time.time() print(t1 - t0) return out