def forward1d(sig, fourier_pts, eps=None): # converting all variables to complex one sig = sig.astype(np.complex64) fourier_pts = fourier_pts.astype(np.complex64) # grid = np.arange(np.ceil(-len(sig) / 2.), np.ceil(len(sig) / 2.)).astype( # np.complex64) sz = np.uint32(sig.size) res = np.zeros_like(sig).astype(np.complex64) grid = np.zeros_like(sig).astype(np.complex64) # the kernel mod = SourceModule(""" #include <pycuda-complex.hpp> #include <stdio.h> __global__ void dft1(pycuda::complex<float> *signal, pycuda::complex<float> *fourier_pts, pycuda::complex<float> *grid,int sz, pycuda::complex<float> *res) { // initialize grid int i = 0; for (i=0; i<sz; i++){ grid[i] = i - (sz/2); } int idx = (threadIdx.x + blockDim.x * blockIdx.x) + (threadIdx.y + blockDim.y * blockIdx.y) + (threadIdx.z + blockDim.z * blockIdx.z); pycuda::complex<float> j(0, -1); pycuda::complex<float> tmp(0, 0); for (int i=0; i<sz; i++){ tmp += exp(j * grid[i] * fourier_pts[idx]) * signal[i]; } res[idx] = tmp; } """) bdim = (32, 32, 1) gridm = (sz / bdim[0] + (sz % bdim[0] > 0), 1) func = mod.get_function("dft1") func(cuda.In(sig), cuda.In(fourier_pts), cuda.In(grid), sz, cuda.Out(res), block=bdim, grid=gridm) return res, 0
def gpu_curand_init(seed): global _rng_state global _NUM_THREADS if _rng_state is not None: gpu_curand_deinit() seed = np.int32(seed) _rng_state = np.zeros(_NUM_THREADS, np.intp) _gpu_curand_init(seed, cuda.Out(_rng_state), block=(_NUM_THREADS, 1, 1), grid=(1, 1))
def run(self, x): dev_x = drv.to_device(x) sell_y = np.empty(self.slice_height * self.slice_count, dtype=np.float32) self.prg(self.dev_sell_sliceptr, self.dev_sell_slicecol, self.dev_sell_colidx, self.dev_sell_val, dev_x, np.int32(self.slice_height), drv.Out(sell_y), block=(self.slice_height, 1, 1), grid=(self.slice_count, 1)) return sell_y[:self.n_row]
def kernel(self): density, x, y, z, Qx, Qy, Qz, result = self.born_args nx, ny, nz, nqx, nqy, nqz = [ numpy.int32(len(v)) for v in x, y, z, Qx, Qy, Qz ] cx, cy, cz, cQx, cQy, cQz, cdensity = [ gpuarray.to_gpu(v) for v in x, y, z, Qx, Qy, Qz, density ] cframe = cuda.mem_alloc(result[0].nbytes) cuda.matrix_to_texref(nx, texref, order="C") texref.set_filter_mode(cuda.filter_mode.LINEAR) n = int(1 * nqy * nqz) print 'fn in kernel' while True: try: qxi = numpy.int32(self.work_queue.get(block=False)) except Queue.Empty: break print "%d of %d on %d\n" % (qxi, nqx, self.gpu), cuda_texture_func(nx, ny, nz, nqx, nqy, nqz, cdensity, cx, cy, cz, cQx, cQy, cQz, qxi, cuda.Out(result), texrefs=[texref]) #self.cudaBorn(nx,ny,nz,nqx,nqy,nqz, #cdensity,cx,cy,cz,cQx,cQy,cQz,qxi,cframe, #**cuda_partition(n)) ## Delay fetching result until the kernel is complete #cuda_sync() ## Fetch result back to the CPU #cuda.memcpy_dtoh(result[qxi], cframe) print "%d %s\n" % (qxi, ctemp.get()) del cx, cy, cz, cQx, cQy, cQz, cdensity, cframe
def calc_next_world_gpu(world, next_world): height, width = world.shape ## CUDAカーネルを定義 mod = SourceModule(""" __global__ void get_next_world(int *world, int *nextWorld, int height, int width){ int x = threadIdx.x + blockIdx.x * blockDim.x; int y = threadIdx.y + blockIdx.y * blockDim.y; const int index = y * width + x; int current_value; int next_value; if (x >= width) { return; } if (y >= height) { return; } current_value = world[index]; int numlive = 0; numlive += world[((y - 1) % height ) * width + ((x - 1) % width)]; numlive += world[((y - 1) % height ) * width + ( x % width)]; numlive += world[((y - 1) % height ) * width + ((x + 1) % width)]; numlive += world[( y % height ) * width + ((x - 1) % width)]; numlive += world[( y % height ) * width + ((x + 1) % width)]; numlive += world[((y + 1) % height ) * width + ((x - 1) % width)]; numlive += world[((y + 1) % height ) * width + ( x % width)]; numlive += world[((y + 1) % height ) * width + ((x + 1) % width)]; if (current_value == 0 && numlive == 3){ next_value = 1; }else if (current_value == 1 && numlive >= 2 && numlive <= 3){ next_value = 1; }else{ next_value = 0; } nextWorld[index] = next_value; } """) set_next_cell_value_GPU = mod.get_function("get_next_world") block = (BLOCKSIZE, BLOCKSIZE, 1) grid = ((width + block[0] - 1) // block[0], (height + block[1] - 1) // block[1]) set_next_cell_value_GPU(cuda.In(world), cuda.Out(next_world), numpy.int32(height), numpy.int32(width), block=block, grid=grid)
def onestepIteration(dist, timestep, maxit): """ iterates the function image on a 2d grid through an euler anisotropic diffusion operator with timestep=timestep maxit number of times """ image = 1 * dist forme = image.shape if (np.size(forme) > 2): sys.exit('Only works on gray images') aSize = forme[0] * forme[1] xdim = np.int32(forme[0]) ydim = np.int32(forme[1]) image[0, :] = image[1, :] image[xdim - 1, :] = image[xdim - 2, :] image[:, ydim - 1] = image[:, ydim - 2] image[:, 0] = image[:, 1] image = image.reshape(aSize, order='C').astype(np.float32) final = np.zeros(aSize).astype(np.float32) #reshaping the image matrix #block size: B := dim1*dim2*dim3=1024 #gird size : dim1*dimr2*dim3 = ceiling(aSize/B) blockX = int(1024) multiplier = aSize / float(1024) if (aSize / float(1024) > int(aSize / float(1024))): gridX = int(multiplier + 1) else: gridX = int(multiplier) for k in range(0, maxit): diffIteration(drv.In(image), drv.Out(final), ydim, xdim, np.float32(timestep), block=(blockX, 1, 1), grid=(gridX, 1, 1)) final = final.reshape(forme, order='C') final[0, :] = final[1, :] final[xdim - 1, :] = final[xdim - 2, :] final[:, ydim - 1] = final[:, ydim - 2] final[:, 0] = final[:, 1] image = final.reshape(aSize, order='C').astype(np.float32) return final.reshape(forme, order='C')
def hitungcuda(a5): # cuda.init() # device = cuda.Device(0) # ctx = device.make_context() from pycuda.compiler import SourceModule mod = SourceModule(""" __global__ void conv5(float *r5r, float *r5i, float *a5, float *f5r, float *f5i) { const int i = blockDim.x * blockIdx.x + threadIdx.x; const int j = blockDim.y * blockIdx.y + threadIdx.y; int Idx = i + j * blockDim.x * gridDim.x; r5r[Idx] = a5[Idx] * f5r[Idx]; r5i[Idx] = a5[Idx] * f5i[Idx]; } """) # ctx.pop() conv5 = mod.get_function("conv5") conv5(cuda.Out(r5r), cuda.Out(r5i), cuda.In(a5), cuda.In(f5r), cuda.In(f5i), block=(68, 4, 1), grid=(5, 5))
def matmul(self, M, N, timed=False): """ Apply the compiled kernel to compute M*N and return the result. Optionally, report GPU execution time (in miliseconds) if timed is set to True. M, N and P are all assumed to be square matrices of the same dimension and whose dtype is compatible with that of the kernel. """ n = np.shape(M)[0] P = np.empty((n, n), dtype=self.dtype) # initialize empty P block_dim = (self.tile_width, self.tile_width, 1) grid_width = int(np.ceil(n / float(self.tile_width))) grid_dim = (grid_width, grid_width, 1) if timed: start = cuda.Event() end = cuda.Event() start.record() self.kernelfunc(cuda.In(M), cuda.In(N), cuda.Out(P), np.int32(n), block=block_dim, grid=grid_dim) end.record() end.synchronize() milisecs = start.time_till(end) return P, milisecs else: self.kernelfunc(cuda.In(M), cuda.In(N), cuda.Out(P), np.int32(n), block=block_dim, grid=grid_dim) return P
def calculate_rates(lat: np.ndarray, lon: np.ndarray, pop: np.ndarray, parameters: dict) -> tuple: cuda.init() device = cuda.Device(0) context = device.make_context() kernel_fn = get_kernel() p0 = float(parameters["p0"]) if "p0" in parameters else 1.0 p1 = float(parameters["p1"]) if "p1" in parameters else 1.0 p2 = float(parameters["p2"]) if "p2" in parameters else 1.0 p3 = float(parameters["p3"]) if "p3" in parameters else -2.0 count = len(lat) distances = np.zeros((count, count), dtype=np.float32) rates = np.zeros((count, count), dtype=np.float32) BLOCK_DIM = 32 GRID_DIM = (count + BLOCK_DIM - 1) // BLOCK_DIM try: kernel_fn(np.uint32(count), cuda.In(lat), cuda.In(lon), cuda.In(pop), np.float32(p0), np.float32(p1), np.float32(p2), np.float32(p3), cuda.Out(distances), cuda.Out(rates), block=(BLOCK_DIM, BLOCK_DIM, 1), grid=(GRID_DIM, GRID_DIM)) finally: context.pop() return distances, rates
def loopboxes_overlaps(vertex, gt_vertex, sample_range, ignore_union): K, dim = vertex.shape N, dim2 = gt_vertex.shape assert dim >= 8 assert dim2 >=8 vertex = vertex[:, :8] gt_vertex = gt_vertex[:, :8] overlaps = np.zeros((K, N), dtype=np.float32) with cuda_context(): cuda_overlap(cuda.In(vertex.astype(np.float32)),cuda.In(gt_vertex.astype(np.float32)), cuda.Out(overlaps), np.int32(K), np.int32(N), np.int32(8), np.float32(sample_range), np.int32(ignore_union), block=(THREAD_NUM,1,1),grid=(K/THREAD_NUM+1,N,1)) return overlaps.astype(np.float)
def test_vector_types(self): mod = SourceModule(""" __global__ void set_them(float3 *dest, float3 x) { const int i = threadIdx.x; dest[i] = x; } """) set_them = mod.get_function("set_them") a = gpuarray.vec.make_float3(1, 2, 3) dest = np.empty((400), gpuarray.vec.float3) set_them(drv.Out(dest), a, block=(400, 1, 1)) assert (dest == a).all()
def gpuAdd(array1, array2): if array1.dtype != np.float32 or array2.dtype != np.float32: array1 = array1.astype(np.float32) array2 = array2.astype(np.float32) mod = SourceModule(""" __global__ void add_them(float *dest, float *a, float *b) { const int i = threadIdx.x; dest[i] = a[i] + b[i]; } """) add_them = mod.get_function("add_them") dest = np.zeros_like(array1) add_them(drv.Out(dest), drv.In(array1), drv.In(array2), block=(32, 32, 1)) return dest
def euclidean_dists(a, bs, n): # bzero the dest array dest = np.zeros((n, )).astype(np.float64) # elements a[0], a[1] # prevents an error about ndarray continuity a_point = a[0:2].copy(order='C') parallel_euclidean_dist(drv.Out(dest), drv.In(a_point), drv.In(bs), block=(n, 1, 1), grid=(1, 1)) return dest
def get_next_state_gpu(state, next_state): height, width = state.shape mod = SourceModule(""" __global__ void get_next_state(int *state, int *nextState, int height, int width) { unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x; unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y; unsigned int idx = iy * width + ix; int sum = 0; int val, nextVal; if (ix >= width || iy >= height) { return; } val = state[idx]; sum += state[((iy - 1) % height) * width + ((ix - 1) % width)]; sum += state[((iy - 1) % height) * width + (ix % width)]; sum += state[((iy - 1) % height) * width + ((ix + 1) % width)]; sum += state[(iy % height) * width + ((ix - 1) % width)]; sum += state[(iy % height) * width + ((ix + 1) % width)]; sum += state[((iy + 1) % height) * width + ((ix - 1) % width)]; sum += state[((iy + 1) % height) * width + (ix % width)]; sum += state[((iy + 1) % height) * width + ((ix + 1) % width)]; if (val == 0 && sum == 3) { nextVal = 1; } else if (val != 0 && (sum >= 2 && sum <= 3)) { nextVal = 1; } else { nextVal = 0; } nextState[idx] = nextVal; } """) kernel_func = mod.get_function("get_next_state") blk_dim = (32, 32, 1) grid_dim = ((width + blk_dim[0] - 1) // blk_dim[0], \ (height + blk_dim[1] - 1) // blk_dim[1], 1) kernel_func( drv.In(state), drv.Out(next_state), \ np.int32(height), np.int32(width), \ block=blk_dim, grid=grid_dim)
def lerpImage(imageA, imageB, mu, outfile): imA = Image.open(imageA) imB = Image.open(imageB) # read pixels and floats pxA = np.array(imA) pxA = pxA.astype(np.float32) pxB = np.array(imB) pxB = pxB.astype(np.float32) # the kernel function kernel = """ __global__ void lerpImage(float *lerped, float *a, float *b, float mu, int check){ int i = (threadIdx.x) + blockDim.x * blockIdx.x; if(i*3 < check*3) { lerped[i*3]= a[i*3] + mu * (b[i*3]-a[i*3]); lerped[i*3+1]= a[i*3+1] + mu * (b[i*3+1]-a[i*3+1]); lerped[i*3+2]= a[i*3+2] + mu * (b[i*3+2]-a[i*3+2]); } } """ # define block and grid dim = imA.size[0] * imA.size[1] checkSize = np.int32(dim) BLOCK_SIZE = 1024 block = (BLOCK_SIZE, 1, 1) grid = (int(dim / BLOCK_SIZE) + 1, 1, 1) # Init lerped pixels lerpedPx = np.zeros_like(pxA) # Compile and get kernel function mod = SourceModule(kernel) func = mod.get_function("lerpImage") func(cuda.Out(lerpedPx), cuda.In(pxA), cuda.In(pxB), np.float32(mu), checkSize, block=block, grid=grid) # Convert back to ints and save lerpedPx = (np.uint8(lerpedPx)) imOut = Image.fromarray(lerpedPx, mode="RGB") imOut.save(outfile) print "Wrote to image file %s" % outfile
def compute(volume, offset): bsize = (32, 32, 1) gsize = (int(volume[0] / bsize[0]), int(volume[1] / bsize[1]), int(volume[2])) DEFINES = '\n#define SCALE ' + str(1) + \ '\n#define WIDTH ' + str(volume[0]) + \ '\n#define HEIGHT ' + str(volume[1]) + \ '\n#define bwidth ' + str(bsize[0]) + \ '\n#define bheight ' + str(bsize[1]) + \ '\n#define offx ' + str(-offset[0]) + \ '\n#define offy ' + str(-offset[1]) + \ '\n#define offz ' + str(-offset[2]) + \ '\n#define DEPTH ' + str(volume[2]) + \ '\n#define bdepth ' + str(1) + '\n' # inutile # Non optimal method path = os.path.split(__file__)[0] + '/cuda/' kernel_cu = open(path + 'kernel.cu', 'r') kernel_buf = kernel_cu.read() # Load complex2.cu complex2_cu = open(path + 'complex2.cu', 'r') complex2_buf = complex2_cu.read() # Load vectors.cu vectors_cu = open(path + 'vectors.cu', 'r') vectors_buf = vectors_cu.read() # Import cu files inside the kernel and copy the defines cu_buffer = vectors_buf + '\n' + complex2_buf kernel_buf = kernel_buf.replace('%DEFINES%', DEFINES).replace('%CUFILES%', cu_buffer) mod = SourceModule(kernel_buf, "nvcc", include_dirs=["/usr/local/cuda/include"], no_extern_c=True) compute = mod.get_function("compute") # Array di uscita dest = np.zeros(volume[0] * volume[1] * volume[2]).astype(np.float32) compute(drv.Out(dest), block=bsize, grid=gsize) context.synchronize() return dest
def nearest_neighbor(dataset): a, b = dataset n = b.size / len(b) as_xy = np.delete(a, 2, 0) bs_xy = np.delete(b, 2, 0) dest = np.zeros((n, )).astype(np.int32) parallel_nn(drv.Out(dest), drv.In(as_xy), drv.In(bs_xy), np.int32(n), block=(n, 1, 1), grid=(1, 1)) return dest
def cuda_add(): multiply_them = mod.get_function("multiply_them") a = numpy.random.randn(10).astype(numpy.float32) b = numpy.random.randn(10).astype(numpy.float32) print(a) dest = numpy.zeros_like(a) c = 2 multiply_them(cuda.Out(dest), cuda.InOut(a), cuda.In(b), numpy.float64(c), block=(10, 1, 1), grid=(1, 1)) return a
def overlap_coord_transfer(gt_boxes): if gt_boxes is None: return np.array([]) if len(gt_boxes)<=0: return gt_boxes K, n = gt_boxes.shape if n >= 8: return gt_boxes[:,:8] vertex = np.ones((K, 9), dtype = np.float32)*(-1.0) with cuda_context(): cuda_transfer(cuda.In(gt_boxes.astype(np.float32)), cuda.Out(vertex), np.int32(n),np.int32(K), block=(THREAD_NUM,1,1),grid=(K/THREAD_NUM+1,1)) return vertex.astype(np.float)
def h_of_p( px, py, E_k_n, U_k, mu, q, g, theta, debug=0): kxprime = q * reduced_kx kyprime = q * reduced_ky h_p = np.zeros((N_kx, N_ky)) block=(16,16,1) grid=(int(N_kx/16 + 1),int(N_ky/16 + 1)) h_of_p_GPU(drv.Out(h_p), drv.In(px), drv.In(py), drv.In(kxprime), drv.In(kyprime), drv.In(E_k_n), drv.In(U_k), np.double(mu), np.double(q), np.double(g), np.double(theta), np.int32(N_kx), np.int32(N_ky), np.int32(debug), block=block, grid=grid) return h_p
def computeCorrespondence(self): """ Compute point correspondence from result PointCloud to dst. CUDA function and summation reduction is called here. :return: total distance and matrix with point correspondence """ super(ICPParallel, self).computeCorrespondence() target = np.zeros([self.src.num, 3], dtype=np.float32) self.computeCorrespondenceCuda(cuda.In(self.result.points), cuda.Out(target), self.distances_gpu, block=(self.numCore, 1, 1)) return gpuarray.sum(self.distances_gpu).get(), PointCloud(target)
def hsv2rgb(img): hsv2rgb_func = mod.get_function("hsv_rgb") width = img.shape[1] height = img.shape[0] block = (16, 16, 1) grid = (ceil(width / 16), ceil(height / 16), 1) result = np.empty_like(img).astype(np.uint8) hsv2rgb_func(cuda.In(img.astype(np.float32)), cuda.Out(result), np.int32(width), np.int32(height), grid=grid, block=block) return result
def _process_total_test(self): """Use PyCuda""" nElements = np.int32(BLOCK_SIZE * 16 + 10) nBlocks = nElements / BLOCK_SIZE + 1 grid_dimensions = (nBlocks, 1, 1) a = np.random.randn(nElements).astype(np.float32) sum_cpu = np.sum(a) partialsum_gpu = np.zeros((nBlocks, 1), dtype=np.float32) self.cuda_total(cuda_driver.In(a), cuda_driver.Out(partialsum_gpu), \ np.uint32(nElements), grid=grid_dimensions, block=(BLOCK_SIZE, 1, 1)) cuda_driver.Context.synchronize() #Sum result from GPU print nBlocks print partialsum_gpu sum_gpu = np.sum(partialsum_gpu[0:np.ceil(nBlocks / 2.)]) return sum_cpu, sum_gpu
def gpufunc(xdr_data): xdr_data = iter(xdr_data) inp = np.asarray(list(xdr_data), dtype=np.float32) N = len(inp) # print("len:",N) out = np.zeros(N, dtype=np.float32) # out = np.empty(N, gpuarray.vec.float1) N = np.int32(N) print(inp, out) print("B") # GPU run nTheads = 256 * 4 nBlocks = int((N + nTheads - 1) / nTheads) drv.init() dev = drv.Device(0) contx = dev.make_context() mod = SourceModule(""" __global__ void func(float *a, float *b, size_t N) { const int i = blockIdx.x * blockDim.x + threadIdx.x; if (i >= N) { return; } //float temp_a = a[i]; //float temp_b = b[i]; //a[i] = (temp_a * 10 + 2 ) * ((temp_b + 2) * 10 - 5 ) * 5; a[i] = b[i]; } """) func = mod.get_function("func") start = timer() func(drv.Out(out), drv.In(inp), N, block=(nTheads, 1, 1), grid=(nBlocks, 1)) out1 = [np.asarray(x) for x in out] contx.pop() del contx del inp run_time = timer() - start print("gpu run time %f seconds " % run_time) return iter(out1)
def zoom_on_square(eclick, erelease): """eclick and erelease are the press and release events""" global n, side, x0, y0, my_obj, M, power x1, y1 = min(eclick.xdata, erelease.xdata), min(eclick.ydata, erelease.ydata) x2, y2 = max(eclick.xdata, erelease.xdata), max(eclick.ydata, erelease.ydata) print(" The button you used were: %s %s" % (eclick.button, erelease.button)) print(' Nx=%d, Ny=%d, x0=%f, y0=%f' % (x1, y1, x0, y0)) print(' Nx=%d, Ny=%d, x0=%f, y0=%f' % (x2, y2, x0, y0)) x_1 = x0 + side * (x1 - n / 2.) / n y_1 = y0 + side * (y1 - n / 2.) / n x_2 = x0 + side * (x2 - n / 2.) / n y_2 = y0 + side * (y2 - n / 2.) / n x0 = (x_2 + x_1) / 2. y0 = (y_2 + y_1) / 2. # Average of the 2 rectangle sides side = side * (x2 - x1 + y2 - y1) / n / 2 mandel(np.float64(x0), np.float64(y0), np.float64(side), np.int32(loops), np.int32(power), drv.Out(M), block=(n_block, n_block, 1), grid=(n_grid, n_grid, 1)) my_obj = plt.imshow( M, origin='lower', cmap=cmaps[i_cmap], aspect='equal', ) my_obj.set_data(M) ax.add_patch( Rectangle( (1 - .1, 1 - .1), 0.2, 0.2, alpha=1, facecolor='none', fill=None, )) ax.set_title('Side=%.2e, x=%.2e, y=%.2e, %s, Loops=%d' % (side, x0, y0, cmaps[i_cmap], loops)) plt.draw()
def generate_hat(num_samples): # The math suggests 16 samples is the width of the QRS complex # Measuring the QRS complex for 9004 gives 16 samples # Measured correlated peak 7 samples after start of QRS # Mexican hats seem to hold a nonzero value between -4 and 4 w/ sigma=1 sigma = 1.0 maxval = 4 * sigma minval = -maxval hat = numpy.zeros(num_samples).astype(numpy.float32) mexican_hat(cuda.Out(hat), numpy.float32(sigma), numpy.float32(minval), numpy.float32((maxval - minval) / num_samples), grid=(1, 1), block=(num_samples, 1, 1)) return hat
def test_simple_kernel(self): mod = SourceModule(""" __global__ void multiply_them(float *dest, float *a, float *b) { const int i = threadIdx.x; dest[i] = a[i] * b[i]; } """) multiply_them = mod.get_function("multiply_them") a = np.random.randn(400).astype(np.float32) b = np.random.randn(400).astype(np.float32) dest = np.zeros_like(a) multiply_them(drv.Out(dest), drv.In(a), drv.In(b), block=(400, 1, 1)) assert la.norm(dest - a * b) == 0
def cuda_interpolate(self, channel, m, size_result): cols = size_result[0] rows = size_result[1] kernel_code = """ texture<float, 2> tex; __global__ void interpolation(float *dest, float *m0, float *m1) { int idx = threadIdx.x + blockDim.x * blockIdx.x; int idy = threadIdx.y + blockDim.y * blockIdx.y; if (( idx < %(NCOLS)s ) && ( idy < %(NDIM)s )) { dest[%(NDIM)s * idx + idy] = tex2D(tex, m0[%(NDIM)s * idy + idx], m1[%(NDIM)s * idy + idx]); } } """ kernel_code = kernel_code % {'NCOLS': cols, 'NDIM': rows} mod = SourceModule(kernel_code) interpolation = mod.get_function("interpolation") texref = mod.get_texref("tex") channel = channel.astype("float32") drv.matrix_to_texref(channel, texref, order="F") texref.set_filter_mode(drv.filter_mode.LINEAR) bdim = (16, 16, 1) dx, mx = divmod(cols, bdim[0]) dy, my = divmod(rows, bdim[1]) gdim = ((dx + (mx > 0)) * bdim[0], (dy + (my > 0)) * bdim[1]) dest = np.zeros((rows, cols)).astype("float32") m0 = (m[0, :] - 1).astype("float32") m1 = (m[1, :] - 1).astype("float32") interpolation(drv.Out(dest), drv.In(m0), drv.In(m1), block=bdim, grid=gdim, texrefs=[texref]) return dest.astype("uint8")
def minimizeFunc(self, x): r = R.from_euler('xyz', x, degrees=True) for (select, i) in zip(self.selection, range(len(self.selection))): pos = self.sensor_pos[select] self.input[i] = r.apply(pos) self.interpol(np.int32(len(self.selection)), drv.In(self.input), drv.Out(self.output), texrefs=[self.texref], block=self.bdim, grid=self.gdim) b_error = 0 for i in range(len(self.selection)): out = self.output[i] target = r.apply(self.b_target[i]) b_error += np.linalg.norm(out - target) return [b_error]
def Max(input_data, input_N): input = input_data block_N = 2 grid_N = int(input_N / block_N) output = np.zeros((input_N // 2, input_N // 2), np.float32) reduction_max(drv.In(input), np.int32(input_N), drv.Out(output), block=(block_N, block_N, 1), grid=(grid_N, grid_N)) if (output.shape[0] == data_shape_size): return output input = output input_N = input.shape[0] result = Max(input, input_N) return result