def bp(grad,W,gW,gb,h,gradInput=None): z = cuda.device_array_like(gW) print('grad',grad.shape,'h',h.shape) mmprod(h,grad,z,transa='T') mmadd(gW,z,gW) mmadd(gb,grad,gb) if gradInput is not None: mmprod(grad,W,gradInput,transb='T')
def mcopy(a): blockDim = (min(32,a.shape[0]),min(32,a.shape[1])) gridDim = ((((a.shape[0] + blockDim[0]) - 1) / blockDim[0]), (((a.shape[1] + blockDim[1]) - 1) / blockDim[1])) b = cuda.device_array_like(a) d_mcopy[gridDim,blockDim](a,b) return b
def msum(a): blockDim = (min(32,a.shape[0]),min(32,a.shape[1])) gridDim = ((((a.shape[0] + blockDim[0]) - 1) / blockDim[0]), (((a.shape[1] + blockDim[1]) - 1) / blockDim[1])) db = cuda.device_array_like(a) print(blockDim,gridDim) d_msum[gridDim,blockDim](a,db) while gridDim[1] > 1: blockDim = (min(32,a.shape[0]),min(32,gridDim[1])) gridDim = ((((a.shape[0] + blockDim[0]) - 1) / blockDim[0]), (((gridDim[1] + blockDim[1]) - 1) / blockDim[1])) print(blockDim,gridDim) d_msum[gridDim,blockDim](db,db) return db
def block_increment(start, n): cuda.select_device(0) stream = cuda.stream() blockdim = 256 griddim = n // 256 + 1 c_host = np.zeros((n, n), dtype=np.float32) m_dev = curand.normal(0, 1, n, dtype=np.float32, device=True) n_dev = curand.normal(0, 1, n, dtype=np.float32, device=True) a_host = np.zeros(n, dtype=np.float32) a_dev = cuda.device_array_like(a_host) cuda_div[griddim, blockdim, stream](m_dev, n_dev, a_dev, n) #keeps a_dev on the device for the kernel ==> no access at this point to the device memory # so i cant know what appends to m_dev and n_dev best guess is python GC is # translated into desallocation on the device b_dev = curand.uniform((n * n), dtype=np.float32, device=True) c_dev = cuda.device_array_like(c_host, stream) block_kernel[griddim, blockdim, stream](start, n, a_dev, b_dev, c_dev) c_dev.copy_to_host(c_host, stream) stream.synchronize() return c_host
def generate_omegas(self): # Generating device array device_output = cuda.device_array_like(self.host_omegas) # Calling kernel to calculate omegas self.omega[self.grid_dim, self.threads_per_block](self.device_values, device_output, self.size) # Copying back omegas to host device_output.copy_to_host(self.host_omegas) return device_output
def compute_block(self): device_uniforms = curand.uniform(size=N * N, device=True) host_results = zeros((self.size, self.size)) stream = cuda.stream() device_proposals = cuda.to_device(self.host_proposals, stream=stream) device_omegas = cuda.to_device(self.host_omegas, stream=stream) device_results = cuda.device_array_like(host_results, stream=stream) cu_one_block[self.grid_dim, self.threads_per_block, stream](self.start, device_proposals, device_omegas, device_uniforms, device_results, self.size, self.size) device_results.copy_to_host(host_results, stream=stream) stream.synchronize() return host_results
def generate_proposals(self): # Generating two device arrays of standard normals variables device_normals_1 = curand.normal(0, 1, self.size, device=True) device_normals_2 = curand.normal(0, 1, self.size, device=True) # Generating device array device_output = cuda.device_array_like(self.host_values) # Calling external kernel to calculate ratios of standard normals (in device) self.division[self.grid_dim, self.threads_per_block](device_normals_1, device_normals_2, device_output, self.size) # Copying back values to host device_output.copy_to_host(self.host_values) return device_output
def mc_cuda(paths, dt, interest, volatility): n = paths.shape[0] blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK gridsz = int(math.ceil(float(n) / blksz)) # instantiate a CUDA stream for queueing async CUDA cmds stream = cuda.stream() # instantiate a cuRAND PRNG prng = curand.PRNG(curand.PRNG.MRG32K3A) # Allocate device side array d_normdist = cuda.device_array(n, dtype=np.double, stream=stream) c0 = interest - 0.5 * volatility ** 2 c1 = volatility * math.sqrt(dt) # configure the kernel # similar to CUDA-C: step_cuda<<<gridsz, blksz, 0, stream>>> step_cfg = step_cuda[gridsz, blksz, stream] # transfer the initial prices d_last = cuda.to_device(paths[:, 0], stream=stream) for j in range(1, paths.shape[1]): # call cuRAND to populate d_normdist with gaussian noises prng.normal(d_normdist, mean=0, sigma=1) # setup memory for new prices # device_array_like is like empty_like for GPU d_paths = cuda.device_array_like(paths[:, j], stream=stream) # invoke step kernel asynchronously step_cfg(d_last, d_paths, dt, c0, c1, d_normdist) # transfer memory back to the host d_paths.copy_to_host(paths[:, j], stream=stream) d_last = d_paths # wait for all GPU work to complete stream.synchronize()
def task2(): a = numpy.float32(2.) # Force value to be float32 x = numpy.arange(NELEM, dtype='float32') y = numpy.arange(NELEM, dtype='float32') ### Task2 ### # a) Complete the memory transfer for x -> dx, y -> dy # b) Allocate device memory for dout # c) Transfer for out <- dout dx = cuda.to_device(x) dy = cuda.to_device(y) dout = cuda.device_array_like(x) griddim = NUM_BLOCKS blockdim = NUM_THREADS saxpy[griddim, blockdim](a, dx, dy, dout) out = dout.copy_to_host() print "out =", out if numpy.allclose(a * x + y, out): print "Correct result" else: print "Incorrect result"
from numbapro import cuda, float32, void import numpy import time @cuda.jit(void(float32[:], float32[:], float32[:])) def sumarrays(a, b, c): i = cuda.grid(1) # Short for cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x if i < c.size: c[i] = a[i] + b[i] n = 16*1024*1024 a = numpy.arange(n, dtype='float32') b = a*2 start_time = time.time() da = cuda.to_device(a) db = cuda.to_device(b) dc = cuda.device_array_like(a) size_block = 1024 size_grid = int((n-1)/size_block + 1) sumarrays[size_grid, size_block](da, db, dc) c = dc.copy_to_host() print "Time elapsed: ", time.time() - start_time, "s"
generator.centroids = generator._init_centroids(data) generator.labels = cuda.pinned_array(shape=generator.N, dtype = np.int32) generator._dists = cuda.pinned_array(shape=generator.N, dtype = np.float32) generator._compute_cuda_dims(data) gridDim = generator._gridDim blockDim = generator._blockDim print "grid: ", gridDim print "block: ", blockDim dData = cuda.to_device(data) dCentroids = cuda.to_device(generator.centroids) dLabels = cuda.device_array_like(generator.labels) dDists = cuda.device_array_like(generator._dists) startE = cuda.event() endE = cuda.event() startE.record() _cu_label_kernel_dists[gridDim,blockDim](dData,dCentroids,dLabels,dDists) endE.record() endE.synchronize() print cuda.event_elapsed_time(startE,endE) startE.record() dDists.copy_to_host(ary = generator._dists) labels = dLabels.copy_to_host(ary = generator.labels) endE.record()
from timeit import default_timer as timer import numpy as np from numbapro import vectorize, float32, cuda src = np.arange(10 ** 7, dtype=np.float32) dst = np.empty_like(src) @vectorize([float32(float32)], target='gpu') def copy_kernel(src): return src # Regular memory transfer ts = timer() d_src = cuda.to_device(src) d_dst = cuda.device_array_like(dst) copy_kernel(d_src, out=d_dst) d_dst.copy_to_host(dst) te = timer() print 'regular', te - ts del d_src, d_dst assert np.allclose(dst, src) # Pinned (pagelocked) memory transfer with cuda.pinned(src, dst):
nThreads = (16,16) nBlocks = (ceil(n_theta/nThreads[0]), ceil(n_phi/nThreads[1])) print(nBlocks) date = "2015_6_22_15_33_43" xml_path = "D:\\image_software\\results\\GMEMtracking3D_"+date+"\\XML_finalResult_lht_bckgRm\\GMEMfinalResult_frame????.xml" # CUDA call of the image s = timer() d_theta = cuda.to_device(all_theta) d_phi = cuda.to_device(all_phi) d_image = cuda.to_device(image) d_r = cuda.to_device(r) d_center = cuda.to_device(center) d_max_val = cuda.device_array_like(all_max_val) d_r_max = cuda.device_array_like(r_of_maxval) calcRay_CUDA[nBlocks, nThreads](d_theta, d_phi, d_image, d_r, d_center, d_max_val, d_r_max) d_max_val.copy_to_host(all_max_val) d_r_max.copy_to_host(r_of_maxval) e = timer() print(e-s) n_time = 10 pos = readXML(xml_path, n_time) # Calculate the points coordinates pos_arr = np.asarray(pos[0][0:3]) n_cells = pos_arr.shape[1] angles = np.zeros((3,n_cells)) for cell in range(n_cells):
def main(): device = cuda.get_current_device() maxtpb = device.MAX_THREADS_PER_BLOCK warpsize = device.WARP_SIZE # benchmark loop vary_warpsize = [] baseline = [] ilpx2 = [] ilpx4 = [] ilpx8 = [] # For OSX 10.8 where the GPU is used for graphic as well, # increasing the following to 10 * 2 ** 20 seems to be necessary to # produce consistent result. approx_data_size = 1.5 * 2**20 for multiplier in range(1, maxtpb // warpsize + 1): blksz = warpsize * multiplier gridsz = ceil_to_nearest(float(approx_data_size) / blksz, 8) print 'kernel config [%d, %d]' % (gridsz, blksz) N = blksz * gridsz A = np.arange(N, dtype=np.float32) B = np.arange(N, dtype=np.float32) print 'data size %dMB' % (N / 2.**20 * A.dtype.itemsize) dA = cuda.to_device(A) dB = cuda.to_device(B) assert float(N) / blksz == gridsz, (float(N) / blksz, gridsz) vary_warpsize.append(blksz) dC = cuda.device_array_like(A) basetime = time_this(vec_add, gridsz, blksz, (dA, dB, dC)) expected_result = dC.copy_to_host() if basetime > 0: baseline.append(N / basetime) dC = cuda.device_array_like(A) x2time = time_this(vec_add_ilp_x2, gridsz // 2, blksz, (dA, dB, dC)) assert np.allclose(expected_result, dC.copy_to_host()) if x2time > 0: ilpx2.append(N / x2time) dC = cuda.device_array_like(A) x4time = time_this(vec_add_ilp_x4, gridsz // 4, blksz, (dA, dB, dC)) assert np.allclose(expected_result, dC.copy_to_host()) if x4time > 0: ilpx4.append(N / x4time) dC = cuda.device_array_like(A) x8time = time_this(vec_add_ilp_x8, gridsz // 8, blksz, (dA, dB, dC)) assert np.allclose(expected_result, dC.copy_to_host()) if x8time > 0: ilpx8.append(N / x8time) pylab.plot(vary_warpsize[:len(baseline)], baseline, label='baseline') pylab.plot(vary_warpsize[:len(ilpx2)], ilpx2, label='ILP2') pylab.plot(vary_warpsize[:len(ilpx4)], ilpx4, label='ILP4') pylab.plot(vary_warpsize[:len(ilpx8)], ilpx8, label='ILP8') pylab.legend(loc=4) pylab.title(cuda.get_current_device().name) pylab.xlabel('block size') pylab.ylabel('float per second') pylab.show()
aryA = np.arange(BLOCKSIZE * BLOCKCOUNT, dtype=np.float32) print 'data size: %.1fMB' % (aryA.size * aryA.dtype.itemsize / (2**20)) evt_total_begin = cuda.event() evt_total_end = cuda.event() evt_kernel_begin = cuda.event() evt_kernel_end = cuda.event() t_total_begin = timer() evt_total_begin.record() # explicity tranfer memory d_aryA = cuda.to_device(aryA) d_aryB = cuda.device_array_like(aryA) evt_kernel_begin.record() t_kernel_begin = timer() cu_copy_array[BLOCKCOUNT, BLOCKSIZE](d_aryB, d_aryA) t_kernel_end = timer() evt_kernel_end.record() aryB = d_aryB.copy_to_host() evt_total_end.record() evt_total_end.synchronize() t_total_end = timer()
def accum_bp(grad,gW,gb,h): z = cuda.device_array_like(gW) mmprod(h,grad,z,transa='T') mmadd(gW,z,gW) mmadd(gb,grad,gb)
def main(): device = cuda.get_current_device() maxtpb = device.MAX_THREADS_PER_BLOCK warpsize = device.WARP_SIZE # benchmark loop vary_warpsize = [] baseline = [] ilpx2 = [] ilpx4 = [] ilpx8 = [] # For OSX 10.8 where the GPU is used for graphic as well, # increasing the following to 10 * 2 ** 20 seems to be necessary to # produce consistent result. approx_data_size = 1.5 * 2**20 for multiplier in range(1, maxtpb // warpsize + 1): blksz = warpsize * multiplier gridsz = ceil_to_nearest(float(approx_data_size) / blksz, 8) print 'kernel config [%d, %d]' % (gridsz, blksz) N = blksz * gridsz A = np.arange(N, dtype=np.float32) B = np.arange(N, dtype=np.float32) print 'data size %dMB' % (N / 2.**20 * A.dtype.itemsize) dA = cuda.to_device(A) dB = cuda.to_device(B) assert float(N) / blksz == gridsz, (float(N) / blksz, gridsz) vary_warpsize.append(blksz) dC = cuda.device_array_like(A) basetime = time_this(vec_add, gridsz, blksz, (dA, dB, dC)) expected_result = dC.copy_to_host() if basetime > 0: baseline.append(N / basetime) dC = cuda.device_array_like(A) x2time = time_this(vec_add_ilp_x2, gridsz//2, blksz, (dA, dB, dC)) assert np.allclose(expected_result, dC.copy_to_host()) if x2time > 0: ilpx2.append(N / x2time) dC = cuda.device_array_like(A) x4time = time_this(vec_add_ilp_x4, gridsz//4, blksz, (dA, dB, dC)) assert np.allclose(expected_result, dC.copy_to_host()) if x4time > 0: ilpx4.append(N / x4time) dC = cuda.device_array_like(A) x8time = time_this(vec_add_ilp_x8, gridsz//8, blksz, (dA, dB, dC)) assert np.allclose(expected_result, dC.copy_to_host()) if x8time > 0: ilpx8.append(N / x8time) pylab.plot(vary_warpsize[:len(baseline)], baseline, label='baseline') pylab.plot(vary_warpsize[:len(ilpx2)], ilpx2, label='ILP2') pylab.plot(vary_warpsize[:len(ilpx4)], ilpx4, label='ILP4') pylab.plot(vary_warpsize[:len(ilpx8)], ilpx8, label='ILP8') pylab.legend(loc=4) pylab.title(cuda.get_current_device().name) pylab.xlabel('block size') pylab.ylabel('float per second') pylab.show()
from numbapro import vectorize, float32, cuda src = np.arange(10**7, dtype=np.float32) dst = np.empty_like(src) @vectorize([float32(float32)], target='gpu') def copy_kernel(src): return src # Regular memory transfer ts = timer() d_src = cuda.to_device(src) d_dst = cuda.device_array_like(dst) copy_kernel(d_src, out=d_dst) d_dst.copy_to_host(dst) te = timer() print 'regular', te - ts del d_src, d_dst assert np.allclose(dst, src) # Pinned (pagelocked) memory transfer with cuda.pinned(src, dst):