def start(A_matrix): A = A_matrix.flatten() L = np.zeros_like(A) U = np.zeros_like(A) rows = len(A_matrix) columns = len(A_matrix) tpb = 32 n = rows * columns with cuda.pinned(A, L, U): stream = cuda.stream() gpu_A = cuda.to_device(A, stream=stream) gpu_L = cuda.to_device(L, stream=stream) gpu_U = cuda.to_device(U, stream=stream) bpg = n + (tpb - 1) // tpb crout[(bpg, bpg), (tpb, tpb)](gpu_A, gpu_L, gpu_U, rows) gpu_L.copy_to_host(L, stream) gpu_U.copy_to_host(U, stream) L = L.reshape(rows, columns) U = U.reshape(rows, columns) print(L) print(U) print(np.matmul(L, U))
def test_generate(self): from pyculib.rand.binding import (Generator, CURAND_RNG_QUASI_SOBOL64, CURAND_RNG_QUASI_DEFAULT) N = 10 stream = cuda.stream() ary32 = np.zeros(N, dtype=np.uint32) devary32 = cuda.to_device(ary32, stream=stream) rndgen = Generator(CURAND_RNG_QUASI_DEFAULT) rndgen.set_stream(stream) rndgen.set_offset(123) rndgen.set_quasi_random_generator_dimensions(1) rndgen.generate(devary32, N) devary32.copy_to_host(ary32, stream=stream) stream.synchronize() self.assertTrue(any(ary32 != 0)) ary64 = np.zeros(N, dtype=np.uint64) devary64 = cuda.to_device(ary64, stream=stream) rndgen = Generator(CURAND_RNG_QUASI_SOBOL64) rndgen.set_stream(stream) rndgen.set_offset(123) rndgen.set_quasi_random_generator_dimensions(1) rndgen.generate(devary64, N) devary64.copy_to_host(ary64, stream=stream) stream.synchronize() self.assertTrue(any(ary64 != 0))
def move(self, init, stream=None): if len(init[0].shape) == 2: self.threadim = (16, 16) _f = self._f2 elif len(init[0].shape) == 3: self.threadim = (8, 8, 4) _f = self._f3 BPG = np.array(init[0].shape) / np.array(self.threadim) self.gridim = tuple(BPG.astype(np.int)) if stream is None: self.stream = cuda.stream() else: self.stream = stream if not cuda.devicearray.is_cuda_ndarray(init[0]): self.dA = cuda.to_device(init[0], stream=self.stream) self.dB = cuda.to_device(init[1], stream=self.stream) self.dC = cuda.to_device(init[2], stream=self.stream) self.stream.synchronize() else: self.dA = init[0] self.dB = init[1] self.dC = init[2] _f[self.gridim, self.threadim, self.stream](self.dA, self.dB, self.dC) self.stream.synchronize()
def test_11(): from timeit import default_timer as time n = 7 * 32 m = n // CMP_RATIO if n % CMP_RATIO != 0: m += 1 thrd = 0.5 A = np.array(np.random.random((n, n)), dtype=np.float32) - thrd * 2 R = np.empty_like(A) # for residual C = np.array(np.zeros((m, n)), dtype=np.uint32) stream = cuda.stream() dA = cuda.to_device(A, stream) dR = cuda.to_device(R, stream) dC = cuda.to_device(C, stream) tm = time() cu_fn_matrix_quantize_e(dA, dC, dR, thrd) print('quantize: ', time() - tm) B = np.empty_like(A) # for dequantize dB = cuda.to_device(B, stream) tm = time() cu_fn_matrix_dequantize_e(dB, dC, dR, thrd) print('dequantize: ', time() - tm)
def test_create_stream(self): stream = cuda.stream() states = cuda.random.create_xoroshiro128p_states(10, seed=1, stream=stream) s = states.copy_to_host() self.assertEqual(len(np.unique(s)), 10)
def test_func(self): A = np.array(np.random.random((n, n)), dtype=np.float32) B = np.array(np.random.random((n, n)), dtype=np.float32) C = np.empty_like(A) print("N = %d x %d" % (n, n)) s = time() stream = cuda.stream() with stream.auto_synchronize(): dA = cuda.to_device(A, stream) dB = cuda.to_device(B, stream) dC = cuda.to_device(C, stream) cu_square_matrix_mul[(bpg, bpg), (tpb, tpb), stream](dA, dB, dC) dC.copy_to_host(C, stream) e = time() tcuda = e - s # Host compute Amat = np.matrix(A) Bmat = np.matrix(B) s = time() Cans = Amat * Bmat e = time() tcpu = e - s print('cpu: %f' % tcpu) print('cuda: %f' % tcuda) print('cuda speedup: %.2fx' % (tcpu / tcuda)) # Check result self.assertTrue(np.allclose(C, Cans))
def compute_region_PSNR(filename, in_dir): n = np.array([16]) # Max length of interpolated pixels stream = cuda.stream() # Read the original image original = cv2.imread(join(in_dir, filename)) original = cv2.split(cv2.cvtColor(original, cv2.COLOR_BGR2YCR_CB))[0] original = original.astype(np.float64) dim = np.array([original.shape[1], original.shape[0]]) template = np.zeros_like(original) template = np.repeat(template[:, :, np.newaxis], n, axis=2) # Create images in GPU d_original = cuda.to_device(np.ascontiguousarray(original), stream=stream) d_n = cuda.to_device(np.ascontiguousarray(n), stream=stream) d_dim = cuda.to_device(np.ascontiguousarray(dim), stream=stream) # Intepolate image d_interpolated = cuda.to_device(np.ascontiguousarray(template), stream=stream) interpolate_image[512, 512](d_original, d_n, d_dim, d_interpolated) # Compute MSE of a pixel in a 3x3 region d_mse_region = cuda.to_device(np.ascontiguousarray(template), stream=stream) region_MSE[512, 512](d_original, d_interpolated, d_n, d_dim, d_mse_region) region_PSNR[512, 512](d_mse_region, d_n, d_dim) psnr_region = d_mse_region.copy_to_host() return psnr_region
def test_gufunc_stream(self): gufunc = _get_matmulcore_gufunc(max_blocksize=512) #cuda.driver.flush_pending_free() matrix_ct = 1001 # an odd number to test thread/block division in CUDA A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, 5) stream = cuda.stream() dA = cuda.to_device(A, stream) dB = cuda.to_device(B, stream) dC = cuda.device_array(shape=(1001, 2, 5), dtype=A.dtype, stream=stream) dC = gufunc(dA, dB, out=dC, stream=stream) C = dC.copy_to_host(stream=stream) stream.synchronize() Gold = ut.matrix_multiply(A, B) self.assertTrue(np.allclose(C, Gold))
def GPUWrapper(data_out, device_id, photons_req_per_device, max_photons_per_device, muA, muS, g, source_type, source_param1, source_param2, detector_params, max_N, max_distance_from_det, target_type, target_mask, target_gridsize, z_target, z_bounded, z_range, ret_cols, absorb_threshold, absorb_chance): # TODO: These numbers can be optimized based on the device / architecture / number of photons threads_per_block = 256 blocks = 64 photons_per_thread = int( np.ceil(float(photons_req_per_device) / (threads_per_block * blocks))) max_photons_per_thread = int( np.ceil(float(max_photons_per_device) / (threads_per_block * blocks))) cuda.select_device(device_id) stream = cuda.stream() # use stream to trigger async memory transfer # Keeping this piece of code here for now -potentially we need this in the future # with compiler_lock: # lock the compiler # prepare function for this thread # the jitted CUDA kernel is loaded into the current context # TODO: ideally we should call cuda.jit(signature)(propPhotonGPU), where # signature is the call to the function. So far I couldn't figure out what is the signature of the # rng_states, closest I got to was: array(Record([('s0', '<u8'), ('s1', '<u8')]), 1d, A) # But I couldn't get it to work yet. # MC_cuda_kernel = cuda.jit(propPhotonGPU) data = np.ndarray(shape=(threads_per_block * blocks, photons_per_thread, 12), dtype=np.float32) photon_counters = np.ndarray(shape=(threads_per_block * blocks, 5), dtype=np.int) data_out_device = cuda.device_array_like(data, stream=stream) photon_counters_device = cuda.device_array_like(photon_counters, stream=stream) # Used to initialize the threads random states. rng_states = create_xoroshiro128p_states( threads_per_block * blocks, seed=(np.random.randint(sys.maxsize) - 128) + device_id, stream=stream) # Actual kernel call propPhotonGPU[blocks, threads_per_block]( rng_states, data_out_device, photon_counters_device, photons_per_thread, max_photons_per_thread, muA, muS, g, source_type, source_param1, source_param2, detector_params, max_N, max_distance_from_det, target_type, target_mask, target_gridsize, z_target, z_bounded, z_range, absorb_threshold, absorb_chance) # Copy data back data_out_device.copy_to_host(data, stream=stream) photon_counters_device.copy_to_host(photon_counters, stream=stream) stream.synchronize() data = data.reshape(data.shape[0] * data.shape[1], data.shape[2]) data = data[:, ret_cols] data_out[device_id][0] = data photon_counters_aggr = np.squeeze(np.sum(photon_counters, axis=0)) data_out[device_id][1] = photon_counters_aggr
def setup(self): self.stream = cuda.stream() self.small_data = np.zeros(512, dtype=np.float64) self.large_data = np.zeros(512 * 1024, dtype=np.float64) self.d_small_data = cuda.to_device(self.small_data, self.stream) self.d_large_data = cuda.to_device(self.large_data, self.stream) self.stream.synchronize()
async def test_multiple_async_done_multiple_streams(self): streams = [cuda.stream() for _ in range(4)] done_aws = [stream.async_done() for stream in streams] done = await asyncio.gather(*done_aws) # Ensure we got the four original streams in done self.assertSetEqual(set(done), set(streams))
def laplace_3d_cuda(d, n): L = d.shape[0] M = d.shape[1] N = d.shape[2] blockdim = (8, 8, 8) griddim = (L // blockdim[0], M // blockdim[1], N // blockdim[2]) #print(griddim) stream = cuda.stream() dd = cuda.to_device(d, stream) dn = cuda.to_device(n, stream) #%timeit -n 32 -r 16 d0td1_cuda_kernel[griddim, blockdim](dd, dn) for i in range(0, 100): laplace_3d_cuda_opt_kernel[griddim, blockdim, stream](dd, dn, L, M, N) evtstart = cuda.event(timing=True) evtend = cuda.event(timing=True) evtstart.record(stream) for i in range(100): laplace_3d_cuda_opt_kernel[griddim, blockdim, stream](dd, dn, L, M, N) evtend.record(stream) evtend.synchronize() print(cuda.event_elapsed_time(evtstart, evtend) / 100.) dd.to_host()
def __init__(self, configModule): self.imagesRGB = [] self.imagesTH = [] self.imagesRGBCuda = [] self.imagesTHCuda = [] self._imagesRGBPath = None self._imagesTHPath = None self.myHeatMapLookup = self.createHeatMapLookUp() self.CudaEnabled = cudaImport self.configurations = configModule.Configuration self.outArr = np.empty(shape=(int(self.configurations.rows) * int(self.configurations.cols) * int(self.configurations.dims)), dtype=np.uint8) self.refreshConfigModule = False if self.CudaEnabled: self.streamCuda = cuda.stream() self.myHeatMapLookupCuda = cuda.to_device(self.myHeatMapLookup, self.streamCuda) try: self.imagesRGBPath = self.configurations.datasetDemoNormalPath self.imagesTHPath = self.configurations.datasetDemoThermalPath except BaseException as e: print( "Dataset Yükleme hata oluştu. Konfigürasyon Dosyasını kontrol edin.{}", str(e))
def start(self, A_matrix, b_vector): """Launches parallel Gauss Jordan elimination for a SLAE and returns its answer. @param A_matrix Coefficient matrix of a SLAE. @param b_vector Linearly independent vector of a SLAE. @return float64[:] """ if 0 in A_matrix.diagonal(): return None b = b_vector.reshape(len(b_vector), 1) A = np.hstack((A_matrix, b)) A = A.flatten() n = len(b) with cuda.pinned(A): stream = cuda.stream() gpu_A = cuda.to_device(A, stream=stream) bpg = 1 for i in range(0, n): self.gauss_jordan[(bpg, bpg), (tpb, tpb)](gpu_A, n, i) self.normalize[(bpg, bpg), (tpb, tpb)](gpu_A, n) gpu_A.copy_to_host(A, stream) x = A.reshape(n, (n + 1))[:, n] if True in np.isnan(x) or True in np.isinf(x): return None else: return x
def test(ty): print("Test %s" % ty) data = np.array(np.random.random(1e+6 + 1), dtype=ty) ts = time() stream = cuda.stream() device_data = cuda.to_device(data, stream) dresult = cuda_ufunc(device_data, device_data, stream=stream) result = dresult.copy_to_host() stream.synchronize() tnumba = time() - ts ts = time() gold = np_ufunc(data, data) tnumpy = time() - ts print("Numpy time: %fs" % tnumpy) print("Numba time: %fs" % tnumba) if tnumba < tnumpy: print("Numba is FASTER by %fx" % (tnumpy / tnumba)) else: print("Numba is SLOWER by %fx" % (tnumba / tnumpy)) self.assertTrue(np.allclose(gold, result), (gold, result))
async def test_cancelled_future(self): stream = cuda.stream() done1, done2 = stream.async_done(), stream.async_done() done1.cancel() await done2 self.assertTrue(done1.cancelled()) self.assertTrue(done2.done())
def dist_matrix(X, p): """Calculate pairwise distance matrix using minkowski distance and returns in longform Args: X (np.ndarray): matrix with rows samples and columns features p (float): exponent for Minkowski distance Returns: [np.ndarray]: matrix of pairwise distances in longform """ rows = X.shape[0] block_dim = (16, 16) grid_dim = (int(rows / block_dim[0] + 1), int(rows / block_dim[1] + 1)) stream = cuda.stream() X = cuda.to_device(np.asarray(X, dtype=np_type), stream=stream) out2 = cuda.device_array(rows * (rows - 1) // 2, dtype=np_type) if p == 2: # speed up performance by calling special function when p == 2. #tick = time.perf_counter() euclidean_pairs_gpu[grid_dim, block_dim](X, out2) #print('euc pairs gpu', time.perf_counter() - tick) else: distance_matrix_gpu[grid_dim, block_dim](X, p, out2) out = out2.copy_to_host(stream=stream) return out
def __init__(self, x_shape, n_samples, logger: logging.Logger = None): if logger is None: logger = logging.Logger('AutoMemoryBuilder') logger.addHandler(logging.NullHandler()) self._logger = logger.getChild('AutoMemoryBuilder') self._logger.debug('Constructor') if len(x_shape) == 1: self._x_size = x_shape[0] else: assert len(x_shape) == 2 self._x_size = x_shape[0] * x_shape[1] assert self._x_size >= n_samples self._n_samples = n_samples self._xmem_shape = (n_samples, self._x_size) self._i = 0 self._logger.info('Allocating pinned array: {0} bytes'.format( n_samples * self._x_size)) self._pinned_mem = cuda.pinned_array(self._xmem_shape) self._stream = cuda.stream()
def recall(self, x, how): if not isinstance(x, np.ndarray): x = np.array(x) assert np.size(x) == self._x_size orig_shape = x.shape x = x.reshape((self._x_size, 1)) out = np.empty_like(x) if how == 'm': mem = self._dev_mem_m op = mnn.mat_morph_mul_min_plus elif how == 'w': mem = self._dev_mem_w op = mnn.mat_morph_mul_max_plus else: raise ValueError('Expected how=\'m\' or how=\'w\'') stream = cuda.stream() dev_x = cuda.to_device(x, stream=stream) out_x = cuda.device_array_like(out, stream=stream) op(mem, x, out_x, stream=stream) out_x.copy_to_host(out, stream=stream) stream.synchronize() return out.T.reshape(orig_shape)
def test_div_up(self): # wrapper kernel for device function that is tested @cuda.jit def _kernel(x, y): x_pos = cuda.grid(1) if x_pos < x.shape[0] and x_pos < y.shape[0]: x[x_pos] = rnnt_helper.div_up(x[x_pos], y[x_pos]) x = np.full([8], fill_value=10) # np.random.rand(8192) y = np.full([8], fill_value=2) # np.random.rand(8192) stream = cuda.stream() x_c = cuda.to_device(x, stream=stream) y_c = cuda.to_device(y, stream=stream) # call kernel threads_per_block = global_constants.threads_per_block() blocks_per_grid = (x.shape[0] + threads_per_block - 1) // threads_per_block _kernel[blocks_per_grid, threads_per_block, stream](x_c, y_c) # sync kernel stream.synchronize() x_new = x_c.copy_to_host(stream=stream) del x_c, y_c for i in range(len(x_new)): assert x_new[i] == ((10 + 2 - 1) // 2)
def reduce(self, arg, stream=0): assert len(list(self.functions.keys())[0]) == 2, "must be a binary " \ "ufunc" assert arg.ndim == 1, "must use 1d array" n = arg.shape[0] gpu_mems = [] if n == 0: raise TypeError("Reduction on an empty array.") elif n == 1: # nothing to do return arg[0] # always use a stream stream = stream or cuda.stream() with stream.auto_synchronize(): # transfer memory to device if necessary if devicearray.is_cuda_ndarray(arg): mem = arg else: mem = cuda.to_device(arg, stream) # do reduction out = self.__reduce(mem, gpu_mems, stream) # use a small buffer to store the result element buf = np.array((1,), dtype=arg.dtype) out.copy_to_host(buf, stream=stream) return buf[0]
def test_log_sum_exp_neg_inf(self): # wrapper kernel for device function that is tested @cuda.jit def _kernel(x, y): x_pos = cuda.grid(1) if x_pos < x.shape[0] and x_pos < y.shape[0]: x[x_pos] = rnnt_helper.log_sum_exp(x[x_pos], y[x_pos]) x = np.asarray([global_constants.FP32_NEG_INF] * 8) y = np.ones([len(x)]) stream = cuda.stream() x_c = cuda.to_device(x, stream=stream) y_c = cuda.to_device(y, stream=stream) # call kernel threads_per_block = global_constants.threads_per_block() blocks_per_grid = (x.shape[0] + threads_per_block - 1) // threads_per_block _kernel[blocks_per_grid, threads_per_block, stream](x_c, y_c) # sync kernel stream.synchronize() x_new = x_c.copy_to_host(stream=stream) del x_c, y_c assert np.allclose(x_new, np.ones_like(x_new), atol=1e-5)
def test_log_sum_exp(self): # wrapper kernel for device function that is tested @cuda.jit def _kernel(x, y): x_pos = cuda.grid(1) if x_pos < x.shape[0] and x_pos < y.shape[0]: x[x_pos] = rnnt_helper.log_sum_exp(x[x_pos], y[x_pos]) x = np.zeros([8]) # np.random.rand(8192) y = np.ones([8]) # np.random.rand(8192) stream = cuda.stream() x_c = cuda.to_device(x, stream=stream) y_c = cuda.to_device(y, stream=stream) # call kernel threads_per_block = global_constants.threads_per_block() blocks_per_grid = (x.shape[0] + threads_per_block - 1) // threads_per_block _kernel[blocks_per_grid, threads_per_block, stream](x_c, y_c) # sync kernel stream.synchronize() x_new = x_c.copy_to_host(stream=stream) del x_c, y_c assert (x_new.sum() - 10.506093500145782) <= 1e-5
def test_exponential(self): # wrapper kernel for device function that is tested @cuda.jit def _kernel(x): x_pos = cuda.grid(1) if x_pos < x.shape[0]: x[x_pos] = rnnt_helper.exponential(x[x_pos]) x = np.random.rand(8) stream = cuda.stream() x_c = cuda.to_device(x, stream=stream) # call kernel threads_per_block = global_constants.threads_per_block() blocks_per_grid = (x.shape[0] + threads_per_block - 1) // threads_per_block _kernel[blocks_per_grid, threads_per_block, stream](x_c) # sync kernel stream.synchronize() x_new = x_c.copy_to_host(stream=stream) del x_c y = np.exp(x) for i in range(len(x_new)): assert (x_new[i] - y[i]) < 1e-4
def test_func(self): np.random.seed(42) A = np.array(np.random.random((n, n)), dtype=np.float32) B = np.array(np.random.random((n, n)), dtype=np.float32) C = np.empty_like(A) s = time() stream = cuda.stream() with stream.auto_synchronize(): dA = cuda.to_device(A, stream) dB = cuda.to_device(B, stream) dC = cuda.to_device(C, stream) cu_square_matrix_mul[(bpg, bpg), (tpb, tpb), stream](dA, dB, dC) dC.copy_to_host(C, stream) e = time() tcuda = e - s # Host compute s = time() Cans = np.dot(A, B) e = time() tcpu = e - s # Check result np.testing.assert_allclose(C, Cans, rtol=1e-5)
def setup(self): self.stream = cuda.stream() self.f32 = np.zeros(self.n, dtype=np.float32) self.d_f32 = cuda.to_device(self.f32, self.stream) self.f64 = np.zeros(self.n, dtype=np.float64) self.d_f64 = cuda.to_device(self.f64, self.stream) self.stream.synchronize()
def test(ty): print("Test %s" % ty) data = np.array(np.random.random(1e6 + 1), dtype=ty) ts = time() stream = cuda.stream() device_data = cuda.to_device(data, stream) dresult = cuda_ufunc(device_data, device_data, stream=stream) result = dresult.copy_to_host() stream.synchronize() tnumba = time() - ts ts = time() gold = np_ufunc(data, data) tnumpy = time() - ts print("Numpy time: %fs" % tnumpy) print("Numba time: %fs" % tnumba) if tnumba < tnumpy: print("Numba is FASTER by %fx" % (tnumpy / tnumba)) else: print("Numba is SLOWER by %fx" % (tnumba / tnumpy)) self.assertTrue(np.allclose(gold, result), (gold, result))
def test_gufunc_stream(self): #cuda.driver.flush_pending_free() matrix_ct = 1001 # an odd number to test thread/block division in CUDA A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, 5) ts = time() stream = cuda.stream() dA = cuda.to_device(A, stream) dB = cuda.to_device(B, stream) dC = cuda.device_array(shape=(1001, 2, 5), dtype=A.dtype, stream=stream) dC = gufunc(dA, dB, out=dC, stream=stream) C = dC.copy_to_host(stream=stream) stream.synchronize() tcuda = time() - ts ts = time() Gold = ut.matrix_multiply(A, B) tcpu = time() - ts stream_speedups.append(tcpu / tcuda) self.assertTrue(np.allclose(C, Gold))
def test_laplace_small(self): NN = 256 NM = 256 A = np.zeros((NN, NM), dtype=np.float64) Anew = np.zeros((NN, NM), dtype=np.float64) n = NN m = NM iter_max = 1000 tol = 1.0e-6 error = 1.0 for j in range(n): A[j, 0] = 1.0 Anew[j, 0] = 1.0 print("Jacobi relaxation Calculation: %d x %d mesh" % (n, m)) timer = time.time() iter = 0 blockdim = (tpb, tpb) griddim = (NN // blockdim[0], NM // blockdim[1]) error_grid = np.zeros(griddim) stream = cuda.stream() dA = cuda.to_device(A, stream) # to device and don't come back dAnew = cuda.to_device(Anew, stream) # to device and don't come back derror_grid = cuda.to_device(error_grid, stream) while error > tol and iter < iter_max: self.assertTrue(error_grid.dtype == np.float64) jocabi_relax_core[griddim, blockdim, stream](dA, dAnew, derror_grid) derror_grid.copy_to_host(error_grid, stream=stream) # error_grid is available on host stream.synchronize() error = np.abs(error_grid).max() # swap dA and dAnew tmp = dA dA = dAnew dAnew = tmp if iter % 100 == 0: print("%5d, %0.6f (elapsed: %f s)" % (iter, error, time.time() - timer)) iter += 1 runtime = time.time() - timer print(" total: %f s" % runtime)
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] mm = MM(shape=n, dtype=np.double, prealloc=5) blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK gridsz = int(math.ceil(float(n) / blksz)) stream = cuda.stream() prng = PRNG(PRNG.MRG32K3A, stream=stream) # Allocate device side array d_normdist = cuda.device_array(n, dtype=np.double, stream=stream) c0 = interest - 0.5 * volatility ** 2 c1 = volatility * math.sqrt(dt) d_last = cuda.to_device(paths[:, 0], to=mm.get()) for j in range(1, paths.shape[1]): prng.normal(d_normdist, mean=0, sigma=1) d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get()) step(d_last, dt, c0, c1, d_normdist, out=d_paths, stream=stream) d_paths.copy_to_host(paths[:, j], stream=stream) mm.free(d_last) d_last = d_paths stream.synchronize()
def box2d_rotate_iou(boxes2d, gt_boxes2d, device_id=0): # Inputs: # boxes2d: (N1, 5) x,y,w,l,r # gt_boxes2d: (N2, 5) x,y,w,l,r # Outputs: # iou: (N1, N2) boxes2d = boxes2d.astype(np.float32) gt_boxes2d = gt_boxes2d.astype(np.float32) N1 = boxes2d.shape[0] N2 = gt_boxes2d.shape[0] iou = np.zeros((N1, N2), dtype=np.float32) if N1 == 0 or N2 == 0: return iou threadsPerBlock = 8 * 8 cuda.select_device(device_id) blockspergrid = (DIVUP(N1, threadsPerBlock), DIVUP(N2, threadsPerBlock)) stream = cuda.stream() with stream.auto_synchronize(): boxes_dev = cuda.to_device(boxes2d.reshape([-1]), stream) query_boxes_dev = cuda.to_device(gt_boxes2d.reshape([-1]), stream) iou_dev = cuda.to_device(iou.reshape([-1]), stream) rotate_iou_kernel[blockspergrid, threadsPerBlock, stream](N1, N2, boxes_dev, query_boxes_dev, iou_dev) iou_dev.copy_to_host(iou.reshape([-1]), stream=stream) return iou.astype(boxes2d.dtype)
def nms_gpu(dets, nms_overlap_thresh, device_id=0): """nms in gpu. Args: dets ([type]): [description] nms_overlap_thresh ([type]): [description] device_id ([type], optional): Defaults to 0. [description] Returns: [type]: [description] """ boxes_num = dets.shape[0] keep_out = np.zeros([boxes_num], dtype=np.int32) scores = dets[:, 4] order = scores.argsort()[::-1].astype(np.int32) boxes_host = dets[order, :] threadsPerBlock = 8 * 8 col_blocks = div_up(boxes_num, threadsPerBlock) cuda.select_device(device_id) mask_host = np.zeros((boxes_num * col_blocks, ), dtype=np.uint64) blockspergrid = (div_up(boxes_num, threadsPerBlock), div_up(boxes_num, threadsPerBlock)) stream = cuda.stream() with stream.auto_synchronize(): boxes_dev = cuda.to_device(boxes_host.reshape([-1]), stream) mask_dev = cuda.to_device(mask_host, stream) nms_kernel[blockspergrid, threadsPerBlock, stream]( boxes_num, nms_overlap_thresh, boxes_dev, mask_dev) mask_dev.copy_to_host(mask_host, stream=stream) # stream.synchronize() num_out = nms_postprocess(keep_out, mask_host, boxes_num) keep = keep_out[:num_out] return list(order[keep])
def main(): NN = 512 NM = 512 A = np.zeros((NN, NM), dtype=np.float64) Anew = np.zeros((NN, NM), dtype=np.float64) n = NN m = NM iter_max = 1000 tol = 1.0e-6 error = 1.0 for j in range(n): A[j, 0] = 1.0 Anew[j, 0] = 1.0 print("Jacobi relaxation Calculation: %d x %d mesh" % (n, m)) timer = time.time() iter = 0 blockdim = (32, 32) griddim = (NN // blockdim[0], NM // blockdim[1]) error_grid = np.zeros_like(A) stream = cuda.stream() dA = cuda.to_device(A, stream) # to device and don't come back dAnew = cuda.to_device(Anew, stream) # to device and don't come back derror_grid = cuda.to_device(error_grid, stream) while error > tol and iter < iter_max: assert error_grid.dtype == np.float64 jacobi_relax_core[griddim, blockdim, stream](dA, dAnew, derror_grid) derror_grid.to_host(stream) # error_grid is available on host stream.synchronize() error = np.abs(error_grid).max() # swap dA and dAnew tmp = dA dA = dAnew dAnew = tmp if iter % 100 == 0: print("%5d, %0.6f (elapsed: %f s)" % (iter, error, time.time() - timer)) iter += 1 runtime = time.time() - timer print(" total: %f s" % runtime)
def test_func(self): @cuda.jit(argtypes=[float32[:, ::1], float32[:, ::1], float32[:, ::1]]) def cu_square_matrix_mul(A, B, C): sA = cuda.shared.array(shape=SM_SIZE, dtype=float32) sB = cuda.shared.array(shape=(tpb, tpb), dtype=float32) tx = cuda.threadIdx.x ty = cuda.threadIdx.y bx = cuda.blockIdx.x by = cuda.blockIdx.y bw = cuda.blockDim.x bh = cuda.blockDim.y x = tx + bx * bw y = ty + by * bh acc = float32(0) # forces all the math to be f32 for i in range(bpg): if x < n and y < n: sA[ty, tx] = A[y, tx + i * tpb] sB[ty, tx] = B[ty + i * tpb, x] cuda.syncthreads() if x < n and y < n: for j in range(tpb): acc += sA[ty, j] * sB[j, tx] cuda.syncthreads() if x < n and y < n: C[y, x] = acc np.random.seed(42) A = np.array(np.random.random((n, n)), dtype=np.float32) B = np.array(np.random.random((n, n)), dtype=np.float32) C = np.empty_like(A) s = time() stream = cuda.stream() with stream.auto_synchronize(): dA = cuda.to_device(A, stream) dB = cuda.to_device(B, stream) dC = cuda.to_device(C, stream) cu_square_matrix_mul[(bpg, bpg), (tpb, tpb), stream](dA, dB, dC) dC.copy_to_host(C, stream) e = time() tcuda = e - s # Host compute s = time() Cans = np.dot(A, B) e = time() tcpu = e - s # Check result np.testing.assert_allclose(C, Cans, rtol=1e-5)
async def async_cuda_fn(value_in: float) -> float: stream = cuda.stream() h_src, h_dst = cuda.pinned_array(8), cuda.pinned_array(8) h_src[:] = value_in d_ary = cuda.to_device(h_src, stream=stream) d_ary.copy_to_host(h_dst, stream=stream) await stream.async_done() return h_dst.mean()
def setup(self): self.stream = cuda.stream() self.d_callResult = cuda.to_device(callResultGold, self.stream) self.d_putResult = cuda.to_device(putResultGold, self.stream) self.d_stockPrice = cuda.to_device(stockPrice, self.stream) self.d_optionStrike = cuda.to_device(optionStrike, self.stream) self.d_optionYears = cuda.to_device(optionYears, self.stream) self.stream.synchronize()
def test_add_callback(self): def callback(stream, status, event): event.set() stream = cuda.stream() callback_event = threading.Event() stream.add_callback(callback, callback_event) self.assertTrue(callback_event.wait(1.0))
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] num_streams = 2 part_width = int(math.ceil(float(n) / num_streams)) partitions = [(0, part_width)] for i in range(1, num_streams): begin, end = partitions[i - 1] begin, end = end, min(end + (end - begin), n) partitions.append((begin, end)) partlens = [end - begin for begin, end in partitions] mm = MM(shape=part_width, dtype=np.double, prealloc=10 * num_streams) device = cuda.get_current_device() blksz = device.MAX_THREADS_PER_BLOCK gridszlist = [int(math.ceil(float(partlen) / blksz)) for partlen in partlens] strmlist = [cuda.stream() for _ in range(num_streams)] prnglist = [PRNG(PRNG.MRG32K3A, stream=strm) for strm in strmlist] # Allocate device side array d_normlist = [cuda.device_array(partlen, dtype=np.double, stream=strm) for partlen, strm in zip(partlens, strmlist)] c0 = interest - 0.5 * volatility ** 2 c1 = volatility * math.sqrt(dt) # Configure the kernel # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>> steplist = [cu_step[gridsz, blksz, strm] for gridsz, strm in zip(gridszlist, strmlist)] d_lastlist = [cuda.to_device(paths[s:e, 0], to=mm.get(stream=strm)) for (s, e), strm in zip(partitions, strmlist)] for j in range(1, paths.shape[1]): for prng, d_norm in zip(prnglist, d_normlist): prng.normal(d_norm, mean=0, sigma=1) d_pathslist = [cuda.to_device(paths[s:e, j], stream=strm, to=mm.get(stream=strm)) for (s, e), strm in zip(partitions, strmlist)] for step, args in zip(steplist, zip(d_lastlist, d_pathslist, d_normlist)): d_last, d_paths, d_norm = args step(d_last, d_paths, dt, c0, c1, d_norm) for d_paths, strm, (s, e) in zip(d_pathslist, strmlist, partitions): d_paths.copy_to_host(paths[s:e, j], stream=strm) mm.free(d_last, stream=strm) d_lastlist = d_pathslist for strm in strmlist: strm.synchronize()
def newthread(): cuda.select_device(0) stream = cuda.stream() A = np.arange(100) dA = cuda.to_device(A, stream=stream) stream.synchronize() del dA del stream cuda.close()
def test_stream_bind(self): stream = cuda.stream() with stream.auto_synchronize(): arr = cuda.device_array( (3, 3), dtype=np.float64, stream=stream) self.assertEqual(arr.bind(stream).stream, stream) self.assertEqual(arr.stream, stream)
def getGraphFromEdges_gpu(dest, weight, fe, od, edges, n_edges = None, MAX_TPB = 512, stream = None): """ All input (except MAX_TPB and stream) are device arrays. edges : array with the IDs of the edges that will be part of the new graph n_edges : array of 1 element with the number of valid edges in the edges array; if n_edges < size of edges, the last elements of the edges array are not considered """ # check if number of valid edges was received if n_edges is None: edges_size = edges.size n_edges = cuda.to_device(np.array([edges_size], dtype = np.int32)) else: edges_size = int(n_edges.getitem(0)) # check if a stream was received, if not create one if stream is None: myStream = cuda.stream() else: myStream = stream new_n_edges = edges_size * 2 # allocate memory for new graph ndest = cuda.device_array(new_n_edges, dtype = dest.dtype, stream = myStream) nweight = cuda.device_array(new_n_edges, dtype = weight.dtype, stream = myStream) nfe = cuda.device_array_like(fe, stream = myStream) nod = cuda.device_array_like(od, stream = myStream) # fill new outdegree with zeros vertexGrid = compute_cuda_grid_dim(nod.size, MAX_TPB) memSet[vertexGrid, MAX_TPB, myStream](nod, 0) # count all edges of new array and who they belong to edgeGrid = compute_cuda_grid_dim(edges_size, MAX_TPB) countEdges[edgeGrid, MAX_TPB, myStream](edges, n_edges, dest, fe, od, nod) # get new first_edge array from new outdegree nfe.copy_to_device(nod, stream=myStream) ex_prefix_sum_gpu(nfe, MAX_TPB = MAX_TPB, stream = myStream) # copy new first_edge to top_edge to serve as pointer in adding edges top_edge = cuda.device_array_like(nfe, stream = myStream) top_edge.copy_to_device(nfe, stream = myStream) addEdges[edgeGrid, MAX_TPB, myStream](edges, n_edges, dest, weight, fe, od, top_edge, ndest, nweight) del top_edge #del dest, weight, fe, od return ndest, nweight, nfe, nod
def test_laplace_small(self): if config.ENABLE_CUDASIM: NN, NM = 4, 4 iter_max = 20 else: NN, NM = 256, 256 iter_max = 1000 A = np.zeros((NN, NM), dtype=np.float64) Anew = np.zeros((NN, NM), dtype=np.float64) n = NN m = NM tol = 1.0e-6 error = 1.0 for j in range(n): A[j, 0] = 1.0 Anew[j, 0] = 1.0 timer = time.time() iter = 0 blockdim = (tpb, tpb) griddim = (NN // blockdim[0], NM // blockdim[1]) error_grid = np.zeros(griddim) stream = cuda.stream() dA = cuda.to_device(A, stream) # to device and don't come back dAnew = cuda.to_device(Anew, stream) # to device and don't come back derror_grid = cuda.to_device(error_grid, stream) while error > tol and iter < iter_max: self.assertTrue(error_grid.dtype == np.float64) jocabi_relax_core[griddim, blockdim, stream](dA, dAnew, derror_grid) derror_grid.copy_to_host(error_grid, stream=stream) # error_grid is available on host stream.synchronize() error = np.abs(error_grid).max() # swap dA and dAnew tmp = dA dA = dAnew dAnew = tmp iter += 1 runtime = time.time() - timer
def _run_copies(self, A): A0 = np.copy(A) stream = cuda.stream() ptr = cuda.to_device(A, copy=False, stream=stream) ptr.copy_to_device(A, stream=stream) ptr.copy_to_host(A, stream=stream) stream.synchronize() self.assertTrue(np.allclose(A, A0))
def __init__(self, positions, weights): self.calculate_forces = cuda.jit( argtypes=(float32[:,:], float32[:], float32[:,:]) )(calculate_forces) self.accelerations = np.zeros_like(positions) self.n_bodies = len(weights) self.stream = cuda.stream() self.d_pos = cuda.to_device(positions, self.stream) self.d_wei = cuda.to_device(weights, self.stream) self.d_acc = cuda.to_device(self.accelerations, self.stream) self.stream.synchronize()
def newthread(exception_queue): try: cuda.select_device(0) stream = cuda.stream() A = np.arange(100) dA = cuda.to_device(A, stream=stream) stream.synchronize() del dA del stream cuda.close() except Exception as e: exception_queue.put(e)
def test(ty): data = np.array(np.random.random(self.N), dtype=ty) stream = cuda.stream() device_data = cuda.to_device(data, stream) dresult = cuda_ufunc(device_data, device_data, stream=stream) result = dresult.copy_to_host() stream.synchronize() gold = np_ufunc(data, data) self.assertTrue(np.allclose(gold, result), (gold, result))
def reduce_test2(self, n): @vectorize(sig, target=target) def vector_add(a, b): return a + b cuda_ufunc = vector_add x = np.arange(n, dtype=np.int32) gold = np.add.reduce(x) stream = cuda.stream() dx = cuda.to_device(x, stream) result = cuda_ufunc.reduce(dx, stream=stream) self.assertEqual(result, gold)
def test_event_elapsed_stream(self): N = 32 stream = cuda.stream() dary = cuda.device_array(N, dtype=np.double) evtstart = cuda.event() evtend = cuda.event() evtstart.record(stream=stream) cuda.to_device(np.arange(N), to=dary, stream=stream) evtend.record(stream=stream) evtend.wait(stream=stream) evtend.synchronize() print(evtstart.elapsed_time(evtend))
def _prepare(self, arr, stream): if arr.ndim != 1: raise TypeError("only support 1D array") from numba import cuda # If no stream is specified, allocate one if stream == 0: stream = cuda.stream() # Make sure `arr` in on the device darr, conv = cuda.devicearray.auto_device(arr, stream=stream) return darr, stream, conv
def test_blackscholes(self): OPT_N = 400 iterations = 2 stockPrice = randfloat(np.random.random(OPT_N), 5.0, 30.0) optionStrike = randfloat(np.random.random(OPT_N), 1.0, 100.0) optionYears = randfloat(np.random.random(OPT_N), 0.25, 10.0) callResultNumpy = np.zeros(OPT_N) putResultNumpy = -np.ones(OPT_N) callResultNumbapro = np.zeros(OPT_N) putResultNumbapro = -np.ones(OPT_N) # numpy for i in range(iterations): black_scholes(callResultNumpy, putResultNumpy, stockPrice, optionStrike, optionYears, RISKFREE, VOLATILITY) # numbapro time0 = time.time() blockdim = 512, 1 griddim = int(math.ceil(float(OPT_N) / blockdim[0])), 1 stream = cuda.stream() d_callResult = cuda.to_device(callResultNumbapro, stream) d_putResult = cuda.to_device(putResultNumbapro, stream) d_stockPrice = cuda.to_device(stockPrice, stream) d_optionStrike = cuda.to_device(optionStrike, stream) d_optionYears = cuda.to_device(optionYears, stream) time1 = time.time() for i in range(iterations): black_scholes_cuda[griddim, blockdim, stream]( d_callResult, d_putResult, d_stockPrice, d_optionStrike, d_optionYears, RISKFREE, VOLATILITY) d_callResult.copy_to_host(callResultNumbapro, stream) d_putResult.copy_to_host(putResultNumbapro, stream) stream.synchronize() dt = (time1 - time0) print("numbapro.cuda time: %f msec" % ((1000 * dt) / iterations)) delta = np.abs(callResultNumpy - callResultNumbapro) L1norm = delta.sum() / np.abs(callResultNumpy).sum() max_abs_err = delta.max() print('L1norm', L1norm) print('Max absolute error', max_abs_err) self.assertTrue(L1norm < 1e-13) self.assertTrue(max_abs_err < 1e-13)
def _template(self, name, A): A0 = np.copy(A) s = timer() stream = cuda.stream() ptr = cuda.to_device(A, copy=False, stream=stream) ptr.copy_to_device(A, stream=stream) ptr.copy_to_host(A, stream=stream) stream.synchronize() e = timer() self.assertTrue(np.allclose(A, A0)) elapsed = e - s return elapsed
def test_gufunc_stream(self): @guvectorize([void(float32[:, :], float32[:, :], float32[:, :])], '(m,n),(n,p)->(m,p)', target='cuda') def matmulcore(A, B, C): m, n = A.shape n, p = B.shape for i in range(m): for j in range(p): C[i, j] = 0 for k in range(n): C[i, j] += A[i, k] * B[k, j] gufunc = matmulcore gufunc.max_blocksize = 512 #cuda.driver.flush_pending_free() matrix_ct = 1001 # an odd number to test thread/block division in CUDA A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, 5) ts = time() stream = cuda.stream() dA = cuda.to_device(A, stream) dB = cuda.to_device(B, stream) dC = cuda.device_array(shape=(1001, 2, 5), dtype=A.dtype, stream=stream) dC = gufunc(dA, dB, out=dC, stream=stream) C = dC.copy_to_host(stream=stream) stream.synchronize() tcuda = time() - ts ts = time() Gold = ut.matrix_multiply(A, B) tcpu = time() - ts stream_speedups.append(tcpu / tcuda) self.assertTrue(np.allclose(C, Gold))
def test_device_auto_jit_2(self): @cuda.jit(device=True) def inner(arg): return arg + 1 @cuda.jit def outer(argin, argout): argout[0] = inner(argin[0]) + inner(2) a = np.zeros(1) b = np.zeros(1) stream = cuda.stream() d_a = cuda.to_device(a, stream) d_b = cuda.to_device(b, stream) outer[1, 1, stream](d_a, d_b) d_b.copy_to_host(b, stream) self.assertEqual(b[0], (a[0] + 1) + (2 + 1))
def device_partial_inplace(self, darr, size=None, init=0, stream=0): """Partially reduce a device array inplace as much as possible in an efficient manner. Does not automatically transfer host array. :param darr: Used to input and output. :type darr: device array :param size: Number of element in ``arr``. If None, the entire array is used. :type size: int or None :param init: Initial value for the reduction :type init: dtype of darr :param stream: All CUDA operations are performed on this stream if it is given. Otherwise, a new stream is created. :type stream: cuda stream :returns: int -- Number of elements in ``darr`` that contains the reduction result. """ if stream == 0: from numba import cuda stream = cuda.stream() ret = self._partial_inplace_driver(darr, size, init, stream) stream.synchronize() else: ret = self._partial_inplace_driver(darr, size, init, stream) return ret
def connected_comps_gpu(dest_in, weight_in, firstEdge_in, outDegree_in, MAX_TPB = 512, stream = None): if stream is None: myStream = cuda.stream() else: myStream = stream dest = cuda.to_device(dest_in, stream = myStream) weight = cuda.to_device(weight_in, stream = myStream) firstEdge = cuda.to_device(firstEdge_in, stream = myStream) outDegree = cuda.to_device(outDegree_in, stream = myStream) n_vertices = firstEdge.size n_edges = dest.size n_components = n_vertices # still need edge_id for conflict resolution in find_minedge edge_id = cuda.to_device(np.arange(n_edges, dtype = dest.dtype), stream = myStream) #labels = np.empty(n_vertices, dtype = dest.dtype) first_iter = True # initialize with name top_edge so we can recycle an array between iterations top_edge = cuda.device_array(n_components, dtype = dest.dtype, stream = myStream) labels = cuda.device_array(n_components, dtype = dest.dtype, stream = myStream) converged = cuda.device_array(1, dtype = np.int8, stream = myStream) gridDimLabels = compute_cuda_grid_dim(n_components, MAX_TPB) gridDim = compute_cuda_grid_dim(n_components, MAX_TPB) final_converged = False while(not final_converged): vertex_minedge = top_edge findMinEdge_CUDA[gridDim, MAX_TPB, myStream](weight, firstEdge, outDegree, vertex_minedge, dest) removeMirroredEdges_CUDA[gridDim, MAX_TPB, myStream](dest, vertex_minedge) colors = cuda.device_array(shape = n_components, dtype = np.int32, stream = myStream) initializeColors_CUDA[gridDim, MAX_TPB, myStream](dest, vertex_minedge, colors) # propagate colors until convergence propagateConverged = False while(not propagateConverged): propagateColors_CUDA[gridDim, MAX_TPB, myStream](colors, converged) converged_num = converged.getitem(0, stream = myStream) propagateConverged = True if converged_num == 1 else False # first we build the flags in the new_vertex array new_vertex = vertex_minedge # reuse the vertex_minedge array as the new new_vertex buildFlag_CUDA[gridDim, MAX_TPB, myStream](colors, new_vertex) # new_n_vertices is the number of vertices of the new contracted graph new_n_vertices = ex_prefix_sum_gpu(new_vertex, MAX_TPB = MAX_TPB, stream = myStream).getitem(0, stream = myStream) new_n_vertices = int(new_n_vertices) if first_iter: # first iteration defines labels as the initial colors and updates labels.copy_to_device(colors, stream = myStream) first_iter = False # other iterations update the labels with the new colors update_labels_single_pass_cuda[gridDimLabels, MAX_TPB, myStream](labels, colors, new_vertex) if new_n_vertices == 1: final_converged = True del new_vertex break newGridDim = compute_cuda_grid_dim(n_components, MAX_TPB) # count number of edges for new supervertices and write in new outDegree newOutDegree = cuda.device_array(shape = new_n_vertices, dtype = np.int32, stream = myStream) memSet[newGridDim, MAX_TPB, myStream](newOutDegree, 0) # zero the newOutDegree array countNewEdges_CUDA[gridDim, MAX_TPB, myStream](colors, firstEdge, outDegree, dest, new_vertex, newOutDegree) # new first edge array for contracted graph newFirstEdge = cuda.device_array_like(newOutDegree, stream = myStream) # copy newOutDegree to newFirstEdge newFirstEdge.copy_to_device(newOutDegree, stream = myStream) new_n_edges = ex_prefix_sum_gpu(newFirstEdge, MAX_TPB = MAX_TPB, stream = myStream) new_n_edges = new_n_edges.getitem(0, stream = myStream) new_n_edges = int(new_n_edges) # if no edges remain, then MST has converged if new_n_edges == 0: final_converged = True del newOutDegree, newFirstEdge, new_vertex break # create arrays for new edges new_dest = cuda.device_array(new_n_edges, dtype = np.int32, stream = myStream) new_edge_id = cuda.device_array(new_n_edges, dtype = np.int32, stream = myStream) new_weight = cuda.device_array(new_n_edges, dtype = weight.dtype, stream = myStream) top_edge = cuda.device_array_like(newFirstEdge, stream = myStream) top_edge.copy_to_device(newFirstEdge, stream = myStream) # assign and insert new edges assignInsert_CUDA[gridDim, MAX_TPB, myStream](edge_id, dest, weight, firstEdge, outDegree, colors, new_vertex, new_dest, new_edge_id, new_weight, top_edge) # delete old graph del new_vertex, edge_id, dest, weight, firstEdge, outDegree, colors # write new graph n_components = newFirstEdge.size edge_id = new_edge_id dest = new_dest weight = new_weight firstEdge = newFirstEdge outDegree = newOutDegree gridDim = newGridDim returnLabels = labels.copy_to_host() del dest, weight, edge_id, firstEdge, outDegree, converged, labels return returnLabels
def test_blackscholes(self): OPT_N = 400 iterations = 2 stockPrice = randfloat(np.random.random(OPT_N), 5.0, 30.0) optionStrike = randfloat(np.random.random(OPT_N), 1.0, 100.0) optionYears = randfloat(np.random.random(OPT_N), 0.25, 10.0) callResultNumpy = np.zeros(OPT_N) putResultNumpy = -np.ones(OPT_N) callResultNumbapro = np.zeros(OPT_N) putResultNumbapro = -np.ones(OPT_N) # numpy for i in range(iterations): black_scholes(callResultNumpy, putResultNumpy, stockPrice, optionStrike, optionYears, RISKFREE, VOLATILITY) @cuda.jit(argtypes=(double,), restype=double, device=True, inline=True) def cnd_cuda(d): K = 1.0 / (1.0 + 0.2316419 * math.fabs(d)) ret_val = (RSQRT2PI * math.exp(-0.5 * d * d) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))))) if d > 0: ret_val = 1.0 - ret_val return ret_val @cuda.jit(argtypes=(double[:], double[:], double[:], double[:], double[:], double, double)) def black_scholes_cuda(callResult, putResult, S, X, T, R, V): i = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x if i >= S.shape[0]: return sqrtT = math.sqrt(T[i]) d1 = (math.log(S[i] / X[i]) + (R + 0.5 * V * V) * T[i]) / (V * sqrtT) d2 = d1 - V * sqrtT cndd1 = cnd_cuda(d1) cndd2 = cnd_cuda(d2) expRT = math.exp((-1. * R) * T[i]) callResult[i] = (S[i] * cndd1 - X[i] * expRT * cndd2) putResult[i] = (X[i] * expRT * (1.0 - cndd2) - S[i] * (1.0 - cndd1)) # numbapro time0 = time.time() blockdim = 512, 1 griddim = int(math.ceil(float(OPT_N) / blockdim[0])), 1 stream = cuda.stream() d_callResult = cuda.to_device(callResultNumbapro, stream) d_putResult = cuda.to_device(putResultNumbapro, stream) d_stockPrice = cuda.to_device(stockPrice, stream) d_optionStrike = cuda.to_device(optionStrike, stream) d_optionYears = cuda.to_device(optionYears, stream) time1 = time.time() for i in range(iterations): black_scholes_cuda[griddim, blockdim, stream]( d_callResult, d_putResult, d_stockPrice, d_optionStrike, d_optionYears, RISKFREE, VOLATILITY) d_callResult.copy_to_host(callResultNumbapro, stream) d_putResult.copy_to_host(putResultNumbapro, stream) stream.synchronize() dt = (time1 - time0) delta = np.abs(callResultNumpy - callResultNumbapro) L1norm = delta.sum() / np.abs(callResultNumpy).sum() max_abs_err = delta.max() self.assertTrue(L1norm < 1e-13) self.assertTrue(max_abs_err < 1e-13)