def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] mm = MM(shape=n, dtype=np.double, prealloc=5) blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK gridsz = int(math.ceil(float(n) / blksz)) stream = cuda.stream() prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream) # Allocate device side array d_normdist = cuda.device_array(n, dtype=np.double, stream=stream) c0 = interest - 0.5 * volatility**2 c1 = volatility * math.sqrt(dt) d_last = cuda.to_device(paths[:, 0], to=mm.get()) for j in range(1, paths.shape[1]): prng.normal(d_normdist, mean=0, sigma=1) d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get()) step(d_last, dt, c0, c1, d_normdist, out=d_paths, stream=stream) d_paths.copy_to_host(paths[:, j], stream=stream) mm.free(d_last) d_last = d_paths stream.synchronize()
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] blksz = 512 gridsz = int(math.ceil(float(n) / blksz)) stream = cuda.stream() prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream) qrng = curand.QRNG(curand.QRNG.SOBOL32, stream=stream) d_normdist = cuda.device_array(n, dtype=np.double, stream=stream) d_seed = cuda.device_array(n, dtype=np.uint32, stream=stream) prng.normal(d_normdist, 0, 1) qrng.generate(d_seed) d_paths = cuda.to_device(paths, stream=stream) c0 = interest - 0.5 * volatility**2 c1 = volatility * math.sqrt(dt) griddim = gridsz, 1 blockdim = blksz, 1, 1 cu_monte_carlo_pricer[griddim, blockdim, stream](d_paths, dt, c0, c1, d_normdist, d_seed) d_paths.to_host(stream) stream.synchronize()
def get_cuda_randoms(x, y): rand = np.empty((x * y), np.float64) # rand serves as a container for the randoms # CUDA only fills 1-dimensional arrays prng = curand.PRNG(rndtype=curand.PRNG.XORWOW) # the argument sets the random number algorithm prng.normal(rand, 0, 1) # filling the container rand = rand.reshape(x, y) # to be "fair", we reshape rand to 2 dimensions return rand
def spca_simpler(Vd, epsilon=0.1, d=3, k=10): p = Vd.shape[0] numSamples = int(math.ceil((4. / epsilon)**d)) print(numSamples) ##actual algorithm opt_x = np.zeros((p, 1)) opt_v = -np.inf # Prepare CUDA prng = curand.PRNG() custr = cuda.stream() #GENERATE ALL RANDOM SAMPLES BEFORE # C = np.random.randn(d, numSamples).astype(float_dtype) C = np.empty((d, numSamples), dtype=float_dtype) prng.normal(C.ravel(), mean=0, sigma=1) sorter = RadixSort(maxcount=Vd.shape[0], dtype=Vd.dtype, stream=custr, descending=True) for i in range(1, numSamples + 1): #c = np.random.randn(d,1) #c = C[:,i-1] c = C[:, i - 1:i] c = c / np.linalg.norm(c) a = Vd.dot(c) #partial argsort in numpy? #if partial, kth largest is p-k th smallest #but need indices more than partial # I = np.argsort(a, axis=0) # val = np.linalg.norm(a[I[-k:]]) #index backwards to get k largest # I = sorter.argselect(a[:, 0], k=k, reverse=True) I = sorter.argselect(k, a[:, 0]) val = np.linalg.norm(a[:k]) #index to get k largest if val > opt_v: opt_v = val opt_x = np.zeros((p, 1), dtype=float_dtype) opt_x[I] = a[:k] / val return opt_x
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK gridsz = int(math.ceil(float(n) / blksz)) # Instantiate cuRAND PRNG prng = curand.PRNG(curand.PRNG.MRG32K3A) # Allocate device side array d_normdist = cuda.device_array(n, dtype=np.double) c0 = interest - 0.5 * volatility**2 c1 = volatility * math.sqrt(dt) # Simulation loop d_last = cuda.to_device(paths[:, 0]) for j in range(1, paths.shape[1]): prng.normal(d_normdist, mean=0, sigma=1) d_paths = cuda.to_device(paths[:, j]) step(d_last, dt, c0, c1, d_normdist, out=d_paths) d_paths.copy_to_host(paths[:, j]) d_last = d_paths
def __get_cuda_randoms(self): prng = curand.PRNG(rndtype=curand.PRNG.XORWOW) prng.normal(self.container,0,1)
def main(*args): OPT_N = 4000000 iterations = 10 if len(args) >= 2: iterations = int(args[0]) blockdim = 1024, 1 griddim = int(math.ceil(float(OPT_N) / blockdim[0])), 1 # Use cuRand to generate random numbers directyl on the gpu # to avoid memory transfers. prng = curand.PRNG(rndtype=curand.PRNG.XORWOW) time0 = time.time() # malloc d_stockPrice = cuda.device_array(shape=(OPT_N), dtype=np.float32) d_optionStrike = cuda.device_array(shape=(OPT_N), dtype=np.float32) d_optionYears = cuda.device_array(shape=(OPT_N), dtype=np.float32) # Base distribution prng.uniform(d_stockPrice) prng.uniform(d_optionStrike) prng.uniform(d_optionYears) stream = cuda.stream() cfg_distribute = c_distribute[griddim, blockdim, stream] cfg_distribute(d_stockPrice, 5.0, 30.0) cfg_distribute(d_optionStrike, 1.0, 100.0) cfg_distribute(d_optionYears, 0.25, 10.) stream.synchronize() callResultNumbapro = np.zeros(OPT_N) putResultNumbapro = -np.ones(OPT_N) d_callResult = cuda.to_device(callResultNumbapro, stream) d_putResult = cuda.to_device(putResultNumbapro, stream) time1 = time.time() # Preconfigure the kernel as it's called multiple times in a loop. cfg_black_scholes_cuda = black_scholes_cuda[griddim, blockdim, stream] for i in range(iterations): cfg_black_scholes_cuda(d_callResult, d_putResult, d_stockPrice, d_optionStrike, d_optionYears, RISKFREE, VOLATILITY) d_callResult.to_host(stream) d_putResult.to_host(stream) stream.synchronize() time2 = time.time() dt = (time1 - time0) * 10 + (time2 - time1) print("numbapro.cuda time: %f msec" % ((1000 * dt) / iterations))
def get_cuda_randoms(x, y): rand = np.empty((x * y), np.float64) prng = curand.PRNG(rndtype=curand.PRNG.XORWOW) prng.normal(rand, 0, 1) # filling the container rand = rand.reshape((x, y)) return rand
def spca_full(Vd, epsilon=0.1, d=3, k=10): p = Vd.shape[0] initNumSamples = int(math.ceil((4. / epsilon)**d)) print(initNumSamples) maxSize = 6400 ##actual algorithm opt_x = np.zeros((p, 1), dtype=float_dtype) opt_v = -np.inf # Send Vd to GPU dVd = cuda.to_device(Vd) remaining = initNumSamples custr = cuda.stream() # sorter = RadixSort(maxcount=Vd.shape[0], dtype=Vd.dtype, stream=custr, # descending=True) prng = curand.PRNG(stream=custr) while remaining: numSamples = min(remaining, maxSize) remaining -= numSamples # Prepare storage for vector A # print(Vd.dtype) # print('dA', (Vd.shape[0], numSamples)) # print('dI', (k, numSamples)) dA = cuda.device_array(shape=(Vd.shape[0], numSamples), order='F', dtype=Vd.dtype) dI = cuda.device_array(shape=(Vd.shape[0], numSamples), dtype=np.uint32, order='F') daInorm = cuda.device_array(shape=numSamples, dtype=Vd.dtype) dC = cuda.device_array(shape=(d, numSamples), order='F', dtype=Vd.dtype) #GENERATE ALL RANDOM SAMPLES BEFORE # Also do normalization on the device prng.normal(dC.reshape(dC.size), mean=0, sigma=1) norm_random_nums[calc_ncta1d(dC.shape[1], 512), 512, custr](dC, d) #C = dC.copy_to_host() # Replaces: a = Vd.dot(c) # XXX: Vd.shape[0] must be within compute capability requirement # Note: this kernel can be easily scaled due to the use of num of samples # as the ncta batch_matmul[numSamples, 512, custr](dVd, dC, dA) # Replaces: I = np.argsort(a, axis=0) # Note: the k-selection is dominanting the time nn = Vd.shape[0] segments = (np.arange(numSamples - 1, dtype=np.int32) + 1) * nn blksz = 32 init_indices[(divup(dI.shape[0], blksz), divup(dI.shape[1], blksz)), (blksz, blksz), custr](dI) segmented_sort(dA, dI, segments, stream=custr) # async_dA = dA.bind(custr) # async_dI = dI.bind(custr) # selnext = sorter.batch_argselect(dtype=dA.dtype, # count=dA.shape[0], # k=k, # reverse=True) # for i in range(numSamples): # dIi = selnext(async_dA[:, i]) # async_dI[:, i].copy_to_device(dIi, stream=custr) # for i in range(numSamples): # # radix_argselect(async_dA[:, i], k=k, stream=custr, # # storeidx=async_dI[:, i]) # dIi = sorter.argselect(k, async_dA[:, i]) # async_dI[:, i].copy_to_device(dIi, stream=custr) # Replaces: val = np.linalg.norm(a[I[-k:]]) # batch_scatter_norm[calc_ncta1d(numSamples, 512), 512, custr](dA, dI, # daInorm) dA = dA.bind(custr)[-k:] dI = dI.bind(custr)[-k:] batch_norm[calc_ncta1d(numSamples, 512), 512, custr](dA, daInorm, k) aInorm = daInorm.copy_to_host(stream=custr) custr.synchronize() for i in xrange(numSamples): val = aInorm[i] if val > opt_v: opt_v = val opt_x.fill(0) # Only copy what we need Ik = dI[:, i].copy_to_host() aIk = dA[:, i].copy_to_host().reshape(k, 1) opt_x[Ik] = (aIk / val) # Free allocations del dA, dI, daInorm, dC return opt_x
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] num_streams = 2 part_width = int(math.ceil(float(n) / num_streams)) partitions = [(0, part_width)] for i in range(1, num_streams): begin, end = partitions[i - 1] begin, end = end, min(end + (end - begin), n) partitions.append((begin, end)) partlens = [end - begin for begin, end in partitions] mm = MM(shape=part_width, dtype=np.double, prealloc=10 * num_streams) device = cuda.get_current_device() blksz = device.MAX_THREADS_PER_BLOCK gridszlist = [ int(math.ceil(float(partlen) / blksz)) for partlen in partlens ] strmlist = [cuda.stream() for _ in range(num_streams)] prnglist = [ curand.PRNG(curand.PRNG.MRG32K3A, stream=strm) for strm in strmlist ] # Allocate device side array d_normlist = [ cuda.device_array(partlen, dtype=np.double, stream=strm) for partlen, strm in zip(partlens, strmlist) ] c0 = interest - 0.5 * volatility**2 c1 = volatility * math.sqrt(dt) # Configure the kernel # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>> steplist = [ cu_step[gridsz, blksz, strm] for gridsz, strm in zip(gridszlist, strmlist) ] d_lastlist = [ cuda.to_device(paths[s:e, 0], to=mm.get(stream=strm)) for (s, e), strm in zip(partitions, strmlist) ] for j in xrange(1, paths.shape[1]): for prng, d_norm in zip(prnglist, d_normlist): prng.normal(d_norm, mean=0, sigma=1) d_pathslist = [ cuda.to_device(paths[s:e, j], stream=strm, to=mm.get(stream=strm)) for (s, e), strm in zip(partitions, strmlist) ] for step, args in zip(steplist, zip(d_lastlist, d_pathslist, d_normlist)): d_last, d_paths, d_norm = args step(d_last, d_paths, dt, c0, c1, d_norm) for d_paths, strm, (s, e) in zip(d_pathslist, strmlist, partitions): d_paths.copy_to_host(paths[s:e, j], stream=strm) mm.free(d_last, stream=strm) d_lastlist = d_pathslist for strm in strmlist: strm.synchronize()