def main(): inp = np.arange(1000000, dtype=np.int32) factor = 4 start, end = cuda.event(True), cuda.event(True) reses = [] for (name, f) in [ ("not shared", mult_by_x_not_shared), ("shared", mult_by_x_shared), ("not shared", mult_by_x_not_shared), ]: times = [] for i in range(100): d_out = cuda.device_array_like(inp) start.record() f[blocks, threadsPerBlock](cuda.to_device(inp), d_out, cuda.to_device(np.array([factor]))) end.record() end.synchronize() out = d_out.copy_to_host() # Compilation... if i != 0: times.append(cuda.event_elapsed_time(start, end)) print( f"{name}: {np.mean(times):.2f} +/- {np.std(times) / np.sqrt(len(times)):.3f} (max: {np.max(times):.2f})" ) reses.append(out) assert np.all([reses[0] == reses_i for reses_i in reses])
def laplace_3d_cuda(d, n): L = d.shape[0] M = d.shape[1] N = d.shape[2] blockdim = (8, 8, 8) griddim = (L // blockdim[0], M // blockdim[1], N // blockdim[2]) #print(griddim) stream = cuda.stream() dd = cuda.to_device(d, stream) dn = cuda.to_device(n, stream) #%timeit -n 32 -r 16 d0td1_cuda_kernel[griddim, blockdim](dd, dn) for i in range(0, 100): laplace_3d_cuda_opt_kernel[griddim, blockdim, stream](dd, dn, L, M, N) evtstart = cuda.event(timing=True) evtend = cuda.event(timing=True) evtstart.record(stream) for i in range(100): laplace_3d_cuda_opt_kernel[griddim, blockdim, stream](dd, dn, L, M, N) evtend.record(stream) evtend.synchronize() print(cuda.event_elapsed_time(evtstart, evtend) / 100.) dd.to_host()
def __init__(self, label='', gpu=0): self.label = label self.gpu = gpu self.start = cuda.event() self.end = cuda.event() cuda.select_device(self.gpu) self.start.record(),
def main(image1, image2): streams = [] start_events = [] end_events = [] data1_gpu = [] data2_gpu = [] gpu_out = [] out = [] data_image1 = np.array(image1) data_image2 = np.array(image2) print(data_image1.shape, data_image2.shape) shpape_A = data_image1.shape # prevaod na 1 rozmerne pole data_image1 = data_image1.ravel() data_image2 = data_image2.ravel() input1 = np.split(data_image1, X) input2 = np.split(data_image1, X) for _ in range(len(input1)): streams.append(cuda.stream()) start_events.append(cuda.event()) end_events.append(cuda.event()) for i in range(len(input1)): data1_gpu.append(cuda.to_device(input1[i], stream=streams[i])) data2_gpu.append(cuda.to_device(input2[i], stream=streams[i])) t_start = perf_counter() for i in range(len(input1)): start_events[i].record(streams[i]) sumImages[1, 32, streams[i]](data1_gpu[i], data2_gpu[i]) t_end = perf_counter() for i in range(len(input1)): end_events[i].record(streams[i]) gpu_out.append(data2_gpu[i].copy_to_host(stream=streams[i])) for i in range(len(gpu_out)): out = np.concatenate((out, gpu_out[i])) kernel_times = [] for k in range(len(input1)): kernel_times.append( cuda.event_elapsed_time(start_events[k], end_events[k])) out = out.reshape(shpape_A) out = out.astype('uint8') out = Image.fromarray(out) out.save("out_stream.png") print(f'Total time: {t_end - t_start}') print(f'Mean kernel duration (milliseconds): {np.mean(kernel_times)}') print(f'Mean kernel standard deviation \ (milliseconds): {np.std(kernel_times)}')
def test_event_elapsed(self): N = 32 dary = cuda.device_array(N, dtype=np.double) evtstart = cuda.event() evtend = cuda.event() evtstart.record() cuda.to_device(np.arange(N), to=dary) evtend.record() evtend.wait() evtend.synchronize() print(evtstart.elapsed_time(evtend))
def test_event_elapsed_stream(self): N = 32 stream = cuda.stream() dary = cuda.device_array(N, dtype=np.double) evtstart = cuda.event() evtend = cuda.event() evtstart.record(stream=stream) cuda.to_device(np.arange(N, dtype=np.double), to=dary, stream=stream) evtend.record(stream=stream) evtend.wait(stream=stream) evtend.synchronize() # Exercise the code path evtstart.elapsed_time(evtend)
def test_event_elapsed_stream(self): N = 32 stream = cuda.stream() dary = cuda.device_array(N, dtype=np.double) evtstart = cuda.event() evtend = cuda.event() evtstart.record(stream=stream) cuda.to_device(np.arange(N), to=dary, stream=stream) evtend.record(stream=stream) evtend.wait(stream=stream) evtend.synchronize() # Exercise the code path evtstart.elapsed_time(evtend)
def __init__(self, shape, dtype, prealloc): self.device = cuda.get_current_device() self.freelist = deque() self.events = {} for i in range(prealloc): gpumem = cuda.device_array(shape=shape, dtype=dtype) self.freelist.append(gpumem) self.events[gpumem] = cuda.event(timing=False)
def main(): # Compile lots_of_copies() few_copies() # Now benchmark start, end = cuda.event(timing=True), cuda.event(timing=True) n = 200 for f in [lots_of_copies, few_copies]: times = [] for _ in range(n): start.record() f() end.record() end.synchronize() t = cuda.event_elapsed_time(start, end) times.append(t) print(f.__name__, np.mean(times), np.std(times) / np.sqrt(n))
def _set_up_profiling(self): # set up profiling for manual gpu mem management if self._cuda_mem == 'manual': if not hasattr(self, 'man_prof'): man_prof_events = ('data_ev1', 'data_ev2', 'labels_ev1', 'labels_ev2', 'dists_ev1', 'dists_ev2', 'centroids_ev1', 'centroids_ev2', 'kernel_ev1', 'kernel_ev2') self.man_prof = {key: cuda.event() for key in man_prof_events} man_prof_timings = ('data_timings', 'labels_timings', 'dists_timings', 'kernel_timings', 'centroids_timings') self.man_prof.update({key: list() for key in man_prof_timings}) # set up profiling for auto gpu mem management elif self._cuda_mem == 'auto': if not hasattr(self, 'auto_prof'): auto_prof_events = ('kernel_ev1', 'kernel_ev2') self.auto_prof = {key: cuda.event() for key in auto_prof_events} self.auto_prof['kernel_timings'] = list()
def last_block_test(): MAX_TPB = 512 n = 1024 a = np.arange(n).astype(np.int32) reference = np.empty_like(a) start = timer() scan.exprefixsumNumba(a, reference, init = 0) end = timer() auxidx = -1 elb = a.size p2elb = np.int(np.ceil(np.log2(elb))) telb = 2 ** p2elb tlb = telb / 2 startIdx = 0 sm_size = telb * a.itemsize aux = np.empty(1,dtype=np.int8) trash = cuda.device_array(1) e1, e2 = cuda.event(), cuda.event() e1.record() scan.last_scan[1, tlb, 0, sm_size](a, aux, -1, elb, startIdx) e2.record() print "CPU took: ", (end - start) * 1000, " ms" print "Kernel took: ", cuda.event_elapsed_time(e1,e2), " ms" print (a == reference).all()
def test_last_block(): MAX_TPB = 512 n = 1024 a = np.arange(n).astype(np.int32) reference = np.empty_like(a) start = timer() MyScan.exprefixsumNumba(a, reference, init=0) end = timer() auxidx = -1 elb = a.size p2elb = np.int(np.ceil(np.log2(elb))) telb = 2 ** p2elb tlb = telb / 2 startIdx = 0 sm_size = telb * a.itemsize aux = np.empty(1, dtype=np.int8) trash = cuda.device_array(1) e1, e2 = cuda.event(), cuda.event() e1.record() MyScan.last_scan[1, tlb, 0, sm_size](a, aux, -1, elb, startIdx) e2.record() print "CPU took: ", (end - start) * 1000, " ms" print "Kernel took: ", cuda.event_elapsed_time(e1, e2), " ms" print (a == reference).all()
import numpy as np @cuda.jit('void(float32[:], float32[:])') def cu_copy_array(dst, src): i = cuda.grid(1) dst[i] = src[i] BLOCKCOUNT = 25000 BLOCKSIZE = 256 aryA = np.arange(BLOCKSIZE * BLOCKCOUNT, dtype=np.float32) print('data size: %.1fMB' % (aryA.size * aryA.dtype.itemsize / (2**20))) evt_total_begin = cuda.event() evt_total_end = cuda.event() evt_kernel_begin = cuda.event() evt_kernel_end = cuda.event() t_total_begin = timer() evt_total_begin.record() # explicity tranfer memory d_aryA = cuda.to_device(aryA) d_aryB = cuda.device_array_like(aryA) evt_kernel_begin.record() t_kernel_begin = timer()
qz = z2[j] qw = w2[j] dx = px - qx dy = py - qy dz = pz - qz wprod = pw * qw dsq = dx * dx + dy * dy + dz * dz k = nbins - 1 while dsq <= rbins_squared[k]: cuda.atomic.add(result, k - 1, wprod) k -= 1 if k <= 0: break start = cuda.event() end = cuda.event() timing_nb = 0 timing_nb_wall = 0 d_x1 = cuda.to_device(x1.astype(np.float32)) d_y1 = cuda.to_device(y1.astype(np.float32)) d_z1 = cuda.to_device(z1.astype(np.float32)) d_w1 = cuda.to_device(w1.astype(np.float32)) d_x2 = cuda.to_device(x2.astype(np.float32)) d_y2 = cuda.to_device(y2.astype(np.float32)) d_z2 = cuda.to_device(z2.astype(np.float32)) d_w2 = cuda.to_device(w2.astype(np.float32)) d_rbins_squared = cuda.to_device(DEFAULT_RBINS_SQUARED.astype(np.float32))
def intensityCalculations(GratingSeparation, WaveNumber, sourcePoints, obsPoints, sourceAmp, sourcePhase): """This function is used as an abstraction layer for the CUDA kernel. This function does the type casting and is able to return values. Args: GratingSeparation (float):Constant passed in and used as distance (on the x-plane) between source and observation points WaveNumber (float): Constant defined in global variables. I think it has to do with wave length; not sure why its named wavenumber sourcePoints (f4[:]): Position of source points as an array of float32 obsPoints (f4[:]): Position of observation points as an array of float32 sourceAmp (f4[:]): Amplitudes from each source point as an array of float32 sourcePhase (c8[:]): Phase of each source point as an array of complex128 Returns: return intensities, amplituteds, phases; intensities (f4[:]): Array of intensities for each observation point as an array of float32 amplitudes (f4[:]): Array of amplitudes for each observation point as an array of float32 phases (c8[:]): Array of phases for each observation point as an array of complex128 Changelog: TODO: Author: Alec Buchanan - 3/2018 """ # Specify the number of CUDA threads arraySize = len(obsPoints) threadsperblock = 32 blockspergrid = (arraySize + (threadsperblock - 1)) // threadsperblock # initialize output variables out_i = [0.0] * arraySize # intensity out_a = [0.0] * arraySize # amplitude out_p = [0.0] * arraySize # phase # Cast datatypes so the kernel does not complain GratingSeparation = float(GratingSeparation) WaveNumber = float(WaveNumber) sourcePoints = np.array(sourcePoints, dtype='f4') # 32-bit float array, 4 bytes obsPoints = np.array(obsPoints, dtype='f4') # 32-bit float array, 4 bytes sourcePhase = np.array(sourcePhase, dtype='c8') # 64-bit complex array, 8 bytes out_p = np.array(out_p, dtype='c8') # 64-bit complex array, 8 bytes out_a = np.array(out_a, dtype='f4') # 32-bit float array, 4 bytes out_i = np.array(out_i, dtype='f4') # 32-bit float array, 4 bytes evt_total_begin = cuda.event() evt_total_end = cuda.event() evt_mem_begin = cuda.event() evt_mem_end = cuda.event() evt_mem2_begin = cuda.event() evt_mem2_end = cuda.event() evt_kernel_begin = cuda.event() evt_kernel_end = cuda.event() evt_total_begin.record() evt_mem_begin.record() d_sourcePoints = cuda.to_device(sourcePoints) d_obsPoints = cuda.to_device(obsPoints) d_sourceAmp = cuda.to_device(sourceAmp) d_sourcePhase = cuda.to_device(sourcePhase) d_out_i = cuda.to_device(out_i) d_out_a = cuda.to_device(out_a) d_out_p = cuda.to_device(out_p) evt_mem_end.record() evt_mem_end.synchronize() evt_kernel_begin.record() # call CUDA kernel intensityKernel[blockspergrid, threadsperblock](GratingSeparation, WaveNumber, d_sourcePoints, d_obsPoints, d_sourceAmp, d_sourcePhase, d_out_p, d_out_a, d_out_i) evt_kernel_end.record() evt_kernel_end.synchronize() evt_mem2_begin.record() out_i = d_out_i.copy_to_host() out_a = d_out_a.copy_to_host() out_p = d_out_p.copy_to_host() evt_mem2_end.record() evt_mem2_end.synchronize() evt_total_end.record() evt_total_end.synchronize() print('total time: %fms' % evt_total_begin.elapsed_time(evt_total_end)) print('mem-to-gpu time: %fms' % evt_mem_begin.elapsed_time(evt_mem_end)) print('kernel time: %fms' % evt_kernel_begin.elapsed_time(evt_kernel_end)) print('mem-from-gpu time: %fms' % evt_mem2_begin.elapsed_time(evt_mem2_end)) # Remove Imaginary parts out_i = np.array(out_i, dtype='f4') out_a = np.array(out_a, dtype='f4') return out_i, out_a, out_p
new_centroids[i] = data[furtherDistsArgs[j]] j += 1 return new_centroids if __name__ == '__main__': n = 10000 d = 200 k = 50 data = np.random.random((n, d)).astype(np.float32) centroids = np.random.random((k, d)).astype(np.float32) kt_start, kt_end = cuda.event(), cuda.event() # grid config tpb = 256 bpg = np.int(np.ceil(np.float(n) / tpb)) # compile kernel dData = cuda.to_device(data) dCentroids = cuda.to_device(centroids) dLabels = cuda.device_array(n, dtype=np.int32) dDists = cuda.device_array(n, dtype=np.float32) _cu_label_kernel_dists[bpg, tpb](dData, dCentroids, dLabels, dDists) ## data column major # GPU data data_t = data.T
def mst_cluster_coassoc(): t1,t2 = Timer(), Timer() #foldername = "/home/courses/aac2015/diogoaos/QCThesis/datasets/gaussmix1e4/" foldername = home + "QCThesis/datasets/gaussmix1e4/" print "Loading datasets" t1.tic() # dest = np.genfromtxt(foldername + "prot_dest.csr", dtype = np.int32, delimiter=",") # weight = np.genfromtxt(foldername + "prot_weight.csr", dtype = np.float32, delimiter=",") # fe = np.genfromtxt(foldername + "prot_fe.csr", dtype = np.int32, delimiter=",") dest = np.genfromtxt(foldername + "full_dest.csr", dtype = np.int32, delimiter=",") weight = np.genfromtxt(foldername + "full_weight.csr", dtype = np.float32, delimiter=",") fe = np.genfromtxt(foldername + "full_fe.csr", dtype = np.int32, delimiter=",") t1.tac() print "loading elapsed time : ", t1.elapsed fe = fe[:-1] od = np.empty_like(fe) outdegree_from_firstedge(fe, od, dest.size) # fix weights to dissimilarity weight = 100 - weight print "# edges : ", dest.size print "# vertices : ", fe.size print "edges/vertices ratio : ", dest.size * 1.0 / fe.size t1.tic() mst, n_edges = boruvka_minho_seq(dest, weight, fe, od) t1.tac() print "seq: time elapsed : ", t1.elapsed print "seq: mst size :", mst.size print "seq: n_edges : ", n_edges if n_edges < mst.size: mst = mst[:n_edges] mst.sort() ev1,ev2 = cuda.event(), cuda.event() ev1.record() d_dest = cuda.to_device(dest) d_weight = cuda.to_device(weight) d_fe = cuda.to_device(fe) d_od = cuda.to_device(od) ev2.record() send_graph_time = cuda.event_elapsed_time(ev1,ev2) t2.tic() mst2, n_edges2 = boruvka_minho_gpu(d_dest, d_weight, d_fe, d_od, MAX_TPB=512, returnDevAry = True) t2.tac() ev1.record() mst2 = mst2.copy_to_host() n_edges2 = n_edges2.getitem(0) ev2.record() recv_mst_time = cuda.event_elapsed_time(ev1,ev2) print "gpu: send graph time : ", send_graph_time print "gpu: time elapsed : ", t2.elapsed print "gpu: rcv mst time : ", recv_mst_time print "gpu: mst size :", mst2.size print "seq: n_edges : ", n_edges2 if n_edges2 < mst2.size: mst2 = mst2[:n_edges2] mst2.sort() if n_edges == n_edges2: mst_is_equal = (mst == mst2).all() else: mst_is_equal = False print "mst gpu == seq : ", mst_is_equal
@cuda.jit def kernel(array): thd = cuda.grid(1) num_iters = array.size // cuda.blockDim.x for j in range(num_iters): i = j * cuda.blockDim.x + thd for k in range(50): array[i] *= 2.0 array[i] /= 2.0 data = np.random.randn(array_len).astype('float32') data_gpu = cuda.to_device(data) start_event = cuda.event() end_event = cuda.event() start_event.record() kernel[1, 64](data_gpu) end_event.record() # pocka, kym nebude `end_event` oznaceny za hotovy # end_event.synchronize() print('Has the kernel started yet? {}'.format(start_event.query())) print('Has the kernel ended yet? {}'.format(end_event.query())) # vypocita kolko trvalo spustenie kernelu. # print('Kernel execution time in milliseconds: %f ' % # cuda.event_elapsed_time(start_event, end_event))
def _cu_label(self, data, centroids): #WARNING: data is being transposed when sending to GPU data_ev1, data_ev2 = cuda.event(), cuda.event() labels_ev1, labels_ev2 = cuda.event(), cuda.event() dists_ev1, dists_ev2 = cuda.event(), cuda.event() N, D = data.shape K, cD = centroids.shape if self._cuda_mem not in ('manual','auto'): raise Exception("cuda_mem = \'manual\' or \'auto\'") if self._gridDim is None or self._blockDim is None: self._compute_cuda_dims(data) labels = np.empty(N, dtype=np.int32) if self._cuda_mem == 'manual': # copy dataset and centroids, allocate memory ## cuda persistent handles # avoids redundant data transfer # if dataset has not been sent to device, send it and save handle if self._cudaDataHandle is None: dataT = np.ascontiguousarray(data.T) self.man_prof['data_ev1'].record() dData = cuda.to_device(dataT) self.man_prof['data_ev2'].record() self.man_prof['data_ev2'].synchronize() time_ms = cuda.event_elapsed_time(self.man_prof['data_ev1'], self.man_prof['data_ev2']) self.man_prof['data_timings'].append(time_ms) self._cudaDataHandle = dData # otherwise just use handle else: dData = self._cudaDataHandle # avoids creating labels array in device more than once if self._cuda_labels_handle is None: dLabels = cuda.device_array_like(labels) self._cuda_labels_handle = dLabels else: dLabels = self._cuda_labels_handle # avoids creating dists array in device more than once if self._cuda_dists_handle is None: dDists = cuda.device_array_like(self._dists) self._cuda_dists_handle = dDists else: dDists = self._cuda_dists_handle # copy centroids to device self.man_prof['centroids_ev1'].record() dCentroids = cuda.to_device(centroids) self.man_prof['centroids_ev2'].record() # launch kernel self.man_prof['kernel_ev1'].record() _cu_label_kernel_dists[self._gridDim, self._blockDim](dData, dCentroids, dLabels, dDists) self.man_prof['kernel_ev2'].record() # cuda.synchronize() # self.man_prof['kernel_ev2'].synchronize() # copy labels from device to host self.man_prof['labels_ev1'].record() dLabels.copy_to_host(ary=labels) self.man_prof['labels_ev2'].record() # copy distance to centroids from device to host self.man_prof['dists_ev1'].record() dists = dDists.copy_to_host() self.man_prof['dists_ev2'].record() self._dists = dists # synchronize host with gpu before computing times self.man_prof['dists_ev2'].synchronize() # store timings time_ms = cuda.event_elapsed_time(self.man_prof['centroids_ev1'], self.man_prof['centroids_ev2']) self.man_prof['centroids_timings'].append(time_ms) time_ms = cuda.event_elapsed_time(self.man_prof['kernel_ev1'], self.man_prof['kernel_ev2']) self.man_prof['kernel_timings'].append(time_ms) time_ms = cuda.event_elapsed_time(self.man_prof['labels_ev1'], self.man_prof['labels_ev2']) self.man_prof['labels_timings'].append(time_ms) time_ms = cuda.event_elapsed_time(self.man_prof['dists_ev1'], self.man_prof['dists_ev2']) self.man_prof['dists_timings'].append(time_ms) elif self._cuda_mem == 'auto': self.auto_prof['kernel_ev1'].record() _cu_label_kernel_dists[self._gridDim,self._blockDim](data, centroids, labels, self._dists) self.auto_prof['kernel_ev2'].record() time_ms = cuda.event_elapsed_time(self.auto_prof['kernel_ev1'], self.auto_prof['kernel_ev2']) self.auto_prof['kernel_timings'].append(time_ms) else: raise ValueError("CUDA memory management type may either \ be \'manual\' or \'auto\'.") return labels
def generate_batch(self, end=None, verbose=False, fused=False, nested_cva_at=None, nested_im_at=None, indicator_in_cva=False, alpha=None, im_window=None): if end is None: end = self.num_coarse_steps t = 0. self._reset() self.cuda_generate_exp1(self.d_exp_1, self.d_rng_states) self.stream.synchronize() self.cuda_compute_mtm( 0, t, self.d_X, self.d_mtm_by_cpty, self.d_cash_flows_by_cpty, self.d_vanillas_on_fx_f32, self.d_vanillas_on_fx_i32, self.d_vanillas_on_fx_b8, self.d_irs_f32, self.d_irs_i32, self.d_zcs_f32, self.d_zcs_i32, self.dt, self.max_coarse_per_reset, self.cDtoH_freq, True) self.stream.synchronize() self.d_mtm_by_cpty[0].copy_to_host(ary=self.mtm_by_cpty[0], stream=self.stream) self.d_cash_flows_by_cpty[0].copy_to_host( ary=self.cash_flows_by_cpty[0], stream=self.stream) _cuda_bulk_diffuse_event_begin = [cuda.event() for i in range(end)] _cuda_bulk_diffuse_event_end = [cuda.event() for i in range(end)] _cuda_compute_mtm_event_begin = [cuda.event() for i in range(end)] _cuda_compute_mtm_event_end = [cuda.event() for i in range(end)] _cuda_nested_cva_event_begin = [cuda.event() for i in range(end)] _cuda_nested_cva_event_end = [cuda.event() for i in range(end)] _cuda_nested_im_event_begin = [cuda.event() for i in range(end)] _cuda_nested_im_event_end = [cuda.event() for i in range(end)] for coarse_idx in range(1, end + 1): t += self.dT idx_in_dev_arr = (coarse_idx - 1) % self.cDtoH_freq + 1 if not fused: _cuda_bulk_diffuse_event_begin[coarse_idx - 1].record(stream=self.stream) self.cuda_bulk_diffuse( idx_in_dev_arr, t, self.d_X, self.d_def_indicators, self.d_dom_rate_integral, self.d_spread_integrals, self.d_irs_f32, self.d_irs_i32, self.d_exp_1, self.d_rng_states, self.dt, self.max_coarse_per_reset) _cuda_bulk_diffuse_event_end[coarse_idx - 1].record(stream=self.stream) _cuda_compute_mtm_event_begin[coarse_idx - 1].record(stream=self.stream) self.cuda_compute_mtm( idx_in_dev_arr, t, self.d_X, self.d_mtm_by_cpty, self.d_cash_flows_by_cpty, self.d_vanillas_on_fx_f32, self.d_vanillas_on_fx_i32, self.d_vanillas_on_fx_b8, self.d_irs_f32, self.d_irs_i32, self.d_zcs_f32, self.d_zcs_i32, self.dt, self.max_coarse_per_reset, self.cDtoH_freq, False) _cuda_compute_mtm_event_end[coarse_idx - 1].record(stream=self.stream) else: _cuda_bulk_diffuse_event_begin[coarse_idx - 1].record(stream=self.stream) if idx_in_dev_arr == 1: self.cuda_diffuse_and_price( 1, self.cDtoH_freq, t, self.d_X, self.d_dom_rate_integral, self.d_spread_integrals, self.d_mtm_by_cpty, self.d_cash_flows_by_cpty, self.d_irs_f32, self.d_irs_i32, self.d_vanillas_on_fx_f32, self.d_vanillas_on_fx_i32, self.d_vanillas_on_fx_b8, self.d_rng_states, self.dt, self.max_coarse_per_reset, self.cDtoH_freq) self.cuda_oversimulate_defs(1, self.cDtoH_freq, self.d_def_indicators, self.d_spread_integrals, self.d_exp_1) _cuda_bulk_diffuse_event_end[coarse_idx - 1].record(stream=self.stream) if nested_cva_at is not None: _cuda_nested_cva_event_begin[coarse_idx - 1].record(stream=self.stream) if coarse_idx in nested_cva_at: self.cuda_nested_cva( idx_in_dev_arr, self.num_coarse_steps - coarse_idx, t, self.d_X, self.d_def_indicators, self.d_dom_rate_integral, self.d_spread_integrals, self.d_mtm_by_cpty, self.d_cash_flows_by_cpty, self.d_irs_f32, self.d_irs_i32, self.d_vanillas_on_fx_f32, self.d_vanillas_on_fx_i32, self.d_vanillas_on_fx_b8, self.d_exp_1, self.d_rng_states, self.dt, self.cDtoH_freq, indicator_in_cva, self.d_nested_cva, self.d_nested_cva_sq) self.d_nested_cva.copy_to_host( ary=self.nested_cva[coarse_idx], stream=self.stream) self.d_nested_cva_sq.copy_to_host( ary=self.nested_cva_sq[coarse_idx], stream=self.stream) _cuda_nested_cva_event_end[coarse_idx - 1].record(stream=self.stream) if nested_im_at is not None: _cuda_nested_im_event_begin[coarse_idx - 1].record(stream=self.stream) if coarse_idx in nested_im_at: for adam_iter in range(self.num_adam_iters): adam_init = adam_iter == 0 step_size = self.lam * (adam_iter + 1)**(-self.gamma) self.cuda_nested_im( alpha, adam_init, step_size, idx_in_dev_arr, im_window, t, self.d_X, self.d_mtm_by_cpty[idx_in_dev_arr], self.d_irs_f32, self.d_irs_i32, self.d_vanillas_on_fx_f32, self.d_vanillas_on_fx_i32, self.d_vanillas_on_fx_b8, self.d_rng_states, self.dt, self.d_nested_im_by_cpty, self.d_nested_im_std_by_cpty, self.d_nested_im_m, self.d_nested_im_v, self.adam_b1, self.adam_b2, adam_iter) self.d_nested_im_by_cpty.copy_to_host( ary=self.nested_im_by_cpty[coarse_idx], stream=self.stream) _cuda_nested_im_event_end[coarse_idx - 1].record(stream=self.stream) if coarse_idx % self.cDtoH_freq == 0: self.d_X[self.max_coarse_per_reset:].copy_to_host( ary=self.X[coarse_idx - self.cDtoH_freq + 1:coarse_idx + 1], stream=self.stream) self.d_spread_integrals[1:].copy_to_host( ary=self.spread_integrals[coarse_idx - self.cDtoH_freq + 1:coarse_idx + 1], stream=self.stream) self.d_dom_rate_integral[1:].copy_to_host( ary=self.dom_rate_integral[coarse_idx - self.cDtoH_freq + 1:coarse_idx + 1], stream=self.stream) self.d_def_indicators[1:].copy_to_host( ary=self.def_indicators[coarse_idx - self.cDtoH_freq + 1:coarse_idx + 1], stream=self.stream) self.d_mtm_by_cpty[1:].copy_to_host( ary=self.mtm_by_cpty[coarse_idx - self.cDtoH_freq + 1:coarse_idx + 1], stream=self.stream) self.d_cash_flows_by_cpty[1:].copy_to_host( ary=self.cash_flows_by_cpty[coarse_idx - self.cDtoH_freq + 1:coarse_idx + 1], stream=self.stream) self.d_X[:self.max_coarse_per_reset].copy_to_device( self.d_X[-self.max_coarse_per_reset:], stream=self.stream) self.d_spread_integrals[0].copy_to_device( self.d_spread_integrals[self.cDtoH_freq], stream=self.stream) self.d_dom_rate_integral[0].copy_to_device( self.d_dom_rate_integral[self.cDtoH_freq], stream=self.stream) self.d_def_indicators[0].copy_to_device( self.d_def_indicators[self.cDtoH_freq], stream=self.stream) if end % self.cDtoH_freq != 0: start_idx = (end // self.cDtoH_freq) * self.cDtoH_freq + 1 length = end % self.cDtoH_freq self.d_X[self.max_coarse_per_reset:self.max_coarse_per_reset + length].copy_to_host(ary=self.X[start_idx:start_idx + length], stream=self.stream) self.d_spread_integrals[1:length + 1].copy_to_host( ary=self.spread_integrals[start_idx:start_idx + length], stream=self.stream) self.d_dom_rate_integral[1:length + 1].copy_to_host( ary=self.dom_rate_integral[start_idx:start_idx + length], stream=self.stream) self.d_def_indicators[1:length + 1].copy_to_host( ary=self.def_indicators[start_idx:start_idx + length], stream=self.stream) self.d_mtm_by_cpty[1:length + 1].copy_to_host( ary=self.mtm_by_cpty[start_idx:start_idx + length], stream=self.stream) self.d_cash_flows_by_cpty[1:length + 1].copy_to_host( ary=self.cash_flows_by_cpty[start_idx:start_idx + length], stream=self.stream) if verbose: print('Everything was successfully queued!') for evt_cuda_bulk_diffuse_event, evt_cuda_compute_mtm_event, evt_cuda_nested_cva_event, evt_cuda_nested_im_event in zip( _cuda_bulk_diffuse_event_end, _cuda_compute_mtm_event_end, _cuda_nested_cva_event_end, _cuda_nested_im_event_end): evt_cuda_bulk_diffuse_event.synchronize() evt_cuda_compute_mtm_event.synchronize() evt_cuda_nested_cva_event.synchronize() evt_cuda_nested_im_event.synchronize() self.stream.synchronize() if not fused: print('cuda_bulk_diffuse average elapsed time per launch: {0} ms'. format( round( sum( cuda.event_elapsed_time(evt_begin, evt_end) for evt_begin, evt_end in zip( _cuda_bulk_diffuse_event_begin, _cuda_bulk_diffuse_event_end)) / end, 3))) print('compute_mtm average elapsed time per launch: {0} ms'.format( round( sum( cuda.event_elapsed_time(evt_begin, evt_end) for evt_begin, evt_end in zip( _cuda_compute_mtm_event_begin, _cuda_compute_mtm_event_end)) / end, 3))) else: print('cuda_diffuse_and_price elapsed time: {0} ms'.format( round( sum( cuda.event_elapsed_time(evt_begin, evt_end) for evt_begin, evt_end in zip( _cuda_bulk_diffuse_event_begin, _cuda_bulk_diffuse_event_end)), 3))) if nested_cva_at is not None: print('cuda_nested_cva average elapsed time per launch: {0} ms'. format( round( sum( cuda.event_elapsed_time(evt_begin, evt_end) for evt_begin, evt_end in zip( _cuda_nested_cva_event_begin, _cuda_nested_cva_event_end)) / len(nested_cva_at), 3))) if nested_im_at is not None: print('cuda_nested_im average elapsed time per launch: {0} ms'. format( round( sum( cuda.event_elapsed_time(evt_begin, evt_end) for evt_begin, evt_end in zip( _cuda_nested_im_event_begin, _cuda_nested_im_event_end)) / len(nested_im_at), 3))) # TODO: port this to CUDA self.cash_pos_by_cpty = ne.evaluate('c*exp(-r)', local_dict={ 'c': self.cash_flows_by_cpty, 'r': self.dom_rate_integral[:, None, :] }) np.cumsum(self.cash_pos_by_cpty, axis=0, out=self.cash_pos_by_cpty) self.cash_pos_by_cpty *= np.exp(self.dom_rate_integral[:, None, :])
data_A = [] data_B = [] streams = [] start_events = [] end_events = [] num_arrays = 100 A_gpu = [] B_gpu = [] C_gpu = [] C_out = [] # Host code for _ in range(num_arrays): streams.append(cuda.stream()) start_events.append(cuda.event()) end_events.append(cuda.event()) # Initialize the data arrays A = numpy.full((24, 12), 3, numpy.float64) # matrix containing all 3's B = numpy.full((12, 22), 4, numpy.float64) # matrix containing all 4's data_A.append(A) data_B.append(B) t_start = perf_counter() for i in range(num_arrays): # Copy the arrays to GPU A_gpu.append(cuda.to_device(data_A[i], stream=streams[i])) B_gpu.append(cuda.to_device(data_B[i], stream=streams[i])) # Allocate memory on the device for the result
def __init__(self, time_offset=0, cuda_stream=0): self._t_start = cuda.event(timing=True) self._t_end = cuda.event(timing=True) self._time_off = time_offset self._cuda_stream = cuda_stream