def eps_r_noop_strm_dev(x, A1, A2, out, tmp, tmp2, ones, zeros, streams, handle): D = A1[0].shape[0] Dm1 = D cb._libcublas.cublasSetPointerMode_v2(handle, 1) for s in range(len(A1)): cb.cublasSetStream(handle, streams[s].handle) cublasZgemm_dev(handle, 'N', 'N', D, Dm1, D, ones[s].gpudata, x.gpudata, D, A1[s].gpudata, D, zeros[s].gpudata, tmp[s].gpudata, D) cublasZgemm_dev(handle, 'C', 'N', Dm1, Dm1, D, ones[s].gpudata, A2[s].gpudata, D, tmp[s].gpudata, D, zeros[s].gpudata, tmp2[s].gpudata, Dm1) for s in streams: s.synchronize() cb._libcublas.cublasSetPointerMode_v2(handle, 0) cb.cublasSetStream(handle, 0) out.fill(0) for s in range(len(A1)): #cb.cublasZgeam(handle, 'N', 'N', Dm1, Dm1, 0. if s == 0 else 1., out.gpudata, Dm1, 1., tmp2[s].gpudata, Dm1, out.gpudata, Dm1) cb.cublasZaxpy(handle, Dm1 * Dm1, 1., tmp2[s].gpudata, 1, out.gpudata, 1) return out
def thunk(): x = inputs[0] y = inputs[1] # chop off the real/imag dimension input_shape_x = x[0].shape # (a, b, 2) input_shape_y = y[0].shape # (b, c, 2) output_shape = (input_shape_x[0], input_shape_y[1], 2) # (a, c, 2) input_x_pycuda = to_complex_gpuarray(x[0]) input_y_pycuda = to_complex_gpuarray(y[0]) # multistream experiment # print "DEBUG: Setting stream to %d" % current_stream[0] # prev_stream_obj = stream_pool[(current_stream[0] - 1) % num_streams] # print "PREV STREAM IS DONE?" # print prev_stream_obj.is_done() # print stream_obj = stream_pool[current_stream[0]] cublas.cublasSetStream(handle[0], stream_obj.handle) current_stream[0] += 1 current_stream[0] %= num_streams # print "DEBUG: set next stream id to %d" % current_stream[0] output_pycuda = linalg.dot(input_x_pycuda, input_y_pycuda, handle=handle[0]) outputs[0][0] = to_complex_cudandarray(output_pycuda)
def eps_l_noop_strm_dev(x, A1, A2, out, tmp, tmp2, ones, zeros, streams, handle): D = A1[0].shape[0] cb._libcublas.cublasSetPointerMode_v2(handle, 1) for s in xrange(len(A1)): cb.cublasSetStream(handle, streams[s].handle) cublasZgemm_dev( handle, "N", "C", D, D, D, ones[s].gpudata, x.gpudata, D, A1[s].gpudata, D, zeros[s].gpudata, tmp[s].gpudata, D, ) cublasZgemm_dev( handle, "N", "N", D, D, D, ones[s].gpudata, A2[s].gpudata, D, tmp[s].gpudata, D, zeros[s].gpudata, tmp2[s].gpudata, D, ) for s in streams: s.synchronize() cb._libcublas.cublasSetPointerMode_v2(handle, 0) cb.cublasSetStream(handle, 0) out.fill(0) for s in xrange(len(A1)): cb.cublasZaxpy(handle, D * D, 1.0, tmp2[s].gpudata, 1, out.gpudata, 1) return out
def eps_l_noop_strm(x, A1, A2, out, tmp, tmp2, streams, handle): D = A1[0].shape[0] for s in xrange(len(A1)): cb.cublasSetStream(handle, streams[s].handle) cb.cublasZgemm(handle, "N", "C", D, D, D, 1.0, x.gpudata, D, A1[s].gpudata, D, 0.0, tmp[s].gpudata, D) cb.cublasZgemm(handle, "N", "N", D, D, D, 1.0, A2[s].gpudata, D, tmp[s].gpudata, D, 0.0, tmp2[s].gpudata, D) for s in streams: s.synchronize() cb.cublasSetStream(handle, 0) out.fill(0) for s in xrange(len(A1)): cb.cublasZaxpy(handle, D * D, 1.0, tmp2[s].gpudata, 1, out.gpudata, 1) return out
def eps_l_noop_strm(x, A1, A2, out, tmp, tmp2, streams, handle): D = A1[0].shape[0] for s in range(len(A1)): cb.cublasSetStream(handle, streams[s].handle) cb.cublasZgemm(handle, 'N', 'C', D, D, D, 1., x.gpudata, D, A1[s].gpudata, D, 0., tmp[s].gpudata, D) cb.cublasZgemm(handle, 'N', 'N', D, D, D, 1., A2[s].gpudata, D, tmp[s].gpudata, D, 0., tmp2[s].gpudata, D) for s in streams: s.synchronize() cb.cublasSetStream(handle, 0) out.fill(0) for s in range(len(A1)): cb.cublasZaxpy(handle, D * D, 1., tmp2[s].gpudata, 1, out.gpudata, 1) return out
def eps_r_noop_strm(x, A1, A2, out, tmp, tmp2, streams, handle): D = A1[0].shape[0] Dm1 = D for s in range(len(A1)): cb.cublasSetStream(handle, streams[s].handle) cb.cublasZgemm(handle, 'N', 'N', D, Dm1, D, 1., x.gpudata, D, A1[s].gpudata, D, 0., tmp[s].gpudata, D) cb.cublasZgemm(handle, 'C', 'N', Dm1, Dm1, D, 1., A2[s].gpudata, D, tmp[s].gpudata, D, 0., tmp2[s].gpudata, Dm1) for s in streams: s.synchronize() cb.cublasSetStream(handle, 0) out.fill(0) for s in range(len(A1)): #cb.cublasZgeam(handle, 'N', 'N', Dm1, Dm1, 0. if s == 0 else 1., out.gpudata, Dm1, 1., tmp2[s].gpudata, Dm1, out.gpudata, Dm1) cb.cublasZaxpy(handle, Dm1 * Dm1, 1., tmp2[s].gpudata, 1, out.gpudata, 1) return out
def calc_Bs(N, A, l, l_s, l_si, r, r_s, r_si, C, K, Vsh): GA = [] for An in A: if An is None: GA.append(None) else: GAn = [] for Ans in An: GAn.append(garr.to_gpu(Ans)) GA.append(GAn) GA.append(None) Gl = [] Gl_s = [] Gl_si = [] for n in range(len(l)): if l[n] is None: Gl.append(None) Gl_s.append(None) Gl_si.append(None) else: Gl.append(garr.to_gpu(sp.asarray( l[n]))) #TODO: Support special types... Gl_s.append(garr.to_gpu(sp.asarray(l_s[n]))) Gl_si.append(garr.to_gpu(sp.asarray(l_si[n]))) Gl.append(None) Gl_s.append(None) Gl_si.append(None) Gr = [] Gr_s = [] Gr_si = [] for n in range(len(r)): if r[n] is None: Gr.append(None) Gr_s.append(None) Gr_si.append(None) else: Gr.append(garr.to_gpu(sp.asarray( r[n]))) #TODO: Support special types... Gr_s.append(garr.to_gpu(sp.asarray(r_s[n]))) Gr_si.append(garr.to_gpu(sp.asarray(r_si[n]))) Gr.append(None) Gr_s.append(None) Gr_si.append(None) GK = [] for n in range(len(K)): if K[n] is None: GK.append(None) else: GK.append(garr.to_gpu(sp.asarray(K[n]))) GK.append(None) GVsh = [] for n in range(len(Vsh)): if Vsh[n] is None: GVsh.append(None) else: GVshn = [] for s in range(Vsh[n].shape[0]): GVshn.append(garr.to_gpu(Vsh[n][s])) GVsh.append(GVshn) GC = [] for n in range(len(C)): if C[n] is None: GC.append(None) else: GCn = [] for s in range(C[n].shape[0]): GCns = [] for t in range(C[n].shape[1]): GCns.append(garr.to_gpu(C[n][s, t])) GCn.append(GCns) GC.append(GCn) GC.append(None) GCts = [] for n in range(len(GC)): if GC[n] is None: GCts.append(None) else: GCtsn = [] for t in range(len(GC[n])): GCtsns = [] for s in range(len(GC[n][0])): GCtsns.append(GC[n][s][t]) GCtsn.append(GCtsns) GCts.append(GCtsn) hdl = cb.cublasCreate() num_strms = 10 curr_stream = cb.cublasGetStream(hdl) sites_per_strm = max((N) // num_strms, 1) #print "sites_per_stream = ", sites_per_strm strms = [] for i in range(N // sites_per_strm): strms.append(cd.Stream()) GB = [None] for n in range(1, N + 1): if (n - 1) % sites_per_strm == 0: #print n #print "strm = ", (n - 1) // sites_per_strm cb.cublasSetStream(hdl, strms[(n - 1) // sites_per_strm].handle) if not Vsh[n] is None: if n > 1: Glm2 = Gl[n - 2] else: Glm2 = None Gx = calc_x_G(GK[n + 1], GC[n], GCts[n - 1], Gr[n + 1], Glm2, GA[n - 1], GA[n], GA[n + 1], Gl_s[n - 1], Gl_si[n - 1], Gr_s[n], Gr_si[n], GVsh[n], handle=hdl) GBn = [] for s in range(A[n].shape[0]): GBns = cla.dot(Gl_si[n - 1], Gx, handle=hdl) GBns = cla.dot(GBns, GVsh[n][s], transb='C', handle=hdl) GBns = cla.dot(GBns, Gr_si[n], handle=hdl) GBn.append(GBns) GB.append(GBn) else: GB.append(None) cb.cublasSetStream(hdl, curr_stream) cb.cublasDestroy(hdl) B = [None] for n in range(1, N + 1): if GB[n] is None: B.append(None) else: Bn = sp.empty_like(A[n]) for s in range(A[n].shape[0]): Bn[s] = GB[n][s].get() B.append(Bn) return B
def calc_Bs(N, A, l, l_s, l_si, r, r_s, r_si, C, K, Vsh): GA = [] for An in A: if An is None: GA.append(None) else: GAn = [] for Ans in An: GAn.append(garr.to_gpu(Ans)) GA.append(GAn) GA.append(None) Gl = [] Gl_s = [] Gl_si = [] for n in range(len(l)): if l[n] is None: Gl.append(None) Gl_s.append(None) Gl_si.append(None) else: Gl.append(garr.to_gpu(sp.asarray(l[n]))) #TODO: Support special types... Gl_s.append(garr.to_gpu(sp.asarray(l_s[n]))) Gl_si.append(garr.to_gpu(sp.asarray(l_si[n]))) Gl.append(None) Gl_s.append(None) Gl_si.append(None) Gr = [] Gr_s = [] Gr_si = [] for n in range(len(r)): if r[n] is None: Gr.append(None) Gr_s.append(None) Gr_si.append(None) else: Gr.append(garr.to_gpu(sp.asarray(r[n]))) #TODO: Support special types... Gr_s.append(garr.to_gpu(sp.asarray(r_s[n]))) Gr_si.append(garr.to_gpu(sp.asarray(r_si[n]))) Gr.append(None) Gr_s.append(None) Gr_si.append(None) GK = [] for n in range(len(K)): if K[n] is None: GK.append(None) else: GK.append(garr.to_gpu(sp.asarray(K[n]))) GK.append(None) GVsh = [] for n in range(len(Vsh)): if Vsh[n] is None: GVsh.append(None) else: GVshn = [] for s in range(Vsh[n].shape[0]): GVshn.append(garr.to_gpu(Vsh[n][s])) GVsh.append(GVshn) GC = [] for n in range(len(C)): if C[n] is None: GC.append(None) else: GCn = [] for s in range(C[n].shape[0]): GCns = [] for t in range(C[n].shape[1]): GCns.append(garr.to_gpu(C[n][s, t])) GCn.append(GCns) GC.append(GCn) GC.append(None) GCts = [] for n in range(len(GC)): if GC[n] is None: GCts.append(None) else: GCtsn = [] for t in range(len(GC[n])): GCtsns = [] for s in range(len(GC[n][0])): GCtsns.append(GC[n][s][t]) GCtsn.append(GCtsns) GCts.append(GCtsn) hdl = cb.cublasCreate() num_strms = 10 curr_stream = cb.cublasGetStream(hdl) sites_per_strm = max((N) // num_strms, 1) #print "sites_per_stream = ", sites_per_strm strms = [] for i in range(N // sites_per_strm): strms.append(cd.Stream()) GB = [None] for n in range(1, N + 1): if (n - 1) % sites_per_strm == 0: #print n #print "strm = ", (n - 1) // sites_per_strm cb.cublasSetStream(hdl, strms[(n - 1) // sites_per_strm].handle) if not Vsh[n] is None: if n > 1: Glm2 = Gl[n - 2] else: Glm2 = None Gx = calc_x_G(GK[n + 1], GC[n], GCts[n - 1], Gr[n + 1], Glm2, GA[n - 1], GA[n], GA[n + 1], Gl_s[n - 1], Gl_si[n - 1], Gr_s[n], Gr_si[n], GVsh[n], handle=hdl) GBn = [] for s in range(A[n].shape[0]): GBns = cla.dot(Gl_si[n - 1], Gx, handle=hdl) GBns = cla.dot(GBns, GVsh[n][s], transb='C', handle=hdl) GBns = cla.dot(GBns, Gr_si[n], handle=hdl) GBn.append(GBns) GB.append(GBn) else: GB.append(None) cb.cublasSetStream(hdl, curr_stream) cb.cublasDestroy(hdl) B = [None] for n in range(1, N + 1): if GB[n] is None: B.append(None) else: Bn = sp.empty_like(A[n]) for s in range(A[n].shape[0]): Bn[s] = GB[n][s].get() B.append(Bn) return B