def eps_l_noop_strm(x, A1, A2, out, tmp, tmp2, streams, handle): D = A1[0].shape[0] for s in xrange(len(A1)): cb.cublasSetStream(handle, streams[s].handle) cb.cublasZgemm(handle, "N", "C", D, D, D, 1.0, x.gpudata, D, A1[s].gpudata, D, 0.0, tmp[s].gpudata, D) cb.cublasZgemm(handle, "N", "N", D, D, D, 1.0, A2[s].gpudata, D, tmp[s].gpudata, D, 0.0, tmp2[s].gpudata, D) for s in streams: s.synchronize() cb.cublasSetStream(handle, 0) out.fill(0) for s in xrange(len(A1)): cb.cublasZaxpy(handle, D * D, 1.0, tmp2[s].gpudata, 1, out.gpudata, 1) return out
def eps_l_noop_strm(x, A1, A2, out, tmp, tmp2, streams, handle): D = A1[0].shape[0] for s in range(len(A1)): cb.cublasSetStream(handle, streams[s].handle) cb.cublasZgemm(handle, 'N', 'C', D, D, D, 1., x.gpudata, D, A1[s].gpudata, D, 0., tmp[s].gpudata, D) cb.cublasZgemm(handle, 'N', 'N', D, D, D, 1., A2[s].gpudata, D, tmp[s].gpudata, D, 0., tmp2[s].gpudata, D) for s in streams: s.synchronize() cb.cublasSetStream(handle, 0) out.fill(0) for s in range(len(A1)): cb.cublasZaxpy(handle, D * D, 1., tmp2[s].gpudata, 1, out.gpudata, 1) return out
def eps_r_noop_strm(x, A1, A2, out, tmp, tmp2, streams, handle): D = A1[0].shape[0] Dm1 = D for s in range(len(A1)): cb.cublasSetStream(handle, streams[s].handle) cb.cublasZgemm(handle, 'N', 'N', D, Dm1, D, 1., x.gpudata, D, A1[s].gpudata, D, 0., tmp[s].gpudata, D) cb.cublasZgemm(handle, 'C', 'N', Dm1, Dm1, D, 1., A2[s].gpudata, D, tmp[s].gpudata, D, 0., tmp2[s].gpudata, Dm1) for s in streams: s.synchronize() cb.cublasSetStream(handle, 0) out.fill(0) for s in range(len(A1)): #cb.cublasZgeam(handle, 'N', 'N', Dm1, Dm1, 0. if s == 0 else 1., out.gpudata, Dm1, 1., tmp2[s].gpudata, Dm1, out.gpudata, Dm1) cb.cublasZaxpy(handle, Dm1 * Dm1, 1., tmp2[s].gpudata, 1, out.gpudata, 1) return out