def eps_r_noop_strm_dev(x, A1, A2, out, tmp, tmp2, ones, zeros, streams, handle): D = A1[0].shape[0] Dm1 = D cb._libcublas.cublasSetPointerMode_v2(handle, 1) for s in range(len(A1)): cb.cublasSetStream(handle, streams[s].handle) cublasZgemm_dev(handle, 'N', 'N', D, Dm1, D, ones[s].gpudata, x.gpudata, D, A1[s].gpudata, D, zeros[s].gpudata, tmp[s].gpudata, D) cublasZgemm_dev(handle, 'C', 'N', Dm1, Dm1, D, ones[s].gpudata, A2[s].gpudata, D, tmp[s].gpudata, D, zeros[s].gpudata, tmp2[s].gpudata, Dm1) for s in streams: s.synchronize() cb._libcublas.cublasSetPointerMode_v2(handle, 0) cb.cublasSetStream(handle, 0) out.fill(0) for s in range(len(A1)): #cb.cublasZgeam(handle, 'N', 'N', Dm1, Dm1, 0. if s == 0 else 1., out.gpudata, Dm1, 1., tmp2[s].gpudata, Dm1, out.gpudata, Dm1) cb.cublasZaxpy(handle, Dm1 * Dm1, 1., tmp2[s].gpudata, 1, out.gpudata, 1) return out
def test_cublasZaxpy(self): alpha = np.complex128(np.random.rand()+1j*np.random.rand()) x = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex128) x_gpu = gpuarray.to_gpu(x) y = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex128) y_gpu = gpuarray.to_gpu(y) cublas.cublasZaxpy(x_gpu.size, alpha, x_gpu.gpudata, 1, y_gpu.gpudata, 1) assert np.allclose(y_gpu.get(), alpha*x+y)
def test_cublasZaxpy(self): alpha = np.complex128(np.random.rand() + 1j * np.random.rand()) x = (np.random.rand(5) + 1j * np.random.rand(5)).astype(np.complex128) x_gpu = gpuarray.to_gpu(x) y = (np.random.rand(5) + 1j * np.random.rand(5)).astype(np.complex128) y_gpu = gpuarray.to_gpu(y) cublas.cublasZaxpy(self.cublas_handle, x_gpu.size, alpha, x_gpu.gpudata, 1, y_gpu.gpudata, 1) assert np.allclose(y_gpu.get(), alpha * x + y)
def eps_l_noop_strm_dev(x, A1, A2, out, tmp, tmp2, ones, zeros, streams, handle): D = A1[0].shape[0] cb._libcublas.cublasSetPointerMode_v2(handle, 1) for s in xrange(len(A1)): cb.cublasSetStream(handle, streams[s].handle) cublasZgemm_dev( handle, "N", "C", D, D, D, ones[s].gpudata, x.gpudata, D, A1[s].gpudata, D, zeros[s].gpudata, tmp[s].gpudata, D, ) cublasZgemm_dev( handle, "N", "N", D, D, D, ones[s].gpudata, A2[s].gpudata, D, tmp[s].gpudata, D, zeros[s].gpudata, tmp2[s].gpudata, D, ) for s in streams: s.synchronize() cb._libcublas.cublasSetPointerMode_v2(handle, 0) cb.cublasSetStream(handle, 0) out.fill(0) for s in xrange(len(A1)): cb.cublasZaxpy(handle, D * D, 1.0, tmp2[s].gpudata, 1, out.gpudata, 1) return out
def eps_l_noop_batch(x_ptrs, A1_ptrs, A2_ptrs, out, tmp_ptrs, tmp2_ptrs, tmp2, handle): D = out.shape[0] d = len(tmp2) cb.cublasZgemmBatched(handle, 'N', 'C', D, D, D, 1., x_ptrs.gpudata, D, A1_ptrs.gpudata, D, 0., tmp_ptrs.gpudata, D, d) cb.cublasZgemmBatched(handle, 'N', 'N', D, D, D, 1., A2_ptrs.gpudata, D, tmp_ptrs.gpudata, D, 0., tmp2_ptrs.gpudata, D, d) out.fill(0) for s in range(d): cb.cublasZaxpy(handle, D * D, 1., tmp2[s].gpudata, 1, out.gpudata, 1) return out
def matvec(self, v): x = v.reshape((self.D, self.D)) self.xG.set(x) #self.out2.set(self.xG) #self.out2[:] = self.xG cd.memcpy_dtod(self.out2.gpudata, self.xG.gpudata, self.xG.nbytes) out = [self.out, self.out_p] out2 = [self.out2, self.out2_p] if self.left: #Multiplying from the left, but x is a col. vector, so use mat_dagger for k in range(len(self.A1G)): if self.use_batch: eps_l_noop_batch(out2[1], self.A1G_p[k], self.A2G_p[k], out[0], self.tmp_p, self.tmp2_p, self.tmp2, self.hdl) else: eps_l_noop_strm_dev(out2[0], self.A1G[k], self.A2G[k], out[0], self.tmp, self.tmp2, self.ones, self.zeros, self.streams, self.hdl) out, out2 = out2, out Ehx = out2[0] if self.pseudo: QEQhx = Ehx - self.lG * m.adot(self.r, x) #res = QEQhx.mul_add(-sp.exp(-1.j * self.p), self.xG, 1) cb.cublasZaxpy(self.hdl, self.D**2, -sp.exp(-1.j * self.p), QEQhx.gpudata, 1, self.xG.gpudata, 1) res = self.xG else: #res = Ehx.mul_add(-sp.exp(-1.j * self.p), self.xG, 1) cb.cublasZaxpy(self.hdl, self.D**2, -sp.exp(-1.j * self.p), Ehx.gpudata, 1, self.xG.gpudata, 1) res = self.xG else: for k in range(len(self.A2G) - 1, -1, -1): if self.use_batch: eps_r_noop_batch(out2[1], self.A1G_p[k], self.A2G_p[k], out[0], self.tmp_p, self.tmp2_p, self.tmp2, self.hdl) else: eps_r_noop_strm_dev(out2[0], self.A1G[k], self.A2G[k], out[0], self.tmp, self.tmp2, self.ones, self.zeros, self.streams, self.hdl) out, out2 = out2, out Ex = out2[0] if self.pseudo: QEQx = Ex - self.rG * m.adot(self.l, x) #res = QEQx.mul_add(-sp.exp(1.j * self.p), self.xG, 1) cb.cublasZaxpy(self.hdl, self.D**2, -sp.exp(1.j * self.p), QEQx.gpudata, 1, self.xG.gpudata, 1) res = self.xG else: #res = Ex.mul_add(-sp.exp(1.j * self.p), self.xG, 1) cb.cublasZaxpy(self.hdl, self.D**2, -sp.exp(1.j * self.p), Ex.gpudata, 1, self.xG.gpudata, 1) res = self.xG return res.get().ravel()
def eps_l_noop_strm(x, A1, A2, out, tmp, tmp2, streams, handle): D = A1[0].shape[0] for s in xrange(len(A1)): cb.cublasSetStream(handle, streams[s].handle) cb.cublasZgemm(handle, "N", "C", D, D, D, 1.0, x.gpudata, D, A1[s].gpudata, D, 0.0, tmp[s].gpudata, D) cb.cublasZgemm(handle, "N", "N", D, D, D, 1.0, A2[s].gpudata, D, tmp[s].gpudata, D, 0.0, tmp2[s].gpudata, D) for s in streams: s.synchronize() cb.cublasSetStream(handle, 0) out.fill(0) for s in xrange(len(A1)): cb.cublasZaxpy(handle, D * D, 1.0, tmp2[s].gpudata, 1, out.gpudata, 1) return out
def eps_r_noop_batch(x_ptrs, A1_ptrs, A2_ptrs, out, tmp_ptrs, tmp2_ptrs, tmp2, handle): D = out.shape[0] Dm1 = D d = len(tmp2) cb.cublasZgemmBatched( handle, "N", "N", D, Dm1, D, 1.0, x_ptrs.gpudata, D, A1_ptrs.gpudata, D, 0.0, tmp_ptrs.gpudata, D, d ) cb.cublasZgemmBatched( handle, "C", "N", Dm1, Dm1, D, 1.0, A2_ptrs.gpudata, D, tmp_ptrs.gpudata, D, 0.0, tmp2_ptrs.gpudata, Dm1, d ) out.fill(0) for s in xrange(d): cb.cublasZaxpy(handle, Dm1 * Dm1, 1.0, tmp2[s].gpudata, 1, out.gpudata, 1) return out
def eps_l_noop_strm(x, A1, A2, out, tmp, tmp2, streams, handle): D = A1[0].shape[0] for s in range(len(A1)): cb.cublasSetStream(handle, streams[s].handle) cb.cublasZgemm(handle, 'N', 'C', D, D, D, 1., x.gpudata, D, A1[s].gpudata, D, 0., tmp[s].gpudata, D) cb.cublasZgemm(handle, 'N', 'N', D, D, D, 1., A2[s].gpudata, D, tmp[s].gpudata, D, 0., tmp2[s].gpudata, D) for s in streams: s.synchronize() cb.cublasSetStream(handle, 0) out.fill(0) for s in range(len(A1)): cb.cublasZaxpy(handle, D * D, 1., tmp2[s].gpudata, 1, out.gpudata, 1) return out
def eps_r_noop_strm(x, A1, A2, out, tmp, tmp2, streams, handle): D = A1[0].shape[0] Dm1 = D for s in range(len(A1)): cb.cublasSetStream(handle, streams[s].handle) cb.cublasZgemm(handle, 'N', 'N', D, Dm1, D, 1., x.gpudata, D, A1[s].gpudata, D, 0., tmp[s].gpudata, D) cb.cublasZgemm(handle, 'C', 'N', Dm1, Dm1, D, 1., A2[s].gpudata, D, tmp[s].gpudata, D, 0., tmp2[s].gpudata, Dm1) for s in streams: s.synchronize() cb.cublasSetStream(handle, 0) out.fill(0) for s in range(len(A1)): #cb.cublasZgeam(handle, 'N', 'N', Dm1, Dm1, 0. if s == 0 else 1., out.gpudata, Dm1, 1., tmp2[s].gpudata, Dm1, out.gpudata, Dm1) cb.cublasZaxpy(handle, Dm1 * Dm1, 1., tmp2[s].gpudata, 1, out.gpudata, 1) return out