def test_cublasSgemmBatched(self): l, m, k, n = 11, 7, 5, 3 A = np.random.rand(l, m, k).astype(np.float32) B = np.random.rand(l, k, n).astype(np.float32) C_res = np.einsum('nij,njk->nik', A, B) a_gpu = gpuarray.to_gpu(A) b_gpu = gpuarray.to_gpu(B) c_gpu = gpuarray.empty((l, m, n), np.float32) alpha = np.float32(1.0) beta = np.float32(0.0) a_arr = bptrs(a_gpu) b_arr = bptrs(b_gpu) c_arr = bptrs(c_gpu) cublas.cublasSgemmBatched(self.cublas_handle, 'n','n', n, m, k, alpha, b_arr.gpudata, n, a_arr.gpudata, k, beta, c_arr.gpudata, n, l) assert np.allclose(C_res, c_gpu.get())
def matmul(i, w, o): l, m, n = o.shape k = i.shape[2] if TRANSPOSE_WEIGHT: cublas.cublasSgemmBatched(cublas_handle(), 'T', 'N', n, m, k, ALPHA, w.bptrs, n, i.bptrs, k, BETA, o.bptrs, n, l) else: cublas.cublasSgemmBatched(cublas_handle(), 'N', 'N', n, m, k, ALPHA, w.bptrs, n, i.bptrs, k, BETA, o.bptrs, n, l)