예제 #1
0
    def test_cublasSgemmBatched(self):
        l, m, k, n = 11, 7, 5, 3
        A = np.random.rand(l, m, k).astype(np.float32)
        B = np.random.rand(l, k, n).astype(np.float32)

        C_res = np.einsum('nij,njk->nik', A, B)

        a_gpu = gpuarray.to_gpu(A)
        b_gpu = gpuarray.to_gpu(B)
        c_gpu = gpuarray.empty((l, m, n), np.float32)

        alpha = np.float32(1.0)
        beta = np.float32(0.0)

        a_arr = bptrs(a_gpu)
        b_arr = bptrs(b_gpu)
        c_arr = bptrs(c_gpu)

        cublas.cublasSgemmBatched(self.cublas_handle, 'n','n',
                                  n, m, k, alpha,
                                  b_arr.gpudata, n,
                                  a_arr.gpudata, k,
                                  beta, c_arr.gpudata, n, l)

        assert np.allclose(C_res, c_gpu.get())
예제 #2
0
 def matmul(i, w, o):
     l, m, n = o.shape
     k = i.shape[2]
     if TRANSPOSE_WEIGHT:
         cublas.cublasSgemmBatched(cublas_handle(), 'T', 'N', n, m, k,
                                   ALPHA, w.bptrs, n, i.bptrs, k, BETA,
                                   o.bptrs, n, l)
     else:
         cublas.cublasSgemmBatched(cublas_handle(), 'N', 'N', n, m, k,
                                   ALPHA, w.bptrs, n, i.bptrs, k, BETA,
                                   o.bptrs, n, l)