def test_cublasSgemmBatched(self): l, m, k, n = 11, 7, 5, 3 A = np.random.rand(l, m, k).astype(np.float32) B = np.random.rand(l, k, n).astype(np.float32) C_res = np.einsum('nij,njk->nik', A, B) a_gpu = gpuarray.to_gpu(A) b_gpu = gpuarray.to_gpu(B) c_gpu = gpuarray.empty((l, m, n), np.float32) alpha = np.float32(1.0) beta = np.float32(0.0) a_arr = bptrs(a_gpu) b_arr = bptrs(b_gpu) c_arr = bptrs(c_gpu) cublas.cublasSgemmBatched(self.cublas_handle, 'n','n', n, m, k, alpha, b_arr.gpudata, n, a_arr.gpudata, k, beta, c_arr.gpudata, n, l) assert np.allclose(C_res, c_gpu.get())
def solve_gpu(As, Bs): batch_size, num_factors = As.shape if allocated_shape[0] == As.shape: # reuse previous allocations As_gpu, Bs_gpu, P_gpu, info_gpu, Cs_gpu, Rs_gpu, A_arr, B_arr, C_arr, R_arr = allocations[0] As_gpu.set(As) Bs_gpu.set(Bs) else: # allocate # transfer As and Bs to GPU As_gpu = pycuda.gpuarray.to_gpu(As.astype('float32')) Bs_gpu = pycuda.gpuarray.to_gpu(Bs.astype('float32')) # allocate arrays P_gpu = pycuda.gpuarray.empty((batch_size, num_factors), np.int32) info_gpu = pycuda.gpuarray.zeros(batch_size, np.int32) Cs_gpu = pycuda.gpuarray.empty_like(Bs_gpu) # inverted Bs. Rs_gpu = pycuda.gpuarray.empty_like(As_gpu) # final output, As * inverted Bs. # get pointer arrays A_arr = bptrs(As_gpu) B_arr = bptrs(Bs_gpu) C_arr = bptrs(Cs_gpu) R_arr = bptrs(Rs_gpu) allocated_shape[0] = As.shape allocations[0] = As_gpu, Bs_gpu, P_gpu, info_gpu, Cs_gpu, Rs_gpu, A_arr, B_arr, C_arr, R_arr handle = scikits.cuda.misc._global_cublas_handle # perform LU factorization cublas.cublasSgetrfBatched(handle, num_factors, B_arr.gpudata, num_factors, P_gpu.gpudata, info_gpu.gpudata, batch_size) # the LU factorization is now in Bs_gpu! # use factorization to perform inversion cublas.cublasSgetriBatched(handle, num_factors, B_arr.gpudata, num_factors, P_gpu.gpudata, C_arr.gpudata, num_factors, info_gpu.gpudata, batch_size) # the inverted matrices are now in Cs_gpu! # compute dot products dot(A, C) = dot(A, Binv). Note that the As are actually vectors! transb = 'n' transa = 'n' N, k, m = Cs_gpu.shape N2, l = As_gpu.shape n = 1 # As_gpu is a batch of vectors, not matrices, but we treat it as a batch of matrices with leading dimension 1. # kind of tricky, but it seems to work. The same goes for the output array Rs_gpu. lda = max(1, m) ldb = max(1, k) ldc = max(1, m) alpha = np.float32(1.0) beta = np.float32(0.0) cublas.cublasSgemmBatched(handle, transb, transa, m, n, k, alpha, C_arr.gpudata, lda, A_arr.gpudata, ldb, beta, R_arr.gpudata, ldc, N) # the resulting batch of vectors is now in Rs_gpu. return Rs_gpu.get()
def solve_gpu(As, Bs): batch_size, num_factors = As.shape if allocated_shape[0] == As.shape: # reuse previous allocations As_gpu, Bs_gpu, P_gpu, info_gpu, Cs_gpu, Rs_gpu, A_arr, B_arr, C_arr, R_arr = allocations[ 0] As_gpu.set(As) Bs_gpu.set(Bs) else: # allocate # transfer As and Bs to GPU As_gpu = pycuda.gpuarray.to_gpu(As.astype('float32')) Bs_gpu = pycuda.gpuarray.to_gpu(Bs.astype('float32')) # allocate arrays P_gpu = pycuda.gpuarray.empty((batch_size, num_factors), np.int32) info_gpu = pycuda.gpuarray.zeros(batch_size, np.int32) Cs_gpu = pycuda.gpuarray.empty_like(Bs_gpu) # inverted Bs. Rs_gpu = pycuda.gpuarray.empty_like( As_gpu) # final output, As * inverted Bs. # get pointer arrays A_arr = bptrs(As_gpu) B_arr = bptrs(Bs_gpu) C_arr = bptrs(Cs_gpu) R_arr = bptrs(Rs_gpu) allocated_shape[0] = As.shape allocations[ 0] = As_gpu, Bs_gpu, P_gpu, info_gpu, Cs_gpu, Rs_gpu, A_arr, B_arr, C_arr, R_arr handle = scikits.cuda.misc._global_cublas_handle # perform LU factorization cublas.cublasSgetrfBatched(handle, num_factors, B_arr.gpudata, num_factors, P_gpu.gpudata, info_gpu.gpudata, batch_size) # the LU factorization is now in Bs_gpu! # use factorization to perform inversion cublas.cublasSgetriBatched(handle, num_factors, B_arr.gpudata, num_factors, P_gpu.gpudata, C_arr.gpudata, num_factors, info_gpu.gpudata, batch_size) # the inverted matrices are now in Cs_gpu! # compute dot products dot(A, C) = dot(A, Binv). Note that the As are actually vectors! transb = 'n' transa = 'n' N, k, m = Cs_gpu.shape N2, l = As_gpu.shape n = 1 # As_gpu is a batch of vectors, not matrices, but we treat it as a batch of matrices with leading dimension 1. # kind of tricky, but it seems to work. The same goes for the output array Rs_gpu. lda = max(1, m) ldb = max(1, k) ldc = max(1, m) alpha = np.float32(1.0) beta = np.float32(0.0) cublas.cublasSgemmBatched(handle, transb, transa, m, n, k, alpha, C_arr.gpudata, lda, A_arr.gpudata, ldb, beta, R_arr.gpudata, ldc, N) # the resulting batch of vectors is now in Rs_gpu. return Rs_gpu.get()
def compute_sgemm_batched(cols, kernels, biases, handle, m, k, n): batchsize = len(cols) #takes gpu arrays of pointers to pointers alpha = np.float32(1.0); beta = np.float32(1.0); flop = 2*m*n*k*batchsize cublas.cublasSgemmBatched(handle, 'n', 'n', n, m, k, alpha, cols.ptr, n, kernels.ptr, k, beta, biases.ptr, n, batchsize);
def gpu_dot_batched(bx_gpu, by_gpu, bc_gpu, transa='N', transb='N', handle=None): """ uses cublasSgemmBatched to compute a bunch of dot products in parallel """ if handle is None: handle = scikits.cuda.misc._global_cublas_handle assert len(bx_gpu.shape) == 3 assert len(by_gpu.shape) == 3 assert len(bc_gpu.shape) == 3 assert bx_gpu.dtype == np.float32 assert by_gpu.dtype == np.float32 assert bc_gpu.dtype == np.float32 # Get the shapes of the arguments bx_shape = bx_gpu.shape by_shape = by_gpu.shape # Perform matrix multiplication for 2D arrays: alpha = np.float32(1.0) beta = np.float32(0.0) transa = string.lower(transa) transb = string.lower(transb) if transb in ['t', 'c']: N, m, k = by_shape elif transb in ['n']: N, k, m = by_shape else: raise ValueError('invalid value for transb') if transa in ['t', 'c']: N2, l, n = bx_shape elif transa in ['n']: N2, n, l = bx_shape else: raise ValueError('invalid value for transa') if l != k: raise ValueError('objects are not aligned') if N != N2: raise ValueError('batch sizes are not the same') if transb == 'n': lda = max(1, m) else: lda = max(1, k) if transa == 'n': ldb = max(1, k) else: ldb = max(1, n) ldc = max(1, m) # construct pointer arrays needed for cublasCgemmBatched bx_arr = bptrs(bx_gpu) by_arr = bptrs(by_gpu) bc_arr = bptrs(bc_gpu) cublas.cublasSgemmBatched(handle, transb, transa, m, n, k, alpha, by_arr.gpudata, lda, bx_arr.gpudata, ldb, beta, bc_arr.gpudata, ldc, N)
def compute_sgemm_batched(cols, kernels, biases, m, k, n, stream, handle): batchsize = len(cols) alpha = np.float32(1.0); beta = np.float32(1.0); flop = 2*m*n*k cublas.cublasSgemmBatched(handle, 'n', 'n', n, m, k, alpha, cols.ptr, n, kernels.ptr, k, beta, biases.ptr, n, batchsize)