def forward(self, matrix1, matrix2): with torch.cuda.device_of(matrix1): dim1, dim2 = matrix1.size() dim2, dim3 = matrix2.size() output = matrix1.new(dim1, dim3) handle = torch.cuda.current_blas_handle() stream = torch.cuda.current_stream() cublas.cublasSetStream(handle, stream) if isinstance(matrix1, torch.cuda.FloatTensor): cublas.cublasSgemm(handle, 'n', 'n', dim3, dim1, dim2, 1, matrix2.data_ptr(), dim3, matrix1.data_ptr(), dim2, 0, output.data_ptr(), dim3) elif isinstance(matrix1, torch.cuda.DoubleTensor): cublas.cublasDgemm(handle, 'n', 'n', dim3, dim1, dim2, 1, matrix2.data_ptr(), dim3, matrix1.data_ptr(), dim2, 0, output.data_ptr(), dim3) self.save_for_backward(matrix1, matrix2) return output
def mult_BLAS(): alpha = np.float64(1.0) # no prefactor beta = np.float64(0.0) # C matrix is not involved so beta = 0.0 #m, k, n = ud.basis_size, ud.basis_size, ud.basis_size**2 t0 = time.clock() for a in range(100): cublas.cublasDgemm(handle = cublas.cublasCreate(), transa = 'n', transb = 'n', m = ud.i, n = ud.j_k, k = ud.i_prime, lda = ud.i, ldb = ud.i_prime, ldc = ud.i, alpha = alpha, beta = beta, A = T_gpu.gpudata, B = v_x_gpu.gpudata, C = U_x_gpu.gpudata, ) cublas.cublasDgemm(handle = cublas.cublasCreate(), transa = 'n', transb = 'n', m = ud.i, n = ud.j_k, k = ud.i_prime, lda = ud.i, ldb = ud.i_prime, ldc = ud.i, alpha = alpha, beta = beta, A = T_gpu.gpudata, B = v_y_gpu.gpudata, C = U_y_gpu.gpudata, ) cublas.cublasDgemm(handle = cublas.cublasCreate(), transa = 'n', transb = 'n', m = ud.i, n = ud.j_k, k = ud.i_prime, lda = ud.i, ldb = ud.i_prime, ldc = ud.i_prime, alpha = alpha, beta = beta, A = T_gpu.gpudata, B = v_z_gpu.gpudata, C = U_z_gpu.gpudata, ) '''cublas.cublasDgemm(handle = cublas.cublasCreate(), transa = 'n', transb = 'n', m = ud.i, n = ud.j_k, k = ud.i_prime, lda = ud.i, ldb = ud.i_prime, ldc = ud.i, alpha = alpha, beta = beta, A = pot_gpu.gpudata, B = v_x_gpu.gpudata, C = potential_gpu.gpudata, )''' print(time.clock() - t0, "mult_BLAS timer") return
def diag_gpu(A, v1): # handle current_handle = cublas.cublasCreate() m = A.shape[0] Q = np.zeros((m, m), dtype=np.float64) # Q[0, :] = 0.0 # implied Q[1, :] = v1.copy() beta = np.zeros(m, dtype=np.float64) alpha = np.zeros(m, dtype=np.float64) # move data onto the GPU A_gpu = gpuarray.to_gpu(A) Q_gpu = gpuarray.to_gpu(Q) beta_gpu = gpuarray.to_gpu(beta) alpha_gpu = gpuarray.to_gpu(alpha) w = gpuarray.zeros(m, dtype=np.float64) # we define three kernels for simple arithmetic w_scale = ElementwiseKernel( arguments="double *w, double *alpha, double *beta, double *Q1, double *Q2, int loop_index", operation="w[i] = w[i] - (alpha[loop_index] * Q1[i]) - (beta[loop_index] * Q2[i])", name="element_wise_w_building") # using -= to do inplace subtraction gives an incorrect answer norm_krnl = ReductionKernel(np.float64, neutral="0.0", reduce_expr="a+b", map_expr="x[i]*x[i]", arguments="double *x") ediv = ElementwiseKernel( arguments="double *a, double *b, double *c, int loop_index", operation="a[i] = b[i] / c[loop_index+1]", name="element_wise_division") # the name must not have spaces!!!! for i in range(1, m-1): cublas.cublasDgemv(handle = current_handle, trans = 'T', m = m, n = m, # Hermitian matrix alpha = 1.0, beta = 0.0, A = A_gpu.gpudata, lda = m, x = Q_gpu[i, :].gpudata, incx = 1, y = w.gpudata, incy = 1, ) cublas.cublasDgemm(handle = current_handle, transa = 'n', transb = 'n', m = 1, n = 1, k = m, lda = 1, ldb = m, ldc = 1, alpha = 1.0, beta = 0.0, A = w.gpudata, B = Q_gpu[i, :].gpudata, C = alpha_gpu[i].gpudata) w_scale(w, alpha_gpu, beta_gpu, Q_gpu[i, :], Q_gpu[i-1, :], i) beta_gpu[i+1] = cumath.sqrt(norm_krnl(w)) ediv(Q_gpu[i+1, :], w, beta_gpu, i) # end of loop # last 2 steps cublas.cublasDgemv(handle = current_handle, trans = 'T', m = m, n = m, # Hermitian matrix alpha = 1.0, beta = 0.0, A = A_gpu.gpudata, lda = m, x = Q_gpu[-1, :].gpudata, incx = 1, y = w.gpudata, incy = 1,) cublas.cublasDgemm(handle = current_handle, transa = 'n', transb = 'n', m = 1, n = 1, k = m, lda = 1, ldb = m, ldc = 1, alpha = 1.0, beta = 0.0, A = w.gpudata, B = Q_gpu[-1, :].gpudata, C = alpha_gpu[-1].gpudata) # retrive the alpha's and betas alpha_cpu = alpha_gpu.get() beta_cpu = beta_gpu.get() print("GPU: ", alpha_cpu, beta_cpu, sep="\n\n") # make tridiagonal matrix out of alpha and B # Tri = np.zeros(matrix_size) return
# allocate space on gpu for results U_x_gpu = gpuarray.zeros((basis_size, basis_size*basis_size), np.float64) # an empty matrix of the right size U_y_gpu = gpuarray.zeros((basis_size, basis_size*basis_size), np.float64) # an empty matrix of the right size U_z_gpu = gpuarray.zeros((basis_size, basis_size*basis_size), np.float64) # an empty matrix of the right size #m, k, n = ud.basis_size, ud.basis_size, ud.basis_size**2 for basis in range(BASIS_SIZE): prepare_gpu(basis) # set it up i, j_k, i_prime = basis_size, basis_size*basis_size, basis_size initial_time = time.clock() for num_iter in range(ITERATIONS): cublas.cublasDgemm(handle = cublas.cublasCreate(), transa = 'n', transb = 'n', m = i, n = j_k, k = i_prime, lda = i, ldb = i_prime, ldc = i, alpha = ud.alpha, beta = ud.beta, A = T_gpu.gpudata, B = v_x_gpu.gpudata, C = U_x_gpu.gpudata, ) cublas.cublasDgemm(handle = cublas.cublasCreate(), transa = 'n', transb = 'n', m = i, n = j_k, k = i_prime, lda = i, ldb = i_prime, ldc = i, alpha = ud.alpha, beta = ud.beta, A = T_gpu.gpudata, B = v_y_gpu.gpudata, C = U_y_gpu.gpudata, ) cublas.cublasDgemm(handle = cublas.cublasCreate(), transa = 'n', transb = 'n', m = i, n = j_k, k = i_prime, lda = i, ldb = i_prime, ldc = i,
import pycuda.autoinit import pycuda.gpuarray as gpuarray import numpy as np import skcuda.cublas as cublas A = np.array(([1, 2, 3], [4, 5, 6]), order='F').astype(np.float64) B = np.array(([7, 8, 1, 5], [9, 10, 0, 9], [11, 12, 5, 5]), order='F').astype(np.float64) A_gpu = gpuarray.to_gpu(A) B_gpu = gpuarray.to_gpu(B) m, k = A_gpu.shape k, n = B_gpu.shape C_gpu = gpuarray.empty((m, n), np.float64) alpha = np.float64(1.0) beta = np.float64(0.0) cublas_handle = cublas.cublasCreate() cublas.cublasDgemm(cublas_handle, 'n', 'n', m, n, k, alpha, A_gpu.gpudata, m, B_gpu.gpudata, k, beta, C_gpu.gpudata, m) cublas.cublasDestroy(cublas_handle) C_gpu = C_gpu.reshape(C_gpu.shape, order='F') print(np.dot(A, B)) print(C_gpu)