def gpu_sdot(a, b): assert a.size == b.size assert a.shape[0] == b.shape[1] cublas.cublasInit() cublas.cublasFree(0) d_X = Linear(a.shape).from_numpy(a) d_Y = Linear(b.shape).from_numpy(b) gpu_result = cublas.cublasSdot(a.shape[1], d_X.ref, 1, d_Y.ref, 1) cuda.cudaThreadSynchronize() cublas.cublasShutdown() return gpu_result
def gpu_saxpy(a, b, alpha): # init cublas lib cublasInit() # allocate device vectors from host d_X = Linear(a.shape).from_numpy(a) d_Y = Linear(b.shape).from_numpy(b) # execute cublasSaxpy and sync threads cublasSaxpy(a.shape[1], alpha, d_X.ref, 1, d_Y.ref, 1) cudaThreadSynchronize() return d_Y.to_numpy()
def gpu_saxpy(a,b,alpha): # init cublas lib cublasInit() # allocate device vectors from host d_X = Linear(a.shape).from_numpy(a) d_Y = Linear(b.shape).from_numpy(b) # execute cublasSaxpy and sync threads cublasSaxpy(a.shape[1],alpha,d_X.ref,1,d_Y.ref,1) cudaThreadSynchronize() return d_Y.to_numpy()
def gpu_sgemm(a,b, alpha=1): """ Single Precision Matrix Multiplication on GPU, expects two, two-dimensional numpy arrays as input. Arrays must be such that a.shape[1] == b.shape[0]. Optionally specify alpha for scalar multiplication""" # init cublas cublasInit() assert a.shape[1] == b.shape[0] c_shape = (a.shape[0], b.shape[1]) # allocate device matrices from host dA = Linear(a.shape, order='F').from_numpy(a) dB = Linear(b.shape, order='F').from_numpy(b) dC = Linear(c_shape, order='F') # transpose a/b ? t = yes, n = no transa = 'n' transb = 'n' # compute with CUBLAS cublasSgemm( transa, transb, a.shape[0], b.shape[1], a.shape[1], alpha, dA.ref, a.shape[0], dB.ref, b.shape[0], 0, dC.ref, a.shape[0] ) cudaThreadSynchronize() # shutdown cublasShutdown() return dC.to_numpy()
def gpu_sgemm(a, b, alpha=1): """ Single Precision Matrix Multiplication on GPU, expects two, two-dimensional numpy arrays as input. Arrays must be such that a.shape[1] == b.shape[0]. Optionally specify alpha for scalar multiplication""" # init cublas cublasInit() assert a.shape[1] == b.shape[0] c_shape = (a.shape[0], b.shape[1]) # allocate device matrices from host dA = Linear(a.shape, order='F').from_numpy(a) dB = Linear(b.shape, order='F').from_numpy(b) dC = Linear(c_shape, order='F') # transpose a/b ? t = yes, n = no transa = 'n' transb = 'n' # compute with CUBLAS cublasSgemm(transa, transb, a.shape[0], b.shape[1], a.shape[1], alpha, dA.ref, a.shape[0], dB.ref, b.shape[0], 0, dC.ref, a.shape[0]) cudaThreadSynchronize() # shutdown cublasShutdown() return dC.to_numpy()