def cublas_dot(A, B, C, repeat=1): lda = max(A.strides) // 4 ldb = max(B.strides) // 4 ldc = max(C.strides) // 4 opA = 't' if A.is_trans else 'n' opB = 't' if B.is_trans else 'n' op = opB + opA m = A.shape[0] n = B.shape[1] k = A.shape[1] start.record() # Swap A and B to map from C order to Fortran for r in range(repeat): cublas.cublasSgemm(handle, opB, opA, n, m, k, 1.0, B.gpudata, ldb, A.gpudata, lda, 0.0, C.gpudata, ldc) end.record() end.synchronize() msecs = end.time_since(start) / repeat gflops = (m * n * k * 2.0) / (msecs * 1000000.0) print "%7.3f msecs %4.0f gflops (%s_%s : %d,%d,%d)" % (msecs,gflops,"cublas",op,m,n,k) return gflops
def cublas_dot(A, B, C, alpha=1.0, beta=0.0, repeat=1): lda = max(A.strides) // 4 ldb = max(B.strides) // 4 ldc = max(C.strides) // 4 opA = 't' if A.is_trans else 'n' opB = 't' if B.is_trans else 'n' op = opB + opA m = A.shape[0] n = B.shape[1] k = A.shape[1] start.record() # Swap A and B to map from C order to Fortran for r in range(repeat): cublas.cublasSgemm(handle, opB, opA, n, m, k, alpha, B.gpudata, ldb, A.gpudata, lda, beta, C.gpudata, ldc) end.record() end.synchronize() msecs = end.time_since(start) / repeat gflops = (m * n * k * 2.0) / (msecs * 1000000.0) print "%7.3f msecs %4.0f gflops (%s_%s : %d,%d,%d)" % ( msecs, gflops, "cublas", op, m, n, k)
def cublas_dot(op, A, B, C, repeat=1, warmup=False): lda = A.shape[0] ldb = B.shape[0] ldc = C.shape[0] m = C.shape[0] n = C.shape[1] k = A.shape[1] if op[0] == 'n' else A.shape[0] if warmup: for r in range(repeat): cublas.cublasSgemm(handle, op[0], op[1], m, n, k, 1.0, A.gpudata, lda, B.gpudata, ldb, 0.0, C.gpudata, ldc) start.record() # Swap A and B to map from C order to Fortran for r in range(repeat): cublas.cublasSgemm(handle, op[0], op[1], m, n, k, 1.0, A.gpudata, lda, B.gpudata, ldb, 0.0, C.gpudata, ldc) end.record() end.synchronize() msecs = end.time_since(start) / repeat gflops = (m * n * k * 2.0) / (msecs * 1000000.0) print "%7.3f msecs %4.0f gflops (%s: %d,%d,%d)" % (msecs,gflops,op,m,n,k) return msecs
def cublas_dot(op, A, B, C, repeat=1, warmup=False): lda = A.shape[0] ldb = B.shape[0] ldc = C.shape[0] m = C.shape[0] n = C.shape[1] k = A.shape[1] if op[0] == 'n' else A.shape[0] if warmup: for r in range(repeat): cublas.cublasSgemm(handle, op[0], op[1], m, n, k, 1.0, A.gpudata, lda, B.gpudata, ldb, 0.0, C.gpudata, ldc) start.record() # Swap A and B to map from C order to Fortran for r in range(repeat): cublas.cublasSgemm(handle, op[0], op[1], m, n, k, 1.0, A.gpudata, lda, B.gpudata, ldb, 0.0, C.gpudata, ldc) end.record() end.synchronize() msecs = end.time_since(start) / repeat gflops = (m * n * k * 2.0) / (msecs * 1000000.0) print("%7.3f msecs %4.0f gflops (%s: %d,%d,%d)" % (msecs,gflops,op,m,n,k)) return msecs
def compute_sgemm(col, kernel, bias, stream, handle): alpha = np.float32(1.0); beta = np.float32(1.0); m = np.int32(kernel.shape[0]) k = np.int32(kernel.shape[1]) n = np.int32(col.shape[1]) flop = 2*m*n*k #cublas.cublasSetStream(handle, stream.handle) cublas.cublasSgemm(handle, 'n', 'n', n, m, k, alpha, col.ptr, n, kernel.ptr, k, beta, bias.ptr, n);
def compute_sgemm(col, kernel, bias, handle): alpha = np.float32(1.0); beta = np.float32(1.0); #(mxk)x(kxn) m = np.int32(kernel.shape[0]) k = np.int32(kernel.shape[1]) n = np.int32(col.shape[1]) #2k-1 operatiosn per entry in C matrix. C is m*n so m*n*(2k-1) for mat mult #another m*n additions to perform AB+C #lower bound ignoring alpha and beta multiplications flop = 2*m*n*k cublas.cublasSgemm(handle, 'n', 'n', n, m, k, alpha, col.ptr, n, kernel.ptr, k, beta, bias.ptr, n);
def compute_sgemm(weights, values, biases, handle, m, k, n): alpha = np.float32(1.0); beta = np.float32(1.0); #to do C = A*B + C, we actually do C_t = B_t*A_t + C_t and then transpose, but the transposing is all done implicitly in copy to and from gpu, so we just note that we do BA not AB flop = float(2*m*n*k) gflop = flop/10**9 #Uncomment the two blocks below for precise sgemm timing """ start = cu.Event() end = cu.Event() start.record() """ #We want to do biases = weights*values + biases, which has dimensions (m*n) = (m*k)*(k*n) + (m*n) #but instead use trasnposes as in above note cublas.cublasSgemm(handle, 'n', 'n', n, m, k, alpha, values.ptr, n, weights.ptr, k, beta, biases.ptr, n) """
def x_outer_y_add_O(self, alpha, x, y, beta, O, result = None): ''' calc alpha*(x outer y) + beta*O store in O cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) x_outer_y_add_O(float alpha, float* x, float* y, float beta, float* O, float* result, uint O_col, uint O_row) ''' if result is None: cublas.cublasSgemm(self.handle, 'T', 'N', \ x.size, y.size, 1, \ alpha, x.gpudata, 1, y.gpudata, 1, \ beta, O.gpudata, x.size \ ) else: O_col = O.shape[0] O_row = O.shape[1] self.x_outer_y_add_O_kernel(np.float32(alpha), x.gpudata, y, gpudata, \ np.float32(beta), O.gpudata, \ result.gpudata, np.uint32(O_col), np.uint32(O_row), \ block = (32, 32, 1), \ grid = (int(O_row / 32) + 1, int(O_col / 32) + 1) \ )
def sgemm(*args): cublas.cublasSgemm(handle, *args)