예제 #1
0
def cublas_dot(A, B, C, repeat=1):

    lda = max(A.strides) // 4
    ldb = max(B.strides) // 4
    ldc = max(C.strides) // 4

    opA = 't' if A.is_trans else 'n'
    opB = 't' if B.is_trans else 'n'
    op  = opB + opA

    m = A.shape[0]
    n = B.shape[1]
    k = A.shape[1]

    start.record()
    
    # Swap A and B to map from C order to Fortran 
    for r in range(repeat):
        cublas.cublasSgemm(handle, opB, opA, n, m, k, 1.0, B.gpudata, ldb, A.gpudata, lda, 0.0, C.gpudata, ldc)

    end.record()
    end.synchronize()
    msecs = end.time_since(start) / repeat
    gflops = (m * n * k * 2.0) / (msecs * 1000000.0)
    print "%7.3f msecs %4.0f gflops (%s_%s   : %d,%d,%d)" % (msecs,gflops,"cublas",op,m,n,k)

    return gflops
예제 #2
0
파일: cublas.py 프로젝트: zky001/nervanagpu
def cublas_dot(A, B, C, alpha=1.0, beta=0.0, repeat=1):

    lda = max(A.strides) // 4
    ldb = max(B.strides) // 4
    ldc = max(C.strides) // 4

    opA = 't' if A.is_trans else 'n'
    opB = 't' if B.is_trans else 'n'
    op = opB + opA

    m = A.shape[0]
    n = B.shape[1]
    k = A.shape[1]

    start.record()

    # Swap A and B to map from C order to Fortran
    for r in range(repeat):
        cublas.cublasSgemm(handle, opB, opA, n, m, k, alpha, B.gpudata, ldb,
                           A.gpudata, lda, beta, C.gpudata, ldc)

    end.record()
    end.synchronize()
    msecs = end.time_since(start) / repeat
    gflops = (m * n * k * 2.0) / (msecs * 1000000.0)
    print "%7.3f msecs %4.0f gflops (%s_%s   : %d,%d,%d)" % (
        msecs, gflops, "cublas", op, m, n, k)
예제 #3
0
def cublas_dot(op, A, B, C, repeat=1, warmup=False):

    lda = A.shape[0]
    ldb = B.shape[0]
    ldc = C.shape[0]

    m = C.shape[0]
    n = C.shape[1]
    k = A.shape[1] if op[0] == 'n' else A.shape[0]

    if warmup:
        for r in range(repeat):
            cublas.cublasSgemm(handle, op[0], op[1], m, n, k, 1.0, A.gpudata, lda, B.gpudata, ldb, 0.0, C.gpudata, ldc)

    start.record()
    
    # Swap A and B to map from C order to Fortran 
    for r in range(repeat):
        cublas.cublasSgemm(handle, op[0], op[1], m, n, k, 1.0, A.gpudata, lda, B.gpudata, ldb, 0.0, C.gpudata, ldc)

    end.record()
    end.synchronize()
    msecs = end.time_since(start) / repeat
    gflops = (m * n * k * 2.0) / (msecs * 1000000.0)
    print "%7.3f msecs %4.0f gflops (%s: %d,%d,%d)" % (msecs,gflops,op,m,n,k)

    return msecs
예제 #4
0
def cublas_dot(op, A, B, C, repeat=1, warmup=False):

    lda = A.shape[0]
    ldb = B.shape[0]
    ldc = C.shape[0]

    m = C.shape[0]
    n = C.shape[1]
    k = A.shape[1] if op[0] == 'n' else A.shape[0]

    if warmup:
        for r in range(repeat):
            cublas.cublasSgemm(handle, op[0], op[1], m, n, k, 1.0, A.gpudata, lda, B.gpudata, ldb, 0.0, C.gpudata, ldc)

    start.record()
    
    # Swap A and B to map from C order to Fortran 
    for r in range(repeat):
        cublas.cublasSgemm(handle, op[0], op[1], m, n, k, 1.0, A.gpudata, lda, B.gpudata, ldb, 0.0, C.gpudata, ldc)

    end.record()
    end.synchronize()
    msecs = end.time_since(start) / repeat
    gflops = (m * n * k * 2.0) / (msecs * 1000000.0)
    print("%7.3f msecs %4.0f gflops (%s: %d,%d,%d)" % (msecs,gflops,op,m,n,k))

    return msecs
예제 #5
0
def compute_sgemm(col, kernel, bias, stream, handle):
    alpha = np.float32(1.0); beta = np.float32(1.0);

    m = np.int32(kernel.shape[0])
    k = np.int32(kernel.shape[1])
    n = np.int32(col.shape[1])

    flop = 2*m*n*k
    #cublas.cublasSetStream(handle, stream.handle)
    cublas.cublasSgemm(handle, 'n', 'n', n, m, k, alpha, col.ptr, n, kernel.ptr, k, beta, bias.ptr, n);
예제 #6
0
def compute_sgemm(col, kernel, bias, handle):
    alpha = np.float32(1.0); beta = np.float32(1.0);

    #(mxk)x(kxn)
    m = np.int32(kernel.shape[0])
    k = np.int32(kernel.shape[1]) 
    n = np.int32(col.shape[1])
    #2k-1 operatiosn per entry in C matrix. C is m*n so m*n*(2k-1) for mat mult
    #another m*n additions to perform AB+C
    #lower bound ignoring alpha and beta multiplications
    flop = 2*m*n*k

    cublas.cublasSgemm(handle, 'n', 'n', n, m, k, alpha, col.ptr, n, kernel.ptr, k, beta, bias.ptr, n);
예제 #7
0
def compute_sgemm(weights, values, biases, handle, m, k, n):
    alpha = np.float32(1.0); beta = np.float32(1.0);
    #to do C = A*B + C, we actually do C_t = B_t*A_t + C_t and then transpose, but the transposing is all done implicitly in copy to and from gpu, so we just note that we do BA not AB
    flop = float(2*m*n*k)
    gflop = flop/10**9
    
    
    #Uncomment the two blocks below for precise sgemm timing
    """
    start = cu.Event()
    end = cu.Event()
    start.record()
    """
    #We want to do biases = weights*values + biases, which has dimensions (m*n) = (m*k)*(k*n) + (m*n)
    #but instead use trasnposes as in above note
    cublas.cublasSgemm(handle, 'n', 'n', n, m, k, alpha, values.ptr, n, weights.ptr, k, beta, biases.ptr, n)
    """
 def x_outer_y_add_O(self, alpha, x, y, beta, O, result = None):
     '''
     calc alpha*(x outer y) + beta*O store in O
     cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)
     
     x_outer_y_add_O(float alpha, float* x, float* y, float beta, float* O,
             float* result, uint O_col, uint O_row)
     '''
     if result is None:
         cublas.cublasSgemm(self.handle, 'T', 'N', \
                            x.size, y.size, 1, \
                            alpha, x.gpudata, 1, y.gpudata, 1, \
                            beta, O.gpudata, x.size \
                            )
     else:
         O_col = O.shape[0]
         O_row = O.shape[1]
         self.x_outer_y_add_O_kernel(np.float32(alpha), x.gpudata, y, gpudata, \
                                     np.float32(beta), O.gpudata, \
                                     result.gpudata, np.uint32(O_col), np.uint32(O_row), \
                                     block = (32, 32, 1), \
                                     grid = (int(O_row / 32) + 1, int(O_col / 32) + 1) \
                                     )            
예제 #9
0
 def sgemm(*args):
   cublas.cublasSgemm(handle, *args)
예제 #10
0
 def sgemm(*args):
     cublas.cublasSgemm(handle, *args)