예제 #1
0
def runTest(vlength = 128,loops = 1):
    n2 = vlength*vlength
    alfa = c_float(.5)

    cublasInit()

    h_X = (c_float*n2)()
    h_Y = (c_float*n2)()
    g_Y = (c_float*n2)()
    vectorInit(h_X)
    vectorInit(h_Y)

    d_X = c_void_p()
    d_Y = c_void_p()
    cublasAlloc(n2, sizeof(c_float), byref(d_X))
    cublasAlloc(n2, sizeof(c_float), byref(d_Y))
 
    cublasSetVector(n2, sizeof(c_float), h_X, 1, d_X, 1)
    cublasSetVector(n2, sizeof(c_float), h_Y, 1, d_Y, 1)

    flops = (2.e-9*n2)*float(loops)
    t0 = time()
    for i in range(loops):
        cublasSaxpy(n2, alfa, d_X, 1, d_Y, 1)
    cudaThreadSynchronize()
    t0 = time()-t0

    print "Processing time: %.3g sec" % t0
    print "Gigaflops GPU: %.2f (%d)" % (flops/t0,n2)

    t1 = time()
    for i in range(loops):
        cpuSAXPY(alfa,h_X,h_Y)
    t1 = time()-t1
    
    print "\nProcessing time: %.3g sec" % t1
    print "Gigaflops CPU: %.2f" % (flops/t1)
    print "GPU vs. CPU  : %.2f" % (t1/t0)

    cublasGetVector(n2, sizeof(c_float), d_Y, 1, g_Y, 1)
    err,mxe = checkError(h_Y,g_Y)
    print "\nAvg and max rel error = %.2e %.2e" % (err,mxe)

    cublasFree(d_X)
    cublasFree(d_Y)

    cublasShutdown()
예제 #2
0
def main(device, vlength=128, loops=1):

    alfa = c_float(.5)
    n2 = vlength  ## Vector length
    gpuSAXPY = device.functions["gpuSAXPY"]

    h_X = (c_float * n2)()
    h_Y = (c_float * n2)()
    g_Y = (c_float * n2)()

    fixedInit(h_X)

    d_X = getMemory(h_X)
    d_Y = getMemory(h_Y)

    cuFuncSetBlockShape(gpuSAXPY, BLOCK_SIZE, 1, 1)
    cuParamSetf(gpuSAXPY, 0, alfa)
    cuParamSeti(gpuSAXPY, 4, d_X)
    cuParamSeti(gpuSAXPY, 8, d_Y)
    cuParamSeti(gpuSAXPY, 12, n2)
    cuParamSetSize(gpuSAXPY, 16)

    cuCtxSynchronize()
    t0 = time()
    for i in range(loops):
        cuLaunchGrid(gpuSAXPY, GRID_SIZE, 1)
    cuCtxSynchronize()
    t0 = time() - t0

    flops = (2.e-9 * n2) * float(loops)
    cuMemcpyDtoH(g_Y, d_Y, n2 * S4)
    cuCtxSynchronize()

    cuMemFree(d_X)
    cuMemFree(d_Y)

    t1 = time()
    for i in range(loops):
        cpuSAXPY(alfa, h_X, h_Y)
    t1 = time() - t1
    print "%10d%6.2f%6.2f" % (vlength, flops / t1, flops / t0)

    if checkErrorFlag:
        err, mxe = checkError(h_Y, g_Y)
        print "Avg and max rel error = %.2e %.2e" % (err, mxe)
예제 #3
0
def main(device, vlength=128, loops=1):

    alfa = c_float(0.5)
    n2 = vlength  ## Vector length
    gpuSAXPY = device.functions["gpuSAXPY"]

    h_X = (c_float * n2)()
    h_Y = (c_float * n2)()
    g_Y = (c_float * n2)()

    fixedInit(h_X)

    d_X = getMemory(h_X)
    d_Y = getMemory(h_Y)

    cuFuncSetBlockShape(gpuSAXPY, BLOCK_SIZE, 1, 1)
    cuParamSetf(gpuSAXPY, 0, alfa)
    cuParamSeti(gpuSAXPY, 4, d_X)
    cuParamSeti(gpuSAXPY, 8, d_Y)
    cuParamSeti(gpuSAXPY, 12, n2)
    cuParamSetSize(gpuSAXPY, 16)

    cuCtxSynchronize()
    t0 = time()
    for i in range(loops):
        cuLaunchGrid(gpuSAXPY, GRID_SIZE, 1)
    cuCtxSynchronize()
    t0 = time() - t0

    flops = (2.0e-9 * n2) * float(loops)
    cuMemcpyDtoH(g_Y, d_Y, n2 * S4)
    cuCtxSynchronize()

    cuMemFree(d_X)
    cuMemFree(d_Y)

    t1 = time()
    for i in range(loops):
        cpuSAXPY(alfa, h_X, h_Y)
    t1 = time() - t1
    print "%10d%6.2f%6.2f" % (vlength, flops / t1, flops / t0)

    if checkErrorFlag:
        err, mxe = checkError(h_Y, g_Y)
        print "Avg and max rel error = %.2e %.2e" % (err, mxe)
예제 #4
0
def main(vlength = 128,loops = 1):

    alfa = c_float(.5)
    n2 = vlength ## Vector length

    h_X = (c_float*n2)()
    h_Y = (c_float*n2)()
    g_Y = (c_float*n2)()

    fixedInit(h_X)

    d_X = getMemory(h_X)
    d_Y = getMemory(h_Y)

    blockDim  = dim3(BLOCK_SIZE,1,1)
    gridDim   = dim3(GRID_SIZE,1,1)

    t0 = time()
    cudaThreadSynchronize()
    for i in range(loops):
        cudaConfigureCall(gridDim,blockDim,0,0)
        gpuSAXPY(alfa,d_X,d_Y,n2)
    cudaThreadSynchronize()
    t0 = time()-t0

    flops = (2.e-9*n2)*float(loops)
    g_Y = (c_float*n2)()
    cudaMemcpy(g_Y,d_Y,S4*n2,cudaMemcpyDeviceToHost)
    cudaThreadSynchronize()

    cudaFree(d_X)
    cudaFree(d_Y)

    cudaThreadExit()
    t1 = time()
    for i in range(loops):
        cpuSAXPY(alfa,h_X,h_Y)
    t1 = time()-t1
    print "%10d%6.2f%6.2f" % (vlength,flops/t1,flops/t0)

    if checkErrorFlag:
        err,mxe = checkError(h_Y,g_Y)
        print "Avg and max rel error = %.2e %.2e" % (err,mxe)
예제 #5
0
def main(vlength = 128,loops = 1):

    alfa = c_float(.5)
    n2 = vlength ## Vector length

    h_X = (c_float*n2)()
    h_Y = (c_float*n2)()
    g_Y = (c_float*n2)()

    fixedInit(h_X)

    d_X = getMemory(h_X)
    d_Y = getMemory(h_Y)

    blockDim  = dim3(BLOCK_SIZE,1,1)
    gridDim   = dim3(GRID_SIZE,1,1)

    t0 = time()
    cudaThreadSynchronize()
    for i in range(loops):
        cudaConfigureCall(gridDim,blockDim,0,0)
        gpuSAXPY(alfa,d_X,d_Y,n2)
    cudaThreadSynchronize()
    t0 = time()-t0

    flops = (2.e-9*n2)*float(loops)
    g_Y = (c_float*n2)()
    cudaMemcpy(g_Y,d_Y,S4*n2,cudaMemcpyDeviceToHost)
    cudaThreadSynchronize()

    cudaFree(d_X)
    cudaFree(d_Y)

    cudaThreadExit()
    t1 = time()
    for i in range(loops):
        cpuSAXPY(alfa,h_X,h_Y)
    t1 = time()-t1
    print "%10d%6.2f%6.2f" % (vlength,flops/t1,flops/t0)

    if checkErrorFlag:
        err,mxe = checkError(h_Y,g_Y)
        print "Avg and max rel error = %.2e %.2e" % (err,mxe)