示例#1
0
def main(vlength = 128,loops = 1):

    n2 = vlength ## Vector length

    h_S = (c_float*n2)()
    h_X = (c_float*n2)()
    h_T = (c_float*n2)()
    h_C = (c_float*n2)()
    h_P = (c_float*n2)()

    
    randInit(h_S,5.,30.)
    randInit(h_X,1.,100.)
    randInit(h_T,.25,10.)
    R,V = .03,.3

    d_S = getMemory(h_S)
    d_X = getMemory(h_X)
    d_T = getMemory(h_T)
    d_C = getMemory(h_C)
    d_P = getMemory(h_P)

    blockDim  = dim3(BLOCK_SIZE,1,1)
    gridDim   = dim3(GRID_SIZE,1,1)

    cudaThreadSynchronize()
    t0 = time()
    for i in range(loops):
        cudaConfigureCall(gridDim,blockDim,0,0)
        gpuBLSC(d_C,d_P,d_S,d_X,d_T,R,V,n2)
    cudaThreadSynchronize()
    t0 = time()-t0

    flops = (2.e-6*n2)*float(loops)
    g_C = (c_float*n2)()
    g_P = (c_float*n2)()
    cudaMemcpy(g_C,d_C,S4*n2,cudaMemcpyDeviceToHost)
    cudaMemcpy(g_P,d_P,S4*n2,cudaMemcpyDeviceToHost)
    cudaThreadSynchronize()

    cudaFree(d_S)
    cudaFree(d_X)
    cudaFree(d_T)
    cudaFree(d_C)
    cudaFree(d_P)

    cudaThreadExit()
    t1 = time()
    for i in range(loops):
        cpuBLSC(h_C,h_P,h_S,h_X,h_T,R,V,n2)
    t1 = time()-t1
    print "%10d%10.2f%10.2f" % (vlength,flops/t1,flops/t0)

    if checkErrorFlag:
        err,mxe = checkError(h_C,g_C)
        print "Avg rel error (call) = %.2e" % (err,)
        err,mxe = checkError(h_P,g_P)
        print "Avg rel error (put)  = %.2e" % (err,)
示例#2
0
def main(vlength = 128,loops = 1):

    n2 = vlength ## Vector length

    h_S = (c_float*n2)()
    h_X = (c_float*n2)()
    h_T = (c_float*n2)()
    h_C = (c_float*n2)()
    h_P = (c_float*n2)()


    randInit(h_S,5.,30.)
    randInit(h_X,1.,100.)
    randInit(h_T,.25,10.)
    R,V = .03,.3

    d_S = getMemory(h_S)
    d_X = getMemory(h_X)
    d_T = getMemory(h_T)
    d_C = getMemory(h_C)
    d_P = getMemory(h_P)

    blockDim  = dim3(BLOCK_SIZE,1,1)
    gridDim   = dim3(GRID_SIZE,1,1)

    cudaThreadSynchronize()
    t0 = time()
    for i in range(loops):
        cudaConfigureCall(gridDim,blockDim,0,0)
        gpuBLSC(d_C,d_P,d_S,d_X,d_T,R,V,n2)
    cudaThreadSynchronize()
    t0 = time()-t0

    flops = (2.e-6*n2)*float(loops)
    g_C = (c_float*n2)()
    g_P = (c_float*n2)()
    cudaMemcpy(g_C,d_C,S4*n2,cudaMemcpyDeviceToHost)
    cudaMemcpy(g_P,d_P,S4*n2,cudaMemcpyDeviceToHost)
    cudaThreadSynchronize()

    cudaFree(d_S)
    cudaFree(d_X)
    cudaFree(d_T)
    cudaFree(d_C)
    cudaFree(d_P)

    cudaThreadExit()
    t1 = time()
    for i in range(loops):
        cpuBLSC(h_C,h_P,h_S,h_X,h_T,R,V,n2)
    t1 = time()-t1
    print "%10d%10.2f%10.2f" % (vlength,flops/t1,flops/t0)

    if checkErrorFlag:
        err,mxe = checkError(h_C,g_C)
        print "Avg rel error (call) = %.2e" % (err,)
        err,mxe = checkError(h_P,g_P)
        print "Avg rel error (put)  = %.2e" % (err,)
示例#3
0
def main(vlength = 128,loops = 1):

    n2 = vlength ## Vector length

    h_X = (c_float*n2)()
    h_Y = (c_float*n2)()
    h_Z = (c_float*n2)()

    vectorInit(h_X)

    d_X = getMemory(h_X)
    d_Y = getMemory(h_Y)
    d_Z = getMemory(h_Z)

    blockDim  = dim3(BLOCK_SIZE,1,1)
    gridDim   = dim3(GRID_SIZE,1,1)

    t0 = time()
    cudaThreadSynchronize()
    for i in range(loops):
        cudaConfigureCall(gridDim,blockDim,0,0)
        gpuTRIG(d_Y,d_Z,d_X,n2)
    cudaThreadSynchronize()
    t0 = time()-t0

    flops = (2.e-9*n2)*float(loops)
    g_Y = (c_float*n2)()
    cudaMemcpy(g_Y,d_Y,S4*n2,cudaMemcpyDeviceToHost)
    cudaThreadSynchronize()

    flops = (8.e-9*n2)*float(loops)
    g_Y = (c_float*n2)()
    g_Z = (c_float*n2)()
    cudaMemcpy(g_Y,d_Y,S4*n2,cudaMemcpyDeviceToHost)
    cudaMemcpy(g_Z,d_Z,S4*n2,cudaMemcpyDeviceToHost)
    cudaThreadSynchronize()

    cudaFree(d_X)
    cudaFree(d_Y)
    cudaFree(d_Z)

    cudaThreadExit()
    t1 = time()
    for i in range(loops):
        cpuTRIG(h_Y,h_Z,h_X)
    t1 = time()-t1
    print "%10d%6.2f%6.2f GFlops" % (vlength,flops/t1,flops/t0)

    if checkErrorFlag:
        err,mxe = checkError(h_Y,g_Y,n2)
        print "Avg and max rel error (cos) = %.2e %.2e" % (err,mxe)
        err,mxe = checkError(h_Z,g_Z,n2)
        print "Avg and max rel error (sin) = %.2e %.2e" % (err,mxe)
示例#4
0
def main(vlength=128, loops=1):

    n2 = vlength  ## Vector length

    h_X = (c_float * n2)()
    h_Y = (c_float * n2)()
    h_Z = (c_float * n2)()

    vectorInit(h_X)

    d_X = getMemory(h_X)
    d_Y = getMemory(h_Y)
    d_Z = getMemory(h_Z)

    blockDim = dim3(BLOCK_SIZE, 1, 1)
    gridDim = dim3(GRID_SIZE, 1, 1)

    t0 = time()
    cudaThreadSynchronize()
    for i in range(loops):
        cudaConfigureCall(gridDim, blockDim, 0, 0)
        gpuTRIG(d_Y, d_Z, d_X, n2)
    cudaThreadSynchronize()
    t0 = time() - t0

    flops = (2.e-9 * n2) * float(loops)
    g_Y = (c_float * n2)()
    cudaMemcpy(g_Y, d_Y, S4 * n2, cudaMemcpyDeviceToHost)
    cudaThreadSynchronize()

    flops = (8.e-9 * n2) * float(loops)
    g_Y = (c_float * n2)()
    g_Z = (c_float * n2)()
    cudaMemcpy(g_Y, d_Y, S4 * n2, cudaMemcpyDeviceToHost)
    cudaMemcpy(g_Z, d_Z, S4 * n2, cudaMemcpyDeviceToHost)
    cudaThreadSynchronize()

    cudaFree(d_X)
    cudaFree(d_Y)
    cudaFree(d_Z)

    cudaThreadExit()
    t1 = time()
    for i in range(loops):
        cpuTRIG(h_Y, h_Z, h_X)
    t1 = time() - t1
    print "%10d%6.2f%6.2f GFlops" % (vlength, flops / t1, flops / t0)

    if checkErrorFlag:
        err, mxe = checkError(h_Y, g_Y, n2)
        print "Avg and max rel error (cos) = %.2e %.2e" % (err, mxe)
        err, mxe = checkError(h_Z, g_Z, n2)
        print "Avg and max rel error (sin) = %.2e %.2e" % (err, mxe)
示例#5
0
def main(device, vlength=128, loops=1):

    n2 = vlength  ## Vector length
    gpuTRIG = device.functions["gpuTRIG"]

    h_X = (c_float * n2)()
    h_Y = (c_float * n2)()
    h_Z = (c_float * n2)()

    vectorInit(h_X)

    d_X = getMemory(h_X)
    d_Y = getMemory(h_Y)
    d_Z = getMemory(h_Z)

    cuFuncSetBlockShape(gpuTRIG, BLOCK_SIZE, 1, 1)
    cuParamSeti(gpuTRIG, 0, d_Y)
    cuParamSeti(gpuTRIG, 4, d_Z)
    cuParamSeti(gpuTRIG, 8, d_X)
    cuParamSeti(gpuTRIG, 12, n2)
    cuParamSetSize(gpuTRIG, 16)

    cuCtxSynchronize()
    t0 = time()
    for i in range(loops):
        cuLaunchGrid(gpuTRIG, GRID_SIZE, 1)
    cuCtxSynchronize()
    t0 = time() - t0

    flops = (8.e-9 * n2) * float(loops)
    g_Y = (c_float * n2)()
    g_Z = (c_float * n2)()
    cuMemcpyDtoH(g_Y, d_Y, S4 * n2)
    cuMemcpyDtoH(g_Z, d_Z, S4 * n2)
    cuCtxSynchronize()

    cuMemFree(d_X)
    cuMemFree(d_Y)
    cuMemFree(d_Z)

    t1 = time()
    for i in range(loops):
        cpuTRIG(h_Y, h_Z, h_X)
    t1 = time() - t1
    print "%10d%6.2f%6.2f GFlops" % (vlength, flops / t1, flops / t0)

    if checkErrorFlag:
        err, mxe = checkError(h_Y, g_Y, n2)
        print "Avg and max rel error (cos) = %.2e %.2e" % (err, mxe)
        err, mxe = checkError(h_Z, g_Z, n2)
        print "Avg and max rel error (sin) = %.2e %.2e" % (err, mxe)
示例#6
0
def main(device,vlength = 128,loops = 1):

    n2 = vlength ## Vector length
    gpuTRIG = device.functions["gpuTRIG"]

    h_X = (c_float*n2)()
    h_Y = (c_float*n2)()
    h_Z = (c_float*n2)()

    vectorInit(h_X)

    d_X = getMemory(h_X)
    d_Y = getMemory(h_Y)
    d_Z = getMemory(h_Z)

    cuFuncSetBlockShape(gpuTRIG,BLOCK_SIZE,1,1)
    cuParamSeti(gpuTRIG,0,d_Y)
    cuParamSeti(gpuTRIG,4,d_Z)
    cuParamSeti(gpuTRIG,8,d_X)
    cuParamSeti(gpuTRIG,12,n2)
    cuParamSetSize(gpuTRIG,16)

    cuCtxSynchronize()
    t0 = time()
    for i in range(loops):
        cuLaunchGrid(gpuTRIG,GRID_SIZE,1)
    cuCtxSynchronize()
    t0 = time()-t0

    flops = (8.e-9*n2)*float(loops)
    g_Y = (c_float*n2)()
    g_Z = (c_float*n2)()
    cuMemcpyDtoH(g_Y,d_Y,S4*n2)
    cuMemcpyDtoH(g_Z,d_Z,S4*n2)
    cuCtxSynchronize()

    cuMemFree(d_X)
    cuMemFree(d_Y)
    cuMemFree(d_Z)

    t1 = time()
    for i in range(loops):
        cpuTRIG(h_Y,h_Z,h_X)
    t1 = time()-t1
    print "%10d%6.2f%6.2f GFlops" % (vlength,flops/t1,flops/t0)

    if checkErrorFlag:
        err,mxe = checkError(h_Y,g_Y,n2)
        print "Avg and max rel error (cos) = %.2e %.2e" % (err,mxe)
        err,mxe = checkError(h_Z,g_Z,n2)
        print "Avg and max rel error (sin) = %.2e %.2e" % (err,mxe)
示例#7
0
def main(device,vlength = 128,loops = 1,m1 = 1):
    print "%5d %5d %5d" % (l,loops,m1),

    alfa = c_float(.5)
    n2 = vlength ## Vector length

    mp = 1 << (m1-1)
    print "%5d" % (mp*psize),
    fcn = "gpuPOLY%d"%(mp*psize)
    gpuPOLY = device.functions[fcn]
    h_X = (c_float*n2)()
    h_Y = (c_float*n2)()
    g_Y = (c_float*n2)()

    vectorInit(h_X)

    d_X = getMemory(h_X)
    d_Y = getMemory(h_Y)

    cuFuncSetBlockShape(gpuPOLY,BLOCK_SIZE,1,1)
    cuParamSeti(gpuPOLY,0,d_X)
    cuParamSeti(gpuPOLY,4,d_Y)
    cuParamSeti(gpuPOLY,8,n2)
    cuParamSetSize(gpuPOLY,12)

    cuCtxSynchronize()
    cuLaunchGrid(gpuPOLY,GRID_SIZE,1)
    t0 = time()
    for i in range(loops):
        cuLaunchGrid(gpuPOLY,GRID_SIZE,1)
    cuCtxSynchronize()
    t0 = time()-t0

    flops = (2.e-9*m1*n2*(psize-1))*float(loops)
    cuMemcpyDtoH(g_Y,d_Y,n2*S4)
    cuCtxSynchronize()

    cuMemFree(d_X)
    cuMemFree(d_Y)

    cpuPOLY = eval("cpuPOLY%d" % (mp*psize))
    t1 = time()
    for i in range(loops):
        cpuPOLY(h_X,h_Y)
    t1 = time()-t1
    print "%10d%6.2f%6.2f" % (vlength,flops/t1,flops/t0)

    if checkErrorFlag:
        err,mxe = checkError(h_Y,g_Y)
        print "Avg and max rel error = %.2e %.2e" % (err,mxe)
示例#8
0
def main(vlength = 128,loops = 1,m1 = 1):
    print "%5d %5d %5d" % (l,loops,m1),

    alfa = c_float(.5)
    n2 = vlength ## Vector length

    mp = 1 << (m1-1)
    print "%5d" % (mp*psize),
    gpuPOLY = eval("gpuPOLY%d"%(mp*psize))
    h_X = (c_float*n2)()
    h_Y = (c_float*n2)()
    g_Y = (c_float*n2)()

    vectorInit(h_X)

    d_X = getMemory(h_X)
    d_Y = getMemory(h_Y)

    blockDim  = dim3(BLOCK_SIZE,1,1)
    gridDim   = dim3(GRID_SIZE,1,1)

    t0 = time()
    cudaThreadSynchronize()
    for i in range(loops):
        cudaConfigureCall(gridDim,blockDim,0,0)
        gpuPOLY(d_X,d_Y,n2)
    cudaThreadSynchronize()
    t0 = time()-t0

    flops = (2.e-9*m1*n2*(psize-1))*float(loops)
    cudaMemcpy(g_Y,d_Y,S4*n2,cudaMemcpyDeviceToHost)
    cudaThreadSynchronize()

    cudaFree(d_X)
    cudaFree(d_Y)

    cudaThreadExit()
    cpuPOLY = eval("cpuPOLY%d" % (mp*psize))
    t1 = time()
    for i in range(loops):
        cpuPOLY(h_X,h_Y)
    t1 = time()-t1
    print "%10d%6.2f%6.2f" % (vlength,flops/t1,flops/t0)

    if checkErrorFlag:
        err,mxe = checkError(h_Y,g_Y)
        print "Avg and max rel error = %.2e %.2e" % (err,mxe)
示例#9
0
def runTest(vlength = 128,loops = 1):
    n2 = vlength*vlength
    alfa = c_float(.5)

    cublasInit()

    h_X = (c_float*n2)()
    h_Y = (c_float*n2)()
    g_Y = (c_float*n2)()
    vectorInit(h_X)
    vectorInit(h_Y)

    d_X = c_void_p()
    d_Y = c_void_p()
    cublasAlloc(n2, sizeof(c_float), byref(d_X))
    cublasAlloc(n2, sizeof(c_float), byref(d_Y))
 
    cublasSetVector(n2, sizeof(c_float), h_X, 1, d_X, 1)
    cublasSetVector(n2, sizeof(c_float), h_Y, 1, d_Y, 1)

    flops = (2.e-9*n2)*float(loops)
    t0 = time()
    for i in range(loops):
        cublasSaxpy(n2, alfa, d_X, 1, d_Y, 1)
    cudaThreadSynchronize()
    t0 = time()-t0

    print "Processing time: %.3g sec" % t0
    print "Gigaflops GPU: %.2f (%d)" % (flops/t0,n2)

    t1 = time()
    for i in range(loops):
        cpuSAXPY(alfa,h_X,h_Y)
    t1 = time()-t1
    
    print "\nProcessing time: %.3g sec" % t1
    print "Gigaflops CPU: %.2f" % (flops/t1)
    print "GPU vs. CPU  : %.2f" % (t1/t0)

    cublasGetVector(n2, sizeof(c_float), d_Y, 1, g_Y, 1)
    err,mxe = checkError(h_Y,g_Y)
    print "\nAvg and max rel error = %.2e %.2e" % (err,mxe)

    cublasFree(d_X)
    cublasFree(d_Y)

    cublasShutdown()
示例#10
0
def main(vlength = 128,loops = 1,m1 = 1):
    print "%5d %5d %5d" % (l,loops,m1),

    alfa = c_float(.5)
    n2 = vlength ## Vector length

    mp = 1 << (m1-1)
    print "%5d" % (mp*psize),
    gpuPOLY = eval("gpuPOLY%d"%(mp*psize))
    h_X = (c_float*n2)()
    h_Y = (c_float*n2)()
    g_Y = (c_float*n2)()

    vectorInit(h_X)

    d_X = getMemory(h_X)
    d_Y = getMemory(h_Y)

    blockDim  = dim3(BLOCK_SIZE,1,1)
    gridDim   = dim3(GRID_SIZE,1,1)

    t0 = time()
    cudaThreadSynchronize()
    for i in range(loops):
        cudaConfigureCall(gridDim,blockDim,0,0)
        gpuPOLY(d_X,d_Y,n2)
    cudaThreadSynchronize()
    t0 = time()-t0

    flops = (2.e-9*m1*n2*(psize-1))*float(loops)
    cudaMemcpy(g_Y,d_Y,S4*n2,cudaMemcpyDeviceToHost)
    cudaThreadSynchronize()

    cudaFree(d_X)
    cudaFree(d_Y)

    cudaThreadExit()
    cpuPOLY = eval("cpuPOLY%d" % (mp*psize))
    t1 = time()
    for i in range(loops):
        cpuPOLY(h_X,h_Y)
    t1 = time()-t1
    print "%10d%6.2f%6.2f" % (vlength,flops/t1,flops/t0)

    if checkErrorFlag:
        err,mxe = checkError(h_Y,g_Y)
        print "Avg and max rel error = %.2e %.2e" % (err,mxe)
示例#11
0
def main(device, vlength=128, loops=1):

    alfa = c_float(.5)
    n2 = vlength  ## Vector length
    gpuSAXPY = device.functions["gpuSAXPY"]

    h_X = (c_float * n2)()
    h_Y = (c_float * n2)()
    g_Y = (c_float * n2)()

    fixedInit(h_X)

    d_X = getMemory(h_X)
    d_Y = getMemory(h_Y)

    cuFuncSetBlockShape(gpuSAXPY, BLOCK_SIZE, 1, 1)
    cuParamSetf(gpuSAXPY, 0, alfa)
    cuParamSeti(gpuSAXPY, 4, d_X)
    cuParamSeti(gpuSAXPY, 8, d_Y)
    cuParamSeti(gpuSAXPY, 12, n2)
    cuParamSetSize(gpuSAXPY, 16)

    cuCtxSynchronize()
    t0 = time()
    for i in range(loops):
        cuLaunchGrid(gpuSAXPY, GRID_SIZE, 1)
    cuCtxSynchronize()
    t0 = time() - t0

    flops = (2.e-9 * n2) * float(loops)
    cuMemcpyDtoH(g_Y, d_Y, n2 * S4)
    cuCtxSynchronize()

    cuMemFree(d_X)
    cuMemFree(d_Y)

    t1 = time()
    for i in range(loops):
        cpuSAXPY(alfa, h_X, h_Y)
    t1 = time() - t1
    print "%10d%6.2f%6.2f" % (vlength, flops / t1, flops / t0)

    if checkErrorFlag:
        err, mxe = checkError(h_Y, g_Y)
        print "Avg and max rel error = %.2e %.2e" % (err, mxe)
示例#12
0
def main(device, vlength=128, loops=1):

    alfa = c_float(0.5)
    n2 = vlength  ## Vector length
    gpuSAXPY = device.functions["gpuSAXPY"]

    h_X = (c_float * n2)()
    h_Y = (c_float * n2)()
    g_Y = (c_float * n2)()

    fixedInit(h_X)

    d_X = getMemory(h_X)
    d_Y = getMemory(h_Y)

    cuFuncSetBlockShape(gpuSAXPY, BLOCK_SIZE, 1, 1)
    cuParamSetf(gpuSAXPY, 0, alfa)
    cuParamSeti(gpuSAXPY, 4, d_X)
    cuParamSeti(gpuSAXPY, 8, d_Y)
    cuParamSeti(gpuSAXPY, 12, n2)
    cuParamSetSize(gpuSAXPY, 16)

    cuCtxSynchronize()
    t0 = time()
    for i in range(loops):
        cuLaunchGrid(gpuSAXPY, GRID_SIZE, 1)
    cuCtxSynchronize()
    t0 = time() - t0

    flops = (2.0e-9 * n2) * float(loops)
    cuMemcpyDtoH(g_Y, d_Y, n2 * S4)
    cuCtxSynchronize()

    cuMemFree(d_X)
    cuMemFree(d_Y)

    t1 = time()
    for i in range(loops):
        cpuSAXPY(alfa, h_X, h_Y)
    t1 = time() - t1
    print "%10d%6.2f%6.2f" % (vlength, flops / t1, flops / t0)

    if checkErrorFlag:
        err, mxe = checkError(h_Y, g_Y)
        print "Avg and max rel error = %.2e %.2e" % (err, mxe)
示例#13
0
def main(vlength = 128,loops = 1):

    alfa = c_float(.5)
    n2 = vlength ## Vector length

    h_X = (c_float*n2)()
    h_Y = (c_float*n2)()
    g_Y = (c_float*n2)()

    fixedInit(h_X)

    d_X = getMemory(h_X)
    d_Y = getMemory(h_Y)

    blockDim  = dim3(BLOCK_SIZE,1,1)
    gridDim   = dim3(GRID_SIZE,1,1)

    t0 = time()
    cudaThreadSynchronize()
    for i in range(loops):
        cudaConfigureCall(gridDim,blockDim,0,0)
        gpuSAXPY(alfa,d_X,d_Y,n2)
    cudaThreadSynchronize()
    t0 = time()-t0

    flops = (2.e-9*n2)*float(loops)
    g_Y = (c_float*n2)()
    cudaMemcpy(g_Y,d_Y,S4*n2,cudaMemcpyDeviceToHost)
    cudaThreadSynchronize()

    cudaFree(d_X)
    cudaFree(d_Y)

    cudaThreadExit()
    t1 = time()
    for i in range(loops):
        cpuSAXPY(alfa,h_X,h_Y)
    t1 = time()-t1
    print "%10d%6.2f%6.2f" % (vlength,flops/t1,flops/t0)

    if checkErrorFlag:
        err,mxe = checkError(h_Y,g_Y)
        print "Avg and max rel error = %.2e %.2e" % (err,mxe)
示例#14
0
def main(vlength = 128,loops = 1):

    alfa = c_float(.5)
    n2 = vlength ## Vector length

    h_X = (c_float*n2)()
    h_Y = (c_float*n2)()
    g_Y = (c_float*n2)()

    fixedInit(h_X)

    d_X = getMemory(h_X)
    d_Y = getMemory(h_Y)

    blockDim  = dim3(BLOCK_SIZE,1,1)
    gridDim   = dim3(GRID_SIZE,1,1)

    t0 = time()
    cudaThreadSynchronize()
    for i in range(loops):
        cudaConfigureCall(gridDim,blockDim,0,0)
        gpuSAXPY(alfa,d_X,d_Y,n2)
    cudaThreadSynchronize()
    t0 = time()-t0

    flops = (2.e-9*n2)*float(loops)
    g_Y = (c_float*n2)()
    cudaMemcpy(g_Y,d_Y,S4*n2,cudaMemcpyDeviceToHost)
    cudaThreadSynchronize()

    cudaFree(d_X)
    cudaFree(d_Y)

    cudaThreadExit()
    t1 = time()
    for i in range(loops):
        cpuSAXPY(alfa,h_X,h_Y)
    t1 = time()-t1
    print "%10d%6.2f%6.2f" % (vlength,flops/t1,flops/t0)

    if checkErrorFlag:
        err,mxe = checkError(h_Y,g_Y)
        print "Avg and max rel error = %.2e %.2e" % (err,mxe)
示例#15
0
def main(device,vlength = 128,loops = 1):
    print "+-----------------------+"
    print "|   Simple  TRIG Test   |"
    print "| using CUDA driver API |"
    print "+-----------------------+"
    print "params: %2d %5dK %3d\n" % (log2n,vlength >> 10,loops),

    n2 = vlength ## Vector length

    # TRIGTex is about 1.5x faster than TRIG
#    name = "TRIG"
    name = "TRIGTex"

    TRIG = device.functions[name]
    mod0 = device.modules[0]

    sizeV = S4*n2
    h_Arg = (c_float*n2)()
    h_Cos = (c_float*n2)()
    h_Sin = (c_float*n2)()

    vectorInit(h_Arg)

    d_Arg = getMemory(h_Arg)
    d_Cos = getMemory(n2)
    d_Sin = getMemory(n2)

    tex = devMemToTex(mod0,"Arg",d_Arg,sizeV)

    cuFuncSetBlockShape(TRIG,BLOCK_SIZE,1,1)
    cuParamSeti(TRIG,0,d_Cos)
    cuParamSeti(TRIG,4,d_Sin)
    if name != "TRIGTex":
        cuParamSeti(TRIG,8,d_Arg)
        cuParamSeti(TRIG,12,n2)
        cuParamSetSize(TRIG,16)
    else:
        cuParamSetTexRef(TRIG,CU_PARAM_TR_DEFAULT,tex)
        cuParamSeti(TRIG,8,n2)
        cuParamSetSize(TRIG,12)
    cuCtxSynchronize()

    t0 = time()
    for i in range(loops):
        cuLaunchGrid(TRIG,GRID_SIZE,1)
    cuCtxSynchronize()
    t0 = time()-t0

    g_Cos = (c_float*n2)()
    g_Sin = (c_float*n2)()
    cuMemcpyDtoH(g_Cos,d_Cos,sizeV)
    cuMemcpyDtoH(g_Sin,d_Sin,sizeV)
    cuCtxSynchronize()

    cuMemFree(d_Arg)
    cuMemFree(d_Cos)
    cuMemFree(d_Sin)

    t1 = time()
    for i in range(loops):
        cpuTRIG(h_Cos,h_Sin,h_Arg)
    t1 = time()-t1

    flopsg = (2.e-6*n2)*float(loops)
    flopsc = flopsg

    t0 *= 1.e3;
    t1 *= 1.e3;
    print "\n       time[msec]    GFlops\n"
    print "GPU: %12.1f%10.2f" % (t0,flopsg/t0)
    print "CPU: %12.1f%10.2f" % (t1,flopsc/t1)
    print "     %12.1f" % (t1/t0)

    x = float(1 << 23)
    e,m = checkTrig(g_Cos,g_Sin)
    print "\n",name, "internal check GPU"
    print "%8.1e %8.1e" % (e,m)
    print "%8.1f %8.1f" % (e*x,m*x)

    e,m = checkTrig(h_Cos,h_Sin)
    print "\n",name, "internal check CPU"
    print "%8.1e %8.1e" % (e,m)
    print "%8.1f %8.1f" % (e*x,m*x)

    print "\n","check between CPU and GPU"
    err,mxe = checkError(h_Cos,g_Cos)
    print "Avg and max abs error (cos) = %8.1e %8.1e" % (err,mxe)
    print "                              %8.1f %8.1f" % (err*x,mxe*x)
    err,mxe = checkError(h_Sin,g_Sin)
    print "Avg and max abs error (sin) = %8.1e %8.1e" % (err,mxe)
    print "                              %8.1f %8.1f" % (err*x,mxe*x)
示例#16
0
def main(check=False, doComplex=False, dims=(128, )):
    print "+------------------------+"
    print "| Fast Fourier Transform |"
    print "| using CUDA runtime API |"
    print "+------------------------+\n"
    dims = tuple(dims)
    ndim = len(dims)
    v = ("", "NX = %d", "NX = %d NY = %d", "NX = %d NY = %d NZ = %d")
    SC = reduce(lambda x, y: x * y, dims)
    SR = reduce(lambda x, y: x * y, dims[:-1], 1)
    SR *= 2 * (dims[-1] / 2 + 1)

    print v[ndim] % dims
    print "< doComplex: %s >\n" % doComplex

    rz = 1. / float(SC)
    flops = 2. * 5. * SC * log(SC) / log(2.) * 1.e-9
    if doComplex:
        SC *= 2
    S4 = sizeof(c_float)

    if doComplex:
        sz = S4 * (SC + SC) / (1024 * 1024)
    else:
        sz = S4 * (SC + SR) / (1024 * 1024)

    h_A = (c_float * SC)()
    g_A = (c_float * SC)()
    arrayInit(h_A)

    d_A = getMemory(h_A)
    allocate = True

    if doComplex:
        d_B = getMemory(SC)
    elif allocate:
        d_B = getMemory(SR)

    if doComplex:
        plan = gf.makePlan(dims, CUFFT_C2C)
    else:
        plan1 = gf.makePlan(dims, CUFFT_R2C)
        plan2 = gf.makePlan(dims, CUFFT_C2R)

    t0 = time()
    x0 = ReadTimestampCounter()
    cudaThreadSynchronize()

    if doComplex:
        d_B = gf.ccfft(plan, d_A, None, d_B)
        d_A = gf.icfft(plan, d_B, None, d_A)
    else:
        if allocate:
            d_B = gf.rcfft(plan1, d_A, None, d_B)
            d_A = gf.crfft(plan2, d_B, None, d_A)
        else:
            d_B = gf.rcfft(plan1, d_A, SR)
            cuMemFree(d_A)
            d_A = gf.crfft(plan2, d_B, SR)

    cudaThreadSynchronize()
    t0 = time() - t0
    x1 = ReadTimestampCounter()
    fc = 1.e-3 / 2.8
    print "RDTSC: %.0f µs" % ((x1 - x0) * fc)

    cudaMemcpy(g_A, d_A, S4 * SC, cudaMemcpyDeviceToHost)

    cudaFree(d_A)
    cudaFree(d_B)

    if doComplex:
        cufftDestroy(plan)
    else:
        cufftDestroy(plan1)
        cufftDestroy(plan2)

    cudaThreadExit()
    scale(g_A, rz)

    print "\nProcessing time: %.3g sec" % t0
    print "Gigaflops GPU  : %.2f" % (flops / t0)
    gflops = (flops / t0, )

    print "\nError CPU initial vs GPU"
    err, mxe = checkError(h_A, g_A)
    stats = err, mxe
    print "Avg and max rel error = %.2e %.2e\n" % (err, mxe)

    if check:
        t1 = time()
        if doComplex:
            h_B = xf.ccfft(h_A, dims)
            h_B = xf.icfft(h_B, dims)
        else:
            h_B = xf.rcfft(h_A, dims)
            h_B = xf.crfft(h_B, dims)
        t1 = time() - t1
        print "Processing time: %.3g sec" % t1
        print "Gigaflops CPU  : %.2f" % (flops / t1)
        print "Speedup GPU/CPU: %.2f" % (t1 / t0)

        print "\nError CPU final vs CPU initial"
        err, mxe = checkError(h_B, h_A)
        print "Avg and max rel error = %.2e %.2e" % (err, mxe)

        print "\nError CPU final vs GPU"
        err, mxe = checkError(h_B, g_A)
        print "Avg and max rel error = %.2e %.2e" % (err, mxe)
    f = (-1., )
    if check:
        f = (t1 / t0, )
    fmt = "\n## " + " ".join(len(dims) * ["%3d"]) + " : %.1f %.1f: %.2e %.2e"
    print fmt % (dims + gflops + f + stats)
示例#17
0
def main(N = 1024,L = 100):
    M = N
    K = N >> 1
    N = N << 1
    flops = (2.e-9*M*N)*float(K*L)
    print "M = %d, N = %d, K = %d, L = %d; GFlops = %.1f\n" % (M,N,K,L,flops)
    na,nb,nc,alfa,beta = M*K,K*N,M*N,1.,0.

    t0 = time()
    cublasInit()

    h_A = (c_float*na)()
    h_B = (c_float*nb)()
    h_C = (c_float*nc)()
    g_C = (c_float*nc)()

    arrayInit(h_A,na)
    arrayInit(h_B,nb)
    arrayInit(h_C,nc)

    d_A = c_void_p()
    d_B = c_void_p()
    d_C = c_void_p()

    cublasAlloc(na, sizeof(c_float), byref(d_A))
    cublasAlloc(nb, sizeof(c_float), byref(d_B))
    cublasAlloc(nc, sizeof(c_float), byref(d_C))
 
    cublasSetVector(na, sizeof(c_float), h_A, 1, d_A, 1)
    cublasSetVector(nb, sizeof(c_float), h_B, 1, d_B, 1)
    cublasSetVector(nc, sizeof(c_float), h_C, 1, d_C, 1)
    tt = t0 = time()-t0
    print "Overhead CUBLAS: %.3f sec\n" % t0

    t0 = time()
    for i in range(L):
        cublasSgemm('n', 'n', M, N, K, alfa, d_A, M, d_B, K, beta, d_C, M)
    cudaThreadSynchronize()
    t0 = time()-t0
    tt += t0

    print "Processing time: %.3g (%.3g) sec" % (t0,tt)
    print "Gigaflops GPU: %.2f (%.2f)" % (flops/t0,flops/tt)

    t1 = time()
    for i in range(L):
        sgemm(h_C,h_A,h_B,M,N,K)
    t1 = time()-t1
    print "\nProcessing time: %.3g sec" % t1
    print "Gigaflops CPU  : %.2f" % (flops/t1)
    print "Speedup GPU/CPU: %.2f" % (t1/t0)

    cublasGetVector(nc, sizeof(c_float), d_C, 1, g_C, 1)
    err,mxe = checkError(h_C,g_C,nc)
    print "\nAvg and max rel error = %.2e %.2e" % (err,mxe)

    cublasFree(d_A)
    cublasFree(d_B)
    cublasFree(d_C)

    cublasShutdown()
示例#18
0
def main(N=1024, L=100):
    M = N
    K = N >> 1
    N = N << 1
    flops = (2.e-9 * M * N) * float(K * L)
    print "M = %d, N = %d, K = %d, L = %d; GFlops = %.1f\n" % (M, N, K, L,
                                                               flops)
    na, nb, nc, alfa, beta = M * K, K * N, M * N, 1., 0.

    t0 = time()
    device = cu_CUDA()
    device.getSourceModule("gpuFunctions.cubin")
    gpuSGEMM = device.getFunction("gpuSGEMM")

    sizeA = M * K
    sizeB = K * N
    sizeC = M * N

    h_A = (c_float * sizeA)()
    h_B = (c_float * sizeB)()

    arrayInit(h_A)
    arrayInit(h_B)

    d_A = getMemory(h_A)
    d_B = getMemory(h_B)
    d_C = getMemory(sizeC)

    cuFuncSetBlockShape(gpuSGEMM, BLOCK_SIZE, BLOCK_SIZE, 1)
    cuFuncSetSharedSize(gpuSGEMM, 2 * BLOCK_SIZE * BLOCK_SIZE * S4)
    cuParamSeti(gpuSGEMM, 0, d_C)
    cuParamSeti(gpuSGEMM, 4, d_A)
    cuParamSeti(gpuSGEMM, 8, d_B)
    cuParamSeti(gpuSGEMM, 12, K)
    cuParamSeti(gpuSGEMM, 16, N)
    cuParamSetSize(gpuSGEMM, 20)
    tt = t0 = time() - t0
    print "Overhead driver API: %.3f sec\n" % t0

    t0 = time()
    cuCtxSynchronize()
    for i in range(L):
        cuLaunchGrid(gpuSGEMM, N / BLOCK_SIZE, M / BLOCK_SIZE)
    cuCtxSynchronize()
    t0 = time() - t0
    tt += t0

    h_C = (c_float * sizeC)()
    cuMemcpyDtoH(h_C, d_C, S4 * sizeC)
    cuCtxSynchronize()

    cuMemFree(d_A)
    cuMemFree(d_B)
    cuMemFree(d_C)
    cuCtxDetach(device.context)

    print "Processing time: %.3g (%.3g) sec" % (t0, tt)
    print "Gigaflops GPU: %.2f (%.2f)" % (flops / t0, flops / tt)

    ref = (c_float * sizeC)()

    t1 = time()
    for i in range(L):
        sgemm(ref, h_A, h_B, M, N, K)
    t1 = time() - t1
    print "\nProcessing time: %.3g sec" % t1
    print "Gigaflops CPU  : %.2f" % (flops / t1)
    print "Speedup GPU/CPU: %.2f" % (t1 / t0)

    err, mxe = checkError(ref, h_C)
    print "\nAvg and max rel error = %.2e %.2e" % (err, mxe)
示例#19
0
def main(N = 1024,L = 100):
    M = N
    K = N >> 1
    N = N << 1
    flops = (2.e-9*M*N)*float(K*L)
    print "M = %d, N = %d, K = %d, L = %d; GFlops = %.1f\n" % (M,N,K,L,flops)
    na,nb,nc,alfa,beta = M*K,K*N,M*N,1.,0.

    t0 = time()
    device = cu_CUDA()
    device.getSourceModule("gpuFunctions.cubin")
    gpuSGEMM = device.getFunction("gpuSGEMM")

    sizeA = M*K
    sizeB = K*N
    sizeC = M*N

    h_A = (c_float*sizeA)()
    h_B = (c_float*sizeB)()

    arrayInit(h_A)
    arrayInit(h_B)

    d_A = getMemory(h_A)
    d_B = getMemory(h_B)
    d_C = getMemory(sizeC)

    cuFuncSetBlockShape(gpuSGEMM,BLOCK_SIZE,BLOCK_SIZE,1)
    cuFuncSetSharedSize(gpuSGEMM,2*BLOCK_SIZE*BLOCK_SIZE*S4)
    cuParamSeti(gpuSGEMM,0,d_C)
    cuParamSeti(gpuSGEMM,4,d_A)
    cuParamSeti(gpuSGEMM,8,d_B)
    cuParamSeti(gpuSGEMM,12,K)
    cuParamSeti(gpuSGEMM,16,N)
    cuParamSetSize(gpuSGEMM,20)
    tt = t0 = time()-t0
    print "Overhead driver API: %.3f sec\n" % t0

    t0 = time()
    cuCtxSynchronize()
    for i in range(L):
        cuLaunchGrid(gpuSGEMM,N/BLOCK_SIZE,M/BLOCK_SIZE)
    cuCtxSynchronize()
    t0 = time()-t0
    tt += t0

    h_C = (c_float*sizeC)()
    cuMemcpyDtoH(h_C,d_C,S4*sizeC)
    cuCtxSynchronize()

    cuMemFree(d_A)
    cuMemFree(d_B)
    cuMemFree(d_C)
    cuCtxDetach(device.context)

    print "Processing time: %.3g (%.3g) sec" % (t0,tt)
    print "Gigaflops GPU: %.2f (%.2f)" % (flops/t0,flops/tt)

    ref = (c_float*sizeC)()

    t1 = time()
    for i in range(L):
        sgemm(ref,h_A,h_B,M,N,K)
    t1 = time()-t1
    print "\nProcessing time: %.3g sec" % t1
    print "Gigaflops CPU  : %.2f" % (flops/t1)
    print "Speedup GPU/CPU: %.2f" % (t1/t0)

    err,mxe = checkError(ref,h_C)
    print "\nAvg and max rel error = %.2e %.2e" % (err,mxe)
示例#20
0
def main(device,vlength = 128,loops = 1):

    n2 = vlength ## Vector length

    gpuBLSC = device.functions["gpuBLSC"]

    h_S = (c_float*n2)()
    h_X = (c_float*n2)()
    h_T = (c_float*n2)()
    h_C = (c_float*n2)()
    h_P = (c_float*n2)()


    randInit(h_S,5.,30.)
    randInit(h_X,1.,100.)
    randInit(h_T,.25,10.)
    R,V = .03,.3

    d_S = getMemory(h_S)
    d_X = getMemory(h_X)
    d_T = getMemory(h_T)
    d_C = getMemory(h_C)
    d_P = getMemory(h_P)

    cuFuncSetBlockShape(gpuBLSC,BLOCK_SIZE,1,1)
    cuParamSeti(gpuBLSC, 0,d_C)
    cuParamSeti(gpuBLSC, 4,d_P)
    cuParamSeti(gpuBLSC, 8,d_S)
    cuParamSeti(gpuBLSC,12,d_X)
    cuParamSeti(gpuBLSC,16,d_T)
    cuParamSetf(gpuBLSC,20,R)
    cuParamSetf(gpuBLSC,24,V)
    cuParamSeti(gpuBLSC,28,n2)
    cuParamSetSize(gpuBLSC,32)

    cuCtxSynchronize()
    t0 = time()
    for i in range(loops):
        cuLaunchGrid(gpuBLSC,GRID_SIZE,1)
    cuCtxSynchronize()
    t0 = time()-t0

    flops = (2.e-6*n2)*float(loops)
    g_C = (c_float*n2)()
    g_P = (c_float*n2)()
    cuMemcpyDtoH(g_C,d_C,n2*S4)
    cuMemcpyDtoH(g_P,d_P,n2*S4)
    cuCtxSynchronize()

    cuMemFree(d_S)
    cuMemFree(d_X)
    cuMemFree(d_T)
    cuMemFree(d_C)
    cuMemFree(d_P)

    t1 = time()
    for i in range(loops):
        cpuBLSC(h_C,h_P,h_S,h_X,h_T,R,V,n2)
    t1 = time()-t1
    print "%10d%10.2f%10.2f" % (vlength,flops/t1,flops/t0)

    if checkErrorFlag:
        err,mxe = checkError(h_C,g_C)
        print "Avg rel error (call) = %.2e" % (err,)
        err,mxe = checkError(h_P,g_P)
        print "Avg rel error (put)  = %.2e" % (err,)
示例#21
0
xr = float(.5 )/float(kr)

if doComplex:
    text = "complex"
    rcfftx = xfft.ccfft
    crfftx = xfft.icfft
else:
    text = "   real"
    rcfftx = xfft.rcfft
    crfftx = xfft.crfft
for k in range(0,kr):
    c = rcfftx(r,dims)
    z = crfftx(c,dims)

fftw_end = time.clock()
wall_end = time.time()

dif = fftw_end - fftw_start
wif = wall_end - wall_start
print "\nfft elapsed real time     : %8.3f seconds" % wif
print "%d-D %s-to-complex fft: %8.3f seconds" % (len(dims),text,dif*xr)

flops = 2.*5.e-9*log(size)*size*kr/log(2.)
print "Performance               : %8.3f GFlops" % (flops/wif)
dif = dif * xr * sz
print "%d-D %s-to-complex fft: %8.3f µs/point\n" % (len(dims),text,dif)

rz = 1./size
err,mxe = checkError(r,z)
print "avg and max error         : %8.1e %8.1e" %  (err,mxe)
示例#22
0
def main(N = 1024,L = 100):
    M = N
    K = N >> 1
    N = N << 1
    flops = (2.e-9*M*N)*float(K*L)
    print "M = %d, N = %d, K = %d, L = %d; GFlops = %.1f\n" % (M,N,K,L,flops)
    na,nb,nc,alfa,beta = M*K,K*N,M*N,1.,0.

    t0 = time()

    sizeA = M*K
    sizeB = K*N
    sizeC = M*N

    h_A = (c_float*sizeA)()
    h_B = (c_float*sizeB)()

    arrayInit(h_A)
    arrayInit(h_B)

    d_A = getMemory(h_A)
    d_B = getMemory(h_B)
    d_C = getMemory(sizeC)

    blockDim  = dim3(BLOCK_SIZE,BLOCK_SIZE,1)
    gridDim   = dim3(N/BLOCK_SIZE,M/BLOCK_SIZE,1)
    sharedMem = S4*2*BLOCK_SIZE*BLOCK_SIZE
    tt = t0 = time()-t0
    print "Overhead runtime API: %.3f sec\n" % t0

    t0 = time()
    cudaThreadSynchronize()
    for i in range(L):
        cudaConfigureCall(gridDim,blockDim,sharedMem,0)
        gpuSGEMM(d_C,d_A,d_B,K,N)
    cudaThreadSynchronize()
    t0 = time()-t0
    tt += t0

    h_C = (c_float*sizeC)()
    cudaMemcpy(h_C,d_C,S4*sizeC,cudaMemcpyDeviceToHost)

    cudaThreadSynchronize()

    cudaFree(d_A)
    cudaFree(d_B)
    cudaFree(d_C)

    cudaThreadExit()
    print "Processing time: %.3g (%.3g) sec" % (t0,tt)
    print "Gigaflops GPU: %.2f (%.2f)" % (flops/t0,flops/tt)

    ref = (c_float*sizeC)()

    t1 = time()
    for i in range(L):
        sgemm(ref,h_A,h_B,M,N,K)
    t1 = time()-t1
    print "\nProcessing time: %.3g sec" % t1
    print "Gigaflops CPU  : %.2f" % (flops/t1)
    print "Speedup GPU/CPU: %.2f" % (t1/t0)

    err,mxe = checkError(ref,h_C)
    print "\nAvg and max rel error = %.2e %.2e" % (err,mxe)
示例#23
0
def main(N=1024, L=100):
    M = N
    K = N >> 1
    N = N << 1
    flops = (2.e-9 * M * N) * float(K * L)
    print "M = %d, N = %d, K = %d, L = %d; GFlops = %.1f\n" % (M, N, K, L,
                                                               flops)
    na, nb, nc, alfa, beta = M * K, K * N, M * N, 1., 0.

    t0 = time()

    sizeA = M * K
    sizeB = K * N
    sizeC = M * N

    h_A = (c_float * sizeA)()
    h_B = (c_float * sizeB)()

    arrayInit(h_A)
    arrayInit(h_B)

    d_A = getMemory(h_A)
    d_B = getMemory(h_B)
    d_C = getMemory(sizeC)

    blockDim = dim3(BLOCK_SIZE, BLOCK_SIZE, 1)
    gridDim = dim3(N / BLOCK_SIZE, M / BLOCK_SIZE, 1)
    sharedMem = S4 * 2 * BLOCK_SIZE * BLOCK_SIZE
    tt = t0 = time() - t0
    print "Overhead runtime API: %.3f sec\n" % t0

    t0 = time()
    cudaThreadSynchronize()
    for i in range(L):
        cudaConfigureCall(gridDim, blockDim, sharedMem, 0)
        gpuSGEMM(d_C, d_A, d_B, K, N)
    cudaThreadSynchronize()
    t0 = time() - t0
    tt += t0

    h_C = (c_float * sizeC)()
    cudaMemcpy(h_C, d_C, S4 * sizeC, cudaMemcpyDeviceToHost)

    cudaThreadSynchronize()

    cudaFree(d_A)
    cudaFree(d_B)
    cudaFree(d_C)

    cudaThreadExit()
    print "Processing time: %.3g (%.3g) sec" % (t0, tt)
    print "Gigaflops GPU: %.2f (%.2f)" % (flops / t0, flops / tt)

    ref = (c_float * sizeC)()

    t1 = time()
    for i in range(L):
        sgemm(ref, h_A, h_B, M, N, K)
    t1 = time() - t1
    print "\nProcessing time: %.3g sec" % t1
    print "Gigaflops CPU  : %.2f" % (flops / t1)
    print "Speedup GPU/CPU: %.2f" % (t1 / t0)

    err, mxe = checkError(ref, h_C)
    print "\nAvg and max rel error = %.2e %.2e" % (err, mxe)
示例#24
0
def main(check=False,doComplex=False,dims=(128,)):
    print "+------------------------+"
    print "| Fast Fourier Transform |"
    print "| using CUDA runtime API |"
    print "+------------------------+\n"
    dims = tuple(dims)
    ndim = len(dims)
    v = ("","NX = %d","NX = %d NY = %d","NX = %d NY = %d NZ = %d")
    SC = reduce(lambda x,y:x*y,dims)
    SR = reduce(lambda x,y:x*y,dims[:-1],1)
    SR *= 2*(dims[-1]/2+1)

    print v[ndim] % dims
    print "< doComplex: %s >\n" % doComplex

    rz = 1./float(SC)
    flops = 2.*5.*SC*log(SC)/log(2.)*1.e-9
    if doComplex:
        SC *= 2
    S4 = sizeof(c_float)

    if doComplex:
        sz = S4*(SC+SC)/(1024*1024)
    else:
        sz = S4*(SC+SR)/(1024*1024)

    h_A = (c_float*SC)()
    g_A = (c_float*SC)()
    arrayInit(h_A)

    d_A = getMemory(h_A)
    allocate = True

    if doComplex:
        d_B = getMemory(SC)
    elif allocate:
        d_B = getMemory(SR)

    if doComplex:
        plan = gf.makePlan(dims,CUFFT_C2C)
    else:
        plan1 = gf.makePlan(dims,CUFFT_R2C)
        plan2 = gf.makePlan(dims,CUFFT_C2R)

    t0 = time()
    x0 = ReadTimestampCounter()
    cudaThreadSynchronize()

    if doComplex:
        d_B = gf.ccfft(plan,d_A,None,d_B)
        d_A = gf.icfft(plan,d_B,None,d_A)
    else:
        if allocate:
            d_B = gf.rcfft(plan1,d_A,None,d_B)
            d_A = gf.crfft(plan2,d_B,None,d_A)
        else:
            d_B = gf.rcfft(plan1,d_A,SR)
            cuMemFree(d_A)
            d_A = gf.crfft(plan2,d_B,SR)

    cudaThreadSynchronize()
    t0 = time()-t0
    x1 = ReadTimestampCounter()
    fc = 1.e-3/2.8
    print "RDTSC: %.0f µs" % ((x1-x0)*fc)

    cudaMemcpy(g_A,d_A,S4*SC,cudaMemcpyDeviceToHost)

    cudaFree(d_A)
    cudaFree(d_B)

    if doComplex:
        cufftDestroy(plan)
    else:
        cufftDestroy(plan1)
        cufftDestroy(plan2)

    cudaThreadExit()
    scale(g_A,rz)

    print "\nProcessing time: %.3g sec" % t0
    print "Gigaflops GPU  : %.2f" % (flops/t0)
    gflops = (flops/t0,)

    print "\nError CPU initial vs GPU"
    err,mxe = checkError(h_A,g_A)
    stats = err,mxe
    print "Avg and max rel error = %.2e %.2e\n" % (err,mxe)

    if check:
        t1 = time()
        if doComplex:
            h_B = xf.ccfft(h_A,dims)
            h_B = xf.icfft(h_B,dims)
        else:
            h_B = xf.rcfft(h_A,dims)
            h_B = xf.crfft(h_B,dims)
        t1 = time()-t1
        print "Processing time: %.3g sec" % t1
        print "Gigaflops CPU  : %.2f" % (flops/t1)
        print "Speedup GPU/CPU: %.2f" % (t1/t0)

        print "\nError CPU final vs CPU initial"
        err,mxe = checkError(h_B,h_A)
        print "Avg and max rel error = %.2e %.2e" % (err,mxe)

        print "\nError CPU final vs GPU"
        err,mxe = checkError(h_B,g_A)
        print "Avg and max rel error = %.2e %.2e" % (err,mxe)
    f = (-1.,)
    if check:
        f = (t1/t0,)
    fmt = "\n## "+" ".join(len(dims)*["%3d"])+" : %.1f %.1f: %.2e %.2e"
    print fmt % (dims+gflops+f+stats)
示例#25
0
xr = float(.5) / float(kr)

if doComplex:
    text = "complex"
    rcfftx = xfft.ccfft
    crfftx = xfft.icfft
else:
    text = "   real"
    rcfftx = xfft.rcfft
    crfftx = xfft.crfft
for k in range(0, kr):
    c = rcfftx(r, dims)
    z = crfftx(c, dims)

fftw_end = time.clock()
wall_end = time.time()

dif = fftw_end - fftw_start
wif = wall_end - wall_start
print "\nfft elapsed real time     : %8.3f seconds" % wif
print "%d-D %s-to-complex fft: %8.3f seconds" % (len(dims), text, dif * xr)

flops = 2. * 5.e-9 * log(size) * size * kr / log(2.)
print "Performance               : %8.3f GFlops" % (flops / wif)
dif = dif * xr * sz
print "%d-D %s-to-complex fft: %8.3f µs/point\n" % (len(dims), text, dif)

rz = 1. / size
err, mxe = checkError(r, z)
print "avg and max error         : %8.1e %8.1e" % (err, mxe)
示例#26
0
def main(device, vlength=128, loops=1):
    print "+-----------------------+"
    print "|   Simple  TRIG Test   |"
    print "| using CUDA driver API |"
    print "+-----------------------+"
    print "params: %2d %5dK %3d\n" % (log2n, vlength >> 10, loops),

    n2 = vlength  ## Vector length

    # TRIGTex is about 1.5x faster than TRIG
    #    name = "TRIG"
    name = "TRIGTex"

    TRIG = device.functions[name]
    mod0 = device.modules[0]

    sizeV = S4 * n2
    h_Arg = (c_float * n2)()
    h_Cos = (c_float * n2)()
    h_Sin = (c_float * n2)()

    vectorInit(h_Arg)

    d_Arg = getMemory(h_Arg)
    d_Cos = getMemory(n2)
    d_Sin = getMemory(n2)

    tex = devMemToTex(mod0, "Arg", d_Arg, sizeV)

    cuFuncSetBlockShape(TRIG, BLOCK_SIZE, 1, 1)
    cuParamSeti(TRIG, 0, d_Cos)
    cuParamSeti(TRIG, 4, d_Sin)
    if name != "TRIGTex":
        cuParamSeti(TRIG, 8, d_Arg)
        cuParamSeti(TRIG, 12, n2)
        cuParamSetSize(TRIG, 16)
    else:
        cuParamSetTexRef(TRIG, CU_PARAM_TR_DEFAULT, tex)
        cuParamSeti(TRIG, 8, n2)
        cuParamSetSize(TRIG, 12)
    cuCtxSynchronize()

    t0 = time()
    for i in range(loops):
        cuLaunchGrid(TRIG, GRID_SIZE, 1)
    cuCtxSynchronize()
    t0 = time() - t0

    g_Cos = (c_float * n2)()
    g_Sin = (c_float * n2)()
    cuMemcpyDtoH(g_Cos, d_Cos, sizeV)
    cuMemcpyDtoH(g_Sin, d_Sin, sizeV)
    cuCtxSynchronize()

    cuMemFree(d_Arg)
    cuMemFree(d_Cos)
    cuMemFree(d_Sin)

    t1 = time()
    for i in range(loops):
        cpuTRIG(h_Cos, h_Sin, h_Arg)
    t1 = time() - t1

    flopsg = (2.e-6 * n2) * float(loops)
    flopsc = flopsg

    t0 *= 1.e3
    t1 *= 1.e3
    print "\n       time[msec]    GFlops\n"
    print "GPU: %12.1f%10.2f" % (t0, flopsg / t0)
    print "CPU: %12.1f%10.2f" % (t1, flopsc / t1)
    print "     %12.1f" % (t1 / t0)

    x = float(1 << 23)
    e, m = checkTrig(g_Cos, g_Sin)
    print "\n", name, "internal check GPU"
    print "%8.1e %8.1e" % (e, m)
    print "%8.1f %8.1f" % (e * x, m * x)

    e, m = checkTrig(h_Cos, h_Sin)
    print "\n", name, "internal check CPU"
    print "%8.1e %8.1e" % (e, m)
    print "%8.1f %8.1f" % (e * x, m * x)

    print "\n", "check between CPU and GPU"
    err, mxe = checkError(h_Cos, g_Cos)
    print "Avg and max abs error (cos) = %8.1e %8.1e" % (err, mxe)
    print "                              %8.1f %8.1f" % (err * x, mxe * x)
    err, mxe = checkError(h_Sin, g_Sin)
    print "Avg and max abs error (sin) = %8.1e %8.1e" % (err, mxe)
    print "                              %8.1f %8.1f" % (err * x, mxe * x)