示例#1
0
def main(vlength=128, loops=1):

    n2 = vlength  ## Vector length

    h_X = (c_float * n2)()
    h_Y = (c_float * n2)()
    h_Z = (c_float * n2)()

    vectorInit(h_X)

    d_X = getMemory(h_X)
    d_Y = getMemory(h_Y)
    d_Z = getMemory(h_Z)

    blockDim = dim3(BLOCK_SIZE, 1, 1)
    gridDim = dim3(GRID_SIZE, 1, 1)

    t0 = time()
    cudaThreadSynchronize()
    for i in range(loops):
        cudaConfigureCall(gridDim, blockDim, 0, 0)
        gpuTRIG(d_Y, d_Z, d_X, n2)
    cudaThreadSynchronize()
    t0 = time() - t0

    flops = (2.e-9 * n2) * float(loops)
    g_Y = (c_float * n2)()
    cudaMemcpy(g_Y, d_Y, S4 * n2, cudaMemcpyDeviceToHost)
    cudaThreadSynchronize()

    flops = (8.e-9 * n2) * float(loops)
    g_Y = (c_float * n2)()
    g_Z = (c_float * n2)()
    cudaMemcpy(g_Y, d_Y, S4 * n2, cudaMemcpyDeviceToHost)
    cudaMemcpy(g_Z, d_Z, S4 * n2, cudaMemcpyDeviceToHost)
    cudaThreadSynchronize()

    cudaFree(d_X)
    cudaFree(d_Y)
    cudaFree(d_Z)

    cudaThreadExit()
    t1 = time()
    for i in range(loops):
        cpuTRIG(h_Y, h_Z, h_X)
    t1 = time() - t1
    print "%10d%6.2f%6.2f GFlops" % (vlength, flops / t1, flops / t0)

    if checkErrorFlag:
        err, mxe = checkError(h_Y, g_Y, n2)
        print "Avg and max rel error (cos) = %.2e %.2e" % (err, mxe)
        err, mxe = checkError(h_Z, g_Z, n2)
        print "Avg and max rel error (sin) = %.2e %.2e" % (err, mxe)
示例#2
0
def main(vlength = 128,loops = 1):

    n2 = vlength ## Vector length

    h_X = (c_float*n2)()
    h_Y = (c_float*n2)()
    h_Z = (c_float*n2)()

    vectorInit(h_X)

    d_X = getMemory(h_X)
    d_Y = getMemory(h_Y)
    d_Z = getMemory(h_Z)

    blockDim  = dim3(BLOCK_SIZE,1,1)
    gridDim   = dim3(GRID_SIZE,1,1)

    t0 = time()
    cudaThreadSynchronize()
    for i in range(loops):
        cudaConfigureCall(gridDim,blockDim,0,0)
        gpuTRIG(d_Y,d_Z,d_X,n2)
    cudaThreadSynchronize()
    t0 = time()-t0

    flops = (2.e-9*n2)*float(loops)
    g_Y = (c_float*n2)()
    cudaMemcpy(g_Y,d_Y,S4*n2,cudaMemcpyDeviceToHost)
    cudaThreadSynchronize()

    flops = (8.e-9*n2)*float(loops)
    g_Y = (c_float*n2)()
    g_Z = (c_float*n2)()
    cudaMemcpy(g_Y,d_Y,S4*n2,cudaMemcpyDeviceToHost)
    cudaMemcpy(g_Z,d_Z,S4*n2,cudaMemcpyDeviceToHost)
    cudaThreadSynchronize()

    cudaFree(d_X)
    cudaFree(d_Y)
    cudaFree(d_Z)

    cudaThreadExit()
    t1 = time()
    for i in range(loops):
        cpuTRIG(h_Y,h_Z,h_X)
    t1 = time()-t1
    print "%10d%6.2f%6.2f GFlops" % (vlength,flops/t1,flops/t0)

    if checkErrorFlag:
        err,mxe = checkError(h_Y,g_Y,n2)
        print "Avg and max rel error (cos) = %.2e %.2e" % (err,mxe)
        err,mxe = checkError(h_Z,g_Z,n2)
        print "Avg and max rel error (sin) = %.2e %.2e" % (err,mxe)
示例#3
0
def main(device,vlength = 128,loops = 1):

    n2 = vlength ## Vector length
    gpuTRIG = device.functions["gpuTRIG"]

    h_X = (c_float*n2)()
    h_Y = (c_float*n2)()
    h_Z = (c_float*n2)()

    vectorInit(h_X)

    d_X = getMemory(h_X)
    d_Y = getMemory(h_Y)
    d_Z = getMemory(h_Z)

    cuFuncSetBlockShape(gpuTRIG,BLOCK_SIZE,1,1)
    cuParamSeti(gpuTRIG,0,d_Y)
    cuParamSeti(gpuTRIG,4,d_Z)
    cuParamSeti(gpuTRIG,8,d_X)
    cuParamSeti(gpuTRIG,12,n2)
    cuParamSetSize(gpuTRIG,16)

    cuCtxSynchronize()
    t0 = time()
    for i in range(loops):
        cuLaunchGrid(gpuTRIG,GRID_SIZE,1)
    cuCtxSynchronize()
    t0 = time()-t0

    flops = (8.e-9*n2)*float(loops)
    g_Y = (c_float*n2)()
    g_Z = (c_float*n2)()
    cuMemcpyDtoH(g_Y,d_Y,S4*n2)
    cuMemcpyDtoH(g_Z,d_Z,S4*n2)
    cuCtxSynchronize()

    cuMemFree(d_X)
    cuMemFree(d_Y)
    cuMemFree(d_Z)

    t1 = time()
    for i in range(loops):
        cpuTRIG(h_Y,h_Z,h_X)
    t1 = time()-t1
    print "%10d%6.2f%6.2f GFlops" % (vlength,flops/t1,flops/t0)

    if checkErrorFlag:
        err,mxe = checkError(h_Y,g_Y,n2)
        print "Avg and max rel error (cos) = %.2e %.2e" % (err,mxe)
        err,mxe = checkError(h_Z,g_Z,n2)
        print "Avg and max rel error (sin) = %.2e %.2e" % (err,mxe)
示例#4
0
def main(device, vlength=128, loops=1):

    n2 = vlength  ## Vector length
    gpuTRIG = device.functions["gpuTRIG"]

    h_X = (c_float * n2)()
    h_Y = (c_float * n2)()
    h_Z = (c_float * n2)()

    vectorInit(h_X)

    d_X = getMemory(h_X)
    d_Y = getMemory(h_Y)
    d_Z = getMemory(h_Z)

    cuFuncSetBlockShape(gpuTRIG, BLOCK_SIZE, 1, 1)
    cuParamSeti(gpuTRIG, 0, d_Y)
    cuParamSeti(gpuTRIG, 4, d_Z)
    cuParamSeti(gpuTRIG, 8, d_X)
    cuParamSeti(gpuTRIG, 12, n2)
    cuParamSetSize(gpuTRIG, 16)

    cuCtxSynchronize()
    t0 = time()
    for i in range(loops):
        cuLaunchGrid(gpuTRIG, GRID_SIZE, 1)
    cuCtxSynchronize()
    t0 = time() - t0

    flops = (8.e-9 * n2) * float(loops)
    g_Y = (c_float * n2)()
    g_Z = (c_float * n2)()
    cuMemcpyDtoH(g_Y, d_Y, S4 * n2)
    cuMemcpyDtoH(g_Z, d_Z, S4 * n2)
    cuCtxSynchronize()

    cuMemFree(d_X)
    cuMemFree(d_Y)
    cuMemFree(d_Z)

    t1 = time()
    for i in range(loops):
        cpuTRIG(h_Y, h_Z, h_X)
    t1 = time() - t1
    print "%10d%6.2f%6.2f GFlops" % (vlength, flops / t1, flops / t0)

    if checkErrorFlag:
        err, mxe = checkError(h_Y, g_Y, n2)
        print "Avg and max rel error (cos) = %.2e %.2e" % (err, mxe)
        err, mxe = checkError(h_Z, g_Z, n2)
        print "Avg and max rel error (sin) = %.2e %.2e" % (err, mxe)
示例#5
0
def main(device,vlength = 128,loops = 1,m1 = 1):
    print "%5d %5d %5d" % (l,loops,m1),

    alfa = c_float(.5)
    n2 = vlength ## Vector length

    mp = 1 << (m1-1)
    print "%5d" % (mp*psize),
    fcn = "gpuPOLY%d"%(mp*psize)
    gpuPOLY = device.functions[fcn]
    h_X = (c_float*n2)()
    h_Y = (c_float*n2)()
    g_Y = (c_float*n2)()

    vectorInit(h_X)

    d_X = getMemory(h_X)
    d_Y = getMemory(h_Y)

    cuFuncSetBlockShape(gpuPOLY,BLOCK_SIZE,1,1)
    cuParamSeti(gpuPOLY,0,d_X)
    cuParamSeti(gpuPOLY,4,d_Y)
    cuParamSeti(gpuPOLY,8,n2)
    cuParamSetSize(gpuPOLY,12)

    cuCtxSynchronize()
    cuLaunchGrid(gpuPOLY,GRID_SIZE,1)
    t0 = time()
    for i in range(loops):
        cuLaunchGrid(gpuPOLY,GRID_SIZE,1)
    cuCtxSynchronize()
    t0 = time()-t0

    flops = (2.e-9*m1*n2*(psize-1))*float(loops)
    cuMemcpyDtoH(g_Y,d_Y,n2*S4)
    cuCtxSynchronize()

    cuMemFree(d_X)
    cuMemFree(d_Y)

    cpuPOLY = eval("cpuPOLY%d" % (mp*psize))
    t1 = time()
    for i in range(loops):
        cpuPOLY(h_X,h_Y)
    t1 = time()-t1
    print "%10d%6.2f%6.2f" % (vlength,flops/t1,flops/t0)

    if checkErrorFlag:
        err,mxe = checkError(h_Y,g_Y)
        print "Avg and max rel error = %.2e %.2e" % (err,mxe)
示例#6
0
def runTest(vlength = 128,loops = 1):
    n2 = vlength*vlength
    alfa = c_float(.5)

    cublasInit()

    h_X = (c_float*n2)()
    h_Y = (c_float*n2)()
    g_Y = (c_float*n2)()
    vectorInit(h_X)
    vectorInit(h_Y)

    d_X = c_void_p()
    d_Y = c_void_p()
    cublasAlloc(n2, sizeof(c_float), byref(d_X))
    cublasAlloc(n2, sizeof(c_float), byref(d_Y))
 
    cublasSetVector(n2, sizeof(c_float), h_X, 1, d_X, 1)
    cublasSetVector(n2, sizeof(c_float), h_Y, 1, d_Y, 1)

    flops = (2.e-9*n2)*float(loops)
    t0 = time()
    for i in range(loops):
        cublasSaxpy(n2, alfa, d_X, 1, d_Y, 1)
    cudaThreadSynchronize()
    t0 = time()-t0

    print "Processing time: %.3g sec" % t0
    print "Gigaflops GPU: %.2f (%d)" % (flops/t0,n2)

    t1 = time()
    for i in range(loops):
        cpuSAXPY(alfa,h_X,h_Y)
    t1 = time()-t1
    
    print "\nProcessing time: %.3g sec" % t1
    print "Gigaflops CPU: %.2f" % (flops/t1)
    print "GPU vs. CPU  : %.2f" % (t1/t0)

    cublasGetVector(n2, sizeof(c_float), d_Y, 1, g_Y, 1)
    err,mxe = checkError(h_Y,g_Y)
    print "\nAvg and max rel error = %.2e %.2e" % (err,mxe)

    cublasFree(d_X)
    cublasFree(d_Y)

    cublasShutdown()
示例#7
0
def main(vlength = 128,loops = 1,m1 = 1):
    print "%5d %5d %5d" % (l,loops,m1),

    alfa = c_float(.5)
    n2 = vlength ## Vector length

    mp = 1 << (m1-1)
    print "%5d" % (mp*psize),
    gpuPOLY = eval("gpuPOLY%d"%(mp*psize))
    h_X = (c_float*n2)()
    h_Y = (c_float*n2)()
    g_Y = (c_float*n2)()

    vectorInit(h_X)

    d_X = getMemory(h_X)
    d_Y = getMemory(h_Y)

    blockDim  = dim3(BLOCK_SIZE,1,1)
    gridDim   = dim3(GRID_SIZE,1,1)

    t0 = time()
    cudaThreadSynchronize()
    for i in range(loops):
        cudaConfigureCall(gridDim,blockDim,0,0)
        gpuPOLY(d_X,d_Y,n2)
    cudaThreadSynchronize()
    t0 = time()-t0

    flops = (2.e-9*m1*n2*(psize-1))*float(loops)
    cudaMemcpy(g_Y,d_Y,S4*n2,cudaMemcpyDeviceToHost)
    cudaThreadSynchronize()

    cudaFree(d_X)
    cudaFree(d_Y)

    cudaThreadExit()
    cpuPOLY = eval("cpuPOLY%d" % (mp*psize))
    t1 = time()
    for i in range(loops):
        cpuPOLY(h_X,h_Y)
    t1 = time()-t1
    print "%10d%6.2f%6.2f" % (vlength,flops/t1,flops/t0)

    if checkErrorFlag:
        err,mxe = checkError(h_Y,g_Y)
        print "Avg and max rel error = %.2e %.2e" % (err,mxe)
示例#8
0
def main(vlength = 128,loops = 1,m1 = 1):
    print "%5d %5d %5d" % (l,loops,m1),

    alfa = c_float(.5)
    n2 = vlength ## Vector length

    mp = 1 << (m1-1)
    print "%5d" % (mp*psize),
    gpuPOLY = eval("gpuPOLY%d"%(mp*psize))
    h_X = (c_float*n2)()
    h_Y = (c_float*n2)()
    g_Y = (c_float*n2)()

    vectorInit(h_X)

    d_X = getMemory(h_X)
    d_Y = getMemory(h_Y)

    blockDim  = dim3(BLOCK_SIZE,1,1)
    gridDim   = dim3(GRID_SIZE,1,1)

    t0 = time()
    cudaThreadSynchronize()
    for i in range(loops):
        cudaConfigureCall(gridDim,blockDim,0,0)
        gpuPOLY(d_X,d_Y,n2)
    cudaThreadSynchronize()
    t0 = time()-t0

    flops = (2.e-9*m1*n2*(psize-1))*float(loops)
    cudaMemcpy(g_Y,d_Y,S4*n2,cudaMemcpyDeviceToHost)
    cudaThreadSynchronize()

    cudaFree(d_X)
    cudaFree(d_Y)

    cudaThreadExit()
    cpuPOLY = eval("cpuPOLY%d" % (mp*psize))
    t1 = time()
    for i in range(loops):
        cpuPOLY(h_X,h_Y)
    t1 = time()-t1
    print "%10d%6.2f%6.2f" % (vlength,flops/t1,flops/t0)

    if checkErrorFlag:
        err,mxe = checkError(h_Y,g_Y)
        print "Avg and max rel error = %.2e %.2e" % (err,mxe)
示例#9
0
def runTest(vlength = 128,loops = 1):
    n2 = vlength*vlength
    alfa = c_float(.5)

    cublasInit()

    h_X = (c_float*n2)()
    h_Y = (c_float*n2)()
    vectorInit(h_X)
    vectorInit(h_Y)

    d_X = c_void_p()
    d_Y = c_void_p()
    cublasAlloc(n2, sizeof(c_float), byref(d_X))
    cublasAlloc(n2, sizeof(c_float), byref(d_Y))
 
    cublasSetVector(n2, sizeof(c_float), h_X, 1, d_X, 1)
    cublasSetVector(n2, sizeof(c_float), h_Y, 1, d_Y, 1)

    flops = (2.e-9*n2)*float(loops)
    s0 = 0.
    t0 = time()
    for i in range(loops):
        s0 += cublasSdot(n2, d_X, 1, d_Y, 1)
    cudaThreadSynchronize()
    t0 = time()-t0

    print "Processing time: %.3g sec" % t0
    print "Gigaflops GPU: %.2f (%d)" % (flops/t0,n2)

    s1 = 0.
    t1 = time()
    for i in range(loops):
        s1 += cpuSDOT(h_X,h_Y)
    t1 = time()-t1
    print "\nProcessing time: %.3g sec" % t1
    print "Gigaflops CPU: %.2f" % (flops/t1)
    print "GPU vs. CPU  : %.2f" % (t1/t0)

    sx = max(1.e-7,max(abs(s0),abs(s1)))
    err = abs(s1-s0)/sx
    print "\nError = %.2e" % err

    cublasFree(d_X)
    cublasFree(d_Y)

    cublasShutdown()
示例#10
0
def main(device,vlength = 128,loops = 1):
    print "+-----------------------+"
    print "|   Simple  TRIG Test   |"
    print "| using CUDA driver API |"
    print "+-----------------------+"
    print "params: %2d %5dK %3d\n" % (log2n,vlength >> 10,loops),

    n2 = vlength ## Vector length

    # TRIGTex is about 1.5x faster than TRIG
#    name = "TRIG"
    name = "TRIGTex"

    TRIG = device.functions[name]
    mod0 = device.modules[0]

    sizeV = S4*n2
    h_Arg = (c_float*n2)()
    h_Cos = (c_float*n2)()
    h_Sin = (c_float*n2)()

    vectorInit(h_Arg)

    d_Arg = getMemory(h_Arg)
    d_Cos = getMemory(n2)
    d_Sin = getMemory(n2)

    tex = devMemToTex(mod0,"Arg",d_Arg,sizeV)

    cuFuncSetBlockShape(TRIG,BLOCK_SIZE,1,1)
    cuParamSeti(TRIG,0,d_Cos)
    cuParamSeti(TRIG,4,d_Sin)
    if name != "TRIGTex":
        cuParamSeti(TRIG,8,d_Arg)
        cuParamSeti(TRIG,12,n2)
        cuParamSetSize(TRIG,16)
    else:
        cuParamSetTexRef(TRIG,CU_PARAM_TR_DEFAULT,tex)
        cuParamSeti(TRIG,8,n2)
        cuParamSetSize(TRIG,12)
    cuCtxSynchronize()

    t0 = time()
    for i in range(loops):
        cuLaunchGrid(TRIG,GRID_SIZE,1)
    cuCtxSynchronize()
    t0 = time()-t0

    g_Cos = (c_float*n2)()
    g_Sin = (c_float*n2)()
    cuMemcpyDtoH(g_Cos,d_Cos,sizeV)
    cuMemcpyDtoH(g_Sin,d_Sin,sizeV)
    cuCtxSynchronize()

    cuMemFree(d_Arg)
    cuMemFree(d_Cos)
    cuMemFree(d_Sin)

    t1 = time()
    for i in range(loops):
        cpuTRIG(h_Cos,h_Sin,h_Arg)
    t1 = time()-t1

    flopsg = (2.e-6*n2)*float(loops)
    flopsc = flopsg

    t0 *= 1.e3;
    t1 *= 1.e3;
    print "\n       time[msec]    GFlops\n"
    print "GPU: %12.1f%10.2f" % (t0,flopsg/t0)
    print "CPU: %12.1f%10.2f" % (t1,flopsc/t1)
    print "     %12.1f" % (t1/t0)

    x = float(1 << 23)
    e,m = checkTrig(g_Cos,g_Sin)
    print "\n",name, "internal check GPU"
    print "%8.1e %8.1e" % (e,m)
    print "%8.1f %8.1f" % (e*x,m*x)

    e,m = checkTrig(h_Cos,h_Sin)
    print "\n",name, "internal check CPU"
    print "%8.1e %8.1e" % (e,m)
    print "%8.1f %8.1f" % (e*x,m*x)

    print "\n","check between CPU and GPU"
    err,mxe = checkError(h_Cos,g_Cos)
    print "Avg and max abs error (cos) = %8.1e %8.1e" % (err,mxe)
    print "                              %8.1f %8.1f" % (err*x,mxe*x)
    err,mxe = checkError(h_Sin,g_Sin)
    print "Avg and max abs error (sin) = %8.1e %8.1e" % (err,mxe)
    print "                              %8.1f %8.1f" % (err*x,mxe*x)
示例#11
0
def main(device, vlength=128, loops=1):
    print "+-----------------------+"
    print "|   Simple  TRIG Test   |"
    print "| using CUDA driver API |"
    print "+-----------------------+"
    print "params: %2d %5dK %3d\n" % (log2n, vlength >> 10, loops),

    n2 = vlength  ## Vector length

    # TRIGTex is about 1.5x faster than TRIG
    #    name = "TRIG"
    name = "TRIGTex"

    TRIG = device.functions[name]
    mod0 = device.modules[0]

    sizeV = S4 * n2
    h_Arg = (c_float * n2)()
    h_Cos = (c_float * n2)()
    h_Sin = (c_float * n2)()

    vectorInit(h_Arg)

    d_Arg = getMemory(h_Arg)
    d_Cos = getMemory(n2)
    d_Sin = getMemory(n2)

    tex = devMemToTex(mod0, "Arg", d_Arg, sizeV)

    cuFuncSetBlockShape(TRIG, BLOCK_SIZE, 1, 1)
    cuParamSeti(TRIG, 0, d_Cos)
    cuParamSeti(TRIG, 4, d_Sin)
    if name != "TRIGTex":
        cuParamSeti(TRIG, 8, d_Arg)
        cuParamSeti(TRIG, 12, n2)
        cuParamSetSize(TRIG, 16)
    else:
        cuParamSetTexRef(TRIG, CU_PARAM_TR_DEFAULT, tex)
        cuParamSeti(TRIG, 8, n2)
        cuParamSetSize(TRIG, 12)
    cuCtxSynchronize()

    t0 = time()
    for i in range(loops):
        cuLaunchGrid(TRIG, GRID_SIZE, 1)
    cuCtxSynchronize()
    t0 = time() - t0

    g_Cos = (c_float * n2)()
    g_Sin = (c_float * n2)()
    cuMemcpyDtoH(g_Cos, d_Cos, sizeV)
    cuMemcpyDtoH(g_Sin, d_Sin, sizeV)
    cuCtxSynchronize()

    cuMemFree(d_Arg)
    cuMemFree(d_Cos)
    cuMemFree(d_Sin)

    t1 = time()
    for i in range(loops):
        cpuTRIG(h_Cos, h_Sin, h_Arg)
    t1 = time() - t1

    flopsg = (2.e-6 * n2) * float(loops)
    flopsc = flopsg

    t0 *= 1.e3
    t1 *= 1.e3
    print "\n       time[msec]    GFlops\n"
    print "GPU: %12.1f%10.2f" % (t0, flopsg / t0)
    print "CPU: %12.1f%10.2f" % (t1, flopsc / t1)
    print "     %12.1f" % (t1 / t0)

    x = float(1 << 23)
    e, m = checkTrig(g_Cos, g_Sin)
    print "\n", name, "internal check GPU"
    print "%8.1e %8.1e" % (e, m)
    print "%8.1f %8.1f" % (e * x, m * x)

    e, m = checkTrig(h_Cos, h_Sin)
    print "\n", name, "internal check CPU"
    print "%8.1e %8.1e" % (e, m)
    print "%8.1f %8.1f" % (e * x, m * x)

    print "\n", "check between CPU and GPU"
    err, mxe = checkError(h_Cos, g_Cos)
    print "Avg and max abs error (cos) = %8.1e %8.1e" % (err, mxe)
    print "                              %8.1f %8.1f" % (err * x, mxe * x)
    err, mxe = checkError(h_Sin, g_Sin)
    print "Avg and max abs error (sin) = %8.1e %8.1e" % (err, mxe)
    print "                              %8.1f %8.1f" % (err * x, mxe * x)