def main(vlength = 128,loops = 1): n2 = vlength ## Vector length h_S = (c_float*n2)() h_X = (c_float*n2)() h_T = (c_float*n2)() h_C = (c_float*n2)() h_P = (c_float*n2)() randInit(h_S,5.,30.) randInit(h_X,1.,100.) randInit(h_T,.25,10.) R,V = .03,.3 d_S = getMemory(h_S) d_X = getMemory(h_X) d_T = getMemory(h_T) d_C = getMemory(h_C) d_P = getMemory(h_P) blockDim = dim3(BLOCK_SIZE,1,1) gridDim = dim3(GRID_SIZE,1,1) cudaThreadSynchronize() t0 = time() for i in range(loops): cudaConfigureCall(gridDim,blockDim,0,0) gpuBLSC(d_C,d_P,d_S,d_X,d_T,R,V,n2) cudaThreadSynchronize() t0 = time()-t0 flops = (2.e-6*n2)*float(loops) g_C = (c_float*n2)() g_P = (c_float*n2)() cudaMemcpy(g_C,d_C,S4*n2,cudaMemcpyDeviceToHost) cudaMemcpy(g_P,d_P,S4*n2,cudaMemcpyDeviceToHost) cudaThreadSynchronize() cudaFree(d_S) cudaFree(d_X) cudaFree(d_T) cudaFree(d_C) cudaFree(d_P) cudaThreadExit() t1 = time() for i in range(loops): cpuBLSC(h_C,h_P,h_S,h_X,h_T,R,V,n2) t1 = time()-t1 print "%10d%10.2f%10.2f" % (vlength,flops/t1,flops/t0) if checkErrorFlag: err,mxe = checkError(h_C,g_C) print "Avg rel error (call) = %.2e" % (err,) err,mxe = checkError(h_P,g_P) print "Avg rel error (put) = %.2e" % (err,)
def main(vlength = 128,loops = 1): n2 = vlength ## Vector length h_X = (c_float*n2)() h_Y = (c_float*n2)() h_Z = (c_float*n2)() vectorInit(h_X) d_X = getMemory(h_X) d_Y = getMemory(h_Y) d_Z = getMemory(h_Z) blockDim = dim3(BLOCK_SIZE,1,1) gridDim = dim3(GRID_SIZE,1,1) t0 = time() cudaThreadSynchronize() for i in range(loops): cudaConfigureCall(gridDim,blockDim,0,0) gpuTRIG(d_Y,d_Z,d_X,n2) cudaThreadSynchronize() t0 = time()-t0 flops = (2.e-9*n2)*float(loops) g_Y = (c_float*n2)() cudaMemcpy(g_Y,d_Y,S4*n2,cudaMemcpyDeviceToHost) cudaThreadSynchronize() flops = (8.e-9*n2)*float(loops) g_Y = (c_float*n2)() g_Z = (c_float*n2)() cudaMemcpy(g_Y,d_Y,S4*n2,cudaMemcpyDeviceToHost) cudaMemcpy(g_Z,d_Z,S4*n2,cudaMemcpyDeviceToHost) cudaThreadSynchronize() cudaFree(d_X) cudaFree(d_Y) cudaFree(d_Z) cudaThreadExit() t1 = time() for i in range(loops): cpuTRIG(h_Y,h_Z,h_X) t1 = time()-t1 print "%10d%6.2f%6.2f GFlops" % (vlength,flops/t1,flops/t0) if checkErrorFlag: err,mxe = checkError(h_Y,g_Y,n2) print "Avg and max rel error (cos) = %.2e %.2e" % (err,mxe) err,mxe = checkError(h_Z,g_Z,n2) print "Avg and max rel error (sin) = %.2e %.2e" % (err,mxe)
def main(vlength=128, loops=1): n2 = vlength ## Vector length h_X = (c_float * n2)() h_Y = (c_float * n2)() h_Z = (c_float * n2)() vectorInit(h_X) d_X = getMemory(h_X) d_Y = getMemory(h_Y) d_Z = getMemory(h_Z) blockDim = dim3(BLOCK_SIZE, 1, 1) gridDim = dim3(GRID_SIZE, 1, 1) t0 = time() cudaThreadSynchronize() for i in range(loops): cudaConfigureCall(gridDim, blockDim, 0, 0) gpuTRIG(d_Y, d_Z, d_X, n2) cudaThreadSynchronize() t0 = time() - t0 flops = (2.e-9 * n2) * float(loops) g_Y = (c_float * n2)() cudaMemcpy(g_Y, d_Y, S4 * n2, cudaMemcpyDeviceToHost) cudaThreadSynchronize() flops = (8.e-9 * n2) * float(loops) g_Y = (c_float * n2)() g_Z = (c_float * n2)() cudaMemcpy(g_Y, d_Y, S4 * n2, cudaMemcpyDeviceToHost) cudaMemcpy(g_Z, d_Z, S4 * n2, cudaMemcpyDeviceToHost) cudaThreadSynchronize() cudaFree(d_X) cudaFree(d_Y) cudaFree(d_Z) cudaThreadExit() t1 = time() for i in range(loops): cpuTRIG(h_Y, h_Z, h_X) t1 = time() - t1 print "%10d%6.2f%6.2f GFlops" % (vlength, flops / t1, flops / t0) if checkErrorFlag: err, mxe = checkError(h_Y, g_Y, n2) print "Avg and max rel error (cos) = %.2e %.2e" % (err, mxe) err, mxe = checkError(h_Z, g_Z, n2) print "Avg and max rel error (sin) = %.2e %.2e" % (err, mxe)
def main(device, vlength=128, loops=1): n2 = vlength ## Vector length gpuTRIG = device.functions["gpuTRIG"] h_X = (c_float * n2)() h_Y = (c_float * n2)() h_Z = (c_float * n2)() vectorInit(h_X) d_X = getMemory(h_X) d_Y = getMemory(h_Y) d_Z = getMemory(h_Z) cuFuncSetBlockShape(gpuTRIG, BLOCK_SIZE, 1, 1) cuParamSeti(gpuTRIG, 0, d_Y) cuParamSeti(gpuTRIG, 4, d_Z) cuParamSeti(gpuTRIG, 8, d_X) cuParamSeti(gpuTRIG, 12, n2) cuParamSetSize(gpuTRIG, 16) cuCtxSynchronize() t0 = time() for i in range(loops): cuLaunchGrid(gpuTRIG, GRID_SIZE, 1) cuCtxSynchronize() t0 = time() - t0 flops = (8.e-9 * n2) * float(loops) g_Y = (c_float * n2)() g_Z = (c_float * n2)() cuMemcpyDtoH(g_Y, d_Y, S4 * n2) cuMemcpyDtoH(g_Z, d_Z, S4 * n2) cuCtxSynchronize() cuMemFree(d_X) cuMemFree(d_Y) cuMemFree(d_Z) t1 = time() for i in range(loops): cpuTRIG(h_Y, h_Z, h_X) t1 = time() - t1 print "%10d%6.2f%6.2f GFlops" % (vlength, flops / t1, flops / t0) if checkErrorFlag: err, mxe = checkError(h_Y, g_Y, n2) print "Avg and max rel error (cos) = %.2e %.2e" % (err, mxe) err, mxe = checkError(h_Z, g_Z, n2) print "Avg and max rel error (sin) = %.2e %.2e" % (err, mxe)
def main(device,vlength = 128,loops = 1): n2 = vlength ## Vector length gpuTRIG = device.functions["gpuTRIG"] h_X = (c_float*n2)() h_Y = (c_float*n2)() h_Z = (c_float*n2)() vectorInit(h_X) d_X = getMemory(h_X) d_Y = getMemory(h_Y) d_Z = getMemory(h_Z) cuFuncSetBlockShape(gpuTRIG,BLOCK_SIZE,1,1) cuParamSeti(gpuTRIG,0,d_Y) cuParamSeti(gpuTRIG,4,d_Z) cuParamSeti(gpuTRIG,8,d_X) cuParamSeti(gpuTRIG,12,n2) cuParamSetSize(gpuTRIG,16) cuCtxSynchronize() t0 = time() for i in range(loops): cuLaunchGrid(gpuTRIG,GRID_SIZE,1) cuCtxSynchronize() t0 = time()-t0 flops = (8.e-9*n2)*float(loops) g_Y = (c_float*n2)() g_Z = (c_float*n2)() cuMemcpyDtoH(g_Y,d_Y,S4*n2) cuMemcpyDtoH(g_Z,d_Z,S4*n2) cuCtxSynchronize() cuMemFree(d_X) cuMemFree(d_Y) cuMemFree(d_Z) t1 = time() for i in range(loops): cpuTRIG(h_Y,h_Z,h_X) t1 = time()-t1 print "%10d%6.2f%6.2f GFlops" % (vlength,flops/t1,flops/t0) if checkErrorFlag: err,mxe = checkError(h_Y,g_Y,n2) print "Avg and max rel error (cos) = %.2e %.2e" % (err,mxe) err,mxe = checkError(h_Z,g_Z,n2) print "Avg and max rel error (sin) = %.2e %.2e" % (err,mxe)
def main(device,vlength = 128,loops = 1,m1 = 1): print "%5d %5d %5d" % (l,loops,m1), alfa = c_float(.5) n2 = vlength ## Vector length mp = 1 << (m1-1) print "%5d" % (mp*psize), fcn = "gpuPOLY%d"%(mp*psize) gpuPOLY = device.functions[fcn] h_X = (c_float*n2)() h_Y = (c_float*n2)() g_Y = (c_float*n2)() vectorInit(h_X) d_X = getMemory(h_X) d_Y = getMemory(h_Y) cuFuncSetBlockShape(gpuPOLY,BLOCK_SIZE,1,1) cuParamSeti(gpuPOLY,0,d_X) cuParamSeti(gpuPOLY,4,d_Y) cuParamSeti(gpuPOLY,8,n2) cuParamSetSize(gpuPOLY,12) cuCtxSynchronize() cuLaunchGrid(gpuPOLY,GRID_SIZE,1) t0 = time() for i in range(loops): cuLaunchGrid(gpuPOLY,GRID_SIZE,1) cuCtxSynchronize() t0 = time()-t0 flops = (2.e-9*m1*n2*(psize-1))*float(loops) cuMemcpyDtoH(g_Y,d_Y,n2*S4) cuCtxSynchronize() cuMemFree(d_X) cuMemFree(d_Y) cpuPOLY = eval("cpuPOLY%d" % (mp*psize)) t1 = time() for i in range(loops): cpuPOLY(h_X,h_Y) t1 = time()-t1 print "%10d%6.2f%6.2f" % (vlength,flops/t1,flops/t0) if checkErrorFlag: err,mxe = checkError(h_Y,g_Y) print "Avg and max rel error = %.2e %.2e" % (err,mxe)
def main(vlength = 128,loops = 1,m1 = 1): print "%5d %5d %5d" % (l,loops,m1), alfa = c_float(.5) n2 = vlength ## Vector length mp = 1 << (m1-1) print "%5d" % (mp*psize), gpuPOLY = eval("gpuPOLY%d"%(mp*psize)) h_X = (c_float*n2)() h_Y = (c_float*n2)() g_Y = (c_float*n2)() vectorInit(h_X) d_X = getMemory(h_X) d_Y = getMemory(h_Y) blockDim = dim3(BLOCK_SIZE,1,1) gridDim = dim3(GRID_SIZE,1,1) t0 = time() cudaThreadSynchronize() for i in range(loops): cudaConfigureCall(gridDim,blockDim,0,0) gpuPOLY(d_X,d_Y,n2) cudaThreadSynchronize() t0 = time()-t0 flops = (2.e-9*m1*n2*(psize-1))*float(loops) cudaMemcpy(g_Y,d_Y,S4*n2,cudaMemcpyDeviceToHost) cudaThreadSynchronize() cudaFree(d_X) cudaFree(d_Y) cudaThreadExit() cpuPOLY = eval("cpuPOLY%d" % (mp*psize)) t1 = time() for i in range(loops): cpuPOLY(h_X,h_Y) t1 = time()-t1 print "%10d%6.2f%6.2f" % (vlength,flops/t1,flops/t0) if checkErrorFlag: err,mxe = checkError(h_Y,g_Y) print "Avg and max rel error = %.2e %.2e" % (err,mxe)
def runTest(vlength = 128,loops = 1): n2 = vlength*vlength alfa = c_float(.5) cublasInit() h_X = (c_float*n2)() h_Y = (c_float*n2)() g_Y = (c_float*n2)() vectorInit(h_X) vectorInit(h_Y) d_X = c_void_p() d_Y = c_void_p() cublasAlloc(n2, sizeof(c_float), byref(d_X)) cublasAlloc(n2, sizeof(c_float), byref(d_Y)) cublasSetVector(n2, sizeof(c_float), h_X, 1, d_X, 1) cublasSetVector(n2, sizeof(c_float), h_Y, 1, d_Y, 1) flops = (2.e-9*n2)*float(loops) t0 = time() for i in range(loops): cublasSaxpy(n2, alfa, d_X, 1, d_Y, 1) cudaThreadSynchronize() t0 = time()-t0 print "Processing time: %.3g sec" % t0 print "Gigaflops GPU: %.2f (%d)" % (flops/t0,n2) t1 = time() for i in range(loops): cpuSAXPY(alfa,h_X,h_Y) t1 = time()-t1 print "\nProcessing time: %.3g sec" % t1 print "Gigaflops CPU: %.2f" % (flops/t1) print "GPU vs. CPU : %.2f" % (t1/t0) cublasGetVector(n2, sizeof(c_float), d_Y, 1, g_Y, 1) err,mxe = checkError(h_Y,g_Y) print "\nAvg and max rel error = %.2e %.2e" % (err,mxe) cublasFree(d_X) cublasFree(d_Y) cublasShutdown()
def main(device, vlength=128, loops=1): alfa = c_float(.5) n2 = vlength ## Vector length gpuSAXPY = device.functions["gpuSAXPY"] h_X = (c_float * n2)() h_Y = (c_float * n2)() g_Y = (c_float * n2)() fixedInit(h_X) d_X = getMemory(h_X) d_Y = getMemory(h_Y) cuFuncSetBlockShape(gpuSAXPY, BLOCK_SIZE, 1, 1) cuParamSetf(gpuSAXPY, 0, alfa) cuParamSeti(gpuSAXPY, 4, d_X) cuParamSeti(gpuSAXPY, 8, d_Y) cuParamSeti(gpuSAXPY, 12, n2) cuParamSetSize(gpuSAXPY, 16) cuCtxSynchronize() t0 = time() for i in range(loops): cuLaunchGrid(gpuSAXPY, GRID_SIZE, 1) cuCtxSynchronize() t0 = time() - t0 flops = (2.e-9 * n2) * float(loops) cuMemcpyDtoH(g_Y, d_Y, n2 * S4) cuCtxSynchronize() cuMemFree(d_X) cuMemFree(d_Y) t1 = time() for i in range(loops): cpuSAXPY(alfa, h_X, h_Y) t1 = time() - t1 print "%10d%6.2f%6.2f" % (vlength, flops / t1, flops / t0) if checkErrorFlag: err, mxe = checkError(h_Y, g_Y) print "Avg and max rel error = %.2e %.2e" % (err, mxe)
def main(device, vlength=128, loops=1): alfa = c_float(0.5) n2 = vlength ## Vector length gpuSAXPY = device.functions["gpuSAXPY"] h_X = (c_float * n2)() h_Y = (c_float * n2)() g_Y = (c_float * n2)() fixedInit(h_X) d_X = getMemory(h_X) d_Y = getMemory(h_Y) cuFuncSetBlockShape(gpuSAXPY, BLOCK_SIZE, 1, 1) cuParamSetf(gpuSAXPY, 0, alfa) cuParamSeti(gpuSAXPY, 4, d_X) cuParamSeti(gpuSAXPY, 8, d_Y) cuParamSeti(gpuSAXPY, 12, n2) cuParamSetSize(gpuSAXPY, 16) cuCtxSynchronize() t0 = time() for i in range(loops): cuLaunchGrid(gpuSAXPY, GRID_SIZE, 1) cuCtxSynchronize() t0 = time() - t0 flops = (2.0e-9 * n2) * float(loops) cuMemcpyDtoH(g_Y, d_Y, n2 * S4) cuCtxSynchronize() cuMemFree(d_X) cuMemFree(d_Y) t1 = time() for i in range(loops): cpuSAXPY(alfa, h_X, h_Y) t1 = time() - t1 print "%10d%6.2f%6.2f" % (vlength, flops / t1, flops / t0) if checkErrorFlag: err, mxe = checkError(h_Y, g_Y) print "Avg and max rel error = %.2e %.2e" % (err, mxe)
def main(vlength = 128,loops = 1): alfa = c_float(.5) n2 = vlength ## Vector length h_X = (c_float*n2)() h_Y = (c_float*n2)() g_Y = (c_float*n2)() fixedInit(h_X) d_X = getMemory(h_X) d_Y = getMemory(h_Y) blockDim = dim3(BLOCK_SIZE,1,1) gridDim = dim3(GRID_SIZE,1,1) t0 = time() cudaThreadSynchronize() for i in range(loops): cudaConfigureCall(gridDim,blockDim,0,0) gpuSAXPY(alfa,d_X,d_Y,n2) cudaThreadSynchronize() t0 = time()-t0 flops = (2.e-9*n2)*float(loops) g_Y = (c_float*n2)() cudaMemcpy(g_Y,d_Y,S4*n2,cudaMemcpyDeviceToHost) cudaThreadSynchronize() cudaFree(d_X) cudaFree(d_Y) cudaThreadExit() t1 = time() for i in range(loops): cpuSAXPY(alfa,h_X,h_Y) t1 = time()-t1 print "%10d%6.2f%6.2f" % (vlength,flops/t1,flops/t0) if checkErrorFlag: err,mxe = checkError(h_Y,g_Y) print "Avg and max rel error = %.2e %.2e" % (err,mxe)
def main(device,vlength = 128,loops = 1): print "+-----------------------+" print "| Simple TRIG Test |" print "| using CUDA driver API |" print "+-----------------------+" print "params: %2d %5dK %3d\n" % (log2n,vlength >> 10,loops), n2 = vlength ## Vector length # TRIGTex is about 1.5x faster than TRIG # name = "TRIG" name = "TRIGTex" TRIG = device.functions[name] mod0 = device.modules[0] sizeV = S4*n2 h_Arg = (c_float*n2)() h_Cos = (c_float*n2)() h_Sin = (c_float*n2)() vectorInit(h_Arg) d_Arg = getMemory(h_Arg) d_Cos = getMemory(n2) d_Sin = getMemory(n2) tex = devMemToTex(mod0,"Arg",d_Arg,sizeV) cuFuncSetBlockShape(TRIG,BLOCK_SIZE,1,1) cuParamSeti(TRIG,0,d_Cos) cuParamSeti(TRIG,4,d_Sin) if name != "TRIGTex": cuParamSeti(TRIG,8,d_Arg) cuParamSeti(TRIG,12,n2) cuParamSetSize(TRIG,16) else: cuParamSetTexRef(TRIG,CU_PARAM_TR_DEFAULT,tex) cuParamSeti(TRIG,8,n2) cuParamSetSize(TRIG,12) cuCtxSynchronize() t0 = time() for i in range(loops): cuLaunchGrid(TRIG,GRID_SIZE,1) cuCtxSynchronize() t0 = time()-t0 g_Cos = (c_float*n2)() g_Sin = (c_float*n2)() cuMemcpyDtoH(g_Cos,d_Cos,sizeV) cuMemcpyDtoH(g_Sin,d_Sin,sizeV) cuCtxSynchronize() cuMemFree(d_Arg) cuMemFree(d_Cos) cuMemFree(d_Sin) t1 = time() for i in range(loops): cpuTRIG(h_Cos,h_Sin,h_Arg) t1 = time()-t1 flopsg = (2.e-6*n2)*float(loops) flopsc = flopsg t0 *= 1.e3; t1 *= 1.e3; print "\n time[msec] GFlops\n" print "GPU: %12.1f%10.2f" % (t0,flopsg/t0) print "CPU: %12.1f%10.2f" % (t1,flopsc/t1) print " %12.1f" % (t1/t0) x = float(1 << 23) e,m = checkTrig(g_Cos,g_Sin) print "\n",name, "internal check GPU" print "%8.1e %8.1e" % (e,m) print "%8.1f %8.1f" % (e*x,m*x) e,m = checkTrig(h_Cos,h_Sin) print "\n",name, "internal check CPU" print "%8.1e %8.1e" % (e,m) print "%8.1f %8.1f" % (e*x,m*x) print "\n","check between CPU and GPU" err,mxe = checkError(h_Cos,g_Cos) print "Avg and max abs error (cos) = %8.1e %8.1e" % (err,mxe) print " %8.1f %8.1f" % (err*x,mxe*x) err,mxe = checkError(h_Sin,g_Sin) print "Avg and max abs error (sin) = %8.1e %8.1e" % (err,mxe) print " %8.1f %8.1f" % (err*x,mxe*x)
def main(check=False, doComplex=False, dims=(128, )): print "+------------------------+" print "| Fast Fourier Transform |" print "| using CUDA runtime API |" print "+------------------------+\n" dims = tuple(dims) ndim = len(dims) v = ("", "NX = %d", "NX = %d NY = %d", "NX = %d NY = %d NZ = %d") SC = reduce(lambda x, y: x * y, dims) SR = reduce(lambda x, y: x * y, dims[:-1], 1) SR *= 2 * (dims[-1] / 2 + 1) print v[ndim] % dims print "< doComplex: %s >\n" % doComplex rz = 1. / float(SC) flops = 2. * 5. * SC * log(SC) / log(2.) * 1.e-9 if doComplex: SC *= 2 S4 = sizeof(c_float) if doComplex: sz = S4 * (SC + SC) / (1024 * 1024) else: sz = S4 * (SC + SR) / (1024 * 1024) h_A = (c_float * SC)() g_A = (c_float * SC)() arrayInit(h_A) d_A = getMemory(h_A) allocate = True if doComplex: d_B = getMemory(SC) elif allocate: d_B = getMemory(SR) if doComplex: plan = gf.makePlan(dims, CUFFT_C2C) else: plan1 = gf.makePlan(dims, CUFFT_R2C) plan2 = gf.makePlan(dims, CUFFT_C2R) t0 = time() x0 = ReadTimestampCounter() cudaThreadSynchronize() if doComplex: d_B = gf.ccfft(plan, d_A, None, d_B) d_A = gf.icfft(plan, d_B, None, d_A) else: if allocate: d_B = gf.rcfft(plan1, d_A, None, d_B) d_A = gf.crfft(plan2, d_B, None, d_A) else: d_B = gf.rcfft(plan1, d_A, SR) cuMemFree(d_A) d_A = gf.crfft(plan2, d_B, SR) cudaThreadSynchronize() t0 = time() - t0 x1 = ReadTimestampCounter() fc = 1.e-3 / 2.8 print "RDTSC: %.0f µs" % ((x1 - x0) * fc) cudaMemcpy(g_A, d_A, S4 * SC, cudaMemcpyDeviceToHost) cudaFree(d_A) cudaFree(d_B) if doComplex: cufftDestroy(plan) else: cufftDestroy(plan1) cufftDestroy(plan2) cudaThreadExit() scale(g_A, rz) print "\nProcessing time: %.3g sec" % t0 print "Gigaflops GPU : %.2f" % (flops / t0) gflops = (flops / t0, ) print "\nError CPU initial vs GPU" err, mxe = checkError(h_A, g_A) stats = err, mxe print "Avg and max rel error = %.2e %.2e\n" % (err, mxe) if check: t1 = time() if doComplex: h_B = xf.ccfft(h_A, dims) h_B = xf.icfft(h_B, dims) else: h_B = xf.rcfft(h_A, dims) h_B = xf.crfft(h_B, dims) t1 = time() - t1 print "Processing time: %.3g sec" % t1 print "Gigaflops CPU : %.2f" % (flops / t1) print "Speedup GPU/CPU: %.2f" % (t1 / t0) print "\nError CPU final vs CPU initial" err, mxe = checkError(h_B, h_A) print "Avg and max rel error = %.2e %.2e" % (err, mxe) print "\nError CPU final vs GPU" err, mxe = checkError(h_B, g_A) print "Avg and max rel error = %.2e %.2e" % (err, mxe) f = (-1., ) if check: f = (t1 / t0, ) fmt = "\n## " + " ".join(len(dims) * ["%3d"]) + " : %.1f %.1f: %.2e %.2e" print fmt % (dims + gflops + f + stats)
def main(N = 1024,L = 100): M = N K = N >> 1 N = N << 1 flops = (2.e-9*M*N)*float(K*L) print "M = %d, N = %d, K = %d, L = %d; GFlops = %.1f\n" % (M,N,K,L,flops) na,nb,nc,alfa,beta = M*K,K*N,M*N,1.,0. t0 = time() cublasInit() h_A = (c_float*na)() h_B = (c_float*nb)() h_C = (c_float*nc)() g_C = (c_float*nc)() arrayInit(h_A,na) arrayInit(h_B,nb) arrayInit(h_C,nc) d_A = c_void_p() d_B = c_void_p() d_C = c_void_p() cublasAlloc(na, sizeof(c_float), byref(d_A)) cublasAlloc(nb, sizeof(c_float), byref(d_B)) cublasAlloc(nc, sizeof(c_float), byref(d_C)) cublasSetVector(na, sizeof(c_float), h_A, 1, d_A, 1) cublasSetVector(nb, sizeof(c_float), h_B, 1, d_B, 1) cublasSetVector(nc, sizeof(c_float), h_C, 1, d_C, 1) tt = t0 = time()-t0 print "Overhead CUBLAS: %.3f sec\n" % t0 t0 = time() for i in range(L): cublasSgemm('n', 'n', M, N, K, alfa, d_A, M, d_B, K, beta, d_C, M) cudaThreadSynchronize() t0 = time()-t0 tt += t0 print "Processing time: %.3g (%.3g) sec" % (t0,tt) print "Gigaflops GPU: %.2f (%.2f)" % (flops/t0,flops/tt) t1 = time() for i in range(L): sgemm(h_C,h_A,h_B,M,N,K) t1 = time()-t1 print "\nProcessing time: %.3g sec" % t1 print "Gigaflops CPU : %.2f" % (flops/t1) print "Speedup GPU/CPU: %.2f" % (t1/t0) cublasGetVector(nc, sizeof(c_float), d_C, 1, g_C, 1) err,mxe = checkError(h_C,g_C,nc) print "\nAvg and max rel error = %.2e %.2e" % (err,mxe) cublasFree(d_A) cublasFree(d_B) cublasFree(d_C) cublasShutdown()
def main(N=1024, L=100): M = N K = N >> 1 N = N << 1 flops = (2.e-9 * M * N) * float(K * L) print "M = %d, N = %d, K = %d, L = %d; GFlops = %.1f\n" % (M, N, K, L, flops) na, nb, nc, alfa, beta = M * K, K * N, M * N, 1., 0. t0 = time() device = cu_CUDA() device.getSourceModule("gpuFunctions.cubin") gpuSGEMM = device.getFunction("gpuSGEMM") sizeA = M * K sizeB = K * N sizeC = M * N h_A = (c_float * sizeA)() h_B = (c_float * sizeB)() arrayInit(h_A) arrayInit(h_B) d_A = getMemory(h_A) d_B = getMemory(h_B) d_C = getMemory(sizeC) cuFuncSetBlockShape(gpuSGEMM, BLOCK_SIZE, BLOCK_SIZE, 1) cuFuncSetSharedSize(gpuSGEMM, 2 * BLOCK_SIZE * BLOCK_SIZE * S4) cuParamSeti(gpuSGEMM, 0, d_C) cuParamSeti(gpuSGEMM, 4, d_A) cuParamSeti(gpuSGEMM, 8, d_B) cuParamSeti(gpuSGEMM, 12, K) cuParamSeti(gpuSGEMM, 16, N) cuParamSetSize(gpuSGEMM, 20) tt = t0 = time() - t0 print "Overhead driver API: %.3f sec\n" % t0 t0 = time() cuCtxSynchronize() for i in range(L): cuLaunchGrid(gpuSGEMM, N / BLOCK_SIZE, M / BLOCK_SIZE) cuCtxSynchronize() t0 = time() - t0 tt += t0 h_C = (c_float * sizeC)() cuMemcpyDtoH(h_C, d_C, S4 * sizeC) cuCtxSynchronize() cuMemFree(d_A) cuMemFree(d_B) cuMemFree(d_C) cuCtxDetach(device.context) print "Processing time: %.3g (%.3g) sec" % (t0, tt) print "Gigaflops GPU: %.2f (%.2f)" % (flops / t0, flops / tt) ref = (c_float * sizeC)() t1 = time() for i in range(L): sgemm(ref, h_A, h_B, M, N, K) t1 = time() - t1 print "\nProcessing time: %.3g sec" % t1 print "Gigaflops CPU : %.2f" % (flops / t1) print "Speedup GPU/CPU: %.2f" % (t1 / t0) err, mxe = checkError(ref, h_C) print "\nAvg and max rel error = %.2e %.2e" % (err, mxe)
def main(N = 1024,L = 100): M = N K = N >> 1 N = N << 1 flops = (2.e-9*M*N)*float(K*L) print "M = %d, N = %d, K = %d, L = %d; GFlops = %.1f\n" % (M,N,K,L,flops) na,nb,nc,alfa,beta = M*K,K*N,M*N,1.,0. t0 = time() device = cu_CUDA() device.getSourceModule("gpuFunctions.cubin") gpuSGEMM = device.getFunction("gpuSGEMM") sizeA = M*K sizeB = K*N sizeC = M*N h_A = (c_float*sizeA)() h_B = (c_float*sizeB)() arrayInit(h_A) arrayInit(h_B) d_A = getMemory(h_A) d_B = getMemory(h_B) d_C = getMemory(sizeC) cuFuncSetBlockShape(gpuSGEMM,BLOCK_SIZE,BLOCK_SIZE,1) cuFuncSetSharedSize(gpuSGEMM,2*BLOCK_SIZE*BLOCK_SIZE*S4) cuParamSeti(gpuSGEMM,0,d_C) cuParamSeti(gpuSGEMM,4,d_A) cuParamSeti(gpuSGEMM,8,d_B) cuParamSeti(gpuSGEMM,12,K) cuParamSeti(gpuSGEMM,16,N) cuParamSetSize(gpuSGEMM,20) tt = t0 = time()-t0 print "Overhead driver API: %.3f sec\n" % t0 t0 = time() cuCtxSynchronize() for i in range(L): cuLaunchGrid(gpuSGEMM,N/BLOCK_SIZE,M/BLOCK_SIZE) cuCtxSynchronize() t0 = time()-t0 tt += t0 h_C = (c_float*sizeC)() cuMemcpyDtoH(h_C,d_C,S4*sizeC) cuCtxSynchronize() cuMemFree(d_A) cuMemFree(d_B) cuMemFree(d_C) cuCtxDetach(device.context) print "Processing time: %.3g (%.3g) sec" % (t0,tt) print "Gigaflops GPU: %.2f (%.2f)" % (flops/t0,flops/tt) ref = (c_float*sizeC)() t1 = time() for i in range(L): sgemm(ref,h_A,h_B,M,N,K) t1 = time()-t1 print "\nProcessing time: %.3g sec" % t1 print "Gigaflops CPU : %.2f" % (flops/t1) print "Speedup GPU/CPU: %.2f" % (t1/t0) err,mxe = checkError(ref,h_C) print "\nAvg and max rel error = %.2e %.2e" % (err,mxe)
def main(device,vlength = 128,loops = 1): n2 = vlength ## Vector length gpuBLSC = device.functions["gpuBLSC"] h_S = (c_float*n2)() h_X = (c_float*n2)() h_T = (c_float*n2)() h_C = (c_float*n2)() h_P = (c_float*n2)() randInit(h_S,5.,30.) randInit(h_X,1.,100.) randInit(h_T,.25,10.) R,V = .03,.3 d_S = getMemory(h_S) d_X = getMemory(h_X) d_T = getMemory(h_T) d_C = getMemory(h_C) d_P = getMemory(h_P) cuFuncSetBlockShape(gpuBLSC,BLOCK_SIZE,1,1) cuParamSeti(gpuBLSC, 0,d_C) cuParamSeti(gpuBLSC, 4,d_P) cuParamSeti(gpuBLSC, 8,d_S) cuParamSeti(gpuBLSC,12,d_X) cuParamSeti(gpuBLSC,16,d_T) cuParamSetf(gpuBLSC,20,R) cuParamSetf(gpuBLSC,24,V) cuParamSeti(gpuBLSC,28,n2) cuParamSetSize(gpuBLSC,32) cuCtxSynchronize() t0 = time() for i in range(loops): cuLaunchGrid(gpuBLSC,GRID_SIZE,1) cuCtxSynchronize() t0 = time()-t0 flops = (2.e-6*n2)*float(loops) g_C = (c_float*n2)() g_P = (c_float*n2)() cuMemcpyDtoH(g_C,d_C,n2*S4) cuMemcpyDtoH(g_P,d_P,n2*S4) cuCtxSynchronize() cuMemFree(d_S) cuMemFree(d_X) cuMemFree(d_T) cuMemFree(d_C) cuMemFree(d_P) t1 = time() for i in range(loops): cpuBLSC(h_C,h_P,h_S,h_X,h_T,R,V,n2) t1 = time()-t1 print "%10d%10.2f%10.2f" % (vlength,flops/t1,flops/t0) if checkErrorFlag: err,mxe = checkError(h_C,g_C) print "Avg rel error (call) = %.2e" % (err,) err,mxe = checkError(h_P,g_P) print "Avg rel error (put) = %.2e" % (err,)
xr = float(.5 )/float(kr) if doComplex: text = "complex" rcfftx = xfft.ccfft crfftx = xfft.icfft else: text = " real" rcfftx = xfft.rcfft crfftx = xfft.crfft for k in range(0,kr): c = rcfftx(r,dims) z = crfftx(c,dims) fftw_end = time.clock() wall_end = time.time() dif = fftw_end - fftw_start wif = wall_end - wall_start print "\nfft elapsed real time : %8.3f seconds" % wif print "%d-D %s-to-complex fft: %8.3f seconds" % (len(dims),text,dif*xr) flops = 2.*5.e-9*log(size)*size*kr/log(2.) print "Performance : %8.3f GFlops" % (flops/wif) dif = dif * xr * sz print "%d-D %s-to-complex fft: %8.3f µs/point\n" % (len(dims),text,dif) rz = 1./size err,mxe = checkError(r,z) print "avg and max error : %8.1e %8.1e" % (err,mxe)
def main(N = 1024,L = 100): M = N K = N >> 1 N = N << 1 flops = (2.e-9*M*N)*float(K*L) print "M = %d, N = %d, K = %d, L = %d; GFlops = %.1f\n" % (M,N,K,L,flops) na,nb,nc,alfa,beta = M*K,K*N,M*N,1.,0. t0 = time() sizeA = M*K sizeB = K*N sizeC = M*N h_A = (c_float*sizeA)() h_B = (c_float*sizeB)() arrayInit(h_A) arrayInit(h_B) d_A = getMemory(h_A) d_B = getMemory(h_B) d_C = getMemory(sizeC) blockDim = dim3(BLOCK_SIZE,BLOCK_SIZE,1) gridDim = dim3(N/BLOCK_SIZE,M/BLOCK_SIZE,1) sharedMem = S4*2*BLOCK_SIZE*BLOCK_SIZE tt = t0 = time()-t0 print "Overhead runtime API: %.3f sec\n" % t0 t0 = time() cudaThreadSynchronize() for i in range(L): cudaConfigureCall(gridDim,blockDim,sharedMem,0) gpuSGEMM(d_C,d_A,d_B,K,N) cudaThreadSynchronize() t0 = time()-t0 tt += t0 h_C = (c_float*sizeC)() cudaMemcpy(h_C,d_C,S4*sizeC,cudaMemcpyDeviceToHost) cudaThreadSynchronize() cudaFree(d_A) cudaFree(d_B) cudaFree(d_C) cudaThreadExit() print "Processing time: %.3g (%.3g) sec" % (t0,tt) print "Gigaflops GPU: %.2f (%.2f)" % (flops/t0,flops/tt) ref = (c_float*sizeC)() t1 = time() for i in range(L): sgemm(ref,h_A,h_B,M,N,K) t1 = time()-t1 print "\nProcessing time: %.3g sec" % t1 print "Gigaflops CPU : %.2f" % (flops/t1) print "Speedup GPU/CPU: %.2f" % (t1/t0) err,mxe = checkError(ref,h_C) print "\nAvg and max rel error = %.2e %.2e" % (err,mxe)
def main(N=1024, L=100): M = N K = N >> 1 N = N << 1 flops = (2.e-9 * M * N) * float(K * L) print "M = %d, N = %d, K = %d, L = %d; GFlops = %.1f\n" % (M, N, K, L, flops) na, nb, nc, alfa, beta = M * K, K * N, M * N, 1., 0. t0 = time() sizeA = M * K sizeB = K * N sizeC = M * N h_A = (c_float * sizeA)() h_B = (c_float * sizeB)() arrayInit(h_A) arrayInit(h_B) d_A = getMemory(h_A) d_B = getMemory(h_B) d_C = getMemory(sizeC) blockDim = dim3(BLOCK_SIZE, BLOCK_SIZE, 1) gridDim = dim3(N / BLOCK_SIZE, M / BLOCK_SIZE, 1) sharedMem = S4 * 2 * BLOCK_SIZE * BLOCK_SIZE tt = t0 = time() - t0 print "Overhead runtime API: %.3f sec\n" % t0 t0 = time() cudaThreadSynchronize() for i in range(L): cudaConfigureCall(gridDim, blockDim, sharedMem, 0) gpuSGEMM(d_C, d_A, d_B, K, N) cudaThreadSynchronize() t0 = time() - t0 tt += t0 h_C = (c_float * sizeC)() cudaMemcpy(h_C, d_C, S4 * sizeC, cudaMemcpyDeviceToHost) cudaThreadSynchronize() cudaFree(d_A) cudaFree(d_B) cudaFree(d_C) cudaThreadExit() print "Processing time: %.3g (%.3g) sec" % (t0, tt) print "Gigaflops GPU: %.2f (%.2f)" % (flops / t0, flops / tt) ref = (c_float * sizeC)() t1 = time() for i in range(L): sgemm(ref, h_A, h_B, M, N, K) t1 = time() - t1 print "\nProcessing time: %.3g sec" % t1 print "Gigaflops CPU : %.2f" % (flops / t1) print "Speedup GPU/CPU: %.2f" % (t1 / t0) err, mxe = checkError(ref, h_C) print "\nAvg and max rel error = %.2e %.2e" % (err, mxe)
def main(check=False,doComplex=False,dims=(128,)): print "+------------------------+" print "| Fast Fourier Transform |" print "| using CUDA runtime API |" print "+------------------------+\n" dims = tuple(dims) ndim = len(dims) v = ("","NX = %d","NX = %d NY = %d","NX = %d NY = %d NZ = %d") SC = reduce(lambda x,y:x*y,dims) SR = reduce(lambda x,y:x*y,dims[:-1],1) SR *= 2*(dims[-1]/2+1) print v[ndim] % dims print "< doComplex: %s >\n" % doComplex rz = 1./float(SC) flops = 2.*5.*SC*log(SC)/log(2.)*1.e-9 if doComplex: SC *= 2 S4 = sizeof(c_float) if doComplex: sz = S4*(SC+SC)/(1024*1024) else: sz = S4*(SC+SR)/(1024*1024) h_A = (c_float*SC)() g_A = (c_float*SC)() arrayInit(h_A) d_A = getMemory(h_A) allocate = True if doComplex: d_B = getMemory(SC) elif allocate: d_B = getMemory(SR) if doComplex: plan = gf.makePlan(dims,CUFFT_C2C) else: plan1 = gf.makePlan(dims,CUFFT_R2C) plan2 = gf.makePlan(dims,CUFFT_C2R) t0 = time() x0 = ReadTimestampCounter() cudaThreadSynchronize() if doComplex: d_B = gf.ccfft(plan,d_A,None,d_B) d_A = gf.icfft(plan,d_B,None,d_A) else: if allocate: d_B = gf.rcfft(plan1,d_A,None,d_B) d_A = gf.crfft(plan2,d_B,None,d_A) else: d_B = gf.rcfft(plan1,d_A,SR) cuMemFree(d_A) d_A = gf.crfft(plan2,d_B,SR) cudaThreadSynchronize() t0 = time()-t0 x1 = ReadTimestampCounter() fc = 1.e-3/2.8 print "RDTSC: %.0f µs" % ((x1-x0)*fc) cudaMemcpy(g_A,d_A,S4*SC,cudaMemcpyDeviceToHost) cudaFree(d_A) cudaFree(d_B) if doComplex: cufftDestroy(plan) else: cufftDestroy(plan1) cufftDestroy(plan2) cudaThreadExit() scale(g_A,rz) print "\nProcessing time: %.3g sec" % t0 print "Gigaflops GPU : %.2f" % (flops/t0) gflops = (flops/t0,) print "\nError CPU initial vs GPU" err,mxe = checkError(h_A,g_A) stats = err,mxe print "Avg and max rel error = %.2e %.2e\n" % (err,mxe) if check: t1 = time() if doComplex: h_B = xf.ccfft(h_A,dims) h_B = xf.icfft(h_B,dims) else: h_B = xf.rcfft(h_A,dims) h_B = xf.crfft(h_B,dims) t1 = time()-t1 print "Processing time: %.3g sec" % t1 print "Gigaflops CPU : %.2f" % (flops/t1) print "Speedup GPU/CPU: %.2f" % (t1/t0) print "\nError CPU final vs CPU initial" err,mxe = checkError(h_B,h_A) print "Avg and max rel error = %.2e %.2e" % (err,mxe) print "\nError CPU final vs GPU" err,mxe = checkError(h_B,g_A) print "Avg and max rel error = %.2e %.2e" % (err,mxe) f = (-1.,) if check: f = (t1/t0,) fmt = "\n## "+" ".join(len(dims)*["%3d"])+" : %.1f %.1f: %.2e %.2e" print fmt % (dims+gflops+f+stats)
xr = float(.5) / float(kr) if doComplex: text = "complex" rcfftx = xfft.ccfft crfftx = xfft.icfft else: text = " real" rcfftx = xfft.rcfft crfftx = xfft.crfft for k in range(0, kr): c = rcfftx(r, dims) z = crfftx(c, dims) fftw_end = time.clock() wall_end = time.time() dif = fftw_end - fftw_start wif = wall_end - wall_start print "\nfft elapsed real time : %8.3f seconds" % wif print "%d-D %s-to-complex fft: %8.3f seconds" % (len(dims), text, dif * xr) flops = 2. * 5.e-9 * log(size) * size * kr / log(2.) print "Performance : %8.3f GFlops" % (flops / wif) dif = dif * xr * sz print "%d-D %s-to-complex fft: %8.3f µs/point\n" % (len(dims), text, dif) rz = 1. / size err, mxe = checkError(r, z) print "avg and max error : %8.1e %8.1e" % (err, mxe)
def main(device, vlength=128, loops=1): print "+-----------------------+" print "| Simple TRIG Test |" print "| using CUDA driver API |" print "+-----------------------+" print "params: %2d %5dK %3d\n" % (log2n, vlength >> 10, loops), n2 = vlength ## Vector length # TRIGTex is about 1.5x faster than TRIG # name = "TRIG" name = "TRIGTex" TRIG = device.functions[name] mod0 = device.modules[0] sizeV = S4 * n2 h_Arg = (c_float * n2)() h_Cos = (c_float * n2)() h_Sin = (c_float * n2)() vectorInit(h_Arg) d_Arg = getMemory(h_Arg) d_Cos = getMemory(n2) d_Sin = getMemory(n2) tex = devMemToTex(mod0, "Arg", d_Arg, sizeV) cuFuncSetBlockShape(TRIG, BLOCK_SIZE, 1, 1) cuParamSeti(TRIG, 0, d_Cos) cuParamSeti(TRIG, 4, d_Sin) if name != "TRIGTex": cuParamSeti(TRIG, 8, d_Arg) cuParamSeti(TRIG, 12, n2) cuParamSetSize(TRIG, 16) else: cuParamSetTexRef(TRIG, CU_PARAM_TR_DEFAULT, tex) cuParamSeti(TRIG, 8, n2) cuParamSetSize(TRIG, 12) cuCtxSynchronize() t0 = time() for i in range(loops): cuLaunchGrid(TRIG, GRID_SIZE, 1) cuCtxSynchronize() t0 = time() - t0 g_Cos = (c_float * n2)() g_Sin = (c_float * n2)() cuMemcpyDtoH(g_Cos, d_Cos, sizeV) cuMemcpyDtoH(g_Sin, d_Sin, sizeV) cuCtxSynchronize() cuMemFree(d_Arg) cuMemFree(d_Cos) cuMemFree(d_Sin) t1 = time() for i in range(loops): cpuTRIG(h_Cos, h_Sin, h_Arg) t1 = time() - t1 flopsg = (2.e-6 * n2) * float(loops) flopsc = flopsg t0 *= 1.e3 t1 *= 1.e3 print "\n time[msec] GFlops\n" print "GPU: %12.1f%10.2f" % (t0, flopsg / t0) print "CPU: %12.1f%10.2f" % (t1, flopsc / t1) print " %12.1f" % (t1 / t0) x = float(1 << 23) e, m = checkTrig(g_Cos, g_Sin) print "\n", name, "internal check GPU" print "%8.1e %8.1e" % (e, m) print "%8.1f %8.1f" % (e * x, m * x) e, m = checkTrig(h_Cos, h_Sin) print "\n", name, "internal check CPU" print "%8.1e %8.1e" % (e, m) print "%8.1f %8.1f" % (e * x, m * x) print "\n", "check between CPU and GPU" err, mxe = checkError(h_Cos, g_Cos) print "Avg and max abs error (cos) = %8.1e %8.1e" % (err, mxe) print " %8.1f %8.1f" % (err * x, mxe * x) err, mxe = checkError(h_Sin, g_Sin) print "Avg and max abs error (sin) = %8.1e %8.1e" % (err, mxe) print " %8.1f %8.1f" % (err * x, mxe * x)