def main(vlength = 128,loops = 1): n2 = vlength ## Vector length h_S = (c_float*n2)() h_X = (c_float*n2)() h_T = (c_float*n2)() h_C = (c_float*n2)() h_P = (c_float*n2)() randInit(h_S,5.,30.) randInit(h_X,1.,100.) randInit(h_T,.25,10.) R,V = .03,.3 d_S = getMemory(h_S) d_X = getMemory(h_X) d_T = getMemory(h_T) d_C = getMemory(h_C) d_P = getMemory(h_P) blockDim = dim3(BLOCK_SIZE,1,1) gridDim = dim3(GRID_SIZE,1,1) cudaThreadSynchronize() t0 = time() for i in range(loops): cudaConfigureCall(gridDim,blockDim,0,0) gpuBLSC(d_C,d_P,d_S,d_X,d_T,R,V,n2) cudaThreadSynchronize() t0 = time()-t0 flops = (2.e-6*n2)*float(loops) g_C = (c_float*n2)() g_P = (c_float*n2)() cudaMemcpy(g_C,d_C,S4*n2,cudaMemcpyDeviceToHost) cudaMemcpy(g_P,d_P,S4*n2,cudaMemcpyDeviceToHost) cudaThreadSynchronize() cudaFree(d_S) cudaFree(d_X) cudaFree(d_T) cudaFree(d_C) cudaFree(d_P) cudaThreadExit() t1 = time() for i in range(loops): cpuBLSC(h_C,h_P,h_S,h_X,h_T,R,V,n2) t1 = time()-t1 print "%10d%10.2f%10.2f" % (vlength,flops/t1,flops/t0) if checkErrorFlag: err,mxe = checkError(h_C,g_C) print "Avg rel error (call) = %.2e" % (err,) err,mxe = checkError(h_P,g_P) print "Avg rel error (put) = %.2e" % (err,)
def main(device,vlength = 128,loops = 1): n2 = vlength ## Vector length gpuBLSC = device.functions["gpuBLSC"] h_S = (c_float*n2)() h_X = (c_float*n2)() h_T = (c_float*n2)() h_C = (c_float*n2)() h_P = (c_float*n2)() randInit(h_S,5.,30.) randInit(h_X,1.,100.) randInit(h_T,.25,10.) R,V = .03,.3 d_S = getMemory(h_S) d_X = getMemory(h_X) d_T = getMemory(h_T) d_C = getMemory(h_C) d_P = getMemory(h_P) cuFuncSetBlockShape(gpuBLSC,BLOCK_SIZE,1,1) cuParamSeti(gpuBLSC, 0,d_C) cuParamSeti(gpuBLSC, 4,d_P) cuParamSeti(gpuBLSC, 8,d_S) cuParamSeti(gpuBLSC,12,d_X) cuParamSeti(gpuBLSC,16,d_T) cuParamSetf(gpuBLSC,20,R) cuParamSetf(gpuBLSC,24,V) cuParamSeti(gpuBLSC,28,n2) cuParamSetSize(gpuBLSC,32) cuCtxSynchronize() t0 = time() for i in range(loops): cuLaunchGrid(gpuBLSC,GRID_SIZE,1) cuCtxSynchronize() t0 = time()-t0 flops = (2.e-6*n2)*float(loops) g_C = (c_float*n2)() g_P = (c_float*n2)() cuMemcpyDtoH(g_C,d_C,n2*S4) cuMemcpyDtoH(g_P,d_P,n2*S4) cuCtxSynchronize() cuMemFree(d_S) cuMemFree(d_X) cuMemFree(d_T) cuMemFree(d_C) cuMemFree(d_P) t1 = time() for i in range(loops): cpuBLSC(h_C,h_P,h_S,h_X,h_T,R,V,n2) t1 = time()-t1 print "%10d%10.2f%10.2f" % (vlength,flops/t1,flops/t0) if checkErrorFlag: err,mxe = checkError(h_C,g_C) print "Avg rel error (call) = %.2e" % (err,) err,mxe = checkError(h_P,g_P) print "Avg rel error (put) = %.2e" % (err,)