def test_against_fft_2d_mgpu(self): from pyculib.fft.binding import Plan, CUFFT_R2C rank = 64 rowsize = 64 N = rank * rowsize x = np.arange(N, dtype=np.float32) halfZ = rowsize // 2 + 1 xh = np.arange(rank * halfZ, dtype=np.complex64) for j in range(rank): for i in range(halfZ - 1): ii = j * rowsize + 2 * i r = x[ii] if ii + 1 < N: imag = x[ii + 1] else: imag = 0 xh[j * halfZ + i] = np.complex(r, imag) xh[j * halfZ + halfZ - 1] = 0 x = x.reshape(rank, rowsize) xh = xh.reshape(rank, halfZ) xf = np.fft.fft2(x) plan = Plan.many([rank, rowsize], CUFFT_R2C, 1, 2) d_x_gpu = plan.to_device(xh) xf_gpu = np.zeros(shape=(rank, halfZ), dtype=np.complex64) #d_xf_gpu = plan.to_device(xf_gpu) plan.forward(d_x_gpu, d_x_gpu) #Inplace d_x_gpu.copy_to_host(xf_gpu) self.assertTrue(np.allclose(xf[:, 0:halfZ], xf_gpu, atol=1e-6))
def test_against_fft_1d_mgpu(self): return True from pyculib.fft.binding import Plan, CUFFT_R2C, CUFFT_C2R N = 32 x = np.arange(N, dtype=np.float32) halfZ = N // 2 + 1 xh = np.arange(halfZ, dtype=np.complex64) for i in range(halfZ - 1): r = x[2 * i] if 2 * i + 1 < N: imag = x[2 * i + 1] else: imag = 0 xh[i] = np.complex(r, imag) xh[halfZ - 1] = 0 print(x) print(xh) xf = np.fft.fft(x) plan = Plan.many([N], CUFFT_R2C, 1, 2) d_x_gpu = plan.to_device(xh) xf_gpu = np.zeros(halfZ, dtype=np.complex64) #d_xf_gpu = plan.to_device(xf_gpu) plan.forward(d_x_gpu, d_x_gpu) d_x_gpu.copy_to_host(xf_gpu) self.assertTrue(np.allclose(xf[0:halfZ], xf_gpu, atol=1e-6))
def test_against_fft_1d(self): from pyculib.fft.binding import Plan, CUFFT_R2C N = 128 x = np.asarray(np.arange(N), dtype=np.float32) xf = np.fft.fft(x) d_x_gpu = cuda.to_device(x) xf_gpu = np.zeros(N//2+1, np.complex64) d_xf_gpu = cuda.to_device(xf_gpu) plan = Plan.many(x.shape, CUFFT_R2C) plan.forward(d_x_gpu, d_xf_gpu) d_xf_gpu.copy_to_host(xf_gpu) self.assertTrue( np.allclose(xf[0:N//2+1], xf_gpu, atol=1e-6) )
def test_against_fft_2d(self): from pyculib.fft.binding import Plan, CUFFT_R2C rank = 2 rowsize = 128 N = rowsize * rank x = np.arange(N, dtype=np.float32).reshape(rank, rowsize) xf = np.fft.fft2(x) d_x_gpu = cuda.to_device(x) xf_gpu = np.zeros(shape=(rank, rowsize//2 + 1), dtype=np.complex64) d_xf_gpu = cuda.to_device(xf_gpu) plan = Plan.many(x.shape, CUFFT_R2C) plan.forward(d_x_gpu, d_xf_gpu) d_xf_gpu.copy_to_host(xf_gpu) self.assertTrue(np.allclose(xf[:, 0:rowsize//2+1], xf_gpu, atol=1e-6))
def test_against_fft_3d(self): from pyculib.fft.binding import Plan, CUFFT_R2C depth = 2 colsize = 2 rowsize = 64 N = depth * colsize * rowsize x = np.arange(N, dtype=np.float32).reshape(depth, colsize, rowsize) xf = np.fft.fftn(x) halfZ = rowsize // 2 + 1 plan = Plan.many(x.shape, CUFFT_R2C) d_x_gpu = plan.to_device(x) xf_gpu = np.zeros(shape=(depth, colsize, halfZ), dtype=np.complex64) d_xf_gpu = plan.to_device(xf_gpu) plan.forward(d_x_gpu, d_xf_gpu) d_xf_gpu.copy_to_host(xf_gpu) self.assertTrue(np.allclose(xf[:, :, 0:halfZ], xf_gpu, atol=1e-6))
def test_against_fft_3d_mgpu(self): from pyculib.fft.binding import Plan, CUFFT_R2C depth = 32 colsize = 32 rowsize = 32 N = depth * colsize * rowsize x = np.arange(N, dtype=np.float32) halfZ = rowsize // 2 + 1 xh = np.arange(depth * colsize * halfZ, dtype=np.complex64) for k in range(depth): for j in range(colsize): for i in range(halfZ - 1): ii = k * colsize * rowsize + j * rowsize + 2 * i r = x[ii] if ii + 1 < N: imag = x[ii + 1] else: imag = 0 xh[k * colsize * halfZ + j * halfZ + i] = np.complex( r, imag) xh[k * colsize * halfZ + j * halfZ + halfZ - 1] = 0 x = x.reshape(depth, colsize, rowsize) xh = xh.reshape(depth, colsize, halfZ) xf = np.fft.fftn(x) plan = Plan.many([depth, colsize, rowsize], CUFFT_R2C, 1, 2) d_x_gpu = plan.to_device(xh) xf_gpu = np.zeros(shape=(depth, colsize, halfZ), dtype=np.complex64) #d_xf_gpu = plan.to_device(xf_gpu) plan.forward(d_x_gpu, d_x_gpu) d_x_gpu.copy_to_host(xf_gpu) self.assertTrue(np.allclose(xf[:, :, 0:halfZ], xf_gpu, atol=1e-6))
# f = fft.FFTPlan(img_shape, np.complex64, np.complex64, 1, 0, fft.FFTPlan.MODE_FFTW_PADDING) from pyculib.fft.binding import Plan, CUFFT_C2C from pyculib import blas as cublas n = (128 * 10)**2 data1 = np.arange(n, dtype=np.complex64).reshape(2, n // 2) data = np.arange(n, dtype=np.complex64) orig = data.copy() d_data = cuda.to_device(data) #s0 = cuda.stream() # cuda.select_device(1) # d_data1 = cuda.to_device(data) #s1 = cuda.stream() # fftplan = Plan.one(CUFFT_C2C, *data.shape) # Plan.many() fftplan1 = Plan.many(data.shape, CUFFT_C2C, 1500) b = cublas.Blas() rounds = 10000 start = time.clock() for x in range(rounds): # fft.fft_inplace(img) # cuda.select_device(0) # fftplan1.forward(d_data, d_data) # fftplan1.inverse(d_data, d_data) # cuda.select_device(1) # fftplan1.forward(d_data1, d_data1) #fftplan1.forward(d_data1, d_data1) # fftplan.inverse(d_data, d_data) # d_data = cuda.to_device(data) # cublas.dot(d_data, d_data)