def task1(cufft, d_image_complex, d_response_complex): ### Task1 ### # Implement a inplace CUDA FFT convolution # Pseduocode: # freq_imag = fft(image) # freq_resp = fft(response) # freq_out = fftimag * fftresp # output = ifft(freq_out) # # Use the cuFFT functions: # - cufft.fft_inplace(ary) # - cufft.ifft_inplace(ary) # # Call `vmult` which is our elementwise complex multiplication. # Do a inplace operation on `d_image_complex`. # Hints: # - keyword argument 'out' specify the output array # - length of d_image_complex and d_response_complex has the same length. cufft.fft_inplace(d_image_complex) cufft.fft_inplace(d_response_complex) vmult(d_image_complex, d_response_complex, out=d_image_complex) cufft.ifft_inplace(d_image_complex) # At this point, we have applied the filter onto d_image_complex return # Does not return anything
def main(): # Build Filter laplacian_pts = ''' -4 -1 0 -1 -4 -1 2 3 2 -1 0 3 4 3 0 -1 2 3 2 -1 -4 -1 0 -1 -4 '''.split() laplacian = np.array(laplacian_pts, dtype=np.float32).reshape(5, 5) # Build Image try: filename = sys.argv[1] image = ndimage.imread(filename, flatten=True).astype(np.float32) except IndexError: image = misc.lena().astype(np.float32) print("Image size: %s" % (image.shape, )) response = np.zeros_like(image) response[:5, :5] = laplacian # CPU ts = timer() cvimage_cpu = fftconvolve(image, laplacian, mode='same') te = timer() print('CPU: %.2fs' % (te - ts)) # GPU threadperblock = 32, 8 blockpergrid = best_grid_size(tuple(reversed(image.shape)), threadperblock) print('kernel config: %s x %s' % (blockpergrid, threadperblock)) # Trigger initialization the cuFFT system. # This takes significant time for small dataset. # We should not be including the time wasted here cufft.FFTPlan(shape=image.shape, itype=np.complex64, otype=np.complex64) # Start GPU timer ts = timer() image_complex = image.astype(np.complex64) response_complex = response.astype(np.complex64) d_image_complex = cuda.to_device(image_complex) d_response_complex = cuda.to_device(response_complex) cufft.fft_inplace(d_image_complex) cufft.fft_inplace(d_response_complex) vmult(d_image_complex, d_response_complex, out=d_image_complex) cufft.ifft_inplace(d_image_complex) cvimage_gpu = d_image_complex.copy_to_host().real / np.prod(image.shape) te = timer() print('GPU: %.2fs' % (te - ts)) # Plot the results plt.subplot(1, 2, 1) plt.title('CPU') plt.imshow(cvimage_cpu, cmap=plt.cm.gray) plt.axis('off') plt.subplot(1, 2, 2) plt.title('GPU') plt.imshow(cvimage_gpu, cmap=plt.cm.gray) plt.axis('off') plt.show()
def main(): # Build Filter laplacian_pts = ''' -4 -1 0 -1 -4 -1 2 3 2 -1 0 3 4 3 0 -1 2 3 2 -1 -4 -1 0 -1 -4 '''.split() laplacian = np.array(laplacian_pts, dtype=np.float32).reshape(5, 5) # Build Image try: filename = sys.argv[1] image = ndimage.imread(filename, flatten=True).astype(np.float32) except IndexError: image = misc.lena().astype(np.float32) print("Image size: %s" % (image.shape,)) response = np.zeros_like(image) response[:5, :5] = laplacian # CPU ts = timer() cvimage_cpu = fftconvolve(image, laplacian, mode='same') te = timer() print('CPU: %.2fs' % (te - ts)) # GPU threadperblock = 32, 8 blockpergrid = best_grid_size(tuple(reversed(image.shape)), threadperblock) print('kernel config: %s x %s' % (blockpergrid, threadperblock)) # Trigger initialization the cuFFT system. # This takes significant time for small dataset. # We should not be including the time wasted here cufft.FFTPlan(shape=image.shape, itype=np.complex64, otype=np.complex64) # Start GPU timer ts = timer() image_complex = image.astype(np.complex64) response_complex = response.astype(np.complex64) d_image_complex = cuda.to_device(image_complex) d_response_complex = cuda.to_device(response_complex) cufft.fft_inplace(d_image_complex) cufft.fft_inplace(d_response_complex) vmult(d_image_complex, d_response_complex, out=d_image_complex) cufft.ifft_inplace(d_image_complex) cvimage_gpu = d_image_complex.copy_to_host().real / np.prod(image.shape) te = timer() print('GPU: %.2fs' % (te - ts)) # Plot the results plt.subplot(1, 2, 1) plt.title('CPU') plt.imshow(cvimage_cpu, cmap=plt.cm.gray) plt.axis('off') plt.subplot(1, 2, 2) plt.title('GPU') plt.imshow(cvimage_gpu, cmap=plt.cm.gray) plt.axis('off') plt.show()