def check_performance(thr_and_double, shape_and_axes): thr, double = thr_and_double dtype = numpy.complex128 if double else numpy.complex64 dtype = dtypes.normalize_type(dtype) shape, axes = shape_and_axes data = numpy.arange(product(shape)).reshape(shape).astype(dtype) shift = FFTShift(data, axes=axes) shiftc = shift.compile(thr) data_dev = thr.to_device(data) res_dev = thr.empty_like(data) attempts = 10 times = [] for i in range(attempts): t1 = time.time() shiftc(res_dev, data_dev) thr.synchronize() times.append(time.time() - t1) res_ref = numpy.fft.fftshift(data, axes=axes) assert diff_is_negligible(res_dev.get(), res_ref) return min(times), product(shape) * dtype.itemsize
def createNormalisationKernel(thread, shape): footprint = thread.array(shape, dtype=numpy.complex) fftshift = FFTShift(footprint) norm = norm_const(footprint, 2) fftshift.parameter.output.connect(norm, norm.input, output_prime=norm.output) normalise = fftshift.compile(thread) return normalise
def check_errors(thr, shape_and_axes, inverse=False): dtype = numpy.int32 shape, axes = shape_and_axes data = numpy.arange(product(shape)).reshape(shape).astype(dtype) shift = FFTShift(data, axes=axes) shiftc = shift.compile(thr) ref_func = numpy.fft.ifftshift if inverse else numpy.fft.fftshift data_dev = thr.to_device(data) shiftc(data_dev, data_dev, inverse) res_ref = ref_func(data, axes=axes) assert diff_is_negligible(data_dev.get(), res_ref)
def check_errors(thr, shape_and_axes): dtype = numpy.int32 shape, axes = shape_and_axes data = numpy.arange(product(shape)).reshape(shape).astype(dtype) shift = FFTShift(data, axes=axes) shiftc = shift.compile(thr) #print(shiftc._kernel_calls[0]._kernel._program.source) #print(shiftc._kernel_calls[0]._kernel.global_size) data_dev = thr.to_device(data) shiftc(data_dev, data_dev) res_ref = numpy.fft.fftshift(data, axes=axes) #print(data) #print(res_ref) #print(data_dev.get()) assert diff_is_negligible(data_dev.get(), res_ref)
def test_trivial(some_thr): """ Checks that even if the axes set is trivial (product of lengths == 1), the transformations are still attached and executed. """ dtype = numpy.complex64 shape = (128, 1, 1, 128) axes = (1, 2) param = 4 data = get_test_array(shape, dtype) data_dev = some_thr.to_device(data) res_dev = some_thr.empty_like(data_dev) shift = FFTShift(data, axes=axes) scale = mul_param(data_dev, numpy.int32) shift.parameter.input.connect(scale, scale.output, input_prime=scale.input, param=scale.param) shiftc = shift.compile(some_thr) shiftc(res_dev, data_dev, param) assert diff_is_negligible(res_dev.get(), data * param)
def kspaceepanechnikov_filter_CL2(ksp, sigma): sz = ksp.shape dtype = np.complex64 ftype = np.float32 clear_first_arg_caches() fsiz = (5, 5, 5) print(np.ceil(sigma[0]) + 2, np.ceil(sigma[1]) + 2, np.ceil(sigma[2]) + 2) print sigma fsiz = (np.ceil(sigma) + 2).astype(int) for i in xrange(0, fsiz.size): if not fsiz[i] & 0x1: fsiz[i] += 1 # Create image-domain Epanechikov kernel Kepa = epanechnikov_kernel(fsiz, sigma) # Place kernel at centre of ksp-sized matrix Kfilter = np.zeros(np.array(sz), dtype=np.complex64) szmin = np.floor( np.array(sz) / 2.0 - np.floor(np.array(Kepa.shape) / 2.0) - 1) szmax = np.floor(szmin + np.array(Kepa.shape)) print "Epa filter size ", sz, " image filter ", Kepa.shape, " szmin ", szmin, " szmax ", szmax Kfilter[szmin[0]:szmax[0], szmin[1]:szmax[1], szmin[2]:szmax[2]] = Kepa Kfilter[szmin[0]:szmax[0], szmin[1]:szmax[1], szmin[2]:szmax[2]].imag = Kepa # Create fourier-domain Epanechnikov filter api = any_api() thr = api.Thread.create() data_dev = thr.to_device(Kfilter) rfft = FFT(data_dev) crfft = rfft.compile(thr) fftshift = FFTShift(data_dev) cfftshift = fftshift.compile(thr) crfft(data_dev, data_dev) thr.synchronize() cfftshift(data_dev, data_dev) Fepanechnikov = np.abs(data_dev.get()) # / np.prod(np.array(ksp.shape)) #result2 = result2[::-1,::-1,::-1] thr.synchronize() #result = np.zeros(np.array(siz), dtype=np.complex64) #result.real = np.abs(result2) / np.sqrt(2) #result.imag = np.abs(result2) / np.sqrt(2) del data_dev, rfft, crfft, fftshift, cfftshift # Multiply Epanechnikov filter to real and imag ksp data program = thr.compile(""" KERNEL void multiply_them( GLOBAL_MEM ${ctype} *dest, GLOBAL_MEM ${ctype} *a, GLOBAL_MEM ${ftype} *f) { const SIZE_T i = get_local_id(0); dest[i].x = a[i].x * f[i]; dest[i].y = a[i].y * f[i]; }""", render_kwds=dict(ctype=dtypes.ctype(dtype), ftype=dtypes.ctype(ftype))) data_dev = thr.to_device(ksp) filter_dev = thr.to_device(Fepanechnikov) multiply_them = program.multiply_them multiply_them(data_dev, data_dev, filter_dev, global_size=512 * 512 * 512) thr.synchronize() del filter_dev, program #api = cluda.ocl_api() #api = any_api() #thr = api.Thread.create() # Filter # data_dev = thr.to_device(ksp) # ifft = FFT(data_dev) FACTOR = 1.0 # Recon # thr.synchronize() #data_dev = thr.to_device(ksp) ifft = FFT(data_dev) cifft = ifft.compile(thr) fftshiftobj = FFTShift(data_dev) cfftshift = fftshiftobj.compile(thr) cifft(data_dev, data_dev, inverse=0) thr.synchronize() cfftshift(data_dev, data_dev) thr.synchronize() result2 = data_dev.get() / np.prod(np.array(ksp.shape)) result2 = result2[::-1, ::-1, ::-1] thr.release() return result2
def fourierepanechnikov(siz, sigma): """ Epanechnikov kernel in Fourier domain is A.(1-|x|^2) => (3/2*w^3)(sin(w) - w*cos(w)/2) """ # (uu, vv, ww) = fouriercoords(siz) # uu = uu + np.spacing(1) # vv = vv + np.spacing(1) # ww = ww + np.spacing(1) # if not hasattr(sigma, "__len__"): # #if type(sigma) is float or type(sigma) is numpy.float64: # return ((3.0*sigma/16.0)/(np.pi*(uu + vv + # ww)/(sigma))**3)*(np.sin(2*np.pi*(uu + vv + ww)/(sigma)) - np.pi*(uu # + vv + ww)/(sigma)*np.cos(2*np.pi*(uu + vv + ww)/(sigma))/2) # else: # return ((3.0/16.0)/(np.pi*((uu**3)/sigma[0]**4 + (vv**3)/sigma[1]**4 + # (ww**3)/sigma[2]**4)))*(np.sin(2*np.pi*(uu/sigma[0] + vv/sigma[1] + # ww/sigma[2])) - np.pi*(uu/sigma[0] + vv/sigma[1] + # ww/sigma[2])*np.cos(2*np.pi*(uu/sigma[0] + vv/sigma[1] + ww/sigma[2]))) def is_odd(num): return num & 0x1 from cplxfilter import epanechnikov_kernel if not hasattr(sigma, "__len__"): Kepa = epanechnikov_kernel( (np.ceil(sigma) + 1, np.ceil(sigma) + 1, np.ceil(sigma) + 1), sigma) else: print( np.ceil(sigma[0]) + 2, np.ceil(sigma[1]) + 2, np.ceil(sigma[2]) + 2) print sigma fsiz = (np.ceil(sigma) + 2).astype(int) for i in xrange(0, fsiz.size): if is_odd(fsiz[i]): fsiz[i] += 1 Kepa = epanechnikov_kernel( (np.ceil(sigma[0]) + 2, np.ceil(sigma[1]) + 2, np.ceil(sigma[2]) + 2), sigma) Kfilter = np.zeros(np.array(siz), dtype=np.complex64) szmin = np.floor( np.array(siz) / 2.0 - np.floor(np.array(Kepa.shape) / 2.0) - 1) szmax = np.floor(szmin + np.array(Kepa.shape)) print "Epa filter size ", siz, " image filter ", Kepa.shape, " szmin ", szmin, " szmax ", szmax Kfilter[szmin[0]:szmax[0], szmin[1]:szmax[1], szmin[2]:szmax[2]] = Kepa Kfilter[szmin[0]:szmax[0], szmin[1]:szmax[1], szmin[2]:szmax[2]].imag = Kepa # return np.abs(fftshift(clfftn(Kfilter))) api = any_api() thr = api.Thread.create() data_dev = thr.to_device(Kfilter) fft = FFT(data_dev) cfft = fft.compile(thr) fftshift = FFTShift(data_dev) cfftshift = fftshift.compile(thr) cfft(data_dev, data_dev) thr.synchronize() cfftshift(data_dev, data_dev) thr.synchronize() result2 = data_dev.get() # / np.prod(np.array(ksp.shape)) #result2 = result2[::-1,::-1,::-1] thr.release() result = np.zeros(np.array(siz), dtype=np.complex64) result.real = np.abs(result2) / np.sqrt(2) result.imag = np.abs(result2) / np.sqrt(2) return result
def kspacegaussian_filter_CL2(ksp, sigma): """ Kspace gaussian filter and recon using GPU OpenCL 1. GPU intialisation 2. push KSP complex matrix to GPU 3. declare FFT program 4. declare Complex Gaussian GPU filter program 5. Execute Gaussian GPU program 6. GPU sync 7. Execute FFT Recon 8. Execute FFTshift 9. Retrieve reconstruced complex image from GPU 10. Reorganise image to standard (mimic numpy format) """ sz = ksp.shape dtype = np.complex64 ftype = np.float32 ultype = np.uint64 #api = cluda.ocl_api() api = any_api() thr = api.Thread.create() data_dev = thr.to_device(ksp) ifft = FFT(data_dev) FACTOR = 1.0 program = thr.compile(""" KERNEL void gauss_kernel( GLOBAL_MEM ${ctype} *dest, GLOBAL_MEM ${ctype} *src) { const ulong x = get_global_id(0); const SIZE_T dim1= %d; const SIZE_T dim2= %d; const SIZE_T dim3= %d; ${ftype} sigma[3]; sigma[0]=%f;sigma[1]=%f;sigma[2]=%f; ${ftype} factor = %f; const double TWOPISQ = 19.739208802178716; //6.283185307179586; //2*3.141592; const ${ftype} SQRT2PI = 2.5066282746; const double CUBEDSQRT2PI = 15.749609945722419; const ulong idx = x; ${ftype} i = (${ftype})((x / dim3) / dim2); i = (i - (${ftype})floor((${ftype})(dim1)/2.0f))/(${ftype})(dim1); ${ftype} j = (${ftype})(x / dim3); if((SIZE_T)j > dim2) {j=(${ftype})fmod(j, (${ftype})dim2);}; j = (j - (${ftype})floor((${ftype})(dim2)/2.0f))/(${ftype})(dim2); // Account for large global index (stored as ulong) before performing modulus double pre_k=fmod((double)(x), (double)dim3); ${ftype} k = (${ftype}) pre_k; k = (k - (${ftype})floor((${ftype})(dim3)/2.0f))/(${ftype})(dim3); ${ftype} weight = exp(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2])); // ${ftype} weight = expm1(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]))+1; // ${ftype} weight= ${exp}(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2])); dest[idx].x = src[idx].x * weight; dest[idx].y = src[idx].y * weight; } """ % (sz[0], sz[1], sz[2], sigma[0], sigma[1], sigma[2], FACTOR), render_kwds=dict(ctype=dtypes.ctype(dtype), ftype=dtypes.ctype(ftype), exp=functions.exp(ftype)), fast_math=True) gauss_kernel = program.gauss_kernel #data_dev = thr.empty_like(ksp_dev) gauss_kernel(data_dev, data_dev, global_size=sz[0] * sz[1] * sz[2]) thr.synchronize() # Recon #data_dev = thr.to_device(ksp) ifftobj = FFT(data_dev) cifft = ifftobj.compile(thr) fftshiftobj = FFTShift(data_dev) cfftshift = fftshiftobj.compile(thr) cifft(data_dev, data_dev, inverse=0) thr.synchronize() cfftshift(data_dev, data_dev) thr.synchronize() result2 = data_dev.get() / np.prod(np.array(ksp.shape)) result2 = result2[::-1, ::-1, ::-1] thr.release() return result2
thr.synchronize() toc() result = np.fft.fftshift(data_dev.get() / N**3) result = result[::-1, ::-1, ::-1] result = np.roll(np.roll(np.roll(result, 1, axis=2), 1, axis=1), 1, axis=0) print "Reikna IFFT time and first three results:" print "%s sec, %s" % (toc(), str(np.abs(result[:3, 0, 0]))) thr.release() del ifft, cifft, data_dev, thr thr = api.Thread.create() tic() data_dev = thr.to_device(ksp) ifft = FFT(data_dev) cifft = ifft.compile(thr) fftshiftobj = FFTShift(data_dev) cfftshift = fftshiftobj.compile(thr) cifft(data_dev, data_dev, inverse=0) thr.synchronize() toc() cfftshift(data_dev, data_dev) thr.synchronize() result2 = data_dev.get() / N**3 result2 = result2[::-1, ::-1, ::-1] #result = np.roll(np.roll(np.roll(result,1,axis=2),1,axis=1),1,axis=0) print "Reikna IFFT time and first three results:" print "%s sec, %s" % (toc(), str(np.abs(result2[:3, 0, 0]))) thr.release() del ifft, cifft, data_dev, fftshiftobj, cfftshift del thr, api
def run_test(thr, shape, dtype, axes=None): data = numpy.random.normal(size=shape).astype(dtype) fft = FFT(data, axes=axes) fftc = fft.compile(thr) shift = FFTShift(data, axes=axes) shiftc = shift.compile(thr) # FFT + shift as two separate computations data_dev = thr.to_device(data) t_start = time.time() fftc(data_dev, data_dev) thr.synchronize() t_gpu_fft = time.time() - t_start t_start = time.time() shiftc(data_dev, data_dev) thr.synchronize() t_gpu_shift = time.time() - t_start data_dev = thr.to_device(data) t_start = time.time() fftc(data_dev, data_dev) shiftc(data_dev, data_dev) thr.synchronize() t_gpu_separate = time.time() - t_start data_gpu = data_dev.get() # FFT + shift as a computation with a transformation data_dev = thr.to_device(data) # a separate output array to avoid unsafety of the shift transformation res_dev = thr.empty_like(data_dev) shift_tr = fftshift(data, axes=axes) fft2 = fft.parameter.output.connect(shift_tr, shift_tr.input, new_output=shift_tr.output) fft2c = fft2.compile(thr) t_start = time.time() fft2c(res_dev, data_dev) thr.synchronize() t_gpu_combined = time.time() - t_start # Reference calculation with numpy t_start = time.time() numpy.fft.fftn(data, axes=axes) t_cpu_fft = time.time() - t_start t_start = time.time() numpy.fft.fftshift(data, axes=axes) t_cpu_shift = time.time() - t_start t_start = time.time() data_ref = numpy.fft.fftn(data, axes=axes) data_ref = numpy.fft.fftshift(data_ref, axes=axes) t_cpu_all = time.time() - t_start data_gpu2 = res_dev.get() # Checking that the results are correct # (note: this will require relaxing the tolerances # if complex64 is used instead of complex128) assert numpy.allclose(data_ref, data_gpu) assert numpy.allclose(data_ref, data_gpu2) return dict( t_gpu_fft=t_gpu_fft, t_gpu_shift=t_gpu_shift, t_gpu_separate=t_gpu_separate, t_gpu_combined=t_gpu_combined, t_cpu_fft=t_cpu_fft, t_cpu_shift=t_cpu_shift, t_cpu_all=t_cpu_all)
def run_test(thr, shape, dtype, axes=None): data = numpy.random.normal(size=shape).astype(dtype) fft = FFT(data, axes=axes) fftc = fft.compile(thr) shift = FFTShift(data, axes=axes) shiftc = shift.compile(thr) # FFT + shift as two separate computations data_dev = thr.to_device(data) t_start = time.time() fftc(data_dev, data_dev) thr.synchronize() t_gpu_fft = time.time() - t_start t_start = time.time() shiftc(data_dev, data_dev) thr.synchronize() t_gpu_shift = time.time() - t_start data_dev = thr.to_device(data) t_start = time.time() fftc(data_dev, data_dev) shiftc(data_dev, data_dev) thr.synchronize() t_gpu_separate = time.time() - t_start data_gpu = data_dev.get() # FFT + shift as a computation with a transformation data_dev = thr.to_device(data) # a separate output array to avoid unsafety of the shift transformation res_dev = thr.empty_like(data_dev) shift_tr = fftshift(data, axes=axes) fft2 = fft.parameter.output.connect(shift_tr, shift_tr.input, new_output=shift_tr.output) fft2c = fft2.compile(thr) t_start = time.time() fft2c(res_dev, data_dev) thr.synchronize() t_gpu_combined = time.time() - t_start # Reference calculation with numpy t_start = time.time() numpy.fft.fftn(data, axes=axes) t_cpu_fft = time.time() - t_start t_start = time.time() numpy.fft.fftshift(data, axes=axes) t_cpu_shift = time.time() - t_start t_start = time.time() data_ref = numpy.fft.fftn(data, axes=axes) data_ref = numpy.fft.fftshift(data_ref, axes=axes) t_cpu_all = time.time() - t_start data_gpu2 = res_dev.get() # Checking that the results are correct # (note: this will require relaxing the tolerances # if complex64 is used instead of complex128) assert numpy.allclose(data_ref, data_gpu) assert numpy.allclose(data_ref, data_gpu2) return dict(t_gpu_fft=t_gpu_fft, t_gpu_shift=t_gpu_shift, t_gpu_separate=t_gpu_separate, t_gpu_combined=t_gpu_combined, t_cpu_fft=t_cpu_fft, t_cpu_shift=t_cpu_shift, t_cpu_all=t_cpu_all)
def kspacegaussian_filter_CL2(ksp, sigma): sz = ksp.shape dtype = np.complex64 ftype = np.float32 #api = cluda.ocl_api() api = cuda_api() thr = api.Thread.create() data_dev = thr.to_device(ksp) ifft = FFT(data_dev) FACTOR = 1.0 program = thr.compile(""" KERNEL void gauss_kernel( GLOBAL_MEM ${ctype} *dest, GLOBAL_MEM ${ctype} *src) { const ulong x = get_global_id(0); const SIZE_T dim1= %d; const SIZE_T dim2= %d; const SIZE_T dim3= %d; ${ftype} sigma[3]; sigma[0]=%f;sigma[1]=%f;sigma[2]=%f; ${ftype} factor = %f; const double TWOPISQ = 19.739208802178716; //6.283185307179586; //2*3.141592; // const ${ftype} SQRT2PI = 2.5066282746; // const double CUBEDSQRT2PI = 15.749609945722419; const ulong idx = x; ${ftype} i = (${ftype})((x / dim3) / dim2); i = (i - (${ftype})floor((${ftype})(dim1)/2.0))/(${ftype})(dim1); ${ftype} j = (${ftype})(x / dim3); if((SIZE_T)j > dim2) {j=(${ftype})fmod(j, (${ftype})dim2);}; j = (j - (${ftype})floor((${ftype})(dim2)/2.0f))/(${ftype})(dim2); //Account for large global index (stored as ulong) before performing modulus double pre_k=fmod((double)(x) , (double) dim3); ${ftype} k = (${ftype}) pre_k; k = (k - (${ftype})floor((${ftype})(dim3)/2.0f))/(${ftype})(dim3); ${ftype} weight = exp(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2])); //${ftype} weight = expm1(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]))+1; //${ftype} weight= ${exp}(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2])); dest[idx].x = src[idx].x * weight * factor; dest[idx].y = src[idx].y * weight * factor; } """ % (sz[0], sz[1], sz[2], sigma[0], sigma[1], sigma[2], FACTOR), render_kwds=dict(ctype=dtypes.ctype(dtype), ftype=dtypes.ctype(ftype), exp=functions.exp(ftype)), fast_math=True) gauss_kernel = program.gauss_kernel #data_dev = thr.empty_like(ksp_dev) gauss_kernel(data_dev, data_dev, global_size=(sz[0], sz[1], sz[2])) thr.synchronize() #data_dev = thr.to_device(ksp) ifft = FFT(data_dev) cifft = ifft.compile(thr) fftshift = FFTShift(data_dev) cfftshift = fftshift.compile(thr) cifft(data_dev, data_dev, inverse=0) thr.synchronize() cfftshift(data_dev, data_dev) thr.synchronize() result2 = data_dev.get() / np.prod(np.array(ksp.shape)) result2 = result2[::-1, ::-1, ::-1] thr.release() return result2