def __initialize_gpu(self): try: import reikna.cluda as cluda from reikna.fft import FFT # dtype = dtype#numpy.complex64 data = numpy.zeros( self.st['Kd'],dtype=numpy.complex64) # data2 = numpy.empty_like(data) api = cluda.ocl_api() self.thr = api.Thread.create(async=True) self.data_dev = self.thr.to_device(data) # self.data_rec = self.thr.to_device(data2) axes=range(0,numpy.size(self.st['Kd'])) myfft= FFT( data, axes=axes) self.myfft = myfft.compile(self.thr,fast_math=True) self.gpu_flag=1 print('create gpu fft?',self.gpu_flag) print('line 642') W= self.st['w'][...,0] print('line 645') self.W = numpy.reshape(W, self.st['Kd'],order='C') print('line 647') # self.thr2 = api.Thread.create() print('line 649') self.W_dev = self.thr.to_device(self.W.astype(dtype)) self.gpu_flag=1 print('line 652') except: self.gpu_flag=0 print('get error, using cpu')
def __initialize_gpu(self): try: import reikna.cluda as cluda from reikna.fft import FFT # dtype = dtype#numpy.complex64 data = numpy.zeros(self.st['Kd'], dtype=numpy.complex64) # data2 = numpy.empty_like(data) api = cluda.ocl_api() self.thr = api.Thread.create(async=True) self.data_dev = self.thr.to_device(data) # self.data_rec = self.thr.to_device(data2) axes = range(0, numpy.size(self.st['Kd'])) myfft = FFT(data, axes=axes) self.myfft = myfft.compile(self.thr, fast_math=True) self.gpu_flag = 1 print('create gpu fft?', self.gpu_flag) print('line 642') W = self.st['w'][..., 0] print('line 645') self.W = numpy.reshape(W, self.st['Kd'], order='C') print('line 647') # self.thr2 = api.Thread.create() print('line 649') self.W_dev = self.thr.to_device(self.W.astype(dtype)) self.gpu_flag = 1 print('line 652') except: self.gpu_flag = 0 print('get error, using cpu')
def check_performance(thr_and_double, shape_and_axes, fast_math): thr, double = thr_and_double shape, axes = shape_and_axes dtype = numpy.complex128 if double else numpy.complex64 data = get_test_array(shape, dtype) data_dev = thr.to_device(data) res_dev = thr.empty_like(data_dev) fft = FFT(data_dev, axes=axes) fftc = fft.compile(thr, fast_math=fast_math) attempts = 10 t1 = time.time() for i in range(attempts): fftc(res_dev, data_dev) thr.synchronize() t2 = time.time() dev_time = (t2 - t1) / attempts fwd_ref = numpy.fft.fftn(data, axes=axes).astype(dtype) assert diff_is_negligible(res_dev.get(), fwd_ref) return dev_time, product(shape) * sum([numpy.log2(shape[a]) for a in axes]) * 5
def _fft_2(data, inverse=False, queue=None, block=True): """Execute FFT on *data*, which is first converted to a pyopencl array and retyped to complex. """ if not queue: queue = cfg.OPENCL.queue thread = ocl_api().Thread(queue) data = g_util.get_array(data, queue=queue) if data.dtype != cfg.PRECISION.np_cplx: data = data.astype(cfg.PRECISION.np_cplx) if queue not in cfg.OPENCL.fft_plans: cfg.OPENCL.fft_plans[queue] = {} if data.shape not in cfg.OPENCL.fft_plans[queue]: LOG.debug("Creating FFT Plan for {} and shape {}".format(queue, data.shape)) _fft = FFT(data, axes=(0, 1)) cfg.OPENCL.fft_plans[queue][data.shape] = _fft.compile(thread, fast_math=False) plan = cfg.OPENCL.fft_plans[queue][data.shape] LOG.debug("fft_2, shape: %s, inverse: %s", data.shape, inverse) # plan.execute(data.data, inverse=inverse, wait_for_finish=block) plan(data, data, inverse=inverse) if block: thread.synchronize() return data
def createComplexFFTKernel(thread, shape): scaling = numpy.sqrt(shape[-2] * shape[-1]) footprint = thread.array(shape, dtype=numpy.complex128) fft = FFT(footprint) div = div_const(footprint, scaling) fft.parameter.output.connect(div, div.input, output_prime=div.output) return fft.compile(thread)
def get_fftc(self, arr): self._initialize() shape = arr.shape if shape in self._fftc: return self._fftc[shape] fft = FFT(self._thr.array(shape, np.complex64)) fftc = fft.compile(self._thr) self._fftc[shape] = fftc return fftc
def __init__(self, diffs, coords, mask, probe, sample, sample_support, pmod_int = False): """Initialise the Ptychography module with the data in 'inputDir' Naming convention: coords_100x2.raw list of y, x coordinates in np.float64 pixel units diffs_322x256x512.raw 322 (256,512) diffraction patterns in np.float64 The zero pixel must be at [0, 0] and there must be an equal no. of postive and negative frequencies mask_256x512.raw (optional) mask for the diffraction data np.float64 probeInit_256x512 (optional) Initial estimate for the probe np.complex128 sampleInit_1024x2048 (optional) initial estimate for the sample np.complex128 also sets the field of view If not present then initialise with random numbers """ # # Get the shape shape = diffs[0].shape # # Store these values self.exits = makeExits(sample, probe, coords) # # This will save time later self.diffAmps = bg.quadshift(np.sqrt(diffs)) self.shape = shape self.shape_sample = sample.shape self.coords = coords self.mask = bg.quadshift(mask) self.probe = probe self.sample = sample self.alpha_div = 1.0e-10 self.error_mod = [] self.error_sup = [] self.error_conv = [] self.probe_sum = None self.sample_sum = None self.diffNorm = np.sum(self.mask * (self.diffAmps)**2) self.pmod_int = pmod_int self.sample_support = sample_support # # create a gpu thread api = cluda.cuda_api() self.thr = api.Thread.create() # # send the diffraction amplitudes, the exit waves and the mask to the gpu self.diffAmps_gpu = self.thr.to_device(self.diffAmps) * np.sqrt(float(self.diffAmps.shape[1]) * float(self.diffAmps.shape[2])) self.exits_gpu = self.thr.to_device(self.exits) mask2 = np.zeros_like(diffs, dtype=np.complex128) mask2[:] = self.mask.astype(np.complex128) self.mask_gpu = self.thr.to_device(mask2) # # compile the fft routine fft = FFT(self.diffAmps_gpu.astype(np.complex128), axes=(1,2)) self.fftc = fft.compile(self.thr, fast_math=True)
def process(hdr_fiename, filename): api = cluda.cuda_api() thr = api.Thread.create() X = thr.array((10, 32768 * 2), dtype=numpy.complex128) iq_data = TCAPData(filename, hdr_fiename) file_counter = int(iq_data.filename_wo_ext[-3:]) fs = 312500 file_length_in_sec = 15625 * 32768 / fs time_passed_upto_now = (file_counter - 1) * file_length_in_sec # extract hour min sec hr, placeholder = divmod(time_passed_upto_now, 3600) mnt, sec = divmod(placeholder, 60) total_time = '{}h-{}m-{}s'.format(int(hr), int(mnt), int(sec)) title = 'Time: {}:{}:{}'.format(int(hr), int(mnt), int(sec)) zz = np.array([]) for j in range(1, 780 * 2 * 10 + 1, 2 * 10): data = np.array([]) # read 2*10 i.e. 20 blocks for i in range(j, j + 2 * 10): data = np.append(data, iq_data.read_block(i)) data = np.reshape(data, (10, 32768 * 2)) x = thr.to_device(data) fft = FFT(x, axes=(1, )) fftc = fft.compile(thr) fftc(X, x, 0) data_fft = X #data_fft = np.fft.fft(data, axis=1) data_fft = np.average(data_fft, axis=0) data_fft = np.abs(np.fft.fftshift(data_fft)) zz = np.append(zz, data_fft) zz = np.reshape(zz, (780, 32768 * 2)) data_fft_freqs = np.fft.fftshift(np.fft.fftfreq(32768 * 2, d=1 / fs)) # in Hz xx, yy = np.meshgrid(data_fft_freqs, np.arange(780)) yy = yy * 2.10 # in seconds plt_filename = '{}_{}'.format(iq_data.filename_wo_ext, total_time) print('Printing into file: ' + plt_filename) plot_spectrogram(xx, yy, zz, dbm=False, cmap=cm.jet, filename=plt_filename, dpi=500, title=title)
def initialize_gpu(self): try: import reikna.cluda as cluda from reikna.fft import FFT data = numpy.zeros( self.st['Kd'],dtype=dtype) print('get_platform') api = cluda.ocl_api() print('api=',api== cluda.cuda_api()) self.gpu_api = 'opencl' self.thr = api.Thread.create(async=True) print('line 630') self.data_dev = self.thr.to_device(data) axes=range(0,numpy.size(self.st['Kd'])) print('line 635') myfft= FFT( data, axes=axes) print('line 640') self.myfft = myfft.compile(self.thr,fast_math=True) print('line 640') self.gpu_flag=1 print('create gpu fft?',self.gpu_flag) print('line 642')# self.data_rec = self.thr.to_device(data2) W= self.st['w'][...,0] print('line 645') self.W = numpy.reshape(W, self.st['Kd'],order='C') print('line 647') # self.thr2 = api.Thread.create() print('line 649') self.W_dev = self.thr.to_device(self.W.astype(dtype)) self.W2_dev = self.thr.to_device(self.W.astype(dtype)) self.tmp_dev = self.thr.to_device(self.W.astype(dtype)) # device memory # self.tmp2_dev = self.thr.to_device(1.0/self.W.astype(dtype)) # device memory self.gpu_flag=1 # if self.debug > 0: print('line 652') except: self.gpu_flag=0 # if self.debug > 0: print('get error, using cpu')
def reikna_fft(a, inverse=False): ''' Get the FFT to calculate the FFT of an array, keeping the compiled source in a cache. ''' global FFT_CACHE # Compile the FFT cf = FFT_CACHE.get(a.shape, None) if cf is None: f = FFT(a) cf = f.compile(THREAD) FFT_CACHE[a.shape] = cf # Calculate the value output = get_array_cache(types.cpu_complex).get_array(len(a)) cf(output, a, inverse=inverse) return output
def rfft(self, a, nthreads=ncpu): a = self.check_array(a, RTYPES, RTYPE) if SCIK and self.is_gpu_memory_enough(a): shape = [s for s in a.shape] shape[-1] = shape[-1]//2 + 1 dtype = G_RTYPES[a.dtype.type] func = fft.fft af = self._fft_scik(a, func, shape, dtype) elif REIK and self.is_gpu_memory_enough(a): thr = self.api.Thread(self.dev) plan = FFT(Type(complex_for(a.dtype), a.shape)) # combines two real-valued inputs into a complex-valued input of the same shape cc = combine_complex(plan.parameter.input) # supplies a constant output bc = broadcast_const(cc.imag, 0) plan.parameter.input.connect(cc, cc.output, real_input=cc.real, imag_input=cc.imag) plan.parameter.imag_input.connect(bc, bc.output) fftc = plan.compile(thr, fast_math=True) a_dev = thr.to_device(a) a_out_dev = thr.empty_like(plan.parameter.output) fftc(a_out_dev, a_dev) af = a_out_dev.get() af = N.fft.fftshift(af) elif FFTW: func = pyfftw.builders.rfftn af = self._fftw(a, func, nthreads) else: af = N.fft.rfftn(a) return af
def test_trivial(some_thr): """ Checks that even if the FFT is trivial (problem size == 1), the transformations are still attached and executed. """ dtype = numpy.complex64 shape = (128, 1, 1, 128) axes = (1, 2) param = 4 data = get_test_array(shape, dtype) data_dev = some_thr.to_device(data) res_dev = some_thr.empty_like(data_dev) fft = FFT(data_dev, axes=axes) scale = mul_param(data_dev, numpy.int32) fft.parameter.input.connect(scale, scale.output, input_prime=scale.input, param=scale.param) fftc = fft.compile(some_thr) fftc(res_dev, data_dev, param) assert diff_is_negligible(res_dev.get(), data * param)
def __init__(self, nx, ny): shapeX = [ny, nx] shapeK = [ny, nx] self.shapeX = shapeX self.arrayK = np.empty(shapeK, dtype=self.type_complex) # Pick the first available GPGPU API and make a Thread on it. api = any_api() # api = cuda_api() # api = ocl_api() dev = api.get_platforms()[0].get_devices() self.thr = api.Thread.create(dev) fft = FFT(self.arrayK, axes=(0, 1)) scale = mul_param(self.arrayK, np.float) fft.parameter.input.connect(scale, scale.output, input_prime=scale.input, param=scale.param) self.fftplan = fft.compile(self.thr, fast_math=True) self.coef_norm = nx * ny
def check_errors(thr, shape_and_axes): dtype = numpy.complex64 shape, axes = shape_and_axes data = get_test_array(shape, dtype) fft = FFT(data, axes=axes) fftc = fft.compile(thr) # forward transform # Testing inplace transformation, because if this works, # then the out of place one will surely work too. data_dev = thr.to_device(data) fftc(data_dev, data_dev) fwd_ref = numpy.fft.fftn(data, axes=axes).astype(dtype) assert diff_is_negligible(data_dev.get(), fwd_ref) # inverse transform data_dev = thr.to_device(data) fftc(data_dev, data_dev, inverse=True) inv_ref = numpy.fft.ifftn(data, axes=axes).astype(dtype) assert diff_is_negligible(data_dev.get(), inv_ref)
Parameter('output', Annotation(Type(dtype, size), 'o')), Parameter('input', Annotation(Type(dtype, size), 'i')) ], ''' ${input.ctype} val = ${input.load_same}; ${output.store_same}(${polar_unit}(atan2(val.y, val.x))); ''', render_kwds=dict(polar_unit=functions.polar_unit(dtype=np.float32 if single else np.double)) ) return unimod unimod = unimod_gen(size) ffts = FFT(thr.array(size, dtype=np.complex64)) ffts.parameter.output.connect(unimod, unimod.input, uni=unimod.output) ffts_unimod = ffts.compile(thr) x = np.arange(size, dtype=np.complex64) x = thr.to_device(x) X = thr.array((size,), dtype=np.complex64) ffts_unimod(X, x) print(X) unimod = unimod_gen(size, single=False) fftd = FFT(thr.array(size, dtype=np.complex128)) fftd.parameter.output.connect(unimod, unimod.input, uni=unimod.output) fftd_unimod = fftd.compile(thr) x = np.arange(size, dtype=np.complex128) x = thr.to_device(x) X = thr.array((size,), dtype=np.complex128)
def create(thr, size, dtype=np.complex128, axes=None, compile_=True): fft = FFT(thr.array(size, dtype=dtype), axes) if compile_: fft = fft.compile(thr) return fft
def test_fft(in_data_np): atol = 1e-4 if in_data_np.dtype == Types.cfloat else 1e-8 import numpy as np in_data_cl = to_device(in_data_np) fft_cl = Fft(in_data_cl, emulate=False) # zero padding data for numpy axis = 1 N = in_data_np.shape[axis] if not np.log2(N).is_integer(): # if not power of 2, pad accordingly N = 2**int(np.log2(N) + 1) in_data_np_power_of_two = np.zeros((in_data_np.shape[0], N), in_data_np.dtype) in_data_np_power_of_two[:, :in_data_np.shape[axis]] = in_data_np def measure(call): attempts = 3 ts = [] for i in range(attempts): t1 = time.time() call() t2 = time.time() ts.append(t2 - t1) return min(ts) # import pyfftw # t_fftw = measure(lambda: pyfftw.interfaces.numpy_fft.fft(in_data_np_power_of_two, axis=-1)) t_np = measure(lambda: np.fft.fft(in_data_np_power_of_two, axis=-1)) fft_in_data_np = np.fft.fft(in_data_np_power_of_two, axis=-1) def fft_call(): fft_in_data_cl = fft_cl() fft_in_data_cl.queue.finish() t_cl = measure(fft_call) fft_in_data_cl = fft_cl() if in_data_np.size < 1024: # Test against emulation (commented since it is slower) use_existing_file_for_emulation(False) fft_cl_py = Fft(in_data_cl, emulate=True) fft_in_data_cl_py = fft_cl_py() a = fft_in_data_cl_py.get().view(Types.cdouble) b = fft_in_data_cl.get().view(Types.cdouble) c = fft_in_data_np.view(Types.cdouble) assert np.allclose(a, b) assert np.allclose(c, b) assert np.allclose(c, a) # import matplotlib.pyplot as plt # plt.plot(fft_in_data_np.flatten()) # plt.plot(fft_in_data_cl_emulation.get().flatten()) # plt.show() assert np.allclose(fft_in_data_np, fft_in_data_cl.get(), atol=atol) # benchmark using reikna if False: # change to true to run against reikna's fft. Note: Reikna takes quite some optimization time before run from reikna.cluda import any_api from reikna.fft import FFT import numpy api = any_api() thr = api.Thread.create() data = in_data_np dtype = data.dtype axes = (1, ) fft = FFT(data, axes=axes) fftc = fft.compile(thr) data_dev = thr.to_device(data) res_dev = thr.empty_like(data_dev) ts = [] for i in range(attempts): t1 = time.time() fftc(res_dev, data_dev) thr.synchronize() t2 = time.time() ts.append(t2 - t1) fwd_ref = numpy.fft.fftn(data, axes=axes).astype(dtype) tnp = time.time() fwd_ref = numpy.fft.fftn(data, axes=axes).astype(dtype) tnp = time.time() - tnp # numpy.fft.fftn(data[:, :, 0], axes=(1,)) treikna_min = min(ts) assert np.allclose(fft_in_data_np, res_dev.get())
Parameter('input', Annotation(Type(dtype, size), 'i')) ], ''' ${input.ctype} val = ${input.load_same}; ${output.store_same}(${polar_unit}(atan2(val.y, val.x))); ''', render_kwds=dict(polar_unit=functions.polar_unit( dtype=np.float32 if single else np.double))) return unimod unimod = unimod_gen(size) ffts = FFT(thr.array(size, dtype=np.complex64)) ffts.parameter.output.connect(unimod, unimod.input, uni=unimod.output) ffts_unimod = ffts.compile(thr) x = np.arange(size, dtype=np.complex64) x = thr.to_device(x) X = thr.array((size, ), dtype=np.complex64) ffts_unimod(X, x) print(X) unimod = unimod_gen(size, single=False) fftd = FFT(thr.array(size, dtype=np.complex128)) fftd.parameter.output.connect(unimod, unimod.input, uni=unimod.output) fftd_unimod = fftd.compile(thr) x = np.arange(size, dtype=np.complex128) x = thr.to_device(x) X = thr.array((size, ), dtype=np.complex128) fftd_unimod(X, x)
def run_test(thr, shape, dtype, axes=None): data = numpy.random.normal(size=shape).astype(dtype) fft = FFT(data, axes=axes) fftc = fft.compile(thr) shift = FFTShift(data, axes=axes) shiftc = shift.compile(thr) # FFT + shift as two separate computations data_dev = thr.to_device(data) t_start = time.time() fftc(data_dev, data_dev) thr.synchronize() t_gpu_fft = time.time() - t_start t_start = time.time() shiftc(data_dev, data_dev) thr.synchronize() t_gpu_shift = time.time() - t_start data_dev = thr.to_device(data) t_start = time.time() fftc(data_dev, data_dev) shiftc(data_dev, data_dev) thr.synchronize() t_gpu_separate = time.time() - t_start data_gpu = data_dev.get() # FFT + shift as a computation with a transformation data_dev = thr.to_device(data) # a separate output array to avoid unsafety of the shift transformation res_dev = thr.empty_like(data_dev) shift_tr = fftshift(data, axes=axes) fft2 = fft.parameter.output.connect(shift_tr, shift_tr.input, new_output=shift_tr.output) fft2c = fft2.compile(thr) t_start = time.time() fft2c(res_dev, data_dev) thr.synchronize() t_gpu_combined = time.time() - t_start # Reference calculation with numpy t_start = time.time() numpy.fft.fftn(data, axes=axes) t_cpu_fft = time.time() - t_start t_start = time.time() numpy.fft.fftshift(data, axes=axes) t_cpu_shift = time.time() - t_start t_start = time.time() data_ref = numpy.fft.fftn(data, axes=axes) data_ref = numpy.fft.fftshift(data_ref, axes=axes) t_cpu_all = time.time() - t_start data_gpu2 = res_dev.get() # Checking that the results are correct # (note: this will require relaxing the tolerances # if complex64 is used instead of complex128) assert numpy.allclose(data_ref, data_gpu) assert numpy.allclose(data_ref, data_gpu2) return dict(t_gpu_fft=t_gpu_fft, t_gpu_shift=t_gpu_shift, t_gpu_separate=t_gpu_separate, t_gpu_combined=t_gpu_combined, t_cpu_fft=t_cpu_fft, t_cpu_shift=t_cpu_shift, t_cpu_all=t_cpu_all)
def kspaceepanechnikov_filter_CL2(ksp, sigma): sz = ksp.shape dtype = np.complex64 ftype = np.float32 clear_first_arg_caches() fsiz = (5, 5, 5) print(np.ceil(sigma[0]) + 2, np.ceil(sigma[1]) + 2, np.ceil(sigma[2]) + 2) print sigma fsiz = (np.ceil(sigma) + 2).astype(int) for i in xrange(0, fsiz.size): if not fsiz[i] & 0x1: fsiz[i] += 1 # Create image-domain Epanechikov kernel Kepa = epanechnikov_kernel(fsiz, sigma) # Place kernel at centre of ksp-sized matrix Kfilter = np.zeros(np.array(sz), dtype=np.complex64) szmin = np.floor( np.array(sz) / 2.0 - np.floor(np.array(Kepa.shape) / 2.0) - 1) szmax = np.floor(szmin + np.array(Kepa.shape)) print "Epa filter size ", sz, " image filter ", Kepa.shape, " szmin ", szmin, " szmax ", szmax Kfilter[szmin[0]:szmax[0], szmin[1]:szmax[1], szmin[2]:szmax[2]] = Kepa Kfilter[szmin[0]:szmax[0], szmin[1]:szmax[1], szmin[2]:szmax[2]].imag = Kepa # Create fourier-domain Epanechnikov filter api = any_api() thr = api.Thread.create() data_dev = thr.to_device(Kfilter) rfft = FFT(data_dev) crfft = rfft.compile(thr) fftshift = FFTShift(data_dev) cfftshift = fftshift.compile(thr) crfft(data_dev, data_dev) thr.synchronize() cfftshift(data_dev, data_dev) Fepanechnikov = np.abs(data_dev.get()) # / np.prod(np.array(ksp.shape)) #result2 = result2[::-1,::-1,::-1] thr.synchronize() #result = np.zeros(np.array(siz), dtype=np.complex64) #result.real = np.abs(result2) / np.sqrt(2) #result.imag = np.abs(result2) / np.sqrt(2) del data_dev, rfft, crfft, fftshift, cfftshift # Multiply Epanechnikov filter to real and imag ksp data program = thr.compile(""" KERNEL void multiply_them( GLOBAL_MEM ${ctype} *dest, GLOBAL_MEM ${ctype} *a, GLOBAL_MEM ${ftype} *f) { const SIZE_T i = get_local_id(0); dest[i].x = a[i].x * f[i]; dest[i].y = a[i].y * f[i]; }""", render_kwds=dict(ctype=dtypes.ctype(dtype), ftype=dtypes.ctype(ftype))) data_dev = thr.to_device(ksp) filter_dev = thr.to_device(Fepanechnikov) multiply_them = program.multiply_them multiply_them(data_dev, data_dev, filter_dev, global_size=512 * 512 * 512) thr.synchronize() del filter_dev, program #api = cluda.ocl_api() #api = any_api() #thr = api.Thread.create() # Filter # data_dev = thr.to_device(ksp) # ifft = FFT(data_dev) FACTOR = 1.0 # Recon # thr.synchronize() #data_dev = thr.to_device(ksp) ifft = FFT(data_dev) cifft = ifft.compile(thr) fftshiftobj = FFTShift(data_dev) cfftshift = fftshiftobj.compile(thr) cifft(data_dev, data_dev, inverse=0) thr.synchronize() cfftshift(data_dev, data_dev) thr.synchronize() result2 = data_dev.get() / np.prod(np.array(ksp.shape)) result2 = result2[::-1, ::-1, ::-1] thr.release() return result2
def fourierepanechnikov(siz, sigma): """ Epanechnikov kernel in Fourier domain is A.(1-|x|^2) => (3/2*w^3)(sin(w) - w*cos(w)/2) """ # (uu, vv, ww) = fouriercoords(siz) # uu = uu + np.spacing(1) # vv = vv + np.spacing(1) # ww = ww + np.spacing(1) # if not hasattr(sigma, "__len__"): # #if type(sigma) is float or type(sigma) is numpy.float64: # return ((3.0*sigma/16.0)/(np.pi*(uu + vv + # ww)/(sigma))**3)*(np.sin(2*np.pi*(uu + vv + ww)/(sigma)) - np.pi*(uu # + vv + ww)/(sigma)*np.cos(2*np.pi*(uu + vv + ww)/(sigma))/2) # else: # return ((3.0/16.0)/(np.pi*((uu**3)/sigma[0]**4 + (vv**3)/sigma[1]**4 + # (ww**3)/sigma[2]**4)))*(np.sin(2*np.pi*(uu/sigma[0] + vv/sigma[1] + # ww/sigma[2])) - np.pi*(uu/sigma[0] + vv/sigma[1] + # ww/sigma[2])*np.cos(2*np.pi*(uu/sigma[0] + vv/sigma[1] + ww/sigma[2]))) def is_odd(num): return num & 0x1 from cplxfilter import epanechnikov_kernel if not hasattr(sigma, "__len__"): Kepa = epanechnikov_kernel( (np.ceil(sigma) + 1, np.ceil(sigma) + 1, np.ceil(sigma) + 1), sigma) else: print( np.ceil(sigma[0]) + 2, np.ceil(sigma[1]) + 2, np.ceil(sigma[2]) + 2) print sigma fsiz = (np.ceil(sigma) + 2).astype(int) for i in xrange(0, fsiz.size): if is_odd(fsiz[i]): fsiz[i] += 1 Kepa = epanechnikov_kernel( (np.ceil(sigma[0]) + 2, np.ceil(sigma[1]) + 2, np.ceil(sigma[2]) + 2), sigma) Kfilter = np.zeros(np.array(siz), dtype=np.complex64) szmin = np.floor( np.array(siz) / 2.0 - np.floor(np.array(Kepa.shape) / 2.0) - 1) szmax = np.floor(szmin + np.array(Kepa.shape)) print "Epa filter size ", siz, " image filter ", Kepa.shape, " szmin ", szmin, " szmax ", szmax Kfilter[szmin[0]:szmax[0], szmin[1]:szmax[1], szmin[2]:szmax[2]] = Kepa Kfilter[szmin[0]:szmax[0], szmin[1]:szmax[1], szmin[2]:szmax[2]].imag = Kepa # return np.abs(fftshift(clfftn(Kfilter))) api = any_api() thr = api.Thread.create() data_dev = thr.to_device(Kfilter) fft = FFT(data_dev) cfft = fft.compile(thr) fftshift = FFTShift(data_dev) cfftshift = fftshift.compile(thr) cfft(data_dev, data_dev) thr.synchronize() cfftshift(data_dev, data_dev) thr.synchronize() result2 = data_dev.get() # / np.prod(np.array(ksp.shape)) #result2 = result2[::-1,::-1,::-1] thr.release() result = np.zeros(np.array(siz), dtype=np.complex64) result.real = np.abs(result2) / np.sqrt(2) result.imag = np.abs(result2) / np.sqrt(2) return result
def kspacegaussian_filter_CL2(ksp, sigma): """ Kspace gaussian filter and recon using GPU OpenCL 1. GPU intialisation 2. push KSP complex matrix to GPU 3. declare FFT program 4. declare Complex Gaussian GPU filter program 5. Execute Gaussian GPU program 6. GPU sync 7. Execute FFT Recon 8. Execute FFTshift 9. Retrieve reconstruced complex image from GPU 10. Reorganise image to standard (mimic numpy format) """ sz = ksp.shape dtype = np.complex64 ftype = np.float32 ultype = np.uint64 #api = cluda.ocl_api() api = any_api() thr = api.Thread.create() data_dev = thr.to_device(ksp) ifft = FFT(data_dev) FACTOR = 1.0 program = thr.compile(""" KERNEL void gauss_kernel( GLOBAL_MEM ${ctype} *dest, GLOBAL_MEM ${ctype} *src) { const ulong x = get_global_id(0); const SIZE_T dim1= %d; const SIZE_T dim2= %d; const SIZE_T dim3= %d; ${ftype} sigma[3]; sigma[0]=%f;sigma[1]=%f;sigma[2]=%f; ${ftype} factor = %f; const double TWOPISQ = 19.739208802178716; //6.283185307179586; //2*3.141592; const ${ftype} SQRT2PI = 2.5066282746; const double CUBEDSQRT2PI = 15.749609945722419; const ulong idx = x; ${ftype} i = (${ftype})((x / dim3) / dim2); i = (i - (${ftype})floor((${ftype})(dim1)/2.0f))/(${ftype})(dim1); ${ftype} j = (${ftype})(x / dim3); if((SIZE_T)j > dim2) {j=(${ftype})fmod(j, (${ftype})dim2);}; j = (j - (${ftype})floor((${ftype})(dim2)/2.0f))/(${ftype})(dim2); // Account for large global index (stored as ulong) before performing modulus double pre_k=fmod((double)(x), (double)dim3); ${ftype} k = (${ftype}) pre_k; k = (k - (${ftype})floor((${ftype})(dim3)/2.0f))/(${ftype})(dim3); ${ftype} weight = exp(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2])); // ${ftype} weight = expm1(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]))+1; // ${ftype} weight= ${exp}(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2])); dest[idx].x = src[idx].x * weight; dest[idx].y = src[idx].y * weight; } """ % (sz[0], sz[1], sz[2], sigma[0], sigma[1], sigma[2], FACTOR), render_kwds=dict(ctype=dtypes.ctype(dtype), ftype=dtypes.ctype(ftype), exp=functions.exp(ftype)), fast_math=True) gauss_kernel = program.gauss_kernel #data_dev = thr.empty_like(ksp_dev) gauss_kernel(data_dev, data_dev, global_size=sz[0] * sz[1] * sz[2]) thr.synchronize() # Recon #data_dev = thr.to_device(ksp) ifftobj = FFT(data_dev) cifft = ifftobj.compile(thr) fftshiftobj = FFTShift(data_dev) cfftshift = fftshiftobj.compile(thr) cifft(data_dev, data_dev, inverse=0) thr.synchronize() cfftshift(data_dev, data_dev) thr.synchronize() result2 = data_dev.get() / np.prod(np.array(ksp.shape)) result2 = result2[::-1, ::-1, ::-1] thr.release() return result2
image_filtered = simpleifft(procpar, dims, hdr, kspgauss2, args) toc() from reikna.cluda import dtypes, any_api from reikna.fft import FFT from reikna.core import Annotation, Type, Transformation, Parameter # create two timers so we can speed-test each approach api = any_api() thr = api.Thread.create() N = 512 tic() data_dev = thr.to_device(ksp) ifft = FFT(data_dev) cifft = ifft.compile(thr) cifft(data_dev, data_dev, inverse=0) thr.synchronize() toc() result = np.fft.fftshift(data_dev.get() / N**3) result = result[::-1, ::-1, ::-1] result = np.roll(np.roll(np.roll(result, 1, axis=2), 1, axis=1), 1, axis=0) print "Reikna IFFT time and first three results:" print "%s sec, %s" % (toc(), str(np.abs(result[:3, 0, 0]))) thr.release() del ifft, cifft, data_dev, thr thr = api.Thread.create() tic() data_dev = thr.to_device(ksp) ifft = FFT(data_dev)
print 'PyFFT error: ', np.sum(abs(cpuSol.real - solPyFFT.real)), np.sum( abs(cpuSol.imag - solPyFFT.imag)) print 'Extra memory use:', imem - getFreeMemory(show=False), 'MB \n' #print np.sum(cpuSol.real) imem = getFreeMemory(show=False) setZero(aux_gpu, block=block3d, grid=grid3d) setZero(aux2_gpu, block=block3d, grid=grid3d) myplan1 = plan2(aux_gpu.shape, aux_gpu.dtype, aux_gpu.dtype) gpuMesureTime(solBySci, ntimes=100) solSci = aux2_gpu.get() / float(cpuSol.size) print 'SciKits error: ', np.sum(abs(cpuSol.real - solSci.real)), np.sum( abs(cpuSol.imag - solSci.imag)) print 'Extra memory use:', imem - getFreeMemory(show=False), 'MB \n' imem = getFreeMemory(show=False) setZero(aux_gpu, block=block3d, grid=grid3d) setZero(aux2_gpu, block=block3d, grid=grid3d) api = cuda_api() thr = api.Thread(ctx) fftPlan3 = FFT(func_gpu) reikFFT = fftPlan3.compile(thr) gpuMesureTime(solByReik, ntimes=100) solReik = aux2_gpu.get() print 'Reikna error: ', np.sum(abs(cpuSol.real - solReik.real)), np.sum( abs(cpuSol.imag - solReik.imag)) print 'Extra memory use:', imem - getFreeMemory(show=False), 'MB \n' #print np.sum(cpuSol.real),np.sum(abs(cpuSol.real)) ctx.detach()
def build_object(self, arr): f = FFT(arr) return f.compile(self._thread)
def kspacegaussian_filter_CL(ksp, sigma): sz = ksp.shape dtype = np.complex64 ftype = np.float32 #api = cluda.ocl_api() api = any_api() thr = api.Thread.create() data_dev = thr.to_device(ksp) ifft = FFT(data_dev) FACTOR = 1.0 program = thr.compile(""" KERNEL void gauss_kernel( GLOBAL_MEM ${ctype} *dest, GLOBAL_MEM ${ctype} *src) { const ${ultype} x = (${ultype})get_global_id(0); const SIZE_T dim1= %d; const SIZE_T dim2= %d; const SIZE_T dim3= %d; ${ftype} sigma[3]; sigma[0]=%f;sigma[1]=%f;sigma[2]=%f; ${ftype} factor = %f; const double TWOPISQ = 19.739208802178716; //6.283185307179586; //2*3.141592; const ${ftype} SQRT2PI = 2.5066282746; const double CUBEDSQRT2PI = 15.749609945722419; const ${ultype} idx = x; ${ftype} i = (${ftype})((x / dim3) / dim2); i = (i - (${ftype})floor((${ftype})(dim1)/2.0f))/(${ftype})(dim1); ${ftype} j = (${ftype})(x / dim3); if((SIZE_T)j > dim2) {j=(${ftype})fmod(j, (${ftype})dim2);}; j = (j - (${ftype})floor((${ftype})(dim2)/2.0f))/(${ftype})(dim2); // Account for large global index (stored as ulong) before performing modulus double pre_k=fmod((double)(x), (double)dim3); ${ftype} k = (${ftype}) pre_k; k = (k - (${ftype})floor((${ftype})(dim3)/2.0f))/(${ftype})(dim3); ${ftype} weight = exp(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2])); // ${ftype} weight = expm1(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]))+1; // ${ftype} weight= ${exp}(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2])); dest[idx].x = src[idx].x * weight; dest[idx].y = src[idx].y * weight; } """ % (sz[0], sz[1], sz[2], sigma[0], sigma[1], sigma[2], FACTOR), render_kwds=dict(ctype=dtypes.ctype(dtype), ftype=dtypes.ctype(ftype), ultype=dtypes.ctype(np.uint64), exp=functions.exp(ftype)), fast_math=True) gauss_kernel = program.gauss_kernel #data_dev = thr.empty_like(ksp_dev) gauss_kernel(data_dev, data_dev, global_size=sz[0] * sz[1] * sz[2]) thr.synchronize() ## #api = any_api() #thr = api.Thread.create() #data_dev = thr.to_device(ksp_out) ifft = FFT(data_dev) cifft = ifft.compile(thr) cifft(data_dev, data_dev, inverse=0) result = np.fft.fftshift(data_dev.get() / sz[0] * sz[1] * sz[2]) result = result[::-1, ::-1, ::-1] result = np.roll(np.roll(np.roll(result, 1, axis=2), 1, axis=1), 1, axis=0) return result # ,ksp_out
def run_test(thr, shape, dtype, axes=None): data = numpy.random.normal(size=shape).astype(dtype) fft = FFT(data, axes=axes) fftc = fft.compile(thr) shift = FFTShift(data, axes=axes) shiftc = shift.compile(thr) # FFT + shift as two separate computations data_dev = thr.to_device(data) t_start = time.time() fftc(data_dev, data_dev) thr.synchronize() t_gpu_fft = time.time() - t_start t_start = time.time() shiftc(data_dev, data_dev) thr.synchronize() t_gpu_shift = time.time() - t_start data_dev = thr.to_device(data) t_start = time.time() fftc(data_dev, data_dev) shiftc(data_dev, data_dev) thr.synchronize() t_gpu_separate = time.time() - t_start data_gpu = data_dev.get() # FFT + shift as a computation with a transformation data_dev = thr.to_device(data) # a separate output array to avoid unsafety of the shift transformation res_dev = thr.empty_like(data_dev) shift_tr = fftshift(data, axes=axes) fft2 = fft.parameter.output.connect(shift_tr, shift_tr.input, new_output=shift_tr.output) fft2c = fft2.compile(thr) t_start = time.time() fft2c(res_dev, data_dev) thr.synchronize() t_gpu_combined = time.time() - t_start # Reference calculation with numpy t_start = time.time() numpy.fft.fftn(data, axes=axes) t_cpu_fft = time.time() - t_start t_start = time.time() numpy.fft.fftshift(data, axes=axes) t_cpu_shift = time.time() - t_start t_start = time.time() data_ref = numpy.fft.fftn(data, axes=axes) data_ref = numpy.fft.fftshift(data_ref, axes=axes) t_cpu_all = time.time() - t_start data_gpu2 = res_dev.get() # Checking that the results are correct # (note: this will require relaxing the tolerances # if complex64 is used instead of complex128) assert numpy.allclose(data_ref, data_gpu) assert numpy.allclose(data_ref, data_gpu2) return dict( t_gpu_fft=t_gpu_fft, t_gpu_shift=t_gpu_shift, t_gpu_separate=t_gpu_separate, t_gpu_combined=t_gpu_combined, t_cpu_fft=t_cpu_fft, t_cpu_shift=t_cpu_shift, t_cpu_all=t_cpu_all)
[Parameter('output', Annotation(Type(complex_dtype, arr.shape), 'o')), Parameter('input', Annotation(arr, 'i'))], """ ${output.store_same}( COMPLEX_CTR(${output.ctype})( ${input.load_same}, 0)); """) arr = numpy.random.normal(size=3000).astype(numpy.float32) trf = get_complex_trf(arr) # Create the FFT computation and attach the transformation above to its input. fft = FFT(trf.output) # (A shortcut: using the array type saved in the transformation) fft.parameter.input.connect(trf, trf.output, new_input=trf.input) cfft = fft.compile(thr) # Run the computation arr_dev = thr.to_device(arr) res_dev = thr.array(arr.shape, numpy.complex64) cfft(res_dev, arr_dev) result = res_dev.get() reference = numpy.fft.fft(arr) assert numpy.linalg.norm(result - reference) / numpy.linalg.norm(reference) < 1e-6
if __name__ == '__main__': api = any_api() thr = api.Thread.create() dtype = numpy.complex128 shape = (1024, 16, 16, 16) axes = (1, 2, 3) data = numpy.random.normal(size=shape) + 1j * numpy.random.normal(size=shape) data = data.astype(dtype) fft = FFT(data, axes=axes) fftc = fft.compile(thr) fft2 = FFTWithTranspose(data, axes=axes) fft2c = fft2.compile(thr) data_dev = thr.to_device(data) res_dev = thr.empty_like(data_dev) for comp, tag in [(fftc, "original FFT"), (fft2c, "transposition-based FFT")]: attempts = 10 ts = [] for i in range(attempts): t1 = time.time() comp(res_dev, data_dev) thr.synchronize() t2 = time.time()