def thunk(): input_shape = inputs[0][0].shape output_shape = input_shape z = outputs[0] # only allocate if there is no previous allocation of the # right size. if z[0] is None or z[0].shape != output_shape: z[0] = CudaNdarray.zeros(output_shape) input_pycuda = to_gpuarray(inputs[0][0]) # input_pycuda is a float32 array with an extra dimension, # but will be interpreted by scikits.cuda as a complex64 # array instead. output_pycuda = to_gpuarray(z[0]) # only initialise plan if necessary if plan[0] is None or plan_input_shape[0] != input_shape: plan_input_shape[0] = input_shape plan[0] = fft.Plan(output_shape[1:-1], np.complex64, np.complex64, batch=output_shape[0]) fft.ifft(input_pycuda, output_pycuda, plan[0]) compute_map[node.outputs[0]][0] = True
def fft_multiply_repeated(h_fft, x, cuda_dict=dict(use_cuda=False)): """Do FFT multiplication by a filter function (possibly using CUDA) Parameters ---------- h_fft : 1-d array or gpuarray The filtering array to apply. x : 1-d array The array to filter. cuda_dict : dict Dictionary constructed using setup_cuda_multiply_repeated(). Returns ------- x : 1-d array Filtered version of x. """ if not cuda_dict["use_cuda"]: # do the fourier-domain operations x = np.real(ifft(h_fft * fft(x), overwrite_x=True)).ravel() else: # do the fourier-domain operations, results in second param cuda_dict["x"].set(x.astype(np.float64)) cudafft.fft(cuda_dict["x"], cuda_dict["x_fft"], cuda_dict["fft_plan"]) cuda_multiply_inplace_c128(h_fft, cuda_dict["x_fft"]) # If we wanted to do it locally instead of using our own kernel: # cuda_seg_fft.set(cuda_seg_fft.get() * h_fft) cudafft.ifft(cuda_dict["x_fft"], cuda_dict["x"], cuda_dict["ifft_plan"], False) x = np.array(cuda_dict["x"].get(), dtype=x.dtype, subok=True, copy=False) return x
def thunk(): input_shape = inputs[0][0].shape # construct output shape # chop off the extra length-2 dimension for real/imag output_shape = list(input_shape[:-1]) # restore full signal length output_shape[-1] = (output_shape[-1] - 1) * 2 output_shape = tuple(output_shape) z = outputs[0] # only allocate if there is no previous allocation of the # right size. if z[0] is None or z[0].shape != output_shape: z[0] = CudaNdarray.zeros(output_shape) input_pycuda = to_gpuarray(inputs[0][0]) # input_pycuda is a float32 array with an extra dimension, # but will be interpreted by scikits.cuda as a complex64 # array instead. output_pycuda = to_gpuarray(z[0]) # only initialise plan if necessary if plan[0] is None or plan_input_shape[0] != input_shape: plan_input_shape[0] = input_shape plan[0] = fft.Plan(output_shape[1:], np.complex64, np.float32, batch=output_shape[0]) fft.ifft(input_pycuda, output_pycuda, plan[0])
def fft_multiply_repeated(h_fft, x, cuda_dict=dict(use_cuda=False)): """Do FFT multiplication by a filter function (possibly using CUDA) Parameters ---------- h_fft : 1-d array or gpuarray The filtering array to apply. x : 1-d array The array to filter. cuda_dict : dict Dictionary constructed using setup_cuda_multiply_repeated(). Returns ------- x : 1-d array Filtered version of x. """ if not cuda_dict['use_cuda']: # do the fourier-domain operations x = np.real(ifft(h_fft * fft(x), overwrite_x=True)).ravel() else: # do the fourier-domain operations, results in second param cuda_dict['x'].set(x.astype(np.float64)) cudafft.fft(cuda_dict['x'], cuda_dict['x_fft'], cuda_dict['fft_plan']) cuda_multiply_inplace_c128(h_fft, cuda_dict['x_fft']) # If we wanted to do it locally instead of using our own kernel: # cuda_seg_fft.set(cuda_seg_fft.get() * h_fft) cudafft.ifft(cuda_dict['x_fft'], cuda_dict['x'], cuda_dict['ifft_plan'], False) x = np.array(cuda_dict['x'].get(), dtype=x.dtype, subok=True, copy=False) return x
def gpu_c2r_ifft(in1, is_gpuarray=False, store_on_gpu=False): """ This function makes use of the scikits implementation of the FFT for GPUs to take the complex to real IFFT. INPUTS: in1 (no default): The array on which the IFFT is to be performed. is_gpuarray (default=True): Boolean specifier for whether or not input is on the gpu. store_on_gpu (default=False): Boolean specifier for whether the result is to be left on the gpu or not. OUTPUTS: gpu_out1 The gpu array containing the result. OR gpu_out1.get() The result from the gpu array. """ if is_gpuarray: gpu_in1 = in1 else: gpu_in1 = gpuarray.to_gpu_async(in1.astype(np.complex64)) output_size = np.array(in1.shape) output_size[1] = 2*(output_size[1]-1) gpu_out1 = gpuarray.empty([output_size[0],output_size[1]], np.float32) gpu_plan = Plan(output_size, np.complex64, np.float32) ifft(gpu_in1, gpu_out1, gpu_plan) scale_fft(gpu_out1) if store_on_gpu: return gpu_out1 else: return gpu_out1.get()
def test_ifft_complex64_to_float32_1d(self): x = np.asarray(np.random.rand(self.N), np.float32) xf = np.asarray(np.fft.rfftn(x), np.complex64) xf_gpu = gpuarray.to_gpu(xf) x_gpu = gpuarray.empty(self.N, np.float32) plan = fft.Plan(x.shape, np.complex64, np.float32) fft.ifft(xf_gpu, x_gpu, plan, True) assert np.allclose(x, x_gpu.get(), atol=atol_float32)
def test_ifft_complex128_to_float64(self): x = np.asarray(np.random.rand(self.N), np.float64) xf = np.asarray(np.fft.fft(x), np.complex128) xf_gpu = gpuarray.to_gpu(xf[0:self.N/2+1]) x_gpu = gpuarray.empty(self.N, np.float64) plan = fft.Plan(x.shape, np.complex128, np.float64) fft.ifft(xf_gpu, x_gpu, plan, True) assert np.allclose(x, x_gpu.get(), atol=atol_float64)
def test_ifft_complex128_to_float64(self): x = np.asarray(np.random.rand(self.N), np.float64) xf = np.asarray(np.fft.fft(x), np.complex128) xf_gpu = gpuarray.to_gpu(xf[0:self.N / 2 + 1]) x_gpu = gpuarray.empty(self.N, np.float64) plan = fft.Plan(x.shape, np.complex128, np.float64) fft.ifft(xf_gpu, x_gpu, plan, True) assert np.allclose(x, x_gpu.get(), atol=atol_float64)
def irfft2(self, i, o=None, cache=True): shape = i.shape[:-2] cshape = i.shape[-2:] rshape = (cshape[0], (cshape[1] - 1) * 2) batch = np.prod(shape, dtype=np.int) plan = self.get_plan(cache, rshape, self.ctype, self.rtype, batch) if o is None: o = self.context.empty(shape + rshape, self.rtype) cu_fft.ifft(i, o, plan, scale=True) return o
def irfft2(self, i, o = None, cache = True): shape = i.shape[:-2] cshape = i.shape[-2:] rshape = (cshape[0], (cshape[1]-1)*2) batch = np.prod(shape, dtype=np.int) plan = self.get_plan(cache, rshape, self.ctype, self.rtype, batch) if o is None: o = self.context.empty(shape+rshape, self.rtype) cu_fft.ifft(i, o, plan, scale=True) return o
def test_ifft_complex64_to_float32_2d(self): # Note that since rfftn returns a Fortran-ordered array, it # needs to be reformatted as a C-ordered array before being # passed to gpuarray.to_gpu: x = np.asarray(np.random.rand(self.N, self.M), np.float32) xf = np.asarray(np.fft.rfftn(x), np.complex64) xf_gpu = gpuarray.to_gpu(np.ascontiguousarray(xf)) x_gpu = gpuarray.empty((self.N, self.M), np.float32) plan = fft.Plan(x.shape, np.complex64, np.float32) fft.ifft(xf_gpu, x_gpu, plan, True) assert np.allclose(x, x_gpu.get(), atol=atol_float32)
def test_batch_ifft_complex128_to_float64_1d(self): # Note that since rfftn returns a Fortran-ordered array, it # needs to be reformatted as a C-ordered array before being # passed to gpuarray.to_gpu: x = np.asarray(np.random.rand(self.B, self.N), np.float64) xf = np.asarray(np.fft.rfft(x, axis=1), np.complex128) xf_gpu = gpuarray.to_gpu(np.ascontiguousarray(xf)) x_gpu = gpuarray.empty((self.B, self.N), np.float64) plan = fft.Plan(x.shape[1], np.complex128, np.float64, batch=self.B) fft.ifft(xf_gpu, x_gpu, plan, True) assert np.allclose(x, x_gpu.get(), atol=atol_float64)
def test_batch_ifft_complex128_to_float64_2d(self): # Note that since rfftn returns a Fortran-ordered array, it # needs to be reformatted as a C-ordered array before being # passed to gpuarray.to_gpu: x = np.asarray(np.random.rand(self.B, self.N, self.M), np.float64) xf = np.asarray(np.fft.rfftn(x, axes=(1,2)), np.complex128) xf_gpu = gpuarray.to_gpu(np.ascontiguousarray(xf)) x_gpu = gpuarray.empty((self.B, self.N, self.M), np.float64) plan = fft.Plan([self.N, self.M], np.complex128, np.float64, batch=self.B) fft.ifft(xf_gpu, x_gpu, plan, True) assert np.allclose(x, x_gpu.get(), atol=atol_float64)
def convol(self, data1, data2): self.init() self.ctx.push() plan = self.__class__.plans[self.shape] data1_gpu = self.__class__.data1_gpus[self.shape] data2_gpu = self.__class__.data2_gpus[self.shape] data1_gpu.set(data1.astype(numpy.complex128)) cu_fft.fft(data1_gpu, data1_gpu, plan) data2_gpu.set(data2.astype(numpy.complex128)) cu_fft.fft(data2_gpu, data2_gpu, plan) # data1_gpu *= data2_gpu.conj() self.multconj(data1_gpu, data2_gpu) cu_fft.ifft(data1_gpu, data1_gpu, plan, True) # self.ctx.synchronize() res = data1_gpu.get().real self.ctx.pop() return res
def correlate(self, data1, data2): self.init() with self.__class__.sem: self.ctx.push() plan = self.__class__.plans[self.shape] data1_gpu = self.__class__.data1_gpus[self.shape] data2_gpu = self.__class__.data2_gpus[self.shape] data1_gpu.set(data1.astype(numpy.complex128)) cu_fft.fft(data1_gpu, data1_gpu, plan) data2_gpu.set(data2.astype(numpy.complex128)) cu_fft.fft(data2_gpu, data2_gpu, plan) # data1_gpu *= data2_gpu.conj() self.multconj(data1_gpu, data2_gpu) cu_fft.ifft(data1_gpu, data1_gpu, plan, True) # self.ctx.synchronize() res = data1_gpu.get().real self.ctx.pop() return res
def cufft(data,shape=None,inverse=False): if shape: data = pad2(data,shape) plan = CUFFT_PLANS.get(data.shape) if not plan: plan = cu_fft.Plan(data.shape,np.complex64,np.complex64) CUFFT_PLANS[data.shape] = plan gpu_data = gpuarray.to_gpu(np.cast[np.complex64](data)) if inverse: cu_fft.ifft(gpu_data,gpu_data,plan) else: cu_fft.fft(gpu_data,gpu_data,plan) r = gpu_data.get() return r
def resample_sdbe_to_r2dbe_zpfft(Xs): """ Resample SWARM spectrum product in time-domain at R2DBE rate using zero-padding and a radix-2 iFFT algorithm. Arguments: ---------- Xs -- MxN numpy array in which the zeroth dimension is increasing snapshot index, and the first dimension is the positive frequency half of the spectrum. Returns: -------- xs -- The time-domain signal sampled at the R2DBE rate. next_start_vec -- Start indecies for each FFT window. """ # timestep sizes for SWARM and R2DBE rates dt_s = 1.0 / SWARM_RATE dt_r = 1.0 / R2DBE_RATE # we need to oversample by factor 64 and then undersample by factor 39 simple_r = 64 # 4096 simple_s = 39 # 2496 fft_window_oversample = 2 * SWARM_CHANNELS * simple_r # 2* due to real FFT # oversample timestep size dt_f = dt_s / simple_r # the timespan of one SWARM FFT window T_s = dt_s * SWARM_SAMPLES_PER_WINDOW # what are these...? x_t2_0 = None x_t2_1 = None # time vectors over one SWARM FFT window in different step sizes t_r = arange(0, T_s, dt_r) t_s = arange(0, T_s, dt_s) t_f = arange(0, T_s, dt_f) # offset in oversampled time series that corresponds to one dt_r step # from the last R2DBE rate sample in the previous window next_start = 0 # some time offsets...? offset_in_window_offset_s = list() offset_global_s = list() # total number of time series samples N_x = int(ceil(Xs.shape[0] * SWARM_SAMPLES_PER_WINDOW * dt_s / dt_r)) # and initialize the output xs = zeros(N_x, dtype=float32) #fine_sample_index = zeros(N_x) next_start_vec = zeros(Xs.shape[0]) # index in output where samples from next window are stored start_output = 0 # cuFFT plan for complex to real DFT plan = cu_fft.Plan(fft_window_oversample, complex64, float32) # padding kernel fill_padded = mod.get_function("fill_padded") # downsampling kernel downsample = mod.get_function("downsample") # FFT scaling kernel scale = ElementwiseKernel( "float *a", "a[i] = {0} * a[i]".format(1. / fft_window_oversample), "scale") # max size of resampled chunk from a single window xs_chunk_size_max = int32(ceil((1. * fft_window_oversample) / simple_s)) # create memory on device for cuFFT xf_d = gpuarray.empty(fft_window_oversample, dtype=float32) xp_d = gpuarray.zeros(fft_window_oversample / 2 + 1, dtype=complex64) y_d = gpuarray.empty(xs_chunk_size_max, dtype=float32) for ii in range(Xs.shape[0]): # move window to device x_d = gpuarray.to_gpu(Xs[ii, :]) # threads per block # number of blocks (keep the array as zeros to save time) TPB = 1024 nB = int(ceil(1. * Xs.shape[1] / TPB)) # pad with zeros to oversample by 64 fill_padded(int32(1), xp_d, int32(fft_window_oversample/2+1),\ x_d, int32(Xs.shape[1]),\ block=(TPB,1,1), grid=(nB,1)) # iFFT cu_fft.ifft(xp_d, xf_d, plan, scale=False) xs_chunk_size = int32( ceil((1. * fft_window_oversample - next_start) / simple_s)) # threads per block TPB = 64 # number of blocks nB = ceil(1. * xs_chunk_size / TPB).astype(int) ## undersample by 39 to correct rate, and start at the correct ## offset in this window downsample(xf_d,int32(fft_window_oversample),\ y_d,xs_chunk_size, int32(next_start),int32(simple_s),\ block=(TPB,1,1),grid=(nB,1)) # rescale from ifft using ElementwiseKernel scale(y_d) # pull data back onto host xs_chunk = y_d.get() # fill output numpy array stop_output = start_output + xs_chunk_size xs[start_output:stop_output] = xs_chunk[:xs_chunk_size] # update the starting index in the output array start_output = stop_output # mark the time of the last used sample relative to the start # of this window time_window_start_to_last_used_sample = t_f[next_start::39][-1] # calculate the remaining time in this window time_remaining_in_window = T_s - time_window_start_to_last_used_sample # convert to the equivalent number of oversample timesteps num_dt_f_steps_short = round(time_remaining_in_window / dt_f) next_start_vec[ii] = next_start if (num_dt_f_steps_short == 0): next_start = 0 else: next_start = simple_s - num_dt_f_steps_short return xs, next_start_vec
def sample_defrost_gpu(lat, func, gamma, m2_eff): """Calculates a sample of random values in the lattice lat = Lattice func = name of Cuda kernel n = size of cubic lattice gamma = -0.25 or +0.25 m2_eff = effective mass This uses CuFFT to calculate FFTW. """ import scikits.cuda.fft as fft import fftw3 "Various constants:" mpl = lat.mpl n = lat.n nn = lat.nn os = 16 nos = n * pow(os, 2) dk = lat.dk dx = lat.dx dkos = dk / (2. * os) dxos = dx / os kcut = nn * dk / 2.0 norm = 0.5 / (math.sqrt(2 * pi * dk**3.) * mpl) * (dkos / dxos) ker = np.empty(nos, dtype=lat.prec_real) fft1 = fftw3.Plan(ker, ker, direction='forward', flags=['measure'], realtypes=['realodd 10']) for k in xrange(nos): kk = (k + 0.5) * dkos ker[k] = kk * (kk**2. + m2_eff)**gamma * math.exp(-(kk / kcut)**2.) fft1.execute() fftw3.destroy_plan(fft1) for k in xrange(nos): ker[k] = norm * ker[k] / (k + 1) Fk_gpu = gpuarray.zeros((n / 2 + 1, n, n), dtype=lat.prec_complex) ker_gpu = gpuarray.to_gpu(ker) tmp_gpu = gpuarray.zeros((n, n, n), dtype=lat.prec_real) plan = fft.Plan(tmp_gpu.shape, lat.prec_real, lat.prec_complex) plan2 = fft.Plan(tmp_gpu.shape, lat.prec_complex, lat.prec_real) func(tmp_gpu, ker_gpu, np.uint32(nn), np.float64(os), np.uint32(lat.dimx), np.uint32(lat.dimy), np.uint32(lat.dimz), block=lat.cuda_block_1, grid=lat.cuda_grid) fft.fft(tmp_gpu, Fk_gpu, plan) if lat.test == True: print 'Testing mode on! Set testQ to False to disable this.\n' np.random.seed(1) rr1 = (np.random.normal(size=Fk_gpu.shape) + np.random.normal(size=Fk_gpu.shape) * 1j) Fk = Fk_gpu.get() Fk *= rr1 Fk_gpu = gpuarray.to_gpu(Fk) fft.ifft(Fk_gpu, tmp_gpu, plan2) res = (tmp_gpu.get()).astype(lat.prec_real) res *= 1. / lat.VL return res
def resample_sdbe_to_r2dbe_fft_interp(Xs, interp_kind="nearest"): """ Resample SWARM spectrum product in time-domain at R2DBE rate using iFFT and then interpolation in the time-domain. Arguments: ---------- Xs -- MxN numpy array in which the zeroth dimension is increasing snapshot index, and the first dimension is the positive frequency half of the spectrum. interp_kind -- Kind of interpolation. Returns: -------- xs -- The time-domain signal sampled at the R2DBE rate. """ # timestep sizes for SWARM and R2DBE rates dt_s = 1.0 / SWARM_RATE dt_r = 1.0 / R2DBE_RATE # cuFFT plan for complex to real DFT plan = cu_fft.Plan(SWARM_SAMPLES_PER_WINDOW, complex64, float32, Xs.shape[0]) # load complex spectrum to device x_d = gpuarray.to_gpu(Xs) xp_d = gpuarray.empty((Xs.shape[0], Xs.shape[1] + 1), dtype=complex64) # pad nyquist with zeros block = (32, 32, 1) grid = (int(ceil(1. * (Xs.shape[1] + 1) / block[1])), int(ceil(1. * Xs.shape[0] / block[0]))) fill_padded = mod.get_function("fill_padded") fill_padded(int32(Xs.shape[0]),xp_d,int32(Xs.shape[1]+1),x_d,int32(Xs.shape[1]),\ block=block,grid=grid) # allocate memory for time series xf_d = gpuarray.empty((Xs.shape[0], SWARM_SAMPLES_PER_WINDOW), float32) # calculate time series, include scaling cu_fft.ifft(xp_d, xf_d, plan, scale=True) # and interpolate xs_size = int(floor( Xs.shape[0] * SWARM_SAMPLES_PER_WINDOW * dt_s / dt_r)) - 1 TPB = 64 # threads per block nB = int(ceil(1. * xs_size / TPB)) # number of blocks xs_d = gpuarray.empty(xs_size, float32) # decimated time-series if interp_kind == 'nearest': # compile kernel nearest_interp = mod.get_function(interp_kind) # call kernel nearest_interp(xf_d, xs_d, int32(xs_size), float64(dt_r / dt_s), block=(TPB, 1, 1), grid=(nB, 1)) elif interp_kind == 'linear': # compile kernel linear_interp = mod.get_function("copy_texture_kernel") # get texture reference a_texref = mod.get_texref("a_tex") a_texref.set_filter_mode(drv.filter_mode.LINEAR) # linear #a_texref.set_filter_mode(drv.filter_mode.POINT) # nearest-neighbor # move time series to texture reference # following http://lists.tiker.net/pipermail/pycuda/2009-November/001916.html descr = drv.ArrayDescriptor() descr.format = drv.array_format.FLOAT descr.height = Xs.shape[0] descr.width = SWARM_SAMPLES_PER_WINDOW descr.num_channels = 1 a_texref.set_address_2d(xf_d.gpudata, descr, SWARM_SAMPLES_PER_WINDOW * 4) # set up linear interpolation over texture linear_interp(xs_d,int32(xs_size),float64(dt_r/dt_s),int32(SWARM_SAMPLES_PER_WINDOW),\ texrefs=[a_texref],block=(TPB,1,1),grid=(nB,1)) return xs_d.get()
def fft_resample(x, W, new_len, npad, to_remove, cuda_dict=dict(use_cuda=False)): """Do FFT resampling with a filter function (possibly using CUDA) Parameters ---------- x : 1-d array The array to resample. W : 1-d array or gpuarray The filtering function to apply. new_len : int The size of the output array (before removing padding). npad : int Amount of padding to apply before resampling. to_remove : int Number of samples to remove after resampling. cuda_dict : dict Dictionary constructed using setup_cuda_multiply_repeated(). Returns ------- x : 1-d array Filtered version of x. """ # add some padding at beginning and end to make this work a little cleaner x = _smart_pad(x, npad) old_len = len(x) shorter = new_len < old_len if not cuda_dict['use_cuda']: N = int(min(new_len, old_len)) sl_1 = slice((N + 1) // 2) y_fft = np.zeros(new_len, np.complex128) x_fft = fft(x).ravel() * W y_fft[sl_1] = x_fft[sl_1] sl_2 = slice(-(N - 1) // 2, None) y_fft[sl_2] = x_fft[sl_2] y = np.real(ifft(y_fft, overwrite_x=True)).ravel() else: cuda_dict['x'].set( np.concatenate((x, np.zeros(max(new_len - old_len, 0), x.dtype)))) # do the fourier-domain operations, results put in second param cudafft.fft(cuda_dict['x'], cuda_dict['x_fft'], cuda_dict['fft_plan']) cuda_multiply_inplace_c128(W, cuda_dict['x_fft']) # This is not straightforward, but because x_fft and y_fft share # the same data (and only one half of the full DFT is stored), we # don't have to transfer the slice like we do in scipy. All we # need to worry about is the Nyquist component, either halving it # or taking just the real component... use_len = new_len if shorter else old_len func = cuda_real_c128 if shorter else cuda_halve_c128 if use_len % 2 == 0: nyq = int((use_len - (use_len % 2)) // 2) func(cuda_dict['x_fft'], slice=slice(nyq, nyq + 1)) cudafft.ifft(cuda_dict['x_fft'], cuda_dict['x'], cuda_dict['ifft_plan'], scale=False) y = cuda_dict['x'].get()[:new_len if shorter else None] # now let's trim it back to the correct size (if there was padding) if to_remove > 0: keep = np.ones((new_len), dtype='bool') keep[:to_remove] = False keep[-to_remove:] = False y = np.compress(keep, y) return y
def sample_defrost_gpu(lat, func, gamma, m2_eff): """Calculates a sample of random values in the lattice lat = Lattice func = name of Cuda kernel n = size of cubic lattice gamma = -0.25 or +0.25 m2_eff = effective mass This uses CuFFT to calculate FFTW. """ import scikits.cuda.fft as fft import fftw3 "Various constants:" mpl = lat.mpl n = lat.n nn = lat.nn os = 16 nos = n*pow(os,2) dk = lat.dk dx = lat.dx dkos = dk/(2.*os) dxos = dx/os kcut = nn*dk/2.0 norm = 0.5/(math.sqrt(2*pi*dk**3.)*mpl)*(dkos/dxos) ker = np.empty(nos,dtype = lat.prec_real) fft1 = fftw3.Plan(ker,ker, direction='forward', flags=['measure'], realtypes = ['realodd 10']) for k in xrange(nos): kk = (k+0.5)*dkos ker[k]=kk*(kk**2. + m2_eff)**gamma*math.exp(-(kk/kcut)**2.) fft1.execute() fftw3.destroy_plan(fft1) for k in xrange(nos): ker[k] = norm*ker[k]/(k+1) Fk_gpu = gpuarray.zeros((n/2+1,n,n), dtype = lat.prec_complex) ker_gpu = gpuarray.to_gpu(ker) tmp_gpu = gpuarray.zeros((n,n,n),dtype = lat.prec_real) plan = fft.Plan(tmp_gpu.shape, lat.prec_real, lat.prec_complex) plan2 = fft.Plan(tmp_gpu.shape, lat.prec_complex, lat.prec_real) func(tmp_gpu, ker_gpu, np.uint32(nn), np.float64(os), np.uint32(lat.dimx), np.uint32(lat.dimy), np.uint32(lat.dimz), block = lat.cuda_block_1, grid = lat.cuda_grid) fft.fft(tmp_gpu, Fk_gpu, plan) if lat.test==True: print'Testing mode on! Set testQ to False to disable this.\n' np.random.seed(1) rr1 = (np.random.normal(size=Fk_gpu.shape)+ np.random.normal(size=Fk_gpu.shape)*1j) Fk = Fk_gpu.get() Fk*= rr1 Fk_gpu = gpuarray.to_gpu(Fk) fft.ifft(Fk_gpu, tmp_gpu, plan2) res = (tmp_gpu.get()).astype(lat.prec_real) res *= 1./lat.VL return res
ii = 0 tmpimg = numpy.zeros((n, m, k), dtype=numpy.float32) ln = sq + 5 mags = mag[indexp].sum() del indexp s = 3 N2 = int(N * 0.7) N3 = int(N * 0.7) gpu_data.set(sobject.astype(numpy.complex64)) pycuda.driver.memcpy_dtod(gpu_last.gpudata, gpu_data.gpudata, gpu_data.nbytes) gpu_intensity.set(mag) gpu_mask.set(sobm) #print real_space.nbytes for i in range(N): t0 = time() cu_fft.fft(gpu_data, gpu_data, plan) constrains_fourier(gpu_data, gpu_intensity) cu_fft.ifft(gpu_data, gpu_data, plan, True) constrains_real(gpu_data, gpu_last, gpu_mask, beta) pycuda.driver.memcpy_dtod(gpu_last.gpudata, gpu_data.gpudata, gpu_data.nbytes) t1 = time() ctx.synchronize() t2 = time() print("With CUDA, the full loop took %.3fs but after sync %.3fs" % (t1 - t0, t2 - t0)) del tmpimg print "it took", time() - time0, N / (time() - time0) print "smallest error", serr, "number", nerr
print 'Testing fft/ifft..' N = 4096 * 16 batch_size = 16 x = np.asarray(np.random.rand(batch_size, N), np.float32) xf = np.fft.fft(x) y = np.real(np.fft.ifft(xf)) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((batch_size, N / 2 + 1), np.complex64) plan_forward = cu_fft.Plan(N, np.float32, np.complex64, batch_size) cu_fft.fft(x_gpu, xf_gpu, plan_forward) y_gpu = gpuarray.empty_like(x_gpu) plan_inverse = cu_fft.Plan(N, np.complex64, np.float32, batch_size) cu_fft.ifft(xf_gpu, y_gpu, plan_inverse, True) print 'Success status: ', np.allclose(y, y_gpu.get(), atol=1e-6) print 'Testing in-place fft..' x = np.asarray(np.random.rand(batch_size, N)+\ 1j*np.random.rand(batch_size, N), np.complex64) x_gpu = gpuarray.to_gpu(x) plan = cu_fft.Plan(N, np.complex64, np.complex64, batch_size) cu_fft.fft(x_gpu, x_gpu, plan) cu_fft.ifft(x_gpu, x_gpu, plan, True) print 'Success status: ', np.allclose(x, x_gpu.get(), atol=1e-6)
Ny, block=blocksize, grid=gridsize) Sb_kernel(FFTiB_d, FIB_d, FFToB_d, d_d, np.float32(beta), Nx, Ny, block=blocksize, grid=gridsize) # inverse FFT to compute S + 1 in each color channel fft_s = time.time() cu_fft.ifft(FFTiR_d, FFToR_d, plan, scale=True) cu_fft.ifft(FFTiG_d, FFToG_d, plan, scale=True) cu_fft.ifft(FFTiB_d, FFToB_d, plan, scale=True) fft_e = time.time() step_2_fft += fft_e - fft_s # merge real components of 3 complex color channels merge_r_kernel(S_d, FFToR_d, FFToG_d, FFToB_d, Nx, Ny, block=blocksize, grid=gridsize)
def resample_sdbe_to_r2dbe_zpfft(Xs): """ Resample SWARM spectrum product in time-domain at R2DBE rate using zero-padding and a radix-2 iFFT algorithm. Arguments: ---------- Xs -- MxN numpy array in which the zeroth dimension is increasing snapshot index, and the first dimension is the positive frequency half of the spectrum. Returns: -------- xs -- The time-domain signal sampled at the R2DBE rate. next_start_vec -- Start indecies for each FFT window. """ # timestep sizes for SWARM and R2DBE rates dt_s = 1.0/SWARM_RATE dt_r = 1.0/R2DBE_RATE # we need to oversample by factor 64 and then undersample by factor 39 simple_r = 64 # 4096 simple_s = 39 # 2496 fft_window_oversample = 2*SWARM_CHANNELS*simple_r # 2* due to real FFT # oversample timestep size dt_f = dt_s/simple_r # the timespan of one SWARM FFT window T_s = dt_s*SWARM_SAMPLES_PER_WINDOW # what are these...? x_t2_0 = None x_t2_1 = None # time vectors over one SWARM FFT window in different step sizes t_r = arange(0,T_s,dt_r) t_s = arange(0,T_s,dt_s) t_f = arange(0,T_s,dt_f) # offset in oversampled time series that corresponds to one dt_r step # from the last R2DBE rate sample in the previous window next_start = 0 # some time offsets...? offset_in_window_offset_s = list() offset_global_s = list() # total number of time series samples N_x = int(ceil(Xs.shape[0]*SWARM_SAMPLES_PER_WINDOW*dt_s/dt_r)) # and initialize the output xs = zeros(N_x,dtype=float32) #fine_sample_index = zeros(N_x) next_start_vec = zeros(Xs.shape[0]) # index in output where samples from next window are stored start_output = 0 # cuFFT plan for complex to real DFT plan = cu_fft.Plan(fft_window_oversample,complex64,float32) # padding kernel fill_padded = mod.get_function("fill_padded") # downsampling kernel downsample = mod.get_function("downsample") # FFT scaling kernel scale = ElementwiseKernel( "float *a", "a[i] = {0} * a[i]".format(1./fft_window_oversample),"scale") # max size of resampled chunk from a single window xs_chunk_size_max = int32(ceil((1. * fft_window_oversample)/simple_s)) # create memory on device for cuFFT xf_d = gpuarray.empty(fft_window_oversample,dtype=float32) xp_d = gpuarray.zeros(fft_window_oversample/2+1, dtype=complex64) y_d = gpuarray.empty(xs_chunk_size_max,dtype=float32) for ii in range(Xs.shape[0]): # move window to device x_d = gpuarray.to_gpu(Xs[ii,:]) # threads per block # number of blocks (keep the array as zeros to save time) TPB = 1024 nB = int(ceil(1. * Xs.shape[1] / TPB)) # pad with zeros to oversample by 64 fill_padded(int32(1), xp_d, int32(fft_window_oversample/2+1),\ x_d, int32(Xs.shape[1]),\ block=(TPB,1,1), grid=(nB,1)) # iFFT cu_fft.ifft(xp_d,xf_d,plan,scale=False) xs_chunk_size = int32(ceil((1. * fft_window_oversample - next_start)/simple_s)) # threads per block TPB = 64 # number of blocks nB = ceil(1. * xs_chunk_size / TPB).astype(int) ## undersample by 39 to correct rate, and start at the correct ## offset in this window downsample(xf_d,int32(fft_window_oversample),\ y_d,xs_chunk_size, int32(next_start),int32(simple_s),\ block=(TPB,1,1),grid=(nB,1)) # rescale from ifft using ElementwiseKernel scale(y_d) # pull data back onto host xs_chunk = y_d.get() # fill output numpy array stop_output = start_output+xs_chunk_size xs[start_output:stop_output] = xs_chunk[:xs_chunk_size] # update the starting index in the output array start_output = stop_output # mark the time of the last used sample relative to the start # of this window time_window_start_to_last_used_sample = t_f[next_start::39][-1] # calculate the remaining time in this window time_remaining_in_window = T_s-time_window_start_to_last_used_sample # convert to the equivalent number of oversample timesteps num_dt_f_steps_short = round(time_remaining_in_window/dt_f) next_start_vec[ii] = next_start if (num_dt_f_steps_short == 0): next_start = 0 else: next_start = simple_s - num_dt_f_steps_short return xs,next_start_vec
def ifft(invec, outvec, prec, itype, otype): cuplan = _get_inv_plan(invec.dtype, outvec.dtype, len(outvec)) cu_fft.ifft(invec.data, outvec.data, cuplan)
def resample_sdbe_to_r2dbe_fft_interp(Xs,interp_kind="nearest"): """ Resample SWARM spectrum product in time-domain at R2DBE rate using iFFT and then interpolation in the time-domain. Arguments: ---------- Xs -- MxN numpy array in which the zeroth dimension is increasing snapshot index, and the first dimension is the positive frequency half of the spectrum. interp_kind -- Kind of interpolation. Returns: -------- xs -- The time-domain signal sampled at the R2DBE rate. """ # timestep sizes for SWARM and R2DBE rates dt_s = 1.0/SWARM_RATE dt_r = 1.0/R2DBE_RATE # cuFFT plan for complex to real DFT plan = cu_fft.Plan(SWARM_SAMPLES_PER_WINDOW,complex64,float32,Xs.shape[0]) # load complex spectrum to device x_d = gpuarray.to_gpu(Xs) xp_d = gpuarray.empty((Xs.shape[0],Xs.shape[1]+1),dtype=complex64) # pad nyquist with zeros block = (32,32,1) grid = (int(ceil(1. * (Xs.shape[1]+1) / block[1])), int(ceil(1. * Xs.shape[0] / block[0]))) fill_padded = mod.get_function("fill_padded") fill_padded(int32(Xs.shape[0]),xp_d,int32(Xs.shape[1]+1),x_d,int32(Xs.shape[1]),\ block=block,grid=grid) # allocate memory for time series xf_d = gpuarray.empty((Xs.shape[0],SWARM_SAMPLES_PER_WINDOW),float32) # calculate time series, include scaling cu_fft.ifft(xp_d,xf_d,plan,scale=True) # and interpolate xs_size = int(floor(Xs.shape[0]*SWARM_SAMPLES_PER_WINDOW*dt_s/dt_r)) - 1 TPB = 64 # threads per block nB = int(ceil(1. * xs_size / TPB)) # number of blocks xs_d = gpuarray.empty(xs_size,float32) # decimated time-series if interp_kind == 'nearest': # compile kernel nearest_interp = mod.get_function(interp_kind) # call kernel nearest_interp(xf_d,xs_d,int32(xs_size),float64(dt_r/dt_s),block=(TPB,1,1),grid=(nB,1)) elif interp_kind == 'linear': # compile kernel linear_interp = mod.get_function("copy_texture_kernel") # get texture reference a_texref = mod.get_texref("a_tex") a_texref.set_filter_mode(drv.filter_mode.LINEAR) # linear #a_texref.set_filter_mode(drv.filter_mode.POINT) # nearest-neighbor # move time series to texture reference # following http://lists.tiker.net/pipermail/pycuda/2009-November/001916.html descr = drv.ArrayDescriptor() descr.format= drv.array_format.FLOAT descr.height = Xs.shape[0] descr.width = SWARM_SAMPLES_PER_WINDOW descr.num_channels = 1 a_texref.set_address_2d(xf_d.gpudata,descr,SWARM_SAMPLES_PER_WINDOW*4) # set up linear interpolation over texture linear_interp(xs_d,int32(xs_size),float64(dt_r/dt_s),int32(SWARM_SAMPLES_PER_WINDOW),\ texrefs=[a_texref],block=(TPB,1,1),grid=(nB,1)) return xs_d.get()
print 'Testing fft/ifft..' N = 4096*16 batch_size = 16 x = np.asarray(np.random.rand(batch_size, N), np.float32) xf = np.fft.fft(x) y = np.real(np.fft.ifft(xf)) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((batch_size, N/2+1), np.complex64) plan_forward = cu_fft.Plan(N, np.float32, np.complex64, batch_size) cu_fft.fft(x_gpu, xf_gpu, plan_forward) y_gpu = gpuarray.empty_like(x_gpu) plan_inverse = cu_fft.Plan(N, np.complex64, np.float32, batch_size) cu_fft.ifft(xf_gpu, y_gpu, plan_inverse, True) print 'Success status: ', np.allclose(y, y_gpu.get(), atol=1e-6) print 'Testing in-place fft..' x = np.asarray(np.random.rand(batch_size, N)+\ 1j*np.random.rand(batch_size, N), np.complex64) x_gpu = gpuarray.to_gpu(x) plan = cu_fft.Plan(N, np.complex64, np.complex64, batch_size) cu_fft.fft(x_gpu, x_gpu, plan) cu_fft.ifft(x_gpu, x_gpu, plan, True) print 'Success status: ', np.allclose(x, x_gpu.get(), atol=1e-6)
def ifft(invec,outvec,prec,itype,otype): cuplan = _get_inv_plan(invec.dtype,outvec.dtype,len(outvec)) cu_fft.ifft(invec.data,outvec.data,cuplan)
def fft_resample(x, W, new_len, npad, to_remove, cuda_dict=dict(use_cuda=False)): """Do FFT resampling with a filter function (possibly using CUDA) Parameters ---------- x : 1-d array The array to resample. W : 1-d array or gpuarray The filtering function to apply. new_len : int The size of the output array (before removing padding). npad : int Amount of padding to apply before resampling. to_remove : int Number of samples to remove after resampling. cuda_dict : dict Dictionary constructed using setup_cuda_multiply_repeated(). Returns ------- x : 1-d array Filtered version of x. """ # add some padding at beginning and end to make this work a little cleaner x = _smart_pad(x, npad) old_len = len(x) shorter = new_len < old_len if not cuda_dict["use_cuda"]: N = int(min(new_len, old_len)) sl_1 = slice((N + 1) // 2) y_fft = np.zeros(new_len, np.complex128) x_fft = fft(x).ravel() * W y_fft[sl_1] = x_fft[sl_1] sl_2 = slice(-(N - 1) // 2, None) y_fft[sl_2] = x_fft[sl_2] y = np.real(ifft(y_fft, overwrite_x=True)).ravel() else: cuda_dict["x"].set(np.concatenate((x, np.zeros(max(new_len - old_len, 0), x.dtype)))) # do the fourier-domain operations, results put in second param cudafft.fft(cuda_dict["x"], cuda_dict["x_fft"], cuda_dict["fft_plan"]) cuda_multiply_inplace_c128(W, cuda_dict["x_fft"]) # This is not straightforward, but because x_fft and y_fft share # the same data (and only one half of the full DFT is stored), we # don't have to transfer the slice like we do in scipy. All we # need to worry about is the Nyquist component, either halving it # or taking just the real component... use_len = new_len if shorter else old_len func = cuda_real_c128 if shorter else cuda_halve_c128 if use_len % 2 == 0: nyq = int((use_len - (use_len % 2)) // 2) func(cuda_dict["x_fft"], slice=slice(nyq, nyq + 1)) cudafft.ifft(cuda_dict["x_fft"], cuda_dict["x"], cuda_dict["ifft_plan"], scale=False) y = cuda_dict["x"].get()[: new_len if shorter else None] # now let's trim it back to the correct size (if there was padding) if to_remove > 0: keep = np.ones((new_len), dtype="bool") keep[:to_remove] = False keep[-to_remove:] = False y = np.compress(keep, y) return y