def thunk(): input_shape = inputs[0][0].shape # construct output shape output_shape = list(input_shape) # DFT of real input is symmetric, no need to store # redundant coefficients output_shape[-1] = output_shape[-1] // 2 + 1 # extra dimension with length 2 for real/imag output_shape += [2] output_shape = tuple(output_shape) z = outputs[0] # only allocate if there is no previous allocation of the # right size. if z[0] is None or z[0].shape != output_shape: z[0] = CudaNdarray.zeros(output_shape) input_pycuda = to_gpuarray(inputs[0][0]) # I thought we'd need to change the type on output_pycuda # so it is complex64, but as it turns out scikits.cuda.fft # doesn't really care either way and treats the array as # if it is complex64 anyway. output_pycuda = to_gpuarray(z[0]) # only initialise plan if necessary if plan[0] is None or plan_input_shape[0] != input_shape: plan_input_shape[0] = input_shape plan[0] = fft.Plan(input_shape[1:], np.float32, np.complex64, batch=input_shape[0]) fft.fft(input_pycuda, output_pycuda, plan[0])
def init_cuda(ignore_config=False): """Initialize CUDA functionality This function attempts to load the necessary interfaces (hardware connectivity) to run CUDA-based filtering. This function should only need to be run once per session. If the config var (set via mne.set_config or in ENV) MNE_USE_CUDA == 'true', this function will be executed when the first CUDA setup is performed. If this variable is not set, this function can be manually executed. """ global _cuda_capable, _multiply_inplace_c128, _halve_c128, _real_c128 if _cuda_capable: return if not ignore_config and (get_config('MNE_USE_CUDA', 'false').lower() != 'true'): logger.info('CUDA not enabled in config, skipping initialization') return # Triage possible errors for informative messaging _cuda_capable = False try: from pycuda import gpuarray, driver # noqa from pycuda.elementwise import ElementwiseKernel except ImportError: logger.warning('module pycuda not found, CUDA not enabled') return try: # Initialize CUDA; happens with importing autoinit import pycuda.autoinit # noqa except ImportError: logger.warning('pycuda.autoinit could not be imported, likely ' 'a hardware error, CUDA not enabled') return # Make sure scikits.cuda is installed try: from scikits.cuda import fft as cudafft except ImportError: logger.warning('module scikits.cuda not found, CUDA not ' 'enabled') return # let's construct our own CUDA multiply in-place function _multiply_inplace_c128 = ElementwiseKernel( 'pycuda::complex<double> *a, pycuda::complex<double> *b', 'b[i] *= a[i]', 'multiply_inplace') _halve_c128 = ElementwiseKernel('pycuda::complex<double> *a', 'a[i] /= 2.0', 'halve_value') _real_c128 = ElementwiseKernel('pycuda::complex<double> *a', 'a[i] = real(a[i])', 'real_value') # Make sure we can use 64-bit FFTs try: cudafft.Plan(16, np.float64, np.complex128) # will get auto-GC'ed except: logger.warning('Device does not support 64-bit FFTs, ' 'CUDA not enabled') return _cuda_capable = True # Figure out limit for CUDA FFT calculations logger.info('Enabling CUDA with %s available memory' % get_cuda_memory())
def thunk(): input_shape = inputs[0][0].shape # construct output shape output_shape = tuple(input_shape) # print 'FFT shapes:', input_shape, '->', output_shape # print 'Batch size:', input_shape[0] # print 'Core shape:', input_shape[1:-1] z = outputs[0] # only allocate if there is no previous allocation of the right size. if z[0] is None or z[0].shape != output_shape: z[0] = CudaNdarray.zeros(output_shape) input_pycuda = to_gpuarray(inputs[0][0]) # I thought we'd need to change the type on output_pycuda # so it is complex64, but as it turns out scikits.cuda.fft # doesn't really care either way and treats the array as # if it is complex64 anyway. output_pycuda = to_gpuarray(z[0]) # only initialise plan if necessary if plan[0] is None or plan_input_shape[0] != input_shape: plan_input_shape[0] = input_shape plan[0] = fft.Plan(shape=input_shape[1:-1], # Exclude batch dim and complex dim in_dtype=np.complex64, out_dtype=np.complex64, batch=input_shape[0]) fft.fft(input_pycuda, output_pycuda, plan[0])
def init(self): if self.ctx is None: with self.__class__.initsem: if self.ctx is None: import pycuda if "autoinit" not in dir(pycuda): import pycuda.autoinit self.__class__.ctx = pycuda.autoinit.context if not self.shape in self.plans: with self.__class__.initsem: if not self.shape in self.plans: self.ctx.push() if not self.__class__.multconj: self.__class__.multconj = pycuda.elementwise.ElementwiseKernel( "pycuda::complex<double> *a, pycuda::complex<double> *b", "a[i]*=conj(b[i])") if self.shape not in self.__class__.data1_gpus: self.__class__.data1_gpus[self.shape] = gpuarray.empty( self.shape, numpy.complex128) if self.shape not in self.__class__.data2_gpus: self.__class__.data2_gpus[self.shape] = gpuarray.empty( self.shape, numpy.complex128) if self.shape not in self.__class__.plans: self.__class__.plans[self.shape] = cu_fft.Plan( self.shape, numpy.complex128, numpy.complex128) self.ctx.synchronize() self.ctx.pop()
def thunk(): input_shape = inputs[0][0].shape output_shape = input_shape z = outputs[0] # only allocate if there is no previous allocation of the # right size. if z[0] is None or z[0].shape != output_shape: z[0] = CudaNdarray.zeros(output_shape) input_pycuda = to_gpuarray(inputs[0][0]) # I thought we'd need to change the type on output_pycuda # so it is complex64, but as it turns out scikits.cuda.fft # doesn't really care either way and treats the array as # if it is complex64 anyway. output_pycuda = to_gpuarray(z[0]) # only initialise plan if necessary if plan[0] is None or plan_input_shape[0] != input_shape: plan_input_shape[0] = input_shape plan[0] = fft.Plan(input_shape[1:-1], np.complex64, np.complex64, batch=input_shape[0]) fft.fft(input_pycuda, output_pycuda, plan[0]) compute_map[node.outputs[0]][0] = True
def thunk(): input_shape = inputs[0][0].shape # construct output shape # chop off the extra length-2 dimension for real/imag output_shape = list(input_shape[:-1]) # restore full signal length output_shape[-1] = (output_shape[-1] - 1) * 2 output_shape = tuple(output_shape) z = outputs[0] # only allocate if there is no previous allocation of the # right size. if z[0] is None or z[0].shape != output_shape: z[0] = CudaNdarray.zeros(output_shape) input_pycuda = to_gpuarray(inputs[0][0]) # input_pycuda is a float32 array with an extra dimension, # but will be interpreted by scikits.cuda as a complex64 # array instead. output_pycuda = to_gpuarray(z[0]) # only initialise plan if necessary if plan[0] is None or plan_input_shape[0] != input_shape: plan_input_shape[0] = input_shape plan[0] = fft.Plan(output_shape[1:], np.complex64, np.float32, batch=output_shape[0]) fft.ifft(input_pycuda, output_pycuda, plan[0])
def thunk(): input_shape = inputs[0][0].shape output_shape = input_shape z = outputs[0] # only allocate if there is no previous allocation of the # right size. if z[0] is None or z[0].shape != output_shape: z[0] = CudaNdarray.zeros(output_shape) input_pycuda = to_gpuarray(inputs[0][0]) # input_pycuda is a float32 array with an extra dimension, # but will be interpreted by scikits.cuda as a complex64 # array instead. output_pycuda = to_gpuarray(z[0]) # only initialise plan if necessary if plan[0] is None or plan_input_shape[0] != input_shape: plan_input_shape[0] = input_shape plan[0] = fft.Plan(output_shape[1:-1], np.complex64, np.complex64, batch=output_shape[0]) fft.ifft(input_pycuda, output_pycuda, plan[0]) compute_map[node.outputs[0]][0] = True
def get_plan(self, cache, *args): if not args in self.plan_cache: plan = cu_fft.Plan(*args) if cache: self.plan_cache[args] = plan else: plan = self.plan_cache[args] return plan
def test_batch_fft_float64_to_complex128_1d(self): x = np.asarray(np.random.rand(self.B, self.N), np.float64) xf = np.fft.rfft(x, axis=1) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((self.B, self.N / 2 + 1), np.complex128) plan = fft.Plan(x.shape[1], np.float64, np.complex128, batch=self.B) fft.fft(x_gpu, xf_gpu, plan) assert np.allclose(xf, xf_gpu.get(), atol=atol_float64)
def test_fft_float64_to_complex128_2d(self): x = np.asarray(np.random.rand(self.N, self.M), np.float64) xf = np.fft.rfftn(x) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((self.N, self.M / 2 + 1), np.complex128) plan = fft.Plan(x.shape, np.float64, np.complex128) fft.fft(x_gpu, xf_gpu, plan) assert np.allclose(xf, xf_gpu.get(), atol=atol_float64)
def test_fft_float32_to_complex64_1d(self): x = np.asarray(np.random.rand(self.N), np.float32) xf = np.fft.rfftn(x) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty(self.N / 2 + 1, np.complex64) plan = fft.Plan(x.shape, np.float32, np.complex64) fft.fft(x_gpu, xf_gpu, plan) assert np.allclose(xf, xf_gpu.get(), atol=atol_float32)
def test_multiple_streams(self): x = np.asarray(np.random.rand(self.N), np.float32) xf = np.fft.rfftn(x) y = np.asarray(np.random.rand(self.N), np.float32) yf = np.fft.rfftn(y) x_gpu = gpuarray.to_gpu(x) y_gpu = gpuarray.to_gpu(y) xf_gpu = gpuarray.empty(self.N / 2 + 1, np.complex64) yf_gpu = gpuarray.empty(self.N / 2 + 1, np.complex64) stream0 = drv.Stream() stream1 = drv.Stream() plan1 = fft.Plan(x.shape, np.float32, np.complex64, stream=stream0) plan2 = fft.Plan(y.shape, np.float32, np.complex64, stream=stream1) fft.fft(x_gpu, xf_gpu, plan1) fft.fft(y_gpu, yf_gpu, plan2) assert np.allclose(xf, xf_gpu.get(), atol=atol_float32) assert np.allclose(yf, yf_gpu.get(), atol=atol_float32)
def test_ifft_complex128_to_float64(self): x = np.asarray(np.random.rand(self.N), np.float64) xf = np.asarray(np.fft.fft(x), np.complex128) xf_gpu = gpuarray.to_gpu(xf[0:self.N / 2 + 1]) x_gpu = gpuarray.empty(self.N, np.float64) plan = fft.Plan(x.shape, np.complex128, np.float64) fft.ifft(xf_gpu, x_gpu, plan, True) assert np.allclose(x, x_gpu.get(), atol=atol_float64)
def test_ifft_complex64_to_float32_1d(self): x = np.asarray(np.random.rand(self.N), np.float32) xf = np.asarray(np.fft.rfftn(x), np.complex64) xf_gpu = gpuarray.to_gpu(xf) x_gpu = gpuarray.empty(self.N, np.float32) plan = fft.Plan(x.shape, np.complex64, np.float32) fft.ifft(xf_gpu, x_gpu, plan, True) assert np.allclose(x, x_gpu.get(), atol=atol_float32)
def _get_fwd_plan(itype, otype, inlen): try: theplan = _forward_plans[(itype, otype, inlen)] except KeyError: theplan = cu_fft.Plan((inlen, ), itype, otype) _forward_plans.update({(itype, otype, inlen): theplan}) return theplan
def _get_inv_plan(itype, otype, outlen): try: theplan = _reverse_plans[(itype, otype, outlen)] except KeyError: theplan = cu_fft.Plan((outlen, ), itype, otype) _reverse_plans.update({(itype, otype, outlen): theplan}) return theplan
def test_batch_fft_float32_to_complex64_2d(self): x = np.asarray(np.random.rand(self.B, self.N, self.M), np.float32) xf = np.fft.rfftn(x, axes=(1, 2)) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((self.B, self.N, self.M / 2 + 1), np.complex64) plan = fft.Plan([self.N, self.M], np.float32, np.complex64, batch=self.B) fft.fft(x_gpu, xf_gpu, plan) assert np.allclose(xf, xf_gpu.get(), atol=atol_float32)
def test_batch_ifft_complex128_to_float64_1d(self): # Note that since rfftn returns a Fortran-ordered array, it # needs to be reformatted as a C-ordered array before being # passed to gpuarray.to_gpu: x = np.asarray(np.random.rand(self.B, self.N), np.float64) xf = np.asarray(np.fft.rfft(x, axis=1), np.complex128) xf_gpu = gpuarray.to_gpu(np.ascontiguousarray(xf)) x_gpu = gpuarray.empty((self.B, self.N), np.float64) plan = fft.Plan(x.shape[1], np.complex128, np.float64, batch=self.B) fft.ifft(xf_gpu, x_gpu, plan, True) assert np.allclose(x, x_gpu.get(), atol=atol_float64)
def test_ifft_complex64_to_float32_2d(self): # Note that since rfftn returns a Fortran-ordered array, it # needs to be reformatted as a C-ordered array before being # passed to gpuarray.to_gpu: x = np.asarray(np.random.rand(self.N, self.M), np.float32) xf = np.asarray(np.fft.rfftn(x), np.complex64) xf_gpu = gpuarray.to_gpu(np.ascontiguousarray(xf)) x_gpu = gpuarray.empty((self.N, self.M), np.float32) plan = fft.Plan(x.shape, np.complex64, np.float32) fft.ifft(xf_gpu, x_gpu, plan, True) assert np.allclose(x, x_gpu.get(), atol=atol_float32)
# Start time start_time = time.time() # Validate image format N, M, D = image.shape assert D == 3, "Error: input must be 3-channel RGB image" print "Processing %d x %d RGB image" % (M, N) ### Compile and initialize CUDA kernels and FFT plans mtf_kernel = cuda_compile(mtf_kernel_source, "mtf_kernel") hv_kernel = cuda_compile(hv_kernel_source, "hv_kernel") Sa_kernel = cuda_compile(Sa_kernel_source, "Sa_kernel") Sb_kernel = cuda_compile(Sb_kernel_source, "Sb_kernel") d_kernel = cuda_compile(d_kernel_source, "d_kernel") merge_r_kernel = cuda_compile(merge_r_kernel_source, "merge_r_kernel") plan = cu_fft.Plan((N, M), np.complex64, np.complex64) ### CUDA kernel settings Nx, Ny = np.int32(M), np.int32(N) x_tpb = 32 y_tpb = 16 x_blocks = int(np.ceil(Nx * 1.0 / x_tpb)) y_blocks = int(np.ceil(Ny * 1.0 / y_tpb)) blocksize = (x_tpb, y_tpb, 1) gridsize = (x_blocks, y_blocks) # Initialize S with I and normalize RGB values S = np.float32(image) / 256 ### Allocate memory on GPU
settings = dict([]) # setup filename settings['vfile'] = example settings['imsize'] = np.int32(ISIZE) # number of image pixels # 1 degree viewfield, 1*3.1415926535/180*3600 = settings['cell'] = np.float32(3600. / ISIZE) # pixel size in arcseconds (rad ? degree?) settings['briggs'] = np.float32(1e7) # weight parameter ## make cuFFT plan #improvable# imsize = settings['imsize'] # nx - 2 imsize, it means 2048 when imsize=1024 nx = np.int32(2 * imsize) # create fft plan nx*nx plan = fft.Plan((np.int(nx), np.int(nx)), np.complex64, np.complex64) ## Create the PSF & dirty image # dpsf - PSF, gpu_im ( dirty image) # dpsf is computed by CPU, gpu_im is in the GPU dpsf, gpu_im = cuda_gridvis(settings, plan) gpu_dpsf = gpu.to_gpu(dpsf) if PLOTME: dirty = np.roll(np.fliplr(gpu_im.get()), 1, axis=1) ## Clean the PSF cpsf = serial_clean_beam(dpsf, imsize / 50.) gpu_cpsf = gpu.to_gpu(cpsf) if PLOTME:
def resample_sdbe_to_r2dbe_zpfft(Xs): """ Resample SWARM spectrum product in time-domain at R2DBE rate using zero-padding and a radix-2 iFFT algorithm. Arguments: ---------- Xs -- MxN numpy array in which the zeroth dimension is increasing snapshot index, and the first dimension is the positive frequency half of the spectrum. Returns: -------- xs -- The time-domain signal sampled at the R2DBE rate. next_start_vec -- Start indecies for each FFT window. """ # timestep sizes for SWARM and R2DBE rates dt_s = 1.0 / SWARM_RATE dt_r = 1.0 / R2DBE_RATE # we need to oversample by factor 64 and then undersample by factor 39 simple_r = 64 # 4096 simple_s = 39 # 2496 fft_window_oversample = 2 * SWARM_CHANNELS * simple_r # 2* due to real FFT # oversample timestep size dt_f = dt_s / simple_r # the timespan of one SWARM FFT window T_s = dt_s * SWARM_SAMPLES_PER_WINDOW # what are these...? x_t2_0 = None x_t2_1 = None # time vectors over one SWARM FFT window in different step sizes t_r = arange(0, T_s, dt_r) t_s = arange(0, T_s, dt_s) t_f = arange(0, T_s, dt_f) # offset in oversampled time series that corresponds to one dt_r step # from the last R2DBE rate sample in the previous window next_start = 0 # some time offsets...? offset_in_window_offset_s = list() offset_global_s = list() # total number of time series samples N_x = int(ceil(Xs.shape[0] * SWARM_SAMPLES_PER_WINDOW * dt_s / dt_r)) # and initialize the output xs = zeros(N_x, dtype=float32) #fine_sample_index = zeros(N_x) next_start_vec = zeros(Xs.shape[0]) # index in output where samples from next window are stored start_output = 0 # cuFFT plan for complex to real DFT plan = cu_fft.Plan(fft_window_oversample, complex64, float32) # padding kernel fill_padded = mod.get_function("fill_padded") # downsampling kernel downsample = mod.get_function("downsample") # FFT scaling kernel scale = ElementwiseKernel( "float *a", "a[i] = {0} * a[i]".format(1. / fft_window_oversample), "scale") # max size of resampled chunk from a single window xs_chunk_size_max = int32(ceil((1. * fft_window_oversample) / simple_s)) # create memory on device for cuFFT xf_d = gpuarray.empty(fft_window_oversample, dtype=float32) xp_d = gpuarray.zeros(fft_window_oversample / 2 + 1, dtype=complex64) y_d = gpuarray.empty(xs_chunk_size_max, dtype=float32) for ii in range(Xs.shape[0]): # move window to device x_d = gpuarray.to_gpu(Xs[ii, :]) # threads per block # number of blocks (keep the array as zeros to save time) TPB = 1024 nB = int(ceil(1. * Xs.shape[1] / TPB)) # pad with zeros to oversample by 64 fill_padded(int32(1), xp_d, int32(fft_window_oversample/2+1),\ x_d, int32(Xs.shape[1]),\ block=(TPB,1,1), grid=(nB,1)) # iFFT cu_fft.ifft(xp_d, xf_d, plan, scale=False) xs_chunk_size = int32( ceil((1. * fft_window_oversample - next_start) / simple_s)) # threads per block TPB = 64 # number of blocks nB = ceil(1. * xs_chunk_size / TPB).astype(int) ## undersample by 39 to correct rate, and start at the correct ## offset in this window downsample(xf_d,int32(fft_window_oversample),\ y_d,xs_chunk_size, int32(next_start),int32(simple_s),\ block=(TPB,1,1),grid=(nB,1)) # rescale from ifft using ElementwiseKernel scale(y_d) # pull data back onto host xs_chunk = y_d.get() # fill output numpy array stop_output = start_output + xs_chunk_size xs[start_output:stop_output] = xs_chunk[:xs_chunk_size] # update the starting index in the output array start_output = stop_output # mark the time of the last used sample relative to the start # of this window time_window_start_to_last_used_sample = t_f[next_start::39][-1] # calculate the remaining time in this window time_remaining_in_window = T_s - time_window_start_to_last_used_sample # convert to the equivalent number of oversample timesteps num_dt_f_steps_short = round(time_remaining_in_window / dt_f) next_start_vec[ii] = next_start if (num_dt_f_steps_short == 0): next_start = 0 else: next_start = simple_s - num_dt_f_steps_short return xs, next_start_vec
def resample_sdbe_to_r2dbe_fft_interp(Xs, interp_kind="nearest"): """ Resample SWARM spectrum product in time-domain at R2DBE rate using iFFT and then interpolation in the time-domain. Arguments: ---------- Xs -- MxN numpy array in which the zeroth dimension is increasing snapshot index, and the first dimension is the positive frequency half of the spectrum. interp_kind -- Kind of interpolation. Returns: -------- xs -- The time-domain signal sampled at the R2DBE rate. """ # timestep sizes for SWARM and R2DBE rates dt_s = 1.0 / SWARM_RATE dt_r = 1.0 / R2DBE_RATE # cuFFT plan for complex to real DFT plan = cu_fft.Plan(SWARM_SAMPLES_PER_WINDOW, complex64, float32, Xs.shape[0]) # load complex spectrum to device x_d = gpuarray.to_gpu(Xs) xp_d = gpuarray.empty((Xs.shape[0], Xs.shape[1] + 1), dtype=complex64) # pad nyquist with zeros block = (32, 32, 1) grid = (int(ceil(1. * (Xs.shape[1] + 1) / block[1])), int(ceil(1. * Xs.shape[0] / block[0]))) fill_padded = mod.get_function("fill_padded") fill_padded(int32(Xs.shape[0]),xp_d,int32(Xs.shape[1]+1),x_d,int32(Xs.shape[1]),\ block=block,grid=grid) # allocate memory for time series xf_d = gpuarray.empty((Xs.shape[0], SWARM_SAMPLES_PER_WINDOW), float32) # calculate time series, include scaling cu_fft.ifft(xp_d, xf_d, plan, scale=True) # and interpolate xs_size = int(floor( Xs.shape[0] * SWARM_SAMPLES_PER_WINDOW * dt_s / dt_r)) - 1 TPB = 64 # threads per block nB = int(ceil(1. * xs_size / TPB)) # number of blocks xs_d = gpuarray.empty(xs_size, float32) # decimated time-series if interp_kind == 'nearest': # compile kernel nearest_interp = mod.get_function(interp_kind) # call kernel nearest_interp(xf_d, xs_d, int32(xs_size), float64(dt_r / dt_s), block=(TPB, 1, 1), grid=(nB, 1)) elif interp_kind == 'linear': # compile kernel linear_interp = mod.get_function("copy_texture_kernel") # get texture reference a_texref = mod.get_texref("a_tex") a_texref.set_filter_mode(drv.filter_mode.LINEAR) # linear #a_texref.set_filter_mode(drv.filter_mode.POINT) # nearest-neighbor # move time series to texture reference # following http://lists.tiker.net/pipermail/pycuda/2009-November/001916.html descr = drv.ArrayDescriptor() descr.format = drv.array_format.FLOAT descr.height = Xs.shape[0] descr.width = SWARM_SAMPLES_PER_WINDOW descr.num_channels = 1 a_texref.set_address_2d(xf_d.gpudata, descr, SWARM_SAMPLES_PER_WINDOW * 4) # set up linear interpolation over texture linear_interp(xs_d,int32(xs_size),float64(dt_r/dt_s),int32(SWARM_SAMPLES_PER_WINDOW),\ texrefs=[a_texref],block=(TPB,1,1),grid=(nB,1)) return xs_d.get()
def setup_cuda_fft_multiply_repeated(n_jobs, h_fft): """Set up repeated CUDA FFT multiplication with a given filter Parameters ---------- n_jobs : int | str If n_jobs == 'cuda', the function will attempt to set up for CUDA FFT multiplication. h_fft : array The filtering function that will be used repeatedly. If n_jobs='cuda', this function will be shortened (since CUDA assumes FFTs of real signals are half the length of the signal) and turned into a gpuarray. Returns ------- n_jobs : int Sets n_jobs = 1 if n_jobs == 'cuda' was passed in, otherwise original n_jobs is passed. cuda_dict : dict Dictionary with the following CUDA-related variables: use_cuda : bool Whether CUDA should be used. fft_plan : instance of FFTPlan FFT plan to use in calculating the FFT. ifft_plan : instance of FFTPlan FFT plan to use in calculating the IFFT. x_fft : instance of gpuarray Empty allocated GPU space for storing the result of the frequency-domain multiplication. x : instance of gpuarray Empty allocated GPU space for the data to filter. h_fft : array | instance of gpuarray This will either be a gpuarray (if CUDA enabled) or np.ndarray. If CUDA is enabled, h_fft will be modified appropriately for use with filter.fft_multiply(). Notes ----- This function is designed to be used with fft_multiply_repeated(). """ cuda_dict = dict(use_cuda=False, fft_plan=None, ifft_plan=None, x_fft=None, x=None) n_fft = len(h_fft) cuda_fft_len = int((n_fft - (n_fft % 2)) / 2 + 1) if n_jobs == 'cuda': n_jobs = 1 if cuda_capable: # set up all arrays necessary for CUDA # try setting up for float64 try: # do the IFFT normalization now so we don't have to later h_fft = gpuarray.to_gpu( h_fft[:cuda_fft_len].astype('complex_') / len(h_fft)) cuda_dict.update( use_cuda=True, fft_plan=cudafft.Plan(n_fft, np.float64, np.complex128), ifft_plan=cudafft.Plan(n_fft, np.complex128, np.float64), x_fft=gpuarray.empty(cuda_fft_len, np.complex128), x=gpuarray.empty(int(n_fft), np.float64)) logger.info('Using CUDA for FFT FIR filtering') except Exception: logger.info('CUDA not used, could not instantiate memory ' '(arrays may be too large), falling back to ' 'n_jobs=1') else: logger.info('CUDA not used, CUDA has not been initialized, ' 'falling back to n_jobs=1') return n_jobs, cuda_dict, h_fft
def init_cuda(): """Initialize CUDA functionality This function attempts to load the necessary interfaces (hardware connectivity) to run CUDA-based filtering. This function should only need to be run once per session. If the config var (set via mne.set_config or in ENV) MNE_USE_CUDA == 'true', this function will be executed when importing mne. If this variable is not set, this function can be manually executed. """ global cuda_capable global cuda_multiply_inplace_c128 global cuda_halve_c128 global cuda_real_c128 if cuda_capable is True: logger.info('CUDA previously enabled, currently %s available memory' % sizeof_fmt(mem_get_info()[0])) return # Triage possible errors for informative messaging cuda_capable = False try: import pycuda.gpuarray import pycuda.driver except ImportError: logger.warning('module pycuda not found, CUDA not enabled') return try: # Initialize CUDA; happens with importing autoinit import pycuda.autoinit # noqa, analysis:ignore except ImportError: logger.warning('pycuda.autoinit could not be imported, likely ' 'a hardware error, CUDA not enabled') return # Make sure scikits.cuda is installed try: from scikits.cuda import fft as cudafft except ImportError: logger.warning('module scikits.cuda not found, CUDA not ' 'enabled') return # Make our multiply inplace kernel from pycuda.elementwise import ElementwiseKernel # let's construct our own CUDA multiply in-place function cuda_multiply_inplace_c128 = ElementwiseKernel( 'pycuda::complex<double> *a, pycuda::complex<double> *b', 'b[i] *= a[i]', 'multiply_inplace') cuda_halve_c128 = ElementwiseKernel('pycuda::complex<double> *a', 'a[i] /= 2.0', 'halve_value') cuda_real_c128 = ElementwiseKernel('pycuda::complex<double> *a', 'a[i] = real(a[i])', 'real_value') # Make sure we can use 64-bit FFTs try: cudafft.Plan(16, np.float64, np.complex128) # will get auto-GC'ed except: logger.warning('Device does not support 64-bit FFTs, ' 'CUDA not enabled') return cuda_capable = True # Figure out limit for CUDA FFT calculations logger.info('Enabling CUDA with %s available memory' % sizeof_fmt(mem_get_info()[0]))
def setup_cuda_fft_resample(n_jobs, W, new_len): """Set up CUDA FFT resampling Parameters ---------- n_jobs : int | str If n_jobs == 'cuda', the function will attempt to set up for CUDA FFT resampling. W : array The filtering function to be used during resampling. If n_jobs='cuda', this function will be shortened (since CUDA assumes FFTs of real signals are half the length of the signal) and turned into a gpuarray. new_len : int The size of the array following resampling. Returns ------- n_jobs : int Sets n_jobs = 1 if n_jobs == 'cuda' was passed in, otherwise original n_jobs is passed. cuda_dict : dict Dictionary with the following CUDA-related variables: use_cuda : bool Whether CUDA should be used. fft_plan : instance of FFTPlan FFT plan to use in calculating the FFT. ifft_plan : instance of FFTPlan FFT plan to use in calculating the IFFT. x_fft : instance of gpuarray Empty allocated GPU space for storing the result of the frequency-domain multiplication. x : instance of gpuarray Empty allocated GPU space for the data to resample. W : array | instance of gpuarray This will either be a gpuarray (if CUDA enabled) or np.ndarray. If CUDA is enabled, W will be modified appropriately for use with filter.fft_multiply(). Notes ----- This function is designed to be used with fft_resample(). """ cuda_dict = dict(use_cuda=False, fft_plan=None, ifft_plan=None, x_fft=None, x=None, y_fft=None, y=None) n_fft_x, n_fft_y = len(W), new_len cuda_fft_len_x = int((n_fft_x - (n_fft_x % 2)) // 2 + 1) cuda_fft_len_y = int((n_fft_y - (n_fft_y % 2)) // 2 + 1) if n_jobs == 'cuda': n_jobs = 1 if cuda_capable: # try setting up for float64 try: # do the IFFT normalization now so we don't have to later W = gpuarray.to_gpu(W[:cuda_fft_len_x].astype('complex_') / n_fft_y) cuda_dict.update( use_cuda=True, fft_plan=cudafft.Plan(n_fft_x, np.float64, np.complex128), ifft_plan=cudafft.Plan(n_fft_y, np.complex128, np.float64), x_fft=gpuarray.zeros(max(cuda_fft_len_x, cuda_fft_len_y), np.complex128), x=gpuarray.empty(max(int(n_fft_x), int(n_fft_y)), np.float64)) logger.info('Using CUDA for FFT resampling') except Exception: logger.info('CUDA not used, could not instantiate memory ' '(arrays may be too large), falling back to ' 'n_jobs=1') else: logger.info('CUDA not used, CUDA has not been initialized, ' 'falling back to n_jobs=1') return n_jobs, cuda_dict, W
import pycuda.gpuarray as gpuarray import numpy as np import scikits.cuda.fft as cu_fft print 'Testing fft/ifft..' N = 4096 * 16 batch_size = 16 x = np.asarray(np.random.rand(batch_size, N), np.float32) xf = np.fft.fft(x) y = np.real(np.fft.ifft(xf)) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((batch_size, N / 2 + 1), np.complex64) plan_forward = cu_fft.Plan(N, np.float32, np.complex64, batch_size) cu_fft.fft(x_gpu, xf_gpu, plan_forward) y_gpu = gpuarray.empty_like(x_gpu) plan_inverse = cu_fft.Plan(N, np.complex64, np.float32, batch_size) cu_fft.ifft(xf_gpu, y_gpu, plan_inverse, True) print 'Success status: ', np.allclose(y, y_gpu.get(), atol=1e-6) print 'Testing in-place fft..' x = np.asarray(np.random.rand(batch_size, N)+\ 1j*np.random.rand(batch_size, N), np.complex64) x_gpu = gpuarray.to_gpu(x) plan = cu_fft.Plan(N, np.complex64, np.complex64, batch_size) cu_fft.fft(x_gpu, x_gpu, plan)
def init_cuda(): """Initialize CUDA functionality This function attempts to load the necessary interfaces (hardware connectivity) to run CUDA-based filering. This function should only need to be run once per session. If the config var (set via mne.set_config or in ENV) MNE_USE_CUDA == 'true', this function will be executed when importing mne. If this variable is not set, this function can be manually executed. """ global cuda_capable global cuda_multiply_inplace_complex128 global cuda_halve_value_complex128 global cuda_real_value_complex128 global requires_cuda if cuda_capable is True: logger.info('CUDA previously enabled, currently %s available memory' % sizeof_fmt(mem_get_info()[0])) return # Triage possible errors for informative messaging cuda_capable = False try: import pycuda.gpuarray import pycuda.driver except ImportError: logger.warn('module pycuda not found, CUDA not enabled') else: try: # Initialize CUDA; happens with importing autoinit import pycuda.autoinit except ImportError: logger.warn('pycuda.autoinit could not be imported, likely ' 'a hardware error, CUDA not enabled') else: # Make our multiply inplace kernel try: from pycuda.elementwise import ElementwiseKernel # let's construct our own CUDA multiply in-place function dtype = 'pycuda::complex<double>' cuda_multiply_inplace_complex128 = \ ElementwiseKernel(dtype + ' *a, ' + dtype + ' *b', 'b[i] *= a[i]', 'multiply_inplace') cuda_halve_value_complex128 = \ ElementwiseKernel(dtype + ' *a', 'a[i] /= 2.0', 'halve_value') cuda_real_value_complex128 = \ ElementwiseKernel(dtype + ' *a', 'a[i] = real(a[i])', 'real_value') except: # This should never happen raise RuntimeError('pycuda ElementwiseKernel could not be ' 'constructed, please report this issue ' 'to mne-python developers with your ' 'system information and pycuda version') else: # Make sure scikits.cuda is installed try: from scikits.cuda import fft as cudafft except ImportError: logger.warn('module scikits.cuda not found, CUDA not ' 'enabled') else: # Make sure we can use 64-bit FFTs try: fft_plan = cudafft.Plan(16, np.float64, np.complex128) del fft_plan except: logger.warn('Device does not support 64-bit FFTs, ' 'CUDA not enabled') else: cuda_capable = True # Figure out limit for CUDA FFT calculations logger.info('Enabling CUDA with %s available memory' % sizeof_fmt(mem_get_info()[0])) requires_cuda = np.testing.dec.skipif(not cuda_capable, 'CUDA not initialized')
def sample_defrost_gpu(lat, func, gamma, m2_eff): """Calculates a sample of random values in the lattice lat = Lattice func = name of Cuda kernel n = size of cubic lattice gamma = -0.25 or +0.25 m2_eff = effective mass This uses CuFFT to calculate FFTW. """ import scikits.cuda.fft as fft import fftw3 "Various constants:" mpl = lat.mpl n = lat.n nn = lat.nn os = 16 nos = n * pow(os, 2) dk = lat.dk dx = lat.dx dkos = dk / (2. * os) dxos = dx / os kcut = nn * dk / 2.0 norm = 0.5 / (math.sqrt(2 * pi * dk**3.) * mpl) * (dkos / dxos) ker = np.empty(nos, dtype=lat.prec_real) fft1 = fftw3.Plan(ker, ker, direction='forward', flags=['measure'], realtypes=['realodd 10']) for k in xrange(nos): kk = (k + 0.5) * dkos ker[k] = kk * (kk**2. + m2_eff)**gamma * math.exp(-(kk / kcut)**2.) fft1.execute() fftw3.destroy_plan(fft1) for k in xrange(nos): ker[k] = norm * ker[k] / (k + 1) Fk_gpu = gpuarray.zeros((n / 2 + 1, n, n), dtype=lat.prec_complex) ker_gpu = gpuarray.to_gpu(ker) tmp_gpu = gpuarray.zeros((n, n, n), dtype=lat.prec_real) plan = fft.Plan(tmp_gpu.shape, lat.prec_real, lat.prec_complex) plan2 = fft.Plan(tmp_gpu.shape, lat.prec_complex, lat.prec_real) func(tmp_gpu, ker_gpu, np.uint32(nn), np.float64(os), np.uint32(lat.dimx), np.uint32(lat.dimy), np.uint32(lat.dimz), block=lat.cuda_block_1, grid=lat.cuda_grid) fft.fft(tmp_gpu, Fk_gpu, plan) if lat.test == True: print 'Testing mode on! Set testQ to False to disable this.\n' np.random.seed(1) rr1 = (np.random.normal(size=Fk_gpu.shape) + np.random.normal(size=Fk_gpu.shape) * 1j) Fk = Fk_gpu.get() Fk *= rr1 Fk_gpu = gpuarray.to_gpu(Fk) fft.ifft(Fk_gpu, tmp_gpu, plan2) res = (tmp_gpu.get()).astype(lat.prec_real) res *= 1. / lat.VL return res
def main(infile, outdir, ISIZE, PLOT_ME): # Load settings for each example settings = dict([]) # setup filename settings['vfile'] = infile settings['imsize'] = np.int32(ISIZE) # number of image pixels # 1 degree viewfield, 1*3.1415926535/180*3600 = settings['cell'] = np.float32(3600. / ISIZE) # pixel size in arcseconds (rad ? degree?) settings['briggs'] = np.float32(1e7) # weight parameter ## make cuFFT plan #improvable# ## Create the PSF & dirty image # dpsf - PSF, gpu_im ( dirty image) # dpsf is computed by CPU, gpu_im is in the GPU imsize = settings['imsize'] # nx - 2 imsize, it means 2048 when imsize=1024 nx = np.int32(2 * imsize) # create fft plan nx*nx plan = fft.Plan((np.int(nx), np.int(nx)), np.complex64, np.complex64) f = pyfits.open(settings['vfile']) channel = f[0].data.data.shape[3] for chan in range(4, 5): dpsf, gpu_im = cuda_gridvis(f, settings, plan, chan) gpu_dpsf = gpu.to_gpu(dpsf) if PLOTME: dirty = np.roll(np.fliplr(gpu_im.get()), 1, axis=1) ## Clean the PSF if imsize >= 1024: cpsf = serial_clean_beam(dpsf, imsize / 50.) elif imsize >= 512: cpsf = serial_clean_beam(dpsf, imsize / 25.) elif imsize >= 256: cpsf = serial_clean_beam(dpsf, imsize / 12.) gpu_cpsf = gpu.to_gpu(cpsf) if PLOTME: print "Plotting dirty and cleaned beam" fig, axs = plt.subplots(); #1, 2, sharex=True, sharey=True); plt.subplots_adjust(wspace=0) axs.imshow(dpsf, vmin=np.percentile(dpsf, 0), vmax=np.percentile(dpsf, 99), cmap=cm.gray) #axs[1].imshow(cpsf, vmin=np.percentile(dpsf, 0), vmax=np.percentile(dpsf, 99), cmap=cm.gray) pathPrefix = outdir if pathPrefix == None: plt.savefig('test_cleanbeam_%d.png'%chan) else: if pathPrefix[-1:] == '/': pathPrefix = pathPrefix[:-1] if not os.path.exists(pathPrefix): os.makedirs(pathPrefix) plt.savefig(pathPrefix + '/' + 'test_cleanbeam_%d.png'%chan) plt.close() ## Run CLEAN gpu_dirty, gpu_pmodel, gpu_clean = cuda_hogbom(gpu_im, gpu_dpsf, gpu_cpsf, thresh=0.2, gain=0.1) if PLOTME: prefix = infile prefix, ext = os.path.splitext(os.path.basename(prefix)) try: vra except NameError: vra = [np.percentile(dirty, 1), np.percentile(dirty, 99)] print "Plotting dirty image and dirty image after iterative source removal" fig, axs = plt.subplots() #1, 2, sharex=True, sharey=True, figsize=(12.2, 6)); plt.subplots_adjust(wspace=0) axs.imshow(dirty, vmin=vra[0], vmax=vra[1], cmap=cm.jet, origin='lower') axs.set_title('Original dirty image') #axs[1].imshow(np.roll(np.fliplr(gpu_dirty.get()), 1, axis=1), vmin=vra[0], vmax=vra[1], cmap=cm.gray, # origin='lower') #axs[1].set_title('Dirty image cleaned of sources') pathPrefix = outdir if pathPrefix == None: plt.savefig(prefix + '_dirty_final_%d.png'%chan) #dirty.tofile(prefix+'_dirty_final_axs0_%d.dat'%chan) #(np.roll(np.fliplr(gpu_dirty.get()),1,axis=1)).tofile(prefix+'_dirty_final_axs1.dat') else: if pathPrefix[-1:] == '/': pathPrefix = pathPrefix[:-1] plt.savefig(pathPrefix + '/' + prefix + '_dirty_final_%d.png'%chan) #dirty.tofile(pathPrefix+'/'+prefix+'_dirty_final_axs0_%d.dat'%chan) #(np.roll(np.fliplr(gpu_dirty.get()),1,axis=1)).tofile(pathPrefix+'/'+prefix+'_dirty_final_axs1.dat') plt.close() print "Plotting dirty image and final clean image" vra = [np.percentile(dirty, 1), np.percentile(dirty, 99)] fig, axs = plt.subplots(figsize=(6.1, 6)) #1, 2, sharex=True, sharey=True, figsize=(12.2, 6)); plt.subplots_adjust(wspace=0) clean = np.roll(np.fliplr(gpu_clean.get()), 1, axis=1) #axs.imshow(dirty, vmin=vra[0], vmax=vra[1], cmap=cm.gray, origin='lower') #axs.set_title('Original dirty image') axs.imshow(clean, vmin=vra[0], vmax=vra[1], cmap=cm.hot, origin='lower') axs.set_title('Final cleaned image') pathPrefix = outdir if pathPrefix == None: plt.savefig(prefix + '_clean_final_%d.png'%chan) #dirty.tofile(prefix+'_clean_final_axs0_%d.dat'%chan) #clean.tofile(prefix+'_clean_final_axs1_%d.dat'%chan) else: if pathPrefix[-1:] == '/': pathPrefix = pathPrefix[:-1] plt.savefig(pathPrefix + '/' + prefix + '_clean_final_%d.png'%chan) #dirty.tofile(pathPrefix+'/'+prefix+'_clean_final_axs0_%d.dat'%chan) #clean.tofile(pathPrefix+'/'+prefix+'_clean_final_axs1_%d.dat'%chan) plt.close()