def __init__(self, volume, template, mask, wedge, stdV, gpu=True): self.volume = gu.to_gpu(volume) self.template = Volume(template) self.templatePadded = gu.zeros_like(self.volume, dtype=np.float32) self.mask = Volume(mask) self.maskPadded = gu.zeros_like(self.volume, dtype=np.float32) self.sOrg = mask.shape self.sPad = volume.shape print(self.sPad, self.sOrg) rotate(self.mask, [0, 0, 0], self.maskPadded, self.sPad, self.sOrg) #paste_in_center_gpu(self.template.d_data, self.templatePadded, np.int32(self.sPad), np.int32(self.maskSize), block=(10, 10, 10), grid=(8,1,1)) #rotate(self.template, [0, 0, 0], self.templatePadded, self.sPad, self.maskSize) print(volume.shape, stdV.shape, wedge.shape) self.wedge = gu.to_gpu(wedge) self.stdV = gu.to_gpu(stdV) self.fwd_plan = Plan(volume.shape, volume.dtype, np.complex64) self.inv_plan = Plan(volume.shape, np.complex64, volume.dtype) self.volume_fft = gu.zeros_like(self.volume, dtype=np.complex64) self.template_fft = gu.zeros_like(self.volume, dtype=np.complex64) self.ccc_map = gu.zeros_like(self.volume, dtype=np.float32) self.norm_volume = np.prod(volume.shape) self.scores = gu.ones_like(self.volume, dtype=np.float32) * -1000 self.angles = gu.ones_like(self.volume, dtype=np.float32) * -1000 self.p = sum(self.mask.d_data)
def scikit_gpu_fft_pipeline(filename): data = [] start = timer() with open(filename, 'r') as file_obj: for _ in range(((32768 * 1024 * SIZE_MULTIPLIER // GULP_SIZE) // COMPLEX_MULTIPLIER) // GULP_FRAME_FFT): data = np.fromfile(file_obj, dtype=np.complex64, count=GULP_SIZE * GULP_FRAME_FFT).reshape( (GULP_FRAME_FFT, GULP_SIZE)) g_data = gpuarray.to_gpu(data) plan = Plan(data.shape[1], np.complex64, np.complex64, batch=GULP_FRAME_FFT) plan_inverse = Plan(data.shape[1], np.complex64, np.complex64, batch=GULP_FRAME_FFT) tmp1 = gpuarray.empty(data.shape, dtype=np.complex64) tmp2 = gpuarray.empty(data.shape, dtype=np.complex64) fft(g_data, tmp1, plan) ifft(tmp1, tmp2, plan_inverse) for _ in range(NUMBER_FFT - 1): # Can't do FFT in place for fairness (emulating full pipeline) tmp1 = gpuarray.empty(data.shape, dtype=np.complex64) fft(tmp2, tmp1, plan) tmp2 = gpuarray.empty(data.shape, dtype=np.complex64) ifft(tmp1, tmp2, plan_inverse) end = timer() return end - start
def __init__(self, volume, template, mask, gpu): self.gpu = gpu self.volume = gu.to_gpu(volume) self.template = Volume(template) self.mask = gu.to_gpu(mask) self.fwd_plan = Plan(volume.shape, volume.dtype, np.complex64) self.inv_plan = Plan(volume.shape, np.complex64, volume.dtype) self.volume_fft = gu.zeros_like(self.volume, dtype=np.complex64) self.template_fft = gu.zeros_like(self.template.d_data, dtype=np.complex64) self.ccc_map = gu.zeros_like(self.volume, dtype=np.float32) self.norm_volume = np.prod(volume.shape) self.scores = gu.zeros_like(self.volume, dtype=np.float32) self.angles = gu.zeros_like(self.volume, dtype=np.float32)
def ifft2c2c_cuda(x, axes=(0, 1)): rank = len(axes) x = np.array(x).astype(np.complex64) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty(x.shape, np.complex64) if len(x.shape) > rank: batch = np.prod(x.shape[rank:len(x.shape)]) plan = Plan(x.shape[0:rank], np.complex64, np.complex64, batch, None, 1, \ np.array(x.shape[0:rank]).astype(np.int32), np.prod(x.shape[rank:len(x.shape)]), 1, \ np.array(x.shape[0:rank]).astype(np.int32), np.prod(x.shape[rank:len(x.shape)]), 1 ) else: batch = 1 plan = Plan(x.shape[0:rank], np.complex64, np.complex64) ifft(x_gpu, xf_gpu, plan) return xf_gpu.get() / np.prod(x.shape[0:rank])
def __init__(self, volume, template, gpu): self.gpu = gpu volume_gpu = gu.to_gpu(volume) self.fwd_plan = Plan(volume.shape, volume.dtype, np.complex64) self.volume_fft = gu.zeros_like(volume_gpu, dtype=np.complex64) fft(volume_gpu, self.volume_fft, self.fwd_plan) self.template_fft = gu.zeros_like(volume_gpu, dtype=np.complex64) self.ccc_map = gu.zeros_like(volume_gpu, dtype=np.float32) self.norm_volume = gu.prod(volume_gpu.shape) #self.scores = gu.zeros_like(volume_gpu, dtype=np.float32) #self.angles = gu.zeros_like(volume_gpu, dtype=np.float32) self.padded_volume = gu.zeros_like(volume_gpu, dtype=np.float32) del volume_gpu self.inv_plan = Plan(volume.shape, np.complex64, volume.dtype) self.template = Volume(template)
def get_rfft_plans(shape, double_precision=False): """ Loads or computes fft plans for ffts performed on the GPU. """ real_type = np.float32 if not double_precision else np.float64 cplx_type = np.complex64 if not double_precision else np.complex128 lab = '%s x %s real2complex' % (shape[0], shape[1]) if double_precision: lab += ' (double)' if lab not in fft_plans.keys(): print "lens_GPU : building and caching fft plan %s" % lab fft_plans[lab] = Plan(shape, real_type, cplx_type) if lab not in fft_inv_plans.keys(): print "lens_GPU : building and caching ifft plan %s" % lab fft_inv_plans[lab] = Plan(shape, cplx_type, real_type) return fft_plans[lab], fft_inv_plans[lab]
def test3(): N = 128 x = np.asarray(np.random.rand(N, N, N), np.complex64) xf = np.fft.fftn(x, s=None, axes=(0, 1, 2)) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((N, N, N), np.complex64) plan = Plan(x.shape, np.complex64, np.complex64) fft(x_gpu, xf_gpu, plan) print(np.allclose(xf[0:N, 0:N, 0:N], xf_gpu.get(), atol=1e-2))
def test1(): N = 128 x = np.asarray(np.random.rand(N), np.complex64) xf = np.fft.fft(x) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty(N, np.complex64) plan = Plan(x.shape, np.complex64, np.complex64) fft(x_gpu, xf_gpu, plan) print(np.allclose(xf[0:N], xf_gpu.get(), atol=1e-3))
def compute_inverse_plan(self): self.plan_inverse = Plan( self.cufft_shape, # not shape_out self.dtype_out, self.dtype, batch=self.cufft_batch_size, stream=self.cufft_stream, # cufft extensible plan API is only supported after 0.5.1 # (commit 65288d28ca0b93e1234133f8d460dc6becb65121) # but there is still no official 0.5.2 #~ auto_allocate=True )
def cu_lpf(stimulus, dt, freq): """ CUDA implementation of low-pass-filter. stimulus: ndarray The input to be filtered. dt: float The sampling interval of the input. freq: float The cut-off frequency of the low pass filter. """ num = len(stimulus) num_fft = int(num / 2 + 1) idtype = stimulus.dtype odtype = np.complex128 if idtype == np.float64 else np.complex64 if not isinstance(stimulus, gpuarray.GPUArray): d_stimulus = gpuarray.to_gpu(stimulus) else: d_stimulus = stimulus plan = Plan(stimulus.shape, idtype, odtype) d_fstimulus = gpuarray.empty(num_fft, odtype) fft(d_stimulus, d_fstimulus, plan) df = 1.0 / dt / num idx = int(freq // df) unit = int(d_fstimulus.dtype.itemsize / 4) offset = int(d_fstimulus.gpudata) + d_fstimulus.dtype.itemsize * idx cuda.memset_d32(offset, 0, unit * (num_fft - idx)) plan = Plan(stimulus.shape, odtype, idtype) d_lpf_stimulus = gpuarray.empty(num, idtype) ifft(d_fstimulus, d_lpf_stimulus, plan, False) return d_lpf_stimulus.get()
import skcuda.cublas as cublas import skcuda s = cuda.Event() e = cuda.Event() s.record() nStreams = 8 stream = [cuda.Stream() for i in range(nStreams)] N = 8192 print skcuda.misc.get_current_device() x = [np.asarray(np.random.rand(N/nStreams), np.float32) for i in range(nStreams)] #x_pin = cuda.register_host_memory(x) #xf = np.fft.fft(x) x_gpu = [gpuarray.to_gpu_async(x[i], stream=stream[i]) for i in range(nStreams)] xf_gpu = [gpuarray.empty((N/nStreams)/2 + 1, np.complex64) for i in range(nStreams)] plan = [Plan(x[0].shape, np.float32, np.complex64, stream=stream[i]) for i in range(nStreams)] print skcuda.misc.get_current_device() for i in range(nStreams): fft(x_gpu[i], xf_gpu[i], plan[i]) print skcuda.misc.get_current_device() x_pin = [xf_gpu[i].get_async(stream=stream[i]) for i in range(nStreams)] #print np.allclose(xf[0:N/2 + 1], xf_gpu.get(), atol=1e-6) e.record() e.synchronize() print s.time_till(e), "ms"