예제 #1
0
 def test_batch_fft_float64_to_complex128_1d(self):
     x = np.asarray(np.random.rand(self.B, self.N), np.float64)
     xf = np.fft.rfft(x, axis=1)
     x_gpu = gpuarray.to_gpu(x)
     xf_gpu = gpuarray.empty((self.B, self.N//2+1), np.complex128)
     plan = fft.Plan(x.shape[1], np.float64, np.complex128, batch=self.B)
     fft.fft(x_gpu, xf_gpu, plan)
     assert np.allclose(xf, xf_gpu.get(), atol=atol_float64)
예제 #2
0
 def test_fft_float32_to_complex64_2d(self):
     x = np.asarray(np.random.rand(self.N, self.M), np.float32)
     xf = np.fft.rfftn(x)
     x_gpu = gpuarray.to_gpu(x)
     xf_gpu = gpuarray.empty((self.N, self.M//2+1), np.complex64)
     plan = fft.Plan(x.shape, np.float32, np.complex64)
     fft.fft(x_gpu, xf_gpu, plan)
     assert np.allclose(xf, xf_gpu.get(), atol=atol_float32)
예제 #3
0
 def test_batch_fft_float64_to_complex128_2d(self):
     x = np.asarray(np.random.rand(self.B, self.N, self.M), np.float64)
     xf = np.fft.rfftn(x, axes=(1,2))
     x_gpu = gpuarray.to_gpu(x)
     xf_gpu = gpuarray.empty((self.B, self.N, self.M//2+1), np.complex128)
     plan = fft.Plan([self.N, self.M], np.float64, np.complex128, batch=self.B)
     fft.fft(x_gpu, xf_gpu, plan)
     assert np.allclose(xf, xf_gpu.get(), atol=atol_float64)
예제 #4
0
 def test_fft_float64_to_complex128_1d(self):
     x = np.asarray(np.random.rand(self.N), np.float64)
     xf = np.fft.rfftn(x)
     x_gpu = gpuarray.to_gpu(x)
     xf_gpu = gpuarray.empty(self.N//2+1, np.complex128)
     plan = fft.Plan(x.shape, np.float64, np.complex128)
     fft.fft(x_gpu, xf_gpu, plan)
     assert np.allclose(xf, xf_gpu.get(), atol=atol_float64)
예제 #5
0
 def test_batch_fft_float32_to_complex64_2d(self):
     x = np.asarray(np.random.rand(self.B, self.N, self.M), np.float32)
     xf = np.fft.rfftn(x, axes=(1,2))
     x_gpu = gpuarray.to_gpu(x)
     xf_gpu = gpuarray.empty((self.B, self.N, self.M//2+1), np.complex64)
     plan = fft.Plan([self.N, self.M], np.float32, np.complex64, batch=self.B)
     fft.fft(x_gpu, xf_gpu, plan)
     assert np.allclose(xf, xf_gpu.get(), atol=atol_float32)
예제 #6
0
 def test_batch_fft_float32_to_complex64_1d(self):
     x = np.asarray(np.random.rand(self.B, self.N), np.float32)
     xf = np.fft.rfft(x, axis=1)
     x_gpu = gpuarray.to_gpu(x)
     xf_gpu = gpuarray.empty((self.B, self.N//2+1), np.complex64)
     plan = fft.Plan(x.shape[1], np.float32, np.complex64, batch=self.B)
     fft.fft(x_gpu, xf_gpu, plan)
     assert np.allclose(xf, xf_gpu.get(), atol=atol_float32)
예제 #7
0
def test1():
    N = 128
    x = np.asarray(np.random.rand(N), np.complex64)
    xf = np.fft.fft(x)
    x_gpu = gpuarray.to_gpu(x)
    xf_gpu = gpuarray.empty(N, np.complex64)
    plan = Plan(x.shape, np.complex64, np.complex64)
    fft(x_gpu, xf_gpu, plan)
    print(np.allclose(xf[0:N], xf_gpu.get(), atol=1e-3))
예제 #8
0
def test3():
    N = 128
    x = np.asarray(np.random.rand(N, N, N), np.complex64)
    xf = np.fft.fftn(x, s=None, axes=(0, 1, 2))
    x_gpu = gpuarray.to_gpu(x)
    xf_gpu = gpuarray.empty((N, N, N), np.complex64)
    plan = Plan(x.shape, np.complex64, np.complex64)
    fft(x_gpu, xf_gpu, plan)
    print(np.allclose(xf[0:N, 0:N, 0:N], xf_gpu.get(), atol=1e-2))
예제 #9
0
 def _solve_kernel_slow(self):
     ''' Slow version, use when save_memory is True: Stores only 1 slice
     of the fgreentr function and loops over all slices
     '''
     cu_fft.fft(self.tmpspace, self.tmpspace, plan=self.plan_forward)
     for i in xrange(self.mesh.nz):
         self.tmpspace[i,:,:] = self.tmpspace[i,:,:] * self.fgreentr
     cu_fft.ifft(self.tmpspace, self.tmpspace,
                 plan=self.plan_backward)
예제 #10
0
 def test_work_area(self):
     x = np.asarray(np.random.rand(self.N), np.float32)
     xf = np.fft.rfftn(x)
     x_gpu = gpuarray.to_gpu(x)
     xf_gpu = gpuarray.empty(self.N // 2 + 1, np.complex64)
     plan = fft.Plan(x.shape, np.float32, np.complex64, auto_allocate=False)
     work_area = gpuarray.empty((plan.worksize,), np.uint8)
     plan.set_work_area(work_area)
     fft.fft(x_gpu, xf_gpu, plan)
     assert np.allclose(xf, xf_gpu.get(), atol=atol_float32)
예제 #11
0
 def test_work_area(self):
     x = np.asarray(np.random.rand(self.N), np.float32)
     xf = np.fft.rfftn(x)
     x_gpu = gpuarray.to_gpu(x)
     xf_gpu = gpuarray.empty(self.N // 2 + 1, np.complex64)
     plan = fft.Plan(x.shape, np.float32, np.complex64, auto_allocate=False)
     work_area = gpuarray.empty((plan.worksize, ), np.uint8)
     plan.set_work_area(work_area)
     fft.fft(x_gpu, xf_gpu, plan)
     assert np.allclose(xf, xf_gpu.get(), atol=atol_float32)
예제 #12
0
def inplaceFractShift(img, dx, dy, PhaseShiftFunc, bInverse=False):
    if dx == 0 and dy == 0:
        return
    global plan
    global FT

    Cache(img.shape)
    cu_fft.fft(img, FT, plan)
    PhaseShiftFunc(FT, kxx, kyy, np.float32(dx), np.float32(dy))
    cu_fft.ifft(FT, img, plan, True)
예제 #13
0
def process_video_cuda(data):
    global cs, cs_first
    #	fft_overlap(data, FiltV_GPU)

    if cs_first == True:
        prepare_video_filters(SysParams)
        prepare_video_cuda()
        cs_first = False

    fdata = np.float32(data)

    gpudata = gpuarray.to_gpu(fdata)

    # first fft->ifft cycle applies pre-decoding filtering (low pass filters, CAV/CLV emphasis)
    # and very importantly, performs the Hilbert transform
    fft.fft(gpudata, cs['fft1_out'], cs['plan1'])

    if Inner:
        cs['fft1_out'] *= cs['filt_video_inner']
    else:
        cs['fft1_out'] *= cs['filt_video']

    fft.ifft(cs['fft1_out'], cs['filtered1'], cs['plan1i'], True)

    cs['doanglediff'](cs['fm_demod'],
                      cs['filtered1'],
                      block=(1024, 1, 1),
                      grid=(blocklenk, 1))

    # post-processing:  output low-pass filtering and deemphasis
    fft.fft(cs['fm_demod'], cs['fft2_out'], cs['plan2'])
    cs['fft2_out'] *= cs['filt_post']
    fft.ifft(cs['fft2_out'], cs['postlpf'], cs['plan2i'], True)

    cs['doclamp16'](cs['clipped_gpu'],
                    cs['postlpf'],
                    np.float32(-SysParams['output_minfreq']),
                    np.float32(SysParams['output_scale']),
                    block=(1024, 1, 1),
                    grid=(blocklenk, 1))

    output_16 = cs['clipped_gpu'].get()

    chop = 512
    return output_16[chop:len(output_16) - chop]

    # graph for debug

    #	plt.plot(cs['postlpf'].get()[5000:7500])
    plt.plot(output_16[5000:7000])
    #	plt.plot(range(0, len(output_16)), output_16)
    #	plt.plot(range(0, len(doutput)), doutput)
    #	plt.plot(range(0, len(output_prefilt)), output_prefilt)
    plt.show()
    exit()
예제 #14
0
def process_video_cuda(data):
    global cs, cs_first
    # 	fft_overlap(data, FiltV_GPU)

    if cs_first == True:
        prepare_video_filters()
        prepare_video_cuda()
        cs_first = False

    fdata = np.float32(data)

    gpudata = gpuarray.to_gpu(fdata)

    # first fft->ifft cycle applies pre-decoding filtering (low pass filters, CAV/CLV emphasis)
    # and very importantly, performs the Hilbert transform
    fft.fft(gpudata, cs["fft1_out"], cs["plan1"])

    if Inner:
        cs["fft1_out"] *= cs["filt_video_inner"]
    else:
        cs["fft1_out"] *= cs["filt_video"]

    fft.ifft(cs["fft1_out"], cs["filtered1"], cs["plan1i"], True)

    cs["doanglediff"](cs["fm_demod"], cs["filtered1"], block=(1024, 1, 1), grid=(blocklenk, 1))

    # post-processing:  output low-pass filtering and deemphasis
    fft.fft(cs["fm_demod"], cs["fft2_out"], cs["plan2"])
    cs["fft2_out"] *= cs["filt_post"]
    fft.ifft(cs["fft2_out"], cs["postlpf"], cs["plan2i"], True)

    cs["doclamp16"](
        cs["clipped_gpu"],
        cs["postlpf"],
        np.float32(-SP["output_minfreq"]),
        np.float32(SP["output_scale"]),
        block=(1024, 1, 1),
        grid=(blocklenk, 1),
    )

    output_16 = cs["clipped_gpu"].get()

    chop = 512
    return output_16[chop : len(output_16) - chop]

    # graph for debug
    # 	output = (sps.lfilter(f_deemp_b, f_deemp_a, output)[128:len(output)]) / deemp_corr

    # 	plt.plot(cs['postlpf'].get()[5000:7500])
    plt.plot(output_16[5000:7000])
    # 	plt.plot(range(0, len(output_16)), output_16)
    # 	plt.plot(range(0, len(doutput)), doutput)
    # 	plt.plot(range(0, len(output_prefilt)), output_prefilt)
    plt.show()
    exit()
예제 #15
0
    def propagate_eager(self, wavelength, wavefront):
        """
        'Not-Too-Good' version of the propagation on the GPU (lots of Memory issues...)
        Remove in the future
        :param wavelength:
        :param wavefront:
        :return:
        """

        N = self.N_PIX
        # free, total = cuda.mem_get_info()
        free, total = cuda.mem_get_info()
        print("Free: %.2f percent" % (free / total * 100))

        # Pupil Plane -> Image Slicer
        complex_pupil = self.pupil_masks[wavelength] * np.exp(
            1j * 2 * np.pi * self.pupil_masks[wavelength] / wavelength)
        complex_pupil_gpu = gpuarray.to_gpu(
            np.asarray(complex_pupil, np.complex64))
        plan = cu_fft.Plan(complex_pupil_gpu.shape, np.complex64, np.complex64)
        cu_fft.fft(complex_pupil_gpu, complex_pupil_gpu, plan, scale=True)

        # Add N_slices copies to be Masked
        complex_slicer_cpu = complex_pupil_gpu.get()
        complex_pupil_gpu.gpudata.free()

        free, total = cuda.mem_get_info()
        print("*Free: %.2f percent" % (free / total * 100))

        complex_slicer_cpu = np.stack([complex_slicer_cpu] * self.N_slices)
        complex_slicer_gpu = gpuarray.to_gpu(complex_slicer_cpu)
        slicer_masks_gpu = gpuarray.to_gpu(self.slicer_masks_fftshift)
        clinalg.multiply(slicer_masks_gpu, complex_slicer_gpu, overwrite=True)
        slicer_masks_gpu.gpudata.free()
        free, total = cuda.mem_get_info()
        print("**Free: %.2f percent" % (free / total * 100))

        # Slicer -> Pupil Mirror
        plan = cu_fft.Plan((N, N), np.complex64, np.complex64, self.N_slices)
        cu_fft.ifft(complex_slicer_gpu, complex_slicer_gpu, plan, scale=True)
        mirror_mask_gpu = gpuarray.to_gpu(self.pupil_mirror_masks_fft)
        clinalg.multiply(mirror_mask_gpu, complex_slicer_gpu, overwrite=True)

        # Pupil Mirror -> Slits
        cu_fft.fft(complex_slicer_gpu, complex_slicer_gpu, plan)
        slits = complex_slicer_gpu.get()
        complex_slicer_gpu.gpudata.free()
        mirror_mask_gpu.gpudata.free()
        slit = fftshift(np.sum((np.abs(slits))**2, axis=0))

        free, total = cuda.mem_get_info()
        print("***Free: %.2f percent" % (free / total * 100))

        return slit
예제 #16
0
def cross_correlate(plan, normalize=True):
    #norm_template = sum(plan.mask.d_data)

    fft(plan.volume, plan.volume_fft, plan.fwd_plan)
    fft(plan.templatePadded, plan.template_fft, plan.fwd_plan)

    conj(plan.template_fft, overwrite=True)
    volume_fft = plan.volume_fft * plan.template_fft
    ifft(volume_fft, plan.ccc_map, plan.inv_plan, scale=True)

    plan.ccc_map /= np.float32(plan.p.get()) * plan.stdV
예제 #17
0
def FractShift(src, dest, dx, dy, PhaseShiftFunc):
    if dx == 0 and dy == 0:
        return
    global plan
    global FT

    Cache(src.shape)

    cu_fft.fft(src, FT, plan, PhaseShiftFunc)
    PhaseShift(FT, dx, dy)
    cu_fft.ifft(FT, dest, plan, True)
예제 #18
0
    def __init__(self, mesh, context=None):
        '''
        Args:
            mesh The mesh on which the solver will operate. The dimensionality
                 is deducted from mesh.dimension
        '''
        # create the mesh grid and compute the greens function on it
        self.mesh = mesh
        self._context = context
        mesh_shape = self.mesh.shape # nz, ny, (nx)
        mesh_shape2 = [2*n for n in mesh_shape] # 2*nz, 2*ny, (2*nx)
        mesh_distances = list(reversed(self.mesh.distances)) #dz, dy, dx
        self.fgreentr = gpuarray.empty(mesh_shape2,
                        dtype=np.complex128)
        self.tmpspace = gpuarray.zeros_like(self.fgreentr)
        sizeof_complex = np.dtype(np.complex128).itemsize

        # dimensionality function dispatch
        dim = self.mesh.dimension
        self._fgreen = getattr(self, '_fgreen' + str(dim) + 'd')
        self._mirror = getattr(self, '_mirror' + str(dim) + 'd')
        copy_fn = {'3d' : get_Memcpy3D_d2d, '2d': get_Memcpy2D_d2d}
        memcpy_nd = copy_fn[str(dim) + 'd']
        dim_args = self.mesh.shape
        self._cpyrho2tmp = memcpy_nd(
            src=None, dst=self.tmpspace, # None because src(rho) not yet known
            src_pitch=self.mesh.nx*sizeof_complex,
            dst_pitch=2*self.mesh.nx*sizeof_complex,
            dim_args=dim_args,
            itemsize=np.dtype(np.complex128).itemsize,
            src_height=self.mesh.ny,
            dst_height=2*self.mesh.ny)
        self._cpytmp2rho = memcpy_nd(
            src=self.tmpspace, dst=None, # None because dst(rho) not yet know
            src_pitch=2*self.mesh.nx*sizeof_complex,
            dst_pitch=self.mesh.nx*sizeof_complex,
            dim_args=dim_args,
            itemsize=np.dtype(np.complex128).itemsize,
            src_height=2*self.mesh.ny,
            dst_height=self.mesh.ny)
        mesh_arr = [-mesh_distances[i]/2 + np.arange(mesh_shape[i]+1)
                                            * mesh_distances[i]
                    for i in xrange(self.mesh.dimension)
                   ]
        # mesh_arr is [mz, my, mx]
        mesh_grids = np.meshgrid(*mesh_arr, indexing='ij')
        fgreen = self._fgreen(*mesh_grids)
        fgreen = self._mirror(fgreen)
        self.plan_forward = cu_fft.Plan(self.tmpspace.shape, in_dtype=np.complex128,
                                        out_dtype=np.complex128)
        self.plan_backward = cu_fft.Plan(self.tmpspace.shape, in_dtype=np.complex128,
                                         out_dtype=np.complex128)
        cu_fft.fft(gpuarray.to_gpu(fgreen), self.fgreentr, plan=self.plan_forward)
예제 #19
0
 def rfft(a, nthreads=0):
     if is_memory_enough(a):
         arg = gpuarray.to_gpu(a)
         shape = [s for s in a.shape]
         shape[-1] = shape[-1]//2 + 1
         ctype = G_RTYPES[a.dtype.type]
         afg = gpuarray.empty(shape, ctype)
         plan = fft.Plan(shape, a.dtype.type, ctype)
         print(shape, a.dtype.type, ctype)
         fft.fft(arg, afg, plan)
         return afg.get()
     else:
         return _rfft(a)
예제 #20
0
    def filter(self):
        import pycuda.gpuarray as gpuarray
        import skcuda.fft as cu_fft
        import skcuda.linalg as linalg
        import pycuda.driver as cuda
        from pycuda.tools import make_default_context
        cuda.init()
        context = make_default_context()
        device = context.get_device()
        signal = self.series[0]
        window = self.series[1]
        linalg.init()
        nfft = determine_size(len(signal) + len(window) - 1)
        # Move data to GPU
        sig_zero_pad = np.zeros(nfft, dtype=self.precision['float'])
        win_zero_pad = np.zeros(nfft, dtype=self.precision['float'])
        sig_gpu = gpuarray.zeros(sig_zero_pad.shape,
                                 dtype=self.precision['float'])
        win_gpu = gpuarray.zeros(win_zero_pad.shape,
                                 dtype=self.precision['float'])
        sig_zero_pad[0:len(signal)] = signal
        win_zero_pad[0:len(window)] = window
        sig_gpu.set(sig_zero_pad)
        win_gpu.set(win_zero_pad)

        # Plan forwards
        sig_fft_gpu = gpuarray.zeros(nfft, dtype=self.precision['complex'])
        win_fft_gpu = gpuarray.zeros(nfft, dtype=self.precision['complex'])
        sig_plan_forward = cu_fft.Plan(sig_fft_gpu.shape,
                                       self.precision['float'],
                                       self.precision['complex'])
        win_plan_forward = cu_fft.Plan(win_fft_gpu.shape,
                                       self.precision['float'],
                                       self.precision['complex'])
        cu_fft.fft(sig_gpu, sig_fft_gpu, sig_plan_forward)
        cu_fft.fft(win_gpu, win_fft_gpu, win_plan_forward)

        # Convolve
        out_fft = linalg.multiply(sig_fft_gpu, win_fft_gpu, overwrite=True)
        linalg.scale(2.0, out_fft)

        # Plan inverse
        out_gpu = gpuarray.zeros_like(out_fft)
        plan_inverse = cu_fft.Plan(out_fft.shape, self.precision['complex'],
                                   self.precision['complex'])
        cu_fft.ifft(out_fft, out_gpu, plan_inverse, True)
        out_np = np.zeros(len(out_gpu), self.precision['complex'])
        out_gpu.get(out_np)
        context.pop()
        return out_np
예제 #21
0
def RunCorrection(neib,ROI,DifPad,rspace,kspace,exitWave,buffer_exitWave,finalObj,offsetx,offsety,objsizex,roisizex,CopyFromROI,ExitwaveAndBuffer,ApplyDifPad,cufftplan,aperture,fcachevector):
	Fs = []		
	for jpos in range(-neib,neib+1):
		for ipos in range(-neib,neib+1):
			CopyFromROI(rspace, finalObj, np.int32(offsety+jpos), np.int32(offsetx+ipos), roisizex, objsizex)
			
			ExitwaveAndBuffer(exitWave, buffer_exitWave, aperture, rspace) # Compute exitwaves
			cu_fft.fft(exitWave,kspace,cufftplan) # kspace = wave at detector
			ApplyDifPad(kspace,DifPad,fcachevector) # replace amplitudes.
			cu_fft.ifft(kspace,exitWave,cufftplan,True)	# new exitwave
				
			errori = np.sum(((exitWave-buffer_exitWave).__abs__()**2).get())
			Fs.append(errori+0)
	return GetMin(Fs,neib)
예제 #22
0
파일: fft.py 프로젝트: Sayam753/Theano-PyMC
        def thunk():
            input_shape = inputs[0][0].shape
            s = inputs[1][0]

            # Since padding is not supported, assert s matches input shape.
            assert (input_shape[1:] == s).all()

            # construct output shape
            output_shape = [input_shape[0]] + list(s)
            # DFT of real input is symmetric, no need to store
            # redundant coefficients
            output_shape[-1] = output_shape[-1] // 2 + 1
            # extra dimension with length 2 for real/imag
            output_shape += [2]
            output_shape = tuple(output_shape)

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = pygpu.zeros(output_shape,
                                   context=inputs[0][0].context,
                                   dtype="float32")

            input_pycuda = inputs[0][0]
            # I thought we'd need to change the type on output_pycuda
            # so it is complex64, but as it turns out skcuda.fft
            # doesn't really care either way and treats the array as
            # if it is complex64 anyway.
            output_pycuda = z[0]

            with input_pycuda.context:
                # only initialise plan if necessary
                if plan[0] is None or plan_input_shape[0] != input_shape:
                    plan_input_shape[0] = input_shape
                    plan[0] = fft.Plan(s,
                                       np.float32,
                                       np.complex64,
                                       batch=input_shape[0])

                # Sync GPU variables before computation
                input_pycuda.sync()
                output_pycuda.sync()

                fft.fft(input_pycuda, output_pycuda, plan[0])

                # Sync results to ensure output contains completed computation
                pycuda.driver.Context.synchronize()
예제 #23
0
파일: correlation.py 프로젝트: xmzzaa/PyTom
 def __init__(self, volume, template, gpu):
     self.gpu = gpu
     volume_gpu = gu.to_gpu(volume)
     self.fwd_plan = Plan(volume.shape, volume.dtype, np.complex64)
     self.volume_fft = gu.zeros_like(volume_gpu, dtype=np.complex64)
     fft(volume_gpu, self.volume_fft, self.fwd_plan)
     self.template_fft = gu.zeros_like(volume_gpu, dtype=np.complex64)
     self.ccc_map = gu.zeros_like(volume_gpu, dtype=np.float32)
     self.norm_volume = gu.prod(volume_gpu.shape)
     #self.scores = gu.zeros_like(volume_gpu, dtype=np.float32)
     #self.angles = gu.zeros_like(volume_gpu, dtype=np.float32)
     self.padded_volume = gu.zeros_like(volume_gpu, dtype=np.float32)
     del volume_gpu
     self.inv_plan = Plan(volume.shape, np.complex64, volume.dtype)
     self.template = Volume(template)
예제 #24
0
def fft_2d(x, N, M, batch_size):
    # print('Testing in-place fft..')

    # for i in range(batch_size):
    #     x[i, :, :] = np.asarray(np.random.rand(N, M), np.complex64)
    x_gpu = gpuarray.to_gpu(x)
    # start = timer()

    plan = cu_fft.Plan((N, M), np.complex128, np.complex128, batch_size)

    cu_fft.fft(x_gpu, x_gpu, plan)

    # timeit2=timer()-start
    x_gpu1 = x_gpu.get()
    # print ('take time:',timeit2)
    return x_gpu1
예제 #25
0
def fft2c2c_cuda(x, axes=(0, 1)):
    rank = len(axes)
    x = np.array(x).astype(np.complex64)
    x_gpu = gpuarray.to_gpu(x)
    xf_gpu = gpuarray.empty(x.shape, np.complex64)
    if len(x.shape) > rank:
        batch = np.prod(x.shape[rank:len(x.shape)])
        plan  = Plan(x.shape[0:rank], np.complex64, np.complex64, batch, None, 1, \
        np.array(x.shape[0:rank]).astype(np.int32), np.prod(x.shape[rank:len(x.shape)]), 1, \
        np.array(x.shape[0:rank]).astype(np.int32), np.prod(x.shape[rank:len(x.shape)]), 1 )
    else:
        batch = 1
        plan = Plan(x.shape[0:rank], np.complex64, np.complex64)
    fft(x_gpu, xf_gpu, plan)
    xf = xf_gpu.get()
    return xf
예제 #26
0
def fft2_gpu(x, fftshift=False):
    """
    R2C FFT
    This function produce an output that is compatible with numpy.fft.fft2.
    The input x is a 2D numpy array 
    """
    #converting the input array to single precision float
    if x.dtype != "float64":
        x = x.astype(np.float64)

    #get the shape of the initial numpy array
    n1, n2 = x.shape

    # from numpy array to GPUarray
    xgpu = gpuarray.to_gpu(x)

    #initialize output GPUarray
    # For real to complex transformations, the fft function computes
    # N/2+1 non-redundant coefficients of a length-N input signal
    ysize = n2 // 2 + 1
    y = gpuarray.empty((n1, ysize), np.complex128)

    #forward FFT
    plan_forward = cu_fft.Plan((n1, n2), np.float64, np.complex128)

    cu_fft.fft(xgpu, y, plan_forward)

    left = y.get()

    #to make the output array compatible with the numpy output
    # we need to stack horizontally the y.get() array and its flipped version
    # we must take care of handling even or odd sized array to get the correct size of the final array
    if n2 // 2 == n2 / 2:
        #even
        right = np.roll(np.fliplr(np.flipud(left))[:, 1:-1], 1, axis=0)
    else:
        #odd
        right = np.roll(np.fliplr(np.flipud(left))[:, :-1], 1, axis=0)
    print(right.shape)
    print(left.shape)
    #get a numpy array back to compatible with np.fft
    if fftshift is False:
        yout = np.hstack((left, right))
    else:
        yout = np.fft.fftshift(np.hstack((left, right)))

    return yout
예제 #27
0
 def test_multiple_streams(self):
     x = np.asarray(np.random.rand(self.N), np.float32)
     xf = np.fft.rfftn(x)
     y = np.asarray(np.random.rand(self.N), np.float32)
     yf = np.fft.rfftn(y)
     x_gpu = gpuarray.to_gpu(x)
     y_gpu = gpuarray.to_gpu(y)
     xf_gpu = gpuarray.empty(self.N//2+1, np.complex64)
     yf_gpu = gpuarray.empty(self.N//2+1, np.complex64)
     stream0 = drv.Stream()
     stream1 = drv.Stream()
     plan1 = fft.Plan(x.shape, np.float32, np.complex64, stream=stream0)
     plan2 = fft.Plan(y.shape, np.float32, np.complex64, stream=stream1)
     fft.fft(x_gpu, xf_gpu, plan1)
     fft.fft(y_gpu, yf_gpu, plan2)
     assert np.allclose(xf, xf_gpu.get(), atol=atol_float32)
     assert np.allclose(yf, yf_gpu.get(), atol=atol_float32)
예제 #28
0
 def test_multiple_streams(self):
     x = np.asarray(np.random.rand(self.N), np.float32)
     xf = np.fft.rfftn(x)
     y = np.asarray(np.random.rand(self.N), np.float32)
     yf = np.fft.rfftn(y)
     x_gpu = gpuarray.to_gpu(x)
     y_gpu = gpuarray.to_gpu(y)
     xf_gpu = gpuarray.empty(self.N // 2 + 1, np.complex64)
     yf_gpu = gpuarray.empty(self.N // 2 + 1, np.complex64)
     stream0 = drv.Stream()
     stream1 = drv.Stream()
     plan1 = fft.Plan(x.shape, np.float32, np.complex64, stream=stream0)
     plan2 = fft.Plan(y.shape, np.float32, np.complex64, stream=stream1)
     fft.fft(x_gpu, xf_gpu, plan1)
     fft.fft(y_gpu, yf_gpu, plan2)
     assert np.allclose(xf, xf_gpu.get(), atol=atol_float32)
     assert np.allclose(yf, yf_gpu.get(), atol=atol_float32)
예제 #29
0
def filter_fft_cuda(signal: np.array, window: np.array, prec: dict):
    """
    Computes the low_pass filter using the numpy pycuda method.
    Also auto-inits the pycuda library
    :param signal: The input series
    :param window: The input window
    :param prec: The precision entry
    :return: The filtered signal
    """
    import pycuda.autoinit  # Here because it initialises a new cuda environment every trial.
    import pycuda.gpuarray as gpuarray
    import skcuda.fft as cu_fft
    import skcuda.linalg as linalg
    linalg.init()
    nfft = determine_size(len(signal) + len(window) - 1)
    # Move data to GPU
    sig_zero_pad = np.zeros(nfft, dtype=prec['float'])
    win_zero_pad = np.zeros(nfft, dtype=prec['float'])
    sig_gpu = gpuarray.zeros(sig_zero_pad.shape, dtype=prec['float'])
    win_gpu = gpuarray.zeros(win_zero_pad.shape, dtype=prec['float'])
    sig_zero_pad[0:len(signal)] = signal
    win_zero_pad[0:len(window)] = window
    sig_gpu.set(sig_zero_pad)
    win_gpu.set(win_zero_pad)

    # Plan forwards
    sig_fft_gpu = gpuarray.zeros(nfft, dtype=prec['complex'])
    win_fft_gpu = gpuarray.zeros(nfft, dtype=prec['complex'])
    sig_plan_forward = cu_fft.Plan(sig_fft_gpu.shape, prec['float'],
                                   prec['complex'])
    win_plan_forward = cu_fft.Plan(win_fft_gpu.shape, prec['float'],
                                   prec['complex'])
    cu_fft.fft(sig_gpu, sig_fft_gpu, sig_plan_forward)
    cu_fft.fft(win_gpu, win_fft_gpu, win_plan_forward)

    # Convolve
    out_fft = linalg.multiply(sig_fft_gpu, win_fft_gpu, overwrite=True)
    linalg.scale(2.0, out_fft)

    # Plan inverse
    out_gpu = gpuarray.zeros_like(out_fft)
    plan_inverse = cu_fft.Plan(out_fft.shape, prec['complex'], prec['complex'])
    cu_fft.ifft(out_fft, out_gpu, plan_inverse, True)
    out_np = np.zeros(len(out_gpu), prec['complex'])
    out_gpu.get(out_np)
    return out_np
예제 #30
0
 def setup_mesh(self, mesh):
     '''Create the meshgrid, compute and store integrated Green's
     function from mesh distances.
     Only accepts meshes with same shape as self.mesh .
     '''
     assert (mesh.shape == self.mesh.shape)
     self.mesh = mesh
     mesh_arr = [
         -mesh.distances[i]/2 +
         np.arange(mesh.shape_r[i] + 1.) * mesh.distances[i]
         for i in range(mesh.dimension)[::-1]
        ]
     # mesh_arr is [mz, my, mx]
     mesh_grids = np.meshgrid(*mesh_arr, indexing='ij')
     fgreen = self._fgreen(*mesh_grids)
     fgreen = self._mirror(fgreen)
     cu_fft.fft(gpuarray.to_gpu(fgreen), self.fgreentr,
                plan=self.plan_forward)
예제 #31
0
def fft2_gpu(x, fftshift=False):
    #code taken verbatim from https://www.idtools.com.au/gpu-accelerated-fft-compatible-numpy/
    ''' This function produce an output that is 
    compatible with numpy.fft.fft2
    The input x is a 2D numpy array'''

    # Convert the input array to single precision float
    if x.dtype != 'float32':
        x = x.astype('float32')

    # Get the shape of the initial numpy array
    n1, n2 = x.shape

    # From numpy array to GPUarray
    xgpu = gpuarray.to_gpu(x)

    # Initialise output GPUarray
    # For real to complex transformations, the fft function computes
    # N/2+1 non-redundant coefficients of a length-N input signal.
    y = gpuarray.empty((n1, n2 // 2 + 1), np.complex64)

    # Forward FFT
    plan_forward = cu_fft.Plan((n1, n2), np.float32, np.complex64)
    cu_fft.fft(xgpu, y, plan_forward)

    left = y.get()

    # To make the output array compatible with the numpy output
    # we need to stack horizontally the y.get() array and its flipped version
    # We must take care of handling even or odd sized array to get the correct
    # size of the final array
    if n2 // 2 == n2 / 2:
        right = np.roll(np.fliplr(np.flipud(y.get()))[:, 1:-1], 1, axis=0)
    else:
        right = np.roll(np.fliplr(np.flipud(y.get()))[:, :-1], 1, axis=0)

    # Get a numpy array back compatible with np.fft
    if fftshift is False:
        yout = np.hstack((left, right))
    else:
        yout = np.fft.fftshift(np.hstack((left, right)))

    return yout.astype('complex128')
예제 #32
0
	def Simulate(self, period, width):

		period=np.float64(period)
		width=np.float64(width)

		for i in range(len(self.DatFiles)):
			self.DatFiles[i].gpu_pulsar_signal = self.DatFiles[i].gpu_time - 0*period
			self.MakeSignal(self.DatFiles[i].gpu_pulsar_signal, period, ((period*width)**2), grid=(self.DatFiles[i].Tblocks,1), block=(self.DatFiles[i].block_size,1,1))

			s = self.DatFiles[i].gpu_pulsar_signal.get()
			np.savetxt("realsig.dat", zip(np.arange(0,10000),s[:10000]))

                        fft.fft(self.DatFiles[i].gpu_pulsar_signal, self.DatFiles[i].gpu_pulsar_fft, self.DatFiles[i].Plan)
			ranPhases = np.random.uniform(0,1, len(self.DatFiles[i].gpu_pulsar_fft))
			CompRan = np.cos(2*np.pi*ranPhases) + 1j*np.sin(2*np.pi*ranPhases)
			CompRan[0] = 1 + 0j
			OComp = self.DatFiles[i].gpu_pulsar_fft.get()
			NComp = OComp*CompRan	
                        s = np.fft.irfft(NComp)
			np.savetxt("ransig.dat", zip(np.arange(0,10000),s[:10000]))
예제 #33
0
def fft2_gpu_c2c(x, fftshift=True):
    """
    C2C FFT
    This function produce an output that is compatible with numpy.fft.fft2.
    The input x is a 2D numpy array 
    """
    if x.dtype != np.complex128:
        x = x.astype(np.complex128)
    #get the shape of the initial numpy array
    n1, n2 = x.shape
    xgpu = gpuarray.to_gpu(x)
    #Initialise empty output GPUarray
    y = gpuarray.empty((n1, n2), np.complex128)
    #FFT
    plan_forward = cu_fft.Plan((n1, n2), np.complex128, np.complex128)
    cu_fft.fft(xgpu, y, plan_forward)

    #Must divide by the total number of pixels in the image to get the normalization right
    yout = y.get() / n1 / n2
    if fftshift:
        yout = np.fft.fftshift(yout)
    return yout
예제 #34
0
 def poisson_solve(self, rho):
     ''' Solve the poisson equation with the given charge distribution
     Args:
         rho: Charge distribution (same dimensions as mesh)
     Returns:
         Phi (same dimensions as rho)
     '''
     rho = rho.astype(np.complex128)
     self._cpyrho2tmp.set_src_device(rho.gpudata)
     self._cpytmp2rho.set_dst_device(rho.gpudata)
     # set to 0 since it might be filled with the old potential
     self.tmpspace.fill(0)
     self._cpyrho2tmp()
     cu_fft.fft(self.tmpspace, self.tmpspace, plan=self.plan_forward)
     cu_fft.ifft(self.tmpspace * self.fgreentr, self.tmpspace,
                 plan=self.plan_backward)
     # store the result in the rho gpuarray to save space
     self._cpytmp2rho()
     # scale (cuFFT is unscaled)
     phi = rho.real/(2**self.mesh.dimension * self.mesh.n_nodes)
     phi *= self.mesh.volume_elem/(2**(self.mesh.dimension-1)*np.pi*epsilon_0)
     return phi
예제 #35
0
파일: cuutils.py 프로젝트: chungheng/neural
def cu_lpf(stimulus, dt, freq):
    """
    CUDA implementation of low-pass-filter.

    stimulus: ndarray
        The input to be filtered.
    dt: float
        The sampling interval of the input.
    freq: float
        The cut-off frequency of the low pass filter.
    """
    num = len(stimulus)
    num_fft = int(num / 2 + 1)
    idtype = stimulus.dtype
    odtype = np.complex128 if idtype == np.float64 else np.complex64

    if not isinstance(stimulus, gpuarray.GPUArray):
        d_stimulus = gpuarray.to_gpu(stimulus)
    else:
        d_stimulus = stimulus

    plan = Plan(stimulus.shape, idtype, odtype)
    d_fstimulus = gpuarray.empty(num_fft, odtype)
    fft(d_stimulus, d_fstimulus, plan)

    df = 1.0 / dt / num
    idx = int(freq // df)

    unit = int(d_fstimulus.dtype.itemsize / 4)
    offset = int(d_fstimulus.gpudata) + d_fstimulus.dtype.itemsize * idx

    cuda.memset_d32(offset, 0, unit * (num_fft - idx))

    plan = Plan(stimulus.shape, odtype, idtype)
    d_lpf_stimulus = gpuarray.empty(num, idtype)
    ifft(d_fstimulus, d_lpf_stimulus, plan, False)

    return d_lpf_stimulus.get()
예제 #36
0
def funcfftw(F, *args, **kwargs):
    """funcfftw(F, *args, **kwargs) -> numpy.2darray
    apply 2D Fourier transform

    Parameters
    ----------
    F      : numpy.2darray
    args   : options
    kwargs : options
    """
    if found_pyfftw is True and kwargs.get('fft_type') == 'fftw':
        pyfftw.forget_wisdom()
        func = pyfftw.builders.fft2(F,
                                    overwrite_input=True,
                                    planner_effort='FFTW_ESTIMATE',
                                    threads=CPU_COUNT)
        return func()
    elif found_cufft is True and kwargs.get('fft_type') == 'cufft':
        x_gpu = gpuarray.to_gpu(F.astype(np.complex64))
        xf_gpu = gpuarray.empty(F.shape, np.complex64)
        cu_fft.fft(x_gpu, xf_gpu, args[0])
        return xf_gpu.get()
    else:
        return fft2(F)
예제 #37
0
    def __init__(self, mesh, context=None, save_memory=True):
        '''
        Args:
            mesh The mesh on which the solver will operate. The dimensionality
                 is deducted from mesh.dimension
            save_memory: Decide whether to store all slices of the transformed
                 greens function (more memory but faster) or save 1 slice only
                 (saves memory but slower, default)
        '''
        # create the mesh grid and compute the greens function on it
        if (mesh.dimension != 3):
            print ('Error: Use a 3d mesh for the 2.5d algorithm!. Abort.')
            return None
        self.is_25D = True

        self.mesh = mesh
        self._context = context
        mesh_shape = self.mesh.shape # nz, ny, (nx)
        nz, ny, nx = mesh_shape
        mesh_shape2 = [2*n for n in mesh_shape] # 2*nz, 2*ny, (2*nx)
        mesh_distances = list(reversed(self.mesh.distances)) #dz, dy, dx
        if save_memory:
            self.fgreentr = gpuarray.empty((2*ny, 2*nx),
                            dtype=np.complex128)
            self._solve_kernel = self._solve_kernel_slow
        else:
            self.fgreentr = gpuarray.empty((nz, 2*ny, 2*nx),
                            dtype=np.complex128)
            self._solve_kernel = self._solve_kernel_fast
        self.tmpspace = gpuarray.zeros((nz, 2*ny, 2*nx), dtype=np.complex128)
        sizeof_complex = np.dtype(np.complex128).itemsize

        # dimensionality function dispatch
        self._fgreen = getattr(self, '_fgreen25d')
        self._mirror = getattr(self, '_mirror2d')
        #copy_fn = {'3d' : get_Memcpy3D_d2d, '2d': get_Memcpy2D_d2d}
        memcpy_nd = get_Memcpy3D_d2d
        #memcpy_nd = copy_fn[str(dim) + 'd']
        dim_args = self.mesh.shape
        self._cpyrho2tmp = memcpy_nd(
            src=None, dst=self.tmpspace, # None because src(rho) not yet known
            src_pitch=self.mesh.nx*sizeof_complex,
            dst_pitch=2*self.mesh.nx*sizeof_complex,
            dim_args=dim_args,
            itemsize=np.dtype(np.complex128).itemsize,
            src_height=self.mesh.ny,
            dst_height=2*self.mesh.ny)
        self._cpytmp2rho = memcpy_nd(
            src=self.tmpspace, dst=None, # None because dst(rho) not yet know
            src_pitch=2*self.mesh.nx*sizeof_complex,
            dst_pitch=self.mesh.nx*sizeof_complex,
            dim_args=dim_args,
            itemsize=np.dtype(np.complex128).itemsize,
            src_height=2*self.mesh.ny,
            dst_height=self.mesh.ny)

        mesh_arr = [-mesh_distances[i]/2 + np.arange(mesh.shape[i]+1)
                                            * mesh_distances[i]
                    for i in [1,2]
                   ]
        # mesh_arr is [mz, my, mx]
        mesh_grids = np.meshgrid(*mesh_arr, indexing='ij') #choose my, mx
        fgreen2 = self._fgreen(*mesh_grids)
        fgreen2 = self._mirror(fgreen2)
        fgreen = np.empty(shape=(mesh.nz, 2*mesh.ny, 2*mesh.nx),
           dtype=np.complex128)
        for nn in xrange(mesh.nz):
           fgreen[nn,:,:] = fgreen2
        # tile in 3d dimension, yields to memerror, uses huuge amount of memory!
        #fgreen = np.tile(fgreen, (mesh.nz, 2*mesh.ny, 2*mesh.nx))

        self.plan_forward = cu_fft.Plan([2*self.mesh.ny, 2*self.mesh.nx],
            in_dtype=np.complex128, out_dtype=np.complex128, batch=self.mesh.nz)
        self.plan_backward = cu_fft.Plan([2*self.mesh.ny, 2*self.mesh.nx],
            in_dtype=np.complex128, out_dtype=np.complex128, batch=self.mesh.nz)
        if save_memory:
            plan_2d = cu_fft.Plan([2*self.mesh.ny, 2*self.mesh.nx],
                in_dtype=np.complex128, out_dtype=np.complex128)
            cu_fft.fft(gpuarray.to_gpu(fgreen2), self.fgreentr, plan=plan_2d)
        else:
            cu_fft.fft(gpuarray.to_gpu(fgreen), self.fgreentr,
                plan=self.plan_forward)
예제 #38
0
import numpy as np

import skcuda.fft as cu_fft

print('Testing fft/ifft..')
N = 1024
M = N//2

x = np.asarray(np.random.rand(N, M), np.float32)
xf = np.fft.fft2(x)
y = np.real(np.fft.ifft2(xf))

x_gpu = gpuarray.to_gpu(x)
xf_gpu = gpuarray.empty((x.shape[0], x.shape[1]//2+1), np.complex64)
plan_forward = cu_fft.Plan(x_gpu.shape, np.float32, np.complex64)
cu_fft.fft(x_gpu, xf_gpu, plan_forward)

y_gpu = gpuarray.empty_like(x_gpu)
plan_inverse = cu_fft.Plan(x_gpu.shape, np.complex64, np.float32)
cu_fft.ifft(xf_gpu, y_gpu, plan_inverse, True)

print('Success status: %r' % np.allclose(y, y_gpu.get(), atol=1e-6))

print('Testing in-place fft..')
x = np.asarray(np.random.rand(N, M) + 1j * np.random.rand(N, M), np.complex64)
x_gpu = gpuarray.to_gpu(x)

plan = cu_fft.Plan(x_gpu.shape, np.complex64, np.complex64)
cu_fft.fft(x_gpu, x_gpu, plan)

cu_fft.ifft(x_gpu, x_gpu, plan, True)
예제 #39
0
def process_audio_cuda(data):
    global cs, csa, csa_first

    if csa_first == True:
        prepare_audio_filters()
        prepare_audio_cuda()
        csa_first = False

    fdata = np.float32(data)
    gpudata = gpuarray.to_gpu(fdata)

    fft.fft(gpudata, cs["fft1_out"], cs["plan1"])

    cs["left_fft1"] = (cs["fft1_out"] * cs["filt_audio_left"])[
        0 : (ablocklen // 2) + 1
    ]  # [0:blocklen])[0:(ablocklen//2)+1]
    cs["right_fft1"] = (cs["fft1_out"] * cs["filt_audio_right"])[0 : (ablocklen // 2) + 1]

    fft.ifft(cs["left_fft1"], cs["fm_left"], cs["plan1i"], True)
    fft.ifft(cs["right_fft1"], cs["fm_right"], cs["plan1i"], True)

    cs["doanglediff_mac"](
        cs["left_clipped"],
        cs["fm_left"],
        np.float32((afreq_hz / 1.0 / np.pi)),
        np.float32(-SysParams["audio_lfreq"]),
        block=(1024, 1, 1),
        grid=(ablocklenk, 1),
    )
    cs["doanglediff_mac"](
        cs["right_clipped"],
        cs["fm_right"],
        np.float32((afreq_hz / 1.0 / np.pi)),
        np.float32(-SysParams["audio_rfreq"]),
        block=(1024, 1, 1),
        grid=(ablocklenk, 1),
    )

    fft.fft(cs["left_clipped"], cs["left_fft2"], cs["plan2"])
    fft.fft(cs["right_clipped"], cs["right_fft2"], cs["plan2"])

    cs["left_fft2"] *= cs["filt_audiolpf"]
    cs["right_fft2"] *= cs["filt_audiolpf"]

    fft.ifft(cs["left_fft2"], cs["left_out"], cs["plan2i"], True)
    fft.ifft(cs["right_fft2"], cs["right_out"], cs["plan2i"], True)

    aclip = 256

    outlen = ablocklen

    cs["doaudioscale"](
        cs["scaledout"],
        cs["left_out"],
        cs["right_out"],
        np.float32(20),
        np.float32(0),
        block=(32, 1, 1),
        grid=(outlen // 32, 1),
    )

    output = cs["scaledout"].get()[aclip:-aclip]

    return output, len(output) * 80 / 2

    plt.plot(cs["scaledout"].get())

    # 	plt.plot(cs['right_clipped'].get()[768:-768])
    # 	plt.plot(cs['right_out'].get()[768:-768] + 100000)
    plt.show()
    exit()
예제 #40
0
 def _solve_kernel_fast(self):
     '''Fast kernel, use when save_memory is False
     '''
     cu_fft.fft(self.tmpspace, self.tmpspace, plan=self.plan_forward)
     cu_fft.ifft(self.tmpspace * self.fgreentr, self.tmpspace,
                 plan=self.plan_backward)
예제 #41
0
파일: cufft.py 프로젝트: RorySmith/pycbc
 def execute(self):
     cu_fft.fft(self.invec, self.outvec, self.plan)
예제 #42
0
파일: GPUTest.py 프로젝트: danielct/Honours
modSquared = mod.get_function("modSquared")
psiNonlinear = mod2.get_function("test")
modSquared.prepare(["P", "P", "I"])
psiNonlinear.prepare("FFFPPPI")
block = (16, 16, 1)
grid = (64, 64)

for n in np.arange(N_RUNS):
    start = time.time()

    for step in xrange(N_TIMESTEPS):
        # print step
       # Implementing split-step method
       # Update wavefunction and resovoir, record density
        cu_fft.fft(psi_gpu, psi_gpu, plan_forward)
        psi_gpu *= kineticFactorHalf_gpu
        cu_fft.ifft(psi_gpu, psi_gpu, plan_inverse, scale=True)

        # currentDensity_gpu = abs(psi_gpu) ** 2
        # currentDensity_gpu = psi_gpu.real **2 + psi_gpu.imag ** 2
        currentDensity_gpu = (psi_gpu * psi_gpu.conj()).real
        # modSquared.prepared_call(grid, block, psi_gpu.gpudata,
        #                          currentDensity_gpu.gpudata, 1024)
        # n_gpu *= cumath.exp(-gammaRdt_gpu + Rdt_gpu * currentDensity_gpu)
        n_gpu *= cumath.exp(misc.add(- gammaRdt_gpu,
                                     - misc.multiply(Rdt_gpu, currentDensity_gpu)))
        n_gpu += Pdt_gpu
        psi_gpu *= cumath.exp(
            misc.add(
                misc.add(misc.multiply(expFactorPolFirst_gpu, n_gpu),
예제 #43
0
파일: cufft.py 프로젝트: RorySmith/pycbc
def fft(invec, outvec, prec, itype, otype):
    cuplan = _get_fwd_plan(invec.dtype, outvec.dtype, len(invec))
    cu_fft.fft(invec.data, outvec.data, cuplan)