Exemplo n.º 1
0
        def thunk():
            input_shape = inputs[0][0].shape
            output_shape = input_shape

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = CudaNdarray.zeros(output_shape)

            input_pycuda = to_gpuarray(inputs[0][0])
            # input_pycuda is a float32 array with an extra dimension,
            # but will be interpreted by scikits.cuda as a complex64
            # array instead.
            output_pycuda = to_gpuarray(z[0])

            # only initialise plan if necessary
            if plan[0] is None or plan_input_shape[0] != input_shape:
                plan_input_shape[0] = input_shape
                plan[0] = fft.Plan(output_shape[1:-1], np.complex64, np.complex64,
                                   batch=output_shape[0])

            fft.ifft(input_pycuda, output_pycuda, plan[0])
            compute_map[node.outputs[0]][0] = True
Exemplo n.º 2
0
def fft_multiply_repeated(h_fft, x, cuda_dict=dict(use_cuda=False)):
    """Do FFT multiplication by a filter function (possibly using CUDA)

    Parameters
    ----------
    h_fft : 1-d array or gpuarray
        The filtering array to apply.
    x : 1-d array
        The array to filter.
    cuda_dict : dict
        Dictionary constructed using setup_cuda_multiply_repeated().

    Returns
    -------
    x : 1-d array
        Filtered version of x.
    """
    if not cuda_dict["use_cuda"]:
        # do the fourier-domain operations
        x = np.real(ifft(h_fft * fft(x), overwrite_x=True)).ravel()
    else:
        # do the fourier-domain operations, results in second param
        cuda_dict["x"].set(x.astype(np.float64))
        cudafft.fft(cuda_dict["x"], cuda_dict["x_fft"], cuda_dict["fft_plan"])
        cuda_multiply_inplace_c128(h_fft, cuda_dict["x_fft"])
        # If we wanted to do it locally instead of using our own kernel:
        # cuda_seg_fft.set(cuda_seg_fft.get() * h_fft)
        cudafft.ifft(cuda_dict["x_fft"], cuda_dict["x"], cuda_dict["ifft_plan"], False)
        x = np.array(cuda_dict["x"].get(), dtype=x.dtype, subok=True, copy=False)
    return x
Exemplo n.º 3
0
        def thunk():
            input_shape = inputs[0][0].shape

            # construct output shape
            # chop off the extra length-2 dimension for real/imag
            output_shape = list(input_shape[:-1])
            # restore full signal length
            output_shape[-1] = (output_shape[-1] - 1) * 2
            output_shape = tuple(output_shape)

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = CudaNdarray.zeros(output_shape)

            input_pycuda = to_gpuarray(inputs[0][0])
            # input_pycuda is a float32 array with an extra dimension,
            # but will be interpreted by scikits.cuda as a complex64
            # array instead.
            output_pycuda = to_gpuarray(z[0])

            # only initialise plan if necessary
            if plan[0] is None or plan_input_shape[0] != input_shape:
                plan_input_shape[0] = input_shape
                plan[0] = fft.Plan(output_shape[1:],
                                   np.complex64,
                                   np.float32,
                                   batch=output_shape[0])

            fft.ifft(input_pycuda, output_pycuda, plan[0])
Exemplo n.º 4
0
def fft_multiply_repeated(h_fft, x, cuda_dict=dict(use_cuda=False)):
    """Do FFT multiplication by a filter function (possibly using CUDA)

    Parameters
    ----------
    h_fft : 1-d array or gpuarray
        The filtering array to apply.
    x : 1-d array
        The array to filter.
    cuda_dict : dict
        Dictionary constructed using setup_cuda_multiply_repeated().

    Returns
    -------
    x : 1-d array
        Filtered version of x.
    """
    if not cuda_dict['use_cuda']:
        # do the fourier-domain operations
        x = np.real(ifft(h_fft * fft(x), overwrite_x=True)).ravel()
    else:
        # do the fourier-domain operations, results in second param
        cuda_dict['x'].set(x.astype(np.float64))
        cudafft.fft(cuda_dict['x'], cuda_dict['x_fft'], cuda_dict['fft_plan'])
        cuda_multiply_inplace_c128(h_fft, cuda_dict['x_fft'])
        # If we wanted to do it locally instead of using our own kernel:
        # cuda_seg_fft.set(cuda_seg_fft.get() * h_fft)
        cudafft.ifft(cuda_dict['x_fft'], cuda_dict['x'],
                     cuda_dict['ifft_plan'], False)
        x = np.array(cuda_dict['x'].get(),
                     dtype=x.dtype,
                     subok=True,
                     copy=False)
    return x
Exemplo n.º 5
0
        def thunk():
            input_shape = inputs[0][0].shape
            output_shape = input_shape

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = CudaNdarray.zeros(output_shape)

            input_pycuda = to_gpuarray(inputs[0][0])
            # input_pycuda is a float32 array with an extra dimension,
            # but will be interpreted by scikits.cuda as a complex64
            # array instead.
            output_pycuda = to_gpuarray(z[0])

            # only initialise plan if necessary
            if plan[0] is None or plan_input_shape[0] != input_shape:
                plan_input_shape[0] = input_shape
                plan[0] = fft.Plan(output_shape[1:-1], np.complex64, np.complex64,
                                   batch=output_shape[0])

            fft.ifft(input_pycuda, output_pycuda, plan[0])
            compute_map[node.outputs[0]][0] = True
Exemplo n.º 6
0
def gpu_c2r_ifft(in1, is_gpuarray=False, store_on_gpu=False):
    """
    This function makes use of the scikits implementation of the FFT for GPUs to take the complex to real IFFT.

    INPUTS:
    in1             (no default):       The array on which the IFFT is to be performed.
    is_gpuarray     (default=True):     Boolean specifier for whether or not input is on the gpu.
    store_on_gpu    (default=False):    Boolean specifier for whether the result is to be left on the gpu or not.

    OUTPUTS:
    gpu_out1                            The gpu array containing the result.
    OR
    gpu_out1.get()                      The result from the gpu array.
    """

    if is_gpuarray:
        gpu_in1 = in1
    else:
        gpu_in1 = gpuarray.to_gpu_async(in1.astype(np.complex64))

    output_size = np.array(in1.shape)
    output_size[1] = 2*(output_size[1]-1)

    gpu_out1 = gpuarray.empty([output_size[0],output_size[1]], np.float32)
    gpu_plan = Plan(output_size, np.complex64, np.float32)
    ifft(gpu_in1, gpu_out1, gpu_plan)
    scale_fft(gpu_out1)

    if store_on_gpu:
        return gpu_out1
    else:
        return gpu_out1.get()
Exemplo n.º 7
0
        def thunk():
            input_shape = inputs[0][0].shape

            # construct output shape
            # chop off the extra length-2 dimension for real/imag
            output_shape = list(input_shape[:-1])
            # restore full signal length
            output_shape[-1] = (output_shape[-1] - 1) * 2
            output_shape = tuple(output_shape)

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = CudaNdarray.zeros(output_shape)

            input_pycuda = to_gpuarray(inputs[0][0])
            # input_pycuda is a float32 array with an extra dimension,
            # but will be interpreted by scikits.cuda as a complex64
            # array instead.
            output_pycuda = to_gpuarray(z[0])

            # only initialise plan if necessary
            if plan[0] is None or plan_input_shape[0] != input_shape:
                plan_input_shape[0] = input_shape
                plan[0] = fft.Plan(output_shape[1:], np.complex64, np.float32,
                                   batch=output_shape[0])

            fft.ifft(input_pycuda, output_pycuda, plan[0])
Exemplo n.º 8
0
 def test_ifft_complex64_to_float32_1d(self):
     x = np.asarray(np.random.rand(self.N), np.float32)
     xf = np.asarray(np.fft.rfftn(x), np.complex64)
     xf_gpu = gpuarray.to_gpu(xf)
     x_gpu = gpuarray.empty(self.N, np.float32)
     plan = fft.Plan(x.shape, np.complex64, np.float32)
     fft.ifft(xf_gpu, x_gpu, plan, True)
     assert np.allclose(x, x_gpu.get(), atol=atol_float32)
Exemplo n.º 9
0
 def test_ifft_complex128_to_float64(self):
     x = np.asarray(np.random.rand(self.N), np.float64)
     xf = np.asarray(np.fft.fft(x), np.complex128)
     xf_gpu = gpuarray.to_gpu(xf[0:self.N/2+1])
     x_gpu = gpuarray.empty(self.N, np.float64)
     plan = fft.Plan(x.shape, np.complex128, np.float64)
     fft.ifft(xf_gpu, x_gpu, plan, True)
     assert np.allclose(x, x_gpu.get(), atol=atol_float64)
Exemplo n.º 10
0
 def test_ifft_complex128_to_float64(self):
     x = np.asarray(np.random.rand(self.N), np.float64)
     xf = np.asarray(np.fft.fft(x), np.complex128)
     xf_gpu = gpuarray.to_gpu(xf[0:self.N / 2 + 1])
     x_gpu = gpuarray.empty(self.N, np.float64)
     plan = fft.Plan(x.shape, np.complex128, np.float64)
     fft.ifft(xf_gpu, x_gpu, plan, True)
     assert np.allclose(x, x_gpu.get(), atol=atol_float64)
Exemplo n.º 11
0
 def test_ifft_complex64_to_float32_1d(self):
     x = np.asarray(np.random.rand(self.N), np.float32)
     xf = np.asarray(np.fft.rfftn(x), np.complex64)
     xf_gpu = gpuarray.to_gpu(xf)
     x_gpu = gpuarray.empty(self.N, np.float32)
     plan = fft.Plan(x.shape, np.complex64, np.float32)
     fft.ifft(xf_gpu, x_gpu, plan, True)
     assert np.allclose(x, x_gpu.get(), atol=atol_float32)
Exemplo n.º 12
0
 def irfft2(self, i, o=None, cache=True):
     shape = i.shape[:-2]
     cshape = i.shape[-2:]
     rshape = (cshape[0], (cshape[1] - 1) * 2)
     batch = np.prod(shape, dtype=np.int)
     plan = self.get_plan(cache, rshape, self.ctype, self.rtype, batch)
     if o is None:
         o = self.context.empty(shape + rshape, self.rtype)
     cu_fft.ifft(i, o, plan, scale=True)
     return o
Exemplo n.º 13
0
 def irfft2(self, i, o = None, cache = True):
     shape = i.shape[:-2]
     cshape = i.shape[-2:]
     rshape = (cshape[0], (cshape[1]-1)*2)
     batch = np.prod(shape, dtype=np.int)
     plan = self.get_plan(cache, rshape, self.ctype, self.rtype, batch)
     if o is None:
         o = self.context.empty(shape+rshape, self.rtype)
     cu_fft.ifft(i, o, plan, scale=True)
     return o
Exemplo n.º 14
0
    def test_ifft_complex64_to_float32_2d(self):

        # Note that since rfftn returns a Fortran-ordered array, it
        # needs to be reformatted as a C-ordered array before being
        # passed to gpuarray.to_gpu:
        x = np.asarray(np.random.rand(self.N, self.M), np.float32)
        xf = np.asarray(np.fft.rfftn(x), np.complex64)
        xf_gpu = gpuarray.to_gpu(np.ascontiguousarray(xf))
        x_gpu = gpuarray.empty((self.N, self.M), np.float32)
        plan = fft.Plan(x.shape, np.complex64, np.float32)
        fft.ifft(xf_gpu, x_gpu, plan, True)
        assert np.allclose(x, x_gpu.get(), atol=atol_float32)
Exemplo n.º 15
0
    def test_batch_ifft_complex128_to_float64_1d(self):

        # Note that since rfftn returns a Fortran-ordered array, it
        # needs to be reformatted as a C-ordered array before being
        # passed to gpuarray.to_gpu:
        x = np.asarray(np.random.rand(self.B, self.N), np.float64)
        xf = np.asarray(np.fft.rfft(x, axis=1), np.complex128)
        xf_gpu = gpuarray.to_gpu(np.ascontiguousarray(xf))
        x_gpu = gpuarray.empty((self.B, self.N), np.float64)
        plan = fft.Plan(x.shape[1], np.complex128, np.float64, batch=self.B)
        fft.ifft(xf_gpu, x_gpu, plan, True)
        assert np.allclose(x, x_gpu.get(), atol=atol_float64)
Exemplo n.º 16
0
    def test_ifft_complex64_to_float32_2d(self):

        # Note that since rfftn returns a Fortran-ordered array, it
        # needs to be reformatted as a C-ordered array before being
        # passed to gpuarray.to_gpu:
        x = np.asarray(np.random.rand(self.N, self.M), np.float32)
        xf = np.asarray(np.fft.rfftn(x), np.complex64)
        xf_gpu = gpuarray.to_gpu(np.ascontiguousarray(xf))
        x_gpu = gpuarray.empty((self.N, self.M), np.float32)
        plan = fft.Plan(x.shape, np.complex64, np.float32)
        fft.ifft(xf_gpu, x_gpu, plan, True)
        assert np.allclose(x, x_gpu.get(), atol=atol_float32)
Exemplo n.º 17
0
    def test_batch_ifft_complex128_to_float64_2d(self):

        # Note that since rfftn returns a Fortran-ordered array, it
        # needs to be reformatted as a C-ordered array before being
        # passed to gpuarray.to_gpu:
        x = np.asarray(np.random.rand(self.B, self.N, self.M), np.float64)
        xf = np.asarray(np.fft.rfftn(x, axes=(1,2)), np.complex128)
        xf_gpu = gpuarray.to_gpu(np.ascontiguousarray(xf))
        x_gpu = gpuarray.empty((self.B, self.N, self.M), np.float64)
        plan = fft.Plan([self.N, self.M], np.complex128, np.float64, batch=self.B)
        fft.ifft(xf_gpu, x_gpu, plan, True)
        assert np.allclose(x, x_gpu.get(), atol=atol_float64)
Exemplo n.º 18
0
    def convol(self, data1, data2):
	self.init()
	self.ctx.push()
	plan = self.__class__.plans[self.shape]
	data1_gpu = self.__class__.data1_gpus[self.shape]
	data2_gpu = self.__class__.data2_gpus[self.shape]
	data1_gpu.set(data1.astype(numpy.complex128))
	cu_fft.fft(data1_gpu, data1_gpu, plan)
	data2_gpu.set(data2.astype(numpy.complex128))
	cu_fft.fft(data2_gpu, data2_gpu, plan)
	# data1_gpu *= data2_gpu.conj()
	self.multconj(data1_gpu, data2_gpu)
	cu_fft.ifft(data1_gpu, data1_gpu, plan, True)
	# self.ctx.synchronize()
	res = data1_gpu.get().real
	self.ctx.pop()
   	return res
Exemplo n.º 19
0
 def correlate(self, data1, data2):
     self.init()
     with self.__class__.sem:
         self.ctx.push()
         plan = self.__class__.plans[self.shape]
         data1_gpu = self.__class__.data1_gpus[self.shape]
         data2_gpu = self.__class__.data2_gpus[self.shape]
         data1_gpu.set(data1.astype(numpy.complex128))
         cu_fft.fft(data1_gpu, data1_gpu, plan)
         data2_gpu.set(data2.astype(numpy.complex128))
         cu_fft.fft(data2_gpu, data2_gpu, plan)
         #            data1_gpu *= data2_gpu.conj()
         self.multconj(data1_gpu, data2_gpu)
         cu_fft.ifft(data1_gpu, data1_gpu, plan, True)
         #            self.ctx.synchronize()
         res = data1_gpu.get().real
         self.ctx.pop()
     return res
Exemplo n.º 20
0
def cufft(data,shape=None,inverse=False):

    if shape:
        data = pad2(data,shape)
                        
    plan  = CUFFT_PLANS.get(data.shape)
    if not plan:
        plan = cu_fft.Plan(data.shape,np.complex64,np.complex64)
        CUFFT_PLANS[data.shape] = plan
    
    gpu_data = gpuarray.to_gpu(np.cast[np.complex64](data))
    if inverse:
        cu_fft.ifft(gpu_data,gpu_data,plan)
    else:
        cu_fft.fft(gpu_data,gpu_data,plan)
    r = gpu_data.get()
    
    return r
Exemplo n.º 21
0
def resample_sdbe_to_r2dbe_zpfft(Xs):
    """
	Resample SWARM spectrum product in time-domain at R2DBE rate using
	zero-padding and a radix-2 iFFT algorithm.
	
	Arguments:
	----------
	Xs -- MxN numpy array in which the zeroth dimension is increasing
	snapshot index, and the first dimension is the positive frequency
	half of the spectrum.
	
	Returns:
	--------
	xs -- The time-domain signal sampled at the R2DBE rate.
	next_start_vec -- Start indecies for each FFT window.

	"""

    # timestep sizes for SWARM and R2DBE rates
    dt_s = 1.0 / SWARM_RATE
    dt_r = 1.0 / R2DBE_RATE

    # we need to oversample by factor 64 and then undersample by factor 39
    simple_r = 64  # 4096
    simple_s = 39  # 2496
    fft_window_oversample = 2 * SWARM_CHANNELS * simple_r  # 2* due to real FFT

    # oversample timestep size
    dt_f = dt_s / simple_r

    # the timespan of one SWARM FFT window
    T_s = dt_s * SWARM_SAMPLES_PER_WINDOW

    # what are these...?
    x_t2_0 = None
    x_t2_1 = None

    # time vectors over one SWARM FFT window in different step sizes
    t_r = arange(0, T_s, dt_r)
    t_s = arange(0, T_s, dt_s)
    t_f = arange(0, T_s, dt_f)

    # offset in oversampled time series that corresponds to one dt_r step
    # from the last R2DBE rate sample in the previous window
    next_start = 0

    # some time offsets...?
    offset_in_window_offset_s = list()
    offset_global_s = list()

    # total number of time series samples
    N_x = int(ceil(Xs.shape[0] * SWARM_SAMPLES_PER_WINDOW * dt_s / dt_r))
    # and initialize the output
    xs = zeros(N_x, dtype=float32)
    #fine_sample_index = zeros(N_x)
    next_start_vec = zeros(Xs.shape[0])
    # index in output where samples from next window are stored
    start_output = 0

    # cuFFT plan for complex to real DFT
    plan = cu_fft.Plan(fft_window_oversample, complex64, float32)

    # padding kernel
    fill_padded = mod.get_function("fill_padded")

    # downsampling kernel
    downsample = mod.get_function("downsample")

    # FFT scaling kernel
    scale = ElementwiseKernel(
        "float *a", "a[i] = {0} * a[i]".format(1. / fft_window_oversample),
        "scale")

    # max size of resampled chunk from a single window
    xs_chunk_size_max = int32(ceil((1. * fft_window_oversample) / simple_s))

    # create memory on device for cuFFT
    xf_d = gpuarray.empty(fft_window_oversample, dtype=float32)
    xp_d = gpuarray.zeros(fft_window_oversample / 2 + 1, dtype=complex64)
    y_d = gpuarray.empty(xs_chunk_size_max, dtype=float32)

    for ii in range(Xs.shape[0]):

        # move window to device
        x_d = gpuarray.to_gpu(Xs[ii, :])

        # threads per block
        # number of blocks (keep the array as zeros to save time)
        TPB = 1024
        nB = int(ceil(1. * Xs.shape[1] / TPB))
        # pad with zeros to oversample by 64
        fill_padded(int32(1), xp_d, int32(fft_window_oversample/2+1),\
             x_d, int32(Xs.shape[1]),\
             block=(TPB,1,1), grid=(nB,1))

        # iFFT
        cu_fft.ifft(xp_d, xf_d, plan, scale=False)

        xs_chunk_size = int32(
            ceil((1. * fft_window_oversample - next_start) / simple_s))
        # threads per block
        TPB = 64
        # number of blocks
        nB = ceil(1. * xs_chunk_size / TPB).astype(int)
        ## undersample by 39 to correct rate, and start at the correct
        ## offset in this window
        downsample(xf_d,int32(fft_window_oversample),\
          y_d,xs_chunk_size,
          int32(next_start),int32(simple_s),\
          block=(TPB,1,1),grid=(nB,1))

        # rescale from ifft using ElementwiseKernel
        scale(y_d)

        # pull data back onto host
        xs_chunk = y_d.get()

        # fill output numpy array
        stop_output = start_output + xs_chunk_size
        xs[start_output:stop_output] = xs_chunk[:xs_chunk_size]
        # update the starting index in the output array
        start_output = stop_output

        # mark the time of the last used sample relative to the start
        # of this window
        time_window_start_to_last_used_sample = t_f[next_start::39][-1]
        # calculate the remaining time in this window
        time_remaining_in_window = T_s - time_window_start_to_last_used_sample
        # convert to the equivalent number of oversample timesteps
        num_dt_f_steps_short = round(time_remaining_in_window / dt_f)
        next_start_vec[ii] = next_start
        if (num_dt_f_steps_short == 0):
            next_start = 0
        else:
            next_start = simple_s - num_dt_f_steps_short
    return xs, next_start_vec
Exemplo n.º 22
0
def sample_defrost_gpu(lat, func, gamma, m2_eff):
    """Calculates a sample of random values in the lattice

    lat = Lattice
    func = name of Cuda kernel
    n = size of cubic lattice
    gamma = -0.25 or +0.25
    m2_eff = effective mass

    This uses CuFFT to calculate FFTW.
    """
    import scikits.cuda.fft as fft
    import fftw3

    "Various constants:"
    mpl = lat.mpl
    n = lat.n
    nn = lat.nn
    os = 16
    nos = n * pow(os, 2)
    dk = lat.dk
    dx = lat.dx
    dkos = dk / (2. * os)
    dxos = dx / os
    kcut = nn * dk / 2.0
    norm = 0.5 / (math.sqrt(2 * pi * dk**3.) * mpl) * (dkos / dxos)

    ker = np.empty(nos, dtype=lat.prec_real)
    fft1 = fftw3.Plan(ker,
                      ker,
                      direction='forward',
                      flags=['measure'],
                      realtypes=['realodd 10'])

    for k in xrange(nos):
        kk = (k + 0.5) * dkos
        ker[k] = kk * (kk**2. + m2_eff)**gamma * math.exp(-(kk / kcut)**2.)
    fft1.execute()
    fftw3.destroy_plan(fft1)

    for k in xrange(nos):
        ker[k] = norm * ker[k] / (k + 1)

    Fk_gpu = gpuarray.zeros((n / 2 + 1, n, n), dtype=lat.prec_complex)

    ker_gpu = gpuarray.to_gpu(ker)
    tmp_gpu = gpuarray.zeros((n, n, n), dtype=lat.prec_real)

    plan = fft.Plan(tmp_gpu.shape, lat.prec_real, lat.prec_complex)
    plan2 = fft.Plan(tmp_gpu.shape, lat.prec_complex, lat.prec_real)

    func(tmp_gpu,
         ker_gpu,
         np.uint32(nn),
         np.float64(os),
         np.uint32(lat.dimx),
         np.uint32(lat.dimy),
         np.uint32(lat.dimz),
         block=lat.cuda_block_1,
         grid=lat.cuda_grid)

    fft.fft(tmp_gpu, Fk_gpu, plan)

    if lat.test == True:
        print 'Testing mode on! Set testQ to False to disable this.\n'
        np.random.seed(1)

    rr1 = (np.random.normal(size=Fk_gpu.shape) +
           np.random.normal(size=Fk_gpu.shape) * 1j)

    Fk = Fk_gpu.get()
    Fk *= rr1
    Fk_gpu = gpuarray.to_gpu(Fk)

    fft.ifft(Fk_gpu, tmp_gpu, plan2)
    res = (tmp_gpu.get()).astype(lat.prec_real)

    res *= 1. / lat.VL

    return res
Exemplo n.º 23
0
def resample_sdbe_to_r2dbe_fft_interp(Xs, interp_kind="nearest"):
    """
	Resample SWARM spectrum product in time-domain at R2DBE rate using
	iFFT and then interpolation in the time-domain.
	
	Arguments:
	----------
	Xs -- MxN numpy array in which the zeroth dimension is increasing
	snapshot index, and the first dimension is the positive frequency
	half of the spectrum.
	interp_kind -- Kind of interpolation.
	
	Returns:
	--------
	xs -- The time-domain signal sampled at the R2DBE rate.
	"""
    # timestep sizes for SWARM and R2DBE rates
    dt_s = 1.0 / SWARM_RATE
    dt_r = 1.0 / R2DBE_RATE

    # cuFFT plan for complex to real DFT
    plan = cu_fft.Plan(SWARM_SAMPLES_PER_WINDOW, complex64, float32,
                       Xs.shape[0])

    # load complex spectrum to device
    x_d = gpuarray.to_gpu(Xs)
    xp_d = gpuarray.empty((Xs.shape[0], Xs.shape[1] + 1), dtype=complex64)

    # pad nyquist with zeros
    block = (32, 32, 1)
    grid = (int(ceil(1. * (Xs.shape[1] + 1) / block[1])),
            int(ceil(1. * Xs.shape[0] / block[0])))
    fill_padded = mod.get_function("fill_padded")
    fill_padded(int32(Xs.shape[0]),xp_d,int32(Xs.shape[1]+1),x_d,int32(Xs.shape[1]),\
     block=block,grid=grid)

    # allocate memory for time series
    xf_d = gpuarray.empty((Xs.shape[0], SWARM_SAMPLES_PER_WINDOW), float32)

    # calculate time series, include scaling
    cu_fft.ifft(xp_d, xf_d, plan, scale=True)

    # and interpolate
    xs_size = int(floor(
        Xs.shape[0] * SWARM_SAMPLES_PER_WINDOW * dt_s / dt_r)) - 1
    TPB = 64  # threads per block
    nB = int(ceil(1. * xs_size / TPB))  # number of blocks
    xs_d = gpuarray.empty(xs_size, float32)  # decimated time-series
    if interp_kind == 'nearest':
        # compile kernel
        nearest_interp = mod.get_function(interp_kind)
        # call kernel
        nearest_interp(xf_d,
                       xs_d,
                       int32(xs_size),
                       float64(dt_r / dt_s),
                       block=(TPB, 1, 1),
                       grid=(nB, 1))
    elif interp_kind == 'linear':
        # compile kernel
        linear_interp = mod.get_function("copy_texture_kernel")
        # get texture reference
        a_texref = mod.get_texref("a_tex")
        a_texref.set_filter_mode(drv.filter_mode.LINEAR)  # linear
        #a_texref.set_filter_mode(drv.filter_mode.POINT)	# nearest-neighbor
        # move time series to texture reference
        # following http://lists.tiker.net/pipermail/pycuda/2009-November/001916.html
        descr = drv.ArrayDescriptor()
        descr.format = drv.array_format.FLOAT
        descr.height = Xs.shape[0]
        descr.width = SWARM_SAMPLES_PER_WINDOW
        descr.num_channels = 1
        a_texref.set_address_2d(xf_d.gpudata, descr,
                                SWARM_SAMPLES_PER_WINDOW * 4)
        # set up linear interpolation over texture
        linear_interp(xs_d,int32(xs_size),float64(dt_r/dt_s),int32(SWARM_SAMPLES_PER_WINDOW),\
          texrefs=[a_texref],block=(TPB,1,1),grid=(nB,1))

    return xs_d.get()
Exemplo n.º 24
0
def fft_resample(x,
                 W,
                 new_len,
                 npad,
                 to_remove,
                 cuda_dict=dict(use_cuda=False)):
    """Do FFT resampling with a filter function (possibly using CUDA)

    Parameters
    ----------
    x : 1-d array
        The array to resample.
    W : 1-d array or gpuarray
        The filtering function to apply.
    new_len : int
        The size of the output array (before removing padding).
    npad : int
        Amount of padding to apply before resampling.
    to_remove : int
        Number of samples to remove after resampling.
    cuda_dict : dict
        Dictionary constructed using setup_cuda_multiply_repeated().

    Returns
    -------
    x : 1-d array
        Filtered version of x.
    """
    # add some padding at beginning and end to make this work a little cleaner
    x = _smart_pad(x, npad)
    old_len = len(x)
    shorter = new_len < old_len
    if not cuda_dict['use_cuda']:
        N = int(min(new_len, old_len))
        sl_1 = slice((N + 1) // 2)
        y_fft = np.zeros(new_len, np.complex128)
        x_fft = fft(x).ravel() * W
        y_fft[sl_1] = x_fft[sl_1]
        sl_2 = slice(-(N - 1) // 2, None)
        y_fft[sl_2] = x_fft[sl_2]
        y = np.real(ifft(y_fft, overwrite_x=True)).ravel()
    else:
        cuda_dict['x'].set(
            np.concatenate((x, np.zeros(max(new_len - old_len, 0), x.dtype))))
        # do the fourier-domain operations, results put in second param
        cudafft.fft(cuda_dict['x'], cuda_dict['x_fft'], cuda_dict['fft_plan'])
        cuda_multiply_inplace_c128(W, cuda_dict['x_fft'])
        # This is not straightforward, but because x_fft and y_fft share
        # the same data (and only one half of the full DFT is stored), we
        # don't have to transfer the slice like we do in scipy. All we
        # need to worry about is the Nyquist component, either halving it
        # or taking just the real component...
        use_len = new_len if shorter else old_len
        func = cuda_real_c128 if shorter else cuda_halve_c128
        if use_len % 2 == 0:
            nyq = int((use_len - (use_len % 2)) // 2)
            func(cuda_dict['x_fft'], slice=slice(nyq, nyq + 1))
        cudafft.ifft(cuda_dict['x_fft'],
                     cuda_dict['x'],
                     cuda_dict['ifft_plan'],
                     scale=False)
        y = cuda_dict['x'].get()[:new_len if shorter else None]

    # now let's trim it back to the correct size (if there was padding)
    if to_remove > 0:
        keep = np.ones((new_len), dtype='bool')
        keep[:to_remove] = False
        keep[-to_remove:] = False
        y = np.compress(keep, y)

    return y
Exemplo n.º 25
0
def sample_defrost_gpu(lat, func, gamma, m2_eff):
    """Calculates a sample of random values in the lattice

    lat = Lattice
    func = name of Cuda kernel
    n = size of cubic lattice
    gamma = -0.25 or +0.25
    m2_eff = effective mass

    This uses CuFFT to calculate FFTW.
    """
    import scikits.cuda.fft as fft
    import fftw3

    "Various constants:"
    mpl = lat.mpl
    n = lat.n
    nn = lat.nn
    os = 16
    nos = n*pow(os,2)
    dk = lat.dk
    dx = lat.dx
    dkos = dk/(2.*os)
    dxos = dx/os
    kcut = nn*dk/2.0
    norm = 0.5/(math.sqrt(2*pi*dk**3.)*mpl)*(dkos/dxos)

    ker = np.empty(nos,dtype = lat.prec_real)
    fft1 = fftw3.Plan(ker,ker, direction='forward', flags=['measure'],
                     realtypes = ['realodd 10'])

    for k in xrange(nos):
        kk = (k+0.5)*dkos
        ker[k]=kk*(kk**2. + m2_eff)**gamma*math.exp(-(kk/kcut)**2.)
    fft1.execute()
    fftw3.destroy_plan(fft1)

    for k in xrange(nos):
        ker[k] = norm*ker[k]/(k+1)

    Fk_gpu = gpuarray.zeros((n/2+1,n,n), dtype = lat.prec_complex)

    ker_gpu = gpuarray.to_gpu(ker)
    tmp_gpu = gpuarray.zeros((n,n,n),dtype = lat.prec_real)

    plan = fft.Plan(tmp_gpu.shape, lat.prec_real, lat.prec_complex)
    plan2 = fft.Plan(tmp_gpu.shape, lat.prec_complex, lat.prec_real)
    
    func(tmp_gpu, ker_gpu, np.uint32(nn), np.float64(os),
         np.uint32(lat.dimx), np.uint32(lat.dimy), np.uint32(lat.dimz),
         block = lat.cuda_block_1, grid = lat.cuda_grid)
    
    fft.fft(tmp_gpu, Fk_gpu, plan)
    
    if lat.test==True:
        print'Testing mode on! Set testQ to False to disable this.\n'
        np.random.seed(1)

    rr1 = (np.random.normal(size=Fk_gpu.shape)+
           np.random.normal(size=Fk_gpu.shape)*1j)

    Fk = Fk_gpu.get()
    Fk*= rr1
    Fk_gpu = gpuarray.to_gpu(Fk)

    fft.ifft(Fk_gpu, tmp_gpu, plan2)
    res = (tmp_gpu.get()).astype(lat.prec_real)

    res *= 1./lat.VL

    return res
Exemplo n.º 26
0
ii = 0
tmpimg = numpy.zeros((n, m, k), dtype=numpy.float32)

ln = sq + 5
mags = mag[indexp].sum()
del indexp
s = 3
N2 = int(N * 0.7)
N3 = int(N * 0.7)

gpu_data.set(sobject.astype(numpy.complex64))
pycuda.driver.memcpy_dtod(gpu_last.gpudata, gpu_data.gpudata, gpu_data.nbytes)
gpu_intensity.set(mag)
gpu_mask.set(sobm)
#print real_space.nbytes
for i in range(N):
    t0 = time()
    cu_fft.fft(gpu_data, gpu_data, plan)
    constrains_fourier(gpu_data, gpu_intensity)
    cu_fft.ifft(gpu_data, gpu_data, plan, True)
    constrains_real(gpu_data, gpu_last, gpu_mask, beta)
    pycuda.driver.memcpy_dtod(gpu_last.gpudata, gpu_data.gpudata, gpu_data.nbytes)
    t1 = time()
    ctx.synchronize()
    t2 = time()
    print("With CUDA, the full loop took %.3fs but after sync %.3fs" % (t1 - t0, t2 - t0))

del tmpimg
print "it took", time() - time0, N / (time() - time0)
print "smallest error", serr, "number", nerr
Exemplo n.º 27
0
print 'Testing fft/ifft..'
N = 4096 * 16
batch_size = 16

x = np.asarray(np.random.rand(batch_size, N), np.float32)
xf = np.fft.fft(x)
y = np.real(np.fft.ifft(xf))

x_gpu = gpuarray.to_gpu(x)
xf_gpu = gpuarray.empty((batch_size, N / 2 + 1), np.complex64)
plan_forward = cu_fft.Plan(N, np.float32, np.complex64, batch_size)
cu_fft.fft(x_gpu, xf_gpu, plan_forward)

y_gpu = gpuarray.empty_like(x_gpu)
plan_inverse = cu_fft.Plan(N, np.complex64, np.float32, batch_size)
cu_fft.ifft(xf_gpu, y_gpu, plan_inverse, True)

print 'Success status: ', np.allclose(y, y_gpu.get(), atol=1e-6)

print 'Testing in-place fft..'
x = np.asarray(np.random.rand(batch_size, N)+\
               1j*np.random.rand(batch_size, N), np.complex64)
x_gpu = gpuarray.to_gpu(x)

plan = cu_fft.Plan(N, np.complex64, np.complex64, batch_size)
cu_fft.fft(x_gpu, x_gpu, plan)

cu_fft.ifft(x_gpu, x_gpu, plan, True)

print 'Success status: ', np.allclose(x, x_gpu.get(), atol=1e-6)
Exemplo n.º 28
0
                  Ny,
                  block=blocksize,
                  grid=gridsize)
        Sb_kernel(FFTiB_d,
                  FIB_d,
                  FFToB_d,
                  d_d,
                  np.float32(beta),
                  Nx,
                  Ny,
                  block=blocksize,
                  grid=gridsize)

        # inverse FFT to compute S + 1 in each color channel
        fft_s = time.time()
        cu_fft.ifft(FFTiR_d, FFToR_d, plan, scale=True)
        cu_fft.ifft(FFTiG_d, FFToG_d, plan, scale=True)
        cu_fft.ifft(FFTiB_d, FFToB_d, plan, scale=True)
        fft_e = time.time()
        step_2_fft += fft_e - fft_s

        # merge real components of 3 complex color channels
        merge_r_kernel(S_d,
                       FFToR_d,
                       FFToG_d,
                       FFToB_d,
                       Nx,
                       Ny,
                       block=blocksize,
                       grid=gridsize)
Exemplo n.º 29
0
def resample_sdbe_to_r2dbe_zpfft(Xs):
	"""
	Resample SWARM spectrum product in time-domain at R2DBE rate using
	zero-padding and a radix-2 iFFT algorithm.
	
	Arguments:
	----------
	Xs -- MxN numpy array in which the zeroth dimension is increasing
	snapshot index, and the first dimension is the positive frequency
	half of the spectrum.
	
	Returns:
	--------
	xs -- The time-domain signal sampled at the R2DBE rate.
	next_start_vec -- Start indecies for each FFT window.

	"""

	# timestep sizes for SWARM and R2DBE rates
	dt_s = 1.0/SWARM_RATE
	dt_r = 1.0/R2DBE_RATE
	
	# we need to oversample by factor 64 and then undersample by factor 39
	simple_r = 64 # 4096
	simple_s = 39 # 2496
	fft_window_oversample = 2*SWARM_CHANNELS*simple_r # 2* due to real FFT
	
	# oversample timestep size
	dt_f = dt_s/simple_r
	
	# the timespan of one SWARM FFT window
	T_s = dt_s*SWARM_SAMPLES_PER_WINDOW
	
	# what are these...?
	x_t2_0 = None
	x_t2_1 = None
	
	# time vectors over one SWARM FFT window in different step sizes
	t_r = arange(0,T_s,dt_r)
	t_s = arange(0,T_s,dt_s)
	t_f = arange(0,T_s,dt_f)
	
	# offset in oversampled time series that corresponds to one dt_r step
	# from the last R2DBE rate sample in the previous window
	next_start = 0 
	
	# some time offsets...?
	offset_in_window_offset_s = list()
	offset_global_s = list()
	
	# total number of time series samples
	N_x = int(ceil(Xs.shape[0]*SWARM_SAMPLES_PER_WINDOW*dt_s/dt_r))
	# and initialize the output
	xs = zeros(N_x,dtype=float32)
	#fine_sample_index = zeros(N_x)
	next_start_vec = zeros(Xs.shape[0])
	# index in output where samples from next window are stored
	start_output = 0

	# cuFFT plan for complex to real DFT
	plan = cu_fft.Plan(fft_window_oversample,complex64,float32)

	# padding kernel
	fill_padded = mod.get_function("fill_padded")

	# downsampling kernel
	downsample = mod.get_function("downsample")

	# FFT scaling kernel
	scale = ElementwiseKernel(
			"float *a",
			"a[i] = {0} * a[i]".format(1./fft_window_oversample),"scale")

	# max size of resampled chunk from a single window
	xs_chunk_size_max = int32(ceil((1. * fft_window_oversample)/simple_s))

	# create memory on device for cuFFT
       	xf_d = gpuarray.empty(fft_window_oversample,dtype=float32)
	xp_d = gpuarray.zeros(fft_window_oversample/2+1, dtype=complex64)
	y_d = gpuarray.empty(xs_chunk_size_max,dtype=float32)

	for ii in range(Xs.shape[0]):

		# move window to device
		x_d = gpuarray.to_gpu(Xs[ii,:])

		# threads per block
		# number of blocks (keep the array as zeros to save time)
		TPB = 1024
		nB = int(ceil(1. * Xs.shape[1] / TPB))
		# pad with zeros to oversample by 64
		fill_padded(int32(1), xp_d, int32(fft_window_oversample/2+1),\
			    x_d, int32(Xs.shape[1]),\
			    block=(TPB,1,1), grid=(nB,1))

		# iFFT
		cu_fft.ifft(xp_d,xf_d,plan,scale=False)

		xs_chunk_size = int32(ceil((1. * fft_window_oversample - next_start)/simple_s))
		# threads per block
		TPB = 64
		# number of blocks
		nB = ceil(1. * xs_chunk_size / TPB).astype(int)
		## undersample by 39 to correct rate, and start at the correct 
		## offset in this window
		downsample(xf_d,int32(fft_window_oversample),\
				y_d,xs_chunk_size,
				int32(next_start),int32(simple_s),\
				block=(TPB,1,1),grid=(nB,1))

		# rescale from ifft using ElementwiseKernel
		scale(y_d)

		# pull data back onto host
		xs_chunk = y_d.get()

		# fill output numpy array
		stop_output = start_output+xs_chunk_size
		xs[start_output:stop_output] = xs_chunk[:xs_chunk_size]
		# update the starting index in the output array
		start_output = stop_output

		# mark the time of the last used sample relative to the start
		# of this window
		time_window_start_to_last_used_sample = t_f[next_start::39][-1]
		# calculate the remaining time in this window
		time_remaining_in_window = T_s-time_window_start_to_last_used_sample
		# convert to the equivalent number of oversample timesteps
		num_dt_f_steps_short = round(time_remaining_in_window/dt_f)
		next_start_vec[ii] = next_start
		if (num_dt_f_steps_short == 0):
			next_start = 0
		else:
			next_start = simple_s - num_dt_f_steps_short
	return xs,next_start_vec
Exemplo n.º 30
0
def ifft(invec, outvec, prec, itype, otype):
    cuplan = _get_inv_plan(invec.dtype, outvec.dtype, len(outvec))
    cu_fft.ifft(invec.data, outvec.data, cuplan)
Exemplo n.º 31
0
def resample_sdbe_to_r2dbe_fft_interp(Xs,interp_kind="nearest"):
	"""
	Resample SWARM spectrum product in time-domain at R2DBE rate using
	iFFT and then interpolation in the time-domain.
	
	Arguments:
	----------
	Xs -- MxN numpy array in which the zeroth dimension is increasing
	snapshot index, and the first dimension is the positive frequency
	half of the spectrum.
	interp_kind -- Kind of interpolation.
	
	Returns:
	--------
	xs -- The time-domain signal sampled at the R2DBE rate.
	"""
	# timestep sizes for SWARM and R2DBE rates
	dt_s = 1.0/SWARM_RATE
	dt_r = 1.0/R2DBE_RATE
	
	# cuFFT plan for complex to real DFT
	plan = cu_fft.Plan(SWARM_SAMPLES_PER_WINDOW,complex64,float32,Xs.shape[0])

	# load complex spectrum to device
	x_d = gpuarray.to_gpu(Xs)
	xp_d = gpuarray.empty((Xs.shape[0],Xs.shape[1]+1),dtype=complex64)

	# pad nyquist with zeros
	block = (32,32,1)
	grid = (int(ceil(1. * (Xs.shape[1]+1) / block[1])), int(ceil(1. * Xs.shape[0] / block[0])))
	fill_padded = mod.get_function("fill_padded")
	fill_padded(int32(Xs.shape[0]),xp_d,int32(Xs.shape[1]+1),x_d,int32(Xs.shape[1]),\
		block=block,grid=grid)

	# allocate memory for time series
	xf_d = gpuarray.empty((Xs.shape[0],SWARM_SAMPLES_PER_WINDOW),float32)

	# calculate time series, include scaling
	cu_fft.ifft(xp_d,xf_d,plan,scale=True)

	# and interpolate
	xs_size = int(floor(Xs.shape[0]*SWARM_SAMPLES_PER_WINDOW*dt_s/dt_r)) - 1
	TPB = 64				# threads per block
	nB = int(ceil(1. * xs_size / TPB))	# number of blocks
	xs_d = gpuarray.empty(xs_size,float32)	# decimated time-series 
	if interp_kind == 'nearest':
		# compile kernel
		nearest_interp = mod.get_function(interp_kind)
		# call kernel
		nearest_interp(xf_d,xs_d,int32(xs_size),float64(dt_r/dt_s),block=(TPB,1,1),grid=(nB,1))
	elif interp_kind == 'linear':
		# compile kernel
		linear_interp = mod.get_function("copy_texture_kernel")
		# get texture reference
		a_texref = mod.get_texref("a_tex")
		a_texref.set_filter_mode(drv.filter_mode.LINEAR)	# linear
		#a_texref.set_filter_mode(drv.filter_mode.POINT)	# nearest-neighbor
		# move time series to texture reference
		# following http://lists.tiker.net/pipermail/pycuda/2009-November/001916.html
		descr = drv.ArrayDescriptor()
		descr.format= drv.array_format.FLOAT
		descr.height = Xs.shape[0]
		descr.width = SWARM_SAMPLES_PER_WINDOW
		descr.num_channels = 1
		a_texref.set_address_2d(xf_d.gpudata,descr,SWARM_SAMPLES_PER_WINDOW*4)
		# set up linear interpolation over texture
		linear_interp(xs_d,int32(xs_size),float64(dt_r/dt_s),int32(SWARM_SAMPLES_PER_WINDOW),\
				texrefs=[a_texref],block=(TPB,1,1),grid=(nB,1))

	return xs_d.get()
Exemplo n.º 32
0
print 'Testing fft/ifft..'
N = 4096*16
batch_size = 16

x = np.asarray(np.random.rand(batch_size, N), np.float32)
xf = np.fft.fft(x)
y = np.real(np.fft.ifft(xf))

x_gpu = gpuarray.to_gpu(x)
xf_gpu = gpuarray.empty((batch_size, N/2+1), np.complex64)
plan_forward = cu_fft.Plan(N, np.float32, np.complex64, batch_size)
cu_fft.fft(x_gpu, xf_gpu, plan_forward)

y_gpu = gpuarray.empty_like(x_gpu)
plan_inverse = cu_fft.Plan(N, np.complex64, np.float32, batch_size)
cu_fft.ifft(xf_gpu, y_gpu, plan_inverse, True)

print 'Success status: ', np.allclose(y, y_gpu.get(), atol=1e-6)

print 'Testing in-place fft..'
x = np.asarray(np.random.rand(batch_size, N)+\
               1j*np.random.rand(batch_size, N), np.complex64)
x_gpu = gpuarray.to_gpu(x)

plan = cu_fft.Plan(N, np.complex64, np.complex64, batch_size)
cu_fft.fft(x_gpu, x_gpu, plan)

cu_fft.ifft(x_gpu, x_gpu, plan, True)

print 'Success status: ', np.allclose(x, x_gpu.get(), atol=1e-6)
Exemplo n.º 33
0
def ifft(invec,outvec,prec,itype,otype):
    cuplan = _get_inv_plan(invec.dtype,outvec.dtype,len(outvec))
    cu_fft.ifft(invec.data,outvec.data,cuplan)
Exemplo n.º 34
0
def fft_resample(x, W, new_len, npad, to_remove, cuda_dict=dict(use_cuda=False)):
    """Do FFT resampling with a filter function (possibly using CUDA)

    Parameters
    ----------
    x : 1-d array
        The array to resample.
    W : 1-d array or gpuarray
        The filtering function to apply.
    new_len : int
        The size of the output array (before removing padding).
    npad : int
        Amount of padding to apply before resampling.
    to_remove : int
        Number of samples to remove after resampling.
    cuda_dict : dict
        Dictionary constructed using setup_cuda_multiply_repeated().

    Returns
    -------
    x : 1-d array
        Filtered version of x.
    """
    # add some padding at beginning and end to make this work a little cleaner
    x = _smart_pad(x, npad)
    old_len = len(x)
    shorter = new_len < old_len
    if not cuda_dict["use_cuda"]:
        N = int(min(new_len, old_len))
        sl_1 = slice((N + 1) // 2)
        y_fft = np.zeros(new_len, np.complex128)
        x_fft = fft(x).ravel() * W
        y_fft[sl_1] = x_fft[sl_1]
        sl_2 = slice(-(N - 1) // 2, None)
        y_fft[sl_2] = x_fft[sl_2]
        y = np.real(ifft(y_fft, overwrite_x=True)).ravel()
    else:
        cuda_dict["x"].set(np.concatenate((x, np.zeros(max(new_len - old_len, 0), x.dtype))))
        # do the fourier-domain operations, results put in second param
        cudafft.fft(cuda_dict["x"], cuda_dict["x_fft"], cuda_dict["fft_plan"])
        cuda_multiply_inplace_c128(W, cuda_dict["x_fft"])
        # This is not straightforward, but because x_fft and y_fft share
        # the same data (and only one half of the full DFT is stored), we
        # don't have to transfer the slice like we do in scipy. All we
        # need to worry about is the Nyquist component, either halving it
        # or taking just the real component...
        use_len = new_len if shorter else old_len
        func = cuda_real_c128 if shorter else cuda_halve_c128
        if use_len % 2 == 0:
            nyq = int((use_len - (use_len % 2)) // 2)
            func(cuda_dict["x_fft"], slice=slice(nyq, nyq + 1))
        cudafft.ifft(cuda_dict["x_fft"], cuda_dict["x"], cuda_dict["ifft_plan"], scale=False)
        y = cuda_dict["x"].get()[: new_len if shorter else None]

    # now let's trim it back to the correct size (if there was padding)
    if to_remove > 0:
        keep = np.ones((new_len), dtype="bool")
        keep[:to_remove] = False
        keep[-to_remove:] = False
        y = np.compress(keep, y)

    return y