Python cufftExecC2R示例，scikits.cuda.cufft.cufftExecC2R Python示例

示例#1

0

显示文件

文件： fft_test.py 项目： sma-wideband/sdbe

def scenario_inplace_padded_C2R(batch,tic,toc):

  n = array([2*BENG_CHANNELS_],int32)
  inembed = array([16*(BENG_CHANNELS//16+1)],int32)
  onembed = array([2*inembed[0]],int32)
  plan = cufft.cufftPlanMany(1, n.ctypes.data, inembed.ctypes.data, 1, inembed[0],
  	                                       onembed.ctypes.data, 1, onembed[0],
  					       cufft.CUFFT_C2R, batch)

  data_shape = (batch,inembed[0])
  cpu_data = standard_normal(data_shape) + 1j * standard_normal(data_shape)
  cpu_data = cpu_data.astype(complex64)
  gpu_data  = cuda.mem_alloc(8*batch*inembed[0])		# complex64
  cuda.memcpy_htod(gpu_data,cpu_data)

  tic.record()
  cufft.cufftExecC2R(plan,int(gpu_data),int(gpu_data))
  toc.record()
  toc.synchronize()

  cpu_result = np.empty(batch*onembed[0],dtype=np.float32)
  cuda.memcpy_dtoh(cpu_result,gpu_data)
  cpu_result = cpu_result.reshape((batch,onembed[0]))[:,:2*BENG_CHANNELS_]/(2*BENG_CHANNELS_)
  result = irfft(cpu_data[:,:BENG_CHANNELS],axis=-1)
  print 'Batched in-place scenario'
  print 'test passed:',np.allclose(cpu_result,result)
  print 'GPU time:', tic.time_till(toc),' ms =  ',tic.time_till(toc)/(batch*0.5*13.128e-3),' x real (both SB)'

示例#2

0

显示文件

def scenario_inplace_padded_C2R(batch, tic, toc):

    n = array([2 * BENG_CHANNELS_], int32)
    inembed = array([16 * (BENG_CHANNELS // 16 + 1)], int32)
    onembed = array([2 * inembed[0]], int32)
    plan = cufft.cufftPlanMany(1, n.ctypes.data, inembed.ctypes.data, 1,
                               inembed[0], onembed.ctypes.data, 1, onembed[0],
                               cufft.CUFFT_C2R, batch)

    data_shape = (batch, inembed[0])
    cpu_data = standard_normal(data_shape) + 1j * standard_normal(data_shape)
    cpu_data = cpu_data.astype(complex64)
    gpu_data = cuda.mem_alloc(8 * batch * inembed[0])  # complex64
    cuda.memcpy_htod(gpu_data, cpu_data)

    tic.record()
    cufft.cufftExecC2R(plan, int(gpu_data), int(gpu_data))
    toc.record()
    toc.synchronize()

    cpu_result = np.empty(batch * onembed[0], dtype=np.float32)
    cuda.memcpy_dtoh(cpu_result, gpu_data)
    cpu_result = cpu_result.reshape(
        (batch, onembed[0]))[:, :2 * BENG_CHANNELS_] / (2 * BENG_CHANNELS_)
    result = irfft(cpu_data[:, :BENG_CHANNELS], axis=-1)
    print 'Batched in-place scenario'
    print 'test passed:', np.allclose(cpu_result, result)
    print 'GPU time:', tic.time_till(toc), ' ms =  ', tic.time_till(toc) / (
        batch * 0.5 * 13.128e-3), ' x real (both SB)'

示例#3

0

显示文件

    def __fft_linear_interp(self):
        '''
    Resample using linear interpolation.
    '''
        self.logger.debug('Resampling using linear interpolation')
        threads_per_block = 512
        blocks_per_grid = int(
            ceil(1. * self.num_r2dbe_samples / threads_per_block))

        # allocate device memory
        self.__gpu_time_series_1 = cuda.mem_alloc(4 * self.num_r2dbe_samples /
                                                  2)  # 2048 MHz clock
        self.__gpu_time_series_0 = cuda.mem_alloc(4 * self.num_r2dbe_samples /
                                                  2)  # 2048 MHz clock
        gpu_r2dbe_spec = cuda.mem_alloc(
            8 * (4096 / 2 + 1) * self.__bandlimit_batch)  # memory peanuts

        for (phased_sum_in, phased_sum_out) in zip(
            (self.__gpu_beng_0, self.__gpu_beng_1),
            (self.__gpu_time_series_0, self.__gpu_time_series_1)):
            # Turn SWARM snapshots into timeseries
            cufft.cufftExecC2R(self.__plan_A, int(phased_sum_in),
                               int(phased_sum_in))
            # resample
            gpu_resamp = cuda.mem_alloc(
                4 * self.num_r2dbe_samples)  # 25% of device memory
            self.__linear_interp(phased_sum_in,
                                 int32(self.num_swarm_samples),
                                 gpu_resamp,
                                 int32(self.num_r2dbe_samples),
                                 float64(SWARM_RATE / R2DBE_RATE),
                                 float32(1.),
                                 block=(threads_per_block, 1, 1),
                                 grid=(blocks_per_grid, 1))
            #float32(1./(2*BENG_CHANNELS_)),
            phased_sum_in.free()

            # loop through resampled time series in chunks of batch_B num_r2dbe_samples/4096/__bandlimit_batch
            for ib in range(self.num_r2dbe_samples / 4096 /
                            self.__bandlimit_batch):
                # compute spectrum with 4096 MHz sample clock
                cufft.cufftExecR2C(
                    self.__plan_B,
                    int(gpu_resamp) +
                    int(4 * ib * 4096 * self.__bandlimit_batch),
                    int(gpu_r2dbe_spec))
                # invert to time series with BW of 1024 MHz, masking out first 150 MHz and last (1024-150) MHz. (pointers are 4-byte values)
                cufft.cufftExecC2R(
                    self.__plan_C,
                    int(gpu_r2dbe_spec) + int(8 * 150),
                    int(phased_sum_out) +
                    int(4 * ib * 2048 * self.__bandlimit_batch))

            gpu_resamp.free()

        gpu_r2dbe_spec.free()

示例#4

0

显示文件

def scenario_contiguous_channels_wpadding(batch, tic, toc):
    '''
  # Scenario: batched IFFT of 39 snapshots
  # 39 x 16385 complex64 --> 39 x 32768 float32
  # padding complex input so channel dimension has 16400 elements.  
  '''

    n = array([2 * BENG_CHANNELS_], int32)
    beng_channels_padded = 16400

    # create batched FFT plan configuration
    inembed = array([beng_channels_padded], int32)
    onembed = array([2 * BENG_CHANNELS_], int32)
    istride = int32(beng_channels_padded)
    plan = cufft.cufftPlanMany(int32(1), n.ctypes.data, inembed.ctypes.data,
                               int32(1), istride,
                               onembed.ctypes.data, int32(1),
                               int32(2 * BENG_CHANNELS_), cufft.CUFFT_C2R,
                               int32(batch))
    # construct arrays
    gpu_in = cuda.mem_alloc(8 * batch * beng_channels_padded)  # complex64
    gpu_out = cuda.mem_alloc(4 * batch * 2 * BENG_CHANNELS_)  # float32
    cpu_in = standard_normal(
        batch * beng_channels_padded) + 1j * standard_normal(
            batch * beng_channels_padded)
    cpu_in = cpu_in.astype(complex64)
    cuda.memcpy_htod(gpu_in, cpu_in)
    # execute plan

    tic.record()
    cufft.cufftExecC2R(plan, int(gpu_in), int(gpu_out))
    toc.record()
    toc.synchronize()

    # read out result
    cpu_out = empty(batch * 2 * BENG_CHANNELS_, float32)
    cuda.memcpy_dtoh(cpu_out, gpu_out)
    cpu_out.resize((batch, 2 * BENG_CHANNELS_))

    # execute on CPU
    cpu = irfft(cpu_in.reshape(
        (batch, beng_channels_padded))[:, :BENG_CHANNELS],
                axis=-1)

    # destroy plan
    cufft.cufftDestroy(plan)

    # test
    print '\nContiguous Channel w/ Padding Scenario:'
    print '1-D %d-element C2R iFFT in batch of %d.' % (n, batch)
    print 'test results: ' 'pass' if allclose(cpu, cpu_out /
                                              (2 * BENG_CHANNELS_)) else 'fail'
    print 'real time:', batch * 13.128e-3, ' ms'
    print 'GPU time:', tic.time_till(toc), ' ms =  ', tic.time_till(toc) / (
        batch * 0.5 * 13.128e-3), ' x real (both SB)'

示例#5

0

显示文件

def scenario_contiguous_channels(batch, tic, toc):
    '''
  # Scenario: batched IFFT of batch snapshots
  # batch x 16385 complex64 --> batch x 32768 float32
  # no padding.
  '''

    n = array([2 * BENG_CHANNELS_], int32)
    seed(12740)

    # create batched FFT plan configuration
    inembed = array([BENG_CHANNELS], int32)
    onembed = array([2 * BENG_CHANNELS_], int32)
    plan = cufft.cufftPlanMany(1, n.ctypes.data, inembed.ctypes.data, 1,
                               BENG_CHANNELS, onembed.ctypes.data, 1,
                               2 * BENG_CHANNELS_, cufft.CUFFT_C2R, batch)

    # construct arrays
    gpu_in = cuda.mem_alloc(8 * batch * BENG_CHANNELS)  # complex64
    gpu_out = cuda.mem_alloc(4 * batch * 2 * BENG_CHANNELS_)  # float32
    cpu_in = standard_normal(
        batch * BENG_CHANNELS) + 1j * standard_normal(batch * BENG_CHANNELS)
    cpu_in = cpu_in.astype(complex64)
    cuda.memcpy_htod(gpu_in, cpu_in)

    # execute plan
    tic.record()
    cufft.cufftExecC2R(plan, int(gpu_in), int(gpu_out))
    toc.record()
    toc.synchronize()

    # read out result
    cpu_out = empty(batch * 2 * BENG_CHANNELS_, float32)
    cuda.memcpy_dtoh(cpu_out, gpu_out)
    cpu_out.resize((batch, 2 * BENG_CHANNELS_))

    # execute on CPU
    cpu = irfft(cpu_in.reshape((batch, BENG_CHANNELS)), axis=-1)

    # destroy plan
    cufft.cufftDestroy(plan)

    # test
    print '\nContiguous Channel Scenario:'
    print '1-D %d-element C2R iFFT in batch of %d.' % (n, batch)
    print 'test results: ' 'pass' if allclose(cpu, cpu_out /
                                              (2 * BENG_CHANNELS_)) else 'fail'
    print 'real time:', batch * 13.128e-3, ' ms'
    print 'GPU time:', tic.time_till(toc), ' ms =  ', tic.time_till(toc) / (
        batch * 0.5 * 13.128e-3), ' x real (both SB)'

示例#6

0

显示文件

def scenario_contiguous_channels_oversampled64(batch, tic, toc):
    '''
  Scenario: batched IFFT of 2*2**14*64 channels
  '''
    fft_window_oversample = 64 * 2 * 2**14
    n = array([fft_window_oversample], int32)

    # create batched FFT plan configuration
    inembed = array([fft_window_oversample / 2 + 1], int32)
    onembed = array([fft_window_oversample], int32)
    plan = cufft.cufftPlanMany(1, n.ctypes.data, inembed.ctypes.data, 1,
                               fft_window_oversample / 2 + 1,
                               onembed.ctypes.data, 1, fft_window_oversample,
                               cufft.CUFFT_C2R, batch)
    # construct arrays
    gpu_in = cuda.mem_alloc(8 * batch *
                            (fft_window_oversample / 2 + 1))  # complex64
    gpu_out = cuda.mem_alloc(4 * batch * fft_window_oversample)  # float32
    data_shape = (batch, fft_window_oversample / 2 + 1)
    cpu_in = standard_normal(data_shape) + 1j * standard_normal(data_shape)
    cpu_in = cpu_in.astype(complex64)
    cuda.memcpy_htod(gpu_in, cpu_in)
    # execute plan

    tic.record()
    cufft.cufftExecC2R(plan, int(gpu_in), int(gpu_out))
    toc.record()
    toc.synchronize()

    # read out result
    cpu_out = empty((batch, fft_window_oversample), float32)
    cuda.memcpy_dtoh(cpu_out, gpu_out)

    # execute on CPU
    cpu = irfft(cpu_in, axis=-1)

    # destroy plan
    cufft.cufftDestroy(plan)

    # test
    print '\nOversampling by x64 Scenario with batches:'
    print '1-D %d-element C2R iFFT in batch of %d.' % (n, batch)
    print 'test results: ' 'pass' if allclose(
        cpu, cpu_out / (fft_window_oversample)) else 'fail'
    print 'real time:', batch * 13.128e-3, ' ms'
    print 'GPU time:', tic.time_till(toc), ' ms =  ', tic.time_till(toc) / (
        batch * 0.5 * 13.128e-3), ' x real (both SB)'

示例#7

0

显示文件

文件： fft_test.py 项目： sma-wideband/sdbe

def scenario_contiguous_channels(batch,tic,toc):
  '''
  # Scenario: batched IFFT of batch snapshots
  # batch x 16385 complex64 --> batch x 32768 float32
  # no padding.
  '''
  
  n = array([2*BENG_CHANNELS_],int32)
  seed(12740)
  
  # create batched FFT plan configuration
  inembed = array([BENG_CHANNELS],int32)
  onembed = array([2*BENG_CHANNELS_],int32)
  plan = cufft.cufftPlanMany(1, n.ctypes.data, inembed.ctypes.data, 1, BENG_CHANNELS,
  	                                     onembed.ctypes.data, 1, 2*BENG_CHANNELS_,
  					     cufft.CUFFT_C2R, batch)
  
  # construct arrays 
  gpu_in  = cuda.mem_alloc(8*batch*BENG_CHANNELS)		# complex64
  gpu_out = cuda.mem_alloc(4*batch*2*BENG_CHANNELS_)	# float32
  cpu_in = standard_normal(batch*BENG_CHANNELS) + 1j * standard_normal(batch*BENG_CHANNELS)
  cpu_in = cpu_in.astype(complex64)
  cuda.memcpy_htod(gpu_in,cpu_in)
  
  # execute plan
  tic.record()
  cufft.cufftExecC2R(plan,int(gpu_in),int(gpu_out))
  toc.record()
  toc.synchronize()
  
  # read out result
  cpu_out = empty(batch*2*BENG_CHANNELS_,float32)
  cuda.memcpy_dtoh(cpu_out,gpu_out)
  cpu_out.resize((batch,2*BENG_CHANNELS_))
  
  # execute on CPU
  cpu = irfft(cpu_in.reshape((batch,BENG_CHANNELS)),axis=-1)
  
  # destroy plan
  cufft.cufftDestroy(plan)
  
  # test
  print '\nContiguous Channel Scenario:'
  print '1-D %d-element C2R iFFT in batch of %d.' % (n, batch)
  print 'test results: ' 'pass' if allclose(cpu,cpu_out/(2*BENG_CHANNELS_)) else 'fail'
  print 'real time:', batch * 13.128e-3,' ms'
  print 'GPU time:', tic.time_till(toc),' ms =  ',tic.time_till(toc)/(batch*0.5*13.128e-3),' x real (both SB)'

示例#8

0

显示文件

文件： fft_test.py 项目： sma-wideband/sdbe

def scenario_contiguous_channels_wpadding(batch,tic,toc):
  '''
  # Scenario: batched IFFT of 39 snapshots
  # 39 x 16385 complex64 --> 39 x 32768 float32
  # padding complex input so channel dimension has 16400 elements.  
  '''
  
  n = array([2*BENG_CHANNELS_],int32)
  beng_channels_padded = 16400
  
  # create batched FFT plan configuration
  inembed = array([beng_channels_padded],int32)
  onembed = array([2*BENG_CHANNELS_],int32)
  istride = int32(beng_channels_padded)
  plan = cufft.cufftPlanMany(int32(1), n.ctypes.data, inembed.ctypes.data, int32(1), istride,
  	                                     onembed.ctypes.data, int32(1), int32(2*BENG_CHANNELS_),
  					     cufft.CUFFT_C2R, int32(batch))
  # construct arrays 
  gpu_in  = cuda.mem_alloc(8*batch*beng_channels_padded)	# complex64
  gpu_out = cuda.mem_alloc(4*batch*2*BENG_CHANNELS_)	# float32
  cpu_in = standard_normal(batch*beng_channels_padded) + 1j * standard_normal(batch*beng_channels_padded)
  cpu_in = cpu_in.astype(complex64)
  cuda.memcpy_htod(gpu_in,cpu_in)
  # execute plan
  
  tic.record()
  cufft.cufftExecC2R(plan,int(gpu_in),int(gpu_out))
  toc.record()
  toc.synchronize()
  
  # read out result
  cpu_out = empty(batch*2*BENG_CHANNELS_,float32)
  cuda.memcpy_dtoh(cpu_out,gpu_out)
  cpu_out.resize((batch,2*BENG_CHANNELS_))
  
  # execute on CPU
  cpu = irfft(cpu_in.reshape((batch,beng_channels_padded))[:,:BENG_CHANNELS],axis=-1)
  
  # destroy plan
  cufft.cufftDestroy(plan)
  
  # test
  print '\nContiguous Channel w/ Padding Scenario:'
  print '1-D %d-element C2R iFFT in batch of %d.' % (n, batch)
  print 'test results: ' 'pass' if allclose(cpu,cpu_out/(2*BENG_CHANNELS_)) else 'fail'
  print 'real time:', batch * 13.128e-3,' ms'
  print 'GPU time:', tic.time_till(toc),' ms =  ',tic.time_till(toc)/(batch*0.5*13.128e-3),' x real (both SB)'

示例#9

0

显示文件

文件： fft_test.py 项目： sma-wideband/sdbe

def scenario_contiguous_channels_oversampled64(batch,tic,toc):
  '''
  Scenario: batched IFFT of 2*2**14*64 channels
  '''
  fft_window_oversample = 64*2*2**14
  n = array([fft_window_oversample],int32)
  
  # create batched FFT plan configuration
  inembed = array([fft_window_oversample/2+1],int32)
  onembed = array([fft_window_oversample],int32)
  plan = cufft.cufftPlanMany(1, n.ctypes.data, inembed.ctypes.data, 1, fft_window_oversample/2+1,
  	                                     onembed.ctypes.data, 1, fft_window_oversample,
  					     cufft.CUFFT_C2R, batch)
  # construct arrays 
  gpu_in  = cuda.mem_alloc(8*batch*(fft_window_oversample/2+1))	# complex64
  gpu_out = cuda.mem_alloc(4*batch*fft_window_oversample)	# float32
  data_shape = (batch,fft_window_oversample/2+1)
  cpu_in = standard_normal(data_shape) + 1j * standard_normal(data_shape)
  cpu_in = cpu_in.astype(complex64)
  cuda.memcpy_htod(gpu_in,cpu_in)
  # execute plan
  
  tic.record()
  cufft.cufftExecC2R(plan,int(gpu_in),int(gpu_out))
  toc.record()
  toc.synchronize()
  
  # read out result
  cpu_out = empty((batch,fft_window_oversample),float32)
  cuda.memcpy_dtoh(cpu_out,gpu_out)
  
  # execute on CPU
  cpu = irfft(cpu_in,axis=-1)
  
  # destroy plan
  cufft.cufftDestroy(plan)
  
  # test
  print '\nOversampling by x64 Scenario with batches:'
  print '1-D %d-element C2R iFFT in batch of %d.' % (n, batch)
  print 'test results: ' 'pass' if allclose(cpu,cpu_out/(fft_window_oversample)) else 'fail'
  print 'real time:', batch * 13.128e-3,' ms'
  print 'GPU time:', tic.time_till(toc),' ms =  ',tic.time_till(toc)/(batch*0.5*13.128e-3),' x real (both SB)'

示例#10

0

显示文件

    def __fft_resample(self):
        '''
    Resample using FFTs.
    Requires that use_fft_resample flag is True.
    individual phased sums resampled at 2048 MHz. 
    '''
        self.logger.debug('Resampling using FFTs')

        #device memory allocation
        self.__gpu_time_series_0 = cuda.mem_alloc(4 * self.num_r2dbe_samples /
                                                  2)  # 2048 MHz clock
        self.__gpu_time_series_1 = cuda.mem_alloc(4 * self.num_r2dbe_samples /
                                                  2)  # 2048 MHz clock
        gpu_swarm = cuda.mem_alloc(4 * self.num_swarm_samples)

        # loop over phased sums
        for (phased_sum_in, phased_sum_out) in zip(
            (self.__gpu_beng_0, self.__gpu_beng_1),
            (self.__gpu_time_series_0, self.__gpu_time_series_1)):
            cufft.cufftExecC2R(self.__plan_A, int(phased_sum_in),
                               int(gpu_swarm))
            phased_sum_in.free()

            gpu_tmp = cuda.mem_alloc(8 * int(39 * BENG_CHANNELS_ + 1) *
                                     self.__bandlimit_batch)
            for ib in range((BENG_BUFFER_IN_COUNTS - 1) * BENG_SNAPSHOTS / 39 /
                            self.__bandlimit_batch):
                # Turn concatenated SWARM time series into single spectrum
                cufft.cufftExecR2C(
                    self.__plan_B,
                    int(gpu_swarm) + int(4 * 39 * 2 * BENG_CHANNELS_ *
                                         self.__bandlimit_batch * ib),
                    int(gpu_tmp))
                # Turn padded SWARM spectrum into time series with R2DBE sampling rate
                cufft.cufftExecC2R(
                    self.__plan_C,
                    int(gpu_tmp) + int(8 * 150 * 512),
                    int(phased_sum_out) + int(4 * 32 * 2 * BENG_CHANNELS_ *
                                              ib * self.__bandlimit_batch))
            gpu_tmp.free()

        gpu_swarm.free()

示例#11

0

显示文件

    gpumeminfo(cuda)
    cpu_beng_spectra_1 = empty(
        ((BENG_BUFFER_IN_COUNTS - 1) * BENG_SNAPSHOTS, BENG_CHANNELS),
        dtype=complex64)
    cuda.memcpy_dtoh(cpu_beng_spectra_1, gpu_beng_1)

# allocate memory for time series
gpu_r2dbe = cuda.mem_alloc(4 * num_r2dbe_samples / 2)
if not in_place:
    gpu_swarm = cuda.mem_alloc(4 * num_swarm_samples)

for SB in (gpu_beng_0, gpu_beng_1):
    #for SB in (gpu_beng_1,):
    # Turn SWARM snapshots into timeseries
    if in_place:
        cufft.cufftExecC2R(plan_A, int(SB), int(SB))
    else:
        cufft.cufftExecC2R(plan_A, int(SB), int(gpu_swarm))
        SB.free()
    if DEBUG:
        print 'DEBUG::loading cpu_beng_timeseries_1'
        gpumeminfo(cuda)
        if in_place:
            cpu_beng_timeseries_1 = empty(
                ((BENG_BUFFER_IN_COUNTS - 1) * BENG_SNAPSHOTS,
                 2 * BENG_CHANNELS),
                dtype=float32)
            cuda.memcpy_dtoh(cpu_beng_timeseries_1, SB)
            cpu_beng_timeseries_1 = cpu_beng_timeseries_1[:, :2 *
                                                          BENG_CHANNELS_]
        else:

示例#12

0

显示文件

def scenario_contiguous_batched39_resample(num_snapshots, tic, toc):
    '''
  # Scenario: Fourier resample of num_snapshots 
  # A iFFT: [num_snapshots,16385] complex64 --> 
  # B FFT: [39,num_snapshots/39 * 32768] float32 --> 
  # C iFFT + zero-padding: [39,num_snapshots/39* 32768*4096/2496/ 2 + 1] complex 64 -->
  # [39,num_snapshots * 32768 * 4096 / 2496] float32
  #
  # 1 C(B(A(gpu_1)))  = C(B(gpu_2)) = C(gpu_1) = gpu_2
  # num_snapshots is a multiple of 39.  
  # A executed using batch = num_snapshots
  # B&C executed using batch = num_snapshots / 39
  '''
    print '\nContiguous channel Fourier resampling scenario in batches of 39:'
    assert num_snapshots % 39 is 0, 'error: num_snapshots must be integer multiple of 39'

    # construct arrays
    batch = num_snapshots / 39
    print 'batch: %d' % batch
    gpu_1 = cuda.mem_alloc(
        int(8 * batch *
            (39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1)))
    gpu_2 = cuda.mem_alloc(
        int(4 * batch * (39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE)))
    cpu_in = standard_normal(
        num_snapshots *
        BENG_CHANNELS) + 1j * standard_normal(num_snapshots * BENG_CHANNELS)
    cpu_in = cpu_in.astype(complex64)

    # create FFT plans
    n_A = array([2 * BENG_CHANNELS_], int32)
    inembed_A = array([BENG_CHANNELS], int32)
    onembed_A = array([2 * BENG_CHANNELS_], int32)
    plan_A = cufft.cufftPlanMany(1, n_A.ctypes.data, inembed_A.ctypes.data, 1,
                                 BENG_CHANNELS, onembed_A.ctypes.data, 1,
                                 2 * BENG_CHANNELS_, cufft.CUFFT_C2R,
                                 num_snapshots)

    n_B = array([39 * 2 * BENG_CHANNELS_], int32)
    inembed_B = array([39 * 2 * BENG_CHANNELS_], int32)
    onembed_B = array(
        [int(39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1)],
        int32)
    plan_B = cufft.cufftPlanMany(
        1, n_B.ctypes.data, inembed_B.ctypes.data, 1, 39 * 2 * BENG_CHANNELS_,
        onembed_B.ctypes.data, 1,
        int32(39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1),
        cufft.CUFFT_R2C, batch)

    n_C = array([39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE], int32)
    inembed_C = array(
        [39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1], int32)
    onembed_C = array([39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE],
                      int32)
    plan_C = cufft.cufftPlanMany(
        1, n_C.ctypes.data, inembed_C.ctypes.data, 1,
        int32(39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1),
        onembed_C.ctypes.data, 1,
        int32(39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE),
        cufft.CUFFT_C2R, batch)

    # zero out gpu_1
    kernel_module = SourceModule(kernel_source)
    zero_out = kernel_module.get_function('zero_out')

    # sanity check:
    zero_out(
        gpu_1,
        int32(batch *
              (39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1)),
        block=(1024, 1, 1),
        grid=(int(
            ceil(batch *
                 (39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1) /
                 1024.)), 1))
    cpu_out = empty(
        (batch * (39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1)),
        complex64)
    cuda.memcpy_dtoh(cpu_out, gpu_1)
    assert len(unique(cpu_out)) == 1, 'problem with zero_out'

    # move data to device
    cuda.memcpy_htod(gpu_1, cpu_in)

    tic.record()

    # Turn SWARM snapshots into timeseries
    cufft.cufftExecC2R(plan_A, int(gpu_1), int(gpu_2))

    # zero out gpu_1
    zero_out(
        gpu_1,
        int32(batch *
              (39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1)),
        block=(1024, 1, 1),
        grid=(int(
            ceil(batch *
                 (39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1) /
                 1024.)), 1))

    # Turn concatenated SWARM time series into single spectrum (already zero-padded)
    cufft.cufftExecR2C(plan_B, int(gpu_2), int(gpu_1))

    # Turn padded SWARM spectrum into time series with R2DBE sampling rate
    cufft.cufftExecC2R(plan_C, int(gpu_1), int(gpu_2))

    toc.record()
    toc.synchronize()

    # check on CPU
    cpu_A = irfft(cpu_in.reshape(num_snapshots, BENG_CHANNELS),
                  axis=-1).astype(float32)
    cpu_B = rfft(cpu_A.reshape(batch, 39 * 2 * BENG_CHANNELS_),
                 axis=-1).astype(complex64)
    cpu_C = irfft(hstack([
        cpu_B,
        zeros((batch,
               (39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1) -
               (39 * 2 * BENG_CHANNELS_ / 2 + 1)), complex64)
    ]),
                  axis=-1)
    cpu_out = empty(
        num_snapshots * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE, float32)
    cuda.memcpy_dtoh(cpu_out, gpu_2)

    print 'test results: ', 'pass' if allclose(
        cpu_C.flatten(), cpu_out /
        (cpu_C.shape[-1] * 2 * BENG_CHANNELS_)) else 'fail'
    print 'max residual: ', max(
        abs(cpu_C.flatten() - cpu_out /
            (cpu_C.shape[-1] * 2 * BENG_CHANNELS_)))
    print 'GPU time:', tic.time_till(toc), ' ms = ', tic.time_till(toc) / (
        num_snapshots * 0.5 * 13.128e-3), ' x real (both SB)'

    # destroy plans
    cufft.cufftDestroy(plan_A)
    cufft.cufftDestroy(plan_B)
    cufft.cufftDestroy(plan_C)

示例#13

0

显示文件

文件： resample.py 项目： sma-wideband/sdbe

def fft_batched(gpu_1,gpu_2,num_snapshots,snapshots_per_batch=39,cpu_check=True):
  '''
  gpu_1: pointer to Mx16385 array on GPU device where zeroth dimension is positive frequency half of spectrum
        and the first dimension is is increasing snapshot index.  This array will be destroyed.
	Must have byte size: int(8*batch_size*(snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1))
  gpu_2: pointer to result
  snapshots_per_batch: number of snapshots grouped for resampling (% 39 == 0)
  '''
  tic = cuda.Event()
  toc = cuda.Event()

  batch_size = num_snapshots / snapshots_per_batch
  print 'batch size: %d' % batch_size

  # create FFT plans
  n_A = array([2*BENG_CHANNELS_],int32)
  inembed_A = array([BENG_CHANNELS],int32)
  onembed_A = array([2*BENG_CHANNELS_],int32)
  plan_A = cufft.cufftPlanMany(1, n_A.ctypes.data, inembed_A.ctypes.data, 1, BENG_CHANNELS,
  	                                     onembed_A.ctypes.data, 1, 2*BENG_CHANNELS_,
  					     cufft.CUFFT_C2R, num_snapshots)

  n_B = array([snapshots_per_batch*2*BENG_CHANNELS_],int32)
  inembed_B = array([snapshots_per_batch*2*BENG_CHANNELS_],int32)
  onembed_B = array([int(snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1)],int32)
  plan_B = cufft.cufftPlanMany(1, n_B.ctypes.data,
			inembed_B.ctypes.data,1,snapshots_per_batch*2*BENG_CHANNELS_,
			onembed_B.ctypes.data,1,int32(snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1),
					cufft.CUFFT_R2C, batch_size)

  n_C = array([snapshots_per_batch*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE],int32)
  inembed_C = array([snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1],int32)
  onembed_C = array([snapshots_per_batch*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE],int32)
  plan_C = cufft.cufftPlanMany(1, n_C.ctypes.data,
			inembed_C.ctypes.data,1,int32(snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1),
			onembed_C.ctypes.data,1,int32(snapshots_per_batch*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE),
					cufft.CUFFT_C2R, batch_size)

  # fetch kernel that zeroes out an array
  kernel_module = SourceModule(kernel_source)
  zero_out = kernel_module.get_function('zero_out')

  tic.record()

  # Turn SWARM snapshots into timeseries
  cufft.cufftExecC2R(plan_A,int(gpu_1),int(gpu_2))

  # zero out gpu_1
  zero_out(gpu_1,int32(batch_size*(snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1)),
	block=(1024,1,1),
	grid=(int(ceil(batch_size*(snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1)/1024.)),1))

  # Turn concatenated SWARM time series into single spectrum (already zero-padded)
  cufft.cufftExecR2C(plan_B,int(gpu_2),int(gpu_1))

  # Turn padded SWARM spectrum into time series with R2DBE sampling rate
  cufft.cufftExecC2R(plan_C,int(gpu_1),int(gpu_2))

  toc.record()
  toc.synchronize()

  # check on CPU
  if (cpu_check):
    cpu_A = irfft(cpu_in.reshape(num_snapshots,BENG_CHANNELS),axis=-1).astype(float32)
    cpu_B = rfft(cpu_A.reshape(batch_size,snapshots_per_batch*2*BENG_CHANNELS_),axis=-1).astype(complex64)
    cpu_C = irfft(hstack([cpu_B, 
			zeros((batch_size,(snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1)-
				(snapshots_per_batch*BENG_CHANNELS_+1)),complex64)]),axis=-1)
    cpu_out = empty(num_snapshots*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE,float32)
    cuda.memcpy_dtoh(cpu_out,gpu_2)

    print 'test results: ', 'pass' if allclose(cpu_C.flatten(),cpu_out/(cpu_C.shape[-1]*2*BENG_CHANNELS_)) else 'fail'
    print 'max residual: ',max(abs(cpu_C.flatten()-cpu_out/(cpu_C.shape[-1]*2*BENG_CHANNELS_)))

  print 'GPU time:', tic.time_till(toc),' ms = ',tic.time_till(toc)/(num_snapshots*0.5*13.128e-3),' x real (both SB)' 

  # destroy plans
  cufft.cufftDestroy(plan_A)
  cufft.cufftDestroy(plan_B)
  cufft.cufftDestroy(plan_C)

示例#14

0

显示文件

文件： resample.py 项目： sma-wideband/sdbe

def fft_interp(gpu_1,gpu_2,num_snapshots,interp_kind='nearest',cpu_check=True):
  '''
  Batched fft to time series and then interpolation to resample.
  No filter applied yet...
  '''
  tic = cuda.Event()
  toc = cuda.Event()

  batch_size = num_snapshots
  print 'batch size: %d' % batch_size

  # create batched FFT plan configuration
  n = array([2*BENG_CHANNELS_],int32)
  inembed = array([BENG_CHANNELS],int32)
  onembed = array([2*BENG_CHANNELS_],int32)
  plan = cufft.cufftPlanMany(1, n.ctypes.data,
 			inembed.ctypes.data, 1, BENG_CHANNELS,
                        onembed.ctypes.data, 1, 2*BENG_CHANNELS_,
  			cufft.CUFFT_C2R, batch_size)

  # fetch kernel that resamples 
  kernel_module = SourceModule(kernel_source)
  interp_1d = kernel_module.get_function(interp_kind)

  # execute plan
  cufft.cufftExecC2R(plan,int(gpu_1),int(gpu_2))

  # interpolate
  tic.record()
  xs_size = int(floor(batch_size*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE)) - 1
  TPB = 512                               # threads per block
  nB = int(ceil(1. * xs_size / TPB))      # number of blocks
  if interp_kind is 'linear':
    interp_1d(gpu_2,gpu_1,int32(xs_size),float64(SWARM_RATE/R2DBE_RATE),float32(1./(2*BENG_CHANNELS_)),
		block=(TPB,1,1),grid=(nB,1))
  else:
    interp_1d(gpu_2,gpu_1,int32(xs_size),float64(SWARM_RATE/R2DBE_RATE),float32(1./(2*BENG_CHANNELS_)),
		block=(TPB,1,1),grid=(nB,1))

  toc.record()
  toc.synchronize()

  print 'GPU time:', tic.time_till(toc),' ms = ',tic.time_till(toc)/(num_snapshots*0.5*13.128e-3),' x real (both SB)' 

  # destroy plan
  cufft.cufftDestroy(plan)

  # check on CPU
  if (cpu_check):
    # timestep sizes for SWARM and R2DBE rates
    dt_s = 1.0/SWARM_RATE
    dt_r = 1.0/R2DBE_RATE
    # the timespan of one SWARM FFT window
    T_s = dt_s*2*BENG_CHANNELS_
    # the timespan of all SWARM data
    T_s_all = T_s*batch_size
    # get time-domain signal
    xs_swarm_rate = irfft(cpu_in,n=2*BENG_CHANNELS_,axis=1).flatten()
    # and calculate sample points
    t_swarm_rate = arange(0,T_s_all,dt_s)
    print t_swarm_rate[0],t_swarm_rate[-1]
    # calculate resample points (subtract one dt_s from end to avoid extrapolation)
    t_r2dbe_rate = arange(0,T_s_all-dt_s,dt_r)
    # and interpolate
    x_interp = interp1d(t_swarm_rate,xs_swarm_rate,kind=interp_kind)
    cpu_A = x_interp(t_r2dbe_rate)

    cpu_out = np.empty_like(cpu_A,dtype=float32)
    cuda.memcpy_dtoh(cpu_out,gpu_1)

    print 'median residual: ',median(abs(cpu_A-cpu_out))
    if interp_kind is 'nearest':
      cpu_A[::32] = 0
      cpu_out[::32] = 0
    print 'test results: ', 'pass' if allclose(cpu_A,cpu_out) else 'fail'

示例#15

0

显示文件

文件： fft_test.py 项目： sma-wideband/sdbe

def scenario_contiguous_batched39_resample(num_snapshots,tic,toc):
  '''
  # Scenario: Fourier resample of num_snapshots 
  # A iFFT: [num_snapshots,16385] complex64 --> 
  # B FFT: [39,num_snapshots/39 * 32768] float32 --> 
  # C iFFT + zero-padding: [39,num_snapshots/39* 32768*4096/2496/ 2 + 1] complex 64 -->
  # [39,num_snapshots * 32768 * 4096 / 2496] float32
  #
  # 1 C(B(A(gpu_1)))  = C(B(gpu_2)) = C(gpu_1) = gpu_2
  # num_snapshots is a multiple of 39.  
  # A executed using batch = num_snapshots
  # B&C executed using batch = num_snapshots / 39
  '''
  print '\nContiguous channel Fourier resampling scenario in batches of 39:'
  assert num_snapshots % 39 is 0, 'error: num_snapshots must be integer multiple of 39'

  # construct arrays
  batch = num_snapshots / 39
  print 'batch: %d' % batch
  gpu_1 = cuda.mem_alloc(int(8 * batch * (39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1)))
  gpu_2 = cuda.mem_alloc(int(4 * batch * (39*2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE)))
  cpu_in = standard_normal(num_snapshots*BENG_CHANNELS) + 1j * standard_normal(num_snapshots*BENG_CHANNELS)
  cpu_in = cpu_in.astype(complex64)

  # create FFT plans
  n_A = array([2*BENG_CHANNELS_],int32)
  inembed_A = array([BENG_CHANNELS],int32)
  onembed_A = array([2*BENG_CHANNELS_],int32)
  plan_A = cufft.cufftPlanMany(1, n_A.ctypes.data, inembed_A.ctypes.data, 1, BENG_CHANNELS,
  	                                     onembed_A.ctypes.data, 1, 2*BENG_CHANNELS_,
  					     cufft.CUFFT_C2R, num_snapshots)

  n_B = array([39*2*BENG_CHANNELS_],int32)
  inembed_B = array([39*2*BENG_CHANNELS_],int32)
  onembed_B = array([int(39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1)],int32)
  plan_B = cufft.cufftPlanMany(1, n_B.ctypes.data,
					inembed_B.ctypes.data,1,39*2*BENG_CHANNELS_,
					onembed_B.ctypes.data,1,int32(39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1),
					cufft.CUFFT_R2C, batch)

  n_C = array([39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE],int32)
  inembed_C = array([39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1],int32)
  onembed_C = array([39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE],int32)
  plan_C = cufft.cufftPlanMany(1, n_C.ctypes.data,
					inembed_C.ctypes.data,1,int32(39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1),
					onembed_C.ctypes.data,1,int32(39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE),
					cufft.CUFFT_C2R, batch)

  # zero out gpu_1
  kernel_module = SourceModule(kernel_source)
  zero_out = kernel_module.get_function('zero_out')

  # sanity check:
  zero_out(gpu_1,int32(batch * (39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1)),
	block=(1024,1,1),grid=(int(ceil(batch*(39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1)/1024.)),1))
  cpu_out = empty((batch * (39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1)),complex64)
  cuda.memcpy_dtoh(cpu_out,gpu_1)
  assert len(unique(cpu_out)) == 1, 'problem with zero_out'

  # move data to device
  cuda.memcpy_htod(gpu_1,cpu_in)

  tic.record()

  # Turn SWARM snapshots into timeseries
  cufft.cufftExecC2R(plan_A,int(gpu_1),int(gpu_2))

  # zero out gpu_1
  zero_out(gpu_1,int32(batch*(39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1)),
	block=(1024,1,1),grid=(int(ceil(batch*(39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1)/1024.)),1))

  # Turn concatenated SWARM time series into single spectrum (already zero-padded)
  cufft.cufftExecR2C(plan_B,int(gpu_2),int(gpu_1))

  # Turn padded SWARM spectrum into time series with R2DBE sampling rate
  cufft.cufftExecC2R(plan_C,int(gpu_1),int(gpu_2))

  toc.record()
  toc.synchronize()

  # check on CPU
  cpu_A = irfft(cpu_in.reshape(num_snapshots,BENG_CHANNELS),axis=-1).astype(float32)
  cpu_B = rfft(cpu_A.reshape(batch,39*2*BENG_CHANNELS_),axis=-1).astype(complex64)
  cpu_C = irfft(hstack([cpu_B, zeros((batch,(39*2*BENG_CHANNELS_* R2DBE_RATE/SWARM_RATE/2+1)-(39*2*BENG_CHANNELS_/2+1)),complex64)]),axis=-1)
  cpu_out = empty(num_snapshots*2*BENG_CHANNELS_* R2DBE_RATE/SWARM_RATE,float32)
  cuda.memcpy_dtoh(cpu_out,gpu_2)

  print 'test results: ', 'pass' if allclose(cpu_C.flatten(),cpu_out/(cpu_C.shape[-1]*2*BENG_CHANNELS_)) else 'fail'
  print 'max residual: ',max(abs(cpu_C.flatten()-cpu_out/(cpu_C.shape[-1]*2*BENG_CHANNELS_)))
  print 'GPU time:', tic.time_till(toc),' ms = ',tic.time_till(toc)/(num_snapshots*0.5*13.128e-3),' x real (both SB)' 

  # destroy plans
  cufft.cufftDestroy(plan_A)
  cufft.cufftDestroy(plan_B)
  cufft.cufftDestroy(plan_C)

示例#16

0

显示文件

def fft_interp(gpu_1,
               gpu_2,
               num_snapshots,
               interp_kind='nearest',
               cpu_check=True):
    '''
  Batched fft to time series and then interpolation to resample.
  No filter applied yet...
  '''
    tic = cuda.Event()
    toc = cuda.Event()

    batch_size = num_snapshots
    print 'batch size: %d' % batch_size

    # create batched FFT plan configuration
    n = array([2 * BENG_CHANNELS_], int32)
    inembed = array([BENG_CHANNELS], int32)
    onembed = array([2 * BENG_CHANNELS_], int32)
    plan = cufft.cufftPlanMany(1, n.ctypes.data, inembed.ctypes.data, 1,
                               BENG_CHANNELS, onembed.ctypes.data, 1,
                               2 * BENG_CHANNELS_, cufft.CUFFT_C2R, batch_size)

    # fetch kernel that resamples
    kernel_module = SourceModule(kernel_source)
    interp_1d = kernel_module.get_function(interp_kind)

    # execute plan
    cufft.cufftExecC2R(plan, int(gpu_1), int(gpu_2))

    # interpolate
    tic.record()
    xs_size = int(
        floor(batch_size * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE)) - 1
    TPB = 512  # threads per block
    nB = int(ceil(1. * xs_size / TPB))  # number of blocks
    if interp_kind is 'linear':
        interp_1d(gpu_2,
                  gpu_1,
                  int32(xs_size),
                  float64(SWARM_RATE / R2DBE_RATE),
                  float32(1. / (2 * BENG_CHANNELS_)),
                  block=(TPB, 1, 1),
                  grid=(nB, 1))
    else:
        interp_1d(gpu_2,
                  gpu_1,
                  int32(xs_size),
                  float64(SWARM_RATE / R2DBE_RATE),
                  float32(1. / (2 * BENG_CHANNELS_)),
                  block=(TPB, 1, 1),
                  grid=(nB, 1))

    toc.record()
    toc.synchronize()

    print 'GPU time:', tic.time_till(toc), ' ms = ', tic.time_till(toc) / (
        num_snapshots * 0.5 * 13.128e-3), ' x real (both SB)'

    # destroy plan
    cufft.cufftDestroy(plan)

    # check on CPU
    if (cpu_check):
        # timestep sizes for SWARM and R2DBE rates
        dt_s = 1.0 / SWARM_RATE
        dt_r = 1.0 / R2DBE_RATE
        # the timespan of one SWARM FFT window
        T_s = dt_s * 2 * BENG_CHANNELS_
        # the timespan of all SWARM data
        T_s_all = T_s * batch_size
        # get time-domain signal
        xs_swarm_rate = irfft(cpu_in, n=2 * BENG_CHANNELS_, axis=1).flatten()
        # and calculate sample points
        t_swarm_rate = arange(0, T_s_all, dt_s)
        print t_swarm_rate[0], t_swarm_rate[-1]
        # calculate resample points (subtract one dt_s from end to avoid extrapolation)
        t_r2dbe_rate = arange(0, T_s_all - dt_s, dt_r)
        # and interpolate
        x_interp = interp1d(t_swarm_rate, xs_swarm_rate, kind=interp_kind)
        cpu_A = x_interp(t_r2dbe_rate)

        cpu_out = np.empty_like(cpu_A, dtype=float32)
        cuda.memcpy_dtoh(cpu_out, gpu_1)

        print 'median residual: ', median(abs(cpu_A - cpu_out))
        if interp_kind is 'nearest':
            cpu_A[::32] = 0
            cpu_out[::32] = 0
        print 'test results: ', 'pass' if allclose(cpu_A, cpu_out) else 'fail'

示例#17

0

显示文件

def fft_batched(gpu_1,
                gpu_2,
                num_snapshots,
                snapshots_per_batch=39,
                cpu_check=True):
    '''
  gpu_1: pointer to Mx16385 array on GPU device where zeroth dimension is positive frequency half of spectrum
        and the first dimension is is increasing snapshot index.  This array will be destroyed.
	Must have byte size: int(8*batch_size*(snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1))
  gpu_2: pointer to result
  snapshots_per_batch: number of snapshots grouped for resampling (% 39 == 0)
  '''
    tic = cuda.Event()
    toc = cuda.Event()

    batch_size = num_snapshots / snapshots_per_batch
    print 'batch size: %d' % batch_size

    # create FFT plans
    n_A = array([2 * BENG_CHANNELS_], int32)
    inembed_A = array([BENG_CHANNELS], int32)
    onembed_A = array([2 * BENG_CHANNELS_], int32)
    plan_A = cufft.cufftPlanMany(1, n_A.ctypes.data, inembed_A.ctypes.data, 1,
                                 BENG_CHANNELS, onembed_A.ctypes.data, 1,
                                 2 * BENG_CHANNELS_, cufft.CUFFT_C2R,
                                 num_snapshots)

    n_B = array([snapshots_per_batch * 2 * BENG_CHANNELS_], int32)
    inembed_B = array([snapshots_per_batch * 2 * BENG_CHANNELS_], int32)
    onembed_B = array([
        int(snapshots_per_batch * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE + 1)
    ], int32)
    plan_B = cufft.cufftPlanMany(
        1, n_B.ctypes.data, inembed_B.ctypes.data, 1,
        snapshots_per_batch * 2 * BENG_CHANNELS_, onembed_B.ctypes.data, 1,
        int32(snapshots_per_batch * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE +
              1), cufft.CUFFT_R2C, batch_size)

    n_C = array(
        [snapshots_per_batch * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE],
        int32)
    inembed_C = array(
        [snapshots_per_batch * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE + 1],
        int32)
    onembed_C = array(
        [snapshots_per_batch * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE],
        int32)
    plan_C = cufft.cufftPlanMany(
        1, n_C.ctypes.data, inembed_C.ctypes.data, 1,
        int32(snapshots_per_batch * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE +
              1), onembed_C.ctypes.data, 1,
        int32(snapshots_per_batch * 2 * BENG_CHANNELS_ * R2DBE_RATE /
              SWARM_RATE), cufft.CUFFT_C2R, batch_size)

    # fetch kernel that zeroes out an array
    kernel_module = SourceModule(kernel_source)
    zero_out = kernel_module.get_function('zero_out')

    tic.record()

    # Turn SWARM snapshots into timeseries
    cufft.cufftExecC2R(plan_A, int(gpu_1), int(gpu_2))

    # zero out gpu_1
    zero_out(
        gpu_1,
        int32(batch_size *
              (snapshots_per_batch * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE +
               1)),
        block=(1024, 1, 1),
        grid=(int(
            ceil(batch_size * (snapshots_per_batch * BENG_CHANNELS_ *
                               R2DBE_RATE / SWARM_RATE + 1) / 1024.)), 1))

    # Turn concatenated SWARM time series into single spectrum (already zero-padded)
    cufft.cufftExecR2C(plan_B, int(gpu_2), int(gpu_1))

    # Turn padded SWARM spectrum into time series with R2DBE sampling rate
    cufft.cufftExecC2R(plan_C, int(gpu_1), int(gpu_2))

    toc.record()
    toc.synchronize()

    # check on CPU
    if (cpu_check):
        cpu_A = irfft(cpu_in.reshape(num_snapshots, BENG_CHANNELS),
                      axis=-1).astype(float32)
        cpu_B = rfft(cpu_A.reshape(batch_size,
                                   snapshots_per_batch * 2 * BENG_CHANNELS_),
                     axis=-1).astype(complex64)
        cpu_C = irfft(hstack([
            cpu_B,
            zeros((batch_size, (snapshots_per_batch * BENG_CHANNELS_ *
                                R2DBE_RATE / SWARM_RATE + 1) -
                   (snapshots_per_batch * BENG_CHANNELS_ + 1)), complex64)
        ]),
                      axis=-1)
        cpu_out = empty(
            num_snapshots * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE,
            float32)
        cuda.memcpy_dtoh(cpu_out, gpu_2)

        print 'test results: ', 'pass' if allclose(
            cpu_C.flatten(), cpu_out /
            (cpu_C.shape[-1] * 2 * BENG_CHANNELS_)) else 'fail'
        print 'max residual: ', max(
            abs(cpu_C.flatten() - cpu_out /
                (cpu_C.shape[-1] * 2 * BENG_CHANNELS_)))

    print 'GPU time:', tic.time_till(toc), ' ms = ', tic.time_till(toc) / (
        num_snapshots * 0.5 * 13.128e-3), ' x real (both SB)'

    # destroy plans
    cufft.cufftDestroy(plan_A)
    cufft.cufftDestroy(plan_B)
    cufft.cufftDestroy(plan_C)

示例#18

0

显示文件

文件： fft_lindy.py 项目： sma-wideband/sdbe

if DEBUG:
  print 'DEBUG::loading cpu_beng_spectra_1'
  gpumeminfo(cuda)
  cpu_beng_spectra_1 = empty(((BENG_BUFFER_IN_COUNTS-1)*BENG_SNAPSHOTS,BENG_CHANNELS),dtype=complex64)
  cuda.memcpy_dtoh(cpu_beng_spectra_1,gpu_beng_1)

# allocate memory for time series
gpu_r2dbe = cuda.mem_alloc(4 * num_r2dbe_samples / 2)
if not in_place: 
  gpu_swarm = cuda.mem_alloc(4 * num_swarm_samples)

for SB in (gpu_beng_0,gpu_beng_1):
#for SB in (gpu_beng_1,):
  # Turn SWARM snapshots into timeseries
  if in_place:
    cufft.cufftExecC2R(plan_A,int(SB),int(SB))
  else:  
    cufft.cufftExecC2R(plan_A,int(SB),int(gpu_swarm))
    SB.free()
  if DEBUG:
    print 'DEBUG::loading cpu_beng_timeseries_1'
    gpumeminfo(cuda)
    if in_place:
      cpu_beng_timeseries_1 = empty(((BENG_BUFFER_IN_COUNTS-1)*BENG_SNAPSHOTS,2*BENG_CHANNELS),dtype=float32)
      cuda.memcpy_dtoh(cpu_beng_timeseries_1,SB)
      cpu_beng_timeseries_1 = cpu_beng_timeseries_1[:,:2*BENG_CHANNELS_]
    else:
      cpu_beng_timeseries_1 = empty(((BENG_BUFFER_IN_COUNTS-1)*BENG_SNAPSHOTS,2*BENG_CHANNELS_),dtype=float32)
      cuda.memcpy_dtoh(cpu_beng_timeseries_1,gpu_swarm)

  # look over chunks of 39 SWARM snapshots