def scenario_inplace_padded_C2R(batch,tic,toc): n = array([2*BENG_CHANNELS_],int32) inembed = array([16*(BENG_CHANNELS//16+1)],int32) onembed = array([2*inembed[0]],int32) plan = cufft.cufftPlanMany(1, n.ctypes.data, inembed.ctypes.data, 1, inembed[0], onembed.ctypes.data, 1, onembed[0], cufft.CUFFT_C2R, batch) data_shape = (batch,inembed[0]) cpu_data = standard_normal(data_shape) + 1j * standard_normal(data_shape) cpu_data = cpu_data.astype(complex64) gpu_data = cuda.mem_alloc(8*batch*inembed[0]) # complex64 cuda.memcpy_htod(gpu_data,cpu_data) tic.record() cufft.cufftExecC2R(plan,int(gpu_data),int(gpu_data)) toc.record() toc.synchronize() cpu_result = np.empty(batch*onembed[0],dtype=np.float32) cuda.memcpy_dtoh(cpu_result,gpu_data) cpu_result = cpu_result.reshape((batch,onembed[0]))[:,:2*BENG_CHANNELS_]/(2*BENG_CHANNELS_) result = irfft(cpu_data[:,:BENG_CHANNELS],axis=-1) print 'Batched in-place scenario' print 'test passed:',np.allclose(cpu_result,result) print 'GPU time:', tic.time_till(toc),' ms = ',tic.time_till(toc)/(batch*0.5*13.128e-3),' x real (both SB)'
def scenario_inplace_padded_C2R(batch, tic, toc): n = array([2 * BENG_CHANNELS_], int32) inembed = array([16 * (BENG_CHANNELS // 16 + 1)], int32) onembed = array([2 * inembed[0]], int32) plan = cufft.cufftPlanMany(1, n.ctypes.data, inembed.ctypes.data, 1, inembed[0], onembed.ctypes.data, 1, onembed[0], cufft.CUFFT_C2R, batch) data_shape = (batch, inembed[0]) cpu_data = standard_normal(data_shape) + 1j * standard_normal(data_shape) cpu_data = cpu_data.astype(complex64) gpu_data = cuda.mem_alloc(8 * batch * inembed[0]) # complex64 cuda.memcpy_htod(gpu_data, cpu_data) tic.record() cufft.cufftExecC2R(plan, int(gpu_data), int(gpu_data)) toc.record() toc.synchronize() cpu_result = np.empty(batch * onembed[0], dtype=np.float32) cuda.memcpy_dtoh(cpu_result, gpu_data) cpu_result = cpu_result.reshape( (batch, onembed[0]))[:, :2 * BENG_CHANNELS_] / (2 * BENG_CHANNELS_) result = irfft(cpu_data[:, :BENG_CHANNELS], axis=-1) print 'Batched in-place scenario' print 'test passed:', np.allclose(cpu_result, result) print 'GPU time:', tic.time_till(toc), ' ms = ', tic.time_till(toc) / ( batch * 0.5 * 13.128e-3), ' x real (both SB)'
def __fft_linear_interp(self): ''' Resample using linear interpolation. ''' self.logger.debug('Resampling using linear interpolation') threads_per_block = 512 blocks_per_grid = int( ceil(1. * self.num_r2dbe_samples / threads_per_block)) # allocate device memory self.__gpu_time_series_1 = cuda.mem_alloc(4 * self.num_r2dbe_samples / 2) # 2048 MHz clock self.__gpu_time_series_0 = cuda.mem_alloc(4 * self.num_r2dbe_samples / 2) # 2048 MHz clock gpu_r2dbe_spec = cuda.mem_alloc( 8 * (4096 / 2 + 1) * self.__bandlimit_batch) # memory peanuts for (phased_sum_in, phased_sum_out) in zip( (self.__gpu_beng_0, self.__gpu_beng_1), (self.__gpu_time_series_0, self.__gpu_time_series_1)): # Turn SWARM snapshots into timeseries cufft.cufftExecC2R(self.__plan_A, int(phased_sum_in), int(phased_sum_in)) # resample gpu_resamp = cuda.mem_alloc( 4 * self.num_r2dbe_samples) # 25% of device memory self.__linear_interp(phased_sum_in, int32(self.num_swarm_samples), gpu_resamp, int32(self.num_r2dbe_samples), float64(SWARM_RATE / R2DBE_RATE), float32(1.), block=(threads_per_block, 1, 1), grid=(blocks_per_grid, 1)) #float32(1./(2*BENG_CHANNELS_)), phased_sum_in.free() # loop through resampled time series in chunks of batch_B num_r2dbe_samples/4096/__bandlimit_batch for ib in range(self.num_r2dbe_samples / 4096 / self.__bandlimit_batch): # compute spectrum with 4096 MHz sample clock cufft.cufftExecR2C( self.__plan_B, int(gpu_resamp) + int(4 * ib * 4096 * self.__bandlimit_batch), int(gpu_r2dbe_spec)) # invert to time series with BW of 1024 MHz, masking out first 150 MHz and last (1024-150) MHz. (pointers are 4-byte values) cufft.cufftExecC2R( self.__plan_C, int(gpu_r2dbe_spec) + int(8 * 150), int(phased_sum_out) + int(4 * ib * 2048 * self.__bandlimit_batch)) gpu_resamp.free() gpu_r2dbe_spec.free()
def scenario_contiguous_channels_wpadding(batch, tic, toc): ''' # Scenario: batched IFFT of 39 snapshots # 39 x 16385 complex64 --> 39 x 32768 float32 # padding complex input so channel dimension has 16400 elements. ''' n = array([2 * BENG_CHANNELS_], int32) beng_channels_padded = 16400 # create batched FFT plan configuration inembed = array([beng_channels_padded], int32) onembed = array([2 * BENG_CHANNELS_], int32) istride = int32(beng_channels_padded) plan = cufft.cufftPlanMany(int32(1), n.ctypes.data, inembed.ctypes.data, int32(1), istride, onembed.ctypes.data, int32(1), int32(2 * BENG_CHANNELS_), cufft.CUFFT_C2R, int32(batch)) # construct arrays gpu_in = cuda.mem_alloc(8 * batch * beng_channels_padded) # complex64 gpu_out = cuda.mem_alloc(4 * batch * 2 * BENG_CHANNELS_) # float32 cpu_in = standard_normal( batch * beng_channels_padded) + 1j * standard_normal( batch * beng_channels_padded) cpu_in = cpu_in.astype(complex64) cuda.memcpy_htod(gpu_in, cpu_in) # execute plan tic.record() cufft.cufftExecC2R(plan, int(gpu_in), int(gpu_out)) toc.record() toc.synchronize() # read out result cpu_out = empty(batch * 2 * BENG_CHANNELS_, float32) cuda.memcpy_dtoh(cpu_out, gpu_out) cpu_out.resize((batch, 2 * BENG_CHANNELS_)) # execute on CPU cpu = irfft(cpu_in.reshape( (batch, beng_channels_padded))[:, :BENG_CHANNELS], axis=-1) # destroy plan cufft.cufftDestroy(plan) # test print '\nContiguous Channel w/ Padding Scenario:' print '1-D %d-element C2R iFFT in batch of %d.' % (n, batch) print 'test results: ' 'pass' if allclose(cpu, cpu_out / (2 * BENG_CHANNELS_)) else 'fail' print 'real time:', batch * 13.128e-3, ' ms' print 'GPU time:', tic.time_till(toc), ' ms = ', tic.time_till(toc) / ( batch * 0.5 * 13.128e-3), ' x real (both SB)'
def scenario_contiguous_channels(batch, tic, toc): ''' # Scenario: batched IFFT of batch snapshots # batch x 16385 complex64 --> batch x 32768 float32 # no padding. ''' n = array([2 * BENG_CHANNELS_], int32) seed(12740) # create batched FFT plan configuration inembed = array([BENG_CHANNELS], int32) onembed = array([2 * BENG_CHANNELS_], int32) plan = cufft.cufftPlanMany(1, n.ctypes.data, inembed.ctypes.data, 1, BENG_CHANNELS, onembed.ctypes.data, 1, 2 * BENG_CHANNELS_, cufft.CUFFT_C2R, batch) # construct arrays gpu_in = cuda.mem_alloc(8 * batch * BENG_CHANNELS) # complex64 gpu_out = cuda.mem_alloc(4 * batch * 2 * BENG_CHANNELS_) # float32 cpu_in = standard_normal( batch * BENG_CHANNELS) + 1j * standard_normal(batch * BENG_CHANNELS) cpu_in = cpu_in.astype(complex64) cuda.memcpy_htod(gpu_in, cpu_in) # execute plan tic.record() cufft.cufftExecC2R(plan, int(gpu_in), int(gpu_out)) toc.record() toc.synchronize() # read out result cpu_out = empty(batch * 2 * BENG_CHANNELS_, float32) cuda.memcpy_dtoh(cpu_out, gpu_out) cpu_out.resize((batch, 2 * BENG_CHANNELS_)) # execute on CPU cpu = irfft(cpu_in.reshape((batch, BENG_CHANNELS)), axis=-1) # destroy plan cufft.cufftDestroy(plan) # test print '\nContiguous Channel Scenario:' print '1-D %d-element C2R iFFT in batch of %d.' % (n, batch) print 'test results: ' 'pass' if allclose(cpu, cpu_out / (2 * BENG_CHANNELS_)) else 'fail' print 'real time:', batch * 13.128e-3, ' ms' print 'GPU time:', tic.time_till(toc), ' ms = ', tic.time_till(toc) / ( batch * 0.5 * 13.128e-3), ' x real (both SB)'
def scenario_contiguous_channels_oversampled64(batch, tic, toc): ''' Scenario: batched IFFT of 2*2**14*64 channels ''' fft_window_oversample = 64 * 2 * 2**14 n = array([fft_window_oversample], int32) # create batched FFT plan configuration inembed = array([fft_window_oversample / 2 + 1], int32) onembed = array([fft_window_oversample], int32) plan = cufft.cufftPlanMany(1, n.ctypes.data, inembed.ctypes.data, 1, fft_window_oversample / 2 + 1, onembed.ctypes.data, 1, fft_window_oversample, cufft.CUFFT_C2R, batch) # construct arrays gpu_in = cuda.mem_alloc(8 * batch * (fft_window_oversample / 2 + 1)) # complex64 gpu_out = cuda.mem_alloc(4 * batch * fft_window_oversample) # float32 data_shape = (batch, fft_window_oversample / 2 + 1) cpu_in = standard_normal(data_shape) + 1j * standard_normal(data_shape) cpu_in = cpu_in.astype(complex64) cuda.memcpy_htod(gpu_in, cpu_in) # execute plan tic.record() cufft.cufftExecC2R(plan, int(gpu_in), int(gpu_out)) toc.record() toc.synchronize() # read out result cpu_out = empty((batch, fft_window_oversample), float32) cuda.memcpy_dtoh(cpu_out, gpu_out) # execute on CPU cpu = irfft(cpu_in, axis=-1) # destroy plan cufft.cufftDestroy(plan) # test print '\nOversampling by x64 Scenario with batches:' print '1-D %d-element C2R iFFT in batch of %d.' % (n, batch) print 'test results: ' 'pass' if allclose( cpu, cpu_out / (fft_window_oversample)) else 'fail' print 'real time:', batch * 13.128e-3, ' ms' print 'GPU time:', tic.time_till(toc), ' ms = ', tic.time_till(toc) / ( batch * 0.5 * 13.128e-3), ' x real (both SB)'
def scenario_contiguous_channels(batch,tic,toc): ''' # Scenario: batched IFFT of batch snapshots # batch x 16385 complex64 --> batch x 32768 float32 # no padding. ''' n = array([2*BENG_CHANNELS_],int32) seed(12740) # create batched FFT plan configuration inembed = array([BENG_CHANNELS],int32) onembed = array([2*BENG_CHANNELS_],int32) plan = cufft.cufftPlanMany(1, n.ctypes.data, inembed.ctypes.data, 1, BENG_CHANNELS, onembed.ctypes.data, 1, 2*BENG_CHANNELS_, cufft.CUFFT_C2R, batch) # construct arrays gpu_in = cuda.mem_alloc(8*batch*BENG_CHANNELS) # complex64 gpu_out = cuda.mem_alloc(4*batch*2*BENG_CHANNELS_) # float32 cpu_in = standard_normal(batch*BENG_CHANNELS) + 1j * standard_normal(batch*BENG_CHANNELS) cpu_in = cpu_in.astype(complex64) cuda.memcpy_htod(gpu_in,cpu_in) # execute plan tic.record() cufft.cufftExecC2R(plan,int(gpu_in),int(gpu_out)) toc.record() toc.synchronize() # read out result cpu_out = empty(batch*2*BENG_CHANNELS_,float32) cuda.memcpy_dtoh(cpu_out,gpu_out) cpu_out.resize((batch,2*BENG_CHANNELS_)) # execute on CPU cpu = irfft(cpu_in.reshape((batch,BENG_CHANNELS)),axis=-1) # destroy plan cufft.cufftDestroy(plan) # test print '\nContiguous Channel Scenario:' print '1-D %d-element C2R iFFT in batch of %d.' % (n, batch) print 'test results: ' 'pass' if allclose(cpu,cpu_out/(2*BENG_CHANNELS_)) else 'fail' print 'real time:', batch * 13.128e-3,' ms' print 'GPU time:', tic.time_till(toc),' ms = ',tic.time_till(toc)/(batch*0.5*13.128e-3),' x real (both SB)'
def scenario_contiguous_channels_wpadding(batch,tic,toc): ''' # Scenario: batched IFFT of 39 snapshots # 39 x 16385 complex64 --> 39 x 32768 float32 # padding complex input so channel dimension has 16400 elements. ''' n = array([2*BENG_CHANNELS_],int32) beng_channels_padded = 16400 # create batched FFT plan configuration inembed = array([beng_channels_padded],int32) onembed = array([2*BENG_CHANNELS_],int32) istride = int32(beng_channels_padded) plan = cufft.cufftPlanMany(int32(1), n.ctypes.data, inembed.ctypes.data, int32(1), istride, onembed.ctypes.data, int32(1), int32(2*BENG_CHANNELS_), cufft.CUFFT_C2R, int32(batch)) # construct arrays gpu_in = cuda.mem_alloc(8*batch*beng_channels_padded) # complex64 gpu_out = cuda.mem_alloc(4*batch*2*BENG_CHANNELS_) # float32 cpu_in = standard_normal(batch*beng_channels_padded) + 1j * standard_normal(batch*beng_channels_padded) cpu_in = cpu_in.astype(complex64) cuda.memcpy_htod(gpu_in,cpu_in) # execute plan tic.record() cufft.cufftExecC2R(plan,int(gpu_in),int(gpu_out)) toc.record() toc.synchronize() # read out result cpu_out = empty(batch*2*BENG_CHANNELS_,float32) cuda.memcpy_dtoh(cpu_out,gpu_out) cpu_out.resize((batch,2*BENG_CHANNELS_)) # execute on CPU cpu = irfft(cpu_in.reshape((batch,beng_channels_padded))[:,:BENG_CHANNELS],axis=-1) # destroy plan cufft.cufftDestroy(plan) # test print '\nContiguous Channel w/ Padding Scenario:' print '1-D %d-element C2R iFFT in batch of %d.' % (n, batch) print 'test results: ' 'pass' if allclose(cpu,cpu_out/(2*BENG_CHANNELS_)) else 'fail' print 'real time:', batch * 13.128e-3,' ms' print 'GPU time:', tic.time_till(toc),' ms = ',tic.time_till(toc)/(batch*0.5*13.128e-3),' x real (both SB)'
def scenario_contiguous_channels_oversampled64(batch,tic,toc): ''' Scenario: batched IFFT of 2*2**14*64 channels ''' fft_window_oversample = 64*2*2**14 n = array([fft_window_oversample],int32) # create batched FFT plan configuration inembed = array([fft_window_oversample/2+1],int32) onembed = array([fft_window_oversample],int32) plan = cufft.cufftPlanMany(1, n.ctypes.data, inembed.ctypes.data, 1, fft_window_oversample/2+1, onembed.ctypes.data, 1, fft_window_oversample, cufft.CUFFT_C2R, batch) # construct arrays gpu_in = cuda.mem_alloc(8*batch*(fft_window_oversample/2+1)) # complex64 gpu_out = cuda.mem_alloc(4*batch*fft_window_oversample) # float32 data_shape = (batch,fft_window_oversample/2+1) cpu_in = standard_normal(data_shape) + 1j * standard_normal(data_shape) cpu_in = cpu_in.astype(complex64) cuda.memcpy_htod(gpu_in,cpu_in) # execute plan tic.record() cufft.cufftExecC2R(plan,int(gpu_in),int(gpu_out)) toc.record() toc.synchronize() # read out result cpu_out = empty((batch,fft_window_oversample),float32) cuda.memcpy_dtoh(cpu_out,gpu_out) # execute on CPU cpu = irfft(cpu_in,axis=-1) # destroy plan cufft.cufftDestroy(plan) # test print '\nOversampling by x64 Scenario with batches:' print '1-D %d-element C2R iFFT in batch of %d.' % (n, batch) print 'test results: ' 'pass' if allclose(cpu,cpu_out/(fft_window_oversample)) else 'fail' print 'real time:', batch * 13.128e-3,' ms' print 'GPU time:', tic.time_till(toc),' ms = ',tic.time_till(toc)/(batch*0.5*13.128e-3),' x real (both SB)'
def __fft_resample(self): ''' Resample using FFTs. Requires that use_fft_resample flag is True. individual phased sums resampled at 2048 MHz. ''' self.logger.debug('Resampling using FFTs') #device memory allocation self.__gpu_time_series_0 = cuda.mem_alloc(4 * self.num_r2dbe_samples / 2) # 2048 MHz clock self.__gpu_time_series_1 = cuda.mem_alloc(4 * self.num_r2dbe_samples / 2) # 2048 MHz clock gpu_swarm = cuda.mem_alloc(4 * self.num_swarm_samples) # loop over phased sums for (phased_sum_in, phased_sum_out) in zip( (self.__gpu_beng_0, self.__gpu_beng_1), (self.__gpu_time_series_0, self.__gpu_time_series_1)): cufft.cufftExecC2R(self.__plan_A, int(phased_sum_in), int(gpu_swarm)) phased_sum_in.free() gpu_tmp = cuda.mem_alloc(8 * int(39 * BENG_CHANNELS_ + 1) * self.__bandlimit_batch) for ib in range((BENG_BUFFER_IN_COUNTS - 1) * BENG_SNAPSHOTS / 39 / self.__bandlimit_batch): # Turn concatenated SWARM time series into single spectrum cufft.cufftExecR2C( self.__plan_B, int(gpu_swarm) + int(4 * 39 * 2 * BENG_CHANNELS_ * self.__bandlimit_batch * ib), int(gpu_tmp)) # Turn padded SWARM spectrum into time series with R2DBE sampling rate cufft.cufftExecC2R( self.__plan_C, int(gpu_tmp) + int(8 * 150 * 512), int(phased_sum_out) + int(4 * 32 * 2 * BENG_CHANNELS_ * ib * self.__bandlimit_batch)) gpu_tmp.free() gpu_swarm.free()
gpumeminfo(cuda) cpu_beng_spectra_1 = empty( ((BENG_BUFFER_IN_COUNTS - 1) * BENG_SNAPSHOTS, BENG_CHANNELS), dtype=complex64) cuda.memcpy_dtoh(cpu_beng_spectra_1, gpu_beng_1) # allocate memory for time series gpu_r2dbe = cuda.mem_alloc(4 * num_r2dbe_samples / 2) if not in_place: gpu_swarm = cuda.mem_alloc(4 * num_swarm_samples) for SB in (gpu_beng_0, gpu_beng_1): #for SB in (gpu_beng_1,): # Turn SWARM snapshots into timeseries if in_place: cufft.cufftExecC2R(plan_A, int(SB), int(SB)) else: cufft.cufftExecC2R(plan_A, int(SB), int(gpu_swarm)) SB.free() if DEBUG: print 'DEBUG::loading cpu_beng_timeseries_1' gpumeminfo(cuda) if in_place: cpu_beng_timeseries_1 = empty( ((BENG_BUFFER_IN_COUNTS - 1) * BENG_SNAPSHOTS, 2 * BENG_CHANNELS), dtype=float32) cuda.memcpy_dtoh(cpu_beng_timeseries_1, SB) cpu_beng_timeseries_1 = cpu_beng_timeseries_1[:, :2 * BENG_CHANNELS_] else:
def scenario_contiguous_batched39_resample(num_snapshots, tic, toc): ''' # Scenario: Fourier resample of num_snapshots # A iFFT: [num_snapshots,16385] complex64 --> # B FFT: [39,num_snapshots/39 * 32768] float32 --> # C iFFT + zero-padding: [39,num_snapshots/39* 32768*4096/2496/ 2 + 1] complex 64 --> # [39,num_snapshots * 32768 * 4096 / 2496] float32 # # 1 C(B(A(gpu_1))) = C(B(gpu_2)) = C(gpu_1) = gpu_2 # num_snapshots is a multiple of 39. # A executed using batch = num_snapshots # B&C executed using batch = num_snapshots / 39 ''' print '\nContiguous channel Fourier resampling scenario in batches of 39:' assert num_snapshots % 39 is 0, 'error: num_snapshots must be integer multiple of 39' # construct arrays batch = num_snapshots / 39 print 'batch: %d' % batch gpu_1 = cuda.mem_alloc( int(8 * batch * (39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1))) gpu_2 = cuda.mem_alloc( int(4 * batch * (39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE))) cpu_in = standard_normal( num_snapshots * BENG_CHANNELS) + 1j * standard_normal(num_snapshots * BENG_CHANNELS) cpu_in = cpu_in.astype(complex64) # create FFT plans n_A = array([2 * BENG_CHANNELS_], int32) inembed_A = array([BENG_CHANNELS], int32) onembed_A = array([2 * BENG_CHANNELS_], int32) plan_A = cufft.cufftPlanMany(1, n_A.ctypes.data, inembed_A.ctypes.data, 1, BENG_CHANNELS, onembed_A.ctypes.data, 1, 2 * BENG_CHANNELS_, cufft.CUFFT_C2R, num_snapshots) n_B = array([39 * 2 * BENG_CHANNELS_], int32) inembed_B = array([39 * 2 * BENG_CHANNELS_], int32) onembed_B = array( [int(39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1)], int32) plan_B = cufft.cufftPlanMany( 1, n_B.ctypes.data, inembed_B.ctypes.data, 1, 39 * 2 * BENG_CHANNELS_, onembed_B.ctypes.data, 1, int32(39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1), cufft.CUFFT_R2C, batch) n_C = array([39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE], int32) inembed_C = array( [39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1], int32) onembed_C = array([39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE], int32) plan_C = cufft.cufftPlanMany( 1, n_C.ctypes.data, inembed_C.ctypes.data, 1, int32(39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1), onembed_C.ctypes.data, 1, int32(39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE), cufft.CUFFT_C2R, batch) # zero out gpu_1 kernel_module = SourceModule(kernel_source) zero_out = kernel_module.get_function('zero_out') # sanity check: zero_out( gpu_1, int32(batch * (39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1)), block=(1024, 1, 1), grid=(int( ceil(batch * (39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1) / 1024.)), 1)) cpu_out = empty( (batch * (39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1)), complex64) cuda.memcpy_dtoh(cpu_out, gpu_1) assert len(unique(cpu_out)) == 1, 'problem with zero_out' # move data to device cuda.memcpy_htod(gpu_1, cpu_in) tic.record() # Turn SWARM snapshots into timeseries cufft.cufftExecC2R(plan_A, int(gpu_1), int(gpu_2)) # zero out gpu_1 zero_out( gpu_1, int32(batch * (39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1)), block=(1024, 1, 1), grid=(int( ceil(batch * (39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1) / 1024.)), 1)) # Turn concatenated SWARM time series into single spectrum (already zero-padded) cufft.cufftExecR2C(plan_B, int(gpu_2), int(gpu_1)) # Turn padded SWARM spectrum into time series with R2DBE sampling rate cufft.cufftExecC2R(plan_C, int(gpu_1), int(gpu_2)) toc.record() toc.synchronize() # check on CPU cpu_A = irfft(cpu_in.reshape(num_snapshots, BENG_CHANNELS), axis=-1).astype(float32) cpu_B = rfft(cpu_A.reshape(batch, 39 * 2 * BENG_CHANNELS_), axis=-1).astype(complex64) cpu_C = irfft(hstack([ cpu_B, zeros((batch, (39 * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE / 2 + 1) - (39 * 2 * BENG_CHANNELS_ / 2 + 1)), complex64) ]), axis=-1) cpu_out = empty( num_snapshots * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE, float32) cuda.memcpy_dtoh(cpu_out, gpu_2) print 'test results: ', 'pass' if allclose( cpu_C.flatten(), cpu_out / (cpu_C.shape[-1] * 2 * BENG_CHANNELS_)) else 'fail' print 'max residual: ', max( abs(cpu_C.flatten() - cpu_out / (cpu_C.shape[-1] * 2 * BENG_CHANNELS_))) print 'GPU time:', tic.time_till(toc), ' ms = ', tic.time_till(toc) / ( num_snapshots * 0.5 * 13.128e-3), ' x real (both SB)' # destroy plans cufft.cufftDestroy(plan_A) cufft.cufftDestroy(plan_B) cufft.cufftDestroy(plan_C)
def fft_batched(gpu_1,gpu_2,num_snapshots,snapshots_per_batch=39,cpu_check=True): ''' gpu_1: pointer to Mx16385 array on GPU device where zeroth dimension is positive frequency half of spectrum and the first dimension is is increasing snapshot index. This array will be destroyed. Must have byte size: int(8*batch_size*(snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1)) gpu_2: pointer to result snapshots_per_batch: number of snapshots grouped for resampling (% 39 == 0) ''' tic = cuda.Event() toc = cuda.Event() batch_size = num_snapshots / snapshots_per_batch print 'batch size: %d' % batch_size # create FFT plans n_A = array([2*BENG_CHANNELS_],int32) inembed_A = array([BENG_CHANNELS],int32) onembed_A = array([2*BENG_CHANNELS_],int32) plan_A = cufft.cufftPlanMany(1, n_A.ctypes.data, inembed_A.ctypes.data, 1, BENG_CHANNELS, onembed_A.ctypes.data, 1, 2*BENG_CHANNELS_, cufft.CUFFT_C2R, num_snapshots) n_B = array([snapshots_per_batch*2*BENG_CHANNELS_],int32) inembed_B = array([snapshots_per_batch*2*BENG_CHANNELS_],int32) onembed_B = array([int(snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1)],int32) plan_B = cufft.cufftPlanMany(1, n_B.ctypes.data, inembed_B.ctypes.data,1,snapshots_per_batch*2*BENG_CHANNELS_, onembed_B.ctypes.data,1,int32(snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1), cufft.CUFFT_R2C, batch_size) n_C = array([snapshots_per_batch*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE],int32) inembed_C = array([snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1],int32) onembed_C = array([snapshots_per_batch*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE],int32) plan_C = cufft.cufftPlanMany(1, n_C.ctypes.data, inembed_C.ctypes.data,1,int32(snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1), onembed_C.ctypes.data,1,int32(snapshots_per_batch*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE), cufft.CUFFT_C2R, batch_size) # fetch kernel that zeroes out an array kernel_module = SourceModule(kernel_source) zero_out = kernel_module.get_function('zero_out') tic.record() # Turn SWARM snapshots into timeseries cufft.cufftExecC2R(plan_A,int(gpu_1),int(gpu_2)) # zero out gpu_1 zero_out(gpu_1,int32(batch_size*(snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1)), block=(1024,1,1), grid=(int(ceil(batch_size*(snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1)/1024.)),1)) # Turn concatenated SWARM time series into single spectrum (already zero-padded) cufft.cufftExecR2C(plan_B,int(gpu_2),int(gpu_1)) # Turn padded SWARM spectrum into time series with R2DBE sampling rate cufft.cufftExecC2R(plan_C,int(gpu_1),int(gpu_2)) toc.record() toc.synchronize() # check on CPU if (cpu_check): cpu_A = irfft(cpu_in.reshape(num_snapshots,BENG_CHANNELS),axis=-1).astype(float32) cpu_B = rfft(cpu_A.reshape(batch_size,snapshots_per_batch*2*BENG_CHANNELS_),axis=-1).astype(complex64) cpu_C = irfft(hstack([cpu_B, zeros((batch_size,(snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1)- (snapshots_per_batch*BENG_CHANNELS_+1)),complex64)]),axis=-1) cpu_out = empty(num_snapshots*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE,float32) cuda.memcpy_dtoh(cpu_out,gpu_2) print 'test results: ', 'pass' if allclose(cpu_C.flatten(),cpu_out/(cpu_C.shape[-1]*2*BENG_CHANNELS_)) else 'fail' print 'max residual: ',max(abs(cpu_C.flatten()-cpu_out/(cpu_C.shape[-1]*2*BENG_CHANNELS_))) print 'GPU time:', tic.time_till(toc),' ms = ',tic.time_till(toc)/(num_snapshots*0.5*13.128e-3),' x real (both SB)' # destroy plans cufft.cufftDestroy(plan_A) cufft.cufftDestroy(plan_B) cufft.cufftDestroy(plan_C)
def fft_interp(gpu_1,gpu_2,num_snapshots,interp_kind='nearest',cpu_check=True): ''' Batched fft to time series and then interpolation to resample. No filter applied yet... ''' tic = cuda.Event() toc = cuda.Event() batch_size = num_snapshots print 'batch size: %d' % batch_size # create batched FFT plan configuration n = array([2*BENG_CHANNELS_],int32) inembed = array([BENG_CHANNELS],int32) onembed = array([2*BENG_CHANNELS_],int32) plan = cufft.cufftPlanMany(1, n.ctypes.data, inembed.ctypes.data, 1, BENG_CHANNELS, onembed.ctypes.data, 1, 2*BENG_CHANNELS_, cufft.CUFFT_C2R, batch_size) # fetch kernel that resamples kernel_module = SourceModule(kernel_source) interp_1d = kernel_module.get_function(interp_kind) # execute plan cufft.cufftExecC2R(plan,int(gpu_1),int(gpu_2)) # interpolate tic.record() xs_size = int(floor(batch_size*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE)) - 1 TPB = 512 # threads per block nB = int(ceil(1. * xs_size / TPB)) # number of blocks if interp_kind is 'linear': interp_1d(gpu_2,gpu_1,int32(xs_size),float64(SWARM_RATE/R2DBE_RATE),float32(1./(2*BENG_CHANNELS_)), block=(TPB,1,1),grid=(nB,1)) else: interp_1d(gpu_2,gpu_1,int32(xs_size),float64(SWARM_RATE/R2DBE_RATE),float32(1./(2*BENG_CHANNELS_)), block=(TPB,1,1),grid=(nB,1)) toc.record() toc.synchronize() print 'GPU time:', tic.time_till(toc),' ms = ',tic.time_till(toc)/(num_snapshots*0.5*13.128e-3),' x real (both SB)' # destroy plan cufft.cufftDestroy(plan) # check on CPU if (cpu_check): # timestep sizes for SWARM and R2DBE rates dt_s = 1.0/SWARM_RATE dt_r = 1.0/R2DBE_RATE # the timespan of one SWARM FFT window T_s = dt_s*2*BENG_CHANNELS_ # the timespan of all SWARM data T_s_all = T_s*batch_size # get time-domain signal xs_swarm_rate = irfft(cpu_in,n=2*BENG_CHANNELS_,axis=1).flatten() # and calculate sample points t_swarm_rate = arange(0,T_s_all,dt_s) print t_swarm_rate[0],t_swarm_rate[-1] # calculate resample points (subtract one dt_s from end to avoid extrapolation) t_r2dbe_rate = arange(0,T_s_all-dt_s,dt_r) # and interpolate x_interp = interp1d(t_swarm_rate,xs_swarm_rate,kind=interp_kind) cpu_A = x_interp(t_r2dbe_rate) cpu_out = np.empty_like(cpu_A,dtype=float32) cuda.memcpy_dtoh(cpu_out,gpu_1) print 'median residual: ',median(abs(cpu_A-cpu_out)) if interp_kind is 'nearest': cpu_A[::32] = 0 cpu_out[::32] = 0 print 'test results: ', 'pass' if allclose(cpu_A,cpu_out) else 'fail'
def scenario_contiguous_batched39_resample(num_snapshots,tic,toc): ''' # Scenario: Fourier resample of num_snapshots # A iFFT: [num_snapshots,16385] complex64 --> # B FFT: [39,num_snapshots/39 * 32768] float32 --> # C iFFT + zero-padding: [39,num_snapshots/39* 32768*4096/2496/ 2 + 1] complex 64 --> # [39,num_snapshots * 32768 * 4096 / 2496] float32 # # 1 C(B(A(gpu_1))) = C(B(gpu_2)) = C(gpu_1) = gpu_2 # num_snapshots is a multiple of 39. # A executed using batch = num_snapshots # B&C executed using batch = num_snapshots / 39 ''' print '\nContiguous channel Fourier resampling scenario in batches of 39:' assert num_snapshots % 39 is 0, 'error: num_snapshots must be integer multiple of 39' # construct arrays batch = num_snapshots / 39 print 'batch: %d' % batch gpu_1 = cuda.mem_alloc(int(8 * batch * (39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1))) gpu_2 = cuda.mem_alloc(int(4 * batch * (39*2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE))) cpu_in = standard_normal(num_snapshots*BENG_CHANNELS) + 1j * standard_normal(num_snapshots*BENG_CHANNELS) cpu_in = cpu_in.astype(complex64) # create FFT plans n_A = array([2*BENG_CHANNELS_],int32) inembed_A = array([BENG_CHANNELS],int32) onembed_A = array([2*BENG_CHANNELS_],int32) plan_A = cufft.cufftPlanMany(1, n_A.ctypes.data, inembed_A.ctypes.data, 1, BENG_CHANNELS, onembed_A.ctypes.data, 1, 2*BENG_CHANNELS_, cufft.CUFFT_C2R, num_snapshots) n_B = array([39*2*BENG_CHANNELS_],int32) inembed_B = array([39*2*BENG_CHANNELS_],int32) onembed_B = array([int(39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1)],int32) plan_B = cufft.cufftPlanMany(1, n_B.ctypes.data, inembed_B.ctypes.data,1,39*2*BENG_CHANNELS_, onembed_B.ctypes.data,1,int32(39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1), cufft.CUFFT_R2C, batch) n_C = array([39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE],int32) inembed_C = array([39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1],int32) onembed_C = array([39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE],int32) plan_C = cufft.cufftPlanMany(1, n_C.ctypes.data, inembed_C.ctypes.data,1,int32(39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1), onembed_C.ctypes.data,1,int32(39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE), cufft.CUFFT_C2R, batch) # zero out gpu_1 kernel_module = SourceModule(kernel_source) zero_out = kernel_module.get_function('zero_out') # sanity check: zero_out(gpu_1,int32(batch * (39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1)), block=(1024,1,1),grid=(int(ceil(batch*(39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1)/1024.)),1)) cpu_out = empty((batch * (39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1)),complex64) cuda.memcpy_dtoh(cpu_out,gpu_1) assert len(unique(cpu_out)) == 1, 'problem with zero_out' # move data to device cuda.memcpy_htod(gpu_1,cpu_in) tic.record() # Turn SWARM snapshots into timeseries cufft.cufftExecC2R(plan_A,int(gpu_1),int(gpu_2)) # zero out gpu_1 zero_out(gpu_1,int32(batch*(39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1)), block=(1024,1,1),grid=(int(ceil(batch*(39*2*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE/2+1)/1024.)),1)) # Turn concatenated SWARM time series into single spectrum (already zero-padded) cufft.cufftExecR2C(plan_B,int(gpu_2),int(gpu_1)) # Turn padded SWARM spectrum into time series with R2DBE sampling rate cufft.cufftExecC2R(plan_C,int(gpu_1),int(gpu_2)) toc.record() toc.synchronize() # check on CPU cpu_A = irfft(cpu_in.reshape(num_snapshots,BENG_CHANNELS),axis=-1).astype(float32) cpu_B = rfft(cpu_A.reshape(batch,39*2*BENG_CHANNELS_),axis=-1).astype(complex64) cpu_C = irfft(hstack([cpu_B, zeros((batch,(39*2*BENG_CHANNELS_* R2DBE_RATE/SWARM_RATE/2+1)-(39*2*BENG_CHANNELS_/2+1)),complex64)]),axis=-1) cpu_out = empty(num_snapshots*2*BENG_CHANNELS_* R2DBE_RATE/SWARM_RATE,float32) cuda.memcpy_dtoh(cpu_out,gpu_2) print 'test results: ', 'pass' if allclose(cpu_C.flatten(),cpu_out/(cpu_C.shape[-1]*2*BENG_CHANNELS_)) else 'fail' print 'max residual: ',max(abs(cpu_C.flatten()-cpu_out/(cpu_C.shape[-1]*2*BENG_CHANNELS_))) print 'GPU time:', tic.time_till(toc),' ms = ',tic.time_till(toc)/(num_snapshots*0.5*13.128e-3),' x real (both SB)' # destroy plans cufft.cufftDestroy(plan_A) cufft.cufftDestroy(plan_B) cufft.cufftDestroy(plan_C)
def fft_interp(gpu_1, gpu_2, num_snapshots, interp_kind='nearest', cpu_check=True): ''' Batched fft to time series and then interpolation to resample. No filter applied yet... ''' tic = cuda.Event() toc = cuda.Event() batch_size = num_snapshots print 'batch size: %d' % batch_size # create batched FFT plan configuration n = array([2 * BENG_CHANNELS_], int32) inembed = array([BENG_CHANNELS], int32) onembed = array([2 * BENG_CHANNELS_], int32) plan = cufft.cufftPlanMany(1, n.ctypes.data, inembed.ctypes.data, 1, BENG_CHANNELS, onembed.ctypes.data, 1, 2 * BENG_CHANNELS_, cufft.CUFFT_C2R, batch_size) # fetch kernel that resamples kernel_module = SourceModule(kernel_source) interp_1d = kernel_module.get_function(interp_kind) # execute plan cufft.cufftExecC2R(plan, int(gpu_1), int(gpu_2)) # interpolate tic.record() xs_size = int( floor(batch_size * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE)) - 1 TPB = 512 # threads per block nB = int(ceil(1. * xs_size / TPB)) # number of blocks if interp_kind is 'linear': interp_1d(gpu_2, gpu_1, int32(xs_size), float64(SWARM_RATE / R2DBE_RATE), float32(1. / (2 * BENG_CHANNELS_)), block=(TPB, 1, 1), grid=(nB, 1)) else: interp_1d(gpu_2, gpu_1, int32(xs_size), float64(SWARM_RATE / R2DBE_RATE), float32(1. / (2 * BENG_CHANNELS_)), block=(TPB, 1, 1), grid=(nB, 1)) toc.record() toc.synchronize() print 'GPU time:', tic.time_till(toc), ' ms = ', tic.time_till(toc) / ( num_snapshots * 0.5 * 13.128e-3), ' x real (both SB)' # destroy plan cufft.cufftDestroy(plan) # check on CPU if (cpu_check): # timestep sizes for SWARM and R2DBE rates dt_s = 1.0 / SWARM_RATE dt_r = 1.0 / R2DBE_RATE # the timespan of one SWARM FFT window T_s = dt_s * 2 * BENG_CHANNELS_ # the timespan of all SWARM data T_s_all = T_s * batch_size # get time-domain signal xs_swarm_rate = irfft(cpu_in, n=2 * BENG_CHANNELS_, axis=1).flatten() # and calculate sample points t_swarm_rate = arange(0, T_s_all, dt_s) print t_swarm_rate[0], t_swarm_rate[-1] # calculate resample points (subtract one dt_s from end to avoid extrapolation) t_r2dbe_rate = arange(0, T_s_all - dt_s, dt_r) # and interpolate x_interp = interp1d(t_swarm_rate, xs_swarm_rate, kind=interp_kind) cpu_A = x_interp(t_r2dbe_rate) cpu_out = np.empty_like(cpu_A, dtype=float32) cuda.memcpy_dtoh(cpu_out, gpu_1) print 'median residual: ', median(abs(cpu_A - cpu_out)) if interp_kind is 'nearest': cpu_A[::32] = 0 cpu_out[::32] = 0 print 'test results: ', 'pass' if allclose(cpu_A, cpu_out) else 'fail'
def fft_batched(gpu_1, gpu_2, num_snapshots, snapshots_per_batch=39, cpu_check=True): ''' gpu_1: pointer to Mx16385 array on GPU device where zeroth dimension is positive frequency half of spectrum and the first dimension is is increasing snapshot index. This array will be destroyed. Must have byte size: int(8*batch_size*(snapshots_per_batch*BENG_CHANNELS_*R2DBE_RATE/SWARM_RATE+1)) gpu_2: pointer to result snapshots_per_batch: number of snapshots grouped for resampling (% 39 == 0) ''' tic = cuda.Event() toc = cuda.Event() batch_size = num_snapshots / snapshots_per_batch print 'batch size: %d' % batch_size # create FFT plans n_A = array([2 * BENG_CHANNELS_], int32) inembed_A = array([BENG_CHANNELS], int32) onembed_A = array([2 * BENG_CHANNELS_], int32) plan_A = cufft.cufftPlanMany(1, n_A.ctypes.data, inembed_A.ctypes.data, 1, BENG_CHANNELS, onembed_A.ctypes.data, 1, 2 * BENG_CHANNELS_, cufft.CUFFT_C2R, num_snapshots) n_B = array([snapshots_per_batch * 2 * BENG_CHANNELS_], int32) inembed_B = array([snapshots_per_batch * 2 * BENG_CHANNELS_], int32) onembed_B = array([ int(snapshots_per_batch * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE + 1) ], int32) plan_B = cufft.cufftPlanMany( 1, n_B.ctypes.data, inembed_B.ctypes.data, 1, snapshots_per_batch * 2 * BENG_CHANNELS_, onembed_B.ctypes.data, 1, int32(snapshots_per_batch * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE + 1), cufft.CUFFT_R2C, batch_size) n_C = array( [snapshots_per_batch * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE], int32) inembed_C = array( [snapshots_per_batch * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE + 1], int32) onembed_C = array( [snapshots_per_batch * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE], int32) plan_C = cufft.cufftPlanMany( 1, n_C.ctypes.data, inembed_C.ctypes.data, 1, int32(snapshots_per_batch * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE + 1), onembed_C.ctypes.data, 1, int32(snapshots_per_batch * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE), cufft.CUFFT_C2R, batch_size) # fetch kernel that zeroes out an array kernel_module = SourceModule(kernel_source) zero_out = kernel_module.get_function('zero_out') tic.record() # Turn SWARM snapshots into timeseries cufft.cufftExecC2R(plan_A, int(gpu_1), int(gpu_2)) # zero out gpu_1 zero_out( gpu_1, int32(batch_size * (snapshots_per_batch * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE + 1)), block=(1024, 1, 1), grid=(int( ceil(batch_size * (snapshots_per_batch * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE + 1) / 1024.)), 1)) # Turn concatenated SWARM time series into single spectrum (already zero-padded) cufft.cufftExecR2C(plan_B, int(gpu_2), int(gpu_1)) # Turn padded SWARM spectrum into time series with R2DBE sampling rate cufft.cufftExecC2R(plan_C, int(gpu_1), int(gpu_2)) toc.record() toc.synchronize() # check on CPU if (cpu_check): cpu_A = irfft(cpu_in.reshape(num_snapshots, BENG_CHANNELS), axis=-1).astype(float32) cpu_B = rfft(cpu_A.reshape(batch_size, snapshots_per_batch * 2 * BENG_CHANNELS_), axis=-1).astype(complex64) cpu_C = irfft(hstack([ cpu_B, zeros((batch_size, (snapshots_per_batch * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE + 1) - (snapshots_per_batch * BENG_CHANNELS_ + 1)), complex64) ]), axis=-1) cpu_out = empty( num_snapshots * 2 * BENG_CHANNELS_ * R2DBE_RATE / SWARM_RATE, float32) cuda.memcpy_dtoh(cpu_out, gpu_2) print 'test results: ', 'pass' if allclose( cpu_C.flatten(), cpu_out / (cpu_C.shape[-1] * 2 * BENG_CHANNELS_)) else 'fail' print 'max residual: ', max( abs(cpu_C.flatten() - cpu_out / (cpu_C.shape[-1] * 2 * BENG_CHANNELS_))) print 'GPU time:', tic.time_till(toc), ' ms = ', tic.time_till(toc) / ( num_snapshots * 0.5 * 13.128e-3), ' x real (both SB)' # destroy plans cufft.cufftDestroy(plan_A) cufft.cufftDestroy(plan_B) cufft.cufftDestroy(plan_C)
if DEBUG: print 'DEBUG::loading cpu_beng_spectra_1' gpumeminfo(cuda) cpu_beng_spectra_1 = empty(((BENG_BUFFER_IN_COUNTS-1)*BENG_SNAPSHOTS,BENG_CHANNELS),dtype=complex64) cuda.memcpy_dtoh(cpu_beng_spectra_1,gpu_beng_1) # allocate memory for time series gpu_r2dbe = cuda.mem_alloc(4 * num_r2dbe_samples / 2) if not in_place: gpu_swarm = cuda.mem_alloc(4 * num_swarm_samples) for SB in (gpu_beng_0,gpu_beng_1): #for SB in (gpu_beng_1,): # Turn SWARM snapshots into timeseries if in_place: cufft.cufftExecC2R(plan_A,int(SB),int(SB)) else: cufft.cufftExecC2R(plan_A,int(SB),int(gpu_swarm)) SB.free() if DEBUG: print 'DEBUG::loading cpu_beng_timeseries_1' gpumeminfo(cuda) if in_place: cpu_beng_timeseries_1 = empty(((BENG_BUFFER_IN_COUNTS-1)*BENG_SNAPSHOTS,2*BENG_CHANNELS),dtype=float32) cuda.memcpy_dtoh(cpu_beng_timeseries_1,SB) cpu_beng_timeseries_1 = cpu_beng_timeseries_1[:,:2*BENG_CHANNELS_] else: cpu_beng_timeseries_1 = empty(((BENG_BUFFER_IN_COUNTS-1)*BENG_SNAPSHOTS,2*BENG_CHANNELS_),dtype=float32) cuda.memcpy_dtoh(cpu_beng_timeseries_1,gpu_swarm) # look over chunks of 39 SWARM snapshots