def gpu_getmax(map): """ Use pycuda to get the maximum absolute deviation of the residual map, with the correct sign """ imax=gpu.max(cumath.fabs(map)).get() if gpu.max(map).get()!=imax: imax*=-1 return np.float32(imax)
def test_fabs(self): """tests if the fabs function works""" a = simplearray.array(test_sample).fill_arange() * -1 b = cumath.fabs(a) for i in range(test_sample): self.assert_(a[i] + b[i] == 0) self.assert_(b[i] >= 0)
def gpu_getmax(map): """ Use pycuda to get the maximum absolute deviation of the residual map, with the correct sign """ imax = gpu.max(cumath.fabs(map)).get() if gpu.max(map).get() != imax: imax *= -1 return np.float32(imax)
def calculateTimestep(meshPropSpeedsGPU, cellDim): maxPropSpeed = gpuarray.max(cumath.fabs(meshPropSpeedsGPU)).get() return cellDim / (4.0 * maxPropSpeed)
def abs_t(self, a, out): cumath.fabs(a, out=out)
def cuda_gridvis(settings,plan): """ Grid the visibilities parallelized by pixel. References: - Chapter 10 in "Interferometry and Synthesis in Radio Astronomy" by Thompson, Moran, & Swenson - Daniel Brigg's PhD Thesis: http://www.aoc.nrao.edu/dissertations/dbriggs/ """ print "Gridding the visibilities" t_start=time.time() # unpack parameters vfile = settings['vfile'] briggs = settings['briggs'] imsize = settings['imsize'] cell = settings['cell'] nx = np.int32(2*imsize) noff = np.int32((nx-imsize)/2) ## constants arc2rad = np.float32(np.pi/180/3600.) du = np.float32(1./(arc2rad*cell*nx)) ## grab data f = pyfits.open(settings['vfile']) ## quickly figure out what data is not flagged freq = np.float32(f[0].header['CRVAL4']) good = np.where(f[0].data.data[:,0,0,0,0,0,0] != 0) h_u = np.float32(freq*f[0].data.par('uu')[good]) h_v = np.float32(freq*f[0].data.par('vv')[good]) gcount = np.int32(np.size(h_u)) ## assume data is unpolarized h_re = np.float32(0.5*(f[0].data.data[good,0,0,0,0,0,0]+f[0].data.data[good,0,0,0,0,1,0])) h_im = np.float32(0.5*(f[0].data.data[good,0,0,0,0,0,1]+f[0].data.data[good,0,0,0,0,1,1])) ## make GPU arrays h_grd = np.zeros((nx,nx),dtype=np.complex64) h_cnt = np.zeros((nx,nx),dtype=np.int32) d_u = gpu.to_gpu(h_u) d_v = gpu.to_gpu(h_v) d_re = gpu.to_gpu(h_re) d_im = gpu.to_gpu(h_im) d_cnt = gpu.zeros((np.int(nx),np.int(nx)),np.int32) d_grd = gpu.zeros((np.int(nx),np.int(nx)),np.complex64) d_ngrd = gpu.zeros_like(d_grd) d_bm = gpu.zeros_like(d_grd) d_nbm = gpu.zeros_like(d_grd) d_fim = gpu.zeros((np.int(imsize),np.int(imsize)),np.float32) ## define kernel parameters blocksize2D = (8,16,1) gridsize2D = (np.int(np.ceil(1.*nx/blocksize2D[0])),np.int(np.ceil(1.*nx/blocksize2D[1]))) blocksizeF2D = (16,16,1) gridsizeF2D = (np.int(np.ceil(1.*imsize/blocksizeF2D[0])),np.int(np.ceil(1.*imsize/blocksizeF2D[1]))) blocksize1D = (256,1,1) gridsize1D = (np.int(np.ceil(1.*gcount/blocksize1D[0])),1) # ------------------------ # make gridding kernels # ------------------------ ## make spheroidal convolution kernel (don't mess with these!) width = 6. ngcf = 24. h_cgf = gcf(ngcf,width) ## make grid correction h_corr = corrfun(nx,width) d_cgf = module.get_global('cgf')[0] d_corr = gpu.to_gpu(h_corr) cu.memcpy_htod(d_cgf,h_cgf) # ------------------------ # grid it up # ------------------------ d_umax = gpu.max(cumath.fabs(d_u)) d_vmax = gpu.max(cumath.fabs(d_v)) umax = np.int32(np.ceil(d_umax.get()/du)) vmax = np.int32(np.ceil(d_vmax.get()/du)) ## grid ($$) # This should be improvable via: # - shared memory solution? I tried... # - better coalesced memory access? I tried... # - reorganzing and indexing UV data beforehand? # (i.e. http://www.nvidia.com/docs/IO/47905/ECE757_Project_Report_Gregerson.pdf) # - storing V(u,v) in texture memory? gridVis_wBM_kernel(d_grd,d_bm,d_cnt,d_u,d_v,d_re,d_im,nx,du,gcount,umax,vmax,\ block=blocksize2D,grid=gridsize2D) ## apply weights wgtGrid_kernel(d_bm,d_cnt,briggs,nx,block=blocksize2D,grid=gridsize2D) hfac = np.int32(1) dblGrid_kernel(d_bm,nx,hfac,block=blocksize2D,grid=gridsize2D) shiftGrid_kernel(d_bm,d_nbm,nx,block=blocksize2D,grid=gridsize2D) ## normalize wgtGrid_kernel(d_grd,d_cnt,briggs,nx,block=blocksize2D,grid=gridsize2D) ## Reflect grid about v axis hfac = np.int32(-1) dblGrid_kernel(d_grd,nx,hfac,block=blocksize2D,grid=gridsize2D) ## Shift both shiftGrid_kernel(d_grd,d_ngrd,nx,block=blocksize2D,grid=gridsize2D) # ------------------------ # Make the beam # ------------------------ ## Transform to image plane fft.fft(d_nbm,d_bm,plan) ## Shift shiftGrid_kernel(d_bm,d_nbm,nx,block=blocksize2D,grid=gridsize2D) ## Correct for C corrGrid_kernel(d_nbm,d_corr,nx,block=blocksize2D,grid=gridsize2D) # Trim trimIm_kernel(d_nbm,d_fim,noff,nx,imsize,block=blocksizeF2D,grid=gridsizeF2D) ## Normalize d_bmax = gpu.max(d_fim) bmax = d_bmax.get() bmax = np.float32(1./bmax) nrmBeam_kernel(d_fim,bmax,imsize,block=blocksizeF2D,grid=gridsizeF2D) ## Pull onto CPU dpsf = d_fim.get() # ------------------------ # Make the map # ------------------------ ## Transform to image plane fft.fft(d_ngrd,d_grd,plan) ## Shift shiftGrid_kernel(d_grd,d_ngrd,nx,block=blocksize2D,grid=gridsize2D) ## Correct for C corrGrid_kernel(d_ngrd,d_corr,nx,block=blocksize2D,grid=gridsize2D) ## Trim trimIm_kernel(d_ngrd,d_fim,noff,nx,imsize,block=blocksizeF2D,grid=gridsizeF2D) ## Normalize (Jy/beam) nrmGrid_kernel(d_fim,bmax,imsize,block=blocksizeF2D,grid=gridsizeF2D) ## Finish timers t_end=time.time() t_full=t_end-t_start print "Gridding execution time %0.5f"%t_full+' s' print "\t%0.5f"%(t_full/gcount)+' s per visibility' ## Return dirty psf (CPU) and dirty image (GPU) return dpsf,d_fim
N = 100000 # --- Create random vectorson the CPU h_a = np.random.randn(1, N) h_b = np.random.randn(1, N) # --- Set CPU arrays as single precision h_a = h_a.astype(np.float32) h_b = h_b.astype(np.float32) h_c = np.empty_like(h_a) d_a = gpuarray.to_gpu(h_a) d_b = gpuarray.to_gpu(h_b) start.record() d_c = (cumath.sqrt(cumath.fabs(d_a)) + cumath.exp(d_b)) end.record() end.synchronize() secs = start.time_till(end) * 1e-3 print("Processing time = %fs" % (secs)) h_c = d_c.get() if np.all(abs(h_c - (np.sqrt(np.abs(h_a)) + np.exp(h_b))) < 1e-5): print("Test passed!") else: print("Error!") # --- Flush context printf buffer cuda.Context.synchronize()
def cuda_gridvis(sub_array, f, settings, plan, chan): """ Grid the visibilities parallelized by pixel. References: - Chapter 10 in "Interferometry and Synthesis in Radio Astronomy" by Thompson, Moran, & Swenson - Daniel Brigg's PhD Thesis: http://www.aoc.nrao.edu/dissertations/dbriggs/ """ print "Gridding the visibilities" t_start = time.time() if sub_array==1: Antennas = 40 else: Antennas = 60 # unpack parameters vfile = settings['vfile'] briggs = settings['briggs'] imsize = settings['imsize'] cell = settings['cell'] nx = np.int32(2 * imsize) noff = np.int32((nx - imsize) / 2) ## constants arc2rad = np.float32(np.pi / 180. / 3600.) du = np.float32(1. / (arc2rad * cell * nx)) # determin the file type (uvfits or fitsidi) h_u = np.ndarray(shape=(Antennas*(Antennas-1)//2, 1), dtype='float64') h_v = np.ndarray(shape=(Antennas*(Antennas-1)//2, 1), dtype='float64') h_re = np.ndarray(shape=(Antennas*(Antennas-1)//2, 1), dtype='float32') h_im = np.ndarray(shape=(Antennas*(Antennas-1)//2, 1), dtype='float32') #Get Visibility Data and values of UVW if settings['vfile'].find('.uvfits') != -1: freq = 3.45E11 #np.float32(f[0].header['CRVAL4']) light_speed = 299792458. good = np.where(f[0].data.data[:, 0, 0, chan, 0, 0] != 0) h_u = np.float32(light_speed * f[0].data.par('uu')[good]) print "h_u", h_u.shape h_v = np.float32(light_speed * f[0].data.par('vv')[good]) gcount = np.int32(np.size(h_u)) ## assume data is unpolarized h_re = np.float32(f[0].data.data[good, 0, 0, chan, 0, 0]) h_im = np.float32(f[0].data.data[good, 0, 0, chan, 0, 1]) freq = 1702500000. light_speed = 299792458. # Speed of light ## assume data is unpolarized #print chan print 'GCOUNT', gcount # h_ : host, d_ : device h_grd = np.zeros((nx, nx), dtype=np.complex64) h_cnt = np.zeros((nx, nx), dtype=np.int32) d_u = gpu.to_gpu(np.array(h_u,dtype='float32')) d_v = gpu.to_gpu(np.array(h_v,dtype='float32')) d_re = gpu.to_gpu(np.array(h_re,dtype='float32')) d_im = gpu.to_gpu(np.array(h_im,dtype='float32')) d_cnt = gpu.zeros((np.int(nx), np.int(nx)), np.int32) d_grd = gpu.zeros((np.int(nx), np.int(nx)), np.complex64) d_ngrd = gpu.zeros_like(d_grd) d_bm = gpu.zeros_like(d_grd) d_nbm = gpu.zeros_like(d_grd) d_fim = gpu.zeros((np.int(imsize), np.int(imsize)), np.float32) ## define kernel parameters if imsize == 1024: blocksize2D = (8, 16, 1) gridsize2D = (np.int(np.ceil(1. * nx / blocksize2D[0])), np.int(np.ceil(1. * nx / blocksize2D[1]))) blocksizeF2D = (16, 16, 1) gridsizeF2D = (np.int(np.ceil(1. * imsize / blocksizeF2D[0])), np.int(np.ceil(1. * imsize / blocksizeF2D[1]))) blocksize1D = (256, 1, 1) else: blocksize2D = (16, 32, 1) gridsize2D = (np.int(np.ceil(1. * nx / blocksize2D[0])), np.int(np.ceil(1. * nx / blocksize2D[1]))) blocksizeF2D = (32, 32, 1) gridsizeF2D = (np.int(np.ceil(1. * imsize / blocksizeF2D[0])), np.int(np.ceil(1. * imsize / blocksizeF2D[1]))) blocksize1D = (512, 1, 1) gridsize1D = (np.int(np.ceil(1. * gcount / blocksize1D[0])), 1) # ------------------------ # make gridding kernels # ------------------------ ## make spheroidal convolution kernel (don't mess with these!) width = 6. ngcf = 24. h_cgf = gcf(ngcf, width) ## make grid correction h_corr = corrfun(nx, width) d_cgf = module.get_global('cgf')[0] d_corr = gpu.to_gpu(h_corr) cu.memcpy_htod(d_cgf, h_cgf) # ------------------------ # grid it up # ------------------------ d_umax = gpu.max(cumath.fabs(d_u)) d_vmax = gpu.max(cumath.fabs(d_v)) umax = np.int32(np.ceil(d_umax.get() / du)) vmax = np.int32(np.ceil(d_vmax.get() / du)) ## grid ($$) # This should be improvable via: # - shared memory solution? I tried... # - better coalesced memory access? I tried... # - reorganzing and indexing UV data beforehand? # (i.e. http://www.nvidia.com/docs/IO/47905/ECE757_Project_Report_Gregerson.pdf) # - storing V(u,v) in texture memory? # Each pixel in the uv plane goes through the data and check to see whether the pixel is included in the convolution. # This kernel also calculates the point spread function and the local sampling # from the data (for applying the weights later). gridVis_wBM_kernel(d_grd, d_bm, d_cnt, d_u, d_v, d_re, d_im, nx, du, gcount, umax, vmax, \ block=blocksize2D, grid=gridsize2D) ## apply weights wgtGrid_kernel(d_bm, d_cnt, briggs, nx, block=blocksize2D, grid=gridsize2D) hfac = np.int32(1) dblGrid_kernel(d_bm, nx, hfac, block=blocksize2D, grid=gridsize2D) shiftGrid_kernel(d_bm, d_nbm, nx, block=blocksize2D, grid=gridsize2D) ## normalize wgtGrid_kernel(d_grd, d_cnt, briggs, nx, block=blocksize2D, grid=gridsize2D) ## Reflect grid about v axis hfac = np.int32(-1) dblGrid_kernel(d_grd, nx, hfac, block=blocksize2D, grid=gridsize2D) ## Shift both shiftGrid_kernel(d_grd, d_ngrd, nx, block=blocksize2D, grid=gridsize2D) # ------------------------ # Make the beam # ------------------------ ## Transform to image plane fft.fft(d_nbm, d_bm, plan) ## Shift shiftGrid_kernel(d_bm, d_nbm, nx, block=blocksize2D, grid=gridsize2D) ## Correct for C corrGrid_kernel(d_nbm, d_corr, nx, block=blocksize2D, grid=gridsize2D) # Trim trimIm_kernel(d_nbm, d_fim, noff, nx, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Normalize d_bmax = gpu.max(d_fim) bmax = d_bmax.get() bmax = np.float32(1. / bmax) nrmBeam_kernel(d_fim, bmax, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Pull onto CPU dpsf = d_fim.get() # ------------------------ # Make the map # ------------------------ ## Transform to image plane fft.fft(d_ngrd, d_grd, plan) ## Shift shiftGrid_kernel(d_grd, d_ngrd, nx, block=blocksize2D, grid=gridsize2D) ## Correct for C corrGrid_kernel(d_ngrd, d_corr, nx, block=blocksize2D, grid=gridsize2D) ## Trim trimIm_kernel(d_ngrd, d_fim, noff, nx, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Normalize (Jy/beam) nrmGrid_kernel(d_fim, bmax, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Finish timers t_end = time.time() t_full = t_end - t_start print "Gridding execution time %0.5f" % t_full + ' s' print "\t%0.5f" % (t_full / gcount) + ' s per visibility' ## Return dirty psf (CPU) and dirty image (GPU) return dpsf, d_fim
def kernel(a): from pycuda.cumath import fabs return my_max(fabs(a))
def calculateTimestep(PropSpeedsGPU, cellDim): maxPropSpeed = gpuarray.max(cumath.fabs(PropSpeedsGPU)).get() return cellDim / (4.0 * maxPropSpeed), maxPropSpeed
H = 240 cap.set(cv.CAP_PROP_FRAME_HEIGHT, H) cap.set(cv.CAP_PROP_FRAME_WIDTH, W) ret, frame = cap.read() gray_a = cv.cvtColor(frame, cv.COLOR_RGB2GRAY) img_ori_gpu = gpuarray.to_gpu(gray_a.astype(np.float32)) img_buf_gpu = gpuarray.empty_like(img_ori_gpu) img_sub = gpuarray.ones_like(img_ori_gpu) img_sub = 25 * img_sub img_bgm = gpuarray.zeros_like(img_sub) while True: ret, frame = cap.read() gray_buff = cv.cvtColor(frame, cv.COLOR_BGR2GRAY) img_res_gpu = gpuarray.to_gpu(gray_buff.astype(np.float32)) img_buf_gpu = cmath.fabs(img_ori_gpu - img_res_gpu) img_buf_gpu = img_buf_gpu - img_sub img_ori_gpu = img_res_gpu.copy() img_res_gpu = gpuarray.if_positive(img_buf_gpu, img_bgm, img_res_gpu) gray_buff = img_res_gpu.get() gray_buff = gray_buff.astype(np.uint8) frame = cv.cvtColor(gray_buff, cv.COLOR_GRAY2RGB) cv.imshow("Moving Detecting!", frame) if cv.waitKey(1) & 0xFF == ord('q'): break cap.release() cv.destroyAllWindows()
def cuda_gridvis(csrh_sun, csrh_satellite, settings, plan, chan): """ Grid the visibilities parallelized by pixel. References: - Chapter 10 in "Interferometry and Synthesis in Radio Astronomy" by Thompson, Moran, & Swenson - Daniel Brigg's PhD Thesis: http://www.aoc.nrao.edu/dissertations/dbriggs/ """ print "Gridding the visibilities" t_start = time.time() #f = pyfits.open(settings['vfile']) # unpack parameters vfile = settings['vfile'] briggs = settings['briggs'] imsize = settings['imsize'] cell = settings['cell'] nx = np.int32(2 * imsize) noff = np.int32((nx - imsize) / 2) ## constants arc2rad = np.float32(np.pi / 180. / 3600.) du = np.float32(1. / (arc2rad * cell * nx)) ## grab data #f = pyfits.open(settings['vfile']) Data = np.ndarray(shape=(44, 44, 16), dtype=complex) UVW = np.ndarray(shape=(780, 1), dtype='float64') Data, UVW = visibility(csrh_sun, csrh_satellite, chan) print "UVW*****\n", UVW # determin the file type (uvfits or fitsidi) h_uu = np.ndarray(shape=(780), dtype='float64') h_vv = np.ndarray(shape=(780), dtype='float64') h_rere = np.ndarray(shape=(780), dtype='float32') h_imim = np.ndarray(shape=(780), dtype='float32') freq = 1702500000. light_speed = 299792458. # Speed of light ## quickly figure out what data is not flagged #np.float32(f[7].header['CRVAL3']) 299792458vvvv #good = np.where(f[0].data.data[:,0,0,0,0,0,0] != 0) #h_u = np.float32(freq*f[0].data.par('uu')[good]) #h_v = np.float32(freq*f[0].data.par('vv')[good]) blen = 0 for antenna1 in range(0, 39): for antenna2 in range(antenna1 + 1, 40): h_rere[blen] = Data[antenna1][antenna2][chan].real h_imim[blen] = Data[antenna1][antenna2][chan].imag h_uu[blen] = freq * UVW[blen][0] h_vv[blen] = freq * UVW[blen][1] blen += 1 print "h_u", h_uu #h_u = np.float32(h_u.ravel()) #h_v = np.float32(h_v.ravel()) gcount = np.int32(np.size(h_uu)) #gcount = len(gcount.ravel()) #h_re = np.float32(h_re.ravel()) #h_im = np.float32(h_im.ravel()) #freq = 3.45E11 #np.float32(f[0].header['CRVAL4']) blen = 0 bl_order = np.ndarray(shape=(780, 2), dtype=int) good = [] for border1 in range(0, 39): for border2 in range(border1 + 1, 40): bl_order[blen][0] = border1 bl_order[blen][1] = border2 blen = blen + 1 blen = 0 h_u = [] h_v = [] h_re = [] h_im = [] Flag_Ant = [ 0, 4, 8, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 28, 29, 37, 38, 39 ] for blen in range(0, 780): if (bl_order[blen][0] not in Flag_Ant) and (bl_order[blen][1] not in Flag_Ant): good.append(blen) h_u.append(h_uu[blen]) h_v.append(h_vv[blen]) h_re.append(h_rere[blen]) h_im.append(h_imim[blen]) #print "Good:",good gcount = np.int32(np.size(h_u)) ## assume data is unpolarized #print chan print 'GCOUNT', gcount #print "H_U", h_u #print "H_V", h_v #print h_re #print h_im # h_ : host, d_ : device h_grd = np.zeros((nx, nx), dtype=np.complex64) h_cnt = np.zeros((nx, nx), dtype=np.int32) d_u = gpu.to_gpu(np.array(h_uu, dtype='float32')) d_v = gpu.to_gpu(np.array(h_vv, dtype='float32')) d_re = gpu.to_gpu(np.array(h_rere, dtype='float32')) d_im = gpu.to_gpu(np.array(h_imim, dtype='float32')) d_cnt = gpu.zeros((np.int(nx), np.int(nx)), np.int32) d_grd = gpu.zeros((np.int(nx), np.int(nx)), np.complex64) d_ngrd = gpu.zeros_like(d_grd) d_bm = gpu.zeros_like(d_grd) d_nbm = gpu.zeros_like(d_grd) d_fim = gpu.zeros((np.int(imsize), np.int(imsize)), np.float32) ## define kernel parameters if imsize == 1024: blocksize2D = (8, 16, 1) gridsize2D = (np.int(np.ceil(1. * nx / blocksize2D[0])), np.int(np.ceil(1. * nx / blocksize2D[1]))) blocksizeF2D = (16, 16, 1) gridsizeF2D = (np.int(np.ceil(1. * imsize / blocksizeF2D[0])), np.int(np.ceil(1. * imsize / blocksizeF2D[1]))) blocksize1D = (256, 1, 1) else: blocksize2D = (16, 32, 1) gridsize2D = (np.int(np.ceil(1. * nx / blocksize2D[0])), np.int(np.ceil(1. * nx / blocksize2D[1]))) blocksizeF2D = (32, 32, 1) gridsizeF2D = (np.int(np.ceil(1. * imsize / blocksizeF2D[0])), np.int(np.ceil(1. * imsize / blocksizeF2D[1]))) blocksize1D = (512, 1, 1) gridsize1D = (np.int(np.ceil(1. * gcount / blocksize1D[0])), 1) # ------------------------ # make gridding kernels # ------------------------ ## make spheroidal convolution kernel (don't mess with these!) width = 6. ngcf = 24. h_cgf = gcf(ngcf, width) ## make grid correction h_corr = corrfun(nx, width) d_cgf = module.get_global('cgf')[0] d_corr = gpu.to_gpu(h_corr) cu.memcpy_htod(d_cgf, h_cgf) # ------------------------ # grid it up # ------------------------ d_umax = gpu.max(cumath.fabs(d_u)) d_vmax = gpu.max(cumath.fabs(d_v)) umax = np.int32(np.ceil(d_umax.get() / du)) vmax = np.int32(np.ceil(d_vmax.get() / du)) ## grid ($$) # This should be improvable via: # - shared memory solution? I tried... # - better coalesced memory access? I tried... # - reorganzing and indexing UV data beforehand? # (i.e. http://www.nvidia.com/docs/IO/47905/ECE757_Project_Report_Gregerson.pdf) # - storing V(u,v) in texture memory? gridVis_wBM_kernel(d_grd, d_bm, d_cnt, d_u, d_v, d_re, d_im, nx, du, gcount, umax, vmax, \ block=blocksize2D, grid=gridsize2D) ## apply weights wgtGrid_kernel(d_bm, d_cnt, briggs, nx, block=blocksize2D, grid=gridsize2D) hfac = np.int32(1) dblGrid_kernel(d_bm, nx, hfac, block=blocksize2D, grid=gridsize2D) shiftGrid_kernel(d_bm, d_nbm, nx, block=blocksize2D, grid=gridsize2D) ## normalize wgtGrid_kernel(d_grd, d_cnt, briggs, nx, block=blocksize2D, grid=gridsize2D) ## Reflect grid about v axis hfac = np.int32(-1) dblGrid_kernel(d_grd, nx, hfac, block=blocksize2D, grid=gridsize2D) ## Shift both shiftGrid_kernel(d_grd, d_ngrd, nx, block=blocksize2D, grid=gridsize2D) # ------------------------ # Make the beam # ------------------------ ## Transform to image plane fft.fft(d_nbm, d_bm, plan) ## Shift shiftGrid_kernel(d_bm, d_nbm, nx, block=blocksize2D, grid=gridsize2D) ## Correct for C corrGrid_kernel(d_nbm, d_corr, nx, block=blocksize2D, grid=gridsize2D) # Trim trimIm_kernel(d_nbm, d_fim, noff, nx, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Normalize d_bmax = gpu.max(d_fim) bmax = d_bmax.get() bmax = np.float32(1. / bmax) nrmBeam_kernel(d_fim, bmax, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Pull onto CPU dpsf = d_fim.get() # ------------------------ # Make the map # ------------------------ ## Transform to image plane fft.fft(d_ngrd, d_grd, plan) ## Shift shiftGrid_kernel(d_grd, d_ngrd, nx, block=blocksize2D, grid=gridsize2D) ## Correct for C corrGrid_kernel(d_ngrd, d_corr, nx, block=blocksize2D, grid=gridsize2D) ## Trim trimIm_kernel(d_ngrd, d_fim, noff, nx, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Normalize (Jy/beam) nrmGrid_kernel(d_fim, bmax, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Finish timers t_end = time.time() t_full = t_end - t_start print "Gridding execution time %0.5f" % t_full + ' s' print "\t%0.5f" % (t_full / gcount) + ' s per visibility' ## Return dirty psf (CPU) and dirty image (GPU) return dpsf, d_fim
def cuda_gridvis(settings, plan): """ Grid the visibilities parallelized by pixel. References: - Chapter 10 in "Interferometry and Synthesis in Radio Astronomy" by Thompson, Moran, & Swenson - Daniel Brigg's PhD Thesis: http://www.aoc.nrao.edu/dissertations/dbriggs/ """ print "Gridding the visibilities" t_start = time.time() # unpack parameters vfile = settings['vfile'] briggs = settings['briggs'] imsize = settings['imsize'] cell = settings['cell'] nx = np.int32(2 * imsize) noff = np.int32((nx - imsize) / 2) ## constants arc2rad = np.float32(np.pi / 180 / 3600.) du = np.float32(1. / (arc2rad * cell * nx)) ## grab data f = pyfits.open(settings['vfile']) # determin the file type (uvfits or fitsidi) if settings['vfile'].find('.fitsidi') != -1: ## quickly figure out what data is not flagged freq = 3.45E11 #np.float32(f[7].header['CRVAL3']) 299792458vvvv #good = np.where(f[0].data.data[:,0,0,0,0,0,0] != 0) #h_u = np.float32(freq*f[0].data.par('uu')[good]) #h_v = np.float32(freq*f[0].data.par('vv')[good]) light_speed = 299792458. # Speed of light h_u = np.ndarray(shape=(780, 1),dtype='float64') h_v = np.ndarray(shape=(780, 1),dtype='float64') h_re = np.ndarray(shape=(780, 1),dtype='float32') h_im = np.ndarray(shape=(780, 1),dtype='float32') h_u = np.float64(light_speed * f[0].data[:].UU) h_v = np.float64(light_speed * f[0].data[:].VV) for bl in range(0, 780): #gcount += np.int32(np.size(h_u[bl])) ## assume data is unpolarized #h_re = np.float32(0.5*(f[0].data.data[good,0,0,0,0,0,0]+f[0].data.data[good,0,0,0,0,1,0])) #h_im = np.float32(0.5*(f[0].data.data[good,0,0,0,0,0,1]+f[0].data.data[good,0,0,0,0,1,1])) h_re[bl] = np.float32(f[0].data[:].data[bl][0][0][0][0][0][0]) h_im[bl] = np.float32(f[0].data[:].data[bl][0][0][0][0][0][1]) ## make GPU arrays h_u = np.float32(h_u.ravel()) h_v = np.float32(h_v.ravel()) gcount = np.int32(np.size(h_u)) #gcount = len(gcount.ravel()) h_re = np.float32(h_re.ravel()) h_im = np.float32(h_im.ravel()) print len(h_re),len(h_im) elif settings['vfile'].find('.uvfits') != -1: freq = 3.45E11 #np.float32(f[0].header['CRVAL4']) light_speed = 299792458. good = np.where(f[0].data.data[:, 0, 0, 0, 0, 0] != 0) h_u = np.float32(light_speed * f[0].data.par('uu')[good]) h_v = np.float32(light_speed * f[0].data.par('vv')[good]) gcount = np.int32(np.size(h_u)) ## assume data is unpolarized h_re = np.float32(f[0].data.data[good, 0, 0, 0, 0, 0]) h_im = np.float32(f[0].data.data[good, 0, 0, 0, 0, 1]) print h_u # h_ : host, d_ : device h_grd = np.zeros((nx, nx), dtype=np.complex64) h_cnt = np.zeros((nx, nx), dtype=np.int32) d_u = gpu.to_gpu(h_u) d_v = gpu.to_gpu(h_v) d_re = gpu.to_gpu(h_re) d_im = gpu.to_gpu(h_im) d_cnt = gpu.zeros((np.int(nx), np.int(nx)), np.int32) d_grd = gpu.zeros((np.int(nx), np.int(nx)), np.complex64) d_ngrd = gpu.zeros_like(d_grd) d_bm = gpu.zeros_like(d_grd) d_nbm = gpu.zeros_like(d_grd) d_fim = gpu.zeros((np.int(imsize), np.int(imsize)), np.float32) ## define kernel parameters blocksize2D = (8, 16, 1) gridsize2D = (np.int(np.ceil(1. * nx / blocksize2D[0])), np.int(np.ceil(1. * nx / blocksize2D[1]))) blocksizeF2D = (16, 16, 1) gridsizeF2D = (np.int(np.ceil(1. * imsize / blocksizeF2D[0])), np.int(np.ceil(1. * imsize / blocksizeF2D[1]))) blocksize1D = (256, 1, 1) gridsize1D = (np.int(np.ceil(1. * gcount / blocksize1D[0])), 1) # ------------------------ # make gridding kernels # ------------------------ ## make spheroidal convolution kernel (don't mess with these!) width = 6. ngcf = 24. h_cgf = gcf(ngcf, width) ## make grid correction h_corr = corrfun(nx, width) d_cgf = module.get_global('cgf')[0] d_corr = gpu.to_gpu(h_corr) cu.memcpy_htod(d_cgf, h_cgf) # ------------------------ # grid it up # ------------------------ d_umax = gpu.max(cumath.fabs(d_u)) d_vmax = gpu.max(cumath.fabs(d_v)) umax = np.int32(np.ceil(d_umax.get() / du)) vmax = np.int32(np.ceil(d_vmax.get() / du)) ## grid ($$) # This should be improvable via: # - shared memory solution? I tried... # - better coalesced memory access? I tried... # - reorganzing and indexing UV data beforehand? # (i.e. http://www.nvidia.com/docs/IO/47905/ECE757_Project_Report_Gregerson.pdf) # - storing V(u,v) in texture memory? gridVis_wBM_kernel(d_grd, d_bm, d_cnt, d_u, d_v, d_re, d_im, nx, du, gcount, umax, vmax, \ block=blocksize2D, grid=gridsize2D) ## apply weights wgtGrid_kernel(d_bm, d_cnt, briggs, nx, block=blocksize2D, grid=gridsize2D) hfac = np.int32(1) dblGrid_kernel(d_bm, nx, hfac, block=blocksize2D, grid=gridsize2D) shiftGrid_kernel(d_bm, d_nbm, nx, block=blocksize2D, grid=gridsize2D) ## normalize wgtGrid_kernel(d_grd, d_cnt, briggs, nx, block=blocksize2D, grid=gridsize2D) ## Reflect grid about v axis hfac = np.int32(-1) dblGrid_kernel(d_grd, nx, hfac, block=blocksize2D, grid=gridsize2D) ## Shift both shiftGrid_kernel(d_grd, d_ngrd, nx, block=blocksize2D, grid=gridsize2D) # ------------------------ # Make the beam # ------------------------ ## Transform to image plane fft.fft(d_nbm, d_bm, plan) ## Shift shiftGrid_kernel(d_bm, d_nbm, nx, block=blocksize2D, grid=gridsize2D) ## Correct for C corrGrid_kernel(d_nbm, d_corr, nx, block=blocksize2D, grid=gridsize2D) # Trim trimIm_kernel(d_nbm, d_fim, noff, nx, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Normalize d_bmax = gpu.max(d_fim) bmax = d_bmax.get() bmax = np.float32(1. / bmax) nrmBeam_kernel(d_fim, bmax, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Pull onto CPU dpsf = d_fim.get() # ------------------------ # Make the map # ------------------------ ## Transform to image plane fft.fft(d_ngrd, d_grd, plan) ## Shift shiftGrid_kernel(d_grd, d_ngrd, nx, block=blocksize2D, grid=gridsize2D) ## Correct for C corrGrid_kernel(d_ngrd, d_corr, nx, block=blocksize2D, grid=gridsize2D) ## Trim trimIm_kernel(d_ngrd, d_fim, noff, nx, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Normalize (Jy/beam) nrmGrid_kernel(d_fim, bmax, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Finish timers t_end = time.time() t_full = t_end - t_start print "Gridding execution time %0.5f" % t_full + ' s' print "\t%0.5f" % (t_full / gcount) + ' s per visibility' ## Return dirty psf (CPU) and dirty image (GPU) return dpsf, d_fim
def cuda_gridvis(self, plan, x_offset, y_offset): """ Grid the visibilities parallelized by pixel. References: - Chapter 10 in "Interferometry and Synthesis in Radio Astronomy" by Thompson, Moran, & Swenson - Daniel Brigg's PhD Thesis: http://www.aoc.nrao.edu/dissertations/dbriggs/ If the size of the image is 1024x1024, the plan should be at least 1024*1.414 (about 25 degrees' rotation) And to satisfy the requirements of CLEAN, the dirty image should be 1024* 2.828 """ logger.debug("Gridding the visibilities") t_start = time.time() nx = np.int32(2 * self.imsize) noff = np.int32((nx - self.imsize) / 2) arc2rad = np.float32(np.pi / 180. / 3600.) du = np.float32(1. / (arc2rad * self.cell)) / (self.imsize * 2.) logger.debug("1 Pixel DU = %f" % du) h_uu = np.float32(self.h_uu.ravel()) h_vv = np.float32(self.h_vv.ravel()) h_rere = np.float32(self.h_rere.ravel()) h_imim = np.float32(self.h_imim.ravel()) blen = 0 bl_order = np.ndarray(shape=(self.baseline_number, 2), dtype=int) good = [] if self.baseline_number == 780: # MUSER-I antennas = 40 else: antennas = 60 # print antennas for border1 in range(0, antennas - 1): for border2 in range(border1 + 1, antennas): bl_order[blen][0] = border1 bl_order[blen][1] = border2 blen = blen + 1 h_u = [] h_v = [] h_re = [] h_im = [] for blen in range(0, self.baseline_number): if (bl_order[blen][0] not in self.Flag_Ant) and (bl_order[blen][1] not in self.Flag_Ant): good.append(blen) h_u.append(h_uu[blen]) h_v.append(h_vv[blen]) h_re.append(h_rere[blen]) h_im.append(h_imim[blen]) gcount = np.int32(np.size(h_u)) # h_ : host, d_ : device # h_grd = np.zeros((nx, nx), dtype=np.complex64) # h_cnt = np.zeros((nx, nx), dtype=np.int32) d_u = gpu.to_gpu(np.array(h_u, dtype='float32')) d_v = gpu.to_gpu(np.array(h_v, dtype='float32')) d_re = gpu.to_gpu(np.array(h_re, dtype='float32')) d_im = gpu.to_gpu(np.array(h_im, dtype='float32')) d_cnt = gpu.zeros((np.int(nx), np.int(nx)), np.int32) d_grd = gpu.zeros((np.int(nx), np.int(nx)), np.complex64) d_ngrd = gpu.zeros_like(d_grd) d_bm = gpu.zeros_like(d_grd) d_nbm = gpu.zeros_like(d_grd) d_cbm = gpu.zeros_like(d_grd) d_fbm = gpu.zeros((np.int(nx), np.int(nx)), np.float32) d_fim = gpu.zeros((np.int(self.imsize), np.int(self.imsize)), np.float32) d_dim = gpu.zeros((np.int(self.imsize), np.int(self.imsize)), np.float32) d_sun_disk = gpu.zeros_like(d_grd) d_fdisk = gpu.zeros((np.int(self.imsize), np.int(self.imsize)), np.float32) ## define kernel parameters self.calc_gpu_thread(nx, self.imsize, gcount) width = 6. ngcf = 24. h_cgf = self.gcf(ngcf, width) ## make grid correction h_corr = self.corrfun(nx, width) d_cgf = self.module.get_global('cgf')[0] d_corr = gpu.to_gpu(h_corr) cu.memcpy_htod(d_cgf, h_cgf) # ------------------------ # grid it up # ------------------------ d_umax = gpu.max(cumath.fabs(d_u)) d_vmax = gpu.max(cumath.fabs(d_v)) umax = np.int32(np.ceil(d_umax.get() / du)) vmax = np.int32(np.ceil(d_vmax.get() / du)) self.gridVis_wBM_kernel(d_grd, d_bm, d_cbm, d_cnt, d_u, d_v, d_re, d_im, np.int32(nx), np.float32(du), np.int32(gcount), np.int32(umax), np.int32(vmax), np.int32(1 if self.correct_p_angle else 0), block=self.blocksize_2D, grid=self.gridsize_2D) ## apply weights self.wgtGrid_kernel(d_bm, d_cnt, self.briggs, nx, 0, block=self.blocksize_2D, grid=self.gridsize_2D) hfac = np.int32(1) self.dblGrid_kernel(d_bm, nx, hfac, block=self.blocksize_2D, grid=self.gridsize_2D) self.dblGrid_kernel(d_cbm, nx, hfac, block=self.blocksize_2D, grid=self.gridsize_2D) self.shiftGrid_kernel(d_bm, d_nbm, nx, block=self.blocksize_2D, grid=self.gridsize_2D) self.shiftGrid_kernel(d_cbm, d_bm, nx, block=self.blocksize_2D, grid=self.gridsize_2D) ## normalize self.wgtGrid_kernel(d_grd, d_cnt, self.briggs, nx, 0, block=self.blocksize_2D, grid=self.gridsize_2D) ## Reflect grid about v axis hfac = np.int32(-1) self.dblGrid_kernel(d_grd, nx, hfac, block=self.blocksize_2D, grid=self.gridsize_2D) ## Shift both self.shiftGrid_kernel(d_grd, d_ngrd, nx, block=self.blocksize_2D, grid=self.gridsize_2D) fft.fft(d_ngrd, d_grd, plan) ## Shift self.shiftGrid_kernel(d_grd, d_ngrd, nx, block=self.blocksize_2D, grid=self.gridsize_2D) ## Correct for C self.corrGrid_kernel(d_ngrd, d_corr, nx, block=self.blocksize_2D, grid=self.gridsize_2D) ## Trim self.trimIm_kernel(d_ngrd, d_dim, nx, self.imsize, block=self.blocksize_F2D, grid=self.gridsize_F2D) self.copyIm_kernel(d_ngrd, d_fbm, nx, block=self.blocksize_2D, grid=self.gridsize_2D) ## Normalize (Jy/beam)i # self.nrmGrid_kernel(d_dim, bmax1, self.imsize, block=self.blocksize_F2D, grid=self.gridsize_F2D) # self.nrmGrid_kernel(d_fbm, bmax2, nx, block=self.blocksize_2D, grid=self.gridsize_2D) ## Finish timers t_end = time.time() t_full = t_end - t_start logger.debug("Gridding execution time %0.5f" % t_full + ' s') logger.debug("\t%0.5f" % (t_full / gcount) + ' s per visibility') # ---------------------- ## Return dirty psf (CPU), dirty image (GPU) and sun disk return d_dim
def fabs(self): return CUDAArray(cumath.fabs(self.arr))