if rank == 0: t0 = time.time() ##################################Wdeltag######################################## print '=' * 80 print 'starting cal wdengx wdengy' deltagw1 = np.empty((N, N, N / 2 + 1), dtype=np.complex128) deltagw2 = np.empty((N, N, N / 2 + 1), dtype=np.complex128) deltax1 = np.empty_like(deltax, dtype=np.float64) deltax2 = np.empty_like(deltax, dtype=np.float64) deltak = np.empty((N, N, N / 2 + 1), dtype=np.complex128) fft = fftw.Plan(inarray=deltax, outarray=deltak, direction='forward', nthreads=nthreads) fftw.execute(fft) fftw.destroy_plan(fft) k[0, 0, 0] = 10**-4 / Kf comm.Scatter(deltak, recvdata_k1, root=0) #deltak smoothed log W = wk(k * Kf) if rank == 0: W[0, 0, 0] = 1 deltak1 = recvdata_k1 * W * 1j * Kf * (mpi_fn[rank][:, None, None] + np.zeros_like(fn)[None, :, None] + np.zeros_like(fnc)[None, None, :]) deltak2 = recvdata_k1 * W * 1j * Kf * ( np.zeros_like(mpi_fn[rank])[:, None, None] + fn[None, :, None] + np.zeros_like(fnc)[None, None, :]) comm.Gather(deltak1, deltagw1, root=0) comm.Gather(deltak2, deltagw2, root=0) if rank == 0: k[0, 0, 0] = 0
Pk0=np.empty((N,N,N/2+1),dtype=np.float64) deltax=np.linspace(0,N,N**3).reshape(N,N,N) change=np.array(Tide.LoadData(Input),dtype=np.float64) deltax[:]=change[:] deltax=np.array(deltax,dtype=np.float64) del change sum=deltax.sum() deltax*=(N**3/sum) #for halo, the data is n/nbar. ###################################smooth####################################### print '='*80 print 'smoothing...' t0=time.time() deltak=np.empty((N,N,N/2+1),dtype=np.complex128) fft=fftw.Plan(inarray=deltax,outarray=deltak,direction='forward',nthreads=nthreads) fftw.execute(fft) fftw.destroy_plan(fft) smooth_k=np.empty((N,N,N/2+1),dtype=np.complex128) k=(mpi_fn[rank][:,None,None]**2.+fn[None,:,None]**2.+fnc[None,None,:]**2)**(1./2.) window_k= np.sinc(1./N*mpi_fn[rank][:,None,None])*np.sinc(1./N*fn[None,:,None])*np.sinc(1./N*fnc[None,None,:]) comm.Scatter(deltak,recvdata_k1,root=0) #deltak sum=comm.bcast(sum,root=0) #deltak senddata_k1=recvdata_k1*np.exp(-0.5*Kf*Kf*k*k*Sigma**2)/window_k #smooth_k Ph=L**3/N**6*np.abs(senddata_k1)**2 Wiener=Ph/(Ph+(L**3)/sum) #wiener filter senddata_k1*=Wiener Pk_halo=np.abs(recvdata_k1/window_k)**2 Pk_halo*=(L**3/N**6) Pk_halo=np.array(Pk_halo,dtype=np.float64) comm.Gather(senddata_k1,smooth_k,root=0) comm.Gather(Pk_halo,Pk0,root=0) if rank==0:
def sample_defrost_cpu(lat, func, gamma, m2_eff): """Calculates a sample of random values in the lattice. Taken from Defrost-program. func = name of Cuda kernel n = size of cubic lattice gamma = -0.25 or +0.25 m2_eff = effective mass This uses numpy to calculate FFTW. """ import fftw3 "Various constants:" mpl = lat.mpl n = lat.n nn = lat.nn os = 16 nos = n * pow(os, 2) dk = lat.dk dx = lat.dx dkos = dk / (2. * os) dxos = dx / os kcut = nn * dk / 2.0 norm = 0.5 / (math.sqrt(2 * pi * dk**3.) * mpl) * (dkos / dxos) ker = np.empty(nos, dtype=np.float64) fft = fftw3.Plan(ker, ker, direction='forward', flags=['measure'], realtypes=['realodd 10']) for k in xrange(nos): kk = (k + 0.5) * dkos ker[k] = (kk * (kk**2. + m2_eff)**gamma) * math.exp(-(kk / kcut)**2.) fft.execute() fftw3.destroy_plan(fft) for k in xrange(nos): ker[k] = norm * ker[k] / (k + 1) l0 = int(np.floor(np.sqrt(3) * n / 2 * os)) tmp = np.zeros((n, n, n), dtype=np.float64) Fk = np.zeros((n, n, n / 2 + 1), dtype=np.complex128) ker_gpu = gpuarray.to_gpu(ker) tmp_gpu = gpuarray.to_gpu(tmp) func(tmp_gpu, ker_gpu, np.uint32(nn), np.float64(os), np.uint32(lat.dimx), np.uint32(lat.dimy), np.uint32(lat.dimz), block=lat.cuda_block_1, grid=lat.cuda_grid) tmp += tmp_gpu.get() Fk = np.fft.rfftn(tmp) if lat.test == True: print 'Testing mode on! Set testQ to False to disable this.\n' np.random.seed(1) rr1 = np.random.normal( size=Fk.shape) + np.random.normal(size=Fk.shape) * 1j Fk *= rr1 tmp = np.fft.irfftn(Fk) ker_gpu.gpudata.free() tmp_gpu.gpudata.free() return tmp
def sample_defrost_cpu2(lat, func, gamma, m2_eff): """Calculates a sample of random values in the lattice lat = Lattice func = name of Cuda kernel n = size of cubic lattice gamma = -0.25 or +0.25 m2_eff = effective mass This uses fftw3 to calculate FFTW. """ import fftw3 "Various constants:" mpl = lat.mpl n = lat.n nn = lat.nn os = 16 nos = n*pow(os,2) dk = lat.dk dx = lat.dx dkos = dk/(2.*os) dxos = dx/os kcut = nn*dk/2.0 norm = 0.5/(math.sqrt(2*pi*dk**3.)*mpl)*(dkos/dxos) ker = np.empty(nos, dtype= lat.prec_real) fft = fftw3.Plan(ker,ker, direction='forward', flags=['measure'], realtypes = ['realodd 10']) for k in xrange(nos): kk = (k+0.5)*dkos ker[k] = kk*(kk**2. + m2_eff)**gamma*math.exp(-(kk/kcut)**2.) fft.execute() fftw3.destroy_plan(fft) for k in xrange(nos): ker[k] = norm*ker[k]/(k+1) tmp = np.zeros((n,n,n),dtype = lat.prec_real) Fk = np.zeros((n,n,n/2+1),dtype = lat.prec_complex) ker_gpu = gpuarray.to_gpu(ker) tmp_gpu = gpuarray.to_gpu(tmp) fft2 = fftw3.Plan(tmp, Fk, direction='forward', flags=['measure']) fft3 = fftw3.Plan(Fk, tmp, direction='forward', flags=['measure']) func(tmp_gpu, ker_gpu, np.uint32(nn), np.float64(os), np.uint32(lat.dimx), np.uint32(lat.dimy), np.uint32(lat.dimz), block = lat.cuda_block_1, grid = lat.cuda_grid) tmp += tmp_gpu.get() fft2.execute() fftw3.destroy_plan(fft2) if lat.test==True: print'Testing mode on! Set testQ to False to disable this.\n' np.random.seed(1) rr1 = np.random.normal(size=Fk.shape) + np.random.normal(size=Fk.shape)*1j Fk *= rr1 fft3.execute() fftw3.destroy_plan(fft3) tmp *= 1./lat.VL return tmp
def sample_defrost_gpu(lat, func, gamma, m2_eff): """Calculates a sample of random values in the lattice lat = Lattice func = name of Cuda kernel n = size of cubic lattice gamma = -0.25 or +0.25 m2_eff = effective mass This uses CuFFT to calculate FFTW. """ import scikits.cuda.fft as fft import fftw3 "Various constants:" mpl = lat.mpl n = lat.n nn = lat.nn os = 16 nos = n * pow(os, 2) dk = lat.dk dx = lat.dx dkos = dk / (2. * os) dxos = dx / os kcut = nn * dk / 2.0 norm = 0.5 / (math.sqrt(2 * pi * dk**3.) * mpl) * (dkos / dxos) ker = np.empty(nos, dtype=lat.prec_real) fft1 = fftw3.Plan(ker, ker, direction='forward', flags=['measure'], realtypes=['realodd 10']) for k in xrange(nos): kk = (k + 0.5) * dkos ker[k] = kk * (kk**2. + m2_eff)**gamma * math.exp(-(kk / kcut)**2.) fft1.execute() fftw3.destroy_plan(fft1) for k in xrange(nos): ker[k] = norm * ker[k] / (k + 1) Fk_gpu = gpuarray.zeros((n / 2 + 1, n, n), dtype=lat.prec_complex) ker_gpu = gpuarray.to_gpu(ker) tmp_gpu = gpuarray.zeros((n, n, n), dtype=lat.prec_real) plan = fft.Plan(tmp_gpu.shape, lat.prec_real, lat.prec_complex) plan2 = fft.Plan(tmp_gpu.shape, lat.prec_complex, lat.prec_real) func(tmp_gpu, ker_gpu, np.uint32(nn), np.float64(os), np.uint32(lat.dimx), np.uint32(lat.dimy), np.uint32(lat.dimz), block=lat.cuda_block_1, grid=lat.cuda_grid) fft.fft(tmp_gpu, Fk_gpu, plan) if lat.test == True: print 'Testing mode on! Set testQ to False to disable this.\n' np.random.seed(1) rr1 = (np.random.normal(size=Fk_gpu.shape) + np.random.normal(size=Fk_gpu.shape) * 1j) Fk = Fk_gpu.get() Fk *= rr1 Fk_gpu = gpuarray.to_gpu(Fk) fft.ifft(Fk_gpu, tmp_gpu, plan2) res = (tmp_gpu.get()).astype(lat.prec_real) res *= 1. / lat.VL return res