Exemplo n.º 1
0
def evolve_linear(z, deltax):
    """
	Input type IN must be numpy or 21cmfast
	"""

    fgrowth = pb.fgrowth(z, COSMO['omega_M_0'])  #normalized to 1 at z=0
    #primordial_fgrowth = pb.fgrowth(INITIAL_REDSHIFT, cosmo['omega_M_0']) #normalized to 1 at z=0

    updated = deltax * fgrowth

    np.save(
        parent_folder +
        "/Boxes/updated_smoothed_deltax_z{0:.2f}_{1:d}_{2:.0f}Mpc".format(
            z, HII_DIM, BOX_LEN), updated)

    if False:  #velocity information may not be useful for linear field
        plan = Plan(HII_shape, dtype=np.complex64)
        deltak_d = deltax_d.astype(np.complex64)
        vbox_d = gpuarray.zeros_like(deltak_d)
        plan.execute(deltak_d)
        dDdt_D = np.float32(dDdt_D(z))
        for num, mode in enumerate(['x', 'y', 'z']):
            velocity_kernel(deltak_d,
                            vbox_d,
                            dDdt_D,
                            DIM,
                            np.int32(num),
                            block=block_size,
                            grid=grid_size)
            np.save(
                parent_folder +
                "/Boxes/updated_v{0}overddot_{1:d}_{2:.0f}Mpc".format(
                    mode, HII_DIM, BOX_LEN), smallvbox_d.get())

    return
Exemplo n.º 2
0
def _get_plan(itype, otype, inlen):
    try:
        theplan = _plans[(itype, otype, inlen)]
    except KeyError:
        theplan = Plan(inlen, dtype=itype, normalize=False, fast_math=True)
        _plans.update({(itype, otype, inlen): theplan})

    return theplan
Exemplo n.º 3
0
def conv(delta_d, filt_d, shape, fil):
	smoothI = np.zeros(shape, dtype=np.complex64)
	smoothed_d = gpuarray.to_gpu(smoothI)
	plan = Plan(shape, dtype=np.complex64)
	plan.execute(delta_d)
	if fil == 'rspace':
		plan.execute(filt_d)
	smoothed_d = delta_d * filt_d.conj()
	plan.execute(smoothed_d, inverse=True)
	return smoothed_d.real
Exemplo n.º 4
0
def gpu_fft(data, inverse=False):
    global plan, ctx, stream  ##cuda
    if not plan:
        print 'building plan', data.shape
        plan = Plan(data.shape, stream=stream, wait_for_finish=True)

    result = gpuarray.zeros_like(data)

    plan.execute(data, data_out=result, inverse=inverse)

    return result
Exemplo n.º 5
0
def prep(image, psf):
    datadim1 = image.shape[0]
    datadim2 = image.shape[1]
    if datadim1 != datadim2:
        ddim = max(datadim1, datadim2)
        s = numpy.binary_repr(ddim - 1)
        s = s[:-1] + '0'  # Guarantee that padding is used
    else:
        ddim = datadim1
        s = numpy.binary_repr(ddim - 1)
    if s.find('0') > 0:
        size = 2**len(s)
        boxd = numpy.zeros((size, size))
        r = size - datadim1
        r1 = r2 = r / 2
        if r % 2 == 1:
            r1 = r // 2 + 1
        c = size - datadim2
        c1 = c2 = c // 2
        if c % 2 == 1:
            c1 = c // 2 + 1
        boxdslice = (slice(r1, datadim1 + r1), slice(c1, datadim2 + c1))
        boxd[boxdslice] = image
    else:
        boxd = image

    boxp = boxd * 0.
    if boxd.shape[0] == psf.shape[0]:
        boxp = psf.copy()
    else:
        r = boxp.shape[0] - psf.shape[0]
        r1 = r // 2 + 1
        c = boxp.shape[1] - psf.shape[1]
        c1 = c // 2 + 1
        boxpslice = (slice(r1,
                           psf.shape[0] + r1), slice(c1, psf.shape[1] + c1))
        boxp[boxpslice] = psf.copy()

    from pyfft.cuda import Plan
    import pycuda.driver as cuda
    from pycuda.tools import make_default_context
    import pycuda.gpuarray as gpuarray
    cuda.init()
    context = make_default_context()
    stream = cuda.Stream()

    plan = Plan(boxp.shape, stream=stream)
    gdata = gpuarray.to_gpu(boxp.astype(numpy.complex64))
    plan.execute(gdata)
    return gdata, boxd.shape, boxdslice, plan, stream
Exemplo n.º 6
0
    def run(self):
        drv.init()
        a0=numpy.zeros((p,),dtype=numpy.complex64)
        self.dev = drv.Device(self.number)
        self.ctx = self.dev.make_context()
#TO VERIFY WHETHER ALL THE MEMORY IS FREED BEFORE NEXT ALLOCATION (THIS DOES NOT HAPPEN IN MULTITHREADING)
        print drv.mem_get_info() 
        self.gpu_a = garray.empty((self.input_cpu.size,), dtype=numpy.complex64)
        self.gpu_b = garray.zeros_like(self.gpu_a)
        self.gpu_a = garray.to_gpu(self.input_cpu)
        plan = Plan(a0.shape,context=self.ctx)
        plan.execute(self.gpu_a, self.gpu_b, batch=p/m)
        self.temp = self.gpu_b.get()
        print output_cpu._closed
        self.output_cpu.put(self.temp)
Exemplo n.º 7
0
def image_cuda(grids):
    """ Run 2d FFT to image each plane of grid array
    """

    from pyfft.cuda import Plan
    from pycuda.tools import make_default_context
    import pycuda.gpuarray as gpuarray
    import pycuda.driver as cuda

    nints, npixx, npixy = grids.shape

    cuda.init()
    context = make_default_context()
    stream = cuda.Stream()

    plan = Plan((npixx, npixy), stream=stream)

    grid_gpu = gpuarray.to_gpu(grids)
    for i in range(0, nints):
        plan.execute(grid_gpu[i], inverse=True)
    grids = grid_gpu.get()

    context.pop()
    return recenter(grids.real, (npixx//2, npixy//2))
Exemplo n.º 8
0
Arquivo: ssf.py Projeto: cvarin/PyOFTK
def ssfgpuFull(u0,
               dt,
               dz,
               nz,
               alpha,
               betap,
               gamma,
               context,
               maxiter=4,
               tol=1e-5,
               phiNLOut=False):
    '''	
	Very simple implementation of the symmetrized split-step fourier algo.
	Solve the NLS equation with the SPM nonlinear terme only.

		* error: third in step size
		* u0 : Input field
		* dt: Time increment
		* dz: Space increment
		* nz: Number of space propagation step
		* alpha: Loss/Gain parameter (array)
		* betap: Beta array beta[2] = GVD, beta[3] = TOD, etc...
		* gamma: Nonlinear parameter
		* maxiter: Maximal number of iteration per step (4)
		* tol: Error for each step (1e-5)
		* phiNLOut: If True return the nonlinear phase shift (True)

		--- GPU Version (float precision) ---
	'''

    nt = len(u0)
    e_ini = pow(abs(u0), 2).sum()
    w = wspace(dt * nt, nt)
    phiNL = 0.0

    # Make sure u0 is in single precision
    u0 = u0.astype(complex64)
    alpha = alpha.astype(complex64)
    u1 = u0
    uArch = zeros([nz, nt], float32)
    uv = empty_like(u0)

    # Construction of the linear operator
    halfstep = -alpha / 2.0
    if len(betap) != nt:
        for ii in arange(len(betap)):
            halfstep = halfstep - 1.0j * betap[ii] * pow(w, ii) / factorial(ii)
    halfstep = exp(halfstep * dz / 2.0).astype(complex64)

    # CUDA Kitchen sink
    fftPlan = Plan((1, nt), dtype=numpy.complex64)

    # Allocate memory to the device
    gpu_halfstep = gpuarray.to_gpu(halfstep)
    gpu_u0 = gpuarray.to_gpu(u0)
    gpu_u1 = gpuarray.to_gpu(u1)
    gpu_uhalf = gpuarray.empty_like(gpu_u0)
    gpu_uv = gpuarray.empty_like(gpu_u0)
    gpu_ufft = gpuarray.empty_like(gpu_u0)

    fftPlan.execute(gpu_u0, gpu_ufft)

    # GPU Kernel corresponding to the linear operator
    halfStepKernel = ElementwiseKernel(
        "pycuda::complex<float> *u, pycuda::complex<float> *halfstep, pycuda::complex<float> *uhalf",
        "uhalf[i] = u[i] * halfstep[i]",
        "halfstep_linear",
        preamble="#include <pycuda-complex.hpp>",
    )

    # GPU Kernel corresponding to the nonlinear operator
    nlKernel = ElementwiseKernel(
        "pycuda::complex<float> *uhalf, pycuda::complex<float> *u0, pycuda::complex<float> *u1, pycuda::complex<float> *uv, float gamma, float dz",
        """
		float u0_int = pow(u0[i]._M_re,2) + pow(u0[i]._M_im,2);
		float u1_int = pow(u1[i]._M_re,2) + pow(u1[i]._M_im,2);
		float realArg = -gamma*(u1_int + u0_int)*dz;
		float euler1 = cos(realArg);
		float euler2 = sin(realArg);
		uv[i]._M_re = uhalf[i]._M_re * euler1 - uhalf[i]._M_im * euler2;
		uv[i]._M_im = uhalf[i]._M_im * euler1 + uhalf[i]._M_re * euler2;
		""",
        "halfstep_nonlinear",
        preamble="#include <pycuda-complex.hpp>",
    )

    # GPU reduction kernel computing the error between two complex array
    computeError = ReductionKernel(
        numpy.float32,
        neutral="0",
        reduce_expr="a+b",
        map_expr="pow(abs(a[i] - b[i]),2)",
        arguments="pycuda::complex<float> *a, pycuda::complex<float> *b",
        name="error_reduction",
        preamble="#include <pycuda-complex.hpp>",
    )

    # Perfom a deep copy of a complex gpuarray
    complexDeepCopy = ElementwiseKernel(
        "pycuda::complex<float> *u1, pycuda::complex<float> *u2",
        "u1[i]._M_re = u2[i]._M_re;u1[i]._M_im = u2[i]._M_im",
        "gpuarray_deepcopy",
        preamble="#include <pycuda-complex.hpp>",
    )

    # Main Loop
    for iz in arange(nz):
        # First application of the linear operator
        halfStepKernel(gpu_ufft, gpu_halfstep, gpu_uhalf)
        fftPlan.execute(gpu_uhalf, inverse=True)
        for ii in arange(maxiter):
            # Application de l'operateur nonlineaire en approx. l'integral de N(z)dz
            # avec la methode du trapeze
            nlKernel(gpu_uhalf, gpu_u0, gpu_u1, gpu_uv, float(gamma),
                     float(dz / 2.0))
            fftPlan.execute(gpu_uv)
            # Second application of the linear operator
            halfStepKernel(gpu_uv, gpu_halfstep, gpu_ufft)
            fftPlan.execute(gpu_ufft, gpu_uv, inverse=True)

            error = computeError(gpu_u1, gpu_uv).get() / e_ini

            if (error < tol):
                complexDeepCopy(gpu_u1, gpu_uv)
                break
            else:
                complexDeepCopy(gpu_u1, gpu_uv)

        if (ii >= maxiter - 1):
            raise Exception, "Failed to converge"

        complexDeepCopy(gpu_u0, gpu_u1)
        uArch[iz] = pow(abs(gpu_u1.get()), 2)

    u1 = gpu_u1.get()

    if phiNLOut:
        return [u1, uArch, phiNL]
    else:
        return [u1, uArch]
Exemplo n.º 9
0
def init_stitch(N):
	"""outputs the high resolution k-box, and the smoothed r box

	Input
	-----------
	N:  int32
		size of box to load onto the GPU, should be related to DIM by powers of 2

	"""
	if N is None:
		N = np.int32(HII_DIM) #prepare for stitching
	META_GRID_SIZE = DIM/N
	M = np.int32(HII_DIM/META_GRID_SIZE)
	#HII_DIM = np.int32(HII_DIM)
	f_pixel_factor = DIM/HII_DIM;
	scale = np.float32(BOX_LEN/DIM)
	print 'scale', scale
	HII_scale = np.float32(BOX_LEN/HII_DIM)
	shape = (DIM,DIM,N)
	stitch_grid_size = (DIM/(block_size[0]),
						DIM/(block_size[0]),
						N/(block_size[0]))
	HII_stitch_grid_size = (HII_DIM/(block_size[0]),
						HII_DIM/(block_size[0]),
						M/(block_size[0]))
	#ratio of large box to small size
	kernel_source = open(cmd_folder+"/initialize_stitch.cu").read()
	kernel_code = kernel_source % {

		'DELTAK': DELTA_K,
		'DIM': DIM, 
		'VOLUME': VOLUME,
		'META_BLOCKDIM': N
	}
	main_module = nvcc.SourceModule(kernel_code)
	init_stitch = main_module.get_function("init_kernel")
	HII_filter = main_module.get_function("HII_filter")
	subsample_kernel = main_module.get_function("subsample")
	velocity_kernel = main_module.get_function("set_velocity")
	pspec_texture = main_module.get_texref("pspec")
	MRGgen = MRG32k3aRandomNumberGenerator(seed_getter=seed_getter_uniform, offset=0)
	plan2d = Plan((np.int64(DIM), np.int64(DIM)), dtype=np.complex64)
	plan1d = Plan((np.int64(DIM)), dtype=np.complex64)
	print "init pspec"
	interpPspec, interpSize = init_pspec() #interpPspec contains both k array and P array
	interp_cu = cuda.matrix_to_array(interpPspec, order='F')
	cuda.bind_array_to_texref(interp_cu, pspec_texture)
	#hbox_large = pyfftw.empty_aligned((DIM, DIM, DIM), dtype='complex64')
	hbox_large = np.zeros((DIM, DIM, DIM), dtype=np.complex64)
	#hbox_small = np.zeros(HII_shape, dtype=np.float32)
	#hbox_large = n
	smoothR = np.float32(L_FACTOR*BOX_LEN/HII_DIM)

	# Set up pinned memory for transfer
	#largebox_hs = cuda.aligned_empty(shape=shape, dtype=np.float32, alignment=resource.getpagesize())
	largebox_pin = cuda.pagelocked_empty(shape=shape, dtype=np.float32)
	largecbox_pin = cuda.pagelocked_empty(shape=shape, dtype=np.complex64)

	largebox_d = gpuarray.zeros(shape, dtype=np.float32)
	largebox_d_imag = gpuarray.zeros(shape, dtype=np.float32)
	print "init boxes"
	for meta_z in xrange(META_GRID_SIZE):
		# MRGgen = MRG32k3aRandomNumberGenerator(seed_getter=seed_getter_uniform, offset=meta_x*N**3)
		init_stitch(largebox_d, DIM, np.int32(meta_z),block=block_size, grid=stitch_grid_size)
		init_stitch(largebox_d_imag, DIM, np.int32(meta_z),block=block_size, grid=stitch_grid_size)
		largebox_d *= MRGgen.gen_normal(shape, dtype=np.float32)
		largebox_d_imag *= MRGgen.gen_normal(shape, dtype=np.float32)
		largebox_d = largebox_d + np.complex64(1.j) * largebox_d_imag
		cuda.memcpy_dtoh_async(largecbox_pin, largebox_d)
		hbox_large[:, :, meta_z*N:(meta_z+1)*N] = largecbox_pin.copy()
	#if want to get velocity need to use this
	if True:
		print "saving kbox"
		np.save(parent_folder+"/Boxes/deltak_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN), hbox_large)

	print "Executing FFT on device"
	#hbox_large = pyfftw.interfaces.numpy_fft.ifftn(hbox_large).real
	hbox_large = fft_stitch(N, plan2d, plan1d, hbox_large, largebox_d).real
	print hbox_large.dtype
	print "Finished FFT on device"
	np.save(parent_folder+"/Boxes/deltax_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN), hbox_large)
	
	if True:
		print "loading kbox"
		hbox_large = np.load(parent_folder+"/Boxes/deltak_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN))
	for meta_z in xrange(META_GRID_SIZE):
		largebox_pin = hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy()
		#cuda.memcpy_htod_async(largebox_d, largebox_pin)
		largebox_d = gpuarray.to_gpu_async(hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy())
		HII_filter(largebox_d, DIM, np.int32(meta_z), ZERO, smoothR, block=block_size, grid=stitch_grid_size);
		hbox_large[:, :, meta_z*N:(meta_z+1)*N] = largebox_d.get_async()
	#import IPython; IPython.embed()
	print "Executing FFT on host"
	#hbox_large = hifft(hbox_large).astype(np.complex64).real
	#hbox_large = pyfftw.interfaces.numpy_fft.ifftn(hbox_large).real
	hbox_large = fft_stitch(N, plan2d, plan1d, hbox_large, largebox_d).real
	print "Finished FFT on host"
	#import IPython; IPython.embed()

	# for meta_x in xrange(META_GRID_SIZE):
	# 	for meta_y in xrange(META_GRID_SIZE):
	# 		for meta_z in xrange(META_GRID_SIZE):
	# 			largebox_d = gpuarray.to_gpu(hbox_large[meta_x*N:(meta_x+1)*N, meta_y*N:(meta_y+1)*N, meta_z*N:(meta_z+1)*N])
	# 			HII_filter(largebox_d, N, np.int32(meta_x), np.int32(meta_y), np.int32(meta_z), ZERO, smoothR, block=block_size, grid=grid_size);
	# 			hbox_large[meta_x*N:(meta_x+1)*N, meta_y*N:(meta_y+1)*N, meta_z*N:(meta_z+1)*N] = largebox_d.get()
	#plan = Plan(shape, dtype=np.complex64)
	#plan.execute(largebox_d, inverse=True)  #FFT to real space of smoothed box
	#largebox_d /=  VOLUME  #divide by VOLUME if using fft (vs ifft)


	# This saves a large resolution deltax

	
	print "downsampling"
	smallbox_d = gpuarray.zeros((HII_DIM,HII_DIM,M), dtype=np.float32)
	for meta_z in xrange(META_GRID_SIZE):
		largebox_pin = hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy()
		cuda.memcpy_dtoh_async(largecbox_pin, largebox_d)
		#largebox_d = gpuarray.to_gpu_async(hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy())
		largebox_d /= scale**3 #
		subsample_kernel(largebox_d, smallbox_d, DIM, HII_DIM, PIXEL_FACTOR, block=block_size, grid=HII_stitch_grid_size) #subsample in real space
		hbox_small[:, :, meta_z*M:(meta_z+1)*M] = smallbox_d.get_async()
	np.save(parent_folder+"/Boxes/smoothed_deltax_z0.00_{0:d}_{1:.0f}Mpc".format(HII_DIM, BOX_LEN), hbox_small)
	#import IPython; IPython.embed()


	# To get velocities: reload the k-space box
	hbox_large = np.load(parent_folder+"/Boxes/deltak_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN))
	hvbox_large = np.zeros((DIM, DIM, DIM), dtype=np.float32)
	hvbox_small = np.zeros(HII_shape, dtype=np.float32)
	smoothR = np.float32(L_FACTOR*BOX_LEN/HII_DIM)
	largevbox_d = gpuarray.zeros((DIM,DIM,N), dtype=np.complex64)
	smallvbox_d = gpuarray.zeros((HII_DIM, HII_DIM, M), dtype=np.float32)
	for num, mode in enumerate(['x', 'y', 'z']):
		for meta_z in xrange(META_GRID_SIZE):
			largebox_d = gpuarray.to_gpu_async(hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy())
			#largebox_d /=  VOLUME  #divide by VOLUME if using fft (vs ifft)
			velocity_kernel(largebox_d, largevbox_d, DIM, np.int32(meta_z), np.int32(num), block=block_size, grid=stitch_grid_size)
			HII_filter(largevbox_d, DIM, ZERO, smoothR, block=block_size, grid=stitch_grid_size)
			print hvbox_large.shape, largevbox_d.shape
			hvbox_large[:, :, meta_z*N:(meta_z+1)*N] = largevbox_d.get_async()
		hvbox_large = fft_stitch(N, plan2d, plan1d, hvbox_large, largevbox_d).real
		for meta_z in xrange(META_GRID_SIZE):
			largevbox_d = gpuarray.to_gpu_async(hvbox_large[:, :, meta_z*N:(meta_z+1)*N].copy())
			subsample_kernel(largevbox_d.real, smallvbox_d, DIM, HII_DIM,PIXEL_FACTOR, block=block_size, grid=HII_stitch_grid_size)
			hvbox_small[:, :, meta_z*M:(meta_z+1)*M] = smallvbox_d.get_async()
		np.save(parent_folder+"/Boxes/v{0}overddot_{1:d}_{2:.0f}Mpc".format(mode, HII_DIM, BOX_LEN), smallvbox_d.get())

	return
Exemplo n.º 10
0
def init():
	"""outputs the high resolution k-box, and the smoothed r box"""
	N = np.int32(DIM) #prepare for stitching
	#HII_DIM = np.int32(HII_DIM)
	f_pixel_factor = DIM/HII_DIM;
	scale = np.float32(BOX_LEN)/DIM
	HII_scale = np.float32(BOX_LEN)/HII_DIM
	shape = (N,N,N)
	
	MRGgen = MRG32k3aRandomNumberGenerator(seed_getter=seed_getter_uniform, offset=0)

	kernel_source = open(cmd_folder+"/initialize.cu").read()
	kernel_code = kernel_source % {

		'DELTAK': DELTA_K,
		'VOLUME': VOLUME,
		'DIM': DIM
	}
	main_module = nvcc.SourceModule(kernel_code)
	init_kernel = main_module.get_function("init_kernel")
	HII_filter = main_module.get_function("HII_filter")
	adj_complex_conj = main_module.get_function("adj_complex_conj")
	subsample_kernel = main_module.get_function("subsample")
	velocity_kernel = main_module.get_function("set_velocity")
	pspec_texture = main_module.get_texref("pspec")

	interpPspec, interpSize = init_pspec() #interpPspec contains both k array and P array
	interp_cu = cuda.matrix_to_array(interpPspec, order='F')
	cuda.bind_array_to_texref(interp_cu, pspec_texture)

	largebox_d = gpuarray.zeros(shape, dtype=np.float32)
	init_kernel(largebox_d, np.int32(DIM), block=block_size, grid=grid_size)

	#import IPython; IPython.embed()
	largebox_d_imag = gpuarray.zeros(shape, dtype=np.float32)
	init_kernel(largebox_d_imag, np.int32(DIM), block=block_size, grid=grid_size)

	largebox_d *= MRGgen.gen_normal(shape, dtype=np.float32)
	largebox_d_imag *= MRGgen.gen_normal(shape, dtype=np.float32)
	largebox_d = largebox_d + np.complex64(1.j) * largebox_d_imag

	#adj_complex_conj(largebox_d, DIM, block=block_size, grid=grid_size)
	largebox = largebox_d.get()
	#np.save(parent_folder+"/Boxes/deltak_z0.00_{0:d}_{1:.0f}Mpc".format(DIM, BOX_LEN), largebox)

	#save real space box before smoothing
	plan = Plan(shape, dtype=np.complex64)
	plan.execute(largebox_d, inverse=True)  #FFT to real space of smoothed box
	largebox_d /= scale**3
	np.save(parent_folder+"/Boxes/deltax_z0.00_{0:d}_{1:.0f}Mpc".format(DIM, BOX_LEN), largebox_d.real.get_async())

	#save real space box after smoothing and subsampling
	# host largebox is still in k space, no need to reload from disk
	largebox_d = gpuarray.to_gpu(largebox)
	smoothR = np.float32(L_FACTOR*BOX_LEN/HII_DIM)
	HII_filter(largebox_d, N, ZERO, smoothR, block=block_size, grid=grid_size);
	plan.execute(largebox_d, inverse=True)  #FFT to real space of smoothed box
	largebox_d /= scale**3
	smallbox_d = gpuarray.zeros(HII_shape, dtype=np.float32)
	subsample_kernel(largebox_d.real, smallbox_d, N, HII_DIM, PIXEL_FACTOR, block=block_size, grid=HII_grid_size) #subsample in real space
	np.save(parent_folder+"/Boxes/smoothed_deltax_z0.00_{0:d}_{1:.0f}Mpc".format(HII_DIM, BOX_LEN), smallbox_d.get_async())

	# reload the k-space box for velocity boxes
	largebox_d = gpuarray.to_gpu(largebox)
	
	#largebox_d /=  VOLUME  #divide by VOLUME if using fft (vs ifft)
	smoothR = np.float32(L_FACTOR*BOX_LEN/HII_DIM)
	largevbox_d = gpuarray.zeros((DIM,DIM,DIM), dtype=np.complex64)
	smallbox_d = gpuarray.zeros(HII_shape, dtype=np.float32)
	for num, mode in enumerate(['x', 'y', 'z']):
		velocity_kernel(largebox_d, largevbox_d, DIM, np.int32(num), block=block_size, grid=grid_size)
		HII_filter(largevbox_d, DIM, ZERO, smoothR, block=block_size, grid=grid_size)
		plan.execute(largevbox_d, inverse=True)
		largevbox_d /= scale**3
		#import IPython; IPython.embed()
		subsample_kernel(largevbox_d.real, smallbox_d, DIM, HII_DIM,PIXEL_FACTOR, block=block_size, grid=HII_grid_size)
		np.save(parent_folder+"/Boxes/v{0}overddot_{1:d}_{2:.0f}Mpc".format(mode, HII_DIM, BOX_LEN), smallbox_d.get())

	return
Exemplo n.º 11
0
def evolve_zeldovich(z, deltax):
    """First order Zeldovich approximation. """
    if BOX_LEN > DIM:
        print "perturb_field: WARNING: Resolution is likely too low for accurate evolved density fields"
    #move_mass(updated_d, deltax_d, vx_d, vy_d, vz_d, np.float32(1./primordial_fgrowth))
    kernel_source = open(cmd_folder + "/perturb_field.cu").read()
    kernel_code = kernel_source % {
        'DELTAK': DELTA_K,
        'HII_DIM': HII_DIM,
        'DIM': DIM,
        'PIXEL_FACTOR': PIXEL_FACTOR
    }
    main_module = nvcc.SourceModule(kernel_code)
    move_mass = main_module.get_function("move_mass")
    velocity_kernel = main_module.get_function("set_velocity")
    filter_kernel = main_module.get_function("filter")
    subsample_kernel = main_module.get_function("subsample")

    fgrowth = np.float32(pb.fgrowth(
        z, COSMO['omega_M_0']))  #normalized to 1 at z=0
    primordial_fgrowth = np.float32(
        pb.fgrowth(INITIAL_REDSHIFT,
                   COSMO['omega_M_0']))  #normalized to 1 at z=0

    vx = np.load(
        parent_folder +
        "/Boxes/vxoverddot_{0:d}_{1:.0f}Mpc.npy".format(HII_DIM, BOX_LEN))
    vy = np.load(
        parent_folder +
        "/Boxes/vyoverddot_{0:d}_{1:.0f}Mpc.npy".format(HII_DIM, BOX_LEN))
    vz = np.load(
        parent_folder +
        "/Boxes/vzoverddot_{0:d}_{1:.0f}Mpc.npy".format(HII_DIM, BOX_LEN))
    vx_d = gpuarray.to_gpu(vx)
    vy_d = gpuarray.to_gpu(vy)
    vz_d = gpuarray.to_gpu(vz)
    vx_d *= ((fgrowth - primordial_fgrowth) / BOX_LEN
             )  #this is now comoving displacement in units of box size
    vy_d *= ((fgrowth - primordial_fgrowth) / BOX_LEN)
    vz_d *= ((fgrowth - primordial_fgrowth) / BOX_LEN)

    #updated_d = gpuarray.zeros_like(vx_d)
    start = cuda.Event()
    updated_d = gpuarray.zeros(HII_shape, dtype=np.float32)
    delta_d = gpuarray.to_gpu(deltax)
    start.record()
    start.synchronize()

    move_mass(updated_d,
              delta_d,
              vx_d,
              vy_d,
              vz_d,
              primordial_fgrowth,
              block=block_size,
              grid=grid_size)
    updated_d /= MASS_FACTOR
    updated_d -= np.float32(
        1.)  #renormalize to the new pixel size, and make into delta
    updated = updated_d.get_async()
    #import IPython; IPython.embed()
    np.save(
        parent_folder +
        "/Boxes/updated_smoothed_deltax_z{0:.2f}_{1:d}_{2:.0f}Mpc".format(
            z, HII_DIM, BOX_LEN), updated)

    plan = Plan((DIM, DIM, DIM), dtype=np.complex64)
    delta_d = delta_d.astype(np.complex64)
    #import IPython; IPython.embed()
    vbox_d = gpuarray.zeros_like(delta_d)
    smallvbox_d = gpuarray.zeros(HII_shape, dtype=np.float32)
    plan.execute(delta_d)  #now deltak
    dDdt_D = np.float32(dDdtoverD(z))
    #print dDdt_D
    #import IPython; IPython.embed()
    smoothR = np.float32(L_FACTOR * BOX_LEN / HII_DIM)
    for num, mode in enumerate(['x', 'y', 'z']):
        velocity_kernel(delta_d,
                        vbox_d,
                        dDdt_D,
                        DIM,
                        np.int32(num),
                        block=block_size,
                        grid=grid_size)
        filter_kernel(vbox_d,
                      DIM,
                      ZERO,
                      smoothR,
                      block=block_size,
                      grid=grid_size)
        plan.execute(vbox_d, inverse=True)
        subsample_kernel(vbox_d.real,
                         smallvbox_d,
                         DIM,
                         HII_DIM,
                         PIXEL_FACTOR,
                         block=block_size,
                         grid=HII_grid_size)
        np.save(
            parent_folder +
            "/Boxes/updated_v{0}overddot_{1:d}_{2:.0f}Mpc".format(
                mode, HII_DIM, BOX_LEN), smallvbox_d.get())

    return
Exemplo n.º 12
0
def simpleFourierTest2D(N=2048):
    """
    Using PyFFT to call CUDA.

    :return:
    """
    from pyfft.cuda import Plan
    import pycuda.driver as cuda
    from pycuda.tools import make_default_context
    import pycuda.gpuarray as gpuarray
    import time

    cuda.init()
    context = make_default_context()
    stream = cuda.Stream()

    plan = Plan((N, N), dtype=np.complex64, stream=stream)
    x = np.ones((N, N), dtype=np.complex64)

    x_gpu = gpuarray.to_gpu(x)

    plan.execute(x_gpu)
    res = x_gpu.get()
    plan.execute(x_gpu, inverse=True)
    result = x_gpu.get()
    context.pop()

    error = np.abs(np.sum(np.abs(x) - np.abs(result)) / x.size)
    #print 'Error:', error

    #Single precision
    print 'Array size %i x %i' % (N, N)
    print 'Single Precisions'
    x = np.random.random((N, N))
    x = x.astype(np.complex64)

    start = time.time()
    cuda.init()
    context = make_default_context()
    stream = cuda.Stream()

    plan = Plan((N, N), dtype=np.complex64, stream=stream, fast_math=True)

    x_gpu = gpuarray.to_gpu(x)
    plan.execute(x_gpu)
    result = x_gpu.get()
    context.pop()
    end = time.time()
    cudatime = end - start

    #numpy
    start = time.time()
    xf = np.fft.fft2(x)
    end = time.time()
    numpytime = end - start

    print 'Same to 1e-2?'
    print np.testing.assert_allclose(xf, result, rtol=1e-2)
    print 'Numpy time', numpytime
    print 'CUDA time', cudatime

    #Double precision
    print '\n\nDouble Precision'
    x = np.random.random((N, N))
    x = x.astype(np.complex128)

    start = time.time()

    cuda.init()
    context = make_default_context()
    stream = cuda.Stream()

    plan = Plan((N, N), dtype=np.complex128, stream=stream, fast_math=True)

    x_gpu = gpuarray.to_gpu(x)
    plan.execute(x_gpu)
    result = x_gpu.get()
    context.pop()

    end = time.time()
    cudatime = end - start

    #numpy
    start = time.time()
    xf = np.fft.fft2(x)
    end = time.time()
    numpytime = end - start

    print 'Same to 1e-7?'
    print np.testing.assert_allclose(xf, result, rtol=1e-7)
    print 'Numpy time', numpytime
    print 'CUDA time', cudatime
Exemplo n.º 13
0
def run(xHfile=None, IO_DIR=None):

    if IO_DIR is None:
        IO_DIR = parent_folder
    if not os.path.exists(IO_DIR + "/Outfiles"):
        os.makedirs(IO_DIR + "/Outfiles")
    if xHfile is None:
        xHfile = find_files(IO_DIR + "/Boxes/",
                            pattern="xH*{0:06.2f}_{1:i}_{2:.0f}*".format(
                                Z, HII_DIM, BOX_LEN))[0]
    if xHfile.endswith('.npy'):
        xH = np.load(xHfile)
        p_dict = boxio.parse_filename(os.path.splitext(xHfile)[0])
    else:
        b = boxio.readbox(xHfile)
        xH = b.box_data
        p_dict = b.param_dict
    Z = p_dict['z']
    #growth_factor = pb.fgrowth(Z, COSMO['omega_M_0'], unnormed=True)
    #overwrite global variables
    HII_DIM = p_dict['dim']
    BOX_LEN = np.float32(p_dict['BoxSize'])
    DELTA_K = np.float32(2 * np.pi / BOX_LEN)
    VOLUME = (BOX_LEN * BOX_LEN * BOX_LEN)
    HII_TOT_NUM_PIXELS = HII_DIM**3
    try:
        deltax = np.load(
            IO_DIR +
            "/Boxes/updated_smoothed_deltax_z0{0:.2f}_{1:d}_{2:.0f}Mpc.npy".
            format(Z, HII_DIM, BOX_LEN))
    except:
        #deltax = boxio.readbox(IO_DIR+"/Boxes/updated_smoothed_deltax_z{0:.2f}_{1:d}_{2:.0f}Mpc".format(Z, HII_DIM, BOX_LEN)).box_data
        deltax = boxio.readbox(
            IO_DIR +
            "/Boxes/updated_smoothed_deltax_z0{0:.2f}_{1:d}_{2:.0f}Mpc".format(
                Z, HII_DIM *
                2, BOX_LEN)).box_data[:HII_DIM, :HII_DIM, :HII_DIM]

    kernel_source = open(cmd_folder + "/delta_T.cu").read()
    kernel_code = kernel_source % {
        'DELTAK': DELTA_K,
        'VOLUME': VOLUME,
        'NUM_BINS': NUM_BINS
    }
    main_module = nvcc.SourceModule(kernel_code)
    pbox_kernel = main_module.get_function("pbox_kernel")
    #pixel_deltax_d = gpuarray.to_gpu(deltax)
    #pixel_xH_d = gpuarray.to_gpu(xH)

    _const_factor = np.float32(
        27 * (COSMO['omega_b_0'] * COSMO['h'] * COSMO['h'] / 0.023) * np.sqrt(
            (0.15 / COSMO['omega_M_0'] / COSMO['h'] / COSMO['h']) *
            (1 + Z) / 10.0))
    delta_T = np.float32(_const_factor * xH * (1.0 + deltax))  #in mK
    ave = np.mean(delta_T)
    np.save(
        IO_DIR +
        "/Boxes/delta_T_no_halos_z{0:.2f}_nf{1:f}_useTs{2:d}_zetaX{3:.1e}_TvirminX{4:.1e}_aveTb{5:.2f}_{6:d}_{7:d}Mpc.npy"
        .format(Z, p_dict['nf'], USE_TS_IN_21CM, p_dict['eff'], ION_Tvir_MIN,
                ave, HII_DIM, int(BOX_LEN)), delta_T)

    deldel_T = (delta_T / ave - 1) * VOLUME / HII_TOT_NUM_PIXELS
    if DIMENSIONAL_T_POWER_SPEC:
        deldel_T *= ave

    plan = Plan(HII_shape, dtype=np.complex64)
    deldel_T_d = gpuarray.to_gpu(deldel_T.astype(np.complex64))
    plan.execute(deldel_T_d)
    K = np.float32(
        np.logspace(np.log10(DELTA_K), np.log10(DELTA_K * np.sqrt(3.) * DIM),
                    NUM_BINS))
    K_d = gpuarray.to_gpu(K)
    k_ave_d = gpuarray.zeros_like(K_d)
    in_bin_ct_d = gpuarray.zeros_like(K_d)
    ps_d = gpuarray.zeros_like(K_d)

    pbox_kernel(deldel_T_d,
                DIM,
                ps_d,
                k_ave_d,
                in_bin_ct_d,
                K_d,
                block=block_size,
                grid=HII_grid_size)
    ps = ps_d.get()
    in_bin_ct = in_bin_ct_d.get()
    k_ave = k_ave_d.get()
    k_ave = np.where(in_bin_ct > 0, k_ave / in_bin_ct, 0.)
    ps_ave = np.where(in_bin_ct > 0, ps / in_bin_ct, 0.)
    #ps_fname = "/ps_nov_no_halos_z{0:.2f}_nf{1:f}_useTs{2:d}_zetaX{3:.1e}_TvirminX{4:.1e}_aveTb{5:.2f}_{6:d}_{7:d}Mpc".format(Z, p_dict['nf'], USE_TS_IN_21CM, p_dict['eff'], ION_Tvir_MIN, ave, HII_DIM, np.int32(BOX_LEN))
    #np.savez(IO_DIR+ps_fname, k_ave=k_ave, ps_ave=ps_ave)

    return K, k_ave, ps_ave
Exemplo n.º 14
0
def conv_bubbles(I, param_dict, Z, scale=None, fil=1, update=0, LE=False, visualize=False):
	"""uses fft convolution"""
	zeta = 40.
	Lfactor = 0.620350491
	# Z = param_dict['z']
	DELTA_R_FACTOR = 1.05
	print "Using filter_type {}".format(fil)	
	if scale is None:
		scale = param_dict['BoxeSize']/param_dict['HIIdim']
	dk = 2*np.pi/I.shape[0]*scale#param_dict['BoxSize'] #delta k in inverse Mpc
	RMAX = np.float32(30) #in Mpc
	RMIN = np.float32(1.)
	mm = mmin(Z, Tvir=1.e4)
	smin = sig0(m2R(mm))
	#smin = pb.sigma_r(m2R(mm), Z, **cosmo)[0]
	deltac = Deltac(Z)
	fgrowth = np.float32(deltac/1.686)
	#fgrowth = 1./pb.fgrowth(Z, cosmo['omega_M_0'], unnormed=True)
	fc_mean_ps = pb.collapse_fraction(np.sqrt(smin), deltac).astype(np.float32)  #mean collapse fraction of universe
	print fc_mean_ps
	"""find bubbbles for deltax box I"""
	kernel_source = open("find_bubbles.cu").read()
	kernel_code = kernel_source % {
        'DELTAC': deltac,
        'RMIN': RMIN,
        'SMIN': smin, 
        'ZETA': zeta,
        'DELTAK': dk
    }
	main_module = nvcc.SourceModule(kernel_code)
	fcoll_kernel = main_module.get_function("fcoll_kernel")
	update_kernel = main_module.get_function("update_kernel")
	update_sphere_kernel = main_module.get_function("update_sphere_kernel")
	final_kernel = main_module.get_function("final_kernel")
	HII_filter = main_module.get_function("HII_filter")
	# Get contiguous image + shape.
	height, width, depth = I.shape
	HII_TOT_NUM_PIXELS = height*width*depth
	
	
	# Get block/grid size make sure divisible (currrently only power of 2 so ok)
	block_size =  (8,8,8)
	grid_size =   (width/(block_size[0]),
				height/(block_size[0]),
				depth/(block_size[0]))
	 # Initialize variables.
	#ionized       = np.zeros([height,width,depth]) 
	#ionized       = np.float32(ionized)
	width         = np.int32(width)
	I             = np.float32(I.copy()) 
	if not LE:
		I *= fgrowth #linearly extrapolate the non-linear density to present
	#filt          = np.ones_like(I)


	# Transfer labels asynchronously.
	ionized_d = gpuarray.zeros([height,width,depth], dtype=np.float32) 
	delta_d = gpuarray.to_gpu_async(I)
	# I_cu = cu.np_to_array(I, order='C')
	# cu.bind_array_to_texref(I_cu, image_texture)

	fftplan = Plan(I.shape, dtype=np.complex64)
	R = RMAX; cnt = 0

	if visualize is not None:
		fig = plt.figure()
		ax0 = fig.add_subplot(131)
		ax0.set_title('Density')
		mydelta0 = plt.imshow(I.real[width/2])
		plt.colorbar()
		ax1 = fig.add_subplot(132)
		fig.suptitle(" Smoothed Density and Ionization")
		ax1.set_title('smoothed Density')
		mydelta = plt.imshow(delta_d.get().real[width/2])
		plt.colorbar()
		ax2 = fig.add_subplot(133)
		ax2.set_title('Ionization')
		myion = plt.imshow(np.ones_like(I)[width/2])
		plt.colorbar()
		if visualize == 'draw':
			plt.pause(.01)
			plt.draw()
		else:
			plt.savefig('tmp/{0:03d}.png'.format(cnt))

		#plt.colorbar()
	final_step = False
	final_denom = -1
	if RMIN < Lfactor*scale:
		temparg = 2*(smin - sig0(Lfactor*scale) )
		if temparg < 0:
			raise(Exception)
		else:
			final_denom = np.sqrt(temparg).astype(np.float32)
	while not final_step:
		print 'R={} Mpc'.format(R)
		if (R/DELTA_R_FACTOR) <= (Lfactor*scale) or ((R/DELTA_R_FACTOR) <= RMIN): #stop if reach either rmin or cell size
			final_step = True
		R = np.float32(R)
		Rpix = np.float32(R/scale)

		S0 = np.float32(sig0(R))
		#S0 = np.float32(pb.sigma_r(R, Z, **cosmo)[0])
		denom = np.sqrt(2*(smin - S0)).astype(np.float32)
		print 'denom', denom

		start = cu.Event()
		step1 = cu.Event()
		step2 = cu.Event()
		step3 = cu.Event()
		step4 = cu.Event()
		end = cu.Event()

		start.record()
		#smoothed_d = conv(delta_d.astype(np.complex64), I.shape, fil=fil)

		delta_d = gpuarray.to_gpu_async(I).astype(np.complex64)
		fcoll_d = gpuarray.zeros(I.shape, dtype=np.float32)
		start.synchronize()
		if R > 1 or True: # smoothing
			fftplan.execute(delta_d)
			step1.record(); step1.synchronize()
			
			HII_filter(delta_d, width, np.int32(fil), R, block=block_size, grid=grid_size)
			step2.record(); step2.synchronize()
			#import IPython; IPython.embed()
			fftplan.execute(delta_d, inverse=True)

		if not final_step:
			fcoll_kernel(fcoll_d, delta_d.real, width, denom, block=block_size, grid=grid_size)
			step3.record(); step3.synchronize()
			if not LE:
				#fcollmean = gpuarray.sum((1+delta_d.real)*fcoll_d).get()/float(HII_TOT_NUM_PIXELS)
				fcollmean = gpuarray.sum(fcoll_d).get()/np.float32(HII_TOT_NUM_PIXELS)
				fcoll_d *= fc_mean_ps/fcollmean# #normalize since we used non-linear density
				step4.record(); step4.synchronize()
			if update == 0:
				update_kernel(ionized_d, fcoll_d, width, block=block_size, grid=grid_size)
			elif update == 1:
				update_sphere_kernel(ionized_d, fcoll_d, width, Rpix, block=block_size, grid=grid_size)
			#import IPython; IPython.embed()
		else:
			if (RMIN > Lfactor*scale) or (final_denom < 0): final_denom = denom
			print 'final denom', final_denom
			fcoll_kernel(fcoll_d, delta_d.real, width, denom, block=block_size, grid=grid_size)
			step3.record(); step3.synchronize()
			if not LE:
				fcollmean = gpuarray.sum(fcoll_d).get()/np.float32(HII_TOT_NUM_PIXELS)
				#fcollmean = gpuarray.sum((1+delta_d.real)*fcoll_d).get()/float(HII_TOT_NUM_PIXELS)
				fcoll_d *= fc_mean_ps/fcollmean
				step4.record(); step4.synchronize()
			if update == 0:
				update_kernel(ionized_d, fcoll_d, width, block=block_size, grid=grid_size)
			elif update == 1:
				update_sphere_kernel(ionized_d, fcoll_d, width, Rpix, block=block_size, grid=grid_size)
			final_kernel(ionized_d, fcoll_d, width, block=block_size, grid=grid_size)
		end.record()
		end.synchronize()
		if visualize is not None:
			mydelta.set_data(delta_d[width/2].real.get())
			myion.set_data(ionized_d[width/2].get())
			ax1.set_title('R = %f'%(R))
			if visualize == 'draw':
				plt.pause(.01)
				plt.draw()
			else:
				plt.savefig('tmp/{0:03d}.png'.format(cnt))


		R = R/DELTA_R_FACTOR
		cnt +=1 

	ionized = ionized_d.get()
	return ionized
Exemplo n.º 15
0
def conv_bubbles(deltax, param_dict, Z=None, scale=None, fil=1, update=0, LE=False, visualize=0, quiet=False):
	"""
Excursion-set formalism, or Fast Fourier Radiative-Transform. 
Calculates ionization fields from density field provided. 
For each box pixel, it cycles through various bubble radii
  , until it finds the largest radius such that the enclosed collapsed mass fraction 
  (obtained by summing masses from the halo list file of
  halos whose centers are within the bubble, or by taking 
  the mean collapsed mass from conditional press-schechter)
  is larger than 1/ZETA. 

Parameters
----------
deltax : numpy.float32 array
	Real space density box, must have dimensions powers of 2. 
param_dict: python dictionary
	dictionary of parameters created by boxio.parse_filename
Z: float32
	Required if input density is the present day linear density, program would extrapolate to Z. 
fil: int32
	type of filter for smoothing : 0: rtophat; 1: ktophat, 2: Gaussian
update: int32
	Method to update the ionization field 0: center pixel, 1: sphere painting
visualize: bool
	if True, draw slice of density field and created ionization field 
quiet: bool

Returns
----------

ion_field: numpy array float32

"""
	
	if not quiet: 
		print "Using filter_type {}".format(fil)	
	if scale is None:
		scale = param_dict['BoxSize']/param_dict['HIIdim']
	if Z is None:
		Z = param_dict['Z']
	sigmamin, deltac = pb.sig_del(ION_Tvir_MIN, Z, **COSMO)
	fgrowth = np.float32(deltac/1.686)
	smin = sigmamin**2
	fc_mean_ps = pb.collapse_fraction(sigmamin, deltac).astype(np.float32)  #mean collapse fraction of universe

	"""find bubbbles for deltax box I"""
	kernel_source = open(cmd_folder+"/find_bubbles.cu").read()
	kernel_code = kernel_source % {
        'DELTAC': deltac,
        'RMIN': R_BUBBLE_MIN,
        'ZETA': ZETA,
        'DELTAK': DELTA_K
    }
	main_module = nvcc.SourceModule(kernel_code)
	fcoll_kernel = main_module.get_function("fcoll_kernel")
	update_kernel = main_module.get_function("update_kernel")
	update_sphere_kernel = main_module.get_function("update_sphere_kernel")
	final_kernel = main_module.get_function("final_kernel")
	HII_filter = main_module.get_function("HII_filter")
	# Get contiguous image + shape.
	height, width, depth = deltax.shape
	HII_TOT_NUM_PIXELS = height*width*depth
	
	
	 # Initialize variables.
	width         = np.int32(width)
	deltax        = np.float32(deltax.copy()) 
	if not LE:
		deltax *= fgrowth #linearly extrapolate the non-linear density to present
	# Transfer asynchronously.
	ionized_d = gpuarray.zeros([height,width,depth], dtype=np.float32)
	delta_d = gpuarray.to_gpu_async(deltax)


	fftplan = Plan(deltax.shape, dtype=np.complex64)
	R = R_BUBBLE_MAX; cnt = 0

	if visualize > 0:
		fig = plt.figure()
		ax1 = fig.add_subplot(121)
		fig.suptitle(" Smoothed Density and Ionization")
		ax1.set_title('Density')
		mydelta = plt.imshow(delta_d.get().real[width/2])
		plt.colorbar()
		ax2 = fig.add_subplot(122)
		ax2.set_title('Ionization')
		myion = plt.imshow(np.ones_like(deltax)[width/2])
		plt.colorbar()
		if visualize == 1:
			print "HERE"
			plt.pause(.01)
			plt.draw()
		elif visualize == 2:
			plt.savefig('tmp/{0:03d}.png'.format(cnt))

		#plt.colorbar()
	final_step = False
	final_denom = -1
	if R_BUBBLE_MIN < L_FACTOR*scale:
		temparg = 2*(smin - sig0(L_FACTOR*scale) )
		if temparg < 0:
			raise(Exception)
		else:
			final_denom = np.sqrt(temparg).astype(np.float32)
	while not final_step:
		
		if (R/DELTA_R_FACTOR) <= (L_FACTOR*scale) or ((R/DELTA_R_FACTOR) <= R_BUBBLE_MIN): #stop if reach either rmin or cell size
			final_step = True
		R = np.float32(R)
		S0 = np.float32(sig0(R))
		#S0 = np.float32(pb.sigma_r(R, Z, **cosmo)[0])
		denom = np.sqrt(2*(smin - S0)).astype(np.float32)
		if not quiet:
			print 'R={} Mpc'.format(R)
			print 'denom', denom

		start = cu.Event()
		step1 = cu.Event()
		step2 = cu.Event()
		step3 = cu.Event()
		step4 = cu.Event()
		end = cu.Event()

		start.record()
		delta_d = gpuarray.to_gpu_async(deltax.astype(np.complex64))
		fcoll_d = gpuarray.zeros(deltax.shape, dtype=np.float32)
		start.synchronize()
		fftplan.execute(delta_d)
		step1.record(); step1.synchronize()
		
		HII_filter(delta_d, width, np.int32(fil), R, block=block_size, grid=grid_size)
		step2.record(); step2.synchronize()
		#import IPython; IPython.embed()
		fftplan.execute(delta_d, inverse=True)
		step2.synchronize()
		

		# if not the final step, get ionized regions, if final step paint partial ionizations
		if not final_step:
			fcoll_kernel(fcoll_d, delta_d.real, width, denom, block=block_size, grid=grid_size)
			step3.record(); step3.synchronize()
			if not LE:
				fcollmean = gpuarray.sum(fcoll_d).get()/float(HII_TOT_NUM_PIXELS)
				fcoll_d *= fc_mean_ps/fcollmean# #normalize since we used non-linear density
				step4.record(); step4.synchronize()
			if update == 0:
				update_kernel(ionized_d, fcoll_d, width, block=block_size, grid=grid_size)
			else:
				update_sphere_kernel(ionized_d, fcoll_d, width, R, block=block_size, grid=grid_size)
		else:
			if final_denom < 0: final_denom = denom
			# print 'final denom', final_denom
			fcoll_kernel(fcoll_d, delta_d.real, width, denom, block=block_size, grid=grid_size)
			step3.record(); step3.synchronize()
			if not LE:
				fcollmean = gpuarray.sum(fcoll_d).get()/float(HII_TOT_NUM_PIXELS)
				fcoll_d *= fc_mean_ps/fcollmean
				step4.record(); step4.synchronize()
			final_kernel(ionized_d, fcoll_d, width, block=block_size, grid=grid_size)
		end.record()
		end.synchronize()
		if visualize > 0:
			mydelta.set_data(delta_d.real.get()[width/2])
			myion.set_data(ionized_d.get()[width/2])
			ax1.set_title('R = %f'%(R))
			if visualize == 1:
				plt.pause(.01)
				plt.draw()
			elif visualize == 2:
				plt.savefig('tmp/{0:03d}.png'.format(cnt))


		R = R/DELTA_R_FACTOR
		cnt +=1 

	ionized = ionized_d.get()
	return ionized
Exemplo n.º 16
0
def get_pyfft_plan(shape):
    try:
        from pyfft.cuda import Plan
    except ImportError:
        return None
    return Plan(shape, normalize=True, wait_for_finish=True)
Exemplo n.º 17
0
import numpy
from pyfft.cuda import Plan
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
# w,h,k are the array dimensions in a power of 2
# im1, im2 are the input 3d arrays of dtype complex64
w = h = k = 512
im1 = numpy.random.rand(w, h, k).astype(numpy.complex64)
im2 = numpy.random.rand(w, h, k).astype(numpy.complex64)
%time plan = Plan((w, h, k), normalize=True)
# forward transform on device
%time im1_gpu = gpuarray.to_gpu(im1)
%time plan.execute(im1_gpu)
%time im1_ft = im1_gpu.get()
del im1_gpu
%time im2_gpu = gpuarray.to_gpu(im2)
%time plan.execute(im2_gpu)
%time im2_ft = im2_gpu.get()
del im2_gpu
# do multiplication on host - can be done on device.
%time conv = im1_ft * im2_ft
# inverse transform on device
%time conv_gpu = gpuarray.to_gpu(conv)
# del conv
%time plan.execute(conv_gpu, inverse=True)
%time corr_gpu = conv_gpu.get()
# Reference calculation on CPU:
%time im1_ft = numpy.fft.fftn(im1)
%time im2_ft = numpy.fft.fftn(im2)
%time conv = im1_ft * im2_ft
del im1