Exemplo n.º 1
0
def Pool_generic(func, arg, root_Nthreads, do_not_prefilter=False):
    assert len(arg) == 11, arg
    path_to_map, path_to_dx, path_to_dy, buff0, buff1, lside0, lside1, HD_res0, HD_res1, NR_iter, kspl = arg

    assert os.path.exists(path_to_dx) and os.path.exists(path_to_dy)
    assert IsPowerOfTwo(root_Nthreads)
    diff0, diff1 = Log2ofPowerof2((root_Nthreads, root_Nthreads))
    HD_shape = (2**HD_res0, 2**HD_res1)
    LD = (HD_res0 - diff0, HD_res1 - diff1)
    pool = setup_Pool()
    ret_list = pool.map(func, [[
        i, path_to_map, path_to_dx, path_to_dy, buff0, buff1, lside0, lside1,
        HD_res0, HD_res1, NR_iter, kspl, LD[0], LD[1], do_not_prefilter
    ] for i in range(root_Nthreads**2)])
    pool.close()
    pool.join()
    # Recombines from the lensed_chks :
    spliter_lib = periodicmap_spliter()  # library to split periodic maps.
    ret = []  # one map for lens, two for inverse
    for i in range(len(ret_list[0])):
        map = np.empty(HD_shape)
        if verbose:
            for j, N in enumerate_progress(
                    xrange(root_Nthreads**2),
                    label='Pool_generic:patching chks together'):
                sLDs, sHDs = spliter_lib.get_slices_chk_N(N,
                                                          LD,
                                                          (HD_res0, HD_res1),
                                                          (buff0, buff1),
                                                          inverse=True)
                map[sHDs[0]] = ret_list[N][i][sLDs[0]]
            ret.append(map)
        else:
            for j, N in enumerate(xrange(root_Nthreads**2)):
                sLDs, sHDs = spliter_lib.get_slices_chk_N(N,
                                                          LD,
                                                          (HD_res0, HD_res1),
                                                          (buff0, buff1),
                                                          inverse=True)
                map[sHDs[0]] = ret_list[N][i][sLDs[0]]
            ret.append(map)
    return ret
Exemplo n.º 2
0
def apply_FDxiDtFt_GPU_inplace_timed(type,
                                     lib_alm_dat,
                                     lib_alm_sky,
                                     alms_unlCMB,
                                     f,
                                     f_inv,
                                     cls_unl,
                                     func='bicubic',
                                     double_precision_ffts=False):
    """
    Note that the first call might be substantially slower than subsequent calls, as it caches the fft and ifft plans
    for subsequent usage.
    :param type : 'T', 'QU' or 'TQU'
    :param alms_unlCMB: ffs_alms to apply FDxiDtFt to.
    :param func: bicubic or bilinear
    :param cls_unl : unlensed CMB cls dictionary (used in get_P_mat)
    :return: ffs_alms of shape (len(type,lib_alm_dat.alm_size)
    """
    if True:
        ti = time.time()
    assert func in ['bicubic', 'bilinear'], func
    assert alms_unlCMB.shape == (len(type), lib_alm_dat.alm_size)
    assert lib_alm_dat.ell_mat.shape == lib_alm_sky.ell_mat.shape
    assert lib_alm_dat.ell_mat.lsides == lib_alm_sky.ell_mat.lsides
    # Useful declarations :
    nfields = len(type)
    rshape = lib_alm_sky.ell_mat.rshape
    shape = (rshape[0], 2 * (rshape[1] - 1))
    flat_shape = np.prod(shape)

    GPU_grid = (shape[0] / GPU_block[0], shape[1] / GPU_block[1], 1)

    assert shape[0] % GPU_block[0] == 0

    assert shape[0] == shape[1], shape
    assert IsPowerOfTwo(shape[0]), shape
    assert f.shape == shape, (f.shape, shape)
    assert f_inv.shape == shape, (f_inv.shape, shape)
    assert np.all(np.array(shape) % GPU_block[0] == 0), shape

    if shape[0] > 4096:
        print "--- Exercise caution, array shapes larger than 4096 have never been tested so far ---"

    def get_rfft_unlCMB(idx):
        return lib_alm_dat.alm2rfft(alms_unlCMB[idx])

    # TODO : some get_Pij method
    if True:
        t0 = time.time()
    unlPmat = get_Pmat(type, lib_alm_sky, cls_unl)
    if True:
        dt = time.time() - t0
        print "     unl Pmat at %s Mpixel / sec, ex. time %s sec." % (
            np.prod(shape) / 1e6 / dt, dt)
        t0 = time.time()

    # 2D texture references :
    unl_CMB_tex = CUDA_module.get_texref("unl_CMB")
    dx_tex = CUDA_module.get_texref("tex_dx")
    dy_tex = CUDA_module.get_texref("tex_dy")

    # loading fft plans :
    plan, plan_inv = get_rfft_plans(shape,
                                    double_precision=double_precision_ffts)
    # Function references :
    # Spline bicubic prefiltering, bicubic interpolation and multiplication with magnification.
    prefilter = CUDA_module.get_function(
        "cf_outer_w"
    ) if not double_precision_ffts else CUDA_module.get_function("cdd_outer_w")
    lens_func = CUDA_module.get_function("%slensKernel_normtex" % func)
    magn_func = CUDA_module.get_function("detmagn_normtex")

    cplx_type = np.complex64 if not double_precision_ffts else np.complex128
    f_type = np.float32 if not double_precision_ffts else np.float64

    # We will store in host memory some maps for convenience
    temp_alms = np.zeros((nfields, lib_alm_sky.alm_size), dtype=cplx_type)

    setup_texture_nparr(dx_tex, f_inv.get_dx_ingridunits())
    setup_texture_nparr(dy_tex, f_inv.get_dy_ingridunits())
    coeffs_gpu = gpuarray.empty(shape, dtype=f_type, order='C')
    # Setting up the texture references to the displacement
    # (This is what  contribute most to the cost actually)
    rfft2_unlCMB_gpu = gpuarray.empty(rshape, dtype=cplx_type)
    if True:
        dt = time.time() - t0
        print "  First tex. setup at %s Mpixel / sec, ex. time %s sec." % (
            np.prod(shape) / 1e6 / dt, dt)
        t0 = time.time()
    wx_gpu = (6. / (2. * np.cos(
        2. * np.pi * Freq(np.arange(shape[0]), shape[0]) / shape[0]) + 4.) /
              shape[0])
    wx_gpu = gpuarray.to_gpu(wx_gpu.astype(f_type))

    for _f in xrange(nfields):
        # Multiplying with the spline coefficients and Fourier transforming
        rfft2_unlCMB_gpu.set(get_rfft_unlCMB(_f).astype(cplx_type))
        prefilter(rfft2_unlCMB_gpu,
                  wx_gpu,
                  np.int32(rshape[1]),
                  np.int32(rshape[0]),
                  block=GPU_block,
                  grid=GPU_grid)
        ifft(rfft2_unlCMB_gpu, coeffs_gpu, plan_inv, False)
        # coeffs_gpu now contains the prefiltered map to be now bicubic interpolated
        if f_type != np.float32: coeffs_gpu = coeffs_gpu.astype(np.float32)
        setup_texture_gpuarr(unl_CMB_tex, coeffs_gpu)
        if True:
            dt = time.time() - t0
            print "     CMB field %s texture setup at %s Mpixel / sec, ex. time %s sec." % (
                _f, np.prod(shape) / 1e6 / dt, dt)
            t0 = time.time()

        # Now bicubic interpolation with inverse displacement, and mult. with magnification.
        lens_func(coeffs_gpu,
                  np.int32(shape[0]),
                  block=GPU_block,
                  grid=GPU_grid,
                  texrefs=[unl_CMB_tex, dx_tex, dy_tex])
        magn_func(coeffs_gpu,
                  np.int32(shape[0]),
                  np.int32(flat_shape),
                  block=GPU_block,
                  grid=GPU_grid,
                  texrefs=[dx_tex, dy_tex])
        if True:
            dt = time.time() - t0
            print "     CMB field %s lensed and magnified at %s Mpixel / sec, ex. time %s sec." % (
                _f, np.prod(shape) / 1e6 / dt, dt)
            t0 = time.time()

        if f_type != np.float32: coeffs_gpu = coeffs_gpu.astype(f_type)

        fft(coeffs_gpu, rfft2_unlCMB_gpu, plan)

        # To be GPU memory friendly these maps are in the host memory :
        # TODO : should be possible to adapt the code to do everything on the GPU, by using 4 displacement textures.
        temp_alm = lib_alm_sky.rfftmap2alm(rfft2_unlCMB_gpu.get())
        for _g in xrange(nfields):
            temp_alms[_g] += temp_alm * unlPmat[:, _g, _f]

        if True:
            dt = time.time() - t0
            print "     CMB field %s built temp_alms at %s Mpixel / sec, ex. time %s sec." % (
                _f, np.prod(shape) / 1e6 / dt, dt)
            t0 = time.time()

    # We now lens and then fft each map, and return.
    # We lens now with the forward displacement :
    setup_texture_nparr(dx_tex, f.get_dx_ingridunits())
    setup_texture_nparr(dy_tex, f.get_dy_ingridunits())
    if True:
        dt = time.time() - t0
        print "     Setup of forw. displ. textures at %s Mpixel / sec, ex. time %s sec." % (
            np.prod(shape) / 1e6 / dt, dt)
        t0 = time.time()

    lenCMB_gpu = gpuarray.empty(shape, dtype=np.float32, order='C')
    for _g in xrange(nfields):

        rfft2_unlCMB_gpu.set(
            lib_alm_sky.alm2rfft(temp_alms[_g]).astype(cplx_type))
        if True:
            dt = time.time() - t0
            print "     Pushing temp alm field %s at %s Mpixel / sec, ex. time %s sec." % (
                _g, np.prod(shape) / 1e6 / dt, dt)
            t0 = time.time()

        prefilter(rfft2_unlCMB_gpu,
                  wx_gpu,
                  np.int32(rshape[1]),
                  np.int32(rshape[0]),
                  block=GPU_block,
                  grid=GPU_grid)
        ifft(rfft2_unlCMB_gpu, coeffs_gpu, plan_inv, False)
        # Lensing :
        if f_type != np.float32: coeffs_gpu = coeffs_gpu.astype(np.float32)
        setup_texture_gpuarr(unl_CMB_tex, coeffs_gpu)
        lens_func(lenCMB_gpu,
                  np.int32(shape[0]),
                  block=GPU_block,
                  grid=GPU_grid,
                  texrefs=[unl_CMB_tex, dx_tex, dy_tex])
        if f_type != np.float32: lenCMB_gpu = lenCMB_gpu.astype(f_type)

        # coeffs_gpu is now D xi D^t. Turn this to Fourier space :
        fft(lenCMB_gpu, rfft2_unlCMB_gpu, plan)
        if True:
            dt = time.time() - t0
            print "     Lensing + rfft of field %s at %s Mpixel / sec, ex. time %s sec." % (
                _g, np.prod(shape) / 1e6 / dt, dt)
            t0 = time.time()

        alms_unlCMB[_g] = lib_alm_dat.rfftmap2alm(
            rfft2_unlCMB_gpu.get())  # Pulling result from GPU to CPUcd
        if True:
            dt = time.time() - t0
            print "     Pulling back field %s at %s Mpixel / sec, ex. time %s sec." % (
                _g, np.prod(shape) / 1e6 / dt, dt)
            t0 = time.time()

    if True:
        dt = time.time() - ti
        print "GPU TQU did G D xi D^t G^t at %s Mpixel / sec, ex. time %s sec." % (
            np.prod(shape) / 1e6 / dt, dt)

    return alms_unlCMB
Exemplo n.º 3
0
def inverse_GPU(dx, dy, rmins, NR):
    """
    Inverse displacement field on the GPU.
    The cost is completely dominated by, first, getting the results from the GPU to the host,
    and to a lesser degree by loading the dx and dy textures. Rest is super-fast.
    :return: inverse displacement in physical units
    """
    dx = load_map(dx)
    dy = load_map(dy)
    assert dx.shape == dy.shape
    assert dx.shape[0] % GPU_block[0] == 0, (dx.shape, GPU_block)
    assert dx.shape[1] % GPU_block[1] == 0, (dx.shape, GPU_block)
    assert dx.shape[0] == dx.shape[1], dx.shape
    # FIXME : THIS DOES NOT APPEAR TO WORK PROPERLY FOR NON POWER OF TWO INPUT MAPS BUT WHY ?"
    assert IsPowerOfTwo(dx.shape[0]), dx.shape

    shape = dx.shape
    GPU_grid = (shape[0] / GPU_block[0], shape[1] / GPU_block[1], 1)
    rshape = (shape[0], shape[1] / 2 + 1)

    if shape[0] > 2048:
        print "--- Exercise caution, array shapes larger than 2048 have never been tested so far ---"
    # if NR < 3:
    #    NR = 3
    #    print "--- inverse GPU : NR parameter changed to 3 ----" # This is just better
    # 2D texture references :
    Minvxx_tex = CUDA_inv_module.get_texref("Minv_xx")
    Minvyy_tex = CUDA_inv_module.get_texref("Minv_yy")
    Minvxy_tex = CUDA_inv_module.get_texref("Minv_xy")
    Minvyx_tex = CUDA_inv_module.get_texref("Minv_yx")
    dx_tex = CUDA_inv_module.get_texref("tex_dx")
    dy_tex = CUDA_inv_module.get_texref("tex_dy")

    # loading fft plans :
    plan, plan_inv = get_rfft_plans(shape)
    # Function references :
    # Spline bicubic prefiltering, bicubic interpolation and multiplication with magnification.
    prefilter = CUDA_inv_module.get_function("cf_outer_w")
    mult_inplace = CUDA_inv_module.get_function('ff_mult_inplace')
    divide_detM = CUDA_inv_module.get_function("divide_detmagn")

    cplx_type = np.complex64
    f_type = np.float32

    rminx = rmins[1].astype(f_type)
    rminy = rmins[0].astype(f_type)
    rminx_inv = (1. / rmins[1]).astype(f_type)
    rminy_inv = (1. / rmins[0]).astype(f_type)

    # The prefiltering done in this way requires square matrix, but we could change that.
    wx_gpu = (6. / (2. * np.cos(2. * np.pi * np.fft.fftfreq(shape[0])) + 4.) / shape[0])
    wx_gpu = gpuarray.to_gpu(wx_gpu.astype(f_type))

    gpu_rfft = gpuarray.empty(rshape, dtype=cplx_type, order='C')

    # Setting up dx texture. The dx texture is in grid units
    gpu_map = gpuarray.to_gpu(dx.astype(f_type))
    mult_inplace(gpu_map, rminx_inv, np.int32(shape[0]), block=GPU_block, grid=GPU_grid)
    fft(gpu_map, gpu_rfft, plan)
    prefilter(gpu_rfft, wx_gpu, np.int32(rshape[1]), np.int32(rshape[0]), block=GPU_block, grid=GPU_grid)
    ifft(gpu_rfft, gpu_map, plan_inv, False)
    setup_texture_gpuarr(dx_tex, gpu_map)

    # Setting up dy texture :
    gpu_map.set(dy.astype(f_type))
    mult_inplace(gpu_map, rminy_inv, np.int32(shape[0]), block=GPU_block, grid=GPU_grid)
    fft(gpu_map, gpu_rfft, plan)
    prefilter(gpu_rfft, wx_gpu, np.int32(rshape[1]), np.int32(rshape[0]), block=GPU_block, grid=GPU_grid)
    ifft(gpu_rfft, gpu_map, plan_inv, False)
    setup_texture_gpuarr(dy_tex, gpu_map)

    # Setting magnification textures  Mxx :
    func = CUDA_inv_module.get_function("get_m1pMyy")
    func(gpu_map, np.int32(shape[0]), block=GPU_block, grid=GPU_grid)
    divide_detM(gpu_map, np.int32(shape[0]), block=GPU_block, grid=GPU_grid)
    fft(gpu_map, gpu_rfft, plan)
    prefilter(gpu_rfft, wx_gpu, np.int32(rshape[1]), np.int32(rshape[0]), block=GPU_block, grid=GPU_grid)
    ifft(gpu_rfft, gpu_map, plan_inv, False)
    setup_texture_gpuarr(Minvxx_tex, gpu_map)

    # Setting magnification textures  Myy :
    func = CUDA_inv_module.get_function("get_m1pMxx")
    func(gpu_map, np.int32(shape[0]), block=GPU_block, grid=GPU_grid)
    divide_detM(gpu_map, np.int32(shape[0]), block=GPU_block, grid=GPU_grid)
    fft(gpu_map, gpu_rfft, plan)
    prefilter(gpu_rfft, wx_gpu, np.int32(rshape[1]), np.int32(rshape[0]), block=GPU_block, grid=GPU_grid)
    ifft(gpu_rfft, gpu_map, plan_inv, False)
    setup_texture_gpuarr(Minvyy_tex, gpu_map)

    # Setting magnification textures  Mxy :
    func = CUDA_inv_module.get_function("get_Mxy")
    func(gpu_map, np.int32(shape[0]), block=GPU_block, grid=GPU_grid)
    divide_detM(gpu_map, np.int32(shape[0]), block=GPU_block, grid=GPU_grid)
    fft(gpu_map, gpu_rfft, plan)
    prefilter(gpu_rfft, wx_gpu, np.int32(rshape[1]), np.int32(rshape[0]), block=GPU_block, grid=GPU_grid)
    ifft(gpu_rfft, gpu_map, plan_inv, False)
    setup_texture_gpuarr(Minvxy_tex, gpu_map)

    # Setting magnification textures  Myx :
    func = CUDA_inv_module.get_function("get_Myx")
    func(gpu_map, np.int32(shape[0]), block=GPU_block, grid=GPU_grid)
    divide_detM(gpu_map, np.int32(shape[0]), block=GPU_block, grid=GPU_grid)
    fft(gpu_map, gpu_rfft, plan)
    prefilter(gpu_rfft, wx_gpu, np.int32(rshape[1]), np.int32(rshape[0]), block=GPU_block, grid=GPU_grid)
    ifft(gpu_rfft, gpu_map, plan_inv, False)
    setup_texture_gpuarr(Minvyx_tex, gpu_map)

    # iterations proper :
    # First iteration is simpler, no need to lens maps :
    iterate_0 = CUDA_inv_module.get_function("displ_0th")
    gpu_map2 = gpuarray.empty(shape, f_type, order='C')  # We use the already declared gpu_map for the dx component.
    iterate_0(gpu_map, gpu_map2, np.int32(shape[0]), block=GPU_block, grid=GPU_grid)
    if NR > 0:
        iterate = CUDA_inv_module.get_function("iterate")
        for i in xrange(NR):
            iterate(gpu_map, gpu_map2, np.int32(shape[0]), block=GPU_block, grid=GPU_grid)
    mult_inplace(gpu_map, rminx, np.int32(shape[0]), block=GPU_block, grid=GPU_grid)  # Turning to physical units
    mult_inplace(gpu_map2, rminy, np.int32(shape[0]), block=GPU_block, grid=GPU_grid)

    return gpu_map.get(), gpu_map2.get()  # in physical units
Exemplo n.º 4
0
def alm2lenmap_onGPU(lib_alm, unlalm, dx_gu, dy_gu, do_not_prefilter=False):
    """
    Lens the input unl_CMB map on the GPU using the pyCUDA interface.
    dx dy displacement in grid units. (f.get_dx_ingridunits() e.g.)
    Can be path to arrays or arrays or memmap.
    Will probably crash for too large maps, with need to split the job.
    Works for 4096 x 4096 at least on my laptop.

    Cost dominated by texture setup. # FIXME : try get rid of texture
    Note that the first call might be substantially slower than subsequent calls, as it caches the fft and ifft plans
    for subsequent usage.
    :param unl_CMB:
    :param func: bicubic or bilinear
    :param normalized_tex: use a modified version of the GPU bicubic spline to account for periodicity of the map
    :return:
    """
    if timed:
        ti = time.time()
    shape = lib_alm.ell_mat.shape
    rshape = (shape[0], shape[1] / 2 + 1)
    assert shape[0] == shape[1], shape
    assert IsPowerOfTwo(shape[0]), shape
    assert load_map(dx_gu).shape == shape, (load_map(dx_gu).shape,
                                            lib_alm.ell_mat.shape)
    assert load_map(dy_gu).shape == shape, (load_map(dy_gu).shape,
                                            lib_alm.ell_mat.shape)

    assert np.all(np.array(shape) % GPU_block[0] == 0), shape
    if shape[0] > 4096:
        print "--- Exercise caution, array shapes larger than 4096 have never been tested so far ---"

    GPU_grid = (shape[0] / GPU_block[0], shape[1] / GPU_block[1], 1)

    # Prefiltering forces the interpolant to pass through the samples and increase accuracy, but dominates the cost.
    rfft2_unlCMB_gpu = gpuarray.to_gpu(
        lib_alm.alm2rfft(unlalm / np.prod(shape)).astype(np.complex64))
    coeffs_gpu = gpuarray.empty(lib_alm.ell_mat.shape, dtype=np.float32)
    plan, plan_inv = get_rfft_plans(shape)

    if not do_not_prefilter:
        # The prefilter makes sure the spline is exact at the nodes.
        # Uncomments this to put coeffs_gpu on pitched memory to allow later for 2D texture binding :
        # alloc,pitch  = cuda.mem_alloc_pitch(shape[0] * 4,shape[1],4) # 4 bytes per float32
        wx = (6. / (2. * np.cos(
            2. * np.pi * Freq(np.arange(shape[0]), shape[0]) / shape[0]) + 4.))
        wx_gpu = gpuarray.to_gpu(wx.astype(np.float32))
        prefilter = CUDA_module.get_function("cf_outer_w")
        prefilter(rfft2_unlCMB_gpu,
                  wx_gpu,
                  np.int32(rshape[1]),
                  np.int32(rshape[0]),
                  block=GPU_block,
                  grid=GPU_grid)
        del wx_gpu

    ifft(rfft2_unlCMB_gpu, coeffs_gpu, plan_inv, False)

    # Binding arrays to textures and getting lensing func.
    if texture_count == 0:
        lens_func = CUDA_module.get_function("bicubiclensKernel_notex")
        tex_refs = []
        dx_gu = gpuarray.to_gpu(load_map(dx_gu).astype(np.float32))
        dy_gu = gpuarray.to_gpu(load_map(dy_gu).astype(np.float32))
    elif texture_count == 1:
        unl_CMB_tex = CUDA_module.get_texref("unl_CMB")
        tex_refs = [unl_CMB_tex]
        unl_CMB_tex.set_array(cuda.gpuarray_to_array(coeffs_gpu, "C"))
        del coeffs_gpu
        dx_gu = gpuarray.to_gpu(load_map(dx_gu).astype(np.float32))
        dy_gu = gpuarray.to_gpu(load_map(dy_gu).astype(np.float32))
        lens_func = CUDA_module.get_function(
            "bicubiclensKernel_normtex_singletex")
    elif texture_count == 3:
        unl_CMB_tex = CUDA_module.get_texref("unl_CMB")
        dx_tex = CUDA_module.get_texref("tex_dx")
        dy_tex = CUDA_module.get_texref("tex_dy")
        tex_refs = ([unl_CMB_tex, dx_tex, dy_tex])
        unl_CMB_tex.set_array(cuda.gpuarray_to_array(coeffs_gpu, "C"))
        del coeffs_gpu
        cuda.matrix_to_texref(load_map(dx_gu).astype(np.float32),
                              dx_tex,
                              order="C")
        cuda.matrix_to_texref(load_map(dy_gu).astype(np.float32),
                              dy_tex,
                              order="C")
        lens_func = CUDA_module.get_function("bicubiclensKernel_normtex")
    else:
        tex_refs = []
        lens_func = 0
        assert 0
    # Wraping, important for periodic boundary conditions.
    # Note that WRAP has not effect for unnormalized texture coordinates.

    for tex_ref in tex_refs:
        tex_ref.set_address_mode(0, cuda.address_mode.WRAP)
        tex_ref.set_address_mode(1, cuda.address_mode.WRAP)
        tex_ref.set_filter_mode(cuda.filter_mode.POINT)
        tex_ref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)

    if timed: t0 = time.time()

    len_CMB = np.empty(shape, dtype=np.float32)

    if texture_count == 0:
        lens_func(cuda.Out(len_CMB),
                  coeffs_gpu,
                  dx_gu,
                  dy_gu,
                  np.int32(shape[0]),
                  block=GPU_block,
                  grid=GPU_grid,
                  texrefs=tex_refs)
    elif texture_count == 1:
        lens_func(cuda.Out(len_CMB),
                  dx_gu,
                  dy_gu,
                  np.int32(shape[0]),
                  block=GPU_block,
                  grid=GPU_grid,
                  texrefs=tex_refs)
    elif texture_count == 3:
        lens_func(cuda.Out(len_CMB),
                  np.int32(shape[0]),
                  block=GPU_block,
                  grid=GPU_grid,
                  texrefs=tex_refs)

    if timed:
        dt = time.time() - t0
        t_tot = time.time() - ti
        print "     GPU bicubic spline and transfer at %s Mpixel / sec, time %s sec" % (
            np.prod(lib_alm.ell_mat.shape) / 1e6 / dt, dt)
        print " Total ex. time at %s Mpixel / sec, ex. time %s sec." % (
            np.prod(shape) / 1e6 / t_tot, t_tot)
    return len_CMB.astype(np.float64)
Exemplo n.º 5
0
def apply_cond3_GPU_inplace(type,
                            lib_alm_dat,
                            alms_unlCMB,
                            f,
                            f_inv,
                            cls_unl,
                            cl_transf,
                            cls_noise,
                            func='bicubic',
                            double_precision_ffts=False):
    """
    cond3 is F D^-t (B xi B^t + N)D^-1 F^t
    Note that the first call might be substantially slower than subsequent calls, as it caches the fft and ifft plans
    for subsequent usage, if not already in the fft plans (See __init__.py)
    :param type : 'T', 'QU' or 'TQU'
    :param alms_unlCMB: ffs_alms to apply FDxiDtFt to.
    :param func: bicubic or bilinear
    :param cls_unl : unlensed CMB cls dictionary (used in get_P_mat)
    :return: ffs_alms of shape (len(type,lib_alm_dat.alm_size)
    """
    if timed:
        ti = time.time()

    assert func in ['bicubic', 'bilinear'], func
    assert alms_unlCMB.shape == (len(type), lib_alm_dat.alm_size)

    # Useful declarations :
    nfields = len(type)
    rshape = lib_alm_dat.ell_mat.rshape
    shape = (rshape[0], 2 * (rshape[1] - 1))
    flat_shape = np.prod(shape)

    GPU_grid = (shape[0] / GPU_block[0], shape[1] / GPU_block[1], 1)

    assert shape[0] % GPU_block[0] == 0, shape

    assert shape[0] == shape[1], shape
    assert IsPowerOfTwo(shape[0]), shape
    assert f.shape == shape, (f.shape, shape)
    assert f_inv.shape == shape, (f_inv.shape, shape)
    assert f.lsides == lib_alm_dat.ell_mat.lsides, (f.lsides,
                                                    lib_alm_dat.ell_mat.lsides)
    assert f_inv.lsides == lib_alm_dat.ell_mat.lsides, (
        f_inv.lsides, lib_alm_dat.ell_mat.lsides)

    assert np.all(np.array(shape) % GPU_block[0] == 0), shape

    if shape[0] > 4096:
        print "--- Exercise caution, array shapes larger than 4096 have never been tested so far ---"

    def get_rfft_unlCMB(idx):
        return lib_alm_dat.alm2rfft(alms_unlCMB[idx])

    unlPmat = get_Pmat(type,
                       lib_alm_dat,
                       cls_unl,
                       cl_transf=cl_transf,
                       cls_noise=cls_noise,
                       inverse=True)

    # 2D texture references :
    unl_CMB_tex = CUDA_module.get_texref("unl_CMB")
    dx_tex = CUDA_module.get_texref("tex_dx")
    dy_tex = CUDA_module.get_texref("tex_dy")

    # loading fft plans :
    plan, plan_inv = get_rfft_plans(shape,
                                    double_precision=double_precision_ffts)
    # Function references :
    prefilter = CUDA_module.get_function(
        "cf_outer_w"
    ) if not double_precision_ffts else CUDA_module.get_function("cdd_outer_w")
    lens_func = CUDA_module.get_function("%slensKernel_normtex" % func)
    magn_func = CUDA_module.get_function("detmagn_normtex")

    cplx_type = np.complex64 if not double_precision_ffts else np.complex128
    f_type = np.float32 if not double_precision_ffts else np.float64

    # We will store in host memory some maps for convenience
    temp_alm = np.zeros((nfields, lib_alm_dat.alm_size), dtype=cplx_type)

    # Setting up the texture references to the displacement
    # (This is what  contribute most to the cost actually)
    setup_texture_nparr(dx_tex, f_inv.get_dx_ingridunits())
    setup_texture_nparr(dy_tex, f_inv.get_dy_ingridunits())
    # Building spline coefficients (1 / shape[0] comes from ifft convention)
    wx_gpu = (6. / (2. * np.cos(
        2. * np.pi * Freq(np.arange(shape[0]), shape[0]) / shape[0]) + 4.) /
              shape[0])
    wx_gpu = gpuarray.to_gpu(wx_gpu.astype(f_type))
    coeffs_gpu = gpuarray.empty(shape, dtype=f_type, order='C')
    for _f in xrange(nfields):
        # Multiplying with the spline coefficients and Fourier transforming
        rfft2_unlCMB_gpu = gpuarray.to_gpu(
            get_rfft_unlCMB(_f).astype(cplx_type))
        prefilter(rfft2_unlCMB_gpu,
                  wx_gpu,
                  np.int32(rshape[1]),
                  np.int32(rshape[0]),
                  block=GPU_block,
                  grid=GPU_grid)
        ifft(rfft2_unlCMB_gpu, coeffs_gpu, plan_inv, False)
        # coeffs_gpu now contains the prefiltered map to be now bicubic interpolated

        # Now bicubic interpolation with inverse displacement.
        setup_texture_gpuarr(unl_CMB_tex, coeffs_gpu)
        lenCMB_gpu = gpuarray.empty(shape, dtype=np.float32, order='C')
        lens_func(lenCMB_gpu,
                  np.int32(shape[0]),
                  block=GPU_block,
                  grid=GPU_grid,
                  texrefs=[unl_CMB_tex, dx_tex, dy_tex])
        if f_type != np.float32: lenCMB_gpu = lenCMB_gpu.astype(f_type)

        # Back to Fourier space :
        rfft2_unlCMB_gpu = gpuarray.empty(rshape, dtype=cplx_type, order='C')
        fft(lenCMB_gpu, rfft2_unlCMB_gpu, plan)

        # We construct the map P_ij m_j which we will have to lens afterwards.
        # To be GPU memory friendly these maps are in the host memory :
        # for _g in xrange(nfields): ret[_g] += rfft2_unlCMB_gpu.get() * get_unlPmat(_g,_f)
        for _g in xrange(nfields):
            temp_alm[_g] += lib_alm_dat.rfftmap2alm(
                rfft2_unlCMB_gpu.get()) * unlPmat[:, _g, _f]

    # We now lens and then fft each map, and return.
    # We lens now with the forward displacement :
    setup_texture_nparr(dx_tex, f.get_dx_ingridunits())
    setup_texture_nparr(dy_tex, f.get_dy_ingridunits())
    for _g in xrange(nfields):
        rfft2_unlCMB_gpu = gpuarray.to_gpu(
            lib_alm_dat.alm2rfft(temp_alm[_g]).astype(cplx_type))
        prefilter(rfft2_unlCMB_gpu,
                  wx_gpu,
                  np.int32(rshape[1]),
                  np.int32(rshape[0]),
                  block=GPU_block,
                  grid=GPU_grid)
        ifft(rfft2_unlCMB_gpu, coeffs_gpu, plan_inv, False)
        # Lensing by forward displacement, and multiplication by magnification :
        setup_texture_gpuarr(unl_CMB_tex, coeffs_gpu)
        lenCMB_gpu = gpuarray.empty(shape, dtype=np.float32, order='C')
        lens_func(lenCMB_gpu,
                  np.int32(shape[0]),
                  block=GPU_block,
                  grid=GPU_grid,
                  texrefs=[unl_CMB_tex, dx_tex, dy_tex])
        magn_func(lenCMB_gpu,
                  np.int32(shape[0]),
                  np.int32(flat_shape),
                  block=GPU_block,
                  grid=GPU_grid,
                  texrefs=[dx_tex, dy_tex])
        if f_type != np.float32: lenCMB_gpu = lenCMB_gpu.astype(f_type)
        # coeffs_gpu is now D xi D^t. Turn this to Fourier space :
        fft(lenCMB_gpu, rfft2_unlCMB_gpu, plan)
        alms_unlCMB[_g] = lib_alm_dat.rfftmap2alm(
            rfft2_unlCMB_gpu.get().astype(
                np.complex128))  # Pulling result from GPU to CPUcd
    if timed:
        dt = time.time() - ti
        print "GPU TQU did conditioner 3 at %s Mpixel / sec, ex. time %s sec." % (
            np.prod(shape) / 1e6 / dt, dt)
    return