예제 #1
0
 def updateMomentLM(self, dt, A, B, C, LU, LM, U, nstages):
     lda = np.int(U[0].shape[0])//self.nalph
     grid = get_grid_for_block(self.block, lda)
     coeffs = A + B + C
     args = LU + LM + U
     self.updateMomKernsLM[nstages-1].prepared_call(grid, self.block, 
             self._prefactor, lda, dt, *coeffs, *self.ptr(args))
예제 #2
0
파일: bc.py 프로젝트: jaisw7/dgfs1D_gpu
    def __init__(self, xsol, nl, vm, cfg, cfgsect, **kwargs):

        super().__init__(xsol, nl, vm, cfg, cfgsect, **kwargs)

        initcondcls = subclass_where(DGFSInitConditionStd, model='maxwellian')
        bc = initcondcls(cfg, self._vm, cfgsect, wall=False)
        f0 = bc.get_init_vals().reshape(self._vm.vsize(), 1)
        self._d_bnd_f0 = gpuarray.to_gpu(f0)

        # template
        dfltargs = dict(dtype=cfg.dtypename,
                        vsize=self._vm.vsize(),
                        cw=self._vm.cw(),
                        nl=nl,
                        x=xsol)
        kernsrc = DottedTemplateLookup('dgfs1D.std.kernels.bcs',
                                       dfltargs).get_template(
                                           self.type).render()
        kernmod = compiler.SourceModule(kernsrc)

        # block size
        block = (128, 1, 1)
        grid_Nv = get_grid_for_block(block, self._vm.vsize())

        # for extracting right face values
        applyBCFunc = get_kernel(kernmod, "applyBC",
                                 [np.intp] * 4 + [cfg.dtype])
        self._applyBCKern = lambda ul, ur, t: applyBCFunc.prepared_call(
            grid_Nv, block, ul.ptr, ur.ptr,
            self._vm.d_cvx().ptr, self._d_bnd_f0.ptr, t)
예제 #3
0
 def updateDistLM(self, dt, A, B, C, L, M, F, fnew, U, nstages):
     lda = np.int(F[0].shape[0])
     grid = get_grid_for_block(self.block, lda)
     coeffs = A + B + C
     args = L + M + F + [fnew] + U
     self.updateDistKernsLM[nstages-1].prepared_call(grid, self.block, 
         self._prefactor, lda, dt, *coeffs, *self.ptr(args))
예제 #4
0
    def constructMaxwellian(self, t, U, M):
        lda = M.shape[0]//self.vm.vsize()
        assert lda==U.shape[0]//self.nalph, "Some issue"

        vm = self.vm

        if not self.normU: self.normU = gpuarray.empty_like(U)
        cuda.memcpy_dtod(self.normU.ptr, U.ptr, U.nbytes)

        grid = get_grid_for_block(self.block, lda)
        self.momentNormKern.prepared_call(grid, self.block, lda, self.normU.ptr)

        grid = get_grid_for_block(self.block, lda*vm.vsize())
        self.cmaxwellianKern.prepared_call(
                    grid, self.block, lda*vm.vsize(), 
                    vm.d_cvx().ptr, vm.d_cvy().ptr, vm.d_cvz().ptr, 
                    M.ptr, self.normU.ptr)
예제 #5
0
    def collideNu(self, t, U, M, f, Q):
        lda = M.shape[0]//self.vm.vsize()
        assert lda==U.shape[0]//self.nalph, "Some issue"

        vm = self.vm
        grid = get_grid_for_block(self.block, lda*vm.vsize())
        self.collideNuKern.prepared_call(
                    grid, self.block, self._prefactor, np.int(lda*vm.vsize()), 
                    M.ptr, U.ptr, f.ptr, Q.ptr)
예제 #6
0
    def updateDistNuBDF(self, dt, *args):
        # the size of args should be 4*q+5 for BDF scheme
        q = (len(args) - 5)//4
        assert len(args)==4*q+5, "Inconsistency in number of parameters"

        lda = np.int(args[1].shape[0])
        grid = get_grid_for_block(self.block, lda)
        self.updateDistNuKernsBDF[q-1].prepared_call(grid, self.block, 
            self._prefactor, lda, dt, *self.ptr(args))
예제 #7
0
    def updateMomentARS(self, dt, *args):
        # the size of args should be 4*q+1 for ARS scheme
        q = (len(args) - 1)//4
        assert len(args)==4*q+1, "Inconsistency in number of parameters"

        lda = np.int(args[-1].shape[0])//self.nalph
        grid = get_grid_for_block(self.block, lda)
        self.updateMomKernsARS[q-1].prepared_call(grid, self.block, 
                self._prefactor, lda, dt, *self.ptr(args))
예제 #8
0
    def updateDistWeightSSPL(self, dt, *args):
        # the size of args should be 6*q+5 for ARS scheme
        q = (len(args) - 3)//6
        assert len(args)==6*q+3, "Inconsistency in number of parameters"

        lda = np.int(args[-1].shape[0])
        grid = get_grid_for_block(self.block, lda)
        self.updateDistWeightKernsSSPL[q-1].prepared_call(grid, self.block, 
            self._prefactor, lda, dt, *self.ptr(args))
예제 #9
0
    def constructMaxwellian(self, t, U, M):
        lda = M.shape[0]//self.vm.vsize()
        assert lda==U.shape[0]//self.nalph, "Some issue"

        vm = self.vm
        grid = get_grid_for_block(self.block, lda*vm.vsize())
        self.cmaxwellianKern.prepared_call(
                    grid, self.block, lda*vm.vsize(), 
                    vm.d_cvx().ptr, vm.d_cvy().ptr, vm.d_cvz().ptr, 
                    M.ptr, U.ptr)
예제 #10
0
파일: bc.py 프로젝트: jaisw7/dgfs1D_gpu
    def __init__(self, xsol, nl, vm, cfg, cfgsect, **kwargs):

        super().__init__(xsol, nl, vm, cfg, cfgsect, **kwargs)

        initcondcls = subclass_where(DGFSInitConditionStd, model='maxwellian')
        bc = initcondcls(cfg, self._vm, cfgsect, wall=True)
        f0 = bc.get_init_vals().reshape(self._vm.vsize(), 1)
        self._d_bnd_f0 = gpuarray.to_gpu(f0)
        unondim = bc.unondim()

        # storage
        self._bc_vals_num = gpuarray.empty(self._vm.vsize(),
                                           self._d_bnd_f0.dtype)
        self._bc_vals_den = gpuarray.empty_like(self._bc_vals_num)

        dfltargs = dict(dtype=cfg.dtypename,
                        vsize=self._vm.vsize(),
                        cw=self._vm.cw(),
                        ux=unondim[0, 0],
                        nl=nl,
                        x=xsol)
        kernsrc = DottedTemplateLookup('dgfs1D.std.kernels.bcs',
                                       dfltargs).get_template(
                                           self.type).render()
        kernmod = compiler.SourceModule(kernsrc)

        # block size
        block = (128, 1, 1)
        grid_Nv = get_grid_for_block(block, self._vm.vsize())

        # for extracting right face values
        applyBCFunc = get_kernel(kernmod, "applyBC",
                                 [np.intp] * 5 + [unondim.dtype])
        self._applyBCKern = lambda ul, ur, t: applyBCFunc.prepared_call(
            grid_Nv, block, ul.ptr, ur.ptr,
            self._vm.d_cvx().ptr, self._d_bnd_f0.ptr, self._wall_nden.ptr, t)

        # for extracting left face values
        updateBCFunc = get_kernel(kernmod, "updateBC",
                                  [np.intp] * 5 + [unondim.dtype])

        def updateBC(ul, t):
            updateBCFunc.prepared_call(grid_Nv, block, ul.ptr,
                                       self._vm.d_cvx().ptr,
                                       self._d_bnd_f0.ptr,
                                       self._bc_vals_num.ptr,
                                       self._bc_vals_den.ptr, t)
            self._wall_nden = -(gpuarray.sum(self._bc_vals_num) /
                                gpuarray.sum(self._bc_vals_den))
            #print(xsol, self._wall_nden.get())

        self._updateBCKern = updateBC
예제 #11
0
    def __init__(self, rhs, shape, dtype, **kwargs):
        self._rhs = rhs

        K, Ne, Nv = shape
        block = (128, 1, 1)
        grid_NeNv = get_grid_for_block(block, Ne*Nv)

        # axnpby kernel generator
        self._addGen = lambda nx: get_axnpby_kerns(nx, range(K), Ne*Nv, dtype)
        self._add = lambda kern: lambda *x: kern.prepared_call(
                            grid_NeNv, block, 
                            *list(list(map(lambda c: c.ptr, x[1::2])) 
                                + list(x[::2]))
                        )

        self.add2 = self._add(self._addGen(2)) # axnpby on 2 vectors
        self.add3 = self._add(self._addGen(3)) # axnpby on 3 vectors

        # allocate scratch storage 
        self.scratch = [gpuarray.empty(K*Ne*Nv, dtype) 
                        for r in range(self._nscratch)]
예제 #12
0
    def init(self, explicit, sm, shape, dtype, **kwargs):
        self._explicit = explicit
        self._sm = sm
        self._shape = shape
        self._dtype = dtype
        self._explicitQ = kwargs.get('explicitQ')

        K, Ne, Nv = shape
        block = (128, 1, 1)
        grid_NeNv = get_grid_for_block(block, Ne*Nv)

        # axnpby kernel generator
        self._addGen = lambda nx: get_axnpby_kerns(nx, range(K), Ne*Nv, dtype)
        self._add = lambda kern: lambda *x: kern.prepared_call(
                            grid_NeNv, block, 
                            *list(list(map(lambda c: c.ptr, x[1::2])) 
                                + list(x[::2]))
                        )

        self.add2 = self._add(self._addGen(2)) # axnpby on 2 vectors
        self.add3 = self._add(self._addGen(3)) # axnpby on 3 vectors
예제 #13
0
파일: bc.py 프로젝트: jaisw7/dgfs1D_gpu
    def __init__(self, xsol, nl, vm, cfg, cfgsect, **kwargs):

        super().__init__(xsol, nl, vm, cfg, cfgsect, **kwargs)

        dfltargs = dict(dtype=cfg.dtypename,
                        vsize=self._vm.vsize(),
                        cw=self._vm.cw(),
                        nl=nl,
                        x=xsol)
        kernsrc = DottedTemplateLookup('dgfs1D.std.kernels.bcs',
                                       dfltargs).get_template(
                                           self.type).render()
        kernmod = compiler.SourceModule(kernsrc)

        # block size
        block = (128, 1, 1)
        grid_Nv = get_grid_for_block(block, self._vm.vsize())

        # copy the left face values to the right
        applyBCFunc = get_kernel(kernmod, "applyBC", 'PP')
        self._applyBCKern = lambda ul, ur, t: applyBCFunc.prepared_call(
            grid_Nv, block, ul.ptr, ur.ptr)
예제 #14
0
    def __init__(self, xsol, nl, vm, cfg, cfgsect, **kwargs):

        super().__init__(xsol, nl, vm, cfg, cfgsect, **kwargs)

        initcondcls = subclass_where(DGFSInitConditionBi, model='maxwellian')
        bc = initcondcls(cfg, self._vm, cfgsect, wall=False)
        f0 = bc.get_init_vals()
        self._d_bnd_f0 = [gpuarray.to_gpu(f.ravel()) for f in f0]
        unondim = bc.unondim()

        # template
        dfltargs = dict(dtype=cfg.dtypename,
                        vsize=self._vm.vsize(),
                        cw=self._vm.cw(),
                        ux=unondim[0, 0],
                        nl=nl,
                        x=xsol)
        kernsrc = DottedTemplateLookup('dgfs1D.bi.kernels.bcs',
                                       dfltargs).get_template(
                                           self.type).render()
        kernmod = compiler.SourceModule(kernsrc)

        # block size
        block = (128, 1, 1)
        grid_Nv = get_grid_for_block(block, self._vm.vsize())

        # for applying the boundary condition
        def make_applyBC(p, applyBCFunc):
            def applyBC(ul, ur, t):
                applyBCFunc.prepared_call(grid_Nv, block, ul.ptr, ur.ptr,
                                          self._vm.d_cvx().ptr,
                                          self._d_bnd_f0[p].ptr, t)

            return applyBC

        applyBCFunc = get_kernel(kernmod, "applyBC",
                                 [np.intp] * 4 + [unondim.dtype])
        for p in range(vm.nspcs()):
            self._applyBCKern[p] = make_applyBC(p, applyBCFunc)
예제 #15
0
파일: bc.py 프로젝트: jaisw7/dgfs1D_gpu
    def __init__(self, xsol, nl, vm, cfg, cfgsect, **kwargs):

        super().__init__(xsol, nl, vm, cfg, cfgsect, **kwargs)

        #initcondcls = subclass_where(DGFSInitConditionStd, model='maxwellian-expr-nondim')
        #bc = initcondcls(cfg, self._vm, cfgsect)
        #f0 = np.zeros((1,1,self._vm.vsize()))
        #bc.apply_init_vals(f0, 1, 1, xsol)

        self.vm = self._vm
        rho, ux, T = map(lambda v: cfg.lookupfloat(cfgsect, v),
                         ('rho', 'ux', 'T'))
        f0 = self.maxwellian(rho, ux, 0, 0, T)
        f0 = f0.reshape(self._vm.vsize(), 1)
        self._d_bnd_f0 = gpuarray.to_gpu(f0)

        # template
        dfltargs = dict(dtype=cfg.dtypename,
                        vsize=self._vm.vsize(),
                        cw=self._vm.cw(),
                        nl=nl,
                        x=xsol,
                        u=ux)
        kernsrc = DottedTemplateLookup(
            'dgfs1D.std.kernels.bcs',
            dfltargs).get_template('dgfs-inlet-nondim').render()
        kernmod = compiler.SourceModule(kernsrc)

        # block size
        block = (128, 1, 1)
        grid_Nv = get_grid_for_block(block, self._vm.vsize())

        # for extracting right face values
        applyBCFunc = get_kernel(kernmod, "applyBC",
                                 [np.intp] * 4 + [cfg.dtype])
        self._applyBCKern = lambda ul, ur, t: applyBCFunc.prepared_call(
            grid_Nv, block, ul.ptr, ur.ptr,
            self._vm.d_cvx().ptr, self._d_bnd_f0.ptr, t)
예제 #16
0
def main():
    # who am I in this world? (Bulleh Shah, 18th century sufi poet)
    comm, rank, root = get_comm_rank_root()

    # read the inputs (from people)
    cfg, args = initialize()
    mesh = Mesh(cfg)

    # define 1D mesh (construct a 1D world view)
    xmesh = mesh.xmesh

    # number of elements (how refined perspectives do we want/have?)
    Ne = mesh.Ne

    # define the basis (what is the basis for those perspectives?)
    bsKind = cfg.lookup('basis', 'kind')
    basiscls = subclass_where(Basis, basis_kind=bsKind)
    basis = basiscls(cfg)

    # number of local degrees of freedom (depth/granualirity of perspectives)
    K = basis.K

    # number of solution points (how far can I interpolate my learning)
    Nq = basis.Nq

    # left/right face maps
    Nqf = basis.Nqf  # number of points used in reconstruction at faces
    mapL, mapR = np.arange(Ne+1)+(Nqf-1)*Ne-1, np.arange(Ne+1)
    mapL[0], mapR[-1] = 0, Ne*Nqf-1
    Nf = len(mapL)

    # the zeros
    z = basis.z
    
    # jacobian of the mapping from D^{st}=[-1,1] to D
    jac, invjac = mesh.jac, mesh.invjac

    # load the velocity mesh
    vm = DGFSVelocityMeshBi(cfg)
    Nv = vm.vsize()

    # load the scattering model
    smn = cfg.lookup('scattering-model', 'type')
    scatteringcls = subclass_where(DGFSScatteringModelBi, 
        scattering_model=smn)
    sm = scatteringcls(cfg, vm, Ne=Ne)

    # initial time, time step, final time
    ti, dt, tf = cfg.lookupfloats('time-integrator', ('tstart', 'dt', 'tend'))
    nsteps = np.ceil((tf - ti)/dt)
    dt = (tf - ti)/nsteps

    # Compute the location of the solution points 
    xsol = np.array([0.5*(xmesh[j]+xmesh[j+1])+jac[j]*z for j in range(Ne)]).T
    xcoeff = np.einsum("kq,qe->ke", basis.fwdTransMat, xsol)

    # Determine the grid/block
    NeNv = Ne*Nv
    KNeNv = K*Ne*Nv
    NqNeNv = Nq*Ne*Nv
    NqfNeNv = Nqf*Ne*Nv
    NfNv = Nf*Nv
    block = (128, 1, 1)
    grid_Nv = get_grid_for_block(block, Nv)
    grid_NeNv = get_grid_for_block(block, Ne*Nv)
    grid_KNeNv = get_grid_for_block(block, K*Ne*Nv)

    # operator generator for matrix operations
    matOpGen = lambda v: lambda arg0, arg1: v.prepared_call(
                grid_NeNv, block, NeNv, arg0.ptr, NeNv, arg1.ptr, NeNv)
    
    # forward trans, backward, backward (at faces), derivative kernels
    fwdTrans_Op, bwdTrans_Op, bwdTransFace_Op, deriv_Op, invMass_Op = map(
        matOpGen, (basis.fwdTransOp, basis.bwdTransOp, 
            basis.bwdTransFaceOp, basis.derivOp, basis.invMassOp)
    )

    # U, V operator kernels
    trans_U_Op = tuple(map(matOpGen, basis.uTransOps))
    trans_V_Op = tuple(map(matOpGen, basis.vTransOps))

    # prepare the kernel for extracting face/interface values
    dfltargs = dict(
        K=K, Ne=Ne, Nq=Nq, vsize=Nv, dtype=cfg.dtypename,
        mapL=mapL, mapR=mapR, offsetL=0, offsetR=len(mapR)-1,
        invjac=invjac, gRD=basis.gRD, gLD=basis.gLD)
    kernsrc = DottedTemplateLookup('dgfs1D.bi.kernels', 
                                    dfltargs).get_template('bi').render()
    kernmod = compiler.SourceModule(kernsrc)

    # prepare operators for execution (see bi.mako for description)
    (extLeft_Op, extRight_Op, transferBC_L_Op, transferBC_R_Op, 
        insertBC_L_Op, insertBC_R_Op) = map(lambda v: 
        lambda *args: get_kernel(kernmod, v, 'PP').prepared_call(
            grid_Nv, block, *list(map(lambda c: c.ptr, args))
        ), ("extract_left", "extract_right", "transfer_bc_left", 
            "transfer_bc_right", "insert_bc_left", "insert_bc_right")
    )

    # The boundary conditions (by default all boundaries are processor bnds)
    bcl_type, bcr_type = 'dgfs-periodic', 'dgfs-periodic'

    # the mesh is decomposed in linear fashion, so rank 0 gets left boundary
    if rank==0: bcl_type = cfg.lookup('soln-bcs-xlo', 'type')

    # and the last rank comm.size-1 gets the right boundary
    if rank==comm.size-1:  bcr_type = cfg.lookup('soln-bcs-xhi', 'type')
    
    # prepare kernels for left boundary    
    bcl_cls = subclass_where(DGFSBCBi, type=bcl_type)
    bcl = bcl_cls(xmesh[0], -1., vm, cfg, 'soln-bcs-xlo')
    updateBC_L_Op = bcl.updateBCKern
    applyBC_L_Op = bcl.applyBCKern
    
    # prepare kernels for right boundary
    bcr_cls = subclass_where(DGFSBCBi, type=bcr_type)
    bcr = bcr_cls(xmesh[-1], 1., vm, cfg, 'soln-bcs-xhi')
    updateBC_R_Op = bcr.updateBCKern
    applyBC_R_Op = bcr.applyBCKern

    # flux kernel
    flux = get_kernel(kernmod, "flux", 'PPPPP')
    flux_Op = lambda d_uL, d_uR, d_jL, d_jR: flux.prepared_call(
            grid_Nv, block, 
            d_uL.ptr, d_uR.ptr, vm.d_cvx().ptr, d_jL.ptr, d_jR.ptr)

    # multiply the derivative by the advection velocity
    mulbyadv = get_kernel(kernmod, "mul_by_adv", 'PP')
    mulbyadv_Op = lambda d_ux: mulbyadv.prepared_call(
                    grid_KNeNv, block, vm.d_cvx().ptr, d_ux.ptr)

    # multiply the coefficient by the inverse jacobian
    mulbyinvjac = get_kernel(kernmod, "mul_by_invjac", 'P')
    mulbyinvjac_Op = lambda d_ux: mulbyinvjac.prepared_call(
                    grid_Nv, block, d_ux.ptr)

    # \alpha AX + \beta Y kernel (for operations on coefficients)
    axnpbyCoeff = get_axnpby_kerns(2, range(K), NeNv, cfg.dtype)
    axnpbyCoeff_Op = lambda a0, x0, a1, x1: axnpbyCoeff.prepared_call(
                    grid_NeNv, block, x0.ptr, x1.ptr, a0, a1)

    # \alpha AX + \beta Y kernel (for operations on physical solutions)
    axnpbySol = get_axnpby_kerns(2, range(Nq), NeNv, cfg.dtype)
    axnpbySol_Op = lambda a0, x0, a1, x1: axnpbySol.prepared_call(
                    grid_NeNv, block, x0.ptr, x1.ptr, a0, a1)

    # total flux kernel (sums up surface and volume terms)
    totalFlux = get_kernel(kernmod, "totalFlux", 'PPPP')
    totalFlux_Op = lambda d_ux, d_jL, d_jR: totalFlux.prepared_call(
            grid_Nv, block, d_ux.ptr, vm.d_cvx().ptr, d_jL.ptr, d_jR.ptr)

    # allocations on gpu
    d_usol = gpuarray.empty(NqNeNv, dtype=cfg.dtype)
    d_usolF = gpuarray.empty(NqfNeNv, dtype=cfg.dtype)
    d_uL = gpuarray.empty(NfNv, dtype=cfg.dtype) 
    d_uR = gpuarray.empty(NfNv, dtype=cfg.dtype) 
    d_jL = gpuarray.empty(NfNv, dtype=cfg.dtype) 
    d_jR = gpuarray.empty(NfNv, dtype=cfg.dtype) 
    d_bcL = gpuarray.empty(Nv, dtype=cfg.dtype) 
    d_bcR = gpuarray.empty(Nv, dtype=cfg.dtype) 
    d_bcT = gpuarray.empty(Nv, dtype=cfg.dtype) 
    d_ux = gpuarray.empty(KNeNv, dtype=cfg.dtype)
    d_f = gpuarray.empty(KNeNv, dtype=cfg.dtype)
    d_g = gpuarray.empty(KNeNv, dtype=cfg.dtype)

    d_ucoeffs = [gpuarray.empty_like(d_ux) for p in range(vm.nspcs())]
    d_ucoeffPrevs = [gpuarray.empty_like(d_ux) for p in range(vm.nspcs())]

    # check if this is a new run
    if hasattr(args, 'process_run'):
        usol = np.empty((Nq, Ne, Nv), dtype=cfg.dtype)  # temporary storage

        # load the initial condition model
        icn = cfg.lookup('soln-ics', 'type')
        initcondcls = subclass_where(DGFSInitConditionBi, model=icn)
        ic = initcondcls(cfg, vm, 'soln-ics')
        
        for p in range(vm.nspcs()):
            ic.apply_init_vals(p, usol, Nq, Ne, xsol)

            # transfer the information to the gpu
            d_usol.set(usol.ravel())

            # forward transform to coefficient space
            fwdTrans_Op(d_usol, d_ucoeffs[p])

    # check if we are restarting
    if hasattr(args, 'process_restart'):
        import h5py as h5py
        check(len(args.dist[0])==comm.size, "No. of distributions != nranks")
        with h5py.File(args.dist[0][rank].name, 'r') as h5f:
            for p, d_ucoeff in enumerate(d_ucoeffs): 
                dst = h5f['coeff'+str(p)]
                ti = dst.attrs['time']
                d_ucoeff.set(dst[:])
                check(dst.attrs['K']==K, "Inconsistent distribution K")
                check(dst.attrs['Ne']==Ne, "Inconsistent distribution Ne")
                check(dst.attrs['Nv']==Nv, "Inconsistent distribution N")

                # backward transform to solution space
                #bwdTrans_Op(d_ucoeff, d_usol)
    
    # prepare the post-processing handlers    
    # For computing moments
    moments = DGFSMomWriterBi(ti, basis.interpMat, xcoeff, d_ucoeffs, vm, cfg, 
        'dgfsmomwriter')

    # For computing residual
    residual = DGFSResidualBi(cfg, 'dgfsresidual')

    # For writing distribution function
    distribution = DGFSDistributionBi(ti, (K, Ne, Nv), cfg, 
        'dgfsdistwriter')

    # Actual algorithm
    # allocation for time integrators

    # initialize
    for p in range(vm.nspcs()):
        axnpbyCoeff_Op(0., d_ucoeffPrevs[p], 1., d_ucoeffs[p])
    sigModes = basis.sigModes

    # define the neighbours
    from mpi4py import MPI
    down_nbr, up_nbr = comm.rank - 1, comm.rank + 1;
    if up_nbr >= comm.size: up_nbr = MPI.PROC_NULL
    if down_nbr < 0: down_nbr = MPI.PROC_NULL

    # define the ode rhs    
    def rhs(p, time, d_ucoeffs_in, d_ucoeff_out):

        # reconstruct solution at faces
        bwdTransFace_Op(d_ucoeffs_in[p], d_usolF)

        # Step:1 extract the solution at faces
        extLeft_Op(d_usolF, d_uL)
        extRight_Op(d_usolF, d_uR)
        
        # transfer left boundary information in send buffer
        transferBC_L_Op(d_uL, d_bcL)       # Transfer the left ghost BC info
        transferBC_R_Op(d_uR, d_bcR)       # Transfer the right ghost BC info

        # this can be adjusted in case of RDMA enabled MPI support
        h_bcL, h_bcR = d_bcL.get(), d_bcR.get()
        #h_bcL, h_bcR = map(lambda v: v.gpudata.as_buffer(v.nbytes), 
        #               (d_bcL, d_bcR))
        
        # send information
        req1 = comm.isend(d_bcR, dest=up_nbr)  # to upstream neighbour
        req2 = comm.isend(d_bcL, dest=down_nbr)  # to downstream neighbour 

        # recieve information
        h_bcL = comm.recv(source=down_nbr)  # from downstream neighbour
        h_bcR = comm.recv(source=up_nbr)    # from upstream neighbour
        MPI.Request.Waitall([req1, req2])
        
        # set information at left boundary
        if h_bcL:
            d_bcL.set(h_bcL)
        else:
            transferBC_L_Op(d_uL, d_bcL)  # Transfer the ghost BC info
        
        # set information at right boundary
        if h_bcR:
            d_bcR.set(h_bcR)
        else:
            transferBC_R_Op(d_uR, d_bcR)  # Transfer the ghost BC info

        # At left boundary        
        #transferBC_L_Op(d_uL, d_bcL)       # Transfer the ghost BC info
        updateBC_L_Op[p](d_bcL, time)         # now update boundary info 
        applyBC_L_Op[p](d_bcL, d_bcT, time)   # apply boundary condition
        insertBC_L_Op(d_bcT, d_uL)         # insert info to global face-flux

        # At right boundary        
        #transferBC_R_Op(d_uR, d_bcL)       # Transfer the ghost BC info
        updateBC_R_Op[p](d_bcR, time)         # now update boundary info 
        applyBC_R_Op[p](d_bcR, d_bcT, time)   # apply boundary condition
        insertBC_R_Op(d_bcT, d_uR)         # insert info to global face-flux

        # Step:2 Compute the flux and jumps (all operations in single call)
        #fL, fR = cvx*uL, cvx*uR
        #fupw = 0.5*(fL + fR) + 0.5*np.abs(cvx)*(uL - uR)
        #jL = fupw - fL  # Compute the jump at left boundary
        #jR = fupw - fR  # Compute the jump at right boundary
        flux_Op(d_uL, d_uR, d_jL, d_jR)

        # Step:3 evaluate the derivative 
        # ux = -cvx*np.einsum("ml,em->el", Sx, ucoeff)
        deriv_Op(d_ucoeffs_in[p], d_ux)
        mulbyadv_Op(d_ux)

        # Compute the continuous flux for each element in strong form
        totalFlux_Op(d_ux, d_jL, d_jR)
        
        # multiply by the inverse jacobian
        mulbyinvjac_Op(d_ux)

        # Step:4 Add collision kernel contribution
        #ux += Q(\sum U^{m}_{ar} ucoeff_{aej}, \sum V^{m}_{ra} ucoeff_{aej})
        cases = [str(p)+str(q) for q in range(vm.nspcs())]
        for m in range(K):
            trans_U_Op[m](d_ucoeffs_in[p], d_f)
            for q in range(vm.nspcs()):            
                trans_V_Op[m](d_ucoeffs_in[q], d_g)
                for r, e in it.product(sigModes[m], range(Ne)):
                    sm.fs(cases[q], d_f, d_g, d_ux, e, r, m)
        
        #for q in range(vm.nspcs()):            
        #  for r, e in it.product(range(K), range(Ne)):
        #    sm.fs(cases[q], d_ucoeffs_in[p], d_ucoeffs_in[q], d_ux, e, r, r)

        # Step:5 Multiply by inverse mass matrix
        invMass_Op(d_ux, d_ucoeff_out)

        
    # define a time-integrator
    odestype = cfg.lookup('time-integrator', 'scheme')
    odescls = subclass_where(DGFSIntegratorBi, intg_kind=odestype)
    odes = odescls(rhs, (K, Ne, Nv), cfg.dtype, vm.nspcs())

    # Finally start everything
    time = ti  # initialize time in case of restart
    nacptsteps = 0 # number of elasped steps in the current run

    # start timer
    start = timer()

    while(time < tf):

        # March in time 
        odes.integrate(time, dt, d_ucoeffs)

        # increment time
        time += dt 
        nacptsteps += 1

        # Final step: post processing routines
        residual(time, nacptsteps, d_ucoeffPrevs, d_ucoeffs)
        moments(dt, time, d_ucoeffs)
        distribution(dt, time, d_ucoeffs)

        # copy the solution for the next time step
        for p in range(vm.nspcs()):
            cuda.memcpy_dtod(d_ucoeffPrevs[p].ptr, d_ucoeffs[p].ptr, 
                d_ucoeffs[p].nbytes)

    # print elasped time
    end = timer()
    elapsed = np.array([end - start])
    if rank==root:
        comm.Allreduce(get_mpi('in_place'), elapsed, op=get_mpi('sum'))
        avgtime = elapsed[0]/comm.size
        print("Nsteps", nacptsteps, ", elapsed time", avgtime, "s")
예제 #17
0
    def __init__(self, xsol, nl, vm, cfg, cfgsect, **kwargs):

        super().__init__(xsol, nl, vm, cfg, cfgsect, **kwargs)

        #initcondcls = subclass_where(DGFSInitConditionBi, model='maxwellian')
        #bc = initcondcls(cfg, self._vm, cfgsect, wall=True)
        #f0 = bc.get_init_vals()
        #self._d_bnd_f0 = [gpuarray.to_gpu(f.ravel()) for f in f0]
        #unondim = bc.unondim()

        #ndenini = 1.
        ux = cfg.lookupexpr(cfgsect, 'ux')
        uy = cfg.lookupexpr(cfgsect, 'uy')
        uz = cfg.lookupexpr(cfgsect, 'uz')
        T = cfg.lookupexpr(cfgsect, 'T')
        ux = '((' + ux + ')/' + str(self._vm.u0()) + ')'
        uy = '((' + uy + ')/' + str(self._vm.u0()) + ')'
        uz = '((' + uz + ')/' + str(self._vm.u0()) + ')'
        T = '((' + T + ')/' + str(self._vm.T0()) + ')'

        # mass ratios
        mr = vm.masses()

        # storage
        self._bc_vals_num = [
            gpuarray.empty(self._vm.vsize(), cfg.dtype)
            for p in range(vm.nspcs())
        ]
        self._bc_vals_den = [
            gpuarray.empty(self._vm.vsize(), cfg.dtype)
            for p in range(vm.nspcs())
        ]
        self._wall_nden = [
            gpuarray.empty(1, dtype=cfg.dtype) for p in range(vm.nspcs())
        ]
        self._mr = [
            gpuarray.to_gpu(np.array(mr[p], dtype=cfg.dtype))
            for p in range(vm.nspcs())
        ]

        dfltargs = dict(dtype=cfg.dtypename,
                        vsize=self._vm.vsize(),
                        cw=self._vm.cw(),
                        nl=nl,
                        x=xsol,
                        ux=ux,
                        uy=uy,
                        uz=uz,
                        T=T)
        kernsrc = DottedTemplateLookup('dgfs1D.bi.kernels.bcs',
                                       dfltargs).get_template(
                                           self.type).render()
        kernmod = compiler.SourceModule(kernsrc)

        # block size
        block = (128, 1, 1)
        grid_Nv = get_grid_for_block(block, self._vm.vsize())

        # for applying the boundary condition
        def make_applyBC(p, applyBCFunc):
            def applyBC(ul, ur, t):
                applyBCFunc.prepared_call(grid_Nv, block, ul.ptr, ur.ptr,
                                          self._vm.d_cvx().ptr,
                                          self._vm.d_cvy().ptr,
                                          self._vm.d_cvz().ptr,
                                          self._mr[p].ptr,
                                          self._wall_nden[p].ptr, t)

            return applyBC

        applyBCFunc = get_kernel(kernmod, "applyBC",
                                 [np.intp] * 7 + [cfg.dtype])
        for p in range(vm.nspcs()):
            self._applyBCKern[p] = make_applyBC(p, applyBCFunc)

        # for extracting left face values
        def make_updateBC(p, updateBCFunc):
            def updateBC(ul, t):
                updateBCFunc.prepared_call(grid_Nv, block, ul.ptr,
                                           self._vm.d_cvx().ptr,
                                           self._vm.d_cvy().ptr,
                                           self._vm.d_cvz().ptr,
                                           self._mr[p].ptr,
                                           self._bc_vals_num[p].ptr,
                                           self._bc_vals_den[p].ptr, t)
                self._wall_nden[p] = -(gpuarray.sum(self._bc_vals_num[p]) /
                                       gpuarray.sum(self._bc_vals_den[p]))

            return updateBC

        updateBCFunc = get_kernel(kernmod, "updateBC",
                                  [np.intp] * 7 + [cfg.dtype])
        for p in range(vm.nspcs()):
            self._updateBCKern[p] = make_updateBC(p, updateBCFunc)
예제 #18
0
파일: bc.py 프로젝트: jaisw7/dgfs1D_gpu
    def __init__(self, xsol, nl, vm, cfg, cfgsect, **kwargs):

        super().__init__(xsol, nl, vm, cfg, cfgsect, **kwargs)

        #initcondcls = subclass_where(DGFSInitConditionStd, model='maxwellian')
        #bc = initcondcls(cfg, self._vm, cfgsect, wall=True)
        #f0 = bc.get_init_vals().reshape(self._vm.vsize(), 1)
        #self._d_bnd_f0 = gpuarray.to_gpu(f0)
        #unondim = bc.unondim()
        rhoini = 1.
        ux = cfg.lookupexpr(cfgsect, 'ux')
        uy = cfg.lookupexpr(cfgsect, 'uy')
        uz = cfg.lookupexpr(cfgsect, 'uz')
        T = cfg.lookupexpr(cfgsect, 'T')
        ux = '((' + ux + ')/' + str(self._vm.u0()) + ')'
        uy = '((' + uy + ')/' + str(self._vm.u0()) + ')'
        uz = '((' + uz + ')/' + str(self._vm.u0()) + ')'
        T = '((' + T + ')/' + str(self._vm.T0()) + ')'

        # storage
        self._bc_vals_num = gpuarray.empty(self._vm.vsize(), cfg.dtype)
        self._bc_vals_den = gpuarray.empty_like(self._bc_vals_num)

        dfltargs = dict(dtype=cfg.dtypename,
                        vsize=self._vm.vsize(),
                        cw=self._vm.cw(),
                        nl=nl,
                        x=xsol,
                        ux=ux,
                        uy=uy,
                        uz=uz,
                        T=T)
        kernsrc = DottedTemplateLookup('dgfs1D.std.kernels.bcs',
                                       dfltargs).get_template(
                                           self.type).render()
        kernmod = compiler.SourceModule(kernsrc)

        # block size
        block = (128, 1, 1)
        grid_Nv = get_grid_for_block(block, self._vm.vsize())

        # for extracting right face values
        applyBCFunc = get_kernel(kernmod, "applyBC",
                                 [np.intp] * 6 + [cfg.dtype])
        self._applyBCKern = lambda ul, ur, t: applyBCFunc.prepared_call(
            grid_Nv, block, ul.ptr, ur.ptr,
            self._vm.d_cvx().ptr,
            self._vm.d_cvy().ptr,
            self._vm.d_cvz().ptr, self._wall_nden.ptr, t)

        # for extracting left face values
        updateBCFunc = get_kernel(kernmod, "updateBC",
                                  [np.intp] * 6 + [cfg.dtype])

        def updateBC(ul, t):
            updateBCFunc.prepared_call(grid_Nv, block, ul.ptr,
                                       self._vm.d_cvx().ptr,
                                       self._vm.d_cvy().ptr,
                                       self._vm.d_cvz().ptr,
                                       self._bc_vals_num.ptr,
                                       self._bc_vals_den.ptr, t)
            self._wall_nden = -(gpuarray.sum(self._bc_vals_num) /
                                gpuarray.sum(self._bc_vals_den))

        self._updateBCKern = updateBC
예제 #19
0
파일: astd.py 프로젝트: jaisw7/dgfs1D_gpu
def main(Ne=None, dt=None):
    # who am I in this world? (Bulleh Shah, 18th century sufi poet)
    comm, rank, root = get_comm_rank_root()

    # read the inputs (from people)
    cfg, args = initialize()
    if Ne is not None: cfg._cp.set('mesh', 'Ne', str(int(Ne)))
    if dt is not None: cfg._cp.set('time-integrator', 'dt', str(float(dt)))
    mesh = Mesh(cfg)

    # define 1D mesh (construct a 1D world view)
    xmesh = mesh.xmesh

    # number of elements (how refined perspectives do we want/have?)
    Ne = mesh.Ne

    # define the basis (what is the basis for those perspectives?)
    bsKind = cfg.lookup('basis', 'kind')
    #assert bsKind == 'nodal-sem-gll', "Only one supported as of now"
    basiscls = subclass_where(Basis, basis_kind=bsKind)
    basis = basiscls(cfg)

    # number of local degrees of freedom (depth/granualirity of perspectives)
    K = basis.K

    # number of solution points (how far can I interpolate my learning)
    Nq = basis.Nq

    # left/right face maps
    Nqf = basis.Nqf  # number of points used in reconstruction at faces
    mapL, mapR = np.arange(Ne + 1) + (Nqf - 1) * Ne - 1, np.arange(Ne + 1)
    mapL[0], mapR[-1] = 0, Ne * Nqf - 1
    Nf = len(mapL)

    # the zeros
    z = basis.z

    # jacobian of the mapping from D^{st}=[-1,1] to D
    jac, invjac = mesh.jac, mesh.invjac

    # load the velocity mesh
    vm = DGFSVelocityMeshStd(cfg)
    Nv = vm.vsize()

    # load the scattering model
    smn = cfg.lookup('scattering-model', 'type')
    scatteringcls = subclass_where(DGFSScatteringModelAstd,
                                   scattering_model=smn)
    sm = scatteringcls(cfg, vm, Ne=Ne)

    # initial time, time step, final time
    ti, dt, tf = cfg.lookupfloats('time-integrator', ('tstart', 'dt', 'tend'))
    nsteps = np.ceil((tf - ti) / dt)
    dt = (tf - ti) / nsteps

    # Compute the location of the solution points
    xsol = np.array(
        [0.5 * (xmesh[j] + xmesh[j + 1]) + jac[j] * z for j in range(Ne)]).T
    xcoeff = np.einsum("kq,qe->ke", basis.fwdTransMat, xsol)

    # Determine the grid/block
    NeNv = Ne * Nv
    KNeNv = K * Ne * Nv
    NqNeNv = Nq * Ne * Nv
    NqfNeNv = Nqf * Ne * Nv
    NfNv = Nf * Nv
    block = (128, 1, 1)
    grid_Nv = get_grid_for_block(block, Nv)
    grid_NeNv = get_grid_for_block(block, Ne * Nv)
    grid_KNeNv = get_grid_for_block(block, K * Ne * Nv)

    # operator generator for matrix operations
    matOpGen = lambda v: lambda arg0, arg1: v.prepared_call(
        grid_NeNv, block, NeNv, arg0.ptr, NeNv, arg1.ptr, NeNv)

    # forward trans, backward, backward (at faces), derivative kernels
    fwdTrans_Op, bwdTrans_Op, bwdTransFace_Op, deriv_Op, invMass_Op, \
        computeCellAvg_Op, extractDrLin_Op = map(
        matOpGen, (basis.fwdTransOp, basis.bwdTransOp,
            basis.bwdTransFaceOp, basis.derivOp, basis.invMassOp,
            basis.computeCellAvgKern, basis.extractDrLinKern)
    )

    # U, V operator kernels
    trans_U_Op = tuple(map(matOpGen, basis.uTransOps))
    trans_V_Op = tuple(map(matOpGen, basis.vTransOps))

    # prepare the kernel for extracting face/interface values
    dfltargs = dict(K=K,
                    Ne=Ne,
                    Nq=Nq,
                    vsize=Nv,
                    dtype=cfg.dtypename,
                    mapL=mapL,
                    mapR=mapR,
                    offsetL=0,
                    offsetR=len(mapR) - 1,
                    invjac=invjac,
                    gRD=basis.gRD,
                    gLD=basis.gLD,
                    xsol=xsol)
    kernsrc = DottedTemplateLookup('dgfs1D.std.kernels',
                                   dfltargs).get_template('std').render()
    kernmod = compiler.SourceModule(kernsrc)

    dfltargs.update(nalph=sm.nalph, Dr=basis.derivMat)
    kernlimssrc = DottedTemplateLookup(
        'dgfs1D.astd.kernels', dfltargs).get_template('limiters').render()
    kernlimsmod = compiler.SourceModule(kernlimssrc)

    # prepare operators for execution (see std.mako for description)
    (extLeft_Op, extRight_Op, transferBC_L_Op, transferBC_R_Op, insertBC_L_Op,
     insertBC_R_Op) = map(
         lambda v: lambda *args: get_kernel(kernmod, v, 'PP').prepared_call(
             grid_Nv, block, *list(map(lambda c: c.ptr, args))),
         ("extract_left", "extract_right", "transfer_bc_left",
          "transfer_bc_right", "insert_bc_left", "insert_bc_right"))

    # The boundary conditions (by default all boundaries are processor bnds)
    bcl_type, bcr_type = 'dgfs-periodic', 'dgfs-periodic'

    # the mesh is decomposed in linear fashion, so rank 0 gets left boundary
    if rank == 0: bcl_type = cfg.lookup('soln-bcs-xlo', 'type')

    # and the last rank comm.size-1 gets the right boundary
    if rank == comm.size - 1: bcr_type = cfg.lookup('soln-bcs-xhi', 'type')

    # prepare kernels for left boundary
    bcl_cls = subclass_where(DGFSBCStd, type=bcl_type)
    bcl = bcl_cls(xmesh[0], -1., vm, cfg, 'soln-bcs-xlo')
    updateBC_L_Op = bcl.updateBCKern
    applyBC_L_Op = bcl.applyBCKern

    # prepare kernels for right boundary
    bcr_cls = subclass_where(DGFSBCStd, type=bcr_type)
    bcr = bcr_cls(xmesh[-1], 1., vm, cfg, 'soln-bcs-xhi')
    updateBC_R_Op = bcr.updateBCKern
    applyBC_R_Op = bcr.applyBCKern

    #if bcl_type == 'dgfs-cyclic' or bcr_type == 'dgfs-cyclic':
    #    assert(bcl_type==bcr_type);

    # flux kernel
    flux = get_kernel(kernmod, "flux", 'PPPPP')
    flux_Op = lambda d_uL, d_uR, d_jL, d_jR: flux.prepared_call(
        grid_Nv, block, d_uL.ptr, d_uR.ptr,
        vm.d_cvx().ptr, d_jL.ptr, d_jR.ptr)

    # multiply the derivative by the advection velocity
    mulbyadv = get_kernel(kernmod, "mul_by_adv", 'PP')
    mulbyadv_Op = lambda d_ux: mulbyadv.prepared_call(grid_KNeNv, block,
                                                      vm.d_cvx().ptr, d_ux.ptr)

    # multiply the coefficient by the inverse jacobian
    mulbyinvjac = get_kernel(kernmod, "mul_by_invjac", 'P')
    mulbyinvjac_Op = lambda d_ux: mulbyinvjac.prepared_call(
        grid_Nv, block, d_ux.ptr)

    # \alpha AX + \beta Y kernel (for operations on coefficients)
    axnpbyCoeff = get_axnpby_kerns(2, range(K), NeNv, cfg.dtype)
    axnpbyCoeff_Op = lambda a0, x0, a1, x1: axnpbyCoeff.prepared_call(
        grid_NeNv, block, x0.ptr, x1.ptr, a0, a1)

    # total flux kernel (sums up surface and volume terms)
    totalFlux = get_kernel(kernmod, "totalFlux", 'PPPP')
    totalFlux_Op = lambda d_ux, d_jL, d_jR: totalFlux.prepared_call(
        grid_Nv, block, d_ux.ptr,
        vm.d_cvx().ptr, d_jL.ptr, d_jR.ptr)

    # linear limiter
    limitLin = get_kernel(kernlimsmod, "limitLin", 'PPPP')
    limitLin_Op = lambda d_u, d_ulx, d_uavg, d_ulim: \
        limitLin.prepared_call(grid_Nv, block, d_u.ptr, d_ulx.ptr,
            d_uavg.ptr, d_ulim.ptr)

    # allocations on gpu
    d_usol = gpuarray.empty(NqNeNv, dtype=cfg.dtype)
    d_usolF = gpuarray.empty(NqfNeNv, dtype=cfg.dtype)
    d_uL = gpuarray.empty(NfNv, dtype=cfg.dtype)
    d_uR = gpuarray.empty(NfNv, dtype=cfg.dtype)
    d_jL = gpuarray.empty(NfNv, dtype=cfg.dtype)
    d_jR = gpuarray.empty(NfNv, dtype=cfg.dtype)
    d_bcL = gpuarray.empty(Nv, dtype=cfg.dtype)
    d_bcR = gpuarray.empty(Nv, dtype=cfg.dtype)
    d_bcT = gpuarray.empty(Nv, dtype=cfg.dtype)
    d_ux = gpuarray.empty(KNeNv, dtype=cfg.dtype)
    d_f = gpuarray.empty(KNeNv, dtype=cfg.dtype)
    d_g = gpuarray.empty(KNeNv, dtype=cfg.dtype)

    d_ucoeff = gpuarray.empty(KNeNv, dtype=cfg.dtype)
    d_ucoeffPrev = gpuarray.empty_like(d_ucoeff)

    # check if this is a new run
    if hasattr(args, 'process_run'):
        usol = np.empty((Nq, Ne, Nv), dtype=cfg.dtype)  # temporary storage

        # load the initial condition model
        icn = cfg.lookup('soln-ics', 'type')
        initcondcls = subclass_where(DGFSInitConditionStd, model=icn)
        ic = initcondcls(cfg, vm, 'soln-ics')
        ic.apply_init_vals(usol, Nq, Ne, xsol, mesh=mesh, basis=basis, sm=sm)

        # transfer the information to the gpu
        d_usol.set(usol.ravel())

        # forward transform to coefficient space
        fwdTrans_Op(d_usol, d_ucoeff)

    # check if we are restarting
    if hasattr(args, 'process_restart'):
        import h5py as h5py
        check(len(args.dist[0]) == comm.size, "No. of distributions != nranks")
        with h5py.File(args.dist[0][rank].name, 'r') as h5f:
            dst = h5f['coeff']
            ti = dst.attrs['time']
            d_ucoeff.set(dst[:])
            check(dst.attrs['K'] == K, "Inconsistent distribution K")
            check(dst.attrs['Ne'] == Ne, "Inconsistent distribution Ne")
            check(dst.attrs['Nv'] == Nv, "Inconsistent distribution N")

        # backward transform to solution space
        bwdTrans_Op(d_ucoeff, d_usol)

    # prepare the post-processing handlers
    # For computing moments
    moments = DGFSMomWriterStd(ti, basis.interpMat, xcoeff, d_ucoeff, vm, cfg,
                               'dgfsmomwriter')

    # For computing residual
    residual = DGFSResidualStd(cfg, 'dgfsresidual')

    # For writing distribution function
    distribution = DGFSDistributionStd(ti, (K, Ne, Nv), cfg, 'dgfsdistwriter')

    # Actual algorithm

    # initialize
    axnpbyCoeff_Op(0., d_ucoeffPrev, 1., d_ucoeff)
    sigModes = basis.sigModes

    # define the neighbours
    from mpi4py import MPI
    down_nbr, up_nbr = comm.rank - 1, comm.rank + 1
    if up_nbr >= comm.size: up_nbr = MPI.PROC_NULL
    if down_nbr < 0: down_nbr = MPI.PROC_NULL

    # define the explicit part
    def explicit(time, d_ucoeff_in, d_ucoeff_out):

        # reconstruct solution at faces
        bwdTransFace_Op(d_ucoeff_in, d_usolF)

        # Step:1 extract the solution at faces
        extLeft_Op(d_usolF, d_uL)
        extRight_Op(d_usolF, d_uR)

        # transfer left boundary information in send buffer
        transferBC_L_Op(d_uL, d_bcL)  # Transfer the left ghost BC info
        transferBC_R_Op(d_uR, d_bcR)  # Transfer the right ghost BC info

        # this can be adjusted in case of RDMA enabled MPI support
        #h_bcL, h_bcR = d_bcL.get(), d_bcR.get()
        #h_bcL, h_bcR = map(lambda v: v.gpudata.as_buffer(v.nbytes),
        #               (d_bcL, d_bcR))

        # send information
        req1 = comm.isend(d_bcR, dest=up_nbr)  # to upstream neighbour
        req2 = comm.isend(d_bcL, dest=down_nbr)  # to downstream neighbour

        # recieve information
        h_bcL = comm.recv(source=down_nbr)  # from downstream neighbour
        h_bcR = comm.recv(source=up_nbr)  # from upstream neighbour
        MPI.Request.Waitall([req1, req2])

        # set information at left, right boundary
        if h_bcL: d_bcL.set(h_bcL)
        else: transferBC_L_Op(d_uL, d_bcL)

        if h_bcR: d_bcR.set(h_bcR)
        else: transferBC_R_Op(d_uR, d_bcR)

        # The physical-periodic boundary condition
        if comm.size == 1 and bcr_type == 'dgfs-cyclic':
            copy(d_bcT, d_bcL)
            copy(d_bcL, d_bcR)
            copy(d_bcR, d_bcT)
        else:
            # At left, receive from right-most communicator; and vice-versa
            req1 = req2 = MPI.REQUEST_NULL
            if bcl_type == 'dgfs-cyclic':
                req1 = comm.isend(d_bcL, dest=comm.size - 1)
            if bcr_type == 'dgfs-cyclic': req2 = comm.isend(d_bcR, dest=0)
            if bcr_type == 'dgfs-cyclic': h_bcR = comm.recv(source=0)
            if bcl_type == 'dgfs-cyclic':
                h_bcL = comm.recv(source=comm.size - 1)
            MPI.Request.Waitall([req1, req2])
            if bcl_type == 'dgfs-cyclic': d_bcL.set(h_bcL)
            elif bcr_type == 'dgfs-cyclic': d_bcR.set(h_bcR)

        # At left boundary
        #transferBC_L_Op(d_uL, d_bcL)       # Transfer the ghost BC info
        updateBC_L_Op(d_bcL, time)  # now update boundary info
        applyBC_L_Op(d_bcL, d_bcT, time)  # apply boundary condition
        insertBC_L_Op(d_bcT, d_uL)  # insert info to global face-flux

        # At right boundary
        #transferBC_R_Op(d_uR, d_bcL)       # Transfer the ghost BC info
        updateBC_R_Op(d_bcR, time)  # now update boundary info
        applyBC_R_Op(d_bcR, d_bcT, time)  # apply boundary condition
        insertBC_R_Op(d_bcT, d_uR)  # insert info to global face-flux

        # Step:2 Compute the flux and jumps (all operations in single call)
        #fL, fR = cvx*uL, cvx*uR
        #fupw = 0.5*(fL + fR) + 0.5*np.abs(cvx)*(uL - uR)
        #jL = fupw - fL  # Compute the jump at left boundary
        #jR = fupw - fR  # Compute the jump at right boundary
        flux_Op(d_uL, d_uR, d_jL, d_jR)

        # Step:3 evaluate the derivative
        # ux = -cvx*np.einsum("ml,em->el", Sx, ucoeff)
        deriv_Op(d_ucoeff_in, d_ux)
        mulbyadv_Op(d_ux)

        # Compute the continuous flux for each element in strong form
        totalFlux_Op(d_ux, d_jL, d_jR)

        # multiply by the inverse jacobian
        # Now we have f* = d_ux
        mulbyinvjac_Op(d_ux)

        # project back to coefficient space
        invMass_Op(d_ux, d_ucoeff_out)

    d_uavg, d_ulx = map(gpuarray.empty_like, [d_ucoeff] * 2)

    def limit(d_ucoeff_in, d_ucoeff_out):
        assert comm.size == 1, "Not implemented"
        #assert basis.basis_kind == 'nodal-sem-gll', "Not implemented"

        # Extract the cell average
        computeCellAvg_Op(d_ucoeff_in, d_uavg)

        # extract gradient of the linear polynomial
        extractDrLin_Op(d_ucoeff_in, d_ulx)
        mulbyinvjac_Op(d_ulx)

        # limit functions in all cells
        limitLin_Op(d_ucoeff_in, d_ulx, d_uavg, d_ucoeff_out)

    # define a time-integrator (we use Euler scheme: good enough for steady)
    odestype = cfg.lookup('time-integrator', 'scheme')
    odescls = subclass_where(DGFSIntegratorAstd, intg_kind=odestype)
    limitOn = cfg.lookupordefault('time-integrator', 'limiter', 0)

    # Finally start everything
    time = ti  # initialize time in case of restart
    nacptsteps = 0  # number of elasped steps in the current run

    # initialize ode: this performs pre-integration for multi-step schemes
    odes = odescls(explicit,
                   sm, (K, Ne, Nv),
                   cfg.dtype,
                   t=time,
                   dt=dt,
                   f0=d_ucoeff)

    # start timer
    start = timer()
    while (time < tf):

        # March in time
        odes.integrate(time, dt, nacptsteps, d_ucoeff)
        if limitOn: limit(d_ucoeff, d_ucoeff)

        # increment time
        time += dt
        nacptsteps += 1

        # Final step: post processing routines
        residual(time, nacptsteps, d_ucoeff, d_ucoeffPrev)
        moments(dt, time, d_ucoeff)
        distribution(dt, time, d_ucoeff)

        # copy the solution for the next time step
        cuda.memcpy_dtod(d_ucoeffPrev.ptr, d_ucoeff.ptr, d_ucoeff.nbytes)

    # print elasped time
    end = timer()
    elapsed = np.array([end - start])
    if rank != root: comm.Reduce(elapsed, None, op=get_mpi('sum'), root=root)
    else:
        comm.Reduce(get_mpi('in_place'), elapsed, op=get_mpi('sum'), root=root)
        avgtime = elapsed[0] / comm.size
        print("Nsteps", nacptsteps, ", elapsed time", avgtime, "s")

    return d_ucoeff, mesh, vm, basis
예제 #20
0
    def perform_precomputation(self):
        # Precompute aa, bb1, bb2 (required for kernel)
        # compute l
        Nv = self.vm.Nv()
        Nrho = self.vm.Nrho()
        M = self.vm.M()
        L = self.vm.L()
        qz = self.vm.qz()
        qw = self.vm.qw()
        sz = self.vm.sz()
        sw = self.vm.sw()
        vsize = self.vm.vsize()
        szpre = self._szpre
        swpre = self._swpre

        check(self.cfg.dtype == np.float64,
              "Need to extend for single precision")

        # precision control
        dint = np.int32
        dfloat = np.float64
        dcplx = np.complex128

        if self.cfg.dtype == np.float32:
            dfloat = np.float32
            dcplx = np.complex64

        l0 = np.concatenate(
            (np.arange(0, Nv / 2, dtype=dint), np.arange(-Nv / 2,
                                                         0,
                                                         dtype=dint)))
        l = np.zeros((3, vsize), dtype=dint)
        for idv in range(vsize):
            I = int(idv / (Nv * Nv))
            J = int((idv % (Nv * Nv)) / Nv)
            K = int((idv % (Nv * Nv)) % Nv)
            l[0, idv] = l0[I]
            l[1, idv] = l0[J]
            l[2, idv] = l0[K]
        d_lx = gpuarray.to_gpu(np.ascontiguousarray(l[0, :]))
        d_ly = gpuarray.to_gpu(np.ascontiguousarray(l[1, :]))
        d_lz = gpuarray.to_gpu(np.ascontiguousarray(l[2, :]))

        # transfer sphere points to gpu
        d_sz_x = gpuarray.to_gpu(np.ascontiguousarray(sz[:, 0]))
        d_sz_y = gpuarray.to_gpu(np.ascontiguousarray(sz[:, 1]))
        d_sz_z = gpuarray.to_gpu(np.ascontiguousarray(sz[:, 2]))

        # define complex to complex plan
        rank = 3
        n = np.array([Nv, Nv, Nv], dtype=np.int32)

        #planD2Z = cufftPlan3d(Nv, Nv, Nv, CUFFT_D2Z)
        self.planZ2Z_MNrho = cufftPlanMany(rank, n.ctypes.data, None, 1, vsize,
                                           None, 1, vsize, CUFFT_Z2Z, M * Nrho)
        self.planZ2Z = cufftPlan3d(Nv, Nv, Nv, CUFFT_Z2Z)

        dfltargs = dict(
            dtype=self.cfg.dtypename,
            Nrho=Nrho,
            M=M,
            vsize=vsize,
            sw=sw,
            prefac=self._prefactor,
            cases=self._cases,
            masses=self.vm.masses(),
            qw=qw,
            qz=qz,
            L=L,
            sz=sz,
            gamma=self._gamma,
            eta=self._eta,
            Mpre=self._Mpre,
            szpre=szpre,
            swpre=swpre  #, Ne=self._Ne
        )
        src = DottedTemplateLookup('dgfs1D.bi.kernels.scattering',
                                   dfltargs).get_template(
                                       self.scattering_model).render()

        # Compile the source code and retrieve the kernel
        print("\nCompiling scattering kernels, this may take some time ...")
        module = compiler.SourceModule(src)

        self.block = (256, 1, 1)
        self.grid = get_grid_for_block(self.block, vsize)

        print("Starting precomputation, this may take some time ...")
        start, end = cuda.Event(), cuda.Event()
        cuda.Context.synchronize()
        start.record()
        start.synchronize()

        self.d_aa = gpuarray.empty(Nrho * M * vsize, dtype=dfloat)
        precompute_aa = get_kernel(module, "precompute_a", 'PPPP')
        precompute_aa.prepared_call(self.grid, self.block, d_lx.ptr, d_ly.ptr,
                                    d_lz.ptr, self.d_aa.ptr)

        self.d_bb1 = {}
        self.d_bb2 = {}
        precompute_bb = {}
        for cp, cq in self._cases:
            cpcq = str(cp) + str(cq)
            self.d_bb1[cpcq] = gpuarray.empty(Nrho * M * vsize, dtype=dcplx)
            self.d_bb2[cpcq] = gpuarray.zeros(vsize, dtype=dcplx)
            precompute_bb[cpcq] = module.get_function("precompute_bc_" + cpcq)
            precompute_bb[cpcq].prepare('IIdddPPPPPPPP')
            precompute_bb[cpcq].set_cache_config(cuda.func_cache.PREFER_L1)

            for p in range(Nrho):
                fac = np.pi / L * qz[p]
                fac_b = swpre * pow(qz[p], self._gamma[cpcq] + 2)
                fac_c = qw[p] * sw * fac_b
                for q in range(M):
                    precompute_bb[cpcq].prepared_call(
                        self.grid, self.block, dint(p), dint(q), dfloat(fac),
                        dfloat(fac_b), dfloat(fac_c), d_lx.ptr, d_ly.ptr,
                        d_lz.ptr, d_sz_x.ptr, d_sz_y.ptr, d_sz_z.ptr,
                        self.d_bb1[cpcq].ptr, self.d_bb2[cpcq].ptr)

        end.record()
        end.synchronize()
        secs = start.time_till(end) * 1e-3
        print("Finished precomputation in: %fs" % (secs))

        # transform scalar to complex
        self.r2zKern = module.get_function("r2z")
        self.r2zKern.prepare('IIIPP')
        self.r2zKern.set_cache_config(cuda.func_cache.PREFER_L1)

        # Prepare the cosSinMul kernel for execution
        self.cosSinMultKern = {}
        #self.computeQGKern = {}
        self.outKern = {}
        for cp, cq in self._cases:
            idx = str(cp) + str(cq)
            self.cosSinMultKern[idx] = module.get_function("cosSinMul_" + idx)
            self.cosSinMultKern[idx].prepare('PPPPP')
            self.cosSinMultKern[idx].set_cache_config(
                cuda.func_cache.PREFER_L1)

            #self.computeQGKern[idx] = module.get_function("computeQG_"+idx)
            #self.computeQGKern[idx].prepare('PPP')
            #self.computeQGKern[idx].set_cache_config(
            #    cuda.func_cache.PREFER_L1)

            self.outKern[idx] = module.get_function("output_" + idx)
            self.outKern[idx].prepare('IIIIPPPP')
            self.outKern[idx].set_cache_config(cuda.func_cache.PREFER_L1)

        # prepare the computeQG kernel
        self.computeQGKern = module.get_function("computeQG")
        self.computeQGKern.prepare('PPP')
        self.computeQGKern.set_cache_config(cuda.func_cache.PREFER_L1)

        # Prepare the prodKern kernel for execution
        self.prodKern = module.get_function("prod")
        self.prodKern.prepare('PPP')
        self.prodKern.set_cache_config(cuda.func_cache.PREFER_L1)

        # Prepare the ax kernel for execution
        self.ax2Kern = module.get_function("ax2")
        self.ax2Kern.prepare('PPP')
        self.ax2Kern.set_cache_config(cuda.func_cache.PREFER_L1)

        # define scratch  spaces
        self.d_FTf = gpuarray.empty(vsize, dtype=dcplx)
        self.d_FTg = gpuarray.empty(vsize, dtype=dcplx)
        self.d_f1C = gpuarray.empty_like(self.d_FTf)
        self.d_f2C = gpuarray.empty_like(self.d_FTf)
        self.d_QG = gpuarray.empty_like(self.d_FTf)
        self.d_t1 = gpuarray.empty(M * Nrho * vsize, dtype=dcplx)
        self.d_t2 = gpuarray.empty_like(self.d_t1)
        self.d_t3 = gpuarray.empty_like(self.d_t1)