示例#1
0
def flux_header_struct(float_type, dims):
    from cgen import GenerableStruct

    return GenerableStruct("flux_header", [
        POD(numpy.uint16, "same_facepairs_end"),
        POD(numpy.uint16, "diff_facepairs_end"),
        POD(numpy.uint16, "bdry_facepairs_end"),
        ], align_bytes=face_pair_struct(float_type, dims).alignment_requirement())
示例#2
0
def flux_header_struct(float_type, dims):
    from cgen import GenerableStruct

    return GenerableStruct("flux_header", [
        POD(numpy.uint16, "same_facepairs_end"),
        POD(numpy.uint16, "diff_facepairs_end"),
        POD(numpy.uint16, "bdry_facepairs_end"),
    ],
                           align_bytes=face_pair_struct(
                               float_type, dims).alignment_requirement())
示例#3
0
    def shared_mem_use(self):
        from hedge.backends.cuda.fluxgather import face_pair_struct
        d = self.given.ldis.dimensions

        if self.dofs_per_face > 255:
            index_lists_entry_size = 2
        else:
            index_lists_entry_size = 1

        result = (128 # parameters, block header, small extra stuff
                + len(face_pair_struct(self.given.float_type, d))
                * self.face_pair_count())

        if not self.direct_store:
            result += (self.aligned_face_dofs_per_microblock()
                * self.flux_count
                * self.microblocks_per_block()
                * self.given.float_size())

        return result
示例#4
0
    def shared_mem_use(self):
        from hedge.backends.cuda.fluxgather import face_pair_struct
        d = self.given.ldis.dimensions

        if self.dofs_per_face > 255:
            index_lists_entry_size = 2
        else:
            index_lists_entry_size = 1

        result = (
            128  # parameters, block header, small extra stuff
            + len(face_pair_struct(self.given.float_type, d)) *
            self.face_pair_count())

        if not self.direct_store:
            result += (self.aligned_face_dofs_per_microblock() *
                       self.flux_count * self.microblocks_per_block() *
                       self.given.float_size())

        return result
示例#5
0
    def get_kernel(self, fdata, ilist_data, for_benchmark):
        from cgen.cuda import CudaShared, CudaGlobal
        from pycuda.tools import dtype_to_ctype

        discr = self.discr
        given = self.plan.given
        fplan = self.plan
        d = discr.dimensions
        dims = range(d)

        elgroup, = discr.element_groups

        float_type = given.float_type

        f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_flux"),
            [
                Pointer(POD(float_type, "debugbuf")),
                Pointer(POD(numpy.uint8, "gmem_facedata")),
                ]+[
                Pointer(POD(float_type, "gmem_fluxes_on_faces%d" % flux_nr))
                for flux_nr in range(len(self.fluxes))
                ]
            ))

        cmod = Module()
        cmod.append(Include("pycuda-helpers.hpp"))

        for dep_expr in self.all_deps:
            cmod.extend([
                Value("texture<%s, 1, cudaReadModeElementType>"
                    % dtype_to_ctype(float_type, with_fp_tex_hack=True),
                    "field%d_tex" % self.dep_to_index[dep_expr])
                ])

        if fplan.flux_count != len(self.fluxes):
            from warnings import warn
            warn("Flux count in flux execution plan different from actual flux count.\n"
                    "You may want to specify the tune_for= kwarg in the Discretization\n"
                    "constructor.")

        cmod.extend([
            Line(),
            Typedef(POD(float_type, "value_type")),
            Line(),
            flux_header_struct(float_type, discr.dimensions),
            Line(),
            face_pair_struct(float_type, discr.dimensions),
            Line(),
            Define("DIMENSIONS", discr.dimensions),
            Define("DOFS_PER_FACE", fplan.dofs_per_face),
            Define("THREADS_PER_FACE", fplan.threads_per_face()),
            Line(),
            Define("CONCURRENT_FACES", fplan.parallel_faces),
            Define("BLOCK_MB_COUNT", fplan.mbs_per_block),
            Line(),
            Define("FACEDOF_NR", "threadIdx.x"),
            Define("BLOCK_FACE", "threadIdx.y"),
            Line(),
            Define("FLUX_COUNT", len(self.fluxes)),
            Line(),
            Define("THREAD_NUM", "(FACEDOF_NR + BLOCK_FACE*THREADS_PER_FACE)"),
            Define("THREAD_COUNT", "(THREADS_PER_FACE*CONCURRENT_FACES)"),
            Define("COALESCING_THREAD_COUNT",
                "(THREAD_COUNT < 0x10 ? THREAD_COUNT : THREAD_COUNT & ~0xf)"),
            Line(),
            Define("DATA_BLOCK_SIZE", fdata.block_bytes),
            Define("ALIGNED_FACE_DOFS_PER_MB", fplan.aligned_face_dofs_per_microblock()),
            Define("ALIGNED_FACE_DOFS_PER_BLOCK",
                "(ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT)"),
            Line(),
            Define("FOF_BLOCK_BASE", "(blockIdx.x*ALIGNED_FACE_DOFS_PER_BLOCK)"),
            Line(),
            ] + ilist_data.code + [
            Line(),
            Value("texture<index_list_entry_t, 1, cudaReadModeElementType>",
                "tex_index_lists"),
            Line(),
            fdata.struct,
            Line(),
            CudaShared(Value("flux_data", "data")),
            ])

        if not fplan.direct_store:
            cmod.extend([
                CudaShared(
                    ArrayOf(
                        ArrayOf(
                            POD(float_type, "smem_fluxes_on_faces"),
                            "FLUX_COUNT"),
                        "ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT")
                    ),
                Line(),
                ])

        S = Statement
        f_body = Block()

        from hedge.backends.cuda.tools import get_load_code

        f_body.extend(get_load_code(
            dest="&data",
            base="gmem_facedata + blockIdx.x*DATA_BLOCK_SIZE",
            bytes="sizeof(flux_data)",
            descr="load face_pair data")
            +[S("__syncthreads()"), Line() ])

        def get_flux_code(flux_writer):
            flux_code = Block([])

            flux_code.extend([
                Initializer(Pointer(
                    Value("face_pair", "fpair")),
                    "data.facepairs+fpair_nr"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "a_index")),
                    "fpair->a_base + tex1Dfetch(tex_index_lists, "
                    "fpair->a_ilist_index + FACEDOF_NR)"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "b_index")),
                    "fpair->b_base + tex1Dfetch(tex_index_lists, "
                    "fpair->b_ilist_index + FACEDOF_NR)"),
                Line(),
                flux_writer(),
                Line(),
                S("fpair_nr += CONCURRENT_FACES")
                ])

            return flux_code

        flux_computation = Block([
            Comment("fluxes for dual-sided (intra-block) interior face pairs"),
            While("fpair_nr < data.header.same_facepairs_end",
                get_flux_code(lambda:
                    self.write_interior_flux_code(True))
                ),
            Line(),
            Comment("work around nvcc assertion failure"),
            S("fpair_nr+=1"),
            S("fpair_nr-=1"),
            Line(),
            Comment("fluxes for single-sided (inter-block) interior face pairs"),
            While("fpair_nr < data.header.diff_facepairs_end",
                get_flux_code(lambda:
                    self.write_interior_flux_code(False))
                ),
            Line(),
            Comment("fluxes for single-sided boundary face pairs"),
            While("fpair_nr < data.header.bdry_facepairs_end",
                get_flux_code(
                    lambda: self.write_boundary_flux_code(for_benchmark))
                ),
            ])

        f_body.extend_log_block("compute the fluxes", [
            Initializer(POD(numpy.uint32, "fpair_nr"), "BLOCK_FACE"),
            If("FACEDOF_NR < DOFS_PER_FACE", flux_computation)
            ])

        if not fplan.direct_store:
            f_body.extend([
                Line(),
                S("__syncthreads()"),
                Line()
                ])

            f_body.extend_log_block("store fluxes", [
                    #Assign("debugbuf[blockIdx.x]", "FOF_BLOCK_BASE"),
                    #Assign("debugbuf[0]", "FOF_BLOCK_BASE"),
                    #Assign("debugbuf[0]", "sizeof(face_pair)"),
                    For("unsigned word_nr = THREAD_NUM",
                        "word_nr < ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT",
                        "word_nr += COALESCING_THREAD_COUNT",
                        Block([Assign(
                            "gmem_fluxes_on_faces%d[FOF_BLOCK_BASE+word_nr]" % flux_nr,
                            "smem_fluxes_on_faces[%d][word_nr]" % flux_nr)
                            for flux_nr in range(len(self.fluxes))]
                           #+[If("isnan(smem_fluxes_on_faces[%d][word_nr])" % flux_nr,
                               #Block([
                                   #Assign("debugbuf[blockIdx.x]", "word_nr"),
                                   #])
                               #)
                            #for flux_nr in range(len(self.fluxes))]
                        )
                    )
                    ])
        if False:
            f_body.extend([
                    Assign("debugbuf[blockIdx.x*96+32+BLOCK_FACE*32+threadIdx.x]", "fpair_nr"),
                    Assign("debugbuf[blockIdx.x*96+16]", "data.header.same_facepairs_end"),
                    Assign("debugbuf[blockIdx.x*96+17]", "data.header.diff_facepairs_end"),
                    Assign("debugbuf[blockIdx.x*96+18]", "data.header.bdry_facepairs_end"),
                    ]
                    )

        # finish off ----------------------------------------------------------
        cmod.append(FunctionBody(f_decl, f_body))

        if not for_benchmark and "cuda_dump_kernels" in discr.debug:
            from hedge.tools import open_unique_debug_file
            open_unique_debug_file("flux_gather", ".cu").write(str(cmod))

        #from pycuda.tools import allow_user_edit
        mod = SourceModule(
                #allow_user_edit(cmod, "kernel.cu", "the flux kernel"),
                cmod,
                keep="cuda_keep_kernels" in discr.debug)
        expr_to_texture_map = dict(
                (dep_expr, mod.get_texref(
                    "field%d_tex" % self.dep_to_index[dep_expr]))
                for dep_expr in self.all_deps)

        index_list_texref = mod.get_texref("tex_index_lists")
        index_list_texref.set_address(
                ilist_data.device_memory,
                ilist_data.bytes)
        index_list_texref.set_format(
                cuda.dtype_to_array_format(ilist_data.type), 1)
        index_list_texref.set_flags(cuda.TRSF_READ_AS_INTEGER)

        func = mod.get_function("apply_flux")
        block = (fplan.threads_per_face(), fplan.parallel_faces, 1)
        func.prepare(
                (2+len(self.fluxes))*"P",
                texrefs=expr_to_texture_map.values()
                + [index_list_texref])

        if "cuda_flux" in discr.debug:
            print "flux: lmem=%d smem=%d regs=%d" % (
                    func.local_size_bytes,
                    func.shared_size_bytes,
                    func.num_regs)

        return block, func, expr_to_texture_map
示例#6
0
    def fake_flux_face_data_block(self, block_count):
        discr = self.discr
        given = self.plan.given

        fh_struct = flux_header_struct(given.float_type, discr.dimensions)
        fp_struct = face_pair_struct(given.float_type, discr.dimensions)

        min_headers = []
        min_fp_blocks = []

        from random import randrange, choice

        face_dofs = self.plan.dofs_per_face()

        mp_count = discr.device.get_attribute(
                    cuda.device_attribute.MULTIPROCESSOR_COUNT)

        # FIXME
        assert False, "flux planning in the presence of quadrature needs to be fixed"

        for block_nr in range(mp_count):
            fp_structs = []

            faces = [(mb_nr, mb_el_nr, face_nr)
                    for mb_nr in range(self.plan.microblocks_per_block())
                    for mb_el_nr in range(given.microblock.elements)
                    for face_nr in range(self.plan.faces_per_el())]

            def draw_base():
                mb_nr, mb_el_nr, face_nr = choice(faces)
                return (block_nr * given.microblock.aligned_floats
                        * self.plan.microblocks_per_block()
                        + mb_nr * given.microblock.aligned_floats
                        + mb_el_nr * given.dofs_per_el())

            def draw_dest():
                mb_nr, mb_el_nr, face_nr = choice(faces)
                return (mb_nr * given.aligned_face_dofs_per_microblock()
                        + mb_el_nr * face_dofs * given.faces_per_el()
                        + face_nr * face_dofs)

            def bound_int(low, x, hi):
                return int(min(max(low, x), hi))

            from random import gauss
            pdata = self.plan.partition_data
            fp_count = bound_int(
                    0,
                    gauss(
                        pdata.face_pair_avg,
                        (pdata.max_face_pair_count-pdata.face_pair_avg)/2),
                    pdata.max_face_pair_count)


            for i in range(fp_count):
                fp_structs.append(
                        fp_struct.make(
                            h=0.5, order=2, face_jacobian=0.5,
                            normal=discr.dimensions*[0.1],

                            a_base=draw_base(), b_base=draw_base(),

                            a_ilist_index=randrange(self.FAKE_INDEX_LIST_COUNT)*face_dofs,
                            b_ilist_index=randrange(self.FAKE_INDEX_LIST_COUNT)*face_dofs,

                            boundary_bitmap=1,
                            b_write_ilist_index=randrange(self.FAKE_INDEX_LIST_COUNT)*face_dofs,

                            a_dest=draw_dest(), b_dest=draw_dest()
                            ))

            total_ext_face_count = bound_int(0,
                pdata.ext_face_avg + randrange(-1,2),
                fp_count)

            bdry_count = min(total_ext_face_count,
                    randrange(1+int(round(total_ext_face_count/6))))
            diff_count = total_ext_face_count-bdry_count

            min_headers.append(fh_struct.make(
                    same_facepairs_end=len(fp_structs)-total_ext_face_count,
                    diff_facepairs_end=diff_count,
                    bdry_facepairs_end=bdry_count))
            min_fp_blocks.append(fp_structs)

        dups = block_count//mp_count + 1
        headers = (min_headers * dups)[:block_count]
        fp_blocks = (min_fp_blocks * dups)[:block_count]

        from cgen import Value
        from hedge.backends.cuda.tools import make_superblocks

        return make_superblocks(
                given.devdata, "flux_data",
                [(headers, Value(fh_struct.tpname, "header")) ],
                [(fp_blocks, Value(fp_struct.tpname, "facepairs"))]
                )
示例#7
0
    def flux_face_data_block(self, elgroup):
        discr = self.discr
        given = self.plan.given
        fplan = self.plan
        headers = []
        fp_blocks = []

        INVALID_DEST = (1<<16)-1

        from hedge.backends.cuda import GPUBoundaryFaceStorage

        fh_struct = flux_header_struct(given.float_type, discr.dimensions)
        fp_struct = face_pair_struct(given.float_type, discr.dimensions)

        def find_elface_dest(el_face):
            num_in_block = discr.find_number_in_block(el_face[0])
            mb_index, index_in_mb = divmod(num_in_block, given.microblock.elements)
            return (mb_index * fplan.aligned_face_dofs_per_microblock()
                    + index_in_mb * elface_dofs
                    + el_face[1]*face_dofs)

        # {{{ quadrature setup, if necessary
        if fplan.quadrature_tag is not None:
            quad_info = discr.get_cuda_quadrature_info(
                    fplan.quadrature_tag)
            eg_quad_info = discr.get_cuda_elgroup_quadrature_info(
                    elgroup, fplan.quadrature_tag)
            ldis_quad_info = eg_quad_info.ldis_quad_info

            def find_el_src_index(el):
                block = discr.blocks[discr.partition[el.id]]

                mb_nr, in_mb_nr = divmod(block.el_number_map[el],
                    given.microblock.elements)

                return (block.number * fplan.input_dofs_per_block()
                        + mb_nr*eg_quad_info.aligned_int_face_dofs_per_microblock
                        + in_mb_nr*ldis_quad_info.face_node_count()
                        * ldis_quad_info.ldis.face_count())

            face_storage_map = quad_info.face_storage_info.map
        else:
            find_el_src_index = discr.find_el_gpu_index
            face_storage_map = discr.face_storage_info.map

        # }}}

        int_fp_count, ext_fp_count, bdry_fp_count = 0, 0, 0

        for block_nr, block in enumerate(discr.blocks):
            ldis = block.local_discretization
            face_dofs = fplan.dofs_per_face
            elface_dofs = face_dofs*ldis.face_count()

            faces_todo = set((el,face_nbr)
                    for mb in block.microblocks
                    for el in mb
                    for face_nbr in range(ldis.face_count()))
            same_fp_structs = []
            diff_fp_structs = []
            bdry_fp_structs = []

            while faces_todo:
                elface = faces_todo.pop()

                a_face = face_storage_map[elface]
                b_face = a_face.opposite

                if isinstance(b_face, GPUBoundaryFaceStorage):
                    # boundary face
                    b_base = b_face.gpu_bdry_index_in_floats
                    boundary_bitmap = self.executor.elface_to_bdry_bitmap.get(
                            a_face.el_face, 0)
                    b_write_index_list = 0 # doesn't matter
                    b_dest = INVALID_DEST

                    fp_structs = bdry_fp_structs
                    bdry_fp_count += 1
                else:
                    # interior face
                    b_base = find_el_src_index(b_face.el_face[0])
                    boundary_bitmap = 0

                    if b_face.native_block == a_face.native_block:
                        # same block
                        faces_todo.remove(b_face.el_face)
                        b_write_index_list = a_face.ext_write_index_list_id
                        b_dest = find_elface_dest(b_face.el_face)

                        fp_structs = same_fp_structs
                        int_fp_count += 1
                    else:
                        # different block
                        b_write_index_list = 0 # doesn't matter
                        b_dest = INVALID_DEST

                        fp_structs = diff_fp_structs
                        ext_fp_count += 1

                a_base = find_el_src_index(a_face.el_face[0])
                a_dest = find_elface_dest(a_face.el_face)

                fp_structs.append(
                        fp_struct.make(
                            h=a_face.face_pair_side.h,
                            order=a_face.face_pair_side.order,
                            face_jacobian=a_face.face_pair_side.face_jacobian,
                            normal=a_face.face_pair_side.normal,

                            a_base=a_base,
                            b_base=b_base,

                            a_ilist_index= \
                                    a_face.global_int_flux_index_list_id*face_dofs,
                            b_ilist_index= \
                                    a_face.global_ext_flux_index_list_id*face_dofs,

                            boundary_bitmap=boundary_bitmap,
                            b_write_ilist_index= \
                                    b_write_index_list*face_dofs,

                            a_dest=a_dest,
                            b_dest=b_dest
                            ))

            headers.append(fh_struct.make(
                    same_facepairs_end=\
                            len(same_fp_structs),
                    diff_facepairs_end=\
                            len(same_fp_structs)+len(diff_fp_structs),
                    bdry_facepairs_end=\
                            len(same_fp_structs)+len(diff_fp_structs)
                            +len(bdry_fp_structs),
                    ))
            fp_blocks.append(
                    same_fp_structs
                    +diff_fp_structs
                    +bdry_fp_structs)

        #print len(same_fp_structs), len(diff_fp_structs), len(bdry_fp_structs)

        from cgen import Value
        from hedge.backends.cuda.tools import make_superblocks

        return make_superblocks(
                given.devdata, "flux_data",
                [(headers, Value(fh_struct.tpname, "header"))],
                [(fp_blocks, Value(fp_struct.tpname, "facepairs"))],
                extra_fields={
                    "int_fp_count": int_fp_count,
                    "ext_fp_count": ext_fp_count,
                    "bdry_fp_count": bdry_fp_count,
                    "fp_count": int_fp_count+ext_fp_count+bdry_fp_count,
                    }
                )
示例#8
0
    def get_kernel(self, fdata, ilist_data, for_benchmark):
        from cgen.cuda import CudaShared, CudaGlobal
        from pycuda.tools import dtype_to_ctype

        discr = self.discr
        given = self.plan.given
        fplan = self.plan
        d = discr.dimensions
        dims = range(d)

        elgroup, = discr.element_groups

        float_type = given.float_type

        f_decl = CudaGlobal(
            FunctionDeclaration(Value("void", "apply_flux"), [
                Pointer(POD(float_type, "debugbuf")),
                Pointer(POD(numpy.uint8, "gmem_facedata")),
            ] + [
                Pointer(POD(float_type, "gmem_fluxes_on_faces%d" % flux_nr))
                for flux_nr in range(len(self.fluxes))
            ]))

        cmod = Module()
        cmod.append(Include("pycuda-helpers.hpp"))

        for dep_expr in self.all_deps:
            cmod.extend([
                Value(
                    "texture<%s, 1, cudaReadModeElementType>" %
                    dtype_to_ctype(float_type, with_fp_tex_hack=True),
                    "field%d_tex" % self.dep_to_index[dep_expr])
            ])

        if fplan.flux_count != len(self.fluxes):
            from warnings import warn
            warn(
                "Flux count in flux execution plan different from actual flux count.\n"
                "You may want to specify the tune_for= kwarg in the Discretization\n"
                "constructor.")

        cmod.extend([
            Line(),
            Typedef(POD(float_type, "value_type")),
            Line(),
            flux_header_struct(float_type, discr.dimensions),
            Line(),
            face_pair_struct(float_type, discr.dimensions),
            Line(),
            Define("DIMENSIONS", discr.dimensions),
            Define("DOFS_PER_FACE", fplan.dofs_per_face),
            Define("THREADS_PER_FACE", fplan.threads_per_face()),
            Line(),
            Define("CONCURRENT_FACES", fplan.parallel_faces),
            Define("BLOCK_MB_COUNT", fplan.mbs_per_block),
            Line(),
            Define("FACEDOF_NR", "threadIdx.x"),
            Define("BLOCK_FACE", "threadIdx.y"),
            Line(),
            Define("FLUX_COUNT", len(self.fluxes)),
            Line(),
            Define("THREAD_NUM", "(FACEDOF_NR + BLOCK_FACE*THREADS_PER_FACE)"),
            Define("THREAD_COUNT", "(THREADS_PER_FACE*CONCURRENT_FACES)"),
            Define(
                "COALESCING_THREAD_COUNT",
                "(THREAD_COUNT < 0x10 ? THREAD_COUNT : THREAD_COUNT & ~0xf)"),
            Line(),
            Define("DATA_BLOCK_SIZE", fdata.block_bytes),
            Define("ALIGNED_FACE_DOFS_PER_MB",
                   fplan.aligned_face_dofs_per_microblock()),
            Define("ALIGNED_FACE_DOFS_PER_BLOCK",
                   "(ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT)"),
            Line(),
            Define("FOF_BLOCK_BASE",
                   "(blockIdx.x*ALIGNED_FACE_DOFS_PER_BLOCK)"),
            Line(),
        ] + ilist_data.code + [
            Line(),
            Value("texture<index_list_entry_t, 1, cudaReadModeElementType>",
                  "tex_index_lists"),
            Line(),
            fdata.struct,
            Line(),
            CudaShared(Value("flux_data", "data")),
        ])

        if not fplan.direct_store:
            cmod.extend([
                CudaShared(
                    ArrayOf(
                        ArrayOf(POD(float_type, "smem_fluxes_on_faces"),
                                "FLUX_COUNT"),
                        "ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT")),
                Line(),
            ])

        S = Statement
        f_body = Block()

        from hedge.backends.cuda.tools import get_load_code

        f_body.extend(
            get_load_code(dest="&data",
                          base="gmem_facedata + blockIdx.x*DATA_BLOCK_SIZE",
                          bytes="sizeof(flux_data)",
                          descr="load face_pair data") +
            [S("__syncthreads()"), Line()])

        def get_flux_code(flux_writer):
            flux_code = Block([])

            flux_code.extend([
                Initializer(Pointer(Value("face_pair", "fpair")),
                            "data.facepairs+fpair_nr"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "a_index")),
                    "fpair->a_base + tex1Dfetch(tex_index_lists, "
                    "fpair->a_ilist_index + FACEDOF_NR)"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "b_index")),
                    "fpair->b_base + tex1Dfetch(tex_index_lists, "
                    "fpair->b_ilist_index + FACEDOF_NR)"),
                Line(),
                flux_writer(),
                Line(),
                S("fpair_nr += CONCURRENT_FACES")
            ])

            return flux_code

        flux_computation = Block([
            Comment("fluxes for dual-sided (intra-block) interior face pairs"),
            While("fpair_nr < data.header.same_facepairs_end",
                  get_flux_code(lambda: self.write_interior_flux_code(True))),
            Line(),
            Comment("work around nvcc assertion failure"),
            S("fpair_nr+=1"),
            S("fpair_nr-=1"),
            Line(),
            Comment(
                "fluxes for single-sided (inter-block) interior face pairs"),
            While("fpair_nr < data.header.diff_facepairs_end",
                  get_flux_code(lambda: self.write_interior_flux_code(False))),
            Line(),
            Comment("fluxes for single-sided boundary face pairs"),
            While(
                "fpair_nr < data.header.bdry_facepairs_end",
                get_flux_code(
                    lambda: self.write_boundary_flux_code(for_benchmark))),
        ])

        f_body.extend_log_block("compute the fluxes", [
            Initializer(POD(numpy.uint32, "fpair_nr"), "BLOCK_FACE"),
            If("FACEDOF_NR < DOFS_PER_FACE", flux_computation)
        ])

        if not fplan.direct_store:
            f_body.extend([Line(), S("__syncthreads()"), Line()])

            f_body.extend_log_block(
                "store fluxes",
                [
                    #Assign("debugbuf[blockIdx.x]", "FOF_BLOCK_BASE"),
                    #Assign("debugbuf[0]", "FOF_BLOCK_BASE"),
                    #Assign("debugbuf[0]", "sizeof(face_pair)"),
                    For(
                        "unsigned word_nr = THREAD_NUM",
                        "word_nr < ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT",
                        "word_nr += COALESCING_THREAD_COUNT",
                        Block([
                            Assign(
                                "gmem_fluxes_on_faces%d[FOF_BLOCK_BASE+word_nr]"
                                % flux_nr,
                                "smem_fluxes_on_faces[%d][word_nr]" % flux_nr)
                            for flux_nr in range(len(self.fluxes))
                        ]
                              #+[If("isnan(smem_fluxes_on_faces[%d][word_nr])" % flux_nr,
                              #Block([
                              #Assign("debugbuf[blockIdx.x]", "word_nr"),
                              #])
                              #)
                              #for flux_nr in range(len(self.fluxes))]
                              ))
                ])
        if False:
            f_body.extend([
                Assign("debugbuf[blockIdx.x*96+32+BLOCK_FACE*32+threadIdx.x]",
                       "fpair_nr"),
                Assign("debugbuf[blockIdx.x*96+16]",
                       "data.header.same_facepairs_end"),
                Assign("debugbuf[blockIdx.x*96+17]",
                       "data.header.diff_facepairs_end"),
                Assign("debugbuf[blockIdx.x*96+18]",
                       "data.header.bdry_facepairs_end"),
            ])

        # finish off ----------------------------------------------------------
        cmod.append(FunctionBody(f_decl, f_body))

        if not for_benchmark and "cuda_dump_kernels" in discr.debug:
            from hedge.tools import open_unique_debug_file
            open_unique_debug_file("flux_gather", ".cu").write(str(cmod))

        #from pycuda.tools import allow_user_edit
        mod = SourceModule(
            #allow_user_edit(cmod, "kernel.cu", "the flux kernel"),
            cmod,
            keep="cuda_keep_kernels" in discr.debug)
        expr_to_texture_map = dict(
            (dep_expr,
             mod.get_texref("field%d_tex" % self.dep_to_index[dep_expr]))
            for dep_expr in self.all_deps)

        index_list_texref = mod.get_texref("tex_index_lists")
        index_list_texref.set_address(ilist_data.device_memory,
                                      ilist_data.bytes)
        index_list_texref.set_format(
            cuda.dtype_to_array_format(ilist_data.type), 1)
        index_list_texref.set_flags(cuda.TRSF_READ_AS_INTEGER)

        func = mod.get_function("apply_flux")
        block = (fplan.threads_per_face(), fplan.parallel_faces, 1)
        func.prepare(
            (2 + len(self.fluxes)) * "P",
            texrefs=expr_to_texture_map.values() + [index_list_texref])

        if "cuda_flux" in discr.debug:
            print "flux: lmem=%d smem=%d regs=%d" % (
                func.local_size_bytes, func.shared_size_bytes, func.num_regs)

        return block, func, expr_to_texture_map
示例#9
0
    def fake_flux_face_data_block(self, block_count):
        discr = self.discr
        given = self.plan.given

        fh_struct = flux_header_struct(given.float_type, discr.dimensions)
        fp_struct = face_pair_struct(given.float_type, discr.dimensions)

        min_headers = []
        min_fp_blocks = []

        from random import randrange, choice

        face_dofs = self.plan.dofs_per_face()

        mp_count = discr.device.get_attribute(
            cuda.device_attribute.MULTIPROCESSOR_COUNT)

        # FIXME
        assert False, "flux planning in the presence of quadrature needs to be fixed"

        for block_nr in range(mp_count):
            fp_structs = []

            faces = [(mb_nr, mb_el_nr, face_nr)
                     for mb_nr in range(self.plan.microblocks_per_block())
                     for mb_el_nr in range(given.microblock.elements)
                     for face_nr in range(self.plan.faces_per_el())]

            def draw_base():
                mb_nr, mb_el_nr, face_nr = choice(faces)
                return (block_nr * given.microblock.aligned_floats *
                        self.plan.microblocks_per_block() +
                        mb_nr * given.microblock.aligned_floats +
                        mb_el_nr * given.dofs_per_el())

            def draw_dest():
                mb_nr, mb_el_nr, face_nr = choice(faces)
                return (mb_nr * given.aligned_face_dofs_per_microblock() +
                        mb_el_nr * face_dofs * given.faces_per_el() +
                        face_nr * face_dofs)

            def bound_int(low, x, hi):
                return int(min(max(low, x), hi))

            from random import gauss
            pdata = self.plan.partition_data
            fp_count = bound_int(
                0,
                gauss(pdata.face_pair_avg,
                      (pdata.max_face_pair_count - pdata.face_pair_avg) / 2),
                pdata.max_face_pair_count)

            for i in range(fp_count):
                fp_structs.append(
                    fp_struct.make(
                        h=0.5,
                        order=2,
                        face_jacobian=0.5,
                        normal=discr.dimensions * [0.1],
                        a_base=draw_base(),
                        b_base=draw_base(),
                        a_ilist_index=randrange(self.FAKE_INDEX_LIST_COUNT) *
                        face_dofs,
                        b_ilist_index=randrange(self.FAKE_INDEX_LIST_COUNT) *
                        face_dofs,
                        boundary_bitmap=1,
                        b_write_ilist_index=randrange(
                            self.FAKE_INDEX_LIST_COUNT) * face_dofs,
                        a_dest=draw_dest(),
                        b_dest=draw_dest()))

            total_ext_face_count = bound_int(
                0, pdata.ext_face_avg + randrange(-1, 2), fp_count)

            bdry_count = min(
                total_ext_face_count,
                randrange(1 + int(round(total_ext_face_count / 6))))
            diff_count = total_ext_face_count - bdry_count

            min_headers.append(
                fh_struct.make(same_facepairs_end=len(fp_structs) -
                               total_ext_face_count,
                               diff_facepairs_end=diff_count,
                               bdry_facepairs_end=bdry_count))
            min_fp_blocks.append(fp_structs)

        dups = block_count // mp_count + 1
        headers = (min_headers * dups)[:block_count]
        fp_blocks = (min_fp_blocks * dups)[:block_count]

        from cgen import Value
        from hedge.backends.cuda.tools import make_superblocks

        return make_superblocks(
            given.devdata, "flux_data",
            [(headers, Value(fh_struct.tpname, "header"))],
            [(fp_blocks, Value(fp_struct.tpname, "facepairs"))])
示例#10
0
    def flux_face_data_block(self, elgroup):
        discr = self.discr
        given = self.plan.given
        fplan = self.plan
        headers = []
        fp_blocks = []

        INVALID_DEST = (1 << 16) - 1

        from hedge.backends.cuda import GPUBoundaryFaceStorage

        fh_struct = flux_header_struct(given.float_type, discr.dimensions)
        fp_struct = face_pair_struct(given.float_type, discr.dimensions)

        def find_elface_dest(el_face):
            num_in_block = discr.find_number_in_block(el_face[0])
            mb_index, index_in_mb = divmod(num_in_block,
                                           given.microblock.elements)
            return (mb_index * fplan.aligned_face_dofs_per_microblock() +
                    index_in_mb * elface_dofs + el_face[1] * face_dofs)

        # {{{ quadrature setup, if necessary
        if fplan.quadrature_tag is not None:
            quad_info = discr.get_cuda_quadrature_info(fplan.quadrature_tag)
            eg_quad_info = discr.get_cuda_elgroup_quadrature_info(
                elgroup, fplan.quadrature_tag)
            ldis_quad_info = eg_quad_info.ldis_quad_info

            def find_el_src_index(el):
                block = discr.blocks[discr.partition[el.id]]

                mb_nr, in_mb_nr = divmod(block.el_number_map[el],
                                         given.microblock.elements)

                return (
                    block.number * fplan.input_dofs_per_block() +
                    mb_nr * eg_quad_info.aligned_int_face_dofs_per_microblock +
                    in_mb_nr * ldis_quad_info.face_node_count() *
                    ldis_quad_info.ldis.face_count())

            face_storage_map = quad_info.face_storage_info.map
        else:
            find_el_src_index = discr.find_el_gpu_index
            face_storage_map = discr.face_storage_info.map

        # }}}

        int_fp_count, ext_fp_count, bdry_fp_count = 0, 0, 0

        for block_nr, block in enumerate(discr.blocks):
            ldis = block.local_discretization
            face_dofs = fplan.dofs_per_face
            elface_dofs = face_dofs * ldis.face_count()

            faces_todo = set((el, face_nbr) for mb in block.microblocks
                             for el in mb
                             for face_nbr in range(ldis.face_count()))
            same_fp_structs = []
            diff_fp_structs = []
            bdry_fp_structs = []

            while faces_todo:
                elface = faces_todo.pop()

                a_face = face_storage_map[elface]
                b_face = a_face.opposite

                if isinstance(b_face, GPUBoundaryFaceStorage):
                    # boundary face
                    b_base = b_face.gpu_bdry_index_in_floats
                    boundary_bitmap = self.executor.elface_to_bdry_bitmap.get(
                        a_face.el_face, 0)
                    b_write_index_list = 0  # doesn't matter
                    b_dest = INVALID_DEST

                    fp_structs = bdry_fp_structs
                    bdry_fp_count += 1
                else:
                    # interior face
                    b_base = find_el_src_index(b_face.el_face[0])
                    boundary_bitmap = 0

                    if b_face.native_block == a_face.native_block:
                        # same block
                        faces_todo.remove(b_face.el_face)
                        b_write_index_list = a_face.ext_write_index_list_id
                        b_dest = find_elface_dest(b_face.el_face)

                        fp_structs = same_fp_structs
                        int_fp_count += 1
                    else:
                        # different block
                        b_write_index_list = 0  # doesn't matter
                        b_dest = INVALID_DEST

                        fp_structs = diff_fp_structs
                        ext_fp_count += 1

                a_base = find_el_src_index(a_face.el_face[0])
                a_dest = find_elface_dest(a_face.el_face)

                fp_structs.append(
                        fp_struct.make(
                            h=a_face.face_pair_side.h,
                            order=a_face.face_pair_side.order,
                            face_jacobian=a_face.face_pair_side.face_jacobian,
                            normal=a_face.face_pair_side.normal,

                            a_base=a_base,
                            b_base=b_base,

                            a_ilist_index= \
                                    a_face.global_int_flux_index_list_id*face_dofs,
                            b_ilist_index= \
                                    a_face.global_ext_flux_index_list_id*face_dofs,

                            boundary_bitmap=boundary_bitmap,
                            b_write_ilist_index= \
                                    b_write_index_list*face_dofs,

                            a_dest=a_dest,
                            b_dest=b_dest
                            ))

            headers.append(fh_struct.make(
                    same_facepairs_end=\
                            len(same_fp_structs),
                    diff_facepairs_end=\
                            len(same_fp_structs)+len(diff_fp_structs),
                    bdry_facepairs_end=\
                            len(same_fp_structs)+len(diff_fp_structs)
                            +len(bdry_fp_structs),
                    ))
            fp_blocks.append(same_fp_structs + diff_fp_structs +
                             bdry_fp_structs)

        #print len(same_fp_structs), len(diff_fp_structs), len(bdry_fp_structs)

        from cgen import Value
        from hedge.backends.cuda.tools import make_superblocks

        return make_superblocks(
            given.devdata,
            "flux_data", [(headers, Value(fh_struct.tpname, "header"))],
            [(fp_blocks, Value(fp_struct.tpname, "facepairs"))],
            extra_fields={
                "int_fp_count": int_fp_count,
                "ext_fp_count": ext_fp_count,
                "bdry_fp_count": bdry_fp_count,
                "fp_count": int_fp_count + ext_fp_count + bdry_fp_count,
            })