def flux_header_struct(float_type, dims): from cgen import GenerableStruct return GenerableStruct("flux_header", [ POD(numpy.uint16, "same_facepairs_end"), POD(numpy.uint16, "diff_facepairs_end"), POD(numpy.uint16, "bdry_facepairs_end"), ], align_bytes=face_pair_struct(float_type, dims).alignment_requirement())
def flux_header_struct(float_type, dims): from cgen import GenerableStruct return GenerableStruct("flux_header", [ POD(numpy.uint16, "same_facepairs_end"), POD(numpy.uint16, "diff_facepairs_end"), POD(numpy.uint16, "bdry_facepairs_end"), ], align_bytes=face_pair_struct( float_type, dims).alignment_requirement())
def shared_mem_use(self): from hedge.backends.cuda.fluxgather import face_pair_struct d = self.given.ldis.dimensions if self.dofs_per_face > 255: index_lists_entry_size = 2 else: index_lists_entry_size = 1 result = (128 # parameters, block header, small extra stuff + len(face_pair_struct(self.given.float_type, d)) * self.face_pair_count()) if not self.direct_store: result += (self.aligned_face_dofs_per_microblock() * self.flux_count * self.microblocks_per_block() * self.given.float_size()) return result
def shared_mem_use(self): from hedge.backends.cuda.fluxgather import face_pair_struct d = self.given.ldis.dimensions if self.dofs_per_face > 255: index_lists_entry_size = 2 else: index_lists_entry_size = 1 result = ( 128 # parameters, block header, small extra stuff + len(face_pair_struct(self.given.float_type, d)) * self.face_pair_count()) if not self.direct_store: result += (self.aligned_face_dofs_per_microblock() * self.flux_count * self.microblocks_per_block() * self.given.float_size()) return result
def get_kernel(self, fdata, ilist_data, for_benchmark): from cgen.cuda import CudaShared, CudaGlobal from pycuda.tools import dtype_to_ctype discr = self.discr given = self.plan.given fplan = self.plan d = discr.dimensions dims = range(d) elgroup, = discr.element_groups float_type = given.float_type f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_flux"), [ Pointer(POD(float_type, "debugbuf")), Pointer(POD(numpy.uint8, "gmem_facedata")), ]+[ Pointer(POD(float_type, "gmem_fluxes_on_faces%d" % flux_nr)) for flux_nr in range(len(self.fluxes)) ] )) cmod = Module() cmod.append(Include("pycuda-helpers.hpp")) for dep_expr in self.all_deps: cmod.extend([ Value("texture<%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type, with_fp_tex_hack=True), "field%d_tex" % self.dep_to_index[dep_expr]) ]) if fplan.flux_count != len(self.fluxes): from warnings import warn warn("Flux count in flux execution plan different from actual flux count.\n" "You may want to specify the tune_for= kwarg in the Discretization\n" "constructor.") cmod.extend([ Line(), Typedef(POD(float_type, "value_type")), Line(), flux_header_struct(float_type, discr.dimensions), Line(), face_pair_struct(float_type, discr.dimensions), Line(), Define("DIMENSIONS", discr.dimensions), Define("DOFS_PER_FACE", fplan.dofs_per_face), Define("THREADS_PER_FACE", fplan.threads_per_face()), Line(), Define("CONCURRENT_FACES", fplan.parallel_faces), Define("BLOCK_MB_COUNT", fplan.mbs_per_block), Line(), Define("FACEDOF_NR", "threadIdx.x"), Define("BLOCK_FACE", "threadIdx.y"), Line(), Define("FLUX_COUNT", len(self.fluxes)), Line(), Define("THREAD_NUM", "(FACEDOF_NR + BLOCK_FACE*THREADS_PER_FACE)"), Define("THREAD_COUNT", "(THREADS_PER_FACE*CONCURRENT_FACES)"), Define("COALESCING_THREAD_COUNT", "(THREAD_COUNT < 0x10 ? THREAD_COUNT : THREAD_COUNT & ~0xf)"), Line(), Define("DATA_BLOCK_SIZE", fdata.block_bytes), Define("ALIGNED_FACE_DOFS_PER_MB", fplan.aligned_face_dofs_per_microblock()), Define("ALIGNED_FACE_DOFS_PER_BLOCK", "(ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT)"), Line(), Define("FOF_BLOCK_BASE", "(blockIdx.x*ALIGNED_FACE_DOFS_PER_BLOCK)"), Line(), ] + ilist_data.code + [ Line(), Value("texture<index_list_entry_t, 1, cudaReadModeElementType>", "tex_index_lists"), Line(), fdata.struct, Line(), CudaShared(Value("flux_data", "data")), ]) if not fplan.direct_store: cmod.extend([ CudaShared( ArrayOf( ArrayOf( POD(float_type, "smem_fluxes_on_faces"), "FLUX_COUNT"), "ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT") ), Line(), ]) S = Statement f_body = Block() from hedge.backends.cuda.tools import get_load_code f_body.extend(get_load_code( dest="&data", base="gmem_facedata + blockIdx.x*DATA_BLOCK_SIZE", bytes="sizeof(flux_data)", descr="load face_pair data") +[S("__syncthreads()"), Line() ]) def get_flux_code(flux_writer): flux_code = Block([]) flux_code.extend([ Initializer(Pointer( Value("face_pair", "fpair")), "data.facepairs+fpair_nr"), Initializer( MaybeUnused(POD(numpy.uint32, "a_index")), "fpair->a_base + tex1Dfetch(tex_index_lists, " "fpair->a_ilist_index + FACEDOF_NR)"), Initializer( MaybeUnused(POD(numpy.uint32, "b_index")), "fpair->b_base + tex1Dfetch(tex_index_lists, " "fpair->b_ilist_index + FACEDOF_NR)"), Line(), flux_writer(), Line(), S("fpair_nr += CONCURRENT_FACES") ]) return flux_code flux_computation = Block([ Comment("fluxes for dual-sided (intra-block) interior face pairs"), While("fpair_nr < data.header.same_facepairs_end", get_flux_code(lambda: self.write_interior_flux_code(True)) ), Line(), Comment("work around nvcc assertion failure"), S("fpair_nr+=1"), S("fpair_nr-=1"), Line(), Comment("fluxes for single-sided (inter-block) interior face pairs"), While("fpair_nr < data.header.diff_facepairs_end", get_flux_code(lambda: self.write_interior_flux_code(False)) ), Line(), Comment("fluxes for single-sided boundary face pairs"), While("fpair_nr < data.header.bdry_facepairs_end", get_flux_code( lambda: self.write_boundary_flux_code(for_benchmark)) ), ]) f_body.extend_log_block("compute the fluxes", [ Initializer(POD(numpy.uint32, "fpair_nr"), "BLOCK_FACE"), If("FACEDOF_NR < DOFS_PER_FACE", flux_computation) ]) if not fplan.direct_store: f_body.extend([ Line(), S("__syncthreads()"), Line() ]) f_body.extend_log_block("store fluxes", [ #Assign("debugbuf[blockIdx.x]", "FOF_BLOCK_BASE"), #Assign("debugbuf[0]", "FOF_BLOCK_BASE"), #Assign("debugbuf[0]", "sizeof(face_pair)"), For("unsigned word_nr = THREAD_NUM", "word_nr < ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT", "word_nr += COALESCING_THREAD_COUNT", Block([Assign( "gmem_fluxes_on_faces%d[FOF_BLOCK_BASE+word_nr]" % flux_nr, "smem_fluxes_on_faces[%d][word_nr]" % flux_nr) for flux_nr in range(len(self.fluxes))] #+[If("isnan(smem_fluxes_on_faces[%d][word_nr])" % flux_nr, #Block([ #Assign("debugbuf[blockIdx.x]", "word_nr"), #]) #) #for flux_nr in range(len(self.fluxes))] ) ) ]) if False: f_body.extend([ Assign("debugbuf[blockIdx.x*96+32+BLOCK_FACE*32+threadIdx.x]", "fpair_nr"), Assign("debugbuf[blockIdx.x*96+16]", "data.header.same_facepairs_end"), Assign("debugbuf[blockIdx.x*96+17]", "data.header.diff_facepairs_end"), Assign("debugbuf[blockIdx.x*96+18]", "data.header.bdry_facepairs_end"), ] ) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file("flux_gather", ".cu").write(str(cmod)) #from pycuda.tools import allow_user_edit mod = SourceModule( #allow_user_edit(cmod, "kernel.cu", "the flux kernel"), cmod, keep="cuda_keep_kernels" in discr.debug) expr_to_texture_map = dict( (dep_expr, mod.get_texref( "field%d_tex" % self.dep_to_index[dep_expr])) for dep_expr in self.all_deps) index_list_texref = mod.get_texref("tex_index_lists") index_list_texref.set_address( ilist_data.device_memory, ilist_data.bytes) index_list_texref.set_format( cuda.dtype_to_array_format(ilist_data.type), 1) index_list_texref.set_flags(cuda.TRSF_READ_AS_INTEGER) func = mod.get_function("apply_flux") block = (fplan.threads_per_face(), fplan.parallel_faces, 1) func.prepare( (2+len(self.fluxes))*"P", texrefs=expr_to_texture_map.values() + [index_list_texref]) if "cuda_flux" in discr.debug: print "flux: lmem=%d smem=%d regs=%d" % ( func.local_size_bytes, func.shared_size_bytes, func.num_regs) return block, func, expr_to_texture_map
def fake_flux_face_data_block(self, block_count): discr = self.discr given = self.plan.given fh_struct = flux_header_struct(given.float_type, discr.dimensions) fp_struct = face_pair_struct(given.float_type, discr.dimensions) min_headers = [] min_fp_blocks = [] from random import randrange, choice face_dofs = self.plan.dofs_per_face() mp_count = discr.device.get_attribute( cuda.device_attribute.MULTIPROCESSOR_COUNT) # FIXME assert False, "flux planning in the presence of quadrature needs to be fixed" for block_nr in range(mp_count): fp_structs = [] faces = [(mb_nr, mb_el_nr, face_nr) for mb_nr in range(self.plan.microblocks_per_block()) for mb_el_nr in range(given.microblock.elements) for face_nr in range(self.plan.faces_per_el())] def draw_base(): mb_nr, mb_el_nr, face_nr = choice(faces) return (block_nr * given.microblock.aligned_floats * self.plan.microblocks_per_block() + mb_nr * given.microblock.aligned_floats + mb_el_nr * given.dofs_per_el()) def draw_dest(): mb_nr, mb_el_nr, face_nr = choice(faces) return (mb_nr * given.aligned_face_dofs_per_microblock() + mb_el_nr * face_dofs * given.faces_per_el() + face_nr * face_dofs) def bound_int(low, x, hi): return int(min(max(low, x), hi)) from random import gauss pdata = self.plan.partition_data fp_count = bound_int( 0, gauss( pdata.face_pair_avg, (pdata.max_face_pair_count-pdata.face_pair_avg)/2), pdata.max_face_pair_count) for i in range(fp_count): fp_structs.append( fp_struct.make( h=0.5, order=2, face_jacobian=0.5, normal=discr.dimensions*[0.1], a_base=draw_base(), b_base=draw_base(), a_ilist_index=randrange(self.FAKE_INDEX_LIST_COUNT)*face_dofs, b_ilist_index=randrange(self.FAKE_INDEX_LIST_COUNT)*face_dofs, boundary_bitmap=1, b_write_ilist_index=randrange(self.FAKE_INDEX_LIST_COUNT)*face_dofs, a_dest=draw_dest(), b_dest=draw_dest() )) total_ext_face_count = bound_int(0, pdata.ext_face_avg + randrange(-1,2), fp_count) bdry_count = min(total_ext_face_count, randrange(1+int(round(total_ext_face_count/6)))) diff_count = total_ext_face_count-bdry_count min_headers.append(fh_struct.make( same_facepairs_end=len(fp_structs)-total_ext_face_count, diff_facepairs_end=diff_count, bdry_facepairs_end=bdry_count)) min_fp_blocks.append(fp_structs) dups = block_count//mp_count + 1 headers = (min_headers * dups)[:block_count] fp_blocks = (min_fp_blocks * dups)[:block_count] from cgen import Value from hedge.backends.cuda.tools import make_superblocks return make_superblocks( given.devdata, "flux_data", [(headers, Value(fh_struct.tpname, "header")) ], [(fp_blocks, Value(fp_struct.tpname, "facepairs"))] )
def flux_face_data_block(self, elgroup): discr = self.discr given = self.plan.given fplan = self.plan headers = [] fp_blocks = [] INVALID_DEST = (1<<16)-1 from hedge.backends.cuda import GPUBoundaryFaceStorage fh_struct = flux_header_struct(given.float_type, discr.dimensions) fp_struct = face_pair_struct(given.float_type, discr.dimensions) def find_elface_dest(el_face): num_in_block = discr.find_number_in_block(el_face[0]) mb_index, index_in_mb = divmod(num_in_block, given.microblock.elements) return (mb_index * fplan.aligned_face_dofs_per_microblock() + index_in_mb * elface_dofs + el_face[1]*face_dofs) # {{{ quadrature setup, if necessary if fplan.quadrature_tag is not None: quad_info = discr.get_cuda_quadrature_info( fplan.quadrature_tag) eg_quad_info = discr.get_cuda_elgroup_quadrature_info( elgroup, fplan.quadrature_tag) ldis_quad_info = eg_quad_info.ldis_quad_info def find_el_src_index(el): block = discr.blocks[discr.partition[el.id]] mb_nr, in_mb_nr = divmod(block.el_number_map[el], given.microblock.elements) return (block.number * fplan.input_dofs_per_block() + mb_nr*eg_quad_info.aligned_int_face_dofs_per_microblock + in_mb_nr*ldis_quad_info.face_node_count() * ldis_quad_info.ldis.face_count()) face_storage_map = quad_info.face_storage_info.map else: find_el_src_index = discr.find_el_gpu_index face_storage_map = discr.face_storage_info.map # }}} int_fp_count, ext_fp_count, bdry_fp_count = 0, 0, 0 for block_nr, block in enumerate(discr.blocks): ldis = block.local_discretization face_dofs = fplan.dofs_per_face elface_dofs = face_dofs*ldis.face_count() faces_todo = set((el,face_nbr) for mb in block.microblocks for el in mb for face_nbr in range(ldis.face_count())) same_fp_structs = [] diff_fp_structs = [] bdry_fp_structs = [] while faces_todo: elface = faces_todo.pop() a_face = face_storage_map[elface] b_face = a_face.opposite if isinstance(b_face, GPUBoundaryFaceStorage): # boundary face b_base = b_face.gpu_bdry_index_in_floats boundary_bitmap = self.executor.elface_to_bdry_bitmap.get( a_face.el_face, 0) b_write_index_list = 0 # doesn't matter b_dest = INVALID_DEST fp_structs = bdry_fp_structs bdry_fp_count += 1 else: # interior face b_base = find_el_src_index(b_face.el_face[0]) boundary_bitmap = 0 if b_face.native_block == a_face.native_block: # same block faces_todo.remove(b_face.el_face) b_write_index_list = a_face.ext_write_index_list_id b_dest = find_elface_dest(b_face.el_face) fp_structs = same_fp_structs int_fp_count += 1 else: # different block b_write_index_list = 0 # doesn't matter b_dest = INVALID_DEST fp_structs = diff_fp_structs ext_fp_count += 1 a_base = find_el_src_index(a_face.el_face[0]) a_dest = find_elface_dest(a_face.el_face) fp_structs.append( fp_struct.make( h=a_face.face_pair_side.h, order=a_face.face_pair_side.order, face_jacobian=a_face.face_pair_side.face_jacobian, normal=a_face.face_pair_side.normal, a_base=a_base, b_base=b_base, a_ilist_index= \ a_face.global_int_flux_index_list_id*face_dofs, b_ilist_index= \ a_face.global_ext_flux_index_list_id*face_dofs, boundary_bitmap=boundary_bitmap, b_write_ilist_index= \ b_write_index_list*face_dofs, a_dest=a_dest, b_dest=b_dest )) headers.append(fh_struct.make( same_facepairs_end=\ len(same_fp_structs), diff_facepairs_end=\ len(same_fp_structs)+len(diff_fp_structs), bdry_facepairs_end=\ len(same_fp_structs)+len(diff_fp_structs) +len(bdry_fp_structs), )) fp_blocks.append( same_fp_structs +diff_fp_structs +bdry_fp_structs) #print len(same_fp_structs), len(diff_fp_structs), len(bdry_fp_structs) from cgen import Value from hedge.backends.cuda.tools import make_superblocks return make_superblocks( given.devdata, "flux_data", [(headers, Value(fh_struct.tpname, "header"))], [(fp_blocks, Value(fp_struct.tpname, "facepairs"))], extra_fields={ "int_fp_count": int_fp_count, "ext_fp_count": ext_fp_count, "bdry_fp_count": bdry_fp_count, "fp_count": int_fp_count+ext_fp_count+bdry_fp_count, } )
def get_kernel(self, fdata, ilist_data, for_benchmark): from cgen.cuda import CudaShared, CudaGlobal from pycuda.tools import dtype_to_ctype discr = self.discr given = self.plan.given fplan = self.plan d = discr.dimensions dims = range(d) elgroup, = discr.element_groups float_type = given.float_type f_decl = CudaGlobal( FunctionDeclaration(Value("void", "apply_flux"), [ Pointer(POD(float_type, "debugbuf")), Pointer(POD(numpy.uint8, "gmem_facedata")), ] + [ Pointer(POD(float_type, "gmem_fluxes_on_faces%d" % flux_nr)) for flux_nr in range(len(self.fluxes)) ])) cmod = Module() cmod.append(Include("pycuda-helpers.hpp")) for dep_expr in self.all_deps: cmod.extend([ Value( "texture<%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type, with_fp_tex_hack=True), "field%d_tex" % self.dep_to_index[dep_expr]) ]) if fplan.flux_count != len(self.fluxes): from warnings import warn warn( "Flux count in flux execution plan different from actual flux count.\n" "You may want to specify the tune_for= kwarg in the Discretization\n" "constructor.") cmod.extend([ Line(), Typedef(POD(float_type, "value_type")), Line(), flux_header_struct(float_type, discr.dimensions), Line(), face_pair_struct(float_type, discr.dimensions), Line(), Define("DIMENSIONS", discr.dimensions), Define("DOFS_PER_FACE", fplan.dofs_per_face), Define("THREADS_PER_FACE", fplan.threads_per_face()), Line(), Define("CONCURRENT_FACES", fplan.parallel_faces), Define("BLOCK_MB_COUNT", fplan.mbs_per_block), Line(), Define("FACEDOF_NR", "threadIdx.x"), Define("BLOCK_FACE", "threadIdx.y"), Line(), Define("FLUX_COUNT", len(self.fluxes)), Line(), Define("THREAD_NUM", "(FACEDOF_NR + BLOCK_FACE*THREADS_PER_FACE)"), Define("THREAD_COUNT", "(THREADS_PER_FACE*CONCURRENT_FACES)"), Define( "COALESCING_THREAD_COUNT", "(THREAD_COUNT < 0x10 ? THREAD_COUNT : THREAD_COUNT & ~0xf)"), Line(), Define("DATA_BLOCK_SIZE", fdata.block_bytes), Define("ALIGNED_FACE_DOFS_PER_MB", fplan.aligned_face_dofs_per_microblock()), Define("ALIGNED_FACE_DOFS_PER_BLOCK", "(ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT)"), Line(), Define("FOF_BLOCK_BASE", "(blockIdx.x*ALIGNED_FACE_DOFS_PER_BLOCK)"), Line(), ] + ilist_data.code + [ Line(), Value("texture<index_list_entry_t, 1, cudaReadModeElementType>", "tex_index_lists"), Line(), fdata.struct, Line(), CudaShared(Value("flux_data", "data")), ]) if not fplan.direct_store: cmod.extend([ CudaShared( ArrayOf( ArrayOf(POD(float_type, "smem_fluxes_on_faces"), "FLUX_COUNT"), "ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT")), Line(), ]) S = Statement f_body = Block() from hedge.backends.cuda.tools import get_load_code f_body.extend( get_load_code(dest="&data", base="gmem_facedata + blockIdx.x*DATA_BLOCK_SIZE", bytes="sizeof(flux_data)", descr="load face_pair data") + [S("__syncthreads()"), Line()]) def get_flux_code(flux_writer): flux_code = Block([]) flux_code.extend([ Initializer(Pointer(Value("face_pair", "fpair")), "data.facepairs+fpair_nr"), Initializer( MaybeUnused(POD(numpy.uint32, "a_index")), "fpair->a_base + tex1Dfetch(tex_index_lists, " "fpair->a_ilist_index + FACEDOF_NR)"), Initializer( MaybeUnused(POD(numpy.uint32, "b_index")), "fpair->b_base + tex1Dfetch(tex_index_lists, " "fpair->b_ilist_index + FACEDOF_NR)"), Line(), flux_writer(), Line(), S("fpair_nr += CONCURRENT_FACES") ]) return flux_code flux_computation = Block([ Comment("fluxes for dual-sided (intra-block) interior face pairs"), While("fpair_nr < data.header.same_facepairs_end", get_flux_code(lambda: self.write_interior_flux_code(True))), Line(), Comment("work around nvcc assertion failure"), S("fpair_nr+=1"), S("fpair_nr-=1"), Line(), Comment( "fluxes for single-sided (inter-block) interior face pairs"), While("fpair_nr < data.header.diff_facepairs_end", get_flux_code(lambda: self.write_interior_flux_code(False))), Line(), Comment("fluxes for single-sided boundary face pairs"), While( "fpair_nr < data.header.bdry_facepairs_end", get_flux_code( lambda: self.write_boundary_flux_code(for_benchmark))), ]) f_body.extend_log_block("compute the fluxes", [ Initializer(POD(numpy.uint32, "fpair_nr"), "BLOCK_FACE"), If("FACEDOF_NR < DOFS_PER_FACE", flux_computation) ]) if not fplan.direct_store: f_body.extend([Line(), S("__syncthreads()"), Line()]) f_body.extend_log_block( "store fluxes", [ #Assign("debugbuf[blockIdx.x]", "FOF_BLOCK_BASE"), #Assign("debugbuf[0]", "FOF_BLOCK_BASE"), #Assign("debugbuf[0]", "sizeof(face_pair)"), For( "unsigned word_nr = THREAD_NUM", "word_nr < ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT", "word_nr += COALESCING_THREAD_COUNT", Block([ Assign( "gmem_fluxes_on_faces%d[FOF_BLOCK_BASE+word_nr]" % flux_nr, "smem_fluxes_on_faces[%d][word_nr]" % flux_nr) for flux_nr in range(len(self.fluxes)) ] #+[If("isnan(smem_fluxes_on_faces[%d][word_nr])" % flux_nr, #Block([ #Assign("debugbuf[blockIdx.x]", "word_nr"), #]) #) #for flux_nr in range(len(self.fluxes))] )) ]) if False: f_body.extend([ Assign("debugbuf[blockIdx.x*96+32+BLOCK_FACE*32+threadIdx.x]", "fpair_nr"), Assign("debugbuf[blockIdx.x*96+16]", "data.header.same_facepairs_end"), Assign("debugbuf[blockIdx.x*96+17]", "data.header.diff_facepairs_end"), Assign("debugbuf[blockIdx.x*96+18]", "data.header.bdry_facepairs_end"), ]) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file("flux_gather", ".cu").write(str(cmod)) #from pycuda.tools import allow_user_edit mod = SourceModule( #allow_user_edit(cmod, "kernel.cu", "the flux kernel"), cmod, keep="cuda_keep_kernels" in discr.debug) expr_to_texture_map = dict( (dep_expr, mod.get_texref("field%d_tex" % self.dep_to_index[dep_expr])) for dep_expr in self.all_deps) index_list_texref = mod.get_texref("tex_index_lists") index_list_texref.set_address(ilist_data.device_memory, ilist_data.bytes) index_list_texref.set_format( cuda.dtype_to_array_format(ilist_data.type), 1) index_list_texref.set_flags(cuda.TRSF_READ_AS_INTEGER) func = mod.get_function("apply_flux") block = (fplan.threads_per_face(), fplan.parallel_faces, 1) func.prepare( (2 + len(self.fluxes)) * "P", texrefs=expr_to_texture_map.values() + [index_list_texref]) if "cuda_flux" in discr.debug: print "flux: lmem=%d smem=%d regs=%d" % ( func.local_size_bytes, func.shared_size_bytes, func.num_regs) return block, func, expr_to_texture_map
def fake_flux_face_data_block(self, block_count): discr = self.discr given = self.plan.given fh_struct = flux_header_struct(given.float_type, discr.dimensions) fp_struct = face_pair_struct(given.float_type, discr.dimensions) min_headers = [] min_fp_blocks = [] from random import randrange, choice face_dofs = self.plan.dofs_per_face() mp_count = discr.device.get_attribute( cuda.device_attribute.MULTIPROCESSOR_COUNT) # FIXME assert False, "flux planning in the presence of quadrature needs to be fixed" for block_nr in range(mp_count): fp_structs = [] faces = [(mb_nr, mb_el_nr, face_nr) for mb_nr in range(self.plan.microblocks_per_block()) for mb_el_nr in range(given.microblock.elements) for face_nr in range(self.plan.faces_per_el())] def draw_base(): mb_nr, mb_el_nr, face_nr = choice(faces) return (block_nr * given.microblock.aligned_floats * self.plan.microblocks_per_block() + mb_nr * given.microblock.aligned_floats + mb_el_nr * given.dofs_per_el()) def draw_dest(): mb_nr, mb_el_nr, face_nr = choice(faces) return (mb_nr * given.aligned_face_dofs_per_microblock() + mb_el_nr * face_dofs * given.faces_per_el() + face_nr * face_dofs) def bound_int(low, x, hi): return int(min(max(low, x), hi)) from random import gauss pdata = self.plan.partition_data fp_count = bound_int( 0, gauss(pdata.face_pair_avg, (pdata.max_face_pair_count - pdata.face_pair_avg) / 2), pdata.max_face_pair_count) for i in range(fp_count): fp_structs.append( fp_struct.make( h=0.5, order=2, face_jacobian=0.5, normal=discr.dimensions * [0.1], a_base=draw_base(), b_base=draw_base(), a_ilist_index=randrange(self.FAKE_INDEX_LIST_COUNT) * face_dofs, b_ilist_index=randrange(self.FAKE_INDEX_LIST_COUNT) * face_dofs, boundary_bitmap=1, b_write_ilist_index=randrange( self.FAKE_INDEX_LIST_COUNT) * face_dofs, a_dest=draw_dest(), b_dest=draw_dest())) total_ext_face_count = bound_int( 0, pdata.ext_face_avg + randrange(-1, 2), fp_count) bdry_count = min( total_ext_face_count, randrange(1 + int(round(total_ext_face_count / 6)))) diff_count = total_ext_face_count - bdry_count min_headers.append( fh_struct.make(same_facepairs_end=len(fp_structs) - total_ext_face_count, diff_facepairs_end=diff_count, bdry_facepairs_end=bdry_count)) min_fp_blocks.append(fp_structs) dups = block_count // mp_count + 1 headers = (min_headers * dups)[:block_count] fp_blocks = (min_fp_blocks * dups)[:block_count] from cgen import Value from hedge.backends.cuda.tools import make_superblocks return make_superblocks( given.devdata, "flux_data", [(headers, Value(fh_struct.tpname, "header"))], [(fp_blocks, Value(fp_struct.tpname, "facepairs"))])
def flux_face_data_block(self, elgroup): discr = self.discr given = self.plan.given fplan = self.plan headers = [] fp_blocks = [] INVALID_DEST = (1 << 16) - 1 from hedge.backends.cuda import GPUBoundaryFaceStorage fh_struct = flux_header_struct(given.float_type, discr.dimensions) fp_struct = face_pair_struct(given.float_type, discr.dimensions) def find_elface_dest(el_face): num_in_block = discr.find_number_in_block(el_face[0]) mb_index, index_in_mb = divmod(num_in_block, given.microblock.elements) return (mb_index * fplan.aligned_face_dofs_per_microblock() + index_in_mb * elface_dofs + el_face[1] * face_dofs) # {{{ quadrature setup, if necessary if fplan.quadrature_tag is not None: quad_info = discr.get_cuda_quadrature_info(fplan.quadrature_tag) eg_quad_info = discr.get_cuda_elgroup_quadrature_info( elgroup, fplan.quadrature_tag) ldis_quad_info = eg_quad_info.ldis_quad_info def find_el_src_index(el): block = discr.blocks[discr.partition[el.id]] mb_nr, in_mb_nr = divmod(block.el_number_map[el], given.microblock.elements) return ( block.number * fplan.input_dofs_per_block() + mb_nr * eg_quad_info.aligned_int_face_dofs_per_microblock + in_mb_nr * ldis_quad_info.face_node_count() * ldis_quad_info.ldis.face_count()) face_storage_map = quad_info.face_storage_info.map else: find_el_src_index = discr.find_el_gpu_index face_storage_map = discr.face_storage_info.map # }}} int_fp_count, ext_fp_count, bdry_fp_count = 0, 0, 0 for block_nr, block in enumerate(discr.blocks): ldis = block.local_discretization face_dofs = fplan.dofs_per_face elface_dofs = face_dofs * ldis.face_count() faces_todo = set((el, face_nbr) for mb in block.microblocks for el in mb for face_nbr in range(ldis.face_count())) same_fp_structs = [] diff_fp_structs = [] bdry_fp_structs = [] while faces_todo: elface = faces_todo.pop() a_face = face_storage_map[elface] b_face = a_face.opposite if isinstance(b_face, GPUBoundaryFaceStorage): # boundary face b_base = b_face.gpu_bdry_index_in_floats boundary_bitmap = self.executor.elface_to_bdry_bitmap.get( a_face.el_face, 0) b_write_index_list = 0 # doesn't matter b_dest = INVALID_DEST fp_structs = bdry_fp_structs bdry_fp_count += 1 else: # interior face b_base = find_el_src_index(b_face.el_face[0]) boundary_bitmap = 0 if b_face.native_block == a_face.native_block: # same block faces_todo.remove(b_face.el_face) b_write_index_list = a_face.ext_write_index_list_id b_dest = find_elface_dest(b_face.el_face) fp_structs = same_fp_structs int_fp_count += 1 else: # different block b_write_index_list = 0 # doesn't matter b_dest = INVALID_DEST fp_structs = diff_fp_structs ext_fp_count += 1 a_base = find_el_src_index(a_face.el_face[0]) a_dest = find_elface_dest(a_face.el_face) fp_structs.append( fp_struct.make( h=a_face.face_pair_side.h, order=a_face.face_pair_side.order, face_jacobian=a_face.face_pair_side.face_jacobian, normal=a_face.face_pair_side.normal, a_base=a_base, b_base=b_base, a_ilist_index= \ a_face.global_int_flux_index_list_id*face_dofs, b_ilist_index= \ a_face.global_ext_flux_index_list_id*face_dofs, boundary_bitmap=boundary_bitmap, b_write_ilist_index= \ b_write_index_list*face_dofs, a_dest=a_dest, b_dest=b_dest )) headers.append(fh_struct.make( same_facepairs_end=\ len(same_fp_structs), diff_facepairs_end=\ len(same_fp_structs)+len(diff_fp_structs), bdry_facepairs_end=\ len(same_fp_structs)+len(diff_fp_structs) +len(bdry_fp_structs), )) fp_blocks.append(same_fp_structs + diff_fp_structs + bdry_fp_structs) #print len(same_fp_structs), len(diff_fp_structs), len(bdry_fp_structs) from cgen import Value from hedge.backends.cuda.tools import make_superblocks return make_superblocks( given.devdata, "flux_data", [(headers, Value(fh_struct.tpname, "header"))], [(fp_blocks, Value(fp_struct.tpname, "facepairs"))], extra_fields={ "int_fp_count": int_fp_count, "ext_fp_count": ext_fp_count, "bdry_fp_count": bdry_fp_count, "fp_count": int_fp_count + ext_fp_count + bdry_fp_count, })