def test_ifdef(): expected = """#ifdef SOME_DEFINE /* TRUE */ #else /* FALSE */ #endif""" code = IfDef("SOME_DEFINE", [Comment("TRUE")], [Comment("FALSE")]) assert str(code) == expected
def test_ifndef_no_else(): expected = """#ifndef SOME_DEFINE /* TRUE */ #endif""" code = IfNDef("SOME_DEFINE", [Comment("TRUE")]) assert str(code) == expected
def get_load_code(dest, base, bytes, word_type=numpy.uint32, descr=None): from cgen import ( Pointer, POD, Comment, Block, Line, \ Constant, For, Statement) from cgen import dtype_to_ctype copy_dtype = numpy.dtype(word_type) copy_dtype_str = dtype_to_ctype(copy_dtype) code = [] if descr is not None: code.append(Comment(descr)) code.extend([ Block([ Constant(Pointer(POD(copy_dtype, "load_base")), ("(%s *) (%s)" % (copy_dtype_str, base))), For("unsigned word_nr = THREAD_NUM", "word_nr*sizeof(int) < (%s)" % bytes, "word_nr += COALESCING_THREAD_COUNT", Statement("((%s *) (%s))[word_nr] = load_base[word_nr]" % (copy_dtype_str, dest)) ), ]), Line(), ]) return code
def get_cpu_per_element_code(self): from cgen import (Value, Statement, Initializer, While, Comment, Block, For, Line, Pointer) S = Statement return [ # assumes there is more than one coefficient Initializer(Value("cit_type", "el_modes"), "field_it+er.first"), Line(), Comment("zero out reduced_modes"), For("npy_uint32 mode_idx = 0", "mode_idx < max_degree+1", "++mode_idx", S("reduced_modes[mode_idx] = 0")), Line(), Comment("gather modes by degree"), For("npy_uint32 mode_idx = 0", "mode_idx < mode_count", "++mode_idx", S("reduced_modes[mode_degrees_iterator[mode_idx]]" " += el_modes[mode_idx]")), Line(), Comment("perform skyline procedure"), Initializer(Pointer(Value("value_type", "start")), "reduced_modes.get()"), Initializer(Pointer(Value("value_type", "end")), "start+max_degree+1"), Initializer(Value("value_type", "cur_max"), "std::max(*(end-1), *(end-2))"), Line(), While("end != start", Block([ S("--end"), S("*end = std::max(cur_max, *end)"), ])), Line(), Comment("scatter modes by degree"), Initializer(Value("it_type", "tgt_base"), "result_it+er.first"), For("npy_uint32 mode_idx = 0", "mode_idx < mode_count", "++mode_idx", S("tgt_base[mode_idx] = " "reduced_modes[mode_degrees_iterator[mode_idx]]")), ]
def test_cgen(): s = Struct( "yuck", [ POD( np.float32, "h", ), POD(np.float32, "order"), POD(np.float32, "face_jacobian"), ArrayOf(POD(np.float32, "normal"), 17), POD(np.uint16, "a_base"), POD(np.uint16, "b_base"), #CudaGlobal(POD(np.uint8, "a_ilist_number")), POD(np.uint8, "b_ilist_number"), POD(np.uint8, "bdry_flux_number"), # 0 if not on boundary POD(np.uint8, "reserved"), POD(np.uint32, "b_global_base"), ]) f_decl = FunctionDeclaration(POD(np.uint16, "get_num"), [ POD(np.uint8, "reserved"), POD(np.uint32, "b_global_base"), ]) f_body = FunctionBody( f_decl, Block([ POD(np.uint32, "i"), For( "i = 0", "i < 17", "++i", If( "a > b", Assign("a", "b"), Block([ Assign("a", "b-1"), #Break(), ])), ), #BlankLine(), Comment("all done"), ])) t_decl = Template( 'typename T', FunctionDeclaration( Value('CUdeviceptr', 'scan'), [Value('CUdeviceptr', 'inputPtr'), Value('int', 'length')])) print(s) print(f_body) print(t_decl)
def emit_barrier(self, kind, comment): from cgen import Comment, Statement assert comment if kind == "local": return Comment("local barrier: %s" % comment) elif kind == "global": return Statement("sync; /* %s */" % comment) else: raise LoopyError("unknown barrier kind")
def emit_comment(self, s): from cgen import Comment return Comment(s)
def get_scalar_diff_code(): code = [] for inl in range(par.inline): for axis in dims: code.append( Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0)) code.append(Line()) tex_channels = ["x", "y", "z", "w"] store_code = Block() for inl in range(par.inline): for rst_axis in dims: store_code.append(Assign( "drst%d_global[GLOBAL_MB_IMAGE_DOF_BASE + " "%d*ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF]" % (rst_axis, inl), "d%drst%d" % (inl, rst_axis) )) from hedge.backends.cuda.tools import unroll code.extend([ Comment("everybody needs to be done with the old data"), S("__syncthreads()"), Line(), get_load_code(), Line(), Comment("all the new data must be loaded"), S("__syncthreads()"), Line(), ]) if float_type == numpy.float32: code.append(Value("float%d" % rst_channels, "dmat_entries")) code.extend([ POD(float_type, "field_value%d" % inl) for inl in range(par.inline) ]+[Line()]) def unroll_body(j): result = [ Assign("field_value%d" % inl, "smem_field[PAR_MB_NR][%d][mb_el*PREIMAGE_DOFS_PER_EL+%s]" % (inl, j)) for inl in range(par.inline) ] if float_type == numpy.float32: result.append(Assign("dmat_entries", "tex1Dfetch(diff_rst_mat_tex, IMAGE_EL_DOF + %s*IMAGE_DOFS_PER_EL)" % j)) result.extend( S("d%drst%d += dmat_entries.%s * field_value%d" % (inl, axis, tex_channels[axis], inl)) for inl in range(par.inline) for axis in dims) elif float_type == numpy.float64: result.extend( S("d%(inl)drst%(axis)d += " "fp_tex1Dfetch(diff_rst_mat_tex, %(axis)d " "+ DIMENSIONS*(IMAGE_EL_DOF + %(j)d*IMAGE_DOFS_PER_EL))" "* field_value%(inl)d" % { "inl": inl, "axis": axis, "j": j }) for inl in range(par.inline) for axis in dims) else: assert False return result code.append(If("IMAGE_MB_DOF < IMAGE_DOFS_PER_MB", Block(unroll(unroll_body, total_number=plan.preimage_dofs_per_el) +[store_code]))) return code
def get_kernel(self, fdata, ilist_data, for_benchmark): from cgen.cuda import CudaShared, CudaGlobal from pycuda.tools import dtype_to_ctype discr = self.discr given = self.plan.given fplan = self.plan d = discr.dimensions dims = range(d) elgroup, = discr.element_groups float_type = given.float_type f_decl = CudaGlobal( FunctionDeclaration(Value("void", "apply_flux"), [ Pointer(POD(float_type, "debugbuf")), Pointer(POD(numpy.uint8, "gmem_facedata")), ] + [ Pointer(POD(float_type, "gmem_fluxes_on_faces%d" % flux_nr)) for flux_nr in range(len(self.fluxes)) ])) cmod = Module() cmod.append(Include("pycuda-helpers.hpp")) for dep_expr in self.all_deps: cmod.extend([ Value( "texture<%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type, with_fp_tex_hack=True), "field%d_tex" % self.dep_to_index[dep_expr]) ]) if fplan.flux_count != len(self.fluxes): from warnings import warn warn( "Flux count in flux execution plan different from actual flux count.\n" "You may want to specify the tune_for= kwarg in the Discretization\n" "constructor.") cmod.extend([ Line(), Typedef(POD(float_type, "value_type")), Line(), flux_header_struct(float_type, discr.dimensions), Line(), face_pair_struct(float_type, discr.dimensions), Line(), Define("DIMENSIONS", discr.dimensions), Define("DOFS_PER_FACE", fplan.dofs_per_face), Define("THREADS_PER_FACE", fplan.threads_per_face()), Line(), Define("CONCURRENT_FACES", fplan.parallel_faces), Define("BLOCK_MB_COUNT", fplan.mbs_per_block), Line(), Define("FACEDOF_NR", "threadIdx.x"), Define("BLOCK_FACE", "threadIdx.y"), Line(), Define("FLUX_COUNT", len(self.fluxes)), Line(), Define("THREAD_NUM", "(FACEDOF_NR + BLOCK_FACE*THREADS_PER_FACE)"), Define("THREAD_COUNT", "(THREADS_PER_FACE*CONCURRENT_FACES)"), Define( "COALESCING_THREAD_COUNT", "(THREAD_COUNT < 0x10 ? THREAD_COUNT : THREAD_COUNT & ~0xf)"), Line(), Define("DATA_BLOCK_SIZE", fdata.block_bytes), Define("ALIGNED_FACE_DOFS_PER_MB", fplan.aligned_face_dofs_per_microblock()), Define("ALIGNED_FACE_DOFS_PER_BLOCK", "(ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT)"), Line(), Define("FOF_BLOCK_BASE", "(blockIdx.x*ALIGNED_FACE_DOFS_PER_BLOCK)"), Line(), ] + ilist_data.code + [ Line(), Value("texture<index_list_entry_t, 1, cudaReadModeElementType>", "tex_index_lists"), Line(), fdata.struct, Line(), CudaShared(Value("flux_data", "data")), ]) if not fplan.direct_store: cmod.extend([ CudaShared( ArrayOf( ArrayOf(POD(float_type, "smem_fluxes_on_faces"), "FLUX_COUNT"), "ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT")), Line(), ]) S = Statement f_body = Block() from hedge.backends.cuda.tools import get_load_code f_body.extend( get_load_code(dest="&data", base="gmem_facedata + blockIdx.x*DATA_BLOCK_SIZE", bytes="sizeof(flux_data)", descr="load face_pair data") + [S("__syncthreads()"), Line()]) def get_flux_code(flux_writer): flux_code = Block([]) flux_code.extend([ Initializer(Pointer(Value("face_pair", "fpair")), "data.facepairs+fpair_nr"), Initializer( MaybeUnused(POD(numpy.uint32, "a_index")), "fpair->a_base + tex1Dfetch(tex_index_lists, " "fpair->a_ilist_index + FACEDOF_NR)"), Initializer( MaybeUnused(POD(numpy.uint32, "b_index")), "fpair->b_base + tex1Dfetch(tex_index_lists, " "fpair->b_ilist_index + FACEDOF_NR)"), Line(), flux_writer(), Line(), S("fpair_nr += CONCURRENT_FACES") ]) return flux_code flux_computation = Block([ Comment("fluxes for dual-sided (intra-block) interior face pairs"), While("fpair_nr < data.header.same_facepairs_end", get_flux_code(lambda: self.write_interior_flux_code(True))), Line(), Comment("work around nvcc assertion failure"), S("fpair_nr+=1"), S("fpair_nr-=1"), Line(), Comment( "fluxes for single-sided (inter-block) interior face pairs"), While("fpair_nr < data.header.diff_facepairs_end", get_flux_code(lambda: self.write_interior_flux_code(False))), Line(), Comment("fluxes for single-sided boundary face pairs"), While( "fpair_nr < data.header.bdry_facepairs_end", get_flux_code( lambda: self.write_boundary_flux_code(for_benchmark))), ]) f_body.extend_log_block("compute the fluxes", [ Initializer(POD(numpy.uint32, "fpair_nr"), "BLOCK_FACE"), If("FACEDOF_NR < DOFS_PER_FACE", flux_computation) ]) if not fplan.direct_store: f_body.extend([Line(), S("__syncthreads()"), Line()]) f_body.extend_log_block( "store fluxes", [ #Assign("debugbuf[blockIdx.x]", "FOF_BLOCK_BASE"), #Assign("debugbuf[0]", "FOF_BLOCK_BASE"), #Assign("debugbuf[0]", "sizeof(face_pair)"), For( "unsigned word_nr = THREAD_NUM", "word_nr < ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT", "word_nr += COALESCING_THREAD_COUNT", Block([ Assign( "gmem_fluxes_on_faces%d[FOF_BLOCK_BASE+word_nr]" % flux_nr, "smem_fluxes_on_faces[%d][word_nr]" % flux_nr) for flux_nr in range(len(self.fluxes)) ] #+[If("isnan(smem_fluxes_on_faces[%d][word_nr])" % flux_nr, #Block([ #Assign("debugbuf[blockIdx.x]", "word_nr"), #]) #) #for flux_nr in range(len(self.fluxes))] )) ]) if False: f_body.extend([ Assign("debugbuf[blockIdx.x*96+32+BLOCK_FACE*32+threadIdx.x]", "fpair_nr"), Assign("debugbuf[blockIdx.x*96+16]", "data.header.same_facepairs_end"), Assign("debugbuf[blockIdx.x*96+17]", "data.header.diff_facepairs_end"), Assign("debugbuf[blockIdx.x*96+18]", "data.header.bdry_facepairs_end"), ]) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file("flux_gather", ".cu").write(str(cmod)) #from pycuda.tools import allow_user_edit mod = SourceModule( #allow_user_edit(cmod, "kernel.cu", "the flux kernel"), cmod, keep="cuda_keep_kernels" in discr.debug) expr_to_texture_map = dict( (dep_expr, mod.get_texref("field%d_tex" % self.dep_to_index[dep_expr])) for dep_expr in self.all_deps) index_list_texref = mod.get_texref("tex_index_lists") index_list_texref.set_address(ilist_data.device_memory, ilist_data.bytes) index_list_texref.set_format( cuda.dtype_to_array_format(ilist_data.type), 1) index_list_texref.set_flags(cuda.TRSF_READ_AS_INTEGER) func = mod.get_function("apply_flux") block = (fplan.threads_per_face(), fplan.parallel_faces, 1) func.prepare( (2 + len(self.fluxes)) * "P", texrefs=expr_to_texture_map.values() + [index_list_texref]) if "cuda_flux" in discr.debug: print "flux: lmem=%d smem=%d regs=%d" % ( func.local_size_bytes, func.shared_size_bytes, func.num_regs) return block, func, expr_to_texture_map
def write_boundary_flux_code(self, for_benchmark): given = self.plan.given flux_write_code = Block() fluxes_by_bdry_number = {} for flux_nr, wdflux in enumerate(self.fluxes): for bflux_info in wdflux.boundaries: if for_benchmark: bdry_number = 0 else: bdry_number = self.executor.boundary_tag_to_number[ bflux_info.bpair.tag] fluxes_by_bdry_number.setdefault(bdry_number, [])\ .append((flux_nr, bflux_info)) flux_write_code.extend([ Initializer(MaybeUnused(POD(given.float_type, "flux%d" % flux_nr)), 0) for flux_nr in range(len(self.fluxes)) ]) for bdry_number, nrs_and_fluxes in fluxes_by_bdry_number.iteritems(): bblock = [] from pytools import set_sum int_deps = set_sum(flux_rec.int_dependencies for flux_nr, flux_rec in nrs_and_fluxes) ext_deps = set_sum(flux_rec.ext_dependencies for flux_nr, flux_rec in nrs_and_fluxes) for dep in int_deps: bblock.extend([ Comment(str(dep)), Initializer( MaybeUnused( POD(given.float_type, "val_a_field%d" % self.dep_to_index[dep])), "fp_tex1Dfetch(field%d_tex, a_index)" % self.dep_to_index[dep]) ]) for dep in ext_deps: bblock.extend([ Comment(str(dep)), Initializer( MaybeUnused( POD(given.float_type, "val_b_field%d" % self.dep_to_index[dep])), "fp_tex1Dfetch(field%s_tex, b_index)" % self.dep_to_index[dep]) ]) f2cm = FluxToCodeMapper(given.float_type) comp_code = [Line()] for flux_nr, flux_rec in nrs_and_fluxes: comp_code.append( Statement( ("flux%d += " % flux_nr) + flux_to_code(f2cm, is_flipped=False, int_field_expr=flux_rec.bpair.field, ext_field_expr=flux_rec.bpair.bfield, dep_to_index=self.dep_to_index, flux=flux_rec.flux_expr, prec=PREC_NONE))) if f2cm.cse_name_list: bblock.append(Line()) bblock.extend( Initializer(Value("value_type", cse_name), cse_str) for cse_name, cse_str in f2cm.cse_name_list) flux_write_code.extend([ Line(), Comment(nrs_and_fluxes[0][1].bpair.tag), If("(fpair->boundary_bitmap) & (1 << %d)" % (bdry_number), Block(bblock + comp_code)), ]) flux_write_code.extend( [ Line(), ] + [ self.gen_store(flux_nr, "fpair->a_dest+FACEDOF_NR", "fpair->face_jacobian * flux%d" % flux_nr) for flux_nr in range(len(self.fluxes)) ] #Assign("debugbuf[blockIdx.x*96+fpair_nr]", "10000+fpair->a_dest"), ) return flux_write_code
def write_interior_flux_code(self, is_twosided): given = self.plan.given def get_field(flux_rec, is_interior, flipped): if is_interior ^ flipped: prefix = "a" else: prefix = "b" return ("val_%s_field%d" % (prefix, self.dep_to_index[flux_rec.field_expr])) flux_write_code = Block([]) flux_var_decl = [Initializer(POD(given.float_type, "a_flux"), 0)] if is_twosided: flux_var_decl.append( Initializer(POD(given.float_type, "b_flux"), 0)) prefixes = ["a", "b"] flip_values = [False, True] else: prefixes = ["a"] flip_values = [False] flux_write_code.append(Line()) for dep in self.interior_deps: flux_write_code.append(Comment(str(dep))) for side in ["a", "b"]: flux_write_code.append( Initializer( MaybeUnused( POD( given.float_type, "val_%s_field%d" % (side, self.dep_to_index[dep]))), "fp_tex1Dfetch(field%d_tex, %s_index)" % (self.dep_to_index[dep], side))) f2cm = FluxToCodeMapper(given.float_type) flux_sub_codes = [] for flux_nr, wdflux in enumerate(self.fluxes): my_flux_block = Block(flux_var_decl) for int_rec in wdflux.interiors: for prefix, is_flipped in zip(prefixes, flip_values): my_flux_block.append( Statement("%s_flux += %s" % ( prefix, flux_to_code(f2cm, is_flipped, int_rec.field_expr, int_rec.field_expr, self.dep_to_index, int_rec.flux_expr, PREC_NONE), ))) my_flux_block.append(Line()) my_flux_block.append( self.gen_store(flux_nr, "fpair->a_dest+FACEDOF_NR", "fpair->face_jacobian*a_flux")) #my_flux_block.append( #Statement("if(isnan(val_b_field5)) debugbuf[blockIdx.x] = 1"), #) if is_twosided: my_flux_block.append( self.gen_store( flux_nr, "fpair->b_dest+tex1Dfetch(tex_index_lists, " "fpair->b_write_ilist_index + FACEDOF_NR)", "fpair->face_jacobian*b_flux")) #my_flux_block.append( #Assign("debugbuf[blockIdx.x*96+fpair_nr+8]", "10000+fpair->b_dest"), #) flux_sub_codes.append(my_flux_block) if f2cm.cse_name_list: flux_write_code.append(Line()) flux_write_code.extend( Initializer(Value("value_type", cse_name), cse_str) for cse_name, cse_str in f2cm.cse_name_list) flux_write_code.extend(flux_sub_codes) return flux_write_code