def test_cgen(): s = Struct( "yuck", [ POD( np.float32, "h", ), POD(np.float32, "order"), POD(np.float32, "face_jacobian"), ArrayOf(POD(np.float32, "normal"), 17), POD(np.uint16, "a_base"), POD(np.uint16, "b_base"), #CudaGlobal(POD(np.uint8, "a_ilist_number")), POD(np.uint8, "b_ilist_number"), POD(np.uint8, "bdry_flux_number"), # 0 if not on boundary POD(np.uint8, "reserved"), POD(np.uint32, "b_global_base"), ]) u = Union( "yuck", [ POD( np.float32, "h", ), POD(np.float32, "order"), POD(np.float32, "face_jacobian"), ArrayOf(POD(np.float32, "normal"), 17), POD(np.uint16, "a_base"), POD(np.uint16, "b_base"), #CudaGlobal(POD(np.uint8, "a_ilist_number")), POD(np.uint8, "b_ilist_number"), POD(np.uint8, "bdry_flux_number"), # 0 if not on boundary POD(np.uint8, "reserved"), POD(np.uint32, "b_global_base"), ]) f_decl = FunctionDeclaration(POD(np.uint16, "get_num"), [ POD(np.uint8, "reserved"), POD(np.uint32, "b_global_base"), ]) f_body = FunctionBody( f_decl, Block([ POD(np.uint32, "i"), For( "i = 0", "i < 17", "++i", If( "a > b", Assign("a", "b"), Block([ Assign("a", "b-1"), #Break(), ])), ), #BlankLine(), Comment("all done"), ])) t_decl = Template( 'typename T', FunctionDeclaration( Value('CUdeviceptr', 'scan'), [Value('CUdeviceptr', 'inputPtr'), Value('int', 'length')])) print(s) print(u) print(f_body) print(t_decl)
def get_kernel(self, fdata, ilist_data, for_benchmark): from cgen.cuda import CudaShared, CudaGlobal from pycuda.tools import dtype_to_ctype discr = self.discr given = self.plan.given fplan = self.plan d = discr.dimensions dims = range(d) elgroup, = discr.element_groups float_type = given.float_type f_decl = CudaGlobal( FunctionDeclaration(Value("void", "apply_flux"), [ Pointer(POD(float_type, "debugbuf")), Pointer(POD(numpy.uint8, "gmem_facedata")), ] + [ Pointer(POD(float_type, "gmem_fluxes_on_faces%d" % flux_nr)) for flux_nr in range(len(self.fluxes)) ])) cmod = Module() cmod.append(Include("pycuda-helpers.hpp")) for dep_expr in self.all_deps: cmod.extend([ Value( "texture<%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type, with_fp_tex_hack=True), "field%d_tex" % self.dep_to_index[dep_expr]) ]) if fplan.flux_count != len(self.fluxes): from warnings import warn warn( "Flux count in flux execution plan different from actual flux count.\n" "You may want to specify the tune_for= kwarg in the Discretization\n" "constructor.") cmod.extend([ Line(), Typedef(POD(float_type, "value_type")), Line(), flux_header_struct(float_type, discr.dimensions), Line(), face_pair_struct(float_type, discr.dimensions), Line(), Define("DIMENSIONS", discr.dimensions), Define("DOFS_PER_FACE", fplan.dofs_per_face), Define("THREADS_PER_FACE", fplan.threads_per_face()), Line(), Define("CONCURRENT_FACES", fplan.parallel_faces), Define("BLOCK_MB_COUNT", fplan.mbs_per_block), Line(), Define("FACEDOF_NR", "threadIdx.x"), Define("BLOCK_FACE", "threadIdx.y"), Line(), Define("FLUX_COUNT", len(self.fluxes)), Line(), Define("THREAD_NUM", "(FACEDOF_NR + BLOCK_FACE*THREADS_PER_FACE)"), Define("THREAD_COUNT", "(THREADS_PER_FACE*CONCURRENT_FACES)"), Define( "COALESCING_THREAD_COUNT", "(THREAD_COUNT < 0x10 ? THREAD_COUNT : THREAD_COUNT & ~0xf)"), Line(), Define("DATA_BLOCK_SIZE", fdata.block_bytes), Define("ALIGNED_FACE_DOFS_PER_MB", fplan.aligned_face_dofs_per_microblock()), Define("ALIGNED_FACE_DOFS_PER_BLOCK", "(ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT)"), Line(), Define("FOF_BLOCK_BASE", "(blockIdx.x*ALIGNED_FACE_DOFS_PER_BLOCK)"), Line(), ] + ilist_data.code + [ Line(), Value("texture<index_list_entry_t, 1, cudaReadModeElementType>", "tex_index_lists"), Line(), fdata.struct, Line(), CudaShared(Value("flux_data", "data")), ]) if not fplan.direct_store: cmod.extend([ CudaShared( ArrayOf( ArrayOf(POD(float_type, "smem_fluxes_on_faces"), "FLUX_COUNT"), "ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT")), Line(), ]) S = Statement f_body = Block() from hedge.backends.cuda.tools import get_load_code f_body.extend( get_load_code(dest="&data", base="gmem_facedata + blockIdx.x*DATA_BLOCK_SIZE", bytes="sizeof(flux_data)", descr="load face_pair data") + [S("__syncthreads()"), Line()]) def get_flux_code(flux_writer): flux_code = Block([]) flux_code.extend([ Initializer(Pointer(Value("face_pair", "fpair")), "data.facepairs+fpair_nr"), Initializer( MaybeUnused(POD(numpy.uint32, "a_index")), "fpair->a_base + tex1Dfetch(tex_index_lists, " "fpair->a_ilist_index + FACEDOF_NR)"), Initializer( MaybeUnused(POD(numpy.uint32, "b_index")), "fpair->b_base + tex1Dfetch(tex_index_lists, " "fpair->b_ilist_index + FACEDOF_NR)"), Line(), flux_writer(), Line(), S("fpair_nr += CONCURRENT_FACES") ]) return flux_code flux_computation = Block([ Comment("fluxes for dual-sided (intra-block) interior face pairs"), While("fpair_nr < data.header.same_facepairs_end", get_flux_code(lambda: self.write_interior_flux_code(True))), Line(), Comment("work around nvcc assertion failure"), S("fpair_nr+=1"), S("fpair_nr-=1"), Line(), Comment( "fluxes for single-sided (inter-block) interior face pairs"), While("fpair_nr < data.header.diff_facepairs_end", get_flux_code(lambda: self.write_interior_flux_code(False))), Line(), Comment("fluxes for single-sided boundary face pairs"), While( "fpair_nr < data.header.bdry_facepairs_end", get_flux_code( lambda: self.write_boundary_flux_code(for_benchmark))), ]) f_body.extend_log_block("compute the fluxes", [ Initializer(POD(numpy.uint32, "fpair_nr"), "BLOCK_FACE"), If("FACEDOF_NR < DOFS_PER_FACE", flux_computation) ]) if not fplan.direct_store: f_body.extend([Line(), S("__syncthreads()"), Line()]) f_body.extend_log_block( "store fluxes", [ #Assign("debugbuf[blockIdx.x]", "FOF_BLOCK_BASE"), #Assign("debugbuf[0]", "FOF_BLOCK_BASE"), #Assign("debugbuf[0]", "sizeof(face_pair)"), For( "unsigned word_nr = THREAD_NUM", "word_nr < ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT", "word_nr += COALESCING_THREAD_COUNT", Block([ Assign( "gmem_fluxes_on_faces%d[FOF_BLOCK_BASE+word_nr]" % flux_nr, "smem_fluxes_on_faces[%d][word_nr]" % flux_nr) for flux_nr in range(len(self.fluxes)) ] #+[If("isnan(smem_fluxes_on_faces[%d][word_nr])" % flux_nr, #Block([ #Assign("debugbuf[blockIdx.x]", "word_nr"), #]) #) #for flux_nr in range(len(self.fluxes))] )) ]) if False: f_body.extend([ Assign("debugbuf[blockIdx.x*96+32+BLOCK_FACE*32+threadIdx.x]", "fpair_nr"), Assign("debugbuf[blockIdx.x*96+16]", "data.header.same_facepairs_end"), Assign("debugbuf[blockIdx.x*96+17]", "data.header.diff_facepairs_end"), Assign("debugbuf[blockIdx.x*96+18]", "data.header.bdry_facepairs_end"), ]) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file("flux_gather", ".cu").write(str(cmod)) #from pycuda.tools import allow_user_edit mod = SourceModule( #allow_user_edit(cmod, "kernel.cu", "the flux kernel"), cmod, keep="cuda_keep_kernels" in discr.debug) expr_to_texture_map = dict( (dep_expr, mod.get_texref("field%d_tex" % self.dep_to_index[dep_expr])) for dep_expr in self.all_deps) index_list_texref = mod.get_texref("tex_index_lists") index_list_texref.set_address(ilist_data.device_memory, ilist_data.bytes) index_list_texref.set_format( cuda.dtype_to_array_format(ilist_data.type), 1) index_list_texref.set_flags(cuda.TRSF_READ_AS_INTEGER) func = mod.get_function("apply_flux") block = (fplan.threads_per_face(), fplan.parallel_faces, 1) func.prepare( (2 + len(self.fluxes)) * "P", texrefs=expr_to_texture_map.values() + [index_list_texref]) if "cuda_flux" in discr.debug: print "flux: lmem=%d smem=%d regs=%d" % ( func.local_size_bytes, func.shared_size_bytes, func.num_regs) return block, func, expr_to_texture_map
def get_kernel(self, diff_op, elgroup, for_benchmark=False): from cgen import \ Pointer, POD, Value, ArrayOf, Const, \ Module, FunctionDeclaration, FunctionBody, Block, \ Comment, Line, Define, Include, \ Initializer, If, For, Statement, Assign from pycuda.tools import dtype_to_ctype from cgen.cuda import CudaShared, CudaGlobal discr = self.discr d = discr.dimensions dims = range(d) plan = self.plan given = plan.given elgroup, = discr.element_groups float_type = given.float_type f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_diff_mat_smem"), [Pointer(POD(float_type, "debugbuf")), Pointer(POD(float_type, "field")), ] + [Pointer(POD(float_type, "drst%d_global" % i)) for i in dims] )) par = plan.parallelism cmod = Module([ Include("pycuda-helpers.hpp"), ]) if float_type == numpy.float64: cmod.append(Value("texture<fp_tex_double, 1, cudaReadModeElementType>", "diff_rst_mat_tex")) elif float_type == numpy.float32: rst_channels = given.devdata.make_valid_tex_channel_count(d) cmod.append(Value("texture<float%d, 1, cudaReadModeElementType>" % rst_channels, "diff_rst_mat_tex")) else: raise ValueError("unsupported float type: %s" % float_type) # only preimage size variation is supported here assert plan.image_dofs_per_el == given.dofs_per_el() assert plan.aligned_image_dofs_per_microblock == given.microblock.aligned_floats # FIXME: aligned_image_dofs_per_microblock must be divisible # by this, therefore hardcoding for now. chunk_size = 16 cmod.extend([ Line(), Define("DIMENSIONS", discr.dimensions), Define("IMAGE_DOFS_PER_EL", plan.image_dofs_per_el), Define("PREIMAGE_DOFS_PER_EL", plan.preimage_dofs_per_el), Define("ALIGNED_IMAGE_DOFS_PER_MB", plan.aligned_image_dofs_per_microblock), Define("ALIGNED_PREIMAGE_DOFS_PER_MB", plan.aligned_preimage_dofs_per_microblock), Define("ELS_PER_MB", given.microblock.elements), Define("IMAGE_DOFS_PER_MB", "(IMAGE_DOFS_PER_EL*ELS_PER_MB)"), Line(), Define("CHUNK_SIZE", chunk_size), Define("CHUNK_DOF", "threadIdx.x"), Define("PAR_MB_NR", "threadIdx.y"), Define("CHUNK_NR", "threadIdx.z"), Define("IMAGE_MB_DOF", "(CHUNK_NR*CHUNK_SIZE+CHUNK_DOF)"), Define("IMAGE_EL_DOF", "(IMAGE_MB_DOF - mb_el*IMAGE_DOFS_PER_EL)"), Line(), Define("MACROBLOCK_NR", "blockIdx.x"), Line(), Define("PAR_MB_COUNT", par.parallel), Define("INLINE_MB_COUNT", par.inline), Define("SEQ_MB_COUNT", par.serial), Line(), Define("GLOBAL_MB_NR_BASE", "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"), Define("GLOBAL_MB_NR", "(GLOBAL_MB_NR_BASE" "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"), Define("GLOBAL_MB_IMAGE_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_IMAGE_DOFS_PER_MB)"), Define("GLOBAL_MB_PREIMAGE_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"), Line(), CudaShared( ArrayOf( ArrayOf( ArrayOf( POD(float_type, "smem_field"), "PAR_MB_COUNT"), "INLINE_MB_COUNT"), "ALIGNED_PREIMAGE_DOFS_PER_MB")), Line(), ]) S = Statement f_body = Block([ Initializer(Const(POD(numpy.uint16, "mb_el")), "IMAGE_MB_DOF / IMAGE_DOFS_PER_EL"), Line(), ]) # --------------------------------------------------------------------- def get_load_code(): mb_img_dofs = plan.aligned_image_dofs_per_microblock mb_preimg_dofs = plan.aligned_preimage_dofs_per_microblock preimg_dofs_over_dofs = (mb_preimg_dofs+mb_img_dofs-1) // mb_img_dofs load_code = [] store_code = [] var_num = 0 for load_block in range(preimg_dofs_over_dofs): for inl in range(par.inline): # load and store are split for better pipelining # compiler can't figure that out because of branch var = "tmp%d" % var_num var_num += 1 load_code.append(POD(float_type, var)) block_addr = "%d * ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF" % load_block load_instr = Assign(var, "field[GLOBAL_MB_PREIMAGE_DOF_BASE" " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB" " + %s]" % (inl, block_addr)) store_instr = Assign( "smem_field[PAR_MB_NR][%d][%s]" % (inl, block_addr), var ) if (load_block+1)*mb_img_dofs >= mb_preimg_dofs: cond = "%s < ALIGNED_PREIMAGE_DOFS_PER_MB" % block_addr load_instr = If(cond, load_instr) store_instr = If(cond, store_instr) load_code.append(load_instr) store_code.append(store_instr) return Block(load_code + [Line()] + store_code) def get_scalar_diff_code(): code = [] for inl in range(par.inline): for axis in dims: code.append( Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0)) code.append(Line()) tex_channels = ["x", "y", "z", "w"] store_code = Block() for inl in range(par.inline): for rst_axis in dims: store_code.append(Assign( "drst%d_global[GLOBAL_MB_IMAGE_DOF_BASE + " "%d*ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF]" % (rst_axis, inl), "d%drst%d" % (inl, rst_axis) )) from hedge.backends.cuda.tools import unroll code.extend([ Comment("everybody needs to be done with the old data"), S("__syncthreads()"), Line(), get_load_code(), Line(), Comment("all the new data must be loaded"), S("__syncthreads()"), Line(), ]) if float_type == numpy.float32: code.append(Value("float%d" % rst_channels, "dmat_entries")) code.extend([ POD(float_type, "field_value%d" % inl) for inl in range(par.inline) ]+[Line()]) def unroll_body(j): result = [ Assign("field_value%d" % inl, "smem_field[PAR_MB_NR][%d][mb_el*PREIMAGE_DOFS_PER_EL+%s]" % (inl, j)) for inl in range(par.inline) ] if float_type == numpy.float32: result.append(Assign("dmat_entries", "tex1Dfetch(diff_rst_mat_tex, IMAGE_EL_DOF + %s*IMAGE_DOFS_PER_EL)" % j)) result.extend( S("d%drst%d += dmat_entries.%s * field_value%d" % (inl, axis, tex_channels[axis], inl)) for inl in range(par.inline) for axis in dims) elif float_type == numpy.float64: result.extend( S("d%(inl)drst%(axis)d += " "fp_tex1Dfetch(diff_rst_mat_tex, %(axis)d " "+ DIMENSIONS*(IMAGE_EL_DOF + %(j)d*IMAGE_DOFS_PER_EL))" "* field_value%(inl)d" % { "inl": inl, "axis": axis, "j": j }) for inl in range(par.inline) for axis in dims) else: assert False return result code.append(If("IMAGE_MB_DOF < IMAGE_DOFS_PER_MB", Block(unroll(unroll_body, total_number=plan.preimage_dofs_per_el) +[store_code]))) return code f_body.extend([ For("unsigned short seq_mb_number = 0", "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number", Block(get_scalar_diff_code()) ) ]) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file("diff", ".cu").write(str(cmod)) mod = SourceModule(cmod, keep="cuda_keep_kernels" in discr.debug, #options=["--maxrregcount=16"] ) func = mod.get_function("apply_diff_mat_smem") if "cuda_diff" in discr.debug: print "diff: lmem=%d smem=%d regs=%d" % ( func.local_size_bytes, func.shared_size_bytes, func.registers) diff_rst_mat_texref = mod.get_texref("diff_rst_mat_tex") gpu_diffmats = self.gpu_diffmats(diff_op, elgroup) if given.float_type == numpy.float32: gpu_diffmats.bind_to_texref_ext(diff_rst_mat_texref, rst_channels) elif given.float_type == numpy.float64: gpu_diffmats.bind_to_texref_ext(diff_rst_mat_texref, allow_double_hack=True) else: assert False assert given.microblock.aligned_floats % chunk_size == 0 block = ( chunk_size, plan.parallelism.parallel, given.microblock.aligned_floats//chunk_size) func.prepare( ["PP"] + discr.dimensions*["P"], texrefs=[diff_rst_mat_texref]) return block, func
def get_boundary_flux_mod(fluxes, fvi, discr, dtype): from cgen import \ FunctionDeclaration, FunctionBody, Typedef, Struct, \ Const, Reference, Value, POD, MaybeUnused, \ Statement, Include, Line, Block, Initializer, Assign, \ CustomLoop, For from pytools import to_uncomplex_dtype, flatten from codepy.bpl import BoostPythonModule mod = BoostPythonModule() mod.add_to_preamble([ Include("cstdlib"), Include("algorithm"), Line(), Include("boost/foreach.hpp"), Line(), Include("hedge/face_operators.hpp"), ]) S = Statement mod.add_to_module([ S("using namespace hedge"), S("using namespace pyublas"), Line(), Typedef(POD(dtype, "value_type")), Typedef(POD(to_uncomplex_dtype(dtype), "uncomplex_type")), ]) arg_struct = Struct("arg_struct", [ Value("numpy_array<value_type>", "flux%d_on_faces" % i) for i in range(len(fluxes)) ]+[ Value("numpy_array<value_type>", arg_name) for arg_name in fvi.arg_names ]) mod.add_struct(arg_struct, "ArgStruct") mod.add_to_module([Line()]) fdecl = FunctionDeclaration( Value("void", "gather_flux"), [ Const(Reference(Value("face_group<face_pair<straight_face> >" , "fg"))), Reference(Value("arg_struct", "args")) ]) from pymbolic.mapper.stringifier import PREC_PRODUCT def gen_flux_code(): f2cm = FluxToCodeMapper() result = [ Assign("fof%d_it[loc_fof_base+i]" % flux_idx, "uncomplex_type(fp.int_side.face_jacobian) * " + flux_to_code(f2cm, False, flux_idx, fvi, flux.op.flux, PREC_PRODUCT)) for flux_idx, flux in enumerate(fluxes) ] return [ Initializer(Value("value_type", cse_name), cse_str) for cse_name, cse_str in f2cm.cse_name_list] + result fbody = Block([ Initializer( Const(Value("numpy_array<value_type>::iterator", "fof%d_it" % i)), "args.flux%d_on_faces.begin()" % i) for i in range(len(fluxes)) ]+[ Initializer( Const(Value("numpy_array<value_type>::const_iterator", "%s_it" % arg_name)), "args.%s.begin()" % arg_name) for arg_name in fvi.arg_names ]+[ Line(), CustomLoop("BOOST_FOREACH(const face_pair<straight_face> &fp, fg.face_pairs)", Block( list(flatten([ Initializer(Value("node_number_t", "%s_ebi" % where), "fp.%s.el_base_index" % where), Initializer(Value("index_lists_t::const_iterator", "%s_idx_list" % where), "fg.index_list(fp.%s.face_index_list_number)" % where), Line(), ] for where in ["int_side", "ext_side"] ))+[ Line(), Initializer(Value("node_number_t", "loc_fof_base"), "fg.face_length()*(fp.%(where)s.local_el_number*fg.face_count" " + fp.%(where)s.face_id)" % {"where": "int_side"}), Line(), For( "unsigned i = 0", "i < fg.face_length()", "++i", Block( [ Initializer(MaybeUnused( Value("node_number_t", "%s_idx" % where)), "%(where)s_ebi + %(where)s_idx_list[i]" % {"where": where}) for where in ["int_side", "ext_side"] ]+gen_flux_code() ) ) ])) ]) mod.add_function(FunctionBody(fdecl, fbody)) #print "----------------------------------------------------------------" #print mod.generate() #raw_input("[Enter]") return mod.compile(get_flux_toolchain(discr, fluxes))
def create_native(self): from cgen import (ArrayOf, POD, Block, For, Statement, Struct) from cgen import dtype_to_ctype import numpy members = [] code = [] for pk, pv in config.parameters.iteritems(): if isinstance(pv, int): members.append(POD(numpy.int, pk)) code.append( Statement("params.%s = extract<%s>(cppdict[\"%s\"])" % (pk, dtype_to_ctype(numpy.int), pk))) elif isinstance(pv, float): members.append(POD(numpy.float64, pk)) code.append( Statement("params.%s = extract<%s>(cppdict[\"%s\"])" % (pk, dtype_to_ctype(numpy.float64), pk))) elif isinstance(pv, list): if isinstance(pv[0], int): members.append(ArrayOf(POD(numpy.int, pk), len(pv))) code.append( Block([ Statement("list v = extract<%s>(cppdict[\"%s\"])" % (list.__name__, pk)), For( "unsigned int i = 0", "i<len(v)", "++i", Statement("params.%s[i] = extract<%s>(v[i])" % (pk, dtype_to_ctype(numpy.int)))), ])) elif isinstance(pv[0], float): members.append(ArrayOf(POD(numpy.float64, pk), len(pv))) code.append( Block([ Statement("list v = extract<%s>(cppdict[\"%s\"])" % (list.__name__, pk)), For( "unsigned int i = 0", "i < len(v)", "++i", Block([ Statement( "params.%s[i] = extract<%s>(v[i])" % (pk, dtype_to_ctype(numpy.float64))), Statement( "//std::cout << params.%s[i] << std::endl" % (pk)) ])), ])) mystruct = Struct('Parameters', members) mycode = Block(code) # print mystruct # print mycode from jinja2 import Template tpl = Template(""" #include <boost/python.hpp> #include <boost/python/object.hpp> #include <boost/python/extract.hpp> #include <boost/python/list.hpp> #include <boost/python/dict.hpp> #include <boost/python/str.hpp> #include <stdexcept> #include <iostream> {{my_struct}} Parameters params; void CopyDictionary(boost::python::object pydict) { using namespace boost::python; extract< dict > cppdict_ext(pydict); if(!cppdict_ext.check()){ throw std::runtime_error( "PassObj::pass_dict: type error: not a python dict."); } dict cppdict = cppdict_ext(); list keylist = cppdict.keys(); {{my_extractor}} } BOOST_PYTHON_MODULE({{my_module}}) { boost::python::def("copy_dict", &CopyDictionary); } """) rendered_tpl = tpl.render(my_module="NativeParameters", my_extractor=mycode, my_struct=mystruct) # print rendered_tpl from codepy.toolchain import NVCCToolchain import codepy.toolchain kwargs = codepy.toolchain._guess_toolchain_kwargs_from_python_config() # print kwargs kwargs["cc"] = "nvcc" # kwargs["cflags"]=["-m64","-x","cu","-Xcompiler","-fPIC","-ccbin","/opt/local/bin/g++-mp-4.4"] kwargs["cflags"] = ["-m64", "-x", "cu", "-Xcompiler", "-fPIC"] kwargs["include_dirs"].append("/usr/local/cuda/include") kwargs["defines"] = [] kwargs["ldflags"] = ["-shared"] # kwargs["libraries"]=["python2.7"] kwargs["libraries"] = ["python2.6"] print kwargs toolchain = NVCCToolchain(**kwargs) from codepy.libraries import add_boost_python add_boost_python(toolchain) from codepy.jit import extension_from_string mymod = extension_from_string(toolchain, "NativeParameters", rendered_tpl) mymod.copy_dict(config.parameters)
def get_kernel(self, diff_op_cls, elgroup, for_benchmark=False): from cgen import \ Pointer, POD, Value, ArrayOf, \ Module, FunctionDeclaration, FunctionBody, Block, \ Line, Define, Include, \ Initializer, If, For, Statement, Assign from cgen import dtype_to_ctype from cgen.cuda import CudaShared, CudaGlobal discr = self.discr d = discr.dimensions dims = range(d) given = self.plan.given par = self.plan.parallelism diffmat_data = self.gpu_diffmats(diff_op_cls, elgroup) elgroup, = discr.element_groups float_type = given.float_type f_decl = CudaGlobal( FunctionDeclaration( Value("void", "apply_diff_mat"), [ Pointer(POD(numpy.uint8, "gmem_diff_rst_mat")), #Pointer(POD(float_type, "debugbuf")), ] + [Pointer(POD(float_type, "drst%d_global" % i)) for i in dims])) rst_channels = given.devdata.make_valid_tex_channel_count(d) cmod = Module([ Include("pycuda-helpers.hpp"), Line(), Value( "texture<fp_tex_%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type), "field_tex"), Line(), Define("DIMENSIONS", discr.dimensions), Define("DOFS_PER_EL", given.dofs_per_el()), Line(), Define("SEGMENT_DOF", "threadIdx.x"), Define("PAR_MB_NR", "threadIdx.y"), Line(), Define("MB_SEGMENT", "blockIdx.x"), Define("MACROBLOCK_NR", "blockIdx.y"), Line(), Define("DOFS_PER_SEGMENT", self.plan.segment_size), Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()), Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats), Define("ELS_PER_MB", given.microblock.elements), Line(), Define("PAR_MB_COUNT", par.parallel), Define("INLINE_MB_COUNT", par.inline), Define("SEQ_MB_COUNT", par.serial), Line(), Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"), Define("COALESCING_THREAD_COUNT", "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"), Line(), Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"), Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"), Define( "GLOBAL_MB_NR_BASE", "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"), Define( "GLOBAL_MB_NR", "(GLOBAL_MB_NR_BASE" "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"), Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"), Line(), Define("DIFFMAT_SEGMENT_FLOATS", diffmat_data.block_floats), Define("DIFFMAT_SEGMENT_BYTES", "(DIFFMAT_SEGMENT_FLOATS*%d)" % given.float_size()), Define("DIFFMAT_COLUMNS", diffmat_data.matrix_columns), Line(), CudaShared( ArrayOf(POD(float_type, "smem_diff_rst_mat"), "DIFFMAT_COLUMNS*DOFS_PER_SEGMENT")), Line(), ]) S = Statement f_body = Block() f_body.extend_log_block("calculate responsibility data", [ Initializer(POD(numpy.uint16, "mb_el"), "MB_DOF/DOFS_PER_EL"), ]) from hedge.backends.cuda.tools import get_load_code f_body.extend( get_load_code( dest="smem_diff_rst_mat", base="gmem_diff_rst_mat + MB_SEGMENT*DIFFMAT_SEGMENT_BYTES", bytes="DIFFMAT_SEGMENT_BYTES", descr="load diff mat segment") + [S("__syncthreads()"), Line()]) # --------------------------------------------------------------------- def get_scalar_diff_code(): code = [] for inl in range(par.inline): for axis in dims: code.append( Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0)) code.append(Line()) def get_mat_entry(row, col, axis): return ("smem_diff_rst_mat[" "%(row)s*DIFFMAT_COLUMNS + %(axis)s*DOFS_PER_EL" " + %(col)s" "]" % { "row": row, "col": col, "axis": axis }) tex_channels = ["x", "y", "z", "w"] from hedge.backends.cuda.tools import unroll code.extend([ POD(float_type, "field_value%d" % inl) for inl in range(par.inline) ] + [Line()] + unroll( lambda j: [ Assign( "field_value%d" % inl, "fp_tex1Dfetch(field_tex, GLOBAL_MB_DOF_BASE + %d*ALIGNED_DOFS_PER_MB " "+ mb_el*DOFS_PER_EL + %s)" % (inl, j)) for inl in range(par.inline) ] + [Line()] + [ S("d%drst%d += %s * field_value%d" % (inl, axis, get_mat_entry("SEGMENT_DOF", j, axis), inl)) for axis in dims for inl in range(par.inline) ] + [Line()], given.dofs_per_el(), self.plan.max_unroll)) store_code = Block() for inl in range(par.inline): for rst_axis in dims: store_code.append( Assign( "drst%d_global[GLOBAL_MB_DOF_BASE" " + %d*ALIGNED_DOFS_PER_MB + MB_DOF]" % (rst_axis, inl), "d%drst%d" % (inl, rst_axis), )) code.append(If("MB_DOF < DOFS_PER_EL*ELS_PER_MB", store_code)) return code f_body.extend([ For("unsigned short seq_mb_number = 0", "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number", Block(get_scalar_diff_code())) ]) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file("diff", ".cu").write(str(cmod)) mod = SourceModule( cmod, keep="cuda_keep_kernels" in discr.debug, #options=["--maxrregcount=10"] ) field_texref = mod.get_texref("field_tex") func = mod.get_function("apply_diff_mat") func.prepare(discr.dimensions * [float_type] + ["P"], block=(self.plan.segment_size, par.parallel, 1), texrefs=[field_texref]) if "cuda_diff" in discr.debug: print "diff: lmem=%d smem=%d regs=%d" % ( func.local_size_bytes, func.shared_size_bytes, func.num_regs) return func, field_texref
def make_lift(self, fgroup, with_scale, dtype): discr = self.discr from cgen import (FunctionDeclaration, FunctionBody, Typedef, Const, Reference, Value, POD, Statement, Include, Line, Block, Initializer, Assign, For, If, Define) from pytools import to_uncomplex_dtype from codepy.bpl import BoostPythonModule mod = BoostPythonModule() S = Statement mod.add_to_preamble([ Include("hedge/face_operators.hpp"), Include("hedge/volume_operators.hpp"), Include("boost/foreach.hpp"), ]) mod.add_to_module([ S("namespace ublas = boost::numeric::ublas"), S("using namespace hedge"), S("using namespace pyublas"), Line(), Define("DOFS_PER_EL", fgroup.ldis_loc.node_count()), Define("FACES_PER_EL", fgroup.ldis_loc.face_count()), Define("DIMENSIONS", discr.dimensions), Line(), Typedef(POD(dtype, "value_type")), Typedef(POD(to_uncomplex_dtype(dtype), "uncomplex_type")), ]) def if_(cond, result, else_=None): if cond: return [result] else: if else_ is None: return [] else: return [else_] fdecl = FunctionDeclaration(Value("void", "lift"), [ Const( Reference(Value("face_group<face_pair<straight_face> >", "fg"))), Value("ublas::matrix<uncomplex_type>", "matrix"), Value("numpy_array<value_type>", "field"), Value("numpy_array<value_type>", "result") ] + if_( with_scale, Const( Reference(Value("numpy_array<double>", "elwise_post_scaling"))))) def make_it(name, is_const=True, tpname="value_type"): if is_const: const = "const_" else: const = "" return Initializer( Value("numpy_array<%s>::%siterator" % (tpname, const), name + "_it"), "%s.begin()" % name) fbody = Block([ make_it("field"), make_it("result", is_const=False), ] + if_(with_scale, make_it("elwise_post_scaling", tpname="double")) + [ Line(), For( "unsigned fg_el_nr = 0", "fg_el_nr < fg.element_count()", "++fg_el_nr", Block([ Initializer(Value("node_number_t", "dest_el_base"), "fg.local_el_write_base[fg_el_nr]"), Initializer(Value("node_number_t", "src_el_base"), "FACES_PER_EL*fg.face_length()*fg_el_nr"), Line(), For( "unsigned i = 0", "i < DOFS_PER_EL", "++i", Block([ Initializer(Value("value_type", "tmp"), 0), Line(), For( "unsigned j = 0", "j < FACES_PER_EL*fg.face_length()", "++j", S("tmp += matrix(i, j)*field_it[src_el_base+j]" )), Line(), ] + if_( with_scale, Assign( "result_it[dest_el_base+i]", "tmp * value_type(*elwise_post_scaling_it)"), Assign("result_it[dest_el_base+i]", "tmp")))), ] + if_(with_scale, S("elwise_post_scaling_it++")))) ]) mod.add_function(FunctionBody(fdecl, fbody)) #print "----------------------------------------------------------------" #print FunctionBody(fdecl, fbody) #raw_input() return mod.compile(self.discr.toolchain).lift
def make_diff(self, elgroup, dtype, shape): """ :param shape: If non-square, the resulting code takes two element_ranges arguments and supports non-square matrices. """ from hedge._internal import UniformElementRanges assert isinstance(elgroup.ranges, UniformElementRanges) ldis = elgroup.local_discretization discr = self.discr from cgen import ( FunctionDeclaration, FunctionBody, Typedef, Const, Reference, Value, POD, Statement, Include, Line, Block, Initializer, Assign, For, If, Define) from pytools import to_uncomplex_dtype from codepy.bpl import BoostPythonModule mod = BoostPythonModule() # {{{ preamble S = Statement mod.add_to_preamble([ Include("hedge/volume_operators.hpp"), Include("boost/foreach.hpp"), ]) mod.add_to_module([ S("namespace ublas = boost::numeric::ublas"), S("using namespace hedge"), S("using namespace pyublas"), Line(), Define("ROW_COUNT", shape[0]), Define("COL_COUNT", shape[1]), Define("DIMENSIONS", discr.dimensions), Line(), Typedef(POD(dtype, "value_type")), Typedef(POD(to_uncomplex_dtype(dtype), "uncomplex_type")), ]) fdecl = FunctionDeclaration( Value("void", "diff"), [ Const(Reference(Value("uniform_element_ranges", "from_ers"))), Const(Reference(Value("uniform_element_ranges", "to_ers"))), Value("numpy_array<value_type>", "field") ]+[ Value("ublas::matrix<uncomplex_type>", "diffmat_rst%d" % rst) for rst in range(discr.dimensions) ]+[ Value("numpy_array<value_type>", "result%d" % i) for i in range(discr.dimensions) ] ) # }}} # {{{ set-up def make_it(name, is_const=True, tpname="value_type"): if is_const: const = "const_" else: const = "" return Initializer( Value("numpy_array<%s>::%siterator" % (tpname, const), name+"_it"), "%s.begin()" % name) fbody = Block([ If("ROW_COUNT != diffmat_rst%d.size1()" % i, S('throw(std::runtime_error("unexpected matrix size"))')) for i in range(discr.dimensions) ] + [ If("COL_COUNT != diffmat_rst%d.size2()" % i, S('throw(std::runtime_error("unexpected matrix size"))')) for i in range(discr.dimensions) ]+[ If("ROW_COUNT != to_ers.el_size()", S('throw(std::runtime_error("unsupported image element size"))')), If("COL_COUNT != from_ers.el_size()", S('throw(std::runtime_error("unsupported preimage element size"))')), If("from_ers.size() != to_ers.size()", S('throw(std::runtime_error("image and preimage element groups ' 'do nothave the same element count"))')), Line(), make_it("field"), ]+[ make_it("result%d" % i, is_const=False) for i in range(discr.dimensions) ]+[ Line(), # }}} # {{{ computation For("element_number_t eg_el_nr = 0", "eg_el_nr < to_ers.size()", "++eg_el_nr", Block([ Initializer( Value("node_number_t", "from_el_base"), "from_ers.start() + eg_el_nr*COL_COUNT"), Initializer( Value("node_number_t", "to_el_base"), "to_ers.start() + eg_el_nr*ROW_COUNT"), Line(), For("unsigned i = 0", "i < ROW_COUNT", "++i", Block([ Initializer(Value("value_type", "drst_%d" % rst), 0) for rst in range(discr.dimensions) ]+[ Line(), ]+[ For("unsigned j = 0", "j < COL_COUNT", "++j", Block([ S("drst_%(rst)d += " "diffmat_rst%(rst)d(i, j)*field_it[from_el_base+j]" % {"rst":rst}) for rst in range(discr.dimensions) ]) ), Line(), ]+[ Assign("result%d_it[to_el_base+i]" % rst, "drst_%d" % rst) for rst in range(discr.dimensions) ]) ) ]) ) ]) # }}} # {{{ compilation mod.add_function(FunctionBody(fdecl, fbody)) #print "----------------------------------------------------------------" #print mod.generate() #raw_input() compiled_func = mod.compile(self.discr.toolchain).diff if self.discr.instrumented: from hedge.tools import time_count_flop compiled_func = time_count_flop(compiled_func, discr.diff_timer, discr.diff_counter, discr.diff_flop_counter, flops=discr.dimensions*( 2 # mul+add * ldis.node_count() * len(elgroup.members) * ldis.node_count() + 2 * discr.dimensions * len(elgroup.members) * ldis.node_count()), increment=discr.dimensions) return compiled_func