def generate(self): """Generate (i.e. yield) the source code of the module line-by-line. """ from cgen import Block, Module, Include, Line, Define, \ PrivateNamespace body = [] if self.max_arity is not None: body.append(Define("BOOST_PYTHON_MAX_ARITY", self.max_arity)) if self.use_private_namespace: mod_body = [PrivateNamespace(self.mod_body)] else: mod_body = self.mod_body body += ([Include("boost/python.hpp")] + self.preamble + [Line()] + mod_body + [Line(), Line(f"BOOST_PYTHON_MODULE({self.name})")] + [Block(self.init_body)]) return Module(body)
def get_kernel(self, with_scaling, for_benchmark=False): from cgen import \ Pointer, POD, Value, ArrayOf, \ Module, FunctionDeclaration, FunctionBody, Block, \ Line, Define, Include, \ Initializer, If, For, Statement, Assign, \ ArrayInitializer from cgen import dtype_to_ctype from cgen.cuda import CudaShared, CudaConstant, CudaGlobal discr = self.discr d = discr.dimensions dims = range(d) given = self.plan.given float_type = given.float_type f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_el_local_mat_smem_mat"), [ Pointer(POD(float_type, "out_vector")), Pointer(POD(numpy.uint8, "gmem_matrix")), Pointer(POD(float_type, "debugbuf")), POD(numpy.uint32, "microblock_count"), ] )) cmod = Module([ Include("pycuda-helpers.hpp"), Line(), Value("texture<fp_tex_%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type), "in_vector_tex"), ]) if with_scaling: cmod.append( Value("texture<fp_tex_%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type), "scaling_tex"), ) par = self.plan.parallelism cmod.extend([ Line(), Define("DIMENSIONS", discr.dimensions), Define("DOFS_PER_EL", given.dofs_per_el()), Define("PREIMAGE_DOFS_PER_EL", self.plan.preimage_dofs_per_el), Line(), Define("SEGMENT_DOF", "threadIdx.x"), Define("PAR_MB_NR", "threadIdx.y"), Line(), Define("MB_SEGMENT", "blockIdx.x"), Define("MACROBLOCK_NR", "blockIdx.y"), Line(), Define("DOFS_PER_SEGMENT", self.plan.segment_size), Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()), Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats), Define("ALIGNED_PREIMAGE_DOFS_PER_MB", self.plan.aligned_preimage_dofs_per_microblock), Define("MB_EL_COUNT", given.microblock.elements), Line(), Define("PAR_MB_COUNT", par.parallel), Define("INLINE_MB_COUNT", par.inline), Define("SEQ_MB_COUNT", par.serial), Line(), Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"), Define("COALESCING_THREAD_COUNT", "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"), Line(), Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"), Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"), Define("GLOBAL_MB_NR_BASE", "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"), Define("GLOBAL_MB_NR", "(GLOBAL_MB_NR_BASE" "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"), Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"), Define("GLOBAL_MB_PREIMG_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"), Line(), Define("MATRIX_COLUMNS", self.plan.gpu_matrix_columns()), Define("MATRIX_SEGMENT_FLOATS", self.plan.gpu_matrix_block_floats()), Define("MATRIX_SEGMENT_BYTES", "(MATRIX_SEGMENT_FLOATS*%d)" % given.float_size()), Line(), CudaShared(ArrayOf(POD(float_type, "smem_matrix"), "MATRIX_SEGMENT_FLOATS")), CudaShared( ArrayOf( ArrayOf( ArrayOf( POD(float_type, "dof_buffer"), "PAR_MB_COUNT"), "INLINE_MB_COUNT"), "DOFS_PER_SEGMENT"), ), CudaShared(POD(numpy.uint16, "segment_start_el")), CudaShared(POD(numpy.uint16, "segment_stop_el")), CudaShared(POD(numpy.uint16, "segment_el_count")), Line(), ArrayInitializer( CudaConstant( ArrayOf( POD(numpy.uint32, "segment_start_el_lookup"), "SEGMENTS_PER_MB")), [(chk*self.plan.segment_size)//given.dofs_per_el() for chk in range(self.plan.segments_per_microblock())] ), ArrayInitializer( CudaConstant( ArrayOf( POD(numpy.uint32, "segment_stop_el_lookup"), "SEGMENTS_PER_MB")), [min(given.microblock.elements, (chk*self.plan.segment_size+self.plan.segment_size-1) //given.dofs_per_el()+1) for chk in range(self.plan.segments_per_microblock())] ), ]) S = Statement f_body = Block() f_body.extend_log_block("calculate this dof's element", [ Initializer(POD(numpy.uint8, "mb_el"), "MB_DOF/DOFS_PER_EL") ]) if self.plan.use_prefetch_branch: f_body.extend_log_block("calculate segment responsibility data", [ If("THREAD_NUM==0", Block([ Assign("segment_start_el", "segment_start_el_lookup[MB_SEGMENT]"), Assign("segment_stop_el", "segment_stop_el_lookup[MB_SEGMENT]"), Assign("segment_el_count", "segment_stop_el-segment_start_el"), ]) ), S("__syncthreads()") ]) from hedge.backends.cuda.tools import get_load_code f_body.extend( get_load_code( dest="smem_matrix", base=("gmem_matrix + MB_SEGMENT*MATRIX_SEGMENT_BYTES"), bytes="MATRIX_SEGMENT_BYTES", descr="load matrix segment") +[S("__syncthreads()")] ) # --------------------------------------------------------------------- def get_batched_fetch_mat_mul_code(el_fetch_count): result = [] dofs = range(self.plan.preimage_dofs_per_el) for load_segment_start in range(0, self.plan.preimage_dofs_per_el, self.plan.segment_size): result.extend( [S("__syncthreads()")] +[Assign( "dof_buffer[PAR_MB_NR][%d][SEGMENT_DOF]" % inl, "fp_tex1Dfetch(in_vector_tex, " "GLOBAL_MB_PREIMG_DOF_BASE" " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB" " + (segment_start_el)*PREIMAGE_DOFS_PER_EL + %d + SEGMENT_DOF)" % (inl, load_segment_start) ) for inl in range(par.inline)] +[S("__syncthreads()"), Line(), ]) for dof in dofs[load_segment_start:load_segment_start+self.plan.segment_size]: for inl in range(par.inline): result.append( S("result%d += " "smem_matrix[SEGMENT_DOF*MATRIX_COLUMNS + %d]" "*" "dof_buffer[PAR_MB_NR][%d][%d]" % (inl, dof, inl, dof-load_segment_start)) ) result.append(Line()) return result from hedge.backends.cuda.tools import unroll def get_direct_tex_mat_mul_code(): return ( [POD(float_type, "fof%d" % inl) for inl in range(par.inline)] + [POD(float_type, "lm"), Line()] + unroll( lambda j: [ Assign("fof%d" % inl, "fp_tex1Dfetch(in_vector_tex, " "GLOBAL_MB_PREIMG_DOF_BASE" " + %(inl)d * ALIGNED_PREIMAGE_DOFS_PER_MB" " + mb_el*PREIMAGE_DOFS_PER_EL+%(j)s)" % {"j":j, "inl":inl, "row": "SEGMENT_DOF"},) for inl in range(par.inline) ]+[ Assign("lm", "smem_matrix[" "%(row)s*MATRIX_COLUMNS + %(j)s]" % {"j":j, "row": "SEGMENT_DOF"}, ) ]+[ S("result%(inl)d += fof%(inl)d*lm" % {"inl":inl}) for inl in range(par.inline) ], total_number=self.plan.preimage_dofs_per_el, max_unroll=self.plan.max_unroll) + [Line()]) def get_mat_mul_code(el_fetch_count): if el_fetch_count == 1: return get_batched_fetch_mat_mul_code(el_fetch_count) else: return get_direct_tex_mat_mul_code() def mat_mul_outer_loop(fetch_count): if with_scaling: inv_jac_multiplier = ("fp_tex1Dfetch(scaling_tex," "(GLOBAL_MB_NR + %(inl)d)*MB_EL_COUNT + mb_el)") else: inv_jac_multiplier = "1" write_condition = "MB_DOF < DOFS_PER_EL*MB_EL_COUNT" if self.with_index_check: write_condition += " && GLOBAL_MB_NR < microblock_count" return For("unsigned short seq_mb_number = 0", "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number", Block([ Initializer(POD(float_type, "result%d" % inl), 0) for inl in range(par.inline) ]+[Line()] +get_mat_mul_code(fetch_count) +[ If(write_condition, Block([ Assign( "out_vector[GLOBAL_MB_DOF_BASE" " + %d*ALIGNED_DOFS_PER_MB" " + MB_DOF]" % inl, "result%d * %s" % (inl, (inv_jac_multiplier % {"inl":inl})) ) for inl in range(par.inline) ]) ) ]) ) if self.plan.use_prefetch_branch: from cgen import make_multiple_ifs f_body.append(make_multiple_ifs([ ("segment_el_count == %d" % fetch_count, mat_mul_outer_loop(fetch_count)) for fetch_count in range(1, self.plan.max_elements_touched_by_segment()+1)] )) else: f_body.append(mat_mul_outer_loop(0)) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file(self.plan.debug_name, ".cu").write(str(cmod)) mod = SourceModule(cmod, keep="cuda_keep_kernels" in discr.debug, #options=["--maxrregcount=12"] ) func = mod.get_function("apply_el_local_mat_smem_mat") if self.plan.debug_name in discr.debug: print "%s: lmem=%d smem=%d regs=%d" % ( self.plan.debug_name, func.local_size_bytes, func.shared_size_bytes, func.num_regs) in_vector_texref = mod.get_texref("in_vector_tex") texrefs = [in_vector_texref] if with_scaling: scaling_texref = mod.get_texref("scaling_tex") texrefs.append(scaling_texref) else: scaling_texref = None func.prepare( "PPPI", block=(self.plan.segment_size, self.plan.parallelism.parallel, 1), texrefs=texrefs) return func, in_vector_texref, scaling_texref
c_gpu = cuda.mem_alloc(a.nbytes) from cgen import FunctionBody, \ FunctionDeclaration, POD, Value, \ Pointer, Module, Block, Initializer, Assign from cgen.cuda import CudaGlobal mod = Module([ FunctionBody( CudaGlobal(FunctionDeclaration( Value("void", "add"), arg_decls=[Pointer(POD(dtype, name)) for name in ["tgt", "op1", "op2"]])), Block([ Initializer( POD(numpy.int32, "idx"), "threadIdx.x + %d*blockIdx.x" % (block_size*thread_strides)), ]+[ Assign( "tgt[idx+%d]" % (o*block_size), "op1[idx+%d] + op2[idx+%d]" % ( o*block_size, o*block_size)) for o in range(thread_strides)]))]) mod = SourceModule(mod) func = mod.get_function("add") func(c_gpu, a_gpu, b_gpu, block=(block_size,1,1), grid=(macroblock_count,1))
def make_cuda_kernel(self, discr, dtype, eg): given = discr.given ldis = eg.local_discretization microblocks_per_block = 1 from cgen.cuda import CudaGlobal from cgen import (Module, Value, Include, Typedef, FunctionBody, FunctionDeclaration, Const, Line, POD, LiteralBlock, Define, Pointer) cmod = Module([ Include("pycuda-helpers.hpp"), Line(), Typedef(POD(dtype, "value_type")), Line(), Define("DOFS_PER_EL", given.dofs_per_el()), Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats), Define("VERTICES_PER_EL", ldis.vertex_count()), Define("ELS_PER_MB", given.microblock.elements), Define("MBS_PER_BLOCK", microblocks_per_block), Line(), Define("DOF_IN_MB_IDX", "threadIdx.x"), Define("DOF_IN_EL_IDX", "(DOF_IN_MB_IDX-el_idx_in_mb*DOFS_PER_EL)"), Define("MB_IN_BLOCK_IDX", "threadIdx.y"), Define("BLOCK_IDX", "blockIdx.x"), Define("MB_NUMBER", "(BLOCK_IDX * MBS_PER_BLOCK + MB_IN_BLOCK_IDX)"), Define("BLOCK_DATA", "whole_block[MB_IN_BLOCK_IDX]")] + self.get_cuda_extra_preamble(discr, dtype, eg) + [FunctionBody( CudaGlobal(FunctionDeclaration( Value("void", "elwise_kernel"), [ Pointer(Const(POD(dtype, "field"))), Pointer(POD(dtype, "result")), POD(numpy.uint32, "mb_count"), ])), LiteralBlock(""" int el_idx_in_mb = DOF_IN_MB_IDX / DOFS_PER_EL; if (MB_NUMBER >= mb_count) return; int idx = MB_NUMBER * ALIGNED_DOFS_PER_MB + DOF_IN_MB_IDX; int element_base_idx = ALIGNED_DOFS_PER_MB * MB_IN_BLOCK_IDX + (DOF_IN_MB_IDX / DOFS_PER_EL) * DOFS_PER_EL; int dof_in_element = DOF_IN_MB_IDX-el_idx_in_mb*DOFS_PER_EL; __shared__ value_type whole_block[MBS_PER_BLOCK][ALIGNED_DOFS_PER_MB+1]; int idx_in_block = ALIGNED_DOFS_PER_MB * MB_IN_BLOCK_IDX + DOF_IN_MB_IDX; BLOCK_DATA[idx_in_block] = field[idx]; __syncthreads(); %s result[idx] = node_result; """ % self.get_cuda_code(discr, dtype, eg))) ]) if False: for i, l in enumerate(str(cmod).split("\n")): print i+1, l raw_input() from pycuda.compiler import SourceModule mod = SourceModule( cmod, keep="cuda_keep_kernels" in discr.debug, ) func = mod.get_function("elwise_kernel") func.prepare( "PPI", block=( given.microblock.aligned_floats, microblocks_per_block, 1)) mb_count = len(discr.blocks) * discr.given.microblocks_per_block grid_dim = (mb_count + microblocks_per_block - 1) \ // microblocks_per_block from pytools import Record class KernelInfo(Record): pass return KernelInfo( func=func, grid_dim=grid_dim, mb_count=mb_count)
from cgen import FunctionBody, \ FunctionDeclaration, Typedef, POD, Value, \ Pointer, Module, Block, Initializer, Assign from cgen.cuda import CudaGlobal mod = Module([ FunctionBody( CudaGlobal( FunctionDeclaration(Value("void", "add"), arg_decls=[ Pointer(POD(dtype, name)) for name in ["tgt", "op1", "op2"] ])), Block([ Initializer( POD(numpy.int32, "idx"), "threadIdx.x + %d*blockIdx.x" % (block_size * thread_strides)), ] + [ Assign( "tgt[idx+%d]" % (o * block_size), "op1[idx+%d] + op2[idx+%d]" % (o * block_size, o * block_size)) for o in range(thread_strides) ])) ]) mod = SourceModule(mod) func = mod.get_function("add") func(c_gpu, a_gpu, b_gpu, block=(block_size, 1, 1), grid=(macroblock_count, 1))
def get_kernel(self, diff_op, elgroup, for_benchmark=False): from cgen import ( Pointer, POD, Value, ArrayOf, Const, Module, FunctionDeclaration, FunctionBody, Block, Comment, Line, Define, Include, Initializer, If, For, Statement, Assign, ) from pycuda.tools import dtype_to_ctype from cgen.cuda import CudaShared, CudaGlobal discr = self.discr d = discr.dimensions dims = range(d) plan = self.plan given = plan.given elgroup, = discr.element_groups float_type = given.float_type f_decl = CudaGlobal( FunctionDeclaration( Value("void", "apply_diff_mat_smem"), [Pointer(POD(float_type, "debugbuf")), Pointer(POD(float_type, "field"))] + [Pointer(POD(float_type, "drst%d_global" % i)) for i in dims], ) ) par = plan.parallelism cmod = Module([Include("pycuda-helpers.hpp")]) if float_type == numpy.float64: cmod.append(Value("texture<fp_tex_double, 1, cudaReadModeElementType>", "diff_rst_mat_tex")) elif float_type == numpy.float32: rst_channels = given.devdata.make_valid_tex_channel_count(d) cmod.append(Value("texture<float%d, 1, cudaReadModeElementType>" % rst_channels, "diff_rst_mat_tex")) else: raise ValueError("unsupported float type: %s" % float_type) # only preimage size variation is supported here assert plan.image_dofs_per_el == given.dofs_per_el() assert plan.aligned_image_dofs_per_microblock == given.microblock.aligned_floats # FIXME: aligned_image_dofs_per_microblock must be divisible # by this, therefore hardcoding for now. chunk_size = 16 cmod.extend( [ Line(), Define("DIMENSIONS", discr.dimensions), Define("IMAGE_DOFS_PER_EL", plan.image_dofs_per_el), Define("PREIMAGE_DOFS_PER_EL", plan.preimage_dofs_per_el), Define("ALIGNED_IMAGE_DOFS_PER_MB", plan.aligned_image_dofs_per_microblock), Define("ALIGNED_PREIMAGE_DOFS_PER_MB", plan.aligned_preimage_dofs_per_microblock), Define("ELS_PER_MB", given.microblock.elements), Define("IMAGE_DOFS_PER_MB", "(IMAGE_DOFS_PER_EL*ELS_PER_MB)"), Line(), Define("CHUNK_SIZE", chunk_size), Define("CHUNK_DOF", "threadIdx.x"), Define("PAR_MB_NR", "threadIdx.y"), Define("CHUNK_NR", "threadIdx.z"), Define("IMAGE_MB_DOF", "(CHUNK_NR*CHUNK_SIZE+CHUNK_DOF)"), Define("IMAGE_EL_DOF", "(IMAGE_MB_DOF - mb_el*IMAGE_DOFS_PER_EL)"), Line(), Define("MACROBLOCK_NR", "blockIdx.x"), Line(), Define("PAR_MB_COUNT", par.parallel), Define("INLINE_MB_COUNT", par.inline), Define("SEQ_MB_COUNT", par.serial), Line(), Define("GLOBAL_MB_NR_BASE", "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"), Define( "GLOBAL_MB_NR", "(GLOBAL_MB_NR_BASE" "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)" ), Define("GLOBAL_MB_IMAGE_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_IMAGE_DOFS_PER_MB)"), Define("GLOBAL_MB_PREIMAGE_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"), Line(), CudaShared( ArrayOf( ArrayOf(ArrayOf(POD(float_type, "smem_field"), "PAR_MB_COUNT"), "INLINE_MB_COUNT"), "ALIGNED_PREIMAGE_DOFS_PER_MB", ) ), Line(), ] ) S = Statement f_body = Block([Initializer(Const(POD(numpy.uint16, "mb_el")), "IMAGE_MB_DOF / IMAGE_DOFS_PER_EL"), Line()]) # --------------------------------------------------------------------- def get_load_code(): mb_img_dofs = plan.aligned_image_dofs_per_microblock mb_preimg_dofs = plan.aligned_preimage_dofs_per_microblock preimg_dofs_over_dofs = (mb_preimg_dofs + mb_img_dofs - 1) // mb_img_dofs load_code = [] store_code = [] var_num = 0 for load_block in range(preimg_dofs_over_dofs): for inl in range(par.inline): # load and store are split for better pipelining # compiler can't figure that out because of branch var = "tmp%d" % var_num var_num += 1 load_code.append(POD(float_type, var)) block_addr = "%d * ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF" % load_block load_instr = Assign( var, "field[GLOBAL_MB_PREIMAGE_DOF_BASE" " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB" " + %s]" % (inl, block_addr), ) store_instr = Assign("smem_field[PAR_MB_NR][%d][%s]" % (inl, block_addr), var) if (load_block + 1) * mb_img_dofs >= mb_preimg_dofs: cond = "%s < ALIGNED_PREIMAGE_DOFS_PER_MB" % block_addr load_instr = If(cond, load_instr) store_instr = If(cond, store_instr) load_code.append(load_instr) store_code.append(store_instr) return Block(load_code + [Line()] + store_code) def get_scalar_diff_code(): code = [] for inl in range(par.inline): for axis in dims: code.append(Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0)) code.append(Line()) tex_channels = ["x", "y", "z", "w"] store_code = Block() for inl in range(par.inline): for rst_axis in dims: store_code.append( Assign( "drst%d_global[GLOBAL_MB_IMAGE_DOF_BASE + " "%d*ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF]" % (rst_axis, inl), "d%drst%d" % (inl, rst_axis), ) ) from hedge.backends.cuda.tools import unroll code.extend( [ Comment("everybody needs to be done with the old data"), S("__syncthreads()"), Line(), get_load_code(), Line(), Comment("all the new data must be loaded"), S("__syncthreads()"), Line(), ] ) if float_type == numpy.float32: code.append(Value("float%d" % rst_channels, "dmat_entries")) code.extend([POD(float_type, "field_value%d" % inl) for inl in range(par.inline)] + [Line()]) def unroll_body(j): result = [ Assign("field_value%d" % inl, "smem_field[PAR_MB_NR][%d][mb_el*PREIMAGE_DOFS_PER_EL+%s]" % (inl, j)) for inl in range(par.inline) ] if float_type == numpy.float32: result.append( Assign("dmat_entries", "tex1Dfetch(diff_rst_mat_tex, IMAGE_EL_DOF + %s*IMAGE_DOFS_PER_EL)" % j) ) result.extend( S("d%drst%d += dmat_entries.%s * field_value%d" % (inl, axis, tex_channels[axis], inl)) for inl in range(par.inline) for axis in dims ) elif float_type == numpy.float64: result.extend( S( "d%(inl)drst%(axis)d += " "fp_tex1Dfetch(diff_rst_mat_tex, %(axis)d " "+ DIMENSIONS*(IMAGE_EL_DOF + %(j)d*IMAGE_DOFS_PER_EL))" "* field_value%(inl)d" % {"inl": inl, "axis": axis, "j": j} ) for inl in range(par.inline) for axis in dims ) else: assert False return result code.append( If( "IMAGE_MB_DOF < IMAGE_DOFS_PER_MB", Block(unroll(unroll_body, total_number=plan.preimage_dofs_per_el) + [store_code]), ) ) return code f_body.extend( [ For( "unsigned short seq_mb_number = 0", "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number", Block(get_scalar_diff_code()), ) ] ) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file("diff", ".cu").write(str(cmod)) mod = SourceModule( cmod, keep="cuda_keep_kernels" in discr.debug, # options=["--maxrregcount=16"] ) func = mod.get_function("apply_diff_mat_smem") if "cuda_diff" in discr.debug: print "diff: lmem=%d smem=%d regs=%d" % (func.local_size_bytes, func.shared_size_bytes, func.registers) diff_rst_mat_texref = mod.get_texref("diff_rst_mat_tex") gpu_diffmats = self.gpu_diffmats(diff_op, elgroup) if given.float_type == numpy.float32: gpu_diffmats.bind_to_texref_ext(diff_rst_mat_texref, rst_channels) elif given.float_type == numpy.float64: gpu_diffmats.bind_to_texref_ext(diff_rst_mat_texref, allow_double_hack=True) else: assert False assert given.microblock.aligned_floats % chunk_size == 0 block = (chunk_size, plan.parallelism.parallel, given.microblock.aligned_floats // chunk_size) func.prepare(["PP"] + discr.dimensions * ["P"], texrefs=[diff_rst_mat_texref]) return block, func
def get_kernel(self, for_benchmark=False): from cgen import \ Pointer, POD, Value, ArrayOf, Const, \ Module, FunctionDeclaration, FunctionBody, Block, \ Comment, Line, Include, \ Define, \ Initializer, If, For, Statement, Assign from cgen import dtype_to_ctype from cgen.cuda import CudaShared, CudaGlobal discr = self.discr d = discr.dimensions dims = range(d) given = self.plan.given float_type = given.float_type f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_el_local_mat_smem_field"), [ Pointer(POD(float_type, "out_vector")), Pointer(POD(float_type, "in_vector")), Pointer(POD(float_type, "debugbuf")), POD(numpy.uint32, "microblock_count"), ] )) cmod = Module([ Include("pycuda-helpers.hpp"), Line(), Value("texture<fp_tex_%s, 2, cudaReadModeElementType>" % dtype_to_ctype(float_type), "mat_tex"), ]) plan = self.plan par = plan.parallelism # FIXME: aligned_image_dofs_per_microblock must be divisible # by this, therefore hardcoding for now. chunk_size = 16 cmod.extend([ Line(), Define("DIMENSIONS", discr.dimensions), Define("IMAGE_DOFS_PER_EL", plan.image_dofs_per_el), Define("PREIMAGE_DOFS_PER_EL", plan.preimage_dofs_per_el), Define("ALIGNED_IMAGE_DOFS_PER_MB", plan.aligned_image_dofs_per_microblock), Define("ALIGNED_PREIMAGE_DOFS_PER_MB", plan.aligned_preimage_dofs_per_microblock), Line(), Define("MB_EL_COUNT", plan.elements_per_microblock), Line(), Define("IMAGE_DOFS_PER_MB", "(IMAGE_DOFS_PER_EL*MB_EL_COUNT)"), Line(), Define("CHUNK_SIZE", chunk_size), Define("CHUNK_DOF", "threadIdx.x"), Define("PAR_MB_NR", "threadIdx.y"), Define("CHUNK_NR", "threadIdx.z"), Define("IMAGE_MB_DOF", "(CHUNK_NR*CHUNK_SIZE+CHUNK_DOF)"), Define("IMAGE_EL_DOF", "(IMAGE_MB_DOF - mb_el*IMAGE_DOFS_PER_EL)"), Line(), Define("MACROBLOCK_NR", "blockIdx.x"), Line(), Define("PAR_MB_COUNT", par.parallel), Define("INLINE_MB_COUNT", par.inline), Define("SEQ_MB_COUNT", par.serial), Line(), Define("GLOBAL_MB_NR_BASE", "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"), Define("GLOBAL_MB_NR", "(GLOBAL_MB_NR_BASE" "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"), Define("GLOBAL_MB_IMAGE_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_IMAGE_DOFS_PER_MB)"), Define("GLOBAL_MB_PREIMAGE_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"), Line(), CudaShared( ArrayOf( ArrayOf( ArrayOf( POD(float_type, "smem_in_vector"), "PAR_MB_COUNT"), "INLINE_MB_COUNT"), "ALIGNED_PREIMAGE_DOFS_PER_MB")), Line(), ]) S = Statement f_body = Block([ Initializer(Const(POD(numpy.uint16, "mb_el")), "IMAGE_MB_DOF / IMAGE_DOFS_PER_EL"), Line(), ]) def get_load_code(): mb_img_dofs = plan.aligned_image_dofs_per_microblock mb_preimg_dofs = plan.aligned_preimage_dofs_per_microblock preimg_dofs_over_dofs = (mb_preimg_dofs+mb_img_dofs-1) // mb_img_dofs load_code = [] store_code = [] var_num = 0 for load_block in range(preimg_dofs_over_dofs): for inl in range(par.inline): # load and store are split for better pipelining # compiler can't figure that out because of branch var = "tmp%d" % var_num var_num += 1 load_code.append(POD(float_type, var)) block_addr = "%d * ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF" % load_block load_instr = Assign(var, "in_vector[GLOBAL_MB_PREIMAGE_DOF_BASE" " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB" " + %s]" % (inl, block_addr)) store_instr = Assign( "smem_in_vector[PAR_MB_NR][%d][%s]" % (inl, block_addr), var ) if (load_block+1)*mb_img_dofs >= mb_preimg_dofs: cond = "%s < ALIGNED_PREIMAGE_DOFS_PER_MB" % block_addr load_instr = If(cond, load_instr) store_instr = If(cond, store_instr) load_code.append(load_instr) store_code.append(store_instr) return Block(load_code + [Line()] + store_code) def get_matmul_code(): from hedge.backends.cuda.tools import unroll index_check_condition = "GLOBAL_MB_NR < microblock_count" def if_(conditions, then): final_cond = " && ".join(cond for cond in conditions if cond) if final_cond: return If(final_cond, then) else: return then result = Block([ Comment("everybody needs to be done with the old data"), S("__syncthreads()"), Line(), ]+[If(index_check_condition, get_load_code())]+[ Line(), Comment("all the new data must be loaded"), S("__syncthreads()"), Line(), ]+[ Initializer(POD(float_type, "result%d" % inl), 0) for inl in range(par.inline) ]+[ Line(), POD(float_type, "mat_entry"), Line(), ]) result.append(if_(["IMAGE_MB_DOF < IMAGE_DOFS_PER_MB", index_check_condition], Block(unroll(lambda j: [Assign("mat_entry", "fp_tex2D(mat_tex, IMAGE_EL_DOF, %s)" % j)] +[ S("result%d += mat_entry " "* smem_in_vector[PAR_MB_NR][%d][mb_el*PREIMAGE_DOFS_PER_EL + %s]" % (inl, inl, j)) for inl in range(par.inline) ], total_number=plan.preimage_dofs_per_el) +[Line()] +[Assign( "out_vector[GLOBAL_MB_IMAGE_DOF_BASE + " "%d*ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF]" % inl, "result%d" % inl) for inl in range(par.inline)] ))) return result f_body.append(For("unsigned short seq_mb_number = 0", "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number", get_matmul_code())) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file(plan.debug_name, ".cu").write(str(cmod)) mod = SourceModule(cmod, keep="cuda_keep_kernels" in discr.debug, #options=["--maxrregcount=12"] ) func = mod.get_function("apply_el_local_mat_smem_field") if plan.debug_name in discr.debug: print "%s: lmem=%d smem=%d regs=%d" % ( plan.debug_name, func.local_size_bytes, func.shared_size_bytes, func.num_regs) mat_texref = mod.get_texref("mat_tex") texrefs = [mat_texref] func.prepare( "PPPI", texrefs=texrefs) assert plan.aligned_image_dofs_per_microblock % chunk_size == 0 block = ( chunk_size, plan.parallelism.parallel, plan.aligned_image_dofs_per_microblock //chunk_size) return func, block, mat_texref
def get_kernel(self, fdata, ilist_data, for_benchmark): from cgen.cuda import CudaShared, CudaGlobal from pycuda.tools import dtype_to_ctype discr = self.discr given = self.plan.given fplan = self.plan d = discr.dimensions dims = range(d) elgroup, = discr.element_groups float_type = given.float_type f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_flux"), [ Pointer(POD(float_type, "debugbuf")), Pointer(POD(numpy.uint8, "gmem_facedata")), ]+[ Pointer(POD(float_type, "gmem_fluxes_on_faces%d" % flux_nr)) for flux_nr in range(len(self.fluxes)) ] )) cmod = Module() cmod.append(Include("pycuda-helpers.hpp")) for dep_expr in self.all_deps: cmod.extend([ Value("texture<%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type, with_fp_tex_hack=True), "field%d_tex" % self.dep_to_index[dep_expr]) ]) if fplan.flux_count != len(self.fluxes): from warnings import warn warn("Flux count in flux execution plan different from actual flux count.\n" "You may want to specify the tune_for= kwarg in the Discretization\n" "constructor.") cmod.extend([ Line(), Typedef(POD(float_type, "value_type")), Line(), flux_header_struct(float_type, discr.dimensions), Line(), face_pair_struct(float_type, discr.dimensions), Line(), Define("DIMENSIONS", discr.dimensions), Define("DOFS_PER_FACE", fplan.dofs_per_face), Define("THREADS_PER_FACE", fplan.threads_per_face()), Line(), Define("CONCURRENT_FACES", fplan.parallel_faces), Define("BLOCK_MB_COUNT", fplan.mbs_per_block), Line(), Define("FACEDOF_NR", "threadIdx.x"), Define("BLOCK_FACE", "threadIdx.y"), Line(), Define("FLUX_COUNT", len(self.fluxes)), Line(), Define("THREAD_NUM", "(FACEDOF_NR + BLOCK_FACE*THREADS_PER_FACE)"), Define("THREAD_COUNT", "(THREADS_PER_FACE*CONCURRENT_FACES)"), Define("COALESCING_THREAD_COUNT", "(THREAD_COUNT < 0x10 ? THREAD_COUNT : THREAD_COUNT & ~0xf)"), Line(), Define("DATA_BLOCK_SIZE", fdata.block_bytes), Define("ALIGNED_FACE_DOFS_PER_MB", fplan.aligned_face_dofs_per_microblock()), Define("ALIGNED_FACE_DOFS_PER_BLOCK", "(ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT)"), Line(), Define("FOF_BLOCK_BASE", "(blockIdx.x*ALIGNED_FACE_DOFS_PER_BLOCK)"), Line(), ] + ilist_data.code + [ Line(), Value("texture<index_list_entry_t, 1, cudaReadModeElementType>", "tex_index_lists"), Line(), fdata.struct, Line(), CudaShared(Value("flux_data", "data")), ]) if not fplan.direct_store: cmod.extend([ CudaShared( ArrayOf( ArrayOf( POD(float_type, "smem_fluxes_on_faces"), "FLUX_COUNT"), "ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT") ), Line(), ]) S = Statement f_body = Block() from hedge.backends.cuda.tools import get_load_code f_body.extend(get_load_code( dest="&data", base="gmem_facedata + blockIdx.x*DATA_BLOCK_SIZE", bytes="sizeof(flux_data)", descr="load face_pair data") +[S("__syncthreads()"), Line() ]) def get_flux_code(flux_writer): flux_code = Block([]) flux_code.extend([ Initializer(Pointer( Value("face_pair", "fpair")), "data.facepairs+fpair_nr"), Initializer( MaybeUnused(POD(numpy.uint32, "a_index")), "fpair->a_base + tex1Dfetch(tex_index_lists, " "fpair->a_ilist_index + FACEDOF_NR)"), Initializer( MaybeUnused(POD(numpy.uint32, "b_index")), "fpair->b_base + tex1Dfetch(tex_index_lists, " "fpair->b_ilist_index + FACEDOF_NR)"), Line(), flux_writer(), Line(), S("fpair_nr += CONCURRENT_FACES") ]) return flux_code flux_computation = Block([ Comment("fluxes for dual-sided (intra-block) interior face pairs"), While("fpair_nr < data.header.same_facepairs_end", get_flux_code(lambda: self.write_interior_flux_code(True)) ), Line(), Comment("work around nvcc assertion failure"), S("fpair_nr+=1"), S("fpair_nr-=1"), Line(), Comment("fluxes for single-sided (inter-block) interior face pairs"), While("fpair_nr < data.header.diff_facepairs_end", get_flux_code(lambda: self.write_interior_flux_code(False)) ), Line(), Comment("fluxes for single-sided boundary face pairs"), While("fpair_nr < data.header.bdry_facepairs_end", get_flux_code( lambda: self.write_boundary_flux_code(for_benchmark)) ), ]) f_body.extend_log_block("compute the fluxes", [ Initializer(POD(numpy.uint32, "fpair_nr"), "BLOCK_FACE"), If("FACEDOF_NR < DOFS_PER_FACE", flux_computation) ]) if not fplan.direct_store: f_body.extend([ Line(), S("__syncthreads()"), Line() ]) f_body.extend_log_block("store fluxes", [ #Assign("debugbuf[blockIdx.x]", "FOF_BLOCK_BASE"), #Assign("debugbuf[0]", "FOF_BLOCK_BASE"), #Assign("debugbuf[0]", "sizeof(face_pair)"), For("unsigned word_nr = THREAD_NUM", "word_nr < ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT", "word_nr += COALESCING_THREAD_COUNT", Block([Assign( "gmem_fluxes_on_faces%d[FOF_BLOCK_BASE+word_nr]" % flux_nr, "smem_fluxes_on_faces[%d][word_nr]" % flux_nr) for flux_nr in range(len(self.fluxes))] #+[If("isnan(smem_fluxes_on_faces[%d][word_nr])" % flux_nr, #Block([ #Assign("debugbuf[blockIdx.x]", "word_nr"), #]) #) #for flux_nr in range(len(self.fluxes))] ) ) ]) if False: f_body.extend([ Assign("debugbuf[blockIdx.x*96+32+BLOCK_FACE*32+threadIdx.x]", "fpair_nr"), Assign("debugbuf[blockIdx.x*96+16]", "data.header.same_facepairs_end"), Assign("debugbuf[blockIdx.x*96+17]", "data.header.diff_facepairs_end"), Assign("debugbuf[blockIdx.x*96+18]", "data.header.bdry_facepairs_end"), ] ) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file("flux_gather", ".cu").write(str(cmod)) #from pycuda.tools import allow_user_edit mod = SourceModule( #allow_user_edit(cmod, "kernel.cu", "the flux kernel"), cmod, keep="cuda_keep_kernels" in discr.debug) expr_to_texture_map = dict( (dep_expr, mod.get_texref( "field%d_tex" % self.dep_to_index[dep_expr])) for dep_expr in self.all_deps) index_list_texref = mod.get_texref("tex_index_lists") index_list_texref.set_address( ilist_data.device_memory, ilist_data.bytes) index_list_texref.set_format( cuda.dtype_to_array_format(ilist_data.type), 1) index_list_texref.set_flags(cuda.TRSF_READ_AS_INTEGER) func = mod.get_function("apply_flux") block = (fplan.threads_per_face(), fplan.parallel_faces, 1) func.prepare( (2+len(self.fluxes))*"P", texrefs=expr_to_texture_map.values() + [index_list_texref]) if "cuda_flux" in discr.debug: print "flux: lmem=%d smem=%d regs=%d" % ( func.local_size_bytes, func.shared_size_bytes, func.num_regs) return block, func, expr_to_texture_map
Pointer, Module, Block, Initializer, Assign, Const from cgen.opencl import CLKernel, CLGlobal, \ CLRequiredWorkGroupSize mod = Module([ FunctionBody( CLKernel( CLRequiredWorkGroupSize( (local_size, ), FunctionDeclaration(Value("void", "add"), arg_decls=[ CLGlobal( Pointer(Const(POD(dtype, name)))) for name in ["tgt", "op1", "op2"] ]))), Block([ Initializer( POD(numpy.int32, "idx"), "get_local_id(0) + %d * get_group_id(0)" % (local_size * thread_strides)) ] + [ Assign( "tgt[idx+%d]" % (o * local_size), "op1[idx+%d] + op2[idx+%d]" % (o * local_size, o * local_size)) for o in range(thread_strides) ])) ]) knl = cl.Program(ctx, str(mod)).build().add knl(queue, (local_size * macroblock_count, ), (local_size, ), c_buf, a_buf,
def get_kernel(self, diff_op, elgroup, for_benchmark=False): from cgen import \ Pointer, POD, Value, ArrayOf, Const, \ Module, FunctionDeclaration, FunctionBody, Block, \ Comment, Line, Define, Include, \ Initializer, If, For, Statement, Assign from pycuda.tools import dtype_to_ctype from cgen.cuda import CudaShared, CudaGlobal discr = self.discr d = discr.dimensions dims = range(d) plan = self.plan given = plan.given elgroup, = discr.element_groups float_type = given.float_type f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_diff_mat_smem"), [Pointer(POD(float_type, "debugbuf")), Pointer(POD(float_type, "field")), ] + [Pointer(POD(float_type, "drst%d_global" % i)) for i in dims] )) par = plan.parallelism cmod = Module([ Include("pycuda-helpers.hpp"), ]) if float_type == numpy.float64: cmod.append(Value("texture<fp_tex_double, 1, cudaReadModeElementType>", "diff_rst_mat_tex")) elif float_type == numpy.float32: rst_channels = given.devdata.make_valid_tex_channel_count(d) cmod.append(Value("texture<float%d, 1, cudaReadModeElementType>" % rst_channels, "diff_rst_mat_tex")) else: raise ValueError("unsupported float type: %s" % float_type) # only preimage size variation is supported here assert plan.image_dofs_per_el == given.dofs_per_el() assert plan.aligned_image_dofs_per_microblock == given.microblock.aligned_floats # FIXME: aligned_image_dofs_per_microblock must be divisible # by this, therefore hardcoding for now. chunk_size = 16 cmod.extend([ Line(), Define("DIMENSIONS", discr.dimensions), Define("IMAGE_DOFS_PER_EL", plan.image_dofs_per_el), Define("PREIMAGE_DOFS_PER_EL", plan.preimage_dofs_per_el), Define("ALIGNED_IMAGE_DOFS_PER_MB", plan.aligned_image_dofs_per_microblock), Define("ALIGNED_PREIMAGE_DOFS_PER_MB", plan.aligned_preimage_dofs_per_microblock), Define("ELS_PER_MB", given.microblock.elements), Define("IMAGE_DOFS_PER_MB", "(IMAGE_DOFS_PER_EL*ELS_PER_MB)"), Line(), Define("CHUNK_SIZE", chunk_size), Define("CHUNK_DOF", "threadIdx.x"), Define("PAR_MB_NR", "threadIdx.y"), Define("CHUNK_NR", "threadIdx.z"), Define("IMAGE_MB_DOF", "(CHUNK_NR*CHUNK_SIZE+CHUNK_DOF)"), Define("IMAGE_EL_DOF", "(IMAGE_MB_DOF - mb_el*IMAGE_DOFS_PER_EL)"), Line(), Define("MACROBLOCK_NR", "blockIdx.x"), Line(), Define("PAR_MB_COUNT", par.parallel), Define("INLINE_MB_COUNT", par.inline), Define("SEQ_MB_COUNT", par.serial), Line(), Define("GLOBAL_MB_NR_BASE", "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"), Define("GLOBAL_MB_NR", "(GLOBAL_MB_NR_BASE" "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"), Define("GLOBAL_MB_IMAGE_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_IMAGE_DOFS_PER_MB)"), Define("GLOBAL_MB_PREIMAGE_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"), Line(), CudaShared( ArrayOf( ArrayOf( ArrayOf( POD(float_type, "smem_field"), "PAR_MB_COUNT"), "INLINE_MB_COUNT"), "ALIGNED_PREIMAGE_DOFS_PER_MB")), Line(), ]) S = Statement f_body = Block([ Initializer(Const(POD(numpy.uint16, "mb_el")), "IMAGE_MB_DOF / IMAGE_DOFS_PER_EL"), Line(), ]) # --------------------------------------------------------------------- def get_load_code(): mb_img_dofs = plan.aligned_image_dofs_per_microblock mb_preimg_dofs = plan.aligned_preimage_dofs_per_microblock preimg_dofs_over_dofs = (mb_preimg_dofs+mb_img_dofs-1) // mb_img_dofs load_code = [] store_code = [] var_num = 0 for load_block in range(preimg_dofs_over_dofs): for inl in range(par.inline): # load and store are split for better pipelining # compiler can't figure that out because of branch var = "tmp%d" % var_num var_num += 1 load_code.append(POD(float_type, var)) block_addr = "%d * ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF" % load_block load_instr = Assign(var, "field[GLOBAL_MB_PREIMAGE_DOF_BASE" " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB" " + %s]" % (inl, block_addr)) store_instr = Assign( "smem_field[PAR_MB_NR][%d][%s]" % (inl, block_addr), var ) if (load_block+1)*mb_img_dofs >= mb_preimg_dofs: cond = "%s < ALIGNED_PREIMAGE_DOFS_PER_MB" % block_addr load_instr = If(cond, load_instr) store_instr = If(cond, store_instr) load_code.append(load_instr) store_code.append(store_instr) return Block(load_code + [Line()] + store_code) def get_scalar_diff_code(): code = [] for inl in range(par.inline): for axis in dims: code.append( Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0)) code.append(Line()) tex_channels = ["x", "y", "z", "w"] store_code = Block() for inl in range(par.inline): for rst_axis in dims: store_code.append(Assign( "drst%d_global[GLOBAL_MB_IMAGE_DOF_BASE + " "%d*ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF]" % (rst_axis, inl), "d%drst%d" % (inl, rst_axis) )) from hedge.backends.cuda.tools import unroll code.extend([ Comment("everybody needs to be done with the old data"), S("__syncthreads()"), Line(), get_load_code(), Line(), Comment("all the new data must be loaded"), S("__syncthreads()"), Line(), ]) if float_type == numpy.float32: code.append(Value("float%d" % rst_channels, "dmat_entries")) code.extend([ POD(float_type, "field_value%d" % inl) for inl in range(par.inline) ]+[Line()]) def unroll_body(j): result = [ Assign("field_value%d" % inl, "smem_field[PAR_MB_NR][%d][mb_el*PREIMAGE_DOFS_PER_EL+%s]" % (inl, j)) for inl in range(par.inline) ] if float_type == numpy.float32: result.append(Assign("dmat_entries", "tex1Dfetch(diff_rst_mat_tex, IMAGE_EL_DOF + %s*IMAGE_DOFS_PER_EL)" % j)) result.extend( S("d%drst%d += dmat_entries.%s * field_value%d" % (inl, axis, tex_channels[axis], inl)) for inl in range(par.inline) for axis in dims) elif float_type == numpy.float64: result.extend( S("d%(inl)drst%(axis)d += " "fp_tex1Dfetch(diff_rst_mat_tex, %(axis)d " "+ DIMENSIONS*(IMAGE_EL_DOF + %(j)d*IMAGE_DOFS_PER_EL))" "* field_value%(inl)d" % { "inl": inl, "axis": axis, "j": j }) for inl in range(par.inline) for axis in dims) else: assert False return result code.append(If("IMAGE_MB_DOF < IMAGE_DOFS_PER_MB", Block(unroll(unroll_body, total_number=plan.preimage_dofs_per_el) +[store_code]))) return code f_body.extend([ For("unsigned short seq_mb_number = 0", "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number", Block(get_scalar_diff_code()) ) ]) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file("diff", ".cu").write(str(cmod)) mod = SourceModule(cmod, keep="cuda_keep_kernels" in discr.debug, #options=["--maxrregcount=16"] ) func = mod.get_function("apply_diff_mat_smem") if "cuda_diff" in discr.debug: print "diff: lmem=%d smem=%d regs=%d" % ( func.local_size_bytes, func.shared_size_bytes, func.registers) diff_rst_mat_texref = mod.get_texref("diff_rst_mat_tex") gpu_diffmats = self.gpu_diffmats(diff_op, elgroup) if given.float_type == numpy.float32: gpu_diffmats.bind_to_texref_ext(diff_rst_mat_texref, rst_channels) elif given.float_type == numpy.float64: gpu_diffmats.bind_to_texref_ext(diff_rst_mat_texref, allow_double_hack=True) else: assert False assert given.microblock.aligned_floats % chunk_size == 0 block = ( chunk_size, plan.parallelism.parallel, given.microblock.aligned_floats//chunk_size) func.prepare( ["PP"] + discr.dimensions*["P"], texrefs=[diff_rst_mat_texref]) return block, func
def _init_lr_correction_libs(self): sph_gen = self.sph_gen def _re_lm(l, m): return l**2 + l + m assign_gen = 'double rhol = charge;\n' for lx in range(self.L): for mx in range(-lx, lx+1): res, ims = sph_gen.get_y_sym(lx, -mx) offset = _re_lm(lx, mx) assign_gen += ''.join(['MULTIPOLE[{}] += {} * rhol;\n'.format(*args) for args in ( (offset, str(res)), (offset + self.L**2, str(ims)) ) ]) res, ims = sph_gen.get_y_sym(lx, mx) assign_gen += ''.join(['DOT_VEC[{}] += {} * rhol;\n'.format(*args) for args in ( (offset, str(res)), (offset + self.L**2, '-1.0 * ' + str(ims)) ) ]) assign_gen += 'rhol *= radius;\n' co = self.solver.il[1] new27direct = 0.0 ex = self.domain.extent for ox in co: # image of old pos dox = np.array((ex[0] * ox[0], ex[1] * ox[1], ex[2] * ox[2])) if ox != (0,0,0): new27direct -= 1.0 / np.linalg.norm(dox) offset_consts = '' bc27 = '' for oxi, ox in enumerate(co): offset_consts += ''' const REAL dox{oxi} = EX * {OX}; const REAL doy{oxi} = EY * {OY}; const REAL doz{oxi} = EZ * {OZ}; '''.format( oxi=str(oxi), OX=str(ox[0]), OY=str(ox[1]), OZ=str(ox[2]), ) bc27 += ''' const REAL dpx{oxi} = dox{oxi} + opx; const REAL dpy{oxi} = doy{oxi} + opy; const REAL dpz{oxi} = doz{oxi} + opz; const REAL ddx{oxi} = dpx{oxi} - npx; const REAL ddy{oxi} = dpy{oxi} - npy; const REAL ddz{oxi} = dpz{oxi} - npz; const REAL o_bbp{oxi} = 1.0 / sqrt(ddx{oxi}*ddx{oxi} + ddy{oxi}*ddy{oxi} + ddz{oxi}*ddz{oxi}); energy27 += o_bbp{oxi}; '''.format( oxi=str(oxi) ) src = r''' namespace LR_SI {{ static inline REAL apply_dipole_correction_split( const REAL * RESTRICT M, const REAL * RESTRICT E ){{ REAL tmp = 0.0; tmp += (DIPOLE_SX * M[RE_1P1]) * E[RE_1P1]; tmp += (DIPOLE_SX * M[RE_1P1]) * E[RE_1N1]; tmp -= (DIPOLE_SY * M[IM_1P1]) * E[IM_1P1]; tmp += (DIPOLE_SY * M[IM_1P1]) * E[IM_1N1]; tmp += (DIPOLE_SZ * M[RE_1_0]) * E[RE_1_0]; return tmp; }} static inline REAL linop_csr_both( const REAL * RESTRICT linop_data, const INT64 * RESTRICT linop_indptr, const INT64 * RESTRICT linop_indices, const REAL * RESTRICT x1, const REAL * RESTRICT E ){{ INT64 data_ind = 0; REAL dot_tmp = 0.0; for(INT64 row=0 ; row<HALF_NCOMP ; row++){{ REAL row_tmp_1 = 0.0; REAL row_tmp_2 = 0.0; for(INT64 col_ind=linop_indptr[row] ; col_ind<linop_indptr[row+1] ; col_ind++){{ const INT64 col = linop_indices[data_ind]; const REAL data = linop_data[data_ind]; data_ind++; row_tmp_1 += data * x1[col]; row_tmp_2 += data * x1[col + HALF_NCOMP]; }} dot_tmp += row_tmp_1 * E[row] + row_tmp_2 * E[row + HALF_NCOMP]; }} return dot_tmp; }} static inline void vector_diff( const REAL dx, const REAL dy, const REAL dz, const REAL charge, REAL * MULTIPOLE, REAL * DOT_VEC ){{ const double xy2 = dx * dx + dy * dy; const double radius = sqrt(xy2 + dz * dz); const double theta = atan2(sqrt(xy2), dz); const double phi = atan2(dy, dx); {SPH_GEN} {ASSIGN_GEN} }} static inline REAL lr_energy_diff( const INT64 accept_flag, const REAL * RESTRICT old_position, const REAL * RESTRICT new_position, const REAL charge, const REAL old_energy, REAL * RESTRICT existing_multipole, REAL * RESTRICT existing_evector, const REAL * RESTRICT linop_data, const INT64 * RESTRICT linop_indptr, const INT64 * RESTRICT linop_indices ){{ REAL mvector[NCOMP]; REAL evector[NCOMP]; // copy the existing vectors for(int ix=0 ; ix<NCOMP ; ix++){{ mvector[ix] = existing_multipole[ix]; evector[ix] = existing_evector[ix]; }} // remove the old contribution vector_diff(old_position[0], old_position[1], old_position[2], -1.0 * charge, mvector, evector); // add the new contribution vector_diff(new_position[0], new_position[1], new_position[2], charge, mvector, evector); // cheap way to reuse this code for accepts if (accept_flag > 0){{ for(int ix=0 ; ix<NCOMP ; ix++){{ existing_multipole[ix] = mvector[ix]; existing_evector[ix] = evector[ix]; }} }} // apply the long range linear operator and get the new energy (minus dipole correction) REAL new_energy = 0.5 * linop_csr_both( linop_data, linop_indptr, linop_indices, mvector, evector ); // add the dipole correction new_energy += 0.5 * apply_dipole_correction_split( mvector, evector ); return new_energy - old_energy; }} static inline REAL self_contributon( const REAL * RESTRICT old_position, const REAL * RESTRICT new_position, const REAL charge ){{ const REAL opx = old_position[0]; const REAL opy = old_position[1]; const REAL opz = old_position[2]; const REAL npx = new_position[0]; const REAL npy = new_position[1]; const REAL npz = new_position[2]; {OFFSET_CONSTS} REAL energy27 = (DOMAIN_27_ENERGY); {BC27} return energy27 * charge * charge; }} int lr_self_interaction_inner( const INT64 accept_flag, const REAL * RESTRICT old_position, const REAL * RESTRICT new_position, const REAL charge, const REAL old_energy, REAL * RESTRICT existing_multipole, REAL * RESTRICT existing_evector, const REAL * RESTRICT linop_data, const INT64 * RESTRICT linop_indptr, const INT64 * RESTRICT linop_indices, REAL * RESTRICT return_energy, REAL * RESTRICT TIME_TAKEN ){{ std::chrono::high_resolution_clock::time_point _loop_timer_t0 = std::chrono::high_resolution_clock::now(); REAL tmpu0 = lr_energy_diff(accept_flag, old_position, new_position, charge, old_energy, existing_multipole, existing_evector, linop_data, linop_indptr, linop_indices); REAL tmpu1 = (accept_flag < 1) ? self_contributon(old_position, new_position, charge) : 0.0 ; *return_energy = -1.0 * tmpu0 + tmpu1; std::chrono::high_resolution_clock::time_point _loop_timer_t1 = std::chrono::high_resolution_clock::now(); std::chrono::duration<double> _loop_timer_res = _loop_timer_t1 - _loop_timer_t0; *TIME_TAKEN = (double) _loop_timer_res.count(); return 0; }} }} extern "C" int lr_self_interaction( const INT64 accept_flag, const REAL * RESTRICT old_position, const REAL * RESTRICT new_position, const REAL charge, const REAL old_energy, REAL * RESTRICT existing_multipole, REAL * RESTRICT existing_evector, const REAL * RESTRICT linop_data, const INT64 * RESTRICT linop_indptr, const INT64 * RESTRICT linop_indices, REAL * RESTRICT return_energy, REAL * RESTRICT TIME_TAKEN ) {{ return LR_SI::lr_self_interaction_inner( accept_flag, old_position, new_position, charge, old_energy, existing_multipole, existing_evector, linop_data, linop_indptr, linop_indices, return_energy, TIME_TAKEN ); }} '''.format( SPH_GEN=str(sph_gen.module), ASSIGN_GEN=str(assign_gen), OFFSET_CONSTS=str(offset_consts), BC27=str(bc27) ) header = str( Module(( Include('omp.h'), Include('stdio.h'), Include('math.h'), Include('chrono'), Define('INT64', 'int64_t'), Define('REAL', 'double'), Define('NCOMP', str(self.ncomp)), Define('HALF_NCOMP', str(self.L**2)), Define('DIPOLE_SX', str(self.lrc.dipole_correction[0])), Define('DIPOLE_SY', str(self.lrc.dipole_correction[1])), Define('DIPOLE_SZ', str(self.lrc.dipole_correction[2])), Define('RE_1P1', str(_re_lm(1, 1))), Define('RE_1_0', str(_re_lm(1, 0))), Define('RE_1N1', str(_re_lm(1,-1))), Define('IM_1P1', str(_re_lm(1, 1) + self.L**2)), Define('IM_1_0', str(_re_lm(1, 0) + self.L**2)), Define('DOMAIN_27_ENERGY', str(new27direct)), Define('IM_1N1', str(_re_lm(1,-1) + self.L**2)), Define('EX', self.domain.extent[0]), Define('EY', self.domain.extent[1]), Define('EZ', self.domain.extent[2]), )) ) header_post = ''' #undef NCOMP #undef HALF_NCOMP #undef DIPOLE_SX #undef DIPOLE_SY #undef DIPOLE_SZ #undef RE_1P1 #undef RE_1_0 #undef RE_1N1 #undef IM_1P1 #undef IM_1_0 #undef DOMAIN_27_ENERGY #undef IM_1N1 #undef EX #undef EY #undef EZ ''' src = header + src + header_post self._lr_si_lib = lib.build.simple_lib_creator(header_code='', src_code=src)['lr_self_interaction'] self.lib_sl_source = src self.lib_sl_parameters = [ 'const INT64 LR_SI_accept_flag', 'const REAL * RESTRICT LR_SI_old_position', 'const REAL * RESTRICT LR_SI_new_position', 'const REAL LR_SI_charge', 'const REAL LR_SI_old_energy', ' REAL * RESTRICT LR_SI_existing_multipole', ' REAL * RESTRICT LR_SI_existing_evector', 'const REAL * RESTRICT LR_SI_linop_data', 'const INT64 * RESTRICT LR_SI_linop_indptr', 'const INT64 * RESTRICT LR_SI_linop_indices', ' REAL * RESTRICT LR_SI_return_energy', ' REAL * RESTRICT TIME_TAKEN', ] self.lib_sl_call = ''' LR_SI::lr_self_interaction_inner( LR_SI_accept_flag, LR_SI_old_position, LR_SI_new_position, LR_SI_charge, LR_SI_old_energy, LR_SI_existing_multipole, LR_SI_existing_evector, LR_SI_linop_data, LR_SI_linop_indptr, LR_SI_linop_indices, LR_SI_return_energy, TIME_TAKEN ); ''' if not (self.boundary_condition == BCType.PBC): self.lib_sl_parameters = [] self.lib_sl_call = ''
def get_kernel(self, fdata, ilist_data, for_benchmark): from cgen.cuda import CudaShared, CudaGlobal from pycuda.tools import dtype_to_ctype discr = self.discr given = self.plan.given fplan = self.plan d = discr.dimensions dims = range(d) elgroup, = discr.element_groups float_type = given.float_type f_decl = CudaGlobal( FunctionDeclaration(Value("void", "apply_flux"), [ Pointer(POD(float_type, "debugbuf")), Pointer(POD(numpy.uint8, "gmem_facedata")), ] + [ Pointer(POD(float_type, "gmem_fluxes_on_faces%d" % flux_nr)) for flux_nr in range(len(self.fluxes)) ])) cmod = Module() cmod.append(Include("pycuda-helpers.hpp")) for dep_expr in self.all_deps: cmod.extend([ Value( "texture<%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type, with_fp_tex_hack=True), "field%d_tex" % self.dep_to_index[dep_expr]) ]) if fplan.flux_count != len(self.fluxes): from warnings import warn warn( "Flux count in flux execution plan different from actual flux count.\n" "You may want to specify the tune_for= kwarg in the Discretization\n" "constructor.") cmod.extend([ Line(), Typedef(POD(float_type, "value_type")), Line(), flux_header_struct(float_type, discr.dimensions), Line(), face_pair_struct(float_type, discr.dimensions), Line(), Define("DIMENSIONS", discr.dimensions), Define("DOFS_PER_FACE", fplan.dofs_per_face), Define("THREADS_PER_FACE", fplan.threads_per_face()), Line(), Define("CONCURRENT_FACES", fplan.parallel_faces), Define("BLOCK_MB_COUNT", fplan.mbs_per_block), Line(), Define("FACEDOF_NR", "threadIdx.x"), Define("BLOCK_FACE", "threadIdx.y"), Line(), Define("FLUX_COUNT", len(self.fluxes)), Line(), Define("THREAD_NUM", "(FACEDOF_NR + BLOCK_FACE*THREADS_PER_FACE)"), Define("THREAD_COUNT", "(THREADS_PER_FACE*CONCURRENT_FACES)"), Define( "COALESCING_THREAD_COUNT", "(THREAD_COUNT < 0x10 ? THREAD_COUNT : THREAD_COUNT & ~0xf)"), Line(), Define("DATA_BLOCK_SIZE", fdata.block_bytes), Define("ALIGNED_FACE_DOFS_PER_MB", fplan.aligned_face_dofs_per_microblock()), Define("ALIGNED_FACE_DOFS_PER_BLOCK", "(ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT)"), Line(), Define("FOF_BLOCK_BASE", "(blockIdx.x*ALIGNED_FACE_DOFS_PER_BLOCK)"), Line(), ] + ilist_data.code + [ Line(), Value("texture<index_list_entry_t, 1, cudaReadModeElementType>", "tex_index_lists"), Line(), fdata.struct, Line(), CudaShared(Value("flux_data", "data")), ]) if not fplan.direct_store: cmod.extend([ CudaShared( ArrayOf( ArrayOf(POD(float_type, "smem_fluxes_on_faces"), "FLUX_COUNT"), "ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT")), Line(), ]) S = Statement f_body = Block() from hedge.backends.cuda.tools import get_load_code f_body.extend( get_load_code(dest="&data", base="gmem_facedata + blockIdx.x*DATA_BLOCK_SIZE", bytes="sizeof(flux_data)", descr="load face_pair data") + [S("__syncthreads()"), Line()]) def get_flux_code(flux_writer): flux_code = Block([]) flux_code.extend([ Initializer(Pointer(Value("face_pair", "fpair")), "data.facepairs+fpair_nr"), Initializer( MaybeUnused(POD(numpy.uint32, "a_index")), "fpair->a_base + tex1Dfetch(tex_index_lists, " "fpair->a_ilist_index + FACEDOF_NR)"), Initializer( MaybeUnused(POD(numpy.uint32, "b_index")), "fpair->b_base + tex1Dfetch(tex_index_lists, " "fpair->b_ilist_index + FACEDOF_NR)"), Line(), flux_writer(), Line(), S("fpair_nr += CONCURRENT_FACES") ]) return flux_code flux_computation = Block([ Comment("fluxes for dual-sided (intra-block) interior face pairs"), While("fpair_nr < data.header.same_facepairs_end", get_flux_code(lambda: self.write_interior_flux_code(True))), Line(), Comment("work around nvcc assertion failure"), S("fpair_nr+=1"), S("fpair_nr-=1"), Line(), Comment( "fluxes for single-sided (inter-block) interior face pairs"), While("fpair_nr < data.header.diff_facepairs_end", get_flux_code(lambda: self.write_interior_flux_code(False))), Line(), Comment("fluxes for single-sided boundary face pairs"), While( "fpair_nr < data.header.bdry_facepairs_end", get_flux_code( lambda: self.write_boundary_flux_code(for_benchmark))), ]) f_body.extend_log_block("compute the fluxes", [ Initializer(POD(numpy.uint32, "fpair_nr"), "BLOCK_FACE"), If("FACEDOF_NR < DOFS_PER_FACE", flux_computation) ]) if not fplan.direct_store: f_body.extend([Line(), S("__syncthreads()"), Line()]) f_body.extend_log_block( "store fluxes", [ #Assign("debugbuf[blockIdx.x]", "FOF_BLOCK_BASE"), #Assign("debugbuf[0]", "FOF_BLOCK_BASE"), #Assign("debugbuf[0]", "sizeof(face_pair)"), For( "unsigned word_nr = THREAD_NUM", "word_nr < ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT", "word_nr += COALESCING_THREAD_COUNT", Block([ Assign( "gmem_fluxes_on_faces%d[FOF_BLOCK_BASE+word_nr]" % flux_nr, "smem_fluxes_on_faces[%d][word_nr]" % flux_nr) for flux_nr in range(len(self.fluxes)) ] #+[If("isnan(smem_fluxes_on_faces[%d][word_nr])" % flux_nr, #Block([ #Assign("debugbuf[blockIdx.x]", "word_nr"), #]) #) #for flux_nr in range(len(self.fluxes))] )) ]) if False: f_body.extend([ Assign("debugbuf[blockIdx.x*96+32+BLOCK_FACE*32+threadIdx.x]", "fpair_nr"), Assign("debugbuf[blockIdx.x*96+16]", "data.header.same_facepairs_end"), Assign("debugbuf[blockIdx.x*96+17]", "data.header.diff_facepairs_end"), Assign("debugbuf[blockIdx.x*96+18]", "data.header.bdry_facepairs_end"), ]) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file("flux_gather", ".cu").write(str(cmod)) #from pycuda.tools import allow_user_edit mod = SourceModule( #allow_user_edit(cmod, "kernel.cu", "the flux kernel"), cmod, keep="cuda_keep_kernels" in discr.debug) expr_to_texture_map = dict( (dep_expr, mod.get_texref("field%d_tex" % self.dep_to_index[dep_expr])) for dep_expr in self.all_deps) index_list_texref = mod.get_texref("tex_index_lists") index_list_texref.set_address(ilist_data.device_memory, ilist_data.bytes) index_list_texref.set_format( cuda.dtype_to_array_format(ilist_data.type), 1) index_list_texref.set_flags(cuda.TRSF_READ_AS_INTEGER) func = mod.get_function("apply_flux") block = (fplan.threads_per_face(), fplan.parallel_faces, 1) func.prepare( (2 + len(self.fluxes)) * "P", texrefs=expr_to_texture_map.values() + [index_list_texref]) if "cuda_flux" in discr.debug: print "flux: lmem=%d smem=%d regs=%d" % ( func.local_size_bytes, func.shared_size_bytes, func.num_regs) return block, func, expr_to_texture_map
def get_kernel(self, diff_op_cls, elgroup, for_benchmark=False): from cgen import \ Pointer, POD, Value, ArrayOf, \ Module, FunctionDeclaration, FunctionBody, Block, \ Line, Define, Include, \ Initializer, If, For, Statement, Assign from cgen import dtype_to_ctype from cgen.cuda import CudaShared, CudaGlobal discr = self.discr d = discr.dimensions dims = range(d) given = self.plan.given par = self.plan.parallelism diffmat_data = self.gpu_diffmats(diff_op_cls, elgroup) elgroup, = discr.element_groups float_type = given.float_type f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_diff_mat"), [Pointer(POD(numpy.uint8, "gmem_diff_rst_mat")), #Pointer(POD(float_type, "debugbuf")), ] + [Pointer(POD(float_type, "drst%d_global" % i)) for i in dims] )) rst_channels = given.devdata.make_valid_tex_channel_count(d) cmod = Module([ Include("pycuda-helpers.hpp"), Line(), Value("texture<fp_tex_%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type), "field_tex"), Line(), Define("DIMENSIONS", discr.dimensions), Define("DOFS_PER_EL", given.dofs_per_el()), Line(), Define("SEGMENT_DOF", "threadIdx.x"), Define("PAR_MB_NR", "threadIdx.y"), Line(), Define("MB_SEGMENT", "blockIdx.x"), Define("MACROBLOCK_NR", "blockIdx.y"), Line(), Define("DOFS_PER_SEGMENT", self.plan.segment_size), Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()), Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats), Define("ELS_PER_MB", given.microblock.elements), Line(), Define("PAR_MB_COUNT", par.parallel), Define("INLINE_MB_COUNT", par.inline), Define("SEQ_MB_COUNT", par.serial), Line(), Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"), Define("COALESCING_THREAD_COUNT", "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"), Line(), Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"), Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"), Define("GLOBAL_MB_NR_BASE", "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"), Define("GLOBAL_MB_NR", "(GLOBAL_MB_NR_BASE" "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"), Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"), Line(), Define("DIFFMAT_SEGMENT_FLOATS", diffmat_data.block_floats), Define("DIFFMAT_SEGMENT_BYTES", "(DIFFMAT_SEGMENT_FLOATS*%d)" % given.float_size()), Define("DIFFMAT_COLUMNS", diffmat_data.matrix_columns), Line(), CudaShared(ArrayOf(POD(float_type, "smem_diff_rst_mat"), "DIFFMAT_COLUMNS*DOFS_PER_SEGMENT")), Line(), ]) S = Statement f_body = Block() f_body.extend_log_block("calculate responsibility data", [ Initializer(POD(numpy.uint16, "mb_el"), "MB_DOF/DOFS_PER_EL"), ]) from hedge.backends.cuda.tools import get_load_code f_body.extend( get_load_code( dest="smem_diff_rst_mat", base="gmem_diff_rst_mat + MB_SEGMENT*DIFFMAT_SEGMENT_BYTES", bytes="DIFFMAT_SEGMENT_BYTES", descr="load diff mat segment") +[S("__syncthreads()"), Line()]) # --------------------------------------------------------------------- def get_scalar_diff_code(): code = [] for inl in range(par.inline): for axis in dims: code.append( Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0)) code.append(Line()) def get_mat_entry(row, col, axis): return ("smem_diff_rst_mat[" "%(row)s*DIFFMAT_COLUMNS + %(axis)s*DOFS_PER_EL" " + %(col)s" "]" % {"row":row, "col":col, "axis":axis} ) tex_channels = ["x", "y", "z", "w"] from hedge.backends.cuda.tools import unroll code.extend( [POD(float_type, "field_value%d" % inl) for inl in range(par.inline)] +[Line()] +unroll(lambda j: [ Assign("field_value%d" % inl, "fp_tex1Dfetch(field_tex, GLOBAL_MB_DOF_BASE + %d*ALIGNED_DOFS_PER_MB " "+ mb_el*DOFS_PER_EL + %s)" % (inl, j) ) for inl in range(par.inline)] +[Line()] +[S("d%drst%d += %s * field_value%d" % (inl, axis, get_mat_entry("SEGMENT_DOF", j, axis), inl)) for axis in dims for inl in range(par.inline)] +[Line()], given.dofs_per_el(), self.plan.max_unroll) ) store_code = Block() for inl in range(par.inline): for rst_axis in dims: store_code.append(Assign( "drst%d_global[GLOBAL_MB_DOF_BASE" " + %d*ALIGNED_DOFS_PER_MB + MB_DOF]" % (rst_axis, inl), "d%drst%d" % (inl, rst_axis), )) code.append(If("MB_DOF < DOFS_PER_EL*ELS_PER_MB", store_code)) return code f_body.extend([ For("unsigned short seq_mb_number = 0", "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number", Block(get_scalar_diff_code())) ]) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file("diff", ".cu").write(str(cmod)) mod = SourceModule(cmod, keep="cuda_keep_kernels" in discr.debug, #options=["--maxrregcount=10"] ) field_texref = mod.get_texref("field_tex") func = mod.get_function("apply_diff_mat") func.prepare( discr.dimensions*[float_type] + ["P"], block=(self.plan.segment_size, par.parallel, 1), texrefs=[field_texref]) if "cuda_diff" in discr.debug: print "diff: lmem=%d smem=%d regs=%d" % ( func.local_size_bytes, func.shared_size_bytes, func.num_regs) return func, field_texref
def get_kernel(self, diff_op_cls, elgroup, for_benchmark=False): from cgen import \ Pointer, POD, Value, ArrayOf, \ Module, FunctionDeclaration, FunctionBody, Block, \ Line, Define, Include, \ Initializer, If, For, Statement, Assign from cgen import dtype_to_ctype from cgen.cuda import CudaShared, CudaGlobal discr = self.discr d = discr.dimensions dims = range(d) given = self.plan.given par = self.plan.parallelism diffmat_data = self.gpu_diffmats(diff_op_cls, elgroup) elgroup, = discr.element_groups float_type = given.float_type f_decl = CudaGlobal( FunctionDeclaration( Value("void", "apply_diff_mat"), [ Pointer(POD(numpy.uint8, "gmem_diff_rst_mat")), #Pointer(POD(float_type, "debugbuf")), ] + [Pointer(POD(float_type, "drst%d_global" % i)) for i in dims])) rst_channels = given.devdata.make_valid_tex_channel_count(d) cmod = Module([ Include("pycuda-helpers.hpp"), Line(), Value( "texture<fp_tex_%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type), "field_tex"), Line(), Define("DIMENSIONS", discr.dimensions), Define("DOFS_PER_EL", given.dofs_per_el()), Line(), Define("SEGMENT_DOF", "threadIdx.x"), Define("PAR_MB_NR", "threadIdx.y"), Line(), Define("MB_SEGMENT", "blockIdx.x"), Define("MACROBLOCK_NR", "blockIdx.y"), Line(), Define("DOFS_PER_SEGMENT", self.plan.segment_size), Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()), Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats), Define("ELS_PER_MB", given.microblock.elements), Line(), Define("PAR_MB_COUNT", par.parallel), Define("INLINE_MB_COUNT", par.inline), Define("SEQ_MB_COUNT", par.serial), Line(), Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"), Define("COALESCING_THREAD_COUNT", "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"), Line(), Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"), Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"), Define( "GLOBAL_MB_NR_BASE", "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"), Define( "GLOBAL_MB_NR", "(GLOBAL_MB_NR_BASE" "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"), Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"), Line(), Define("DIFFMAT_SEGMENT_FLOATS", diffmat_data.block_floats), Define("DIFFMAT_SEGMENT_BYTES", "(DIFFMAT_SEGMENT_FLOATS*%d)" % given.float_size()), Define("DIFFMAT_COLUMNS", diffmat_data.matrix_columns), Line(), CudaShared( ArrayOf(POD(float_type, "smem_diff_rst_mat"), "DIFFMAT_COLUMNS*DOFS_PER_SEGMENT")), Line(), ]) S = Statement f_body = Block() f_body.extend_log_block("calculate responsibility data", [ Initializer(POD(numpy.uint16, "mb_el"), "MB_DOF/DOFS_PER_EL"), ]) from hedge.backends.cuda.tools import get_load_code f_body.extend( get_load_code( dest="smem_diff_rst_mat", base="gmem_diff_rst_mat + MB_SEGMENT*DIFFMAT_SEGMENT_BYTES", bytes="DIFFMAT_SEGMENT_BYTES", descr="load diff mat segment") + [S("__syncthreads()"), Line()]) # --------------------------------------------------------------------- def get_scalar_diff_code(): code = [] for inl in range(par.inline): for axis in dims: code.append( Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0)) code.append(Line()) def get_mat_entry(row, col, axis): return ("smem_diff_rst_mat[" "%(row)s*DIFFMAT_COLUMNS + %(axis)s*DOFS_PER_EL" " + %(col)s" "]" % { "row": row, "col": col, "axis": axis }) tex_channels = ["x", "y", "z", "w"] from hedge.backends.cuda.tools import unroll code.extend([ POD(float_type, "field_value%d" % inl) for inl in range(par.inline) ] + [Line()] + unroll( lambda j: [ Assign( "field_value%d" % inl, "fp_tex1Dfetch(field_tex, GLOBAL_MB_DOF_BASE + %d*ALIGNED_DOFS_PER_MB " "+ mb_el*DOFS_PER_EL + %s)" % (inl, j)) for inl in range(par.inline) ] + [Line()] + [ S("d%drst%d += %s * field_value%d" % (inl, axis, get_mat_entry("SEGMENT_DOF", j, axis), inl)) for axis in dims for inl in range(par.inline) ] + [Line()], given.dofs_per_el(), self.plan.max_unroll)) store_code = Block() for inl in range(par.inline): for rst_axis in dims: store_code.append( Assign( "drst%d_global[GLOBAL_MB_DOF_BASE" " + %d*ALIGNED_DOFS_PER_MB + MB_DOF]" % (rst_axis, inl), "d%drst%d" % (inl, rst_axis), )) code.append(If("MB_DOF < DOFS_PER_EL*ELS_PER_MB", store_code)) return code f_body.extend([ For("unsigned short seq_mb_number = 0", "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number", Block(get_scalar_diff_code())) ]) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file("diff", ".cu").write(str(cmod)) mod = SourceModule( cmod, keep="cuda_keep_kernels" in discr.debug, #options=["--maxrregcount=10"] ) field_texref = mod.get_texref("field_tex") func = mod.get_function("apply_diff_mat") func.prepare(discr.dimensions * [float_type] + ["P"], block=(self.plan.segment_size, par.parallel, 1), texrefs=[field_texref]) if "cuda_diff" in discr.debug: print "diff: lmem=%d smem=%d regs=%d" % ( func.local_size_bytes, func.shared_size_bytes, func.num_regs) return func, field_texref
def get_kernel(self, with_scaling, for_benchmark=False): from cgen import \ Pointer, POD, Value, ArrayOf, \ Module, FunctionDeclaration, FunctionBody, Block, \ Line, Define, Include, \ Initializer, If, For, Statement, Assign, \ ArrayInitializer from cgen import dtype_to_ctype from cgen.cuda import CudaShared, CudaConstant, CudaGlobal discr = self.discr d = discr.dimensions dims = range(d) given = self.plan.given float_type = given.float_type f_decl = CudaGlobal( FunctionDeclaration(Value("void", "apply_el_local_mat_smem_mat"), [ Pointer(POD(float_type, "out_vector")), Pointer(POD(numpy.uint8, "gmem_matrix")), Pointer(POD(float_type, "debugbuf")), POD(numpy.uint32, "microblock_count"), ])) cmod = Module([ Include("pycuda-helpers.hpp"), Line(), Value( "texture<fp_tex_%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type), "in_vector_tex"), ]) if with_scaling: cmod.append( Value( "texture<fp_tex_%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type), "scaling_tex"), ) par = self.plan.parallelism cmod.extend([ Line(), Define("DIMENSIONS", discr.dimensions), Define("DOFS_PER_EL", given.dofs_per_el()), Define("PREIMAGE_DOFS_PER_EL", self.plan.preimage_dofs_per_el), Line(), Define("SEGMENT_DOF", "threadIdx.x"), Define("PAR_MB_NR", "threadIdx.y"), Line(), Define("MB_SEGMENT", "blockIdx.x"), Define("MACROBLOCK_NR", "blockIdx.y"), Line(), Define("DOFS_PER_SEGMENT", self.plan.segment_size), Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()), Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats), Define("ALIGNED_PREIMAGE_DOFS_PER_MB", self.plan.aligned_preimage_dofs_per_microblock), Define("MB_EL_COUNT", given.microblock.elements), Line(), Define("PAR_MB_COUNT", par.parallel), Define("INLINE_MB_COUNT", par.inline), Define("SEQ_MB_COUNT", par.serial), Line(), Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"), Define("COALESCING_THREAD_COUNT", "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"), Line(), Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"), Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"), Define( "GLOBAL_MB_NR_BASE", "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"), Define( "GLOBAL_MB_NR", "(GLOBAL_MB_NR_BASE" "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"), Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"), Define("GLOBAL_MB_PREIMG_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"), Line(), Define("MATRIX_COLUMNS", self.plan.gpu_matrix_columns()), Define("MATRIX_SEGMENT_FLOATS", self.plan.gpu_matrix_block_floats()), Define("MATRIX_SEGMENT_BYTES", "(MATRIX_SEGMENT_FLOATS*%d)" % given.float_size()), Line(), CudaShared( ArrayOf(POD(float_type, "smem_matrix"), "MATRIX_SEGMENT_FLOATS")), CudaShared( ArrayOf( ArrayOf( ArrayOf(POD(float_type, "dof_buffer"), "PAR_MB_COUNT"), "INLINE_MB_COUNT"), "DOFS_PER_SEGMENT"), ), CudaShared(POD(numpy.uint16, "segment_start_el")), CudaShared(POD(numpy.uint16, "segment_stop_el")), CudaShared(POD(numpy.uint16, "segment_el_count")), Line(), ArrayInitializer( CudaConstant( ArrayOf(POD(numpy.uint32, "segment_start_el_lookup"), "SEGMENTS_PER_MB")), [(chk * self.plan.segment_size) // given.dofs_per_el() for chk in range(self.plan.segments_per_microblock())]), ArrayInitializer( CudaConstant( ArrayOf(POD(numpy.uint32, "segment_stop_el_lookup"), "SEGMENTS_PER_MB")), [ min(given.microblock.elements, (chk * self.plan.segment_size + self.plan.segment_size - 1) // given.dofs_per_el() + 1) for chk in range(self.plan.segments_per_microblock()) ]), ]) S = Statement f_body = Block() f_body.extend_log_block( "calculate this dof's element", [Initializer(POD(numpy.uint8, "mb_el"), "MB_DOF/DOFS_PER_EL")]) if self.plan.use_prefetch_branch: f_body.extend_log_block("calculate segment responsibility data", [ If( "THREAD_NUM==0", Block([ Assign("segment_start_el", "segment_start_el_lookup[MB_SEGMENT]"), Assign("segment_stop_el", "segment_stop_el_lookup[MB_SEGMENT]"), Assign("segment_el_count", "segment_stop_el-segment_start_el"), ])), S("__syncthreads()") ]) from hedge.backends.cuda.tools import get_load_code f_body.extend( get_load_code(dest="smem_matrix", base=( "gmem_matrix + MB_SEGMENT*MATRIX_SEGMENT_BYTES"), bytes="MATRIX_SEGMENT_BYTES", descr="load matrix segment") + [S("__syncthreads()")]) # --------------------------------------------------------------------- def get_batched_fetch_mat_mul_code(el_fetch_count): result = [] dofs = range(self.plan.preimage_dofs_per_el) for load_segment_start in range(0, self.plan.preimage_dofs_per_el, self.plan.segment_size): result.extend([S("__syncthreads()")] + [ Assign( "dof_buffer[PAR_MB_NR][%d][SEGMENT_DOF]" % inl, "fp_tex1Dfetch(in_vector_tex, " "GLOBAL_MB_PREIMG_DOF_BASE" " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB" " + (segment_start_el)*PREIMAGE_DOFS_PER_EL + %d + SEGMENT_DOF)" % (inl, load_segment_start)) for inl in range(par.inline) ] + [ S("__syncthreads()"), Line(), ]) for dof in dofs[load_segment_start:load_segment_start + self.plan.segment_size]: for inl in range(par.inline): result.append( S("result%d += " "smem_matrix[SEGMENT_DOF*MATRIX_COLUMNS + %d]" "*" "dof_buffer[PAR_MB_NR][%d][%d]" % (inl, dof, inl, dof - load_segment_start))) result.append(Line()) return result from hedge.backends.cuda.tools import unroll def get_direct_tex_mat_mul_code(): return ( [POD(float_type, "fof%d" % inl) for inl in range(par.inline)] + [POD(float_type, "lm"), Line()] + unroll( lambda j: [ Assign( "fof%d" % inl, "fp_tex1Dfetch(in_vector_tex, " "GLOBAL_MB_PREIMG_DOF_BASE" " + %(inl)d * ALIGNED_PREIMAGE_DOFS_PER_MB" " + mb_el*PREIMAGE_DOFS_PER_EL+%(j)s)" % { "j": j, "inl": inl, "row": "SEGMENT_DOF" }, ) for inl in range(par.inline) ] + [ Assign( "lm", "smem_matrix[" "%(row)s*MATRIX_COLUMNS + %(j)s]" % { "j": j, "row": "SEGMENT_DOF" }, ) ] + [ S("result%(inl)d += fof%(inl)d*lm" % {"inl": inl}) for inl in range(par.inline) ], total_number=self.plan.preimage_dofs_per_el, max_unroll=self.plan.max_unroll) + [Line()]) def get_mat_mul_code(el_fetch_count): if el_fetch_count == 1: return get_batched_fetch_mat_mul_code(el_fetch_count) else: return get_direct_tex_mat_mul_code() def mat_mul_outer_loop(fetch_count): if with_scaling: inv_jac_multiplier = ( "fp_tex1Dfetch(scaling_tex," "(GLOBAL_MB_NR + %(inl)d)*MB_EL_COUNT + mb_el)") else: inv_jac_multiplier = "1" write_condition = "MB_DOF < DOFS_PER_EL*MB_EL_COUNT" if self.with_index_check: write_condition += " && GLOBAL_MB_NR < microblock_count" return For( "unsigned short seq_mb_number = 0", "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number", Block([ Initializer(POD(float_type, "result%d" % inl), 0) for inl in range(par.inline) ] + [Line()] + get_mat_mul_code(fetch_count) + [ If( write_condition, Block([ Assign( "out_vector[GLOBAL_MB_DOF_BASE" " + %d*ALIGNED_DOFS_PER_MB" " + MB_DOF]" % inl, "result%d * %s" % (inl, (inv_jac_multiplier % { "inl": inl }))) for inl in range(par.inline) ])) ])) if self.plan.use_prefetch_branch: from cgen import make_multiple_ifs f_body.append( make_multiple_ifs([ ("segment_el_count == %d" % fetch_count, mat_mul_outer_loop(fetch_count)) for fetch_count in range( 1, self.plan.max_elements_touched_by_segment() + 1) ])) else: f_body.append(mat_mul_outer_loop(0)) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file(self.plan.debug_name, ".cu").write(str(cmod)) mod = SourceModule( cmod, keep="cuda_keep_kernels" in discr.debug, #options=["--maxrregcount=12"] ) func = mod.get_function("apply_el_local_mat_smem_mat") if self.plan.debug_name in discr.debug: print "%s: lmem=%d smem=%d regs=%d" % ( self.plan.debug_name, func.local_size_bytes, func.shared_size_bytes, func.num_regs) in_vector_texref = mod.get_texref("in_vector_tex") texrefs = [in_vector_texref] if with_scaling: scaling_texref = mod.get_texref("scaling_tex") texrefs.append(scaling_texref) else: scaling_texref = None func.prepare("PPPI", block=(self.plan.segment_size, self.plan.parallelism.parallel, 1), texrefs=texrefs) return func, in_vector_texref, scaling_texref