def make_element_local_plan(discr, given, aligned_preimage_dofs_per_microblock, preimage_dofs_per_el, aligned_image_dofs_per_microblock, image_dofs_per_el, elements_per_microblock, microblock_count, op_name): def generate_plans(): if ("cuda_no_smem_matrix" not in discr.debug and image_dofs_per_el == given.dofs_per_el and aligned_image_dofs_per_microblock == given.microblock.aligned_floats and elements_per_microblock == given.microblock.elements): from hedge.backends.cuda.el_local_shared_segmat import ExecutionPlan as SSegPlan for use_prefetch_branch in [True]: #for use_prefetch_branch in [True, False]: segment_sizes = range(given.microblock.align_size, elements_per_microblock*given.dofs_per_el()+1, given.microblock.align_size) for pe in range(1,32+1): for inline in range(1, MAX_INLINE+1): for seq in range(1, 4+1): for segment_size in segment_sizes: yield SSegPlan(given, Parallelism(pe, inline, seq), segment_size, max_unroll=preimage_dofs_per_el, use_prefetch_branch=use_prefetch_branch, debug_name="cuda_%s" % op_name, aligned_preimage_dofs_per_microblock= aligned_preimage_dofs_per_microblock, preimage_dofs_per_el=preimage_dofs_per_el) from hedge.backends.cuda.el_local_shared_fld import ExecutionPlan as SFieldPlan for pe in range(1,32+1): for inline in range(1, MAX_INLINE): yield SFieldPlan(given, Parallelism(pe, inline, 1), debug_name="cuda_%s" % op_name, aligned_preimage_dofs_per_microblock= aligned_preimage_dofs_per_microblock, preimage_dofs_per_el=preimage_dofs_per_el, aligned_image_dofs_per_microblock= aligned_image_dofs_per_microblock, image_dofs_per_el=image_dofs_per_el, elements_per_microblock=elements_per_microblock, microblock_count=microblock_count) def target_func(plan): return plan.make_kernel(discr).benchmark() from hedge.backends.cuda.plan import optimize_plan return optimize_plan( op_name, generate_plans, target_func, maximize=False, debug_flags=discr.debug, log_filename="%s-%d" % (op_name, given.order()))
def make_diff_plan(discr, given, aligned_preimage_dofs_per_microblock, preimage_dofs_per_el, aligned_image_dofs_per_microblock, image_dofs_per_el): def generate_plans(): segment_sizes = range(given.microblock.align_size, given.microblock.elements*given.dofs_per_el()+1, given.microblock.align_size) from hedge.backends.cuda.diff_shared_segmat import ExecutionPlan as SSegPlan if ("cuda_no_smem_matrix" not in discr.debug and image_dofs_per_el == given.dofs_per_el and aligned_image_dofs_per_microblock == given.microblock.aligned_floats): for pe in range(1,32+1): for inline in range(1, MAX_INLINE+1): for seq in range(1, 4): for segment_size in segment_sizes: yield SSegPlan( given, Parallelism(pe, inline, seq), segment_size, max_unroll=given.dofs_per_el()) from hedge.backends.cuda.diff_shared_fld import ExecutionPlan as SFieldPlan for pe in range(1,32+1): for inline in range(1, MAX_INLINE+1): yield SFieldPlan(given, Parallelism(pe, inline, 1), aligned_preimage_dofs_per_microblock= aligned_preimage_dofs_per_microblock, preimage_dofs_per_el=preimage_dofs_per_el, aligned_image_dofs_per_microblock= aligned_image_dofs_per_microblock, image_dofs_per_el=image_dofs_per_el) def target_func(plan): return plan.make_kernel(discr).benchmark() from hedge.backends.cuda.plan import optimize_plan return optimize_plan("diff", generate_plans, target_func, maximize=False, debug_flags=discr.debug, log_filename="diff-%d" % given.order())
def make_plan(discr, eg, given, tune_for, dofs_per_face, quadrature_tag, given_mbs_per_block=None): from hedge.backends.cuda.execute import Executor if tune_for is not None: fbatch1 = Executor.get_first_flux_batch(discr.mesh, tune_for) if fbatch1 is not None: fluxes = list(fbatch1.flux_exprs) flux_count = len(fluxes) else: fluxes = None else: fluxes = None if fluxes is None: # a reasonable guess? flux_count = discr.dimensions if quadrature_tag is None: input_dofs_per_microblock = given.microblock.aligned_floats else: input_dofs_per_microblock = discr.get_cuda_elgroup_quadrature_info( eg, quadrature_tag, given=given).aligned_int_face_dofs_per_microblock def generate_valid_plans(): valid_plan_count = 0 if given_mbs_per_block is not None: mbs_per_block_values = [given_mbs_per_block] else: mbs_per_block_values = xrange(1, 8) for direct_store in [False, True]: for parallel_faces in range(1, 32): for mbs_per_block in mbs_per_block_values: flux_plan = ExecutionPlan(given, eg, parallel_faces, mbs_per_block, flux_count, direct_store=direct_store, partition_data=discr._get_partition_data( mbs_per_block*given.microblock.elements), dofs_per_face=dofs_per_face, input_dofs_per_microblock=input_dofs_per_microblock, quadrature_tag=quadrature_tag) if flux_plan.invalid_reason() is None: valid_plan_count += 1 yield flux_plan # if there are valid plans *without* direct_store *and* we're using # single precision, then bail now: It's unlikely that direct-store # offers any benefit. if valid_plan_count and given.float_type == numpy.float32: return def target_func(plan): if tune_for is None: return 0 else: return plan.make_kernel(discr, executor=None, fluxes=fluxes).benchmark() from hedge.backends.cuda.plan import optimize_plan return optimize_plan( "gather", generate_valid_plans, target_func, maximize=False, debug_flags=discr.debug)
def make_plan(discr, eg, given, tune_for, dofs_per_face, quadrature_tag, given_mbs_per_block=None): from hedge.backends.cuda.execute import Executor if tune_for is not None: fbatch1 = Executor.get_first_flux_batch(discr.mesh, tune_for) if fbatch1 is not None: fluxes = list(fbatch1.flux_exprs) flux_count = len(fluxes) else: fluxes = None else: fluxes = None if fluxes is None: # a reasonable guess? flux_count = discr.dimensions if quadrature_tag is None: input_dofs_per_microblock = given.microblock.aligned_floats else: input_dofs_per_microblock = discr.get_cuda_elgroup_quadrature_info( eg, quadrature_tag, given=given).aligned_int_face_dofs_per_microblock def generate_valid_plans(): valid_plan_count = 0 if given_mbs_per_block is not None: mbs_per_block_values = [given_mbs_per_block] else: mbs_per_block_values = xrange(1, 8) for direct_store in [False, True]: for parallel_faces in range(1, 32): for mbs_per_block in mbs_per_block_values: flux_plan = ExecutionPlan( given, eg, parallel_faces, mbs_per_block, flux_count, direct_store=direct_store, partition_data=discr._get_partition_data( mbs_per_block * given.microblock.elements), dofs_per_face=dofs_per_face, input_dofs_per_microblock=input_dofs_per_microblock, quadrature_tag=quadrature_tag) if flux_plan.invalid_reason() is None: valid_plan_count += 1 yield flux_plan # if there are valid plans *without* direct_store *and* we're using # single precision, then bail now: It's unlikely that direct-store # offers any benefit. if valid_plan_count and given.float_type == numpy.float32: return def target_func(plan): if tune_for is None: return 0 else: return plan.make_kernel(discr, executor=None, fluxes=fluxes).benchmark() from hedge.backends.cuda.plan import optimize_plan return optimize_plan("gather", generate_valid_plans, target_func, maximize=False, debug_flags=discr.debug)