示例#1
0
def make_element_local_plan(discr, given,
        aligned_preimage_dofs_per_microblock, preimage_dofs_per_el, 
        aligned_image_dofs_per_microblock, image_dofs_per_el,
        elements_per_microblock, microblock_count,
        op_name):
    def generate_plans():
        if ("cuda_no_smem_matrix" not in discr.debug 
                and image_dofs_per_el == given.dofs_per_el
                and aligned_image_dofs_per_microblock 
                == given.microblock.aligned_floats
                and elements_per_microblock 
                == given.microblock.elements):

            from hedge.backends.cuda.el_local_shared_segmat import ExecutionPlan as SSegPlan

            for use_prefetch_branch in [True]:
            #for use_prefetch_branch in [True, False]:
                segment_sizes = range(given.microblock.align_size,
                        elements_per_microblock*given.dofs_per_el()+1,
                        given.microblock.align_size)

                for pe in range(1,32+1):
                    for inline in range(1, MAX_INLINE+1):
                        for seq in range(1, 4+1):
                            for segment_size in segment_sizes:
                                yield SSegPlan(given,
                                        Parallelism(pe, inline, seq),
                                        segment_size,
                                        max_unroll=preimage_dofs_per_el,
                                        use_prefetch_branch=use_prefetch_branch,

                                        debug_name="cuda_%s" % op_name,
                                        aligned_preimage_dofs_per_microblock=
                                            aligned_preimage_dofs_per_microblock,
                                        preimage_dofs_per_el=preimage_dofs_per_el)

        from hedge.backends.cuda.el_local_shared_fld import ExecutionPlan as SFieldPlan

        for pe in range(1,32+1):
            for inline in range(1, MAX_INLINE):
                yield SFieldPlan(given, Parallelism(pe, inline, 1),
                        debug_name="cuda_%s" % op_name,
                        aligned_preimage_dofs_per_microblock=
                            aligned_preimage_dofs_per_microblock,
                        preimage_dofs_per_el=preimage_dofs_per_el,
                        aligned_image_dofs_per_microblock=
                            aligned_image_dofs_per_microblock,
                        image_dofs_per_el=image_dofs_per_el,
                        elements_per_microblock=elements_per_microblock,
                        microblock_count=microblock_count)

    def target_func(plan):
        return plan.make_kernel(discr).benchmark()

    from hedge.backends.cuda.plan import optimize_plan
    return optimize_plan(
            op_name, generate_plans, target_func, maximize=False,
            debug_flags=discr.debug,
            log_filename="%s-%d" % (op_name, given.order()))
示例#2
0
def make_diff_plan(discr, given,
        aligned_preimage_dofs_per_microblock, preimage_dofs_per_el,
        aligned_image_dofs_per_microblock, image_dofs_per_el):
    def generate_plans():
        segment_sizes = range(given.microblock.align_size,
                given.microblock.elements*given.dofs_per_el()+1,
                given.microblock.align_size)

        from hedge.backends.cuda.diff_shared_segmat import ExecutionPlan as SSegPlan
        if ("cuda_no_smem_matrix" not in discr.debug
                and image_dofs_per_el == given.dofs_per_el
                and aligned_image_dofs_per_microblock 
                == given.microblock.aligned_floats):
            for pe in range(1,32+1):
                for inline in range(1, MAX_INLINE+1):
                    for seq in range(1, 4):
                        for segment_size in segment_sizes:
                            yield SSegPlan(
                                    given, Parallelism(pe, inline, seq),
                                    segment_size,
                                    max_unroll=given.dofs_per_el())

        from hedge.backends.cuda.diff_shared_fld import ExecutionPlan as SFieldPlan
        for pe in range(1,32+1):
            for inline in range(1, MAX_INLINE+1):
                yield SFieldPlan(given, Parallelism(pe, inline, 1),
                        aligned_preimage_dofs_per_microblock=
                            aligned_preimage_dofs_per_microblock,
                        preimage_dofs_per_el=preimage_dofs_per_el,
                        aligned_image_dofs_per_microblock=
                            aligned_image_dofs_per_microblock,
                        image_dofs_per_el=image_dofs_per_el)

    def target_func(plan):
        return plan.make_kernel(discr).benchmark()

    from hedge.backends.cuda.plan import optimize_plan
    return optimize_plan("diff", generate_plans, target_func, maximize=False,
            debug_flags=discr.debug,
            log_filename="diff-%d" % given.order())
示例#3
0
def make_plan(discr, eg, given, tune_for, dofs_per_face, quadrature_tag, 
        given_mbs_per_block=None):
    from hedge.backends.cuda.execute import Executor
    if tune_for is not None:
        fbatch1 = Executor.get_first_flux_batch(discr.mesh, tune_for)
        if fbatch1 is not None:
            fluxes = list(fbatch1.flux_exprs)
            flux_count = len(fluxes)
        else:
            fluxes = None
    else:
        fluxes = None

    if fluxes is None:
        # a reasonable guess?
        flux_count = discr.dimensions

    if quadrature_tag is None:
        input_dofs_per_microblock = given.microblock.aligned_floats
    else:
        input_dofs_per_microblock = discr.get_cuda_elgroup_quadrature_info(
                eg, quadrature_tag, given=given).aligned_int_face_dofs_per_microblock

    def generate_valid_plans():
        valid_plan_count = 0

        if given_mbs_per_block is not None:
            mbs_per_block_values = [given_mbs_per_block]
        else:
            mbs_per_block_values = xrange(1, 8)
        for direct_store in [False, True]:
            for parallel_faces in range(1, 32):
                for mbs_per_block in mbs_per_block_values:
                    flux_plan = ExecutionPlan(given, eg, parallel_faces,
                            mbs_per_block, flux_count,
                            direct_store=direct_store,
                            partition_data=discr._get_partition_data(
                                mbs_per_block*given.microblock.elements),
                            dofs_per_face=dofs_per_face,
                            input_dofs_per_microblock=input_dofs_per_microblock,
                            quadrature_tag=quadrature_tag)
                    if flux_plan.invalid_reason() is None:
                        valid_plan_count += 1
                        yield flux_plan

            # if there are valid plans *without* direct_store *and* we're using
            # single precision, then bail now: It's unlikely that direct-store
            # offers any benefit.
            if valid_plan_count and given.float_type == numpy.float32:
                return

    def target_func(plan):
        if tune_for is None:
            return 0
        else:
            return plan.make_kernel(discr, executor=None,
                    fluxes=fluxes).benchmark()

    from hedge.backends.cuda.plan import optimize_plan

    return optimize_plan(
            "gather",
            generate_valid_plans, target_func,
            maximize=False,
            debug_flags=discr.debug)
示例#4
0
def make_plan(discr,
              eg,
              given,
              tune_for,
              dofs_per_face,
              quadrature_tag,
              given_mbs_per_block=None):
    from hedge.backends.cuda.execute import Executor
    if tune_for is not None:
        fbatch1 = Executor.get_first_flux_batch(discr.mesh, tune_for)
        if fbatch1 is not None:
            fluxes = list(fbatch1.flux_exprs)
            flux_count = len(fluxes)
        else:
            fluxes = None
    else:
        fluxes = None

    if fluxes is None:
        # a reasonable guess?
        flux_count = discr.dimensions

    if quadrature_tag is None:
        input_dofs_per_microblock = given.microblock.aligned_floats
    else:
        input_dofs_per_microblock = discr.get_cuda_elgroup_quadrature_info(
            eg, quadrature_tag,
            given=given).aligned_int_face_dofs_per_microblock

    def generate_valid_plans():
        valid_plan_count = 0

        if given_mbs_per_block is not None:
            mbs_per_block_values = [given_mbs_per_block]
        else:
            mbs_per_block_values = xrange(1, 8)
        for direct_store in [False, True]:
            for parallel_faces in range(1, 32):
                for mbs_per_block in mbs_per_block_values:
                    flux_plan = ExecutionPlan(
                        given,
                        eg,
                        parallel_faces,
                        mbs_per_block,
                        flux_count,
                        direct_store=direct_store,
                        partition_data=discr._get_partition_data(
                            mbs_per_block * given.microblock.elements),
                        dofs_per_face=dofs_per_face,
                        input_dofs_per_microblock=input_dofs_per_microblock,
                        quadrature_tag=quadrature_tag)
                    if flux_plan.invalid_reason() is None:
                        valid_plan_count += 1
                        yield flux_plan

            # if there are valid plans *without* direct_store *and* we're using
            # single precision, then bail now: It's unlikely that direct-store
            # offers any benefit.
            if valid_plan_count and given.float_type == numpy.float32:
                return

    def target_func(plan):
        if tune_for is None:
            return 0
        else:
            return plan.make_kernel(discr, executor=None,
                                    fluxes=fluxes).benchmark()

    from hedge.backends.cuda.plan import optimize_plan

    return optimize_plan("gather",
                         generate_valid_plans,
                         target_func,
                         maximize=False,
                         debug_flags=discr.debug)