Exemplo n.º 1
0
    def generate(self):
        """Generate (i.e. yield) the source code of the
        module line-by-line.
        """

        from cgen import Block, Module, Include, Line, Define, \
                PrivateNamespace

        body = []

        if self.max_arity is not None:
            body.append(Define("BOOST_PYTHON_MAX_ARITY", self.max_arity))

        if self.use_private_namespace:
            mod_body = [PrivateNamespace(self.mod_body)]
        else:
            mod_body = self.mod_body

        body += ([Include("boost/python.hpp")]
                + self.preamble + [Line()]
                + mod_body
                + [Line(), Line(f"BOOST_PYTHON_MODULE({self.name})")]
                + [Block(self.init_body)])

        return Module(body)
Exemplo n.º 2
0
 def get_direct_tex_mat_mul_code():
     return (
         [POD(float_type, "fof%d" % inl) for inl in range(par.inline)] +
         [POD(float_type, "lm"), Line()] + unroll(
             lambda j: [
                 Assign(
                     "fof%d" % inl,
                     "fp_tex1Dfetch(in_vector_tex, "
                     "GLOBAL_MB_PREIMG_DOF_BASE"
                     " + %(inl)d * ALIGNED_PREIMAGE_DOFS_PER_MB"
                     " + mb_el*PREIMAGE_DOFS_PER_EL+%(j)s)" % {
                         "j": j,
                         "inl": inl,
                         "row": "SEGMENT_DOF"
                     },
                 ) for inl in range(par.inline)
             ] + [
                 Assign(
                     "lm",
                     "smem_matrix["
                     "%(row)s*MATRIX_COLUMNS + %(j)s]" % {
                         "j": j,
                         "row": "SEGMENT_DOF"
                     },
                 )
             ] + [
                 S("result%(inl)d += fof%(inl)d*lm" % {"inl": inl})
                 for inl in range(par.inline)
             ],
             total_number=self.plan.preimage_dofs_per_el,
             max_unroll=self.plan.max_unroll) + [Line()])
Exemplo n.º 3
0
        def get_batched_fetch_mat_mul_code(el_fetch_count):
            result = []
            dofs = range(self.plan.preimage_dofs_per_el)

            for load_segment_start in range(0, self.plan.preimage_dofs_per_el,
                                            self.plan.segment_size):
                result.extend([S("__syncthreads()")] + [
                    Assign(
                        "dof_buffer[PAR_MB_NR][%d][SEGMENT_DOF]" %
                        inl, "fp_tex1Dfetch(in_vector_tex, "
                        "GLOBAL_MB_PREIMG_DOF_BASE"
                        " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB"
                        " + (segment_start_el)*PREIMAGE_DOFS_PER_EL + %d + SEGMENT_DOF)"
                        % (inl, load_segment_start))
                    for inl in range(par.inline)
                ] + [
                    S("__syncthreads()"),
                    Line(),
                ])

                for dof in dofs[load_segment_start:load_segment_start +
                                self.plan.segment_size]:
                    for inl in range(par.inline):
                        result.append(
                            S("result%d += "
                              "smem_matrix[SEGMENT_DOF*MATRIX_COLUMNS + %d]"
                              "*"
                              "dof_buffer[PAR_MB_NR][%d][%d]" %
                              (inl, dof, inl, dof - load_segment_start)))
                result.append(Line())
            return result
Exemplo n.º 4
0
def generate_c_instruction_code(codegen_state, insn):
    kernel = codegen_state.kernel

    if codegen_state.vectorization_info is not None:
        raise Unvectorizable("C instructions cannot be vectorized")

    body = []

    from loopy.target.c import POD
    from cgen import Initializer, Block, Line

    from pymbolic.primitives import Variable
    for name, iname_expr in insn.iname_exprs:
        if (isinstance(iname_expr, Variable)
                and name not in codegen_state.var_subst_map):
            # No need, the bare symbol will work
            continue

        body.append(
            Initializer(
                POD(codegen_state.ast_builder, kernel.index_dtype, name),
                codegen_state.expression_to_code_mapper(iname_expr,
                                                        prec=PREC_NONE,
                                                        type_context="i")))

    if body:
        body.append(Line())

    body.extend(Line(line) for line in insn.code.split("\n"))

    return Block(body)
Exemplo n.º 5
0
    def make_codepy_module(self, toolchain, dtype):
        from codepy.libraries import add_codepy
        toolchain = toolchain.copy()
        add_codepy(toolchain)

        from cgen import (Value, Include, Statement,
                Typedef, FunctionBody, FunctionDeclaration, Block, Const,
                Line, POD, Initializer, CustomLoop)
        S = Statement

        from codepy.bpl import BoostPythonModule
        mod = BoostPythonModule()

        mod.add_to_preamble([
            Include("vector"),
            Include("algorithm"),
            Include("hedge/base.hpp"),
            Include("hedge/volume_operators.hpp"),
            Include("boost/foreach.hpp"),
            Include("boost/numeric/ublas/io.hpp"),
            ]+self.get_cpu_extra_includes())

        mod.add_to_module([
            S("namespace ublas = boost::numeric::ublas"),
            S("using namespace hedge"),
            S("using namespace pyublas"),
            Line(),
            Typedef(POD(dtype, "value_type")),
            Line(),
            ])

        mod.add_function(FunctionBody(
            FunctionDeclaration(Value("void", "process_elements"), [
                Const(Value("uniform_element_ranges", "ers")),
                Const(Value("numpy_vector<value_type>", "field")),
                Value("numpy_vector<value_type>", "result"),
                ]+self.get_cpu_extra_parameter_declarators()),
            Block([
                Typedef(Value("numpy_vector<value_type>::iterator",
                    "it_type")),
                Typedef(Value("numpy_vector<value_type>::const_iterator",
                    "cit_type")),
                Line(),
                Initializer(Value("it_type", "result_it"),
                    "result.begin()"),
                Initializer(Value("cit_type", "field_it"),
                    "field.begin()"),
                Line() ]+self.get_cpu_extra_preamble()+[ Line(),
                CustomLoop(
                    "BOOST_FOREACH(const element_range er, ers)",
                    Block(self.get_cpu_per_element_code())
                    )
                ])))

        #print mod.generate()
        #toolchain = toolchain.copy()
        #toolchain.enable_debugging
        return mod.compile(toolchain)
Exemplo n.º 6
0
        def get_scalar_diff_code():
            code = []
            for inl in range(par.inline):
                for axis in dims:
                    code.append(
                        Initializer(POD(float_type, "d%drst%d" % (inl, axis)),
                                    0))

            code.append(Line())

            def get_mat_entry(row, col, axis):
                return ("smem_diff_rst_mat["
                        "%(row)s*DIFFMAT_COLUMNS + %(axis)s*DOFS_PER_EL"
                        " + %(col)s"
                        "]" % {
                            "row": row,
                            "col": col,
                            "axis": axis
                        })

            tex_channels = ["x", "y", "z", "w"]
            from hedge.backends.cuda.tools import unroll
            code.extend([
                POD(float_type, "field_value%d" % inl)
                for inl in range(par.inline)
            ] + [Line()] + unroll(
                lambda j: [
                    Assign(
                        "field_value%d" % inl,
                        "fp_tex1Dfetch(field_tex, GLOBAL_MB_DOF_BASE + %d*ALIGNED_DOFS_PER_MB "
                        "+ mb_el*DOFS_PER_EL + %s)" % (inl, j))
                    for inl in range(par.inline)
                ] + [Line()] + [
                    S("d%drst%d += %s * field_value%d" %
                      (inl, axis, get_mat_entry("SEGMENT_DOF", j, axis), inl))
                    for axis in dims for inl in range(par.inline)
                ] + [Line()], given.dofs_per_el(), self.plan.max_unroll))

            store_code = Block()
            for inl in range(par.inline):
                for rst_axis in dims:
                    store_code.append(
                        Assign(
                            "drst%d_global[GLOBAL_MB_DOF_BASE"
                            " + %d*ALIGNED_DOFS_PER_MB + MB_DOF]" %
                            (rst_axis, inl),
                            "d%drst%d" % (inl, rst_axis),
                        ))

            code.append(If("MB_DOF < DOFS_PER_EL*ELS_PER_MB", store_code))

            return code
Exemplo n.º 7
0
def get_elwise_module_descriptor(arguments, operation, name="kernel"):
    from codepy.bpl import BoostPythonModule

    from cgen import FunctionBody, FunctionDeclaration, \
            Value, POD, Struct, For, Initializer, Include, Statement, \
            Line, Block

    S = Statement  # noqa: N806

    mod = BoostPythonModule()
    mod.add_to_preamble([
        Include("pyublas/numpy.hpp"),
    ])

    mod.add_to_module([
        S("namespace ublas = boost::numeric::ublas"),
        S("using namespace pyublas"),
        Line(),
    ])

    body = Block([
        Initializer(
            Value(
                "numpy_array<{} >::iterator".format(dtype_to_ctype(
                    varg.dtype)), varg.name), f"args.{varg.name}_ary.begin()")
        for varg in arguments if isinstance(varg, VectorArg)
    ] + [
        Initializer(sarg.declarator(), f"args.{sarg.name}")
        for sarg in arguments if isinstance(sarg, ScalarArg)
    ])

    body.extend([
        Line(),
        For("unsigned i = 0", "i < codepy_length", "++i",
            Block([S(operation)]))
    ])

    arg_struct = Struct("arg_struct", [arg.declarator() for arg in arguments])
    mod.add_struct(arg_struct, "ArgStruct")
    mod.add_to_module([Line()])

    mod.add_function(
        FunctionBody(
            FunctionDeclaration(Value("void", name), [
                POD(numpy.uintp, "codepy_length"),
                Value("arg_struct", "args")
            ]), body))

    return mod
Exemplo n.º 8
0
        def mat_mul_outer_loop(fetch_count):
            if with_scaling:
                inv_jac_multiplier = (
                    "fp_tex1Dfetch(scaling_tex,"
                    "(GLOBAL_MB_NR + %(inl)d)*MB_EL_COUNT + mb_el)")
            else:
                inv_jac_multiplier = "1"

            write_condition = "MB_DOF < DOFS_PER_EL*MB_EL_COUNT"
            if self.with_index_check:
                write_condition += " && GLOBAL_MB_NR < microblock_count"
            return For(
                "unsigned short seq_mb_number = 0",
                "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number",
                Block([
                    Initializer(POD(float_type, "result%d" % inl), 0)
                    for inl in range(par.inline)
                ] + [Line()] + get_mat_mul_code(fetch_count) + [
                    If(
                        write_condition,
                        Block([
                            Assign(
                                "out_vector[GLOBAL_MB_DOF_BASE"
                                " + %d*ALIGNED_DOFS_PER_MB"
                                " + MB_DOF]" % inl, "result%d * %s" %
                                (inl, (inv_jac_multiplier % {
                                    "inl": inl
                                }))) for inl in range(par.inline)
                        ]))
                ]))
Exemplo n.º 9
0
def get_load_code(dest, base, bytes, word_type=numpy.uint32,
        descr=None):
    from cgen import (
            Pointer, POD, 
            Comment, Block, Line, \
            Constant, For, Statement)

    from cgen import dtype_to_ctype
    copy_dtype = numpy.dtype(word_type)
    copy_dtype_str = dtype_to_ctype(copy_dtype)

    code = []
    if descr is not None:
        code.append(Comment(descr))

    code.extend([
        Block([
            Constant(Pointer(POD(copy_dtype, "load_base")),
                ("(%s *) (%s)" % (copy_dtype_str, base))),
            For("unsigned word_nr = THREAD_NUM",
                "word_nr*sizeof(int) < (%s)" % bytes,
                "word_nr += COALESCING_THREAD_COUNT",
                Statement("((%s *) (%s))[word_nr] = load_base[word_nr]"
                    % (copy_dtype_str, dest))
                ),
            ]),
        Line(),
        ])

    return code
Exemplo n.º 10
0
    def add_struct(self,
            struct, py_name=None, py_member_name_transform=lambda x: x,
            by_value_members=None):
        if by_value_members is None:
            by_value_members = set()

        from cgen import Block, Line, Statement, Typedef, Value

        if py_name is None:
            py_name = struct.tpname

        self.mod_body.append(struct)

        member_defs = []
        for f in struct.fields:
            py_f_name = py_member_name_transform(f.name)
            tp_lines, declarator = f.get_decl_pair()
            if f.name in by_value_members or tp_lines[0].startswith("numpy_"):
                member_defs.append(
                        ".def(pyublas::by_value_rw_member"
                        f'("{py_f_name}", &cl::{f.name}))')
            else:
                member_defs.append(
                        f'.def_readwrite("{py_f_name}", &cl::{f.name})'
                        )

        self.init_body.append(
            Block([
                Typedef(Value(struct.tpname, "cl")),
                Line(),
                Statement(
                    'boost::python::class_<cl>("{}"){}'.format(
                        py_name, "".join(member_defs))),
                ]))
Exemplo n.º 11
0
def unroll(body_gen, total_number, max_unroll=None, start=0):
    from cgen import For, Line, Block
    from pytools import flatten

    if max_unroll is None:
        max_unroll = total_number

    result = []

    if total_number > max_unroll:
        loop_items = (total_number // max_unroll) * max_unroll

        result.extend([
                For("unsigned j = 0",
                    "j < %d" % loop_items,
                    "j += %d" % max_unroll,
                    Block(list(flatten(
                        body_gen("(j+%d)" % i)
                        for i in range(max_unroll))))
                    ),
                Line()
                ])
        start += loop_items

    result.extend(flatten(
        body_gen(i) for i in range(start, total_number)))

    return result
Exemplo n.º 12
0
        def get_load_code():
            mb_img_dofs = plan.aligned_image_dofs_per_microblock
            mb_preimg_dofs = plan.aligned_preimage_dofs_per_microblock
            preimg_dofs_over_dofs = (mb_preimg_dofs+mb_img_dofs-1) // mb_img_dofs

            load_code = []
            store_code = []

            var_num = 0
            for load_block in range(preimg_dofs_over_dofs):
                for inl in range(par.inline):
                    # load and store are split for better pipelining
                    # compiler can't figure that out because of branch

                    var = "tmp%d" % var_num
                    var_num += 1
                    load_code.append(POD(float_type, var))

                    block_addr = "%d * ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF" % load_block
                    load_instr = Assign(var,
                        "field[GLOBAL_MB_PREIMAGE_DOF_BASE"
                        " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB"
                        " + %s]" % (inl, block_addr))
                    store_instr = Assign(
                            "smem_field[PAR_MB_NR][%d][%s]" % (inl, block_addr),
                            var
                            )
                    if (load_block+1)*mb_img_dofs >= mb_preimg_dofs:
                        cond = "%s < ALIGNED_PREIMAGE_DOFS_PER_MB" % block_addr
                        load_instr = If(cond, load_instr)
                        store_instr = If(cond, store_instr)

                    load_code.append(load_instr)
                    store_code.append(store_instr)
            return Block(load_code + [Line()] + store_code)
Exemplo n.º 13
0
    def get_cpu_per_element_code(self):
        from cgen import (Value, Statement, Initializer, While,
                Comment, Block, For, Line, Pointer)
        S = Statement
        return [
                # assumes there is more than one coefficient
                Initializer(Value("cit_type", "el_modes"), "field_it+er.first"),

                Line(),
                Comment("zero out reduced_modes"),
                For("npy_uint32 mode_idx = 0",
                    "mode_idx < max_degree+1",
                    "++mode_idx",
                    S("reduced_modes[mode_idx] = 0")),

                Line(),
                Comment("gather modes by degree"),
                For("npy_uint32 mode_idx = 0",
                    "mode_idx < mode_count",
                    "++mode_idx",
                    S("reduced_modes[mode_degrees_iterator[mode_idx]]"
                        " += el_modes[mode_idx]")),

                Line(),
                Comment("perform skyline procedure"),
                Initializer(Pointer(Value("value_type", "start")),
                    "reduced_modes.get()"),
                Initializer(Pointer(Value("value_type", "end")),
                    "start+max_degree+1"),
                Initializer(Value("value_type", "cur_max"),
                    "std::max(*(end-1), *(end-2))"),

                Line(),
                While("end != start", Block([
                    S("--end"),
                    S("*end = std::max(cur_max, *end)"),
                    ])),

                Line(),
                Comment("scatter modes by degree"),
                Initializer(Value("it_type", "tgt_base"), "result_it+er.first"),
                For("npy_uint32 mode_idx = 0",
                    "mode_idx < mode_count",
                    "++mode_idx",
                    S("tgt_base[mode_idx] = "
                        "reduced_modes[mode_degrees_iterator[mode_idx]]")),
                ]
Exemplo n.º 14
0
        def get_flux_code(flux_writer):
            flux_code = Block([])

            flux_code.extend([
                Initializer(Pointer(Value("face_pair", "fpair")),
                            "data.facepairs+fpair_nr"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "a_index")),
                    "fpair->a_base + tex1Dfetch(tex_index_lists, "
                    "fpair->a_ilist_index + FACEDOF_NR)"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "b_index")),
                    "fpair->b_base + tex1Dfetch(tex_index_lists, "
                    "fpair->b_ilist_index + FACEDOF_NR)"),
                Line(),
                flux_writer(),
                Line(),
                S("fpair_nr += CONCURRENT_FACES")
            ])

            return flux_code
Exemplo n.º 15
0
    def get_function_definition(self, codegen_state, codegen_result,
            schedule_index,
            function_decl, function_body):
        kernel = codegen_state.kernel

        from cgen import (
                FunctionBody,

                # Post-mid-2016 cgens have 'Collection', too.
                Module as Collection,
                Initializer,
                Line)

        result = []

        from loopy.kernel.data import AddressSpace
        from loopy.schedule import CallKernel
        # We only need to write declarations for global variables with
        # the first device program. `is_first_dev_prog` determines
        # whether this is the first device program in the schedule.
        is_first_dev_prog = codegen_state.is_generating_device_code
        for i in range(schedule_index):
            if isinstance(kernel.schedule[i], CallKernel):
                is_first_dev_prog = False
                break
        if is_first_dev_prog:
            for tv in sorted(
                    six.itervalues(kernel.temporary_variables),
                    key=lambda tv: tv.name):

                if tv.address_space == AddressSpace.GLOBAL and (
                        tv.initializer is not None):
                    assert tv.read_only

                    decl_info, = tv.decl_info(self.target,
                                    index_dtype=kernel.index_dtype)
                    decl = self.wrap_global_constant(
                            self.get_temporary_decl(
                                codegen_state, schedule_index, tv,
                                decl_info))

                    if tv.initializer is not None:
                        decl = Initializer(decl, generate_array_literal(
                            codegen_state, tv, tv.initializer))

                    result.append(decl)

        fbody = FunctionBody(function_decl, function_body)
        if not result:
            return fbody
        else:
            return Collection(result+[Line(), fbody])
Exemplo n.º 16
0
    def expose_vector_type(self, name, py_name=None):
        self.add_codepy_include()

        if py_name is None:
            py_name = name

        from cgen import (Block, Typedef, Line, Statement, Value)

        self.init_body.append(
            Block([
                Typedef(Value(name, "cl")),
                Line(),
                Statement(
                    f'boost::python::class_<cl>("{py_name}")'
                    ".def(codepy::no_compare_indexing_suite<cl>())"),
                ]))
Exemplo n.º 17
0
    def get_function_definition(self, codegen_state, codegen_result,
            schedule_index,
            function_decl, function_body):
        kernel = codegen_state.kernel

        from cgen import (
                FunctionBody,

                # Post-mid-2016 cgens have 'Collection', too.
                Module as Collection,
                Initializer,
                Line)

        result = []

        from loopy.kernel.data import temp_var_scope

        for tv in sorted(
                six.itervalues(kernel.temporary_variables),
                key=lambda tv: tv.name):

            if tv.scope == temp_var_scope.GLOBAL and tv.initializer is not None:
                assert tv.read_only

                decl_info, = tv.decl_info(self.target,
                                index_dtype=kernel.index_dtype)
                decl = self.wrap_global_constant(
                        self.get_temporary_decl(
                            codegen_state, schedule_index, tv,
                            decl_info))

                if tv.initializer is not None:
                    decl = Initializer(decl, generate_array_literal(
                        codegen_state, tv, tv.initializer))

                result.append(decl)

        fbody = FunctionBody(function_decl, function_body)
        if not result:
            return fbody
        else:
            return Collection(result+[Line(), fbody])
Exemplo n.º 18
0
    def get_kernel(self, diff_op, elgroup, for_benchmark=False):
        from cgen import \
                Pointer, POD, Value, ArrayOf, Const, \
                Module, FunctionDeclaration, FunctionBody, Block, \
                Comment, Line, Define, Include, \
                Initializer, If, For, Statement, Assign

        from pycuda.tools import dtype_to_ctype
        from cgen.cuda import CudaShared, CudaGlobal

        discr = self.discr
        d = discr.dimensions
        dims = range(d)
        plan = self.plan
        given = plan.given

        elgroup, = discr.element_groups
        float_type = given.float_type

        f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_diff_mat_smem"),
            [Pointer(POD(float_type, "debugbuf")), Pointer(POD(float_type, "field")), ]
            + [Pointer(POD(float_type, "drst%d_global" % i)) for i in dims]
            ))

        par = plan.parallelism

        cmod = Module([
                Include("pycuda-helpers.hpp"),
                ])

        if float_type == numpy.float64:
            cmod.append(Value("texture<fp_tex_double, 1, cudaReadModeElementType>",
                    "diff_rst_mat_tex"))
        elif float_type == numpy.float32:
            rst_channels = given.devdata.make_valid_tex_channel_count(d)
            cmod.append(Value("texture<float%d, 1, cudaReadModeElementType>"
                    % rst_channels, "diff_rst_mat_tex"))
        else:
            raise ValueError("unsupported float type: %s" % float_type)

        # only preimage size variation is supported here
        assert plan.image_dofs_per_el == given.dofs_per_el()
        assert plan.aligned_image_dofs_per_microblock == given.microblock.aligned_floats

        # FIXME: aligned_image_dofs_per_microblock must be divisible
        # by this, therefore hardcoding for now.
        chunk_size = 16

        cmod.extend([
                Line(),
                Define("DIMENSIONS", discr.dimensions),

                Define("IMAGE_DOFS_PER_EL", plan.image_dofs_per_el),
                Define("PREIMAGE_DOFS_PER_EL", plan.preimage_dofs_per_el),
                Define("ALIGNED_IMAGE_DOFS_PER_MB", plan.aligned_image_dofs_per_microblock),
                Define("ALIGNED_PREIMAGE_DOFS_PER_MB", plan.aligned_preimage_dofs_per_microblock),
                Define("ELS_PER_MB", given.microblock.elements),
                Define("IMAGE_DOFS_PER_MB", "(IMAGE_DOFS_PER_EL*ELS_PER_MB)"),
                Line(),
                Define("CHUNK_SIZE", chunk_size),
                Define("CHUNK_DOF", "threadIdx.x"),
                Define("PAR_MB_NR", "threadIdx.y"),
                Define("CHUNK_NR", "threadIdx.z"),
                Define("IMAGE_MB_DOF", "(CHUNK_NR*CHUNK_SIZE+CHUNK_DOF)"),
                Define("IMAGE_EL_DOF", "(IMAGE_MB_DOF - mb_el*IMAGE_DOFS_PER_EL)"),
                Line(),
                Define("MACROBLOCK_NR", "blockIdx.x"),
                Line(),
                Define("PAR_MB_COUNT", par.parallel),
                Define("INLINE_MB_COUNT", par.inline),
                Define("SEQ_MB_COUNT", par.serial),
                Line(),
                Define("GLOBAL_MB_NR_BASE",
                    "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"),
                Define("GLOBAL_MB_NR",
                    "(GLOBAL_MB_NR_BASE"
                    "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"),
                Define("GLOBAL_MB_IMAGE_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_IMAGE_DOFS_PER_MB)"),
                Define("GLOBAL_MB_PREIMAGE_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"),
                Line(),
                CudaShared(
                    ArrayOf(
                        ArrayOf(
                            ArrayOf(
                                POD(float_type, "smem_field"),
                                "PAR_MB_COUNT"),
                            "INLINE_MB_COUNT"),
                        "ALIGNED_PREIMAGE_DOFS_PER_MB")),
                Line(),
                ])

        S = Statement
        f_body = Block([
            Initializer(Const(POD(numpy.uint16, "mb_el")),
                "IMAGE_MB_DOF / IMAGE_DOFS_PER_EL"),
            Line(),
            ])

        # ---------------------------------------------------------------------
        def get_load_code():
            mb_img_dofs = plan.aligned_image_dofs_per_microblock
            mb_preimg_dofs = plan.aligned_preimage_dofs_per_microblock
            preimg_dofs_over_dofs = (mb_preimg_dofs+mb_img_dofs-1) // mb_img_dofs

            load_code = []
            store_code = []

            var_num = 0
            for load_block in range(preimg_dofs_over_dofs):
                for inl in range(par.inline):
                    # load and store are split for better pipelining
                    # compiler can't figure that out because of branch

                    var = "tmp%d" % var_num
                    var_num += 1
                    load_code.append(POD(float_type, var))

                    block_addr = "%d * ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF" % load_block
                    load_instr = Assign(var,
                        "field[GLOBAL_MB_PREIMAGE_DOF_BASE"
                        " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB"
                        " + %s]" % (inl, block_addr))
                    store_instr = Assign(
                            "smem_field[PAR_MB_NR][%d][%s]" % (inl, block_addr),
                            var
                            )
                    if (load_block+1)*mb_img_dofs >= mb_preimg_dofs:
                        cond = "%s < ALIGNED_PREIMAGE_DOFS_PER_MB" % block_addr
                        load_instr = If(cond, load_instr)
                        store_instr = If(cond, store_instr)

                    load_code.append(load_instr)
                    store_code.append(store_instr)
            return Block(load_code + [Line()] + store_code)

        def get_scalar_diff_code():
            code = []
            for inl in range(par.inline):
                for axis in dims:
                    code.append(
                        Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0))

            code.append(Line())

            tex_channels = ["x", "y", "z", "w"]

            store_code = Block()
            for inl in range(par.inline):
                for rst_axis in dims:
                    store_code.append(Assign(
                        "drst%d_global[GLOBAL_MB_IMAGE_DOF_BASE + "
                        "%d*ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF]"
                        % (rst_axis, inl),
                        "d%drst%d" % (inl, rst_axis)
                        ))

            from hedge.backends.cuda.tools import unroll
            code.extend([
                Comment("everybody needs to be done with the old data"),
                S("__syncthreads()"),
                Line(),
                get_load_code(),
                Line(),
                Comment("all the new data must be loaded"),
                S("__syncthreads()"),
                Line(),
                ])

            if float_type == numpy.float32:
                code.append(Value("float%d" % rst_channels, "dmat_entries"))

            code.extend([
                POD(float_type, "field_value%d" % inl)
                for inl in range(par.inline)
                ]+[Line()])

            def unroll_body(j):
                result = [
                    Assign("field_value%d" % inl,
                        "smem_field[PAR_MB_NR][%d][mb_el*PREIMAGE_DOFS_PER_EL+%s]" % (inl, j))
                    for inl in range(par.inline)
                    ]

                if float_type == numpy.float32:
                    result.append(Assign("dmat_entries",
                        "tex1Dfetch(diff_rst_mat_tex, IMAGE_EL_DOF + %s*IMAGE_DOFS_PER_EL)" % j))
                    result.extend(
                        S("d%drst%d += dmat_entries.%s * field_value%d"
                            % (inl, axis, tex_channels[axis], inl))
                        for inl in range(par.inline)
                        for axis in dims)
                elif float_type == numpy.float64:
                    result.extend(
                        S("d%(inl)drst%(axis)d += "
                            "fp_tex1Dfetch(diff_rst_mat_tex, %(axis)d "
                            "+ DIMENSIONS*(IMAGE_EL_DOF + %(j)d*IMAGE_DOFS_PER_EL))"
                            "* field_value%(inl)d" % {
                            "inl": inl,
                            "axis": axis,
                            "j": j
                            })
                        for inl in range(par.inline)
                        for axis in dims)
                else:
                    assert False

                return result

            code.append(If("IMAGE_MB_DOF < IMAGE_DOFS_PER_MB", Block(unroll(unroll_body,
                    total_number=plan.preimage_dofs_per_el)
                    +[store_code])))

            return code

        f_body.extend([
            For("unsigned short seq_mb_number = 0",
                "seq_mb_number < SEQ_MB_COUNT",
                "++seq_mb_number",
                Block(get_scalar_diff_code())
                )
            ])

        # finish off ----------------------------------------------------------
        cmod.append(FunctionBody(f_decl, f_body))

        if not for_benchmark and "cuda_dump_kernels" in discr.debug:
            from hedge.tools import open_unique_debug_file
            open_unique_debug_file("diff", ".cu").write(str(cmod))

        mod = SourceModule(cmod,
                keep="cuda_keep_kernels" in discr.debug,
                #options=["--maxrregcount=16"]
                )

        func = mod.get_function("apply_diff_mat_smem")

        if "cuda_diff" in discr.debug:
            print "diff: lmem=%d smem=%d regs=%d" % (
                    func.local_size_bytes,
                    func.shared_size_bytes,
                    func.registers)

        diff_rst_mat_texref = mod.get_texref("diff_rst_mat_tex")
        gpu_diffmats = self.gpu_diffmats(diff_op, elgroup)

        if given.float_type == numpy.float32:
            gpu_diffmats.bind_to_texref_ext(diff_rst_mat_texref, rst_channels)
        elif given.float_type == numpy.float64:
            gpu_diffmats.bind_to_texref_ext(diff_rst_mat_texref,
                    allow_double_hack=True)
        else:
            assert False

        assert given.microblock.aligned_floats % chunk_size == 0
        block = (
                chunk_size,
                plan.parallelism.parallel,
                given.microblock.aligned_floats//chunk_size)

        func.prepare(
                ["PP"] + discr.dimensions*["P"],
                texrefs=[diff_rst_mat_texref])

        return block, func
Exemplo n.º 19
0
    def make_cuda_kernel(self, discr, dtype, eg):
        given = discr.given
        ldis = eg.local_discretization

        microblocks_per_block = 1

        from cgen.cuda import CudaGlobal

        from cgen import (Module, Value, Include,
                Typedef, FunctionBody, FunctionDeclaration, Const,
                Line, POD, LiteralBlock,
                Define, Pointer)

        cmod = Module([
            Include("pycuda-helpers.hpp"),
            Line(),
            Typedef(POD(dtype, "value_type")),
            Line(),
            Define("DOFS_PER_EL", given.dofs_per_el()),
            Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats),
            Define("VERTICES_PER_EL", ldis.vertex_count()),
            Define("ELS_PER_MB", given.microblock.elements),
            Define("MBS_PER_BLOCK", microblocks_per_block),
            Line(),
            Define("DOF_IN_MB_IDX", "threadIdx.x"),
            Define("DOF_IN_EL_IDX", "(DOF_IN_MB_IDX-el_idx_in_mb*DOFS_PER_EL)"),
            Define("MB_IN_BLOCK_IDX", "threadIdx.y"),
            Define("BLOCK_IDX", "blockIdx.x"),
            Define("MB_NUMBER", "(BLOCK_IDX * MBS_PER_BLOCK + MB_IN_BLOCK_IDX)"),
            Define("BLOCK_DATA", "whole_block[MB_IN_BLOCK_IDX]")]
            + self.get_cuda_extra_preamble(discr, dtype, eg)
            + [FunctionBody(
            CudaGlobal(FunctionDeclaration(
                    Value("void", "elwise_kernel"), [
                    Pointer(Const(POD(dtype, "field"))),
                    Pointer(POD(dtype, "result")),
                    POD(numpy.uint32, "mb_count"),
                    ])),
                LiteralBlock("""
                int el_idx_in_mb = DOF_IN_MB_IDX / DOFS_PER_EL;

                if (MB_NUMBER >= mb_count)
                  return;

                int idx =  MB_NUMBER * ALIGNED_DOFS_PER_MB + DOF_IN_MB_IDX;
                int element_base_idx = ALIGNED_DOFS_PER_MB * MB_IN_BLOCK_IDX +
                    (DOF_IN_MB_IDX / DOFS_PER_EL) * DOFS_PER_EL;
                int dof_in_element = DOF_IN_MB_IDX-el_idx_in_mb*DOFS_PER_EL;

                __shared__ value_type whole_block[MBS_PER_BLOCK][ALIGNED_DOFS_PER_MB+1];
                int idx_in_block = ALIGNED_DOFS_PER_MB * MB_IN_BLOCK_IDX + DOF_IN_MB_IDX;
                BLOCK_DATA[idx_in_block] = field[idx];

                __syncthreads();

                %s

                result[idx] = node_result;
                """ % self.get_cuda_code(discr, dtype, eg)))
                ])


        if False:
            for i, l in enumerate(str(cmod).split("\n")):
                print i+1, l
            raw_input()

        from pycuda.compiler import SourceModule
        mod = SourceModule(
                cmod,
                keep="cuda_keep_kernels" in discr.debug,
                )
        func = mod.get_function("elwise_kernel")
        func.prepare(
            "PPI", block=(
                given.microblock.aligned_floats,
                microblocks_per_block, 1))

        mb_count = len(discr.blocks) * discr.given.microblocks_per_block
        grid_dim = (mb_count + microblocks_per_block - 1) \
                // microblocks_per_block

        from pytools import Record
        class KernelInfo(Record):
            pass

        return KernelInfo(
                func=func,
                grid_dim=grid_dim,
                mb_count=mb_count)
Exemplo n.º 20
0
    def get_kernel(self, with_scaling, for_benchmark=False):
        from cgen import \
                Pointer, POD, Value, ArrayOf, \
                Module, FunctionDeclaration, FunctionBody, Block, \
                Line, Define, Include, \
                Initializer, If, For, Statement, Assign, \
                ArrayInitializer

        from cgen import dtype_to_ctype
        from cgen.cuda import CudaShared, CudaConstant, CudaGlobal

        discr = self.discr
        d = discr.dimensions
        dims = range(d)
        given = self.plan.given

        float_type = given.float_type

        f_decl = CudaGlobal(
            FunctionDeclaration(Value("void", "apply_el_local_mat_smem_mat"), [
                Pointer(POD(float_type, "out_vector")),
                Pointer(POD(numpy.uint8, "gmem_matrix")),
                Pointer(POD(float_type, "debugbuf")),
                POD(numpy.uint32, "microblock_count"),
            ]))

        cmod = Module([
            Include("pycuda-helpers.hpp"),
            Line(),
            Value(
                "texture<fp_tex_%s, 1, cudaReadModeElementType>" %
                dtype_to_ctype(float_type), "in_vector_tex"),
        ])
        if with_scaling:
            cmod.append(
                Value(
                    "texture<fp_tex_%s, 1, cudaReadModeElementType>" %
                    dtype_to_ctype(float_type), "scaling_tex"), )

        par = self.plan.parallelism

        cmod.extend([
            Line(),
            Define("DIMENSIONS", discr.dimensions),
            Define("DOFS_PER_EL", given.dofs_per_el()),
            Define("PREIMAGE_DOFS_PER_EL", self.plan.preimage_dofs_per_el),
            Line(),
            Define("SEGMENT_DOF", "threadIdx.x"),
            Define("PAR_MB_NR", "threadIdx.y"),
            Line(),
            Define("MB_SEGMENT", "blockIdx.x"),
            Define("MACROBLOCK_NR", "blockIdx.y"),
            Line(),
            Define("DOFS_PER_SEGMENT", self.plan.segment_size),
            Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()),
            Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats),
            Define("ALIGNED_PREIMAGE_DOFS_PER_MB",
                   self.plan.aligned_preimage_dofs_per_microblock),
            Define("MB_EL_COUNT", given.microblock.elements),
            Line(),
            Define("PAR_MB_COUNT", par.parallel),
            Define("INLINE_MB_COUNT", par.inline),
            Define("SEQ_MB_COUNT", par.serial),
            Line(),
            Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"),
            Define("COALESCING_THREAD_COUNT",
                   "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"),
            Line(),
            Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"),
            Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"),
            Define(
                "GLOBAL_MB_NR_BASE",
                "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"),
            Define(
                "GLOBAL_MB_NR", "(GLOBAL_MB_NR_BASE"
                "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"),
            Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"),
            Define("GLOBAL_MB_PREIMG_DOF_BASE",
                   "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"),
            Line(),
            Define("MATRIX_COLUMNS", self.plan.gpu_matrix_columns()),
            Define("MATRIX_SEGMENT_FLOATS",
                   self.plan.gpu_matrix_block_floats()),
            Define("MATRIX_SEGMENT_BYTES",
                   "(MATRIX_SEGMENT_FLOATS*%d)" % given.float_size()),
            Line(),
            CudaShared(
                ArrayOf(POD(float_type, "smem_matrix"),
                        "MATRIX_SEGMENT_FLOATS")),
            CudaShared(
                ArrayOf(
                    ArrayOf(
                        ArrayOf(POD(float_type, "dof_buffer"), "PAR_MB_COUNT"),
                        "INLINE_MB_COUNT"), "DOFS_PER_SEGMENT"), ),
            CudaShared(POD(numpy.uint16, "segment_start_el")),
            CudaShared(POD(numpy.uint16, "segment_stop_el")),
            CudaShared(POD(numpy.uint16, "segment_el_count")),
            Line(),
            ArrayInitializer(
                CudaConstant(
                    ArrayOf(POD(numpy.uint32, "segment_start_el_lookup"),
                            "SEGMENTS_PER_MB")),
                [(chk * self.plan.segment_size) // given.dofs_per_el()
                 for chk in range(self.plan.segments_per_microblock())]),
            ArrayInitializer(
                CudaConstant(
                    ArrayOf(POD(numpy.uint32, "segment_stop_el_lookup"),
                            "SEGMENTS_PER_MB")),
                [
                    min(given.microblock.elements,
                        (chk * self.plan.segment_size +
                         self.plan.segment_size - 1) // given.dofs_per_el() +
                        1)
                    for chk in range(self.plan.segments_per_microblock())
                ]),
        ])

        S = Statement
        f_body = Block()

        f_body.extend_log_block(
            "calculate this dof's element",
            [Initializer(POD(numpy.uint8, "mb_el"), "MB_DOF/DOFS_PER_EL")])

        if self.plan.use_prefetch_branch:
            f_body.extend_log_block("calculate segment responsibility data", [
                If(
                    "THREAD_NUM==0",
                    Block([
                        Assign("segment_start_el",
                               "segment_start_el_lookup[MB_SEGMENT]"),
                        Assign("segment_stop_el",
                               "segment_stop_el_lookup[MB_SEGMENT]"),
                        Assign("segment_el_count",
                               "segment_stop_el-segment_start_el"),
                    ])),
                S("__syncthreads()")
            ])

        from hedge.backends.cuda.tools import get_load_code
        f_body.extend(
            get_load_code(dest="smem_matrix",
                          base=(
                              "gmem_matrix + MB_SEGMENT*MATRIX_SEGMENT_BYTES"),
                          bytes="MATRIX_SEGMENT_BYTES",
                          descr="load matrix segment") +
            [S("__syncthreads()")])

        # ---------------------------------------------------------------------
        def get_batched_fetch_mat_mul_code(el_fetch_count):
            result = []
            dofs = range(self.plan.preimage_dofs_per_el)

            for load_segment_start in range(0, self.plan.preimage_dofs_per_el,
                                            self.plan.segment_size):
                result.extend([S("__syncthreads()")] + [
                    Assign(
                        "dof_buffer[PAR_MB_NR][%d][SEGMENT_DOF]" %
                        inl, "fp_tex1Dfetch(in_vector_tex, "
                        "GLOBAL_MB_PREIMG_DOF_BASE"
                        " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB"
                        " + (segment_start_el)*PREIMAGE_DOFS_PER_EL + %d + SEGMENT_DOF)"
                        % (inl, load_segment_start))
                    for inl in range(par.inline)
                ] + [
                    S("__syncthreads()"),
                    Line(),
                ])

                for dof in dofs[load_segment_start:load_segment_start +
                                self.plan.segment_size]:
                    for inl in range(par.inline):
                        result.append(
                            S("result%d += "
                              "smem_matrix[SEGMENT_DOF*MATRIX_COLUMNS + %d]"
                              "*"
                              "dof_buffer[PAR_MB_NR][%d][%d]" %
                              (inl, dof, inl, dof - load_segment_start)))
                result.append(Line())
            return result

        from hedge.backends.cuda.tools import unroll

        def get_direct_tex_mat_mul_code():
            return (
                [POD(float_type, "fof%d" % inl) for inl in range(par.inline)] +
                [POD(float_type, "lm"), Line()] + unroll(
                    lambda j: [
                        Assign(
                            "fof%d" % inl,
                            "fp_tex1Dfetch(in_vector_tex, "
                            "GLOBAL_MB_PREIMG_DOF_BASE"
                            " + %(inl)d * ALIGNED_PREIMAGE_DOFS_PER_MB"
                            " + mb_el*PREIMAGE_DOFS_PER_EL+%(j)s)" % {
                                "j": j,
                                "inl": inl,
                                "row": "SEGMENT_DOF"
                            },
                        ) for inl in range(par.inline)
                    ] + [
                        Assign(
                            "lm",
                            "smem_matrix["
                            "%(row)s*MATRIX_COLUMNS + %(j)s]" % {
                                "j": j,
                                "row": "SEGMENT_DOF"
                            },
                        )
                    ] + [
                        S("result%(inl)d += fof%(inl)d*lm" % {"inl": inl})
                        for inl in range(par.inline)
                    ],
                    total_number=self.plan.preimage_dofs_per_el,
                    max_unroll=self.plan.max_unroll) + [Line()])

        def get_mat_mul_code(el_fetch_count):
            if el_fetch_count == 1:
                return get_batched_fetch_mat_mul_code(el_fetch_count)
            else:
                return get_direct_tex_mat_mul_code()

        def mat_mul_outer_loop(fetch_count):
            if with_scaling:
                inv_jac_multiplier = (
                    "fp_tex1Dfetch(scaling_tex,"
                    "(GLOBAL_MB_NR + %(inl)d)*MB_EL_COUNT + mb_el)")
            else:
                inv_jac_multiplier = "1"

            write_condition = "MB_DOF < DOFS_PER_EL*MB_EL_COUNT"
            if self.with_index_check:
                write_condition += " && GLOBAL_MB_NR < microblock_count"
            return For(
                "unsigned short seq_mb_number = 0",
                "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number",
                Block([
                    Initializer(POD(float_type, "result%d" % inl), 0)
                    for inl in range(par.inline)
                ] + [Line()] + get_mat_mul_code(fetch_count) + [
                    If(
                        write_condition,
                        Block([
                            Assign(
                                "out_vector[GLOBAL_MB_DOF_BASE"
                                " + %d*ALIGNED_DOFS_PER_MB"
                                " + MB_DOF]" % inl, "result%d * %s" %
                                (inl, (inv_jac_multiplier % {
                                    "inl": inl
                                }))) for inl in range(par.inline)
                        ]))
                ]))

        if self.plan.use_prefetch_branch:
            from cgen import make_multiple_ifs
            f_body.append(
                make_multiple_ifs([
                    ("segment_el_count == %d" % fetch_count,
                     mat_mul_outer_loop(fetch_count)) for fetch_count in range(
                         1,
                         self.plan.max_elements_touched_by_segment() + 1)
                ]))
        else:
            f_body.append(mat_mul_outer_loop(0))

        # finish off ----------------------------------------------------------
        cmod.append(FunctionBody(f_decl, f_body))

        if not for_benchmark and "cuda_dump_kernels" in discr.debug:
            from hedge.tools import open_unique_debug_file
            open_unique_debug_file(self.plan.debug_name,
                                   ".cu").write(str(cmod))

        mod = SourceModule(
            cmod,
            keep="cuda_keep_kernels" in discr.debug,
            #options=["--maxrregcount=12"]
        )

        func = mod.get_function("apply_el_local_mat_smem_mat")

        if self.plan.debug_name in discr.debug:
            print "%s: lmem=%d smem=%d regs=%d" % (
                self.plan.debug_name, func.local_size_bytes,
                func.shared_size_bytes, func.num_regs)

        in_vector_texref = mod.get_texref("in_vector_tex")
        texrefs = [in_vector_texref]

        if with_scaling:
            scaling_texref = mod.get_texref("scaling_tex")
            texrefs.append(scaling_texref)
        else:
            scaling_texref = None

        func.prepare("PPPI",
                     block=(self.plan.segment_size,
                            self.plan.parallelism.parallel, 1),
                     texrefs=texrefs)

        return func, in_vector_texref, scaling_texref
Exemplo n.º 21
0
    def emit_multiple_assignment(self, codegen_state, insn):
        ecm = codegen_state.expression_to_code_mapper

        from pymbolic.primitives import Variable
        from pymbolic.mapper.stringifier import PREC_NONE

        func_id = insn.expression.function
        parameters = insn.expression.parameters

        if isinstance(func_id, Variable):
            func_id = func_id.name

        assignee_var_descriptors = [
            codegen_state.kernel.get_var_descriptor(a)
            for a in insn.assignee_var_names()
        ]

        par_dtypes = tuple(ecm.infer_type(par) for par in parameters)

        str_parameters = None

        mangle_result = codegen_state.kernel.mangle_function(
            func_id, par_dtypes)
        if mangle_result is None:
            raise RuntimeError(
                "function '%s' unknown--"
                "maybe you need to register a function mangler?" % func_id)

        assert mangle_result.arg_dtypes is not None

        from loopy.expression import dtype_to_type_context
        str_parameters = [
            ecm(par, PREC_NONE, dtype_to_type_context(self.target, tgt_dtype),
                tgt_dtype) for par, par_dtype, tgt_dtype in zip(
                    parameters, par_dtypes, mangle_result.arg_dtypes)
        ]

        from loopy.codegen import SeenFunction
        codegen_state.seen_functions.add(
            SeenFunction(func_id, mangle_result.target_name,
                         mangle_result.arg_dtypes))

        for i, (a, tgt_dtype) in enumerate(
                zip(insn.assignees[1:], mangle_result.result_dtypes[1:])):
            if tgt_dtype != ecm.infer_type(a):
                raise LoopyError("type mismatch in %d'th (1-based) left-hand "
                                 "side of instruction '%s'" % (i + 1, insn.id))
            str_parameters.append(
                "&(%s)" %
                ecm(a, PREC_NONE, dtype_to_type_context(
                    self.target, tgt_dtype), tgt_dtype))

        result = "%s(%s)" % (mangle_result.target_name,
                             ", ".join(str_parameters))

        # In case of no assignees, we are done
        if len(mangle_result.result_dtypes) == 0:
            from cgen import Line
            return Line(result + ';')

        result = ecm.wrap_in_typecast(mangle_result.result_dtypes[0],
                                      assignee_var_descriptors[0].dtype,
                                      result)

        lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None)

        from cgen import Assign
        return Assign(lhs_code, result)
Exemplo n.º 22
0
    def get_kernel(self, diff_op_cls, elgroup, for_benchmark=False):
        from cgen import \
                Pointer, POD, Value, ArrayOf, \
                Module, FunctionDeclaration, FunctionBody, Block, \
                Line, Define, Include, \
                Initializer, If, For, Statement, Assign

        from cgen import dtype_to_ctype
        from cgen.cuda import CudaShared, CudaGlobal

        discr = self.discr
        d = discr.dimensions
        dims = range(d)
        given = self.plan.given

        par = self.plan.parallelism

        diffmat_data = self.gpu_diffmats(diff_op_cls, elgroup)
        elgroup, = discr.element_groups

        float_type = given.float_type

        f_decl = CudaGlobal(
            FunctionDeclaration(
                Value("void", "apply_diff_mat"),
                [
                    Pointer(POD(numpy.uint8, "gmem_diff_rst_mat")),
                    #Pointer(POD(float_type, "debugbuf")),
                ] +
                [Pointer(POD(float_type, "drst%d_global" % i)) for i in dims]))

        rst_channels = given.devdata.make_valid_tex_channel_count(d)
        cmod = Module([
            Include("pycuda-helpers.hpp"),
            Line(),
            Value(
                "texture<fp_tex_%s, 1, cudaReadModeElementType>" %
                dtype_to_ctype(float_type), "field_tex"),
            Line(),
            Define("DIMENSIONS", discr.dimensions),
            Define("DOFS_PER_EL", given.dofs_per_el()),
            Line(),
            Define("SEGMENT_DOF", "threadIdx.x"),
            Define("PAR_MB_NR", "threadIdx.y"),
            Line(),
            Define("MB_SEGMENT", "blockIdx.x"),
            Define("MACROBLOCK_NR", "blockIdx.y"),
            Line(),
            Define("DOFS_PER_SEGMENT", self.plan.segment_size),
            Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()),
            Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats),
            Define("ELS_PER_MB", given.microblock.elements),
            Line(),
            Define("PAR_MB_COUNT", par.parallel),
            Define("INLINE_MB_COUNT", par.inline),
            Define("SEQ_MB_COUNT", par.serial),
            Line(),
            Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"),
            Define("COALESCING_THREAD_COUNT",
                   "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"),
            Line(),
            Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"),
            Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"),
            Define(
                "GLOBAL_MB_NR_BASE",
                "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"),
            Define(
                "GLOBAL_MB_NR", "(GLOBAL_MB_NR_BASE"
                "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"),
            Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"),
            Line(),
            Define("DIFFMAT_SEGMENT_FLOATS", diffmat_data.block_floats),
            Define("DIFFMAT_SEGMENT_BYTES",
                   "(DIFFMAT_SEGMENT_FLOATS*%d)" % given.float_size()),
            Define("DIFFMAT_COLUMNS", diffmat_data.matrix_columns),
            Line(),
            CudaShared(
                ArrayOf(POD(float_type, "smem_diff_rst_mat"),
                        "DIFFMAT_COLUMNS*DOFS_PER_SEGMENT")),
            Line(),
        ])

        S = Statement
        f_body = Block()

        f_body.extend_log_block("calculate responsibility data", [
            Initializer(POD(numpy.uint16, "mb_el"), "MB_DOF/DOFS_PER_EL"),
        ])

        from hedge.backends.cuda.tools import get_load_code
        f_body.extend(
            get_load_code(
                dest="smem_diff_rst_mat",
                base="gmem_diff_rst_mat + MB_SEGMENT*DIFFMAT_SEGMENT_BYTES",
                bytes="DIFFMAT_SEGMENT_BYTES",
                descr="load diff mat segment") +
            [S("__syncthreads()"), Line()])

        # ---------------------------------------------------------------------
        def get_scalar_diff_code():
            code = []
            for inl in range(par.inline):
                for axis in dims:
                    code.append(
                        Initializer(POD(float_type, "d%drst%d" % (inl, axis)),
                                    0))

            code.append(Line())

            def get_mat_entry(row, col, axis):
                return ("smem_diff_rst_mat["
                        "%(row)s*DIFFMAT_COLUMNS + %(axis)s*DOFS_PER_EL"
                        " + %(col)s"
                        "]" % {
                            "row": row,
                            "col": col,
                            "axis": axis
                        })

            tex_channels = ["x", "y", "z", "w"]
            from hedge.backends.cuda.tools import unroll
            code.extend([
                POD(float_type, "field_value%d" % inl)
                for inl in range(par.inline)
            ] + [Line()] + unroll(
                lambda j: [
                    Assign(
                        "field_value%d" % inl,
                        "fp_tex1Dfetch(field_tex, GLOBAL_MB_DOF_BASE + %d*ALIGNED_DOFS_PER_MB "
                        "+ mb_el*DOFS_PER_EL + %s)" % (inl, j))
                    for inl in range(par.inline)
                ] + [Line()] + [
                    S("d%drst%d += %s * field_value%d" %
                      (inl, axis, get_mat_entry("SEGMENT_DOF", j, axis), inl))
                    for axis in dims for inl in range(par.inline)
                ] + [Line()], given.dofs_per_el(), self.plan.max_unroll))

            store_code = Block()
            for inl in range(par.inline):
                for rst_axis in dims:
                    store_code.append(
                        Assign(
                            "drst%d_global[GLOBAL_MB_DOF_BASE"
                            " + %d*ALIGNED_DOFS_PER_MB + MB_DOF]" %
                            (rst_axis, inl),
                            "d%drst%d" % (inl, rst_axis),
                        ))

            code.append(If("MB_DOF < DOFS_PER_EL*ELS_PER_MB", store_code))

            return code

        f_body.extend([
            For("unsigned short seq_mb_number = 0",
                "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number",
                Block(get_scalar_diff_code()))
        ])

        # finish off ----------------------------------------------------------
        cmod.append(FunctionBody(f_decl, f_body))

        if not for_benchmark and "cuda_dump_kernels" in discr.debug:
            from hedge.tools import open_unique_debug_file
            open_unique_debug_file("diff", ".cu").write(str(cmod))

        mod = SourceModule(
            cmod,
            keep="cuda_keep_kernels" in discr.debug,
            #options=["--maxrregcount=10"]
        )

        field_texref = mod.get_texref("field_tex")

        func = mod.get_function("apply_diff_mat")
        func.prepare(discr.dimensions * [float_type] + ["P"],
                     block=(self.plan.segment_size, par.parallel, 1),
                     texrefs=[field_texref])

        if "cuda_diff" in discr.debug:
            print "diff: lmem=%d smem=%d regs=%d" % (
                func.local_size_bytes, func.shared_size_bytes, func.num_regs)

        return func, field_texref
Exemplo n.º 23
0
 def emit_blank_line(self):
     from cgen import Line
     return Line()
Exemplo n.º 24
0
    def get_temporary_decls(self, codegen_state, schedule_index):
        from loopy.kernel.data import AddressSpace

        kernel = codegen_state.kernel

        base_storage_decls = []
        temp_decls = []

        # {{{ declare temporaries

        base_storage_sizes = {}
        base_storage_to_scope = {}
        base_storage_to_align_bytes = {}

        from cgen import ArrayOf, Initializer, AlignedAttribute, Value, Line
        # Getting the temporary variables that are needed for the current
        # sub-kernel.
        from loopy.schedule.tools import (
                temporaries_read_in_subkernel,
                temporaries_written_in_subkernel)
        subkernel = kernel.schedule[schedule_index].kernel_name
        sub_knl_temps = (
                temporaries_read_in_subkernel(kernel, subkernel) |
                temporaries_written_in_subkernel(kernel, subkernel))

        for tv in sorted(
                six.itervalues(kernel.temporary_variables),
                key=lambda tv: tv.name):
            decl_info = tv.decl_info(self.target, index_dtype=kernel.index_dtype)

            if not tv.base_storage:
                for idi in decl_info:
                    # global temp vars are mapped to arguments or global declarations
                    if tv.address_space != AddressSpace.GLOBAL and (
                            tv.name in sub_knl_temps):
                        decl = self.wrap_temporary_decl(
                                self.get_temporary_decl(
                                    codegen_state, schedule_index, tv, idi),
                                tv.address_space)

                        if tv.initializer is not None:
                            assert tv.read_only
                            decl = Initializer(decl, generate_array_literal(
                                codegen_state, tv, tv.initializer))

                        temp_decls.append(decl)

            else:
                assert tv.initializer is None

                offset = 0
                base_storage_sizes.setdefault(tv.base_storage, []).append(
                        tv.nbytes)
                base_storage_to_scope.setdefault(tv.base_storage, []).append(
                        tv.address_space)

                align_size = tv.dtype.itemsize

                from loopy.kernel.array import VectorArrayDimTag
                for dim_tag, axis_len in zip(tv.dim_tags, tv.shape):
                    if isinstance(dim_tag, VectorArrayDimTag):
                        align_size *= axis_len

                base_storage_to_align_bytes.setdefault(tv.base_storage, []).append(
                        align_size)

                for idi in decl_info:
                    cast_decl = POD(self, idi.dtype, "")
                    temp_var_decl = POD(self, idi.dtype, idi.name)

                    cast_decl = self.wrap_temporary_decl(cast_decl, tv.address_space)
                    temp_var_decl = self.wrap_temporary_decl(
                            temp_var_decl, tv.address_space)

                    if tv._base_storage_access_may_be_aliasing:
                        ptrtype = _ConstPointer
                    else:
                        # The 'restrict' part of this is a complete lie--of course
                        # all these temporaries are aliased. But we're promising to
                        # not use them to shovel data from one representation to the
                        # other. That counts, right?
                        ptrtype = _ConstRestrictPointer

                    cast_decl = ptrtype(cast_decl)
                    temp_var_decl = ptrtype(temp_var_decl)

                    cast_tp, cast_d = cast_decl.get_decl_pair()
                    temp_var_decl = Initializer(
                            temp_var_decl,
                            "(%s %s) (%s + %s)" % (
                                " ".join(cast_tp), cast_d,
                                tv.base_storage,
                                offset))

                    temp_decls.append(temp_var_decl)

                    from pytools import product
                    offset += (
                            idi.dtype.itemsize
                            * product(si for si in idi.shape))

        ecm = self.get_expression_to_code_mapper(codegen_state)

        for bs_name, bs_sizes in sorted(six.iteritems(base_storage_sizes)):
            bs_var_decl = Value("char", bs_name)
            from pytools import single_valued
            bs_var_decl = self.wrap_temporary_decl(
                    bs_var_decl, single_valued(base_storage_to_scope[bs_name]))

            # FIXME: Could try to use isl knowledge to simplify max.
            if all(isinstance(bs, int) for bs in bs_sizes):
                bs_size_max = max(bs_sizes)
            else:
                bs_size_max = p.Max(tuple(bs_sizes))

            bs_var_decl = ArrayOf(bs_var_decl, ecm(bs_size_max))

            alignment = max(base_storage_to_align_bytes[bs_name])
            bs_var_decl = AlignedAttribute(alignment, bs_var_decl)

            base_storage_decls.append(bs_var_decl)

        # }}}

        result = base_storage_decls + temp_decls

        if result:
            result.append(Line())

        return result
Exemplo n.º 25
0
    def write_interior_flux_code(self, is_twosided):
        given = self.plan.given

        def get_field(flux_rec, is_interior, flipped):
            if is_interior ^ flipped:
                prefix = "a"
            else:
                prefix = "b"

            return ("val_%s_field%d" %
                    (prefix, self.dep_to_index[flux_rec.field_expr]))

        flux_write_code = Block([])

        flux_var_decl = [Initializer(POD(given.float_type, "a_flux"), 0)]

        if is_twosided:
            flux_var_decl.append(
                Initializer(POD(given.float_type, "b_flux"), 0))
            prefixes = ["a", "b"]
            flip_values = [False, True]
        else:
            prefixes = ["a"]
            flip_values = [False]

        flux_write_code.append(Line())

        for dep in self.interior_deps:
            flux_write_code.append(Comment(str(dep)))

            for side in ["a", "b"]:
                flux_write_code.append(
                    Initializer(
                        MaybeUnused(
                            POD(
                                given.float_type, "val_%s_field%d" %
                                (side, self.dep_to_index[dep]))),
                        "fp_tex1Dfetch(field%d_tex, %s_index)" %
                        (self.dep_to_index[dep], side)))

        f2cm = FluxToCodeMapper(given.float_type)

        flux_sub_codes = []
        for flux_nr, wdflux in enumerate(self.fluxes):
            my_flux_block = Block(flux_var_decl)

            for int_rec in wdflux.interiors:
                for prefix, is_flipped in zip(prefixes, flip_values):
                    my_flux_block.append(
                        Statement("%s_flux += %s" % (
                            prefix,
                            flux_to_code(f2cm, is_flipped, int_rec.field_expr,
                                         int_rec.field_expr, self.dep_to_index,
                                         int_rec.flux_expr, PREC_NONE),
                        )))

            my_flux_block.append(Line())

            my_flux_block.append(
                self.gen_store(flux_nr, "fpair->a_dest+FACEDOF_NR",
                               "fpair->face_jacobian*a_flux"))

            #my_flux_block.append(
            #Statement("if(isnan(val_b_field5)) debugbuf[blockIdx.x] = 1"),
            #)

            if is_twosided:
                my_flux_block.append(
                    self.gen_store(
                        flux_nr, "fpair->b_dest+tex1Dfetch(tex_index_lists, "
                        "fpair->b_write_ilist_index + FACEDOF_NR)",
                        "fpair->face_jacobian*b_flux"))

                #my_flux_block.append(
                #Assign("debugbuf[blockIdx.x*96+fpair_nr+8]", "10000+fpair->b_dest"),
                #)

            flux_sub_codes.append(my_flux_block)

        if f2cm.cse_name_list:
            flux_write_code.append(Line())

        flux_write_code.extend(
            Initializer(Value("value_type", cse_name), cse_str)
            for cse_name, cse_str in f2cm.cse_name_list)

        flux_write_code.extend(flux_sub_codes)

        return flux_write_code
Exemplo n.º 26
0
    def write_boundary_flux_code(self, for_benchmark):
        given = self.plan.given

        flux_write_code = Block()

        fluxes_by_bdry_number = {}
        for flux_nr, wdflux in enumerate(self.fluxes):
            for bflux_info in wdflux.boundaries:
                if for_benchmark:
                    bdry_number = 0
                else:
                    bdry_number = self.executor.boundary_tag_to_number[
                        bflux_info.bpair.tag]

                fluxes_by_bdry_number.setdefault(bdry_number, [])\
                        .append((flux_nr, bflux_info))

        flux_write_code.extend([
            Initializer(MaybeUnused(POD(given.float_type, "flux%d" % flux_nr)),
                        0) for flux_nr in range(len(self.fluxes))
        ])

        for bdry_number, nrs_and_fluxes in fluxes_by_bdry_number.iteritems():
            bblock = []

            from pytools import set_sum
            int_deps = set_sum(flux_rec.int_dependencies
                               for flux_nr, flux_rec in nrs_and_fluxes)
            ext_deps = set_sum(flux_rec.ext_dependencies
                               for flux_nr, flux_rec in nrs_and_fluxes)

            for dep in int_deps:
                bblock.extend([
                    Comment(str(dep)),
                    Initializer(
                        MaybeUnused(
                            POD(given.float_type,
                                "val_a_field%d" % self.dep_to_index[dep])),
                        "fp_tex1Dfetch(field%d_tex, a_index)" %
                        self.dep_to_index[dep])
                ])
            for dep in ext_deps:
                bblock.extend([
                    Comment(str(dep)),
                    Initializer(
                        MaybeUnused(
                            POD(given.float_type,
                                "val_b_field%d" % self.dep_to_index[dep])),
                        "fp_tex1Dfetch(field%s_tex, b_index)" %
                        self.dep_to_index[dep])
                ])

            f2cm = FluxToCodeMapper(given.float_type)

            comp_code = [Line()]
            for flux_nr, flux_rec in nrs_and_fluxes:
                comp_code.append(
                    Statement(
                        ("flux%d += " % flux_nr) +
                        flux_to_code(f2cm,
                                     is_flipped=False,
                                     int_field_expr=flux_rec.bpair.field,
                                     ext_field_expr=flux_rec.bpair.bfield,
                                     dep_to_index=self.dep_to_index,
                                     flux=flux_rec.flux_expr,
                                     prec=PREC_NONE)))

            if f2cm.cse_name_list:
                bblock.append(Line())

            bblock.extend(
                Initializer(Value("value_type", cse_name), cse_str)
                for cse_name, cse_str in f2cm.cse_name_list)

            flux_write_code.extend([
                Line(),
                Comment(nrs_and_fluxes[0][1].bpair.tag),
                If("(fpair->boundary_bitmap) & (1 << %d)" % (bdry_number),
                   Block(bblock + comp_code)),
            ])

        flux_write_code.extend(
            [
                Line(),
            ] + [
                self.gen_store(flux_nr, "fpair->a_dest+FACEDOF_NR",
                               "fpair->face_jacobian * flux%d" % flux_nr)
                for flux_nr in range(len(self.fluxes))
            ]
            #Assign("debugbuf[blockIdx.x*96+fpair_nr]", "10000+fpair->a_dest"),
        )

        return flux_write_code
Exemplo n.º 27
0
    def get_temporary_decls(self, codegen_state, schedule_index):
        from loopy.kernel.data import temp_var_scope

        kernel = codegen_state.kernel

        base_storage_decls = []
        temp_decls = []

        # {{{ declare temporaries

        base_storage_sizes = {}
        base_storage_to_scope = {}
        base_storage_to_align_bytes = {}

        from cgen import ArrayOf, Initializer, AlignedAttribute, Value, Line

        for tv in sorted(six.itervalues(kernel.temporary_variables),
                         key=lambda tv: tv.name):
            decl_info = tv.decl_info(self.target,
                                     index_dtype=kernel.index_dtype)

            if not tv.base_storage:
                for idi in decl_info:
                    # global temp vars are mapped to arguments or global declarations
                    if tv.scope != temp_var_scope.GLOBAL:
                        decl = self.wrap_temporary_decl(
                            self.get_temporary_decl(kernel, schedule_index, tv,
                                                    idi), tv.scope)

                        if tv.initializer is not None:
                            decl = Initializer(
                                decl,
                                generate_array_literal(codegen_state, tv,
                                                       tv.initializer))

                        temp_decls.append(decl)

            else:
                assert tv.initializer is None

                offset = 0
                base_storage_sizes.setdefault(tv.base_storage,
                                              []).append(tv.nbytes)
                base_storage_to_scope.setdefault(tv.base_storage,
                                                 []).append(tv.scope)

                align_size = tv.dtype.itemsize

                from loopy.kernel.array import VectorArrayDimTag
                for dim_tag, axis_len in zip(tv.dim_tags, tv.shape):
                    if isinstance(dim_tag, VectorArrayDimTag):
                        align_size *= axis_len

                base_storage_to_align_bytes.setdefault(tv.base_storage,
                                                       []).append(align_size)

                for idi in decl_info:
                    cast_decl = POD(self, idi.dtype, "")
                    temp_var_decl = POD(self, idi.dtype, idi.name)

                    cast_decl = self.wrap_temporary_decl(cast_decl, tv.scope)
                    temp_var_decl = self.wrap_temporary_decl(
                        temp_var_decl, tv.scope)

                    # The 'restrict' part of this is a complete lie--of course
                    # all these temporaries are aliased. But we're promising to
                    # not use them to shovel data from one representation to the
                    # other. That counts, right?

                    cast_decl = _ConstRestrictPointer(cast_decl)
                    temp_var_decl = _ConstRestrictPointer(temp_var_decl)

                    cast_tp, cast_d = cast_decl.get_decl_pair()
                    temp_var_decl = Initializer(
                        temp_var_decl, "(%s %s) (%s + %s)" %
                        (" ".join(cast_tp), cast_d, tv.base_storage, offset))

                    temp_decls.append(temp_var_decl)

                    from pytools import product
                    offset += (idi.dtype.itemsize *
                               product(si for si in idi.shape))

        for bs_name, bs_sizes in sorted(six.iteritems(base_storage_sizes)):
            bs_var_decl = Value("char", bs_name)
            from pytools import single_valued
            bs_var_decl = self.wrap_temporary_decl(
                bs_var_decl, single_valued(base_storage_to_scope[bs_name]))
            bs_var_decl = ArrayOf(bs_var_decl, max(bs_sizes))

            alignment = max(base_storage_to_align_bytes[bs_name])
            bs_var_decl = AlignedAttribute(alignment, bs_var_decl)

            base_storage_decls.append(bs_var_decl)

        # }}}

        result = base_storage_decls + temp_decls

        if result:
            result.append(Line())

        return result
Exemplo n.º 28
0
        def get_scalar_diff_code():
            code = []
            for inl in range(par.inline):
                for axis in dims:
                    code.append(
                        Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0))

            code.append(Line())

            tex_channels = ["x", "y", "z", "w"]

            store_code = Block()
            for inl in range(par.inline):
                for rst_axis in dims:
                    store_code.append(Assign(
                        "drst%d_global[GLOBAL_MB_IMAGE_DOF_BASE + "
                        "%d*ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF]"
                        % (rst_axis, inl),
                        "d%drst%d" % (inl, rst_axis)
                        ))

            from hedge.backends.cuda.tools import unroll
            code.extend([
                Comment("everybody needs to be done with the old data"),
                S("__syncthreads()"),
                Line(),
                get_load_code(),
                Line(),
                Comment("all the new data must be loaded"),
                S("__syncthreads()"),
                Line(),
                ])

            if float_type == numpy.float32:
                code.append(Value("float%d" % rst_channels, "dmat_entries"))

            code.extend([
                POD(float_type, "field_value%d" % inl)
                for inl in range(par.inline)
                ]+[Line()])

            def unroll_body(j):
                result = [
                    Assign("field_value%d" % inl,
                        "smem_field[PAR_MB_NR][%d][mb_el*PREIMAGE_DOFS_PER_EL+%s]" % (inl, j))
                    for inl in range(par.inline)
                    ]

                if float_type == numpy.float32:
                    result.append(Assign("dmat_entries",
                        "tex1Dfetch(diff_rst_mat_tex, IMAGE_EL_DOF + %s*IMAGE_DOFS_PER_EL)" % j))
                    result.extend(
                        S("d%drst%d += dmat_entries.%s * field_value%d"
                            % (inl, axis, tex_channels[axis], inl))
                        for inl in range(par.inline)
                        for axis in dims)
                elif float_type == numpy.float64:
                    result.extend(
                        S("d%(inl)drst%(axis)d += "
                            "fp_tex1Dfetch(diff_rst_mat_tex, %(axis)d "
                            "+ DIMENSIONS*(IMAGE_EL_DOF + %(j)d*IMAGE_DOFS_PER_EL))"
                            "* field_value%(inl)d" % {
                            "inl": inl,
                            "axis": axis,
                            "j": j
                            })
                        for inl in range(par.inline)
                        for axis in dims)
                else:
                    assert False

                return result

            code.append(If("IMAGE_MB_DOF < IMAGE_DOFS_PER_MB", Block(unroll(unroll_body,
                    total_number=plan.preimage_dofs_per_el)
                    +[store_code])))

            return code
Exemplo n.º 29
0
    def get_kernel(self, fdata, ilist_data, for_benchmark):
        from cgen.cuda import CudaShared, CudaGlobal
        from pycuda.tools import dtype_to_ctype

        discr = self.discr
        given = self.plan.given
        fplan = self.plan
        d = discr.dimensions
        dims = range(d)

        elgroup, = discr.element_groups

        float_type = given.float_type

        f_decl = CudaGlobal(
            FunctionDeclaration(Value("void", "apply_flux"), [
                Pointer(POD(float_type, "debugbuf")),
                Pointer(POD(numpy.uint8, "gmem_facedata")),
            ] + [
                Pointer(POD(float_type, "gmem_fluxes_on_faces%d" % flux_nr))
                for flux_nr in range(len(self.fluxes))
            ]))

        cmod = Module()
        cmod.append(Include("pycuda-helpers.hpp"))

        for dep_expr in self.all_deps:
            cmod.extend([
                Value(
                    "texture<%s, 1, cudaReadModeElementType>" %
                    dtype_to_ctype(float_type, with_fp_tex_hack=True),
                    "field%d_tex" % self.dep_to_index[dep_expr])
            ])

        if fplan.flux_count != len(self.fluxes):
            from warnings import warn
            warn(
                "Flux count in flux execution plan different from actual flux count.\n"
                "You may want to specify the tune_for= kwarg in the Discretization\n"
                "constructor.")

        cmod.extend([
            Line(),
            Typedef(POD(float_type, "value_type")),
            Line(),
            flux_header_struct(float_type, discr.dimensions),
            Line(),
            face_pair_struct(float_type, discr.dimensions),
            Line(),
            Define("DIMENSIONS", discr.dimensions),
            Define("DOFS_PER_FACE", fplan.dofs_per_face),
            Define("THREADS_PER_FACE", fplan.threads_per_face()),
            Line(),
            Define("CONCURRENT_FACES", fplan.parallel_faces),
            Define("BLOCK_MB_COUNT", fplan.mbs_per_block),
            Line(),
            Define("FACEDOF_NR", "threadIdx.x"),
            Define("BLOCK_FACE", "threadIdx.y"),
            Line(),
            Define("FLUX_COUNT", len(self.fluxes)),
            Line(),
            Define("THREAD_NUM", "(FACEDOF_NR + BLOCK_FACE*THREADS_PER_FACE)"),
            Define("THREAD_COUNT", "(THREADS_PER_FACE*CONCURRENT_FACES)"),
            Define(
                "COALESCING_THREAD_COUNT",
                "(THREAD_COUNT < 0x10 ? THREAD_COUNT : THREAD_COUNT & ~0xf)"),
            Line(),
            Define("DATA_BLOCK_SIZE", fdata.block_bytes),
            Define("ALIGNED_FACE_DOFS_PER_MB",
                   fplan.aligned_face_dofs_per_microblock()),
            Define("ALIGNED_FACE_DOFS_PER_BLOCK",
                   "(ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT)"),
            Line(),
            Define("FOF_BLOCK_BASE",
                   "(blockIdx.x*ALIGNED_FACE_DOFS_PER_BLOCK)"),
            Line(),
        ] + ilist_data.code + [
            Line(),
            Value("texture<index_list_entry_t, 1, cudaReadModeElementType>",
                  "tex_index_lists"),
            Line(),
            fdata.struct,
            Line(),
            CudaShared(Value("flux_data", "data")),
        ])

        if not fplan.direct_store:
            cmod.extend([
                CudaShared(
                    ArrayOf(
                        ArrayOf(POD(float_type, "smem_fluxes_on_faces"),
                                "FLUX_COUNT"),
                        "ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT")),
                Line(),
            ])

        S = Statement
        f_body = Block()

        from hedge.backends.cuda.tools import get_load_code

        f_body.extend(
            get_load_code(dest="&data",
                          base="gmem_facedata + blockIdx.x*DATA_BLOCK_SIZE",
                          bytes="sizeof(flux_data)",
                          descr="load face_pair data") +
            [S("__syncthreads()"), Line()])

        def get_flux_code(flux_writer):
            flux_code = Block([])

            flux_code.extend([
                Initializer(Pointer(Value("face_pair", "fpair")),
                            "data.facepairs+fpair_nr"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "a_index")),
                    "fpair->a_base + tex1Dfetch(tex_index_lists, "
                    "fpair->a_ilist_index + FACEDOF_NR)"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "b_index")),
                    "fpair->b_base + tex1Dfetch(tex_index_lists, "
                    "fpair->b_ilist_index + FACEDOF_NR)"),
                Line(),
                flux_writer(),
                Line(),
                S("fpair_nr += CONCURRENT_FACES")
            ])

            return flux_code

        flux_computation = Block([
            Comment("fluxes for dual-sided (intra-block) interior face pairs"),
            While("fpair_nr < data.header.same_facepairs_end",
                  get_flux_code(lambda: self.write_interior_flux_code(True))),
            Line(),
            Comment("work around nvcc assertion failure"),
            S("fpair_nr+=1"),
            S("fpair_nr-=1"),
            Line(),
            Comment(
                "fluxes for single-sided (inter-block) interior face pairs"),
            While("fpair_nr < data.header.diff_facepairs_end",
                  get_flux_code(lambda: self.write_interior_flux_code(False))),
            Line(),
            Comment("fluxes for single-sided boundary face pairs"),
            While(
                "fpair_nr < data.header.bdry_facepairs_end",
                get_flux_code(
                    lambda: self.write_boundary_flux_code(for_benchmark))),
        ])

        f_body.extend_log_block("compute the fluxes", [
            Initializer(POD(numpy.uint32, "fpair_nr"), "BLOCK_FACE"),
            If("FACEDOF_NR < DOFS_PER_FACE", flux_computation)
        ])

        if not fplan.direct_store:
            f_body.extend([Line(), S("__syncthreads()"), Line()])

            f_body.extend_log_block(
                "store fluxes",
                [
                    #Assign("debugbuf[blockIdx.x]", "FOF_BLOCK_BASE"),
                    #Assign("debugbuf[0]", "FOF_BLOCK_BASE"),
                    #Assign("debugbuf[0]", "sizeof(face_pair)"),
                    For(
                        "unsigned word_nr = THREAD_NUM",
                        "word_nr < ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT",
                        "word_nr += COALESCING_THREAD_COUNT",
                        Block([
                            Assign(
                                "gmem_fluxes_on_faces%d[FOF_BLOCK_BASE+word_nr]"
                                % flux_nr,
                                "smem_fluxes_on_faces[%d][word_nr]" % flux_nr)
                            for flux_nr in range(len(self.fluxes))
                        ]
                              #+[If("isnan(smem_fluxes_on_faces[%d][word_nr])" % flux_nr,
                              #Block([
                              #Assign("debugbuf[blockIdx.x]", "word_nr"),
                              #])
                              #)
                              #for flux_nr in range(len(self.fluxes))]
                              ))
                ])
        if False:
            f_body.extend([
                Assign("debugbuf[blockIdx.x*96+32+BLOCK_FACE*32+threadIdx.x]",
                       "fpair_nr"),
                Assign("debugbuf[blockIdx.x*96+16]",
                       "data.header.same_facepairs_end"),
                Assign("debugbuf[blockIdx.x*96+17]",
                       "data.header.diff_facepairs_end"),
                Assign("debugbuf[blockIdx.x*96+18]",
                       "data.header.bdry_facepairs_end"),
            ])

        # finish off ----------------------------------------------------------
        cmod.append(FunctionBody(f_decl, f_body))

        if not for_benchmark and "cuda_dump_kernels" in discr.debug:
            from hedge.tools import open_unique_debug_file
            open_unique_debug_file("flux_gather", ".cu").write(str(cmod))

        #from pycuda.tools import allow_user_edit
        mod = SourceModule(
            #allow_user_edit(cmod, "kernel.cu", "the flux kernel"),
            cmod,
            keep="cuda_keep_kernels" in discr.debug)
        expr_to_texture_map = dict(
            (dep_expr,
             mod.get_texref("field%d_tex" % self.dep_to_index[dep_expr]))
            for dep_expr in self.all_deps)

        index_list_texref = mod.get_texref("tex_index_lists")
        index_list_texref.set_address(ilist_data.device_memory,
                                      ilist_data.bytes)
        index_list_texref.set_format(
            cuda.dtype_to_array_format(ilist_data.type), 1)
        index_list_texref.set_flags(cuda.TRSF_READ_AS_INTEGER)

        func = mod.get_function("apply_flux")
        block = (fplan.threads_per_face(), fplan.parallel_faces, 1)
        func.prepare(
            (2 + len(self.fluxes)) * "P",
            texrefs=expr_to_texture_map.values() + [index_list_texref])

        if "cuda_flux" in discr.debug:
            print "flux: lmem=%d smem=%d regs=%d" % (
                func.local_size_bytes, func.shared_size_bytes, func.num_regs)

        return block, func, expr_to_texture_map
Exemplo n.º 30
0
def get_boundary_flux_mod(fluxes, fvi, discr, dtype):
    from cgen import \
            FunctionDeclaration, FunctionBody, Typedef, Struct, \
            Const, Reference, Value, POD, MaybeUnused, \
            Statement, Include, Line, Block, Initializer, Assign, \
            CustomLoop, For

    from pytools import to_uncomplex_dtype, flatten

    from codepy.bpl import BoostPythonModule
    mod = BoostPythonModule()

    mod.add_to_preamble([
        Include("cstdlib"),
        Include("algorithm"),
        Line(),
        Include("boost/foreach.hpp"),
        Line(),
        Include("hedge/face_operators.hpp"),
        ])

    S = Statement
    mod.add_to_module([
        S("using namespace hedge"),
        S("using namespace pyublas"),
        Line(),
        Typedef(POD(dtype, "value_type")),
        Typedef(POD(to_uncomplex_dtype(dtype), "uncomplex_type")),
        ])

    arg_struct = Struct("arg_struct", [
        Value("numpy_array<value_type>", "flux%d_on_faces" % i)
        for i in range(len(fluxes))
        ]+[
        Value("numpy_array<value_type>", arg_name)
        for arg_name in fvi.arg_names
        ])

    mod.add_struct(arg_struct, "ArgStruct")
    mod.add_to_module([Line()])

    fdecl = FunctionDeclaration(
                Value("void", "gather_flux"),
                [
                    Const(Reference(Value("face_group<face_pair<straight_face> >" , "fg"))),
                    Reference(Value("arg_struct", "args"))
                    ])

    from pymbolic.mapper.stringifier import PREC_PRODUCT

    def gen_flux_code():
        f2cm = FluxToCodeMapper()

        result = [
                Assign("fof%d_it[loc_fof_base+i]" % flux_idx,
                    "uncomplex_type(fp.int_side.face_jacobian) * " +
                    flux_to_code(f2cm, False, flux_idx, fvi, flux.op.flux, PREC_PRODUCT))
                for flux_idx, flux in enumerate(fluxes)
                ]

        return [
            Initializer(Value("value_type", cse_name), cse_str)
            for cse_name, cse_str in f2cm.cse_name_list] + result

    fbody = Block([
        Initializer(
            Const(Value("numpy_array<value_type>::iterator", "fof%d_it" % i)),
            "args.flux%d_on_faces.begin()" % i)
        for i in range(len(fluxes))
        ]+[
        Initializer(
            Const(Value("numpy_array<value_type>::const_iterator",
                "%s_it" % arg_name)),
            "args.%s.begin()" % arg_name)
        for arg_name in fvi.arg_names
        ]+[
        Line(),
        CustomLoop("BOOST_FOREACH(const face_pair<straight_face> &fp, fg.face_pairs)", Block(
            list(flatten([
            Initializer(Value("node_number_t", "%s_ebi" % where),
                "fp.%s.el_base_index" % where),
            Initializer(Value("index_lists_t::const_iterator", "%s_idx_list" % where),
                "fg.index_list(fp.%s.face_index_list_number)" % where),
            Line(),
            ]
            for where in ["int_side", "ext_side"]
            ))+[
            Line(),
            Initializer(Value("node_number_t", "loc_fof_base"),
                "fg.face_length()*(fp.%(where)s.local_el_number*fg.face_count"
                " + fp.%(where)s.face_id)" % {"where": "int_side"}),
            Line(),
            For(
                "unsigned i = 0",
                "i < fg.face_length()",
                "++i",
                Block(
                    [
                    Initializer(MaybeUnused(
                        Value("node_number_t", "%s_idx" % where)),
                        "%(where)s_ebi + %(where)s_idx_list[i]"
                        % {"where": where})
                    for where in ["int_side", "ext_side"]
                    ]+gen_flux_code()
                    )
                )
            ]))
        ])

    mod.add_function(FunctionBody(fdecl, fbody))

    #print "----------------------------------------------------------------"
    #print mod.generate()
    #raw_input("[Enter]")

    return mod.compile(get_flux_toolchain(discr, fluxes))