示例#1
0
    def _arg_names_and_decls(self, codegen_state):
        implemented_data_info = codegen_state.implemented_data_info
        arg_names = [iai.name for iai in implemented_data_info]

        arg_decls = [
            self.idi_to_cgen_declarator(codegen_state.kernel, idi)
            for idi in implemented_data_info
        ]

        # {{{ occa compatibility hackery

        from cgen import Value
        if self.target.occa_mode:
            from cgen import ArrayOf, Const
            from cgen.ispc import ISPCUniform

            arg_decls = [
                Const(ISPCUniform(ArrayOf(Value("int", "loopy_dims")))),
                Const(ISPCUniform(Value("int", "o1"))),
                Const(ISPCUniform(Value("int", "o2"))),
                Const(ISPCUniform(Value("int", "o3"))),
            ] + arg_decls
            arg_names = ["loopy_dims", "o1", "o2", "o3"] + arg_names

        # }}}

        return arg_names, arg_decls
示例#2
0
    def make_codepy_module(self, toolchain, dtype):
        from codepy.libraries import add_codepy
        toolchain = toolchain.copy()
        add_codepy(toolchain)

        from cgen import (Value, Include, Statement,
                Typedef, FunctionBody, FunctionDeclaration, Block, Const,
                Line, POD, Initializer, CustomLoop)
        S = Statement

        from codepy.bpl import BoostPythonModule
        mod = BoostPythonModule()

        mod.add_to_preamble([
            Include("vector"),
            Include("algorithm"),
            Include("hedge/base.hpp"),
            Include("hedge/volume_operators.hpp"),
            Include("boost/foreach.hpp"),
            Include("boost/numeric/ublas/io.hpp"),
            ]+self.get_cpu_extra_includes())

        mod.add_to_module([
            S("namespace ublas = boost::numeric::ublas"),
            S("using namespace hedge"),
            S("using namespace pyublas"),
            Line(),
            Typedef(POD(dtype, "value_type")),
            Line(),
            ])

        mod.add_function(FunctionBody(
            FunctionDeclaration(Value("void", "process_elements"), [
                Const(Value("uniform_element_ranges", "ers")),
                Const(Value("numpy_vector<value_type>", "field")),
                Value("numpy_vector<value_type>", "result"),
                ]+self.get_cpu_extra_parameter_declarators()),
            Block([
                Typedef(Value("numpy_vector<value_type>::iterator",
                    "it_type")),
                Typedef(Value("numpy_vector<value_type>::const_iterator",
                    "cit_type")),
                Line(),
                Initializer(Value("it_type", "result_it"),
                    "result.begin()"),
                Initializer(Value("cit_type", "field_it"),
                    "field.begin()"),
                Line() ]+self.get_cpu_extra_preamble()+[ Line(),
                CustomLoop(
                    "BOOST_FOREACH(const element_range er, ers)",
                    Block(self.get_cpu_per_element_code())
                    )
                ])))

        #print mod.generate()
        #toolchain = toolchain.copy()
        #toolchain.enable_debugging
        return mod.compile(toolchain)
示例#3
0
文件: __init__.py 项目: shigh/loopy
    def emit_initializer(self, codegen_state, dtype, name, val_str, is_const):
        decl = POD(self, dtype, name)

        from cgen import Initializer, Const

        if is_const:
            decl = Const(decl)

        return Initializer(decl, val_str)
示例#4
0
文件: __init__.py 项目: shigh/loopy
    def get_global_arg_decl(self, name, shape, dtype, is_written):
        from cgen import RestrictPointer, Const

        arg_decl = RestrictPointer(POD(self, dtype, name))

        if not is_written:
            arg_decl = Const(arg_decl)

        return arg_decl
示例#5
0
    def get_array_arg_decl(self, name, mem_address_space, shape, dtype, is_written):
        from cgen import RestrictPointer, Const

        arg_decl = RestrictPointer(POD(self, dtype, name))

        if not is_written:
            arg_decl = Const(arg_decl)

        return arg_decl
示例#6
0
    def get_constant_arg_decl(self, name, shape, dtype, is_written):
        from loopy.target.c import POD  # uses the correct complex type
        from cgen import RestrictPointer, Const

        arg_decl = RestrictPointer(POD(self, dtype, name))

        if not is_written:
            arg_decl = Const(arg_decl)

        return arg_decl
示例#7
0
文件: cuda.py 项目: dokempf/loopy
    def get_global_arg_decl(self, name, shape, dtype, is_written):
        from loopy.target.c import POD  # uses the correct complex type
        from cgen import Const
        from cgen.cuda import CudaRestrictPointer

        arg_decl = CudaRestrictPointer(POD(self, dtype, name))

        if not is_written:
            arg_decl = Const(arg_decl)

        return arg_decl
示例#8
0
    def generate_top_of_body(self, codegen_state):
        from loopy.kernel.data import ImageArg
        if any(isinstance(arg, ImageArg) for arg in codegen_state.kernel.args):
            from cgen import Value, Const, Initializer
            return [
                    Initializer(Const(Value("sampler_t", "loopy_sampler")),
                        "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP "
                        "| CLK_FILTER_NEAREST")
                    ]

        return []
示例#9
0
    def get_global_arg_decl(self, name, shape, dtype, is_written):
        from loopy.target.c import POD  # uses the correct complex type
        from cgen import Const
        from cgen.ispc import ISPCUniformPointer, ISPCUniform

        arg_decl = ISPCUniformPointer(POD(self, dtype, name))

        if not is_written:
            arg_decl = Const(arg_decl)

        arg_decl = ISPCUniform(arg_decl)

        return arg_decl
示例#10
0
文件: __init__.py 项目: shigh/loopy
    def get_temporary_decl(self, knl, schedule_index, temp_var, decl_info):
        temp_var_decl = POD(self, decl_info.dtype, decl_info.name)

        if temp_var.read_only:
            from cgen import Const
            temp_var_decl = Const(temp_var_decl)

        if decl_info.shape:
            from cgen import ArrayOf
            temp_var_decl = ArrayOf(
                temp_var_decl, " * ".join(str(s) for s in decl_info.shape))

        return temp_var_decl
示例#11
0
文件: __init__.py 项目: shigh/loopy
    def get_value_arg_decl(self, name, shape, dtype, is_written):
        assert shape == ()

        result = POD(self, dtype, name)
        if not is_written:
            from cgen import Const
            result = Const(result)

        if self.target.fortran_abi:
            from cgen import Pointer
            result = Pointer(result)

        return result
示例#12
0
def make_greet_mod(greeting):
    from cgen import FunctionBody, FunctionDeclaration, Block, \
            Const, Pointer, Value, Statement
    from codepy.bpl import BoostPythonModule

    mod = BoostPythonModule()

    mod.add_function(
        FunctionBody(
            FunctionDeclaration(Const(Pointer(Value("char", "greet"))), []),
            Block([Statement('return "%s"' % greeting)])))

    from codepy.toolchain import guess_toolchain
    return mod.compile(guess_toolchain(), wait_on_error=True)
示例#13
0
    def get_temporary_decl(self, codegen_state, schedule_index, temp_var, decl_info):
        temp_var_decl = POD(self, decl_info.dtype, decl_info.name)

        if temp_var.read_only:
            from cgen import Const
            temp_var_decl = Const(temp_var_decl)

        if decl_info.shape:
            from cgen import ArrayOf
            ecm = self.get_expression_to_code_mapper(codegen_state)
            temp_var_decl = ArrayOf(temp_var_decl,
                    ecm(p.flattened_product(decl_info.shape),
                        prec=PREC_NONE, type_context="i"))

        return temp_var_decl
示例#14
0
    def get_value_arg_decl(self, name, shape, dtype, is_written):
        result = super().get_value_arg_decl(name, shape, dtype, is_written)

        from cgen import Reference, Const
        was_const = isinstance(result, Const)

        if was_const:
            result = result.subdecl

        if self.target.occa_mode:
            result = Reference(result)

        if was_const:
            result = Const(result)

        from cgen.ispc import ISPCUniform
        return ISPCUniform(result)
示例#15
0
文件: __init__.py 项目: shigh/loopy
 def idi_to_cgen_declarator(self, kernel, idi):
     from loopy.kernel.data import InameArg
     if (idi.offset_for_name is not None
             or idi.stride_for_name_and_axis is not None):
         assert not idi.is_written
         from cgen import Const
         return Const(POD(self, idi.dtype, idi.name))
     elif issubclass(idi.arg_class, InameArg):
         return InameArg(idi.name, idi.dtype).get_arg_decl(self)
     else:
         name = idi.base_name or idi.name
         var_descr = kernel.get_var_descriptor(name)
         from loopy.kernel.data import ArrayBase
         if isinstance(var_descr, ArrayBase):
             return var_descr.get_arg_decl(self, idi.name[len(name):],
                                           idi.shape, idi.dtype,
                                           idi.is_written)
         else:
             return var_descr.get_arg_decl(self)
示例#16
0
a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)
c_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)

from cgen import FunctionBody, \
        FunctionDeclaration, Typedef, POD, Value, \
        Pointer, Module, Block, Initializer, Assign, Const
from cgen.opencl import CLKernel, CLGlobal, \
        CLRequiredWorkGroupSize

mod = Module([
    FunctionBody(
        CLKernel(CLRequiredWorkGroupSize((local_size,),
            FunctionDeclaration(
            Value("void", "add"),
            arg_decls=[CLGlobal(Pointer(Const(POD(dtype, name))))
                for name in ["tgt", "op1", "op2"]]))),
        Block([
            Initializer(POD(numpy.int32, "idx"),
                "get_local_id(0) + %d * get_group_id(0)"
                % (local_size*thread_strides))
            ]+[
            Assign(
                "tgt[idx+%d]" % (o*local_size),
                "op1[idx+%d] + op2[idx+%d]" % (
                    o*local_size,
                    o*local_size))
            for o in range(thread_strides)]))])

knl = cl.Program(ctx, str(mod)).build().add
示例#17
0
    def get_kernel(self, diff_op, elgroup, for_benchmark=False):
        from cgen import \
                Pointer, POD, Value, ArrayOf, Const, \
                Module, FunctionDeclaration, FunctionBody, Block, \
                Comment, Line, Define, Include, \
                Initializer, If, For, Statement, Assign

        from pycuda.tools import dtype_to_ctype
        from cgen.cuda import CudaShared, CudaGlobal

        discr = self.discr
        d = discr.dimensions
        dims = range(d)
        plan = self.plan
        given = plan.given

        elgroup, = discr.element_groups
        float_type = given.float_type

        f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_diff_mat_smem"),
            [Pointer(POD(float_type, "debugbuf")), Pointer(POD(float_type, "field")), ]
            + [Pointer(POD(float_type, "drst%d_global" % i)) for i in dims]
            ))

        par = plan.parallelism

        cmod = Module([
                Include("pycuda-helpers.hpp"),
                ])

        if float_type == numpy.float64:
            cmod.append(Value("texture<fp_tex_double, 1, cudaReadModeElementType>",
                    "diff_rst_mat_tex"))
        elif float_type == numpy.float32:
            rst_channels = given.devdata.make_valid_tex_channel_count(d)
            cmod.append(Value("texture<float%d, 1, cudaReadModeElementType>"
                    % rst_channels, "diff_rst_mat_tex"))
        else:
            raise ValueError("unsupported float type: %s" % float_type)

        # only preimage size variation is supported here
        assert plan.image_dofs_per_el == given.dofs_per_el()
        assert plan.aligned_image_dofs_per_microblock == given.microblock.aligned_floats

        # FIXME: aligned_image_dofs_per_microblock must be divisible
        # by this, therefore hardcoding for now.
        chunk_size = 16

        cmod.extend([
                Line(),
                Define("DIMENSIONS", discr.dimensions),

                Define("IMAGE_DOFS_PER_EL", plan.image_dofs_per_el),
                Define("PREIMAGE_DOFS_PER_EL", plan.preimage_dofs_per_el),
                Define("ALIGNED_IMAGE_DOFS_PER_MB", plan.aligned_image_dofs_per_microblock),
                Define("ALIGNED_PREIMAGE_DOFS_PER_MB", plan.aligned_preimage_dofs_per_microblock),
                Define("ELS_PER_MB", given.microblock.elements),
                Define("IMAGE_DOFS_PER_MB", "(IMAGE_DOFS_PER_EL*ELS_PER_MB)"),
                Line(),
                Define("CHUNK_SIZE", chunk_size),
                Define("CHUNK_DOF", "threadIdx.x"),
                Define("PAR_MB_NR", "threadIdx.y"),
                Define("CHUNK_NR", "threadIdx.z"),
                Define("IMAGE_MB_DOF", "(CHUNK_NR*CHUNK_SIZE+CHUNK_DOF)"),
                Define("IMAGE_EL_DOF", "(IMAGE_MB_DOF - mb_el*IMAGE_DOFS_PER_EL)"),
                Line(),
                Define("MACROBLOCK_NR", "blockIdx.x"),
                Line(),
                Define("PAR_MB_COUNT", par.parallel),
                Define("INLINE_MB_COUNT", par.inline),
                Define("SEQ_MB_COUNT", par.serial),
                Line(),
                Define("GLOBAL_MB_NR_BASE",
                    "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"),
                Define("GLOBAL_MB_NR",
                    "(GLOBAL_MB_NR_BASE"
                    "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"),
                Define("GLOBAL_MB_IMAGE_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_IMAGE_DOFS_PER_MB)"),
                Define("GLOBAL_MB_PREIMAGE_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"),
                Line(),
                CudaShared(
                    ArrayOf(
                        ArrayOf(
                            ArrayOf(
                                POD(float_type, "smem_field"),
                                "PAR_MB_COUNT"),
                            "INLINE_MB_COUNT"),
                        "ALIGNED_PREIMAGE_DOFS_PER_MB")),
                Line(),
                ])

        S = Statement
        f_body = Block([
            Initializer(Const(POD(numpy.uint16, "mb_el")),
                "IMAGE_MB_DOF / IMAGE_DOFS_PER_EL"),
            Line(),
            ])

        # ---------------------------------------------------------------------
        def get_load_code():
            mb_img_dofs = plan.aligned_image_dofs_per_microblock
            mb_preimg_dofs = plan.aligned_preimage_dofs_per_microblock
            preimg_dofs_over_dofs = (mb_preimg_dofs+mb_img_dofs-1) // mb_img_dofs

            load_code = []
            store_code = []

            var_num = 0
            for load_block in range(preimg_dofs_over_dofs):
                for inl in range(par.inline):
                    # load and store are split for better pipelining
                    # compiler can't figure that out because of branch

                    var = "tmp%d" % var_num
                    var_num += 1
                    load_code.append(POD(float_type, var))

                    block_addr = "%d * ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF" % load_block
                    load_instr = Assign(var,
                        "field[GLOBAL_MB_PREIMAGE_DOF_BASE"
                        " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB"
                        " + %s]" % (inl, block_addr))
                    store_instr = Assign(
                            "smem_field[PAR_MB_NR][%d][%s]" % (inl, block_addr),
                            var
                            )
                    if (load_block+1)*mb_img_dofs >= mb_preimg_dofs:
                        cond = "%s < ALIGNED_PREIMAGE_DOFS_PER_MB" % block_addr
                        load_instr = If(cond, load_instr)
                        store_instr = If(cond, store_instr)

                    load_code.append(load_instr)
                    store_code.append(store_instr)
            return Block(load_code + [Line()] + store_code)

        def get_scalar_diff_code():
            code = []
            for inl in range(par.inline):
                for axis in dims:
                    code.append(
                        Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0))

            code.append(Line())

            tex_channels = ["x", "y", "z", "w"]

            store_code = Block()
            for inl in range(par.inline):
                for rst_axis in dims:
                    store_code.append(Assign(
                        "drst%d_global[GLOBAL_MB_IMAGE_DOF_BASE + "
                        "%d*ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF]"
                        % (rst_axis, inl),
                        "d%drst%d" % (inl, rst_axis)
                        ))

            from hedge.backends.cuda.tools import unroll
            code.extend([
                Comment("everybody needs to be done with the old data"),
                S("__syncthreads()"),
                Line(),
                get_load_code(),
                Line(),
                Comment("all the new data must be loaded"),
                S("__syncthreads()"),
                Line(),
                ])

            if float_type == numpy.float32:
                code.append(Value("float%d" % rst_channels, "dmat_entries"))

            code.extend([
                POD(float_type, "field_value%d" % inl)
                for inl in range(par.inline)
                ]+[Line()])

            def unroll_body(j):
                result = [
                    Assign("field_value%d" % inl,
                        "smem_field[PAR_MB_NR][%d][mb_el*PREIMAGE_DOFS_PER_EL+%s]" % (inl, j))
                    for inl in range(par.inline)
                    ]

                if float_type == numpy.float32:
                    result.append(Assign("dmat_entries",
                        "tex1Dfetch(diff_rst_mat_tex, IMAGE_EL_DOF + %s*IMAGE_DOFS_PER_EL)" % j))
                    result.extend(
                        S("d%drst%d += dmat_entries.%s * field_value%d"
                            % (inl, axis, tex_channels[axis], inl))
                        for inl in range(par.inline)
                        for axis in dims)
                elif float_type == numpy.float64:
                    result.extend(
                        S("d%(inl)drst%(axis)d += "
                            "fp_tex1Dfetch(diff_rst_mat_tex, %(axis)d "
                            "+ DIMENSIONS*(IMAGE_EL_DOF + %(j)d*IMAGE_DOFS_PER_EL))"
                            "* field_value%(inl)d" % {
                            "inl": inl,
                            "axis": axis,
                            "j": j
                            })
                        for inl in range(par.inline)
                        for axis in dims)
                else:
                    assert False

                return result

            code.append(If("IMAGE_MB_DOF < IMAGE_DOFS_PER_MB", Block(unroll(unroll_body,
                    total_number=plan.preimage_dofs_per_el)
                    +[store_code])))

            return code

        f_body.extend([
            For("unsigned short seq_mb_number = 0",
                "seq_mb_number < SEQ_MB_COUNT",
                "++seq_mb_number",
                Block(get_scalar_diff_code())
                )
            ])

        # finish off ----------------------------------------------------------
        cmod.append(FunctionBody(f_decl, f_body))

        if not for_benchmark and "cuda_dump_kernels" in discr.debug:
            from hedge.tools import open_unique_debug_file
            open_unique_debug_file("diff", ".cu").write(str(cmod))

        mod = SourceModule(cmod,
                keep="cuda_keep_kernels" in discr.debug,
                #options=["--maxrregcount=16"]
                )

        func = mod.get_function("apply_diff_mat_smem")

        if "cuda_diff" in discr.debug:
            print "diff: lmem=%d smem=%d regs=%d" % (
                    func.local_size_bytes,
                    func.shared_size_bytes,
                    func.registers)

        diff_rst_mat_texref = mod.get_texref("diff_rst_mat_tex")
        gpu_diffmats = self.gpu_diffmats(diff_op, elgroup)

        if given.float_type == numpy.float32:
            gpu_diffmats.bind_to_texref_ext(diff_rst_mat_texref, rst_channels)
        elif given.float_type == numpy.float64:
            gpu_diffmats.bind_to_texref_ext(diff_rst_mat_texref,
                    allow_double_hack=True)
        else:
            assert False

        assert given.microblock.aligned_floats % chunk_size == 0
        block = (
                chunk_size,
                plan.parallelism.parallel,
                given.microblock.aligned_floats//chunk_size)

        func.prepare(
                ["PP"] + discr.dimensions*["P"],
                texrefs=[diff_rst_mat_texref])

        return block, func
示例#18
0
from cgen import FunctionBody, \
        FunctionDeclaration, POD, Value, \
        Pointer, Module, Block, Initializer, Assign, Const
from cgen.opencl import CLKernel, CLGlobal, \
        CLRequiredWorkGroupSize

mod = Module([
    FunctionBody(
        CLKernel(
            CLRequiredWorkGroupSize(
                (local_size, ),
                FunctionDeclaration(Value("void", "add"),
                                    arg_decls=[
                                        CLGlobal(
                                            Pointer(Const(POD(dtype, name))))
                                        for name in ["tgt", "op1", "op2"]
                                    ]))),
        Block([
            Initializer(
                POD(numpy.int32,
                    "idx"), "get_local_id(0) + %d * get_group_id(0)" %
                (local_size * thread_strides))
        ] + [
            Assign(
                "tgt[idx+%d]" % (o * local_size), "op1[idx+%d] + op2[idx+%d]" %
                (o * local_size, o * local_size))
            for o in range(thread_strides)
        ]))
])
示例#19
0
    def make_lift(self, fgroup, with_scale, dtype):
        discr = self.discr
        from cgen import (FunctionDeclaration, FunctionBody, Typedef, Const,
                          Reference, Value, POD, Statement, Include, Line,
                          Block, Initializer, Assign, For, If, Define)

        from pytools import to_uncomplex_dtype

        from codepy.bpl import BoostPythonModule
        mod = BoostPythonModule()

        S = Statement
        mod.add_to_preamble([
            Include("hedge/face_operators.hpp"),
            Include("hedge/volume_operators.hpp"),
            Include("boost/foreach.hpp"),
        ])

        mod.add_to_module([
            S("namespace ublas = boost::numeric::ublas"),
            S("using namespace hedge"),
            S("using namespace pyublas"),
            Line(),
            Define("DOFS_PER_EL", fgroup.ldis_loc.node_count()),
            Define("FACES_PER_EL", fgroup.ldis_loc.face_count()),
            Define("DIMENSIONS", discr.dimensions),
            Line(),
            Typedef(POD(dtype, "value_type")),
            Typedef(POD(to_uncomplex_dtype(dtype), "uncomplex_type")),
        ])

        def if_(cond, result, else_=None):
            if cond:
                return [result]
            else:
                if else_ is None:
                    return []
                else:
                    return [else_]

        fdecl = FunctionDeclaration(Value("void", "lift"), [
            Const(
                Reference(Value("face_group<face_pair<straight_face> >",
                                "fg"))),
            Value("ublas::matrix<uncomplex_type>", "matrix"),
            Value("numpy_array<value_type>", "field"),
            Value("numpy_array<value_type>", "result")
        ] + if_(
            with_scale,
            Const(
                Reference(Value("numpy_array<double>",
                                "elwise_post_scaling")))))

        def make_it(name, is_const=True, tpname="value_type"):
            if is_const:
                const = "const_"
            else:
                const = ""

            return Initializer(
                Value("numpy_array<%s>::%siterator" % (tpname, const),
                      name + "_it"), "%s.begin()" % name)

        fbody = Block([
            make_it("field"),
            make_it("result", is_const=False),
        ] + if_(with_scale, make_it("elwise_post_scaling", tpname="double")) + [
            Line(),
            For(
                "unsigned fg_el_nr = 0", "fg_el_nr < fg.element_count()",
                "++fg_el_nr",
                Block([
                    Initializer(Value("node_number_t", "dest_el_base"),
                                "fg.local_el_write_base[fg_el_nr]"),
                    Initializer(Value("node_number_t", "src_el_base"),
                                "FACES_PER_EL*fg.face_length()*fg_el_nr"),
                    Line(),
                    For(
                        "unsigned i = 0", "i < DOFS_PER_EL", "++i",
                        Block([
                            Initializer(Value("value_type", "tmp"), 0),
                            Line(),
                            For(
                                "unsigned j = 0",
                                "j < FACES_PER_EL*fg.face_length()", "++j",
                                S("tmp += matrix(i, j)*field_it[src_el_base+j]"
                                  )),
                            Line(),
                        ] + if_(
                            with_scale,
                            Assign(
                                "result_it[dest_el_base+i]",
                                "tmp * value_type(*elwise_post_scaling_it)"),
                            Assign("result_it[dest_el_base+i]", "tmp")))),
                ] + if_(with_scale, S("elwise_post_scaling_it++"))))
        ])

        mod.add_function(FunctionBody(fdecl, fbody))

        #print "----------------------------------------------------------------"
        #print FunctionBody(fdecl, fbody)
        #raw_input()

        return mod.compile(self.discr.toolchain).lift
示例#20
0
def get_boundary_flux_mod(fluxes, fvi, discr, dtype):
    from cgen import \
            FunctionDeclaration, FunctionBody, Typedef, Struct, \
            Const, Reference, Value, POD, MaybeUnused, \
            Statement, Include, Line, Block, Initializer, Assign, \
            CustomLoop, For

    from pytools import to_uncomplex_dtype, flatten

    from codepy.bpl import BoostPythonModule
    mod = BoostPythonModule()

    mod.add_to_preamble([
        Include("cstdlib"),
        Include("algorithm"),
        Line(),
        Include("boost/foreach.hpp"),
        Line(),
        Include("hedge/face_operators.hpp"),
        ])

    S = Statement
    mod.add_to_module([
        S("using namespace hedge"),
        S("using namespace pyublas"),
        Line(),
        Typedef(POD(dtype, "value_type")),
        Typedef(POD(to_uncomplex_dtype(dtype), "uncomplex_type")),
        ])

    arg_struct = Struct("arg_struct", [
        Value("numpy_array<value_type>", "flux%d_on_faces" % i)
        for i in range(len(fluxes))
        ]+[
        Value("numpy_array<value_type>", arg_name)
        for arg_name in fvi.arg_names
        ])

    mod.add_struct(arg_struct, "ArgStruct")
    mod.add_to_module([Line()])

    fdecl = FunctionDeclaration(
                Value("void", "gather_flux"),
                [
                    Const(Reference(Value("face_group<face_pair<straight_face> >" , "fg"))),
                    Reference(Value("arg_struct", "args"))
                    ])

    from pymbolic.mapper.stringifier import PREC_PRODUCT

    def gen_flux_code():
        f2cm = FluxToCodeMapper()

        result = [
                Assign("fof%d_it[loc_fof_base+i]" % flux_idx,
                    "uncomplex_type(fp.int_side.face_jacobian) * " +
                    flux_to_code(f2cm, False, flux_idx, fvi, flux.op.flux, PREC_PRODUCT))
                for flux_idx, flux in enumerate(fluxes)
                ]

        return [
            Initializer(Value("value_type", cse_name), cse_str)
            for cse_name, cse_str in f2cm.cse_name_list] + result

    fbody = Block([
        Initializer(
            Const(Value("numpy_array<value_type>::iterator", "fof%d_it" % i)),
            "args.flux%d_on_faces.begin()" % i)
        for i in range(len(fluxes))
        ]+[
        Initializer(
            Const(Value("numpy_array<value_type>::const_iterator",
                "%s_it" % arg_name)),
            "args.%s.begin()" % arg_name)
        for arg_name in fvi.arg_names
        ]+[
        Line(),
        CustomLoop("BOOST_FOREACH(const face_pair<straight_face> &fp, fg.face_pairs)", Block(
            list(flatten([
            Initializer(Value("node_number_t", "%s_ebi" % where),
                "fp.%s.el_base_index" % where),
            Initializer(Value("index_lists_t::const_iterator", "%s_idx_list" % where),
                "fg.index_list(fp.%s.face_index_list_number)" % where),
            Line(),
            ]
            for where in ["int_side", "ext_side"]
            ))+[
            Line(),
            Initializer(Value("node_number_t", "loc_fof_base"),
                "fg.face_length()*(fp.%(where)s.local_el_number*fg.face_count"
                " + fp.%(where)s.face_id)" % {"where": "int_side"}),
            Line(),
            For(
                "unsigned i = 0",
                "i < fg.face_length()",
                "++i",
                Block(
                    [
                    Initializer(MaybeUnused(
                        Value("node_number_t", "%s_idx" % where)),
                        "%(where)s_ebi + %(where)s_idx_list[i]"
                        % {"where": where})
                    for where in ["int_side", "ext_side"]
                    ]+gen_flux_code()
                    )
                )
            ]))
        ])

    mod.add_function(FunctionBody(fdecl, fbody))

    #print "----------------------------------------------------------------"
    #print mod.generate()
    #raw_input("[Enter]")

    return mod.compile(get_flux_toolchain(discr, fluxes))
示例#21
0
    def make_cuda_kernel(self, discr, dtype, eg):
        given = discr.given
        ldis = eg.local_discretization

        microblocks_per_block = 1

        from cgen.cuda import CudaGlobal

        from cgen import (Module, Value, Include,
                Typedef, FunctionBody, FunctionDeclaration, Const,
                Line, POD, LiteralBlock,
                Define, Pointer)

        cmod = Module([
            Include("pycuda-helpers.hpp"),
            Line(),
            Typedef(POD(dtype, "value_type")),
            Line(),
            Define("DOFS_PER_EL", given.dofs_per_el()),
            Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats),
            Define("VERTICES_PER_EL", ldis.vertex_count()),
            Define("ELS_PER_MB", given.microblock.elements),
            Define("MBS_PER_BLOCK", microblocks_per_block),
            Line(),
            Define("DOF_IN_MB_IDX", "threadIdx.x"),
            Define("DOF_IN_EL_IDX", "(DOF_IN_MB_IDX-el_idx_in_mb*DOFS_PER_EL)"),
            Define("MB_IN_BLOCK_IDX", "threadIdx.y"),
            Define("BLOCK_IDX", "blockIdx.x"),
            Define("MB_NUMBER", "(BLOCK_IDX * MBS_PER_BLOCK + MB_IN_BLOCK_IDX)"),
            Define("BLOCK_DATA", "whole_block[MB_IN_BLOCK_IDX]")]
            + self.get_cuda_extra_preamble(discr, dtype, eg)
            + [FunctionBody(
            CudaGlobal(FunctionDeclaration(
                    Value("void", "elwise_kernel"), [
                    Pointer(Const(POD(dtype, "field"))),
                    Pointer(POD(dtype, "result")),
                    POD(numpy.uint32, "mb_count"),
                    ])),
                LiteralBlock("""
                int el_idx_in_mb = DOF_IN_MB_IDX / DOFS_PER_EL;

                if (MB_NUMBER >= mb_count)
                  return;

                int idx =  MB_NUMBER * ALIGNED_DOFS_PER_MB + DOF_IN_MB_IDX;
                int element_base_idx = ALIGNED_DOFS_PER_MB * MB_IN_BLOCK_IDX +
                    (DOF_IN_MB_IDX / DOFS_PER_EL) * DOFS_PER_EL;
                int dof_in_element = DOF_IN_MB_IDX-el_idx_in_mb*DOFS_PER_EL;

                __shared__ value_type whole_block[MBS_PER_BLOCK][ALIGNED_DOFS_PER_MB+1];
                int idx_in_block = ALIGNED_DOFS_PER_MB * MB_IN_BLOCK_IDX + DOF_IN_MB_IDX;
                BLOCK_DATA[idx_in_block] = field[idx];

                __syncthreads();

                %s

                result[idx] = node_result;
                """ % self.get_cuda_code(discr, dtype, eg)))
                ])


        if False:
            for i, l in enumerate(str(cmod).split("\n")):
                print i+1, l
            raw_input()

        from pycuda.compiler import SourceModule
        mod = SourceModule(
                cmod,
                keep="cuda_keep_kernels" in discr.debug,
                )
        func = mod.get_function("elwise_kernel")
        func.prepare(
            "PPI", block=(
                given.microblock.aligned_floats,
                microblocks_per_block, 1))

        mb_count = len(discr.blocks) * discr.given.microblocks_per_block
        grid_dim = (mb_count + microblocks_per_block - 1) \
                // microblocks_per_block

        from pytools import Record
        class KernelInfo(Record):
            pass

        return KernelInfo(
                func=func,
                grid_dim=grid_dim,
                mb_count=mb_count)
示例#22
0
    def make_diff(self, elgroup, dtype, shape):
        """
        :param shape: If non-square, the resulting code takes two element_ranges
          arguments and supports non-square matrices.
        """
        from hedge._internal import UniformElementRanges
        assert isinstance(elgroup.ranges, UniformElementRanges)

        ldis = elgroup.local_discretization
        discr = self.discr
        from cgen import (
                FunctionDeclaration, FunctionBody, Typedef,
                Const, Reference, Value, POD,
                Statement, Include, Line, Block, Initializer, Assign,
                For, If,
                Define)

        from pytools import to_uncomplex_dtype

        from codepy.bpl import BoostPythonModule
        mod = BoostPythonModule()

        # {{{ preamble
        S = Statement
        mod.add_to_preamble([
            Include("hedge/volume_operators.hpp"),
            Include("boost/foreach.hpp"),
            ])

        mod.add_to_module([
            S("namespace ublas = boost::numeric::ublas"),
            S("using namespace hedge"),
            S("using namespace pyublas"),
            Line(),
            Define("ROW_COUNT", shape[0]),
            Define("COL_COUNT", shape[1]),
            Define("DIMENSIONS", discr.dimensions),
            Line(),
            Typedef(POD(dtype, "value_type")),
            Typedef(POD(to_uncomplex_dtype(dtype), "uncomplex_type")),
            ])

        fdecl = FunctionDeclaration(
                    Value("void", "diff"),
                    [
                    Const(Reference(Value("uniform_element_ranges", "from_ers"))),
                    Const(Reference(Value("uniform_element_ranges", "to_ers"))),
                    Value("numpy_array<value_type>", "field")
                    ]+[
                    Value("ublas::matrix<uncomplex_type>", "diffmat_rst%d" % rst)
                    for rst in range(discr.dimensions)
                    ]+[
                    Value("numpy_array<value_type>", "result%d" % i)
                    for i in range(discr.dimensions)
                    ]
                    )
        # }}}

        # {{{ set-up
        def make_it(name, is_const=True, tpname="value_type"):
            if is_const:
                const = "const_"
            else:
                const = ""

            return Initializer(
                Value("numpy_array<%s>::%siterator" % (tpname, const), name+"_it"),
                "%s.begin()" % name)

        fbody = Block([
            If("ROW_COUNT != diffmat_rst%d.size1()" % i,
                S('throw(std::runtime_error("unexpected matrix size"))'))
            for i in range(discr.dimensions)
            ] + [
            If("COL_COUNT != diffmat_rst%d.size2()" % i,
                S('throw(std::runtime_error("unexpected matrix size"))'))
            for i in range(discr.dimensions) 
            ]+[
            If("ROW_COUNT != to_ers.el_size()",
                S('throw(std::runtime_error("unsupported image element size"))')),
            If("COL_COUNT != from_ers.el_size()",
                S('throw(std::runtime_error("unsupported preimage element size"))')),
            If("from_ers.size() != to_ers.size()",
                S('throw(std::runtime_error("image and preimage element groups '
                    'do nothave the same element count"))')),
            Line(),
            make_it("field"),
            ]+[
            make_it("result%d" % i, is_const=False)
            for i in range(discr.dimensions)
            ]+[
            Line(),
        # }}}

        # {{{ computation
            For("element_number_t eg_el_nr = 0",
                "eg_el_nr < to_ers.size()",
                "++eg_el_nr",
                Block([
                    Initializer(
                        Value("node_number_t", "from_el_base"),
                        "from_ers.start() + eg_el_nr*COL_COUNT"),
                    Initializer(
                        Value("node_number_t", "to_el_base"),
                        "to_ers.start() + eg_el_nr*ROW_COUNT"),
                    Line(),
                    For("unsigned i = 0",
                        "i < ROW_COUNT",
                        "++i",
                        Block([
                            Initializer(Value("value_type", "drst_%d" % rst), 0)
                            for rst in range(discr.dimensions)
                            ]+[
                            Line(),
                            ]+[
                            For("unsigned j = 0",
                                "j < COL_COUNT",
                                "++j",
                                Block([
                                    S("drst_%(rst)d += "
                                        "diffmat_rst%(rst)d(i, j)*field_it[from_el_base+j]"
                                        % {"rst":rst})
                                    for rst in range(discr.dimensions)
                                    ])
                                ),
                            Line(),
                            ]+[
                            Assign("result%d_it[to_el_base+i]" % rst,
                                "drst_%d" % rst)
                            for rst in range(discr.dimensions)
                            ])
                        )
                    ])
                )
            ])
        # }}}

        # {{{ compilation
        mod.add_function(FunctionBody(fdecl, fbody))

        #print "----------------------------------------------------------------"
        #print mod.generate()
        #raw_input()

        compiled_func = mod.compile(self.discr.toolchain).diff

        if self.discr.instrumented:
            from hedge.tools import time_count_flop

            compiled_func = time_count_flop(compiled_func,
                    discr.diff_timer, discr.diff_counter,
                    discr.diff_flop_counter,
                    flops=discr.dimensions*(
                        2 # mul+add
                        * ldis.node_count() * len(elgroup.members)
                        * ldis.node_count()
                        +
                        2 * discr.dimensions
                        * len(elgroup.members) * ldis.node_count()),
                    increment=discr.dimensions)

        return compiled_func