Exemplo n.º 1
0
 def get_direct_tex_mat_mul_code():
     return (
         [POD(float_type, "fof%d" % inl) for inl in range(par.inline)] +
         [POD(float_type, "lm"), Line()] + unroll(
             lambda j: [
                 Assign(
                     "fof%d" % inl,
                     "fp_tex1Dfetch(in_vector_tex, "
                     "GLOBAL_MB_PREIMG_DOF_BASE"
                     " + %(inl)d * ALIGNED_PREIMAGE_DOFS_PER_MB"
                     " + mb_el*PREIMAGE_DOFS_PER_EL+%(j)s)" % {
                         "j": j,
                         "inl": inl,
                         "row": "SEGMENT_DOF"
                     },
                 ) for inl in range(par.inline)
             ] + [
                 Assign(
                     "lm",
                     "smem_matrix["
                     "%(row)s*MATRIX_COLUMNS + %(j)s]" % {
                         "j": j,
                         "row": "SEGMENT_DOF"
                     },
                 )
             ] + [
                 S("result%(inl)d += fof%(inl)d*lm" % {"inl": inl})
                 for inl in range(par.inline)
             ],
             total_number=self.plan.preimage_dofs_per_el,
             max_unroll=self.plan.max_unroll) + [Line()])
Exemplo n.º 2
0
            def unroll_body(j):
                result = [
                    Assign("field_value%d" % inl,
                        "smem_field[PAR_MB_NR][%d][mb_el*PREIMAGE_DOFS_PER_EL+%s]" % (inl, j))
                    for inl in range(par.inline)
                    ]

                if float_type == numpy.float32:
                    result.append(Assign("dmat_entries",
                        "tex1Dfetch(diff_rst_mat_tex, IMAGE_EL_DOF + %s*IMAGE_DOFS_PER_EL)" % j))
                    result.extend(
                        S("d%drst%d += dmat_entries.%s * field_value%d"
                            % (inl, axis, tex_channels[axis], inl))
                        for inl in range(par.inline)
                        for axis in dims)
                elif float_type == numpy.float64:
                    result.extend(
                        S("d%(inl)drst%(axis)d += "
                            "fp_tex1Dfetch(diff_rst_mat_tex, %(axis)d "
                            "+ DIMENSIONS*(IMAGE_EL_DOF + %(j)d*IMAGE_DOFS_PER_EL))"
                            "* field_value%(inl)d" % {
                            "inl": inl,
                            "axis": axis,
                            "j": j
                            })
                        for inl in range(par.inline)
                        for axis in dims)
                else:
                    assert False

                return result
Exemplo n.º 3
0
        def get_load_code():
            mb_img_dofs = plan.aligned_image_dofs_per_microblock
            mb_preimg_dofs = plan.aligned_preimage_dofs_per_microblock
            preimg_dofs_over_dofs = (mb_preimg_dofs+mb_img_dofs-1) // mb_img_dofs

            load_code = []
            store_code = []

            var_num = 0
            for load_block in range(preimg_dofs_over_dofs):
                for inl in range(par.inline):
                    # load and store are split for better pipelining
                    # compiler can't figure that out because of branch

                    var = "tmp%d" % var_num
                    var_num += 1
                    load_code.append(POD(float_type, var))

                    block_addr = "%d * ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF" % load_block
                    load_instr = Assign(var,
                        "field[GLOBAL_MB_PREIMAGE_DOF_BASE"
                        " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB"
                        " + %s]" % (inl, block_addr))
                    store_instr = Assign(
                            "smem_field[PAR_MB_NR][%d][%s]" % (inl, block_addr),
                            var
                            )
                    if (load_block+1)*mb_img_dofs >= mb_preimg_dofs:
                        cond = "%s < ALIGNED_PREIMAGE_DOFS_PER_MB" % block_addr
                        load_instr = If(cond, load_instr)
                        store_instr = If(cond, store_instr)

                    load_code.append(load_instr)
                    store_code.append(store_instr)
            return Block(load_code + [Line()] + store_code)
Exemplo n.º 4
0
 def gen_store(self, flux_nr, index, what):
     if self.plan.direct_store:
         return Assign(
             "gmem_fluxes_on_faces%d[FOF_BLOCK_BASE + %s]" %
             (flux_nr, index), what)
     else:
         return Assign("smem_fluxes_on_faces[%d][%s]" % (flux_nr, index),
                       what)
Exemplo n.º 5
0
        def get_scalar_diff_code():
            code = []
            for inl in range(par.inline):
                for axis in dims:
                    code.append(
                        Initializer(POD(float_type, "d%drst%d" % (inl, axis)),
                                    0))

            code.append(Line())

            def get_mat_entry(row, col, axis):
                return ("smem_diff_rst_mat["
                        "%(row)s*DIFFMAT_COLUMNS + %(axis)s*DOFS_PER_EL"
                        " + %(col)s"
                        "]" % {
                            "row": row,
                            "col": col,
                            "axis": axis
                        })

            tex_channels = ["x", "y", "z", "w"]
            from hedge.backends.cuda.tools import unroll
            code.extend([
                POD(float_type, "field_value%d" % inl)
                for inl in range(par.inline)
            ] + [Line()] + unroll(
                lambda j: [
                    Assign(
                        "field_value%d" % inl,
                        "fp_tex1Dfetch(field_tex, GLOBAL_MB_DOF_BASE + %d*ALIGNED_DOFS_PER_MB "
                        "+ mb_el*DOFS_PER_EL + %s)" % (inl, j))
                    for inl in range(par.inline)
                ] + [Line()] + [
                    S("d%drst%d += %s * field_value%d" %
                      (inl, axis, get_mat_entry("SEGMENT_DOF", j, axis), inl))
                    for axis in dims for inl in range(par.inline)
                ] + [Line()], given.dofs_per_el(), self.plan.max_unroll))

            store_code = Block()
            for inl in range(par.inline):
                for rst_axis in dims:
                    store_code.append(
                        Assign(
                            "drst%d_global[GLOBAL_MB_DOF_BASE"
                            " + %d*ALIGNED_DOFS_PER_MB + MB_DOF]" %
                            (rst_axis, inl),
                            "d%drst%d" % (inl, rst_axis),
                        ))

            code.append(If("MB_DOF < DOFS_PER_EL*ELS_PER_MB", store_code))

            return code
Exemplo n.º 6
0
def test_cgen():
    s = Struct(
        "yuck",
        [
            POD(
                np.float32,
                "h",
            ),
            POD(np.float32, "order"),
            POD(np.float32, "face_jacobian"),
            ArrayOf(POD(np.float32, "normal"), 17),
            POD(np.uint16, "a_base"),
            POD(np.uint16, "b_base"),
            #CudaGlobal(POD(np.uint8, "a_ilist_number")),
            POD(np.uint8, "b_ilist_number"),
            POD(np.uint8, "bdry_flux_number"),  # 0 if not on boundary
            POD(np.uint8, "reserved"),
            POD(np.uint32, "b_global_base"),
        ])
    f_decl = FunctionDeclaration(POD(np.uint16, "get_num"), [
        POD(np.uint8, "reserved"),
        POD(np.uint32, "b_global_base"),
    ])
    f_body = FunctionBody(
        f_decl,
        Block([
            POD(np.uint32, "i"),
            For(
                "i = 0",
                "i < 17",
                "++i",
                If(
                    "a > b",
                    Assign("a", "b"),
                    Block([
                        Assign("a", "b-1"),
                        #Break(),
                    ])),
            ),
            #BlankLine(),
            Comment("all done"),
        ]))
    t_decl = Template(
        'typename T',
        FunctionDeclaration(
            Value('CUdeviceptr', 'scan'),
            [Value('CUdeviceptr', 'inputPtr'),
             Value('int', 'length')]))

    print(s)
    print(f_body)
    print(t_decl)
Exemplo n.º 7
0
        def mat_mul_outer_loop(fetch_count):
            if with_scaling:
                inv_jac_multiplier = (
                    "fp_tex1Dfetch(scaling_tex,"
                    "(GLOBAL_MB_NR + %(inl)d)*MB_EL_COUNT + mb_el)")
            else:
                inv_jac_multiplier = "1"

            write_condition = "MB_DOF < DOFS_PER_EL*MB_EL_COUNT"
            if self.with_index_check:
                write_condition += " && GLOBAL_MB_NR < microblock_count"
            return For(
                "unsigned short seq_mb_number = 0",
                "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number",
                Block([
                    Initializer(POD(float_type, "result%d" % inl), 0)
                    for inl in range(par.inline)
                ] + [Line()] + get_mat_mul_code(fetch_count) + [
                    If(
                        write_condition,
                        Block([
                            Assign(
                                "out_vector[GLOBAL_MB_DOF_BASE"
                                " + %d*ALIGNED_DOFS_PER_MB"
                                " + MB_DOF]" % inl, "result%d * %s" %
                                (inl, (inv_jac_multiplier % {
                                    "inl": inl
                                }))) for inl in range(par.inline)
                        ]))
                ]))
Exemplo n.º 8
0
        def get_batched_fetch_mat_mul_code(el_fetch_count):
            result = []
            dofs = range(self.plan.preimage_dofs_per_el)

            for load_segment_start in range(0, self.plan.preimage_dofs_per_el,
                                            self.plan.segment_size):
                result.extend([S("__syncthreads()")] + [
                    Assign(
                        "dof_buffer[PAR_MB_NR][%d][SEGMENT_DOF]" %
                        inl, "fp_tex1Dfetch(in_vector_tex, "
                        "GLOBAL_MB_PREIMG_DOF_BASE"
                        " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB"
                        " + (segment_start_el)*PREIMAGE_DOFS_PER_EL + %d + SEGMENT_DOF)"
                        % (inl, load_segment_start))
                    for inl in range(par.inline)
                ] + [
                    S("__syncthreads()"),
                    Line(),
                ])

                for dof in dofs[load_segment_start:load_segment_start +
                                self.plan.segment_size]:
                    for inl in range(par.inline):
                        result.append(
                            S("result%d += "
                              "smem_matrix[SEGMENT_DOF*MATRIX_COLUMNS + %d]"
                              "*"
                              "dof_buffer[PAR_MB_NR][%d][%d]" %
                              (inl, dof, inl, dof - load_segment_start)))
                result.append(Line())
            return result
Exemplo n.º 9
0
    def emit_tuple_assignment(self, codegen_state, insn):
        ecm = codegen_state.expression_to_code_mapper

        from cgen import Assign, block_if_necessary
        assignments = []

        for i, (assignee, parameter) in enumerate(
                zip(insn.assignees, insn.expression.parameters)):
            lhs_code = ecm(assignee, prec=PREC_NONE, type_context=None)
            assignee_var_name = insn.assignee_var_names()[i]
            lhs_var = codegen_state.kernel.get_var_descriptor(
                assignee_var_name)
            lhs_dtype = lhs_var.dtype

            from loopy.expression import dtype_to_type_context
            rhs_type_context = dtype_to_type_context(
                codegen_state.kernel.target, lhs_dtype)
            rhs_code = ecm(parameter,
                           prec=PREC_NONE,
                           type_context=rhs_type_context,
                           needed_dtype=lhs_dtype)

            assignments.append(Assign(lhs_code, rhs_code))

        return block_if_necessary(assignments)
Exemplo n.º 10
0
    def emit_multiple_assignment(self, codegen_state, insn):

        ecm = codegen_state.expression_to_code_mapper
        func_id = insn.expression.function.name
        in_knl_callable = codegen_state.callables_table[func_id]

        if isinstance(in_knl_callable,
                      ScalarCallable) and (in_knl_callable.name_in_target
                                           == "loopy_make_tuple"):
            return self.emit_tuple_assignment(codegen_state, insn)

        # takes "is_returned" to infer whether insn.assignees[0] is a part of
        # LHS.
        in_knl_callable_as_call, is_returned = in_knl_callable.emit_call_insn(
            insn=insn, target=self.target, expression_to_code_mapper=ecm)

        if is_returned:
            from cgen import Assign
            lhs_code = ecm(insn.assignees[0],
                           prec=PREC_NONE,
                           type_context=None)
            return Assign(
                lhs_code,
                CExpression(self.get_c_expression_to_code_mapper(),
                            in_knl_callable_as_call))
        else:
            from cgen import ExpressionStatement
            return ExpressionStatement(
                CExpression(self.get_c_expression_to_code_mapper(),
                            in_knl_callable_as_call))
Exemplo n.º 11
0
def test_value_param():
    data = np.arange(6, dtype=np.float64).reshape((3, 2))
    kernel = Assign("output_grid[i2][i1]", "input_grid[i2][i1] + offset")
    propagator = Propagator("process", 3, (2, ), [])
    propagator.add_param("input_grid", data.shape, data.dtype)
    propagator.add_param("output_grid", data.shape, data.dtype)
    propagator.add_scalar_param("offset", np.int32)
    propagator.loop_body = kernel
    f = propagator.cfunction
    arr = np.empty_like(data)
    f(data, arr, np.int32(3))
    assert (arr[2][1] == 8)
Exemplo n.º 12
0
    def gen_flux_code():
        f2cm = FluxToCodeMapper()

        result = [
                Assign("fof%d_it[loc_fof_base+i]" % flux_idx,
                    "uncomplex_type(fp.int_side.face_jacobian) * " +
                    flux_to_code(f2cm, False, flux_idx, fvi, flux.op.flux, PREC_PRODUCT))
                for flux_idx, flux in enumerate(fluxes)
                ]

        return [
            Initializer(Value("value_type", cse_name), cse_str)
            for cse_name, cse_str in f2cm.cse_name_list] + result
Exemplo n.º 13
0
    def emit_assignment(self, codegen_state, insn):
        kernel = codegen_state.kernel
        ecm = codegen_state.expression_to_code_mapper

        assignee_var_name, = insn.assignee_var_names()

        lhs_var = codegen_state.kernel.get_var_descriptor(assignee_var_name)
        lhs_dtype = lhs_var.dtype

        if insn.atomicity is not None:
            lhs_atomicity = [
                    a for a in insn.atomicity if a.var_name == assignee_var_name]
            assert len(lhs_atomicity) <= 1
            if lhs_atomicity:
                lhs_atomicity, = lhs_atomicity
            else:
                lhs_atomicity = None
        else:
            lhs_atomicity = None

        from loopy.kernel.data import AtomicInit, AtomicUpdate
        from loopy.expression import dtype_to_type_context

        lhs_code = ecm(insn.assignee, prec=PREC_NONE, type_context=None)
        rhs_type_context = dtype_to_type_context(kernel.target, lhs_dtype)
        if lhs_atomicity is None:
            from cgen import Assign
            return Assign(
                    lhs_code,
                    ecm(insn.expression, prec=PREC_NONE,
                        type_context=rhs_type_context,
                        needed_dtype=lhs_dtype))

        elif isinstance(lhs_atomicity, AtomicInit):
            codegen_state.seen_atomic_dtypes.add(lhs_dtype)
            return codegen_state.ast_builder.emit_atomic_init(
                    codegen_state, lhs_atomicity, lhs_var,
                    insn.assignee, insn.expression,
                    lhs_dtype, rhs_type_context)

        elif isinstance(lhs_atomicity, AtomicUpdate):
            codegen_state.seen_atomic_dtypes.add(lhs_dtype)
            return codegen_state.ast_builder.emit_atomic_update(
                    codegen_state, lhs_atomicity, lhs_var,
                    insn.assignee, insn.expression,
                    lhs_dtype, rhs_type_context)

        else:
            raise ValueError("unexpected lhs atomicity type: %s"
                    % type(lhs_atomicity).__name__)
Exemplo n.º 14
0
    def gen_flux_code():
        f2cm = FluxToCodeMapper()

        result = [
                Assign("fof%d_it[%s_fof_base+%s]" % (flux_idx, where, tgt_idx),
                    "uncomplex_type(fp.int_side.face_jacobian) * " +
                    flux_to_code(f2cm, is_flipped, flux_idx, fvi, flux.op.flux, PREC_PRODUCT))
                for flux_idx, flux in enumerate(fluxes)
                for where, is_flipped, tgt_idx in [
                    ("int_side", False, "i"),
                    ("ext_side", True, "ext_native_write_map[i]")
                    ]]

        return [
            Initializer(Value("value_type", cse_name), cse_str)
            for cse_name, cse_str in f2cm.cse_name_list] + result
Exemplo n.º 15
0
    def get_kernel(self, fdata, ilist_data, for_benchmark):
        from cgen.cuda import CudaShared, CudaGlobal
        from pycuda.tools import dtype_to_ctype

        discr = self.discr
        given = self.plan.given
        fplan = self.plan
        d = discr.dimensions
        dims = range(d)

        elgroup, = discr.element_groups

        float_type = given.float_type

        f_decl = CudaGlobal(
            FunctionDeclaration(Value("void", "apply_flux"), [
                Pointer(POD(float_type, "debugbuf")),
                Pointer(POD(numpy.uint8, "gmem_facedata")),
            ] + [
                Pointer(POD(float_type, "gmem_fluxes_on_faces%d" % flux_nr))
                for flux_nr in range(len(self.fluxes))
            ]))

        cmod = Module()
        cmod.append(Include("pycuda-helpers.hpp"))

        for dep_expr in self.all_deps:
            cmod.extend([
                Value(
                    "texture<%s, 1, cudaReadModeElementType>" %
                    dtype_to_ctype(float_type, with_fp_tex_hack=True),
                    "field%d_tex" % self.dep_to_index[dep_expr])
            ])

        if fplan.flux_count != len(self.fluxes):
            from warnings import warn
            warn(
                "Flux count in flux execution plan different from actual flux count.\n"
                "You may want to specify the tune_for= kwarg in the Discretization\n"
                "constructor.")

        cmod.extend([
            Line(),
            Typedef(POD(float_type, "value_type")),
            Line(),
            flux_header_struct(float_type, discr.dimensions),
            Line(),
            face_pair_struct(float_type, discr.dimensions),
            Line(),
            Define("DIMENSIONS", discr.dimensions),
            Define("DOFS_PER_FACE", fplan.dofs_per_face),
            Define("THREADS_PER_FACE", fplan.threads_per_face()),
            Line(),
            Define("CONCURRENT_FACES", fplan.parallel_faces),
            Define("BLOCK_MB_COUNT", fplan.mbs_per_block),
            Line(),
            Define("FACEDOF_NR", "threadIdx.x"),
            Define("BLOCK_FACE", "threadIdx.y"),
            Line(),
            Define("FLUX_COUNT", len(self.fluxes)),
            Line(),
            Define("THREAD_NUM", "(FACEDOF_NR + BLOCK_FACE*THREADS_PER_FACE)"),
            Define("THREAD_COUNT", "(THREADS_PER_FACE*CONCURRENT_FACES)"),
            Define(
                "COALESCING_THREAD_COUNT",
                "(THREAD_COUNT < 0x10 ? THREAD_COUNT : THREAD_COUNT & ~0xf)"),
            Line(),
            Define("DATA_BLOCK_SIZE", fdata.block_bytes),
            Define("ALIGNED_FACE_DOFS_PER_MB",
                   fplan.aligned_face_dofs_per_microblock()),
            Define("ALIGNED_FACE_DOFS_PER_BLOCK",
                   "(ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT)"),
            Line(),
            Define("FOF_BLOCK_BASE",
                   "(blockIdx.x*ALIGNED_FACE_DOFS_PER_BLOCK)"),
            Line(),
        ] + ilist_data.code + [
            Line(),
            Value("texture<index_list_entry_t, 1, cudaReadModeElementType>",
                  "tex_index_lists"),
            Line(),
            fdata.struct,
            Line(),
            CudaShared(Value("flux_data", "data")),
        ])

        if not fplan.direct_store:
            cmod.extend([
                CudaShared(
                    ArrayOf(
                        ArrayOf(POD(float_type, "smem_fluxes_on_faces"),
                                "FLUX_COUNT"),
                        "ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT")),
                Line(),
            ])

        S = Statement
        f_body = Block()

        from hedge.backends.cuda.tools import get_load_code

        f_body.extend(
            get_load_code(dest="&data",
                          base="gmem_facedata + blockIdx.x*DATA_BLOCK_SIZE",
                          bytes="sizeof(flux_data)",
                          descr="load face_pair data") +
            [S("__syncthreads()"), Line()])

        def get_flux_code(flux_writer):
            flux_code = Block([])

            flux_code.extend([
                Initializer(Pointer(Value("face_pair", "fpair")),
                            "data.facepairs+fpair_nr"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "a_index")),
                    "fpair->a_base + tex1Dfetch(tex_index_lists, "
                    "fpair->a_ilist_index + FACEDOF_NR)"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "b_index")),
                    "fpair->b_base + tex1Dfetch(tex_index_lists, "
                    "fpair->b_ilist_index + FACEDOF_NR)"),
                Line(),
                flux_writer(),
                Line(),
                S("fpair_nr += CONCURRENT_FACES")
            ])

            return flux_code

        flux_computation = Block([
            Comment("fluxes for dual-sided (intra-block) interior face pairs"),
            While("fpair_nr < data.header.same_facepairs_end",
                  get_flux_code(lambda: self.write_interior_flux_code(True))),
            Line(),
            Comment("work around nvcc assertion failure"),
            S("fpair_nr+=1"),
            S("fpair_nr-=1"),
            Line(),
            Comment(
                "fluxes for single-sided (inter-block) interior face pairs"),
            While("fpair_nr < data.header.diff_facepairs_end",
                  get_flux_code(lambda: self.write_interior_flux_code(False))),
            Line(),
            Comment("fluxes for single-sided boundary face pairs"),
            While(
                "fpair_nr < data.header.bdry_facepairs_end",
                get_flux_code(
                    lambda: self.write_boundary_flux_code(for_benchmark))),
        ])

        f_body.extend_log_block("compute the fluxes", [
            Initializer(POD(numpy.uint32, "fpair_nr"), "BLOCK_FACE"),
            If("FACEDOF_NR < DOFS_PER_FACE", flux_computation)
        ])

        if not fplan.direct_store:
            f_body.extend([Line(), S("__syncthreads()"), Line()])

            f_body.extend_log_block(
                "store fluxes",
                [
                    #Assign("debugbuf[blockIdx.x]", "FOF_BLOCK_BASE"),
                    #Assign("debugbuf[0]", "FOF_BLOCK_BASE"),
                    #Assign("debugbuf[0]", "sizeof(face_pair)"),
                    For(
                        "unsigned word_nr = THREAD_NUM",
                        "word_nr < ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT",
                        "word_nr += COALESCING_THREAD_COUNT",
                        Block([
                            Assign(
                                "gmem_fluxes_on_faces%d[FOF_BLOCK_BASE+word_nr]"
                                % flux_nr,
                                "smem_fluxes_on_faces[%d][word_nr]" % flux_nr)
                            for flux_nr in range(len(self.fluxes))
                        ]
                              #+[If("isnan(smem_fluxes_on_faces[%d][word_nr])" % flux_nr,
                              #Block([
                              #Assign("debugbuf[blockIdx.x]", "word_nr"),
                              #])
                              #)
                              #for flux_nr in range(len(self.fluxes))]
                              ))
                ])
        if False:
            f_body.extend([
                Assign("debugbuf[blockIdx.x*96+32+BLOCK_FACE*32+threadIdx.x]",
                       "fpair_nr"),
                Assign("debugbuf[blockIdx.x*96+16]",
                       "data.header.same_facepairs_end"),
                Assign("debugbuf[blockIdx.x*96+17]",
                       "data.header.diff_facepairs_end"),
                Assign("debugbuf[blockIdx.x*96+18]",
                       "data.header.bdry_facepairs_end"),
            ])

        # finish off ----------------------------------------------------------
        cmod.append(FunctionBody(f_decl, f_body))

        if not for_benchmark and "cuda_dump_kernels" in discr.debug:
            from hedge.tools import open_unique_debug_file
            open_unique_debug_file("flux_gather", ".cu").write(str(cmod))

        #from pycuda.tools import allow_user_edit
        mod = SourceModule(
            #allow_user_edit(cmod, "kernel.cu", "the flux kernel"),
            cmod,
            keep="cuda_keep_kernels" in discr.debug)
        expr_to_texture_map = dict(
            (dep_expr,
             mod.get_texref("field%d_tex" % self.dep_to_index[dep_expr]))
            for dep_expr in self.all_deps)

        index_list_texref = mod.get_texref("tex_index_lists")
        index_list_texref.set_address(ilist_data.device_memory,
                                      ilist_data.bytes)
        index_list_texref.set_format(
            cuda.dtype_to_array_format(ilist_data.type), 1)
        index_list_texref.set_flags(cuda.TRSF_READ_AS_INTEGER)

        func = mod.get_function("apply_flux")
        block = (fplan.threads_per_face(), fplan.parallel_faces, 1)
        func.prepare(
            (2 + len(self.fluxes)) * "P",
            texrefs=expr_to_texture_map.values() + [index_list_texref])

        if "cuda_flux" in discr.debug:
            print "flux: lmem=%d smem=%d regs=%d" % (
                func.local_size_bytes, func.shared_size_bytes, func.num_regs)

        return block, func, expr_to_texture_map
Exemplo n.º 16
0
    def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var,
                           lhs_expr, rhs_expr, lhs_dtype, rhs_type_context):
        from pymbolic.mapper.stringifier import PREC_NONE

        # FIXME: Could detect operations, generate atomic_{add,...} when
        # appropriate.

        if isinstance(lhs_dtype, NumpyType) and lhs_dtype.numpy_dtype in [
                np.int32, np.int64, np.float32, np.float64
        ]:
            from cgen import Block, DoWhile, Assign
            from loopy.target.c import POD
            old_val_var = codegen_state.var_name_generator("loopy_old_val")
            new_val_var = codegen_state.var_name_generator("loopy_new_val")

            from loopy.kernel.data import TemporaryVariable, AddressSpace
            ecm = codegen_state.expression_to_code_mapper.with_assignments({
                old_val_var:
                TemporaryVariable(old_val_var, lhs_dtype),
                new_val_var:
                TemporaryVariable(new_val_var, lhs_dtype),
            })

            lhs_expr_code = ecm(lhs_expr, prec=PREC_NONE, type_context=None)

            from pymbolic.mapper.substitutor import make_subst_func
            from pymbolic import var
            from loopy.symbolic import SubstitutionMapper

            subst = SubstitutionMapper(
                make_subst_func({lhs_expr: var(old_val_var)}))
            rhs_expr_code = ecm(subst(rhs_expr),
                                prec=PREC_NONE,
                                type_context=rhs_type_context,
                                needed_dtype=lhs_dtype)

            if lhs_dtype.numpy_dtype.itemsize == 4:
                func_name = "atomic_cmpxchg"
            elif lhs_dtype.numpy_dtype.itemsize == 8:
                func_name = "atom_cmpxchg"
            else:
                raise LoopyError("unexpected atomic size")

            cast_str = ""
            old_val = old_val_var
            new_val = new_val_var

            if lhs_dtype.numpy_dtype.kind == "f":
                if lhs_dtype.numpy_dtype == np.float32:
                    ctype = "int"
                elif lhs_dtype.numpy_dtype == np.float64:
                    ctype = "long"
                else:
                    assert False

                from loopy.kernel.data import (TemporaryVariable, ArrayArg)
                if (isinstance(lhs_var, ArrayArg)
                        and lhs_var.address_space == AddressSpace.GLOBAL):
                    var_kind = "__global"
                elif (isinstance(lhs_var, ArrayArg)
                      and lhs_var.address_space == AddressSpace.LOCAL):
                    var_kind = "__local"
                elif (isinstance(lhs_var, TemporaryVariable)
                      and lhs_var.address_space == AddressSpace.LOCAL):
                    var_kind = "__local"
                elif (isinstance(lhs_var, TemporaryVariable)
                      and lhs_var.address_space == AddressSpace.GLOBAL):
                    var_kind = "__global"
                else:
                    raise LoopyError("unexpected kind of variable '%s' in "
                                     "atomic operation: " %
                                     (lhs_var.name, type(lhs_var).__name__))

                old_val = "*(%s *) &" % ctype + old_val
                new_val = "*(%s *) &" % ctype + new_val
                cast_str = "(%s %s *) " % (var_kind, ctype)

            return Block([
                POD(self, NumpyType(lhs_dtype.dtype, target=self.target),
                    old_val_var),
                POD(self, NumpyType(lhs_dtype.dtype, target=self.target),
                    new_val_var),
                DoWhile(
                    "%(func_name)s("
                    "%(cast_str)s&(%(lhs_expr)s), "
                    "%(old_val)s, "
                    "%(new_val)s"
                    ") != %(old_val)s" % {
                        "func_name": func_name,
                        "cast_str": cast_str,
                        "lhs_expr": lhs_expr_code,
                        "old_val": old_val,
                        "new_val": new_val,
                    },
                    Block([
                        Assign(old_val_var, lhs_expr_code),
                        Assign(new_val_var, rhs_expr_code),
                    ]))
            ])
        else:
            raise NotImplementedError("atomic update for '%s'" % lhs_dtype)
Exemplo n.º 17
0
    def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var,
                           lhs_expr, rhs_expr, lhs_dtype, rhs_type_context):

        from pymbolic.primitives import Sum
        from cgen import Statement
        from pymbolic.mapper.stringifier import PREC_NONE

        if isinstance(lhs_dtype, NumpyType) and lhs_dtype.numpy_dtype in [
                np.int32, np.int64, np.float32, np.float64
        ]:
            # atomicAdd
            if isinstance(rhs_expr, Sum):
                ecm = self.get_expression_to_code_mapper(codegen_state)

                new_rhs_expr = Sum(
                    tuple(c for c in rhs_expr.children if c != lhs_expr))
                lhs_expr_code = ecm(lhs_expr)
                rhs_expr_code = ecm(new_rhs_expr)

                return Statement("atomicAdd(&{}, {})".format(
                    lhs_expr_code, rhs_expr_code))
            else:
                from cgen import Block, DoWhile, Assign
                from loopy.target.c import POD
                old_val_var = codegen_state.var_name_generator("loopy_old_val")
                new_val_var = codegen_state.var_name_generator("loopy_new_val")

                from loopy.kernel.data import TemporaryVariable
                ecm = codegen_state.expression_to_code_mapper.with_assignments(
                    {
                        old_val_var: TemporaryVariable(old_val_var, lhs_dtype),
                        new_val_var: TemporaryVariable(new_val_var, lhs_dtype),
                    })

                lhs_expr_code = ecm(lhs_expr,
                                    prec=PREC_NONE,
                                    type_context=None)

                from pymbolic.mapper.substitutor import make_subst_func
                from pymbolic import var
                from loopy.symbolic import SubstitutionMapper

                subst = SubstitutionMapper(
                    make_subst_func({lhs_expr: var(old_val_var)}))
                rhs_expr_code = ecm(subst(rhs_expr),
                                    prec=PREC_NONE,
                                    type_context=rhs_type_context,
                                    needed_dtype=lhs_dtype)

                cast_str = ""
                old_val = old_val_var
                new_val = new_val_var

                if lhs_dtype.numpy_dtype.kind == "f":
                    if lhs_dtype.numpy_dtype == np.float32:
                        ctype = "int"
                    elif lhs_dtype.numpy_dtype == np.float64:
                        ctype = "long"
                    else:
                        raise AssertionError()

                    old_val = "*(%s *) &" % ctype + old_val
                    new_val = "*(%s *) &" % ctype + new_val
                    cast_str = "(%s *) " % (ctype)

                return Block([
                    POD(self, NumpyType(lhs_dtype.dtype, target=self.target),
                        old_val_var),
                    POD(self, NumpyType(lhs_dtype.dtype, target=self.target),
                        new_val_var),
                    DoWhile(
                        "atomicCAS("
                        "%(cast_str)s&(%(lhs_expr)s), "
                        "%(old_val)s, "
                        "%(new_val)s"
                        ") != %(old_val)s" % {
                            "cast_str": cast_str,
                            "lhs_expr": lhs_expr_code,
                            "old_val": old_val,
                            "new_val": new_val,
                        },
                        Block([
                            Assign(old_val_var, lhs_expr_code),
                            Assign(new_val_var, rhs_expr_code),
                        ]))
                ])
        else:
            raise NotImplementedError("atomic update for '%s'" % lhs_dtype)
Exemplo n.º 18
0
    def emit_assignment(self, codegen_state, insn):
        kernel = codegen_state.kernel
        ecm = codegen_state.expression_to_code_mapper

        assignee_var_name, = insn.assignee_var_names()

        lhs_var = codegen_state.kernel.get_var_descriptor(assignee_var_name)
        lhs_dtype = lhs_var.dtype

        if insn.atomicity:
            raise NotImplementedError("atomic ops in ISPC")

        from loopy.expression import dtype_to_type_context
        from pymbolic.mapper.stringifier import PREC_NONE

        rhs_type_context = dtype_to_type_context(kernel.target, lhs_dtype)
        rhs_code = ecm(insn.expression,
                       prec=PREC_NONE,
                       type_context=rhs_type_context,
                       needed_dtype=lhs_dtype)

        lhs = insn.assignee

        # {{{ handle streaming stores

        if "!streaming_store" in insn.tags:
            ary = ecm.find_array(lhs)

            from loopy.kernel.array import get_access_info
            from pymbolic import evaluate

            from loopy.symbolic import simplify_using_aff
            index_tuple = tuple(
                simplify_using_aff(kernel, idx) for idx in lhs.index_tuple)

            access_info = get_access_info(
                kernel.target, ary, index_tuple,
                lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
                codegen_state.vectorization_info)

            from loopy.kernel.data import GlobalArg, TemporaryVariable

            if not isinstance(ary, (GlobalArg, TemporaryVariable)):
                raise LoopyError("array type not supported in ISPC: %s" %
                                 type(ary).__name)

            if len(access_info.subscripts) != 1:
                raise LoopyError("streaming stores must have a subscript")
            subscript, = access_info.subscripts

            from pymbolic.primitives import Sum, flattened_sum, Variable
            if isinstance(subscript, Sum):
                terms = subscript.children
            else:
                terms = (subscript.children, )

            new_terms = []

            from loopy.kernel.data import LocalIndexTag
            from loopy.symbolic import get_dependencies

            saw_l0 = False
            for term in terms:
                if (isinstance(term, Variable) and isinstance(
                        kernel.iname_to_tag.get(term.name), LocalIndexTag)
                        and kernel.iname_to_tag.get(term.name).axis == 0):
                    if saw_l0:
                        raise LoopyError("streaming store must have stride 1 "
                                         "in local index, got: %s" % subscript)
                    saw_l0 = True
                    continue
                else:
                    for dep in get_dependencies(term):
                        if (isinstance(kernel.iname_to_tag.get(dep),
                                       LocalIndexTag)
                                and kernel.iname_to_tag.get(dep).axis == 0):
                            raise LoopyError(
                                "streaming store must have stride 1 "
                                "in local index, got: %s" % subscript)

                    new_terms.append(term)

            if not saw_l0:
                raise LoopyError("streaming store must have stride 1 in "
                                 "local index, got: %s" % subscript)

            if access_info.vector_index is not None:
                raise LoopyError("streaming store may not use a short-vector "
                                 "data type")

            rhs_has_programindex = any(
                isinstance(kernel.iname_to_tag.get(dep), LocalIndexTag)
                and kernel.iname_to_tag.get(dep).axis == 0
                for dep in get_dependencies(insn.expression))

            if not rhs_has_programindex:
                rhs_code = "broadcast(%s, 0)" % rhs_code

            from cgen import Statement
            return Statement(
                "streaming_store(%s + %s, %s)" %
                (access_info.array_name,
                 ecm(flattened_sum(new_terms), PREC_NONE, 'i'), rhs_code))

        # }}}

        from cgen import Assign
        return Assign(ecm(lhs, prec=PREC_NONE, type_context=None), rhs_code)
Exemplo n.º 19
0
from cgen.cuda import CudaGlobal

mod = Module([
    FunctionBody(
        CudaGlobal(
            FunctionDeclaration(Value("void", "add"),
                                arg_decls=[
                                    Pointer(POD(dtype, name))
                                    for name in ["tgt", "op1", "op2"]
                                ])),
        Block([
            Initializer(
                POD(numpy.int32, "idx"), "threadIdx.x + %d*blockIdx.x" %
                (block_size * thread_strides)),
        ] + [
            Assign(
                "tgt[idx+%d]" % (o * block_size), "op1[idx+%d] + op2[idx+%d]" %
                (o * block_size, o * block_size))
            for o in range(thread_strides)
        ]))
])

mod = SourceModule(mod)

func = mod.get_function("add")
func(c_gpu, a_gpu, b_gpu, block=(block_size, 1, 1), grid=(macroblock_count, 1))

c = cuda.from_device_like(c_gpu, a)

assert la.norm(c - (a + b)) == 0
Exemplo n.º 20
0
 def emit_assignment(self, codegen_state, lhs, rhs):
     from cgen import Assign
     return Assign(lhs, rhs)
Exemplo n.º 21
0
            CLRequiredWorkGroupSize(
                (local_size, ),
                FunctionDeclaration(Value("void", "add"),
                                    arg_decls=[
                                        CLGlobal(
                                            Pointer(Const(POD(dtype, name))))
                                        for name in ["tgt", "op1", "op2"]
                                    ]))),
        Block([
            Initializer(
                POD(numpy.int32,
                    "idx"), "get_local_id(0) + %d * get_group_id(0)" %
                (local_size * thread_strides))
        ] + [
            Assign(
                "tgt[idx+%d]" % (o * local_size), "op1[idx+%d] + op2[idx+%d]" %
                (o * local_size, o * local_size))
            for o in range(thread_strides)
        ]))
])

knl = cl.Program(ctx, str(mod)).build().add

knl(queue, (local_size * macroblock_count, ), (local_size, ), c_buf, a_buf,
    b_buf)

c = numpy.empty_like(a)
cl.enqueue_copy(queue, c, c_buf).wait()

assert la.norm(c - (a + b)) == 0
Exemplo n.º 22
0
    def get_kernel(self, with_scaling, for_benchmark=False):
        from cgen import \
                Pointer, POD, Value, ArrayOf, \
                Module, FunctionDeclaration, FunctionBody, Block, \
                Line, Define, Include, \
                Initializer, If, For, Statement, Assign, \
                ArrayInitializer

        from cgen import dtype_to_ctype
        from cgen.cuda import CudaShared, CudaConstant, CudaGlobal

        discr = self.discr
        d = discr.dimensions
        dims = range(d)
        given = self.plan.given

        float_type = given.float_type

        f_decl = CudaGlobal(
            FunctionDeclaration(Value("void", "apply_el_local_mat_smem_mat"), [
                Pointer(POD(float_type, "out_vector")),
                Pointer(POD(numpy.uint8, "gmem_matrix")),
                Pointer(POD(float_type, "debugbuf")),
                POD(numpy.uint32, "microblock_count"),
            ]))

        cmod = Module([
            Include("pycuda-helpers.hpp"),
            Line(),
            Value(
                "texture<fp_tex_%s, 1, cudaReadModeElementType>" %
                dtype_to_ctype(float_type), "in_vector_tex"),
        ])
        if with_scaling:
            cmod.append(
                Value(
                    "texture<fp_tex_%s, 1, cudaReadModeElementType>" %
                    dtype_to_ctype(float_type), "scaling_tex"), )

        par = self.plan.parallelism

        cmod.extend([
            Line(),
            Define("DIMENSIONS", discr.dimensions),
            Define("DOFS_PER_EL", given.dofs_per_el()),
            Define("PREIMAGE_DOFS_PER_EL", self.plan.preimage_dofs_per_el),
            Line(),
            Define("SEGMENT_DOF", "threadIdx.x"),
            Define("PAR_MB_NR", "threadIdx.y"),
            Line(),
            Define("MB_SEGMENT", "blockIdx.x"),
            Define("MACROBLOCK_NR", "blockIdx.y"),
            Line(),
            Define("DOFS_PER_SEGMENT", self.plan.segment_size),
            Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()),
            Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats),
            Define("ALIGNED_PREIMAGE_DOFS_PER_MB",
                   self.plan.aligned_preimage_dofs_per_microblock),
            Define("MB_EL_COUNT", given.microblock.elements),
            Line(),
            Define("PAR_MB_COUNT", par.parallel),
            Define("INLINE_MB_COUNT", par.inline),
            Define("SEQ_MB_COUNT", par.serial),
            Line(),
            Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"),
            Define("COALESCING_THREAD_COUNT",
                   "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"),
            Line(),
            Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"),
            Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"),
            Define(
                "GLOBAL_MB_NR_BASE",
                "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"),
            Define(
                "GLOBAL_MB_NR", "(GLOBAL_MB_NR_BASE"
                "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"),
            Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"),
            Define("GLOBAL_MB_PREIMG_DOF_BASE",
                   "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"),
            Line(),
            Define("MATRIX_COLUMNS", self.plan.gpu_matrix_columns()),
            Define("MATRIX_SEGMENT_FLOATS",
                   self.plan.gpu_matrix_block_floats()),
            Define("MATRIX_SEGMENT_BYTES",
                   "(MATRIX_SEGMENT_FLOATS*%d)" % given.float_size()),
            Line(),
            CudaShared(
                ArrayOf(POD(float_type, "smem_matrix"),
                        "MATRIX_SEGMENT_FLOATS")),
            CudaShared(
                ArrayOf(
                    ArrayOf(
                        ArrayOf(POD(float_type, "dof_buffer"), "PAR_MB_COUNT"),
                        "INLINE_MB_COUNT"), "DOFS_PER_SEGMENT"), ),
            CudaShared(POD(numpy.uint16, "segment_start_el")),
            CudaShared(POD(numpy.uint16, "segment_stop_el")),
            CudaShared(POD(numpy.uint16, "segment_el_count")),
            Line(),
            ArrayInitializer(
                CudaConstant(
                    ArrayOf(POD(numpy.uint32, "segment_start_el_lookup"),
                            "SEGMENTS_PER_MB")),
                [(chk * self.plan.segment_size) // given.dofs_per_el()
                 for chk in range(self.plan.segments_per_microblock())]),
            ArrayInitializer(
                CudaConstant(
                    ArrayOf(POD(numpy.uint32, "segment_stop_el_lookup"),
                            "SEGMENTS_PER_MB")),
                [
                    min(given.microblock.elements,
                        (chk * self.plan.segment_size +
                         self.plan.segment_size - 1) // given.dofs_per_el() +
                        1)
                    for chk in range(self.plan.segments_per_microblock())
                ]),
        ])

        S = Statement
        f_body = Block()

        f_body.extend_log_block(
            "calculate this dof's element",
            [Initializer(POD(numpy.uint8, "mb_el"), "MB_DOF/DOFS_PER_EL")])

        if self.plan.use_prefetch_branch:
            f_body.extend_log_block("calculate segment responsibility data", [
                If(
                    "THREAD_NUM==0",
                    Block([
                        Assign("segment_start_el",
                               "segment_start_el_lookup[MB_SEGMENT]"),
                        Assign("segment_stop_el",
                               "segment_stop_el_lookup[MB_SEGMENT]"),
                        Assign("segment_el_count",
                               "segment_stop_el-segment_start_el"),
                    ])),
                S("__syncthreads()")
            ])

        from hedge.backends.cuda.tools import get_load_code
        f_body.extend(
            get_load_code(dest="smem_matrix",
                          base=(
                              "gmem_matrix + MB_SEGMENT*MATRIX_SEGMENT_BYTES"),
                          bytes="MATRIX_SEGMENT_BYTES",
                          descr="load matrix segment") +
            [S("__syncthreads()")])

        # ---------------------------------------------------------------------
        def get_batched_fetch_mat_mul_code(el_fetch_count):
            result = []
            dofs = range(self.plan.preimage_dofs_per_el)

            for load_segment_start in range(0, self.plan.preimage_dofs_per_el,
                                            self.plan.segment_size):
                result.extend([S("__syncthreads()")] + [
                    Assign(
                        "dof_buffer[PAR_MB_NR][%d][SEGMENT_DOF]" %
                        inl, "fp_tex1Dfetch(in_vector_tex, "
                        "GLOBAL_MB_PREIMG_DOF_BASE"
                        " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB"
                        " + (segment_start_el)*PREIMAGE_DOFS_PER_EL + %d + SEGMENT_DOF)"
                        % (inl, load_segment_start))
                    for inl in range(par.inline)
                ] + [
                    S("__syncthreads()"),
                    Line(),
                ])

                for dof in dofs[load_segment_start:load_segment_start +
                                self.plan.segment_size]:
                    for inl in range(par.inline):
                        result.append(
                            S("result%d += "
                              "smem_matrix[SEGMENT_DOF*MATRIX_COLUMNS + %d]"
                              "*"
                              "dof_buffer[PAR_MB_NR][%d][%d]" %
                              (inl, dof, inl, dof - load_segment_start)))
                result.append(Line())
            return result

        from hedge.backends.cuda.tools import unroll

        def get_direct_tex_mat_mul_code():
            return (
                [POD(float_type, "fof%d" % inl) for inl in range(par.inline)] +
                [POD(float_type, "lm"), Line()] + unroll(
                    lambda j: [
                        Assign(
                            "fof%d" % inl,
                            "fp_tex1Dfetch(in_vector_tex, "
                            "GLOBAL_MB_PREIMG_DOF_BASE"
                            " + %(inl)d * ALIGNED_PREIMAGE_DOFS_PER_MB"
                            " + mb_el*PREIMAGE_DOFS_PER_EL+%(j)s)" % {
                                "j": j,
                                "inl": inl,
                                "row": "SEGMENT_DOF"
                            },
                        ) for inl in range(par.inline)
                    ] + [
                        Assign(
                            "lm",
                            "smem_matrix["
                            "%(row)s*MATRIX_COLUMNS + %(j)s]" % {
                                "j": j,
                                "row": "SEGMENT_DOF"
                            },
                        )
                    ] + [
                        S("result%(inl)d += fof%(inl)d*lm" % {"inl": inl})
                        for inl in range(par.inline)
                    ],
                    total_number=self.plan.preimage_dofs_per_el,
                    max_unroll=self.plan.max_unroll) + [Line()])

        def get_mat_mul_code(el_fetch_count):
            if el_fetch_count == 1:
                return get_batched_fetch_mat_mul_code(el_fetch_count)
            else:
                return get_direct_tex_mat_mul_code()

        def mat_mul_outer_loop(fetch_count):
            if with_scaling:
                inv_jac_multiplier = (
                    "fp_tex1Dfetch(scaling_tex,"
                    "(GLOBAL_MB_NR + %(inl)d)*MB_EL_COUNT + mb_el)")
            else:
                inv_jac_multiplier = "1"

            write_condition = "MB_DOF < DOFS_PER_EL*MB_EL_COUNT"
            if self.with_index_check:
                write_condition += " && GLOBAL_MB_NR < microblock_count"
            return For(
                "unsigned short seq_mb_number = 0",
                "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number",
                Block([
                    Initializer(POD(float_type, "result%d" % inl), 0)
                    for inl in range(par.inline)
                ] + [Line()] + get_mat_mul_code(fetch_count) + [
                    If(
                        write_condition,
                        Block([
                            Assign(
                                "out_vector[GLOBAL_MB_DOF_BASE"
                                " + %d*ALIGNED_DOFS_PER_MB"
                                " + MB_DOF]" % inl, "result%d * %s" %
                                (inl, (inv_jac_multiplier % {
                                    "inl": inl
                                }))) for inl in range(par.inline)
                        ]))
                ]))

        if self.plan.use_prefetch_branch:
            from cgen import make_multiple_ifs
            f_body.append(
                make_multiple_ifs([
                    ("segment_el_count == %d" % fetch_count,
                     mat_mul_outer_loop(fetch_count)) for fetch_count in range(
                         1,
                         self.plan.max_elements_touched_by_segment() + 1)
                ]))
        else:
            f_body.append(mat_mul_outer_loop(0))

        # finish off ----------------------------------------------------------
        cmod.append(FunctionBody(f_decl, f_body))

        if not for_benchmark and "cuda_dump_kernels" in discr.debug:
            from hedge.tools import open_unique_debug_file
            open_unique_debug_file(self.plan.debug_name,
                                   ".cu").write(str(cmod))

        mod = SourceModule(
            cmod,
            keep="cuda_keep_kernels" in discr.debug,
            #options=["--maxrregcount=12"]
        )

        func = mod.get_function("apply_el_local_mat_smem_mat")

        if self.plan.debug_name in discr.debug:
            print "%s: lmem=%d smem=%d regs=%d" % (
                self.plan.debug_name, func.local_size_bytes,
                func.shared_size_bytes, func.num_regs)

        in_vector_texref = mod.get_texref("in_vector_tex")
        texrefs = [in_vector_texref]

        if with_scaling:
            scaling_texref = mod.get_texref("scaling_tex")
            texrefs.append(scaling_texref)
        else:
            scaling_texref = None

        func.prepare("PPPI",
                     block=(self.plan.segment_size,
                            self.plan.parallelism.parallel, 1),
                     texrefs=texrefs)

        return func, in_vector_texref, scaling_texref
Exemplo n.º 23
0
        def get_scalar_diff_code():
            code = []
            for inl in range(par.inline):
                for axis in dims:
                    code.append(
                        Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0))

            code.append(Line())

            tex_channels = ["x", "y", "z", "w"]

            store_code = Block()
            for inl in range(par.inline):
                for rst_axis in dims:
                    store_code.append(Assign(
                        "drst%d_global[GLOBAL_MB_IMAGE_DOF_BASE + "
                        "%d*ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF]"
                        % (rst_axis, inl),
                        "d%drst%d" % (inl, rst_axis)
                        ))

            from hedge.backends.cuda.tools import unroll
            code.extend([
                Comment("everybody needs to be done with the old data"),
                S("__syncthreads()"),
                Line(),
                get_load_code(),
                Line(),
                Comment("all the new data must be loaded"),
                S("__syncthreads()"),
                Line(),
                ])

            if float_type == numpy.float32:
                code.append(Value("float%d" % rst_channels, "dmat_entries"))

            code.extend([
                POD(float_type, "field_value%d" % inl)
                for inl in range(par.inline)
                ]+[Line()])

            def unroll_body(j):
                result = [
                    Assign("field_value%d" % inl,
                        "smem_field[PAR_MB_NR][%d][mb_el*PREIMAGE_DOFS_PER_EL+%s]" % (inl, j))
                    for inl in range(par.inline)
                    ]

                if float_type == numpy.float32:
                    result.append(Assign("dmat_entries",
                        "tex1Dfetch(diff_rst_mat_tex, IMAGE_EL_DOF + %s*IMAGE_DOFS_PER_EL)" % j))
                    result.extend(
                        S("d%drst%d += dmat_entries.%s * field_value%d"
                            % (inl, axis, tex_channels[axis], inl))
                        for inl in range(par.inline)
                        for axis in dims)
                elif float_type == numpy.float64:
                    result.extend(
                        S("d%(inl)drst%(axis)d += "
                            "fp_tex1Dfetch(diff_rst_mat_tex, %(axis)d "
                            "+ DIMENSIONS*(IMAGE_EL_DOF + %(j)d*IMAGE_DOFS_PER_EL))"
                            "* field_value%(inl)d" % {
                            "inl": inl,
                            "axis": axis,
                            "j": j
                            })
                        for inl in range(par.inline)
                        for axis in dims)
                else:
                    assert False

                return result

            code.append(If("IMAGE_MB_DOF < IMAGE_DOFS_PER_MB", Block(unroll(unroll_body,
                    total_number=plan.preimage_dofs_per_el)
                    +[store_code])))

            return code
Exemplo n.º 24
0
    def make_lift(self, fgroup, with_scale, dtype):
        discr = self.discr
        from cgen import (FunctionDeclaration, FunctionBody, Typedef, Const,
                          Reference, Value, POD, Statement, Include, Line,
                          Block, Initializer, Assign, For, If, Define)

        from pytools import to_uncomplex_dtype

        from codepy.bpl import BoostPythonModule
        mod = BoostPythonModule()

        S = Statement
        mod.add_to_preamble([
            Include("hedge/face_operators.hpp"),
            Include("hedge/volume_operators.hpp"),
            Include("boost/foreach.hpp"),
        ])

        mod.add_to_module([
            S("namespace ublas = boost::numeric::ublas"),
            S("using namespace hedge"),
            S("using namespace pyublas"),
            Line(),
            Define("DOFS_PER_EL", fgroup.ldis_loc.node_count()),
            Define("FACES_PER_EL", fgroup.ldis_loc.face_count()),
            Define("DIMENSIONS", discr.dimensions),
            Line(),
            Typedef(POD(dtype, "value_type")),
            Typedef(POD(to_uncomplex_dtype(dtype), "uncomplex_type")),
        ])

        def if_(cond, result, else_=None):
            if cond:
                return [result]
            else:
                if else_ is None:
                    return []
                else:
                    return [else_]

        fdecl = FunctionDeclaration(Value("void", "lift"), [
            Const(
                Reference(Value("face_group<face_pair<straight_face> >",
                                "fg"))),
            Value("ublas::matrix<uncomplex_type>", "matrix"),
            Value("numpy_array<value_type>", "field"),
            Value("numpy_array<value_type>", "result")
        ] + if_(
            with_scale,
            Const(
                Reference(Value("numpy_array<double>",
                                "elwise_post_scaling")))))

        def make_it(name, is_const=True, tpname="value_type"):
            if is_const:
                const = "const_"
            else:
                const = ""

            return Initializer(
                Value("numpy_array<%s>::%siterator" % (tpname, const),
                      name + "_it"), "%s.begin()" % name)

        fbody = Block([
            make_it("field"),
            make_it("result", is_const=False),
        ] + if_(with_scale, make_it("elwise_post_scaling", tpname="double")) + [
            Line(),
            For(
                "unsigned fg_el_nr = 0", "fg_el_nr < fg.element_count()",
                "++fg_el_nr",
                Block([
                    Initializer(Value("node_number_t", "dest_el_base"),
                                "fg.local_el_write_base[fg_el_nr]"),
                    Initializer(Value("node_number_t", "src_el_base"),
                                "FACES_PER_EL*fg.face_length()*fg_el_nr"),
                    Line(),
                    For(
                        "unsigned i = 0", "i < DOFS_PER_EL", "++i",
                        Block([
                            Initializer(Value("value_type", "tmp"), 0),
                            Line(),
                            For(
                                "unsigned j = 0",
                                "j < FACES_PER_EL*fg.face_length()", "++j",
                                S("tmp += matrix(i, j)*field_it[src_el_base+j]"
                                  )),
                            Line(),
                        ] + if_(
                            with_scale,
                            Assign(
                                "result_it[dest_el_base+i]",
                                "tmp * value_type(*elwise_post_scaling_it)"),
                            Assign("result_it[dest_el_base+i]", "tmp")))),
                ] + if_(with_scale, S("elwise_post_scaling_it++"))))
        ])

        mod.add_function(FunctionBody(fdecl, fbody))

        #print "----------------------------------------------------------------"
        #print FunctionBody(fdecl, fbody)
        #raw_input()

        return mod.compile(self.discr.toolchain).lift
Exemplo n.º 25
0
    def emit_multiple_assignment(self, codegen_state, insn):
        ecm = codegen_state.expression_to_code_mapper

        from pymbolic.primitives import Variable
        from pymbolic.mapper.stringifier import PREC_NONE

        func_id = insn.expression.function
        parameters = insn.expression.parameters

        if isinstance(func_id, Variable):
            func_id = func_id.name

        assignee_var_descriptors = [
            codegen_state.kernel.get_var_descriptor(a)
            for a in insn.assignee_var_names()
        ]

        par_dtypes = tuple(ecm.infer_type(par) for par in parameters)

        mangle_result = codegen_state.kernel.mangle_function(
            func_id, par_dtypes)
        if mangle_result is None:
            raise RuntimeError(
                "function '%s' unknown--"
                "maybe you need to register a function mangler?" % func_id)

        assert mangle_result.arg_dtypes is not None

        from loopy.expression import dtype_to_type_context
        c_parameters = [
            ecm(par, PREC_NONE, dtype_to_type_context(self.target, tgt_dtype),
                tgt_dtype).expr for par, par_dtype, tgt_dtype in zip(
                    parameters, par_dtypes, mangle_result.arg_dtypes)
        ]

        from loopy.codegen import SeenFunction
        codegen_state.seen_functions.add(
            SeenFunction(func_id, mangle_result.target_name,
                         mangle_result.arg_dtypes))

        from pymbolic import var
        for i, (a, tgt_dtype) in enumerate(
                zip(insn.assignees[1:], mangle_result.result_dtypes[1:])):
            if tgt_dtype != ecm.infer_type(a):
                raise LoopyError("type mismatch in %d'th (1-based) left-hand "
                                 "side of instruction '%s'" % (i + 1, insn.id))
            c_parameters.append(
                # TODO Yuck: The "where-at function": &(...)
                var("&")(ecm(a, PREC_NONE,
                             dtype_to_type_context(self.target, tgt_dtype),
                             tgt_dtype).expr))

        from pymbolic import var
        result = var(mangle_result.target_name)(*c_parameters)

        # In case of no assignees, we are done
        if len(mangle_result.result_dtypes) == 0:
            from cgen import ExpressionStatement
            return ExpressionStatement(
                CExpression(self.get_c_expression_to_code_mapper(), result))

        result = ecm.wrap_in_typecast(mangle_result.result_dtypes[0],
                                      assignee_var_descriptors[0].dtype,
                                      result)

        lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None)

        from cgen import Assign
        return Assign(
            lhs_code,
            CExpression(self.get_c_expression_to_code_mapper(), result))
Exemplo n.º 26
0
    def make_diff(self, elgroup, dtype, shape):
        """
        :param shape: If non-square, the resulting code takes two element_ranges
          arguments and supports non-square matrices.
        """
        from hedge._internal import UniformElementRanges
        assert isinstance(elgroup.ranges, UniformElementRanges)

        ldis = elgroup.local_discretization
        discr = self.discr
        from cgen import (
                FunctionDeclaration, FunctionBody, Typedef,
                Const, Reference, Value, POD,
                Statement, Include, Line, Block, Initializer, Assign,
                For, If,
                Define)

        from pytools import to_uncomplex_dtype

        from codepy.bpl import BoostPythonModule
        mod = BoostPythonModule()

        # {{{ preamble
        S = Statement
        mod.add_to_preamble([
            Include("hedge/volume_operators.hpp"),
            Include("boost/foreach.hpp"),
            ])

        mod.add_to_module([
            S("namespace ublas = boost::numeric::ublas"),
            S("using namespace hedge"),
            S("using namespace pyublas"),
            Line(),
            Define("ROW_COUNT", shape[0]),
            Define("COL_COUNT", shape[1]),
            Define("DIMENSIONS", discr.dimensions),
            Line(),
            Typedef(POD(dtype, "value_type")),
            Typedef(POD(to_uncomplex_dtype(dtype), "uncomplex_type")),
            ])

        fdecl = FunctionDeclaration(
                    Value("void", "diff"),
                    [
                    Const(Reference(Value("uniform_element_ranges", "from_ers"))),
                    Const(Reference(Value("uniform_element_ranges", "to_ers"))),
                    Value("numpy_array<value_type>", "field")
                    ]+[
                    Value("ublas::matrix<uncomplex_type>", "diffmat_rst%d" % rst)
                    for rst in range(discr.dimensions)
                    ]+[
                    Value("numpy_array<value_type>", "result%d" % i)
                    for i in range(discr.dimensions)
                    ]
                    )
        # }}}

        # {{{ set-up
        def make_it(name, is_const=True, tpname="value_type"):
            if is_const:
                const = "const_"
            else:
                const = ""

            return Initializer(
                Value("numpy_array<%s>::%siterator" % (tpname, const), name+"_it"),
                "%s.begin()" % name)

        fbody = Block([
            If("ROW_COUNT != diffmat_rst%d.size1()" % i,
                S('throw(std::runtime_error("unexpected matrix size"))'))
            for i in range(discr.dimensions)
            ] + [
            If("COL_COUNT != diffmat_rst%d.size2()" % i,
                S('throw(std::runtime_error("unexpected matrix size"))'))
            for i in range(discr.dimensions) 
            ]+[
            If("ROW_COUNT != to_ers.el_size()",
                S('throw(std::runtime_error("unsupported image element size"))')),
            If("COL_COUNT != from_ers.el_size()",
                S('throw(std::runtime_error("unsupported preimage element size"))')),
            If("from_ers.size() != to_ers.size()",
                S('throw(std::runtime_error("image and preimage element groups '
                    'do nothave the same element count"))')),
            Line(),
            make_it("field"),
            ]+[
            make_it("result%d" % i, is_const=False)
            for i in range(discr.dimensions)
            ]+[
            Line(),
        # }}}

        # {{{ computation
            For("element_number_t eg_el_nr = 0",
                "eg_el_nr < to_ers.size()",
                "++eg_el_nr",
                Block([
                    Initializer(
                        Value("node_number_t", "from_el_base"),
                        "from_ers.start() + eg_el_nr*COL_COUNT"),
                    Initializer(
                        Value("node_number_t", "to_el_base"),
                        "to_ers.start() + eg_el_nr*ROW_COUNT"),
                    Line(),
                    For("unsigned i = 0",
                        "i < ROW_COUNT",
                        "++i",
                        Block([
                            Initializer(Value("value_type", "drst_%d" % rst), 0)
                            for rst in range(discr.dimensions)
                            ]+[
                            Line(),
                            ]+[
                            For("unsigned j = 0",
                                "j < COL_COUNT",
                                "++j",
                                Block([
                                    S("drst_%(rst)d += "
                                        "diffmat_rst%(rst)d(i, j)*field_it[from_el_base+j]"
                                        % {"rst":rst})
                                    for rst in range(discr.dimensions)
                                    ])
                                ),
                            Line(),
                            ]+[
                            Assign("result%d_it[to_el_base+i]" % rst,
                                "drst_%d" % rst)
                            for rst in range(discr.dimensions)
                            ])
                        )
                    ])
                )
            ])
        # }}}

        # {{{ compilation
        mod.add_function(FunctionBody(fdecl, fbody))

        #print "----------------------------------------------------------------"
        #print mod.generate()
        #raw_input()

        compiled_func = mod.compile(self.discr.toolchain).diff

        if self.discr.instrumented:
            from hedge.tools import time_count_flop

            compiled_func = time_count_flop(compiled_func,
                    discr.diff_timer, discr.diff_counter,
                    discr.diff_flop_counter,
                    flops=discr.dimensions*(
                        2 # mul+add
                        * ldis.node_count() * len(elgroup.members)
                        * ldis.node_count()
                        +
                        2 * discr.dimensions
                        * len(elgroup.members) * ldis.node_count()),
                    increment=discr.dimensions)

        return compiled_func