示例#1
0
    def __call__(self, op_class, field):
        discr = self.discr
        given = self.plan.given

        d = discr.dimensions
        elgroup, = discr.element_groups

        block, func = self.get_kernel(op_class, elgroup)

        assert field.dtype == given.float_type, "Wrong types: %s: %s, %s: %s" % (field, field.dtype, given, given.float_type)

        use_debugbuf = set(["cuda_diff", "cuda_debugbuf"]) <= discr.debug
        if use_debugbuf:
            debugbuf = gpuarray.zeros((512,), dtype=given.float_type)
        else:
            from hedge.backends.cuda.tools import FakeGPUArray
            debugbuf = FakeGPUArray()

        rst_diff = [discr.volume_empty() for axis in range(d)]
        rst_diff_gpudata = [subarray.gpudata for subarray in rst_diff]

        if discr.instrumented:
            discr.diff_op_timer.add_timer_callable(
                    func.prepared_timed_call(self.grid, block,
                        debugbuf.gpudata, field.gpudata, *rst_diff_gpudata))

            block_gmem_floats = (
                    # matrix fetch
                    given.microblock.aligned_floats
                    * discr.dimensions
                    * given.dofs_per_el()
                    * self.plan.parallelism.serial
                    * self.plan.parallelism.parallel
                    # field fetch
                    + given.microblock.aligned_floats
                    * self.plan.parallelism.total()
                    )

            gmem_bytes = given.float_size() * (
                    self.grid[0] * block_gmem_floats
                    # field store
                    + len(discr.nodes))

            discr.gmem_bytes_diff.add(gmem_bytes)
        else:
            func.prepared_call(self.grid, block,
                    debugbuf.gpudata, field.gpudata, *rst_diff_gpudata)

        if use_debugbuf:
            copied_debugbuf = debugbuf.get()
            print "DEBUG"
            print field.shape
            #print numpy.reshape(copied_debugbuf, (len(copied_debugbuf)//16, 16))
            print copied_debugbuf
            raw_input()

        return rst_diff
示例#2
0
    def __call__(self, op_class, field):
        discr = self.discr
        given = self.plan.given

        d = discr.dimensions
        elgroup, = discr.element_groups

        block, func = self.get_kernel(op_class, elgroup)

        assert field.dtype == given.float_type

        use_debugbuf = set(["cuda_diff", "cuda_debugbuf"]) <= discr.debug
        if use_debugbuf:
            debugbuf = gpuarray.zeros((512,), dtype=given.float_type)
        else:
            from hedge.backends.cuda.tools import FakeGPUArray
            debugbuf = FakeGPUArray()

        rst_diff = [discr.volume_empty() for axis in range(d)]
        rst_diff_gpudata = [subarray.gpudata for subarray in rst_diff]

        if discr.instrumented:
            discr.diff_op_timer.add_timer_callable(
                    func.prepared_timed_call(self.grid, block,
                        debugbuf.gpudata, field.gpudata, *rst_diff_gpudata))

            block_gmem_floats = (
                    # matrix fetch
                    given.microblock.aligned_floats
                    * discr.dimensions
                    * given.dofs_per_el()
                    * self.plan.parallelism.serial
                    * self.plan.parallelism.parallel
                    # field fetch
                    + given.microblock.aligned_floats
                    * self.plan.parallelism.total()
                    )

            gmem_bytes = given.float_size() * (
                    self.grid[0] * block_gmem_floats
                    # field store
                    + len(discr.nodes))

            discr.gmem_bytes_diff.add(gmem_bytes)
        else:
            func.prepared_call(self.grid, block,
                    debugbuf.gpudata, field.gpudata, *rst_diff_gpudata)

        if use_debugbuf:
            copied_debugbuf = debugbuf.get()
            print "DEBUG"
            print field.shape
            #print numpy.reshape(copied_debugbuf, (len(copied_debugbuf)//16, 16))
            print copied_debugbuf
            raw_input()

        return rst_diff
    def __call__(self, in_vector, prepped_mat, prepped_scaling, out_vector=None):
        discr = self.discr
        elgroup, = discr.element_groups
        given = self.plan.given

        kernel, in_vector_texref, scaling_texref = \
                self.get_kernel(prepped_scaling is not None)

        if out_vector is None:
            out_vector = discr.volume_empty()

        in_vector.bind_to_texref_ext(in_vector_texref, allow_double_hack=True)
        if prepped_scaling is not None:
            prepped_scaling.bind_to_texref_ext(scaling_texref,
                    allow_double_hack=True)

        if set([self.plan.debug_name, "cuda_debugbuf"]) <= discr.debug:
            debugbuf = gpuarray.zeros((1024,), dtype=given.float_type)
        else:
            debugbuf = FakeGPUArray()

        if discr.instrumented:
            discr.el_local_timer.add_timer_callable(
                    kernel.prepared_timed_call(
                        self.grid,
                        out_vector.gpudata,
                        prepped_mat,
                        debugbuf.gpudata,
                        len(discr.blocks)*given.microblocks_per_block,
                        ))

            from pytools import product
            discr.gmem_bytes_el_local.add(
                    given.float_size()
                    * (
                        # matrix fetch
                        self.plan.gpu_matrix_block_floats() * product(self.grid)
                        # field fetch
                        + self.plan.preimage_dofs_per_el
                        * given.dofs_per_el() * given.microblock.elements
                        * self.grid[1] * self.plan.parallelism.total()
                        # field store
                        + len(discr.nodes)
                        ))
        else:
            kernel.prepared_call(
                    self.grid,
                    out_vector.gpudata,
                    prepped_mat,
                    debugbuf.gpudata,
                    len(discr.blocks)*given.microblocks_per_block,
                    )

        if set([self.plan.debug_name, "cuda_debugbuf"]) <= discr.debug:
            copied_debugbuf = debugbuf.get()[:144*7].reshape((144,7))
            print "DEBUG"
            numpy.set_printoptions(linewidth=100)
            copied_debugbuf.shape = (144,7)
            numpy.set_printoptions(threshold=3000)

            print copied_debugbuf
            raw_input()

        return out_vector
示例#4
0
    def __call__(self, in_vector, prepped_mat, out_vector=None):
        discr = self.discr
        elgroup, = discr.element_groups
        given = self.discr.given
        plan = self.plan

        kernel, block, mat_texref = self.get_kernel()

        mat_texref.set_array(prepped_mat)

        if out_vector is None:
            out_vector = discr.volume_empty()

        if set([self.plan.debug_name, "cuda_debugbuf"]) <= discr.debug:
            debugbuf = gpuarray.zeros((1024,), dtype=self.plan.given.float_type)
        else:
            debugbuf = FakeGPUArray()

        if discr.instrumented:
            discr.el_local_timer.add_timer_callable(
                    kernel.prepared_timed_call(self.grid, block,
                        out_vector.gpudata,
                        in_vector.gpudata,
                        debugbuf.gpudata,
                        plan.microblock_count,
                        ))

            block_gmem_floats = (
                        # matrix fetch
                        given.microblock.aligned_floats
                        * plan.preimage_dofs_per_el
                        * plan.parallelism.serial
                        * plan.parallelism.parallel
                        # field fetch
                        + plan.preimage_dofs_per_el
                        * plan.elements_per_microblock
                        * plan.parallelism.total()
                        )
            gmem_bytes = given.float_size() * (
                    self.grid[0] * block_gmem_floats
                    # field store
                    + len(discr.nodes))

            discr.gmem_bytes_el_local.add(gmem_bytes)
        else:
            kernel.prepared_call(self.grid, block,
                    out_vector.gpudata,
                    in_vector.gpudata,
                    debugbuf.gpudata,
                    plan.microblock_count,
                    )

        if set([self.plan.debug_name, "cuda_debugbuf"]) <= discr.debug:
            copied_debugbuf = debugbuf.get()[:144*7].reshape((144,7))
            print "DEBUG"
            numpy.set_printoptions(linewidth=100)
            copied_debugbuf.shape = (144,7)
            numpy.set_printoptions(threshold=3000)

            print copied_debugbuf
            raw_input()

        return out_vector
示例#5
0
    def __call__(self, eval_dependency, lift_plan):
        discr = self.discr
        fplan = self.plan
        given = fplan.given
        elgroup, = discr.element_groups

        all_fluxes_on_faces = [gpuarray.empty(
                given.matmul_preimage_shape(lift_plan),
                dtype=given.float_type,
                allocator=discr.pool.allocate)
                for i in range(len(self.fluxes))]

        fdata = self.flux_face_data_block(elgroup)
        ilist_data = self.index_list_data()

        block, gather, texref_map = self.get_kernel(fdata, ilist_data,
                for_benchmark=False)

        for dep_expr in self.all_deps:
            dep_field = eval_dependency(dep_expr)

            from hedge.tools import is_zero
            if is_zero(dep_field):
                if dep_expr in self.dep_to_tag:
                    dep_field = discr.boundary_zeros(self.dep_to_tag[dep_expr])
                else:
                    dep_field = discr.volume_zeros()

            assert dep_field.dtype == given.float_type
            dep_field.bind_to_texref_ext(texref_map[dep_expr],
                    allow_double_hack=True)

        if set(["cuda_flux", "cuda_debugbuf"]) <= discr.debug:
            debugbuf = gpuarray.zeros((10000,), dtype=given.float_type)
        else:
            from hedge.backends.cuda.tools import FakeGPUArray
            debugbuf = FakeGPUArray()

        if discr.instrumented:
            discr.flux_gather_timer.add_timer_callable(gather.prepared_timed_call(
                    (len(discr.blocks), 1), block,
                    debugbuf.gpudata,
                    fdata.device_memory,
                    *tuple(fof.gpudata for fof in all_fluxes_on_faces)
                    ))

            discr.gmem_bytes_gather.add(
                    len(discr.blocks) * fdata.block_bytes
                    +
                    given.float_size()
                    * (
                        # fetch
                        len(self.fluxes)
                        * 2*fdata.fp_count
                        * fplan.dofs_per_face

                        # store
                        + len(discr.blocks)
                        * len(self.fluxes)
                        * fplan.microblocks_per_block()
                        * fplan.aligned_face_dofs_per_microblock()
                        ))
        else:
            gather.prepared_call(
                    (len(discr.blocks), 1), block,
                    debugbuf.gpudata,
                    fdata.device_memory,
                    *tuple(fof.gpudata for fof in all_fluxes_on_faces)
                    )

        if set(["cuda_flux", "cuda_debugbuf"]) <= discr.debug:
            from hedge.tools import get_rank, wait_for_keypress
            if get_rank(discr) == 0:
                copied_debugbuf = debugbuf.get()
                print "DEBUG", len(discr.blocks)
                numpy.set_printoptions(linewidth=130)
                #print numpy.reshape(copied_debugbuf, (32, 16))
                print copied_debugbuf[:50]

                #for i in range(len(discr.blocks)*6):
                    #print i, copied_debugbuf[i*16:(i+1)*16]
                    #print i, [x-10000 for x in sorted(copied_debugbuf[i*16:(i+1)*16]) if x != 0]

                wait_for_keypress(discr)

        if "cuda_flux" in discr.debug:
            from hedge.tools import get_rank, wait_for_keypress
            if get_rank(discr) == 0:
                numpy.set_printoptions(linewidth=130, precision=2, threshold=10**6)
                if True:

                    cols = []
                    for k in range(len(all_fluxes_on_faces)):
                        my_fof = all_fluxes_on_faces[k].get()
                        def sstruc(a):
                            result = ""
                            for i in a:
                                if i == 0:
                                    result += "0"
                                elif abs(i) < 1e-10:
                                    result += "-"
                                elif numpy.isnan(i):
                                    result += "N"
                                elif i == 17:
                                    result += "*"
                                else:
                                    result += "#"

                            return result

                        useful_sz = given.block_count \
                                * given.microblocks_per_block \
                                * lift_plan.aligned_preimage_dofs_per_microblock

                        my_col = []
                        i = 0
                        while i < useful_sz:
                            my_col.append(sstruc(my_fof[i:i+16]))
                            i += 16

                        cols.append(my_col)

                    from pytools import Table
                    tbl = Table()
                    tbl.add_row(["num"]+range(len(cols)))
                    i = 0
                    for row in zip(*cols):
                        tbl.add_row((i,)+row)
                        i += 1
                    print tbl
                else:
                    for i in range(len(all_fluxes_on_faces)):
                        print i
                        print all_fluxes_on_faces[i].get()

                wait_for_keypress(discr)
                #print "B", [la.norm(fof.get()) for fof in all_fluxes_on_faces]

        return all_fluxes_on_faces
示例#6
0
    def __call__(self, eval_dependency, lift_plan):
        discr = self.discr
        fplan = self.plan
        given = fplan.given
        elgroup, = discr.element_groups

        all_fluxes_on_faces = [
            gpuarray.empty(given.matmul_preimage_shape(lift_plan),
                           dtype=given.float_type,
                           allocator=discr.pool.allocate)
            for i in range(len(self.fluxes))
        ]

        fdata = self.flux_face_data_block(elgroup)
        ilist_data = self.index_list_data()

        block, gather, texref_map = self.get_kernel(fdata,
                                                    ilist_data,
                                                    for_benchmark=False)

        for dep_expr in self.all_deps:
            dep_field = eval_dependency(dep_expr)

            from hedge.tools import is_zero
            if is_zero(dep_field):
                if dep_expr in self.dep_to_tag:
                    dep_field = discr.boundary_zeros(self.dep_to_tag[dep_expr])
                else:
                    dep_field = discr.volume_zeros()

            assert dep_field.dtype == given.float_type, "Wrong types: %s: %s, %s: %s" % (
                dep_expr, dep_field.dtype, given, given.float_type)
            dep_field.bind_to_texref_ext(texref_map[dep_expr],
                                         allow_double_hack=True)

        if set(["cuda_flux", "cuda_debugbuf"]) <= discr.debug:
            debugbuf = gpuarray.zeros((10000, ), dtype=given.float_type)
        else:
            from hedge.backends.cuda.tools import FakeGPUArray
            debugbuf = FakeGPUArray()

        if discr.instrumented:
            discr.flux_gather_timer.add_timer_callable(
                gather.prepared_timed_call(
                    (len(discr.blocks), 1), block, debugbuf.gpudata,
                    fdata.device_memory,
                    *tuple(fof.gpudata for fof in all_fluxes_on_faces)))

            discr.gmem_bytes_gather.add(
                len(discr.blocks) * fdata.block_bytes + given.float_size() * (
                    # fetch
                    len(self.fluxes) * 2 * fdata.fp_count * fplan.dofs_per_face

                    # store
                    + len(discr.blocks) * len(self.fluxes) *
                    fplan.microblocks_per_block() *
                    fplan.aligned_face_dofs_per_microblock()))
        else:
            gather.prepared_call(
                (len(discr.blocks), 1), block, debugbuf.gpudata,
                fdata.device_memory,
                *tuple(fof.gpudata for fof in all_fluxes_on_faces))

        if set(["cuda_flux", "cuda_debugbuf"]) <= discr.debug:
            from hedge.tools import get_rank, wait_for_keypress
            if get_rank(discr) == 0:
                copied_debugbuf = debugbuf.get()
                print "DEBUG", len(discr.blocks)
                numpy.set_printoptions(linewidth=130)
                #print numpy.reshape(copied_debugbuf, (32, 16))
                print copied_debugbuf[:50]

                #for i in range(len(discr.blocks)*6):
                #print i, copied_debugbuf[i*16:(i+1)*16]
                #print i, [x-10000 for x in sorted(copied_debugbuf[i*16:(i+1)*16]) if x != 0]

                wait_for_keypress(discr)

        if "cuda_flux" in discr.debug:
            from hedge.tools import get_rank, wait_for_keypress
            if get_rank(discr) == 0:
                numpy.set_printoptions(linewidth=130,
                                       precision=2,
                                       threshold=10**6)
                if True:

                    cols = []
                    for k in range(len(all_fluxes_on_faces)):
                        my_fof = all_fluxes_on_faces[k].get()

                        def sstruc(a):
                            result = ""
                            for i in a:
                                if i == 0:
                                    result += "0"
                                elif abs(i) < 1e-10:
                                    result += "-"
                                elif numpy.isnan(i):
                                    result += "N"
                                elif i == 17:
                                    result += "*"
                                else:
                                    result += "#"

                            return result

                        useful_sz = given.block_count \
                                * given.microblocks_per_block \
                                * lift_plan.aligned_preimage_dofs_per_microblock

                        my_col = []
                        i = 0
                        while i < useful_sz:
                            my_col.append(sstruc(my_fof[i:i + 16]))
                            i += 16

                        cols.append(my_col)

                    from pytools import Table
                    tbl = Table()
                    tbl.add_row(["num"] + range(len(cols)))
                    i = 0
                    for row in zip(*cols):
                        tbl.add_row((i, ) + row)
                        i += 1
                    print tbl
                else:
                    for i in range(len(all_fluxes_on_faces)):
                        print i
                        print all_fluxes_on_faces[i].get()

                wait_for_keypress(discr)
                #print "B", [la.norm(fof.get()) for fof in all_fluxes_on_faces]

        return all_fluxes_on_faces
示例#7
0
    def __call__(self,
                 in_vector,
                 prepped_mat,
                 prepped_scaling,
                 out_vector=None):
        discr = self.discr
        elgroup, = discr.element_groups
        given = self.plan.given

        kernel, in_vector_texref, scaling_texref = \
                self.get_kernel(prepped_scaling is not None)

        if out_vector is None:
            out_vector = discr.volume_empty()

        in_vector.bind_to_texref_ext(in_vector_texref, allow_double_hack=True)
        if prepped_scaling is not None:
            prepped_scaling.bind_to_texref_ext(scaling_texref,
                                               allow_double_hack=True)

        if set([self.plan.debug_name, "cuda_debugbuf"]) <= discr.debug:
            debugbuf = gpuarray.zeros((1024, ), dtype=given.float_type)
        else:
            debugbuf = FakeGPUArray()

        if discr.instrumented:
            discr.el_local_timer.add_timer_callable(
                kernel.prepared_timed_call(
                    self.grid,
                    out_vector.gpudata,
                    prepped_mat,
                    debugbuf.gpudata,
                    len(discr.blocks) * given.microblocks_per_block,
                ))

            from pytools import product
            discr.gmem_bytes_el_local.add(given.float_size() * (
                # matrix fetch
                self.plan.gpu_matrix_block_floats() * product(self.grid)
                # field fetch
                + self.plan.preimage_dofs_per_el * given.dofs_per_el() *
                given.microblock.elements * self.grid[1] *
                self.plan.parallelism.total()
                # field store
                + len(discr.nodes)))
        else:
            kernel.prepared_call(
                self.grid,
                out_vector.gpudata,
                prepped_mat,
                debugbuf.gpudata,
                len(discr.blocks) * given.microblocks_per_block,
            )

        if set([self.plan.debug_name, "cuda_debugbuf"]) <= discr.debug:
            copied_debugbuf = debugbuf.get()[:144 * 7].reshape((144, 7))
            print "DEBUG"
            numpy.set_printoptions(linewidth=100)
            copied_debugbuf.shape = (144, 7)
            numpy.set_printoptions(threshold=3000)

            print copied_debugbuf
            raw_input()

        return out_vector