Python Table 예제들, pytools.Table Python 예제들

예제 #1

0

파일 보기

파일: runalyzer.py 프로젝트: inducer/datapyle

def table_from_cursor(cursor):
    from pytools import Table
    tbl = Table()
    tbl.add_row([column[0] for column in cursor.description])
    for row in cursor:
        tbl.add_row(row)
    return tbl

예제 #2

0

파일 보기

파일: convergence.py 프로젝트: damiendr/pytools

    def __str__(self):
        from pytools import Table
        tbl = Table()
        tbl.add_row(("p", "error"))

        for p, err in zip(self.orders, self.errors):
            tbl.add_row((str(p), str(err)))

        return str(tbl)

예제 #3

0

파일 보기

파일: reduction-perf.py 프로젝트: DirkHaehnel/pycuda

def main():
    from pytools import Table
    tbl = Table()
    tbl.add_row(("type", "size [MiB]", "time [ms]", "mem.bw [GB/s]"))

    from random import shuffle
    for dtype_out in [numpy.float32, numpy.float64]:
        for ex in range(15,27):
            sz = 1 << ex
            print sz

            from pycuda.curandom import rand as curand
            a_gpu = curand((sz,))
            b_gpu = curand((sz,))
            assert sz == a_gpu.shape[0]
            assert len(a_gpu.shape) == 1

            from pycuda.reduction import get_sum_kernel, get_dot_kernel
            krnl = get_dot_kernel(dtype_out, a_gpu.dtype)

            elapsed = [0]

            def wrap_with_timer(f):
                def result(*args, **kwargs):
                    start = cuda.Event()
                    stop = cuda.Event()
                    start.record()
                    f(*args, **kwargs)
                    stop.record()
                    stop.synchronize()
                    elapsed[0] += stop.time_since(start)

                return result

            # warm-up
            for i in range(3):
                krnl(a_gpu, b_gpu)

            cnt = 10

            for i in range(cnt):
                krnl(a_gpu, b_gpu,
                #krnl(a_gpu, 
                        kernel_wrapper=wrap_with_timer)

            bytes = a_gpu.nbytes*2*cnt
            secs = elapsed[0]*1e-3

            tbl.add_row((str(dtype_out), a_gpu.nbytes/(1<<20), elapsed[0]/cnt, bytes/secs/1e9))

    print tbl

예제 #4

0

파일 보기

파일: elwise-perf.py 프로젝트: FreddieWitherden/pycuda

def main():
    from pytools import Table
    tbl = Table()
    tbl.add_row(("size [MiB]", "time [s]", "mem.bw [GB/s]"))

    import pycuda.gpuarray as gpuarray

    # they're floats, i.e. 4 bytes each
    for power in range(10, 28):
        size = 1<<power
        print(size)

        a = gpuarray.empty((size,), dtype=numpy.float32)
        b = gpuarray.empty_like(a)
        a.fill(1)
        b.fill(2)

        if power > 20:
            count = 10
        else:
            count = 100

        elapsed = [0]

        def add_timer(_, time):
            elapsed[0] += time()

        for i in range(count):
            a.mul_add(1, b, 2, add_timer)

        bytes = a.nbytes*count*3
        bytes = a.nbytes*count*3

        tbl.add_row((a.nbytes/(1<<20), elapsed[0]/count, bytes/elapsed[0]/1e9))

    print(tbl)

예제 #5

0

파일 보기

파일: convergence.py 프로젝트: allansnielsen/hedge

    def pretty_print(self, abscissa_label="N", error_label="Error", gliding_mean=2):
        from pytools import Table

        tbl = Table()
        tbl.add_row((abscissa_label, error_label, "Running EOC"))

        gm_eoc = self.estimate_order_of_convergence(gliding_mean)
        for i, (absc, err) in enumerate(self.history):
            if i < gliding_mean-1:
                tbl.add_row((str(absc), str(err), ""))
            else:
                tbl.add_row((str(absc), str(err), str(gm_eoc[i-gliding_mean+1,1])))

        if len(self.history) > 1:
            return str(tbl) + "\n\nOverall EOC: %s" % self.estimate_order_of_convergence()[0,1]
        else:
            return str(tbl)

예제 #6

0

파일 보기

def main():
    from pytools import Table
    tbl = Table()
    tbl.add_row(("size [MiB]", "time [s]", "mem.bw [GB/s]"))

    import pycuda.gpuarray as gpuarray

    # they're floats, i.e. 4 bytes each
    for power in range(10, 28):
        size = 1 << power
        print(size)

        a = gpuarray.empty((size, ), dtype=numpy.float32)
        b = gpuarray.empty_like(a)
        a.fill(1)
        b.fill(2)

        if power > 20:
            count = 10
        else:
            count = 100

        elapsed = [0]

        def add_timer(_, time):
            elapsed[0] += time()

        for i in range(count):
            a.mul_add(1, b, 2, add_timer)

        bytes = a.nbytes * count * 3
        bytes = a.nbytes * count * 3

        tbl.add_row((a.nbytes / (1 << 20), elapsed[0] / count,
                     bytes / elapsed[0] / 1e9))

    print(tbl)

예제 #7

0

파일 보기

def main():
    import pycuda.gpuarray as gpuarray

    sizes = []
    times = []
    flops = []
    flopsCPU = []
    timesCPU = []

    for power in range(10, 25):  # 24
        size = 1 << power
        print size
        sizes.append(size)
        a = gpuarray.zeros((size, ), dtype=numpy.float32)

        if power > 20:
            count = 100
        else:
            count = 1000

        #start timer
        start = drv.Event()
        end = drv.Event()
        start.record()

        #cuda operation which fills the array with random numbers
        for i in range(count):
            curandom.rand((size, ))

        #stop timer
        end.record()
        end.synchronize()

        #calculate used time
        secs = start.time_till(end) * 1e-3

        times.append(secs / count)
        flops.append(size)

        #cpu operations which fills teh array with random data
        a = numpy.array((size, ), dtype=numpy.float32)

        #start timer
        start = drv.Event()
        end = drv.Event()
        start.record()

        #cpu operation which fills the array with random data
        for i in range(count):
            numpy.random.rand(size).astype(numpy.float32)

        #stop timer
        end.record()
        end.synchronize()

        #calculate used time
        secs = start.time_till(end) * 1e-3

        #add results to variable
        timesCPU.append(secs / count)
        flopsCPU.append(size)

    #calculate pseudo flops
    flops = [f / t for f, t in zip(flops, times)]
    flopsCPU = [f / t for f, t in zip(flopsCPU, timesCPU)]

    #print the data out
    tbl = Table()
    tbl.add_row(("Size", "Time GPU", "Size/Time GPU", "Time CPU",
                 "Size/Time CPU", "GPU vs CPU speedup"))
    for s, t, f, tCpu, fCpu in zip(sizes, times, flops, timesCPU, flopsCPU):
        tbl.add_row((s, t, f, tCpu, fCpu, f / fCpu))
    print tbl

예제 #8

0

파일 보기

def main():
    from pytools import Table

    tbl = Table()
    tbl.add_row(("type", "size [MiB]", "time [ms]", "mem.bw [GB/s]"))

    from random import shuffle

    for dtype_out in [numpy.float32, numpy.float64]:
        for ex in range(15, 27):
            sz = 1 << ex
            print(sz)

            from pycuda.curandom import rand as curand

            a_gpu = curand((sz, ))
            b_gpu = curand((sz, ))
            assert sz == a_gpu.shape[0]
            assert len(a_gpu.shape) == 1

            from pycuda.reduction import get_sum_kernel, get_dot_kernel

            krnl = get_dot_kernel(dtype_out, a_gpu.dtype)

            elapsed = [0]

            def wrap_with_timer(f):
                def result(*args, **kwargs):
                    start = cuda.Event()
                    stop = cuda.Event()
                    start.record()
                    f(*args, **kwargs)
                    stop.record()
                    stop.synchronize()
                    elapsed[0] += stop.time_since(start)

                return result

            # warm-up
            for i in range(3):
                krnl(a_gpu, b_gpu)

            cnt = 10

            for i in range(cnt):
                krnl(
                    a_gpu,
                    b_gpu,
                    # krnl(a_gpu,
                    kernel_wrapper=wrap_with_timer,
                )

            bytes = a_gpu.nbytes * 2 * cnt
            secs = elapsed[0] * 1e-3

            tbl.add_row((
                str(dtype_out),
                a_gpu.nbytes / (1 << 20),
                elapsed[0] / cnt,
                bytes / secs / 1e9,
            ))

    print(tbl)

예제 #9

0

파일 보기

def main():
    import pycuda.gpuarray as gpuarray

    sizes = []
    times_gpu = []
    flops_gpu = []
    flops_cpu = []
    times_cpu = []

    from pycuda.tools import bitlog2

    max_power = bitlog2(drv.mem_get_info()[0]) - 2
    # they're floats, i.e. 4 bytes each
    for power in range(10, max_power):
        size = 1 << power
        print(size)
        sizes.append(size)
        a = gpuarray.zeros((size,), dtype=numpy.float32)
        b = gpuarray.zeros((size,), dtype=numpy.float32)
        b.fill(1)

        if power > 20:
            count = 100
        else:
            count = 1000

        # gpu -----------------------------------------------------------------
        start = drv.Event()
        end = drv.Event()
        start.record()

        for i in range(count):
            a + b

        end.record()
        end.synchronize()

        secs = start.time_till(end) * 1e-3

        times_gpu.append(secs / count)
        flops_gpu.append(size)
        del a
        del b

        # cpu -----------------------------------------------------------------
        a_cpu = numpy.random.randn(size).astype(numpy.float32)
        b_cpu = numpy.random.randn(size).astype(numpy.float32)

        # start timer
        from time import time

        start = time()
        for i in range(count):
            a_cpu + b_cpu
        secs = time() - start

        times_cpu.append(secs / count)
        flops_cpu.append(size)

    # calculate pseudo flops
    flops_gpu = [f / t for f, t in zip(flops_gpu, times_gpu)]
    flops_cpu = [f / t for f, t in zip(flops_cpu, times_cpu)]

    from pytools import Table

    tbl = Table()
    tbl.add_row(
        (
            "Size",
            "Time GPU",
            "Size/Time GPU",
            "Time CPU",
            "Size/Time CPU",
            "GPU vs CPU speedup",
        )
    )
    for s, t, f, t_cpu, f_cpu in zip(sizes, times_gpu, flops_gpu, times_cpu, flops_cpu):
        tbl.add_row((s, t, f, t_cpu, f_cpu, f / f_cpu))
    print(tbl)

예제 #10

0

파일 보기

파일: test_gpuarray_speed.py 프로젝트: berlinguyinca/pycuda

def main():
    drv.init()
    assert drv.Device.count() >= 1
    ctx = drv.Device(0).make_context()

    import pycuda.gpuarray as gpuarray

    # make sure all the kernels are compiled
    gpuarray.GPUArray.compile_kernels()
    print "done compiling"

    sizes = []
    times = []
    flops = []
    flopsCPU = []
    timesCPU = []

    for power in range(10, 25):  # 24
        size = 1 << power
        print size
        sizes.append(size)
        a = gpuarray.zeros((size,), dtype=numpy.float32)
        b = gpuarray.zeros((size,), dtype=numpy.float32)
        b.fill(1)

        if power > 20:
            count = 100
        else:
            count = 1000

        # start timer
        start = drv.Event()
        end = drv.Event()
        start.record()

        # cuda operation which adds two arrays over count time to get an average
        for i in range(count):
            a + b

        # stop timer
        end.record()
        end.synchronize()

        # calculate used time
        secs = start.time_till(end) * 1e-3

        times.append(secs / count)
        flops.append(size)

        # cpu operations which adds two arrays
        aCpu = numpy.random.randn(size).astype(numpy.float32)
        bCpu = numpy.random.randn(size).astype(numpy.float32)

        # start timer
        start = drv.Event()
        end = drv.Event()
        start.record()

        # cpu operation which adds two arrays over count time to get an average
        for i in range(count):
            aCpu + bCpu

        # stop timer
        end.record()
        end.synchronize()

        # calculate used time
        secs = start.time_till(end) * 1e-3

        # add results to variable
        timesCPU.append(secs / count)
        flopsCPU.append(size)

    # calculate pseudo flops
    flops = [f / t for f, t in zip(flops, times)]
    flopsCPU = [f / t for f, t in zip(flopsCPU, timesCPU)]

    # print the data out

    from pytools import Table

    tbl = Table()
    tbl.add_row(("Size", "Time GPU", "Size/Time GPU", "Time CPU", "Size/Time CPU", "GPU vs CPU speedup"))
    for s, t, f, tCpu, fCpu in zip(sizes, times, flops, timesCPU, flopsCPU):
        tbl.add_row((s, t, f, tCpu, fCpu, f / fCpu))
    print tbl

예제 #11

0

파일 보기

파일: fluxgather.py 프로젝트: felipeh/hedge

    def __call__(self, eval_dependency, lift_plan):
        discr = self.discr
        fplan = self.plan
        given = fplan.given
        elgroup, = discr.element_groups

        all_fluxes_on_faces = [gpuarray.empty(
                given.matmul_preimage_shape(lift_plan),
                dtype=given.float_type,
                allocator=discr.pool.allocate)
                for i in range(len(self.fluxes))]

        fdata = self.flux_face_data_block(elgroup)
        ilist_data = self.index_list_data()

        block, gather, texref_map = self.get_kernel(fdata, ilist_data,
                for_benchmark=False)

        for dep_expr in self.all_deps:
            dep_field = eval_dependency(dep_expr)

            from hedge.tools import is_zero
            if is_zero(dep_field):
                if dep_expr in self.dep_to_tag:
                    dep_field = discr.boundary_zeros(self.dep_to_tag[dep_expr])
                else:
                    dep_field = discr.volume_zeros()

            assert dep_field.dtype == given.float_type
            dep_field.bind_to_texref_ext(texref_map[dep_expr],
                    allow_double_hack=True)

        if set(["cuda_flux", "cuda_debugbuf"]) <= discr.debug:
            debugbuf = gpuarray.zeros((10000,), dtype=given.float_type)
        else:
            from hedge.backends.cuda.tools import FakeGPUArray
            debugbuf = FakeGPUArray()

        if discr.instrumented:
            discr.flux_gather_timer.add_timer_callable(gather.prepared_timed_call(
                    (len(discr.blocks), 1), block,
                    debugbuf.gpudata,
                    fdata.device_memory,
                    *tuple(fof.gpudata for fof in all_fluxes_on_faces)
                    ))

            discr.gmem_bytes_gather.add(
                    len(discr.blocks) * fdata.block_bytes
                    +
                    given.float_size()
                    * (
                        # fetch
                        len(self.fluxes)
                        * 2*fdata.fp_count
                        * fplan.dofs_per_face

                        # store
                        + len(discr.blocks)
                        * len(self.fluxes)
                        * fplan.microblocks_per_block()
                        * fplan.aligned_face_dofs_per_microblock()
                        ))
        else:
            gather.prepared_call(
                    (len(discr.blocks), 1), block,
                    debugbuf.gpudata,
                    fdata.device_memory,
                    *tuple(fof.gpudata for fof in all_fluxes_on_faces)
                    )

        if set(["cuda_flux", "cuda_debugbuf"]) <= discr.debug:
            from hedge.tools import get_rank, wait_for_keypress
            if get_rank(discr) == 0:
                copied_debugbuf = debugbuf.get()
                print "DEBUG", len(discr.blocks)
                numpy.set_printoptions(linewidth=130)
                #print numpy.reshape(copied_debugbuf, (32, 16))
                print copied_debugbuf[:50]

                #for i in range(len(discr.blocks)*6):
                    #print i, copied_debugbuf[i*16:(i+1)*16]
                    #print i, [x-10000 for x in sorted(copied_debugbuf[i*16:(i+1)*16]) if x != 0]

                wait_for_keypress(discr)

        if "cuda_flux" in discr.debug:
            from hedge.tools import get_rank, wait_for_keypress
            if get_rank(discr) == 0:
                numpy.set_printoptions(linewidth=130, precision=2, threshold=10**6)
                if True:

                    cols = []
                    for k in range(len(all_fluxes_on_faces)):
                        my_fof = all_fluxes_on_faces[k].get()
                        def sstruc(a):
                            result = ""
                            for i in a:
                                if i == 0:
                                    result += "0"
                                elif abs(i) < 1e-10:
                                    result += "-"
                                elif numpy.isnan(i):
                                    result += "N"
                                elif i == 17:
                                    result += "*"
                                else:
                                    result += "#"

                            return result

                        useful_sz = given.block_count \
                                * given.microblocks_per_block \
                                * lift_plan.aligned_preimage_dofs_per_microblock

                        my_col = []
                        i = 0
                        while i < useful_sz:
                            my_col.append(sstruc(my_fof[i:i+16]))
                            i += 16

                        cols.append(my_col)

                    from pytools import Table
                    tbl = Table()
                    tbl.add_row(["num"]+range(len(cols)))
                    i = 0
                    for row in zip(*cols):
                        tbl.add_row((i,)+row)
                        i += 1
                    print tbl
                else:
                    for i in range(len(all_fluxes_on_faces)):
                        print i
                        print all_fluxes_on_faces[i].get()

                wait_for_keypress(discr)
                #print "B", [la.norm(fof.get()) for fof in all_fluxes_on_faces]

        return all_fluxes_on_faces

예제 #12

0

파일 보기

파일: MeasureGpuarraySpeedRandom.py 프로젝트: hohiroki/Lattice-Boltzmann

def main():
    import pycuda.gpuarray as gpuarray

    sizes = []
    times = []
    flops = []
    flopsCPU = []
    timesCPU = []
    
    for power in range(10, 25): # 24
        size = 1<<power
        print size
        sizes.append(size)
        a = gpuarray.zeros((size,), dtype=numpy.float32)

        if power > 20:
            count = 100
        else:
            count = 1000

        #start timer
        start = drv.Event()
        end = drv.Event()
        start.record()

        #cuda operation which fills the array with random numbers
        for i in range(count):
            curandom.rand((size, ))
            
        #stop timer
        end.record()
        end.synchronize()
        
        #calculate used time
        secs = start.time_till(end)*1e-3

        times.append(secs/count)
        flops.append(size)

        #cpu operations which fills teh array with random data
        a = numpy.array((size,), dtype=numpy.float32)

        #start timer
        start = drv.Event()
        end = drv.Event()
        start.record()

        #cpu operation which fills the array with random data        
        for i in range(count):
            numpy.random.rand(size).astype(numpy.float32)

        #stop timer
        end.record()
        end.synchronize()
        
        #calculate used time
        secs = start.time_till(end)*1e-3

        #add results to variable
        timesCPU.append(secs/count)
        flopsCPU.append(size)
            
            
    #calculate pseudo flops
    flops = [f/t for f, t in zip(flops,times)]
    flopsCPU = [f/t for f, t in zip(flopsCPU,timesCPU)]

    #print the data out
    tbl = Table()
    tbl.add_row(("Size", "Time GPU", "Size/Time GPU", "Time CPU","Size/Time CPU","GPU vs CPU speedup"))
    for s, t, f,tCpu,fCpu in zip(sizes, times, flops,timesCPU,flopsCPU):
        tbl.add_row((s,t,f,tCpu,fCpu,f/fCpu))
    print tbl

예제 #13

0

파일 보기

파일: test_math_speed.py 프로젝트: berlinguyinca/pycuda

    #stop timer
    end.record()
    end.synchronize()
        
    #calculate used time
    secs = start.time_till(end)

    return secs

#iterate over all methods and time the execution time with different array sizes
print "compile kernels"
kernel._compile_kernels(kernel)

#generate our output table, one for gpu, one for cpu
tblCPU = Table()
tblGPU = Table()
tblSPD = Table()

#contains all the method names
methods = ["size"]

for name in dir(cuma):
    if (name.startswith("__") and name.endswith("__")) == False:
        method = getattr(cuma, name)

        if type(method) == types.FunctionType:
            methods.append(name)
        
tblCPU.add_row(methods)
tblGPU.add_row(methods)

예제 #14

0

파일 보기

파일: measure_gpuarray_speed.py 프로젝트: DirkHaehnel/pycuda

def main():
    import pycuda.gpuarray as gpuarray

    sizes = []
    times_gpu = []
    flops_gpu = []
    flops_cpu = []
    times_cpu = []

    from pycuda.tools import bitlog2
    max_power = bitlog2(drv.mem_get_info()[0]) - 2
    # they're floats, i.e. 4 bytes each
    for power in range(10, max_power):
        size = 1<<power
        print size
        sizes.append(size)
        a = gpuarray.zeros((size,), dtype=numpy.float32)
        b = gpuarray.zeros((size,), dtype=numpy.float32)
        b.fill(1)

        if power > 20:
            count = 100
        else:
            count = 1000

        # gpu -----------------------------------------------------------------
        start = drv.Event()
        end = drv.Event()
        start.record()

        for i in range(count):
            a+b

        end.record()
        end.synchronize()

        secs = start.time_till(end)*1e-3

        times_gpu.append(secs/count)
        flops_gpu.append(size)
        del a
        del b

        # cpu -----------------------------------------------------------------
        a_cpu = numpy.random.randn(size).astype(numpy.float32)
        b_cpu = numpy.random.randn(size).astype(numpy.float32)

        #start timer
        from time import time
        start = time()
        for i in range(count):
            a_cpu + b_cpu
        secs = time() - start

        times_cpu.append(secs/count)
        flops_cpu.append(size)


    # calculate pseudo flops
    flops_gpu = [f/t for f, t in zip(flops_gpu,times_gpu)]
    flops_cpu = [f/t for f, t in zip(flops_cpu,times_cpu)]

    from pytools import Table
    tbl = Table()
    tbl.add_row(("Size", "Time GPU", "Size/Time GPU",
        "Time CPU","Size/Time CPU","GPU vs CPU speedup"))
    for s, t, f, t_cpu, f_cpu in zip(sizes, times_gpu, flops_gpu, times_cpu, flops_cpu):
        tbl.add_row((s, t, f, t_cpu, f_cpu, f/f_cpu))
    print tbl

예제 #15

0

파일 보기

    def __call__(self, eval_dependency, lift_plan):
        discr = self.discr
        fplan = self.plan
        given = fplan.given
        elgroup, = discr.element_groups

        all_fluxes_on_faces = [
            gpuarray.empty(given.matmul_preimage_shape(lift_plan),
                           dtype=given.float_type,
                           allocator=discr.pool.allocate)
            for i in range(len(self.fluxes))
        ]

        fdata = self.flux_face_data_block(elgroup)
        ilist_data = self.index_list_data()

        block, gather, texref_map = self.get_kernel(fdata,
                                                    ilist_data,
                                                    for_benchmark=False)

        for dep_expr in self.all_deps:
            dep_field = eval_dependency(dep_expr)

            from hedge.tools import is_zero
            if is_zero(dep_field):
                if dep_expr in self.dep_to_tag:
                    dep_field = discr.boundary_zeros(self.dep_to_tag[dep_expr])
                else:
                    dep_field = discr.volume_zeros()

            assert dep_field.dtype == given.float_type, "Wrong types: %s: %s, %s: %s" % (
                dep_expr, dep_field.dtype, given, given.float_type)
            dep_field.bind_to_texref_ext(texref_map[dep_expr],
                                         allow_double_hack=True)

        if set(["cuda_flux", "cuda_debugbuf"]) <= discr.debug:
            debugbuf = gpuarray.zeros((10000, ), dtype=given.float_type)
        else:
            from hedge.backends.cuda.tools import FakeGPUArray
            debugbuf = FakeGPUArray()

        if discr.instrumented:
            discr.flux_gather_timer.add_timer_callable(
                gather.prepared_timed_call(
                    (len(discr.blocks), 1), block, debugbuf.gpudata,
                    fdata.device_memory,
                    *tuple(fof.gpudata for fof in all_fluxes_on_faces)))

            discr.gmem_bytes_gather.add(
                len(discr.blocks) * fdata.block_bytes + given.float_size() * (
                    # fetch
                    len(self.fluxes) * 2 * fdata.fp_count * fplan.dofs_per_face

                    # store
                    + len(discr.blocks) * len(self.fluxes) *
                    fplan.microblocks_per_block() *
                    fplan.aligned_face_dofs_per_microblock()))
        else:
            gather.prepared_call(
                (len(discr.blocks), 1), block, debugbuf.gpudata,
                fdata.device_memory,
                *tuple(fof.gpudata for fof in all_fluxes_on_faces))

        if set(["cuda_flux", "cuda_debugbuf"]) <= discr.debug:
            from hedge.tools import get_rank, wait_for_keypress
            if get_rank(discr) == 0:
                copied_debugbuf = debugbuf.get()
                print "DEBUG", len(discr.blocks)
                numpy.set_printoptions(linewidth=130)
                #print numpy.reshape(copied_debugbuf, (32, 16))
                print copied_debugbuf[:50]

                #for i in range(len(discr.blocks)*6):
                #print i, copied_debugbuf[i*16:(i+1)*16]
                #print i, [x-10000 for x in sorted(copied_debugbuf[i*16:(i+1)*16]) if x != 0]

                wait_for_keypress(discr)

        if "cuda_flux" in discr.debug:
            from hedge.tools import get_rank, wait_for_keypress
            if get_rank(discr) == 0:
                numpy.set_printoptions(linewidth=130,
                                       precision=2,
                                       threshold=10**6)
                if True:

                    cols = []
                    for k in range(len(all_fluxes_on_faces)):
                        my_fof = all_fluxes_on_faces[k].get()

                        def sstruc(a):
                            result = ""
                            for i in a:
                                if i == 0:
                                    result += "0"
                                elif abs(i) < 1e-10:
                                    result += "-"
                                elif numpy.isnan(i):
                                    result += "N"
                                elif i == 17:
                                    result += "*"
                                else:
                                    result += "#"

                            return result

                        useful_sz = given.block_count \
                                * given.microblocks_per_block \
                                * lift_plan.aligned_preimage_dofs_per_microblock

                        my_col = []
                        i = 0
                        while i < useful_sz:
                            my_col.append(sstruc(my_fof[i:i + 16]))
                            i += 16

                        cols.append(my_col)

                    from pytools import Table
                    tbl = Table()
                    tbl.add_row(["num"] + range(len(cols)))
                    i = 0
                    for row in zip(*cols):
                        tbl.add_row((i, ) + row)
                        i += 1
                    print tbl
                else:
                    for i in range(len(all_fluxes_on_faces)):
                        print i
                        print all_fluxes_on_faces[i].get()

                wait_for_keypress(discr)
                #print "B", [la.norm(fof.get()) for fof in all_fluxes_on_faces]

        return all_fluxes_on_faces