Exemplo n.º 1
0
def doit(nvecs, nwarps, i_chunk, k_chunk):
    nthreads = 32 * nwarps

    print
    print
    print '=== nvecs={nvecs}, nthreads={nthreads}, i_chunk={i_chunk}, k_chunk={k_chunk}'.format(**locals())

    out = np.zeros((nblocks, 2 * nk, nvecs), dtype=np.double, order='C')

    kernel = CudaLegendreKernel(max_ni=ni,
                                nthreads=nthreads, nvecs=nvecs,
                                k_chunk=k_chunk,
                                i_chunk=i_chunk)

    if 0:
        print '======== Reduction '
        with cuda_profile() as prof:
            for rep in range(repeat):
                output = np.zeros((nblocks, 2, 16, nwarps))
                kernel.test_reduce_kernel(output, repeat=1000, nblocks=nblocks)
        print prof.format('test_reduce_kernel',
                          nflops=nblocks * 2 * 16 * nthreads * 1000,
                          nwarps=nwarps)

    print '======== Legendre transform '
    with cuda_profile() as prof:
        for rep in range(repeat):
            kernel.transpose_legendre_transform(m, m + odd,
                                                plan.x_squared, Lambda_0, Lambda_1,
                                                i_stops, q, out)
    print prof.format('transpose_legendre_transform',
                      nflops=nblocks * nnz * (5 + 2 * nvecs),
                      nwarps=nwarps)

    # Output is stored in strided format
    a = out[:, ::2, :]
    if check:
        if not np.all(a[:, :, 0:1] == a):
            print 'NOT ALL j EQUAL!'

    a = a[1, :, :]
    print 'Error', la.norm(a - a0) / la.norm(a0)
    sys.stdout.flush()
    return a
Exemplo n.º 2
0
map = drv.pagelocked_zeros(npix, np.float64)
buf = drv.pagelocked_zeros((nrings, (lmax + 1) // 2 + 1), np.complex128)

map_gpu = drv.mem_alloc(npix * 8)
buf_gpu = drv.mem_alloc(nrings * ((lmax + 1) // 2 + 1) * 16)

drv.memcpy_htod(map_gpu, map)

from wavemoth.cuda import cufft

print 'ctoring plan'
plan = cufft.HealpixCuFFTPlan(2048, 8)

repeats = 1
print 'plan ctored'
with cuda_profile() as prof:
    t0 = time()
    for i in range(repeats):
        plan.execute(map_gpu, buf_gpu)
    dt = time() - t0
    print dt / repeats
print 'benchmark done'
#print prof.kernels
dt = 0
for kernel, stats in prof.kernels.iteritems():
    dt += sum(stats['times'])
print dt
print prof.kernels

drv.memcpy_dtoh(buf, buf_gpu)
Exemplo n.º 3
0
    cuda.memcpy_htod_async(q_gpu, q_slice, stream=stream)
    plan.execute_transpose_legendre(q_gpu, a_gpu, stream=stream)
    cuda.memcpy_dtoh_async(a_slice, a_gpu, stream=stream)
    
print 'Wall-time taken to set up instruction streams ("Python overhead"): %e' % (time() - t0)
for stream, q_gpu, a_gpu in stream_objects:
    stream.synchronize()
dt = time() - t0
print 'Wall-time taken to end of execution: %f total, %f per transform' % (dt, dt / ntransforms)
print 'Host-to-host compute rate: %f GFLOP/s' % (ntransforms * plan.get_flops() / dt / 1e9)


# Profiled, synchronous run
print
print '== Profiled run at nside=%d' % nside
with cuda_profile() as prof:
    for rep in range(3):
        stream, q_gpu, a_gpu = stream_objects[0]
        cuda.memcpy_htod(q_gpu, q[0, ...])
        plan.execute_transpose_legendre(q_gpu, a_gpu)    
        cuda.memcpy_dtoh(a[0, ...], a_gpu)

print 'Transfer in:  ', prof.format('memcpyHtoD', nflops=q.nbytes // ntransforms)
print 'Compute:      ', prof.format('all_transpose_legendre_transforms',
                                    nflops=plan.get_flops(),
                                    nwarps=2)
print 'Transfer out: ', prof.format('memcpyDtoH', nflops=a.nbytes // ntransforms)

# Check result
print
print '== Accuracy table (m, odd, relative error)'