def test_mwc(rounds=5000, nblocks=64, blockwidth=512): import pycuda.driver as cuda from pycuda.compiler import SourceModule import time nthreads = blockwidth * nblocks seeds = make_seeds(nthreads, host_seed=42) dseeds = cuda.to_device(seeds) mod = SourceModule(assemble_code(mwctestlib)) for trial in range(2): print "Trial %d, on CPU: " % trial, sums = np.zeros(nthreads, dtype=np.uint64) ctime = time.time() mults = seeds[:, 0].astype(np.uint64) states = seeds[:, 1] carries = seeds[:, 2] for i in range(rounds): step = np.frombuffer((mults * states + carries).data, dtype=np.uint32).reshape((nthreads, 2)) states[:] = step[:, 0] carries[:] = step[:, 1] sums += states ctime = time.time() - ctime print "Took %g seconds." % ctime print "Trial %d, on device: " % trial, dsums = cuda.mem_alloc(8 * nthreads) fun = mod.get_function("test_mwc") dtime = fun(dseeds, dsums, np.float32(rounds), block=(blockwidth, 1, 1), grid=(nblocks, 1), time_kernel=True) print "Took %g seconds." % dtime dsums = cuda.from_device(dsums, nthreads, np.uint64) if not np.all(np.equal(sums, dsums)): print "Sum discrepancy!" print sums print dsums
def test_mwc(rounds=5000, nblocks=64, blockwidth=512): import pycuda.driver as cuda from pycuda.compiler import SourceModule import time nthreads = blockwidth * nblocks seeds = make_seeds(nthreads, host_seed=42) dseeds = cuda.to_device(seeds) mod = SourceModule(assemble_code(mwctestlib)) for trial in range(2): print "Trial %d, on CPU: " % trial, sums = np.zeros(nthreads, dtype=np.uint64) ctime = time.time() mults = seeds[0].astype(np.uint64) states = seeds[1] carries = seeds[2] for i in range(rounds): step = np.frombuffer((mults * states + carries).data, dtype=np.uint32).reshape((2, nthreads), order='F') states[:] = step[0] carries[:] = step[1] sums += states ctime = time.time() - ctime print "Took %g seconds." % ctime print "Trial %d, on device: " % trial, dsums = cuda.mem_alloc(8*nthreads) fun = mod.get_function("test_mwc") dtime = fun(dseeds, dsums, np.float32(rounds), block=(blockwidth,1,1), grid=(nblocks,1), time_kernel=True) print "Took %g seconds." % dtime dsums = cuda.from_device(dsums, nthreads, np.uint64) if not np.all(np.equal(sums, dsums)): print "Sum discrepancy!" print sums print dsums
test_cr(const float *times, const float *knots, const float *t, float *r) { int i = threadIdx.x + blockDim.x * blockIdx.x; r[i] = catmull_rom(times, knots, t[i]); } ''') if __name__ == "__main__": # Test spline evaluation. This code will probably drift pretty often. import pycuda.driver as cuda from pycuda.compiler import SourceModule import pycuda.autoinit from cuburn.genome import SplEval gp = GenomePacker("unused") gp.finalize() mod = SourceModule(assemble_code(BaseCode, gp)) times = np.sort(np.concatenate(([-2.0, 0.0, 1.0, 3.0], np.random.rand(12)))) knots = np.random.randn(16) print times print knots evaltimes = np.float32(np.linspace(0, 1, 1024)) sp = SplEval([x for k in zip(times, knots) for x in k]) vals = np.array([sp(t) for t in evaltimes], dtype=np.float32) dtimes = np.empty((32,), dtype=np.float32) dtimes.fill(1e9) dtimes[:16] = times dknots = np.zeros_like(dtimes) dknots[:16] = knots
int i = threadIdx.x + blockDim.x * blockIdx.x; r[i] = catmull_rom(times, knots, t[i]); } """ ) if __name__ == "__main__": # Test spline evaluation. This code will probably drift pretty often. import pycuda.driver as cuda from pycuda.compiler import SourceModule import pycuda.autoinit from cuburn.genome import SplEval gp = GenomePacker("unused") gp.finalize() mod = SourceModule(assemble_code(BaseCode, gp)) times = np.sort(np.concatenate(([-2.0, 0.0, 1.0, 3.0], np.random.rand(12)))) knots = np.random.randn(16) print times print knots evaltimes = np.float32(np.linspace(0, 1, 1024)) sp = SplEval([x for k in zip(times, knots) for x in k]) vals = np.array([sp(t) for t in evaltimes], dtype=np.float32) dtimes = np.empty((32,), dtype=np.float32) dtimes.fill(1e9) dtimes[:16] = times dknots = np.zeros_like(dtimes) dknots[:16] = knots