def test_bitlog(self): from pycuda.tools import bitlog2 assert bitlog2(17) == 4 assert bitlog2(0xAFFE) == 15 assert bitlog2(0x3AFFE) == 17 assert bitlog2(0xCC3AFFE) == 27
def test_mempool(self): from pycuda.tools import bitlog2 from pycuda.tools import DeviceMemoryPool pool = DeviceMemoryPool() queue = [] free, total = drv.mem_get_info() e0 = bitlog2(free) for e in range(e0 - 6, e0 - 4): for i in range(100): queue.append(pool.allocate(1 << e)) if len(queue) > 10: queue.pop(0) del queue pool.stop_holding()
def test_mempool(self): from pycuda.tools import bitlog2 from pycuda.tools import DeviceMemoryPool pool = DeviceMemoryPool() maxlen = 10 queue = [] free, total = drv.mem_get_info() e0 = bitlog2(free) for e in range(e0-6, e0-4): for i in range(100): queue.append(pool.allocate(1<<e)) if len(queue) > 10: queue.pop(0) del queue pool.stop_holding()
def test_bitlog(self): from pycuda.tools import bitlog2 assert bitlog2(17) == 4 assert bitlog2(0xaffe) == 15 assert bitlog2(0x3affe) == 17 assert bitlog2(0xcc3affe) == 27
def main(): import pycuda.gpuarray as gpuarray sizes = [] times_gpu = [] flops_gpu = [] flops_cpu = [] times_cpu = [] from pycuda.tools import bitlog2 max_power = bitlog2(drv.mem_get_info()[0]) - 2 # they're floats, i.e. 4 bytes each for power in range(10, max_power): size = 1 << power print(size) sizes.append(size) a = gpuarray.zeros((size,), dtype=numpy.float32) b = gpuarray.zeros((size,), dtype=numpy.float32) b.fill(1) if power > 20: count = 100 else: count = 1000 # gpu ----------------------------------------------------------------- start = drv.Event() end = drv.Event() start.record() for i in range(count): a + b end.record() end.synchronize() secs = start.time_till(end) * 1e-3 times_gpu.append(secs / count) flops_gpu.append(size) del a del b # cpu ----------------------------------------------------------------- a_cpu = numpy.random.randn(size).astype(numpy.float32) b_cpu = numpy.random.randn(size).astype(numpy.float32) # start timer from time import time start = time() for i in range(count): a_cpu + b_cpu secs = time() - start times_cpu.append(secs / count) flops_cpu.append(size) # calculate pseudo flops flops_gpu = [f / t for f, t in zip(flops_gpu, times_gpu)] flops_cpu = [f / t for f, t in zip(flops_cpu, times_cpu)] from pytools import Table tbl = Table() tbl.add_row( ( "Size", "Time GPU", "Size/Time GPU", "Time CPU", "Size/Time CPU", "GPU vs CPU speedup", ) ) for s, t, f, t_cpu, f_cpu in zip(sizes, times_gpu, flops_gpu, times_cpu, flops_cpu): tbl.add_row((s, t, f, t_cpu, f_cpu, f / f_cpu)) print(tbl)
def main(): import pycuda.gpuarray as gpuarray sizes = [] times_gpu = [] flops_gpu = [] flops_cpu = [] times_cpu = [] from pycuda.tools import bitlog2 max_power = bitlog2(drv.mem_get_info()[0]) - 2 # they're floats, i.e. 4 bytes each for power in range(10, max_power): size = 1<<power print size sizes.append(size) a = gpuarray.zeros((size,), dtype=numpy.float32) b = gpuarray.zeros((size,), dtype=numpy.float32) b.fill(1) if power > 20: count = 100 else: count = 1000 # gpu ----------------------------------------------------------------- start = drv.Event() end = drv.Event() start.record() for i in range(count): a+b end.record() end.synchronize() secs = start.time_till(end)*1e-3 times_gpu.append(secs/count) flops_gpu.append(size) del a del b # cpu ----------------------------------------------------------------- a_cpu = numpy.random.randn(size).astype(numpy.float32) b_cpu = numpy.random.randn(size).astype(numpy.float32) #start timer from time import time start = time() for i in range(count): a_cpu + b_cpu secs = time() - start times_cpu.append(secs/count) flops_cpu.append(size) # calculate pseudo flops flops_gpu = [f/t for f, t in zip(flops_gpu,times_gpu)] flops_cpu = [f/t for f, t in zip(flops_cpu,times_cpu)] from pytools import Table tbl = Table() tbl.add_row(("Size", "Time GPU", "Size/Time GPU", "Time CPU","Size/Time CPU","GPU vs CPU speedup")) for s, t, f, t_cpu, f_cpu in zip(sizes, times_gpu, flops_gpu, times_cpu, flops_cpu): tbl.add_row((s, t, f, t_cpu, f_cpu, f/f_cpu)) print tbl