def table_from_cursor(cursor): from pytools import Table tbl = Table() tbl.add_row([column[0] for column in cursor.description]) for row in cursor: tbl.add_row(row) return tbl
def pretty_print(self, abscissa_label="h", error_label="Error", gliding_mean=2, abscissa_format="%s", error_format="%s", eoc_format="%s"): from pytools import Table tbl = Table() tbl.add_row((abscissa_label, error_label, "Running EOC")) gm_eoc = self.estimate_order_of_convergence(gliding_mean) for i, (absc, err) in enumerate(self.history): absc_str = abscissa_format % absc err_str = error_format % err if i < gliding_mean-1: eoc_str = "" else: eoc_str = eoc_format % (gm_eoc[i - gliding_mean + 1, 1]) tbl.add_row((absc_str, err_str, eoc_str)) if len(self.history) > 1: return "%s\n\nOverall EOC: %s" % (str(tbl), self.estimate_order_of_convergence()[0, 1]) else: return str(tbl)
def tabulate_ascii(rows, col_fmt): del col_fmt from pytools import Table result = Table() for row in rows: result.add_row(row) return str(result)
def __str__(self): from pytools import Table tbl = Table() tbl.add_row(("p", "error")) for p, err in zip(self.orders, self.errors): tbl.add_row((str(p), str(err))) return str(tbl)
def main(): from pytools import Table tbl = Table() tbl.add_row(("type", "size [MiB]", "time [ms]", "mem.bw [GB/s]")) from random import shuffle for dtype_out in [numpy.float32, numpy.float64]: for ex in range(15, 27): sz = 1 << ex print sz from pycuda.curandom import rand as curand a_gpu = curand((sz, )) b_gpu = curand((sz, )) assert sz == a_gpu.shape[0] assert len(a_gpu.shape) == 1 from pycuda.reduction import get_sum_kernel, get_dot_kernel krnl = get_dot_kernel(dtype_out, a_gpu.dtype) elapsed = [0] def wrap_with_timer(f): def result(*args, **kwargs): start = cuda.Event() stop = cuda.Event() start.record() f(*args, **kwargs) stop.record() stop.synchronize() elapsed[0] += stop.time_since(start) return result # warm-up for i in range(3): krnl(a_gpu, b_gpu) cnt = 10 for i in range(cnt): krnl( a_gpu, b_gpu, #krnl(a_gpu, kernel_wrapper=wrap_with_timer) bytes = a_gpu.nbytes * 2 * cnt secs = elapsed[0] * 1e-3 tbl.add_row((str(dtype_out), a_gpu.nbytes / (1 << 20), elapsed[0] / cnt, bytes / secs / 1e9)) print tbl
def test_table(): import math from pytools import Table tbl = Table() tbl.add_row(("i", "i^2", "i^3", "sqrt(i)")) for i in range(8): tbl.add_row((i, i**2, i**3, math.sqrt(i))) print(tbl) print() print(tbl.latex())
def main(): from pytools import Table tbl = Table() tbl.add_row(("type", "size [MiB]", "time [ms]", "mem.bw [GB/s]")) from random import shuffle for dtype_out in [numpy.float32, numpy.float64]: for ex in range(15,27): sz = 1 << ex print sz from pycuda.curandom import rand as curand a_gpu = curand((sz,)) b_gpu = curand((sz,)) assert sz == a_gpu.shape[0] assert len(a_gpu.shape) == 1 from pycuda.reduction import get_sum_kernel, get_dot_kernel krnl = get_dot_kernel(dtype_out, a_gpu.dtype) elapsed = [0] def wrap_with_timer(f): def result(*args, **kwargs): start = cuda.Event() stop = cuda.Event() start.record() f(*args, **kwargs) stop.record() stop.synchronize() elapsed[0] += stop.time_since(start) return result # warm-up for i in range(3): krnl(a_gpu, b_gpu) cnt = 10 for i in range(cnt): krnl(a_gpu, b_gpu, #krnl(a_gpu, kernel_wrapper=wrap_with_timer) bytes = a_gpu.nbytes*2*cnt secs = elapsed[0]*1e-3 tbl.add_row((str(dtype_out), a_gpu.nbytes/(1<<20), elapsed[0]/cnt, bytes/secs/1e9)) print tbl
def ascii_table(table_format, header, rows): from pytools import Table table = Table() table.add_row(header) for input_row in rows: row = [] for item in input_row: if item.startswith(r"\num{"): # Strip \num{...} formatting row.append(item[5:-1]) else: row.append(item) table.add_row(row) return str(table)
def test_cost_model(ctx, calibration_params): queue = cl.CommandQueue(ctx) actx = PyOpenCLArrayContext(queue, force_device_scalars=True) cost_model = QBXCostModel() for lpot_source in test_geometries(actx): lpot_source = lpot_source.copy(cost_model=cost_model) from pytential import GeometryCollection places = GeometryCollection(lpot_source) density_discr = places.get_discretization(places.auto_source.geometry) bound_op = get_bound_op(places) sigma = get_test_density(actx, density_discr) cost_S, _ = bound_op.cost_per_stage(calibration_params, sigma=sigma) model_result = one(cost_S.values()) # Warm-up run. bound_op.eval({"sigma": sigma}, array_context=actx) temp_timing_results = [] for _ in range(RUNS): timing_data = {} bound_op.eval({"sigma": sigma}, array_context=actx, timing_data=timing_data) temp_timing_results.append(one(timing_data.values())) timing_result = {} for param in model_result: timing_result[param] = (sum( temp_timing_result[param]["process_elapsed"] for temp_timing_result in temp_timing_results)) / RUNS from pytools import Table table = Table() table.add_row(["stage", "actual (s)", "predicted (s)"]) for stage in model_result: row = [ stage, f"{timing_result[stage]:.2f}", f"{model_result[stage]:.2f}", ] table.add_row(row) print(table)
def test_table(): import math from pytools import Table tbl = Table() tbl.add_row(("i", "i^2", "i^3", "sqrt(i)")) for i in range(8): tbl.add_row((i, i**2, i**3, math.sqrt(i))) print(tbl) print() print(tbl.latex()) # {{{ test merging from pytools import merge_tables tbl = merge_tables(tbl, tbl, tbl, skip_columns=(0, )) print(tbl.github_markdown())
def _to_table(self, *, abscissa_label="h", error_label="Error", gliding_mean=2, abscissa_format="%s", error_format="%s", eoc_format="%s"): from pytools import Table tbl = Table() tbl.add_row((abscissa_label, error_label, "Running EOC")) gm_eoc = self.estimate_order_of_convergence(gliding_mean) for i, (absc, err) in enumerate(self.history): absc_str = abscissa_format % absc err_str = error_format % err if i < gliding_mean-1: eoc_str = "" else: eoc_str = eoc_format % (gm_eoc[i - gliding_mean + 1, 1]) tbl.add_row((absc_str, err_str, eoc_str)) if len(self.history) > 1: order = self.estimate_order_of_convergence()[0, 1] tbl.add_row(("Overall", "", eoc_format % order)) return tbl
def pretty_print(self, abscissa_label="N", error_label="Error", gliding_mean=2): from pytools import Table tbl = Table() tbl.add_row((abscissa_label, error_label, "Running EOC")) gm_eoc = self.estimate_order_of_convergence(gliding_mean) for i, (absc, err) in enumerate(self.history): if i < gliding_mean - 1: tbl.add_row((str(absc), str(err), "")) else: tbl.add_row( (str(absc), str(err), str(gm_eoc[i - gliding_mean + 1, 1]))) if len(self.history) > 1: return str( tbl ) + "\n\nOverall EOC: %s" % self.estimate_order_of_convergence()[0, 1] else: return str(tbl)
def test_cost_model(ctx, cost_model): queue = cl.CommandQueue(ctx) for lpot_source in test_geometries(queue): lpot_source = lpot_source.copy(cost_model=cost_model) bound_op = get_bound_op(lpot_source) sigma = get_test_density(queue, lpot_source) cost_S = bound_op.get_modeled_cost(queue, sigma=sigma) model_result = (one( cost_S.values()).get_predicted_times(merge_close_lists=True)) # Warm-up run. bound_op.eval(queue, {"sigma": sigma}) temp_timing_results = [] for _ in range(RUNS): timing_data = {} bound_op.eval(queue, {"sigma": sigma}, timing_data=timing_data) temp_timing_results.append(one(timing_data.values())) timing_result = {} for param in model_result: timing_result[param] = (sum( temp_timing_result[param]["process_elapsed"] for temp_timing_result in temp_timing_results)) / RUNS from pytools import Table table = Table() table.add_row(["stage", "actual (s)", "predicted (s)"]) for stage in model_result: row = [ stage, "%.2f" % timing_result[stage], "%.2f" % model_result[stage] ] table.add_row(row) print(table)
def main(): from pytools import Table tbl = Table() tbl.add_row(("size [MiB]", "time [s]", "mem.bw [GB/s]")) import pycuda.gpuarray as gpuarray # they're floats, i.e. 4 bytes each for power in range(10, 28): size = 1 << power print(size) a = gpuarray.empty((size, ), dtype=numpy.float32) b = gpuarray.empty_like(a) a.fill(1) b.fill(2) if power > 20: count = 10 else: count = 100 elapsed = [0] def add_timer(_, time): elapsed[0] += time() for i in range(count): a.mul_add(1, b, 2, add_timer) bytes = a.nbytes * count * 3 bytes = a.nbytes * count * 3 tbl.add_row((a.nbytes / (1 << 20), elapsed[0] / count, bytes / elapsed[0] / 1e9)) print(tbl)
def main(): from pytools import Table tbl = Table() tbl.add_row(("size [MiB]", "time [s]", "mem.bw [GB/s]")) import pycuda.gpuarray as gpuarray # they're floats, i.e. 4 bytes each for power in range(10, 28): size = 1<<power print(size) a = gpuarray.empty((size,), dtype=numpy.float32) b = gpuarray.empty_like(a) a.fill(1) b.fill(2) if power > 20: count = 10 else: count = 100 elapsed = [0] def add_timer(_, time): elapsed[0] += time() for i in range(count): a.mul_add(1, b, 2, add_timer) bytes = a.nbytes*count*3 bytes = a.nbytes*count*3 tbl.add_row((a.nbytes/(1<<20), elapsed[0]/count, bytes/elapsed[0]/1e9)) print(tbl)
def pretty_print(self, abscissa_label="N", error_label="Error", gliding_mean=2): from pytools import Table tbl = Table() tbl.add_row((abscissa_label, error_label, "Running EOC")) gm_eoc = self.estimate_order_of_convergence(gliding_mean) for i, (absc, err) in enumerate(self.history): if i < gliding_mean-1: tbl.add_row((str(absc), str(err), "")) else: tbl.add_row((str(absc), str(err), str(gm_eoc[i-gliding_mean+1,1]))) if len(self.history) > 1: return str(tbl) + "\n\nOverall EOC: %s" % self.estimate_order_of_convergence()[0,1] else: return str(tbl)
def main(): import pycuda.gpuarray as gpuarray sizes = [] times = [] flops = [] flopsCPU = [] timesCPU = [] for power in range(10, 25): # 24 size = 1<<power print size sizes.append(size) a = gpuarray.zeros((size,), dtype=numpy.float32) if power > 20: count = 100 else: count = 1000 #start timer start = drv.Event() end = drv.Event() start.record() #cuda operation which fills the array with random numbers for i in range(count): curandom.rand((size, )) #stop timer end.record() end.synchronize() #calculate used time secs = start.time_till(end)*1e-3 times.append(secs/count) flops.append(size) #cpu operations which fills teh array with random data a = numpy.array((size,), dtype=numpy.float32) #start timer start = drv.Event() end = drv.Event() start.record() #cpu operation which fills the array with random data for i in range(count): numpy.random.rand(size).astype(numpy.float32) #stop timer end.record() end.synchronize() #calculate used time secs = start.time_till(end)*1e-3 #add results to variable timesCPU.append(secs/count) flopsCPU.append(size) #calculate pseudo flops flops = [f/t for f, t in zip(flops,times)] flopsCPU = [f/t for f, t in zip(flopsCPU,timesCPU)] #print the data out tbl = Table() tbl.add_row(("Size", "Time GPU", "Size/Time GPU", "Time CPU","Size/Time CPU","GPU vs CPU speedup")) for s, t, f,tCpu,fCpu in zip(sizes, times, flops,timesCPU,flopsCPU): tbl.add_row((s,t,f,tCpu,fCpu,f/fCpu)) print tbl
def main(): import pycuda.gpuarray as gpuarray sizes = [] times = [] flops = [] flopsCPU = [] timesCPU = [] for power in range(10, 25): # 24 size = 1 << power print size sizes.append(size) a = gpuarray.zeros((size, ), dtype=numpy.float32) if power > 20: count = 100 else: count = 1000 #start timer start = drv.Event() end = drv.Event() start.record() #cuda operation which fills the array with random numbers for i in range(count): curandom.rand((size, )) #stop timer end.record() end.synchronize() #calculate used time secs = start.time_till(end) * 1e-3 times.append(secs / count) flops.append(size) #cpu operations which fills teh array with random data a = numpy.array((size, ), dtype=numpy.float32) #start timer start = drv.Event() end = drv.Event() start.record() #cpu operation which fills the array with random data for i in range(count): numpy.random.rand(size).astype(numpy.float32) #stop timer end.record() end.synchronize() #calculate used time secs = start.time_till(end) * 1e-3 #add results to variable timesCPU.append(secs / count) flopsCPU.append(size) #calculate pseudo flops flops = [f / t for f, t in zip(flops, times)] flopsCPU = [f / t for f, t in zip(flopsCPU, timesCPU)] #print the data out tbl = Table() tbl.add_row(("Size", "Time GPU", "Size/Time GPU", "Time CPU", "Size/Time CPU", "GPU vs CPU speedup")) for s, t, f, tCpu, fCpu in zip(sizes, times, flops, timesCPU, flopsCPU): tbl.add_row((s, t, f, tCpu, fCpu, f / fCpu)) print tbl
def main(): import pycuda.gpuarray as gpuarray sizes = [] times_gpu = [] flops_gpu = [] flops_cpu = [] times_cpu = [] from pycuda.tools import bitlog2 max_power = bitlog2(drv.mem_get_info()[0]) - 2 # they're floats, i.e. 4 bytes each for power in range(10, max_power): size = 1 << power print(size) sizes.append(size) a = gpuarray.zeros((size,), dtype=numpy.float32) b = gpuarray.zeros((size,), dtype=numpy.float32) b.fill(1) if power > 20: count = 100 else: count = 1000 # gpu ----------------------------------------------------------------- start = drv.Event() end = drv.Event() start.record() for i in range(count): a + b end.record() end.synchronize() secs = start.time_till(end) * 1e-3 times_gpu.append(secs / count) flops_gpu.append(size) del a del b # cpu ----------------------------------------------------------------- a_cpu = numpy.random.randn(size).astype(numpy.float32) b_cpu = numpy.random.randn(size).astype(numpy.float32) # start timer from time import time start = time() for i in range(count): a_cpu + b_cpu secs = time() - start times_cpu.append(secs / count) flops_cpu.append(size) # calculate pseudo flops flops_gpu = [f / t for f, t in zip(flops_gpu, times_gpu)] flops_cpu = [f / t for f, t in zip(flops_cpu, times_cpu)] from pytools import Table tbl = Table() tbl.add_row( ( "Size", "Time GPU", "Size/Time GPU", "Time CPU", "Size/Time CPU", "GPU vs CPU speedup", ) ) for s, t, f, t_cpu, f_cpu in zip(sizes, times_gpu, flops_gpu, times_cpu, flops_cpu): tbl.add_row((s, t, f, t_cpu, f_cpu, f / f_cpu)) print(tbl)
def main(): drv.init() assert drv.Device.count() >= 1 ctx = drv.Device(0).make_context() import pycuda.gpuarray as gpuarray # make sure all the kernels are compiled gpuarray.GPUArray.compile_kernels() print "done compiling" sizes = [] times = [] flops = [] flopsCPU = [] timesCPU = [] for power in range(10, 25): # 24 size = 1 << power print size sizes.append(size) a = gpuarray.zeros((size,), dtype=numpy.float32) b = gpuarray.zeros((size,), dtype=numpy.float32) b.fill(1) if power > 20: count = 100 else: count = 1000 # start timer start = drv.Event() end = drv.Event() start.record() # cuda operation which adds two arrays over count time to get an average for i in range(count): a + b # stop timer end.record() end.synchronize() # calculate used time secs = start.time_till(end) * 1e-3 times.append(secs / count) flops.append(size) # cpu operations which adds two arrays aCpu = numpy.random.randn(size).astype(numpy.float32) bCpu = numpy.random.randn(size).astype(numpy.float32) # start timer start = drv.Event() end = drv.Event() start.record() # cpu operation which adds two arrays over count time to get an average for i in range(count): aCpu + bCpu # stop timer end.record() end.synchronize() # calculate used time secs = start.time_till(end) * 1e-3 # add results to variable timesCPU.append(secs / count) flopsCPU.append(size) # calculate pseudo flops flops = [f / t for f, t in zip(flops, times)] flopsCPU = [f / t for f, t in zip(flopsCPU, timesCPU)] # print the data out from pytools import Table tbl = Table() tbl.add_row(("Size", "Time GPU", "Size/Time GPU", "Time CPU", "Size/Time CPU", "GPU vs CPU speedup")) for s, t, f, tCpu, fCpu in zip(sizes, times, flops, timesCPU, flopsCPU): tbl.add_row((s, t, f, tCpu, fCpu, f / fCpu)) print tbl
def __call__(self, eval_dependency, lift_plan): discr = self.discr fplan = self.plan given = fplan.given elgroup, = discr.element_groups all_fluxes_on_faces = [gpuarray.empty( given.matmul_preimage_shape(lift_plan), dtype=given.float_type, allocator=discr.pool.allocate) for i in range(len(self.fluxes))] fdata = self.flux_face_data_block(elgroup) ilist_data = self.index_list_data() block, gather, texref_map = self.get_kernel(fdata, ilist_data, for_benchmark=False) for dep_expr in self.all_deps: dep_field = eval_dependency(dep_expr) from hedge.tools import is_zero if is_zero(dep_field): if dep_expr in self.dep_to_tag: dep_field = discr.boundary_zeros(self.dep_to_tag[dep_expr]) else: dep_field = discr.volume_zeros() assert dep_field.dtype == given.float_type dep_field.bind_to_texref_ext(texref_map[dep_expr], allow_double_hack=True) if set(["cuda_flux", "cuda_debugbuf"]) <= discr.debug: debugbuf = gpuarray.zeros((10000,), dtype=given.float_type) else: from hedge.backends.cuda.tools import FakeGPUArray debugbuf = FakeGPUArray() if discr.instrumented: discr.flux_gather_timer.add_timer_callable(gather.prepared_timed_call( (len(discr.blocks), 1), block, debugbuf.gpudata, fdata.device_memory, *tuple(fof.gpudata for fof in all_fluxes_on_faces) )) discr.gmem_bytes_gather.add( len(discr.blocks) * fdata.block_bytes + given.float_size() * ( # fetch len(self.fluxes) * 2*fdata.fp_count * fplan.dofs_per_face # store + len(discr.blocks) * len(self.fluxes) * fplan.microblocks_per_block() * fplan.aligned_face_dofs_per_microblock() )) else: gather.prepared_call( (len(discr.blocks), 1), block, debugbuf.gpudata, fdata.device_memory, *tuple(fof.gpudata for fof in all_fluxes_on_faces) ) if set(["cuda_flux", "cuda_debugbuf"]) <= discr.debug: from hedge.tools import get_rank, wait_for_keypress if get_rank(discr) == 0: copied_debugbuf = debugbuf.get() print "DEBUG", len(discr.blocks) numpy.set_printoptions(linewidth=130) #print numpy.reshape(copied_debugbuf, (32, 16)) print copied_debugbuf[:50] #for i in range(len(discr.blocks)*6): #print i, copied_debugbuf[i*16:(i+1)*16] #print i, [x-10000 for x in sorted(copied_debugbuf[i*16:(i+1)*16]) if x != 0] wait_for_keypress(discr) if "cuda_flux" in discr.debug: from hedge.tools import get_rank, wait_for_keypress if get_rank(discr) == 0: numpy.set_printoptions(linewidth=130, precision=2, threshold=10**6) if True: cols = [] for k in range(len(all_fluxes_on_faces)): my_fof = all_fluxes_on_faces[k].get() def sstruc(a): result = "" for i in a: if i == 0: result += "0" elif abs(i) < 1e-10: result += "-" elif numpy.isnan(i): result += "N" elif i == 17: result += "*" else: result += "#" return result useful_sz = given.block_count \ * given.microblocks_per_block \ * lift_plan.aligned_preimage_dofs_per_microblock my_col = [] i = 0 while i < useful_sz: my_col.append(sstruc(my_fof[i:i+16])) i += 16 cols.append(my_col) from pytools import Table tbl = Table() tbl.add_row(["num"]+range(len(cols))) i = 0 for row in zip(*cols): tbl.add_row((i,)+row) i += 1 print tbl else: for i in range(len(all_fluxes_on_faces)): print i print all_fluxes_on_faces[i].get() wait_for_keypress(discr) #print "B", [la.norm(fof.get()) for fof in all_fluxes_on_faces] return all_fluxes_on_faces
def __call__(self, eval_dependency, lift_plan): discr = self.discr fplan = self.plan given = fplan.given elgroup, = discr.element_groups all_fluxes_on_faces = [ gpuarray.empty(given.matmul_preimage_shape(lift_plan), dtype=given.float_type, allocator=discr.pool.allocate) for i in range(len(self.fluxes)) ] fdata = self.flux_face_data_block(elgroup) ilist_data = self.index_list_data() block, gather, texref_map = self.get_kernel(fdata, ilist_data, for_benchmark=False) for dep_expr in self.all_deps: dep_field = eval_dependency(dep_expr) from hedge.tools import is_zero if is_zero(dep_field): if dep_expr in self.dep_to_tag: dep_field = discr.boundary_zeros(self.dep_to_tag[dep_expr]) else: dep_field = discr.volume_zeros() assert dep_field.dtype == given.float_type, "Wrong types: %s: %s, %s: %s" % ( dep_expr, dep_field.dtype, given, given.float_type) dep_field.bind_to_texref_ext(texref_map[dep_expr], allow_double_hack=True) if set(["cuda_flux", "cuda_debugbuf"]) <= discr.debug: debugbuf = gpuarray.zeros((10000, ), dtype=given.float_type) else: from hedge.backends.cuda.tools import FakeGPUArray debugbuf = FakeGPUArray() if discr.instrumented: discr.flux_gather_timer.add_timer_callable( gather.prepared_timed_call( (len(discr.blocks), 1), block, debugbuf.gpudata, fdata.device_memory, *tuple(fof.gpudata for fof in all_fluxes_on_faces))) discr.gmem_bytes_gather.add( len(discr.blocks) * fdata.block_bytes + given.float_size() * ( # fetch len(self.fluxes) * 2 * fdata.fp_count * fplan.dofs_per_face # store + len(discr.blocks) * len(self.fluxes) * fplan.microblocks_per_block() * fplan.aligned_face_dofs_per_microblock())) else: gather.prepared_call( (len(discr.blocks), 1), block, debugbuf.gpudata, fdata.device_memory, *tuple(fof.gpudata for fof in all_fluxes_on_faces)) if set(["cuda_flux", "cuda_debugbuf"]) <= discr.debug: from hedge.tools import get_rank, wait_for_keypress if get_rank(discr) == 0: copied_debugbuf = debugbuf.get() print "DEBUG", len(discr.blocks) numpy.set_printoptions(linewidth=130) #print numpy.reshape(copied_debugbuf, (32, 16)) print copied_debugbuf[:50] #for i in range(len(discr.blocks)*6): #print i, copied_debugbuf[i*16:(i+1)*16] #print i, [x-10000 for x in sorted(copied_debugbuf[i*16:(i+1)*16]) if x != 0] wait_for_keypress(discr) if "cuda_flux" in discr.debug: from hedge.tools import get_rank, wait_for_keypress if get_rank(discr) == 0: numpy.set_printoptions(linewidth=130, precision=2, threshold=10**6) if True: cols = [] for k in range(len(all_fluxes_on_faces)): my_fof = all_fluxes_on_faces[k].get() def sstruc(a): result = "" for i in a: if i == 0: result += "0" elif abs(i) < 1e-10: result += "-" elif numpy.isnan(i): result += "N" elif i == 17: result += "*" else: result += "#" return result useful_sz = given.block_count \ * given.microblocks_per_block \ * lift_plan.aligned_preimage_dofs_per_microblock my_col = [] i = 0 while i < useful_sz: my_col.append(sstruc(my_fof[i:i + 16])) i += 16 cols.append(my_col) from pytools import Table tbl = Table() tbl.add_row(["num"] + range(len(cols))) i = 0 for row in zip(*cols): tbl.add_row((i, ) + row) i += 1 print tbl else: for i in range(len(all_fluxes_on_faces)): print i print all_fluxes_on_faces[i].get() wait_for_keypress(discr) #print "B", [la.norm(fof.get()) for fof in all_fluxes_on_faces] return all_fluxes_on_faces
#generate our output table, one for gpu, one for cpu tblCPU = Table() tblGPU = Table() tblSPD = Table() #contains all the method names methods = ["size"] for name in dir(cuma): if (name.startswith("__") and name.endswith("__")) == False: method = getattr(cuma, name) if type(method) == types.FunctionType: methods.append(name) tblCPU.add_row(methods) tblGPU.add_row(methods) tblSPD.add_row(methods) #generate arrays with differnt sizes for power in range(1,20): size = 1<<power #temp variables rowCPU = [size] rowGPU = [size] rowSPD = [size] print "calculating: ", size for name in dir(cuma):
def main(): import pycuda.gpuarray as gpuarray sizes = [] times_gpu = [] flops_gpu = [] flops_cpu = [] times_cpu = [] from pycuda.tools import bitlog2 max_power = bitlog2(drv.mem_get_info()[0]) - 2 # they're floats, i.e. 4 bytes each for power in range(10, max_power): size = 1<<power print size sizes.append(size) a = gpuarray.zeros((size,), dtype=numpy.float32) b = gpuarray.zeros((size,), dtype=numpy.float32) b.fill(1) if power > 20: count = 100 else: count = 1000 # gpu ----------------------------------------------------------------- start = drv.Event() end = drv.Event() start.record() for i in range(count): a+b end.record() end.synchronize() secs = start.time_till(end)*1e-3 times_gpu.append(secs/count) flops_gpu.append(size) del a del b # cpu ----------------------------------------------------------------- a_cpu = numpy.random.randn(size).astype(numpy.float32) b_cpu = numpy.random.randn(size).astype(numpy.float32) #start timer from time import time start = time() for i in range(count): a_cpu + b_cpu secs = time() - start times_cpu.append(secs/count) flops_cpu.append(size) # calculate pseudo flops flops_gpu = [f/t for f, t in zip(flops_gpu,times_gpu)] flops_cpu = [f/t for f, t in zip(flops_cpu,times_cpu)] from pytools import Table tbl = Table() tbl.add_row(("Size", "Time GPU", "Size/Time GPU", "Time CPU","Size/Time CPU","GPU vs CPU speedup")) for s, t, f, t_cpu, f_cpu in zip(sizes, times_gpu, flops_gpu, times_cpu, flops_cpu): tbl.add_row((s, t, f, t_cpu, f_cpu, f/f_cpu)) print tbl