def fill(arch, dtype, dsize, repeat=10): repeat = scale_repeat(arch, dsize, repeat) n = dsize // dtype_size[dtype] ## fill x x = ti.field(dtype, shape=n) if dtype in [ti.f32, ti.f64]: @ti.kernel def fill_const(n: ti.i32): for i in range(n): x[i] = 0.1 else: @ti.kernel def fill_const(n: ti.i32): for i in range(n): x[i] = 1 # compile the kernel first fill_const(n) ti.sync() ti.kernel_profiler_clear() ti.sync() for i in range(repeat): fill_const(n) ti.sync() kernelname = fill_const.__name__ suffix = "_c" quering_result = ti.query_kernel_profiler(kernelname + suffix) return quering_result.min
def reduction(arch, dtype, dsize, repeat=10): repeat = scale_repeat(arch, dsize, repeat) n = dsize // dtype_size[dtype] ## fill x x = ti.field(dtype, shape=n) if dtype in [ti.f32, ti.f64]: @ti.kernel def fill_const(n: ti.i32): for i in range(n): x[i] = 0.1 else: @ti.kernel def fill_const(n: ti.i32): for i in range(n): x[i] = 1 # compile the kernel first fill_const(n) ## reduce y = ti.field(dtype, shape=()) if dtype in [ti.f32, ti.f64]: y[None] = 0.0 else: y[None] = 0 @ti.kernel def reduction(n: ti.i32): for i in range(n): y[None] += ti.atomic_add(y[None], x[i]) # compile the kernel first reduction(n) ti.sync() ti.kernel_profiler_clear() ti.sync() for i in range(repeat): reduction(n) ti.sync() kernelname = reduction.__name__ suffix = "_c" quering_result = ti.query_kernel_profiler(kernelname + suffix) return quering_result.min
def run_benchmark(): compile_time = time.time() func(*args) # compile the kernel first ti.sync() compile_time = time.time() - compile_time ti.stat_write('compilation_time', compile_time) codegen_stat = _ti_core.stat() for line in codegen_stat.split('\n'): try: a, b = line.strip().split(':') except: continue a = a.strip() b = int(float(b)) if a == 'codegen_kernel_statements': ti.stat_write('compiled_inst', b) if a == 'codegen_offloaded_tasks': ti.stat_write('compiled_tasks', b) elif a == 'launched_tasks': ti.stat_write('launched_tasks', b) # Use 3 initial iterations to warm up # instruction/data caches. Discussion: # https://github.com/taichi-dev/taichi/pull/1002#discussion_r426312136 for i in range(3): func(*args) ti.sync() ti.kernel_profiler_clear() t = time.time() for n in range(repeat): func(*args) ti.sync() elapsed = time.time() - t avg = elapsed / repeat ti.stat_write('wall_clk_t', avg) device_time = ti.kernel_profiler_total_time() avg_device_time = device_time / repeat ti.stat_write('exec_t', avg_device_time)
def benchmark(): print( 'Also check "nvprof --print-gpu-trace python3 diffmpm_benchmark.py" for more accurate results' ) iters = 100000 for i in range(1): p2g(0) grid_op() g2p(0) ti.sync() ti.kernel_profiler_clear() t = time.time() for i in range(iters): # clear_grid() p2g(0) grid_op() g2p(0) ti.sync() print('forward ', (time.time() - t) / iters * 1000 * 3, 'ms') ti.kernel_profiler_print() for i in range(1): p2g.grad(0) grid_op.grad() g2p.grad(0) ti.sync() ti.kernel_profiler_clear() t = time.time() for i in range(iters): # clear_grid() g2p.grad(0) grid_op.grad() p2g.grad(0) ti.sync() print('backward ', (time.time() - t) / iters * 1000 * 3, 'ms') ti.kernel_profiler_print()