def test_remove_clear_list_from_fused_serial(): x = ti.field(ti.i32) y = ti.field(ti.i32) z = ti.field(ti.i32, shape=()) n = 32 ti.root.pointer(ti.i, n).dense(ti.i, 1).place(x) ti.root.pointer(ti.i, n).dense(ti.i, 1).place(y) @ti.kernel def init_xy(): for i in range(n): if i & 1: x[i] = i else: y[i] = i init_xy() ti.sync() stats = ti.get_kernel_stats() stats.clear() @ti.kernel def inc(f: ti.template()): for i in f: f[i] += 1 @ti.kernel def serial_z(): z[None] = 40 z[None] += 2 inc(x) inc(y) serial_z() inc(x) inc(y) ti.sync() counters = stats.get_counters() # each of x and y has two listgens: root -> pointer -> dense assert int(counters['launched_tasks_list_gen']) == 4 # clear list tasks have been fused into serial_z assert int(counters['launched_tasks_serial']) == 1 xs = x.to_numpy() ys = y.to_numpy() for i in range(n): if i & 1: assert xs[i] == i + 2 assert ys[i] == 0 else: assert ys[i] == i + 2 assert xs[i] == 0
def test_listgen_opt_with_offsets(): x = ti.field(dtype=ti.i32) ti.root.pointer(ti.i, 4).dense(ti.i, 4).place(x, offset=-8) @ti.kernel def inc(): for i in x: x[i] += 1 for i in range(10): inc() ti.sync() assert ti.get_kernel_stats().get_counters()['launched_tasks_list_gen'] <= 2
def print_async_stats(include_kernel_profiler=False): import taichi as ti if include_kernel_profiler: ti.kernel_profiler_print() print() stat = ti.get_kernel_stats() counters = stat.get_counters() print('=======================') print('Async benchmark metrics') print('-----------------------') print(f'Async mode: {ti.current_cfg().async_mode}') print(f'Kernel time: {ti.kernel_profiler_total_time():.3f} s') print(f'Tasks launched: {int(counters["launched_tasks"])}') print(f'Instructions emitted: {int(counters["codegen_statements"])}') print(f'Tasks compiled: {int(counters["codegen_offloaded_tasks"])}') print('=======================')
def print_async_stats(include_kernel_profiler=False): if include_kernel_profiler: ti.print_kernel_profile_info() print() stat = ti.get_kernel_stats() counters = stat.get_counters() print('=======================') print('Async benchmark metrics') print('-----------------------') print(f'Async mode: {ti.current_cfg().async_mode}') print(f'Kernel time: {ti.kernel_profiler_total_time():.3f} s') print(f'Tasks launched: {int(counters["launched_tasks"])}') print(f'Instructions emitted: {int(counters["codegen_statements"])}') print(f'Tasks compiled: {int(counters["codegen_offloaded_tasks"])}') NUM_FUSED_TASKS_KEY = 'num_fused_tasks' if NUM_FUSED_TASKS_KEY in counters: print(f'Tasks fused: {int(counters["num_fused_tasks"])}') print('=======================')
def test_sfg_dead_store_elimination(): ti.init(arch=ti.cpu, async_mode=True) n = 32 x = ti.field(dtype=float, shape=n, needs_grad=True) total_energy = ti.field(dtype=float, shape=(), needs_grad=True) unused = ti.field(dtype=float, shape=()) @ti.kernel def gather(): for i in x: e = x[i]**2 total_energy[None] += e @ti.kernel def scatter(): for i in x: unused[None] += x[i] xnp = np.arange(n, dtype=np.float32) x.from_numpy(xnp) ti.sync() stats = ti.get_kernel_stats() stats.clear() for _ in range(5): with ti.Tape(total_energy): gather() scatter() ti.sync() counters = stats.get_counters() # gather() should be DSE'ed assert counters['sfg_dse_tasks'] > 0 x_grad = x.grad.to_numpy() for i in range(n): assert ti.approx(x_grad[i]) == 2.0 * i