def run_benchmark(): compile_time = time.time() func(*args) compile_time = time.time() - compile_time ti.stat_write('compilation_time', compile_time) codegen_stat = ti.core.stat() for line in codegen_stat.split('\n'): try: a, b = line.strip().split(':') except: continue a = a.strip() b = int(float(b)) if a == 'codegen_kernel_statements': ti.stat_write('instructions', b) if a == 'codegen_offloaded_tasks': ti.stat_write('offloaded_tasks', b) elif a == 'launched_kernels': ti.stat_write('launched_kernels', b) # The reason why we run 4 times is to warm up instruction/data caches. # Discussion: https://github.com/taichi-dev/taichi/pull/1002#discussion_r426312136 for i in range(4): func(*args) # compile the kernel first ti.sync() t = time.time() for n in range(repeat): func(*args) ti.get_runtime().sync() elapsed = time.time() - t avg = elapsed / repeat ti.stat_write('running_time', avg)
def test_2D_bit_array(): ci1 = ti.type_factory_.get_custom_int_type(1, False) x = ti.field(dtype=ci1) M, N = 4, 8 ti.root._bit_array(ti.ij, (M, N), num_bits=32).place(x) ti.get_runtime().materialize() @ti.kernel def set_val(): for i in range(M): for j in range(N): x[i, j] = (i * N + j) % 2 @ti.kernel def verify_val(): for i in range(M): for j in range(N): assert x[i, j] == (i * N + j) % 2 set_val() verify_val()
def test_simple_array(): ci13 = ti.quant.int(13, True) cu19 = ti.quant.int(19, False) x = ti.field(dtype=ci13) y = ti.field(dtype=cu19) N = 12 ti.root.dense(ti.i, N).bit_struct(num_bits=32).place(x, y) ti.get_runtime().materialize() @ti.kernel def set_val(): for i in range(N): x[i] = -2**i y[i] = 2**i - 1 @ti.kernel def verify_val(): for i in range(N): assert x[i] == -2**i assert y[i] == 2**i - 1 set_val() verify_val() # Test bit_struct SNode read and write in Python-scope by calling the wrapped, untranslated function body set_val.__wrapped__() verify_val.__wrapped__()
def test_simple_array(): ti.init(arch=ti.cpu, debug=True, print_ir=True, cfg_optimization=False) ci13 = ti.type_factory_.get_custom_int_type(13, True) cu19 = ti.type_factory_.get_custom_int_type(19, False) x = ti.field(dtype=ci13) y = ti.field(dtype=cu19) N = 12 ti.root.dense(ti.i, N)._bit_struct(num_bits=32).place(x, y) ti.get_runtime().materialize() @ti.kernel def set_val(): for i in range(N): x[i] = -2**i y[i] = 2**i - 1 @ti.kernel def verify_val(): for i in range(N): assert x[i] == -2**i assert y[i] == 2**i - 1 set_val() verify_val()
def test_unordered(): val = ti.field(ti.i32) n = 3 m = 7 p = 11 blk1 = ti.root.dense(ti.k, n) blk2 = blk1.dense(ti.i, m) blk3 = blk2.dense(ti.j, p) blk3.place(val) assert val.dtype == ti.i32 assert val.shape == (m, p, n) assert val.snode.parent(0) == val.snode assert val.snode.parent() == blk3 assert val.snode.parent(1) == blk3 assert val.snode.parent(2) == blk2 assert val.snode.parent(3) == blk1 assert val.snode.parent(4) == ti.root assert val.snode in blk3.get_children() assert blk3 in blk2.get_children() assert blk2 in blk1.get_children() ti.get_runtime().materialize() assert blk1 in ti.FieldsBuilder.finalized_roots()[0].get_children() expected_str = f'ti.root => dense {[n]} => dense {[m, n]}' \ f' => dense {[m, p, n]} => place {[m, p, n]}' assert str(val.snode) == expected_str
def test_clear_all_gradients(): x = ti.var(ti.f32) y = ti.var(ti.f32) z = ti.var(ti.f32) w = ti.var(ti.f32) n = 128 ti.root.place(x) ti.root.dense(ti.i, n).place(y) ti.root.dense(ti.i, n).dense(ti.j, n).place(z, w) ti.root.lazy_grad() x.grad[None] = 3 for i in range(n): y.grad[i] = 3 for j in range(n): z.grad[i, j] = 5 w.grad[i, j] = 6 ti.clear_all_gradients() assert ti.get_runtime().get_num_compiled_functions() == 3 assert x.grad[None] == 0 for i in range(n): assert y.grad[i] == 0 for j in range(n): assert z.grad[i, j] == 0 assert w.grad[i, j] == 0 ti.clear_all_gradients() # No more kernel compilation assert ti.get_runtime().get_num_compiled_functions() == 3
def test_fused_kernels(): n = 12 X = ti.Matrix(3, 2, ti.f32, shape=(n, n, n)) s = ti.get_runtime().get_num_compiled_functions() t = X.to_torch() assert ti.get_runtime().get_num_compiled_functions() == s + 1 X.from_torch(t) assert ti.get_runtime().get_num_compiled_functions() == s + 2
def test_matrix_field_dynamic_index_different_path_length(): v = ti.Vector.field(2, ti.i32) x = v.get_scalar_field(0) y = v.get_scalar_field(1) ti.root.dense(ti.i, 8).place(x) ti.root.dense(ti.i, 2).dense(ti.i, 4).place(y) ti.get_runtime().materialize() assert v.dynamic_index_stride is None
def test_matrix_field_dynamic_index_not_pure_dense(): v = ti.Vector.field(2, ti.i32) x = v.get_scalar_field(0) y = v.get_scalar_field(1) ti.root.dense(ti.i, 2).pointer(ti.i, 4).place(x) ti.root.dense(ti.i, 2).dense(ti.i, 4).place(y) ti.get_runtime().materialize() assert v.dynamic_index_stride is None
def test_div_default_ip(): ti.get_runtime().set_default_ip(ti.i64) z = ti.field(ti.f32, shape=()) @ti.kernel def func(): a = 1e15 + 1e9 z[None] = a // 1e10 func() assert z[None] == 100000
def benchmark(func, repeat=100, args=()): import taichi as ti import time func(*args) # compile the kernel first ti.sync() t = time.time() for n in range(repeat): func(*args) elapsed = time.time() - t ti.get_runtime().sync() return elapsed / repeat
def test_matrix_field_dynamic_index_different_stride(): temp = ti.field(ti.f32) v = ti.Vector.field(3, ti.i32) x = v.get_scalar_field(0) y = v.get_scalar_field(1) z = v.get_scalar_field(2) ti.root.dense(ti.i, 8).place(x, y, temp, z) ti.get_runtime().materialize() assert v.dynamic_index_stride is None
def test_matrix_field_dynamic_index_different_offset_bytes_in_parent_cell(): temp_a = ti.field(ti.f32) temp_b = ti.field(ti.f32) v = ti.Vector.field(2, ti.i32) x = v.get_scalar_field(0) y = v.get_scalar_field(1) ti.root.dense(ti.i, 8).place(temp_a, x) ti.root.dense(ti.i, 8).place(y, temp_b) ti.get_runtime().materialize() assert v.dynamic_index_stride is None
def cast(self, dt): ret = self.copy() if type(dt) is type and issubclass(dt, numbers.Number): import taichi as ti if dt is float: dt = ti.get_runtime().default_fp elif dt is int: dt = ti.get_runtime().default_ip else: assert False for i in range(len(self.entries)): ret.entries[i] = impl.cast(ret.entries[i], dt) return ret
def test(*func_args, **func_kwargs): import taichi as ti can_run_on = func_kwargs.pop(_tests_arch_checkers_argname, _ArchCheckers()) # Filter away archs that don't support 64-bit data. fp = func_kwargs.get('default_fp', ti.get_runtime().default_fp) ip = func_kwargs.get('default_ip', ti.get_runtime().default_ip) if fp == ti.f64 or ip == ti.i64: can_run_on.register( lambda arch: is_supported(arch, extension.data64)) for arch in ti.supported_archs(): if can_run_on(arch): ti.init(arch=arch, **kwargs) func(*func_args, **func_kwargs)
def test_indices(): a = ti.var(ti.f32, shape=(128, 32, 8)) b = ti.var(ti.f32) ti.root.dense(ti.j, 32).dense(ti.i, 16).place(b) ti.get_runtime().materialize() mapping_a = a.snode().physical_index_position() assert mapping_a == {0: 0, 1: 1, 2: 2} mapping_b = b.snode().physical_index_position() assert mapping_b == {0: 1, 1: 0}
def benchmark(func, repeat=300, args=()): import taichi as ti import time # The reason why we run 4 times is to warm up instruction/data caches. # Discussion: https://github.com/taichi-dev/taichi/pull/1002#discussion_r426312136 for i in range(4): func(*args) # compile the kernel first ti.sync() t = time.time() for n in range(repeat): func(*args) ti.get_runtime().sync() elapsed = time.time() - t avg = elapsed / repeat * 1000 # miliseconds ti.stat_write(avg)
def custom_int(self, bits, signed=True, compute_type=None): import taichi as ti if compute_type is None: compute_type = ti.get_runtime().default_ip if isinstance(compute_type, ti.core.DataType): compute_type = compute_type.get_ptr() return self.core.get_custom_int_type(bits, signed, compute_type)
def main(): print("Loading initial and target states...") initial_smoke_img = cv2.imread("init_smoke.png")[:, :, 0] / 255.0 target_img = cv2.resize(cv2.imread('taichi.png'), (n_grid, n_grid))[:, :, 0] / 255.0 for i in range(n_grid): for j in range(n_grid): target[i, j] = target_img[i, j] smoke[0, i, j] = initial_smoke_img[i, j] for opt in range(num_iterations): t = time.time() with ti.Tape(loss): output = "test" if opt % 10 == -1 else None forward(output) print('total time', (time.time() - t) * 1000, 'ms') print('Iter', opt, ' Loss =', loss[None]) apply_grad() print("Compilation time:", ti.get_runtime().prog.get_total_compilation_time()) # ti.profiler_print() forward("output")
def test_default_ip_ndarray(dtype): ti.init(arch=supported_archs_taichi_ndarray, default_ip=dtype, ndarray_use_torch=False) x = ti.Vector.ndarray(2, int, ()) assert x.dtype == ti.get_runtime().default_ip
def init(default_fp=None, default_ip=None, print_preprocessed=None, debug=None, **kwargs): if debug is None: debug = bool(int(os.environ.get('TI_DEBUG', '0'))) # Make a deepcopy in case these args reference to items from ti.cfg, which are # actually references. If no copy is made and the args are indeed references, # ti.reset() could override the args to their default values. default_fp = _deepcopy(default_fp) default_ip = _deepcopy(default_ip) kwargs = _deepcopy(kwargs) import taichi as ti ti.reset() if default_fp is not None: ti.get_runtime().set_default_fp(default_fp) if default_ip is not None: ti.get_runtime().set_default_ip(default_ip) if print_preprocessed is not None: ti.get_runtime().print_preprocessed = print_preprocessed if debug: ti.set_logging_level(ti.DEBUG) ti.cfg.debug = debug log_level = os.environ.get('TI_LOG_LEVEL', '') if log_level: ti.set_logging_level(log_level) for k, v in kwargs.items(): setattr(ti.cfg, k, v) ti.get_runtime().create_program()
def _test_inconsistent_trailing_bits(): ti.init(arch=ti.cpu, debug=True, print_ir=True) x = ti.field(ti.f32) y = ti.field(ti.f32) z = ti.field(ti.f32) block = ti.root.pointer(ti.i, 8) # Here the numbers of bits of x and z are inconsistent, # which leads to the RuntimeError below. block.dense(ti.i, 32).place(x) block.dense(ti.i, 16).place(z) block.dense(ti.j, 16).place(y) with pytest.raises(RuntimeError): ti.get_runtime().materialize()
def decorated(*args, _gradient=False, **kwargs): if _gradient: adjoint(*args, **kwargs) else: primal(*args, **kwargs) import taichi as ti runtime = ti.get_runtime() if runtime.target_tape and not runtime.inside_complex_kernel: runtime.target_tape.insert(decorated, args)
def fixed(frac, signed=True, range=1.0, compute=None): import taichi as ti # TODO: handle cases with frac > 32 frac_type = Quant.int(bits=frac, signed=signed, compute=ti.i32) if signed: scale = range / 2**(frac - 1) else: scale = range / 2**frac if compute is None: compute = ti.get_runtime().default_fp return ti.type_factory.custom_float(frac_type, None, compute, scale)
def float(exp, frac, signed=True, compute=None): import taichi as ti # Exponent is always unsigned exp_type = Quant.int(bits=exp, signed=False, compute=ti.i32) # TODO: handle cases with frac > 32 frac_type = Quant.int(bits=frac, signed=signed, compute=ti.i32) if compute is None: compute = ti.get_runtime().default_fp return ti.type_factory.custom_float(significand_type=frac_type, exponent_type=exp_type, compute_type=compute)
def _test_compiled_functions(): @ti.kernel def func(a: ti.any_arr(element_dim=1)): for i in range(5): for j in range(4): a[i][j * j] = j * j v = ti.Vector.ndarray(10, ti.i32, 5) func(v) assert ti.get_runtime().get_num_compiled_functions() == 1 v = np.zeros((6, 10), dtype=np.int32) func(v) assert ti.get_runtime().get_num_compiled_functions() == 1 import torch v = torch.zeros((6, 11), dtype=torch.int32) func(v) assert ti.get_runtime().get_num_compiled_functions() == 2 v = ti.Vector.ndarray(10, ti.i32, 5, layout=ti.Layout.SOA) func(v) assert ti.get_runtime().get_num_compiled_functions() == 3
def test_random_int(): for precision in [ti.i32, ti.i64]: ti.init() n = 1024 x = ti.var(ti.f32, shape=(n, n)) ti.get_runtime().set_default_fp(ti.f64) @ti.kernel def fill(): for i in range(n): for j in range(n): v = ti.random(precision) if precision == ti.i32: x[i, j] = (float(v) + float(2**31)) / float(2**32) else: x[i, j] = (float(v) + float(2**63)) / float(2**64) fill() X = x.to_numpy() for i in range(4): assert (X**i).mean() == approx(1 / (i + 1), rel=1e-2)
def custom_float(self, significand_type, exponent_type=None, compute_type=None, scale=1.0): import taichi as ti if compute_type is None: compute_type = ti.get_runtime().default_fp.get_ptr() return self.core.get_custom_float_type(significand_type, exponent_type, compute_type, scale=scale)
def benchmark_flat_range(): a = ti.var(dt=ti.f32) N = 512 @ti.layout def place(): ti.root.dense(ti.ij, N * 8).place(a) @ti.kernel def fill(): for j in range(N * 8): for i in range(N * 8): a[i, j] = 2.0 ti.get_runtime().sync() t = time.time() for n in range(100): fill() ti.get_runtime().sync() elapsed = time.time() - t return elapsed / 100
def test_assert(): return ti.get_runtime().print_preprocessed = True ti.cfg.print_ir = True # ti.cfg.arch = ti.cuda @ti.kernel def func(): x = 20 assert 10 <= x < 20 func()