def test_tensordot_autotune_pybind(self): tensordot_str = """ def tensordot(float(N, C1, C2, H, W) I0, float(N, C2, C3, H, W) I1) -> (O) { O(n, c1, c3, h, w) +=! I0(n, c1, c2, h, w) * I1(n, c2, c3, h, w) } """ entry_point = "tensordot" N, C1, C2, C3, H, W = 40, 16, 8, 20, 13, 15 with tempfile.NamedTemporaryFile() as cache_file: I0 = torch.randn(N, C1, C2, H, W, device='cuda') I1 = torch.randn(N, C2, C3, H, W, device='cuda') tuner = tc.Tuner(tensordot_str, cache_file.name) top1 = tuner.tune(entry_point, (I0, I1), tc.MappingOptions('naive'), tuner_config) import tensor_comprehensions.tclib as tclib executor = tclib.compile(tensordot_str, entry_point, (I0, I1), top1) O = executor.run((I0, I1), ()) cache = tc.MappingOptionsCache(cache_file.name) best_options, = cache.load(tensordot_str, entry_point, (I0, I1), 10) assert str(top1) == str(best_options), ( "Expected the same but found {}\nand\n{}".format( top1, best_options)) executor = tclib.compile(tensordot_str, entry_point, (I0, I1), best_options) O = executor.run((I0, I1), ()) # No simple torch baseline, compare against naive executor = tclib.compile(tensordot_str, entry_point, (I0, I1), tc.MappingOptions('naive')) ref = executor.run((I0, I1), ()) tc.assert_almost_equal(ref, O, I0, I1, operations=C2)
def compile(tc: str, entry_point: str, mapping_options: Union[str, MappingOptions], *inputs: torch.Tensor) -> Executor: r"""Returns a compiled, callable, low-overhead :class:`Executor`. An example of usage is provided in :class:`Executor`. :param tc: a string containing one of more TC defs. :param entry_point: the name of the TC def to compile and execute. :param mapping_options: the options to use for compilation. :param inputs: PyTorch Tensors for which the compiled kernel is specialized. :rtype: :class:`Executor`, a low-overhead callable class to launch the kernel compiled from the :code:`entry_point`. """ mapping_options = (MappingOptions(mapping_options) if isinstance( mapping_options, str) else mapping_options) return Executor(tclib.compile(tc, entry_point, inputs, mapping_options))
def test_matmul_pybind(self): mm_str = """ def matmul(float(M,N) A, float(N,K) B) -> (C) { C(m, k) +=! A(m, r_n) * B(r_n, k) } """ A, B = (torch.randn(3, 4, device='cuda'), torch.randn(4, 5, device='cuda')) import tensor_comprehensions.tclib as tclib executor = tclib.compile(mm_str, "matmul", (A, B), tc.MappingOptions('naive')) C = executor.run((A, B), ()) torch.cuda.synchronize() expected = torch.mm(A, B) torch.cuda.synchronize() tc.assert_almost_equal(C, expected, A, B, operations=4) C = executor.run((A, B), (C, )) tc.assert_almost_equal(C, torch.mm(A, B), A, B, operations=4)
compilation_cache = CompilationCache(mm) tuner_config = TunerConfig().threads(8).pop_size(25).generations(3).devices( "0") ################################################################################ # 1. Use the simple high-overhead compile/run C++ API # If one can keep state in their layer or wishes to experiment with TC, # this is a simple entry point. # If state cannot be kept, be aware that this API has a non-trivial overhead # when outputs sizes need to be inferred and outputs allocated. # Compilation itself has a prohibitive cost and needs to be memoized either # by holding on to the executor or by using the low-overhead abstraction, see # below. ################################################################################ executor = compile(mm, "matmul", (A, B), MappingOptions('naive')) C = executor.run((A, B)) time_tc(100, "simple API (in place)\t", lambda name, ins: executor.unchecked_run(ins, (C, )), "matmul", (A, B)) time_tc(100, "simple API (with allocation overhead)\t", lambda name, ins: executor.unchecked_run(ins), "matmul", (A, B)) ################################################################################ # 2. Use the C++ API to build a low-overhead compilation cache and time it ################################################################################ # Compilation returns an allocated tuple of outputs with the proper shapes. # Allocation overhead is negligible compared to compilation overhead. compilation_cache.compile("matmul", (A, B), MappingOptions('naive'))
""" mat1, mat2 = torch.randn(300, 400).cuda(), torch.randn(400, 500).cuda() ################################################################################ # 1. Use the simple high-overhead compile/run C++ API # If one can keep state in their layer or wishes to experiment with TC, # this is a simple entry point. # If state cannot be kept, be aware that this API has a non-trivial overhead # when outputs sizes need to be inferred and outputs allocated. # Compilation itself has a prohibitive cost and needs to be memoized either # by holding on to the executor or by using the low-overhead abstraction, see # below ################################################################################ from tensor_comprehensions.tclib import compile executor = compile(mm, "matmul", (mat1, mat2), MappingOptions()) outputs = executor.run((mat1, mat2), ()) outputs = executor.unchecked_run((mat1, mat2), tuple(outputs)) time_tc(100, "simple API\t", lambda name, ins: executor.unchecked_run(ins, tuple(outputs)), "matmul", (mat1, mat2)) time_tc(100, "simple API (with allocation overhead)\t", lambda name, ins: executor.unchecked_run(ins, ()), "matmul", (mat1, mat2)) ################################################################################ # 2. Use the C++ API to build a low-overhead compilation cache and time it ################################################################################ from tensor_comprehensions.tclib import CompilationCache compilation_cache = CompilationCache(mm)