def make_autotuned_options_factory( starting_options: Optional[Union[str, MappingOptions]] = None, tuner_config: TunerConfig = TunerConfig(), cache_filename: Optional[str] = None, load_from_cache: Optional[bool] = False, store_to_cache: Optional[bool] = False ) -> (Callable[[str, str, Iterable[torch.Tensor]], MappingOptions]): r"""Return a factory that runs autotuning to determine the best :class:`~tclib.MappingOptions`. The returned factory just calls the :func:`autotune` function, see its documentation for more information. :rtype: a function that takes a string with multiple TC defs, an entry_point and input PyTorch Tensors and produces a :class:`~tclib.MappingOptions`. """ def generate(tc: str, entry_point: str, *inputs: torch.Tensor) -> MappingOptions: return autotune(tc, entry_point, *inputs, starting_options=starting_options, tuner_config=tuner_config, cache_filename=cache_filename, load_from_cache=load_from_cache, store_to_cache=store_to_cache) return generate
def __init__(self, tc="", forward_name="", forward_force_reinforcement_tuning=False, backward_name="", backward_force_reinforcement_tuning=False, check_output_shapes=True, tuner_cache_file="", tuner_config=TunerConfig(), debug=False): if debug: assert isinstance(tc, str), type(tc) assert isinstance(forward_name, str), type(forward_name) assert isinstance(forward_force_reinforcement_tuning, bool), type(forward_force_reinforcement_tuning) assert isinstance(backward_name, str), type(backward_name) assert isinstance(backward_force_reinforcement_tuning, bool), type(backward_force_reinforcement_tuning) assert isinstance(check_output_shapes, bool), type(tuner_cache_file) assert isinstance(tuner_cache_file, str), type(tuner_cache_file) assert isinstance(tuner_config, TunerConfig), type(tuner_config) self.tc = tc self.forward_name = forward_name self.forward_force_reinforcement_tuning = forward_force_reinforcement_tuning self.backward_name = backward_name self.backward_force_reinforcement_tuning = backward_force_reinforcement_tuning self.check_output_shapes = check_output_shapes self.tuner_cache_file = tuner_cache_file self.tuner_config = tuner_config self.debug = debug self.compilation_cache = CompilationCache(self.tc)
def autotune_and_compile( tc: str, entry_point: str, *inputs: torch.Tensor, starting_options: Optional[Union[str, MappingOptions]] = None, tuner_config: Optional[TunerConfig] = TunerConfig(), cache_filename: Optional[str] = None, load_from_cache: Optional[bool] = False, store_to_cache: Optional[bool] = False) -> Executor: r"""Calls autotune, compiles with best options then returns an Executor. Takes the same arguments as the :func:`autotune` function. Example: >>> A, B = ( ... torch.randn(10 ** 5, device='cuda').fill_(1.0), ... torch.randn(10 ** 5, device='cuda').fill_(1.0)) ... add = tc.autotune_and_compile( ... "def add(float(N) A, float(N) B) -> (C) { C(i) = A(i) + B(i) }", ... "add", ... A, B, ... starting_options='naive', ... tuner_config=tc.TunerConfig().threads(5).generations(3).pop_size(5) ... ) ... C = add(A, B) >>> print(C.min(), C.max()) tensor(2., device='cuda:0') tensor(2., device='cuda:0') """ best = autotune( tc, entry_point, *inputs, starting_options=starting_options, tuner_config=tuner_config, cache_filename=cache_filename, load_from_cache=load_from_cache, store_to_cache=store_to_cache) if best is None: return None return compile(tc, entry_point, best, *inputs)
def autotune(tc: str, entry_point: str, *inputs: torch.Tensor, starting_options: Optional[Union[str, MappingOptions]] = None, tuner_config: Optional[TunerConfig] = TunerConfig(), cache_filename: Optional[str] = None, load_from_cache: Optional[bool] = False, store_to_cache: Optional[bool] = False) -> MappingOptions: r"""Tunes the defined TC function for given inputs. The MappingOptions from which tuning starts is either passed explicitly via :code:`starting_options` or loaded from a cache file (when both :code:`cache_filename` and :code:`load_from_cache` are properly specified). Exactly one of :code:`starting_options` and :code:`load_from_cache` must be specified. It is possible to obtain a reinforcement tuning behavior by tuning over multiple executions and specifying both :code:`load_from_cache` and :code:`store_to_cache`. It is recommended to only use a single cache file for all TC defs and reinforce it over time. An example of usage is provided with :func:`autotune_and_compile`. :param tc: a string containing one of more TC defs. :param entry_point: the name of the TC def to compile and execute. :param inputs: PyTorch Tensors that TC should tune for. The inputs must be passed in the order they are also passed in the definition of the TC function. :param starting_options: :class:`~tclib.MappingOptions` from which tuning should start. :param tuner_config: :class:`~tclib.TunerConfig` to control the behavior of the autotuner. :param load_from_cache: Get the starting :class:`~tclib.MappingOptions` by loading from :code:`cache_filename`. If loading fails to recover an entry from the cache file for the given input sizes an assertion error will trigger. :param store_to_cache: Optionally store the best result by appending it to the backing cache file. Returns: The best options found during this tuning run. """ if cache_filename is not None: assert load_from_cache or store_to_cache, ( "cache_filename specified" + "must also specify load_from_cache or store_to_cache") if load_from_cache or store_to_cache: assert cache_filename is not None, ( "load_from_cache or store_to_cache" + " specified, must also specify cache_filename") assert starting_options is not None or load_from_cache, ( "Must specify either starting_options or load_from_cache, choose one!") assert starting_options is None or not load_from_cache, ( "Cannot specify both starting_options and load_from_cache, choose one!" ) base_options = None if load_from_cache: cache = MappingOptionsCache(cache_filename) loaded = cache.load(tc, entry_point, inputs, 1) assert len(loaded) > 0, ( "Could not load from cache for TC {} and sizes {}".format( entry_point, "".join(str(i.size()) + " " for i in inputs))) base_options = loaded[0] else: base_options = (MappingOptions(starting_options) if isinstance( starting_options, str) else starting_options) # TODO: This is still an implicit store behavior in the C++ API, # make it explicit... tuner = Tuner(tc, cache_filename if store_to_cache else "") return tuner.tune(entry_point, inputs, base_options, tuner_config)
def matmul(float(M,N) A, float(N,K) B) -> (C) { C(m, k) +=! A(m, r_n) * B(r_n, k) } def matmul_agrad(float(N,K) B, float(M,K) d_C) -> (d_A) { d_A(m, n) +=! d_C( m, r_k) * B( n, r_k) } def matmul_bgrad(float(M,N) A, float(M,K) d_C) -> (d_B) { d_B(n, k) +=! d_C(r_m, k) * A(r_m, n) } """ A, B = (torch.randn(300, 400, device='cuda', requires_grad=True), torch.randn(400, 500, device='cuda', requires_grad=True)) compilation_cache = CompilationCache(mm) tuner_config = TunerConfig().threads(8).pop_size(25).generations(3).devices( "0") ################################################################################ # 1. Use the simple high-overhead compile/run C++ API # If one can keep state in their layer or wishes to experiment with TC, # this is a simple entry point. # If state cannot be kept, be aware that this API has a non-trivial overhead # when outputs sizes need to be inferred and outputs allocated. # Compilation itself has a prohibitive cost and needs to be memoized either # by holding on to the executor or by using the low-overhead abstraction, see # below. ################################################################################ executor = compile(mm, "matmul", (A, B), MappingOptions('naive')) C = executor.run((A, B)) time_tc(100, "simple API (in place)\t",
# compilation cache ################################################################################ from tensor_comprehensions.tclib import Tuner from tensor_comprehensions.tclib import MappingOptionsCache from tensor_comprehensions.tclib import TunerConfig import uuid unique_filename = "/tmp/" + str(uuid.uuid4()) print("Tune with cache @", unique_filename) print("Note that if you pass a fixed filename, you can reinforce an " + "existing tuning state") tuner = Tuner(mm, unique_filename) top1 = tuner.tune( "matmul", (mat1, mat2), MappingOptions(), TunerConfig(threads=8, pop_size=25, generations=3, devices="0")) cache = MappingOptionsCache(unique_filename) top10 = cache.load(mm, "matmul", (mat1, mat2), 10) assert top1.__str__() == top10[0].__str__() # Compile and run with the new options compilation_cache.compile("matmul", (mat1, mat2), top1) time_tc(100, "raw unchecked_run tuned options\t", lambda name, ins: compilation_cache.unchecked_run(name, ins, ()), "matmul", (mat1, mat2)) ################################################################################ # 4. Simple TC builder ################################################################################ class TcBuilder():