def __call__(self, *inputs, **kwargs): r"""Runs the defined TC language on given inputs. Args: *inputs (required): PyTorch Tensors or Variables that TC should execute on. The inputs should be passed in the order they are also passed in the definition of TC language. options (optional): Kernel mapping options of type :attr:`tc.Options`. These options provide mapping for kernel like grid, blocks, memory etc. It is recommended to always pass kernel options. The options can be obtained by: * Autotuning, (recommended) OR * You can create `Options` object by chosing the closely matching "type" of kernel. For example: .. code:: import tensor_comprehensions as tc options = tc.Options(type) where :attr:`type` is a string with value one of below: * :attr:`pointwise`: if kernel resembles a pointwise operation * :attr:`mlp`: if kernel resembles an Linear layer operation * :attr:`conv`: if kernel resembles a convolution operation * :attr:`group_conv`: if kernel resembles a group convolution operation * :attr:`naive`: if none of the above, then chose naive *Default* If no :attr:`Options` are passed, the naive options will be used which might not yield great performance. outputs (optional): List of Pytorch tensors/Variables. The number of outputs is the same as defined in the TC language and are in the same order as in TC language. You can chose to allocate the outputs tensors/Variables beforehand. Most common use case is to reuse output from a previous operation. cache (string, optional): A string denoting the absolute filepath which contains the mapping options for the kernel. Such file can be created by running autotuning. If :attr:`training` = True, then the backward options will be obtained from file cache + '_backward'. For the backward, separate filename is not accepted for now. grid (int, 3D list): If :attr:`inject_kernel` is `True`, then user needs to specify the kernel grid options for running it. TC will simply use those options and will not add any optimizations block (int, 3D list): If :attr:`inject_kernel` is `True`, then user needs to specify the kernel `block` options for running it. TC will simply use those options and will not add any optimizations reorder_function (optional): If :attr:`training` is set to true in :attr:`define` call, then TC infers the inputs for backward layer for compilation (1st time the layer is run). The backward layer should typically contain the grad_outputs of the forward layer. The backward layer should take TC forward inputs + grad_outputs in the same order as the forward TC takes inputs and emits outputs. If the order of the outputs is changed, or some output grad are not required in backwards, then you can pass a function which can reorder/drop the layer grad_outputs according to backwards layer inputs your TC needs. The function should return a :attr:`list`. Returns: List of PyTorch tensors/Variables which is the output of running TC layer. The number of outputs is the same as defined in the TC language and are in the same order as in TC language. Example: >>> LANG = MATMUL_LANG >>> matmul = tc.define(lang, name="matmul") >>> mat1, mat2 = torch.randn(3, 4).cuda(), torch.randn(4, 5).cuda() >>> out = matmul(mat1, mat2, options=Options("mlp")) """ try: validate_input(*inputs) kwargs.update(self.kwargs_define) name, backward_name = get_tc_names_from_kwargs(**kwargs) kwargs.pop("name", None) backward = True if backward_name is not None else False hash_key = get_tc_hash_key(name, *inputs) if self.tuner and self.tuner.tuner_cache and hash_key in self.tuner.tuner_cache: options_cache = self.tuner.tuner_cache[hash_key] else: options_cache = {} kwargs["options_cache"] = options_cache if hash_key in self.cu.compilation_cache: tc_info = self.cu.compilation_cache[hash_key] else: tc_info = {} kwargs["type"] = "forward" input_tensors = unpack_variables(list(inputs)) if "inject_kernel" in kwargs and "cuda_code" in kwargs: assert "grid" in kwargs and "block" in kwargs, \ "For manual cuda injection, please specify the grid and block settings" self.cu.manual_cuda_injection( name, kwargs["inject_kernel"], kwargs["cuda_code"], input_tensors, kwargs["grid"], kwargs["block"]) handle_forward = self.cu.compile(name, input_tensors, **kwargs) tc_info["forward_name"], tc_info[ "handle_forward"] = name, handle_forward if backward: tc_info["backward_name"] = backward_name self.cu.compilation_cache[hash_key] = tc_info if "outputs" in kwargs and kwargs["outputs"] is not None: out = kwargs["outputs"] tc_info["outputs"] = out if not isinstance(out, list): tc_info["outputs"] = [out] out = TCFunction.apply(self.cu, tc_info, kwargs, *inputs) out = list(out) if (len(out) > 1) else out[0] return out except Exception as e: logger.error("Caught Exception: {}".format(e)) return None
def autotune(self, *inputs, **kwargs): input_tensors = get_tensors(list(inputs)) kwargs.update(self.kwargs) name, backward_name = get_tc_names_from_kwargs(**kwargs) kwargs.pop("name", None) backward = True if backward_name is not None else False hash_key = get_tc_hash_key(name, *input_tensors) # lookup for the options in the cache. Whenever we make the call to # autotune, tuning must happen. But if the kernel has been tuned earlier # then we can use previous options to seed the tuning. if hash_key in self.tuner_cache: options_cache = self.tuner_cache[hash_key] else: options_cache = {} # we give priority to the options user might have passed via file, or # Options object. cache_file = "" if "cache" in kwargs and kwargs["cache"]: if isinstance(kwargs["cache"], bool): hash_key = get_tc_hash_key(name, *input_tensors) cache_file = "/tmp/{}_{}".format(hash_key, str(uuid.uuid4())) elif isinstance(kwargs["cache"], str): cache_file = kwargs["cache"] logger.info( 'Autotuning cache will be saved to: {}.cuda/options'.format( cache_file)) else: logger.warning( "Autotuning results won't be cached. 'cache' option is not set" ) # we will first run the autotuning on the forward layer, the inputs are given # for that, we will tune those kwargs["type"] = "forward" # we pass this tuner object so we can load from file without having to # create special object kwargs["tuner"] = self.autotuner options = get_options_from_kwargs_and_tuner_cache( name, cache_file, options_cache, *input_tensors, **kwargs) forward_best_options = self.tune_and_store(name, input_tensors, mapping_options=options, cache_file=cache_file) # update the cache with the options options_cache["forward"] = forward_best_options if not backward: self.tuner_cache[hash_key] = options_cache return forward_best_options # now, we have to tune the backward layer, for that, we need to run # the forward layer first, get its output, logger.info('Autotuning the backward layer now') cu = TcCompilationUnit() cu.define(self.tc_lang) if "options" in kwargs: orig_options = kwargs["options"] kwargs["options"] = forward_best_options outputs = cu.compile_and_run(name, input_tensors, **kwargs) kwargs["options"] = orig_options else: outputs = cu.compile_and_run(name, input_tensors, options=forward_best_options, **kwargs) # now that we have the outputs of the forward pass, we have the inputs # for the backward layer and we can now tune the backward layer reorder_function = kwargs[ "reorder_function"] if "reorder_function" in kwargs else None rearranged_outputs = list(outputs) if reorder_function is not None: rearranged_outputs = reorder_function(list(outputs)) inputs = make_contiguous( unpack_variables(input_tensors + list(rearranged_outputs))) if cache_file: cache_file = cache_file + "_backward" logger.info( 'Backwards autotuning cache will be saved to: {}.cuda/options'. format(cache_file)) kwargs["type"] = "backward" options = get_options_from_kwargs_and_tuner_cache( backward_name, cache_file, options_cache, *inputs, **kwargs) backward_best_options = self.tune_and_store(backward_name, inputs, mapping_options=options, cache_file=cache_file) # update the cache with the options options_cache["backward"] = backward_best_options self.tuner_cache[hash_key] = options_cache return [forward_best_options, backward_best_options]