Exemplo n.º 1
0
 def _get_torch_gpu_allocator_function_addresses(self):
     if self._use_external_gpu_allocator and torch.cuda.is_available():
         # CPP extension to get torch GPU allocator's alloc and free function addresses
         from onnxruntime.training.ortmodule.torch_cpp_extensions import torch_gpu_allocator
         self._torch_alloc = torch_gpu_allocator.gpu_caching_allocator_raw_alloc_address()
         self._torch_free = torch_gpu_allocator.gpu_caching_allocator_raw_delete_address()
         self._torch_empty_cache = torch_gpu_allocator.gpu_caching_allocator_empty_cache_address()
Exemplo n.º 2
0
    def __init__(self, module, debug_options: DebugOptions):
        """Manages building and execution of onnx graphs

        This class is an abstract class and should not directly be instantiated.
        Please use one of the concrete implementations of GraphExecutionManager.

        Interacts with OrtModuleGraphBuilder to build and optimize
        the onnx graph, and ExecutionAgent to run the onnx graph.
        """

        super(GraphExecutionManager, self).__init__(module._original_module)

        # Original and flattened (tranformed) output module
        self._flattened_module = module

        # onnx models
        self._onnx_models = _onnx_models.ONNXModels()

        # Model after inference optimization or gradient building.
        self._optimized_onnx_model = None
        self._graph_builder = None
        self._graph_info = None
        self._graph_initializer_names = None
        self._graph_initializer_names_to_train = None
        self._graph_initializers = None

        # TrainingAgent or InferenceAgent
        self._execution_agent = None

        # indicators of some logic have been executed previously thus could be skipped for faster training
        self._skip_check = _SkipCheck.SKIP_CHECK_DISABLED

        # Debug flags
        self._debug_options = debug_options

        # Graph transformer config
        # Specify cast propagation strategy. Currently three strategies are available, NONE, INSERT-AND-REDUCE and FLOOD-FILL
        # The default is NONE, which implies the transformer does no cast-propagation transformation.
        self._propagate_cast_ops_strategy = C.PropagateCastOpsStrategy.NONE
        # Optimize by moving Cast operations if propagate_cast_ops_level is non-negative.
        # - If the _propagate_cast_ops_level is set to zero, then the transformation considers only the opcodes specified by _propagate_cast_ops_allow
        #   as "FP16 safe", in order to insert/(re)move cast operations before/after to perform such operations in reduced (16-bit) precision.
        # - If propagate_cast_ops_level is positive, 1 or 2, then in addition to opcode codes specified by propagate_cast_ops_allow use onnxruntime
        #   predetermined list of opcodes considered safe to move before/after cast operation.
        # - Onnxruntime Level 1 predetermind "FP16 safe" opcodes include only opcode that do not perform any computation such as Transpose, Split, Reshape, etc.
        #   whereas Level 2 perdetermined "FP16 safe" opcodes include opcodes that perform computation using contrib ops, GeLU, Dropout, LayerNormalization, etc.
        self._propagate_cast_ops_level = 1
        # List of opcodes to be considered safe to move before/after cast operation if propagate_cast_ops_level is zero.
        self._propagate_cast_ops_allow = []
        # Whether allow fusion of layer norm subgraph if doing so will cause modified precision.
        self._allow_layer_norm_mod_precision = False

        # Value can be either torch.onnx.TrainingMode.TRAINING or torch.onnx.TrainingMode.EVAL
        # To be instantiated in the concrete implementation of GraphExecutionManager
        self._export_mode = None

        # Related to training graph shape inference
        self._current_input_shape = None
        # default execution order is priority-based for both dynamic/static shape input for now
        # if we observe benefit of static shape, we can expose this flag to user
        self._use_static_shape = False

        # flag to enable symbolic shape inference for dynamic shape inputs to improve performance
        self._run_symbolic_shape_infer = True

        # A flag saying if custom autograd.Function should be allowed. True means yes and otherwise False.
        self._enable_custom_autograd_function = False

        self._input_info = None
        self._module_output_schema = None

        # TODO: Single device support for now
        self._device = _utils.get_device_from_module(module)

        self._module_parameters = inspect.signature(
            self._original_module.forward).parameters.values()

        # TODO: remove after PyTorch ONNX exporter supports VAR_KEYWORD parameters.
        for input_parameter in self._module_parameters:
            if input_parameter.kind == inspect.Parameter.VAR_KEYWORD:
                if self._debug_options.logging.log_level <= LogLevel.WARNING:
                    warnings.warn(
                        "The model's forward method has **kwargs parameter which has EXPERIMENTAL support!",
                        UserWarning)

        self.is_rocm_pytorch = (True if ((torch.version.hip is not None) and
                                         (ROCM_HOME is not None)) else False)

        self._use_external_gpu_allocator = True
        if self._use_external_gpu_allocator and torch.cuda.is_available():
            # CPP extension to get torch GPU allocator's alloc and free function addresses
            from onnxruntime.training.ortmodule.torch_cpp_extensions import torch_gpu_allocator
            self._torch_alloc = torch_gpu_allocator.gpu_caching_allocator_raw_alloc_address(
            )
            self._torch_free = torch_gpu_allocator.gpu_caching_allocator_raw_delete_address(
            )

        # WIP feature to enable caching in Gradient accumulation scenario.
        self._enable_grad_acc_optimization = False