Exemplos de gpu_caching_allocator_raw_alloc_address em Python

Linguagem de programação: Python

Espaço para nome / nome do pacote: onnxruntime.training.ortmodule.torch_cpp_extensions.torch_gpu_allocator

Método / Função: gpu_caching_allocator_raw_alloc_address

Exemplos em hotexamples.com: 2

gpu_caching_allocator_raw_alloc_address em Python - 2 exemplos encontrados. Esses são os exemplos do mundo real mais bem avaliados de onnxruntime.training.ortmodule.torch_cpp_extensions.torch_gpu_allocator.gpu_caching_allocator_raw_alloc_address em Python extraídos de projetos de código aberto. Você pode avaliar os exemplos para nos ajudar a melhorar a qualidade deles.

Exemplo n.º 1

0

Exibir arquivo

def _get_torch_gpu_allocator_function_addresses(self): if self._use_external_gpu_allocator and torch.cuda.is_available(): # CPP extension to get torch GPU allocator's alloc and free function addresses from onnxruntime.training.ortmodule.torch_cpp_extensions import torch_gpu_allocator self._torch_alloc = torch_gpu_allocator.gpu_caching_allocator_raw_alloc_address() self._torch_free = torch_gpu_allocator.gpu_caching_allocator_raw_delete_address() self._torch_empty_cache = torch_gpu_allocator.gpu_caching_allocator_empty_cache_address()

Exemplo n.º 2

0

Exibir arquivo

def __init__(self, module, debug_options: DebugOptions): """Manages building and execution of onnx graphs This class is an abstract class and should not directly be instantiated. Please use one of the concrete implementations of GraphExecutionManager. Interacts with OrtModuleGraphBuilder to build and optimize the onnx graph, and ExecutionAgent to run the onnx graph. """ super(GraphExecutionManager, self).__init__(module._original_module) # Original and flattened (tranformed) output module self._flattened_module = module # onnx models self._onnx_models = _onnx_models.ONNXModels() # Model after inference optimization or gradient building. self._optimized_onnx_model = None self._graph_builder = None self._graph_info = None self._graph_initializer_names = None self._graph_initializer_names_to_train = None self._graph_initializers = None # TrainingAgent or InferenceAgent self._execution_agent = None # indicators of some logic have been executed previously thus could be skipped for faster training self._skip_check = _SkipCheck.SKIP_CHECK_DISABLED # Debug flags self._debug_options = debug_options # Graph transformer config # Specify cast propagation strategy. Currently three strategies are available, NONE, INSERT-AND-REDUCE and FLOOD-FILL # The default is NONE, which implies the transformer does no cast-propagation transformation. self._propagate_cast_ops_strategy = C.PropagateCastOpsStrategy.NONE # Optimize by moving Cast operations if propagate_cast_ops_level is non-negative. # - If the _propagate_cast_ops_level is set to zero, then the transformation considers only the opcodes specified by _propagate_cast_ops_allow # as "FP16 safe", in order to insert/(re)move cast operations before/after to perform such operations in reduced (16-bit) precision. # - If propagate_cast_ops_level is positive, 1 or 2, then in addition to opcode codes specified by propagate_cast_ops_allow use onnxruntime # predetermined list of opcodes considered safe to move before/after cast operation. # - Onnxruntime Level 1 predetermind "FP16 safe" opcodes include only opcode that do not perform any computation such as Transpose, Split, Reshape, etc. # whereas Level 2 perdetermined "FP16 safe" opcodes include opcodes that perform computation using contrib ops, GeLU, Dropout, LayerNormalization, etc. self._propagate_cast_ops_level = 1 # List of opcodes to be considered safe to move before/after cast operation if propagate_cast_ops_level is zero. self._propagate_cast_ops_allow = [] # Whether allow fusion of layer norm subgraph if doing so will cause modified precision. self._allow_layer_norm_mod_precision = False # Value can be either torch.onnx.TrainingMode.TRAINING or torch.onnx.TrainingMode.EVAL # To be instantiated in the concrete implementation of GraphExecutionManager self._export_mode = None # Related to training graph shape inference self._current_input_shape = None # default execution order is priority-based for both dynamic/static shape input for now # if we observe benefit of static shape, we can expose this flag to user self._use_static_shape = False # flag to enable symbolic shape inference for dynamic shape inputs to improve performance self._run_symbolic_shape_infer = True # A flag saying if custom autograd.Function should be allowed. True means yes and otherwise False. self._enable_custom_autograd_function = False self._input_info = None self._module_output_schema = None # TODO: Single device support for now self._device = _utils.get_device_from_module(module) self._module_parameters = inspect.signature( self._original_module.forward).parameters.values() # TODO: remove after PyTorch ONNX exporter supports VAR_KEYWORD parameters. for input_parameter in self._module_parameters: if input_parameter.kind == inspect.Parameter.VAR_KEYWORD: if self._debug_options.logging.log_level <= LogLevel.WARNING: warnings.warn( "The model's forward method has **kwargs parameter which has EXPERIMENTAL support!", UserWarning) self.is_rocm_pytorch = (True if ((torch.version.hip is not None) and (ROCM_HOME is not None)) else False) self._use_external_gpu_allocator = True if self._use_external_gpu_allocator and torch.cuda.is_available(): # CPP extension to get torch GPU allocator's alloc and free function addresses from onnxruntime.training.ortmodule.torch_cpp_extensions import torch_gpu_allocator self._torch_alloc = torch_gpu_allocator.gpu_caching_allocator_raw_alloc_address( ) self._torch_free = torch_gpu_allocator.gpu_caching_allocator_raw_delete_address( ) # WIP feature to enable caching in Gradient accumulation scenario. self._enable_grad_acc_optimization = False