class profile(object): """Context manager that manages autograd profiler state and holds a summary of results. Under the hood it just records events of functions being executed in C++ and exposes those events to Python. You can wrap any code into it and it will only report runtime of PyTorch functions. Note: profiler is thread local and is automatically propagated into the async tasks Args: enabled (bool, optional): Setting this to False makes this context manager a no-op. use_cuda (bool, optional): Enables timing of CUDA events as well using the cudaEvent API. Adds approximately 4us of overhead to each tensor operation. record_shapes (bool, optional): If shapes recording is set, information about input dimensions will be collected. This allows one to see which dimensions have been used under the hood and further group by them using prof.key_averages(group_by_input_shape=True). Please note that shape recording might skew your profiling data. It is recommended to use separate runs with and without shape recording to validate the timing. Most likely the skew will be negligible for bottom most events (in a case of nested function calls). But for higher level functions the total self cpu time might be artificially increased because of the shape collection. with_flops (bool, optional): If with_flops is set, the profiler will estimate the FLOPs (floating point operations) value using the operator's input shape. This allows one to estimate the hardware performance. Currently, this option only works for the matrix multiplication and 2D convolution operators. profile_memory (bool, optional): track tensor memory allocation/deallocation. with_stack (bool, optional): record source information (file and line number) for the ops. with_modules (bool): record module hierarchy (including function names) corresponding to the callstack of the op. e.g. If module A's forward call's module B's forward which contains an aten::add op, then aten::add's module hierarchy is A.B Note that this support exist, at the moment, only for TorchScript models and not eager mode models. use_kineto (bool, optional): experimental, enable profiling with Kineto profiler. use_cpu (bool, optional): profile CPU events; setting to ``False`` requires ``use_kineto=True`` and can be used to lower the overhead for GPU-only profiling. .. warning: Enabling memory profiling or source attribution incurs additional profiler overhead .. warning: This context managers should not be called recursively, i.e. no nested instances are allowed .. warning: Due to some CUDA multiprocessing limitations (multiprocessing-cuda-note_), one cannot use the profiler with ``use_cuda = True`` to benchmark DataLoaders with ``num_workers > 0``. If you wish to benchmark data loading, please use ``use_cuda = False`` or ``num_workers = 0``. Example: >>> x = torch.randn((1, 1), requires_grad=True) >>> with torch.autograd.profiler.profile() as prof: >>> for _ in range(100): # any normal python code, really! >>> y = x ** 2 >> y.backward() >>> # NOTE: some columns were removed for brevity >>> print(prof.key_averages().table(sort_by="self_cpu_time_total")) ----------------------------------- --------------- --------------- --------------- Name Self CPU total CPU time avg Number of Calls ----------------------------------- --------------- --------------- --------------- mul 32.048ms 32.048ms 200 pow 27.041ms 27.041ms 200 PowBackward0 9.727ms 55.483ms 100 torch::autograd::AccumulateGrad 9.148ms 9.148ms 100 torch::autograd::GraphRoot 691.816us 691.816us 100 ----------------------------------- --------------- --------------- --------------- """ def __init__(self, enabled=True, *, use_cuda=False, record_shapes=False, with_flops=False, profile_memory=False, with_stack=False, with_modules=False, use_kineto=False, use_cpu=True): self.enabled: bool = enabled if not self.enabled: return self.use_cuda = use_cuda self.function_events: Optional[EventList] = None self.entered = False self.record_shapes = record_shapes self.with_flops = with_flops self.record_shapes |= self.with_flops self.profile_memory = profile_memory self.with_stack = with_stack self.with_modules = with_modules self.use_cpu = use_cpu self.kineto_results: Optional[_ProfilerResult] = None if not self.use_cpu: assert use_kineto, \ "Device-only events supported only with Kineto (use_kineto=True)" if self.use_cuda and not torch.cuda.is_available(): warn("CUDA is not available, disabling CUDA profiling") self.use_cuda = False self.kineto_activities = set() if self.use_cpu: self.kineto_activities.add(ProfilerActivity.CPU) self.profiler_kind = ProfilerState.KINETO if self.use_cuda: if (not use_kineto or ProfilerActivity.CUDA not in _supported_activities()): assert self.use_cpu, "Legacy CUDA profiling requires use_cpu=True" self.profiler_kind = ProfilerState.KINETO_GPU_FALLBACK else: self.kineto_activities.add(ProfilerActivity.CUDA) assert len(self.kineto_activities) > 0, \ "No activities specified for the profiler" def config(self): return ProfilerConfig(self.profiler_kind, self.record_shapes, self.profile_memory, self.with_stack, self.with_flops, self.with_modules) def __enter__(self): if not self.enabled: return if self.entered: raise RuntimeError("Profiler context manager is not reentrant") self._prepare_trace() self._start_trace() return self def _prepare_trace(self): self.entered = True _prepare_profiler(self.config(), self.kineto_activities) def _start_trace(self): self.entered = True _enable_profiler(self.config(), self.kineto_activities) def __exit__(self, exc_type, exc_val, exc_tb): if not self.enabled: return if self.use_cuda: torch.cuda.synchronize() self.kineto_results = _disable_profiler() parsed_results = self._parse_kineto_results(self.kineto_results) self.function_events = EventList(parsed_results, use_cuda=self.use_cuda, profile_memory=self.profile_memory, with_flops=self.with_flops) self.function_events._build_tree() return False def __repr__(self): if self.function_events is None: return '<unfinished torch.autograd.profile>' return repr(self.function_events) def __str__(self): if self.function_events is None: return '<unfinished torch.autograd.profile>' return str(self.function_events) def _check_finish(self): if self.function_events is None: raise RuntimeError("Profiler didn't finish running") def table(self, sort_by=None, row_limit=100, max_src_column_width=75, header=None, top_level_events_only=False): self._check_finish() assert self.function_events is not None return self.function_events.table( sort_by=sort_by, row_limit=row_limit, max_src_column_width=max_src_column_width, header=header, top_level_events_only=top_level_events_only) table.__doc__ = EventList.table.__doc__ def export_chrome_trace(self, path): self._check_finish() if kineto_available(): self.kineto_results.save(path) # type: ignore[union-attr] else: return self.function_events.export_chrome_trace( path) # type: ignore[union-attr] export_chrome_trace.__doc__ = EventList.export_chrome_trace.__doc__ def export_stacks(self, path: str, metric: str = "self_cpu_time_total"): self._check_finish() assert self.function_events is not None, "Expected profiling results" assert self.with_stack, "export_stacks() requires with_stack=True" return self.function_events.export_stacks(path, metric) def key_averages(self, group_by_input_shape=False, group_by_stack_n=0): self._check_finish() assert self.function_events is not None, "Expected profiling results" return self.function_events.key_averages(group_by_input_shape, group_by_stack_n) key_averages.__doc__ = EventList.key_averages.__doc__ def total_average(self): self._check_finish() assert self.function_events is not None, "Expected profiling results" return self.function_events.total_average() total_average.__doc__ = EventList.total_average.__doc__ @property def self_cpu_time_total(self): """ Returns total time spent on CPU obtained as a sum of all self times across all the events. """ self._check_finish() assert self.function_events is not None return self.function_events.self_cpu_time_total def _parse_kineto_results(self, result): # result.events() has most of the events - PyTorch op-level and device-level events trace_start_us = result.trace_start_us() mem_records = [[evt, False] for evt in result.events() if evt.name() == MEMORY_EVENT_NAME] mem_records_acc = MemRecordsAcc(mem_records) def _cpu_memory_usage(mem_record): return mem_record.nbytes() if \ mem_record.device_type() in [DeviceType.CPU, DeviceType.MKLDNN, DeviceType.IDEEP] \ else 0 def _cuda_memory_usage(mem_record): return mem_record.nbytes() if \ mem_record.device_type() in [DeviceType.CUDA, DeviceType.HIP] \ else 0 # Create and return FunctionEvent list function_events = [] cuda_corr_map: Dict[int, List[FunctionEvent]] = {} max_evt_id = 0 for kineto_event in result.events(): if _filter_name(kineto_event.name()): continue rel_start_us = kineto_event.start_us() - trace_start_us rel_end_us = rel_start_us + kineto_event.duration_us() abs_end_us = kineto_event.start_us() + kineto_event.duration_us() cpu_memory_usage = 0 cuda_memory_usage = 0 if kineto_event.device_type() == DeviceType.CPU: # find the corresponding memory allocation events for mem_record in mem_records_acc.in_interval( kineto_event.start_us(), abs_end_us): cpu_memory_usage += _cpu_memory_usage(mem_record[0]) cuda_memory_usage += _cuda_memory_usage(mem_record[0]) mem_record[1] = True is_async = kineto_event.is_async() or ( kineto_event.start_thread_id() != kineto_event.end_thread_id()) fe = FunctionEvent( id=kineto_event.correlation_id(), name=_rewrite_name(name=kineto_event.name(), with_wildcard=True), trace_name=_rewrite_name(name=kineto_event.name(), with_wildcard=False), thread=kineto_event.start_thread_id(), start_us=rel_start_us, end_us=rel_end_us, fwd_thread=kineto_event.fwd_thread_id(), input_shapes=kineto_event.shapes(), stack=[ entry for entry in kineto_event.stack() if _filter_stack_entry(entry) ], scope=kineto_event.scope(), cpu_memory_usage=cpu_memory_usage, cuda_memory_usage=cuda_memory_usage, is_async=is_async, sequence_nr=kineto_event.sequence_nr(), device_type=kineto_event.device_type(), device_index=kineto_event.device_index(), flops=kineto_event.flops(), ) max_evt_id = fe.id if fe.id > max_evt_id else max_evt_id if fe.device_type == DeviceType.CPU and not fe.is_async: # Check if we have CUDA time as a fallback cuda_time = kineto_event.cuda_elapsed_us() if cuda_time > 0: fe.append_kernel(fe.name, fe.device_index, cuda_time) fe.is_legacy = True function_events.append(fe) corr_id = kineto_event.linked_correlation_id() if corr_id > 0: if corr_id not in cuda_corr_map: cuda_corr_map[corr_id] = [] cuda_corr_map[corr_id].append(fe) # associate CUDA kernels and CUDA runtime (CPU) with CPU events for fe in function_events: if (fe.device_type == DeviceType.CPU and not fe.is_async and fe.id in cuda_corr_map): for f_evt in cuda_corr_map[fe.id]: if f_evt.device_type == DeviceType.CUDA: fe.append_kernel( f_evt.name, f_evt.device_index, f_evt.time_range.end - f_evt.time_range.start) elif f_evt.device_type == DeviceType.CPU: # make sure that 'thread' of a CPU Kineto (e.g. CUDA Runtime) event is associated # with the 'thread' of the corresponding linked PyTorch event to properly track # parents and children f_evt.thread = fe.thread # output top-level memory events for mem_record in mem_records: if not mem_record[1]: rel_start_us = mem_record[0].start_us() - trace_start_us max_evt_id += 1 fe = FunctionEvent( id=max_evt_id, name=MEMORY_EVENT_NAME, trace_name=None, # not outputting in the trace thread=mem_record[0].start_thread_id(), start_us=rel_start_us, end_us=rel_start_us, # no duration fwd_thread=mem_record[0].start_thread_id(), input_shapes=[], stack=[], scope=0, # RecordScope::FUNCTION cpu_memory_usage=_cpu_memory_usage(mem_record[0]), cuda_memory_usage=_cuda_memory_usage(mem_record[0]), is_async=False, sequence_nr=-1, device_type=DeviceType.CPU, device_index=0, ) function_events.append(fe) function_events.sort( key=lambda evt: [evt.time_range.start, -evt.time_range.end]) return function_events
class profile(object): """DEPRECATED: use torch.profiler instead""" def __init__(self, enabled=True, *, use_cuda=False, record_shapes=False, with_flops=False, profile_memory=False, with_stack=False, with_modules=False): self.enabled: bool = enabled if not self.enabled: return self.use_cuda = use_cuda self.function_events = None self.entered = False self.record_shapes = record_shapes self.with_flops = with_flops self.record_shapes |= self.with_flops self.profile_memory = profile_memory self.with_stack = with_stack self.with_modules = with_modules if self.use_cuda and not torch.cuda.is_available(): warn("CUDA is not available, disabling CUDA profiling") self.use_cuda = False if self.use_cuda: self.profiler_kind = ProfilerState.CUDA else: self.profiler_kind = ProfilerState.CPU def config(self): return ProfilerConfig( self.profiler_kind, self.record_shapes, self.profile_memory, self.with_stack, self.with_flops, self.with_modules, # avoid exposing _ExperimentalConfig this in legacy public API torch._C._autograd._ExperimentalConfig(), ) def __enter__(self): if not self.enabled: return if self.entered: raise RuntimeError("Profiler context manager is not reentrant") self.entered = True self._start_trace() return self def _start_trace(self): _enable_profiler_legacy(self.config()) def __exit__(self, exc_type, exc_val, exc_tb): if not self.enabled: return if self.use_cuda: torch.cuda.synchronize() records = _disable_profiler_legacy() parsed_results = _parse_legacy_records(records) self.function_events = EventList(parsed_results, use_cuda=self.use_cuda, profile_memory=self.profile_memory, with_flops=self.with_flops) self.function_events._build_tree() return False def __repr__(self): if self.function_events is None: return '<unfinished profiler_legacy.profile>' return repr(self.function_events) def __str__(self): if self.function_events is None: return '<unfinished profile.profiler_legacy.profile>' return str(self.function_events) def _check_finish(self): if self.function_events is None: raise RuntimeError("Profiler didn't finish running") def table(self, sort_by=None, row_limit=100, max_src_column_width=75, header=None, top_level_events_only=False): self._check_finish() assert self.function_events is not None return self.function_events.table( sort_by=sort_by, row_limit=row_limit, max_src_column_width=max_src_column_width, header=header, top_level_events_only=top_level_events_only) table.__doc__ = EventList.table.__doc__ def export_chrome_trace(self, path): self._check_finish() assert self.function_events is not None return self.function_events.export_chrome_trace(path) export_chrome_trace.__doc__ = EventList.export_chrome_trace.__doc__ def export_stacks(self, path: str, metric: str = "self_cpu_time_total"): self._check_finish() assert self.function_events is not None, "Expected profiling results" assert self.with_stack, "export_stacks() requires with_stack=True" return self.function_events.export_stacks(path, metric) def key_averages(self, group_by_input_shape=False, group_by_stack_n=0): self._check_finish() assert self.function_events is not None, "Expected profiling results" return self.function_events.key_averages(group_by_input_shape, group_by_stack_n) key_averages.__doc__ = EventList.key_averages.__doc__ def total_average(self): self._check_finish() assert self.function_events is not None, "Expected profiling results" return self.function_events.total_average() total_average.__doc__ = EventList.total_average.__doc__ @property def self_cpu_time_total(self): """ Returns total time spent on CPU obtained as a sum of all self times across all the events. """ self._check_finish() assert self.function_events is not None return self.function_events.self_cpu_time_total