def measure_run_time_ms(self, batch_size, initial_repetitions=None): """ Measures the iteration run time in milliseconds. NOTE: This method will raise a RuntimeError if there is not enough GPU memory to run the iteration. """ with user_code_environment(self._path_to_entry_point_dir, self._project_root): inputs = self._input_provider(batch_size=batch_size) # Warm up self._iteration(*inputs) torch.cuda.synchronize() def measure(iterations): with user_code_environment(self._path_to_entry_point_dir, self._project_root): self._start_event.record() for _ in range(iterations): self._iteration(*inputs) self._end_event.record() torch.cuda.synchronize() return self._start_event.elapsed_time(self._end_event) # When measuring the iteration run time of the model, we want to use as # few repetitions as possible. The problem is that we do not know how # many repetitions we need ahead of time. # # So the idea here is to iteratively double the number of repetitions # we use until we get a stable measurement (less than 5% difference). # If the caller knows how many measurements to use, they can pass it in # and we'll start from there. We will stop after reaching 100 # iterations (or 2 * initial_repetitions), whichever is larger. # # We will return the smaller number of repetitions. This can be used by # future callers as the value of the initial_repetitions argument. repetitions = 3 if initial_repetitions is None else initial_repetitions max_repetitions = (50 if initial_repetitions is None else max( 50, initial_repetitions)) torch.cuda.reset_max_memory_allocated() lesser = measure(repetitions) / repetitions peak_usage_bytes = torch.cuda.max_memory_allocated() logger.debug("Iters: %d, Measured: %f", repetitions, lesser) while repetitions <= max_repetitions: doubled = repetitions * 2 greater = measure(doubled) / doubled logger.debug("Iters: %d, Measured: %f", doubled, greater) # Stop when the difference between the measurements is less than 5% if (max(lesser, greater) / min(lesser, greater)) < 1.05: break repetitions = doubled lesser = greater return min(lesser, greater), peak_usage_bytes, repetitions
def measure(iterations): with user_code_environment(self._path_to_entry_point_dir, self._project_root): self._start_event.record() for _ in range(iterations): self._iteration(*inputs) self._end_event.record() torch.cuda.synchronize() return self._start_event.elapsed_time(self._end_event)
def _run_entry_point(path_to_entry_point, path_to_entry_point_dir, project_root): with open(path_to_entry_point) as file: code_str = file.read() with exceptions_as_analysis_errors(project_root): tree = ast.parse(code_str, filename=path_to_entry_point) code = compile(tree, path_to_entry_point, mode="exec") with user_code_environment(path_to_entry_point_dir, project_root): scope = {} exec(code, scope, scope) return code_str, tree, scope
def measure_peak_usage_bytes(self): self._prepare_for_memory_profiling() # Run one iteration to initialize the gradients with user_code_environment(self._path_to_entry_point_dir, self._project_root): model = self._model_provider() iteration = self._iteration_provider(model) iteration(*self._input_provider(batch_size=self._batch_size)) torch.cuda.reset_max_memory_allocated() with user_code_environment(self._path_to_entry_point_dir, self._project_root): # NOTE: It's important to run at least 2 iterations here. It turns # out that >= 2 iterations is the number of iterations needed # to get a stable measurement of the total memory # consumption. When using Adam, if you run one iteration, the # memory usage ends up being too low by a constant factor. for _ in range(2): iteration(*(self._input_provider(batch_size=self._batch_size))) return torch.cuda.max_memory_allocated()
def _get_grad_function_contexts(self, iteration, input_provider, user_code_path): grad_function_tracker = GradFunctionTracker(self._project_root) backward_interceptor = BackwardInterceptor() with grad_function_tracker.track(), \ backward_interceptor.intercept(), \ user_code_environment(user_code_path, self._project_root): iteration(*input_provider()) return ( backward_interceptor.backward_root, grad_function_tracker.grad_function_contexts, )
def new_from( cls, model_provider, input_provider, iteration_provider, path_to_entry_point_dir, project_root, ): with user_code_environment(path_to_entry_point_dir, project_root): model = model_provider() iteration = iteration_provider(model) return cls(iteration, input_provider, path_to_entry_point_dir, project_root)
def track_run_time(self): if self._tracker_state == _TrackerState.CREATED: with user_code_environment( self._user_code_path, self._project_root): self._model = self._model_provider() elif self._tracker_state != _TrackerState.MEMORY_TRACKED: raise RuntimeError('Run time tracking has already been performed.') self._tracker_state = _TrackerState.RUN_TIME_TRACKED # 2. Perform operation run time profiling with user_code_environment(self._user_code_path, self._project_root): inputs = self._input_provider() iteration = self._iteration_provider(self._model) self._operation_tracker = OperationRunTimeTracker(self._project_root) backward_interceptor = BackwardInterceptor() with self._operation_tracker.track(): with backward_interceptor.intercept(): with user_code_environment( self._user_code_path, self._project_root): iteration(*inputs)
def track_memory(self): if self._tracker_state != _TrackerState.CREATED: raise RuntimeError( 'Memory tracking must be the first operation performed ' 'on a new Tracker.' ) self._tracker_state = _TrackerState.MEMORY_TRACKED # Sanity check: For accurate profiling numbers, the initial memory # usage should be zero bytes. initial_memory_bytes = torch.cuda.memory_allocated() if initial_memory_bytes != 0: logger.debug( 'Non-zero initial memory usage during memory tracking: ' '%d bytes', initial_memory_bytes, ) # 1. Track and record memory usage associated with model creation self._weight_tracker = WeightsTracker(self._project_root) with user_code_environment(self._user_code_path, self._project_root): with self._weight_tracker.track(): self._model = self._model_provider() # Run one iteration to initialize the gradients iteration = self._iteration_provider(self._model) iteration(*self._input_provider()) # 2. Track and record memory usage associated with stored activations self._activations_tracker = ActivationsTracker(self._project_root) self._activations_tracker.track_memory_usage( iteration, self._input_provider, self._user_code_path) # 3. Record peak memory usage torch.cuda.reset_max_memory_allocated() with user_code_environment(self._user_code_path, self._project_root): iteration(*(self._input_provider())) self._peak_usage_bytes = torch.cuda.max_memory_allocated()