예제 #1
0
    def measure_run_time_ms(self, batch_size, initial_repetitions=None):
        """
        Measures the iteration run time in milliseconds.

        NOTE: This method will raise a RuntimeError if there is not enough GPU
              memory to run the iteration.
        """
        with user_code_environment(self._path_to_entry_point_dir,
                                   self._project_root):
            inputs = self._input_provider(batch_size=batch_size)
            # Warm up
            self._iteration(*inputs)

        torch.cuda.synchronize()

        def measure(iterations):
            with user_code_environment(self._path_to_entry_point_dir,
                                       self._project_root):
                self._start_event.record()
                for _ in range(iterations):
                    self._iteration(*inputs)
                self._end_event.record()
            torch.cuda.synchronize()
            return self._start_event.elapsed_time(self._end_event)

        # When measuring the iteration run time of the model, we want to use as
        # few repetitions as possible. The problem is that we do not know how
        # many repetitions we need ahead of time.
        #
        # So the idea here is to iteratively double the number of repetitions
        # we use until we get a stable measurement (less than 5% difference).
        # If the caller knows how many measurements to use, they can pass it in
        # and we'll start from there. We will stop after reaching 100
        # iterations (or 2 * initial_repetitions), whichever is larger.
        #
        # We will return the smaller number of repetitions. This can be used by
        # future callers as the value of the initial_repetitions argument.
        repetitions = 3 if initial_repetitions is None else initial_repetitions
        max_repetitions = (50 if initial_repetitions is None else max(
            50, initial_repetitions))

        torch.cuda.reset_max_memory_allocated()
        lesser = measure(repetitions) / repetitions
        peak_usage_bytes = torch.cuda.max_memory_allocated()

        logger.debug("Iters: %d, Measured: %f", repetitions, lesser)
        while repetitions <= max_repetitions:
            doubled = repetitions * 2
            greater = measure(doubled) / doubled
            logger.debug("Iters: %d, Measured: %f", doubled, greater)

            # Stop when the difference between the measurements is less than 5%
            if (max(lesser, greater) / min(lesser, greater)) < 1.05:
                break

            repetitions = doubled
            lesser = greater

        return min(lesser, greater), peak_usage_bytes, repetitions
예제 #2
0
 def measure(iterations):
     with user_code_environment(self._path_to_entry_point_dir,
                                self._project_root):
         self._start_event.record()
         for _ in range(iterations):
             self._iteration(*inputs)
         self._end_event.record()
     torch.cuda.synchronize()
     return self._start_event.elapsed_time(self._end_event)
예제 #3
0
def _run_entry_point(path_to_entry_point, path_to_entry_point_dir,
                     project_root):
    with open(path_to_entry_point) as file:
        code_str = file.read()
    with exceptions_as_analysis_errors(project_root):
        tree = ast.parse(code_str, filename=path_to_entry_point)
        code = compile(tree, path_to_entry_point, mode="exec")
    with user_code_environment(path_to_entry_point_dir, project_root):
        scope = {}
        exec(code, scope, scope)
    return code_str, tree, scope
예제 #4
0
    def measure_peak_usage_bytes(self):
        self._prepare_for_memory_profiling()
        # Run one iteration to initialize the gradients
        with user_code_environment(self._path_to_entry_point_dir,
                                   self._project_root):
            model = self._model_provider()
            iteration = self._iteration_provider(model)
            iteration(*self._input_provider(batch_size=self._batch_size))

        torch.cuda.reset_max_memory_allocated()
        with user_code_environment(self._path_to_entry_point_dir,
                                   self._project_root):
            # NOTE: It's important to run at least 2 iterations here. It turns
            #       out that >= 2 iterations is the number of iterations needed
            #       to get a stable measurement of the total memory
            #       consumption. When using Adam, if you run one iteration, the
            #       memory usage ends up being too low by a constant factor.
            for _ in range(2):
                iteration(*(self._input_provider(batch_size=self._batch_size)))
        return torch.cuda.max_memory_allocated()
예제 #5
0
 def _get_grad_function_contexts(self, iteration, input_provider,
                                 user_code_path):
     grad_function_tracker = GradFunctionTracker(self._project_root)
     backward_interceptor = BackwardInterceptor()
     with grad_function_tracker.track(), \
             backward_interceptor.intercept(), \
             user_code_environment(user_code_path, self._project_root):
         iteration(*input_provider())
     return (
         backward_interceptor.backward_root,
         grad_function_tracker.grad_function_contexts,
     )
예제 #6
0
 def new_from(
     cls,
     model_provider,
     input_provider,
     iteration_provider,
     path_to_entry_point_dir,
     project_root,
 ):
     with user_code_environment(path_to_entry_point_dir, project_root):
         model = model_provider()
         iteration = iteration_provider(model)
     return cls(iteration, input_provider, path_to_entry_point_dir,
                project_root)
예제 #7
0
    def track_run_time(self):
        if self._tracker_state == _TrackerState.CREATED:
            with user_code_environment(
                    self._user_code_path, self._project_root):
                self._model = self._model_provider()
        elif self._tracker_state != _TrackerState.MEMORY_TRACKED:
            raise RuntimeError('Run time tracking has already been performed.')

        self._tracker_state = _TrackerState.RUN_TIME_TRACKED

        # 2. Perform operation run time profiling
        with user_code_environment(self._user_code_path, self._project_root):
            inputs = self._input_provider()
            iteration = self._iteration_provider(self._model)

        self._operation_tracker = OperationRunTimeTracker(self._project_root)
        backward_interceptor = BackwardInterceptor()
        with self._operation_tracker.track():
            with backward_interceptor.intercept():
                with user_code_environment(
                        self._user_code_path, self._project_root):
                    iteration(*inputs)
예제 #8
0
    def track_memory(self):
        if self._tracker_state != _TrackerState.CREATED:
            raise RuntimeError(
                'Memory tracking must be the first operation performed '
                'on a new Tracker.'
            )

        self._tracker_state = _TrackerState.MEMORY_TRACKED

        # Sanity check: For accurate profiling numbers, the initial memory
        # usage should be zero bytes.
        initial_memory_bytes = torch.cuda.memory_allocated()
        if initial_memory_bytes != 0:
            logger.debug(
                'Non-zero initial memory usage during memory tracking: '
                '%d bytes',
                initial_memory_bytes,
            )

        # 1. Track and record memory usage associated with model creation
        self._weight_tracker = WeightsTracker(self._project_root)
        with user_code_environment(self._user_code_path, self._project_root):
            with self._weight_tracker.track():
                self._model = self._model_provider()
            # Run one iteration to initialize the gradients
            iteration = self._iteration_provider(self._model)
            iteration(*self._input_provider())

        # 2. Track and record memory usage associated with stored activations
        self._activations_tracker = ActivationsTracker(self._project_root)
        self._activations_tracker.track_memory_usage(
            iteration, self._input_provider, self._user_code_path)

        # 3. Record peak memory usage
        torch.cuda.reset_max_memory_allocated()
        with user_code_environment(self._user_code_path, self._project_root):
            iteration(*(self._input_provider()))
        self._peak_usage_bytes = torch.cuda.max_memory_allocated()