Пример #1
0
def test_profiling() -> None:
    """
    Test profiling.
    """
    # get the current profile values
    (gpu_names, total_memory_per, used_memory_per, load_per, ram_total,
     ram_used, ram_avail) = profile_gpu_and_ram()
    # average / sum over all GPUs
    gpu_mem_used: float = sum(used_memory_per)
    gpu_mem_total: float = sum(total_memory_per)
    gpu_mem_percent: float = gpu_mem_used / max(1, gpu_mem_total)
    load_avg: float = sum(load_per) / max(1, len(load_per))

    print("Metrics.PROFILE_GPU_MEM_USED", gpu_mem_used)
    print("Metrics.PROFILE_GPU_MEM_TOTAL", gpu_mem_total)
    print("Metrics.PROFILE_GPU_LOAD", load_avg)
    print("Metrics.PROFILE_RAM_USED", ram_used)
    print("Metrics.PROFILE_RAM_TOTAL", ram_total)
    print("Metrics.PROFILE_GPU_MEM_PERCENT", gpu_mem_percent)
    print("Metrics.PROFILE_RAM_AVAILABLE", ram_avail)

    # log the values
    gpu_names_str = " ".join(set(gpu_names))
    multi_load, multi_mem = "", ""
    if len(load_per) > 1:
        multi_load = " [" + ", ".join(f"{load:.0%}" for load in load_per) + "]"
        multi_mem = " [" + ", ".join(f"{mem:.1f}GB"
                                     for mem in used_memory_per) + "]"
    print(
        f"RAM GB used/avail/total: {ram_used:.1f}/{ram_avail:.1f}/{ram_total:.1f} - "
        f"GPU {gpu_names_str} Load: {load_avg:.1%}{multi_load} "
        f"Mem: {gpu_mem_used:.1f}GB/{gpu_mem_total:.1f}GB{multi_mem}")
Пример #2
0
    def hook_post_step(
            self, epoch_step: int, loss: th.Tensor, lr: float, additional_log: Optional[str] = None,
            disable_grad_clip: bool = False) -> bool:
        """
        Hook called after one optimization step.
        Profile gpu and update step-based meters. Feed everything to tensorboard.
        Needs some information to be passed down from the trainer for proper logging.
        Args:
            epoch_step: Current step in the epoch.
            loss: Training loss.
            lr: Training learning rate.
            additional_log: Additional string to print in the train step log.
            disable_grad_clip: Disable gradient clipping if it's done already somewhere else
        Returns:
            Whether log output should be printed in this step or not.
        """
        # compute total time for this step and restart the timer
        total_step_time = timer() - self.timer_step
        self.timer_step = timer()

        # clip gradients
        total_norm = 0
        if self.cfg.train.clip_gradient > -1 and not disable_grad_clip:
            # get all parameters to clip
            _params, _param_names, params_flat = self.model_mgr.get_all_params()
            # clip using pytorch
            total_norm = clip_grad_norm_(params_flat, self.cfg.train.clip_gradient)
            if total_norm > self.cfg.train.clip_gradient:
                # print log message if gradients where clipped
                grad_clip_coef = self.cfg.train.clip_gradient / (total_norm + 1e-6)
                self.logger.info(f"Clipping gradient: {total_norm} with coef {grad_clip_coef}")
            total_norm = total_norm.item()
        self.state.last_grad_norm = total_norm

        # print infos
        if epoch_step % self.cfg.logging.step_train == 0:
            total_train_time = (timer() - self.timer_train_epoch) / 60
            str_step = ("{:" + str(len(str(self.steps_per_epoch))) + "d}").format(epoch_step)
            print_string = "".join([
                f"E{self.state.current_epoch}[{str_step}/{self.steps_per_epoch}] T {total_train_time:.3f}m ",
                f"LR {lr:.1e} L {loss:.4f} ",
                f"Grad {self.state.last_grad_norm:.3e} " if self.state.last_grad_norm != 0 else "",
                f"{additional_log}" if additional_log is not None else ""])
            self.logger.info(print_string)

        # check GPU / RAM profiling
        if ((self.state.epoch_step % self.cfg.logging.step_gpu == 0 and self.cfg.logging.step_gpu > 0) or
                self.state.epoch_step == self.cfg.logging.step_gpu_once and self.cfg.logging.step_gpu_once > 0):
            # get the current profile values
            (gpu_names, total_memory_per, used_memory_per, load_per, ram_total, ram_used, ram_avail
             ) = utils_torch.profile_gpu_and_ram()
            # average / sum over all GPUs
            gpu_mem_used: float = sum(used_memory_per)
            gpu_mem_total: float = sum(total_memory_per)
            # gpu_mem_percent: float = gpu_mem_used / gpu_mem_total
            load_avg: float = sum(load_per) / max(1, len(load_per))

            self.metrics.update_meter(Metrics.PROFILE_GPU_MEM_USED, gpu_mem_used)
            self.metrics.update_meter(Metrics.PROFILE_GPU_MEM_TOTAL, gpu_mem_total)
            self.metrics.update_meter(Metrics.PROFILE_GPU_LOAD, load_avg)
            self.metrics.update_meter(Metrics.PROFILE_RAM_USED, ram_used)
            self.metrics.update_meter(Metrics.PROFILE_RAM_TOTAL, ram_total)
            # # these 2 are not logged as they are redundant with the others.
            # self.metrics.update_meter(Metrics.PROFILE_GPU_MEM_PERCENT, gpu_mem_percent)
            # self.metrics.update_meter(Metrics.PROFILE_RAM_AVAILABLE, ram_avail)

            # log the values
            gpu_names_str = " ".join(set(gpu_names))
            multi_load, multi_mem = "", ""
            if len(load_per) > 1:
                multi_load = " [" + ", ".join(f"{load:.0%}" for load in load_per) + "]"
                multi_mem = " [" + ", ".join(f"{mem:.1f}GB" for mem in used_memory_per) + "]"
            self.logger.info(f"RAM GB used/avail/total: {ram_used:.1f}/{ram_avail:.1f}/{ram_total:.1f} - "
                             f"GPU {gpu_names_str} Load: {load_avg:.1%}{multi_load} "
                             f"Mem: {gpu_mem_used:.1f}GB/{gpu_mem_total:.1f}GB{multi_mem}")

        # update timings
        other_t = total_step_time - self.timedelta_step_forward - self.timedelta_step_backward
        self.metrics.update_meter(Metrics.TIME_STEP_FORWARD, self.timedelta_step_forward)
        self.metrics.update_meter(Metrics.TIME_STEP_BACKWARD, self.timedelta_step_backward)
        self.metrics.update_meter(Metrics.TIME_STEP_TOTAL, total_step_time)
        self.metrics.update_meter(Metrics.TIME_STEP_OTHER, other_t)
        # update clipped gradient
        self.metrics.update_meter(Metrics.TRAIN_GRAD_CLIP, self.state.last_grad_norm)
        # update LR
        self.metrics.update_meter(Metrics.TRAIN_LR, lr)
        # MODIFIED!!!!!!!!!!!!!!!!!!!!!!
        self.metrics.update_meter(Metrics.TRAIN_LOSS, loss.item())
        '''
        if self.state.epoch_step % self.cfg.logging.step_train == 0 and self.cfg.logging.step_train > 0:
            # loss update necessary
            self.metrics.update_meter(Metrics.TRAIN_LOSS, loss.item())
        '''
        # Save epoch step and increase total step counter
        self.state.epoch_step = epoch_step
        self.state.total_step += 1

        # feed step-based metrics to tensorboard and collector
        self.metrics.feed_metrics(True, self.state.total_step, self.state.current_epoch)

        # End of batch, step lr scheduler depending on flag
        if self.lr_scheduler is not None:
            self.lr_scheduler.step()
Пример #3
0
def test_profile_gpu_and_ram():
    pprint(profile_gpu_and_ram())