def on_phase_end(self, task: "tasks.ClassyTask") -> None: for meter in task.ema_meters: meter.sync_state() if is_primary(): LogLossMetricsCheckpointHook.print_and_save_meters( task, task.train_phase_idx, task.ema_meters, metric_key_name_suffix="ema", )
def on_forward(self, task: "tasks.ClassyTask") -> None: """ Called each time a model forward is done and make sure that the model forward output is not NaN. If we encounter NaN as the model output, we checkpoint the model to enable debugging and also checkpoint the model input sample, model output. """ # check the model output is not NaN. has_nan = False model_output = task.last_batch.model_output if isinstance(model_output, list): has_nan = not torch.tensor( [torch.isfinite(x).all() for x in model_output] ).all() else: has_nan = not torch.isfinite(model_output).all() if has_nan: _, dist_rank = get_machine_local_and_dist_rank() logging.info(f"Infinite Model output or NaN at iteration={task.iteration}.") # TODO - this code was broken during a refactoring: improve it from vissl.hooks.log_hooks import LogLossMetricsCheckpointHook LogLossMetricsCheckpointHook.checkpoint_model( task, world_size=self.world_size, mode_frequency=1, mode_num=task.iteration, mode="iteration", ) model_output_file = ( f"{task.checkpoint_folder}/rank{dist_rank}_model_output.pth" ) input_sample_file = ( f"{task.checkpoint_folder}/rank{dist_rank}_input_sample.pth" ) with PathManager.open(model_output_file, "wb") as fwrite: torch.save(model_output, fwrite) with PathManager.open(input_sample_file, "wb") as fwrite: torch.save(task.last_batch.sample, fwrite) logging.info(f"Saved model output: {model_output_file}") logging.info(f"Saved model input: {input_sample_file}")
def default_hook_generator(cfg: AttrDict) -> List[ClassyHook]: """ The utility function that prepares all the hoooks that will be used in training based on user selection. Some basic hooks are used by default. Optional hooks: - Tensorboard hook, - loss specific hooks (swav loss, deepcluster loss, moco loss) used only when the loss is being used - model complexity hook (if user wants to compute model flops, activations, params) enable the hook via HOOKS.MODEL_COMPLEXITY.COMPUTE_COMPLEXITY = True Returns: hooks (List(functions)): list containing the hook functions that will be used """ hooks = [] # conditionally add hooks based on use-case if cfg.HOOKS.PERF_STATS.MONITOR_PERF_STATS: perf_stat_freq = (cfg.HOOKS.PERF_STATS.PERF_STAT_FREQUENCY if cfg.HOOKS.PERF_STATS.PERF_STAT_FREQUENCY > 0 else None) hooks.append(LogPerfTimeMetricsHook(perf_stat_freq)) if cfg.LOSS.name == "swav_loss": hooks.extend([SwAVUpdateQueueScoresHook(), NormalizePrototypesHook()]) if cfg.LOSS.name == "swav_momentum_loss": hooks.extend([ SwAVMomentumHook( cfg.LOSS["swav_momentum_loss"]["momentum"], cfg.LOSS["swav_momentum_loss"] ["momentum_eval_mode_iter_start"], cfg.LOSS["swav_momentum_loss"]["crops_for_assign"], ), SwAVMomentumNormalizePrototypesHook(), ]) if cfg.LOSS.name == "deepclusterv2_loss": hooks.extend([InitMemoryHook(), ClusterMemoryHook()]) if cfg.LOSS.name == "moco_loss": hooks.extend([ MoCoHook( cfg.LOSS["moco_loss"]["momentum"], shuffle_batch=( not cfg.MODEL.SYNC_BN_CONFIG.CONVERT_BN_TO_SYNC_BN), ) ]) if cfg.HOOKS.MODEL_COMPLEXITY.COMPUTE_COMPLEXITY: hooks.extend([SSLModelComplexityHook()]) if cfg.HOOKS.LOG_GPU_STATS: hooks.extend([LogGpuStatsHook()]) if cfg.HOOKS.MEMORY_SUMMARY.PRINT_MEMORY_SUMMARY: hooks.extend( [LogGpuMemoryHook(cfg.HOOKS.MEMORY_SUMMARY.LOG_ITERATION_NUM)]) if cfg.HOOKS.TENSORBOARD_SETUP.USE_TENSORBOARD: assert is_tensorboard_available(), ( "Tensorboard must be installed to use it. Please install tensorboard using:" "If pip environment: `pip install tensorboard` " "If using conda and you prefer conda install of tensorboard: " "`conda install -c conda-forge tensorboard`") tb_hook = get_tensorboard_hook(cfg) hooks.extend([tb_hook]) if cfg.MODEL.GRAD_CLIP.USE_GRAD_CLIP: hooks.extend([ GradClipHook( norm_type=cfg.MODEL.GRAD_CLIP.NORM_TYPE, max_norm=cfg.MODEL.GRAD_CLIP.MAX_NORM, ) ]) # hooks that are used irrespective of workflow type rolling_btime_freq = (cfg.HOOKS.PERF_STATS.ROLLING_BTIME_FREQ if cfg.HOOKS.PERF_STATS.ROLLING_BTIME_FREQ > 0 else None) if ProfilingHook.is_enabled(cfg.PROFILING): hooks.append(ProfilingHook(profiling_config=cfg.PROFILING)) world_size = cfg.DISTRIBUTED.NUM_NODES * cfg.DISTRIBUTED.NUM_PROC_PER_NODE checkpoint_folder = get_checkpoint_folder(cfg) hooks.extend([ CheckNanLossHook(), SetDataSamplerEpochHook(), FreezeParametersHook(), UpdateBatchesSeenHook(), UpdateTrainBatchTimeHook(), UpdateTestBatchTimeHook(), UpdateTrainIterationNumHook(), LogLossMetricsCheckpointHook(world_size), LogLossLrEtaHook(checkpoint_folder, rolling_btime_freq), ]) return hooks
def default_hook_generator(cfg: AttrDict) -> List[ClassyHook]: """ The utility function that prepares all the hoooks that will be used in training based on user selection. Some basic hooks are used by default. Optional hooks: - Tensorboard hook, - loss specific hooks (swav loss, deepcluster loss, moco loss) used only when the loss is being used - model complexity hook (if user wants to compute model flops, activations, params) enable the hook via MODEL.MODEL_COMPLEXITY.COMPUTE_COMPLEXITY = True Returns: hooks (List(functions)): list containing the hook functions that will be used """ hooks = [] # conditionally add hooks based on use-case if cfg.MONITOR_PERF_STATS: perf_stat_freq = ( cfg.PERF_STAT_FREQUENCY if cfg.PERF_STAT_FREQUENCY > 0 else None ) hooks.append(LogPerfTimeMetricsHook(perf_stat_freq)) if cfg.LOSS.name == "swav_loss": hooks.extend([SwAVUpdateQueueScoresHook(), NormalizePrototypesHook()]) if cfg.LOSS.name == "swav_momentum_loss": hooks.extend( [ SwAVMomentumHook( cfg.LOSS["swav_momentum_loss"]["momentum"], cfg.LOSS["swav_momentum_loss"]["momentum_eval_mode_iter_start"], cfg.LOSS["swav_momentum_loss"]["crops_for_assign"], ), SwAVMomentumNormalizePrototypesHook(), ] ) if cfg.LOSS.name == "deepclusterv2_loss": hooks.extend([InitMemoryHook(), ClusterMemoryHook()]) if cfg.LOSS.name == "moco_loss": hooks.extend( [ MoCoHook( cfg.LOSS["moco_loss"]["momentum"], shuffle_batch=(not cfg.MODEL.SYNC_BN_CONFIG.CONVERT_BN_TO_SYNC_BN), ) ] ) if cfg.MODEL.MODEL_COMPLEXITY.COMPUTE_COMPLEXITY: hooks.extend([SSLModelComplexityHook()]) if cfg.TENSORBOARD_SETUP.USE_TENSORBOARD: assert is_tensorboard_available(), "Tensorboard must be installed to use it." tb_hook = get_tensorboard_hook(cfg) hooks.extend([tb_hook]) # hooks that are used irrespective of workflow type rolling_btime_freq = cfg.ROLLING_BTIME_FREQ if cfg.ROLLING_BTIME_FREQ > 0 else None hooks.extend( [ CheckNanLossHook(), SetDataSamplerEpochHook(), FreezeParametersHook(), UpdateBatchesSeenHook(), UpdateTrainBatchTimeHook(), UpdateTestBatchTimeHook(), UpdateTrainIterationNumHook(), LogLossMetricsCheckpointHook(), LogLossLrEtaHook(rolling_btime_freq), LogGpuStatsHook(), ] ) return hooks
def default_hook_generator(cfg: AttrDict) -> List[ClassyHook]: """ The utility function that prepares all the hoooks that will be used in training based on user selection. Some basic hooks are used by default. Optional hooks: - Tensorboard hook, - loss specific hooks (swav loss, deepcluster loss, moco loss) used only when the loss is being used - model complexity hook (if user wants to compute model flops, activations, params) enable the hook via HOOKS.MODEL_COMPLEXITY.COMPUTE_COMPLEXITY = True Returns: hooks (List(functions)): list containing the hook functions that will be used """ hooks = [] # conditionally add hooks based on use-case if cfg.HOOKS.PERF_STATS.MONITOR_PERF_STATS: perf_stat_freq = ( cfg.HOOKS.PERF_STATS.PERF_STAT_FREQUENCY if cfg.HOOKS.PERF_STATS.PERF_STAT_FREQUENCY > 0 else None ) hooks.append(LogPerfTimeMetricsHook(perf_stat_freq)) # add the loss hooks based on the loss being used hooks = add_loss_hooks(hooks, cfg.LOSS, cfg) if cfg.HOOKS.MODEL_COMPLEXITY.COMPUTE_COMPLEXITY: hooks.extend([SSLModelComplexityHook()]) if cfg.HOOKS.LOG_GPU_STATS: hooks.extend([LogGpuStatsHook()]) if cfg.HOOKS.MEMORY_SUMMARY.PRINT_MEMORY_SUMMARY: hooks.extend([LogGpuMemoryHook(cfg.HOOKS.MEMORY_SUMMARY.LOG_ITERATION_NUM)]) if cfg.HOOKS.MEMORY_SUMMARY.DUMP_MEMORY_ON_EXCEPTION: hooks.append(DumpMemoryOnException()) if cfg.HOOKS.TENSORBOARD_SETUP.USE_TENSORBOARD: assert is_tensorboard_available(), ( "Tensorboard must be installed to use it. Please install tensorboard using:" "If pip environment: `pip install tensorboard` " "If using conda and you prefer conda install of tensorboard: " "`conda install -c conda-forge tensorboard`" ) tb_hook = get_tensorboard_hook(cfg) hooks.extend([tb_hook]) if cfg.MODEL.GRAD_CLIP.USE_GRAD_CLIP: hooks.extend( [ GradClipHook( norm_type=cfg.MODEL.GRAD_CLIP.NORM_TYPE, max_norm=cfg.MODEL.GRAD_CLIP.MAX_NORM, ) ] ) # hooks that are used irrespective of workflow type rolling_btime_freq = ( cfg.HOOKS.PERF_STATS.ROLLING_BTIME_FREQ if cfg.HOOKS.PERF_STATS.ROLLING_BTIME_FREQ > 0 else None ) if CudaSynchronizeHook.is_enabled(cfg.MODEL): hooks.append(CudaSynchronizeHook()) if ProfilingHook.is_enabled(cfg.PROFILING): hooks.append(ProfilingHook(profiling_config=cfg.PROFILING)) world_size = cfg.DISTRIBUTED.NUM_NODES * cfg.DISTRIBUTED.NUM_PROC_PER_NODE checkpoint_folder = get_checkpoint_folder(cfg) hooks.extend( [ SetDataSamplerEpochHook(), FreezeParametersHook(), LogLossMetricsCheckpointHook(world_size), LogLossLrEtaHook(checkpoint_folder, rolling_btime_freq), ] ) if cfg.METERS.model_output_mask: hooks.extend([ModelOutputMaskHook()]) if cfg.HOOKS.CHECK_NAN: hooks.extend([CheckNanLossHook(), CheckNanModelOutputHook(world_size)]) return hooks