def on_start(self, task: "tasks.ClassyTask") -> None: if not is_master() or getattr(task, "test_only", False): return if not PathManager.exists(self.checkpoint_folder): err_msg = "Checkpoint folder '{}' does not exist.".format( self.checkpoint_folder) raise FileNotFoundError(err_msg)
def on_start(self, task) -> None: if not is_master() or getattr(task, "test_only", False): return if not PathManager.exists(self.torchscript_folder): err_msg = "Torchscript folder '{}' does not exist.".format( self.torchscript_folder) raise FileNotFoundError(err_msg)
def on_update( self, task: "tasks.ClassyTask", local_variables: Dict[str, Any] ) -> None: """Update the progress bar with the batch size.""" if is_master() and self.progress_bar is not None: self.batches += 1 self.progress_bar.update(min(self.batches, self.bar_size))
def on_phase_end( self, task: "tasks.ClassyTask", local_variables: Dict[str, Any] ) -> None: """Add the losses and learning rates to tensorboard.""" if self.learning_rates is None: logging.warning("learning_rates is not initialized") return batches = len(task.losses) if batches == 0 or not is_master(): return phase_type = task.phase_type phase_type_idx = task.train_phase_idx if task.train else task.eval_phase_idx logging.info(f"Plotting to Tensorboard for {phase_type} phase {phase_type_idx}") phase_type = task.phase_type loss_key = f"{phase_type}_loss" learning_rate_key = f"{phase_type}_learning_rate_updates" if task.train: for loss, learning_rate, global_step, wall_time in zip( task.losses, self.learning_rates, self.num_steps_global, self.wall_times ): loss /= task.get_batchsize_per_replica() self.tb_writer.add_scalar( loss_key, loss, global_step=global_step, walltime=wall_time ) self.tb_writer.add_scalar( learning_rate_key, learning_rate, global_step=global_step, walltime=wall_time, ) loss_avg = sum(task.losses) / (batches * task.get_batchsize_per_replica()) loss_key = "avg_{phase_type}_loss".format(phase_type=task.phase_type) self.tb_writer.add_scalar(loss_key, loss_avg, global_step=phase_type_idx) # plot meters which return a dict for meter in task.meters: if not isinstance(meter.value, dict): log.warn(f"Skipping meter {meter.name} with value: {meter.value}") continue for name, value in meter.value.items(): if isinstance(value, float): meter_key = f"{phase_type}_{meter.name}_{name}" self.tb_writer.add_scalar( meter_key, value, global_step=phase_type_idx ) else: log.warn( f"Skipping meter name {meter.name}_{name} with value: {value}" ) continue logging.info(f"Done plotting to Tensorboard")
def on_phase_end(self, task: "tasks.ClassyTask", local_variables: Dict[str, Any]) -> None: """ Plot the metrics on visdom. """ phase_type = task.phase_type metrics = self.metrics batches = len(task.losses) if batches == 0: return # Loss for the phase loss = sum(task.losses) / (batches * task.get_batchsize_per_replica()) loss_key = phase_type + "_loss" if loss_key not in metrics: metrics[loss_key] = [] metrics[loss_key].append(loss) # Optimizer LR for the phase optimizer_lr = task.optimizer.parameters.lr lr_key = phase_type + "_learning_rate" if lr_key not in metrics: metrics[lr_key] = [] metrics[lr_key].append(optimizer_lr) # Calculate meters for meter in task.meters: if isinstance(meter.value, collections.MutableMapping): flattened_meters_dict = flatten_dict(meter.value, prefix=meter.name) for k, v in flattened_meters_dict.items(): metric_key = phase_type + "_" + k if metric_key not in metrics: metrics[metric_key] = [] metrics[metric_key].append(v) else: metric_key = phase_type + "_" + meter.name if metric_key not in metrics: metrics[metric_key] = [] metrics[metric_key].append(meter.value) # update learning curve visualizations: phase_type = "train" if task.train else "test" title = "%s-%s-%d" % ( phase_type, task.base_model.__class__.__name__, task.base_model.model_depth, ) title += self.title_suffix if not task.train and is_master(): logging.info("Plotting learning curves to visdom") plot_learning_curves(metrics, visdom_server=self.visdom, env=self.env, win=title, title=title)
def on_phase_start(self, task) -> None: """Create and display a progress bar with 0 progress.""" if not progressbar_available: raise RuntimeError( "progressbar module not installed, cannot use ProgressBarHook") if is_master(): self.bar_size = task.num_batches_per_phase self.batches = 0 self.progress_bar = progressbar.ProgressBar(self.bar_size) self.progress_bar.start()
def on_phase_end(self, task: "tasks.ClassyTask") -> None: """Checkpoint the task every checkpoint_period phases. We do not necessarily checkpoint the task at the end of every phase. """ if not is_master() or task.phase_type not in self.phase_types: return self.phase_counter += 1 if self.phase_counter % self.checkpoint_period != 0: return checkpoint_name = "model_phase-{phase}_end.torch".format(phase=task.phase_idx) self._save_checkpoint(task, checkpoint_name)
def on_phase_end(self, task) -> None: """Checkpoint the task every checkpoint_period phases. We do not necessarily checkpoint the task at the end of every phase. """ if not is_master() or task.phase_type not in self.phase_types: return self.phase_counter += 1 if self.phase_counter % self.checkpoint_period != 0: return checkpoint_name = CheckpointHook.get_checkpoint_name(task.phase_idx) self._save_checkpoint(task, checkpoint_name)
def load_and_broadcast_checkpoint( checkpoint_path: str, device: torch.device = CPU_DEVICE ) -> Optional[Dict]: """Loads a checkpoint on master and broadcasts it to all replicas. This is a collective operation which needs to be run in sync on all replicas. See :func:`load_checkpoint` for the arguments. """ if is_master(): checkpoint = load_checkpoint(checkpoint_path, device) else: checkpoint = None logging.info(f"Broadcasting checkpoint loaded from {checkpoint_path}") return broadcast_object(checkpoint)
def on_phase_start(self, task) -> None: """Initialize losses and learning_rates.""" self.learning_rates = [] self.wall_times = [] self.num_updates = [] self.step_idx = 0 if not is_master(): return # log the parameters before training starts if task.train and task.train_phase_idx == 0: for name, parameter in task.base_model.named_parameters(): self.tb_writer.add_histogram(f"Parameters/{name}", parameter, global_step=-1)
def on_start(self, task: "tasks.ClassyTask") -> None: """ Plot the model on Tensorboard. """ if is_master(): try: # Show model in tensorboard: logging.info("Showing model graph in TensorBoard...") plot_model( task.base_model, size=task.base_model.input_shape, input_key=task.base_model.input_key if hasattr( task.base_model, "input_key") else None, writer=self.tb_writer, ) except Exception: logging.warn( "Unable to plot model to tensorboard. Exception: ", exc_info=True)
def on_phase_end(self, task: "tasks.ClassyTask") -> None: """Clear the progress bar at the end of the phase.""" if is_master() and self.progress_bar is not None: self.progress_bar.finish()
def on_step(self, task: "tasks.ClassyTask") -> None: """Update the progress bar with the batch size.""" if task.train and is_master() and self.progress_bar is not None: self.batches += 1 self.progress_bar.update(min(self.batches, self.bar_size))
def on_phase_end(self, task: "tasks.ClassyTask") -> None: """Add the losses and learning rates to tensorboard.""" if self.learning_rates is None: logging.warning("learning_rates is not initialized") return batches = len(task.losses) if batches == 0 or not is_master(): return phase_type = task.phase_type phase_type_idx = task.train_phase_idx if task.train else task.eval_phase_idx logging.info(f"Plotting to Tensorboard for {phase_type} phase {phase_type_idx}") phase_type = task.phase_type loss_key = f"{phase_type}_loss" learning_rate_key = f"{phase_type}_learning_rate_updates" if task.train: for loss, learning_rate, global_step, wall_time in zip( task.losses, self.learning_rates, self.num_steps_global, self.wall_times ): loss /= task.get_batchsize_per_replica() self.tb_writer.add_scalar( loss_key, loss, global_step=global_step, walltime=wall_time ) self.tb_writer.add_scalar( learning_rate_key, learning_rate, global_step=global_step, walltime=wall_time, ) loss_avg = sum(task.losses) / (batches * task.get_batchsize_per_replica()) loss_key = "avg_{phase_type}_loss".format(phase_type=task.phase_type) self.tb_writer.add_scalar(loss_key, loss_avg, global_step=phase_type_idx) # plot meters which return a dict for meter in task.meters: if not isinstance(meter.value, dict): log.warn(f"Skipping meter {meter.name} with value: {meter.value}") continue for name, value in meter.value.items(): if isinstance(value, float): meter_key = f"{phase_type}_{meter.name}_{name}" self.tb_writer.add_scalar( meter_key, value, global_step=phase_type_idx ) else: log.warn( f"Skipping meter name {meter.name}_{name} with value: {value}" ) continue if hasattr(task, "perf_log"): for perf in task.perf_log: phase_idx = perf["phase_idx"] tag = perf["tag"] for metric_name, metric_value in perf.items(): if metric_name in ["phase_idx", "tag"]: continue self.tb_writer.add_scalar( f"Speed/{tag}/{metric_name}", metric_value, global_step=phase_idx, ) # flush so that the plots aren't lost if training crashes soon after self.tb_writer.flush() logging.info(f"Done plotting to Tensorboard")
def on_end(self, task) -> None: """Save model into torchscript by the end of training. """ if not is_master() or getattr(task, "test_only", False): return self.save_torchscript(task)
def on_phase_end(self, task: "tasks.ClassyTask", local_variables: Dict[str, Any]) -> None: """Clear the progress bar at the end of the phase.""" if is_master() and self.progress_bar is not None: self.progress_bar.finish()
def on_phase_end(self, task) -> None: """Add the losses and learning rates to tensorboard.""" if self.learning_rates is None: logging.warning("learning_rates is not initialized") return batches = len(task.losses) if batches == 0 or not is_master(): return phase_type = task.phase_type phase_type_idx = task.train_phase_idx if task.train else task.eval_phase_idx logging.info( f"Plotting to Tensorboard for {phase_type} phase {phase_type_idx}") phase_type = task.phase_type learning_rate_key = f"Learning Rate/{phase_type}" if task.train: for learning_rate, global_step, wall_time in zip( self.learning_rates, self.num_updates, self.wall_times): self.tb_writer.add_scalar( learning_rate_key, learning_rate, global_step=global_step, walltime=wall_time, ) for name, parameter in task.base_model.named_parameters(): self.tb_writer.add_histogram(f"Parameters/{name}", parameter, global_step=phase_type_idx) if torch.cuda.is_available() and task.train: self.tb_writer.add_scalar( "Memory/peak_allocated", torch.cuda.max_memory_allocated(), global_step=phase_type_idx, ) loss_avg = sum( task.losses) / (batches * task.get_batchsize_per_replica()) loss_key = "Losses/{phase_type}".format(phase_type=task.phase_type) self.tb_writer.add_scalar(loss_key, loss_avg, global_step=phase_type_idx) # plot meters which return a dict for meter in task.meters: if not isinstance(meter.value, dict): log.warn( f"Skipping meter {meter.name} with value: {meter.value}") continue for name, value in meter.value.items(): if isinstance(value, float): meter_key = f"Meters/{phase_type}/{meter.name}/{name}" self.tb_writer.add_scalar(meter_key, value, global_step=phase_type_idx) else: log.warn( f"Skipping meter name {meter.name}/{name} with value: {value}" ) continue if hasattr(task, "perf_log"): for perf in task.perf_log: phase_idx = perf["phase_idx"] tag = perf["tag"] for metric_name, metric_value in perf.items(): if metric_name in ["phase_idx", "tag"]: continue self.tb_writer.add_scalar( f"Speed/{tag}/{metric_name}", metric_value, global_step=phase_idx, ) # flush so that the plots aren't lost if training crashes soon after self.tb_writer.flush() logging.info(f"Done plotting to Tensorboard")