def on_phase_end(self, task: "tasks.ClassyTask") -> None: """ Called at the end of every epoch if the tensorboard hook is enabled. Log model parameters and/or parameter gradients as set by user in the tensorboard configuration. Also resents the CUDA memory counter. """ # Log train/test accuracy if is_primary(): phase_type = "Training" if task.train else "Testing" for meter in task.meters: if "accuracy" in meter.name: for top_n, accuracies in meter.value.items(): for i, acc in accuracies.items(): tag_name = f"{phase_type}/Accuracy_" f" {top_n}_Output_{i}" self.tb_writer.add_scalar( tag=tag_name, scalar_value=round(acc, 5), global_step=task.train_phase_idx, ) if not (self.log_params or self.log_params_gradients): return if is_primary() and task.train: # Log the weights and bias at the end of the epoch if self.log_params: for name, parameter in task.base_model.named_parameters(): self.tb_writer.add_histogram( f"Parameters/{name}", parameter, global_step=task.train_phase_idx, ) # Log the parameter gradients at the end of the epoch if self.log_params_gradients: for name, parameter in task.base_model.named_parameters(): if parameter.grad is not None: try: self.tb_writer.add_histogram( f"Gradients/{name}", parameter.grad, global_step=task.train_phase_idx, ) except ValueError: logging.info( f"Gradient histogram empty for {name}, " f"iteration {task.iteration}. Unable to " f"log gradient." ) # Reset the GPU Memory counter if torch.cuda.is_available(): torch.cuda.reset_max_memory_allocated() torch.cuda.reset_max_memory_cached()
def on_start(self, task) -> None: if not is_primary() or getattr(task, "test_only", False): return if not PathManager.exists(self.torchscript_folder): err_msg = "Torchscript folder '{}' does not exist.".format( self.torchscript_folder) raise FileNotFoundError(err_msg)
def on_start(self, task: "tasks.ClassyTask") -> None: """ Called at the start of training. """ if self.log_activation_statistics and is_primary(): self.activation_watcher.monitor(task.base_model) self.activation_watcher.set_iteration(task.iteration)
def on_start(self, task: "tasks.ClassyTask") -> None: """ Logs Gpu nvidia-smi stats to logger streams. """ if is_primary() and (task.device.type == "cuda"): # print the nvidia-smi stats log_gpu_stats()
def extract(self, output_folder: str) -> None: """ Extract workflow supports multi-gpu feature extraction. Since we are only extracting features, only the model is built (and initialized from some model weights file if specified by user). The model is set to the eval mode fully. The features are extracted for whatever data splits (train, val, test) etc that user wants. """ # support feature extraction on gpu only. assert self.task.device.type == "cuda", "Set MACHINE.DEVICE = gpu" self.task.prepare_extraction(pin_memory=self.cfg.DATA.PIN_MEMORY) # Create distributed model self._add_dummy_layer() self.task.init_distributed_data_parallel_model() if is_primary(): logging.info("Model is:\n {}".format(self.task.model)) # Get the names of the features that we are extracting. If user doesn't # specify the features to evaluate, we get the full model output and freeze # head/trunk both as caution. feat_names = get_trunk_output_feature_names(self.cfg.MODEL) if len(feat_names) == 0: feat_names = ["heads"] for split in self.task.available_splits: logging.info(f"============== Split: {split} =======================") logging.info(f"Extracting features for partition: {split.lower()}") self.task.data_iterator = iter(self.task.dataloaders[split.lower()]) self._extract_split_features(feat_names, self.task, split, output_folder) logging.info(f"Done getting features for partition: {split.lower()}") self._cleanup_task()
def on_step(self, task: "tasks.ClassyTask") -> None: """ Print the nvidia-smi stats again to get more accurate nvidia-smi useful for monitoring memory usage. """ if (is_primary() and (task.device.type == "cuda") and task.local_iteration_num == 50): log_gpu_stats()
def on_phase_end(self, task) -> None: """ Plot the metrics on visdom. """ phase_type = task.phase_type metrics = self.metrics batches = len(task.losses) if batches == 0: return # Loss for the phase loss = sum(task.losses) / (batches * task.get_batchsize_per_replica()) loss_key = phase_type + "_loss" if loss_key not in metrics: metrics[loss_key] = [] metrics[loss_key].append(loss) # Optimizer LR for the phase optimizer_lr = task.optimizer.options_view.lr lr_key = phase_type + "_learning_rate" if lr_key not in metrics: metrics[lr_key] = [] metrics[lr_key].append(optimizer_lr) # Calculate meters for meter in task.meters: if isinstance(meter.value, collections.MutableMapping): flattened_meters_dict = flatten_dict(meter.value, prefix=meter.name) for k, v in flattened_meters_dict.items(): metric_key = phase_type + "_" + k if metric_key not in metrics: metrics[metric_key] = [] metrics[metric_key].append(v) else: metric_key = phase_type + "_" + meter.name if metric_key not in metrics: metrics[metric_key] = [] metrics[metric_key].append(meter.value) # update learning curve visualizations: phase_type = "train" if task.train else "test" title = "%s-%s-%d" % ( phase_type, task.base_model.__class__.__name__, task.base_model.model_depth, ) title += self.title_suffix if not task.train and is_primary(): logging.info("Plotting learning curves to visdom") plot_learning_curves(metrics, visdom_server=self.visdom, env=self.env, win=title, title=title)
def extract( self, output_folder: str, extract_features: bool = True, extract_predictions: bool = False, ) -> None: """ Extract workflow supports multi-gpu feature extraction and also extracting predicted labels. Since we are only extracting features or label predictions, only the model is built (and initialized from some model weights file if specified by user). Optionally the meters are built if the labels are being extracted. The model is set to the eval mode fully. The features / labels are extracted for whatever data splits (train, val, test) the user wants. """ # support feature/label predictions extraction on gpu only. assert self.task.device.type == "cuda", "Set MACHINE.DEVICE = gpu" self.task.prepare_extraction(pin_memory=self.cfg.DATA.PIN_MEMORY) # Create distributed model self.task.add_dummy_layer() self.task.init_distributed_data_parallel_model() if is_primary(): logging.info(f"Model is:\n {self.task.model}") # Get the names of the features that we are extracting. If user doesn't # specify the features to evaluate, we get the full model output and freeze # head/trunk both as caution. feat_names = get_trunk_output_feature_names(self.cfg.MODEL) if len(feat_names) == 0: feat_names = ["heads"] self.task.train = False self.task.run_hooks(SSLClassyHookFunctions.on_start.name) for split in self.task.available_splits: logging.info( f"============== Split: {split} =======================") self.task.data_iterator = iter( self.task.dataloaders[split.lower()]) if extract_features: logging.info( f"Extracting features for partition: {split.lower()}") self._extract_split_features(feat_names, self.task, split, output_folder) logging.info( f"Done getting features for partition: {split.lower()}") if extract_predictions: logging.info( f"Extracting predictions for partition: {split.lower()}") self._extract_split_label_predictions(feat_names, self.task, split, output_folder) logging.info( f"Done getting predictions for partition: {split.lower()}") self.task.run_hooks(SSLClassyHookFunctions.on_end.name) self._cleanup_task()
def _print_memory_summary(self, task: "tasks.ClassyTask", stage_name: str) -> None: if ( is_primary() and (task.device.type == "cuda") and task.local_iteration_num == self.log_iteration_num ): logging.info( f"========= Memory Summary at {stage_name} =======" f"\n{torch.cuda.memory_summary()}\n" )
def on_phase_start(self, task) -> None: """Create and display a progress bar with 0 progress.""" if not progressbar_available: raise RuntimeError( "progressbar module not installed, cannot use ProgressBarHook") if is_primary(): self.bar_size = task.num_batches_per_phase self.batches = 0 self.progress_bar = progressbar.ProgressBar(self.bar_size) self.progress_bar.start()
def on_phase_end(self, task: "tasks.ClassyTask") -> None: for meter in task.ema_meters: meter.sync_state() if is_primary(): LogLossMetricsCheckpointHook.print_and_save_meters( task, task.train_phase_idx, task.ema_meters, metric_key_name_suffix="ema", )
def extract(self): """ Extract workflow supports multi-gpu feature extraction. Since we are only extracting features, only the model is built (and initialized from some model weights file if specified by user). The model is set to the eval mode fully. The features are extracted for whatever data splits (train, val, test) etc that user wants. """ # support feature extraction on gpu only. assert self.task.device.type == "cuda", "Set MACHINE.DEVICE = gpu" self.task.prepare_extraction(pin_memory=self.cfg.DATA.PIN_MEMORY) # in case of feature evaluation mode, if we are freezing both trunk and # head, DDP won't work as there are no parameters in the model. Adding # the dummy head will lead to features being not right. So we rather # add the dummy layer to the model and use DDP. We copy the model to # gpu (if using gpus) after the new dummy layer addition. fully_frozen_model = self.task.base_model.is_fully_frozen_model() if fully_frozen_model: self.task.base_model.dummy_layer = torch.nn.Linear(4, 4) if self.task.device.type == "cuda": self.task.base_model = copy_model_to_gpu(self.task.base_model) self.task.init_distributed_data_parallel_model() if is_primary(): logging.info("Model is:\n {}".format(self.task.model)) # Get the names of the features that we are extracting. If user doesn't # specify the features to evaluate, we get the full model output and freeze # head/trunk both as caution. feat_names = get_trunk_output_feature_names(self.cfg.MODEL) if len(feat_names) == 0: feat_names = ["heads"] features = {} for split in self.task.available_splits: logging.info(f"Extracting features for partition: {split.lower()}") self.task.data_iterator = iter( self.task.dataloaders[split.lower()]) features[split.lower()] = self._get_split_features( feat_names, self.cfg, self.task) logging.info( f"Done getting features for partition: {split.lower()}") if hasattr(self.task, "data_iterator"): del self.task.data_iterator gc.collect() if hasattr(self.task, "dataloaders"): del self.task.dataloaders gc.collect() return features
def on_phase_end(self, task: "tasks.ClassyTask") -> None: """ Called at the end of each phase and forward. We log the metrics and also save the checkpoint. We pass the mode: phase or iteration """ if is_primary(): self._print_and_save_meters(task, task.train_phase_idx) checkpoint_frequency = task.config["CHECKPOINT"]["CHECKPOINT_FREQUENCY"] self._checkpoint_model( task, mode_frequency=checkpoint_frequency, mode_num=task.train_phase_idx, mode="phase", )
def on_phase_end(self, task) -> None: """Checkpoint the task every checkpoint_period phases. We do not necessarily checkpoint the task at the end of every phase. """ if not is_primary() or task.phase_type not in self.phase_types: return self.phase_counter += 1 if self.phase_counter % self.checkpoint_period != 0: return checkpoint_name = CheckpointHook.get_checkpoint_name(task.phase_idx) self._save_checkpoint(task, checkpoint_name)
def load_and_broadcast_checkpoint( checkpoint_path: str, device: torch.device = CPU_DEVICE) -> Optional[Dict]: """Loads a checkpoint on master and broadcasts it to all replicas. This is a collective operation which needs to be run in sync on all replicas. See :func:`load_checkpoint` for the arguments. """ if is_primary(): checkpoint = load_checkpoint(checkpoint_path, device) else: checkpoint = None logging.info(f"Broadcasting checkpoint loaded from {checkpoint_path}") return broadcast_object(checkpoint)
def on_phase_start(self, task: "tasks.ClassyTask") -> None: """ Called at the start of every epoch if the tensorboard hook is enabled. Logs the model parameters once at the beginning of training only. """ if not self.log_params: return # log the parameters just once, before training starts if is_primary() and task.train and task.train_phase_idx == 0: for name, parameter in task.base_model.named_parameters(): self.tb_writer.add_histogram( f"Parameters/{name}", parameter, global_step=-1 )
def on_forward(self, task: "tasks.ClassyTask") -> None: """ Called after every forward if tensorboard hook is enabled. Logs the model parameters if the training iteration matches the logging frequency. """ if not self.log_params: return if (self.log_params_every_n_iterations > 0 and is_primary() and task.train and task.iteration % self.log_params_every_n_iterations == 0): for name, parameter in task.base_model.named_parameters(): self.tb_writer.add_histogram(f"Parameters/{name}", parameter, global_step=task.iteration)
def init_distributed_data_parallel_model(self): """ Initialize FSDP if needed. This method overloads the ClassificationTask class's method from ClassyVision. """ if not is_distributed_training_run(): return # Make sure default cuda device is set. TODO (Min): we should enable FSDP can # be enabled for 1-GPU as well, but the use case there is likely different. # I.e. perhaps we use it for cpu_offloading. assert get_cuda_device_index( ) > -1, "Distributed training not setup correctly" # The model might be already wrapped by FSDP internally. Check regnet_fsdp.py. # Here, we wrap it at the outer most level. fsdp_config = self.config["MODEL"]["FSDP_CONFIG"] if is_primary(): logging.info(f"Using FSDP, config: {fsdp_config}") # First, wrap the head's prototype_i layers if it is SWAV. # TODO (Min): make this more general for different models, which may have multiple # heads. head0 = self.base_model.heads[0] if isinstance(head0, SwAVPrototypesHead): for j in range(head0.nmb_heads): module = getattr(head0, "prototypes" + str(j)) module = FSDP(module=module, **fsdp_config) setattr(head0, "prototypes" + str(j), module) # TODO (Min): We can load checkpoint, but it ends up setting the trunk's _is_root # flag to true. We need to set it back to None here. # Also, right now, the head's weight is only partially loaded from the checkpoint # because we dump the checkpoint after the head if wrapped, but loading it before # it is wrapped. # For very big models, we need re-work the checkpoint logic because we don't have # enough memory to load the entire model on one node. We need to use local_state_dict() # API to load checkpoint shards. for module in self.base_model.trunk.modules(): if isinstance(module, FSDP): module._is_root = None # Then, wrap the whole model. We replace the base_model since it is used # when checkpoint is taken. self.base_model = FSDP(module=self.base_model, **fsdp_config) self.distributed_model = self.base_model
def on_update(self, task: "tasks.ClassyTask") -> None: """ Executed after after parameter update. If the current phase is training, and it's a logging iteration, we compute and log several helpul training stats to keep track of ongoing training. For monitoring the batch size (average training iteration time), we allow monitoring the stats (optionally) for every N iterations to get better idea about the batch time and training eta. Set the btime_freq input using cfg.HOOKS.PERF_STATS.PERF_STAT_FREQUENCY=N ensuring that cfg.HOOKS.PERF_STATS.MONITOR_PERF_STATS = True. """ if is_primary() and task.train: # Only log during training and on primary self._log_training_epoch(task) task.additional_log_data.clear()
def on_phase_start(self, task) -> None: """Initialize losses and learning_rates.""" self.learning_rates = [] self.wall_times = [] self.sample_fetch_times = [] if not is_primary(): return if torch.cuda.is_available(): torch.cuda.reset_max_memory_allocated() # log the parameters before training starts if task.train and task.train_phase_idx == 0: for name, parameter in task.base_model.named_parameters(): self.tb_writer.add_histogram( f"Parameters/{name}", parameter, global_step=-1 )
def __init__(self, checkpoint_folder: str, btime_freq: Optional[int] = None) -> None: """ Args: checkpoint_folder: checkpoint directory where we will write the stdout.json btime_freq: if specified, logs average batch time of rolling_freq batches also. """ super().__init__() self.btime_freq: Optional[int] = btime_freq self.json_stdout_logger = None if is_primary(): self.json_stdout_logger = PathManager.open( f"{checkpoint_folder}/stdout.json", mode="a", buffering=10 * 1024, # 10KB ) atexit.register(self.json_stdout_logger.close)
def _update(self, model, update_fn): base_model_state_dict = model.state_dict() for key, ema_params in self.module.state_dict().items(): model_params = base_model_state_dict[key] if self.device is not None: model_params = model_params.to(device=self.device) if ema_params.dtype != torch.float32: # This is modification from original code. if self.first_run and is_primary(): logging.warning( f"EMA: will be skipping key: {key} since it is of type: {ema_params.dtype}" # NOQA ) value = model_params else: value = update_fn(ema_params, model_params) ema_params.copy_(value) self.first_run = False
def on_start(self, task) -> None: """ Plot the model on Tensorboard. """ if is_primary(): try: # Show model in tensorboard: logging.info("Showing model graph in TensorBoard...") plot_model( task.base_model, size=task.base_model.input_shape, input_key=task.base_model.input_key if hasattr( task.base_model, "input_key") else None, writer=self.tb_writer, ) except Exception: logging.warn("Unable to plot model to tensorboard") logging.debug("Exception encountered:", exc_info=True)
def _sync_and_print_meters(self, task): for meter in task.meters: meter.sync_state() logging.info("Meters synced") if is_primary(): rank, _ = get_machine_local_and_dist_rank() for meter in task.meters: if len(task.meters) > 0 and ( (task.train and task.config["METERS"]["enable_training_meter"]) or (not task.train) ): meter_value = meter.value metric_key = f"{meter.name}" if metric_key not in task.metrics: task.metrics[metric_key] = [] task.metrics[metric_key].append(meter_value) logging.info( f"Rank: {rank}, name: {metric_key}, value: {meter_value}" )
def on_phase_end(self, task: "tasks.ClassyTask") -> None: """ Called at the end of every epoch if the tensorboard hook is enabled. Log model parameters and/or parameter gradients as set by user in the tensorboard configuration. Also resents the CUDA memory counter. """ if not (self.log_params or self.log_params_gradients): return if is_primary() and task.train: # Log the weights and bias at the end of the epoch if self.log_params: for name, parameter in task.base_model.named_parameters(): self.tb_writer.add_histogram( f"Parameters/{name}", parameter, global_step=task.train_phase_idx, ) # Log the parameter gradients at the end of the epoch if self.log_params_gradients: for name, parameter in task.base_model.named_parameters(): if parameter.grad is not None: try: self.tb_writer.add_histogram( f"Gradients/{name}", parameter.grad, global_step=task.train_phase_idx, ) except ValueError: logging.info( f"Gradient histogram empty for {name}, " f"iteration {task.iteration}. Unable to " f"log gradient.") # Reset the GPU Memory counter if torch.cuda.is_available(): torch.cuda.reset_max_memory_allocated() torch.cuda.reset_max_memory_cached()
def extract_clusters(self, output_folder: str) -> Dict[str, Dict[int, int]]: """ Workflow to extract multi-gpu cluster extraction for pre-trained models based on clusterization (SwAV, DeepCluster, etc). The function returns a map from image index to cluster index for the whole dataset for each of the different splits. """ # Support feature extraction on gpu only. assert self.task.device.type == "cuda", "Set MACHINE.DEVICE = gpu" self.task.prepare_extraction(pin_memory=self.cfg.DATA.PIN_MEMORY) # Assert that the model support extract of clusters assert ( self.task.base_model.is_clustering_model() ), "Extracting clusters is only available for cluster based pre-training methods" # Create distributed model self.task.add_dummy_layer() self.task.init_distributed_data_parallel_model() if is_primary(): logging.info("Model is:\n {}".format(self.task.model)) # Compute the cluster assignment on each worker in parallel cluster_assignment = {} for split in self.task.available_splits: msg = f"Extracting cluster assignment for partition: {split}" logging.info(msg) cluster_assignment[split] = self._get_cluster_assignment_for_split( self.task, split, output_folder=output_folder ) logging.info("Done: " + msg) self._cleanup_task() # Merge the cluster assignments and group by cluster return self._merge_cluster_assignments(cluster_assignment)
def on_loss_and_meter(self, task: "tasks.ClassyTask") -> None: """ """ if not is_primary(): return phase_type = "train" if task.train else "test" train_phase_idx = task.train_phase_idx iteration = task.iteration loss_val = round(task.last_batch.loss.data.cpu().item(), 5) if isinstance(task.optimizer.options_view.lr, set): lr_val = list(task.optimizer.options_view.lr) else: lr_val = round(task.optimizer.options_view.lr, 5) log_str = (f"Phase Type: {phase_type}; " f"[ep: {train_phase_idx}] " f"iter: {iteration}; " f"lr: {lr_val}; " f"loss: {loss_val}; ") logging.info(log_str)
def _checkpoint_model(self, task, train_phase_idx, mode_frequency, mode_num, mode="phase"): """ Checkpoint model. Can be called in 3 possible scenarios: 1. If training becomes NaN, then we checkpoint the model to facilitate debugging 2. After every N epochs (CHECKPOINT_FREQ), model state is checkpointed. 3. If user wants to checkpoint during the epoch (ie. after every few training iterations, the model state is checkpointed.) Args: task: Self-supervision task that hold information about training iteration, epoch number etc. train_phase_idx (int): current training phase number. Starts from 0 mode_frequency (int): mode can be "phase" or "iteration". Frequency of checkpointing for the given mode mode_num (int): for the checkpointing mode (phase or iteration), the number of phase or iteration at which checkpointing is being done """ phase_idx = task.phase_idx # num_train_phases = num_epochs * num_phases_per_epoch # For OSS use, num_train_phases will be equal to num_epochs num_train_phases = task.num_train_phases # check if we need to checkpoint this phase is_checkpointing_phase = is_checkpoint_phase(mode_num, mode_frequency, train_phase_idx, num_train_phases, mode) is_final_train_phase = ((train_phase_idx == (num_train_phases - 1)) and task.train and mode == "phase") # handle checkpoint: if task.train and (is_final_train_phase or is_checkpointing_phase): # - if sharded state consolidate the state # /!\ All the ranks have to participate if hasattr(task.optimizer, "consolidate_state_dict") and mode != "phase": logging.info( f"[{mode}: {mode_num}] Consolidating sharded state on all replicas" ) task.optimizer.consolidate_state_dict() # Depending on whether we are in FSDP mode or not # - save the checkpoint on the primary rank # - save the sharded checkpoint on all ranks if is_primary() or isinstance(task.base_model, FSDP): checkpoint_folder = task.checkpoint_folder logging.info( f"[{mode}: {mode_num}] Saving checkpoint to {checkpoint_folder}" ) model_state_dict = task.get_classy_state() # phase_idx is already incremented at the beginning of phase but if we # are checkpointing at an iteration in the middle of phase, we should not # save the incremented phase_idx as it will incorrectly assume that model # trained for that phase already. if mode == "iteration": model_state_dict[ "phase_idx"] = model_state_dict["phase_idx"] - 1 if task.train: train_phase_idx = train_phase_idx - 1 model_state_dict["train_phase_idx"] = train_phase_idx restart_phase = phase_idx - 1 restart_iteration = task.iteration # When loading from a phase checkpoint: else: restart_phase = phase_idx restart_iteration = task.iteration checkpoint_content = { "phase_idx": restart_phase, "iteration": restart_iteration, "loss": task.loss.state_dict(), "iteration_num": task.local_iteration_num, "train_phase_idx": train_phase_idx, "classy_state_dict": model_state_dict, } checkpoint_writer = CheckpointWriter( checkpoint_folder=checkpoint_folder, is_final_train_phase=is_final_train_phase, mode=mode, mode_num=mode_num, backend=task.config["CHECKPOINT"]["BACKEND"], ) if isinstance(task.base_model, FSDP): _, rank = get_machine_local_and_dist_rank() checkpoint_writer.save_sharded_checkpoint( content=checkpoint_content, shard_rank=rank, world_size=self.world_size, ) else: checkpoint_writer.save_consolidated_checkpoint( checkpoint_content)
def on_update(self, task: "tasks.ClassyTask") -> None: """ Executed after after parameter update. If the current phase is training, and it's a logging iteration, we compute and log several helpul training stats to keep track of ongoing training. For monitoring the batch size (average training iteration time), we allow monitoring the stats (optionally) for every N iterations to get better idea about the batch time and training eta. Set the btime_freq input using cfg.HOOKS.PERF_STATS.PERF_STAT_FREQUENCY=N ensuring that cfg.HOOKS.PERF_STATS.MONITOR_PERF_STATS = True. """ phase_type = "train" if task.train else "test" if is_primary() and phase_type == "train": train_phase_idx = task.train_phase_idx log_freq = task.config["LOG_FREQUENCY"] iteration = task.iteration if torch.cuda.is_available(): peak_mem_used = int(torch.cuda.max_memory_allocated() / 1024.0 / 1024.0) else: peak_mem_used = -1 if ((iteration == 1) or (iteration % log_freq == 0) or (iteration <= 100 and iteration % 5 == 0)): loss_val = round(task.last_batch.loss.data.cpu().item(), 5) if len(task.batch_time) > 0: batch_times = task.batch_time else: batch_times = [0] avg_time = sum(batch_times) / len(batch_times) eta_secs = avg_time * (task.max_iteration - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_secs))) if isinstance(task.optimizer.options_view.lr, set): lr_val = list(task.optimizer.options_view.lr) else: lr_val = round(task.optimizer.options_view.lr, 5) batch_time = int(1000.0 * avg_time) rank = get_rank() log_data = { "Rank": rank, "ep": train_phase_idx, "iter": iteration, "lr": lr_val, "loss": loss_val, "btime(ms)": batch_time, "eta": eta_string, "peak_mem(M)": peak_mem_used, } if iteration == 1: # Set max iterations. Currently used in benchmark_suite_scheduler.py log_data["max_iterations"] = task.max_iteration if self.btime_freq and len(batch_times) >= self.btime_freq: rolling_avg_time = (sum(batch_times[-self.btime_freq:]) / self.btime_freq) rolling_eta_secs = int(rolling_avg_time * (task.max_iteration - iteration)) rolling_eta_str = str( datetime.timedelta(seconds=int(rolling_eta_secs))) rolling_btime = int(1000.0 * rolling_avg_time) log_data[ f"btime({self.btime_freq}iters)(ms)"] = rolling_btime log_data["rolling_eta"] = rolling_eta_str # to maintain the backwards compatibility with the log.txt # logs, we convert the json to the previous format. # the stdout.json can be used to use the json format of logs. stdout_data = "" for key, value in log_data.items(): stdout_data = (f"{stdout_data}[{key}: {value}] " if key == "ep" else f"{stdout_data}{key}: {value}; ") logging.info(stdout_data.strip()) self.json_stdout_logger.write(json.dumps(log_data) + "\n")
def on_phase_end(self, task) -> None: """Clear the progress bar at the end of the phase.""" if is_primary() and self.progress_bar is not None: self.progress_bar.finish()