Пример #1
0
 def on_start(self, task: "tasks.ClassyTask") -> None:
     if not is_master() or getattr(task, "test_only", False):
         return
     if not PathManager.exists(self.checkpoint_folder):
         err_msg = "Checkpoint folder '{}' does not exist.".format(
             self.checkpoint_folder)
         raise FileNotFoundError(err_msg)
Пример #2
0
 def on_start(self, task) -> None:
     if not is_master() or getattr(task, "test_only", False):
         return
     if not PathManager.exists(self.torchscript_folder):
         err_msg = "Torchscript folder '{}' does not exist.".format(
             self.torchscript_folder)
         raise FileNotFoundError(err_msg)
Пример #3
0
 def on_update(
     self, task: "tasks.ClassyTask", local_variables: Dict[str, Any]
 ) -> None:
     """Update the progress bar with the batch size."""
     if is_master() and self.progress_bar is not None:
         self.batches += 1
         self.progress_bar.update(min(self.batches, self.bar_size))
Пример #4
0
    def on_phase_end(
        self, task: "tasks.ClassyTask", local_variables: Dict[str, Any]
    ) -> None:
        """Add the losses and learning rates to tensorboard."""
        if self.learning_rates is None:
            logging.warning("learning_rates is not initialized")
            return

        batches = len(task.losses)
        if batches == 0 or not is_master():
            return

        phase_type = task.phase_type
        phase_type_idx = task.train_phase_idx if task.train else task.eval_phase_idx

        logging.info(f"Plotting to Tensorboard for {phase_type} phase {phase_type_idx}")

        phase_type = task.phase_type
        loss_key = f"{phase_type}_loss"
        learning_rate_key = f"{phase_type}_learning_rate_updates"

        if task.train:
            for loss, learning_rate, global_step, wall_time in zip(
                task.losses, self.learning_rates, self.num_steps_global, self.wall_times
            ):
                loss /= task.get_batchsize_per_replica()
                self.tb_writer.add_scalar(
                    loss_key, loss, global_step=global_step, walltime=wall_time
                )
                self.tb_writer.add_scalar(
                    learning_rate_key,
                    learning_rate,
                    global_step=global_step,
                    walltime=wall_time,
                )

        loss_avg = sum(task.losses) / (batches * task.get_batchsize_per_replica())

        loss_key = "avg_{phase_type}_loss".format(phase_type=task.phase_type)
        self.tb_writer.add_scalar(loss_key, loss_avg, global_step=phase_type_idx)

        # plot meters which return a dict
        for meter in task.meters:
            if not isinstance(meter.value, dict):
                log.warn(f"Skipping meter {meter.name} with value: {meter.value}")
                continue
            for name, value in meter.value.items():
                if isinstance(value, float):
                    meter_key = f"{phase_type}_{meter.name}_{name}"
                    self.tb_writer.add_scalar(
                        meter_key, value, global_step=phase_type_idx
                    )
                else:
                    log.warn(
                        f"Skipping meter name {meter.name}_{name} with value: {value}"
                    )
                    continue

        logging.info(f"Done plotting to Tensorboard")
Пример #5
0
    def on_phase_end(self, task: "tasks.ClassyTask",
                     local_variables: Dict[str, Any]) -> None:
        """
        Plot the metrics on visdom.
        """
        phase_type = task.phase_type
        metrics = self.metrics
        batches = len(task.losses)

        if batches == 0:
            return

        # Loss for the phase
        loss = sum(task.losses) / (batches * task.get_batchsize_per_replica())
        loss_key = phase_type + "_loss"
        if loss_key not in metrics:
            metrics[loss_key] = []
        metrics[loss_key].append(loss)

        # Optimizer LR for the phase
        optimizer_lr = task.optimizer.parameters.lr
        lr_key = phase_type + "_learning_rate"
        if lr_key not in metrics:
            metrics[lr_key] = []
        metrics[lr_key].append(optimizer_lr)

        # Calculate meters
        for meter in task.meters:
            if isinstance(meter.value, collections.MutableMapping):
                flattened_meters_dict = flatten_dict(meter.value,
                                                     prefix=meter.name)
                for k, v in flattened_meters_dict.items():
                    metric_key = phase_type + "_" + k
                    if metric_key not in metrics:
                        metrics[metric_key] = []
                    metrics[metric_key].append(v)
            else:
                metric_key = phase_type + "_" + meter.name
                if metric_key not in metrics:
                    metrics[metric_key] = []
                metrics[metric_key].append(meter.value)

        # update learning curve visualizations:
        phase_type = "train" if task.train else "test"
        title = "%s-%s-%d" % (
            phase_type,
            task.base_model.__class__.__name__,
            task.base_model.model_depth,
        )
        title += self.title_suffix

        if not task.train and is_master():
            logging.info("Plotting learning curves to visdom")
            plot_learning_curves(metrics,
                                 visdom_server=self.visdom,
                                 env=self.env,
                                 win=title,
                                 title=title)
Пример #6
0
    def on_phase_start(self, task) -> None:
        """Create and display a progress bar with 0 progress."""
        if not progressbar_available:
            raise RuntimeError(
                "progressbar module not installed, cannot use ProgressBarHook")

        if is_master():
            self.bar_size = task.num_batches_per_phase
            self.batches = 0
            self.progress_bar = progressbar.ProgressBar(self.bar_size)
            self.progress_bar.start()
Пример #7
0
    def on_phase_end(self, task: "tasks.ClassyTask") -> None:
        """Checkpoint the task every checkpoint_period phases.

        We do not necessarily checkpoint the task at the end of every phase.
        """
        if not is_master() or task.phase_type not in self.phase_types:
            return

        self.phase_counter += 1
        if self.phase_counter % self.checkpoint_period != 0:
            return

        checkpoint_name = "model_phase-{phase}_end.torch".format(phase=task.phase_idx)
        self._save_checkpoint(task, checkpoint_name)
Пример #8
0
    def on_phase_end(self, task) -> None:
        """Checkpoint the task every checkpoint_period phases.

        We do not necessarily checkpoint the task at the end of every phase.
        """
        if not is_master() or task.phase_type not in self.phase_types:
            return

        self.phase_counter += 1
        if self.phase_counter % self.checkpoint_period != 0:
            return

        checkpoint_name = CheckpointHook.get_checkpoint_name(task.phase_idx)
        self._save_checkpoint(task, checkpoint_name)
Пример #9
0
def load_and_broadcast_checkpoint(
    checkpoint_path: str, device: torch.device = CPU_DEVICE
) -> Optional[Dict]:
    """Loads a checkpoint on master and broadcasts it to all replicas.

    This is a collective operation which needs to be run in sync on all replicas.

    See :func:`load_checkpoint` for the arguments.
    """
    if is_master():
        checkpoint = load_checkpoint(checkpoint_path, device)
    else:
        checkpoint = None
    logging.info(f"Broadcasting checkpoint loaded from {checkpoint_path}")
    return broadcast_object(checkpoint)
Пример #10
0
    def on_phase_start(self, task) -> None:
        """Initialize losses and learning_rates."""
        self.learning_rates = []
        self.wall_times = []
        self.num_updates = []
        self.step_idx = 0

        if not is_master():
            return

        # log the parameters before training starts
        if task.train and task.train_phase_idx == 0:
            for name, parameter in task.base_model.named_parameters():
                self.tb_writer.add_histogram(f"Parameters/{name}",
                                             parameter,
                                             global_step=-1)
Пример #11
0
    def on_start(self, task: "tasks.ClassyTask") -> None:
        """
        Plot the model on Tensorboard.
        """
        if is_master():
            try:
                # Show model in tensorboard:
                logging.info("Showing model graph in TensorBoard...")

                plot_model(
                    task.base_model,
                    size=task.base_model.input_shape,
                    input_key=task.base_model.input_key if hasattr(
                        task.base_model, "input_key") else None,
                    writer=self.tb_writer,
                )
            except Exception:
                logging.warn(
                    "Unable to plot model to tensorboard. Exception: ",
                    exc_info=True)
Пример #12
0
 def on_phase_end(self, task: "tasks.ClassyTask") -> None:
     """Clear the progress bar at the end of the phase."""
     if is_master() and self.progress_bar is not None:
         self.progress_bar.finish()
Пример #13
0
 def on_step(self, task: "tasks.ClassyTask") -> None:
     """Update the progress bar with the batch size."""
     if task.train and is_master() and self.progress_bar is not None:
         self.batches += 1
         self.progress_bar.update(min(self.batches, self.bar_size))
    def on_phase_end(self, task: "tasks.ClassyTask") -> None:
        """Add the losses and learning rates to tensorboard."""
        if self.learning_rates is None:
            logging.warning("learning_rates is not initialized")
            return

        batches = len(task.losses)
        if batches == 0 or not is_master():
            return

        phase_type = task.phase_type
        phase_type_idx = task.train_phase_idx if task.train else task.eval_phase_idx

        logging.info(f"Plotting to Tensorboard for {phase_type} phase {phase_type_idx}")

        phase_type = task.phase_type
        loss_key = f"{phase_type}_loss"
        learning_rate_key = f"{phase_type}_learning_rate_updates"

        if task.train:
            for loss, learning_rate, global_step, wall_time in zip(
                task.losses, self.learning_rates, self.num_steps_global, self.wall_times
            ):
                loss /= task.get_batchsize_per_replica()
                self.tb_writer.add_scalar(
                    loss_key, loss, global_step=global_step, walltime=wall_time
                )
                self.tb_writer.add_scalar(
                    learning_rate_key,
                    learning_rate,
                    global_step=global_step,
                    walltime=wall_time,
                )

        loss_avg = sum(task.losses) / (batches * task.get_batchsize_per_replica())

        loss_key = "avg_{phase_type}_loss".format(phase_type=task.phase_type)
        self.tb_writer.add_scalar(loss_key, loss_avg, global_step=phase_type_idx)

        # plot meters which return a dict
        for meter in task.meters:
            if not isinstance(meter.value, dict):
                log.warn(f"Skipping meter {meter.name} with value: {meter.value}")
                continue
            for name, value in meter.value.items():
                if isinstance(value, float):
                    meter_key = f"{phase_type}_{meter.name}_{name}"
                    self.tb_writer.add_scalar(
                        meter_key, value, global_step=phase_type_idx
                    )
                else:
                    log.warn(
                        f"Skipping meter name {meter.name}_{name} with value: {value}"
                    )
                    continue

        if hasattr(task, "perf_log"):
            for perf in task.perf_log:
                phase_idx = perf["phase_idx"]
                tag = perf["tag"]
                for metric_name, metric_value in perf.items():
                    if metric_name in ["phase_idx", "tag"]:
                        continue

                    self.tb_writer.add_scalar(
                        f"Speed/{tag}/{metric_name}",
                        metric_value,
                        global_step=phase_idx,
                    )

        # flush so that the plots aren't lost if training crashes soon after
        self.tb_writer.flush()
        logging.info(f"Done plotting to Tensorboard")
Пример #15
0
 def on_end(self, task) -> None:
     """Save model into torchscript by the end of training.
     """
     if not is_master() or getattr(task, "test_only", False):
         return
     self.save_torchscript(task)
Пример #16
0
 def on_phase_end(self, task: "tasks.ClassyTask",
                  local_variables: Dict[str, Any]) -> None:
     """Clear the progress bar at the end of the phase."""
     if is_master() and self.progress_bar is not None:
         self.progress_bar.finish()
    def on_phase_end(self, task) -> None:
        """Add the losses and learning rates to tensorboard."""
        if self.learning_rates is None:
            logging.warning("learning_rates is not initialized")
            return

        batches = len(task.losses)
        if batches == 0 or not is_master():
            return

        phase_type = task.phase_type
        phase_type_idx = task.train_phase_idx if task.train else task.eval_phase_idx

        logging.info(
            f"Plotting to Tensorboard for {phase_type} phase {phase_type_idx}")

        phase_type = task.phase_type
        learning_rate_key = f"Learning Rate/{phase_type}"

        if task.train:
            for learning_rate, global_step, wall_time in zip(
                    self.learning_rates, self.num_updates, self.wall_times):
                self.tb_writer.add_scalar(
                    learning_rate_key,
                    learning_rate,
                    global_step=global_step,
                    walltime=wall_time,
                )
            for name, parameter in task.base_model.named_parameters():
                self.tb_writer.add_histogram(f"Parameters/{name}",
                                             parameter,
                                             global_step=phase_type_idx)

        if torch.cuda.is_available() and task.train:
            self.tb_writer.add_scalar(
                "Memory/peak_allocated",
                torch.cuda.max_memory_allocated(),
                global_step=phase_type_idx,
            )

        loss_avg = sum(
            task.losses) / (batches * task.get_batchsize_per_replica())

        loss_key = "Losses/{phase_type}".format(phase_type=task.phase_type)
        self.tb_writer.add_scalar(loss_key,
                                  loss_avg,
                                  global_step=phase_type_idx)

        # plot meters which return a dict
        for meter in task.meters:
            if not isinstance(meter.value, dict):
                log.warn(
                    f"Skipping meter {meter.name} with value: {meter.value}")
                continue
            for name, value in meter.value.items():
                if isinstance(value, float):
                    meter_key = f"Meters/{phase_type}/{meter.name}/{name}"
                    self.tb_writer.add_scalar(meter_key,
                                              value,
                                              global_step=phase_type_idx)
                else:
                    log.warn(
                        f"Skipping meter name {meter.name}/{name} with value: {value}"
                    )
                    continue

        if hasattr(task, "perf_log"):
            for perf in task.perf_log:
                phase_idx = perf["phase_idx"]
                tag = perf["tag"]
                for metric_name, metric_value in perf.items():
                    if metric_name in ["phase_idx", "tag"]:
                        continue

                    self.tb_writer.add_scalar(
                        f"Speed/{tag}/{metric_name}",
                        metric_value,
                        global_step=phase_idx,
                    )

        # flush so that the plots aren't lost if training crashes soon after
        self.tb_writer.flush()
        logging.info(f"Done plotting to Tensorboard")