示例#1
0
    def _build_moco_encoder(self, task: tasks.ClassyTask) -> None:
        """
        Create the model replica called the encoder. This will slowly track
        the main model.
        """
        # Create the encoder, which will slowly track the model
        logging.info(
            "Building MoCo encoder - rank %s %s", *get_machine_local_and_dist_rank()
        )

        # - same architecture
        task.loss.moco_encoder = build_model(
            task.config["MODEL"], task.config["OPTIMIZER"]
        )

        task.loss.moco_encoder.to(task.device)

        # Restore an hypothetical checkpoint, else initialize from the model
        if task.loss.checkpoint is not None:
            task.loss.load_state_dict(task.loss.checkpoint)
        else:
            for param_q, param_k in zip(
                task.base_model.parameters(), task.loss.moco_encoder.parameters()
            ):
                param_k.data.copy_(param_q.data)
                param_k.requires_grad = False
示例#2
0
    def _build_momentum_network(self, task: tasks.ClassyTask) -> None:
        """
        Create the model replica called the encoder. This will slowly track
        the main model.
        """
        logging.info("Building momentum encoder - rank %s %s",
                     *get_machine_local_and_dist_rank())

        # - same architecture
        task.loss.momentum_encoder = build_model(task.config["MODEL"],
                                                 task.config["OPTIMIZER"])
        task.loss.momentum_encoder = nn.SyncBatchNorm.convert_sync_batchnorm(
            task.loss.momentum_encoder)
        task.loss.momentum_encoder.to(
            torch.device("cuda" if task.use_gpu else "cpu"))

        # Initialize from the model
        if task.loss.checkpoint is None:
            for param_q, param_k in zip(
                    task.base_model.parameters(),
                    task.loss.momentum_encoder.parameters()):
                param_k.data.copy_(param_q.data)
            for buff_q, buff_k in zip(
                    task.base_model.named_buffers(),
                    task.loss.momentum_encoder.named_buffers(),
            ):
                if "running_" not in buff_k[0]:
                    continue
                buff_k[1].data.copy_(buff_q[1].data)
        task.loss.momentum_encoder = init_distributed_data_parallel_model(
            task.loss.momentum_encoder)

        # Restore an hypothetical checkpoint
        if task.loss.checkpoint is not None:
            task.loss.load_state_dict(task.loss.checkpoint)
示例#3
0
 def __init__(self, profiling_config: AttrDict):
     super().__init__()
     self.output_folder = profiling_config.OUTPUT_FOLDER
     self.start_iteration = (profiling_config.START_ITERATION +
                             profiling_config.WARMUP_ITERATIONS)
     self.end_iteration = self.start_iteration + profiling_config.NUM_ITERATIONS
     self.interrupt_training = profiling_config.STOP_TRAINING_AFTER_PROFILING
     self.dist_rank = get_machine_local_and_dist_rank()[1]
     self.is_profiling_rank = self.dist_rank in profiling_config.PROFILED_RANKS
     self.profile_runtime = (
         self.is_profiling_rank
         and profiling_config.RUNTIME_PROFILING.USE_PROFILER)
     self.runtime_profiler = create_runtime_profiler(
         enabled=self.profile_runtime,
         use_cpu=profiling_config.RUNTIME_PROFILING.PROFILE_CPU,
         use_cuda=profiling_config.RUNTIME_PROFILING.PROFILE_GPU,
         wait=profiling_config.START_ITERATION,
         warmup=profiling_config.WARMUP_ITERATIONS,
         active=profiling_config.NUM_ITERATIONS,
         legacy_profiler=profiling_config.RUNTIME_PROFILING.LEGACY_PROFILER,
     )
     self.profile_by_layer_memory = (
         self.is_profiling_rank
         and profiling_config.MEMORY_PROFILING.TRACK_BY_LAYER_MEMORY)
     if self.profile_by_layer_memory:
         logging.info(
             f"Setting up memory tracker for rank {self.dist_rank}...")
         self.layer_memory_tracker = LayerwiseMemoryTracker()
示例#4
0
 def _print_and_save_meters(self, task, train_phase_idx):
     """
     Executed only on master gpu at the end of each epoch. Computes the
     meters and logs the metrics to the json file and to logger streams
     (stdout, file).
     """
     phase_type = "train" if task.train else "test"
     rank, _ = get_machine_local_and_dist_rank()
     checkpoint_folder = task.checkpoint_folder
     save_metrics = {}
     save_metrics["iteration"] = task.iteration
     save_metrics["phase_idx"] = task.phase_idx
     save_metrics["train_phase_idx"] = train_phase_idx
     for meter in task.meters:
         if len(task.meters) > 0 and (
             (task.train and task.config["METERS"]["enable_training_meter"])
                 or (not task.train)):
             meter_value = meter.value
             metric_key = f"{phase_type}_{meter.name}"
             if metric_key not in task.metrics:
                 task.metrics[metric_key] = []
             task.metrics[metric_key].append(meter_value)
             save_metrics[metric_key] = meter_value
             logging.info(
                 f"Rank: {rank}, name: {metric_key}, value: {meter_value}")
     meter_file = f"{checkpoint_folder}/metrics.json"
     save_file(save_metrics, meter_file, append_to_json=True)
示例#5
0
    def _get_data_files(self, split):
        """
        Get the given dataset split (train or test), get the path to the dataset
        (images and labels).
        1. If the user has explicitly specified the data_sources, we simply
           use those and don't do lookup in the datasets registered with VISSL
           from the dataset catalog.
        2. If the user hasn't specified the path, look for the dataset in
           the datasets catalog registered with VISSL. For a given list of datasets
           and a given partition (train/test), we first verify that we have the
           dataset and the correct source as specified by the user.
           Then for each dataset in the list, we get the data path (make sure it
           exists, sources match). For the label file, the file is optional.
        """
        local_rank, _ = get_machine_local_and_dist_rank()
        self.data_paths, self.label_paths = dataset_catalog.get_data_files(
            split, dataset_config=self.cfg["DATA"]
        )

        logging.info(
            f"Rank: {local_rank} split: {split} Data files:\n{self.data_paths}"
        )
        logging.info(
            f"Rank: {local_rank} split: {split} Label files:\n{self.label_paths}"
        )
示例#6
0
 def value(self):
     """
     Value of the meter globally synced. mean AP and AP for each class is returned
     """
     _, distributed_rank = get_machine_local_and_dist_rank()
     logging.info(
         f"Rank: {distributed_rank} Mean AP meter: "
         f"scores: {self._scores.shape}, target: {self._targets.shape}")
     ap_matrix = torch.ones(self.num_classes, dtype=torch.float32) * -1
     # targets matrix = 0, 1, -1
     # unknown matrix = 0, 1 where 1 means that it's an unknown
     unknown_matrix = torch.eq(self._targets, -1.0).float().detach().numpy()
     for cls_num in range(self.num_classes):
         # compute AP only for classes that have at least one positive example
         num_pos = len(torch.where(self._targets[:, cls_num] == 1)[0])
         if num_pos == 0:
             continue
         P, R, score, ap = get_precision_recall(
             self._targets[:, cls_num].detach().numpy(),
             self._scores[:, cls_num].detach().numpy(),
             (unknown_matrix[:, cls_num] == 0).astype(np.float),
         )
         ap_matrix[cls_num] = ap[0]
     nonzero_indices = torch.nonzero(ap_matrix != -1)
     if nonzero_indices.shape[0] < self.num_classes:
         logging.info(
             f"{nonzero_indices.shape[0]} out of {self.num_classes} classes "
             "have meaningful average precision")
     mean_ap = ap_matrix[nonzero_indices].mean().item()
     return {"mAP": mean_ap, "AP": ap_matrix}
示例#7
0
def extract_clusters(
    cfg: AttrDict,
    dist_run_id: str,
    checkpoint_folder: str,
    local_rank: int = 0,
    node_id: int = 0,
):
    """
    Sets up and executes model visualisation extraction workflow on one node
    """

    # setup the environment variables
    set_env_vars(local_rank, node_id, cfg)
    dist_rank = int(os.environ["RANK"])

    # setup logging
    setup_logging(__name__, output_dir=checkpoint_folder, rank=dist_rank)

    logging.info(f"Env set for rank: {local_rank}, dist_rank: {dist_rank}")
    # print the environment info for the current node
    if local_rank == 0:
        current_env = os.environ.copy()
        print_system_env_info(current_env)

    # setup the multiprocessing to be forkserver.
    # See https://fb.quip.com/CphdAGUaM5Wf
    setup_multiprocessing_method(cfg.MULTI_PROCESSING_METHOD)

    # set seeds
    logging.info("Setting seed....")
    set_seeds(cfg, dist_rank)

    # We set the CUDA device here as well as a safe solution for all downstream
    # `torch.cuda.current_device()` calls to return correct device.
    if cfg.MACHINE.DEVICE == "gpu" and torch.cuda.is_available():
        local_rank, _ = get_machine_local_and_dist_rank()
        torch.cuda.set_device(local_rank)

    # print the training settings and system settings
    if local_rank == 0:
        print_cfg(cfg)
        logging.info("System config:\n{}".format(collect_env_info()))

    # Build the SSL trainer to set up distributed training and then
    # extract the cluster assignments for all entries in the dataset
    trainer = SelfSupervisionTrainer(cfg, dist_run_id)
    cluster_assignments = trainer.extract_clusters()

    # Save the cluster assignments in the output folder
    if dist_rank == 0:
        ClusterAssignmentLoader.save_cluster_assignment(
            output_dir=get_checkpoint_folder(cfg),
            assignments=ClusterAssignment(
                config=cfg, cluster_assignments=cluster_assignments),
        )

    # close the logging streams including the file handlers
    logging.info("All Done!")
    shutdown_logging()
示例#8
0
 def _save_label_cls_idx_map(self, cls_idx_map: Dict[str, int], split: str):
     local_rank, dist_rank = get_machine_local_and_dist_rank()
     if dist_rank == 0:
         checkpoint_folder = get_checkpoint_folder(self.cfg)
         class_idx_file_path = (
             f"{checkpoint_folder}/{split.lower()}_label_to_index_map.json")
         if not g_pathmgr.exists(class_idx_file_path):
             save_file(cls_idx_map,
                       class_idx_file_path,
                       append_to_json=False)
示例#9
0
    def _advance_phase(self, task: ClassyTask):
        """
        Advance the training phase to the next phase.
        - Updates the phase number,
        - resets the meters,
        - reset losses,
        - recreates the data iterator and destroys previous iterator
        - set the model to be in train or eval phase depending on what phase we are in
        - execute any optimizer update (normally learning rate updates etc at the end of
          an epoch)
        """
        # reset the meters at the beginning of the epoch
        for meter in task.meters:
            meter.reset()

        # reset the loss history for this epoch
        task.losses = []

        # advance the epoch num to be current
        task.phase_idx += 1
        phase = task.phases[task.phase_idx]
        task.train = True if phase["train"] else False
        if task.train:
            task.train_phase_idx += 1

        # get a new data iterator - delete the iterator at the beginning explicitly
        # so that all dataloader processes are cleaned up
        phase_type = "train" if phase["train"] else "test"
        # we are advancing to next epoch, so no need to compute start_iter,
        # just let it to be 0 inside of recreate_data_iterator. However, if we are just
        # starting from the resumed training, we want to compute_start_iter
        # again (if applicable) since we recreate the data iterator and delete
        # the old ones.
        compute_start_iter = False
        if task.checkpoint is not None and task.checkpoint["train_phase_idx"] == (
            task.train_phase_idx - 1
        ):
            compute_start_iter = True

        task.recreate_data_iterator(
            phase_type,
            epoch=task.phase_idx,
            compute_start_iter=compute_start_iter,
            train_phase_idx=task.train_phase_idx,
        )

        # set the model to train or eval depending on what phase we are in
        task.model.train(phase["train"])

        if task.train and task.train_phase_idx >= 0:
            task.optimizer.on_epoch(task.where)

        local_rank, _ = get_machine_local_and_dist_rank()
        logging.info(f"Phase advanced. Rank: {local_rank}")
示例#10
0
 def __init__(self, model_config, optimizer_config):
     self.model_config = model_config
     self.optimizer_config = optimizer_config
     super().__init__()
     self.eval_mode = None  # this is just informational
     self.local_rank, _ = get_machine_local_and_dist_rank()
     self.trunk = self._get_trunk()
     self.heads = nn.ModuleList()
     self.head_names = []
     self._output_feature_names = get_trunk_output_feature_names(self.model_config)
     self._get_heads()
     self._setup_multi_input_head_mapping()
示例#11
0
    def _load_labels(self):
        """
        Load the labels if the dataset has labels. In self-supervised
        pre-training task, we don't use labels. However, we use labels for the
        evaluations of the self-supervised models on the downstream tasks.

        For labels, two label sources are supported: disk_filelist and disk_folder

        In case of disk_filelist, we iteratively read labels for each specified file.
        See load_single_label_file().
        In case of disk_folder, we use the ImageFolder object created during the
        data loading itself.
        """
        local_rank, _ = get_machine_local_and_dist_rank()
        for idx, label_source in enumerate(self.label_sources):
            if label_source == "disk_filelist":
                paths = self.label_paths[idx]
                # in case of filelist, we support multiple label files.
                # we rely on the user to have a proper collator to handle
                # the multiple labels
                logging.info(f"Loading labels: {paths}")
                if isinstance(paths, list):
                    labels = []
                    for path in paths:
                        path_labels = self.load_single_label_file(path)
                        labels.append(path_labels)
                else:
                    labels = self.load_single_label_file(paths)
                    labels = self._convert_to_numeric_ids(labels)
            elif label_source == "disk_folder":
                # In this case we use the labels inferred from the directory structure
                # We enforce that the data source also be a disk folder in this case
                assert self.data_sources[idx] == self.label_sources[idx]
                if local_rank == 0:
                    logging.info(
                        f"Using {label_source} labels from {self.data_paths[idx]}"
                    )
                # Use the ImageFolder object created when loading images.
                # We do not create it again since it can be an expensive operation.
                labels = [
                    x[1] for x in self.data_objs[idx].image_dataset.samples
                ]
                labels = np.array(labels).astype(np.int64)
            elif label_source == "torchvision_dataset":
                labels = np.array(self.data_objs[idx].get_labels()).astype(
                    np.int64)
            elif label_source == "synthetic":
                labels = np.array([0 for _ in range(len(self.data_objs[idx]))])
            else:
                raise ValueError(f"unknown label source: {label_source}")
            self.label_objs.append(labels)
示例#12
0
 def load_and_broadcast_checkpoint(
     cls, checkpoint_folder: str, checkpoint_path: str, device
 ):
     """
     Load the checkpoint at the provided path, dealing with the
     potential indirection due to the notion of sharded checkpoint
     """
     checkpoint = load_and_broadcast_checkpoint(checkpoint_path, device)
     if cls._is_shard_aggregator_checkpoint(checkpoint):
         _, global_rank = get_machine_local_and_dist_rank()
         shard_name = checkpoint["shards"][global_rank]
         shard_path = os.path.join(checkpoint_folder, shard_name)
         checkpoint = load_checkpoint(shard_path, device)
     return checkpoint
示例#13
0
 def __init__(self, profiling_config: AttrDict):
     super().__init__()
     self.output_folder = profiling_config.OUTPUT_FOLDER
     self.start_iteration = profiling_config.START_ITERATION
     self.end_iteration = (profiling_config.START_ITERATION +
                           profiling_config.NUM_ITERATIONS)
     self.dist_rank = get_machine_local_and_dist_rank()[1]
     self.enabled = self.dist_rank in profiling_config.PROFILED_RANKS
     self.profile_memory = (
         self.enabled
         and profiling_config.MEMORY_PROFILING.TRACK_BY_LAYER_MEMORY)
     if self.profile_memory:
         logging.info(
             f"Setting up memory tracker for rank {self.dist_rank}...")
         self.layer_memory_tracker = LayerwiseMemoryTracker()
示例#14
0
 def to_pytorch_syncbn(group_size):
     logging.info("Converting BN layers to PyTorch SyncBN")
     if group_size is None:
         process_group = None
         logging.info("Not creating process_group for PyTorch SyncBN...")
     else:
         process_group_ids = split_world_in_process_groups(
             world_size=config.DISTRIBUTED.NUM_PROC_PER_NODE
             * config.DISTRIBUTED.NUM_NODES,
             group_size=group_size,
         )
         process_groups = [dist.new_group(pids) for pids in process_group_ids]
         _, dist_rank = get_machine_local_and_dist_rank()
         process_group = process_groups[dist_rank // group_size]
     return nn.SyncBatchNorm.convert_sync_batchnorm(
         model, process_group=process_group
     )
示例#15
0
 def _sync_and_print_meters(self, task):
     for meter in task.meters:
         meter.sync_state()
         logging.info("Meters synced")
     if is_primary():
         rank, _ = get_machine_local_and_dist_rank()
         for meter in task.meters:
             if len(task.meters) > 0 and (
                 (task.train and task.config["METERS"]["enable_training_meter"])
                 or (not task.train)
             ):
                 meter_value = meter.value
                 metric_key = f"{meter.name}"
                 if metric_key not in task.metrics:
                     task.metrics[metric_key] = []
                 task.metrics[metric_key].append(meter_value)
                 logging.info(
                     f"Rank: {rank}, name: {metric_key}, value: {meter_value}"
                 )
示例#16
0
    def on_forward(self, task: "tasks.ClassyTask") -> None:
        """
        Called each time a model forward is done and make sure that
        the model forward output is not NaN. If we encounter NaN as the model
        output, we checkpoint the model to enable debugging and also checkpoint
        the model input sample, model output.
        """
        # check the model output is not NaN.
        has_nan = False
        model_output = task.last_batch.model_output
        if isinstance(model_output, list):
            has_nan = not torch.tensor(
                [torch.isfinite(x).all() for x in model_output]
            ).all()
        else:
            has_nan = not torch.isfinite(model_output).all()

        if has_nan:
            _, dist_rank = get_machine_local_and_dist_rank()
            logging.info(f"Infinite Model output or NaN at iteration={task.iteration}.")

            # TODO - this code was broken during a refactoring: improve it
            from vissl.hooks.log_hooks import LogLossMetricsCheckpointHook

            LogLossMetricsCheckpointHook.checkpoint_model(
                task,
                world_size=self.world_size,
                mode_frequency=1,
                mode_num=task.iteration,
                mode="iteration",
            )
            model_output_file = (
                f"{task.checkpoint_folder}/rank{dist_rank}_model_output.pth"
            )
            input_sample_file = (
                f"{task.checkpoint_folder}/rank{dist_rank}_input_sample.pth"
            )
            with PathManager.open(model_output_file, "wb") as fwrite:
                torch.save(model_output, fwrite)
            with PathManager.open(input_sample_file, "wb") as fwrite:
                torch.save(task.last_batch.sample, fwrite)
            logging.info(f"Saved model output: {model_output_file}")
            logging.info(f"Saved model input: {input_sample_file}")
示例#17
0
    def on_forward(self, task: "tasks.ClassyTask") -> None:
        """
        Called each time a model forward is done and make sure that
        the model forward output is not NaN. If we encounter NaN as the model
        output, we checkpoint the model to enable debugging and also checkpoint
        the model input sample, model output.
        """
        # check the model output is not NaN.
        has_nan = False
        model_output = task.last_batch.model_output
        if isinstance(model_output, list):
            has_nan = not torch.tensor(
                [torch.isfinite(x).all() for x in model_output]
            ).all()
        else:
            has_nan = not torch.isfinite(model_output).all()

        if has_nan:
            _, dist_rank = get_machine_local_and_dist_rank()
            logging.info(f"Infinite Model output or NaN at iteration={task.iteration}.")
            self._checkpoint_model(
                task,
                task.train_phase_idx,
                mode_frequency=1,
                mode_num=task.iteration,
                mode="iteration",
            )
            model_output_file = (
                f"{task.checkpoint_folder}/rank{dist_rank}_model_output.pth"
            )
            input_sample_file = (
                f"{task.checkpoint_folder}/rank{dist_rank}_input_sample.pth"
            )
            with PathManager.open(model_output_file, "wb") as fwrite:
                torch.save(model_output, fwrite)
            with PathManager.open(input_sample_file, "wb") as fwrite:
                torch.save(task.last_batch.sample, fwrite)
            logging.info(f"Saved model output: {model_output_file}")
            logging.info(f"Saved model input: {input_sample_file}")
示例#18
0
    def __init__(
        self,
        cfg: AttrDict,
        dist_run_id: str,
        checkpoint_path: str = None,
        checkpoint_folder: str = None,
        hooks: List[ClassyHook] = None,
    ):
        self.cfg = cfg
        self.dist_run_id = dist_run_id
        self.local_rank, self.distributed_rank = get_machine_local_and_dist_rank()
        self.setup_distributed(self.cfg.MACHINE.DEVICE == "gpu")

        # now we should build the task. The task will also have the State attached
        # to it. It will have information about phases (train, test) both. It will
        # also contain all the other information like optimizers, etc
        self.task = build_task(self.cfg)
        self.task.set_checkpoint_path(checkpoint_path)
        self.task.set_checkpoint_folder(checkpoint_folder)
        if hooks is None:
            hooks = []
        self.task.set_hooks(hooks)
示例#19
0
    def _checkpoint_model(self,
                          task,
                          train_phase_idx,
                          mode_frequency,
                          mode_num,
                          mode="phase"):
        """
        Checkpoint model. Can be called in 3 possible scenarios:
        1. If training becomes NaN, then we checkpoint the model to facilitate debugging
        2. After every N epochs (CHECKPOINT_FREQ), model state is checkpointed.
        3. If user wants to checkpoint during the epoch (ie. after every few training
           iterations, the model state is checkpointed.)

        Args:
            task: Self-supervision task that hold information about training iteration,
                  epoch number etc.
            train_phase_idx (int): current training phase number. Starts from 0
            mode_frequency (int): mode can be "phase" or "iteration". Frequency
                                  of checkpointing for the given mode
            mode_num (int): for the checkpointing mode (phase or iteration), the number
                            of phase or iteration at which checkpointing is being done
        """
        phase_idx = task.phase_idx
        # num_train_phases = num_epochs * num_phases_per_epoch
        # For OSS use, num_train_phases will be equal to num_epochs
        num_train_phases = task.num_train_phases

        # check if we need to checkpoint this phase
        is_checkpointing_phase = is_checkpoint_phase(mode_num, mode_frequency,
                                                     train_phase_idx,
                                                     num_train_phases, mode)
        is_final_train_phase = ((train_phase_idx == (num_train_phases - 1))
                                and task.train and mode == "phase")

        # handle checkpoint:
        if task.train and (is_final_train_phase or is_checkpointing_phase):
            #  - if sharded state consolidate the state
            # /!\ All the ranks have to participate
            if hasattr(task.optimizer,
                       "consolidate_state_dict") and mode != "phase":
                logging.info(
                    f"[{mode}: {mode_num}] Consolidating sharded state on all replicas"
                )
                task.optimizer.consolidate_state_dict()

            # Depending on whether we are in FSDP mode or not
            # - save the checkpoint on the primary rank
            # - save the sharded checkpoint on all ranks
            if is_primary() or isinstance(task.base_model, FSDP):
                checkpoint_folder = task.checkpoint_folder
                logging.info(
                    f"[{mode}: {mode_num}] Saving checkpoint to {checkpoint_folder}"
                )
                model_state_dict = task.get_classy_state()

                # phase_idx is already incremented at the beginning of phase but if we
                # are checkpointing at an iteration in the middle of phase, we should not
                # save the incremented phase_idx as it will incorrectly assume that model
                # trained for that phase already.
                if mode == "iteration":
                    model_state_dict[
                        "phase_idx"] = model_state_dict["phase_idx"] - 1
                    if task.train:
                        train_phase_idx = train_phase_idx - 1
                        model_state_dict["train_phase_idx"] = train_phase_idx
                    restart_phase = phase_idx - 1
                    restart_iteration = task.iteration

                # When loading from a phase checkpoint:
                else:
                    restart_phase = phase_idx
                    restart_iteration = task.iteration

                checkpoint_content = {
                    "phase_idx": restart_phase,
                    "iteration": restart_iteration,
                    "loss": task.loss.state_dict(),
                    "iteration_num": task.local_iteration_num,
                    "train_phase_idx": train_phase_idx,
                    "classy_state_dict": model_state_dict,
                }

                checkpoint_writer = CheckpointWriter(
                    checkpoint_folder=checkpoint_folder,
                    is_final_train_phase=is_final_train_phase,
                    mode=mode,
                    mode_num=mode_num,
                    backend=task.config["CHECKPOINT"]["BACKEND"],
                )

                if isinstance(task.base_model, FSDP):
                    _, rank = get_machine_local_and_dist_rank()
                    checkpoint_writer.save_sharded_checkpoint(
                        content=checkpoint_content,
                        shard_rank=rank,
                        world_size=self.world_size,
                    )
                else:
                    checkpoint_writer.save_consolidated_checkpoint(
                        checkpoint_content)
示例#20
0
def init_model_from_consolidated_weights(
    config: AttrDict,
    model,
    state_dict: Dict[str, Any],
    state_dict_key_name: str,
    skip_layers: List[str],
    replace_prefix=None,
    append_prefix=None,
):
    """
    Initialize the model from any given params file. This is particularly useful
    during the feature evaluation process or when we want to evaluate a model on
    a range of tasks.

    Args:
        config (AttrDict): config file
        model (object): instance of base_ssl_model
        state_dict (Dict): torch.load() of user provided params file path.
        state_dict_key_name (string): key name containing the model state dict
        skip_layers (List(string)): layer names with this key are not copied
        replace_prefix (string): remove these prefixes from the layer names (executed first)
        append_prefix (string): append the prefix to the layer names
                                (executed after replace_prefix)

    Returns:
        model (object): the model initialized from the weights file
    """
    # whether it's a model from somewhere else or a model from this codebase, load the
    # state_dict
    if state_dict_key_name and len(state_dict_key_name) > 0:
        assert (
            state_dict_key_name in state_dict.keys()
        ), f"Unknown state dict key: {state_dict_key_name}"
        state_dict = state_dict[state_dict_key_name]

    if state_dict_key_name == "classy_state_dict":
        # get the appropriate model_state_dict so that the model can load. We automatically
        # take care of appending prefixes, suffixes etc to match the layer names.
        state_dict = get_checkpoint_model_state_dict(config, state_dict)
    else:
        # make any corrections to the layer names to load checkpoint successfully
        if replace_prefix:
            state_dict = replace_module_prefix(state_dict, replace_prefix)
        if append_prefix:
            state_dict = append_module_prefix(state_dict, append_prefix)
        check_model_compatibilty(config, state_dict)

    # load the checkpoint now
    all_layers = model.state_dict()

    local_rank, _ = get_machine_local_and_dist_rank()
    max_len_model = max(len(key) for key in all_layers.keys())
    for layername in all_layers.keys():
        if len(skip_layers) > 0 and any(item in layername for item in skip_layers):
            if local_rank == 0:
                logging.info(f"Ignored layer:\t{layername}")
            continue
        if layername in state_dict:
            param = state_dict[layername]
            if not isinstance(param, torch.Tensor):
                param = torch.from_numpy(param)
            # if we are initializing the heads and the feature eval mode is on, we check
            # if we are evaluating the heads as well or not. If not, we don't initialize
            # the heads. Otherwise we initialize the heads.
            if (
                not ("heads" in layername)
                or (
                    "heads" in layername
                    and not config.MODEL.FEATURE_EVAL_SETTINGS.EVAL_MODE_ON
                )
                or (
                    "heads" in layername
                    and config.MODEL.FEATURE_EVAL_SETTINGS.EVAL_MODE_ON
                    and config.MODEL.FEATURE_EVAL_SETTINGS.EVAL_TRUNK_AND_HEAD
                )
            ):
                # Accommodate changing position embeddings. Fine-tuning at a
                # different resolution than that which a model was pretrained
                # at requires interpolating the learned position embeddings.
                if "pos_embedding" in layername:
                    param = interpolate_position_embeddings(
                        model, all_layers[layername], param
                    )
                assert all_layers[layername].shape == param.shape, (
                    f"{layername} have different shapes: "
                    f"checkpoint: {param.shape}, model: {all_layers[layername].shape}"
                )
                all_layers[layername].copy_(param)
                if local_rank == 0:
                    logging.info(
                        f"Loaded: {layername: <{max_len_model}} of "
                        f"shape: {all_layers[layername].size()} from checkpoint"
                    )
            else:
                if local_rank == 0:
                    logging.info(f"Ignored layer:\t{layername}")
        else:
            if local_rank == 0:
                logging.info(f"Not found:\t\t{layername}, not initialized")
    if local_rank == 0:
        extra_layers = []
        # go through the checkpoint state_dict and print what extra layers exist in checkpoint
        for layername in state_dict.keys():
            if layername not in all_layers:
                extra_layers.append(layername)
        logging.info(f"Extra layers not loaded from checkpoint: {extra_layers}")

    ####################### DEBUG ############################
    # print_state_dict_shapes(model.state_dict())
    return model
示例#21
0
文件: train.py 项目: walleDCR/vissl
def train_main(
    cfg: AttrDict,
    dist_run_id: str,
    checkpoint_path: str,
    checkpoint_folder: str,
    local_rank: int = 0,
    node_id: int = 0,
    hook_generator: Callable[[Any], List[ClassyHook]] = default_hook_generator,
):
    """
    Sets up and executes training workflow per machine.

    Args:
        cfg (AttrDict): user specified input config that has optimizer, loss, meters etc
                        settings relevant to the training
        dist_run_id (str): For multi-gpu training with PyTorch, we have to specify
                           how the gpus are going to rendezvous. This requires specifying
                           the communication method: file, tcp and the unique rendezvous
                           run_id that is specific to 1 run.
                           We recommend:
                                1) for 1node: use init_method=tcp and run_id=auto
                                2) for multi-node, use init_method=tcp and specify
                                run_id={master_node}:{port}
        checkpoint_path (str): if the training is being resumed from a checkpoint, path to
                          the checkpoint. The tools/run_distributed_engines.py automatically
                          looks for the checkpoint in the checkpoint directory.
        checkpoint_folder (str): what directory to use for checkpointing. The
                          tools/run_distributed_engines.py creates the directory based on user
                          input in the yaml config file.
        local_rank (int): id of the current device on the machine. If using gpus,
                        local_rank = gpu number on the current machine
        node_id (int): id of the current machine. starts from 0. valid for multi-gpu
        hook_generator (Callable): The utility function that prepares all the hoooks that will
                         be used in training based on user selection. Some basic hooks are used
                         by default.
    """

    # setup the environment variables
    set_env_vars(local_rank, node_id, cfg)
    dist_rank = int(os.environ["RANK"])

    # setup logging
    setup_logging(__name__, output_dir=checkpoint_folder, rank=dist_rank)

    logging.info(f"Env set for rank: {local_rank}, dist_rank: {dist_rank}")
    # print the environment info for the current node
    if local_rank == 0:
        current_env = os.environ.copy()
        print_system_env_info(current_env)

    # setup the multiprocessing to be forkserver.
    # See https://fb.quip.com/CphdAGUaM5Wf
    setup_multiprocessing_method(cfg.MULTI_PROCESSING_METHOD)

    # set seeds
    logging.info("Setting seed....")
    set_seeds(cfg, dist_rank)

    # We set the CUDA device here as well as a safe solution for all downstream
    # `torch.cuda.current_device()` calls to return correct device.
    if cfg.MACHINE.DEVICE == "gpu" and torch.cuda.is_available():
        local_rank, _ = get_machine_local_and_dist_rank()
        torch.cuda.set_device(local_rank)

    # print the training settings and system settings
    if local_rank == 0:
        print_cfg(cfg)
        logging.info("System config:\n{}".format(collect_env_info()))

    # get the hooks - these hooks are executed per replica
    hooks = hook_generator(cfg)

    # build the SSL trainer. The trainer first prepares a "task" object which
    # acts as a container for various things needed in a training: datasets,
    # dataloader, optimizers, losses, hooks, etc. "Task" will also have information
    # about phases (train, test) both. The trainer then sets up distributed
    # training.
    trainer = SelfSupervisionTrainer(
        cfg, dist_run_id, checkpoint_path, checkpoint_folder, hooks
    )
    trainer.train()
    logging.info("All Done!")
    # close the logging streams including the filehandlers
    shutdown_logging()
示例#22
0
def extract_main(
    cfg: AttrDict,
    dist_run_id: str,
    checkpoint_folder: str,
    local_rank: int = 0,
    node_id: int = 0,
):
    """
    Sets up and executes feature extraction workflow per machine.

    Args:
        cfg (AttrDict): user specified input config that has optimizer, loss, meters etc
                        settings relevant to the training
        dist_run_id (str): For multi-gpu training with PyTorch, we have to specify
                           how the gpus are going to rendezvous. This requires specifying
                           the communication method: file, tcp and the unique rendezvous
                           run_id that is specific to 1 run.
                           We recommend:
                                1) for 1node: use init_method=tcp and run_id=auto
                                2) for multi-node, use init_method=tcp and specify
                                run_id={master_node}:{port}
        local_rank (int): id of the current device on the machine. If using gpus,
                        local_rank = gpu number on the current machine
        node_id (int): id of the current machine. starts from 0. valid for multi-gpu
    """

    # setup the environment variables
    set_env_vars(local_rank, node_id, cfg)
    dist_rank = int(os.environ["RANK"])

    # setup logging
    setup_logging(__name__, output_dir=checkpoint_folder, rank=dist_rank)

    logging.info(f"Env set for rank: {local_rank}, dist_rank: {dist_rank}")
    # print the environment info for the current node
    if local_rank == 0:
        current_env = os.environ.copy()
        print_system_env_info(current_env)

    # setup the multiprocessing to be forkserver.
    # See https://fb.quip.com/CphdAGUaM5Wf
    setup_multiprocessing_method(cfg.MULTI_PROCESSING_METHOD)

    # set seeds
    logging.info("Setting seed....")
    set_seeds(cfg, dist_rank)

    # We set the CUDA device here as well as a safe solution for all downstream
    # `torch.cuda.current_device()` calls to return correct device.
    if cfg.MACHINE.DEVICE == "gpu" and torch.cuda.is_available():
        local_rank, _ = get_machine_local_and_dist_rank()
        torch.cuda.set_device(local_rank)

    # print the training settings and system settings
    if local_rank == 0:
        print_cfg(cfg)
        logging.info("System config:\n{}".format(collect_env_info()))

    trainer = SelfSupervisionTrainer(cfg, dist_run_id)
    features = trainer.extract()

    for split in features.keys():
        logging.info(f"============== Split: {split} =======================")
        for layer_name, layer_features in features[split].items():
            out_feat_file = os.path.join(
                checkpoint_folder,
                f"rank{dist_rank}_{split}_{layer_name}_features.npy")
            out_target_file = os.path.join(
                checkpoint_folder,
                f"rank{dist_rank}_{split}_{layer_name}_targets.npy")
            out_inds_file = os.path.join(
                checkpoint_folder,
                f"rank{dist_rank}_{split}_{layer_name}_inds.npy")
            feat_shape = layer_features["features"].shape
            logging.info(
                f"Saving extracted features of {layer_name} with shape {feat_shape} to: {out_feat_file}"
            )
            save_file(layer_features["features"], out_feat_file)
            logging.info(
                f"Saving extracted targets of {layer_name} to: {out_target_file}"
            )
            save_file(layer_features["targets"], out_target_file)
            logging.info(
                f"Saving extracted indices of {layer_name} to: {out_inds_file}"
            )
            save_file(layer_features["inds"], out_inds_file)

    logging.info("All Done!")
    # close the logging streams including the filehandlers
    shutdown_logging()
示例#23
0
def extract_label_predictions_main(
    cfg: AttrDict,
    dist_run_id: str,
    checkpoint_folder: str,
    local_rank: int = 0,
    node_id: int = 0,
):
    """
    Sets up and executes label predictions workflow per machine. Runs the
    model in eval mode only to extract the label predicted per class.

    Args:
        cfg (AttrDict): user specified input config that has optimizer, loss, meters etc
                        settings relevant for the feature extraction.
        dist_run_id (str): For multi-gpu training with PyTorch, we have to specify
                           how the gpus are going to rendezvous. This requires specifying
                           the communication method: file, tcp and the unique rendezvous
                           run_id that is specific to 1 run.
                           We recommend:
                                1) for 1node: use init_method=tcp and run_id=auto
                                2) for multi-node, use init_method=tcp and specify
                                run_id={master_node}:{port}
        local_rank (int): id of the current device on the machine. If using gpus,
                        local_rank = gpu number on the current machine
        node_id (int): id of the current machine. starts from 0. valid for multi-gpu
    """

    # setup the environment variables
    set_env_vars(local_rank, node_id, cfg)
    dist_rank = int(os.environ["RANK"])

    # setup logging
    setup_logging(__name__, output_dir=checkpoint_folder, rank=dist_rank)

    # setup the multiprocessing to be forkserver. See https://fb.quip.com/CphdAGUaM5Wf
    logging.info(
        f"Setting multiprocessing method: {cfg.MULTI_PROCESSING_METHOD}")
    setup_multiprocessing_method(cfg.MULTI_PROCESSING_METHOD)

    # set seeds
    logging.info("Setting seed....")
    set_seeds(cfg, dist_rank)

    # We set the CUDA device here as well as a safe solution for all downstream
    # `torch.cuda.current_device()` calls to return correct device.
    if cfg.MACHINE.DEVICE == "gpu" and torch.cuda.is_available():
        local_rank, _ = get_machine_local_and_dist_rank()
        torch.cuda.set_device(local_rank)

    # print the training settings and system settings
    # print the environment info for the current node
    logging.info(f"Env set for rank: {local_rank}, dist_rank: {dist_rank}")
    if local_rank == 0:
        current_env = os.environ.copy()
        print_system_env_info(current_env)
        print_cfg(cfg)
        logging.info(f"System config:\n{collect_env_info()}")

    # Identify the hooks to run for the extract label engine
    # TODO - we need to plug this better with the engine registry
    #  - we either need to use the global hooks registry
    #  - or we need to create specific hook registry by engine
    hooks = extract_label_hook_generator(cfg)

    trainer = SelfSupervisionTrainer(cfg, dist_run_id, hooks=hooks)
    trainer.extract(
        output_folder=cfg.EXTRACT_FEATURES.OUTPUT_DIR or checkpoint_folder,
        extract_features=False,
        extract_predictions=True,
    )

    logging.info("All Done!")
    # close the logging streams including the filehandlers
    shutdown_logging()
示例#24
0
def extract_main(cfg: AttrDict,
                 dist_run_id: str,
                 local_rank: int = 0,
                 node_id: int = 0):
    """
    Sets up and executes feature extraction workflow per machine.

    Args:
        cfg (AttrDict): user specified input config that has optimizer, loss, meters etc
                        settings relevant to the training
        dist_run_id (str): For multi-gpu training with PyTorch, we have to specify
                           how the gpus are going to rendezvous. This requires specifying
                           the communication method: file, tcp and the unique rendezvous
                           run_id that is specific to 1 run.
                           We recommend:
                                1) for 1node: use init_method=tcp and run_id=auto
                                2) for multi-node, use init_method=tcp and specify
                                run_id={master_node}:{port}
        local_rank (int): id of the current device on the machine. If using gpus,
                        local_rank = gpu number on the current machine
        node_id (int): id of the current machine. starts from 0. valid for multi-gpu
    """

    # setup logging
    setup_logging(__name__)
    # setup the environment variables
    set_env_vars(local_rank, node_id, cfg)

    # setup the multiprocessing to be forkserver.
    # See https://fb.quip.com/CphdAGUaM5Wf
    setup_multiprocessing_method(cfg.MULTI_PROCESSING_METHOD)

    # set seeds
    logging.info("Setting seed....")
    set_seeds(cfg)

    # print the training settings and system settings
    local_rank, _ = get_machine_local_and_dist_rank()
    if local_rank == 0:
        print_cfg(cfg)
        logging.info("System config:\n{}".format(collect_env_info()))

    output_dir = get_checkpoint_folder(cfg)
    trainer = SelfSupervisionTrainer(cfg, dist_run_id)
    features = trainer.extract()

    for split in features.keys():
        logging.info(f"============== Split: {split} =======================")
        layers = features[split].keys()
        for layer in layers:
            out_feat_file = (
                f"{output_dir}/rank{local_rank}_{split}_{layer}_features.npy")
            out_target_file = (
                f"{output_dir}/rank{local_rank}_{split}_{layer}_targets.npy")
            out_inds_file = f"{output_dir}/rank{local_rank}_{split}_{layer}_inds.npy"
            logging.info("Saving extracted features: {} {} to: {}".format(
                layer, features[split][layer]["features"].shape,
                out_feat_file))
            save_file(features[split][layer]["features"], out_feat_file)
            logging.info("Saving extracted targets: {} to: {}".format(
                features[split][layer]["targets"].shape, out_target_file))
            save_file(features[split][layer]["targets"], out_target_file)
            logging.info("Saving extracted indices: {} to: {}".format(
                features[split][layer]["inds"].shape, out_inds_file))
            save_file(features[split][layer]["inds"], out_inds_file)
    logging.info("All Done!")
    # close the logging streams including the filehandlers
    shutdown_logging()
示例#25
0
 def __init__(self):
     super().__init__()
     self.dist_rank = get_machine_local_and_dist_rank()[1]
示例#26
0
def extract_features_main(
    cfg: AttrDict,
    dist_run_id: str,
    checkpoint_folder: str,
    local_rank: int = 0,
    node_id: int = 0,
):
    """
    Sets up and executes feature extraction workflow per machine.

    Args:
        cfg (AttrDict): user specified input config that has optimizer, loss, meters etc
                        settings relevant to the training
        dist_run_id (str): For multi-gpu training with PyTorch, we have to specify
                           how the gpus are going to rendezvous. This requires specifying
                           the communication method: file, tcp and the unique rendezvous
                           run_id that is specific to 1 run.
                           We recommend:
                                1) for 1node: use init_method=tcp and run_id=auto
                                2) for multi-node, use init_method=tcp and specify
                                run_id={master_node}:{port}
        checkpoint_folder (str): what directory to use for checkpointing. This folder
                                 will be used to output the extracted features as well
                                 in case config.EXTRACT_FEATURES.OUTPUT_DIR is not set
        local_rank (int): id of the current device on the machine. If using gpus,
                        local_rank = gpu number on the current machine
        node_id (int): id of the current machine. starts from 0. valid for multi-gpu
    """

    # setup the environment variables
    set_env_vars(local_rank, node_id, cfg)
    dist_rank = int(os.environ["RANK"])

    # setup logging
    setup_logging(__name__, output_dir=checkpoint_folder, rank=dist_rank)

    logging.info(f"Env set for rank: {local_rank}, dist_rank: {dist_rank}")
    # print the environment info for the current node
    if local_rank == 0:
        current_env = os.environ.copy()
        print_system_env_info(current_env)

    # setup the multiprocessing to be forkserver.
    # See https://fb.quip.com/CphdAGUaM5Wf
    setup_multiprocessing_method(cfg.MULTI_PROCESSING_METHOD)

    # set seeds
    logging.info("Setting seed....")
    set_seeds(cfg, dist_rank)

    # We set the CUDA device here as well as a safe solution for all downstream
    # `torch.cuda.current_device()` calls to return correct device.
    if cfg.MACHINE.DEVICE == "gpu" and torch.cuda.is_available():
        local_rank, _ = get_machine_local_and_dist_rank()
        torch.cuda.set_device(local_rank)

    # print the training settings and system settings
    if local_rank == 0:
        print_cfg(cfg)
        logging.info("System config:\n{}".format(collect_env_info()))

    # Identify the hooks to run for the extract label engine
    # TODO - we need to plug this better with the engine registry
    #  - we either need to use the global hooks registry
    #  - or we need to create specific hook registry by engine
    hooks = extract_features_hook_generator(cfg)

    # Run the label prediction extraction
    trainer = SelfSupervisionTrainer(cfg, dist_run_id, hooks=hooks)
    output_dir = cfg.EXTRACT_FEATURES.OUTPUT_DIR or checkpoint_folder
    trainer.extract(
        output_folder=cfg.EXTRACT_FEATURES.OUTPUT_DIR or checkpoint_folder,
        extract_features=True,
        extract_predictions=False,
    )

    # TODO (prigoyal): merge this function with _extract_features
    if dist_rank == 0 and cfg.EXTRACT_FEATURES.MAP_FEATURES_TO_IMG_NAME:
        # Get the names of the features that we extracted features for. If user doesn't
        # specify the features to evaluate, we get the full model output and freeze
        # head/trunk both as caution.
        layers = get_trunk_output_feature_names(cfg.MODEL)
        if len(layers) == 0:
            layers = ["heads"]
        available_splits = [
            item.lower() for item in trainer.task.available_splits
        ]
        for split in available_splits:
            image_paths = trainer.task.datasets[split].get_image_paths()[0]
            for layer in layers:
                ExtractedFeaturesLoader.map_features_to_img_filepath(
                    image_paths=image_paths,
                    input_dir=output_dir,
                    split=split,
                    layer=layer,
                )

    logging.info("All Done!")
    # close the logging streams including the filehandlers
    shutdown_logging()