def _build_moco_encoder(self, task: tasks.ClassyTask) -> None: """ Create the model replica called the encoder. This will slowly track the main model. """ # Create the encoder, which will slowly track the model logging.info( "Building MoCo encoder - rank %s %s", *get_machine_local_and_dist_rank() ) # - same architecture task.loss.moco_encoder = build_model( task.config["MODEL"], task.config["OPTIMIZER"] ) task.loss.moco_encoder.to(task.device) # Restore an hypothetical checkpoint, else initialize from the model if task.loss.checkpoint is not None: task.loss.load_state_dict(task.loss.checkpoint) else: for param_q, param_k in zip( task.base_model.parameters(), task.loss.moco_encoder.parameters() ): param_k.data.copy_(param_q.data) param_k.requires_grad = False
def _build_momentum_network(self, task: tasks.ClassyTask) -> None: """ Create the model replica called the encoder. This will slowly track the main model. """ logging.info("Building momentum encoder - rank %s %s", *get_machine_local_and_dist_rank()) # - same architecture task.loss.momentum_encoder = build_model(task.config["MODEL"], task.config["OPTIMIZER"]) task.loss.momentum_encoder = nn.SyncBatchNorm.convert_sync_batchnorm( task.loss.momentum_encoder) task.loss.momentum_encoder.to( torch.device("cuda" if task.use_gpu else "cpu")) # Initialize from the model if task.loss.checkpoint is None: for param_q, param_k in zip( task.base_model.parameters(), task.loss.momentum_encoder.parameters()): param_k.data.copy_(param_q.data) for buff_q, buff_k in zip( task.base_model.named_buffers(), task.loss.momentum_encoder.named_buffers(), ): if "running_" not in buff_k[0]: continue buff_k[1].data.copy_(buff_q[1].data) task.loss.momentum_encoder = init_distributed_data_parallel_model( task.loss.momentum_encoder) # Restore an hypothetical checkpoint if task.loss.checkpoint is not None: task.loss.load_state_dict(task.loss.checkpoint)
def __init__(self, profiling_config: AttrDict): super().__init__() self.output_folder = profiling_config.OUTPUT_FOLDER self.start_iteration = (profiling_config.START_ITERATION + profiling_config.WARMUP_ITERATIONS) self.end_iteration = self.start_iteration + profiling_config.NUM_ITERATIONS self.interrupt_training = profiling_config.STOP_TRAINING_AFTER_PROFILING self.dist_rank = get_machine_local_and_dist_rank()[1] self.is_profiling_rank = self.dist_rank in profiling_config.PROFILED_RANKS self.profile_runtime = ( self.is_profiling_rank and profiling_config.RUNTIME_PROFILING.USE_PROFILER) self.runtime_profiler = create_runtime_profiler( enabled=self.profile_runtime, use_cpu=profiling_config.RUNTIME_PROFILING.PROFILE_CPU, use_cuda=profiling_config.RUNTIME_PROFILING.PROFILE_GPU, wait=profiling_config.START_ITERATION, warmup=profiling_config.WARMUP_ITERATIONS, active=profiling_config.NUM_ITERATIONS, legacy_profiler=profiling_config.RUNTIME_PROFILING.LEGACY_PROFILER, ) self.profile_by_layer_memory = ( self.is_profiling_rank and profiling_config.MEMORY_PROFILING.TRACK_BY_LAYER_MEMORY) if self.profile_by_layer_memory: logging.info( f"Setting up memory tracker for rank {self.dist_rank}...") self.layer_memory_tracker = LayerwiseMemoryTracker()
def _print_and_save_meters(self, task, train_phase_idx): """ Executed only on master gpu at the end of each epoch. Computes the meters and logs the metrics to the json file and to logger streams (stdout, file). """ phase_type = "train" if task.train else "test" rank, _ = get_machine_local_and_dist_rank() checkpoint_folder = task.checkpoint_folder save_metrics = {} save_metrics["iteration"] = task.iteration save_metrics["phase_idx"] = task.phase_idx save_metrics["train_phase_idx"] = train_phase_idx for meter in task.meters: if len(task.meters) > 0 and ( (task.train and task.config["METERS"]["enable_training_meter"]) or (not task.train)): meter_value = meter.value metric_key = f"{phase_type}_{meter.name}" if metric_key not in task.metrics: task.metrics[metric_key] = [] task.metrics[metric_key].append(meter_value) save_metrics[metric_key] = meter_value logging.info( f"Rank: {rank}, name: {metric_key}, value: {meter_value}") meter_file = f"{checkpoint_folder}/metrics.json" save_file(save_metrics, meter_file, append_to_json=True)
def _get_data_files(self, split): """ Get the given dataset split (train or test), get the path to the dataset (images and labels). 1. If the user has explicitly specified the data_sources, we simply use those and don't do lookup in the datasets registered with VISSL from the dataset catalog. 2. If the user hasn't specified the path, look for the dataset in the datasets catalog registered with VISSL. For a given list of datasets and a given partition (train/test), we first verify that we have the dataset and the correct source as specified by the user. Then for each dataset in the list, we get the data path (make sure it exists, sources match). For the label file, the file is optional. """ local_rank, _ = get_machine_local_and_dist_rank() self.data_paths, self.label_paths = dataset_catalog.get_data_files( split, dataset_config=self.cfg["DATA"] ) logging.info( f"Rank: {local_rank} split: {split} Data files:\n{self.data_paths}" ) logging.info( f"Rank: {local_rank} split: {split} Label files:\n{self.label_paths}" )
def value(self): """ Value of the meter globally synced. mean AP and AP for each class is returned """ _, distributed_rank = get_machine_local_and_dist_rank() logging.info( f"Rank: {distributed_rank} Mean AP meter: " f"scores: {self._scores.shape}, target: {self._targets.shape}") ap_matrix = torch.ones(self.num_classes, dtype=torch.float32) * -1 # targets matrix = 0, 1, -1 # unknown matrix = 0, 1 where 1 means that it's an unknown unknown_matrix = torch.eq(self._targets, -1.0).float().detach().numpy() for cls_num in range(self.num_classes): # compute AP only for classes that have at least one positive example num_pos = len(torch.where(self._targets[:, cls_num] == 1)[0]) if num_pos == 0: continue P, R, score, ap = get_precision_recall( self._targets[:, cls_num].detach().numpy(), self._scores[:, cls_num].detach().numpy(), (unknown_matrix[:, cls_num] == 0).astype(np.float), ) ap_matrix[cls_num] = ap[0] nonzero_indices = torch.nonzero(ap_matrix != -1) if nonzero_indices.shape[0] < self.num_classes: logging.info( f"{nonzero_indices.shape[0]} out of {self.num_classes} classes " "have meaningful average precision") mean_ap = ap_matrix[nonzero_indices].mean().item() return {"mAP": mean_ap, "AP": ap_matrix}
def extract_clusters( cfg: AttrDict, dist_run_id: str, checkpoint_folder: str, local_rank: int = 0, node_id: int = 0, ): """ Sets up and executes model visualisation extraction workflow on one node """ # setup the environment variables set_env_vars(local_rank, node_id, cfg) dist_rank = int(os.environ["RANK"]) # setup logging setup_logging(__name__, output_dir=checkpoint_folder, rank=dist_rank) logging.info(f"Env set for rank: {local_rank}, dist_rank: {dist_rank}") # print the environment info for the current node if local_rank == 0: current_env = os.environ.copy() print_system_env_info(current_env) # setup the multiprocessing to be forkserver. # See https://fb.quip.com/CphdAGUaM5Wf setup_multiprocessing_method(cfg.MULTI_PROCESSING_METHOD) # set seeds logging.info("Setting seed....") set_seeds(cfg, dist_rank) # We set the CUDA device here as well as a safe solution for all downstream # `torch.cuda.current_device()` calls to return correct device. if cfg.MACHINE.DEVICE == "gpu" and torch.cuda.is_available(): local_rank, _ = get_machine_local_and_dist_rank() torch.cuda.set_device(local_rank) # print the training settings and system settings if local_rank == 0: print_cfg(cfg) logging.info("System config:\n{}".format(collect_env_info())) # Build the SSL trainer to set up distributed training and then # extract the cluster assignments for all entries in the dataset trainer = SelfSupervisionTrainer(cfg, dist_run_id) cluster_assignments = trainer.extract_clusters() # Save the cluster assignments in the output folder if dist_rank == 0: ClusterAssignmentLoader.save_cluster_assignment( output_dir=get_checkpoint_folder(cfg), assignments=ClusterAssignment( config=cfg, cluster_assignments=cluster_assignments), ) # close the logging streams including the file handlers logging.info("All Done!") shutdown_logging()
def _save_label_cls_idx_map(self, cls_idx_map: Dict[str, int], split: str): local_rank, dist_rank = get_machine_local_and_dist_rank() if dist_rank == 0: checkpoint_folder = get_checkpoint_folder(self.cfg) class_idx_file_path = ( f"{checkpoint_folder}/{split.lower()}_label_to_index_map.json") if not g_pathmgr.exists(class_idx_file_path): save_file(cls_idx_map, class_idx_file_path, append_to_json=False)
def _advance_phase(self, task: ClassyTask): """ Advance the training phase to the next phase. - Updates the phase number, - resets the meters, - reset losses, - recreates the data iterator and destroys previous iterator - set the model to be in train or eval phase depending on what phase we are in - execute any optimizer update (normally learning rate updates etc at the end of an epoch) """ # reset the meters at the beginning of the epoch for meter in task.meters: meter.reset() # reset the loss history for this epoch task.losses = [] # advance the epoch num to be current task.phase_idx += 1 phase = task.phases[task.phase_idx] task.train = True if phase["train"] else False if task.train: task.train_phase_idx += 1 # get a new data iterator - delete the iterator at the beginning explicitly # so that all dataloader processes are cleaned up phase_type = "train" if phase["train"] else "test" # we are advancing to next epoch, so no need to compute start_iter, # just let it to be 0 inside of recreate_data_iterator. However, if we are just # starting from the resumed training, we want to compute_start_iter # again (if applicable) since we recreate the data iterator and delete # the old ones. compute_start_iter = False if task.checkpoint is not None and task.checkpoint["train_phase_idx"] == ( task.train_phase_idx - 1 ): compute_start_iter = True task.recreate_data_iterator( phase_type, epoch=task.phase_idx, compute_start_iter=compute_start_iter, train_phase_idx=task.train_phase_idx, ) # set the model to train or eval depending on what phase we are in task.model.train(phase["train"]) if task.train and task.train_phase_idx >= 0: task.optimizer.on_epoch(task.where) local_rank, _ = get_machine_local_and_dist_rank() logging.info(f"Phase advanced. Rank: {local_rank}")
def __init__(self, model_config, optimizer_config): self.model_config = model_config self.optimizer_config = optimizer_config super().__init__() self.eval_mode = None # this is just informational self.local_rank, _ = get_machine_local_and_dist_rank() self.trunk = self._get_trunk() self.heads = nn.ModuleList() self.head_names = [] self._output_feature_names = get_trunk_output_feature_names(self.model_config) self._get_heads() self._setup_multi_input_head_mapping()
def _load_labels(self): """ Load the labels if the dataset has labels. In self-supervised pre-training task, we don't use labels. However, we use labels for the evaluations of the self-supervised models on the downstream tasks. For labels, two label sources are supported: disk_filelist and disk_folder In case of disk_filelist, we iteratively read labels for each specified file. See load_single_label_file(). In case of disk_folder, we use the ImageFolder object created during the data loading itself. """ local_rank, _ = get_machine_local_and_dist_rank() for idx, label_source in enumerate(self.label_sources): if label_source == "disk_filelist": paths = self.label_paths[idx] # in case of filelist, we support multiple label files. # we rely on the user to have a proper collator to handle # the multiple labels logging.info(f"Loading labels: {paths}") if isinstance(paths, list): labels = [] for path in paths: path_labels = self.load_single_label_file(path) labels.append(path_labels) else: labels = self.load_single_label_file(paths) labels = self._convert_to_numeric_ids(labels) elif label_source == "disk_folder": # In this case we use the labels inferred from the directory structure # We enforce that the data source also be a disk folder in this case assert self.data_sources[idx] == self.label_sources[idx] if local_rank == 0: logging.info( f"Using {label_source} labels from {self.data_paths[idx]}" ) # Use the ImageFolder object created when loading images. # We do not create it again since it can be an expensive operation. labels = [ x[1] for x in self.data_objs[idx].image_dataset.samples ] labels = np.array(labels).astype(np.int64) elif label_source == "torchvision_dataset": labels = np.array(self.data_objs[idx].get_labels()).astype( np.int64) elif label_source == "synthetic": labels = np.array([0 for _ in range(len(self.data_objs[idx]))]) else: raise ValueError(f"unknown label source: {label_source}") self.label_objs.append(labels)
def load_and_broadcast_checkpoint( cls, checkpoint_folder: str, checkpoint_path: str, device ): """ Load the checkpoint at the provided path, dealing with the potential indirection due to the notion of sharded checkpoint """ checkpoint = load_and_broadcast_checkpoint(checkpoint_path, device) if cls._is_shard_aggregator_checkpoint(checkpoint): _, global_rank = get_machine_local_and_dist_rank() shard_name = checkpoint["shards"][global_rank] shard_path = os.path.join(checkpoint_folder, shard_name) checkpoint = load_checkpoint(shard_path, device) return checkpoint
def __init__(self, profiling_config: AttrDict): super().__init__() self.output_folder = profiling_config.OUTPUT_FOLDER self.start_iteration = profiling_config.START_ITERATION self.end_iteration = (profiling_config.START_ITERATION + profiling_config.NUM_ITERATIONS) self.dist_rank = get_machine_local_and_dist_rank()[1] self.enabled = self.dist_rank in profiling_config.PROFILED_RANKS self.profile_memory = ( self.enabled and profiling_config.MEMORY_PROFILING.TRACK_BY_LAYER_MEMORY) if self.profile_memory: logging.info( f"Setting up memory tracker for rank {self.dist_rank}...") self.layer_memory_tracker = LayerwiseMemoryTracker()
def to_pytorch_syncbn(group_size): logging.info("Converting BN layers to PyTorch SyncBN") if group_size is None: process_group = None logging.info("Not creating process_group for PyTorch SyncBN...") else: process_group_ids = split_world_in_process_groups( world_size=config.DISTRIBUTED.NUM_PROC_PER_NODE * config.DISTRIBUTED.NUM_NODES, group_size=group_size, ) process_groups = [dist.new_group(pids) for pids in process_group_ids] _, dist_rank = get_machine_local_and_dist_rank() process_group = process_groups[dist_rank // group_size] return nn.SyncBatchNorm.convert_sync_batchnorm( model, process_group=process_group )
def _sync_and_print_meters(self, task): for meter in task.meters: meter.sync_state() logging.info("Meters synced") if is_primary(): rank, _ = get_machine_local_and_dist_rank() for meter in task.meters: if len(task.meters) > 0 and ( (task.train and task.config["METERS"]["enable_training_meter"]) or (not task.train) ): meter_value = meter.value metric_key = f"{meter.name}" if metric_key not in task.metrics: task.metrics[metric_key] = [] task.metrics[metric_key].append(meter_value) logging.info( f"Rank: {rank}, name: {metric_key}, value: {meter_value}" )
def on_forward(self, task: "tasks.ClassyTask") -> None: """ Called each time a model forward is done and make sure that the model forward output is not NaN. If we encounter NaN as the model output, we checkpoint the model to enable debugging and also checkpoint the model input sample, model output. """ # check the model output is not NaN. has_nan = False model_output = task.last_batch.model_output if isinstance(model_output, list): has_nan = not torch.tensor( [torch.isfinite(x).all() for x in model_output] ).all() else: has_nan = not torch.isfinite(model_output).all() if has_nan: _, dist_rank = get_machine_local_and_dist_rank() logging.info(f"Infinite Model output or NaN at iteration={task.iteration}.") # TODO - this code was broken during a refactoring: improve it from vissl.hooks.log_hooks import LogLossMetricsCheckpointHook LogLossMetricsCheckpointHook.checkpoint_model( task, world_size=self.world_size, mode_frequency=1, mode_num=task.iteration, mode="iteration", ) model_output_file = ( f"{task.checkpoint_folder}/rank{dist_rank}_model_output.pth" ) input_sample_file = ( f"{task.checkpoint_folder}/rank{dist_rank}_input_sample.pth" ) with PathManager.open(model_output_file, "wb") as fwrite: torch.save(model_output, fwrite) with PathManager.open(input_sample_file, "wb") as fwrite: torch.save(task.last_batch.sample, fwrite) logging.info(f"Saved model output: {model_output_file}") logging.info(f"Saved model input: {input_sample_file}")
def on_forward(self, task: "tasks.ClassyTask") -> None: """ Called each time a model forward is done and make sure that the model forward output is not NaN. If we encounter NaN as the model output, we checkpoint the model to enable debugging and also checkpoint the model input sample, model output. """ # check the model output is not NaN. has_nan = False model_output = task.last_batch.model_output if isinstance(model_output, list): has_nan = not torch.tensor( [torch.isfinite(x).all() for x in model_output] ).all() else: has_nan = not torch.isfinite(model_output).all() if has_nan: _, dist_rank = get_machine_local_and_dist_rank() logging.info(f"Infinite Model output or NaN at iteration={task.iteration}.") self._checkpoint_model( task, task.train_phase_idx, mode_frequency=1, mode_num=task.iteration, mode="iteration", ) model_output_file = ( f"{task.checkpoint_folder}/rank{dist_rank}_model_output.pth" ) input_sample_file = ( f"{task.checkpoint_folder}/rank{dist_rank}_input_sample.pth" ) with PathManager.open(model_output_file, "wb") as fwrite: torch.save(model_output, fwrite) with PathManager.open(input_sample_file, "wb") as fwrite: torch.save(task.last_batch.sample, fwrite) logging.info(f"Saved model output: {model_output_file}") logging.info(f"Saved model input: {input_sample_file}")
def __init__( self, cfg: AttrDict, dist_run_id: str, checkpoint_path: str = None, checkpoint_folder: str = None, hooks: List[ClassyHook] = None, ): self.cfg = cfg self.dist_run_id = dist_run_id self.local_rank, self.distributed_rank = get_machine_local_and_dist_rank() self.setup_distributed(self.cfg.MACHINE.DEVICE == "gpu") # now we should build the task. The task will also have the State attached # to it. It will have information about phases (train, test) both. It will # also contain all the other information like optimizers, etc self.task = build_task(self.cfg) self.task.set_checkpoint_path(checkpoint_path) self.task.set_checkpoint_folder(checkpoint_folder) if hooks is None: hooks = [] self.task.set_hooks(hooks)
def _checkpoint_model(self, task, train_phase_idx, mode_frequency, mode_num, mode="phase"): """ Checkpoint model. Can be called in 3 possible scenarios: 1. If training becomes NaN, then we checkpoint the model to facilitate debugging 2. After every N epochs (CHECKPOINT_FREQ), model state is checkpointed. 3. If user wants to checkpoint during the epoch (ie. after every few training iterations, the model state is checkpointed.) Args: task: Self-supervision task that hold information about training iteration, epoch number etc. train_phase_idx (int): current training phase number. Starts from 0 mode_frequency (int): mode can be "phase" or "iteration". Frequency of checkpointing for the given mode mode_num (int): for the checkpointing mode (phase or iteration), the number of phase or iteration at which checkpointing is being done """ phase_idx = task.phase_idx # num_train_phases = num_epochs * num_phases_per_epoch # For OSS use, num_train_phases will be equal to num_epochs num_train_phases = task.num_train_phases # check if we need to checkpoint this phase is_checkpointing_phase = is_checkpoint_phase(mode_num, mode_frequency, train_phase_idx, num_train_phases, mode) is_final_train_phase = ((train_phase_idx == (num_train_phases - 1)) and task.train and mode == "phase") # handle checkpoint: if task.train and (is_final_train_phase or is_checkpointing_phase): # - if sharded state consolidate the state # /!\ All the ranks have to participate if hasattr(task.optimizer, "consolidate_state_dict") and mode != "phase": logging.info( f"[{mode}: {mode_num}] Consolidating sharded state on all replicas" ) task.optimizer.consolidate_state_dict() # Depending on whether we are in FSDP mode or not # - save the checkpoint on the primary rank # - save the sharded checkpoint on all ranks if is_primary() or isinstance(task.base_model, FSDP): checkpoint_folder = task.checkpoint_folder logging.info( f"[{mode}: {mode_num}] Saving checkpoint to {checkpoint_folder}" ) model_state_dict = task.get_classy_state() # phase_idx is already incremented at the beginning of phase but if we # are checkpointing at an iteration in the middle of phase, we should not # save the incremented phase_idx as it will incorrectly assume that model # trained for that phase already. if mode == "iteration": model_state_dict[ "phase_idx"] = model_state_dict["phase_idx"] - 1 if task.train: train_phase_idx = train_phase_idx - 1 model_state_dict["train_phase_idx"] = train_phase_idx restart_phase = phase_idx - 1 restart_iteration = task.iteration # When loading from a phase checkpoint: else: restart_phase = phase_idx restart_iteration = task.iteration checkpoint_content = { "phase_idx": restart_phase, "iteration": restart_iteration, "loss": task.loss.state_dict(), "iteration_num": task.local_iteration_num, "train_phase_idx": train_phase_idx, "classy_state_dict": model_state_dict, } checkpoint_writer = CheckpointWriter( checkpoint_folder=checkpoint_folder, is_final_train_phase=is_final_train_phase, mode=mode, mode_num=mode_num, backend=task.config["CHECKPOINT"]["BACKEND"], ) if isinstance(task.base_model, FSDP): _, rank = get_machine_local_and_dist_rank() checkpoint_writer.save_sharded_checkpoint( content=checkpoint_content, shard_rank=rank, world_size=self.world_size, ) else: checkpoint_writer.save_consolidated_checkpoint( checkpoint_content)
def init_model_from_consolidated_weights( config: AttrDict, model, state_dict: Dict[str, Any], state_dict_key_name: str, skip_layers: List[str], replace_prefix=None, append_prefix=None, ): """ Initialize the model from any given params file. This is particularly useful during the feature evaluation process or when we want to evaluate a model on a range of tasks. Args: config (AttrDict): config file model (object): instance of base_ssl_model state_dict (Dict): torch.load() of user provided params file path. state_dict_key_name (string): key name containing the model state dict skip_layers (List(string)): layer names with this key are not copied replace_prefix (string): remove these prefixes from the layer names (executed first) append_prefix (string): append the prefix to the layer names (executed after replace_prefix) Returns: model (object): the model initialized from the weights file """ # whether it's a model from somewhere else or a model from this codebase, load the # state_dict if state_dict_key_name and len(state_dict_key_name) > 0: assert ( state_dict_key_name in state_dict.keys() ), f"Unknown state dict key: {state_dict_key_name}" state_dict = state_dict[state_dict_key_name] if state_dict_key_name == "classy_state_dict": # get the appropriate model_state_dict so that the model can load. We automatically # take care of appending prefixes, suffixes etc to match the layer names. state_dict = get_checkpoint_model_state_dict(config, state_dict) else: # make any corrections to the layer names to load checkpoint successfully if replace_prefix: state_dict = replace_module_prefix(state_dict, replace_prefix) if append_prefix: state_dict = append_module_prefix(state_dict, append_prefix) check_model_compatibilty(config, state_dict) # load the checkpoint now all_layers = model.state_dict() local_rank, _ = get_machine_local_and_dist_rank() max_len_model = max(len(key) for key in all_layers.keys()) for layername in all_layers.keys(): if len(skip_layers) > 0 and any(item in layername for item in skip_layers): if local_rank == 0: logging.info(f"Ignored layer:\t{layername}") continue if layername in state_dict: param = state_dict[layername] if not isinstance(param, torch.Tensor): param = torch.from_numpy(param) # if we are initializing the heads and the feature eval mode is on, we check # if we are evaluating the heads as well or not. If not, we don't initialize # the heads. Otherwise we initialize the heads. if ( not ("heads" in layername) or ( "heads" in layername and not config.MODEL.FEATURE_EVAL_SETTINGS.EVAL_MODE_ON ) or ( "heads" in layername and config.MODEL.FEATURE_EVAL_SETTINGS.EVAL_MODE_ON and config.MODEL.FEATURE_EVAL_SETTINGS.EVAL_TRUNK_AND_HEAD ) ): # Accommodate changing position embeddings. Fine-tuning at a # different resolution than that which a model was pretrained # at requires interpolating the learned position embeddings. if "pos_embedding" in layername: param = interpolate_position_embeddings( model, all_layers[layername], param ) assert all_layers[layername].shape == param.shape, ( f"{layername} have different shapes: " f"checkpoint: {param.shape}, model: {all_layers[layername].shape}" ) all_layers[layername].copy_(param) if local_rank == 0: logging.info( f"Loaded: {layername: <{max_len_model}} of " f"shape: {all_layers[layername].size()} from checkpoint" ) else: if local_rank == 0: logging.info(f"Ignored layer:\t{layername}") else: if local_rank == 0: logging.info(f"Not found:\t\t{layername}, not initialized") if local_rank == 0: extra_layers = [] # go through the checkpoint state_dict and print what extra layers exist in checkpoint for layername in state_dict.keys(): if layername not in all_layers: extra_layers.append(layername) logging.info(f"Extra layers not loaded from checkpoint: {extra_layers}") ####################### DEBUG ############################ # print_state_dict_shapes(model.state_dict()) return model
def train_main( cfg: AttrDict, dist_run_id: str, checkpoint_path: str, checkpoint_folder: str, local_rank: int = 0, node_id: int = 0, hook_generator: Callable[[Any], List[ClassyHook]] = default_hook_generator, ): """ Sets up and executes training workflow per machine. Args: cfg (AttrDict): user specified input config that has optimizer, loss, meters etc settings relevant to the training dist_run_id (str): For multi-gpu training with PyTorch, we have to specify how the gpus are going to rendezvous. This requires specifying the communication method: file, tcp and the unique rendezvous run_id that is specific to 1 run. We recommend: 1) for 1node: use init_method=tcp and run_id=auto 2) for multi-node, use init_method=tcp and specify run_id={master_node}:{port} checkpoint_path (str): if the training is being resumed from a checkpoint, path to the checkpoint. The tools/run_distributed_engines.py automatically looks for the checkpoint in the checkpoint directory. checkpoint_folder (str): what directory to use for checkpointing. The tools/run_distributed_engines.py creates the directory based on user input in the yaml config file. local_rank (int): id of the current device on the machine. If using gpus, local_rank = gpu number on the current machine node_id (int): id of the current machine. starts from 0. valid for multi-gpu hook_generator (Callable): The utility function that prepares all the hoooks that will be used in training based on user selection. Some basic hooks are used by default. """ # setup the environment variables set_env_vars(local_rank, node_id, cfg) dist_rank = int(os.environ["RANK"]) # setup logging setup_logging(__name__, output_dir=checkpoint_folder, rank=dist_rank) logging.info(f"Env set for rank: {local_rank}, dist_rank: {dist_rank}") # print the environment info for the current node if local_rank == 0: current_env = os.environ.copy() print_system_env_info(current_env) # setup the multiprocessing to be forkserver. # See https://fb.quip.com/CphdAGUaM5Wf setup_multiprocessing_method(cfg.MULTI_PROCESSING_METHOD) # set seeds logging.info("Setting seed....") set_seeds(cfg, dist_rank) # We set the CUDA device here as well as a safe solution for all downstream # `torch.cuda.current_device()` calls to return correct device. if cfg.MACHINE.DEVICE == "gpu" and torch.cuda.is_available(): local_rank, _ = get_machine_local_and_dist_rank() torch.cuda.set_device(local_rank) # print the training settings and system settings if local_rank == 0: print_cfg(cfg) logging.info("System config:\n{}".format(collect_env_info())) # get the hooks - these hooks are executed per replica hooks = hook_generator(cfg) # build the SSL trainer. The trainer first prepares a "task" object which # acts as a container for various things needed in a training: datasets, # dataloader, optimizers, losses, hooks, etc. "Task" will also have information # about phases (train, test) both. The trainer then sets up distributed # training. trainer = SelfSupervisionTrainer( cfg, dist_run_id, checkpoint_path, checkpoint_folder, hooks ) trainer.train() logging.info("All Done!") # close the logging streams including the filehandlers shutdown_logging()
def extract_main( cfg: AttrDict, dist_run_id: str, checkpoint_folder: str, local_rank: int = 0, node_id: int = 0, ): """ Sets up and executes feature extraction workflow per machine. Args: cfg (AttrDict): user specified input config that has optimizer, loss, meters etc settings relevant to the training dist_run_id (str): For multi-gpu training with PyTorch, we have to specify how the gpus are going to rendezvous. This requires specifying the communication method: file, tcp and the unique rendezvous run_id that is specific to 1 run. We recommend: 1) for 1node: use init_method=tcp and run_id=auto 2) for multi-node, use init_method=tcp and specify run_id={master_node}:{port} local_rank (int): id of the current device on the machine. If using gpus, local_rank = gpu number on the current machine node_id (int): id of the current machine. starts from 0. valid for multi-gpu """ # setup the environment variables set_env_vars(local_rank, node_id, cfg) dist_rank = int(os.environ["RANK"]) # setup logging setup_logging(__name__, output_dir=checkpoint_folder, rank=dist_rank) logging.info(f"Env set for rank: {local_rank}, dist_rank: {dist_rank}") # print the environment info for the current node if local_rank == 0: current_env = os.environ.copy() print_system_env_info(current_env) # setup the multiprocessing to be forkserver. # See https://fb.quip.com/CphdAGUaM5Wf setup_multiprocessing_method(cfg.MULTI_PROCESSING_METHOD) # set seeds logging.info("Setting seed....") set_seeds(cfg, dist_rank) # We set the CUDA device here as well as a safe solution for all downstream # `torch.cuda.current_device()` calls to return correct device. if cfg.MACHINE.DEVICE == "gpu" and torch.cuda.is_available(): local_rank, _ = get_machine_local_and_dist_rank() torch.cuda.set_device(local_rank) # print the training settings and system settings if local_rank == 0: print_cfg(cfg) logging.info("System config:\n{}".format(collect_env_info())) trainer = SelfSupervisionTrainer(cfg, dist_run_id) features = trainer.extract() for split in features.keys(): logging.info(f"============== Split: {split} =======================") for layer_name, layer_features in features[split].items(): out_feat_file = os.path.join( checkpoint_folder, f"rank{dist_rank}_{split}_{layer_name}_features.npy") out_target_file = os.path.join( checkpoint_folder, f"rank{dist_rank}_{split}_{layer_name}_targets.npy") out_inds_file = os.path.join( checkpoint_folder, f"rank{dist_rank}_{split}_{layer_name}_inds.npy") feat_shape = layer_features["features"].shape logging.info( f"Saving extracted features of {layer_name} with shape {feat_shape} to: {out_feat_file}" ) save_file(layer_features["features"], out_feat_file) logging.info( f"Saving extracted targets of {layer_name} to: {out_target_file}" ) save_file(layer_features["targets"], out_target_file) logging.info( f"Saving extracted indices of {layer_name} to: {out_inds_file}" ) save_file(layer_features["inds"], out_inds_file) logging.info("All Done!") # close the logging streams including the filehandlers shutdown_logging()
def extract_label_predictions_main( cfg: AttrDict, dist_run_id: str, checkpoint_folder: str, local_rank: int = 0, node_id: int = 0, ): """ Sets up and executes label predictions workflow per machine. Runs the model in eval mode only to extract the label predicted per class. Args: cfg (AttrDict): user specified input config that has optimizer, loss, meters etc settings relevant for the feature extraction. dist_run_id (str): For multi-gpu training with PyTorch, we have to specify how the gpus are going to rendezvous. This requires specifying the communication method: file, tcp and the unique rendezvous run_id that is specific to 1 run. We recommend: 1) for 1node: use init_method=tcp and run_id=auto 2) for multi-node, use init_method=tcp and specify run_id={master_node}:{port} local_rank (int): id of the current device on the machine. If using gpus, local_rank = gpu number on the current machine node_id (int): id of the current machine. starts from 0. valid for multi-gpu """ # setup the environment variables set_env_vars(local_rank, node_id, cfg) dist_rank = int(os.environ["RANK"]) # setup logging setup_logging(__name__, output_dir=checkpoint_folder, rank=dist_rank) # setup the multiprocessing to be forkserver. See https://fb.quip.com/CphdAGUaM5Wf logging.info( f"Setting multiprocessing method: {cfg.MULTI_PROCESSING_METHOD}") setup_multiprocessing_method(cfg.MULTI_PROCESSING_METHOD) # set seeds logging.info("Setting seed....") set_seeds(cfg, dist_rank) # We set the CUDA device here as well as a safe solution for all downstream # `torch.cuda.current_device()` calls to return correct device. if cfg.MACHINE.DEVICE == "gpu" and torch.cuda.is_available(): local_rank, _ = get_machine_local_and_dist_rank() torch.cuda.set_device(local_rank) # print the training settings and system settings # print the environment info for the current node logging.info(f"Env set for rank: {local_rank}, dist_rank: {dist_rank}") if local_rank == 0: current_env = os.environ.copy() print_system_env_info(current_env) print_cfg(cfg) logging.info(f"System config:\n{collect_env_info()}") # Identify the hooks to run for the extract label engine # TODO - we need to plug this better with the engine registry # - we either need to use the global hooks registry # - or we need to create specific hook registry by engine hooks = extract_label_hook_generator(cfg) trainer = SelfSupervisionTrainer(cfg, dist_run_id, hooks=hooks) trainer.extract( output_folder=cfg.EXTRACT_FEATURES.OUTPUT_DIR or checkpoint_folder, extract_features=False, extract_predictions=True, ) logging.info("All Done!") # close the logging streams including the filehandlers shutdown_logging()
def extract_main(cfg: AttrDict, dist_run_id: str, local_rank: int = 0, node_id: int = 0): """ Sets up and executes feature extraction workflow per machine. Args: cfg (AttrDict): user specified input config that has optimizer, loss, meters etc settings relevant to the training dist_run_id (str): For multi-gpu training with PyTorch, we have to specify how the gpus are going to rendezvous. This requires specifying the communication method: file, tcp and the unique rendezvous run_id that is specific to 1 run. We recommend: 1) for 1node: use init_method=tcp and run_id=auto 2) for multi-node, use init_method=tcp and specify run_id={master_node}:{port} local_rank (int): id of the current device on the machine. If using gpus, local_rank = gpu number on the current machine node_id (int): id of the current machine. starts from 0. valid for multi-gpu """ # setup logging setup_logging(__name__) # setup the environment variables set_env_vars(local_rank, node_id, cfg) # setup the multiprocessing to be forkserver. # See https://fb.quip.com/CphdAGUaM5Wf setup_multiprocessing_method(cfg.MULTI_PROCESSING_METHOD) # set seeds logging.info("Setting seed....") set_seeds(cfg) # print the training settings and system settings local_rank, _ = get_machine_local_and_dist_rank() if local_rank == 0: print_cfg(cfg) logging.info("System config:\n{}".format(collect_env_info())) output_dir = get_checkpoint_folder(cfg) trainer = SelfSupervisionTrainer(cfg, dist_run_id) features = trainer.extract() for split in features.keys(): logging.info(f"============== Split: {split} =======================") layers = features[split].keys() for layer in layers: out_feat_file = ( f"{output_dir}/rank{local_rank}_{split}_{layer}_features.npy") out_target_file = ( f"{output_dir}/rank{local_rank}_{split}_{layer}_targets.npy") out_inds_file = f"{output_dir}/rank{local_rank}_{split}_{layer}_inds.npy" logging.info("Saving extracted features: {} {} to: {}".format( layer, features[split][layer]["features"].shape, out_feat_file)) save_file(features[split][layer]["features"], out_feat_file) logging.info("Saving extracted targets: {} to: {}".format( features[split][layer]["targets"].shape, out_target_file)) save_file(features[split][layer]["targets"], out_target_file) logging.info("Saving extracted indices: {} to: {}".format( features[split][layer]["inds"].shape, out_inds_file)) save_file(features[split][layer]["inds"], out_inds_file) logging.info("All Done!") # close the logging streams including the filehandlers shutdown_logging()
def __init__(self): super().__init__() self.dist_rank = get_machine_local_and_dist_rank()[1]
def extract_features_main( cfg: AttrDict, dist_run_id: str, checkpoint_folder: str, local_rank: int = 0, node_id: int = 0, ): """ Sets up and executes feature extraction workflow per machine. Args: cfg (AttrDict): user specified input config that has optimizer, loss, meters etc settings relevant to the training dist_run_id (str): For multi-gpu training with PyTorch, we have to specify how the gpus are going to rendezvous. This requires specifying the communication method: file, tcp and the unique rendezvous run_id that is specific to 1 run. We recommend: 1) for 1node: use init_method=tcp and run_id=auto 2) for multi-node, use init_method=tcp and specify run_id={master_node}:{port} checkpoint_folder (str): what directory to use for checkpointing. This folder will be used to output the extracted features as well in case config.EXTRACT_FEATURES.OUTPUT_DIR is not set local_rank (int): id of the current device on the machine. If using gpus, local_rank = gpu number on the current machine node_id (int): id of the current machine. starts from 0. valid for multi-gpu """ # setup the environment variables set_env_vars(local_rank, node_id, cfg) dist_rank = int(os.environ["RANK"]) # setup logging setup_logging(__name__, output_dir=checkpoint_folder, rank=dist_rank) logging.info(f"Env set for rank: {local_rank}, dist_rank: {dist_rank}") # print the environment info for the current node if local_rank == 0: current_env = os.environ.copy() print_system_env_info(current_env) # setup the multiprocessing to be forkserver. # See https://fb.quip.com/CphdAGUaM5Wf setup_multiprocessing_method(cfg.MULTI_PROCESSING_METHOD) # set seeds logging.info("Setting seed....") set_seeds(cfg, dist_rank) # We set the CUDA device here as well as a safe solution for all downstream # `torch.cuda.current_device()` calls to return correct device. if cfg.MACHINE.DEVICE == "gpu" and torch.cuda.is_available(): local_rank, _ = get_machine_local_and_dist_rank() torch.cuda.set_device(local_rank) # print the training settings and system settings if local_rank == 0: print_cfg(cfg) logging.info("System config:\n{}".format(collect_env_info())) # Identify the hooks to run for the extract label engine # TODO - we need to plug this better with the engine registry # - we either need to use the global hooks registry # - or we need to create specific hook registry by engine hooks = extract_features_hook_generator(cfg) # Run the label prediction extraction trainer = SelfSupervisionTrainer(cfg, dist_run_id, hooks=hooks) output_dir = cfg.EXTRACT_FEATURES.OUTPUT_DIR or checkpoint_folder trainer.extract( output_folder=cfg.EXTRACT_FEATURES.OUTPUT_DIR or checkpoint_folder, extract_features=True, extract_predictions=False, ) # TODO (prigoyal): merge this function with _extract_features if dist_rank == 0 and cfg.EXTRACT_FEATURES.MAP_FEATURES_TO_IMG_NAME: # Get the names of the features that we extracted features for. If user doesn't # specify the features to evaluate, we get the full model output and freeze # head/trunk both as caution. layers = get_trunk_output_feature_names(cfg.MODEL) if len(layers) == 0: layers = ["heads"] available_splits = [ item.lower() for item in trainer.task.available_splits ] for split in available_splits: image_paths = trainer.task.datasets[split].get_image_paths()[0] for layer in layers: ExtractedFeaturesLoader.map_features_to_img_filepath( image_paths=image_paths, input_dir=output_dir, split=split, layer=layer, ) logging.info("All Done!") # close the logging streams including the filehandlers shutdown_logging()