def main(_A: argparse.Namespace): # Get the current device as set for current distributed process. # Check `launch` function in `virtex.utils.distributed` module. device = torch.cuda.current_device() # Local process group is needed for detectron2. pg = list(range(dist.get_world_size())) d2.utils.comm._LOCAL_PROCESS_GROUP = torch.distributed.new_group(pg) # Create a config object (this will be immutable) and perform common setup # such as logging and setting up serialization directory. import pdb pdb.set_trace() if _A.weight_init == "imagenet": _A.config_override.extend(["MODEL.VISUAL.PRETRAINED", True]) _C = Config(_A.config, _A.config_override) # We use `default_setup` from detectron2 to do some common setup, such as # logging, setting up serialization etc. For more info, look into source. _D2C = build_detectron2_config(_C, _A) default_setup(_D2C, _A) # Prepare weights to pass in instantiation call of trainer. if _A.weight_init in {"virtex", "torchvision"}: if _A.resume: # If resuming training, let detectron2 load weights by providing path. model = None weights = _A.checkpoint_path else: # Load backbone weights from VirTex pretrained checkpoint. model = PretrainingModelFactory.from_config(_C) if _A.weight_init == "virtex": CheckpointManager(model=model).load(_A.checkpoint_path) else: model.visual.cnn.load_state_dict( torch.load(_A.checkpoint_path, map_location="cpu")["state_dict"], strict=False, ) weights = model.visual.detectron2_backbone_state_dict() else: # If random or imagenet init, just load weights after initializing model. model = PretrainingModelFactory.from_config(_C) weights = model.visual.detectron2_backbone_state_dict() # Back up pretrain config and model checkpoint (if provided). _C.dump(os.path.join(_A.serialization_dir, "pretrain_config.yaml")) if _A.weight_init == "virtex" and not _A.resume: torch.save( model.state_dict(), os.path.join(_A.serialization_dir, "pretrain_model.pth"), ) del model trainer = DownstreamTrainer(_D2C, weights) trainer.test() if _A.eval_only else trainer.train()
def common_setup(_C: Config, _A: argparse.Namespace, job_type: str = "pretrain"): r""" Setup common stuff at the start of every pretraining or downstream evaluation job, all listed here to avoid code duplication. Basic steps: 1. Fix random seeds and other PyTorch flags. 2. Set up a serialization directory and loggers. 3. Log important stuff such as config, process info (useful during distributed training). 4. Save a copy of config to serialization directory. .. note:: It is assumed that multiple processes for distributed training have already been launched from outside. Functions from :mod:`virtex.utils.distributed` module ae used to get process info. Args: _C: Config object with all the parameters. _A: Argparse command line arguments. job_type: Type of job for which setup is to be done; one of ``{"pretrain", "downstream"}``. """ # Get process rank and world size (assuming distributed is initialized). RANK = dist.get_rank() WORLD_SIZE = dist.get_world_size() # For reproducibility - refer https://pytorch.org/docs/stable/notes/randomness.html torch.manual_seed(_C.RANDOM_SEED) torch.backends.cudnn.deterministic = _C.CUDNN_DETERMINISTIC torch.backends.cudnn.benchmark = _C.CUDNN_BENCHMARK random.seed(_C.RANDOM_SEED) np.random.seed(_C.RANDOM_SEED) # Create serialization directory and save config in it. os.makedirs(_A.serialization_dir, exist_ok=True) _C.dump(os.path.join(_A.serialization_dir, f"{job_type}_config.yaml")) # Remove default logger, create a logger for each process which writes to a # separate log-file. This makes changes in global scope. logger.remove(0) if dist.get_world_size() > 1: logger.add( os.path.join(_A.serialization_dir, f"log-rank{RANK}.txt"), format="{time} {level} {message}", ) # Add a logger for stdout only for the master process. if dist.is_master_process(): logger.add(sys.stdout, format="<g>{time}</g>: <lvl>{message}</lvl>", colorize=True) # Print process info, config and args. logger.info(f"Rank of current process: {RANK}. World size: {WORLD_SIZE}") logger.info(str(_C)) logger.info("Command line args:") for arg in vars(_A): logger.info("{:<20}: {}".format(arg, getattr(_A, arg)))
def main(_A: argparse.Namespace): if _A.num_gpus_per_machine == 0: # Set device as CPU if num_gpus_per_machine = 0. device = torch.device("cpu") else: # Get the current device (this will be zero here by default). device = torch.cuda.current_device() # Create a downstream config object (this will be immutable) and perform # common setup such as logging and setting up serialization directory. _DOWNC = Config(_A.down_config, _A.down_config_override) common_setup(_DOWNC, _A, job_type="downstream") # Create a (pretraining) config object and backup in serialization directory. _C = Config(_A.config, _A.config_override) _C.dump(os.path.join(_A.serialization_dir, "pretrain_config.yaml")) # ------------------------------------------------------------------------- # INSTANTIATE DATALOADER, MODEL, AND FEATURE EXTRACTOR # ------------------------------------------------------------------------- train_dataset = DownstreamDatasetFactory.from_config(_DOWNC, split="trainval") train_dataloader = DataLoader( train_dataset, batch_size=_DOWNC.OPTIM.BATCH_SIZE, num_workers=_A.cpu_workers, pin_memory=True, ) test_dataset = DownstreamDatasetFactory.from_config(_DOWNC, split="test") test_dataloader = DataLoader( test_dataset, batch_size=_DOWNC.OPTIM.BATCH_SIZE, num_workers=_A.cpu_workers, pin_memory=True, ) NUM_CLASSES = len(train_dataset.class_names) # Initialize from a checkpoint, but only keep the visual module. model = PretrainingModelFactory.from_config(_C) # Load weights according to the init method, do nothing for `random`, and # `imagenet` is already taken care of. if _A.weight_init == "virtex": ITERATION = CheckpointManager(model=model).load(_A.checkpoint_path) elif _A.weight_init == "torchvision": # Keep strict=False because this state dict may have weights for # last fc layer. model.visual.cnn.load_state_dict( torch.load(_A.checkpoint_path, map_location="cpu")["state_dict"], strict=False, ) model = FeatureExtractor(model, layer_name=_A.layer, flatten_and_normalize=True) model = model.to(device).eval() # ------------------------------------------------------------------------- # EXTRACT FEATURES FOR TRAINING SVMs # ------------------------------------------------------------------------- features_train: List[torch.Tensor] = [] targets_train: List[torch.Tensor] = [] features_test: List[torch.Tensor] = [] targets_test: List[torch.Tensor] = [] # VOC07 is small, extract all features and keep them in memory. with torch.no_grad(): for batch in tqdm(train_dataloader, desc="Extracting train features:"): features = model(batch["image"].to(device)) features_train.append(features.cpu()) targets_train.append(batch["label"]) # Similarly extract test features. for batch in tqdm(test_dataloader, desc="Extracting test features:"): features = model(batch["image"].to(device)) features_test.append(features.cpu()) targets_test.append(batch["label"]) # Convert batches of features/targets to one large numpy array features_train = torch.cat(features_train, dim=0).numpy() targets_train = torch.cat(targets_train, dim=0).numpy().astype(np.int32) features_test = torch.cat(features_test, dim=0).numpy() targets_test = torch.cat(targets_test, dim=0).numpy().astype(np.int32) # ------------------------------------------------------------------------- # TRAIN AND TEST SVMs WITH EXTRACTED FEATURES # ------------------------------------------------------------------------- input_args: List[Any] = [] # Iterate over all VOC07 classes and train one-vs-all linear SVMs. for cls_idx in range(NUM_CLASSES): # fmt: off input_args.append(( features_train, targets_train[:, cls_idx], features_test, targets_test[:, cls_idx], train_dataset.class_names[cls_idx], )) # fmt: on pool = mp.Pool(processes=_A.cpu_workers) pool_output = pool.map(train_test_single_svm, input_args) # ------------------------------------------------------------------------- # TENSORBOARD LOGGING (RELEVANT MAINLY FOR weight_init=checkpoint) # ------------------------------------------------------------------------- # Tensorboard writer for logging mAP scores. This is useful especially # when weight_init=checkpoint (which maybe be coming from a training job). tensorboard_writer = SummaryWriter(log_dir=_A.serialization_dir) # Test set mAP for each class, for features from every layer. test_map = torch.tensor(pool_output).mean() logger.info(f"mAP: {test_map}") # Tensorboard logging only when _A.weight_init == "virtex" if _A.weight_init == "virtex": tensorboard_writer.add_scalars("metrics/voc07_clf", {"mAP": test_map}, ITERATION)
def main(_A: argparse.Namespace): if _A.num_gpus_per_machine == 0: # Set device as CPU if num_gpus_per_machine = 0. device = torch.device("cpu") else: # Get the current device as set for current distributed process. # Check `launch` function in `virtex.utils.distributed` module. device = torch.cuda.current_device() # Create a downstream config object (this will be immutable) and perform # common setup such as logging and setting up serialization directory. _DOWNC = Config(_A.down_config, _A.down_config_override) common_setup(_DOWNC, _A, job_type="downstream") # Create a (pretraining) config object and backup in serializaion directory. _C = Config(_A.config, _A.config_override) _C.dump(os.path.join(_A.serialization_dir, "pretrain_config.yaml")) # Get dataset name for tensorboard logging. DATASET = _DOWNC.DATA.ROOT.split("/")[-1] # Set number of output classes according to dataset: NUM_CLASSES_MAPPING = {"imagenet": 1000, "inaturalist": 8142} NUM_CLASSES = NUM_CLASSES_MAPPING[DATASET] # ------------------------------------------------------------------------- # INSTANTIATE DATALOADER, MODEL, OPTIMIZER, SCHEDULER # ------------------------------------------------------------------------- train_dataset = DownstreamDatasetFactory.from_config(_DOWNC, split="train") train_dataloader = DataLoader( train_dataset, batch_size=_DOWNC.OPTIM.BATCH_SIZE // dist.get_world_size(), num_workers=_A.cpu_workers, sampler=DistributedSampler( train_dataset, num_replicas=dist.get_world_size(), rank=dist.get_rank(), shuffle=True, ), drop_last=False, pin_memory=True, collate_fn=train_dataset.collate_fn, ) val_dataset = DownstreamDatasetFactory.from_config(_DOWNC, split="val") val_dataloader = DataLoader( val_dataset, batch_size=_DOWNC.OPTIM.BATCH_SIZE // dist.get_world_size(), num_workers=_A.cpu_workers, sampler=DistributedSampler( val_dataset, num_replicas=dist.get_world_size(), rank=dist.get_rank(), shuffle=False, ), pin_memory=True, drop_last=False, collate_fn=val_dataset.collate_fn, ) # Initialize model using pretraining config. pretrained_model = PretrainingModelFactory.from_config(_C) # Load weights according to the init method, do nothing for `random`, and # `imagenet` is already taken care of. if _A.weight_init == "virtex": CheckpointManager(model=pretrained_model).load(_A.checkpoint_path) elif _A.weight_init == "torchvision": # Keep strict=False because this state dict may have weights for # last fc layer. pretrained_model.visual.cnn.load_state_dict( torch.load(_A.checkpoint_path, map_location="cpu")["state_dict"], strict=False, ) # Pull out the CNN (torchvision-like) from our pretrained model and add # back the FC layer - this is exists in torchvision models, and is set to # `nn.Identity()` during pretraining. model = pretrained_model.visual.cnn # type: ignore model.fc = nn.Linear(_DOWNC.MODEL.VISUAL.FEATURE_SIZE, NUM_CLASSES).to(device) model = model.to(device) # Re-initialize the FC layer. torch.nn.init.normal_(model.fc.weight.data, mean=0.0, std=0.01) torch.nn.init.constant_(model.fc.bias.data, 0.0) # Freeze all layers except FC as per config param. if _DOWNC.MODEL.VISUAL.FROZEN: for name, param in model.named_parameters(): if "fc" not in name: param.requires_grad = False # Cross entropy loss and accuracy meter. criterion = nn.CrossEntropyLoss() top1 = TopkAccuracy(top_k=1) optimizer = OptimizerFactory.from_config(_DOWNC, model.named_parameters()) scheduler = LRSchedulerFactory.from_config(_DOWNC, optimizer) del pretrained_model # ------------------------------------------------------------------------- # BEFORE TRAINING STARTS # ------------------------------------------------------------------------- # Create an iterator from dataloader to sample batches perpetually. train_dataloader_iter = cycle(train_dataloader, device) # Wrap model and optimizer using NVIDIA Apex for mixed precision training. # NOTE: Always do this before wrapping model with DistributedDataParallel. if _DOWNC.FP16_OPT > 0: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level=f"O{_DOWNC.FP16_OPT}") if dist.get_world_size() > 1: dist.synchronize() model = nn.parallel.DistributedDataParallel( model, device_ids=[device], find_unused_parameters=True) if dist.is_master_process(): checkpoint_manager = CheckpointManager( _A.serialization_dir, model=model, optimizer=optimizer, scheduler=scheduler, ) tensorboard_writer = SummaryWriter(log_dir=_A.serialization_dir) # Keep track of time per iteration and ETA. timer = Timer(start_from=1, total_iterations=_DOWNC.OPTIM.NUM_ITERATIONS) # ------------------------------------------------------------------------- # TRAINING LOOP # ------------------------------------------------------------------------- for iteration in range(1, _DOWNC.OPTIM.NUM_ITERATIONS + 1): timer.tic() optimizer.zero_grad() batch = next(train_dataloader_iter) logits = model(batch["image"]) loss = criterion(logits, batch["label"]) # Perform dynamic scaling of loss to adjust for mixed precision. if _DOWNC.FP16_OPT > 0: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() scheduler.step(iteration) timer.toc() if iteration % _A.log_every == 0 and dist.is_master_process(): logger.info( f"{timer.stats} | Loss: {loss:.3f} | GPU: {dist.gpu_mem_usage()} MB" ) tensorboard_writer.add_scalar(f"{DATASET}/train_loss", loss, iteration) tensorboard_writer.add_scalar( f"{DATASET}/learning_rate", optimizer.param_groups[0]["lr"], iteration, ) # --------------------------------------------------------------------- # VALIDATION # --------------------------------------------------------------------- if iteration % _A.checkpoint_every == 0: torch.set_grad_enabled(False) model.eval() total_val_loss = torch.tensor(0.0).to(device) for val_iteration, batch in enumerate(val_dataloader, start=1): for key in batch: batch[key] = batch[key].to(device) logits = model(batch["image"]) loss = criterion(logits, batch["label"]) top1(logits, batch["label"]) total_val_loss += loss # Divide each loss component by number of val batches per GPU. total_val_loss = total_val_loss / val_iteration dist.average_across_processes(total_val_loss) # Get accumulated Top-1 accuracy for logging across GPUs. acc = top1.get_metric(reset=True) dist.average_across_processes(acc) torch.set_grad_enabled(True) model.train() # Save recent checkpoint and best checkpoint based on accuracy. if dist.is_master_process(): checkpoint_manager.step(iteration) if iteration % _A.checkpoint_every == 0 and dist.is_master_process(): logger.info(f"Iter: {iteration} | Top-1 accuracy: {acc})") tensorboard_writer.add_scalar(f"{DATASET}/val_loss", total_val_loss, iteration) # This name scoping will result in Tensorboard displaying all metrics # (VOC07, caption, etc.) together. tensorboard_writer.add_scalars(f"metrics/{DATASET}", {"top1": acc}, iteration) # All processes will wait till master process is done logging. dist.synchronize()
def main(_A: argparse.Namespace): if _A.num_gpus_per_machine == 0: # Set device as CPU if num_gpus_per_machine = 0. device = torch.device("cpu") else: # Get the current device (this will be zero here by default). device = torch.cuda.current_device() # Create a downstream config object (this will be immutable) and perform # common setup such as logging and setting up serialization directory. _DOWNC = Config(_A.down_config, _A.down_config_override) common_setup(_DOWNC, _A, job_type="downstream") # Create a (pretraining) config object and backup in serialization directory. _C = Config(_A.config, _A.config_override) _C.dump(os.path.join(_A.serialization_dir, "pretrain_config.yaml")) # ------------------------------------------------------------------------- # INSTANTIATE DATALOADER, MODEL, AND FEATURE EXTRACTOR # ------------------------------------------------------------------------- train_dataset = DownstreamDatasetFactory.from_config(_DOWNC, split="trainval", csv=_A.csv) train_dataloader = DataLoader( train_dataset, batch_size=_DOWNC.OPTIM.BATCH_SIZE, num_workers=_A.cpu_workers, pin_memory=True, shuffle=False, ) print(f"train dataset length {len(train_dataset)}") # Initialize from a checkpoint, but only keep the visual module. model = PretrainingModelFactory.from_config(_C) # Load weights according to the init method, do nothing for `random`, and # `imagenet` is already taken care of. if _A.weight_init == "virtex": ITERATION = CheckpointManager(model=model).load(_A.checkpoint_path) elif _A.weight_init == "torchvision": # Keep strict=False because this state dict may have weights for # last fc layer. model.visual.cnn.load_state_dict( torch.load(_A.checkpoint_path, map_location="cpu")["state_dict"], strict=False, ) model = FeatureExtractor(model, layer_name=_A.layer, flatten_and_normalize=True) model = model.to(device).eval() # ------------------------------------------------------------------------- # EXTRACT FEATURES FOR TRAINING SVMs # ------------------------------------------------------------------------- features_train: List[torch.Tensor] = [] targets_train: List[torch.Tensor] = [] print("input csv is {}".format(_A.csv)) # VOC07 is small, extract all features and keep them in memory. count = 0 with torch.no_grad(): for batch in tqdm(train_dataloader, desc="Extracting train features:"): # if count<=4000: # count+=1 # continue features = model(batch["image"].to(device)) count += 1 print("train features has shape {}, video_id {}".format(features.shape, batch['image_id'])) if count % 1000 == 0: torch.save(features_train, "./features_phoenix/features_{}_{}.pt".format(_A.mode, count)) features_train = [] features_train.append(features.cpu()) torch.save(features_train, "./features_phoenix/features_end_{}.pt".format(_A.mode)) print('finished saving features')