def train_des(cfg): """ Train a video model for many epochs on train set and evaluate it on val set. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Print config. logger.info("Train with config:") logger.info(pprint.pformat(cfg)) # Build the video model and print model statistics. model = build_clevrer_model(cfg) # Construct the optimizer. optimizer = AdamW(model.parameters(), lr=cfg.SOLVER.BASE_LR, eps=1e-8) start_epoch = cu.load_train_checkpoint(cfg, model, optimizer) # Create the video train and val loaders. train_loader = build_dataloader(cfg, "train") val_loader = build_dataloader(cfg, "val") total_steps = len(train_loader) * cfg.SOLVER.MAX_EPOCH # Create the learning rate scheduler. scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, # Default value in run_glue.py num_training_steps=total_steps) # Create meters. train_meter = ClevrerTrainMeter(len(train_loader), cfg) val_meter = ClevrerValMeter(len(val_loader), cfg) # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch + 1)) for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH): # Train for one epoch. train_epoch(train_loader, model, optimizer, scheduler, train_meter, cur_epoch, cfg) is_checkp_epoch = cu.is_checkpoint_epoch( cfg, cur_epoch, None, ) is_eval_epoch = misc.is_eval_epoch(cfg, cur_epoch, None) # Save a checkpoint. if is_checkp_epoch: cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg) # Evaluate the model on validation set. if is_eval_epoch: eval_epoch(val_loader, model, val_meter, cur_epoch, cfg)
def saveOnnxModel(self): model = build_model(self.cfg) optimizer = optim.construct_optimizer(model, self.cfg) start_epoch = cu.load_train_checkpoint(self.cfg, model, optimizer, self.logger) self.cfg.TRAIN['BATCH_SIZE'] = self.cfg.ONNX.BATCH_SIZE dl = loader.construct_loader(self.cfg, "train") inputs, labels, _, _ = next(iter(dl)) if isinstance(inputs, (list, )): for i in range(len(inputs)): inputs[i].to(self.onnxDevice) model.to(torch.device(self.onnxDevice)) model.eval() onnxPath, _ = self.getOnnxModelPath() with torch.no_grad(): torch.onnx.export( model, inputs, onnxPath, opset_version=self.cfg.ONNX.OPSET_VER, verbose=True, input_names=self.cfg.ONNX.INPUT_NAMES, output_names=self.cfg.ONNX.OUTPUT_NAMES, ) self.logger.info("Exported {}".format(onnxPath))
def train(cfg): # Build model model = build_model(cfg) optimizer = optim.construct_optimizer(model, cfg) # load checkpoint start_epoch = cu.load_train_checkpoint(cfg, model, optimizer) # Build data loader train_loader = dataloader.construct_loader(cfg, "train") val_loader = dataloader.construct_loader(cfg, "val") precise_bn_loader = dataloader.construct_loader(cfg, "train") best_accuracy = 0 for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH): # Train for one epoch. train_epoch(train_loader, model, optimizer, cur_epoch, cfg) is_eval_epoch = cur_epoch > 0 # Compute precise BN stats. if (is_eval_epoch and cfg.BN.USE_PRECISE_STATS and len(get_bn_modules(model)) > 0): calculate_and_update_precise_bn( precise_bn_loader, model, min(cfg.BN.NUM_BATCHES_PRECISE, len(precise_bn_loader)), cfg.NUM_GPUS > 0, ) _ = aggregate_sub_bn_stats( model) # for SubBatchNorm3d call before eval # Evaluate the model on validation set. if is_eval_epoch: results = eval_epoch(val_loader, model, cur_epoch, cfg) accuracy = results['top1'] if accuracy > best_accuracy: print("*** Saving best ****") best_accuracy = accuracy torch.save( { 'epoch': cur_epoch + 1, 'model_state': model.state_dict(), 'optimizer_state': optimizer.state_dict() }, os.path.join(cfg.OUTPUT_DIR, 'best_ckpt.pth'))
def train(cfg): """ Train a video model for many epochs on train set and evaluate it on val set. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set up environment. du.init_distributed_training(cfg) # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Init multigrid. multigrid = None if cfg.MULTIGRID.LONG_CYCLE or cfg.MULTIGRID.SHORT_CYCLE: multigrid = MultigridSchedule() cfg = multigrid.init_multigrid(cfg) if cfg.MULTIGRID.LONG_CYCLE: cfg, _ = multigrid.update_long_cycle(cfg, cur_epoch=0) # Print config. logger.info("Train with config:") logger.info(pprint.pformat(cfg)) # Build the video model and print model statistics. model = build_model(cfg) if du.is_master_proc() and cfg.LOG_MODEL_INFO: misc.log_model_info(model, cfg, use_train_input=True) # Construct the optimizer. optimizer = optim.construct_optimizer(model, cfg) # Load a checkpoint to resume training if applicable. start_epoch = cu.load_train_checkpoint(cfg, model, optimizer) # Create the video train and val loaders. train_loader = loader.construct_loader(cfg, "train") val_loader = loader.construct_loader(cfg, "val") precise_bn_loader = (loader.construct_loader( cfg, "train", is_precise_bn=True) if cfg.BN.USE_PRECISE_STATS else None) # Create meters. if cfg.DETECTION.ENABLE: train_meter = AVAMeter(len(train_loader), cfg, mode="train") val_meter = AVAMeter(len(val_loader), cfg, mode="val") else: train_meter = TrainMeter(len(train_loader), cfg) val_meter = ValMeter(len(val_loader), cfg) # set up writer for logging to Tensorboard format. if cfg.TENSORBOARD.ENABLE and du.is_master_proc( cfg.NUM_GPUS * cfg.NUM_SHARDS): writer = tb.TensorboardWriter(cfg) else: writer = None # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch + 1)) for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH): if cfg.MULTIGRID.LONG_CYCLE: cfg, changed = multigrid.update_long_cycle(cfg, cur_epoch) if changed: ( model, optimizer, train_loader, val_loader, precise_bn_loader, train_meter, val_meter, ) = build_trainer(cfg) # Load checkpoint. if cu.has_checkpoint(cfg.OUTPUT_DIR): last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR) assert "{:05d}.pyth".format(cur_epoch) in last_checkpoint else: last_checkpoint = cfg.TRAIN.CHECKPOINT_FILE_PATH logger.info("Load from {}".format(last_checkpoint)) cu.load_checkpoint(last_checkpoint, model, cfg.NUM_GPUS > 1, optimizer) # Shuffle the dataset. loader.shuffle_dataset(train_loader, cur_epoch) # Train for one epoch. train_epoch(train_loader, model, optimizer, train_meter, cur_epoch, cfg, writer) is_checkp_epoch = (cu.is_checkpoint_epoch( cfg, cur_epoch, None if multigrid is None else multigrid.schedule, )) is_eval_epoch = misc.is_eval_epoch( cfg, cur_epoch, None if multigrid is None else multigrid.schedule) # Compute precise BN stats. if ((is_checkp_epoch or is_eval_epoch) and cfg.BN.USE_PRECISE_STATS and len(get_bn_modules(model)) > 0): calculate_and_update_precise_bn( precise_bn_loader, model, min(cfg.BN.NUM_BATCHES_PRECISE, len(precise_bn_loader)), cfg.NUM_GPUS > 0, ) _ = misc.aggregate_sub_bn_stats(model) # Save a checkpoint. if is_checkp_epoch: cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg) # Evaluate the model on validation set. if is_eval_epoch: eval_epoch(val_loader, model, val_meter, cur_epoch, cfg, writer) if writer is not None: writer.close()
def test_implementation_des(cfg): """ Simulates a train and val epoch to check if the gradients are being updated, metrics are being calculated correctly Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Print config. logger.info("Test implementation") # Build the video model and print model statistics. model = build_clevrer_model(cfg) # Construct the optimizer. optimizer = optim.construct_optimizer(model, cfg) start_epoch = cu.load_train_checkpoint(cfg, model, optimizer) # Create the video train and val loaders. if cfg.TRAIN.DATASET != 'Clevrer_des': print("This train script does not support your dataset: -{}-. Only Clevrer_des".format(cfg.TRAIN.DATASET)) exit() train_loader = build_dataloader(cfg, "train") val_loader = build_dataloader(cfg, "val") # Create meters. train_meter = ClevrerTrainMeter(len(train_loader), cfg) val_meter = ClevrerValMeter(len(val_loader), cfg) # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch + 1)) # Train for one epoch. model_before = copy.deepcopy(model) cur_epoch = start_epoch train_epoch( train_loader, model, optimizer, train_meter, cur_epoch, cfg, test_imp=True ) print("Check how much parameters changed") for (p_b_name, p_b), (p_name, p) in zip(model_before.named_parameters(), model.named_parameters()): if p.requires_grad: print("Parameter requires grad:") print(p_name, p_b_name) #Calculate ratio of change change = torch.abs(torch.norm(p) - torch.norm(p_b)) print("Ratio of change = {}".format(torch.true_divide(change, torch.norm(p_b)))) if (p_b != p).any(): print("--Check--") else: print("ALERT - WEIGHTS DID NOT CHANGE WITH TRAINING.") else: print("Parameter does not require grad:") print(p_name) print(p) print("Val epoch") eval_epoch(val_loader, model, val_meter, cur_epoch, cfg, test_imp=True)
def train_des(cfg): """ Train a video model for many epochs on train set and evaluate it on val set. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Print config. logger.info("Train with config:") logger.info(pprint.pformat(cfg)) # Build the video model and print model statistics. model = build_clevrer_model(cfg) # Construct the optimizer. optimizer = optim.construct_optimizer(model, cfg) # Load a checkpoint to resume training if applicable. start_epoch = cu.load_train_checkpoint(cfg, model, optimizer) # Create the video train and val loaders. if cfg.TRAIN.DATASET != 'Clevrer_des': print("This train script does not support your dataset: -{}-. Only Clevrer_des".format(cfg.TRAIN.DATASET)) exit() # Create the video train and val loaders. train_loader = build_dataloader(cfg, "train") val_loader = build_dataloader(cfg, "val") # Create meters. train_meter = ClevrerTrainMeter(len(train_loader), cfg) val_meter = ClevrerValMeter(len(val_loader), cfg) # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch + 1)) for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH): # Shuffle the dataset. #loader.shuffle_dataset(train_loader, cur_epoch) # Train for one epoch. train_epoch( train_loader, model, optimizer, train_meter, cur_epoch, cfg ) is_checkp_epoch = cu.is_checkpoint_epoch( cfg, cur_epoch, None, ) is_eval_epoch = misc.is_eval_epoch( cfg, cur_epoch, None ) # Save a checkpoint. # if is_checkp_epoch: # cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg) # Evaluate the model on validation set. if is_eval_epoch: eval_epoch(val_loader, model, val_meter, cur_epoch, cfg)
def train(cfg): """ Train an audio model for many epochs on train set and evaluate it on val set. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set up environment. du.init_distributed_training(cfg) # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Print config. logger.info("Train with config:") logger.info(pprint.pformat(cfg)) # Build the audio model and print model statistics. model = build_model(cfg) if du.is_master_proc() and cfg.LOG_MODEL_INFO: misc.log_model_info(model, cfg) if cfg.BN.FREEZE: model.module.freeze_fn( 'bn_parameters') if cfg.NUM_GPUS > 1 else model.freeze_fn( 'bn_parameters') # Construct the optimizer. optimizer = optim.construct_optimizer(model, cfg) # Load a checkpoint to resume training if applicable. start_epoch = cu.load_train_checkpoint(cfg, model, optimizer) # Create the audio train and val loaders. if cfg.TRAIN.DATASET != 'epickitchens' or not cfg.EPICKITCHENS.TRAIN_PLUS_VAL: train_loader = loader.construct_loader(cfg, "train") val_loader = loader.construct_loader(cfg, "val") precise_bn_loader = (loader.construct_loader(cfg, "train") if cfg.BN.USE_PRECISE_STATS else None) else: train_loader = loader.construct_loader(cfg, "train+val") val_loader = loader.construct_loader(cfg, "val") precise_bn_loader = (loader.construct_loader(cfg, "train+val") if cfg.BN.USE_PRECISE_STATS else None) # Create meters. if cfg.TRAIN.DATASET == 'epickitchens': train_meter = EPICTrainMeter(len(train_loader), cfg) val_meter = EPICValMeter(len(val_loader), cfg) else: train_meter = TrainMeter(len(train_loader), cfg) val_meter = ValMeter(len(val_loader), cfg) # set up writer for logging to Tensorboard format. if cfg.TENSORBOARD.ENABLE and du.is_master_proc( cfg.NUM_GPUS * cfg.NUM_SHARDS): writer = tb.TensorboardWriter(cfg) else: writer = None if cfg.WANDB.ENABLE and du.is_master_proc(cfg.NUM_GPUS * cfg.NUM_SHARDS): wandb_log = True if cfg.TRAIN.AUTO_RESUME and cfg.WANDB.RUN_ID != "": wandb.init(project='slowfast', config=cfg, sync_tensorboard=True, resume=cfg.WANDB.RUN_ID) else: wandb.init(project='slowfast', config=cfg, sync_tensorboard=True) wandb.watch(model) else: wandb_log = False # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch + 1)) for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH): # Shuffle the dataset. loader.shuffle_dataset(train_loader, cur_epoch) # Train for one epoch. train_epoch(train_loader, model, optimizer, train_meter, cur_epoch, cfg, writer, wandb_log) is_checkp_epoch = cu.is_checkpoint_epoch( cfg, cur_epoch, ) is_eval_epoch = misc.is_eval_epoch( cfg, cur_epoch, ) # Compute precise BN stats. if ((is_checkp_epoch or is_eval_epoch) and cfg.BN.USE_PRECISE_STATS and len(get_bn_modules(model)) > 0): calculate_and_update_precise_bn( precise_bn_loader, model, min(cfg.BN.NUM_BATCHES_PRECISE, len(precise_bn_loader)), cfg.NUM_GPUS > 0, ) _ = misc.aggregate_sub_bn_stats(model) # Save a checkpoint. if is_checkp_epoch: cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg) # Evaluate the model on validation set. if is_eval_epoch: is_best_epoch, _ = eval_epoch(val_loader, model, val_meter, cur_epoch, cfg, writer, wandb_log) if is_best_epoch: cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg, is_best_epoch=is_best_epoch) if writer is not None: writer.close()
def train_des(cfg): """ Train a video model for many epochs on train set and evaluate it on val set. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Print config. logger.info("Train with config:") logger.info(pprint.pformat(cfg)) # Build the video model and print model statistics. model = MONET_BERT(cfg) if cfg.NUM_GPUS: # Determine the GPU used by the current process cur_device = torch.cuda.current_device() # Transfer the model to the current GPU device model = model.cuda(device=cur_device) # Construct the optimizer. # optimizer = AdamW(model.parameters(), # lr = cfg.SOLVER.BASE_LR, # eps = 1e-8 # ) optimizer = Lamb(model.parameters(), lr=cfg.SOLVER.BASE_LR, betas=(0.9, 0.999), eps=1e-6, weight_decay=cfg.SOLVER.WEIGHT_DECAY, adam=False) # optimizer = optim.Adam(model.parameters(), lr=cfg.SOLVER.BASE_LR) start_epoch = cu.load_train_checkpoint(cfg, model, optimizer) optimizer = Lamb(model.parameters(), lr=cfg.SOLVER.BASE_LR, betas=(0.9, 0.999), eps=1e-6, weight_decay=cfg.SOLVER.WEIGHT_DECAY, adam=False) # Create the video train and val loaders. train_loader = build_dataloader(cfg, "train") val_loader = build_dataloader(cfg, "val") total_steps = len(train_loader) * cfg.SOLVER.MAX_EPOCH # Create the learning rate scheduler. scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=4000, # Default value in run_glue.py num_training_steps=total_steps) # Create meters. train_meter = ClevrerTrainMeter(len(train_loader), cfg) val_meter = ClevrerValMeter(len(val_loader), cfg) # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch + 1)) for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH): # Train for one epoch. train_epoch(train_loader, model, optimizer, scheduler, train_meter, cur_epoch, cfg) is_checkp_epoch = cu.is_checkpoint_epoch( cfg, cur_epoch, None, ) is_eval_epoch = misc.is_eval_epoch(cfg, cur_epoch, None) # Save a checkpoint. if is_checkp_epoch: cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg) # Evaluate the model on validation set. if is_eval_epoch: eval_epoch(val_loader, model, val_meter, cur_epoch, cfg)