def train_des(cfg): """ Train a video model for many epochs on train set and evaluate it on val set. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Print config. logger.info("Train with config:") logger.info(pprint.pformat(cfg)) # Build the video model and print model statistics. model = build_clevrer_model(cfg) # Construct the optimizer. optimizer = AdamW(model.parameters(), lr=cfg.SOLVER.BASE_LR, eps=1e-8) start_epoch = cu.load_train_checkpoint(cfg, model, optimizer) # Create the video train and val loaders. train_loader = build_dataloader(cfg, "train") val_loader = build_dataloader(cfg, "val") total_steps = len(train_loader) * cfg.SOLVER.MAX_EPOCH # Create the learning rate scheduler. scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, # Default value in run_glue.py num_training_steps=total_steps) # Create meters. train_meter = ClevrerTrainMeter(len(train_loader), cfg) val_meter = ClevrerValMeter(len(val_loader), cfg) # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch + 1)) for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH): # Train for one epoch. train_epoch(train_loader, model, optimizer, scheduler, train_meter, cur_epoch, cfg) is_checkp_epoch = cu.is_checkpoint_epoch( cfg, cur_epoch, None, ) is_eval_epoch = misc.is_eval_epoch(cfg, cur_epoch, None) # Save a checkpoint. if is_checkp_epoch: cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg) # Evaluate the model on validation set. if is_eval_epoch: eval_epoch(val_loader, model, val_meter, cur_epoch, cfg)
def train(cfg): """ Train a video model for many epochs on train set and evaluate it on val set. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set up environment. du.init_distributed_training(cfg) # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Init multigrid. multigrid = None if cfg.MULTIGRID.LONG_CYCLE or cfg.MULTIGRID.SHORT_CYCLE: multigrid = MultigridSchedule() cfg = multigrid.init_multigrid(cfg) if cfg.MULTIGRID.LONG_CYCLE: cfg, _ = multigrid.update_long_cycle(cfg, cur_epoch=0) # Print config. logger.info("Train with config:") logger.info(pprint.pformat(cfg)) # Build the video model and print model statistics. model = build_model(cfg) if du.is_master_proc() and cfg.LOG_MODEL_INFO: misc.log_model_info(model, cfg, use_train_input=True) # Construct the optimizer. optimizer = optim.construct_optimizer(model, cfg) # Load a checkpoint to resume training if applicable. start_epoch = cu.load_train_checkpoint(cfg, model, optimizer) # Create the video train and val loaders. train_loader = loader.construct_loader(cfg, "train") val_loader = loader.construct_loader(cfg, "val") precise_bn_loader = (loader.construct_loader( cfg, "train", is_precise_bn=True) if cfg.BN.USE_PRECISE_STATS else None) # Create meters. if cfg.DETECTION.ENABLE: train_meter = AVAMeter(len(train_loader), cfg, mode="train") val_meter = AVAMeter(len(val_loader), cfg, mode="val") else: train_meter = TrainMeter(len(train_loader), cfg) val_meter = ValMeter(len(val_loader), cfg) # set up writer for logging to Tensorboard format. if cfg.TENSORBOARD.ENABLE and du.is_master_proc( cfg.NUM_GPUS * cfg.NUM_SHARDS): writer = tb.TensorboardWriter(cfg) else: writer = None # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch + 1)) for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH): if cfg.MULTIGRID.LONG_CYCLE: cfg, changed = multigrid.update_long_cycle(cfg, cur_epoch) if changed: ( model, optimizer, train_loader, val_loader, precise_bn_loader, train_meter, val_meter, ) = build_trainer(cfg) # Load checkpoint. if cu.has_checkpoint(cfg.OUTPUT_DIR): last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR) assert "{:05d}.pyth".format(cur_epoch) in last_checkpoint else: last_checkpoint = cfg.TRAIN.CHECKPOINT_FILE_PATH logger.info("Load from {}".format(last_checkpoint)) cu.load_checkpoint(last_checkpoint, model, cfg.NUM_GPUS > 1, optimizer) # Shuffle the dataset. loader.shuffle_dataset(train_loader, cur_epoch) # Train for one epoch. train_epoch(train_loader, model, optimizer, train_meter, cur_epoch, cfg, writer) is_checkp_epoch = (cu.is_checkpoint_epoch( cfg, cur_epoch, None if multigrid is None else multigrid.schedule, )) is_eval_epoch = misc.is_eval_epoch( cfg, cur_epoch, None if multigrid is None else multigrid.schedule) # Compute precise BN stats. if ((is_checkp_epoch or is_eval_epoch) and cfg.BN.USE_PRECISE_STATS and len(get_bn_modules(model)) > 0): calculate_and_update_precise_bn( precise_bn_loader, model, min(cfg.BN.NUM_BATCHES_PRECISE, len(precise_bn_loader)), cfg.NUM_GPUS > 0, ) _ = misc.aggregate_sub_bn_stats(model) # Save a checkpoint. if is_checkp_epoch: cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg) # Evaluate the model on validation set. if is_eval_epoch: eval_epoch(val_loader, model, val_meter, cur_epoch, cfg, writer) if writer is not None: writer.close()
def train(cfg): """ Train a video model for many epochs on train set and evaluate it on val set. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging() # Print config. logger.info("Train with config:") logger.info(pprint.pformat(cfg)) # Build the video model and print model statistics. model = model_builder.build_model(cfg) if du.is_master_proc(): misc.log_model_info(model) # Construct the optimizer. optimizer = optim.construct_optimizer(model, cfg) # Load a checkpoint to resume training if applicable. if cfg.TRAIN.AUTO_RESUME and cu.has_checkpoint(cfg.OUTPUT_DIR): logger.info("Load from last checkpoint.") last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR) checkpoint_epoch = cu.load_checkpoint(last_checkpoint, model, cfg.NUM_GPUS > 1, optimizer) start_epoch = checkpoint_epoch + 1 elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "": logger.info("Load from given checkpoint file.") checkpoint_epoch = cu.load_checkpoint( cfg.TRAIN.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, optimizer, inflation=cfg.TRAIN.CHECKPOINT_INFLATE, convert_from_caffe2=cfg.TRAIN.CHECKPOINT_TYPE == "caffe2", ) start_epoch = checkpoint_epoch + 1 else: start_epoch = 0 # Create the video train and val loaders. train_loader = loader.construct_loader(cfg, "train") val_loader = loader.construct_loader(cfg, "val") # Create meters. if cfg.DETECTION.ENABLE: train_meter = AVAMeter(len(train_loader), cfg, mode="train") val_meter = AVAMeter(len(val_loader), cfg, mode="val") else: train_meter = TrainMeter(len(train_loader), cfg) val_meter = ValMeter(len(val_loader), cfg) # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch + 1)) for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH): # Shuffle the dataset. loader.shuffle_dataset(train_loader, cur_epoch) # Train for one epoch. train_epoch(train_loader, model, optimizer, train_meter, cur_epoch, cfg) # Compute precise BN stats. if cfg.BN.USE_PRECISE_STATS and len(get_bn_modules(model)) > 0: calculate_and_update_precise_bn(train_loader, model, cfg.BN.NUM_BATCHES_PRECISE) # Save a checkpoint. if cu.is_checkpoint_epoch(cur_epoch, cfg.TRAIN.CHECKPOINT_PERIOD): cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg) # Evaluate the model on validation set. if misc.is_eval_epoch(cfg, cur_epoch): eval_epoch(val_loader, model, val_meter, cur_epoch, cfg)
def train(cfg): """ Train a video model for many epochs on train set and evaluate it on val set. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg) # Print config. logger.info("Train with config:") logger.info(pprint.pformat(cfg)) if du.get_rank()==0 and du.is_master_proc(num_gpus=cfg.NUM_GPUS): writer = SummaryWriter(log_dir=cfg.OUTPUT_DIR) else: writer = None if du.get_rank()==0 and du.is_master_proc(num_gpus=cfg.NUM_GPUS) and not cfg.DEBUG: tags = [] if 'TAGS' in cfg and cfg.TAGS !=[]: tags=list(cfg.TAGS) neptune.set_project('Serre-Lab/motion') ###################### overrides = sys.argv[1:] overrides_dict = {} for i in range(len(overrides)//2): overrides_dict[overrides[2*i]] = overrides[2*i+1] overrides_dict['dir'] = cfg.OUTPUT_DIR ###################### if 'NEP_ID' in cfg and cfg.NEP_ID != "": session = Session() project = session.get_project(project_qualified_name='Serre-Lab/motion') nep_experiment = project.get_experiments(id=cfg.NEP_ID)[0] else: nep_experiment = neptune.create_experiment (name=cfg.NAME, params=overrides_dict, tags=tags) else: nep_experiment=None # Build the video model and print model statistics. model = build_model(cfg) if du.is_master_proc(num_gpus=cfg.NUM_GPUS): misc.log_model_info(model, cfg, is_train=True) # Construct the optimizer. optimizer = optim.construct_optimizer(model, cfg) # Load a checkpoint to resume training if applicable. if cfg.TRAIN.AUTO_RESUME and cu.has_checkpoint(cfg.OUTPUT_DIR): logger.info("Load from last checkpoint.") last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR) checkpoint_epoch = cu.load_checkpoint( last_checkpoint, model, cfg.NUM_GPUS > 1, optimizer ) start_epoch = checkpoint_epoch + 1 elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "": logger.info("Load from given checkpoint file.") checkpoint_epoch = cu.load_checkpoint( cfg.TRAIN.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, optimizer, inflation=cfg.TRAIN.CHECKPOINT_INFLATE, convert_from_caffe2=cfg.TRAIN.CHECKPOINT_TYPE == "caffe2", ) start_epoch = checkpoint_epoch + 1 else: start_epoch = 0 # Create the video train and val loaders. train_loader = loader.construct_loader(cfg, "train") val_loader = loader.construct_loader(cfg, "val") # Create meters. if cfg.DETECTION.ENABLE: train_meter = AVAMeter(len(train_loader), cfg, mode="train") val_meter = AVAMeter(len(val_loader), cfg, mode="val") else: train_meter = TrainMeter(len(train_loader), cfg) val_meter = ValMeter(len(val_loader), cfg) # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch + 1)) for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH): # Shuffle the dataset. loader.shuffle_dataset(train_loader, cur_epoch) # Train for one epoch. train_epoch(train_loader, model, optimizer, train_meter, cur_epoch, writer, nep_experiment, cfg) # Compute precise BN stats. # if cfg.BN.USE_PRECISE_STATS and len(get_bn_modules(model)) > 0: # calculate_and_update_precise_bn( # train_loader, model, cfg.BN.NUM_BATCHES_PRECISE # ) # Save a checkpoint. if cu.is_checkpoint_epoch(cur_epoch, cfg.TRAIN.CHECKPOINT_PERIOD): cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg) # Evaluate the model on validation set. if misc.is_eval_epoch(cfg, cur_epoch): eval_epoch(val_loader, model, val_meter, cur_epoch, nep_experiment, cfg) if du.get_rank()==0 and du.is_master_proc(num_gpus=cfg.NUM_GPUS) and not cfg.DEBUG: nep_experiment.log_metric('epoch', cur_epoch)
def train_des(cfg): """ Train a video model for many epochs on train set and evaluate it on val set. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Print config. logger.info("Train with config:") logger.info(pprint.pformat(cfg)) # Build the video model and print model statistics. model = build_clevrer_model(cfg) # Construct the optimizer. optimizer = optim.construct_optimizer(model, cfg) # Load a checkpoint to resume training if applicable. start_epoch = cu.load_train_checkpoint(cfg, model, optimizer) # Create the video train and val loaders. if cfg.TRAIN.DATASET != 'Clevrertext_des': print( "This train script does not support your dataset: -{}-. Only Clevrertext_des" .format(cfg.TRAIN.DATASET)) exit() # Create the video train and val loaders. train_loader = build_dataloader(cfg, "train") val_loader = build_dataloader(cfg, "val") # Create meters. train_meter = ClevrerTrainMeter(len(train_loader), cfg) val_meter = ClevrerValMeter(len(val_loader), cfg) # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch + 1)) for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH): # Shuffle the dataset. #loader.shuffle_dataset(train_loader, cur_epoch) # Train for one epoch. train_epoch(train_loader, model, optimizer, train_meter, cur_epoch, cfg) is_checkp_epoch = cu.is_checkpoint_epoch( cfg, cur_epoch, None, ) is_eval_epoch = misc.is_eval_epoch(cfg, cur_epoch, None) # Save a checkpoint. if is_checkp_epoch: cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg) # Evaluate the model on validation set. if is_eval_epoch: eval_epoch(val_loader, model, val_meter, cur_epoch, cfg)
def train_epoch(train_loader, model, optimizer, train_meter, cur_epoch, cfg, writer=None): """ Perform the video training for one epoch. Args: train_loader (loader): video training loader. model (model): the video model to train. optimizer (optim): the optimizer to perform optimization on the model's parameters. train_meter (TrainMeter): training meters to log the training performance. cur_epoch (int): current epoch of training. cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py writer (TensorboardWriter, optional): TensorboardWriter object to writer Tensorboard log. """ # Enable train mode. model.train() train_meter.iter_tic() data_size = len(train_loader) for cur_iter, (inputs, labels, _, meta) in enumerate(train_loader): # Transfer the data to the current GPU device. if cfg.NUM_GPUS: if isinstance(inputs, (list, )): for i in range(len(inputs)): inputs[i] = inputs[i].cuda(non_blocking=True) else: inputs = inputs.cuda(non_blocking=True) labels = labels.cuda() for key, val in meta.items(): if isinstance(val, (list, )): for i in range(len(val)): val[i] = val[i].cuda(non_blocking=True) else: meta[key] = val.cuda(non_blocking=True) # Update the learning rate. lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg) optim.set_lr(optimizer, lr) train_meter.data_toc() # _________________________ save model test __________________________________________ if cur_iter % 100 == 1: cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_iter, cfg) # cur_epoch print("----------------------- save done ") # exit(0) # _________________________________________________________________________________________ if cfg.DETECTION.ENABLE: # inputs[4,3,8,224,224], preds[32,2048,7,7] # change {1,3,8,224,224] -> [8,3,224,224] ################################################################################## inputs0 = inputs[0].squeeze(0).permute(1, 0, 2, 3) inputs1 = inputs[1].squeeze(0).permute(1, 0, 2, 3) meta["boxes"] = meta["boxes"].unsqueeze(0).unsqueeze(0) inputs = [inputs0, inputs1] preds = model(inputs, meta["boxes"]) # ################################################################################################################################# # import os # weights = 'checkpoints/checkpoint_epoch_00007.pyth' # os.environ['CUDA_VISIBLE_DEVICES'] = '0' # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # chkpt = torch.load(weights, map_location=device) # try: # model_dict = model.module.state_dict() # except AttributeError: # model_dict = model.state_dict() # 读取原始状态及参数, ## 多GPU训练,导致训练存储的模型时key会加上model # # 将pretrained_dict里不属于model_dict的键剔除掉 # chkpt = {k: v for k, v in chkpt.items() if k in model_dict} # print("load pretrain model") # model_dict.update(chkpt) # model.load_state_dict(model_dict) # model.to(device) # # inputs = [inputs.to(device)] # model.eval() # input_tensor = (inputs, meta["boxes"].to(device)) # traced_script_module = torch.jit.trace(model, input_tensor) # traced_script_module.save("weights/sf_pytorch.pt") # print("************************* out put save **********************************") # exit(0) ############################################################################################## else: preds = model(inputs) # Explicitly declare reduction to mean. loss_fun = losses.get_loss_func(cfg.MODEL.LOSS_FUNC)(reduction="mean") # Compute the loss. loss = loss_fun(preds, labels) # check Nan Loss. misc.check_nan_losses(loss) # Perform the backward pass. optimizer.zero_grad() loss.backward() # Update the parameters. optimizer.step() if cfg.DETECTION.ENABLE: if cfg.NUM_GPUS > 1: loss = du.all_reduce([loss])[0] loss = loss.item() # Update and log stats. train_meter.update_stats(None, None, None, loss, lr) # write to tensorboard format if available. if writer is not None: writer.add_scalars( { "Train/loss": loss, "Train/lr": lr }, global_step=data_size * cur_epoch + cur_iter, ) else: top1_err, top5_err = None, None if cfg.DATA.MULTI_LABEL: # Gather all the predictions across all the devices. if cfg.NUM_GPUS > 1: [loss] = du.all_reduce([loss]) loss = loss.item() else: # Compute the errors. num_topks_correct = metrics.topks_correct( preds, labels, (1, 5)) top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0 for x in num_topks_correct] # Gather all the predictions across all the devices. if cfg.NUM_GPUS > 1: loss, top1_err, top5_err = du.all_reduce( [loss, top1_err, top5_err]) # Copy the stats from GPU to CPU (sync point). loss, top1_err, top5_err = ( loss.item(), top1_err.item(), top5_err.item(), ) # Update and log stats. train_meter.update_stats( top1_err, top5_err, loss, lr, inputs[0].size(0) * max( cfg.NUM_GPUS, 1 ), # If running on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU. ) # write to tensorboard format if available. if writer is not None: writer.add_scalars( { "Train/loss": loss, "Train/lr": lr, "Train/Top1_err": top1_err, "Train/Top5_err": top5_err, }, global_step=data_size * cur_epoch + cur_iter, ) train_meter.iter_toc() # measure allreduce for this meter train_meter.log_iter_stats(cur_epoch, cur_iter) train_meter.iter_tic() # Log epoch stats. train_meter.log_epoch_stats(cur_epoch) train_meter.reset()
def train(cfg): """ Train a video model for many epochs on train set and evaluate it on val set. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging() if du.is_master_proc(): if cfg.ENABLE_WANDB: wandb.login() wandb.init(project='bbox', entity='slowfast') # Print config. # logger.info("Train with config:") # logger.info(pprint.pformat(cfg)) # Build the video model and print model statistics. model = build_model(cfg) if cfg.EPICKITCHENS.USE_BBOX and not cu.has_checkpoint(cfg.OUTPUT_DIR): slow_fast_model = SlowFast(cfg) if cfg.EPICKITCHENS.USE_BBOX and cfg.EPICKITCHENS.LOAD_SLOWFAST_PRETRAIN: if cfg.EPICKITCHENS.SLOWFAST_PRETRAIN_CHECKPOINT_FILE_PATH != "": _ = cu.load_checkpoint( cfg.EPICKITCHENS.SLOWFAST_PRETRAIN_CHECKPOINT_FILE_PATH, slow_fast_model, False, optimizer=None, inflation=cfg.TRAIN.CHECKPOINT_INFLATE, convert_from_caffe2=False, ) # cfg.TRAIN.CHECKPOINT_TYPE == "caffe2" logger.info("Load from slowfast.") if cfg.NUM_GPUS > 1: model.module.load_weight_slowfast(slow_fast_model) else: model.load_weight_slowfast(slow_fast_model) # if du.is_master_proc(): # misc.log_model_info(model, cfg, is_train=True) if cfg.BN.FREEZE: model.freeze_fn('bn_parameters') if cfg.EPICKITCHENS.USE_BBOX and cfg.EPICKITCHENS.FREEZE_BACKBONE: model.freeze_fn('slowfast_bbox') # Construct the optimizer. optimizer = optim.construct_optimizer(model, cfg) # Load a checkpoint to resume training if applicable. if cfg.TRAIN.AUTO_RESUME and cu.has_checkpoint(cfg.OUTPUT_DIR): logger.info("Load from last checkpoint.") last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR) checkpoint_epoch = cu.load_checkpoint(last_checkpoint, model, cfg.NUM_GPUS > 1, optimizer) start_epoch = checkpoint_epoch + 1 elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "" and not cfg.TRAIN.FINETUNE: logger.info("Load from given checkpoint file.") checkpoint_epoch = cu.load_checkpoint( cfg.TRAIN.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, optimizer, inflation=cfg.TRAIN.CHECKPOINT_INFLATE, convert_from_caffe2=cfg.TRAIN.CHECKPOINT_TYPE == "caffe2", ) start_epoch = checkpoint_epoch + 1 elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "" and cfg.TRAIN.FINETUNE: logger.info("Load from given checkpoint file. Finetuning.") _ = cu.load_checkpoint( cfg.TRAIN.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, None, inflation=cfg.TRAIN.CHECKPOINT_INFLATE, convert_from_caffe2=cfg.TRAIN.CHECKPOINT_TYPE == "caffe2", ) start_epoch = 0 else: start_epoch = 0 # Create the video train and val loaders. if cfg.TRAIN.DATASET != 'epickitchens' or not cfg.EPICKITCHENS.TRAIN_PLUS_VAL: train_loader = loader.construct_loader(cfg, "train") val_loader = loader.construct_loader(cfg, "val") logger.info("Train loader size: {}".format(len(train_loader))) logger.info("Val loader size: {}".format(len(val_loader))) else: train_loader = loader.construct_loader(cfg, "train+val") val_loader = loader.construct_loader(cfg, "val") # Create meters. if cfg.DETECTION.ENABLE: train_meter = AVAMeter(len(train_loader), cfg, mode="train") val_meter = AVAMeter(len(val_loader), cfg, mode="val") else: if cfg.TRAIN.DATASET == 'epickitchens': train_meter = EPICTrainMeterSimple(len(train_loader), cfg) val_meter = EPICValMeterSimple(len(val_loader), cfg) else: train_meter = TrainMeter(len(train_loader), cfg) val_meter = ValMeter(len(val_loader), cfg) # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch + 1)) cnt = 0 # eval_epoch(val_loader, model, val_meter, 0, cfg, cnt) # test_from_train(model, cfg, cnt=cnt) for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH): # Shuffle the dataset. loader.shuffle_dataset(train_loader, cur_epoch) # Train for one epoch. cnt = train_epoch(train_loader, model, optimizer, train_meter, cur_epoch, cfg, cnt) # Compute precise BN stats. if cfg.BN.USE_PRECISE_STATS and len(get_bn_modules(model)) > 0: calculate_and_update_precise_bn(train_loader, model, cfg.BN.NUM_BATCHES_PRECISE) # Save a checkpoint. if cu.is_checkpoint_epoch(cur_epoch, cfg.TRAIN.CHECKPOINT_PERIOD): cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg) # Evaluate the model on validation set. if misc.is_eval_epoch(cfg, cur_epoch): is_best_epoch = eval_epoch(val_loader, model, val_meter, cur_epoch, cfg, cnt) if is_best_epoch: cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg, is_best_epoch=is_best_epoch)
def train(cfg): """ Train an audio model for many epochs on train set and evaluate it on val set. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set up environment. du.init_distributed_training(cfg) # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Print config. logger.info("Train with config:") logger.info(pprint.pformat(cfg)) # Build the audio model and print model statistics. model = build_model(cfg) if du.is_master_proc() and cfg.LOG_MODEL_INFO: misc.log_model_info(model, cfg) if cfg.BN.FREEZE: model.module.freeze_fn( 'bn_parameters') if cfg.NUM_GPUS > 1 else model.freeze_fn( 'bn_parameters') # Construct the optimizer. optimizer = optim.construct_optimizer(model, cfg) # Load a checkpoint to resume training if applicable. start_epoch = cu.load_train_checkpoint(cfg, model, optimizer) # Create the audio train and val loaders. if cfg.TRAIN.DATASET != 'epickitchens' or not cfg.EPICKITCHENS.TRAIN_PLUS_VAL: train_loader = loader.construct_loader(cfg, "train") val_loader = loader.construct_loader(cfg, "val") precise_bn_loader = (loader.construct_loader(cfg, "train") if cfg.BN.USE_PRECISE_STATS else None) else: train_loader = loader.construct_loader(cfg, "train+val") val_loader = loader.construct_loader(cfg, "val") precise_bn_loader = (loader.construct_loader(cfg, "train+val") if cfg.BN.USE_PRECISE_STATS else None) # Create meters. if cfg.TRAIN.DATASET == 'epickitchens': train_meter = EPICTrainMeter(len(train_loader), cfg) val_meter = EPICValMeter(len(val_loader), cfg) else: train_meter = TrainMeter(len(train_loader), cfg) val_meter = ValMeter(len(val_loader), cfg) # set up writer for logging to Tensorboard format. if cfg.TENSORBOARD.ENABLE and du.is_master_proc( cfg.NUM_GPUS * cfg.NUM_SHARDS): writer = tb.TensorboardWriter(cfg) else: writer = None if cfg.WANDB.ENABLE and du.is_master_proc(cfg.NUM_GPUS * cfg.NUM_SHARDS): wandb_log = True if cfg.TRAIN.AUTO_RESUME and cfg.WANDB.RUN_ID != "": wandb.init(project='slowfast', config=cfg, sync_tensorboard=True, resume=cfg.WANDB.RUN_ID) else: wandb.init(project='slowfast', config=cfg, sync_tensorboard=True) wandb.watch(model) else: wandb_log = False # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch + 1)) for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH): # Shuffle the dataset. loader.shuffle_dataset(train_loader, cur_epoch) # Train for one epoch. train_epoch(train_loader, model, optimizer, train_meter, cur_epoch, cfg, writer, wandb_log) is_checkp_epoch = cu.is_checkpoint_epoch( cfg, cur_epoch, ) is_eval_epoch = misc.is_eval_epoch( cfg, cur_epoch, ) # Compute precise BN stats. if ((is_checkp_epoch or is_eval_epoch) and cfg.BN.USE_PRECISE_STATS and len(get_bn_modules(model)) > 0): calculate_and_update_precise_bn( precise_bn_loader, model, min(cfg.BN.NUM_BATCHES_PRECISE, len(precise_bn_loader)), cfg.NUM_GPUS > 0, ) _ = misc.aggregate_sub_bn_stats(model) # Save a checkpoint. if is_checkp_epoch: cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg) # Evaluate the model on validation set. if is_eval_epoch: is_best_epoch, _ = eval_epoch(val_loader, model, val_meter, cur_epoch, cfg, writer, wandb_log) if is_best_epoch: cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg, is_best_epoch=is_best_epoch) if writer is not None: writer.close()
def train(cfg): """ Train a video model for many epochs on train set and evaluate it on val set. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set up environment. du.init_distributed_training(cfg) # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Init multigrid. multigrid = None if cfg.MULTIGRID.LONG_CYCLE or cfg.MULTIGRID.SHORT_CYCLE: multigrid = MultigridSchedule() cfg = multigrid.init_multigrid(cfg) if cfg.MULTIGRID.LONG_CYCLE: cfg, _ = multigrid.update_long_cycle(cfg, cur_epoch=0) # Print config. logger.info("Train with config:") logger.info(pprint.pformat(cfg)) # Build the video model and print model statistics. model = build_model(cfg) if du.is_master_proc() and cfg.LOG_MODEL_INFO: misc.log_model_info(model, cfg, use_train_input=True) # Construct the optimizer. optimizer = optim.construct_optimizer(model, cfg) # Create a GradScaler for mixed precision training scaler = torch.cuda.amp.GradScaler(enabled=cfg.TRAIN.MIXED_PRECISION) # Load a checkpoint to resume training if applicable. if cfg.TRAIN.AUTO_RESUME and cu.has_checkpoint(cfg.OUTPUT_DIR): logger.info("Load from last checkpoint.") last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR, task=cfg.TASK) if last_checkpoint is not None: checkpoint_epoch = cu.load_checkpoint( last_checkpoint, model, cfg.NUM_GPUS > 1, optimizer, scaler if cfg.TRAIN.MIXED_PRECISION else None, ) start_epoch = checkpoint_epoch + 1 elif "ssl_eval" in cfg.TASK: last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR, task="ssl") checkpoint_epoch = cu.load_checkpoint( last_checkpoint, model, cfg.NUM_GPUS > 1, optimizer, scaler if cfg.TRAIN.MIXED_PRECISION else None, epoch_reset=True, clear_name_pattern=cfg.TRAIN.CHECKPOINT_CLEAR_NAME_PATTERN, ) start_epoch = checkpoint_epoch + 1 else: start_epoch = 0 elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "": logger.info("Load from given checkpoint file.") checkpoint_epoch = cu.load_checkpoint( cfg.TRAIN.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, optimizer, scaler if cfg.TRAIN.MIXED_PRECISION else None, inflation=cfg.TRAIN.CHECKPOINT_INFLATE, convert_from_caffe2=cfg.TRAIN.CHECKPOINT_TYPE == "caffe2", epoch_reset=cfg.TRAIN.CHECKPOINT_EPOCH_RESET, clear_name_pattern=cfg.TRAIN.CHECKPOINT_CLEAR_NAME_PATTERN, ) start_epoch = checkpoint_epoch + 1 else: start_epoch = 0 # Create the video train and val loaders. train_loader = loader.construct_loader(cfg, "train") val_loader = loader.construct_loader(cfg, "val") precise_bn_loader = (loader.construct_loader( cfg, "train", is_precise_bn=True) if cfg.BN.USE_PRECISE_STATS else None) # if ( # cfg.TASK == "ssl" # and cfg.MODEL.MODEL_NAME == "ContrastiveModel" # and cfg.CONTRASTIVE.KNN_ON # ): # if hasattr(model, "module"): # model.module.init_knn_labels(train_loader) # else: # model.init_knn_labels(train_loader) # Create meters. if cfg.DETECTION.ENABLE: train_meter = AVAMeter(len(train_loader), cfg, mode="train") val_meter = AVAMeter(len(val_loader), cfg, mode="val") else: train_meter = TrainMeter(1e6, cfg) val_meter = ValMeter(1e6, cfg) # set up writer for logging to Tensorboard format. if cfg.TENSORBOARD.ENABLE and du.is_master_proc( cfg.NUM_GPUS * cfg.NUM_SHARDS): writer = tb.TensorboardWriter(cfg) else: writer = None # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch + 1)) epoch_timer = EpochTimer() for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH): if cur_epoch > 0 and cfg.DATA.LOADER_CHUNK_SIZE > 0: num_chunks = math.ceil(cfg.DATA.LOADER_CHUNK_OVERALL_SIZE / cfg.DATA.LOADER_CHUNK_SIZE) skip_rows = (cur_epoch) % num_chunks * cfg.DATA.LOADER_CHUNK_SIZE logger.info( f"=================+++ num_chunks {num_chunks} skip_rows {skip_rows}" ) cfg.DATA.SKIP_ROWS = skip_rows logger.info(f"|===========| skip_rows {skip_rows}") train_loader = loader.construct_loader(cfg, "train") loader.shuffle_dataset(train_loader, cur_epoch) if cfg.MULTIGRID.LONG_CYCLE: cfg, changed = multigrid.update_long_cycle(cfg, cur_epoch) if changed: ( model, optimizer, train_loader, val_loader, precise_bn_loader, train_meter, val_meter, ) = build_trainer(cfg) # Load checkpoint. if cu.has_checkpoint(cfg.OUTPUT_DIR): last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR, task=cfg.TASK) assert "{:05d}.pyth".format(cur_epoch) in last_checkpoint else: last_checkpoint = cfg.TRAIN.CHECKPOINT_FILE_PATH logger.info("Load from {}".format(last_checkpoint)) cu.load_checkpoint(last_checkpoint, model, cfg.NUM_GPUS > 1, optimizer) # Shuffle the dataset. loader.shuffle_dataset(train_loader, cur_epoch) if hasattr(train_loader.dataset, "_set_epoch_num"): train_loader.dataset._set_epoch_num(cur_epoch) # Train for one epoch. epoch_timer.epoch_tic() train_epoch( train_loader, model, optimizer, scaler, train_meter, cur_epoch, cfg, writer, ) epoch_timer.epoch_toc() logger.info( f"Epoch {cur_epoch} takes {epoch_timer.last_epoch_time():.2f}s. Epochs " f"from {start_epoch} to {cur_epoch} take " f"{epoch_timer.avg_epoch_time():.2f}s in average and " f"{epoch_timer.median_epoch_time():.2f}s in median.") logger.info( f"For epoch {cur_epoch}, each iteraction takes " f"{epoch_timer.last_epoch_time()/len(train_loader):.2f}s in average. " f"From epoch {start_epoch} to {cur_epoch}, each iteraction takes " f"{epoch_timer.avg_epoch_time()/len(train_loader):.2f}s in average." ) is_checkp_epoch = (cu.is_checkpoint_epoch( cfg, cur_epoch, None if multigrid is None else multigrid.schedule, ) or cur_epoch == cfg.SOLVER.MAX_EPOCH - 1) is_eval_epoch = misc.is_eval_epoch( cfg, cur_epoch, None if multigrid is None else multigrid.schedule) # Compute precise BN stats. if ((is_checkp_epoch or is_eval_epoch) and cfg.BN.USE_PRECISE_STATS and len(get_bn_modules(model)) > 0): calculate_and_update_precise_bn( precise_bn_loader, model, min(cfg.BN.NUM_BATCHES_PRECISE, len(precise_bn_loader)), cfg.NUM_GPUS > 0, ) _ = misc.aggregate_sub_bn_stats(model) # Save a checkpoint. if is_checkp_epoch: cu.save_checkpoint( cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg, scaler if cfg.TRAIN.MIXED_PRECISION else None, ) # Evaluate the model on validation set. if is_eval_epoch: eval_epoch( val_loader, model, val_meter, cur_epoch, cfg, train_loader, writer, ) if writer is not None: writer.close() result_string = "Top1 Acc: {:.2f} Top5 Acc: {:.2f} MEM: {:.2f}" "".format( 100 - val_meter.min_top1_err, 100 - val_meter.min_top5_err, misc.gpu_mem_usage(), ) logger.info("training done: {}".format(result_string)) return result_string
def train_des(cfg): """ Train a video model for many epochs on train set and evaluate it on val set. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Print config. logger.info("Train with config:") logger.info(pprint.pformat(cfg)) # Build the video model and print model statistics. model = MONET_BERT(cfg) if cfg.NUM_GPUS: # Determine the GPU used by the current process cur_device = torch.cuda.current_device() # Transfer the model to the current GPU device model = model.cuda(device=cur_device) # Construct the optimizer. # optimizer = AdamW(model.parameters(), # lr = cfg.SOLVER.BASE_LR, # eps = 1e-8 # ) optimizer = Lamb(model.parameters(), lr=cfg.SOLVER.BASE_LR, betas=(0.9, 0.999), eps=1e-6, weight_decay=cfg.SOLVER.WEIGHT_DECAY, adam=False) # optimizer = optim.Adam(model.parameters(), lr=cfg.SOLVER.BASE_LR) start_epoch = cu.load_train_checkpoint(cfg, model, optimizer) optimizer = Lamb(model.parameters(), lr=cfg.SOLVER.BASE_LR, betas=(0.9, 0.999), eps=1e-6, weight_decay=cfg.SOLVER.WEIGHT_DECAY, adam=False) # Create the video train and val loaders. train_loader = build_dataloader(cfg, "train") val_loader = build_dataloader(cfg, "val") total_steps = len(train_loader) * cfg.SOLVER.MAX_EPOCH # Create the learning rate scheduler. scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=4000, # Default value in run_glue.py num_training_steps=total_steps) # Create meters. train_meter = ClevrerTrainMeter(len(train_loader), cfg) val_meter = ClevrerValMeter(len(val_loader), cfg) # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch + 1)) for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH): # Train for one epoch. train_epoch(train_loader, model, optimizer, scheduler, train_meter, cur_epoch, cfg) is_checkp_epoch = cu.is_checkpoint_epoch( cfg, cur_epoch, None, ) is_eval_epoch = misc.is_eval_epoch(cfg, cur_epoch, None) # Save a checkpoint. if is_checkp_epoch: cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg) # Evaluate the model on validation set. if is_eval_epoch: eval_epoch(val_loader, model, val_meter, cur_epoch, cfg)