def main_worker(cfg): # create tensorboard and logs if cfg.DDP_CONFIG.GPU_WORLD_RANK == 0: tb_logdir = build_log_dir(cfg) writer = SummaryWriter(log_dir=tb_logdir) else: writer = None cfg.freeze() # create model model = get_model(cfg) model = deploy_model(model, cfg) # create dataset and dataloader test_loader = build_dataloader_test(cfg) eval_path = cfg.CONFIG.LOG.EVAL_DIR if not os.path.exists(eval_path): os.makedirs(eval_path) criterion = nn.CrossEntropyLoss().cuda() file = os.path.join(eval_path, str(cfg.DDP_CONFIG.GPU_WORLD_RANK) + '.txt') test_classification(model, test_loader, criterion, cfg, file) torch.distributed.barrier() if cfg.DDP_CONFIG.GPU_WORLD_RANK == 0: print("Start merging results...") merge(eval_path, cfg) else: print(cfg.DDP_CONFIG.GPU_WORLD_RANK, "Evaluation done!")
def main_worker(cfg): # create tensorboard and logs if cfg.DDP_CONFIG.GPU_WORLD_RANK == 0: tb_logdir = build_log_dir(cfg) writer = SummaryWriter(log_dir=tb_logdir) else: writer = None cfg.freeze() # create model model = get_model(cfg) model = deploy_model(model, cfg) # create dataset and dataloader val_loader, _ = build_dataloader_val(cfg) if cfg.CONFIG.MODEL.LOAD: model, _ = load_model(model, cfg, load_fc=True) criterion = nn.CrossEntropyLoss().cuda() # adversarial_classification(model, val_loader, -1, criterion, cfg, writer) validation_classification(model, val_loader, -1, criterion, cfg, writer) if writer is not None: writer.close()
def main_worker(cfg): # create tensorboard and logs if cfg.DDP_CONFIG.GPU_WORLD_RANK == 0: tb_logdir = build_log_dir(cfg) writer = SummaryWriter(log_dir=tb_logdir) else: writer = None cfg.freeze() # create model model = get_model(cfg) model = deploy_model(model, cfg) # create dataset and dataloader train_loader, val_loader, train_sampler, val_sampler, mg_sampler = build_dataloader( cfg) optimizer = torch.optim.SGD(model.parameters(), lr=cfg.CONFIG.TRAIN.LR, momentum=cfg.CONFIG.TRAIN.MOMENTUM, weight_decay=cfg.CONFIG.TRAIN.W_DECAY) if cfg.CONFIG.MODEL.LOAD: model, _ = load_model(model, optimizer, cfg, load_fc=True) scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=cfg.CONFIG.TRAIN.LR_MILESTONE, gamma=cfg.CONFIG.TRAIN.STEP) criterion = nn.CrossEntropyLoss().cuda() base_iter = 0 for epoch in range(cfg.CONFIG.TRAIN.EPOCH_NUM): if cfg.DDP_CONFIG.DISTRIBUTED: train_sampler.set_epoch(epoch) base_iter = train_classification(base_iter, model, train_loader, epoch, criterion, optimizer, cfg, writer=writer) scheduler.step() if epoch % cfg.CONFIG.VAL.FREQ == 0 or epoch == cfg.CONFIG.TRAIN.EPOCH_NUM - 1: validation_classification(model, val_loader, epoch, criterion, cfg, writer) if epoch % cfg.CONFIG.LOG.SAVE_FREQ == 0: if cfg.DDP_CONFIG.GPU_WORLD_RANK == 0 or cfg.DDP_CONFIG.DISTRIBUTED == False: save_model(model, optimizer, epoch, cfg) if writer is not None: writer.close()
def main_worker(cfg): # create tensorboard and logs if cfg.DDP_CONFIG.GPU_WORLD_RANK == 0: tb_logdir = build_log_dir(cfg) writer = SummaryWriter(log_dir=tb_logdir) else: writer = None cfg.freeze() # create model # model = get_model(cfg) model = directpose_resnet_lpf_fpn(cfg) model = deploy_model(model, cfg) # create dataset and dataloader # train_loader, val_loader, train_sampler, val_sampler, mg_sampler = build_pose_train_loader(cfg) train_loader = build_pose_train_loader(cfg) val_loader = build_pose_test_loader(cfg, cfg.CONFIG.DATA.DATASET.VAL[0]) optimizer = build_pose_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) # if cfg.CONFIG.MODEL.LOAD: # model, _ = load_model(model, optimizer, cfg, load_fc=True) # criterion = nn.CrossEntropyLoss().cuda() pipeline = DirectposePipeline(0, cfg.CONFIG.TRAIN.ITER_NUM, model, train_loader, optimizer, scheduler, cfg, writer=writer) while pipeline.base_iter < pipeline.max_iter: pipeline.train_step() if pipeline.base_iter % cfg.CONFIG.VAL.EVAL_PERIOD == 0 or pipeline.base_iter == pipeline.max_iter: pipeline.validate(val_loader) if pipeline.base_iter % cfg.CONFIG.TRAIN.CHECKPOINT_PERIOD == 0: if cfg.DDP_CONFIG.GPU_WORLD_RANK == 0 or cfg.DDP_CONFIG.DISTRIBUTED == False: pipeline.save_model() if writer is not None: writer.close()
def main_worker(cfg): # create tensorboard and logs if cfg.DDP_CONFIG.GPU_WORLD_RANK == 0: tb_logdir = build_log_dir(cfg) writer = SummaryWriter(log_dir=tb_logdir) else: writer = None cfg.freeze() logger = get_logger(tb_logdir, "trainer", log_file=True) # create model model = get_model(cfg) model = deploy_model(model, cfg) # create dataset and dataloader data_path_dict = create_dataloader_path( cfg.CONFIG.COOT_DATA.DATA_PATH, cfg.CONFIG.COOT_DATA.DATASET_NAME, video_feature_name=cfg.CONFIG.COOT_DATA.FEATURE) train_set, val_set = create_datasets(data_path_dict, cfg, cfg.CONFIG.COOT_DATA.VIDEO_PRELOAD, cfg.CONFIG.COOT_DATA.TEXT_PRELOAD) train_loader, val_loader = create_loaders(train_set, val_set, cfg.CONFIG.TRAIN.BATCH_SIZE, cfg.CONFIG.DATA.NUM_WORKERS) optimizer = RAdam(model.get_params(), lr=cfg.CONFIG.TRAIN.LR, betas=(cfg.CONFIG.TRAIN.MOMENTUM, cfg.CONFIG.TRAIN.ADAM_BETA2), eps=cfg.CONFIG.TRAIN.ADAM_EPS, weight_decay=cfg.CONFIG.TRAIN.W_DECAY) if cfg.CONFIG.MODEL.LOAD: model, _ = load_model(model, optimizer, cfg, load_fc=True) if cfg.CONFIG.TRAIN.LR_POLICY == 'Step': scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=cfg.CONFIG.TRAIN.LR_MILESTONE, gamma=cfg.CONFIG.TRAIN.STEP) elif cfg.CONFIG.TRAIN.LR_POLICY == 'Cosine': scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=cfg.CONFIG.TRAIN.EPOCH_NUM - cfg.CONFIG.TRAIN.WARMUP_EPOCHS, eta_min=0, last_epoch=cfg.CONFIG.TRAIN.RESUME_EPOCH) elif cfg.CONFIG.TRAIN.LR_POLICY == 'LR_Warmup': scheduler = ReduceLROnPlateauWarmup(optimizer, cfg.CONFIG.TRAIN.WARMUP_EPOCHS, mode="max", patience=cfg.CONFIG.TRAIN.PATIENCE, cooldown=cfg.CONFIG.TRAIN.COOLDOWN) else: print( 'Learning rate schedule %s is not supported yet. Please use Step or Cosine.' ) criterion_cycleconsistency = CycleConsistencyCootLoss(num_samples=1, use_cuda=True) criterion_alignment = MaxMarginRankingLoss(use_cuda=True) base_iter = 0 det_best_field_best = 0 for epoch in range(cfg.CONFIG.TRAIN.EPOCH_NUM): ## ======== Training step =============== base_iter = train_coot(cfg, base_iter, model, train_loader, epoch, criterion_alignment, criterion_cycleconsistency, optimizer, writer, logger) ## ======= Validation step ================ if epoch % cfg.CONFIG.VAL.FREQ == 0 or epoch == cfg.CONFIG.TRAIN.EPOCH_NUM - 1: vid_metrics, clip_metrics = validate_coot( cfg, model, val_loader, epoch, criterion_alignment, criterion_cycleconsistency, writer, logger, True) # Check if the performance of model is improving logger.info("---------- Validating epoch {} ----------".format(epoch)) c2s_res, s2c_res, clip_best_at_1 = None, None, None if clip_metrics is not None: c2s_res, s2c_res, clip_best_at_1 = clip_metrics # find field which determines is_best det_best_field_current = clip_best_at_1 # check if best is_best = compare_metrics(det_best_field_current, det_best_field_best) if is_best: det_best_field_best = det_best_field_current best_epoch = epoch # step lr scheduler scheduler.step_rop(det_best_field_current, True) logger.info(f"ROP: model improved: {is_best}, " f"value {det_best_field_current:.3f}," f"new LR: {optimizer.param_groups[0]['lr']:5.3e}") if epoch % cfg.CONFIG.LOG.SAVE_FREQ == 0: if cfg.DDP_CONFIG.GPU_WORLD_RANK == 0 or cfg.DDP_CONFIG.DISTRIBUTED == False: model.save_model(optimizer, epoch, cfg) # check if model did not improve for too long term_after = 15 if epoch - best_epoch > term_after: logger.info(f"NO improvements for {term_after} epochs (current " f"{epoch} best {best_epoch}) STOP training.") break if writer is not None: writer.close() if logger is not None: close_logger(logger)
def main_worker(cfg): # create tensorboard and logs if cfg.DDP_CONFIG.GPU_WORLD_RANK == 0: tb_logdir = build_log_dir(cfg) writer = SummaryWriter(log_dir=tb_logdir) else: writer = None cfg.freeze() # create model model = get_model(cfg) model = deploy_model(model, cfg) # create dataset and dataloader train_loader, val_loader, train_sampler, val_sampler, mg_sampler = build_dataloader( cfg) optimizer = torch.optim.SGD(model.parameters(), lr=cfg.CONFIG.TRAIN.LR, momentum=cfg.CONFIG.TRAIN.MOMENTUM, weight_decay=cfg.CONFIG.TRAIN.W_DECAY) if cfg.CONFIG.MODEL.LOAD: model, _ = load_model(model, optimizer, cfg, load_fc=True) if cfg.CONFIG.TRAIN.LR_POLICY == 'Step': scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=cfg.CONFIG.TRAIN.LR_MILESTONE, gamma=cfg.CONFIG.TRAIN.STEP) elif cfg.CONFIG.TRAIN.LR_POLICY == 'Cosine': scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=cfg.CONFIG.TRAIN.EPOCH_NUM - cfg.CONFIG.TRAIN.WARMUP_EPOCHS, eta_min=0, last_epoch=cfg.CONFIG.TRAIN.RESUME_EPOCH) else: print( 'Learning rate schedule %s is not supported yet. Please use Step or Cosine.' ) if cfg.CONFIG.TRAIN.USE_WARMUP: scheduler_warmup = GradualWarmupScheduler( optimizer, multiplier=(cfg.CONFIG.TRAIN.WARMUP_END_LR / cfg.CONFIG.TRAIN.LR), total_epoch=cfg.CONFIG.TRAIN.WARMUP_EPOCHS, after_scheduler=scheduler) criterion = nn.CrossEntropyLoss().cuda() base_iter = 0 for epoch in range(cfg.CONFIG.TRAIN.EPOCH_NUM): if cfg.DDP_CONFIG.DISTRIBUTED: train_sampler.set_epoch(epoch) base_iter = train_classification(base_iter, model, train_loader, epoch, criterion, optimizer, cfg, writer=writer) if cfg.CONFIG.TRAIN.USE_WARMUP: scheduler_warmup.step() else: scheduler.step() if cfg.CONFIG.TRAIN.MULTIGRID.USE_LONG_CYCLE: if epoch in cfg.CONFIG.TRAIN.MULTIGRID.LONG_CYCLE_EPOCH: mg_sampler.step_long_cycle() if epoch % cfg.CONFIG.VAL.FREQ == 0 or epoch == cfg.CONFIG.TRAIN.EPOCH_NUM - 1: validation_classification(model, val_loader, epoch, criterion, cfg, writer) if epoch % cfg.CONFIG.LOG.SAVE_FREQ == 0: if cfg.DDP_CONFIG.GPU_WORLD_RANK == 0 or cfg.DDP_CONFIG.DISTRIBUTED == False: save_model(model, optimizer, epoch, cfg) if writer is not None: writer.close()