def train_epoch(model, data_loader, criterion, optimizer, device, opt): model.train() losses = AverageMeter('Loss', ':.2f') accuracies = AverageMeter('Acc', ':.2f') progress = ProgressMeter( len(data_loader), [losses, accuracies], prefix='Train: ') # Training for batch_idx, (data, targets) in enumerate(data_loader): # compute outputs data, targets = data.to(device), targets.to(device) outputs = model(data) loss = criterion(outputs, targets) acc = accuracy(outputs, targets) losses.update(loss.item(), data.size(0)) accuracies.update(acc[0].item(), data.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() # show information if batch_idx % opt.log_interval == 0: progress.display(batch_idx) # show information print(f' * Train Loss {losses.avg:.3f}, Train Acc {accuracies.avg:.3f}') return losses.avg, accuracies.avg
def evaluate(val_dataloader, img_encoder, text_encoder, fc_model, args): m_top1 = AverageMeter('Acc@1', ':6.2f') m_iou = AverageMeter('IoU', ':6.2f') m_ap50 = AverageMeter('AP50', ':6.2f') progress = ProgressMeter(len(val_dataloader), [m_top1, m_iou, m_ap50], prefix='Test: ') img_encoder.eval() fc_model.eval() ignore_index = val_dataloader.dataset.ignore_index for i, batch in enumerate(val_dataloader): # Data region_proposals = batch['rpn_image'].cuda(non_blocking=True) commands = batch['command'] sentence = batch['sentence'] command_length = batch['command_length'].cuda(non_blocking=True) gt = batch['rpn_gt'].cuda(non_blocking=True) iou = batch['rpn_iou'].cuda(non_blocking=True).squeeze() b, r, c, h, w = region_proposals.size() # Image features img_features = img_encoder(region_proposals.view(b * r, c, h, w)) norm = img_features.norm(p=2, dim=1, keepdim=True) img_features = img_features.div(norm).view(b, r, -1) #Sentence features sentence_features = torch.from_numpy( np.array(text_encoder.encode(sentence))).cuda(non_blocking=True) sentence_features = fc_model(sentence_features) # Product in latent space scores = torch.bmm(img_features, sentence_features.unsqueeze(2)).squeeze() gt = gt.squeeze() # Summary pred = torch.argmax(scores, 1) pred_bin = F.one_hot(pred, r).bool() valid = (gt != ignore_index) num_valid = torch.sum(valid).float().item() m_top1.update( torch.sum(pred[valid] == gt[valid]).float().item(), num_valid) m_iou.update( torch.masked_select(iou, pred_bin).sum().float().item(), b) m_ap50.update( (torch.masked_select(iou, pred_bin) > 0.5).sum().float().item(), b) if i % args.print_freq == 0: progress.display(i) return m_ap50.avg
def validate(val_loader, model, criterion, local_rank, args): batch_time = AverageMeter('Time', ':6.3f') losses = AverageMeter('Loss', ':.4e') top1 = AverageMeter('Acc@1', ':6.2f') top5 = AverageMeter('Acc@5', ':6.2f') progress = ProgressMeter(len(val_loader), [batch_time, losses, top1, top5], prefix='Test: ') # switch to evaluate mode model.eval() with torch.no_grad(): end = time.time() for i, (images, target) in enumerate(val_loader): images = images.cuda(local_rank, non_blocking=True) target = target.cuda(local_rank, non_blocking=True) # compute output output = model(images) loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) torch.distributed.barrier() reduced_loss = reduce_mean(loss, args.nprocs) reduced_acc1 = reduce_mean(acc1, args.nprocs) reduced_acc5 = reduce_mean(acc5, args.nprocs) losses.update(reduced_loss.item(), images.size(0)) top1.update(reduced_acc1.item(), images.size(0)) top5.update(reduced_acc5.item(), images.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() # 只在主进程打印信息 if local_rank == 0: progress.display(i) # TODO: this should also be done with the ProgressMeter # 只在主进程打印信息 if local_rank == 0: print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'.format( top1=top1, top5=top5)) return top1.avg
def train(train_dataloader, img_encoder, text_encoder, fc_model, optimizer, criterion, epoch, args): m_losses = AverageMeter('Loss', ':.4e') m_top1 = AverageMeter('Acc@1', ':6.2f') m_iou = AverageMeter('IoU', ':6.2f') m_ap50 = AverageMeter('AP50', ':6.2f') progress = ProgressMeter(len(train_dataloader), [m_losses, m_top1, m_iou, m_ap50], prefix="Epoch: [{}]".format(epoch)) img_encoder.train() fc_model.train() # text_encoder.train() ignore_index = train_dataloader.dataset.ignore_index for i, batch in enumerate(train_dataloader): optimizer.zero_grad() # Data region_proposals = batch['rpn_image'].cuda(non_blocking=True) commands = batch['command'] sentence = batch['sentence'] command_length = batch['command_length'].cuda(non_blocking=True) gt = batch['rpn_gt'].cuda(non_blocking=True) iou = batch['rpn_iou'].cuda(non_blocking=True).squeeze() b, r, c, h, w = region_proposals.size() # Image features img_features = img_encoder(region_proposals.view(b * r, c, h, w)) norm = img_features.norm(p=2, dim=1, keepdim=True) img_features = img_features.div(norm).view(b, r, -1) #Sentence features sentence_features = torch.from_numpy( np.array(text_encoder.encode(sentence))).cuda(non_blocking=True) sentence_features = fc_model(sentence_features) # Product in latent space scores = torch.bmm(img_features, sentence_features.unsqueeze(2)).squeeze() gt = gt.squeeze() # Loss total_loss = criterion(scores, gt) # Update optimizer.zero_grad() total_loss.backward() optimizer.step() # Summary pred = torch.argmax(scores, 1) pred_bin = F.one_hot(pred, r).bool() valid = (gt != ignore_index) num_valid = torch.sum(valid).float().item() m_top1.update( torch.sum(pred[valid] == gt[valid]).float().item(), num_valid) m_iou.update( torch.masked_select(iou, pred_bin).sum().float().item(), b) m_ap50.update( (torch.masked_select(iou, pred_bin) > 0.5).sum().float().item(), b) m_losses.update(total_loss.item()) if i % args.print_freq == 0: progress.display(i)
def main_worker(): seed = 1 random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) opt = parse_opts() train_data = get_training_data(cfg) val_data = get_validation_data(cfg) train_loader = DataLoader(train_data, num_workers=opt.num_workers, collate_fn=collater, batch_size=opt.batch_size, shuffle=True) val_loader = DataLoader(val_data, num_workers=opt.num_workers, collate_fn=collater, batch_size=opt.batch_size, shuffle=True) print(f"Training dataset size : {len(train_loader.dataset)}") print(f"Validation dataset size : {len(val_loader.dataset)}") dataiterator = iter(train_loader) faster_rcnn = FasterRCNN() # if torch.cuda.device_count() > 1 and opt.multi_gpu : # print("Let's use", torch.cuda.device_count(), "GPUs!") # faster_rcnn = nn.DataParallel(faster_rcnn) # loading model from a ckpt if opt.weight_path: load_from_ckpt(opt, faster_rcnn) faster_rcnn.to(cfg.DEVICE) if opt.lr is not None: cfg.TRAIN.LEARNING_RATE = opt.lr lr = cfg.TRAIN.LEARNING_RATE print(f"Learning rate : {lr}") if opt.weight_decay is not None: cfg.TRAIN.WEIGHT_DECAY = opt.weight_decay print(f"Weight Decay : {cfg.TRAIN.WEIGHT_DECAY}") ### Optimizer ### # record backbone params, i.e., conv_body and box_head params backbone_bias_params = [] backbone_bias_param_names = [] prd_branch_bias_params = [] prd_branch_bias_param_names = [] backbone_nonbias_params = [] backbone_nonbias_param_names = [] prd_branch_nonbias_params = [] prd_branch_nonbias_param_names = [] for key, value in dict(faster_rcnn.named_parameters()).items(): if value.requires_grad: if 'fpn' in key or 'box_head' in key or 'box_predictor' in key or 'rpn' in key: if 'bias' in key: backbone_bias_params.append(value) backbone_bias_param_names.append(key) else: backbone_nonbias_params.append(value) backbone_nonbias_param_names.append(key) else: if 'bias' in key: prd_branch_bias_params.append(value) prd_branch_bias_param_names.append(key) else: prd_branch_nonbias_params.append(value) prd_branch_nonbias_param_names.append(key) params = [ { 'params': backbone_nonbias_params, 'lr': cfg.TRAIN.LEARNING_RATE, 'weight_decay': cfg.TRAIN.WEIGHT_DECAY }, { 'params': backbone_bias_params, 'lr': cfg.TRAIN.LEARNING_RATE * (cfg.TRAIN.DOUBLE_BIAS + 1), 'weight_decay': cfg.TRAIN.WEIGHT_DECAY if cfg.TRAIN.BIAS_DECAY else 0 }, { 'params': prd_branch_nonbias_params, 'lr': cfg.TRAIN.LEARNING_RATE, 'weight_decay': cfg.TRAIN.WEIGHT_DECAY }, { 'params': prd_branch_bias_params, 'lr': cfg.TRAIN.LEARNING_RATE * (cfg.TRAIN.DOUBLE_BIAS + 1), 'weight_decay': cfg.TRAIN.WEIGHT_DECAY if cfg.TRAIN.BIAS_DECAY else 0 }, ] if cfg.TRAIN.TYPE == "ADAM": optimizer = torch.optim.Adam(params) elif cfg.TRAIN.TYPE == "SGD": optimizer = torch.optim.SGD(params, momentum=cfg.TRAIN.MOMENTUM) # scheduler if opt.scheduler == "plateau": scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5) elif opt.scheduler == "multi_step": scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[83631, 111508]) elif opt.scheduler == "step_lr": scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1, last_epoch=-1) if opt.weight_path: opt.begin_iter = load_train_utils(opt, optimizer, scheduler) # lr of non-backbone parameters, for commmand line outputs. lr = optimizer.param_groups[0]['lr'] # lr of backbone parameters, for commmand line outputs. # backbone_lr = optimizer.param_groups[0]['lr'] summary_writer = Metrics(log_dir='tf_logs') losses_sbj = AverageMeter('Sbj loss: ', ':.2f') losses_obj = AverageMeter('Obj loss: ', ':.2f') losses_rel = AverageMeter('Rel loss: ', ':.2f') losses_total = AverageMeter('Total loss: ', ':.2f') progress = ProgressMeter( [losses_sbj, losses_obj, losses_rel, losses_total], prefix='Train: ') faster_rcnn.train() th = 10000 for step in range(opt.begin_iter, opt.max_iter): try: input_data = next(dataiterator) except StopIteration: dataiterator = iter(train_loader) input_data = next(dataiterator) images, targets = input_data _, metrics = faster_rcnn(images, targets) final_loss = metrics["loss_objectness"] + metrics["loss_rpn_box_reg"] + \ metrics["loss_classifier"] + metrics["loss_box_reg"] + \ metrics["loss_sbj"] + metrics["loss_obj"] + metrics["loss_rlp"] optimizer.zero_grad() final_loss.backward() optimizer.step() losses_sbj.update(metrics["loss_sbj"].item(), len(images)) losses_obj.update(metrics["loss_obj"].item(), len(images)) losses_rel.update(metrics["loss_rlp"].item(), len(images)) losses_total.update(final_loss.item(), len(images)) if opt.scheduler != "plateau": scheduler.step() if (step) % 10 == 0: progress.display(step) if step % 2500 == 0: train_losses = {} train_losses['total_loss'] = losses_total.avg train_losses['sbj_loss'] = losses_sbj.avg train_losses['obj_loss'] = losses_obj.avg train_losses['rel_loss'] = losses_rel.avg val_losses = val_epoch(faster_rcnn, val_loader) if opt.scheduler == "plateau": scheduler.step(val_losses['total_loss']) lr = optimizer.param_groups[0]['lr'] # if val_losses['total_loss'] < th: # save_model(faster_rcnn, optimizer, scheduler, step) # print(f"*** Saved model ***") # th = val_losses['total_loss'] save_model(faster_rcnn, optimizer, scheduler, step) # write summary summary_writer.log_metrics(train_losses, val_losses, step, lr) print( f"* Average training loss : {train_losses['total_loss']:.3f}") print( f"* Average validation loss : {val_losses['total_loss']:.3f}") losses_sbj.reset() losses_obj.reset() losses_rel.reset() losses_total.reset() faster_rcnn.train()