示例#1
0
def train_epoch(model, data_loader, criterion, optimizer, device, opt):
   
	model.train()
	
	losses = AverageMeter('Loss', ':.2f')
	accuracies = AverageMeter('Acc', ':.2f')
	progress = ProgressMeter(
        len(data_loader),
        [losses, accuracies],
        prefix='Train: ')
	# Training
	for batch_idx, (data, targets) in enumerate(data_loader):
		# compute outputs
		data, targets = data.to(device), targets.to(device)

		outputs =  model(data)
		loss = criterion(outputs, targets)

		acc = accuracy(outputs, targets)
		losses.update(loss.item(), data.size(0))
		accuracies.update(acc[0].item(),  data.size(0))

		optimizer.zero_grad()
		loss.backward()
		optimizer.step()

		# show information
		if batch_idx % opt.log_interval == 0:
			progress.display(batch_idx)
		
	# show information
	print(f' * Train Loss {losses.avg:.3f}, Train Acc {accuracies.avg:.3f}')
	return losses.avg, accuracies.avg
示例#2
0
def evaluate(val_dataloader, img_encoder, text_encoder, fc_model, args):
    m_top1 = AverageMeter('Acc@1', ':6.2f')
    m_iou = AverageMeter('IoU', ':6.2f')
    m_ap50 = AverageMeter('AP50', ':6.2f')
    progress = ProgressMeter(len(val_dataloader), [m_top1, m_iou, m_ap50],
                             prefix='Test: ')

    img_encoder.eval()
    fc_model.eval()

    ignore_index = val_dataloader.dataset.ignore_index

    for i, batch in enumerate(val_dataloader):

        # Data
        region_proposals = batch['rpn_image'].cuda(non_blocking=True)
        commands = batch['command']
        sentence = batch['sentence']
        command_length = batch['command_length'].cuda(non_blocking=True)
        gt = batch['rpn_gt'].cuda(non_blocking=True)

        iou = batch['rpn_iou'].cuda(non_blocking=True).squeeze()
        b, r, c, h, w = region_proposals.size()

        # Image features
        img_features = img_encoder(region_proposals.view(b * r, c, h, w))
        norm = img_features.norm(p=2, dim=1, keepdim=True)
        img_features = img_features.div(norm).view(b, r, -1)

        #Sentence features
        sentence_features = torch.from_numpy(
            np.array(text_encoder.encode(sentence))).cuda(non_blocking=True)
        sentence_features = fc_model(sentence_features)

        # Product in latent space
        scores = torch.bmm(img_features,
                           sentence_features.unsqueeze(2)).squeeze()
        gt = gt.squeeze()

        # Summary
        pred = torch.argmax(scores, 1)
        pred_bin = F.one_hot(pred, r).bool()
        valid = (gt != ignore_index)
        num_valid = torch.sum(valid).float().item()
        m_top1.update(
            torch.sum(pred[valid] == gt[valid]).float().item(), num_valid)
        m_iou.update(
            torch.masked_select(iou, pred_bin).sum().float().item(), b)
        m_ap50.update(
            (torch.masked_select(iou, pred_bin) > 0.5).sum().float().item(), b)

        if i % args.print_freq == 0:
            progress.display(i)
    return m_ap50.avg
def validate(val_loader, model, criterion, local_rank, args):
    batch_time = AverageMeter('Time', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    top1 = AverageMeter('Acc@1', ':6.2f')
    top5 = AverageMeter('Acc@5', ':6.2f')
    progress = ProgressMeter(len(val_loader), [batch_time, losses, top1, top5],
                             prefix='Test: ')

    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        end = time.time()
        for i, (images, target) in enumerate(val_loader):
            images = images.cuda(local_rank, non_blocking=True)
            target = target.cuda(local_rank, non_blocking=True)

            # compute output
            output = model(images)
            loss = criterion(output, target)

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))

            torch.distributed.barrier()

            reduced_loss = reduce_mean(loss, args.nprocs)
            reduced_acc1 = reduce_mean(acc1, args.nprocs)
            reduced_acc5 = reduce_mean(acc5, args.nprocs)

            losses.update(reduced_loss.item(), images.size(0))
            top1.update(reduced_acc1.item(), images.size(0))
            top5.update(reduced_acc5.item(), images.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()
            # 只在主进程打印信息
            if local_rank == 0:
                progress.display(i)

        # TODO: this should also be done with the ProgressMeter
        # 只在主进程打印信息
        if local_rank == 0:
            print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'.format(
                top1=top1, top5=top5))

    return top1.avg
示例#4
0
def train(train_dataloader, img_encoder, text_encoder, fc_model, optimizer,
          criterion, epoch, args):
    m_losses = AverageMeter('Loss', ':.4e')
    m_top1 = AverageMeter('Acc@1', ':6.2f')
    m_iou = AverageMeter('IoU', ':6.2f')
    m_ap50 = AverageMeter('AP50', ':6.2f')
    progress = ProgressMeter(len(train_dataloader),
                             [m_losses, m_top1, m_iou, m_ap50],
                             prefix="Epoch: [{}]".format(epoch))

    img_encoder.train()
    fc_model.train()
    # text_encoder.train()

    ignore_index = train_dataloader.dataset.ignore_index

    for i, batch in enumerate(train_dataloader):
        optimizer.zero_grad()

        # Data
        region_proposals = batch['rpn_image'].cuda(non_blocking=True)
        commands = batch['command']
        sentence = batch['sentence']
        command_length = batch['command_length'].cuda(non_blocking=True)
        gt = batch['rpn_gt'].cuda(non_blocking=True)
        iou = batch['rpn_iou'].cuda(non_blocking=True).squeeze()
        b, r, c, h, w = region_proposals.size()

        # Image features
        img_features = img_encoder(region_proposals.view(b * r, c, h, w))
        norm = img_features.norm(p=2, dim=1, keepdim=True)
        img_features = img_features.div(norm).view(b, r, -1)

        #Sentence features
        sentence_features = torch.from_numpy(
            np.array(text_encoder.encode(sentence))).cuda(non_blocking=True)
        sentence_features = fc_model(sentence_features)

        # Product in latent space
        scores = torch.bmm(img_features,
                           sentence_features.unsqueeze(2)).squeeze()
        gt = gt.squeeze()

        # Loss
        total_loss = criterion(scores, gt)

        # Update
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

        # Summary
        pred = torch.argmax(scores, 1)
        pred_bin = F.one_hot(pred, r).bool()
        valid = (gt != ignore_index)
        num_valid = torch.sum(valid).float().item()
        m_top1.update(
            torch.sum(pred[valid] == gt[valid]).float().item(), num_valid)
        m_iou.update(
            torch.masked_select(iou, pred_bin).sum().float().item(), b)
        m_ap50.update(
            (torch.masked_select(iou, pred_bin) > 0.5).sum().float().item(), b)
        m_losses.update(total_loss.item())

        if i % args.print_freq == 0:
            progress.display(i)
def main_worker():
    seed = 1
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    opt = parse_opts()
    train_data = get_training_data(cfg)
    val_data = get_validation_data(cfg)
    train_loader = DataLoader(train_data,
                              num_workers=opt.num_workers,
                              collate_fn=collater,
                              batch_size=opt.batch_size,
                              shuffle=True)
    val_loader = DataLoader(val_data,
                            num_workers=opt.num_workers,
                            collate_fn=collater,
                            batch_size=opt.batch_size,
                            shuffle=True)

    print(f"Training dataset size : {len(train_loader.dataset)}")
    print(f"Validation dataset size : {len(val_loader.dataset)}")

    dataiterator = iter(train_loader)

    faster_rcnn = FasterRCNN()

    # if torch.cuda.device_count() > 1 and opt.multi_gpu :
    #     print("Let's use", torch.cuda.device_count(), "GPUs!")
    #     faster_rcnn = nn.DataParallel(faster_rcnn)

    # loading model from a ckpt
    if opt.weight_path:
        load_from_ckpt(opt, faster_rcnn)
    faster_rcnn.to(cfg.DEVICE)

    if opt.lr is not None:
        cfg.TRAIN.LEARNING_RATE = opt.lr
    lr = cfg.TRAIN.LEARNING_RATE
    print(f"Learning rate : {lr}")

    if opt.weight_decay is not None:
        cfg.TRAIN.WEIGHT_DECAY = opt.weight_decay
    print(f"Weight Decay : {cfg.TRAIN.WEIGHT_DECAY}")

    ### Optimizer ###
    # record backbone params, i.e., conv_body and box_head params
    backbone_bias_params = []
    backbone_bias_param_names = []
    prd_branch_bias_params = []
    prd_branch_bias_param_names = []
    backbone_nonbias_params = []
    backbone_nonbias_param_names = []
    prd_branch_nonbias_params = []
    prd_branch_nonbias_param_names = []
    for key, value in dict(faster_rcnn.named_parameters()).items():
        if value.requires_grad:
            if 'fpn' in key or 'box_head' in key or 'box_predictor' in key or 'rpn' in key:
                if 'bias' in key:
                    backbone_bias_params.append(value)
                    backbone_bias_param_names.append(key)
                else:
                    backbone_nonbias_params.append(value)
                    backbone_nonbias_param_names.append(key)
            else:
                if 'bias' in key:
                    prd_branch_bias_params.append(value)
                    prd_branch_bias_param_names.append(key)
                else:
                    prd_branch_nonbias_params.append(value)
                    prd_branch_nonbias_param_names.append(key)
    params = [
        {
            'params': backbone_nonbias_params,
            'lr': cfg.TRAIN.LEARNING_RATE,
            'weight_decay': cfg.TRAIN.WEIGHT_DECAY
        },
        {
            'params': backbone_bias_params,
            'lr': cfg.TRAIN.LEARNING_RATE * (cfg.TRAIN.DOUBLE_BIAS + 1),
            'weight_decay':
            cfg.TRAIN.WEIGHT_DECAY if cfg.TRAIN.BIAS_DECAY else 0
        },
        {
            'params': prd_branch_nonbias_params,
            'lr': cfg.TRAIN.LEARNING_RATE,
            'weight_decay': cfg.TRAIN.WEIGHT_DECAY
        },
        {
            'params': prd_branch_bias_params,
            'lr': cfg.TRAIN.LEARNING_RATE * (cfg.TRAIN.DOUBLE_BIAS + 1),
            'weight_decay':
            cfg.TRAIN.WEIGHT_DECAY if cfg.TRAIN.BIAS_DECAY else 0
        },
    ]

    if cfg.TRAIN.TYPE == "ADAM":
        optimizer = torch.optim.Adam(params)

    elif cfg.TRAIN.TYPE == "SGD":
        optimizer = torch.optim.SGD(params, momentum=cfg.TRAIN.MOMENTUM)

    # scheduler
    if opt.scheduler == "plateau":
        scheduler = lr_scheduler.ReduceLROnPlateau(optimizer,
                                                   'min',
                                                   patience=5)
    elif opt.scheduler == "multi_step":
        scheduler = lr_scheduler.MultiStepLR(optimizer,
                                             milestones=[83631, 111508])
    elif opt.scheduler == "step_lr":
        scheduler = lr_scheduler.StepLR(optimizer,
                                        step_size=5,
                                        gamma=0.1,
                                        last_epoch=-1)

    if opt.weight_path:
        opt.begin_iter = load_train_utils(opt, optimizer, scheduler)

    # lr of non-backbone parameters, for commmand line outputs.
    lr = optimizer.param_groups[0]['lr']
    # lr of backbone parameters, for commmand line outputs.
    # backbone_lr = optimizer.param_groups[0]['lr']

    summary_writer = Metrics(log_dir='tf_logs')

    losses_sbj = AverageMeter('Sbj loss: ', ':.2f')
    losses_obj = AverageMeter('Obj loss: ', ':.2f')
    losses_rel = AverageMeter('Rel loss: ', ':.2f')
    losses_total = AverageMeter('Total loss: ', ':.2f')
    progress = ProgressMeter(
        [losses_sbj, losses_obj, losses_rel, losses_total], prefix='Train: ')

    faster_rcnn.train()
    th = 10000
    for step in range(opt.begin_iter, opt.max_iter):
        try:
            input_data = next(dataiterator)
        except StopIteration:
            dataiterator = iter(train_loader)
            input_data = next(dataiterator)

        images, targets = input_data
        _, metrics = faster_rcnn(images, targets)
        final_loss = metrics["loss_objectness"] + metrics["loss_rpn_box_reg"] + \
            metrics["loss_classifier"] + metrics["loss_box_reg"] + \
            metrics["loss_sbj"] + metrics["loss_obj"] + metrics["loss_rlp"]

        optimizer.zero_grad()
        final_loss.backward()
        optimizer.step()

        losses_sbj.update(metrics["loss_sbj"].item(), len(images))
        losses_obj.update(metrics["loss_obj"].item(), len(images))
        losses_rel.update(metrics["loss_rlp"].item(), len(images))
        losses_total.update(final_loss.item(), len(images))

        if opt.scheduler != "plateau":
            scheduler.step()

        if (step) % 10 == 0:
            progress.display(step)

        if step % 2500 == 0:
            train_losses = {}
            train_losses['total_loss'] = losses_total.avg
            train_losses['sbj_loss'] = losses_sbj.avg
            train_losses['obj_loss'] = losses_obj.avg
            train_losses['rel_loss'] = losses_rel.avg
            val_losses = val_epoch(faster_rcnn, val_loader)

            if opt.scheduler == "plateau":
                scheduler.step(val_losses['total_loss'])

            lr = optimizer.param_groups[0]['lr']

            # if val_losses['total_loss'] < th:
            #     save_model(faster_rcnn, optimizer, scheduler, step)
            #     print(f"*** Saved model ***")
            #     th = val_losses['total_loss']
            save_model(faster_rcnn, optimizer, scheduler, step)

            # write summary
            summary_writer.log_metrics(train_losses, val_losses, step, lr)

            print(
                f"* Average training loss : {train_losses['total_loss']:.3f}")
            print(
                f"* Average validation loss : {val_losses['total_loss']:.3f}")

            losses_sbj.reset()
            losses_obj.reset()
            losses_rel.reset()
            losses_total.reset()
            faster_rcnn.train()