예제 #1
0
def main():
    args = parse_args()

    # Set the GPU to use
    torch.cuda.set_device(args.gpu)

    annotations = osp.expanduser(args.annotations)
    questions = osp.expanduser(args.questions)

    vqa_loader = dataset.get_train_dataloader(annotations, questions,
                                              args.images, args)
    # We always use the vocab from the training set
    vocab = vqa_loader.dataset.vocab

    maps = {
        "word_to_wid": vqa_loader.dataset.word_to_wid,
        "wid_to_word": vqa_loader.dataset.wid_to_word,
        "ans_to_aid": vqa_loader.dataset.ans_to_aid,
        "aid_to_ans": vqa_loader.dataset.aid_to_ans,
    }
    val_loader = dataset.get_val_dataloader(osp.expanduser(
        args.val_annotations),
                                            osp.expanduser(args.val_questions),
                                            args.val_images,
                                            args,
                                            maps=maps,
                                            vocab=vocab,
                                            shuffle=False)

    arch = Models[args.arch].value
    model = arch(len(vocab), output_dim=args.top_answer_limit)

    if torch.cuda.is_available():
        model.cuda()

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(),
                           lr=args.lr,
                           betas=tuple(args.betas),
                           weight_decay=args.weight_decay)
    scheduler = lr_scheduler.StepLR(optimizer,
                                    step_size=args.decay_interval,
                                    gamma=args.lr_decay)

    vis = visualize.Visualizer(args.port)

    print("Beginning training")
    print("#" * 80)

    for epoch in range(args.start_epoch, args.epochs):
        scheduler.step()

        trainer.train(model,
                      vqa_loader,
                      criterion,
                      optimizer,
                      epoch,
                      args,
                      vis=vis)
        trainer.evaluate(model, val_loader, criterion, epoch, args, vis=vis)

    print("Training complete!")
예제 #2
0
def main():
    args = parse_args()

    # Set the GPU to use
    torch.cuda.set_device(args.gpu)

    transform = transforms.Compose([
        transforms.Resize(256),
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])
    vqa_loader = dataset.get_train_dataloader(osp.expanduser(args.annotations),
                                              osp.expanduser(args.questions),
                                              args.images,
                                              args,
                                              raw_images=args.raw_images,
                                              transforms=transform)
    # We always use the vocab from the training set
    vocab = vqa_loader.dataset.vocab

    maps = {
        "vocab": vocab,
        "word_to_wid": vqa_loader.dataset.word_to_wid,
        "wid_to_word": vqa_loader.dataset.wid_to_word,
        "ans_to_aid": vqa_loader.dataset.ans_to_aid,
        "aid_to_ans": vqa_loader.dataset.aid_to_ans,
    }
    val_transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])
    val_loader = dataset.get_val_dataloader(osp.expanduser(
        args.val_annotations),
                                            osp.expanduser(args.val_questions),
                                            args.val_images,
                                            args,
                                            raw_images=args.raw_images,
                                            maps=maps,
                                            vocab=vocab,
                                            shuffle=False,
                                            transforms=val_transform)

    arch = Models[args.arch].value
    model = arch(len(vocab),
                 output_dim=args.top_answer_limit,
                 raw_images=args.raw_images)

    if args.resume:
        state = torch.load(args.resume)
        model.load_state_dict(state["model"])

    model.cuda()

    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = optim.Adam(model.parameters(),
                           lr=args.lr,
                           betas=tuple(args.betas),
                           weight_decay=args.weight_decay)
    scheduler = lr_scheduler.StepLR(optimizer,
                                    step_size=args.decay_interval,
                                    gamma=args.lr_decay)

    if args.visualize:
        vis = visualize.Visualizer(args.port)
    else:
        vis = None

    print("Beginning training")
    print("#" * 80)

    for epoch in range(args.start_epoch, args.epochs):
        scheduler.step()

        trainer.train(model,
                      vqa_loader,
                      criterion,
                      optimizer,
                      epoch,
                      args,
                      vis=vis)
        trainer.evaluate(model, val_loader, criterion, epoch, args, vis=vis)

    print("Training complete!")
예제 #3
0
def main():
    args = parse_args()

    # Set the GPU to use
    torch.cuda.set_device(args.gpu)

    vqa_loader = dataset.get_train_dataloader(osp.expanduser(args.annotations),
                                              osp.expanduser(args.questions),
                                              args.images,
                                              args,
                                              raw_images=args.raw_images,
                                              transforms=None)
    # We always use the vocab from the training set
    vocab = vqa_loader.dataset.vocab

    maps = {
        "vocab": vocab,
        "word_to_wid": vqa_loader.dataset.word_to_wid,
        "wid_to_word": vqa_loader.dataset.wid_to_word,
        "ans_to_aid": vqa_loader.dataset.ans_to_aid,
        "aid_to_ans": vqa_loader.dataset.aid_to_ans,
    }
    val_transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])
    val_loader = dataset.get_val_dataloader(osp.expanduser(
        args.val_annotations),
                                            osp.expanduser(args.val_questions),
                                            args.val_images,
                                            args,
                                            raw_images=args.raw_images,
                                            maps=maps,
                                            vocab=vocab,
                                            shuffle=False,
                                            transforms=val_transform)

    arch = Models[args.arch].value
    model = arch(len(vocab),
                 output_dim=args.top_answer_limit,
                 raw_images=args.raw_images)

    if args.resume:
        state = torch.load(args.resume)
        model.load_state_dict(state["model"])

    else:
        print(
            "No trained model weights provided. Don't expect the answers to be meaningful."
        )

    if torch.cuda.is_available():
        model.cuda()

    with torch.no_grad():
        results = evaluate(model, val_loader)

    for k in results.keys():
        results[k] = np.asarray(results[k])
        acc = results[k].sum() / results[k].shape
        print("Accuracy for {0} type answers: \t\t{1}".format(k, acc))