def __init__(self, encoder, gpu_mode, embed_hidden=300, mlp_hidden=512):
        super(BaseModel, self).__init__()

        self.normalize = tv.transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                 std=[0.229, 0.224, 0.225])

        self.train_transform = tv.transforms.Compose([
            tv.transforms.RandomRotation(10),
            tv.transforms.RandomResizedCrop(224),
            tv.transforms.RandomHorizontalFlip(),
            tv.transforms.ToTensor(),
            self.normalize,
        ])

        self.dev_transform = tv.transforms.Compose([
            tv.transforms.Resize(224),
            tv.transforms.CenterCrop(224),
            tv.transforms.ToTensor(),
            self.normalize,
        ])

        self.encoder = encoder
        self.gpu_mode = gpu_mode
        self.mlp_hidden = mlp_hidden
        #self.verbq_word_count = len(self.encoder.verb_q_words)
        self.n_verbs = self.encoder.get_num_verbs()

        self.verb_module = model_verbq_0.BaseModel(self.encoder, self.gpu_mode)
        self.role_module = model_roles_recqa_noself.BaseModel(
            self.encoder, self.gpu_mode)
        self.verb_module.eval()
        self.role_module.eval()
        '''self.verb_vqa = TopDown(self.n_verbs)
        self.verb_q_emb = nn.Embedding(self.verb_module.verbq_word_count + 1, embed_hidden, padding_idx=self.verb_module.verbq_word_count)
        self.last_class = nn.Linear(self.mlp_hidden*8, self.n_verbs)

        weight_verbqa = copy.deepcopy(self.verb_module.verb_vqa.state_dict())
        weight_emb = copy.deepcopy(self.verb_module.verb_q_emb.state_dict())
        weight_lastclass = copy.deepcopy(self.verb_module.last_class.state_dict())

        self.verb_vqa.load_state_dict(weight_verbqa)
        self.verb_q_emb.load_state_dict(weight_emb)
        self.last_class.load_state_dict(weight_lastclass)'''

        self.updated_verb_module = model_verbq_0.BaseModel(
            self.encoder, self.gpu_mode)

        self.role_maker = nn.Linear(mlp_hidden, mlp_hidden)
        self.real_comb_concat = nn.Linear(mlp_hidden * 2, mlp_hidden)
Exemplo n.º 2
0
    def __init__(self, encoder, gpu_mode, embed_hidden=300, mlp_hidden=512):
        super(BaseModel, self).__init__()

        self.normalize = tv.transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                 std=[0.229, 0.224, 0.225])

        self.train_transform = tv.transforms.Compose([
            tv.transforms.RandomRotation(10),
            tv.transforms.RandomResizedCrop(224),
            tv.transforms.RandomHorizontalFlip(),
            tv.transforms.ToTensor(),
            self.normalize,
        ])

        self.dev_transform = tv.transforms.Compose([
            tv.transforms.Resize(224),
            tv.transforms.CenterCrop(224),
            tv.transforms.ToTensor(),
            self.normalize,
        ])

        self.encoder = encoder
        self.gpu_mode = gpu_mode
        self.n_roles = self.encoder.get_num_roles()
        self.n_verbs = self.encoder.get_num_verbs()
        self.vocab_size = self.encoder.get_num_labels()
        self.max_role_count = self.encoder.get_max_role_count()
        self.n_role_q_vocab = len(self.encoder.question_words)
        self.verbq_word_count = len(self.encoder.verb_question_words)

        self.conv = vgg16_modified()

        self.verb_module = model_verbq_0.BaseModel(self.encoder, self.gpu_mode)

        #self.verb_lookup = nn.Embedding(self.n_verbs, embed_hidden)
        self.w_emb = nn.Embedding(self.n_role_q_vocab + 1,
                                  embed_hidden,
                                  padding_idx=self.n_role_q_vocab)
        self.roles = TopDown(self.vocab_size)
        self.role_classifier = SimpleClassifier(mlp_hidden, 2 * mlp_hidden,
                                                self.vocab_size, 0.5)

        self.verb_vqa = model_verbq_0.TopDown(self.n_verbs)
        self.verb_q_emb = nn.Embedding(self.verbq_word_count + 1,
                                       embed_hidden,
                                       padding_idx=self.verbq_word_count)
        self.verb_last_class = nn.Linear(mlp_hidden * 8, self.n_verbs)

        self.verb_role_maker = nn.Linear(mlp_hidden, mlp_hidden)
        self.verb_real_comb_concat = nn.Linear(mlp_hidden * 2, mlp_hidden)

        self.conv_hidden = self.conv.base_size()
        self.mlp_hidden = mlp_hidden
        self.embed_hidden = embed_hidden
    def __init__(self, encoder,
                 gpu_mode,
                 embed_hidden=300,
                 mlp_hidden = 512
                 ):
        super(BaseModel, self).__init__()

        self.normalize = tv.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

        self.train_transform = tv.transforms.Compose([
            tv.transforms.RandomRotation(10),
            tv.transforms.RandomResizedCrop(224),
            tv.transforms.RandomHorizontalFlip(),
            tv.transforms.ToTensor(),
            self.normalize,
        ])

        self.dev_transform = tv.transforms.Compose([
            tv.transforms.Resize(224),
            tv.transforms.CenterCrop(224),
            tv.transforms.ToTensor(),
            self.normalize,
        ])

        self.encoder = encoder
        self.gpu_mode = gpu_mode
        self.mlp_hidden = mlp_hidden
        #self.verbq_word_count = len(self.encoder.verb_q_words)
        self.n_verbs = self.encoder.get_num_verbs()

        self.verb_module = model_verbq_0.BaseModel(self.encoder, self.gpu_mode)
        self.role_module = model_roles_recqa_noself.BaseModel(self.encoder, self.gpu_mode)
        self.verb_module.eval()
        self.role_module.eval()

        self.label_small = nn.Linear(mlp_hidden, embed_hidden)
        self.updating_verb_module = model_verbq_0.BaseModel(self.encoder, self.gpu_mode)
        self.dropout = nn.Dropout(0.5)
    def __init__(self, encoder,
                 gpu_mode,
                 embed_hidden=300,
                 mlp_hidden = 512
                 ):
        super(BaseModel, self).__init__()

        self.normalize = tv.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

        self.train_transform = tv.transforms.Compose([
            tv.transforms.RandomRotation(10),
            tv.transforms.RandomResizedCrop(224),
            tv.transforms.RandomHorizontalFlip(),
            tv.transforms.ToTensor(),
            self.normalize,
        ])

        self.dev_transform = tv.transforms.Compose([
            tv.transforms.Resize(224),
            tv.transforms.CenterCrop(224),
            tv.transforms.ToTensor(),
            self.normalize,
        ])

        self.encoder = encoder
        self.gpu_mode = gpu_mode
        self.mlp_hidden = mlp_hidden
        self.verbq_word_count = len(self.encoder.verb_q_words)
        self.n_verbs = self.encoder.get_num_verbs()

        self.conv = vgg16_modified()

        self.verb_module = model_verbq_0.BaseModel(self.encoder, self.gpu_mode)
        self.role_module = model_roles_recqa_noself.BaseModel(self.encoder, self.gpu_mode)
        self.verb_module.eval()
        self.role_module.eval()



        '''for param in self.verb_module.parameters():
            param.require_grad = False

        for param in self.role_module.parameters():
            param.require_grad = False
        
        for param in self.conv.parameters():
            param.require_grad = False'''
        self.verb_vqa = TopDown(self.n_verbs)
        self.verb_q_emb = nn.Embedding(self.verbq_word_count + 1, embed_hidden, padding_idx=self.verbq_word_count)
        self.last_class = nn.Linear(self.mlp_hidden*8, self.n_verbs)
Exemplo n.º 5
0
def main():

    import argparse
    parser = argparse.ArgumentParser(
        description="imsitu VSRL. Training, evaluation and prediction.")
    parser.add_argument("--gpuid",
                        default=-1,
                        help="put GPU id > -1 in GPU mode",
                        type=int)
    #parser.add_argument("--command", choices = ["train", "eval", "resume", 'predict'], required = True)
    parser.add_argument('--resume_training',
                        action='store_true',
                        help='Resume training from the model [resume_model]')
    parser.add_argument('--resume_model',
                        type=str,
                        default='',
                        help='The model we resume')
    parser.add_argument('--verb_module',
                        type=str,
                        default='',
                        help='pretrained verb module')
    parser.add_argument('--role_module',
                        type=str,
                        default='',
                        help='pretrained role module')
    parser.add_argument('--train_role',
                        action='store_true',
                        help='cnn fix, verb fix, role train from the scratch')
    parser.add_argument(
        '--finetune_verb',
        action='store_true',
        help='cnn fix, verb finetune, role train from the scratch')
    parser.add_argument(
        '--finetune_cnn',
        action='store_true',
        help='cnn finetune, verb finetune, role train from the scratch')
    parser.add_argument('--output_dir',
                        type=str,
                        default='./trained_models',
                        help='Location to output the model')
    parser.add_argument('--evaluate',
                        action='store_true',
                        help='Only use the testing mode')
    parser.add_argument('--test',
                        action='store_true',
                        help='Only use the testing mode')
    parser.add_argument('--dataset_folder',
                        type=str,
                        default='./imSitu',
                        help='Location of annotations')
    parser.add_argument('--imgset_dir',
                        type=str,
                        default='./resized_256',
                        help='Location of original images')
    parser.add_argument('--frcnn_feat_dir',
                        type=str,
                        help='Location of output from detectron')
    #todo: train role module separately with gt verbs

    args = parser.parse_args()

    batch_size = 640
    #lr = 5e-6
    lr = 0.0001
    lr_max = 5e-4
    lr_gamma = 0.1
    lr_step = 15
    clip_norm = 0.5
    weight_decay = 1e-4
    n_epoch = 500
    n_worker = 3

    #dataset_folder = 'imSitu'
    #imgset_folder = 'resized_256'
    dataset_folder = args.dataset_folder
    imgset_folder = args.imgset_dir

    print('model spec :, top down att with role q ')

    train_set = json.load(open(dataset_folder + "/updated_train_new.json"))
    imsitu_roleq = json.load(open("imsitu_data/imsitu_questions_prev.json"))
    encoder = imsitu_encoder(train_set, imsitu_roleq)

    model = model_verbq_0.BaseModel(encoder, args.gpuid)

    # To group up the features
    #cnn_features, role_features = utils.group_features_noun(model)
    cnn_features, role_features = utils.group_features_noun(model)

    train_set = imsitu_loader_roleq_updated(imgset_folder, train_set, encoder,
                                            model.train_preprocess())

    train_loader = torch.utils.data.DataLoader(train_set,
                                               batch_size=64,
                                               shuffle=True,
                                               num_workers=n_worker)

    dev_set = json.load(open(dataset_folder + "/dev.json"))
    dev_set = imsitu_loader_roleq_updated(imgset_folder, dev_set, encoder,
                                          model.dev_preprocess())
    dev_loader = torch.utils.data.DataLoader(dev_set,
                                             batch_size=64,
                                             shuffle=True,
                                             num_workers=n_worker)

    test_set = json.load(open(dataset_folder + "/test.json"))
    test_set = imsitu_loader_roleq_updated(imgset_folder, test_set, encoder,
                                           model.dev_preprocess())
    test_loader = torch.utils.data.DataLoader(test_set,
                                              batch_size=64,
                                              shuffle=True,
                                              num_workers=n_worker)

    traindev_set = json.load(open(dataset_folder + "/dev.json"))
    traindev_set = imsitu_loader_roleq_updated(imgset_folder, traindev_set,
                                               encoder, model.dev_preprocess())
    traindev_loader = torch.utils.data.DataLoader(traindev_set,
                                                  batch_size=8,
                                                  shuffle=True,
                                                  num_workers=n_worker)

    #utils.load_net(args.verb_module, [model.verb_module])
    #utils.load_net(args.role_module, [model.role_module])
    model_name = 'train_full'

    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)

    torch.manual_seed(1234)
    if args.gpuid >= 0:
        #print('GPU enabled')
        model.cuda()
        torch.cuda.manual_seed(1234)
        torch.backends.cudnn.deterministic = True

    optimizer = torch.optim.Adam([{
        'params': cnn_features,
        'lr': 5e-5
    }, {
        'params': role_features
    }],
                                 lr=1e-3)

    #optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=lr_step, gamma=lr_gamma)
    #gradient clipping, grad check
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

    if args.evaluate:
        top1, top5, val_loss = eval(model,
                                    dev_loader,
                                    encoder,
                                    args.gpuid,
                                    write_to_file=True)

        top1_avg = top1.get_average_results()
        top5_avg = top5.get_average_results()

        avg_score = top1_avg["verb"] + top1_avg["value"] + top1_avg["value-all"] + top5_avg["verb"] + \
                    top5_avg["value"] + top5_avg["value-all"] + top5_avg["value*"] + top5_avg["value-all*"]
        avg_score /= 8

        print('Dev average :{:.2f} {} {}'.format(
            avg_score * 100, utils.format_dict(top1_avg, '{:.2f}', '1-'),
            utils.format_dict(top5_avg, '{:.2f}', '5-')))

        #write results to csv file
        role_dict = top1.role_dict
        fail_val_all = top1.value_all_dict
        pass_val_dict = top1.vall_all_correct

        with open('role_pred_data.json', 'w') as fp:
            json.dump(role_dict, fp, indent=4)

        with open('fail_val_all.json', 'w') as fp:
            json.dump(fail_val_all, fp, indent=4)

        with open('pass_val_all.json', 'w') as fp:
            json.dump(pass_val_dict, fp, indent=4)

        print('Writing predictions to file completed !')

    elif args.test:
        top1, top5, val_loss = eval(model,
                                    test_loader,
                                    encoder,
                                    args.gpuid,
                                    write_to_file=True)

        top1_avg = top1.get_average_results()
        top5_avg = top5.get_average_results()

        avg_score = top1_avg["verb"] + top1_avg["value"] + top1_avg["value-all"] + top5_avg["verb"] + \
                    top5_avg["value"] + top5_avg["value-all"] + top5_avg["value*"] + top5_avg["value-all*"]
        avg_score /= 8

        print('Test average :{:.2f} {} {}'.format(
            avg_score * 100, utils.format_dict(top1_avg, '{:.2f}', '1-'),
            utils.format_dict(top5_avg, '{:.2f}', '5-')))

    else:

        print('Model training started!')
        train(model, train_loader, dev_loader, traindev_loader, optimizer,
              scheduler, n_epoch, args.output_dir, encoder, args.gpuid,
              clip_norm, lr_max, model_name, args)