Пример #1
0
def debug_show_similarity_with_manually_created_examples(
        model_path, data_path=None, split='test'):
    checkpoint = torch.load(model_path)
    opt = checkpoint['opt']
    opt.use_external_captions = False
    if data_path is not None:
        opt.data_path = data_path

    # load vocabulary used by the model
    with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name),
              'rb') as f:
        vocab = pickle.load(f)
    opt.vocab_size = len(vocab)

    # construct model
    model = VSE(opt)

    # load model state
    model.load_state_dict(checkpoint['model'])

    print('Loading dataset')
    data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size,
                                  opt.batch_size, opt.workers, opt)
    img_embs, cap_embs = encode_data(model, data_loader)
    img_embs = img_embs[:100]
    cap_embs = cap_embs[:100]
    data_loader_ex_0 = get_text_loader(split, opt.data_name, vocab,
                                       opt.batch_size, opt.workers, opt,
                                       'manually_ex_%d' % 0)
    encoding_0 = encode_data(model, data_loader_ex_0)[1]
    data_loader_ex_1 = get_text_loader(split, opt.data_name, vocab,
                                       opt.batch_size, opt.workers, opt,
                                       'manually_ex_%d' % 1)
    encoding_1 = encode_data(model, data_loader_ex_1)[1]
    print('Computing results...')

    # compute similarity
    result = list()
    result_0 = list()
    result_1 = list()

    npts = img_embs.shape[0] // 5
    for index in range(npts):
        # Get query image
        im = img_embs[5 * index].reshape(1, img_embs.shape[1])

        # Compute scores
        if opt.measure == 'order':
            raise Exception('Measure order not supported.')
        else:
            result.append(numpy.dot(im, cap_embs.T).flatten())
            result_0.append(numpy.dot(im, encoding_0.T).flatten())
            result_1.append(numpy.dot(im, encoding_1.T).flatten())
    torch.save({
        'orig': result,
        'Tete': result_0,
        'Haoyue': result_1
    }, 'shy_runs/debug.pt')
Пример #2
0
def eval_with_single_extended(model_path,
                              data_path=None,
                              data_name=None,
                              split='test',
                              backup_vec_ex=None):
    checkpoint = torch.load(model_path)
    opt = checkpoint['opt']
    opt.use_external_captions = False
    if data_path is not None:
        opt.data_path = data_path
    if data_name is not None:
        opt.data_name = data_name

    # load vocabulary used by the model
    with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name),
              'rb') as f:
        vocab = pickle.load(f)
    opt.vocab_size = len(vocab)

    # construct model
    model = VSE(opt)

    # load model state
    model.load_state_dict(checkpoint['model'])

    print('Loading dataset')
    data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size,
                                  opt.batch_size, opt.workers, opt)
    img_embs, cap_embs = encode_data(model, data_loader)
    if backup_vec_ex is None:
        cap_embs_ex = list()
        for i in range(img_embs.shape[0]):
            data_loader_ex = get_text_loader(split, opt.data_name, vocab,
                                             opt.batch_size, opt.workers, opt,
                                             'ex/%d' % i)
            encoding = encode_data(model, data_loader_ex)[1]
            if encoding is not None:
                cap_embs_ex.append(encoding.copy())
            else:
                cap_embs_ex.append(np.zeros(cap_embs[:1].shape))
            print('Caption Embedding: %d' % i)
        # torch.save(cap_embs_ex, 'data/coco_precomp/cap_embs_ex.pth')
    else:
        cap_embs_ex = torch.load(backup_vec_ex)
    print('Computing results...')

    r, rt = i2t_split(img_embs,
                      cap_embs,
                      cap_embs_ex,
                      measure=opt.measure,
                      return_ranks=True)
    ar = (r[0] + r[1] + r[2]) / 3
    print("Average i2t Recall: %.1f" % ar)
    print("Image to text: %.1f\t%.1f\t%.1f\t%.1f\t%.1f" % r)
    torch.save({'rt': rt}, model_path[:model_path.find('model_best')] +
               'ranks_single_extended.pth.tar')
Пример #3
0
    def post_init(self):
        checkpoint = torch.load(self.path,
                                map_location=torch.device('cpu' if not self.on_gpu else 'cuda'))
        opt = checkpoint['opt']
        with open(self.vocab_path, 'rb') as f:
            self.vocab = CustomUnpickler(f).load()

        opt.vocab_size = len(self.vocab)
        model = VSE(opt)
        model.load_state_dict(checkpoint['model'])
        model.txt_enc.eval()
        self.model = model.txt_enc
        del model.img_enc
Пример #4
0
def eval_with_manually_extended(model_path, data_path=None, split='test'):
    checkpoint = torch.load(model_path)
    opt = checkpoint['opt']
    opt.use_external_captions = False
    if data_path is not None:
        opt.data_path = data_path

    # load vocabulary used by the model
    with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name),
              'rb') as f:
        vocab = pickle.load(f)
    opt.vocab_size = len(vocab)

    # construct model
    model = VSE(opt)

    # load model state
    model.load_state_dict(checkpoint['model'])

    print('Loading dataset')
    data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size,
                                  opt.batch_size, opt.workers, opt)
    img_embs, cap_embs = encode_data(model, data_loader)
    img_embs = img_embs[:100]
    cap_embs = cap_embs[:100]
    cap_embs_ex = list()
    data_loader_ex_0 = get_text_loader(split, opt.data_name, vocab,
                                       opt.batch_size, opt.workers, opt,
                                       'manually_ex_%d' % 0)
    encoding_0 = encode_data(model, data_loader_ex_0)[1]
    data_loader_ex_1 = get_text_loader(split, opt.data_name, vocab,
                                       opt.batch_size, opt.workers, opt,
                                       'manually_ex_%d' % 1)
    encoding_1 = encode_data(model, data_loader_ex_1)[1]
    for i in range(100):
        cap_emb = np.concatenate(
            (encoding_0[i * 2:i * 2 + 2], encoding_1[i * 2:i * 2 + 2]), axis=0)
        cap_embs_ex.append(cap_emb)
    print('Computing results...')

    r, rt = i2t_split(img_embs,
                      cap_embs,
                      cap_embs_ex,
                      measure=opt.measure,
                      return_ranks=True)
    # r, rt = i2t(img_embs, cap_embs, measure=opt.measure, return_ranks=True)
    ar = (r[0] + r[1] + r[2]) / 3
    print("Average i2t Recall: %.1f" % ar)
    print("Image to text: %.1f %.1f %.1f %.1f %.1f" % r)
    torch.save({'rt': rt}, model_path[:model_path.find('model_best')] +
               'ranks_manually_extended_1.pth.tar')
class FeatureExtractor(object):
    def __init__(self, checkpoint, image_encoder, dataset):
        self.load_checkpoint(checkpoint)
        self.image_encoder = image_encoder
        self.dataset = dataset

    def load_checkpoint(self, checkpoint):
        checkpoint = torch.load(checkpoint)
        opt = checkpoint['opt']
        opt.use_external_captions = False
        vocab = Vocab.from_pickle(pjoin(opt.vocab_path, '%s_vocab.pkl' % opt.data_name))
        opt.vocab_size = len(vocab)

        from model import VSE
        self.model = VSE(opt)
        self.model.load_state_dict(checkpoint['model'])
        self.projector = vocab

        self.model.img_enc.eval()
        self.model.txt_enc.eval()
        for p in self.model.img_enc.parameters():
            p.requires_grad = False
        for p in self.model.txt_enc.parameters():
            p.requires_grad = False

    def __call__(self, ind):
        raw_img, img, img_embedding, cap, cap_ext = self.dataset[ind]
        img_embedding_precomp = self.model.img_enc(as_cuda(as_variable(img_embedding).unsqueeze(0)))

        img = as_variable(img)
        img.requires_grad = True
        img_embedding_a = img_embedding = self.image_encoder(as_cuda(img.unsqueeze(0)))
        img_embedding = self.model.img_enc(img_embedding)

        txt = [cap]
        txt.extend(cap_ext)
        txt_embeddings, txt_var = self.enc_txt(txt)

        return Record(
                raw_img, cap, cap_ext,
                img, img_embedding, img_embedding_precomp,
                txt_var, txt_embeddings[0], txt_embeddings[1:]
        )

    def enc_txt(self, caps):
        sents, lengths, _, inv = _prepare_batch(caps, self.projector)
        inv = var_with(as_variable(inv), sents)
        out, x = self.model.txt_enc.forward(sents, lengths, True)
        return out[inv], x
    def post_init(self):
        if self.pool_strategy is not None:
            self.pool_fn = getattr(np, self.pool_strategy)

        checkpoint = torch.load(
            self.path,
            map_location=torch.device('cpu' if not self.on_gpu else 'cuda'))
        opt = checkpoint['opt']

        model = VSE(opt)
        model.load_state_dict(checkpoint['model'])
        model.img_enc.eval()
        self.model = model.img_enc
        self.to_device(self.model)
        del model.txt_enc
    def load_checkpoint(self, checkpoint):
        checkpoint = torch.load(checkpoint)
        opt = checkpoint['opt']
        opt.use_external_captions = False
        vocab = Vocab.from_pickle(pjoin(opt.vocab_path, '%s_vocab.pkl' % opt.data_name))
        opt.vocab_size = len(vocab)

        from model import VSE
        self.model = VSE(opt)
        self.model.load_state_dict(checkpoint['model'])
        self.projector = vocab

        self.model.img_enc.eval()
        self.model.txt_enc.eval()
        for p in self.model.img_enc.parameters():
            p.requires_grad = False
        for p in self.model.txt_enc.parameters():
            p.requires_grad = False
Пример #8
0
def eval_with_extended(model_path,
                       data_path=None,
                       data_name=None,
                       split='test'):
    checkpoint = torch.load(model_path)
    opt = checkpoint['opt']
    opt.use_external_captions = True
    opt.negative_number = 5
    if data_path is not None:
        opt.data_path = data_path
    if data_name is not None:
        opt.data_name = data_name

    # load vocabulary used by the model
    with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name),
              'rb') as f:
        vocab = pickle.load(f)
    opt.vocab_size = len(vocab)
    opt.use_external_captions = True

    # construct model
    model = VSE(opt)

    # load model state
    model.load_state_dict(checkpoint['model'])

    print('Loading dataset')
    data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size,
                                  opt.batch_size, opt.workers, opt)
    print('Computing results...')
    img_embs, cap_embs = encode_data(model, data_loader)
    print('Images: %d, Captions: %d' %
          (img_embs.shape[0] // 5, cap_embs.shape[0]))

    r, rt = i2t_text_only(img_embs,
                          cap_embs,
                          measure=opt.measure,
                          return_ranks=True)
    ar = (r[0] + r[1] + r[2]) / 3
    print("Average i2t Recall: %.1f" % ar)
    print("Image to text: %.1f\t%.1f\t%.1f\t%.1f\t%.1f" % r)
    torch.save({'rt': rt}, model_path[:model_path.find('model_best')] +
               'ranks_extended.pth.tar')
Пример #9
0
def evalrank(model_path, data_path=None, split='dev', fold5=False, lang=None):
    """
    Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold
    cross-validation is done (only for MSCOCO). Otherwise, the full data is
    used for evaluation.
    """
    # load model and options
    checkpoint = torch.load(model_path)
    opt = checkpoint['opt']
    if data_path is not None:
        opt.data_path = data_path

    # Never use undersample when testing.
    opt.undersample = False
    print(opt)

    #Load vocabulary used by the model
    if opt.data_name != "m30k":
        with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name),
                  'rb') as f:
            vocab = pickle.load(f)
            opt.vocab_size = len(vocab)
    else:
        vocab = pickle.load(open(os.path.join(opt.logger_name, 'vocab.pkl')))

    # construct model
    model = VSE(opt)

    # load model state
    model.load_state_dict(checkpoint['model'])
    print('Loading dataset')
    if lang is not None:
        opt.lang = lang
    langs = opt.lang.split('-')
    data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size,
                                  opt.batch_size, opt.workers, opt)
    if len(langs) > 1:
        for data, loader_lang in zip(data_loader, langs):
            loader = data
            run_eval(model, loader, fold5, opt, loader_lang)
    else:
        run_eval(model, data_loader, fold5, opt, opt.lang)
Пример #10
0
def evalrank(model_path, data_path=None, split='dev', fold5=False):
    """
    Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold
    cross-validation is done (only for MSCOCO). Otherwise, the full data is
    used for evaluation.
    """
    # load model and options
    checkpoint = torch.load(model_path)
    opt = checkpoint['opt']
    if data_path is not None:
        opt.data_path = data_path

    print(opt)

    # load vocabulary used by the model
    with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name),
              'rb') as f:
        vocab = pickle.load(f)
    opt.vocab_size = len(vocab)

    # construct model
    model = VSE(opt)

    # load model state
    model.load_state_dict(checkpoint['model'])

    print('Loading dataset')
    data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size,
                                  opt.batch_size, opt.workers, opt)

    print('Computing results...')
    img_embs, cap_embs = encode_data(model, data_loader)
    print('Images: %d, Captions: %d' %
          (img_embs.shape[0] / 5, cap_embs.shape[0]))

    scores = numpy.sum(numpy.multiply(img_embs, cap_embs), -1)
    print(scores.shape)
    print('scores:', np.mean(scores))
Пример #11
0
def main():
    # Hyper Parameters
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_path', default='data',
                        help='path to datasets')
    parser.add_argument('--data_name', default='f30k',
                        help='{coco,f8k,f30k,10crop}_precomp|coco|f8k|f30k')
    parser.add_argument('--vocab_path', default='vocab',
                        help='Path to saved vocabulary pickle files.')
    parser.add_argument('--margin', default=0.2, type=float,
                        help='Rank loss margin.')
    parser.add_argument('--num_epochs', default=30, type=int,
                        help='Number of training epochs.')
    parser.add_argument('--batch_size', default=128, type=int,
                        help='Size of a training mini-batch.')
    parser.add_argument('--word_dim', default=300, type=int,
                        help='Dimensionality of the word embedding.')
    parser.add_argument('--embed_size', default=1024, type=int,
                        help='Dimensionality of the joint embedding.')
    parser.add_argument('--grad_clip', default=2., type=float,
                        help='Gradient clipping threshold.')
    parser.add_argument('--crop_size', default=224, type=int,
                        help='Size of an image crop as the CNN input.')
    parser.add_argument('--num_layers', default=1, type=int,
                        help='Number of GRU layers.')
    parser.add_argument('--learning_rate', default=2e-4, type=float,
                        help='Initial learning rate.')
    parser.add_argument('--lr_update', default=15, type=int,
                        help='Number of epochs to update the learning rate.')
    parser.add_argument('--workers', default=10, type=int,
                        help='Number of data loader workers.')
    parser.add_argument('--log_step', default=100, type=int,
                        help='Number of steps to print and record the log.')
    parser.add_argument('--val_step', default=500, type=int,
                        help='Number of steps to run validation.')
    parser.add_argument('--logger_name', default='runs/test',
                        help='Path to save the model and Tensorboard log.')
    parser.add_argument('--resume', default='', type=str, metavar='PATH',
                        help='path to latest checkpoint (default: none)')
    parser.add_argument('--img_dim', default=2048, type=int,
                        help='Dimensionality of the image embedding.')
    parser.add_argument('--finetune', action='store_true',
                        help='Fine-tune the image encoder.')
    parser.add_argument('--use_restval', action='store_true',
                        help='Use the restval data for training on MSCOCO.')
    parser.add_argument('--reset_train', action='store_true',
                        help='Ensure the training is always done in '
                        'train mode (Not recommended).')
    parser.add_argument('--K', default=2, type=int,help='num of JSR.')
    parser.add_argument('--feature_path', default='data/joint-pretrain/flickr30k/region_feat_gvd_wo_bgd/trainval/',
                        type=str, help='path to the pre-computed image features')
    parser.add_argument('--region_bbox_file',
                        default='data/joint-pretrain/flickr30k/region_feat_gvd_wo_bgd/flickr30k_detection_vg_thresh0.2_feat_gvd_checkpoint_trainvaltest.h5',
                        type=str, help='path to the region_bbox_file(.h5)')
    opt = parser.parse_args()
    print(opt)

    logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
    tb_logger.configure(opt.logger_name, flush_secs=5)

    # Load Vocabulary Wrapper
    vocab = pickle.load(open(os.path.join(
        opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb'))
    opt.vocab_size = len(vocab)

    # Load data loaders
    train_loader, val_loader = data.get_loaders(
        opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt)

    # Construct the model
    model = VSE(opt)
    best_rsum = 0
    # optionally resume from a checkpoint
    if opt.resume:
        if os.path.isfile(opt.resume):
            print("=> loading checkpoint '{}'".format(opt.resume))
            checkpoint = torch.load(opt.resume)
            start_epoch = checkpoint['epoch']
            best_rsum = checkpoint['best_rsum']
            model.load_state_dict(checkpoint['model'])
            # Eiters is used to show logs as the continuation of another
            # training
            model.Eiters = checkpoint['Eiters']
            print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})"
                  .format(opt.resume, start_epoch, best_rsum))
            validate(opt, val_loader, model)
        else:
            print("=> no checkpoint found at '{}'".format(opt.resume))
        del checkpoint
    # Train the Model

    for epoch in range(opt.num_epochs):
        adjust_learning_rate(opt, model.optimizer, epoch)

        # train for one epoch
        train(opt, train_loader, model, epoch, val_loader)

        # evaluate on validation set
        rsum = validate(opt, val_loader, model)

        # remember best R@ sum and save checkpoint
        is_best = rsum > best_rsum
        best_rsum = max(rsum, best_rsum)
        save_checkpoint({
            'epoch': epoch + 1,
            'model': model.state_dict(),
            'best_rsum': best_rsum,
            'opt': opt,
            'Eiters': model.Eiters,
        }, is_best, epoch, prefix=opt.logger_name + '/')
Пример #12
0
def evalrank(model_path1,
             model_path2,
             data_path=None,
             split='dev',
             fold5=False,
             shared_space='both'):
    """
    Evaluate a trained model.
    """
    # load model and options
    checkpoint = torch.load(model_path1)
    opt = checkpoint['opt']
    print(opt)

    if data_path is not None:
        opt.data_path = data_path
    opt.vocab_path = "./vocab/"
    # load vocabulary used by the model
    vocab = pickle.load(open(os.path.join(opt.vocab_path, 'vocab.pkl'), 'rb'))

    opt.vocab_size = len(vocab)

    # construct model
    model = VSE(opt)

    # load model state
    model.load_state_dict(checkpoint['model'])

    print('Loading dataset')
    data_loader = get_test_loader1(split, opt.data_name, vocab, opt.crop_size,
                                   opt.batch_size, opt.workers, opt)

    print('Computing results...')
    img_embs1, cap_embs1 = encode_data(model, data_loader)

    # load second model and options
    checkpoint2 = torch.load(model_path2)
    opt = checkpoint2['opt']
    print(opt)

    if data_path is not None:
        opt.data_path = data_path
    opt.vocab_path = "./vocab/"
    # load vocabulary used by the model
    vocab = pickle.load(open(os.path.join(opt.vocab_path, 'vocab.pkl'), 'rb'))

    opt.vocab_size = len(vocab)

    # construct model
    model2 = VSE(opt)

    # load model state
    model2.load_state_dict(checkpoint2['model'])

    print('Loading dataset')
    data_loader = get_test_loader2(split, opt.data_name, vocab, opt.crop_size,
                                   opt.batch_size, opt.workers, opt)

    print('Computing results...')
    img_embs2, cap_embs2 = encode_data(model2, data_loader)

    print('Images: %d, Captions: %d' %
          (img_embs2.shape[0] / 20, cap_embs2.shape[0]))

    # no cross-validation, full evaluation
    r, rt = i2t(img_embs1,
                cap_embs1,
                img_embs2,
                cap_embs2,
                shared_space,
                measure=opt.measure,
                return_ranks=True)
    ri, rti = t2i(img_embs1,
                  cap_embs1,
                  img_embs2,
                  cap_embs2,
                  shared_space,
                  measure=opt.measure,
                  return_ranks=True)
    ar = (r[0] + r[1] + r[2]) / 3
    ari = (ri[0] + ri[1] + ri[2]) / 3
    rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
    print("rsum: %.1f" % rsum)
    print("Average i2t Recall: %.1f" % ar)
    print("Image to text: %.1f %.1f %.1f %.1f %.1f" % r)
    print("Average t2i Recall: %.1f" % ari)
    print("Text to image: %.1f %.1f %.1f %.1f %.1f" % ri)

    torch.save({'rt': rt, 'rti': rti}, 'ranks.pth.tar')
Пример #13
0
def main():
    # Hyper Parameters
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_path',
                        default='/data/stud/jorgebjorn/data/',
                        help='path to datasets')
    parser.add_argument('--data_name',
                        default='f8k_precomp',
                        help='{coco,f8k,f30k,10crop}_precomp|coco|f8k|f30k')
    parser.add_argument('--vocab_path',
                        default='/data/stud/jorgebjorn/data/vocab/',
                        help='Path to saved vocabulary pickle files.')
    parser.add_argument('--margin',
                        default=0.2,
                        type=float,
                        help='Rank loss margin.')
    parser.add_argument('--num_epochs',
                        default=30,
                        type=int,
                        help='Number of training epochs.')
    parser.add_argument('--batch_size',
                        default=128,
                        type=int,
                        help='Size of a training mini-batch.')
    parser.add_argument('--word_dim',
                        default=300,
                        type=int,
                        help='Dimensionality of the word embedding.')
    parser.add_argument('--embed_size',
                        default=1024,
                        type=int,
                        help='Dimensionality of the joint embedding.')
    parser.add_argument('--grad_clip',
                        default=2.,
                        type=float,
                        help='Gradient clipping threshold.')
    parser.add_argument('--crop_size',
                        default=224,
                        type=int,
                        help='Size of an image crop as the CNN input.')
    parser.add_argument('--num_layers',
                        default=1,
                        type=int,
                        help='Number of GRU layers.')
    parser.add_argument('--learning_rate',
                        default=.0002,
                        type=float,
                        help='Initial learning rate.')
    parser.add_argument('--lr_update',
                        default=15,
                        type=int,
                        help='Number of epochs to update the learning rate.')
    parser.add_argument('--workers',
                        default=10,
                        type=int,
                        help='Number of data loader workers.')
    parser.add_argument('--log_step',
                        default=10,
                        type=int,
                        help='Number of steps to print and record the log.')
    parser.add_argument('--val_step',
                        default=500,
                        type=int,
                        help='Number of steps to run validation.')
    parser.add_argument(
        '--logger_name',
        default='/data/stud/jorgebjorn/runs/{}/{}'.format(
            getpass.getuser(),
            datetime.datetime.now().strftime("%d-%m-%y_%H:%M")),
        help='Path to save the model and Tensorboard log.')
    parser.add_argument('--selection',
                        default='uncertainty',
                        help='Active learning selection algorithm')
    parser.add_argument('--primary',
                        default='images',
                        help='Image- or caption-centric active learning')
    parser.add_argument('--resume',
                        default='',
                        type=str,
                        metavar='PATH',
                        help='path to latest checkpoint (default: none)')
    parser.add_argument('--max_violation',
                        action='store_true',
                        help='Use max instead of sum in the rank loss.')
    parser.add_argument('--img_dim',
                        default=4096,
                        type=int,
                        help='Dimensionality of the image embedding.')
    parser.add_argument('--finetune',
                        action='store_true',
                        help='Fine-tune the image encoder.')
    parser.add_argument('--device',
                        default=0,
                        type=int,
                        help='which gpu to use')
    parser.add_argument('--cnn_type',
                        default='vgg19',
                        help="""The CNN used for image encoder
                        (e.g. vgg19, resnet152)""")
    parser.add_argument('--use_restval',
                        action='store_true',
                        help='Use the restval data for training on MSCOCO.')
    parser.add_argument('--measure',
                        default='cosine',
                        help='Similarity measure used (cosine|order)')
    parser.add_argument('--use_abs',
                        action='store_true',
                        help='Take the absolute value of embedding vectors.')
    parser.add_argument('--no_imgnorm',
                        action='store_true',
                        help='Do not normalize the image embeddings.')
    parser.add_argument('--reset_train',
                        action='store_true',
                        help='Ensure the training is always done in '
                        'train mode (Not recommended).')
    parser.add_argument('--no_log',
                        action='store_true',
                        default=False,
                        help='Disable logging')
    opt = parser.parse_args()
    opt.logger_name += "_" + opt.selection + "_" + opt.primary
    print(opt)
    if torch.cuda.is_available():
        torch.cuda.set_device(opt.device)

    # Setup tensorboard logger
    if not opt.no_log:
        logging.basicConfig(format='%(asctime)s %(message)s',
                            level=logging.INFO)
        tb_logger.configure(opt.logger_name, flush_secs=5)

    # Load Vocabulary Wrapper
    vocab = pickle.load(
        open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name),
             'rb'))
    opt.vocab_size = len(vocab)

    # Load data loaders
    active_loader, train_loader, val_loader = data.get_loaders(
        opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt)

    # Construct the model
    model = VSE(opt)
    if torch.cuda.is_available():
        model.cuda()

    # optionally resume from a checkpoint
    if opt.resume:
        if os.path.isfile(opt.resume):
            print("=> loading checkpoint '{}'".format(opt.resume))
            checkpoint = torch.load(opt.resume)
            start_epoch = checkpoint['epoch']
            best_rsum = checkpoint['best_rsum']
            model.load_state_dict(checkpoint['model'])
            # Eiters is used to show logs as the continuation of another
            # training
            model.Eiters = checkpoint['Eiters']
            print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format(
                opt.resume, start_epoch, best_rsum))
            validate(opt, val_loader, model)
        else:
            print("=> no checkpoint found at '{}'".format(opt.resume))

    n_rounds = 234

    if opt.selection == "uncertainty":
        selection = select_uncertainty
    elif opt.selection == "margin":
        selection = select_margin
    elif opt.selection == "random":
        selection = select_random
    elif opt.selection == "hybrid":
        selection = select_hybrid
    elif opt.selection == "all":
        selection = select_all
    elif opt.selection == "capsim":
        selection = select_captionSimilarity
    else:
        selection = select_uncertainty

    for r in range(n_rounds):
        best_indices = selection(r, model, train_loader)

        for index in best_indices:
            active_loader.dataset.add_single(train_loader.dataset[index][0],
                                             train_loader.dataset[index][1])

        train_loader.dataset.delete_indices(best_indices)

        # Train the Model
        print("Training on {} items ".format(len(active_loader)))

        # Reset the model
        model = VSE(opt)
        if torch.cuda.is_available():
            model.cuda()

        best_rsum = 0
        for epoch in range(opt.num_epochs):
            adjust_learning_rate(opt, model.optimizer, epoch)

            # train for one epoch
            train(opt, active_loader, model, epoch, val_loader)

            # evaluate on validation set
        rsum = validate(opt, val_loader, model, not opt.no_log, r)
Пример #14
0
def main():
  # Hyper Parameters

  torch.cuda.set_device(opt.gpu_id)

  tb_logger.configure(opt.logger_name, flush_secs=5)
  logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO, filename=opt.logger_name+'/log.log')
  console = logging.StreamHandler()
  console.setLevel(logging.INFO)
  formatter = logging.Formatter('%(asctime)s %(message)s')
  console.setFormatter(formatter)
  logging.getLogger('').addHandler(console)

  logging.info(opt)

  # Load Vocabulary Wrapper
  vocab_path = os.path.join(opt.vocab_path, '%s_vocab_total.pkl' % opt.data_name)
  print (vocab_path)
  vocab = pickle.load(open(vocab_path, 'rb'))
  opt.vocab_size = len(vocab)

  # Load data loaders
  train_loader, val_loader = data.get_loaders(
    opt.data_name, vocab, opt.batch_size, opt.workers, opt)

  # Construct the model
  model = VSE(opt)

  print('Print out models:')
  print(model.clip_enc)
  print(model.txt_enc)
  print(model.vid_seq_enc)
  print(model.txt_seq_enc)

  start_epoch = 0
  # optionally resume from a checkpoint
  if opt.resume:
    if os.path.isfile(opt.resume):
      print("=> loading checkpoint '{}'".format(opt.resume))
      checkpoint = torch.load(opt.resume)
      start_epoch = checkpoint['epoch']
      best_rsum = checkpoint['best_rsum']
      model.load_state_dict(checkpoint['model'], opt)
      # Eiters is used to show logs as the continuation of another
      # training
      model.Eiters = checkpoint['Eiters']
      print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})"
          .format(opt.resume, start_epoch, best_rsum))
      validate(opt, val_loader, model)
      if opt.eval_only:
        return
    else:
      print("=> no checkpoint found at '{}'".format(opt.resume))

  # Train the Model
  best_rsum = 0
  for epoch in range(start_epoch, opt.num_epochs):
    adjust_learning_rate(opt, model.optimizer, epoch)

    # train for one epoch
    train(opt, train_loader, model, epoch, val_loader)

    # evaluate on validation set
    rsum = validate(opt, val_loader, model)

    # remember best R@ sum and save checkpoint
    is_best = rsum > best_rsum
    best_rsum = max(rsum, best_rsum)
    save_checkpoint({
      'epoch': epoch + 1,
      'model': model.state_dict(opt),
      'best_rsum': best_rsum,
      'opt': opt,
      'Eiters': model.Eiters,
    }, is_best, prefix=opt.logger_name + '/', epoch=epoch)
Пример #15
0
def main():
    # Hyper Parameters
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_path', default='/w/31/faghri/vsepp_data/',
                        help='path to datasets')
    parser.add_argument('--data_name', default='precomp',
                        help='{coco,f8k,f30k,10crop}_precomp|coco|f8k|f30k')
    parser.add_argument('--vocab_path', default='./vocab/',
                        help='Path to saved vocabulary pickle files.')
    parser.add_argument('--margin', default=0.2, type=float,
                        help='Rank loss margin.')
    parser.add_argument('--num_epochs', default=15, type=int,
                        help='Number of training epochs.')
    parser.add_argument('--batch_size', default=128, type=int,
                        help='Size of a training mini-batch.')
    parser.add_argument('--word_dim', default=300, type=int,
                        help='Dimensionality of the word embedding.')
    parser.add_argument('--embed_size', default=1024, type=int,
                        help='Dimensionality of the joint embedding.')
    parser.add_argument('--grad_clip', default=2., type=float,
                        help='Gradient clipping threshold.')
    parser.add_argument('--crop_size', default=224, type=int,
                        help='Size of an image crop as the CNN input.')
    parser.add_argument('--num_layers', default=1, type=int,
                        help='Number of GRU layers.')
    parser.add_argument('--learning_rate', default=.0002, type=float,
                        help='Initial learning rate.')
    parser.add_argument('--lr_update', default=8, type=int,
                        help='Number of epochs to update the learning rate.')
    parser.add_argument('--workers', default=10, type=int,
                        help='Number of data loader workers.')
    parser.add_argument('--log_step', default=10, type=int,
                        help='Number of steps to print and record the log.')
    parser.add_argument('--val_step', default=500, type=int,
                        help='Number of steps to run validation.')
    parser.add_argument('--logger_name', default='runs/runX',
                        help='Path to save the model and Tensorboard log.')
    parser.add_argument('--resume', default='', type=str, metavar='PATH',
                        help='path to latest checkpoint (default: none)')
    parser.add_argument('--max_violation', action='store_true',
                        help='Use max instead of sum in the rank loss.')
    parser.add_argument('--sum_violation', action='store_true')
    parser.add_argument('--img_dim', default=4096, type=int,
                        help='Dimensionality of the image embedding.')
    parser.add_argument('--finetune', action='store_true',
                        help='Fine-tune the image encoder.')
    parser.add_argument('--cnn_type', default='vgg19',
                        help="""The CNN used for image encoder
                        (e.g. vgg19, resnet152)""")
    parser.add_argument('--use_restval', action='store_true',
                        help='Use the restval data for training on MSCOCO.')
    parser.add_argument('--measure', default='cosine',
                        help='Similarity measure used (cosine|order)')
    parser.add_argument('--use_abs', action='store_true',
                        help='Take the absolute value of embedding vectors.')
    parser.add_argument('--no_imgnorm', action='store_true',
                        help='Do not normalize the image embeddings.')
    parser.add_argument('--reset_train', action='store_true',
                        help='Ensure the training is always done in '
                        'train mode (Not recommended).')
    parser.add_argument('--save_all', action='store_true',
                        help="Save model after the training of each epoch")
    parser.add_argument('--memory_bank', action='store_true',
                        help="Train model with memory bank")
    parser.add_argument('--record_val', action='store_true',
                        help="Record the rsum values on validation set in file during training")
    parser.add_argument('--local_alpha', default=30.0, type=float)
    parser.add_argument('--local_ep', default=0.3, type=float)
    parser.add_argument('--global_alpha', default=40.0, type=float)
    parser.add_argument('--global_beta', default=40.0, type=float)
    parser.add_argument('--global_ep_posi', default=0.2, type=float,
                        help="Global epsilon for positive pairs")
    parser.add_argument('--global_ep_nega', default=0.1, type=float,
                        help="Global epsilon for negative pairs")
    parser.add_argument('--mb_k', default=250, type=int,
                        help="Use top K items in memory bank")
    parser.add_argument('--mb_rate', default=0.05, type=float,
                        help="-")

    opt = parser.parse_args()
    print(opt)

    logging.basicConfig(format='%(message)s', level=logging.INFO)
    tb_logger.configure(opt.logger_name, flush_secs=5)

    # Load Vocabulary Wrapper
    vocab = pickle.load(open(os.path.join(
        opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb'))
    opt.vocab_size = len(vocab)
    print("Vocab Size: %d" % opt.vocab_size)

    # Load data loaders
    train_loader, val_loader = data.get_loaders(
        opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt)

    # Construct the model
    model = VSE(opt)

    # optionally resume from a checkpoint
    if opt.resume:
        if os.path.isfile(opt.resume):
            print("=> loading checkpoint '{}'".format(opt.resume))
            checkpoint = torch.load(opt.resume)
            start_epoch = checkpoint['epoch']
            best_rsum = checkpoint['best_rsum']
            model.load_state_dict(checkpoint['model'])
            # Eiters is used to show logs as the continuation of another
            # training
            model.Eiters = checkpoint['Eiters']
            print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})"
                  .format(opt.resume, start_epoch, best_rsum))
            validate(opt, val_loader, model)
        else:
            print("=> no checkpoint found at '{}'".format(opt.resume))

    # Train the Model
    best_rsum = 0
    for epoch in range(opt.num_epochs):
        adjust_learning_rate(opt, model.optimizer, epoch)

        memory_bank = opt.memory_bank
        if memory_bank and epoch > 0:
            load_memory_bank(opt, train_loader, model)
        # train for one epoch
        train(opt, train_loader, model, epoch, val_loader)

        # evaluate on validation set
        rsum = validate(opt, val_loader, model)
        print ("rsum: %.1f" % rsum)
        if opt.record_val:
            with open("rst_val_" + opt.logger_name[5:], "a") as f:
                f.write("Epoch: %d ; rsum: %.1f\n" %(epoch, rsum))

        # remember best R@ sum and save checkpoint
        is_best = rsum > best_rsum
        best_rsum = max(rsum, best_rsum)
        save_checkpoint({
            'epoch': epoch + 1,
            'model': model.state_dict(),
            'best_rsum': best_rsum,
            'opt': opt,
            'Eiters': model.Eiters,
        }, is_best, prefix=opt.logger_name + '/', save_all=opt.save_all)

        # reset memory bank
        model.mb_img = None
        model.mb_cap = None
Пример #16
0
def main():
    parser = argparse.ArgumentParser()
    # Directories.
    parser.add_argument('--data_path',
                        default='/DATA/cvpr19',
                        help='path to datasets')
    parser.add_argument('--vocab_path',
                        default='../vocab/',
                        help='Path to saved vocabulary pickle files.')
    parser.add_argument('--resume',
                        default='',
                        type=str,
                        metavar='PATH',
                        help='path to latest checkpoint (default: none)')
    parser.add_argument('--model',
                        type=str,
                        default='beenburger',
                        help='model_name')
    parser.add_argument(
        '--save_dir',
        type=str,
        default='coco2',
        help='save checkpoint and results in DATA_PATH/MODEL_NAME/SAVE_DIR')
    # Dataset.
    parser.add_argument('--data_name', default='coco', help='{coco|ours}')
    parser.add_argument('--use_restval',
                        default='True',
                        type=str2bool,
                        help='Use the restval data for training on MSCOCO.')
    # Model configurations.
    parser.add_argument('--margin',
                        default=0.2,
                        type=float,
                        help='Rank loss margin.')
    parser.add_argument('--K',
                        default=620,
                        type=int,
                        help='Dimensionality of the word embedding.')
    parser.add_argument('--num_layers',
                        default=4,
                        type=int,
                        help='Number of SRU layers.')
    parser.add_argument('--D',
                        type=int,
                        default='2048',
                        help='dimension of image feature from ResNet')
    parser.add_argument('--D_prime',
                        type=int,
                        default='2400',
                        help='dimension of adaptation + pooling')
    parser.add_argument('--d',
                        type=int,
                        default='2400',
                        help='Dimensionality of the joint embedding')
    # Training configurations.
    parser.add_argument('--num_epochs',
                        default=30,
                        type=int,
                        help='Number of training epochs.')
    parser.add_argument('--batch_size',
                        default=160,
                        type=int,
                        help='Size of a training mini-batch.')
    parser.add_argument('--img_size', default=256, type=int, help='image_size')
    parser.add_argument('--crop_size',
                        default=224,
                        type=int,
                        help='Size of an image crop as the CNN input.')
    parser.add_argument('--learning_rate',
                        default=.001,
                        type=float,
                        help='Initial learning rate.')
    parser.add_argument('--lr_decay',
                        type=float,
                        default=0.5,
                        help='learning rate decay')
    parser.add_argument('--workers',
                        default=4,
                        type=int,
                        help='Number of data loader workers.')
    # Miscellaneous.
    parser.add_argument('--log_step',
                        default=10,
                        type=int,
                        help='Number of steps to print and record the log.')
    parser.add_argument('--val_step',
                        default=500,
                        type=int,
                        help='Number of steps to run validation.')
    parser.add_argument('--logger_name',
                        default='../runs/runX',
                        help='Path to save the model and Tensorboard log.')

    opt = parser.parse_args()
    print(opt)

    logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
    tb_logger.configure(opt.logger_name, flush_secs=5)

    # Load Vocabulary Wrapper
    vocab = pickle.load(
        open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name),
             'rb'))
    opt.vocab_size = len(vocab)
    opt.vocab = vocab

    # Create directories
    create_directory(opt.data_path, opt.model, opt.save_dir)

    # Load data loaders
    train_loader, val_loader = data.get_loaders(opt.data_name, vocab,
                                                opt.crop_size, opt.batch_size,
                                                opt.workers, opt)

    # Construct the model
    model = VSE(opt)

    # optionally resume from a checkpoint
    if opt.resume:
        if os.path.isfile(opt.resume):
            print("=> loading checkpoint '{}'".format(opt.resume))
            checkpoint = torch.load(opt.resume)
            start_epoch = checkpoint['epoch']
            model.change_training_state(0)
            if start_epoch > 2:
                model.change_training_state(2)
            if start_epoch > 8:
                model.change_training_state(8)
            best_rsum = checkpoint['best_rsum']
            model.optimizer = checkpoint['optim']
            model.load_state_dict(checkpoint['model'])
            # Eiters is used to show logs as the continuation of another
            # training
            model.Eiters = checkpoint['Eiters']
            print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format(
                opt.resume, start_epoch, best_rsum))
            validate(opt, val_loader, model)
        else:
            print("=> no checkpoint found at '{}'".format(opt.resume))

    # Train the Model
    best_rsum = 0
    start_epoch = 0
    for epoch in range(start_epoch, opt.num_epochs):
        adjust_learning_rate(opt, model.optimizer, epoch)

        model.change_training_state(epoch)

        # train for one epoch
        train(opt, train_loader, model, epoch, val_loader)

        # evaluate on validation set
        rsum = validate(opt, val_loader, model)

        # remember best R@ sum and save checkpoint
        is_best = rsum > best_rsum
        best_rsum = max(rsum, best_rsum)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'model': model.state_dict(),
                'best_rsum': best_rsum,
                'opt': opt,
                'Eiters': model.Eiters,
                'optim': model.optimizer,
            },
            is_best,
            prefix=os.path.join(opt.data_path, opt.model, opt.save_dir))
Пример #17
0
def evalrank(model_path, data_path=None, split='dev', fold5=False, region_bbox_file=None, feature_path=None):
    """
    Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold
    cross-validation is done (only for MSCOCO). Otherwise, the full data is
    used for evaluation.
    """
    # load model and options
    checkpoint = torch.load(model_path)
    opt = checkpoint['opt']
    if data_path is not None:
        opt.data_path = data_path
    if region_bbox_file is not None:
        opt.region_bbox_file = region_bbox_file
    if feature_path is not None:
        opt.feature_path = feature_path

    # load vocabulary used by the model
    with open(os.path.join(opt.vocab_path,
                           '%s_vocab.pkl' % opt.data_name), 'rb') as f:
        vocab = pickle.load(f)
    opt.vocab_size = len(vocab)
    print(opt)

    # construct model
    model = VSE(opt)
    # load model state
    model.load_state_dict(checkpoint['model'])

    print('Loading dataset')
    data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size,
                                  opt.batch_size, opt.workers, opt)
    print('Computing results...')
    img_embs, cap_embs= encode_data(model, data_loader)
    time_sim_start = time.time()

    if not fold5:
        img_emb_new = img_embs[0:img_embs.size(0):5]
        print(img_emb_new.size())

        sims = torch.mm(img_emb_new, cap_embs.t())
        sims_T = torch.mm(cap_embs, cap_embs.t())
        sims_T = sims_T.cpu().numpy()

        sims = sims.cpu().numpy()
        np.save('sims_f.npy',sims)
        np.save('sims_f_T.npy',sims_T)

        print('Images: %d, Captions: %d' %
              (img_embs.shape[0] / 5, cap_embs.shape[0]))

        r = simrank(sims)

        time_sim_end = time.time()
        print('sims_time:%f' % (time_sim_end - time_sim_start))
        del sims
    else: # fold5-especially for coco
        print('5k---------------')
        img_emb_new = img_embs[0:img_embs.size(0):5]
        print(img_emb_new.size())

        sims = torch.mm(img_emb_new, cap_embs.t())
        sims_T = torch.mm(cap_embs, cap_embs.t())

        sims = sims.cpu().numpy()
        sims_T = sims_T.cpu().numpy()

        np.save('sims_full_5k.npy',sims)
        np.save('sims_full_T_5k.npy',sims_T)
        print('Images: %d, Captions: %d' %
              (img_embs.shape[0] / 5, cap_embs.shape[0]))

        r = simrank(sims)

        time_sim_end = time.time()
        print('sims_time:%f' % (time_sim_end - time_sim_start))
        del sims, sims_T
        print('1k---------------')
        r_ = [0, 0, 0, 0, 0, 0, 0]
        for i in range(5):
            print(i)
            img_emb_new = img_embs[i * 5000 : int(i * 5000 + img_embs.size(0)/5):5]
            cap_emb_new = cap_embs[i * 5000 : int(i * 5000 + cap_embs.size(0)/5)]

            sims = torch.mm(img_emb_new, cap_emb_new.t())
            sims_T = torch.mm(cap_emb_new, cap_emb_new.t())
            sims_T = sims_T.cpu().numpy()
            sims = sims.cpu().numpy()
            np.save('sims_full_%d.npy'%i,sims)
            np.save('sims_full_T_%d'%i,sims_T)

            print('Images: %d, Captions: %d' %
                  (img_emb_new.size(0), cap_emb_new.size(0)))

            r = simrank(sims)
            r_ = np.array(r_) + np.array(r)

            del sims
            print('--------------------')
        r_ = tuple(r_/5)
        print('I2T:%.1f %.1f %.1f' % r_[0:3])
        print('T2I:%.1f %.1f %.1f' % r_[3:6])
        print('Rsum:%.1f' % r_[-1])
Пример #18
0
def main():
  # Hyper Parameters
  parser = argparse.ArgumentParser()
  parser.add_argument('--data_path', default='/data1/hexianghu/activitynet/captions/',
            help='path to datasets')
  parser.add_argument('--data_name', default='anet_precomp',
            help='anet_precomp')
  parser.add_argument('--vocab_path', default='./vocab/',
            help='Path to saved vocabulary pickle files.')
  parser.add_argument('--margin', default=0.2, type=float,
            help='Rank loss margin.')
  parser.add_argument('--num_epochs', default=50, type=int,
            help='Number of training epochs.')
  parser.add_argument('--batch_size', default=64, type=int,
            help='Size of a training mini-batch.')
  parser.add_argument('--word_dim', default=300, type=int,
            help='Dimensionality of the word embedding.')
  parser.add_argument('--embed_size', default=1024, type=int,
            help='Dimensionality of the joint embedding.')
  parser.add_argument('--grad_clip', default=0., type=float,
            help='Gradient clipping threshold.')
  parser.add_argument('--num_layers', default=1, type=int,
            help='Number of GRU layers.')
  parser.add_argument('--learning_rate', default=.001, type=float,
            help='Initial learning rate.')
  parser.add_argument('--lr_update', default=10, type=int,
            help='Number of epochs to update the learning rate.')
  parser.add_argument('--workers', default=10, type=int,
            help='Number of data loader workers.')
  parser.add_argument('--log_step', default=10, type=int,
            help='Number of steps to print and record the log.')
  parser.add_argument('--val_step', default=500, type=int,
            help='Number of steps to run validation.')
  parser.add_argument('--logger_name', default='runs/runX',
            help='Path to save the model and Tensorboard log.')
  parser.add_argument('--resume', default='', type=str, metavar='PATH', required=True,
            help='path to latest checkpoint (default: none)')
  parser.add_argument('--max_violation', action='store_true',
            help='Use max instead of sum in the rank loss.')
  parser.add_argument('--img_dim', default=500, type=int,
            help='Dimensionality of the image embedding.')
  parser.add_argument('--measure', default='cosine',
            help='Similarity measure used (cosine|order)')
  parser.add_argument('--use_abs', action='store_true',
            help='Take the absolute value of embedding vectors.')
  parser.add_argument('--no_imgnorm', action='store_true',
            help='Do not normalize the image embeddings.')
  parser.add_argument('--gpu_id', default=0, type=int,
            help='GPU to use.')
  parser.add_argument('--rnn_type', default='maxout', choices=['maxout', 'seq2seq', 'attention'],
            help='Type of recurrent model.')
  parser.add_argument('--img_first_size', default=1024, type=int,
            help='first img layer emb size')
  parser.add_argument('--cap_first_size', default=1024, type=int,
            help='first cap layer emb size')
  parser.add_argument('--img_first_dropout', default=0, type=float,
            help='first img layer emb size')
  parser.add_argument('--cap_first_dropout', default=0, type=float,
            help='first cap layer emb size')
 
  opt = parser.parse_args()
  print(opt)

  torch.cuda.set_device(opt.gpu_id)

  logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
  tb_logger.configure(opt.logger_name, flush_secs=5)

  # Load Vocabulary Wrapper
  vocab = pickle.load(open(os.path.join(
    opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb'))
  opt.vocab_size = len(vocab)

  # Load data loaders
  train_loader, val_loader = data.get_loaders(
    opt.data_name, vocab, opt.batch_size, opt.workers, opt)

  # Construct the model
  model = VSE(opt)

  print('Print out models:')
  print(model.img_enc)
  print(model.txt_enc)
  print(model.img_seq_enc)
  print(model.txt_seq_enc)

  # optionally resume from a checkpoint
  if os.path.isfile(opt.resume):
    print("=> loading checkpoint '{}'".format(opt.resume))
    checkpoint = torch.load(opt.resume)
    start_epoch = checkpoint['epoch']
    best_rsum = checkpoint['best_rsum']
    model.load_state_dict(checkpoint['model'])
    # Eiters is used to show logs as the continuation of another
    # training
    model.Eiters = checkpoint['Eiters']
    print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})"
        .format(opt.resume, start_epoch, best_rsum))
    validate(opt, val_loader, model)
  else:
    print("=> no checkpoint found at '{}'".format(opt.resume))
def evalrank(model_path, data_path=None, split='dev', fold5=False):
    """
    Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold
    cross-validation is done (only for MSCOCO). Otherwise, the full data is
    used for evaluation.
    """
    # load model and options
    checkpoint = torch.load(model_path)
    opt = checkpoint['opt']
    if data_path is not None:
        opt.data_path = data_path

    # load vocabulary used by the model
    with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name),
              'rb') as f:
        vocab = pickle.load(f)
    opt.vocab_size = len(vocab)

    # construct model
    model = VSE(opt)

    # load model state
    model.load_state_dict(checkpoint['model'])

    print('Loading dataset')
    data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size,
                                  opt.batch_size, opt.workers, opt)

    print('Computing results...')
    img_embs, cap_embs = encode_data(model, data_loader)
    print('Images: %d, Captions: %d' %
          (img_embs.shape[0] / 5, cap_embs.shape[0]))

    if not fold5:
        # no cross-validation, full evaluation
        r, rt = i2t(img_embs, cap_embs, measure=opt.measure, return_ranks=True)
        ri, rti = t2i(img_embs,
                      cap_embs,
                      measure=opt.measure,
                      return_ranks=True)
        ar = (r[0] + r[1] + r[2]) / 3
        ari = (ri[0] + ri[1] + ri[2]) / 3
        rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
        print("rsum: %.1f" % rsum)
        print("Average i2t Recall: %.1f" % ar)
        print("Image to text: %.1f %.1f %.1f %.1f %.1f" % r)
        print("Average t2i Recall: %.1f" % ari)
        print("Text to image: %.1f %.1f %.1f %.1f %.1f" % ri)
    else:
        # 5fold cross-validation, only for MSCOCO
        results = []
        for i in range(5):
            r, rt0 = i2t(img_embs[i * 5000:(i + 1) * 5000],
                         cap_embs[i * 5000:(i + 1) * 5000],
                         measure=opt.measure,
                         return_ranks=True)
            print("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % r)
            ri, rti0 = t2i(img_embs[i * 5000:(i + 1) * 5000],
                           cap_embs[i * 5000:(i + 1) * 5000],
                           measure=opt.measure,
                           return_ranks=True)
            if i == 0:
                rt, rti = rt0, rti0
            print("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % ri)
            ar = (r[0] + r[1] + r[2]) / 3
            ari = (ri[0] + ri[1] + ri[2]) / 3
            rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
            print("rsum: %.1f ar: %.1f ari: %.1f" % (rsum, ar, ari))
            results += [list(r) + list(ri) + [ar, ari, rsum]]

        print("-----------------------------------")
        print("Mean metrics: ")
        mean_metrics = tuple(np.array(results).mean(axis=0).flatten())
        print("rsum: %.1f" % (mean_metrics[10] * 6))
        print("Average i2t Recall: %.1f" % mean_metrics[11])
        print("Image to text: %.1f %.1f %.1f %.1f %.1f" % mean_metrics[:5])
        print("Average t2i Recall: %.1f" % mean_metrics[12])
        print("Text to image: %.1f %.1f %.1f %.1f %.1f" % mean_metrics[5:10])

    torch.save({'rt': rt, 'rti': rti}, 'ranks.pth.tar')
Пример #20
0
    def __init__(self, model_path, img_path, precomp_path, split, vocab_path,
                 batch_size):
        checkpoint = torch.load(model_path)
        self.opt = checkpoint['opt']
        self.opt.data_path = img_path
        self.opt.data_name = 'coco'
        self.opt.batch_size = batch_size

        self.img_loader = get_test_loader(split, self.opt.data_name, None,
                                          self.opt.crop_size,
                                          self.opt.batch_size,
                                          self.opt.workers, self.opt)

        self.opt.data_path = precomp_path
        self.opt.data_name = 'coco_precomp'

        # load vocabulary used by the model
        with open(vocab_path, 'rb') as f:
            self.vocab = pickle.load(f)
        self.vocab_size = len(self.vocab)

        self.precomp_loader = get_test_loader(split, self.opt.data_name,
                                              self.vocab, self.opt.crop_size,
                                              self.opt.batch_size,
                                              self.opt.workers, self.opt)

        self.model = VSE(self.opt)

        # load model state
        self.model.load_state_dict(checkpoint['model'])

        # precompute all image embeddings
        self.model.val_start()

        # numpy array to keep all the embeddings
        self.img_embs = None
        batch_time = AverageMeter()
        val_logger = LogCollector()
        end = time.time()
        log_step = 10
        for i, (images, captions, lengths,
                ids) in enumerate(self.precomp_loader):
            self.model.logger = val_logger

            # compute the embeddings
            images = Variable(images, volatile=True)
            if torch.cuda.is_available():
                images = images.cuda()

            # Forward
            img_emb = self.model.img_enc(images)

            # initialize the numpy arrays given the size of the embeddings
            if self.img_embs is None:
                self.img_embs = np.zeros(
                    (len(self.precomp_loader.dataset), img_emb.size(1)))

            # preserve the embeddings by copying from gpu and converting to numpy
            self.img_embs[ids] = img_emb.data.cpu().numpy().copy()

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % log_step == 0:
                print('Test: [{0}/{1}]\t'
                      '{e_log}\t'
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'.
                      format(i,
                             len(self.precomp_loader),
                             batch_time=batch_time,
                             e_log=str(self.model.logger)))
            del images, captions
Пример #21
0
class ImageRetriever(object):
    def __init__(self, model_path, img_path, precomp_path, split, vocab_path,
                 batch_size):
        checkpoint = torch.load(model_path)
        self.opt = checkpoint['opt']
        self.opt.data_path = img_path
        self.opt.data_name = 'coco'
        self.opt.batch_size = batch_size

        self.img_loader = get_test_loader(split, self.opt.data_name, None,
                                          self.opt.crop_size,
                                          self.opt.batch_size,
                                          self.opt.workers, self.opt)

        self.opt.data_path = precomp_path
        self.opt.data_name = 'coco_precomp'

        # load vocabulary used by the model
        with open(vocab_path, 'rb') as f:
            self.vocab = pickle.load(f)
        self.vocab_size = len(self.vocab)

        self.precomp_loader = get_test_loader(split, self.opt.data_name,
                                              self.vocab, self.opt.crop_size,
                                              self.opt.batch_size,
                                              self.opt.workers, self.opt)

        self.model = VSE(self.opt)

        # load model state
        self.model.load_state_dict(checkpoint['model'])

        # precompute all image embeddings
        self.model.val_start()

        # numpy array to keep all the embeddings
        self.img_embs = None
        batch_time = AverageMeter()
        val_logger = LogCollector()
        end = time.time()
        log_step = 10
        for i, (images, captions, lengths,
                ids) in enumerate(self.precomp_loader):
            self.model.logger = val_logger

            # compute the embeddings
            images = Variable(images, volatile=True)
            if torch.cuda.is_available():
                images = images.cuda()

            # Forward
            img_emb = self.model.img_enc(images)

            # initialize the numpy arrays given the size of the embeddings
            if self.img_embs is None:
                self.img_embs = np.zeros(
                    (len(self.precomp_loader.dataset), img_emb.size(1)))

            # preserve the embeddings by copying from gpu and converting to numpy
            self.img_embs[ids] = img_emb.data.cpu().numpy().copy()

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % log_step == 0:
                print('Test: [{0}/{1}]\t'
                      '{e_log}\t'
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'.
                      format(i,
                             len(self.precomp_loader),
                             batch_time=batch_time,
                             e_log=str(self.model.logger)))
            del images, captions

    def get_NN(self, query, measure='dot', k=5):
        # convert query from string to index
        tokens = nltk.tokenize.word_tokenize(
            str(query).lower().decode('utf-8'))
        query = []
        query.append(self.vocab('<start>'))
        query.extend([self.vocab(token) for token in tokens])
        query.append(self.vocab('<end>'))

        # embedd query
        query = Variable(torch.LongTensor(query), volatile=True)
        if torch.cuda.is_available():
            query = query.cuda()

        # Forward
        q_emb = self.model.txt_enc(query, [query.size(0)])
        q_embs = q_emb.data.cpu().numpy().copy()

        # run nearest neighbours searching text -> image
        return self.find_NN(self.img_embs, q_embs, measure=measure, k=k)

    def find_NN(self, images, query, measure, k, npts=None):
        """
        Text->Images (Image Search)
        Images: (5N, K) matrix of images
        query: (1, K) matrix of captions
        """
        if npts is None:
            npts = images.shape[0] / 5
        ims = numpy.array([images[i] for i in range(0, len(images), 5)])

        # Compute scores
        tic = time.clock()
        if measure == 'order':
            d2 = order_sim(
                torch.Tensor(ims).cuda(),
                torch.Tensor(query).cuda())
            d2 = d2.cpu().numpy()

            d = d2.T
        else:
            d = numpy.dot(
                query, ims.T
            )  # TODO Try to optimize this computation, see if sorting is bottleneck

        imgs_by_similarity = numpy.argsort(numpy.squeeze(d))[::-1]
        toc = time.clock()
        print('NN search took {} ms over {} images'.format(
            (toc - tic) * 1000.0, ims.shape[0]))

        return imgs_by_similarity[0:k] * 5, query, ims

    def visualise_NN(self, inds, img_identifier, file_name):
        html_file = open(file_name, "w")
        for i, ind in enumerate(inds):
            root, caption, img_id, path, image = self.img_loader.dataset.get_raw_item(
                ind)
            image.save("./out/{}_img{}.png".format(img_identifier, i + 1))
            img_tag = '<img src="./out/{}_img{}.png" style="max-height: 400px; max-width: 400px;">'\
                .format(img_identifier, i + 1)
            html_file.write(img_tag)
        html_file.close()
Пример #22
0
def main():
    # Hyper Parameters
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_path',
                        default='/home/dcsaero01/data/datasets/vsepp/',
                        help='path to datasets')
    parser.add_argument('--data_name',
                        default='minicsdv2_precomp',
                        help='{coco,f8k,f30k,10crop}_precomp|coco|f8k|f30k')
    parser.add_argument('--vocab_path',
                        default='./vocab/',
                        help='Path to saved vocabulary pickle files.')
    parser.add_argument('--margin',
                        default=0.2,
                        type=float,
                        help='Rank loss margin.')
    parser.add_argument('--num_epochs',
                        default=300,
                        type=int,
                        help='Number of training epochs.')
    parser.add_argument('--batch_size',
                        default=128,
                        type=int,
                        help='Size of a training mini-batch.')
    parser.add_argument('--word_dim',
                        default=300,
                        type=int,
                        help='Dimensionality of the word embedding.')
    parser.add_argument('--embed_size',
                        default=1024,
                        type=int,
                        help='Dimensionality of the joint embedding.')
    parser.add_argument('--grad_clip',
                        default=2.,
                        type=float,
                        help='Gradient clipping threshold.')
    parser.add_argument('--crop_size',
                        default=224,
                        type=int,
                        help='Size of an image crop as the CNN input.')
    parser.add_argument('--num_layers',
                        default=1,
                        type=int,
                        help='Number of GRU layers.')
    parser.add_argument('--learning_rate',
                        default=.0002,
                        type=float,
                        help='Initial learning rate.')
    parser.add_argument('--lr_update',
                        default=15,
                        type=int,
                        help='Number of epochs to update the learning rate.')
    parser.add_argument(
        '--dropout_value',
        default=0,
        type=float,
        help='Probability value for dropout after linear layer')
    parser.add_argument('--workers',
                        default=10,
                        type=int,
                        help='Number of data loader workers.')
    parser.add_argument('--log_step',
                        default=10,
                        type=int,
                        help='Number of steps to print and record the log.')
    parser.add_argument('--val_step',
                        default=500,
                        type=int,
                        help='Number of steps to run validation.')
    parser.add_argument('--logger_name',
                        default='runs/runX',
                        help='Path to save the model and Tensorboard log.')
    parser.add_argument(
        '--resume',
        default=
        '/home/dcsaero01/data/projects/vsepp/runs/minicsdv2/checkpoint.pth.tar',
        type=str,
        metavar='PATH',
        help='path to latest checkpoint (default: none)')
    parser.add_argument('--max_violation',
                        action='store_true',
                        help='Use max instead of sum in the rank loss.')
    parser.add_argument('--img_dim',
                        default=4096,
                        type=int,
                        help='Dimensionality of the image embedding.')
    parser.add_argument('--text_dim',
                        default=6000,
                        type=int,
                        help='Dimensionality of the image embedding.')
    parser.add_argument('--finetune',
                        action='store_true',
                        help='Fine-tune the image encoder.')
    parser.add_argument('--cnn_type',
                        default='vgg19',
                        help="""The CNN used for image encoder
                        (e.g. vgg19, resnet152)""")
    parser.add_argument('--use_restval',
                        action='store_true',
                        help='Use the restval data for training on MSCOCO.')
    parser.add_argument('--measure',
                        default='cosine',
                        help='Similarity measure used (cosine|order)')
    parser.add_argument('--use_abs',
                        action='store_true',
                        help='Take the absolute value of embedding vectors.')
    parser.add_argument('--no_imgnorm',
                        action='store_true',
                        help='Do not normalize the image embeddings.')
    parser.add_argument('--reset_train',
                        action='store_true',
                        help='Ensure the training is always done in '
                        'train mode (Not recommended).')

    opt = parser.parse_args()
    print(opt)

    logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
    tb_logger.configure(opt.logger_name, flush_secs=5)

    # Load Vocabulary Wrapper
    if opt.data_name == 'coco_st_precomp' or opt.data_name == 'coco_st_ner_precomp':
        vocab = None
        opt.vocab_size = 0
    else:
        vocab = pickle.load(
            open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name),
                 'rb'))
        opt.vocab_size = len(vocab)

    # Load data loaders
    train_loader, val_loader = data.get_loaders(opt.data_name, vocab,
                                                opt.crop_size, opt.batch_size,
                                                opt.workers, opt)

    # Construct the model
    model = VSE(opt)

    # optionally resume from a checkpoint
    if opt.resume:
        if os.path.isfile(opt.resume):
            print("=> loading checkpoint '{}'".format(opt.resume))
            checkpoint = torch.load(opt.resume)
            start_epoch = checkpoint['epoch']
            best_rsum = checkpoint['best_rsum']
            model.load_state_dict(checkpoint['model'])
            # Eiters is used to show logs as the continuation of another
            # training
            model.Eiters = checkpoint['Eiters']
            print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format(
                opt.resume, start_epoch, best_rsum))
            validate(opt, train_loader, model)
        else:
            print("=> no checkpoint found at '{}'".format(opt.resume))
    """
Пример #23
0
def main():
    # Hyper Parameters
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_path', default=".", help='path to datasets')
    parser.add_argument(
        '--data_name',
        default='m30k',
        help='{coco,f8k,f30k,10crop}_precomp|coco|f8k|f30k|m30k')
    parser.add_argument(
        '--lang',
        default='en',
        help='Which language(s) to use from m30k, en-de, trains on en+de.')
    parser.add_argument('--sentencepair',
                        action='store_true',
                        help='Train caption-caption ranking as well.')
    parser.add_argument(
        '--sentencepair_p',
        default=0.5,
        type=float,
        help='Probability of training on caption-caption and not image-caption.'
    )
    parser.add_argument(
        '--primary',
        default=None,
        help=
        'Which language to monitor for early stopping. Multiple with l1-l2-l3')
    parser.add_argument(
        '--undersample',
        action='store_true',
        help='Pick only one of the 5 possilbe captions for m30k task 2.')
    parser.add_argument('--half',
                        action='store_true',
                        help='Use only half of the M30K from task 2.')
    parser.add_argument('--disaligned',
                        action='store_true',
                        help='Use only half of the M30K from task 2.')
    parser.add_argument('--vocab_path',
                        default='.',
                        help='Path to saved vocabulary pickle files.')
    parser.add_argument('--margin',
                        default=0.2,
                        type=float,
                        help='Rank loss margin.')
    parser.add_argument('--num_epochs',
                        default=30,
                        type=int,
                        help='Number of training epochs.')
    parser.add_argument(
        '--patience',
        default=10,
        type=int,
        help='Number of validation steps to tolerate without improvement.')
    parser.add_argument('--batch_size',
                        default=128,
                        type=int,
                        help='Size of a training mini-batch.')
    parser.add_argument('--word_dim',
                        default=300,
                        type=int,
                        help='Dimensionality of the word embedding.')
    parser.add_argument('--embed_size',
                        default=1024,
                        type=int,
                        help='Dimensionality of the joint embedding.')
    parser.add_argument('--grad_clip',
                        default=2.,
                        type=float,
                        help='Gradient clipping threshold.')
    parser.add_argument('--crop_size',
                        default=224,
                        type=int,
                        help='Size of an image crop as the CNN input.')
    parser.add_argument('--num_layers',
                        default=1,
                        type=int,
                        help='Number of GRU layers.')
    parser.add_argument('--learning_rate',
                        default=.0002,
                        type=float,
                        help='Initial learning rate.')
    parser.add_argument('--lr_update',
                        default=15,
                        type=int,
                        help='Number of epochs to update the learning rate.')
    parser.add_argument('--workers',
                        default=10,
                        type=int,
                        help='Number of data loader workers.')
    parser.add_argument('--log_step',
                        default=10,
                        type=int,
                        help='Number of steps to print and record the log.')
    parser.add_argument('--val_step',
                        default=500,
                        type=int,
                        help='Number of steps to run validation.')
    parser.add_argument(
        '--logger_path',
        default='.',
        help='Path where to save the model and Tensorboard log.')
    parser.add_argument(
        '--logger_name',
        help='Name of the folder where to save the model and Tensorboard log.')
    parser.add_argument('--resume',
                        default='',
                        type=str,
                        metavar='PATH',
                        help='path to latest checkpoint (default: none)')
    parser.add_argument('--sum_violation',
                        dest="max_violation",
                        action='store_false',
                        help='Use max instead of sum in the rank loss.')
    parser.add_argument('--img_dim',
                        default=2048,
                        type=int,
                        help='Dimensionality of the image embedding.')
    parser.add_argument('--finetune',
                        action='store_true',
                        help='Fine-tune the image encoder.')
    parser.add_argument('--cnn_type',
                        default='vgg19',
                        help="""The CNN used for image encoder
                        (e.g. vgg19, resnet152)""")
    parser.add_argument('--use_restval',
                        action='store_true',
                        help='Use the restval data for training on MSCOCO.')
    parser.add_argument('--measure',
                        default='cosine',
                        help='Similarity measure used (cosine|order)')
    parser.add_argument('--use_abs',
                        action='store_true',
                        help='Take the absolute value of embedding vectors.')
    parser.add_argument('--no_imgnorm',
                        action='store_true',
                        help='Do not normalize the image embeddings.')
    parser.add_argument('--reset_train',
                        action='store_true',
                        help='Ensure the training is always done in '
                        'train mode (Not recommended).')
    parser.add_argument('--seed', default=42, type=int, help='Random seed.')
    opt = parser.parse_args()

    if torch.__version__ >= "0.3":
        opt.reset_train = True

    opt.vocab_path = os.path.join(opt.vocab_path, "vocab")
    if opt.logger_name is None:
        name = "lang{}_half-{}_undersample-{}_disaligned-{}_sentencepair-{}_primary-{}_epochs-{}"
        name = name.format(opt.lang, opt.half, opt.undersample, opt.disaligned,
                           opt.sentencepair, opt.primary, opt.num_epochs)
        opt.logger_name = os.path.join(opt.data_name, name)

    opt.logger_name = os.path.join(opt.logger_path, opt.logger_name,
                                   str(opt.seed))
    print(opt)
    random.seed(rseed + opt.seed)
    np.random.seed(rseed + opt.seed)
    torch.cuda.manual_seed(rseed + opt.seed)
    torch.cuda.manual_seed_all(rseed + opt.seed)

    logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
    tb_logger.configure(opt.logger_name, flush_secs=5)

    # For multi30k compute vocabulary mappings on the fly.
    if opt.data_name == "m30k":
        vocab = None
        langs = opt.lang.split("-")
    # Load Vocabulary Wrapper for COCO or F30K
    else:
        vocab = pickle.load(
            open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name),
                 'rb'))
        opt.vocab_size = len(vocab)
        langs = [opt.data_name]
    # Load data loaders
    train_loader, val_loader = data.get_loaders(opt.data_name, vocab,
                                                opt.crop_size, opt.batch_size,
                                                opt.workers, opt)
    # Construct the model
    model = VSE(opt)
    print(model.txt_enc)
    # optionally resume from a checkpoint
    if opt.resume:
        if os.path.isfile(opt.resume):
            print("=> loading checkpoint '{}'".format(opt.resume))
            checkpoint = torch.load(opt.resume)
            start_epoch = checkpoint['epoch']
            best_rsum = checkpoint['best_rsum']
            model.load_state_dict(checkpoint['model'])
            # Eiters is used to show logs as the continuation of another
            # training
            model.Eiters = checkpoint['Eiters']
            print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format(
                opt.resume, start_epoch, best_rsum))
            validate(opt, val_loader, model, "")
        else:
            print("=> no checkpoint found at '{}'".format(opt.resume))

    if len(langs) == 1 or opt.data_name != 'm30k':

        # Train the Model on a single data set
        best_rsum = 0
        model.train_start()
        for epoch in range(opt.num_epochs):
            if opt.reset_train:
                # Always reset to train mode, this is not the default behavior
                model.train_start()
            adjust_learning_rate(opt, model.optimizer, epoch)

            # train for one epoch
            train(opt, train_loader, model, epoch, val_loader)

            # evaluate on validation set
            rsum = validate(opt, val_loader, model, langs[0])

            # remember best R@ sum and save checkpoint
            is_best = rsum > best_rsum
            best_rsum = max(rsum, best_rsum)
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'model': model.state_dict(),
                    'best_rsum': best_rsum,
                    'opt': opt,
                    'Eiters': model.Eiters,
                },
                is_best,
                prefix=opt.logger_name + '/')
            if is_best:
                patience_count = 0
                print("New best: {}".format(best_rsum))
            else:
                patience_count += 1
                print("No improvement in {}".format(patience_count))
                if patience_count == opt.patience:
                    print("No improvement in {} epochs, stoppin".format(
                        patience_count))
                    break

    else:
        joint_train(opt, train_loader, model, val_loader)
Пример #24
0
def main():
    # Hyper Parameters
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_path',
                        default='/media/shenkev/data/Ubuntu/vsepp/data/data',
                        help='path to datasets')
    parser.add_argument('--data_name',
                        default='coco_precomp',
                        help='{coco,f8k,f30k,10crop}_precomp|coco|f8k|f30k')
    parser.add_argument('--vocab_path',
                        default='./vocab/',
                        help='Path to saved vocabulary pickle files.')
    parser.add_argument('--margin',
                        default=0.2,
                        type=float,
                        help='Rank loss margin.')
    parser.add_argument('--num_epochs',
                        default=30,
                        type=int,
                        help='Number of training epochs.')
    parser.add_argument('--batch_size',
                        default=128,
                        type=int,
                        help='Size of a training mini-batch.')
    parser.add_argument('--word_dim',
                        default=300,
                        type=int,
                        help='Dimensionality of the word embedding.')
    parser.add_argument('--embed_size',
                        default=1024,
                        type=int,
                        help='Dimensionality of the joint embedding.')
    parser.add_argument('--grad_clip',
                        default=2.,
                        type=float,
                        help='Gradient clipping threshold.')
    parser.add_argument('--crop_size',
                        default=224,
                        type=int,
                        help='Size of an image crop as the CNN input.')
    parser.add_argument('--num_layers',
                        default=1,
                        type=int,
                        help='Number of GRU layers.')
    parser.add_argument('--learning_rate',
                        default=.0002,
                        type=float,
                        help='Initial learning rate.')
    parser.add_argument('--lr_update',
                        default=15,
                        type=int,
                        help='Number of epochs to update the learning rate.')
    parser.add_argument('--workers',
                        default=10,
                        type=int,
                        help='Number of data loader workers.')
    parser.add_argument('--log_step',
                        default=10,
                        type=int,
                        help='Number of steps to print and record the log.')
    parser.add_argument('--val_step',
                        default=500,
                        type=int,
                        help='Number of steps to run validation.')
    parser.add_argument('--logger_name',
                        default='runs/coco_vse',
                        help='Path to save the model and Tensorboard log.')
    parser.add_argument('--resume',
                        default='',
                        type=str,
                        metavar='PATH',
                        help='path to latest checkpoint (default: none)')
    parser.add_argument('--max_violation',
                        default=True,
                        action='store_true',
                        help='Use max instead of sum in the rank loss.')
    parser.add_argument('--img_dim',
                        default=4096,
                        type=int,
                        help='Dimensionality of the image embedding.')
    parser.add_argument('--finetune',
                        action='store_true',
                        help='Fine-tune the image encoder.')
    parser.add_argument('--cnn_type',
                        default='vgg19',
                        help="""The CNN used for image encoder
                        (e.g. vgg19, resnet152)""")
    parser.add_argument('--use_restval',
                        action='store_true',
                        help='Use the restval data for training on MSCOCO.')
    parser.add_argument('--measure',
                        default='cosine',
                        help='Similarity measure used (cosine|order)')
    parser.add_argument('--use_abs',
                        action='store_true',
                        help='Take the absolute value of embedding vectors.')
    parser.add_argument('--no_imgnorm',
                        action='store_true',
                        help='Do not normalize the image embeddings.')
    opt = parser.parse_args()
    print(opt)

    logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
    tb_logger.configure(opt.logger_name, flush_secs=5)

    # Load Vocabulary Wrapper
    vocab = pickle.load(
        open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name),
             'rb'))
    opt.vocab_size = len(vocab)

    # Load data loaders
    train_loader, val_loader = data.get_loaders(opt.data_name, vocab,
                                                opt.crop_size, opt.batch_size,
                                                opt.workers, opt)

    # Construct the model
    model = VSE(opt)

    # optionally resume from a checkpoint
    if opt.resume:
        if os.path.isfile(opt.resume):
            print("=> loading checkpoint '{}'".format(opt.resume))
            checkpoint = torch.load(opt.resume)
            start_epoch = checkpoint['epoch']
            best_rsum = checkpoint['best_rsum']
            model.load_state_dict(checkpoint['model'])
            # Eiters is used to show logs as the continuation of another
            # training
            model.Eiters = checkpoint['Eiters']
            print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format(
                opt.resume, start_epoch, best_rsum))
            validate(opt, val_loader, model)
        else:
            print("=> no checkpoint found at '{}'".format(opt.resume))

    # Train the Model
    best_rsum = 0
    for epoch in range(opt.num_epochs):
        adjust_learning_rate(opt, model.optimizer, epoch)

        # train for one epoch
        train(opt, train_loader, model, epoch, val_loader)

        # evaluate on validation set
        rsum = validate(opt, val_loader, model)

        # remember best R@ sum and save checkpoint
        is_best = rsum > best_rsum
        best_rsum = max(rsum, best_rsum)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'model': model.state_dict(),
                'best_rsum': best_rsum,
                'opt': opt,
                'Eiters': model.Eiters,
            },
            is_best,
            prefix=opt.logger_name + '/')
def main():
    #训练外参数
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_type', default='rgb', help='path to datasets')
    parser.add_argument('--feature_path', default='', help='path to datasets')
    parser.add_argument('--anno_path', default='', help='path to datasets')
    parser.add_argument('--feature_prefix',
                        default='',
                        help='prefix of feaeture')
    parser.add_argument('--dropout', default=0.5, help='prefix of feature')
    parser.add_argument('--split_video_file',
                        default='',
                        help='prefix of feature')
    parser.add_argument('--num_pos_sample',
                        default=10,
                        help='prefix of feature')
    parser.add_argument('--num_neg_sample',
                        default=10,
                        help='prefix of feature')
    parser.add_argument('--num_epochs',
                        default=30,
                        type=int,
                        help='Number of training epochs')
    parser.add_argument('--batch_size',
                        default=128,
                        type=int,
                        help='Size of a training mini-batch')
    parser.add_argument('--embed_size',
                        default=10240,
                        typr=int,
                        help='Dimensionality of the joint embedding.')
    parser.add_argument('--grad_clip',
                        default=2,
                        type=float,
                        help='Gradient clipping threshold')
    parser.add_argument('--learning_rate',
                        default=.001,
                        type=float,
                        help='Initial learning rate')
    parser.add_argument('--lr_update',
                        default=15,
                        type=int,
                        help='Number of epochs to update the learning rate')
    parser.add_argument('--wrokers',
                        default=10,
                        type=int,
                        help='Number of data loader workers')
    parser.add_argument('--log_step',
                        default=10,
                        type=int,
                        help='Number of steps to print and record the log')
    parser.add_argument('--val_step',
                        default=500,
                        type=int,
                        help='Number of steps to run validation')
    parser.add_argument('--logger_name',
                        default='runs/runX',
                        help='Path to save the model and Tensorboard log')
    parser.add_argument('--resume',
                        default='',
                        type=str,
                        metavar='PATH',
                        help='path to latest checkpoint default=none')
    parser.add_argument('--storage_place',
                        default='',
                        type=str,
                        metavar='Path',
                        help='path to latest checkpoint default=none')
    parser.add_argument('--instance_data_path',
                        default='',
                        type=str,
                        metavar='PATH',
                        help='path to the latest checkpoint')

    opt = parser.parse_args()
    logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
    tb_logger.configure(opt.logger_name, flush_secs=5)

    #Load data loaders
    #1
    train_loader, val_loader = data.get_loaders(opt)

    #Construct the model
    #2
    model = VSE(opt)

    #optionally resume from a checkpoint
    if opt.resume:
        if os.path.isfile(opt.resume):
            print("=> loading checkpoint '{}'".format(opt.resume))
            checkpoint = torch.load(opt.resume)
            start_epoch = checkpoint['epoch']
            best_rsum = checkpoint['best_rsum']
            model.load_state_dict(checkpoint['model'])
            #Eiters is used to show logs as the continuation of another
            #training
            model.Eiters = checkpoint['Eiters']
            print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format(
                opt.resume, start_epoch, best_rsum))
        else:
            print("=> no checkpoint found at '{}'".format(opt.resume))

    # Train the Model
    best_rsum = 0

    for epoch in range(opt.num_epochs):
        adjust_learning_rate(opt, model.optimizer, epoch)

        #train for one epoch
        #3
        train(opt, train_loader, model, epoch)
        #evaluate on validation set
        rsum = 0

        #remeber best R@ sum and save checkpoint
        is_best = rsum > best_rsum
        best_rsum = max(rsum, best_rsum)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'model': model.state_dict(),
                'best_rsum': best_rsum,
                'opt': opt,
                'Eiters': model.Eiters,
            },
            is_best,
            prefix=opt.logger_name + '/')
    #4 val_loader (feat,id,labels)
    eval_feat(val_loader, model, opt.storage_place)
Пример #26
0
def main():
    # Hyper Parameters
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_path',
                        default='/A/VSE/data/',
                        help='path to datasets')
    parser.add_argument(
        '--data_name',
        default='resnet152_precomp',
        help='{coco,f8k,f30k,10crop,irv2,resnet152}_precomp|coco|f8k|f30k')
    parser.add_argument('--vocab_path',
                        default='./vocab/',
                        help='Path to saved vocabulary pickle files.')
    parser.add_argument('--margin',
                        default=0.05,
                        type=float,
                        help='Rank loss margin.')
    parser.add_argument('--num_epochs',
                        default=30,
                        type=int,
                        help='Number of training epochs.')
    parser.add_argument('--batch_size',
                        default=128,
                        type=int,
                        help='Size of a training mini-batch.')
    parser.add_argument('--word_dim',
                        default=300,
                        type=int,
                        help='Dimensionality of the word embedding.')
    parser.add_argument(
        '--embed_size',
        default=1024,
        type=int,
        help=
        'Dimensionality of the joint embedding. [NOTE: this is used only if <embed_size> differs from <gru_units>]'
    )
    parser.add_argument('--gru_units',
                        default=1024,
                        type=int,
                        help='Number of GRU neurons.')
    parser.add_argument('--grad_clip',
                        default=1.,
                        type=float,
                        help='Gradient clipping threshold.')
    parser.add_argument('--crop_size',
                        default=224,
                        type=int,
                        help='Size of an image crop as the CNN input.')
    parser.add_argument('--num_layers',
                        default=1,
                        type=int,
                        help='Number of GRU layers.')
    parser.add_argument('--learning_rate',
                        default=.001,
                        type=float,
                        help='Initial learning rate.')
    parser.add_argument('--lr_update',
                        default=15,
                        type=int,
                        help='Number of epochs to update the learning rate.')
    parser.add_argument('--workers',
                        default=10,
                        type=int,
                        help='Number of data loader workers.')
    parser.add_argument('--log_step',
                        default=10,
                        type=int,
                        help='Number of steps to print and record the log.')
    parser.add_argument('--val_step',
                        default=500,
                        type=int,
                        help='Number of steps to run validation.')
    parser.add_argument('--logger_name',
                        default='runs/runX',
                        help='Path to save the model and Tensorboard log.')
    parser.add_argument('--resume',
                        default='',
                        type=str,
                        metavar='PATH',
                        help='path to latest checkpoint (default: none)')
    parser.add_argument('--max_violation',
                        action='store_true',
                        help='Use max instead of sum in the rank loss.')
    parser.add_argument('--img_dim',
                        default=2048,
                        type=int,
                        help='Dimensionality of the image embedding.')
    parser.add_argument('--finetune',
                        action='store_true',
                        help='Fine-tune the image encoder.')
    parser.add_argument('--cnn_type',
                        default='vgg19',
                        help="""The CNN used for image encoder
                        (e.g. vgg19, resnet152)""")
    parser.add_argument('--use_restval',
                        action='store_true',
                        help='Use the restval data for training on MSCOCO.')
    parser.add_argument('--measure',
                        default='cosine',
                        help='Similarity measure used (cosine|order)')
    parser.add_argument(
        '--test_measure',
        default=None,
        help=
        'Similarity used for retrieval (None<same used for training>|cosine|order)'
    )
    parser.add_argument('--use_abs',
                        action='store_true',
                        help='Take the absolute value of embedding vectors.')
    parser.add_argument('--no_imgnorm',
                        action='store_true',
                        help='Do not normalize the image embeddings.')
    parser.add_argument('--text_encoder',
                        default='seam-e',
                        choices=text_encoders.text_encoders_alias.keys())
    parser.add_argument(
        '--att_units',
        default=300,
        type=int,
        help=
        'Number of tanh neurons. When using --att_dim=None we apply a tanh directly to the att input. '
    )
    parser.add_argument('--att_hops',
                        default=30,
                        type=int,
                        help='Number of attention hops (viewpoints).')
    parser.add_argument(
        '--att_coef',
        default=0.,
        type=float,
        help='Influence of Frobenius divergence in the loss function.')

    opt = parser.parse_args()

    if opt.test_measure is None:
        opt.test_measure = opt.measure

    print(opt)

    logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
    tb_logger.configure(opt.logger_name, flush_secs=5)

    tokenizer, vocab_size = data.get_tokenizer(opt.vocab_path, opt.data_name)
    opt.vocab_size = vocab_size

    collate_fn = 'collate_fn'

    # Load data loaders
    train_loader, val_loader = data.get_loaders(opt.data_name, tokenizer,
                                                opt.crop_size, opt.batch_size,
                                                opt.workers, opt, collate_fn)

    # Construct the model
    model = VSE(opt)
    print(model.txt_enc)

    # optionally resume from a checkpoint
    if opt.resume:
        if os.path.isfile(opt.resume):
            print("=> loading checkpoint '{}'".format(opt.resume))
            checkpoint = torch.load(opt.resume)
            start_epoch = checkpoint['epoch']
            best_rsum = checkpoint['best_rsum']
            model.load_state_dict(checkpoint['model'])
            # Eiters is used to show logs as the continuation of another
            # training
            model.Eiters = checkpoint['Eiters']
            print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format(
                opt.resume, start_epoch, best_rsum))
            validate(opt, val_loader, model)
        else:
            print("=> no checkpoint found at '{}'".format(opt.resume))

    # Train the Model
    best_rsum = 0
    for epoch in range(opt.num_epochs):
        adjust_learning_rate(opt, model.optimizer, epoch)

        # train for one epoch
        train(opt, train_loader, model, epoch, val_loader)

        # evaluate on validation set
        rsum = validate(opt, val_loader, model)

        # remember best R@ sum and save checkpoint
        is_best = rsum > best_rsum
        best_rsum = max(rsum, best_rsum)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'model': model.state_dict(),
                'best_rsum': best_rsum,
                'opt': opt,
                'Eiters': model.Eiters,
            },
            is_best,
            prefix=opt.logger_name + '/')
Пример #27
0
def create_model(opt, ema=False):
    model = VSE(opt, ema)

    return model