Exemplo n.º 1
0
def test(args, split, modelfn=None, decoder=None, encoder=None):
    """Runs test on split=val/test with checkpoint file modelfn or loaded model_*"""
    # Device configuration
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Find model directory
    if args.caption_model not in args.model_path:
        args.model_path += "_" + args.caption_model
        if args.finetune_cnn:
            args.model_path += "_finetune"

    # Get the best model path
    if encoder == None:
        modelfn = os.path.join(args.model_path, 'best_model.ckpt')

    # Load vocabulary
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build data loader
    data_loader = get_loader(args, vocab, split, shuffle=False)

    max_tokens = args.max_tokens
    args.numwords = data_loader.dataset.numwords
    args.vocab_len = len(vocab)
    idx2word = vocab.idx2word
    num_batches = len(data_loader)
    print(('[DEBUG] Running inference on %s with %d batches' %
           (split.upper(), num_batches)))

    # Load model
    if modelfn is not None:
        print(('[INFO] Loading checkpoint %s' % modelfn))
        encoder = ResNetFeats(args)
        decoder = models.setup(args)
        encoder.cuda()
        decoder.cuda()

        checkpoint = torch.load(modelfn)
        decoder.load_state_dict(checkpoint['decoder_state_dict'])
        encoder.load_state_dict(checkpoint['encoder_state_dict'])

    encoder.eval()
    decoder.eval()

    pred_captions = []
    for i, current_batch in enumerate(tqdm(data_loader)):

        images, captions, _, _, _, img_ids, _ = current_batch

        images = images.to(device)

        if args.caption_model == "lstm":
            features = encoder(images)
            sentence_ids = decoder.sample(features).cpu().numpy()

            # Convert word_ids to words
            for j in range(args.batch_size):
                sampled_caption = []
                word_raw_id = []
                for word_id in sentence_ids[j]:
                    word = idx2word[word_id]
                    word_raw_id.append(word_id)
                    if word == '<end>':
                        break
                    sampled_caption.append(word)
                word_raw_id = word_raw_id[1:]
                sentence = ' '.join(sampled_caption[1:])
                word_raw_id = [str(raw) for raw in word_raw_id]
                pred_captions.append({
                    'image_id': img_ids[j],
                    'caption': sentence,
                    "gt_caption": captions[j]
                })

        elif args.caption_model == "convcap":
            imgsfeats, imgsfc7 = encoder(images)
            _, featdim, feat_h, feat_w = imgsfeats.size()

            wordclass_feed = np.zeros((args.batch_size, max_tokens),
                                      dtype='int64')
            wordclass_feed[:, 0] = vocab('<start>')

            outcaps = np.empty((args.batch_size, 0)).tolist()

            for j in range(max_tokens - 1):
                wordclass = Variable(torch.from_numpy(wordclass_feed)).cuda()

                wordact, _ = decoder(imgsfeats, imgsfc7, wordclass)

                wordact = wordact[:, :, :-1]
                # batch_size*max_token_len-1, vocab_len
                wordact_t = wordact.permute(0, 2, 1).contiguous().view(
                    args.batch_size * (max_tokens - 1), -1)

                wordprobs = F.softmax(wordact_t, dim=1).cpu().data.numpy()
                wordids = np.argmax(wordprobs, axis=1)

                word_raw_id = [[]] * args.batch_size
                for k in range(args.batch_size):
                    word = idx2word[wordids[j + k * (max_tokens - 1)]]
                    outcaps[k].append(word)
                    word_raw_id[k].append(wordids[j + k * (max_tokens - 1)])
                    if (j < max_tokens - 1):
                        wordclass_feed[k, j + 1] = wordids[j + k *
                                                           (max_tokens - 1)]

            for j in range(args.batch_size):
                num_words = len(outcaps[j])
                if '<end>' in outcaps[j]:
                    num_words = outcaps[j].index('<end>')
                outcap = ' '.join(outcaps[j][:num_words])

                current_word_raw_id = word_raw_id[k]  #[:num_words]
                current_word_raw_id = [str(raw) for raw in current_word_raw_id]
                pred_captions.append({
                    'image_id': img_ids[j],
                    'caption': outcap,
                    "gt_caption": captions[j]
                })

        elif args.caption_model == "transformer":
            features = encoder(images)
            sentence_ids = decoder.evaluate(features,
                                            args.max_tokens).cpu().numpy()

            # Convert word_ids to words
            for j in range(args.batch_size):
                sampled_caption = []
                word_raw_id = []
                for word_id in sentence_ids[j]:
                    word = idx2word[word_id]
                    word_raw_id.append(word_id)
                    if word == '<end>':
                        break
                    sampled_caption.append(word)
                sentence = ' '.join(sampled_caption[1:])
                word_raw_id = word_raw_id[1:]
                word_raw_id = [str(raw) for raw in word_raw_id]
                pred_captions.append({
                    'image_id': img_ids[j],
                    'caption': sentence,
                    "gt_caption": captions[j]
                })
    print(pred_captions[0:2])
    # Calculate scores
    scores = language_eval(args, pred_captions, args.model_path, split)

    if args.vis:
        print("[INFO] visualizing...")
        vis_folder = args.model_path.replace("models", "vis") + "_" + "_".join(
            os.path.basename(args.caption_path).split("_")[-2:])
        target = os.path.join(vis_folder, "imgs")
        if not os.path.exists(target):
            os.makedirs(target)
        data = data_loader.dataset.data
        '''
		# save img
		for pred in pred_captions:
			img_id = pred["image_id"]
			path = data.loadImgs(img_id)[0]['filename']
			img_path = os.path.join(args.image_root, split, path)
			os.system("cp {} {}".format(img_path, target))
		'''
        # in order to save space, we use the original location of img to show them
        for k in range(len(pred_captions)):
            pred = pred_captions[k]
            img_id = pred["image_id"]
            path = data.loadImgs(img_id)[0]['filename'].replace(".jpg", ".png")
            # need absolute path
            img_path = os.path.join(args.image_root, split, path)
            pred_captions[k]["img_path"] = img_path
        with open(os.path.join(vis_folder, "vis.json"), "w") as f:
            json.dump(pred_captions, f)

    encoder.train()
    decoder.train()

    return scores
Exemplo n.º 2
0
def test(args, split, modelfn=None, model_convcap=None, model_imgcnn=None):
    """Runs test on split=val/test with checkpoint file modelfn or loaded model_*"""

    t_start = time.time()
    data = coco_loader(args.coco_root, split=split, ncap_per_img=1)
    print('[DEBUG] Loading %s data ... %f secs' %
          (split, time.time() - t_start))

    data_loader = DataLoader(dataset=data, num_workers=args.nthreads,\
      batch_size=args.batchsize, shuffle=False, drop_last=True)

    batchsize = args.batchsize
    max_tokens = data.max_tokens
    num_batches = np.int_(np.floor((len(data.ids) * 1.) / batchsize))
    print('[DEBUG] Running inference on %s with %d batches' %
          (split, num_batches))

    if (modelfn is not None):
        model_imgcnn = Resnet101Feats()
        model_imgcnn.cuda()

        model_convcap = convcap(data.numwords,
                                args.num_layers,
                                is_attention=args.attention)
        model_convcap.cuda()

        print('[DEBUG] Loading checkpoint %s' % modelfn)
        checkpoint = torch.load(modelfn)
        model_convcap.load_state_dict(checkpoint['state_dict'])
        model_imgcnn.load_state_dict(checkpoint['img_state_dict'])
    else:
        model_imgcnn = model_imgcnn
        model_convcap = model_convcap

    model_imgcnn.train(False)
    model_convcap.train(False)

    pred_captions = []
    #Test epoch
    for batch_idx, (imgs, _, _, _, img_ids) in \
      tqdm(enumerate(data_loader), total=num_batches):

        imgs = imgs.view(batchsize, 3, 224, 224)

        imgs_v = Variable(imgs.cuda())
        imgsfeats, imgsfc7 = model_imgcnn(imgs_v)
        _, featdim, feat_h, feat_w = imgsfeats.size()

        wordclass_feed = np.zeros((batchsize, max_tokens), dtype='int64')
        wordclass_feed[:, 0] = data.wordlist.index('<S>')

        outcaps = np.empty((batchsize, 0)).tolist()

        for j in range(max_tokens - 1):
            wordclass = Variable(torch.from_numpy(wordclass_feed)).cuda()

            wordact, _ = model_convcap(imgsfeats, imgsfc7, wordclass)

            wordact = wordact[:, :, :-1]
            wordact_t = wordact.permute(0, 2, 1).contiguous().view(
                batchsize * (max_tokens - 1), -1)

            wordprobs = F.softmax(wordact_t).cpu().data.numpy()
            wordids = np.argmax(wordprobs, axis=1)

            for k in range(batchsize):
                word = data.wordlist[wordids[j + k * (max_tokens - 1)]]
                outcaps[k].append(word)
                if (j < max_tokens - 1):
                    wordclass_feed[k,
                                   j + 1] = wordids[j + k * (max_tokens - 1)]

        for j in range(batchsize):
            num_words = len(outcaps[j])
            if 'EOS' in outcaps[j]:
                num_words = outcaps[j].index('EOS')
            outcap = ' '.join(outcaps[j][:num_words])
            pred_captions.append({'image_id': img_ids[j], 'caption': outcap})

    scores = language_eval(pred_captions, args.model_dir, split)

    model_imgcnn.train(True)
    model_convcap.train(True)

    return scores
Exemplo n.º 3
0
def test_beam(args, split, modelfn=None): 
  """Sample generation with beam-search"""

  t_start = time.time()
  data = coco_loader(args.coco_root, split=split, ncap_per_img=1)
  print('[DEBUG] Loading %s data ... %f secs' % (split, time.time() - t_start))

  data_loader = DataLoader(dataset=data, num_workers=args.nthreads,\
    batch_size=args.batchsize, shuffle=False, drop_last=True)

  batchsize = args.batchsize
  max_tokens = data.max_tokens
  num_batches = np.int_(np.floor((len(data.ids)*1.)/batchsize))
  print('[DEBUG] Running test (w/ beam search) on %d batches' % num_batches)
  
  model_imgcnn = Vgg16Feats()
  model_imgcnn.cuda() 

  model_convcap = convcap(data.numwords, args.num_layers, is_attention=args.attention)
  model_convcap.cuda()

  print('[DEBUG] Loading checkpoint %s' % modelfn)
  checkpoint = torch.load(modelfn)
  model_convcap.load_state_dict(checkpoint['state_dict'])
  model_imgcnn.load_state_dict(checkpoint['img_state_dict'])

  model_imgcnn.train(False) 
  model_convcap.train(False)

  pred_captions = []
  for batch_idx, (imgs, _, _, _, img_ids) in \
    tqdm(enumerate(data_loader), total=num_batches):
    
    imgs = imgs.view(batchsize, 3, 224, 224)

    imgs_v = Variable(imgs.cuda())
    imgsfeats, imgsfc7 = model_imgcnn(imgs_v)

    b, f_dim, f_h, f_w = imgsfeats.size()
    imgsfeats = imgsfeats.unsqueeze(1).expand(\
      b, args.beam_size, f_dim, f_h, f_w)
    imgsfeats = imgsfeats.contiguous().view(\
      b*args.beam_size, f_dim, f_h, f_w)

    beam_searcher = beamsearch(args.beam_size, batchsize, max_tokens)
  
    wordclass_feed = np.zeros((args.beam_size*batchsize, max_tokens), dtype='int64')
    wordclass_feed[:,0] = data.wordlist.index('<S>') 
    imgsfc7 = repeat_img(args, imgsfc7)
    outcaps = np.empty((batchsize, 0)).tolist()

    for j in range(max_tokens-1):
      wordclass = Variable(torch.from_numpy(wordclass_feed)).cuda()

      wordact, _ = model_convcap(imgsfeats, imgsfc7, wordclass)
      wordact = wordact[:,:,:-1]
      wordact_j = wordact[..., j]

      beam_indices, wordclass_indices = beam_searcher.expand_beam(wordact_j)  

      if len(beam_indices) == 0 or j == (max_tokens-2): # Beam search is over.
        generated_captions = beam_searcher.get_results()
        for k in range(batchsize):
            g = generated_captions[:, k]
            outcaps[k] = [data.wordlist[x] for x in g]
      else:
        wordclass_feed = wordclass_feed[beam_indices]
        imgsfc7 = imgsfc7.index_select(0, Variable(torch.cuda.LongTensor(beam_indices)))
        imgsfeats = imgsfeats.index_select(0, Variable(torch.cuda.LongTensor(beam_indices)))
        for i, wordclass_idx in enumerate(wordclass_indices):
          wordclass_feed[i, j+1] = wordclass_idx

    for j in range(batchsize):
      num_words = len(outcaps[j]) 
      if 'EOS' in outcaps[j]:
        num_words = outcaps[j].index('EOS')
      outcap = ' '.join(outcaps[j][:num_words])
      pred_captions.append({'image_id': img_ids[j], 'caption': outcap})

  scores = language_eval(pred_captions, args.model_dir, split)

  model_imgcnn.train(True) 
  model_convcap.train(True)

  return scores
Exemplo n.º 4
0
def test_beam(args, split, modelfn=None):
    """Sample generation with beam-search"""

    t_start = time.time()
    data = coco_loader(args.coco_root, split=split, ncap_per_img=1)
    print('[DEBUG] Loading %s data ... %f secs' %
          (split, time.time() - t_start))

    data_loader = DataLoader(dataset=data, num_workers=args.nthreads,\
      batch_size=args.batchsize, shuffle=False, drop_last=True)

    batchsize = args.batchsize
    max_tokens = data.max_tokens
    num_batches = np.int_(np.floor((len(data.ids) * 1.) / batchsize))
    print('[DEBUG] Running test (w/ beam search) on %d batches' % num_batches)

    #model_imgcnn = Vgg16Feats()
    #model_imgcnn = Resnet101Feats()
    model_imgcnn = Resnet152Feats()
    model_imgcnn.cuda()

    model_convcap = convcap(data.numwords,
                            args.num_layers,
                            is_attention=args.attention)
    model_convcap.cuda()

    print('[DEBUG] Loading checkpoint %s' % modelfn)
    checkpoint = torch.load(modelfn)
    model_convcap.load_state_dict(checkpoint['state_dict'])
    model_imgcnn.load_state_dict(checkpoint['img_state_dict'])

    model_imgcnn.train(False)
    model_convcap.train(False)

    pred_captions = []
    for batch_idx, (imgs, _, _, _, img_ids) in \
      tqdm(enumerate(data_loader), total=num_batches):

        imgs = imgs.view(batchsize, 3, 224, 224)

        imgs_v = Variable(imgs.cuda())
        imgsfeats, imgsfc7 = model_imgcnn(imgs_v)

        b, f_dim, f_h, f_w = imgsfeats.size()
        imgsfeats = imgsfeats.unsqueeze(1).expand(\
          b, args.beam_size, f_dim, f_h, f_w)
        imgsfeats = imgsfeats.contiguous().view(\
          b*args.beam_size, f_dim, f_h, f_w)

        beam_searcher = beamsearch(args.beam_size, batchsize, max_tokens)

        wordclass_feed = np.zeros((args.beam_size * batchsize, max_tokens),
                                  dtype='int64')
        wordclass_feed[:, 0] = data.wordlist.index('<S>')
        imgsfc7 = repeat_img(args, imgsfc7)
        outcaps = np.empty((batchsize, 0)).tolist()

        for j in range(max_tokens - 1):
            wordclass = Variable(torch.from_numpy(wordclass_feed)).cuda()

            wordact, _ = model_convcap(imgsfeats, imgsfc7, wordclass)
            wordact = wordact[:, :, :-1]
            wordact_j = wordact[..., j]

            beam_indices, wordclass_indices = beam_searcher.expand_beam(
                wordact_j)

            if len(beam_indices) == 0 or j == (max_tokens -
                                               2):  # Beam search is over.
                generated_captions = beam_searcher.get_results()
                for k in range(batchsize):
                    g = generated_captions[:, k]
                    outcaps[k] = [data.wordlist[x] for x in g]
            else:
                wordclass_feed = wordclass_feed[beam_indices]
                imgsfc7 = imgsfc7.index_select(
                    0, Variable(torch.cuda.LongTensor(beam_indices)))
                imgsfeats = imgsfeats.index_select(
                    0, Variable(torch.cuda.LongTensor(beam_indices)))
                for i, wordclass_idx in enumerate(wordclass_indices):
                    wordclass_feed[i, j + 1] = wordclass_idx

        for j in range(batchsize):
            num_words = len(outcaps[j])
            if 'EOS' in outcaps[j]:
                num_words = outcaps[j].index('EOS')
            outcap = ' '.join(outcaps[j][:num_words])
            pred_captions.append({'image_id': img_ids[j], 'caption': outcap})

    scores = language_eval(pred_captions, args.model_dir, split)

    model_imgcnn.train(True)
    model_convcap.train(True)

    return scores
Exemplo n.º 5
0
def test(args, split, modelfn=None, model_convcap=None, model_imgcnn=None):
  """Runs test on split=val/test with checkpoint file modelfn or loaded model_*"""

  t_start = time.time()
  data = coco_loader(args.coco_root, split=split, ncap_per_img=1)
  print('[DEBUG] Loading %s data ... %f secs' % (split, time.time() - t_start))

  data_loader = DataLoader(dataset=data, num_workers=args.nthreads,\
    batch_size=args.batchsize, shuffle=False, drop_last=True)

  batchsize = args.batchsize
  max_tokens = data.max_tokens
  num_batches = np.int_(np.floor((len(data.ids)*1.)/batchsize))
  print('[DEBUG] Running inference on %s with %d batches' % (split, num_batches))

  if(modelfn is not None):
    model_imgcnn = Vgg16Feats()
    model_imgcnn.cuda() 

    model_convcap = convcap(data.numwords, args.num_layers, is_attention=args.attention)
    model_convcap.cuda()

    print('[DEBUG] Loading checkpoint %s' % modelfn)
    checkpoint = torch.load(modelfn)
    model_convcap.load_state_dict(checkpoint['state_dict'])
    model_imgcnn.load_state_dict(checkpoint['img_state_dict'])
  else:
    model_imgcnn = model_imgcnn
    model_convcap = model_convcap

  model_imgcnn.train(False) 
  model_convcap.train(False)

  pred_captions = []
  #Test epoch
  for batch_idx, (imgs, _, _, _, img_ids) in \
    tqdm(enumerate(data_loader), total=num_batches):
    
    imgs = imgs.view(batchsize, 3, 224, 224)

    imgs_v = Variable(imgs.cuda())
    imgsfeats, imgsfc7 = model_imgcnn(imgs_v)
    _, featdim, feat_h, feat_w = imgsfeats.size()
  
    wordclass_feed = np.zeros((batchsize, max_tokens), dtype='int64')
    wordclass_feed[:,0] = data.wordlist.index('<S>') 

    outcaps = np.empty((batchsize, 0)).tolist()

    for j in range(max_tokens-1):
      wordclass = Variable(torch.from_numpy(wordclass_feed)).cuda()

      wordact, _ = model_convcap(imgsfeats, imgsfc7, wordclass)

      wordact = wordact[:,:,:-1]
      wordact_t = wordact.permute(0, 2, 1).contiguous().view(batchsize*(max_tokens-1), -1)

      wordprobs = F.softmax(wordact_t).cpu().data.numpy()
      wordids = np.argmax(wordprobs, axis=1)

      for k in range(batchsize):
        word = data.wordlist[wordids[j+k*(max_tokens-1)]]
        outcaps[k].append(word)
        if(j < max_tokens-1):
          wordclass_feed[k, j+1] = wordids[j+k*(max_tokens-1)]

    for j in range(batchsize):
      num_words = len(outcaps[j]) 
      if 'EOS' in outcaps[j]:
        num_words = outcaps[j].index('EOS')
      outcap = ' '.join(outcaps[j][:num_words])
      pred_captions.append({'image_id': img_ids[j], 'caption': outcap})

  scores = language_eval(pred_captions, args.model_dir, split)

  model_imgcnn.train(True) 
  model_convcap.train(True)

  return scores 
def test(opt,EncoderRNN,DecoderCNN,Convcap,itow,wtoi,modelfn=None):

    '''
    input : option, model, checkpoint/model.pth
    output :scores
    '''
    t_start = time.time()
    t_start = time.time()
    
    test_data=VideoDataset(opt, 'test')
    test_loader=DataLoader(test_data, batch_size=opt["batch_size"],num_workers=30, shuffle=False)
    print('[DEBUG] Loading test data ... %f secs' % (time.time() - t_start))

    batchsize =opt['batch_size']
    cap_size= opt['max_len']
    nbatches = np.int_(np.floor((len(test_data)*1.)/batchsize))
    bestscore = .0
    batchsize_cap = batchsize*1
    max_tokens= opt['max_len']

    if(modelfn is not None):
        encoder=EncoderRNN.EncoderRNN(opt['dim_vid'],opt['dim_hidden'],bidirectional=opt['bidirectional'],rnn_cell=opt['rnn_type']).cuda()
        decoder=DecoderCNN.DecoderCNN(test_data.get_vocab_size()).cuda()
        convcap=Convcap.Convcap(encoder,decoder).cuda()
        print('[DEBUG] Loading checkpoint %s' % modelfn)
        checkpoint = torch.load(modelfn)
        convcap.load_state_dict(checkpoint['state_dict'])
        
    convcap.train(False)
    pred_captions = []
    itr=0
    for data in test_loader:
        print("iteration"+str(itr))
        #print(data['labels'].data[0])
        print("\n")
        print("gt\n")
        for i in data['labels'].data[0]:

            print(itow[int(i)])
            

        itr+=1
        vid_feat=Variable(data['c3d_feats']).cuda()
        labels=Variable(data['labels'].type(torch.LongTensor)).cuda()
        mask = Variable(data['masks']).cpu()
        word_embed=Variable(data['word_embed']).cuda()
        vid_id=data['video_ids']
        #print(vid_id[0])
        wordclass_feed = np.zeros((batchsize, max_tokens), dtype='int64')
        wordclass_feed[:,0] =wtoi['<sos>'] #1 #index of <sos>
        # print('wordclass_feed shape')
        # print(wordclass_feed.shape)
        outcaps = np.empty((batchsize, 0)).tolist()
        x_outcaps=np.empty((batchsize, 0)).tolist()
        for j in range(max_tokens-1):
            wordclass = Variable(torch.from_numpy(wordclass_feed)).cuda()
            wordact = convcap(vid_feat,wordclass,word_embed,'test')
            x=convcap(vid_feat,labels,word_embed,'test')
            x=x[:,:,:-1]
            x_t=x.permute(0, 2, 1).contiguous().view(batchsize*(max_tokens-1), -1)
            x_prob=F.softmax(x_t).cpu().data.numpy()
            

            wordact = wordact[:,:,:-1]
            wordact_t = wordact.permute(0, 2, 1).contiguous().view(batchsize*(max_tokens-1), -1)
           # print("convcap output"+str(wordact_t.shape))
            wordprobs = F.softmax(wordact_t).cpu().data.numpy()
            x_id=np.argmax(x_prob, axis=1)
            
            wordids = np.argmax(wordprobs, axis=1)
            probs=np.max(wordprobs,axis=1)
            
            for k in range(batchsize):
                word = itow[wordids[j+k*(max_tokens-1)]]
                x_word=itow[x_id[j+k*(max_tokens-1)]]
                outcaps[k].append(word)
                x_outcaps[k].append(x_word)
                if(j < max_tokens-1):
                    wordclass_feed[k, j+1] = wordids[j+k*(max_tokens-1)]

                    # print("word ids"+str(wordids[j+k*(max_tokens-1)]))
        for j in range(batchsize):
            num_words = len(outcaps[j])
            x_n_words=len(x_outcaps[j]) 
            if 'eos' in x_outcaps[j]:
                x_n_words=x_outcaps[j].index('eos')
            x_outcap=' '.join(x_outcaps[j][:num_words])
            if 'eos' in outcaps[j]:
                num_words = outcaps[j].index('eos')
            outcap = ' '.join(outcaps[j][:num_words])
            pred_captions.append({'vid_id': vid_id[0][5:], 'caption': outcap})
        print("------------------------------------------------------------------------------")
        print("videoID \t"+str(vid_id))
        print("caption \n")
        print(x_outcap)
        
        #print(itow[int(i)] for i in data['labels'].data[0])
        print("------------------------------------------------------------------------------")
        
        
    scores = language_eval(pred_captions, '/home/sanjay/Documents/Video_convcap/output', 'test')

    return scores
Exemplo n.º 7
0
def test(args, split, modelfn=None, model_convcap=None, model_imgcnn=None):
    """Runs test on split=val/test with checkpoint file modelfn or loaded model_*"""

    t_start = time.time()
    data = coco_loader(args.data_root, split=split, ncap_per_img=1)
    print('[DEBUG] Loading %s data ... %f secs' %
          (split, time.time() - t_start))

    data_loader = DataLoader(dataset=data, num_workers=args.nthreads,\
      batch_size=args.batchsize, shuffle=False, drop_last=False)

    batchsize = args.batchsize
    max_tokens = data.max_tokens
    num_batches = np.int_(np.floor((len(data.ids) * 1.) / batchsize))
    if num_batches == 0: num_batches = 1

    print('[DEBUG] Running inference on %s with %d batches' %
          (split, num_batches))

    model_imgcnn = resnet50().cuda()

    model_imgcnn.load_state_dict(rename_keys(
        torch.load(modelfn)['state_dict']))

    word_embeddings = None
    if args.glove:
        word_embeddings = get_glove_vectors(args.ge, data.wordlist)

    model_convcap = convcap(data.numwords,
                            args.num_layers,
                            is_attention=args.attention,
                            embedding_weights=word_embeddings).cuda()

    if modelfn is not None:
        print('[DEBUG] Loading checkpoint %s' % modelfn)
        checkpoint = torch.load(modelfn)
        model_convcap.load_state_dict(checkpoint['state_dict'])
        model_imgcnn.load_state_dict(checkpoint['img_state_dict'])

    model_imgcnn.eval()
    model_convcap.eval()

    pred_captions = []
    attns = []
    pred_tokens = []
    all_img_ids = []
    loss = 0.

    for batch_idx, (imgs, _, wordclass_t, mask, img_ids) in \
      tqdm(enumerate(data_loader), total=num_batches):
        batchsize = len(imgs)
        wordact_t_final = [None for _ in range(batchsize * (max_tokens - 1))]

        imgs = imgs.view(batchsize, 3, 128, 128)

        imgs_v = Variable(imgs.cuda())
        imgsfeats, imgsfc7 = model_imgcnn(imgs_v)

        _, featdim, feat_h, feat_w = imgsfeats.size()

        wordclass_feed = np.zeros((batchsize, max_tokens), dtype='int64')
        wordclass_feed[:, 0] = data.wordlist.index('<S>')

        outcaps = np.empty((batchsize, 0)).tolist()

        for j in range(max_tokens - 1):
            wordclass = Variable(torch.from_numpy(wordclass_feed)).cuda()

            wordact, attn = model_convcap(imgsfeats, imgsfc7, wordclass)
            if args.mode == 'attvis':
                attn = attn.view(batchsize, max_tokens, feat_h, feat_w)

            wordact = wordact[:, :, :-1]
            wordact_t = wordact.permute(0, 2, 1).contiguous().view(
                batchsize * (max_tokens - 1), -1)

            wordprobs = F.softmax(wordact_t).cpu().data.numpy()
            wordids = np.argmax(wordprobs, axis=1)

            for k in range(batchsize):
                word = data.wordlist[wordids[j + k * (max_tokens - 1)]]
                outcaps[k].append(word)
                if (j < max_tokens - 1):
                    wordclass_feed[k,
                                   j + 1] = wordids[j + k * (max_tokens - 1)]

                wordact_t_final[j + k *
                                (max_tokens - 1)] = wordact_t[j + k *
                                                              (max_tokens - 1)]

        wordclass_t = wordclass_t.view(batchsize, max_tokens)
        mask = mask.view(batchsize, max_tokens)
        wordclass_t = wordclass_t[:, 1:]
        mask = mask[:, 1:].contiguous()
        wordact_t_final = torch.stack(wordact_t_final).cpu()
        wordclass_t = wordclass_t.contiguous().view(
            batchsize * (max_tokens - 1), 1)
        maskids = torch.nonzero(mask.view(-1)).numpy().reshape(-1)

        loss += F.cross_entropy(wordact_t_final[maskids, ...], \
          wordclass_t[maskids, ...].contiguous().view(maskids.shape[0])).data.item()

        for j in range(batchsize):
            num_words = len(outcaps[j])
            if 'EOS' in outcaps[j]:
                num_words = outcaps[j].index('EOS')
            outcap = ' '.join(outcaps[j][:num_words])
            if args.mode == 'attvis':
                pred_tokens.append(outcaps[j][:num_words])
            pred_captions.append({
                'image_id': img_ids[j].item(),
                'caption': outcap
            })

    print('{} split testing loss is: {}'.format(split,
                                                (loss * 1.) / (batch_idx + 1)))

    scores = language_eval(pred_captions, args.model_dir, split)
    if args.mode == 'test':
        labelnames = ['Happy', 'Surprise', 'Fear', 'Disgust', 'Angry', 'Sad']
        label_wise_caps = {labelnames[i] : [p for p in pred_captions if data.labels[p['image_id']][i]]\
                           for i in range(6)}
        for k, preds in label_wise_caps.iteritems():
            print(k)
            language_eval(preds, args.model_dir, split)

    return scores