def main(ckpt_path, gts_name='/gts.json', res_name='/res.json'): print("eval_spice.py") print(ckpt_path) with open(ckpt_path + gts_name) as f: gts = json.load(f) with open(ckpt_path + res_name) as f: res = json.load(f) scorer = Spice() score, scores = scorer.compute_score(gts, res) with open(ckpt_path + '/score.json', 'w') as f: json.dump(score, f) with open(ckpt_path + '/scores.json', 'w') as f: json.dump(scores, f)
def spice(): scorer = Spice() score, scores = scorer.compute_score(gts, res) print('spice = %s' % score)
def spice(gts, res): scorer = Spice() score, scores = scorer.compute_score(gts, res) out_file.write('SPICE = %s' % score + '\n')
import time import os import sys sys.path.append("coco-caption") from pycocotools.coco import COCO from pycocoevalcap.spice.spice import Spice train_path = '/home/yangxu/project/self-critical.pytorch/data/coco_annotations/captions_train2014.json' val_path = '/home/yangxu/project/self-critical.pytorch/data/coco_annotations/captions_val2014.json' coco_train = COCO(train_path) coco_val = COCO(val_path) coco_use = coco_train image_ids = coco_use.getImgIds() gts = {} res = {} for img_id in image_ids: gts[img_id] = [] data_temp = coco_use.imgToAnns[img_id] for dt in data_temp: gts[img_id].append(dt['caption']) res[img_id] = [] res[img_id].append(gts[img_id][0]) scorer = Spice() score, scores = scorer.compute_score(gts, res)
def evaluate(beam_size): """ Evaluation :param beam_size: beam size at which to generate captions for evaluation :return: BLEU-4 score """ # DataLoader loader = torch.utils.data.DataLoader(CaptionDataset( data_folder, data_name, 'TEST', transform=transforms.Compose([normalize])), batch_size=1, shuffle=True, num_workers=0, pin_memory=False) # TODO: Batched Beam Search # Therefore, do not use a batch_size greater than 1 - IMPORTANT! # Lists to store references (true captions), and hypothesis (prediction) for each image # If for n images, we have n hypotheses, and references a, b, c... for each image, we need - # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...] references = dict() hypotheses = dict() # For each image for j, (image, caps, caplens, allcaps) in enumerate( tqdm(loader, desc="EVALUATING AT BEAM SIZE " + str(beam_size))): k = beam_size # Move to GPU device, if available image = image.to(device) # (1, 3, 256, 256) attrs, encoder_out = encoder(image) attrs = attrs.expand(3, attrs_dim) enc_image_size = encoder_out.size(1) encoder_dim = encoder_out.size(3) encoder_out = encoder_out.view(1, -1, encoder_dim) num_pixels = encoder_out.size(1) encoder_out = encoder_out.expand(k, num_pixels, encoder_dim) x0 = decoder.init_x0(attrs) # Tensor to store top k previous words at each step; now they're just <start> k_prev_words = torch.LongTensor([[word_map['<start>']]] * k).to( device) # (k, 1) # Tensor to store top k sequences; now they're just <start> seqs = k_prev_words # (k, 1) # Tensor to store top k sequences' scores; now they're just 0 top_k_scores = torch.zeros(k, 1).to(device) # (k, 1) # Lists to store completed sequences and scores complete_seqs = list() complete_seqs_scores = list() # Start decoding step = 1 h1, c1, h2, c2 = decoder.init_hidden_state(attrs, encoder_out, zero=True) h1, c1 = decoder.decode_step1(x0, (h1, c1)) # s is a number less than or equal to k, because sequences are removed from this process once they hit <end> while True: embeddings = decoder.embedding(k_prev_words).squeeze( 1) # (s, embed_dim) h1, c1 = decoder.decode_step1(embeddings, (h1, c1)) awe, _ = decoder.attention(encoder_out, h1, h2) # gate = decoder.sigmoid(decoder.f_beta(h2)) # awe = gate * awe h2, c2 = decoder.decode_step2(torch.cat([embeddings, awe], dim=1), (h2, c2)) scores = decoder.fc2(decoder.dropout2(h2)) scores = F.log_softmax(scores, dim=1) # Add scores = top_k_scores.expand_as(scores) + scores # (s, vocab_size) # For the first step, all k points will have the same scores (since same k previous words, h, c) if step == 1: top_k_scores, top_k_words = scores[0].topk(k, 0, True, True) # (s) else: # Unroll and find top scores, and their unrolled indices # (s) 所有分数中最大的k个 top_k_scores, top_k_words = scores.view(-1).topk( k, 0, True, True) # Convert unrolled indices to actual indices of scores # 上面展开了,prev_word_inds得到哪些句子是概率最大的 prev_word_inds = top_k_words / vocab_size # (s) next_word_inds = top_k_words % vocab_size # (s) # Add new words to sequences seqs = torch.cat( [seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1) # (s, step+1) # Which sequences are incomplete (didn't reach <end>)? incomplete_inds = [ ind for ind, next_word in enumerate(next_word_inds) if next_word != word_map['<end>'] ] complete_inds = list( set(range(len(next_word_inds))) - set(incomplete_inds)) # Set aside complete sequences if len(complete_inds) > 0: complete_seqs.extend(seqs[complete_inds].tolist()) complete_seqs_scores.extend(top_k_scores[complete_inds]) k -= len(complete_inds) # reduce beam length accordingly # Proceed with incomplete sequences if k == 0: break seqs = seqs[incomplete_inds] h1 = h1[prev_word_inds[incomplete_inds]] c1 = c1[prev_word_inds[incomplete_inds]] h2 = h2[prev_word_inds[incomplete_inds]] c2 = c2[prev_word_inds[incomplete_inds]] encoder_out = encoder_out[prev_word_inds[incomplete_inds]] top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1) k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1) # Break if things have been going on too long if step > 50: break step += 1 i = complete_seqs_scores.index(max(complete_seqs_scores)) seq = complete_seqs[i] # References img_caps = allcaps[0].tolist() img_captions = list( map( lambda c: [ rev_word_map[w] for w in c if w not in { word_map['<start>'], word_map['<end>'], word_map[ '<pad>'] } ], img_caps)) # remove <start> and pads img_caps = [' '.join(c) for c in img_captions] # print(img_caps) references[str(j)] = img_caps # Hypotheses hypothesis = ([ rev_word_map[w] for w in seq if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']} ]) hypothesis = [' '.join(hypothesis)] # print(hypothesis) hypotheses[str(j)] = hypothesis assert len(references) == len(hypotheses) # Calculate BLEU-1~BLEU4 scores m1 = Bleu() m2 = Meteor() m3 = Cider() m4 = Rouge() m5 = Spice() (score1, scores1) = m1.compute_score(references, hypotheses) (score2, scores2) = m2.compute_score(references, hypotheses) (score3, scores3) = m3.compute_score(references, hypotheses) (score4, scores4) = m4.compute_score(references, hypotheses) (score5, scores5) = m5.compute_score(references, hypotheses) return score1, score2, score3, score4, score5