def main(ckpt_path, gts_name='/gts.json', res_name='/res.json'): print("eval_spice.py") print(ckpt_path) with open(ckpt_path + gts_name) as f: gts = json.load(f) with open(ckpt_path + res_name) as f: res = json.load(f) scorer = Spice() score, scores = scorer.compute_score(gts, res) with open(ckpt_path + '/score.json', 'w') as f: json.dump(score, f) with open(ckpt_path + '/scores.json', 'w') as f: json.dump(scores, f)
def compute_scores(gts, res): """ Performs the MS COCO evaluation using the Python 3 implementation (https://github.com/salaniz/pycocoevalcap) :param gts: Dictionary with the image ids and their gold captions, :param res: Dictionary with the image ids ant their generated captions :print: Evaluation score (the mean of the scores of all the instances) for each measure """ # Preprocess captions gts = preprocess_captions(gts) res = preprocess_captions(res) # Set up scorers scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Spice(), "SPICE"), (Cider(), "CIDEr") ] # Compute score for each metric for scorer, method in scorers: print("Computing", scorer.method(), "...") score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, m in zip(score, method): print("%s : %0.3f" % (m, sc)) else: print("%s : %0.3f" % (method, score))
def main(eval_caption_file, output, zh=False): df = pd.read_json(eval_caption_file) if zh: refs = df.groupby("key")["tokens"].apply(list).to_dict() else: refs = df.groupby("key")["caption"].apply(list).to_dict() from pycocoevalcap.bleu.bleu import Bleu from pycocoevalcap.cider.cider import Cider from pycocoevalcap.rouge.rouge import Rouge scorer = Bleu(zh=zh) bleu_scores = coco_score(copy.deepcopy(refs), scorer) scorer = Cider(zh=zh) cider_score = coco_score(copy.deepcopy(refs), scorer) scorer = Rouge(zh=zh) rouge_score = coco_score(copy.deepcopy(refs), scorer) if not zh: from pycocoevalcap.meteor.meteor import Meteor scorer = Meteor() meteor_score = coco_score(copy.deepcopy(refs), scorer) from pycocoevalcap.spice.spice import Spice scorer = Spice() spice_score = coco_score(copy.deepcopy(refs), scorer) with open(output, "w") as f: for n in range(4): f.write("BLEU-{}: {:6.3f}\n".format(n + 1, bleu_scores[n])) f.write("CIDEr: {:6.3f}\n".format(cider_score)) f.write("ROUGE: {:6.3f}\n".format(rouge_score)) if not zh: f.write("Meteor: {:6.3f}\n".format(meteor_score)) f.write("SPICE: {:6.3f}\n".format(spice_score))
def __init__(self, ground_truth_filenames=None, prediction_filename=None, tious=None, max_proposals=1000, prediction_fields=PREDICTION_FIELDS, verbose=False): # Check that the gt and submission files exist and load them if len(tious) == 0: raise IOError('Please input a valid tIoU.') if not ground_truth_filenames: raise IOError('Please input a valid ground truth file.') if not prediction_filename: raise IOError('Please input a valid prediction file.') self.verbose = verbose self.tious = tious self.max_proposals = max_proposals self.pred_fields = prediction_fields self.ground_truths = self.import_ground_truths(ground_truth_filenames) self.prediction = self.import_prediction(prediction_filename) self.tokenizer = PTBTokenizer() # Set up scorers, if not verbose, we only use the one we're # testing on: METEOR if self.verbose: self.scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider('corpus'), "CIDEr"), (Spice(), "SPICE")] else: self.scorers = [(Cider('corpus'), "CIDEr")]
def init_scorer(cached_tokens): global CiderD_scorer CiderD_scorer = CiderD_scorer or CiderD(df=cached_tokens) global Bleu_scorer Bleu_scorer = Bleu_scorer or Bleu(4) global Spice_scorer Spice_scorer = Spice_scorer or Spice()
def get_dcc_scores(self): imgIds = self.params['image_id'] # imgIds = self.coco.getImgIds() gts = {} res = {} for imgId in imgIds: gts[imgId] = self.coco.imgToAnns[imgId] res[imgId] = self.cocoRes.imgToAnns[imgId] tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(df='noc_test_freq'), "CIDEr"), (Spice(), "SPICE")] score_dict = {} for scorer, method in scorers: print('computing %s score...' % (scorer.method())) score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, scs, m in zip(score, scores, method): score_dict[m] = sc print("%s: %0.3f" % (m, sc)) else: score_dict[method] = score print("%s: %0.3f" % (method, score)) return score_dict
def score(num, DIR): print("Testing results on epoch ", num, " in DIR=", DIR) print("Loading coco annotations") dataDir = '.' dataType = 'val2014' algName = 'fakecap' annFile = '%s/annotations/captions_%s.json' % (dataDir, dataType) subtypes = ['results', 'evalImgs', 'eval'] [resFile, evalImgsFile, evalFile]= \ ['%s/results/captions_%s_%s_%s.json'%(dataDir,dataType,algName,subtype) for subtype in subtypes] coco_anns = COCO(annFile) print("COCO anns imported") path = DIR + str(num) + '_test_result.tar.gz' save = pickle.load(open(path)) cocoRes = {} coco = {} for key, val in save.items(): reslst = val[u'res'] res = [] for data in reslst: if data != u'<SEND>': res.append(data) else: break res = res[1:] #print "RES: ",reslst #print "ANN: ", val[u'ann'] #res = [word for word in res if word!=u'<SEND>'][1:] #print "RES FIXED: ", res if len(res) == 0: res = [u'a'] #just not to be empty, and it has low low idf cocoRes[key] = [{u'caption': ' '.join(res)}] #coco[key] = [{u'caption':' '.join(val[u'ann'][1:-1])}] coco[key] = coco_anns.imgToAnns[key] print 'examples' for key in coco.keys()[:5]: print "IMG_NUM=", key print "Annotation: ", '\n'.join( [coco[key][i][u'caption'] for i in range(len(coco[key]))]) print "Generated data: ", ' '.join(save[key][u'res']) print "Cleared generation: ", cocoRes[key][0][u'caption'] print 'tokenization...' tokenizer = PTBTokenizer() gts = tokenizer.tokenize(coco) res = tokenizer.tokenize(cocoRes) print 'setting up scorers...' scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), (Spice(), "SPICE")] for scorer, method in scorers: print 'computing %s score...' % (scorer.method()) score, scores = scorer.compute_score(gts, res) print(score)
def score_all(ref, hypo): scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), (Spice(), "SPICE")] final_scores = {} for scorer, method in scorers: score, scores = scorer.compute_score(ref, hypo) if type(score) == list: for m, s in zip(method, score): final_scores[m] = s else: final_scores[method] = score return final_scores
def evaluate(self): res = {} for r in self.rests: res[str(r['image_id'])] = [{'caption': r['caption']}] gts = {} for imgId in self.annos: gts[str(imgId)] = [{'caption': c} for c in self.annos[imgId]] # ================================================= # Set up scorers # ================================================= # print('tokenization...') tokenizer = self.Tokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) # ================================================= # Set up scorers # ================================================= # print('setting up scorers...') use_scorers = self.use_scorers scorers = [] if 'Bleu' in use_scorers: scorers.append((Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"])) if 'METEOR' in use_scorers: scorers.append((Meteor(), "METEOR")) if 'ROUGE_L' in use_scorers: scorers.append((Rouge(), "ROUGE_L")) if 'CIDEr' in use_scorers: scorers.append((Cider(), "CIDEr")) if 'SPICE' in use_scorers: scorers.append((Spice(), "SPICE")) # ================================================= # Compute scores # ================================================= for scorer, method in scorers: # print('computing %s score...'%(scorer.method())) score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, scs, m in zip(score, scores, method): self.setEval(sc, m) self.setImgToEvalImgs(scs, gts.keys(), m) # print("%s: %0.1f" % (m, sc*100)) else: self.setEval(score, method) self.setImgToEvalImgs(scores, gts.keys(), method) # print("%s: %0.1f" % (method, score*100)) self.setEvalImgs()
def main(eval_caption_file, output, zh=False, embedding_path=None): df = pd.read_json(eval_caption_file) if zh: refs = df.groupby("key")["tokens"].apply(list).to_dict() else: refs = df.groupby("key")["caption"].apply(list).to_dict() from pycocoevalcap.bleu.bleu import Bleu from pycocoevalcap.cider.cider import Cider from pycocoevalcap.rouge.rouge import Rouge scorer = Bleu(zh=zh) bleu_scores = coco_score(copy.deepcopy(refs), scorer) print(bleu_scores) scorer = Cider(zh=zh) cider_score = coco_score(copy.deepcopy(refs), scorer) print(cider_score) scorer = Rouge(zh=zh) rouge_score = coco_score(copy.deepcopy(refs), scorer) print(rouge_score) if not zh: from pycocoevalcap.meteor.meteor import Meteor scorer = Meteor() meteor_score = coco_score(copy.deepcopy(refs), scorer) from pycocoevalcap.spice.spice import Spice scorer = Spice() spice_score = coco_score(copy.deepcopy(refs), scorer) diverse_score = diversity_score(refs, zh) with open(embedding_path, "rb") as f: ref_embeddings = pickle.load(f) bert_score = embedding_score(ref_embeddings, zh) with open(output, "w") as f: for n in range(4): f.write("BLEU-{}: {:6.3f}\n".format(n + 1, bleu_scores[n])) f.write("CIDEr: {:6.3f}\n".format(cider_score)) f.write("ROUGE: {:6.3f}\n".format(rouge_score)) if not zh: f.write("Meteor: {:6.3f}\n".format(meteor_score)) f.write("SPICE: {:6.3f}\n".format(spice_score)) f.write("SentenceBert: {:6.3f}\n".format(bert_score)) f.write("Diversity: {:6.3f}\n".format(diverse_score))
def coco_evaluate(self, path1: str, path2: str, kaldi_stream: str, kaldi_scp: str, caption_file: str, max_length: int = None, output: str = "coco_scores.txt"): key2pred = self._ensemble(path1, path2, kaldi_stream, kaldi_scp, max_length) caption_df = pd.read_json(caption_file) caption_df["key"] = caption_df["filename"].apply( lambda x: os.path.splitext(x)[0]) key2refs = caption_df.groupby(["key"])["caption"].apply(list).to_dict() from pycocoevalcap.bleu.bleu import Bleu from pycocoevalcap.rouge.rouge import Rouge from pycocoevalcap.cider.cider import Cider from pycocoevalcap.meteor.meteor import Meteor from pycocoevalcap.spice.spice import Spice f = open(output, "w") scorer = Bleu(n=4) score, scores = scorer.compute_score(key2refs, key2pred) for n in range(4): f.write("Bleu-{}: {:6.3f}\n".format(n + 1, score[n])) scorer = Rouge() score, scores = scorer.compute_score(key2refs, key2pred) f.write("ROUGE: {:6.3f}\n".format(score)) scorer = Cider() score, scores = scorer.compute_score(key2refs, key2pred) f.write("CIDEr: {:6.3f}\n".format(score)) scorer = Meteor() score, scores = scorer.compute_score(key2refs, key2pred) f.write("Meteor: {:6.3f}\n".format(score)) scorer = Spice() score, scores = scorer.compute_score(key2refs, key2pred) f.write("Spice: {:6.3f}\n".format(score)) f.close()
def __init__(self, coco, cocoRes, metric): super().__init__(coco, cocoRes) self.scores = [] if metric == "Bleu": self.scores = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"])] elif metric == "METEOR": self.scores = [(Meteor(), "METEOR")] elif metric == "ROUGE_L": self.scores = [(Rouge(), "ROUGE_L")] elif metric == "CIDEr": self.scores = [(Cider(), "CIDEr")] elif metric == "SPICE": self.scores = [(Spice(), "SPICE")] else: raise ValueError( f'Not supported image caption metric: {metric}. Supported metric list: [Bleu, METEOR, ROUGE_L, CIDEr, SPICE]' )
def evaluate(self): gts = self.coco res = self.coco_res # ================================================= # Set up scorers # ================================================= print("tokenization...") tokenizer = PTBTokenizer() # gts = {k:[' '.join(v)] for k,v in tokenizer.tokenize(gts).items()} # res = {k:[' '.join(v)] for k,v in tokenizer.tokenize(res).items()} gts = tokenizer.tokenize(gts) # res = {k: v[:1] for k, v in tokenizer.tokenize(res).items()} res = tokenizer.tokenize(res) # breakpoint() # ================================================= # Set up scorers # ================================================= print("setting up scorers...") scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (RougeF1(), "ROUGE_F1"), (Cider(), "CIDEr"), (Spice(), "SPICE"), ] # ================================================= # Compute scores # ================================================= for scorer, method in scorers: print("computing %s score..." % (scorer.method())) score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, scs, m in zip(score, scores, method): self.set_eval(sc, m) self.set_img_to_eval_imgs(scs, gts.keys(), m) print(f"{m}: {sc:0.3f}") else: self.set_eval(score, method) self.set_img_to_eval_imgs(scores, gts.keys(), method) print(f"{method}: {score:0.3f}") self.set_eval_imgs()
def evaluate(self): imgIds = self.params['image_id'] gts = self.gts res = self.res # ================================================= # Set up scorers # ================================================= print('tokenization...') tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) # ================================================= # Set up scorers # ================================================= print('setting up scorers...') scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), (Spice(), "SPICE"), (WMD(), "WMD"), ] # ================================================= # Compute scores # ================================================= eval = {} for scorer, method in scorers: print('computing %s score...' % (scorer.method())) score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, scs, m in zip(score, scores, method): self.setEval(sc, m) self.setImgToEvalImgs(scs, imgIds, m) print("%s: %0.3f" % (m, sc)) else: self.setEval(score, method) self.setImgToEvalImgs(scores, imgIds, method) print("%s: %0.3f" % (method, score)) self.setEvalImgs()
def score_func(ref, hypo, idx=None): """ ref, dictionary of reference sentences (id, sentence) hypo, dictionary of hypothesis sentences (id, sentence) score, dictionary of scores """ scorers = [(Spice(), "SPICE"), (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), (Meteor(), "METEOR")] final_scores = {} if idx is not None: scorers = [scorers[idx]] for scorer, method in scorers: score, scores = scorer.compute_score(ref, hypo) print('score', method, score) if type(score) == list: for m, s in zip(method, score): final_scores[m] = s else: final_scores[method] = score return final_scores
def evaluate(self): gts = self.coco res = self.coco_res # ================================================= # Set up scorers # ================================================= print("tokenization...") tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) # ================================================= # Set up scorers # ================================================= print("setting up scorers...") scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), (Spice(), "SPICE"), ] # ================================================= # Compute scores # ================================================= for scorer, method in scorers: print("computing %s score..." % (scorer.method())) score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, scs, m in zip(score, scores, method): self.set_eval(sc, m) self.set_img_to_eval_imgs(scs, gts.keys(), m) print("%s: %0.3f" % (m, sc)) else: self.set_eval(score, method) self.set_img_to_eval_imgs(scores, gts.keys(), method) print("%s: %0.3f" % (method, score)) self.set_eval_imgs()
def __init__(self, ground_truth_filename=None, prediction_filename=None, verbose=False): # Check that the gt and submission files exist and load them if not ground_truth_filename: raise IOError('Please input a valid ground truth file.') if not prediction_filename: raise IOError('Please input a valid prediction file.') self.verbose = verbose self.ground_truth = self.import_ground_truth(ground_truth_filename) self.prediction = self.import_prediction(prediction_filename) self.tokenizer = PTBTokenizer() # Set up scorers, if not verbose, we only use the one we're # testing on: METEOR if self.verbose: self.scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), (Spice(), "SPICE")] else: self.scorers = [(Meteor(), "METEOR")]
def spice(): scorer = Spice() score, scores = scorer.compute_score(gts, res) print('spice = %s' % score)
def validate(val_loader, encoder, decoder, criterion, tok_en, tok_zh): ''' Performs one epoch's validation. ''' decoder.eval() # eval mode (no dropout or batchnorm) if encoder is not None: encoder.eval() references_en = list( ) # references (true captions) for calculating corpus BLEU-4 score hypotheses_en = list() # hypotheses (predictions) references_zh = list( ) # references (true captions) for calculating corpus BLEU-4 score hypotheses_zh = list() # hypotheses (predictions) avg_loss = 0 with torch.no_grad(): # Batches for cnt, (encap, zhcap, video, caplen_en, caplen_zh, enrefs, zhrefs) in enumerate(val_loader, 1): encap, zhcap, video, caplen_en, caplen_zh = encap.cuda( ), zhcap.cuda(), video.cuda(), caplen_en.cuda(), caplen_zh.cuda() # Forward prop. init_hidden, vid_out = encoder( video ) # fea: decoder input from encoder, should be of size (mb, encout_dim) = (mb, decoder_dim) scores_en, pred_lengths_en, scores_zh, pred_lengths_zh = decoder.inference( encap, zhcap, init_hidden, vid_out, args.MAX_INPUT_LENGTH) # Since we decoded starting with <start>, the targets are all words after <start>, up to <end> targets_en = encap[:, 1:] scores_copy_en = scores_en.clone() targets_zh = zhcap[:, 1:] scores_copy_zh = scores_zh.clone() # Calculate loss loss_en = criterion( scores_en[:, 1:].contiguous().view(-1, decoder.vocab_size_en), targets_en.contiguous().view(-1)) loss_zh = criterion( scores_zh[:, 1:].contiguous().view(-1, decoder.vocab_size_zh), targets_zh.contiguous().view(-1)) # Hypotheses _, preds_en = torch.max(scores_copy_en, dim=2) preds_en = preds_en.tolist() temp_preds_en = list() for j, p in enumerate(preds_en): temp_preds_en.append( preds_en[j][1:pred_lengths_en[j]]) # remove pads and idx-0 preds_en = temp_preds_en hypotheses_en.extend(preds_en) # preds= [1,2,3] enrefs = [list(map(int, i.split())) for i in enrefs ] # tgtrefs = [[1,2,3], [2,4,3], [1,4,5,]] for r in enrefs: references_en.append([r]) assert len(references_en) == len(hypotheses_en) _, preds_zh = torch.max(scores_copy_zh, dim=2) preds_zh = preds_zh.tolist() temp_preds_zh = list() for j, p in enumerate(preds_zh): temp_preds_zh.append( preds_zh[j][1:pred_lengths_zh[j]]) # remove pads and idx-0 preds_zh = temp_preds_zh hypotheses_zh.extend(preds_zh) # preds= [1,2,3] zhrefs = [list(map(int, i.split())) for i in zhrefs ] # tgtrefs = [[1,2,3], [2,4,3], [1,4,5,]] for r in zhrefs: references_zh.append([r]) assert len(references_zh) == len(hypotheses_zh) avg_loss += loss_en.item() + loss_zh.item() # Since we decoded starting with <start>, the targets are all words after <start>, up to <end> # Calculate loss # Hypotheses # Calculate metrics avg_loss = avg_loss / cnt scorers = { "Bleu": Bleu(4), "Meteor": Meteor(), "Rouge": Rouge(), "Cider": Cider(), "Spice": Spice() } gts_en = {} res_en = {} for i in range(len(references_en)): gts_en[i] = [tok_en.decode_sentence(references_en[i][0])] res_en[i] = [tok_en.decode_sentence(hypotheses_en[i])] scores = {} for name, scorer in scorers.items(): score, all_scores = scorer.compute_score(gts_en, res_en) if isinstance(score, list): for i, sc in enumerate(score, 1): scores[name + str(i)] = sc else: scores[name] = score print("Score of EN:") print(scores) """ gts_zh = {} res_zh = {} for i in range(len(references_zh)): gts_zh[i] = [tok_zh.decode_sentence(references_zh[i][0])] res_zh[i] = [tok_zh.decode_sentence(hypotheses_zh[i])] scores = {} for name, scorer in scorers.items(): score, all_scores = scorer.compute_score(gts_zh, res_zh) if isinstance(score, list): for i, sc in enumerate(score, 1): scores[name + str(i)] = sc else: scores[name] = score print("Score of ZH:") print(scores) """ corpbleu_en = corpus_bleu(references_en, hypotheses_en) sentbleu_en = 0 for i, (r, h) in enumerate(zip(references_en, hypotheses_en), 1): sentbleu_en += sentence_bleu(r, h, smoothing_function=cc.method7) sentbleu_en /= i return avg_loss, sentbleu_en, corpbleu_en
def spice(gts, res): scorer = Spice() score, scores = scorer.compute_score(gts, res) out_file.write('SPICE = %s' % score + '\n')
from pycocoevalcap.spice.spice import Spice from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer from utils.logger import setup_logger if __name__ == "__main__": parser = argparse.ArgumentParser(description="evaluate") parser.add_argument("--gt_caption", type=str) parser.add_argument("--pd_caption", type=str) parser.add_argument("--save_dir", type=str) args = parser.parse_args() logger = setup_logger("evaluate", args.save_dir, 0) ptb_tokenizer = PTBTokenizer() scorers = [(Cider(), "C"), (Spice(), "S"), (Bleu(4), ["B1", "B2", "B3", "B4"]), (Meteor(), "M"), (Rouge(), "R")] logger.info(f"loading ground-truths from {args.gt_caption}") with open(args.gt_caption) as f: gt_captions = json.load(f) gt_captions = ptb_tokenizer.tokenize(gt_captions) logger.info(f"loading predictions from {args.pd_caption}") with open(args.pd_caption) as f: pred_dict = json.load(f) pd_captions = dict() for image_id, v in pred_dict.items(): pd_captions[str(image_id)] = [{"caption":v['caption'][0]['caption'],}] tmp = ptb_tokenizer.tokenize(pd_captions)
def eval_model(root_path, inputs): """ Computes evaluation metrics of the model results against the human annotated captions Parameters: ------------ root_path: str the path to the data folder which contains the raw folder inputs: str the name of the caption file to process Returns: ------------ None, it saves the overall score and individual score files under output path """ # load data try: with open(f'{root_path}/json/{inputs}.json', 'r') as data: ref_data = json.load(data) except: raise (f'Make sure that human-annotated captions are store in', f'{root_path}/json/{inputs}.json.') try: with open(f'{root_path}/json/{inputs}_model_caption.json', 'r') as data: results = json.load(data) except: raise ('Please call generate_captions.py to generate captions first.') # format the inputs img_id_dict = {'image_id': list(ref_data.keys())} imgIds = img_id_dict['image_id'] gts = {} res = {} required_key = {'raw', 'imgid', 'sentid'} for imgId in imgIds: caption_list = ref_data[imgId]['sentences'] caption_list_sel = [] for i in caption_list: lst = { key: value for key, value in i.items() if key in required_key } lst['caption'] = lst.pop('raw') lst['image_id'] = lst.pop('imgid') lst['id'] = lst.pop('sentid') caption_list_sel.append(lst) gts[imgId] = caption_list_sel generated = [{'caption': results[imgId]}] res[imgId] = generated # tokenize print('tokenization...') tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) # compute scores scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), (Spice(), "SPICE"), (usc_sim(), "USC_similarity"), ] score_dict = {} scores_dict = {} for scorer, method in scorers: print('computing %s score...' % (scorer.method())) score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, scs, m in zip(score, scores, method): score_dict[m] = sc scores_dict[m] = scs else: score_dict[method] = score scores_dict[method] = scores # format the individual scores img_score_dict = {} for n in range(len(res)): img_name = list(res.keys())[n] img_score_dict[img_name] = {} for metrics in scores_dict.keys(): if metrics == 'SPICE': img_score_dict[img_name][metrics] = scores_dict[metrics][n][ 'All']['f'] else: img_score_dict[img_name][metrics] = scores_dict[metrics][n] output_path = f'{root_path}/score' # save the overall score and individual image score if not os.path.exists(output_path): os.makedirs(output_path, exist_ok=True) with open(f'{output_path}/{inputs}_score.json', 'w') as file: json.dump(score_dict, file) with open(f'{output_path}/{inputs}_img_score.json', 'w') as file: json.dump(img_score_dict, file) assert os.path.isfile(f'{output_path}/{inputs}_score.json'),\ "Average scores are not saved." assert os.path.isfile(f'{output_path}/{inputs}_img_score.json'),\ "Individual scores are not saved."
def train(model, criterion, optimizer, train_loader, val_loader, opt, rl_criterion=None): infos = { 'iter': 0, 'epoch': 0, 'start_epoch': 0, 'best_score': float('-inf'), 'best_iter': 0, 'best_epoch': opt.max_epochs } checkpoint_checked = False rl_training = False seq_per_img = train_loader.get_seq_per_img() infos_history = {} if os.path.exists(opt.start_from): if os.path.isdir(opt.start_from): # loading the same model file at a different experiment dir start_from_file = os.path.join(opt.start_from, os.path.basename(opt.model_file)) else: start_from_file = opt.start_from logger.info('Loading state from: %s', start_from_file) checkpoint = torch.load(start_from_file) model.load_state_dict(checkpoint['model']) infos = checkpoint['infos'] infos['start_epoch'] = infos['epoch'] checkpoint_checked = True # this epoch is already checked else: logger.info('No checkpoint found! Training from the scratch') if opt.use_rl == 1 and opt.use_rl_after == 0: opt.use_rl_after = infos['epoch'] opt.use_cst_after = infos['epoch'] train_loader.set_current_epoch(infos['epoch']) if opt.grounder_type in ['niuc', 'iuc']: # get class weights one_hot_sums = None totes = 0 cur_index = train_loader.get_current_index() train_loader.reset() ep = infos['epoch'] while True: data = train_loader.get_batch() labels_svo = data['labels_svo'] one_hot = torch.clamp( torch.sum(torch.nn.functional.one_hot( labels_svo, num_classes=model.vocab_size), axis=1), 0, 1) one_hot[:, 0] = 0 # make the padding index 0 totes += one_hot.shape[0] if one_hot_sums is None: one_hot_sums = torch.sum(one_hot, axis=0) else: one_hot_sums += torch.sum(one_hot, axis=0) if ep < train_loader.get_current_epoch(): one_hot_negs = -one_hot_sums + totes pos_weight = one_hot_negs.type(torch.FloatTensor) / ( 1 + one_hot_sums.type(torch.FloatTensor)) pos_weight = pos_weight.cuda() train_loader.set_current_index(index=cur_index) break while True: t_start = time.time() model.train() data = train_loader.get_batch() feats = data['feats'] bfeats = data['bfeats'] labels = data['labels'] masks = data['masks'] labels_svo = data['labels_svo'] masks_svo = data['masks_svo'] if torch.cuda.is_available(): feats = [feat.cuda() for feat in feats] bfeats = [bfeat.cuda() for bfeat in bfeats] labels = labels.cuda() masks = masks.cuda() labels_svo = labels_svo.cuda() masks_svo = masks_svo.cuda() # implement scheduled sampling opt.ss_prob = 0 if opt.use_ss == 1 and infos['epoch'] >= opt.use_ss_after: annealing_prob = opt.ss_k / \ (opt.ss_k + np.exp((infos['epoch'] - opt.use_ss_after) / opt.ss_k)) opt.ss_prob = min(1 - annealing_prob, opt.ss_max_prob) model.set_ss_prob(opt.ss_prob) if opt.use_rl == 1 and infos[ 'epoch'] >= opt.use_rl_after and not rl_training: logger.info('Using RL objective...') rl_training = True bcmr_scorer = { 'Bleu_4': Bleu(), 'CIDEr': Cider(df=opt.train_cached_tokens), 'METEOR': Meteor(), 'ROUGE_L': Rouge(), 'SPICE': Spice() }[opt.eval_metric] #logger.info('loading gt refs: %s', train_loader.cocofmt_file) #gt_refs = utils.load_gt_refs(train_loader.cocofmt_file) mixer_from = opt.mixer_from if opt.use_mixer == 1 and rl_training: #annealing_mixer = opt.ss_k / \ # (opt.ss_k + np.exp((infos['epoch'] - opt.use_rl_after) / opt.ss_k)) #annealing_mixer = int(round(annealing_mixer * opt.seq_length)) # -1 for annealing if opt.mixer_from == -1: annealing_mixer = opt.seq_length - int( np.ceil((infos['epoch'] - opt.use_rl_after + 1) / float(opt.mixer_descrease_every))) mixer_from = max(1, annealing_mixer) model.set_mixer_from(mixer_from) scb_captions = opt.scb_captions if opt.use_cst == 1 and rl_training: # if opt.use_cst == 1 and opt.ss_k == 0, # then do not using annealing, but the fixed scb_captions provided #annealing_robust = opt.ss_k / \ # (opt.ss_k + np.exp((infos['epoch'] - opt.use_rl_after) / opt.ss_k)) #annealing_robust = int(round((1 - annealing_robust) * seq_per_img)) # do not use robust before fully mixed # if opt.use_mixer == 1 and mixer_from > 1: # opt.use_cst_after = infos['epoch'] # if opt.scb_captions is -1, then use the annealing value, # otherwise, use the set value if opt.scb_captions == -1: annealing_robust = int( np.ceil((infos['epoch'] - opt.use_cst_after + 1) / float(opt.cst_increase_every))) scb_captions = min(annealing_robust, seq_per_img - 1) optimizer.zero_grad() model.set_seq_per_img(seq_per_img) if rl_training: # sampling from model distribution # model_res, logprobs = model.sample( # feats, {'sample_max': 0, 'expand_feat': opt.expand_feat, 'temperature': 1}) # using mixer pred, model_res, logprobs, pred_svo, res_svo, logprobs_svo = model( feats, bfeats, labels, labels_svo) if opt.use_cst == 0: # greedy decoding baseline in SCST paper greedy_baseline, _, _, _ = model.sample( [Variable(f.data, volatile=True) for f in feats], [Variable(f.data, volatile=True) for f in bfeats], { 'sample_max': 1, 'expand_feat': opt.expand_feat }) if opt.use_cst == 1: bcmrscores = data['bcmrscores'] reward, m_score, g_score = utils.get_cst_reward( model_res, data['gts'], bcmr_scorer, bcmrscores=bcmrscores, expand_feat=opt.expand_feat, seq_per_img=train_loader.get_seq_per_img(), scb_captions=scb_captions, scb_baseline=opt.scb_baseline, use_eos=opt.use_eos, use_mixer=opt.use_mixer) else: # use greedy baseline by default, compute self-critical reward reward, m_score, g_score = utils.get_self_critical_reward( model_res, greedy_baseline, data['gts'], bcmr_scorer, expand_feat=opt.expand_feat, seq_per_img=train_loader.get_seq_per_img(), use_eos=opt.use_eos) loss = rl_criterion( model_res, logprobs, Variable(torch.from_numpy(reward).float().cuda(), requires_grad=False)) loss_svo = criterion(pred_svo, labels_svo, torch.ones(labels.shape).cuda()) loss = loss + (opt.labda / 10.0) * loss_svo else: pred, _, _, pred_svo, svo_it, svo_gath = model( feats, bfeats, labels, labels_svo) loss_cap = criterion(pred, labels[:, 1:], masks[:, 1:], bcmrscores=torch.from_numpy( data['bcmrscores'].astype( np.float32)).cuda()) if opt.grounder_type in ['None', 'none']: loss = loss_cap else: if opt.grounder_type in ['niuc', 'iuc']: # unordered svo_criterion = torch.nn.BCEWithLogitsLoss( pos_weight=pos_weight) concepts_one_hot = torch.clamp( torch.sum(torch.nn.functional.one_hot( labels_svo, num_classes=model.vocab_size), axis=1), 0, 1) loss_svo = svo_criterion( pred_svo[:, 0], concepts_one_hot.type(torch.FloatTensor).cuda() ) # pred_svo[: 0] undoes the repeat at the end of non_iterative_grounder() else: loss_svo = criterion(pred_svo, labels_svo, torch.ones(labels.shape).cuda()) # loss_svo = criterion(pred_svo, labels_svo, masks_svo) if random.random() < 0.01: # compare the svos during training print('---------------------') print(utils.decode_sequence(opt.vocab, pred.argmax(-1))) print(utils.decode_sequence(opt.vocab, labels_svo)[0]) print(utils.decode_sequence(opt.vocab, svo_it)[0]) loss = loss_cap + (opt.labda / 10.0) * loss_svo loss.backward() clip_grad_norm_(model.parameters(), opt.grad_clip) optimizer.step() # memReport() del pred, feats, labels, masks, labels_svo torch.cuda.empty_cache() infos['TrainLoss'] = loss.item() infos['CAPTrainLoss'] = loss_cap.item() if opt.grounder_type not in ['None', 'none']: infos['SVOTrainLoss'] = loss_svo.item() else: infos['SVOTrainLoss'] = 0 infos['mixer_from'] = mixer_from infos['scb_captions'] = scb_captions if infos['iter'] % opt.print_log_interval == 0: elapsed_time = time.time() - t_start log_info = [('Epoch', infos['epoch']), ('Iter', infos['iter']), ('Loss', infos['TrainLoss']), ('CAP Loss', infos['CAPTrainLoss']), ('SVO Loss', infos['SVOTrainLoss'])] if rl_training: log_info += [('Reward', np.mean(reward[:, 0])), ('{} (m)'.format(opt.eval_metric), m_score), ('{} (b)'.format(opt.eval_metric), g_score)] if opt.use_ss == 1: log_info += [('ss_prob', opt.ss_prob)] if opt.use_mixer == 1: log_info += [('mixer_from', mixer_from)] if opt.use_cst == 1: log_info += [('scb_captions', scb_captions)] log_info += [('Time', elapsed_time)] logger.info( '%s', '\t'.join(['{}: {}'.format(k, v) for (k, v) in log_info])) infos['iter'] += 1 if infos['epoch'] < train_loader.get_current_epoch(): infos['epoch'] = train_loader.get_current_epoch() checkpoint_checked = False learning_rate = utils.adjust_learning_rate( opt, optimizer, infos['epoch'] - infos['start_epoch']) logger.info('===> Learning rate: %f: ', learning_rate) # checkpoint_checked = False # if 1: todo debuging, jump straight to validation if (infos['epoch'] >= opt.save_checkpoint_from and infos['epoch'] % opt.save_checkpoint_every == 0 and not checkpoint_checked): # evaluate the validation performance results = validate(model, criterion, val_loader, opt) logger.info( 'Validation output: %s', json.dumps(results['scores'], indent=4, sort_keys=True)) # infos.update(results['scores']) # todo added training set eval to check for overfitting cur_index = train_loader.get_current_index() train_loader.reset() results_train = validate(model, criterion, train_loader, opt, max_iters=20, type='train') train_loader.set_current_index(index=cur_index) for k, v in results_train['scores'].items(): results['scores']['Train_' + k] = v logger.info( 'Training output: %s', json.dumps(results_train['scores'], indent=4, sort_keys=True)) infos.update(results['scores']) check_model(model, opt, infos, infos_history) checkpoint_checked = True if (infos['epoch'] >= opt.max_epochs or infos['epoch'] - infos['best_epoch'] > opt.max_patience): logger.info('>>> Terminating...') break return infos
import time import os import sys sys.path.append("coco-caption") from pycocotools.coco import COCO from pycocoevalcap.spice.spice import Spice train_path = '/home/yangxu/project/self-critical.pytorch/data/coco_annotations/captions_train2014.json' val_path = '/home/yangxu/project/self-critical.pytorch/data/coco_annotations/captions_val2014.json' coco_train = COCO(train_path) coco_val = COCO(val_path) coco_use = coco_train image_ids = coco_use.getImgIds() gts = {} res = {} for img_id in image_ids: gts[img_id] = [] data_temp = coco_use.imgToAnns[img_id] for dt in data_temp: gts[img_id].append(dt['caption']) res[img_id] = [] res[img_id].append(gts[img_id][0]) scorer = Spice() score, scores = scorer.compute_score(gts, res)
def evaluate(self, experiment_path: str, feature_file: str, feature_scp: str, caption_file: str, caption_output: str = "eval_output.json", score_output: str = "scores.txt", **kwargs): """kwargs: {'max_length': int, 'method': str, 'beam_size': int}""" dump = torch.load(os.path.join(experiment_path, "saved.pth"), map_location="cpu") # Load previous training config config = dump["config"] vocabulary = torch.load(config["vocab_file"]) model = self._get_model(config, vocabulary) model.load_state_dict(dump["model"]) # Some scaler (sklearn standardscaler) scaler = dump["scaler"] zh = config["zh"] model = model.to(self.device) dataset = SJTUDatasetEval(feature=feature_file, eval_scp=feature_scp, transform=scaler.transform) dataloader = torch.utils.data.DataLoader(dataset, shuffle=False, collate_fn=collate_fn((1, )), batch_size=32, num_workers=0) caption_df = pd.read_json(caption_file, dtype={"key": str}) if zh: key2refs = caption_df.groupby("key")["tokens"].apply( list).to_dict() else: key2refs = caption_df.groupby("key")["caption"].apply( list).to_dict() model.eval() key2pred = {} def _sample(engine, batch): with torch.no_grad(): model.eval() keys = batch[0] output = self._forward(model, batch, mode="sample", **kwargs) seqs = output["seqs"].cpu().numpy() for idx, seq in enumerate(seqs): caption = self._convert_idx2sentence(seq, vocabulary, zh) key2pred[keys[idx]] = [ caption, ] pbar = ProgressBar(persist=False, ascii=True) sampler = Engine(_sample) pbar.attach(sampler) sampler.run(dataloader) pred_df = [] for key, pred in key2pred.items(): pred_df.append({ "filename": key + ".wav", "caption": "".join(pred[0]) if zh else pred[0], "tokens": pred[0] if zh else pred[0].split() }) pred_df = pd.DataFrame(pred_df) pred_df.to_json(os.path.join(experiment_path, caption_output)) from pycocoevalcap.bleu.bleu import Bleu from pycocoevalcap.rouge.rouge import Rouge from pycocoevalcap.cider.cider import Cider from pycocoevalcap.meteor.meteor import Meteor from pycocoevalcap.spice.spice import Spice f = open(os.path.join(experiment_path, score_output), "w") scorer = Bleu(n=4, zh=zh) score, scores = scorer.compute_score(key2refs, key2pred) for n in range(4): f.write("Bleu-{}: {:6.3f}\n".format(n + 1, score[n])) scorer = Rouge(zh=zh) score, scores = scorer.compute_score(key2refs, key2pred) f.write("ROUGE: {:6.3f}\n".format(score)) scorer = Cider(zh=zh) score, scores = scorer.compute_score(key2refs, key2pred) f.write("CIDEr: {:6.3f}\n".format(score)) if not zh: scorer = Meteor() score, scores = scorer.compute_score(key2refs, key2pred) f.write("Meteor: {:6.3f}\n".format(score)) scorer = Spice() score, scores = scorer.compute_score(key2refs, key2pred) f.write("Spice: {:6.3f}\n".format(score)) f.close()
import model_normal import data import helper_datasources import config sys.path.append(config.mscoco_dir) from pycocotools.coco import COCO from pycocoevalcap.eval import COCOEvalCap from pycocoevalcap.meteor.meteor import Meteor from pycocoevalcap.cider.cider import Cider from pycocoevalcap.spice.spice import Spice from pycocoevalcap.wmd.wmd import WMD _meteor_scorer = Meteor() _cider_scorer = Cider() _spice_scorer = Spice() _wmd_scorer = WMD() ######################################################################################## def geomean(xs): if np.all(xs): #If array does not contain a zero return 2**np.mean(np.log2(xs)) else: return 0.0 ######################################################################################## def get_meteor(test_tokenized_grouped_sents, generated): return _meteor_scorer.compute_score( {
def eval_model(ref_data, results): """ Computes evaluation metrics of the model results against the human annotated captions Parameters: ------------ ref_data: dict a dictionary containing human annotated captions, with image name as key and a list of human annotated captions as values results: dict a dictionary containing model generated caption, with image name as key and a generated caption as value Returns: ------------ score_dict: a dictionary containing the overall average score for the model img_score_dict: a dictionary containing the individual scores for images scores_dict: a dictionary containing the scores by metric type """ # download stanford nlp library subprocess.call(['../../scr/evaluation/get_stanford_models.sh']) # format the inputs img_id_dict = {'image_id': list(ref_data.keys())} imgIds = img_id_dict['image_id'] gts = {} res = {} required_key = {'raw', 'imgid', 'sentid'} for imgId in imgIds: caption_list = ref_data[imgId]['sentences'] caption_list_sel = [] for i in caption_list: lst = { key: value for key, value in i.items() if key in required_key } lst['caption'] = lst.pop('raw') lst['image_id'] = lst.pop('imgid') lst['id'] = lst.pop('sentid') caption_list_sel.append(lst) gts[imgId] = caption_list_sel generated = [{'caption': results[imgId]}] res[imgId] = generated # tokenize print('tokenization...') tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) # compute scores scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), (Spice(), "SPICE"), (usc_sim(), "USC_similarity"), ] score_dict = {} scores_dict = {} for scorer, method in scorers: print('computing %s score...' % (scorer.method())) score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, scs, m in zip(score, scores, method): score_dict[m] = sc scores_dict[m] = scs else: score_dict[method] = score scores_dict[method] = scores # format the individual scores img_score_dict = {} for n in range(len(res)): img_name = list(res.keys())[n] img_score_dict[img_name] = {} for metrics in scores_dict.keys(): if metrics == 'SPICE': img_score_dict[img_name][metrics] = scores_dict[metrics][n][ 'All']['f'] else: img_score_dict[img_name][metrics] = scores_dict[metrics][n] return score_dict, img_score_dict, scores_dict
def evaluate(beam_size): """ Evaluation :param beam_size: beam size at which to generate captions for evaluation :return: BLEU-4 score """ # DataLoader loader = torch.utils.data.DataLoader(CaptionDataset( data_folder, data_name, 'TEST', transform=transforms.Compose([normalize])), batch_size=1, shuffle=True, num_workers=0, pin_memory=False) # TODO: Batched Beam Search # Therefore, do not use a batch_size greater than 1 - IMPORTANT! # Lists to store references (true captions), and hypothesis (prediction) for each image # If for n images, we have n hypotheses, and references a, b, c... for each image, we need - # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...] references = dict() hypotheses = dict() # For each image for j, (image, caps, caplens, allcaps) in enumerate( tqdm(loader, desc="EVALUATING AT BEAM SIZE " + str(beam_size))): k = beam_size # Move to GPU device, if available image = image.to(device) # (1, 3, 256, 256) attrs, encoder_out = encoder(image) attrs = attrs.expand(3, attrs_dim) enc_image_size = encoder_out.size(1) encoder_dim = encoder_out.size(3) encoder_out = encoder_out.view(1, -1, encoder_dim) num_pixels = encoder_out.size(1) encoder_out = encoder_out.expand(k, num_pixels, encoder_dim) x0 = decoder.init_x0(attrs) # Tensor to store top k previous words at each step; now they're just <start> k_prev_words = torch.LongTensor([[word_map['<start>']]] * k).to( device) # (k, 1) # Tensor to store top k sequences; now they're just <start> seqs = k_prev_words # (k, 1) # Tensor to store top k sequences' scores; now they're just 0 top_k_scores = torch.zeros(k, 1).to(device) # (k, 1) # Lists to store completed sequences and scores complete_seqs = list() complete_seqs_scores = list() # Start decoding step = 1 h1, c1, h2, c2 = decoder.init_hidden_state(attrs, encoder_out, zero=True) h1, c1 = decoder.decode_step1(x0, (h1, c1)) # s is a number less than or equal to k, because sequences are removed from this process once they hit <end> while True: embeddings = decoder.embedding(k_prev_words).squeeze( 1) # (s, embed_dim) h1, c1 = decoder.decode_step1(embeddings, (h1, c1)) awe, _ = decoder.attention(encoder_out, h1, h2) # gate = decoder.sigmoid(decoder.f_beta(h2)) # awe = gate * awe h2, c2 = decoder.decode_step2(torch.cat([embeddings, awe], dim=1), (h2, c2)) scores = decoder.fc2(decoder.dropout2(h2)) scores = F.log_softmax(scores, dim=1) # Add scores = top_k_scores.expand_as(scores) + scores # (s, vocab_size) # For the first step, all k points will have the same scores (since same k previous words, h, c) if step == 1: top_k_scores, top_k_words = scores[0].topk(k, 0, True, True) # (s) else: # Unroll and find top scores, and their unrolled indices # (s) 所有分数中最大的k个 top_k_scores, top_k_words = scores.view(-1).topk( k, 0, True, True) # Convert unrolled indices to actual indices of scores # 上面展开了,prev_word_inds得到哪些句子是概率最大的 prev_word_inds = top_k_words / vocab_size # (s) next_word_inds = top_k_words % vocab_size # (s) # Add new words to sequences seqs = torch.cat( [seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1) # (s, step+1) # Which sequences are incomplete (didn't reach <end>)? incomplete_inds = [ ind for ind, next_word in enumerate(next_word_inds) if next_word != word_map['<end>'] ] complete_inds = list( set(range(len(next_word_inds))) - set(incomplete_inds)) # Set aside complete sequences if len(complete_inds) > 0: complete_seqs.extend(seqs[complete_inds].tolist()) complete_seqs_scores.extend(top_k_scores[complete_inds]) k -= len(complete_inds) # reduce beam length accordingly # Proceed with incomplete sequences if k == 0: break seqs = seqs[incomplete_inds] h1 = h1[prev_word_inds[incomplete_inds]] c1 = c1[prev_word_inds[incomplete_inds]] h2 = h2[prev_word_inds[incomplete_inds]] c2 = c2[prev_word_inds[incomplete_inds]] encoder_out = encoder_out[prev_word_inds[incomplete_inds]] top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1) k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1) # Break if things have been going on too long if step > 50: break step += 1 i = complete_seqs_scores.index(max(complete_seqs_scores)) seq = complete_seqs[i] # References img_caps = allcaps[0].tolist() img_captions = list( map( lambda c: [ rev_word_map[w] for w in c if w not in { word_map['<start>'], word_map['<end>'], word_map[ '<pad>'] } ], img_caps)) # remove <start> and pads img_caps = [' '.join(c) for c in img_captions] # print(img_caps) references[str(j)] = img_caps # Hypotheses hypothesis = ([ rev_word_map[w] for w in seq if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']} ]) hypothesis = [' '.join(hypothesis)] # print(hypothesis) hypotheses[str(j)] = hypothesis assert len(references) == len(hypotheses) # Calculate BLEU-1~BLEU4 scores m1 = Bleu() m2 = Meteor() m3 = Cider() m4 = Rouge() m5 = Spice() (score1, scores1) = m1.compute_score(references, hypotheses) (score2, scores2) = m2.compute_score(references, hypotheses) (score3, scores3) = m3.compute_score(references, hypotheses) (score4, scores4) = m4.compute_score(references, hypotheses) (score5, scores5) = m5.compute_score(references, hypotheses) return score1, score2, score3, score4, score5