def __init__(self, ground_truth_filenames=None, prediction_filename=None, tious=None, max_proposals=1000, prediction_fields=PREDICTION_FIELDS, verbose=False): # Check that the gt and submission files exist and load them if len(tious) == 0: raise IOError('Please input a valid tIoU.') if not ground_truth_filenames: raise IOError('Please input a valid ground truth file.') if not prediction_filename: raise IOError('Please input a valid prediction file.') self.verbose = verbose self.tious = tious self.max_proposals = max_proposals self.pred_fields = prediction_fields self.ground_truths = self.import_ground_truths(ground_truth_filenames) self.prediction = self.import_prediction(prediction_filename) self.tokenizer = PTBTokenizer() # Set up scorers, if not verbose, we only use the one we're # testing on: METEOR if self.verbose: self.scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider('corpus'), "CIDEr"), (Spice(), "SPICE")] else: self.scorers = [(Cider('corpus'), "CIDEr")]
def get_corpus_bleu(model, data_loader, vocabs, device, beam_size): import torch from pycocoevalcap.bleu.bleu import Bleu from pycocoevalcap.cider.cider import Cider from pycocoevalcap.rouge.rouge import Rouge """Defining Scorers""" scorer_bleu = Bleu(4) scorer_rouge = Rouge() scorer_cider = Cider() sequences_ref = {} sequences_gen = {} bad_words = ['<SOS>', '<EOS>', '<UNK>'] bad_toks = [vocabs['word_vocab'](i) for i in bad_words] """Generation Loop""" for i, data in enumerate(data_loader): with torch.no_grad(): captions = data['captions'] length = captions.size(1) - 1 targets = captions.narrow(1, 1, length) images = data['images'].to(device) topics = data['topics'].to(device) predictions = model.sample_v2(images, topics, beam_size=beam_size) sequences_ref[i] = [ " ".join([ vocabs['word_vocab'](j.item()) for j in targets[0] if j.item() not in bad_toks ]) ] sequences_gen[i] = [ " ".join([ vocabs['word_vocab'](j.item()) for j in predictions[0][1] if j.item() not in bad_toks ]) ] # sequences_gen[i] = [" ".join([vocabs['word_vocab'](j) for j in predictions[0] if j not in bad_toks])] """Getting Scores""" bleu_score, bleu_scores = scorer_bleu.compute_score( sequences_ref, sequences_gen) rouge_score, rouge_scores = scorer_rouge.compute_score( sequences_ref, sequences_gen) cider_score, cider_scores = scorer_cider.compute_score( sequences_ref, sequences_gen) scores = { 'bleu_score': bleu_score, 'rouge_score': rouge_score, 'cider_score': cider_score } print(scores) return scores
def compute_batch_score(decode_res, key2refs, keys, start_idx, end_idx, vocabulary, scorer): """ Args: decode_res: decoding results of model, [N, max_length] key2refs: references of all samples, dict(<key> -> [ref_1, ref_2, ..., ref_n] keys: keys of this batch, used to match decode results and refs Return: scores of this batch, [N,] """ if scorer is None: from pycocoevalcap.cider.cider import Cider scorer = Cider() hypothesis = {} references = {} for i in range(len(keys)): if keys[i] in hypothesis.keys(): continue # prepare candidate sentence candidate = [] for w_t in decode_res[i]: if w_t == start_idx: continue elif w_t == end_idx: break candidate.append(vocabulary.idx2word[w_t]) hypothesis[keys[i]] = [" ".join(candidate), ] # prepare reference sentences references[keys[i]] = key2refs[keys[i]] score, scores = scorer.compute_score(references, hypothesis) key2score = {key: scores[i] for i, key in enumerate(references.keys())} results = np.zeros(decode_res.shape[0]) for i in range(decode_res.shape[0]): results[i] = key2score[keys[i]] return results
def get_self_critical_reward(model, feat0, feat1, feat_mask, pos_feat, groundtruth, probability_sample, id_word): batch_size = feat0.size(0) double_batch_size = batch_size * 2 seq_length = probability_sample.size(1) greedy_sample, _ = model.sample(feat0, feat1, feat_mask, pos_feat) res = OrderedDict() gts = OrderedDict() greedy_sample = greedy_sample.cpu().numpy() probability_sample = probability_sample.cpu().numpy() for i in range(batch_size): res[i] = [numbers_to_str(probability_sample[i])] for i in range(batch_size, double_batch_size): res[i] = [numbers_to_str(greedy_sample[i - batch_size])] length = len(groundtruth[0]) for i in range(batch_size): gts[i] = [numbers_to_str(groundtruth[i][j]) for j in range(length)] gts = {i: gts[i % batch_size] for i in range(double_batch_size)} assert len(gts.keys()) == len( res.keys()), 'len of gts.keys is not equal to that of res.keys' avg_cider_score, cider_score = Cider().compute_score(gts=gts, res=res) cider_score = np.array(cider_score) reward = cider_score[:batch_size] - cider_score[batch_size:] reward = np.repeat(reward[:, np.newaxis], seq_length, axis=1) return reward
def __init__(self, ground_truth_filenames, prediction_filename, verbose=False, all_scorer=False): # Check that the gt and submission files exist and load them self.verbose = verbose self.all_scorer = all_scorer self.ground_truths = self.import_ground_truths(ground_truth_filenames) self.prediction = self.import_prediction(prediction_filename) self.tokenizer = PTBTokenizer() # Set up scorers, if not verbose, we only use the one we're # testing on: METEOR # Meteor is java-based and can crash alot. try: met = Meteor() except (AttributeError, FileNotFoundError) as e: print(f"Meteor couldn't start due to {e}") met = None if self.verbose or self.all_scorer: self.scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (met, "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr") ] else: self.scorers = [(met, "METEOR")] # init some attributes self.easy_samples = {} self.hard_samples = {} self.n_ref_vids = set() self.scores = {}
def evaluate(self): # ================================================= # Tokenization # ================================================= print("Tokenization") tokenizer = PTBTokenizer() gts = tokenizer.tokenize(self.ground_truth) preds = tokenizer.tokenize(self.prediction) # ================================================= # Setup scorers # ================================================= print("Setting up scorers...") scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), # (Spice(), "SPICE") ] # ================================================= # Compute scores # ================================================= for scorer, method in scorers: print("Computing {} score...".format(scorer.method())) score, scores = scorer.compute_score(gts, preds) if isinstance(method, list): for sc, scs, m in zip(score, scores, method): self.eval_res[m] = sc * 100 else: self.eval_res[method] = score * 100
def calc_scores(file1, file2): """ ref, dictionary of reference sentences (id, sentence) hypo, dictionary of hypothesis sentences (id, sentence) score, dictionary of scores ref: 真实的数据,类型为dict,如dict{"id":"[sentences]"} hypo: 生成的数据,格式如上。 需满足: assert(type(hypo) is list); assert(len(hypo) == 1); assert(type(ref) is list); assert(len(ref) >= 1); """ pred = readfiles(file1) test = readfiles(file2) # 合成dict类型 i = [i for i in range(len(pred))] hypo = dict(zip(i, pred)) ref = dict(zip(i, test)) scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] final_scores = {} for scorer, method in scorers: score, scores = scorer.compute_score(ref, hypo) if type(score) == list: for m, s in zip(method, score): final_scores[m] = s else: final_scores[method] = score return final_scores
def get_scorers(self): # from pycoco_scorers_vizseq import BLEUScorerAll from pycocoevalcap.bleu.bleu import Bleu # from pycocoevalcap.spice.spice import Spice from pycocoevalcap.cider.cider import Cider from pycocoevalcap.rouge.rouge import Rouge from pycocoevalcap.meteor.meteor import Meteor from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer import logging import transformers transformers.tokenization_utils.logger.setLevel(logging.ERROR) transformers.configuration_utils.logger.setLevel(logging.ERROR) transformers.modeling_utils.logger.setLevel(logging.ERROR) Scorer_ = namedtuple("Scorer_", ["cls_fn", "to_init", "out_str"]) self.scorer_dict = { "bleu": Scorer_(Bleu(4, verbose=0), False, ["bleu@1", "bleu@2", "bleu@3", "bleu@4"]), "meteor": Scorer_(Meteor(), False, ["meteor"]), "cider": Scorer_(Cider("corpus"), False, ["cider"]), "rouge": Scorer_(Rouge(), False, ["rouge"]), # "spice": Scorer_(Spice(), False, ["spice"]), "bert_score": Scorer_(BertScoreSimple, True, ["bert_score"]), } self.tokenizer = PTBTokenizer()
def evaluate(gts, res): eval = {} # ================================================= # Set up scorers # ================================================= print('tokenization...') tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) # ================================================= # Set up scorers # ================================================= print('setting up scorers...') scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] # ================================================= # Compute scores # ================================================= for scorer, method in scorers: print('computing %s score...' % (scorer.method())) score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, scs, m in zip(score, scores, method): eval[m] = sc else: eval[method] = score return eval
def get_scorers(self): # from pycoco_scorers_vizseq import BLEUScorerAll from pycocoevalcap.bleu.bleu import Bleu from pycocoevalcap.cider.cider import Cider from pycocoevalcap.rouge.rouge import Rouge from pycocoevalcap.meteor.meteor import Meteor # from pycocoevalcap.spice.spice import Spice from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer Scorer_ = namedtuple("Scorer_", ["cls_fn", "to_init", "out_str"]) self.scorer_dict = { "bleu": Scorer_( Bleu(4, verbose=0), False, ["bleu_1", "bleu_2", "bleu_3", "bleu_4"] ), "meteor": Scorer_(Meteor(), False, ["meteor"]), "cider": Scorer_(Cider("corpus"), False, ["cider"]), "rouge": Scorer_(Rouge(), False, ["rouge"]), # "spice": Scorer_(Spice(), False, ["spice"]), } self.tokenizer = PTBTokenizer() self.coval_all_metrics = [ ("mentions", evaluator.mentions), ("muc", evaluator.muc), ("bcub", evaluator.b_cubed), ("ceafe", evaluator.ceafe), ("lea", evaluator.lea), ("lea_soft", evaluator.lea_soft), ] self.reset_coval_scorer_dict()
def score(ref, hypo): """ ref, dictionary of reference sentences (id, sentence) hypo, dictionary of hypothesis sentences (id, sentence) score, dictionary of scores """ # print('ref') # print(ref) # print('hypo') # print(hypo) scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr") ] final_scores = {} for scorer, method in scorers: score, scores = scorer.compute_score(ref, hypo) if type(score) == list: for m, s in zip(method, score): final_scores[m] = s else: final_scores[method] = score return final_scores
def language_eval(sample_seqs, groundtruth_seqs): assert len(sample_seqs) == len(groundtruth_seqs), 'length of sampled seqs is different from that of groundtruth seqs!' references = OrderedDict() predictions = OrderedDict() for i in range(len(groundtruth_seqs)): references[i] = [groundtruth_seqs[i][j] for j in range(len(groundtruth_seqs[i]))] for i in range(len(sample_seqs)): predictions[i] = [sample_seqs[i]] predictions = {i: predictions[i] for i in range(len(sample_seqs))} references = {i: references[i] for i in range(len(groundtruth_seqs))} avg_bleu_score, bleu_score = Bleu(4).compute_score(references, predictions) print('avg_bleu_score == ', avg_bleu_score) avg_cider_score, cider_score = Cider().compute_score(references, predictions) print('avg_cider_score == ', avg_cider_score) avg_meteor_score, meteor_score = Meteor().compute_score(references, predictions) print('avg_meteor_score == ', avg_meteor_score) avg_rouge_score, rouge_score = Rouge().compute_score(references, predictions) print('avg_rouge_score == ', avg_rouge_score) # print('BLEU1:{}\nBLEU2:{}\nBLEU3:{}\nBLEU4:{}\nMETEOR:{}\nROUGE:{}CIDEr:{}\n'.format(avg_bleu_score[0], # avg_bleu_score[1], # avg_bleu_score[2], # avg_bleu_score[3], # avg_meteor_score, # avg_rouge_score, # avg_cider_score)) return {'BLEU': avg_bleu_score, 'CIDEr': avg_cider_score, 'METEOR': avg_meteor_score, 'ROUGE': avg_rouge_score}
def CocoScore(ref, hyp, metrics_list=None, language='en'): """ Obtains the COCO scores from the references and hypotheses. :param ref: Dictionary of reference sentences (id, sentence) :param hyp: Dictionary of hypothesis sentences (id, sentence) :param metrics_list: List of metrics to evaluate on :param language: Language of the sentences (for METEOR) :return: dictionary of scores """ if metrics_list is None: metrics_list = ['bleu', 'ter', 'meteor', 'rouge_l', 'cider'] else: metrics_list = [metric.lower() for metric in metrics_list] scorers = [] if 'bleu' in metrics_list: scorers.append((Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"])) if 'meteor' in metrics_list: scorers.append((Meteor(language), "METEOR")) if 'ter' in metrics_list: scorers.append((Ter(), "TER")) if 'rouge_l' in metrics_list or 'rouge' in metrics_list: scorers.append((Rouge(), "ROUGE_L")) if 'cider' in metrics_list: scorers.append((Cider(), "CIDEr")) final_scores = {} for scorer, method in scorers: score, _ = scorer.compute_score(ref, hyp) if isinstance(score, list): for m, s in zip(method, score): final_scores[m] = s else: final_scores[method] = score return final_scores
def score(ref, hypo): scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), #(Meteor(),"METEOR"),#......................................issue (Rouge(), "ROUGE_L"), (Cider(), "CIDEr") ] final_scores = {} #print ("length: ",len(scorers)) #print ("scorers :",scorers) #i = 0 for scorer, method in scorers: #print("scorer :",scorer) #print("method : ",method) #print (i) #i = i+1 score, scores = scorer.compute_score(ref, hypo) #print(type(score)) if type(score) == list: #print("done") for m, s in zip(method, score): final_scores[m] = s else: #print("not done") final_scores[method] = score #print("phase complete") return final_scores
def __init__(self): self.scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), # (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr") ] #, (Cider(), "CIDEr")
def main(eval_caption_file, output, zh=False): df = pd.read_json(eval_caption_file) if zh: refs = df.groupby("key")["tokens"].apply(list).to_dict() else: refs = df.groupby("key")["caption"].apply(list).to_dict() from pycocoevalcap.bleu.bleu import Bleu from pycocoevalcap.cider.cider import Cider from pycocoevalcap.rouge.rouge import Rouge scorer = Bleu(zh=zh) bleu_scores = coco_score(copy.deepcopy(refs), scorer) scorer = Cider(zh=zh) cider_score = coco_score(copy.deepcopy(refs), scorer) scorer = Rouge(zh=zh) rouge_score = coco_score(copy.deepcopy(refs), scorer) if not zh: from pycocoevalcap.meteor.meteor import Meteor scorer = Meteor() meteor_score = coco_score(copy.deepcopy(refs), scorer) from pycocoevalcap.spice.spice import Spice scorer = Spice() spice_score = coco_score(copy.deepcopy(refs), scorer) with open(output, "w") as f: for n in range(4): f.write("BLEU-{}: {:6.3f}\n".format(n + 1, bleu_scores[n])) f.write("CIDEr: {:6.3f}\n".format(cider_score)) f.write("ROUGE: {:6.3f}\n".format(rouge_score)) if not zh: f.write("Meteor: {:6.3f}\n".format(meteor_score)) f.write("SPICE: {:6.3f}\n".format(spice_score))
def get_coco_score(gt_list, pred_list, verbose, extra_vars): """ gt_list, dictionary of reference sentences (id, sentence) pred_list, dictionary of hypothesis sentences (id, sentence) verbose - if greater than 0 the metric measures are printed out extra_vars - extra variables, here are: extra_vars['language'] - the target language score, dictionary of scores """ x_trgs = [x.lower() for x in gt_list] hypo = {idx: [lines.strip()] for (idx, lines) in enumerate(pred_list)} refs = {idx: [rr] for idx, rr in enumerate(x_trgs)} scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), #(Meteor(language=extra_vars['language']),"METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr") ] final_scores = {} for scorer, method in scorers: score, scores = scorer.compute_score(refs, hypo) if type(score) == list: for m, s in zip(method, score): final_scores[m] = s else: final_scores[method] = score return final_scores
def compute_scores(gts, res): """ Performs the MS COCO evaluation using the Python 3 implementation (https://github.com/salaniz/pycocoevalcap) :param gts: Dictionary with the image ids and their gold captions, :param res: Dictionary with the image ids ant their generated captions :print: Evaluation score (the mean of the scores of all the instances) for each measure """ # Preprocess captions gts = preprocess_captions(gts) res = preprocess_captions(res) # Set up scorers scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Spice(), "SPICE"), (Cider(), "CIDEr") ] # Compute score for each metric for scorer, method in scorers: print("Computing", scorer.method(), "...") score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, m in zip(score, method): print("%s : %0.3f" % (m, sc)) else: print("%s : %0.3f" % (method, score))
def __init__(self, ground_truth_filenames=None, prediction_filename=None, verbose=False, all_scorer=False): # Check that the gt and submission files exist and load them if not ground_truth_filenames: raise IOError('Please input a valid ground truth file.') if not prediction_filename: raise IOError('Please input a valid prediction file.') self.verbose = verbose self.all_scorer = all_scorer self.ground_truths = self.import_ground_truths(ground_truth_filenames) self.prediction = self.import_prediction(prediction_filename) self.tokenizer = PTBTokenizer() # Set up scorers, if not verbose, we only use the one we're # testing on: METEOR if self.verbose or self.all_scorer: self.scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(),"METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr") ] else: self.scorers = [(Meteor(), "METEOR")]
def get_dcc_scores(self): imgIds = self.params['image_id'] # imgIds = self.coco.getImgIds() gts = {} res = {} for imgId in imgIds: gts[imgId] = self.coco.imgToAnns[imgId] res[imgId] = self.cocoRes.imgToAnns[imgId] tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] score_dict = {} for scorer, method in scorers: print 'computing %s score...' % (scorer.method()) score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, scs, m in zip(score, scores, method): score_dict[m] = sc print "%s: %0.3f" % (m, sc) else: score_dict[method] = score print "%s: %0.3f" % (method, score) return score_dict
def evaluate(self): imgIds = self.params['image_id'] gts = self.gts res = self.res # ================================================= # Set up scorers # ================================================= tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) # ================================================= # Set up scorers # ================================================= scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] # ================================================= # Compute scores # ================================================= eval = {} for scorer, method in scorers: score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, scs, m in zip(score, scores, method): self.setEval(sc, m) self.setImgToEvalImgs(scs, imgIds, m) else: self.setEval(score, method) self.setImgToEvalImgs(scores, imgIds, method) self.setEvalImgs()
def evaluate(self): assert len(self.ground) == len(self.predictions) # ================================================= # Set up scorers # ================================================= #print('setting up scorers...') scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), #(Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr") ] # ================================================= # Compute scores # ================================================= for scorer, method in scorers: #print('computing %s score...' % (scorer.method())) score, scores = scorer.compute_score(self.ground, self.predictions) if type(method) == list: for sc, scs, m in zip(score, scores, method): self.setEval(sc, m) #print("%s: %0.3f" % (m, sc)) else: self.setEval(score, method)
def score(self, GT, RES, IDs): # edited by rgh #self.eval = {} self.eval = OrderedDict() self.imgToEval = {} gts = {} res = {} for ID in IDs: # print ID gts[ID] = GT[ID] res[ID] = RES[ID] print('tokenization...') tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) # ================================================= # Set up scorers # ================================================= print('setting up scorers...') # edited by rgh # scorers = [ # (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), # (Meteor(),"METEOR"), # (Rouge(), "ROUGE_L"), # (Cider(), "CIDEr"), # #(Spice(), "SPICE") # ] scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Cider(), "CIDEr"), (Rouge(), "ROUGE_L"), # (Spice(), "SPICE") ] # ================================================= # Compute scores # ================================================= eval = {} for scorer, method in scorers: print('computing %s score...' % (scorer.method())) score, scores = scorer.compute_score(gts, res) if type(method) == list: # added by rgh # for sc, scs, m in zip(score, scores, method): # self.setEval(sc, m) # self.setImgToEvalImgs(scs, IDs, m) # print("%s: %0.3f" % (m, sc)) self.setEval("%.4f" % score[-1], method[-1]) self.setImgToEvalImgs(scores[-1], IDs, method[-1]) print("%s: %0.4f" % (method[-1], score[-1])) else: self.setEval("%.4f" % score, method) self.setImgToEvalImgs(scores, IDs, method) print("%s: %0.4f" % (method, score)) # for metric, score in self.eval.items(): # print '%s: %.3f'%(metric, score) return self.eval
def score(num, DIR): print("Testing results on epoch ", num, " in DIR=", DIR) print("Loading coco annotations") dataDir = '.' dataType = 'val2014' algName = 'fakecap' annFile = '%s/annotations/captions_%s.json' % (dataDir, dataType) subtypes = ['results', 'evalImgs', 'eval'] [resFile, evalImgsFile, evalFile]= \ ['%s/results/captions_%s_%s_%s.json'%(dataDir,dataType,algName,subtype) for subtype in subtypes] coco_anns = COCO(annFile) print("COCO anns imported") path = DIR + str(num) + '_test_result.tar.gz' save = pickle.load(open(path)) cocoRes = {} coco = {} for key, val in save.items(): reslst = val[u'res'] res = [] for data in reslst: if data != u'<SEND>': res.append(data) else: break res = res[1:] #print "RES: ",reslst #print "ANN: ", val[u'ann'] #res = [word for word in res if word!=u'<SEND>'][1:] #print "RES FIXED: ", res if len(res) == 0: res = [u'a'] #just not to be empty, and it has low low idf cocoRes[key] = [{u'caption': ' '.join(res)}] #coco[key] = [{u'caption':' '.join(val[u'ann'][1:-1])}] coco[key] = coco_anns.imgToAnns[key] print 'examples' for key in coco.keys()[:5]: print "IMG_NUM=", key print "Annotation: ", '\n'.join( [coco[key][i][u'caption'] for i in range(len(coco[key]))]) print "Generated data: ", ' '.join(save[key][u'res']) print "Cleared generation: ", cocoRes[key][0][u'caption'] print 'tokenization...' tokenizer = PTBTokenizer() gts = tokenizer.tokenize(coco) res = tokenizer.tokenize(cocoRes) print 'setting up scorers...' scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), (Spice(), "SPICE")] for scorer, method in scorers: print 'computing %s score...' % (scorer.method()) score, scores = scorer.compute_score(gts, res) print(score)
def _define_metrics(gts, res): bleu_scorer = Bleu(n=4) bleu, _ = bleu_scorer.compute_score(gts=gts, res=res) rouge_scorer = Rouge() rouge, _ = rouge_scorer.compute_score(gts=gts, res=res) cider_scorer = Cider() cider, _ = cider_scorer.compute_score(gts=gts, res=res) meteor_scorer = Meteor() meteor, _ = meteor_scorer.compute_score(gts=gts, res=res) for i in range(4): bleu[i] = round(bleu[i], 4) return bleu, round(meteor, 4), round(rouge, 4), round(cider, 4)
def cider_scores(trues, pred, n=4): """ Compute CIDEr and CIDEr-D for a fixed prediction, with pycocoevalcap """ trues = dict([(i, [r]) for i, r in enumerate(trues)]) preds = dict([(i, [pred]) for i, _ in enumerate(trues)]) ciders, _ = Cider().compute_score(trues, preds) return ciders
def _train_batch(engine, batch): model.train() with torch.enable_grad(): optimizer.zero_grad() train_scorer = Cider(zh=zh) output = self._forward(model, batch, "train", key2refs=train_key2refs, scorer=train_scorer) output["loss"].backward() optimizer.step() return output
def init_eval_metric(bleu_n=4): global Meteor_scorer global Cider_scorer global Bleu_scorer global Bleu_N Meteor_scorer = Meteor_scorer or Meteor() Cider_scorer = Cider_scorer or Cider() Bleu_scorer = Bleu_scorer or Bleu(bleu_n) Bleu_N = bleu_n
def __init__(self, args, task): super().__init__(args, task) self.task = task self.generator = SimpleSequenceGenerator( beam=args.scst_beam, penalty=args.scst_penalty, max_pos=args.max_target_positions, eos_index=task.target_dictionary.eos_index) # Needed for decoding model output to string self.conf_tokenizer = encoders.build_tokenizer(args) self.conf_decoder = encoders.build_bpe(args) self.captions_dict = task.target_dictionary # Tokenizer needed for computing CIDEr scores self.tokenizer = PTBTokenizer() self.scorer = Cider()
def compute_cider_score(decode_res, keys, gts, start_idx, end_idx, vocabulary): """ Args: decode_res: decoding results of model, [B, max_length] keys: keys of this batch, tuple [B,] gts: ground truth sentences of all audios, dict(<key> -> [ref_1, ref_2, ..., ref_n]) Return: score: scores of this batch, [B,] """ from pycocoevalcap.cider.cider import Cider scorer = Cider() hypothesis = {} references = {} for i in range(decode_res.shape[0]): if keys[i] in hypothesis: continue # prepare candidate candidate = [] for t, w_t in enumerate(decode_res[i]): if w_t == start_idx: continue elif w_t == end_idx: break else: candidate.append(vocabulary.idx2word[w_t]) hypothesis[keys[i]] = [ " ".join(candidate), ] # prepare reference references[keys[i]] = gts[keys[i]] (score, scores) = scorer.compute_score(references, hypothesis) key2score = {key: scores[i] for i, key in enumerate(hypothesis.keys())} results = np.zeros(decode_res.shape[0]) for i in range(decode_res.shape[0]): results[i] = key2score[keys[i]] return results