def language_eval_excoco(predictions, predictions_bleu, sents_label_eval, loader): Scorer = CiderD() Bleu_scorer = Bleu(4) METEOR_scorer = Meteor() ROUGE_scorer = Rouge() c_score, _ = Scorer.compute_score(sents_label_eval, predictions) b_score, _ = Bleu_scorer.compute_score(sents_label_eval, predictions_bleu) m_score, _ = METEOR_scorer.compute_score(sents_label_eval, predictions_bleu) r_score, _ = ROUGE_scorer.compute_score(sents_label_eval, predictions_bleu) print('Evaluating {} samples'.format(len(predictions))) print('Bleu_1 : ' + str(b_score[0])) print('Bleu_2 : ' + str(b_score[1])) print('Bleu_3 : ' + str(b_score[2])) print('Bleu_4 : ' + str(b_score[3])) print('METEOR : ' + str(m_score)) print('ROUGE_L : ' + str(r_score)) print('CIDEr : ' + str(c_score)) lang_stat = {} lang_stat['BLEU_1'] = b_score[0] lang_stat['BLEU_2'] = b_score[1] lang_stat['BLEU_3'] = b_score[2] lang_stat['BLEU_4'] = b_score[3] lang_stat['METEOR'] = m_score lang_stat['ROUGE_L'] = r_score lang_stat['CIDEr'] = c_score return lang_stat
def test(model, dataloader, args): scorer = Bleu(4) m_scorer = Meteor() r_scorer = Rouge() hyp = [] ref = [] model.eval() gold_file = open('tmp_gold.txt', 'w') pred_file = open('tmp_pred.txt', 'w') with tqdm(dataloader, desc='Test ', mininterval=1) as tq: for batch in tq: with torch.no_grad(): seq = model(batch, beam_size=args.beam_size) r = write_txt(batch, batch['tgt_text'], gold_file, args) h = write_txt(batch, seq, pred_file, args) hyp.extend(h) ref.extend(r) hyp = dict(zip(range(len(hyp)), hyp)) ref = dict(zip(range(len(ref)), ref)) print(hyp[0], ref[0]) print('BLEU INP', len(hyp), len(ref)) print('BLEU', scorer.compute_score(ref, hyp)[0]) print('METEOR', m_scorer.compute_score(ref, hyp)[0]) print('ROUGE_L', r_scorer.compute_score(ref, hyp)[0]) gold_file.close() pred_file.close()
def scst(self, x, x_mask, s): self.scorer = Meteor() encoding = self.encoder(x, x_mask) # greedy part _, pred = self.decoder.greedy(encoding, s.size(1) - 1) pred_greedy = [] for i in range(pred.data.size(0)): pred_greedy.append(self.denum(pred.data[i])) del pred # sampling part model_pred = self.decoder.sampling(encoding, s, s.size(1) - 2, sample_prob=1, is_argmax=False) model_pred.detach_() new_y = torch.cat((Variable( model_pred.data.new(s.size(0), 1).long().fill_( self.decoder.vocab.stoi['<init>'])), model_pred), 1) h = self.decoder(new_y, encoding) B, T, H = h.size() logits = self.decoder.out(h.view(-1, H)) #.view(B, T, -1) mask = (s[:, 1:] != 1).float() _, pred_sample = torch.max(logits, -1) p_model = F.log_softmax(logits, dim=-1) logp = p_model[torch.arange(0, B * T).type(logits.data.type()).long(), pred_sample.data].view(B, T) pred_sample = pred_sample.view(B, T) assert pred_sample.size(0) == len(pred_greedy), ( 'pred_sample should have the same number of sentences as in ' 'pred_greedy, got {} and {} instead'.format(B, len(pred_greedy))) assert pred_sample.size() == (B, T), ('pred_sample size should error') pred_sample.detach_() # rewards sentence_greedy, sentence_sample, sentence_gt = {}, {}, {} for i in range(len(pred_greedy)): sentence_greedy[i] = [{'caption': pred_greedy[i]}] sentence_sample[i] = [{'caption': self.denum(pred_sample.data[i])}] sentence_gt[i] = [{'caption': self.denum(s.data[i, 1:])}] tok_greedy = self.tokenizer.tokenize(sentence_greedy) tok_sample = self.tokenizer.tokenize(sentence_sample) tok_gt = self.tokenizer.tokenize(sentence_gt) _, r_greedy = self.scorer.compute_score(tok_gt, tok_greedy) _, r_sample = self.scorer.compute_score(tok_gt, tok_sample) r_diff = [r_s - r_g for (r_s, r_g) in zip(r_greedy, r_sample)] r_diff = Variable(torch.Tensor(r_diff).type(logp.data.type())) loss = -torch.mean(torch.sum(r_diff.view(-1, 1) * logp * mask, 1)) return loss
def __init__(self, ground_truth_filenames=None, prediction_filename=None, verbose=False, all_scorer=False): # Check that the gt and submission files exist and load them if not ground_truth_filenames: raise IOError('Please input a valid ground truth file.') if not prediction_filename: raise IOError('Please input a valid prediction file.') self.verbose = verbose self.all_scorer = all_scorer self.ground_truths = self.import_ground_truths(ground_truth_filenames) self.prediction = self.import_prediction(prediction_filename) self.tokenizer = PTBTokenizer() # Set up scorers, if not verbose, we only use the one we're # testing on: METEOR if self.verbose or self.all_scorer: self.scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(),"METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr") ] else: self.scorers = [(Meteor(), "METEOR")]
def test(model_path='models/model-61', video_feat_path=video_feat_path): train_data, test_data = get_video_data(video_data_path, video_feat_path, train_ratio=0.7) test_videos = test_data['video_path'].values test_captions = test_data['Description'].values ixtoword = pd.Series(np.load('./data/ixtoword.npy').tolist()) test_videos_unique = list() test_captions_list = list() for (video, caption) in zip(test_videos, test_captions): if len(test_videos_unique) == 0 or test_videos_unique[-1] != video: test_videos_unique.append(video) test_captions_list.append([caption]) else: test_captions_list[-1].append(caption) model = Video_Caption_Generator( dim_image=dim_image, n_words=len(ixtoword), dim_embed=dim_embed, dim_hidden=dim_hidden, batch_size=batch_size, encoder_max_sequence_length=encoder_step, decoder_max_sentence_length=decoder_step, bias_init_vector=None) video_tf, video_mask_tf, caption_tf, probs_tf, last_embed_tf = model.build_generator() sess = tf.InteractiveSession() saver = tf.train.Saver() saver.restore(sess, model_path) scorer = Meteor() scorer_bleu = Bleu(4) GTS = defaultdict(list) RES = defaultdict(list) counter = 0 for (video_feat_path, caption) in zip(test_videos_unique, test_captions_list): generated_sentence = gen_sentence( sess, video_tf, video_mask_tf, caption_tf, video_feat_path, ixtoword) print video_feat_path, generated_sentence #print caption GTS[str(counter)] = [{'image_id':str(counter),'cap_id':i,'caption':s} for i, s in enumerate(caption)] RES[str(counter)] = [{'image_id':str(counter),'caption':generated_sentence[:-2]+'.'}] #GTS[video_feat_path] = caption #RES[video_feat_path] = [generated_sentence[:-2] + '.'] counter += 1 #ipdb.set_trace() tokenizer = PTBTokenizer() GTS = tokenizer.tokenize(GTS) RES = tokenizer.tokenize(RES) score, scores = scorer.compute_score(GTS, RES) print "METEOR", score score, scores = scorer_bleu.compute_score(GTS, RES) print "BLEU", score
def __init__(self, ground_truth_filenames=None, prediction_filename=None, tious=None, max_proposals=1000, prediction_fields=PREDICTION_FIELDS, verbose=False): # Check that the gt and submission files exist and load them if len(tious) == 0: raise IOError('Please input a valid tIoU.') if not ground_truth_filenames: raise IOError('Please input a valid ground truth file.') if not prediction_filename: raise IOError('Please input a valid prediction file.') self.verbose = verbose self.tious = tious self.max_proposals = max_proposals self.pred_fields = prediction_fields self.ground_truths = self.import_ground_truths(ground_truth_filenames) self.prediction = self.import_prediction(prediction_filename) self.tokenizer = PTBTokenizer() # Set up scorers, if not verbose, we only use the one we're # testing on: METEOR if self.verbose: self.scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] else: self.scorers = [(Meteor(), "METEOR")]
def main(eval_caption_file, output, zh=False): df = pd.read_json(eval_caption_file) if zh: refs = df.groupby("key")["tokens"].apply(list).to_dict() else: refs = df.groupby("key")["caption"].apply(list).to_dict() from pycocoevalcap.bleu.bleu import Bleu from pycocoevalcap.cider.cider import Cider from pycocoevalcap.rouge.rouge import Rouge scorer = Bleu(zh=zh) bleu_scores = coco_score(copy.deepcopy(refs), scorer) scorer = Cider(zh=zh) cider_score = coco_score(copy.deepcopy(refs), scorer) scorer = Rouge(zh=zh) rouge_score = coco_score(copy.deepcopy(refs), scorer) if not zh: from pycocoevalcap.meteor.meteor import Meteor scorer = Meteor() meteor_score = coco_score(copy.deepcopy(refs), scorer) from pycocoevalcap.spice.spice import Spice scorer = Spice() spice_score = coco_score(copy.deepcopy(refs), scorer) with open(output, "w") as f: for n in range(4): f.write("BLEU-{}: {:6.3f}\n".format(n + 1, bleu_scores[n])) f.write("CIDEr: {:6.3f}\n".format(cider_score)) f.write("ROUGE: {:6.3f}\n".format(rouge_score)) if not zh: f.write("Meteor: {:6.3f}\n".format(meteor_score)) f.write("SPICE: {:6.3f}\n".format(spice_score))
def score(ref, hypo): """ ref, dictionary of reference sentences (id, sentence) hypo, dictionary of hypothesis sentences (id, sentence) score, dictionary of scores """ # print('ref') # print(ref) # print('hypo') # print(hypo) scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr") ] final_scores = {} for scorer, method in scorers: score, scores = scorer.compute_score(ref, hypo) if type(score) == list: for m, s in zip(method, score): final_scores[m] = s else: final_scores[method] = score return final_scores
def __init__(self, ground_truth_filenames, prediction_filename, verbose=False, all_scorer=False): # Check that the gt and submission files exist and load them self.verbose = verbose self.all_scorer = all_scorer self.ground_truths = self.import_ground_truths(ground_truth_filenames) self.prediction = self.import_prediction(prediction_filename) self.tokenizer = PTBTokenizer() # Set up scorers, if not verbose, we only use the one we're # testing on: METEOR # Meteor is java-based and can crash alot. try: met = Meteor() except (AttributeError, FileNotFoundError) as e: print(f"Meteor couldn't start due to {e}") met = None if self.verbose or self.all_scorer: self.scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (met, "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr") ] else: self.scorers = [(met, "METEOR")] # init some attributes self.easy_samples = {} self.hard_samples = {} self.n_ref_vids = set() self.scores = {}
def score(self, GT, RES, IDs): # edited by rgh #self.eval = {} self.eval = OrderedDict() self.imgToEval = {} gts = {} res = {} for ID in IDs: # print ID gts[ID] = GT[ID] res[ID] = RES[ID] print('tokenization...') tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) # ================================================= # Set up scorers # ================================================= print('setting up scorers...') # edited by rgh # scorers = [ # (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), # (Meteor(),"METEOR"), # (Rouge(), "ROUGE_L"), # (Cider(), "CIDEr"), # #(Spice(), "SPICE") # ] scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Cider(), "CIDEr"), (Rouge(), "ROUGE_L"), # (Spice(), "SPICE") ] # ================================================= # Compute scores # ================================================= eval = {} for scorer, method in scorers: print('computing %s score...' % (scorer.method())) score, scores = scorer.compute_score(gts, res) if type(method) == list: # added by rgh # for sc, scs, m in zip(score, scores, method): # self.setEval(sc, m) # self.setImgToEvalImgs(scs, IDs, m) # print("%s: %0.3f" % (m, sc)) self.setEval("%.4f" % score[-1], method[-1]) self.setImgToEvalImgs(scores[-1], IDs, method[-1]) print("%s: %0.4f" % (method[-1], score[-1])) else: self.setEval("%.4f" % score, method) self.setImgToEvalImgs(scores, IDs, method) print("%s: %0.4f" % (method, score)) # for metric, score in self.eval.items(): # print '%s: %.3f'%(metric, score) return self.eval
def calc_scores(file1, file2): """ ref, dictionary of reference sentences (id, sentence) hypo, dictionary of hypothesis sentences (id, sentence) score, dictionary of scores ref: 真实的数据,类型为dict,如dict{"id":"[sentences]"} hypo: 生成的数据,格式如上。 需满足: assert(type(hypo) is list); assert(len(hypo) == 1); assert(type(ref) is list); assert(len(ref) >= 1); """ pred = readfiles(file1) test = readfiles(file2) # 合成dict类型 i = [i for i in range(len(pred))] hypo = dict(zip(i, pred)) ref = dict(zip(i, test)) scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] final_scores = {} for scorer, method in scorers: score, scores = scorer.compute_score(ref, hypo) if type(score) == list: for m, s in zip(method, score): final_scores[m] = s else: final_scores[method] = score return final_scores
def get_scorers(self): # from pycoco_scorers_vizseq import BLEUScorerAll from pycocoevalcap.bleu.bleu import Bleu # from pycocoevalcap.spice.spice import Spice from pycocoevalcap.cider.cider import Cider from pycocoevalcap.rouge.rouge import Rouge from pycocoevalcap.meteor.meteor import Meteor from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer import logging import transformers transformers.tokenization_utils.logger.setLevel(logging.ERROR) transformers.configuration_utils.logger.setLevel(logging.ERROR) transformers.modeling_utils.logger.setLevel(logging.ERROR) Scorer_ = namedtuple("Scorer_", ["cls_fn", "to_init", "out_str"]) self.scorer_dict = { "bleu": Scorer_(Bleu(4, verbose=0), False, ["bleu@1", "bleu@2", "bleu@3", "bleu@4"]), "meteor": Scorer_(Meteor(), False, ["meteor"]), "cider": Scorer_(Cider("corpus"), False, ["cider"]), "rouge": Scorer_(Rouge(), False, ["rouge"]), # "spice": Scorer_(Spice(), False, ["spice"]), "bert_score": Scorer_(BertScoreSimple, True, ["bert_score"]), } self.tokenizer = PTBTokenizer()
def evaluate(gts, res): eval = {} # ================================================= # Set up scorers # ================================================= print('tokenization...') tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) # ================================================= # Set up scorers # ================================================= print('setting up scorers...') scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] # ================================================= # Compute scores # ================================================= for scorer, method in scorers: print('computing %s score...' % (scorer.method())) score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, scs, m in zip(score, scores, method): eval[m] = sc else: eval[method] = score return eval
def get_scorers(self): # from pycoco_scorers_vizseq import BLEUScorerAll from pycocoevalcap.bleu.bleu import Bleu from pycocoevalcap.cider.cider import Cider from pycocoevalcap.rouge.rouge import Rouge from pycocoevalcap.meteor.meteor import Meteor # from pycocoevalcap.spice.spice import Spice from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer Scorer_ = namedtuple("Scorer_", ["cls_fn", "to_init", "out_str"]) self.scorer_dict = { "bleu": Scorer_( Bleu(4, verbose=0), False, ["bleu_1", "bleu_2", "bleu_3", "bleu_4"] ), "meteor": Scorer_(Meteor(), False, ["meteor"]), "cider": Scorer_(Cider("corpus"), False, ["cider"]), "rouge": Scorer_(Rouge(), False, ["rouge"]), # "spice": Scorer_(Spice(), False, ["spice"]), } self.tokenizer = PTBTokenizer() self.coval_all_metrics = [ ("mentions", evaluator.mentions), ("muc", evaluator.muc), ("bcub", evaluator.b_cubed), ("ceafe", evaluator.ceafe), ("lea", evaluator.lea), ("lea_soft", evaluator.lea_soft), ] self.reset_coval_scorer_dict()
def language_eval(sample_seqs, groundtruth_seqs): assert len(sample_seqs) == len(groundtruth_seqs), 'length of sampled seqs is different from that of groundtruth seqs!' references = OrderedDict() predictions = OrderedDict() for i in range(len(groundtruth_seqs)): references[i] = [groundtruth_seqs[i][j] for j in range(len(groundtruth_seqs[i]))] for i in range(len(sample_seqs)): predictions[i] = [sample_seqs[i]] predictions = {i: predictions[i] for i in range(len(sample_seqs))} references = {i: references[i] for i in range(len(groundtruth_seqs))} avg_bleu_score, bleu_score = Bleu(4).compute_score(references, predictions) print('avg_bleu_score == ', avg_bleu_score) avg_cider_score, cider_score = Cider().compute_score(references, predictions) print('avg_cider_score == ', avg_cider_score) avg_meteor_score, meteor_score = Meteor().compute_score(references, predictions) print('avg_meteor_score == ', avg_meteor_score) avg_rouge_score, rouge_score = Rouge().compute_score(references, predictions) print('avg_rouge_score == ', avg_rouge_score) # print('BLEU1:{}\nBLEU2:{}\nBLEU3:{}\nBLEU4:{}\nMETEOR:{}\nROUGE:{}CIDEr:{}\n'.format(avg_bleu_score[0], # avg_bleu_score[1], # avg_bleu_score[2], # avg_bleu_score[3], # avg_meteor_score, # avg_rouge_score, # avg_cider_score)) return {'BLEU': avg_bleu_score, 'CIDEr': avg_cider_score, 'METEOR': avg_meteor_score, 'ROUGE': avg_rouge_score}
def evaluate(self): # ================================================= # Tokenization # ================================================= print("Tokenization") tokenizer = PTBTokenizer() gts = tokenizer.tokenize(self.ground_truth) preds = tokenizer.tokenize(self.prediction) # ================================================= # Setup scorers # ================================================= print("Setting up scorers...") scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), # (Spice(), "SPICE") ] # ================================================= # Compute scores # ================================================= for scorer, method in scorers: print("Computing {} score...".format(scorer.method())) score, scores = scorer.compute_score(gts, preds) if isinstance(method, list): for sc, scs, m in zip(score, scores, method): self.eval_res[m] = sc * 100 else: self.eval_res[method] = score * 100
def compute_ms_coco(self): """Performs the MS COCO evaluation using the Python 3 implementation (https://github.com/salaniz/pycocoevalcap) :param gts: Dictionary with the image ids and their gold captions, :param res: Dictionary with the image ids and their generated captions :print: Evaluation score (the mean of the scores of all the instances) for each measure """ # load the csv files, containing the results and gold data. self.logger.info("Loading data") self._load_data() # Preprocess captions self.logger.info("Preprocessing captions") self.gold_data = self._preprocess_captions(self.gold_data) self.result_data = self._preprocess_captions(self.result_data) if len(self.gold_data) == len(self.result_data): # Set up scorers scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L")] # Compute score for each metric self.logger.info("Computing COCO score.") for scorer, method in scorers: print("Computing", scorer.method(), "...") score, scores = scorer.compute_score(self.gold_data, self.result_data) if type(method) == list: for sc, m in zip(score, method): print("%s : %0.3f" % (m, sc)) else: print("%s : %0.3f" % (method, score)) else: self.logger.error( "Gold data len={0} and results data len={1} have not equal size" .format(len(self.gold_data), len(self.result_data)))
def evaluate(self): imgIds = self.params['image_id'] gts = self.gts res = self.res # ================================================= # Set up scorers # ================================================= tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) # ================================================= # Set up scorers # ================================================= scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] # ================================================= # Compute scores # ================================================= eval = {} for scorer, method in scorers: score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, scs, m in zip(score, scores, method): self.setEval(sc, m) self.setImgToEvalImgs(scs, imgIds, m) else: self.setEval(score, method) self.setImgToEvalImgs(scores, imgIds, method) self.setEvalImgs()
def CocoScore(ref, hyp, metrics_list=None, language='en'): """ Obtains the COCO scores from the references and hypotheses. :param ref: Dictionary of reference sentences (id, sentence) :param hyp: Dictionary of hypothesis sentences (id, sentence) :param metrics_list: List of metrics to evaluate on :param language: Language of the sentences (for METEOR) :return: dictionary of scores """ if metrics_list is None: metrics_list = ['bleu', 'ter', 'meteor', 'rouge_l', 'cider'] else: metrics_list = [metric.lower() for metric in metrics_list] scorers = [] if 'bleu' in metrics_list: scorers.append((Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"])) if 'meteor' in metrics_list: scorers.append((Meteor(language), "METEOR")) if 'ter' in metrics_list: scorers.append((Ter(), "TER")) if 'rouge_l' in metrics_list or 'rouge' in metrics_list: scorers.append((Rouge(), "ROUGE_L")) if 'cider' in metrics_list: scorers.append((Cider(), "CIDEr")) final_scores = {} for scorer, method in scorers: score, _ = scorer.compute_score(ref, hyp) if isinstance(score, list): for m, s in zip(method, score): final_scores[m] = s else: final_scores[method] = score return final_scores
def compute_scores(gts, res): """ Performs the MS COCO evaluation using the Python 3 implementation (https://github.com/salaniz/pycocoevalcap) :param gts: Dictionary with the image ids and their gold captions, :param res: Dictionary with the image ids ant their generated captions :print: Evaluation score (the mean of the scores of all the instances) for each measure """ # Preprocess captions gts = preprocess_captions(gts) res = preprocess_captions(res) # Set up scorers scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Spice(), "SPICE"), (Cider(), "CIDEr") ] # Compute score for each metric for scorer, method in scorers: print("Computing", scorer.method(), "...") score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, m in zip(score, method): print("%s : %0.3f" % (m, sc)) else: print("%s : %0.3f" % (method, score))
def score(gts, res, ids): origingts = gts originres = res tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) """ scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(),"METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] """ scorers = [(Meteor(), "METEOR")] for scorer, method in scorers: score, scores = scorer.compute_score(gts, res) print("{:<14}:\t{:0.4f}".format(method, score)) F1_score = F1(originres, origingts) avg = 0.0 for noc_word in sorted(F1_score.keys()): print("{:<14}:\t{:0.4f}".format(noc_word, F1_score[noc_word])) avg += F1_score[noc_word] avg = avg / len(F1_score.keys()) print("{:<14}:\t{:0.4f}".format("Average", avg))
def get_dcc_scores(self): imgIds = self.params['image_id'] # imgIds = self.coco.getImgIds() gts = {} res = {} for imgId in imgIds: gts[imgId] = self.coco.imgToAnns[imgId] res[imgId] = self.cocoRes.imgToAnns[imgId] tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] score_dict = {} for scorer, method in scorers: print 'computing %s score...' % (scorer.method()) score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, scs, m in zip(score, scores, method): score_dict[m] = sc print "%s: %0.3f" % (m, sc) else: score_dict[method] = score print "%s: %0.3f" % (method, score) return score_dict
def get_scorers(cider_idx_path): return { 'cider': CiderD(df=cider_idx_path), 'bleu': Bleu(), 'rouge': Rouge(), 'meteor': Meteor() }
def score(num, DIR): print("Testing results on epoch ", num, " in DIR=", DIR) print("Loading coco annotations") dataDir = '.' dataType = 'val2014' algName = 'fakecap' annFile = '%s/annotations/captions_%s.json' % (dataDir, dataType) subtypes = ['results', 'evalImgs', 'eval'] [resFile, evalImgsFile, evalFile]= \ ['%s/results/captions_%s_%s_%s.json'%(dataDir,dataType,algName,subtype) for subtype in subtypes] coco_anns = COCO(annFile) print("COCO anns imported") path = DIR + str(num) + '_test_result.tar.gz' save = pickle.load(open(path)) cocoRes = {} coco = {} for key, val in save.items(): reslst = val[u'res'] res = [] for data in reslst: if data != u'<SEND>': res.append(data) else: break res = res[1:] #print "RES: ",reslst #print "ANN: ", val[u'ann'] #res = [word for word in res if word!=u'<SEND>'][1:] #print "RES FIXED: ", res if len(res) == 0: res = [u'a'] #just not to be empty, and it has low low idf cocoRes[key] = [{u'caption': ' '.join(res)}] #coco[key] = [{u'caption':' '.join(val[u'ann'][1:-1])}] coco[key] = coco_anns.imgToAnns[key] print 'examples' for key in coco.keys()[:5]: print "IMG_NUM=", key print "Annotation: ", '\n'.join( [coco[key][i][u'caption'] for i in range(len(coco[key]))]) print "Generated data: ", ' '.join(save[key][u'res']) print "Cleared generation: ", cocoRes[key][0][u'caption'] print 'tokenization...' tokenizer = PTBTokenizer() gts = tokenizer.tokenize(coco) res = tokenizer.tokenize(cocoRes) print 'setting up scorers...' scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), (Spice(), "SPICE")] for scorer, method in scorers: print 'computing %s score...' % (scorer.method()) score, scores = scorer.compute_score(gts, res) print(score)
def _define_metrics(gts, res): bleu_scorer = Bleu(n=4) bleu, _ = bleu_scorer.compute_score(gts=gts, res=res) rouge_scorer = Rouge() rouge, _ = rouge_scorer.compute_score(gts=gts, res=res) cider_scorer = Cider() cider, _ = cider_scorer.compute_score(gts=gts, res=res) meteor_scorer = Meteor() meteor, _ = meteor_scorer.compute_score(gts=gts, res=res) for i in range(4): bleu[i] = round(bleu[i], 4) return bleu, round(meteor, 4), round(rouge, 4), round(cider, 4)
def init_scorer(cached_tokens): global CiderD_scorer CiderD_scorer = CiderD_scorer or CiderD(df=cached_tokens) global Bleu_scorer Bleu_scorer = Bleu_scorer or Bleu(4) global Meteor_scorer Meteor_scorer = Meteor() global Rouge_scorer Rouge_scorer = Rouge()
def init_eval_metric(bleu_n=4): global Meteor_scorer global Cider_scorer global Bleu_scorer global Bleu_N Meteor_scorer = Meteor_scorer or Meteor() Cider_scorer = Cider_scorer or Cider() Bleu_scorer = Bleu_scorer or Bleu(bleu_n) Bleu_N = bleu_n
def eval(result_gts_path, result_res_path): with open(result_gts_path, 'r') as file: gts_dict = json.load(file) with open(result_res_path, 'r') as file: res_dict = json.load(file) bleu_score = Bleu(n=4) bleu, _ = bleu_score.compute_score(gts=gts_dict, res=res_dict) meteor_score = Meteor() meteor, _ = meteor_score.compute_score(gts=gts_dict, res=res_dict) rouge_scorer = Rouge() rouge, _ = rouge_scorer.compute_score(gts=gts_dict, res=res_dict) cider_scorer = Cider() cider, _ = cider_scorer.compute_score(gts=gts_dict, res=res_dict) return bleu, meteor, rouge, cider
def __init__(self, generation_l, top_K, crossover_c, iterations_M, sentiment_based_neighbours, alpha, beta, replacements_Z, replLimit): self.generation = generation_l self.topK = top_K self.crossover = crossover_c self.iterations = iterations_M self.neighboursDictionary = sentiment_based_neighbours self.replacements = replacements_Z self.replacementsLimit = replLimit self.allowedPOStags = [ 'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'PRON', 'SCONJ', 'VERB' ] self.alpha = alpha self.beta = beta self.meteor = Meteor()
def evaluate_tiou(self, tiou): # For every prediction, find it's respective references with tIoU > the passed in argument. res = {} gts = {} unique_index = 0 for vid_id in self.prediction: for pred in self.prediction[vid_id]: res[unique_index] = [{'caption': pred['sentence']}] matches = [] for gt in self.ground_truths: refs = gt[vid_id] for ref_i, ref_timestamp in enumerate(refs['timestamps']): if self.iou(pred['timestamp'], ref_timestamp) > tiou: matches.append(refs['sentences'][ref_i]) if len(matches) == 0: gts[unique_index] = [{'caption': 'abc123!@#'}] else: gts[unique_index] = [{'caption': v} for v in matches] unique_index += 1 # Set up scorers if self.verbose: print '| Tokenizing ...' # Suppressing tokenizer output tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) # Set up scorers if self.verbose: print '| Setting up scorers ...' scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] # Compute scores output = {} for scorer, method in scorers: if self.verbose: print 'computing %s score...' % (scorer.method()) score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, scs, m in zip(score, scores, method): output[m] = sc if self.verbose: print "Calculated tIoU: %1.1f, %s: %0.3f" % (tiou, m, sc) else: output[method] = score if self.verbose: print "Calculated tIoU: %1.1f, %s: %0.3f" % (tiou, method, score) return output