def init_scorer(cached_tokens): global CiderD_scorer CiderD_scorer = CiderD_scorer or CiderD(df=cached_tokens) global Bleu_scorer Bleu_scorer = Bleu_scorer or Bleu(4) global Meteor_scorer Meteor_scorer = Meteor() global Rouge_scorer Rouge_scorer = Rouge()
def cal_BLEU(generated, reference, is_corpus=False): #print 'in BLEU score calculation' #the maximum is bigram, so assign the weight into 2 half. BLEUscore = [0.0, 0.0, 0.0] for idx, g in enumerate(generated): if is_corpus: score, scores = Bleu(4).compute_score(reference, {0: [g]}) else: score, scores = Bleu(4).compute_score({0: [reference[0][idx]]}, {0: [g]}) #print g, score for i, s in zip([0, 1, 2], score[1:]): BLEUscore[i] += s #BLEUscore += nltk.translate.bleu_score.sentence_bleu(reference, g, weight) BLEUscore[0] = BLEUscore[0] / len(generated) BLEUscore[1] = BLEUscore[1] / len(generated) BLEUscore[2] = BLEUscore[2] / len(generated) return BLEUscore
def compute_bleu_score(decode_res, keys, gts, start_idx, end_idx, vocabulary): """ Args: decode_res: decoding results of model, [B, max_length] keys: keys of this batch, tuple [B,] gts: ground truth sentences of all audios, dict(<key> -> [ref_1, ref_2, ..., ref_n]) Return: score: scores of this batch, [B,] """ from pycocoevalcap.bleu.bleu import Bleu scorer = Bleu(4) hypothesis = {} references = {} for i in range(decode_res.shape[0]): if keys[i] in hypothesis: continue # prepare candidate candidate = [] for t, w_t in enumerate(decode_res[i]): if w_t == start_idx: continue elif w_t == end_idx: break else: candidate.append(vocabulary.idx2word[w_t]) hypothesis[keys[i]] = [ " ".join(candidate), ] # prepare reference references[keys[i]] = gts[keys[i]] (score, scores) = scorer.compute_score(references, hypothesis) key2score = {key: scores[3][i] for i, key in enumerate(hypothesis.keys())} results = np.zeros(decode_res.shape[0]) for i in range(decode_res.shape[0]): results[i] = key2score[keys[i]] return results
def computeBleuScore(self): methods = ["Blue 1.0", "Blue 2.0", "Blue 3.0", "Blue 4.0"] #Compute Score scores, blueList = Bleu(4).compute_score(self.gtc_tokens, self.pc_tokens) for score, blue, method in zip(scores, blueList, methods): self.evalResults[method] = score self.setVideoEvalResults(blue, method)
def __init__(self, vocab_file='graph2text/data/vocabs.txt'): super(Evaluate, self).__init__() self.scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), # (Rouge(), "ROUGE_L") ] with open(vocab_file, encoding='utf-8') as f: vocab_list = f.readlines() self.vocab = [_.strip('\n') for _ in vocab_list] self.padding_idx = self.vocab.index('<blank>')
def eval(result_gts_path, result_res_path): with open(result_gts_path, 'r') as file: gts_dict = json.load(file) with open(result_res_path, 'r') as file: res_dict = json.load(file) bleu_score = Bleu(n=4) bleu, _ = bleu_score.compute_score(gts=gts_dict, res=res_dict) meteor_score = Meteor() meteor, _ = meteor_score.compute_score(gts=gts_dict, res=res_dict) rouge_scorer = Rouge() rouge, _ = rouge_scorer.compute_score(gts=gts_dict, res=res_dict) cider_scorer = Cider() cider, _ = cider_scorer.compute_score(gts=gts_dict, res=res_dict) return bleu, meteor, rouge, cider
def get_qg_metrics(generated, question, promptQuestion, metrics): evaluation = {} # computing bleu scores for name, score in zip(['bleu{}'.format(i) for i in range(1, 5)], Bleu(4).compute_score(question, generated)[0]): if name in metrics: evaluation[name] = score # computing edit-f1 score if 'edit-f1' in metrics: def _get_edits(tokens1, tokens2): allCommon = [] while True: commons = list(set(tokens1) & set(tokens2)) if len(commons) == 0: break allCommon += commons for c in commons: ind1, ind2 = tokens1.index(c), tokens2.index(c) tokens1 = tokens1[:ind1] + tokens1[ind1 + 1:] tokens2 = tokens2[:ind2] + tokens2[ind2 + 1:] deleted = ["[DELETED]" + token for token in tokens1] added = ["[ADDED]" + token for token in tokens2] common = ["[FIXED]" + token for token in allCommon] return deleted + added #+common assert len(generated) == len(promptQuestion) == 1 generated = generated["sent"][0].split(" ") promptQuestion = promptQuestion["sent"][0].split(" ") prediction = _get_edits(promptQuestion, generated) edit_f1 = 0 for _question in question["sent"]: _question = _question.split(" ") reference = _get_edits(promptQuestion, _question) # now compare the reference edits and predicted edits if len(reference) == len(prediction) == 0: # rarely, reference has no edits after normalization # then, if the prediction also has no edits, it gets full score edit_f1 = 1 elif len(reference) == 0 or len(prediction) == 0: # if only one of them has no edits, zero score edit_f1 = max(edit_f1, 0) else: # otherwise, compute F1 score between prediction and reference edit_f1 = max( edit_f1, get_f1(prediction, reference, is_equal=lambda x, y: x == y)) evaluation["edit-f1"] = edit_f1 assert len(metrics) == len(evaluation) return evaluation
def bleu_scorer(reference, hypothesis): # ================================================= # Compute scores # ================================================= scorer = Bleu(4) method = ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"] # print('computing %s score...' % (scorer.method())) score, scores = scorer.compute_score(reference, hypothesis) bleus = {} if type(method) == list: for sc, scs, m in zip(score, scores, method): # print("%s: %0.3f" % (m, sc)) bleus[m] = sc else: # print("%s: %0.3f" % (method, score)) bleus[method] = score return bleus
def get_auxiliary_features(contexts, gtresponses, modelresponses, num_examples): aux_features = np.zeros((num_examples, 5)) bleu1 = [] bleu2 = [] bleu3 = [] bleu4 = [] meteor = [] rouge = [] for i in xrange(num_examples): bleu1.append(Bleu(1).compute_score({0: [gtresponses[i]]}, {0: [modelresponses[i]]})[0][0]) bleu2.append(Bleu(2).compute_score({0: [gtresponses[i]]}, {0: [modelresponses[i]]})[0][1]) bleu3.append(Bleu(3).compute_score({0: [gtresponses[i]]}, {0: [modelresponses[i]]})[0][2]) bleu4.append(Bleu(4).compute_score({0: [gtresponses[i]]}, {0: [modelresponses[i]]})[0][3]) rouge.append(Rouge().compute_score({0: [gtresponses[i]]}, {0: [modelresponses[i]]})[0]) aux_features[:,0] = bleu1 aux_features[:,1] = bleu2 aux_features[:,2] = bleu3 aux_features[:,3] = bleu4 aux_features[:,4] = rouge return aux_features
class Metrics: def __init__(self): pass def bleu(self, hypo, ref): self.bleu_scorer = Bleu(4) final_scores = {} score, scores = self.bleu_scorer.compute_score(ref, hypo) for m, s in zip(["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"], score): final_scores[m] = s return final_scores
def score(ref, hypo): scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"])] final_scores = {} for scorer, method in scorers: score, scores = scorer.compute_score(ref, hypo) if type(score) == list: for m, s in zip(method, score): final_scores[m] = s else: final_scores[method] = score return final_scores
def init_scorer(cache_tokens): global CiderD_scorer if CiderD_scorer is None: CiderD_scorer = CiderD(df=cache_tokens) else: CiderD_scorer = CiderD_scorer # CiderD_scorer = CiderD_scorer or CiderD(df=cache_tokens) global Bleu_scorer if Bleu_scorer is None: Bleu_scorer = Bleu(4) else: Bleu_scorer = Bleu_scorer
def evaluate_tiou(self, tiou): # For every prediction, find it's respective references with tIoU > the passed in argument. res = {} gts = {} unique_index = 0 for vid_id in self.prediction: for pred in self.prediction[vid_id]: res[unique_index] = [{'caption': pred['sentence']}] matches = [] for gt in self.ground_truths: refs = gt[vid_id] for ref_i, ref_timestamp in enumerate(refs['timestamps']): if self.iou(pred['timestamp'], ref_timestamp) > tiou: matches.append(refs['sentences'][ref_i]) if len(matches) == 0: gts[unique_index] = [{'caption': 'abc123!@#'}] else: gts[unique_index] = [{'caption': v} for v in matches] unique_index += 1 # Set up scorers if self.verbose: print '| Tokenizing ...' # Suppressing tokenizer output tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) # Set up scorers if self.verbose: print '| Setting up scorers ...' scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] # Compute scores output = {} for scorer, method in scorers: if self.verbose: print 'computing %s score...' % (scorer.method()) score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, scs, m in zip(score, scores, method): output[m] = sc if self.verbose: print "Calculated tIoU: %1.1f, %s: %0.3f" % (tiou, m, sc) else: output[method] = score if self.verbose: print "Calculated tIoU: %1.1f, %s: %0.3f" % (tiou, method, score) return output
def calculate_metric(rnn, meteor=None): gts = {} res = {} lp_avg = 0.0 lp_c = 0 for idx in range(rnn.V_valid.shape[0]): iid = rnn.Id_valid[idx] if iid not in gts: gts[iid] = [] #gts[iid].append(' '.join([rnn.dp.i2w[w] for w in rnn.X_valid[idx] if w != 0][::-1])) gts[iid] = [ ' '.join(rnn.dp.tokens[i][::-1]) for i in rnn.dp.img_id_to_tokens[iid] ] if iid in res: continue res[iid] = [] #pos_sen, pos_att = rnn.get_sentence(rnn.V_valid[idx], senti=np.array([1.0], dtype=theano.config.floatX)) (lp, pos_sen) = decoder_beamsearch(rnn, rnn.V_valid[idx], senti=1.0, beam_size=1) pos_sen = pos_sen[:-1] print(' '.join(pos_sen[::-1])) res[iid].append(' '.join(pos_sen[::-1])) lp_avg += np.exp(lp) lp_c += 1 lp_avg /= float(lp_c) return lp_avg bleu = Bleu() print("Bleu:") print("Positive:", bleu.compute_score(gts, res)[0]) rouge = Rouge() print("Rouge:") print("Positive:", rouge.compute_score(gts, res)[0]) if meteor is None: meteor = Meteor() print("Meteor:") mscore = meteor.compute_score(gts, res)[0] print("Positive:", mscore) return mscore
def val_score(self, s_start=0, num_batches=2): bs = self.imp["BATCH_SIZE"] bleu = Bleu() eval_store_gen = {} eval_store_gt = {} num_examples = self.test_data.dec_in.get_num_seqs() max_num_batches = num_examples / bs for i in xrange(min(num_batches, max_num_batches)): s = s_start + bs * i e = s_start + bs * (i + 1) gen_txt = self.generate(s=s, allow_unk=False) gt_txt = self.test_data.dec_out.get_text(s, e) fnames = self.test_data.filenames[s:e] for g, f in zip(gen_txt, fnames): if f not in eval_store_gen: eval_store_gen[f] = [" ".join(g)] for g, f in zip(gt_txt, fnames): if f not in eval_store_gt: eval_store_gt[f] = [] eval_store_gt[f].append(" ".join(g)) print bleu.compute_score(eval_store_gt, eval_store_gen)[0]
def score(ref, hypo): scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] final_scores = {} for scorer, method in scorers: score, scores = scorer.compute_score(ref, hypo) if type(score) == list: for m, s in zip(method, score): final_scores[m] = s else: final_scores[method] = score return final_scores
def score(self, GT, RES, IDs): self.eval = {} self.imgToEval = {} gts = {} res = {} for ID in IDs: # print ID gts[ID] = GT[ID] res[ID] = RES[ID] print('tokenization...') tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) # ================================================= # Set up scorers # ================================================= print('setting up scorers...') scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), # (Spice(), "SPICE") ] # ================================================= # Compute scores # ================================================= eval = {} sub_category_score = None for scorer, method in scorers: print('computing %s score...' % (scorer.method())) if method == 'SPICE': score, scores, sub_category_score = scorer.compute_score( gts, res) else: score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, scs, m in zip(score, scores, method): self.setEval(sc, m) self.setImgToEvalImgs(scs, IDs, m) print("%s: %0.3f" % (m, sc)) else: self.setEval(score, method) self.setImgToEvalImgs(scores, IDs, method) print("%s: %0.3f" % (method, score)) # for metric, score in self.eval.items(): # print '%s: %.3f'%(metric, score) return self.eval, sub_category_score
def eval_epoch_bleu(model, validation_data, device, vocab, list_of_refs_dev, args): ''' Epoch operation in evaluation phase ''' model.eval() total_loss = 0 n_word_total = 0 n_word_correct = 0 hypotheses = {} count = 0 with torch.no_grad(): for batch in tqdm( validation_data, mininterval=2, desc=' - (Validation) ', leave=False): # prepare data image0, image1, image0_attribute, image1_attribute = map(lambda x: x.to(device), batch) """[src/tgt/memory]_key_padding_mask should be a ByteTensor where True values are positions that should be masked with float('-inf') and False values will be unchanged. This mask ensures that no information will be taken from position i if it is masked, and has a separate mask for each sequence in a batch.""" hyp = beam_search(image0, image1, model, args, vocab, image0_attribute, image1_attribute) hyp = hyp.split("<end>")[0].strip() hypotheses[count] = [hyp] count += 1 scorer = Bleu(4) score, _ = scorer.compute_score(list_of_refs_dev, hypotheses) return score
def score(gts, res, ids, log_out): tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] for scorer, method in scorers: # print 'computing %s score...'%(scorer.method()) score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, scs, m in zip(score, scores, method): print >> log_out, "%s: %f" % (m, sc) else: print >> log_out, "%s: %f" % (method, score)
def __init__(self, ground_truth_fname, lang=DEFAULT_LANG): self.eval = {} self.imgToEval = {} self.gts = {} data = open( ground_truth_fname).readlines() if 0 == lang else codecs.open( ground_truth_fname, 'r', 'utf-8').readlines() for line in data: sent_id, sent = line.strip().split(' ', 1) sent = ' '.join(TextTool.tokenize(sent, lang)) #process_sent(sent) img_id = os.path.splitext(sent_id.split('#')[0])[0] self.gts.setdefault(img_id, []).append(sent) logger.info('setting up scorers...') if 0 == lang: self.scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] else: self.scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")]
def score(ref, sample): # ref and sample are both dict scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] final_scores = {} for scorer, method in scorers: print 'computing %s score with COCO-EVAL...' % (scorer.method()) score, scores = scorer.compute_score(ref, sample) if type(score) == list: for m, s in zip(method, score): final_scores[m] = s else: final_scores[method] = score return final_scores
def evaluate_captions_bleu(ref, cand): hypo = {} refe = {} for i, caption in enumerate(cand): hypo[i] = [caption] refe[i] = ref[i] scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"])] final_scores = {} for scorer, method in scorers: _, scores = scorer.compute_score(refe, hypo) for m, s in zip(method, scores): final_scores[m] = s assert len(s) == len(cand) return final_scores['Bleu_4']
def score(self, GT, RES, IDs): self.eval = {} self.imgToEval = {} gts = {} res = {} for ID in IDs: gts[ID] = GT[ID] res[ID] = RES[ID] print 'tokenization...' tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) with open('all_samples.txt', 'w') as f: for i in res.keys(): print >> f, 'valid stuff' print >> f, '\t'.join(res[i]) print >> f, 'ground truth' print >> f, '\n'.join(gts[i]) # ================================================= # Set up scorers # ================================================= print 'setting up scorers...' scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), # (Meteor(),"METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr") ] # ================================================= # Compute scores # ================================================= eval = {} for scorer, method in scorers: print 'computing %s score...' % (scorer.method()) score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, scs, m in zip(score, scores, method): self.setEval(sc, m) self.setImgToEvalImgs(scs, IDs, m) print "%s: %0.3f" % (m, sc) else: self.setEval(score, method) self.setImgToEvalImgs(scores, IDs, method) print "%s: %0.3f" % (method, score) for metric, score in self.eval.items(): print '%s: %.3f' % (metric, score) return self.eval
def score(self, GT, RES, IDs, result_file): self.eval = {} self.imgToEval = {} gts = {} res = {} for ID in IDs: # print ID gts[ID] = GT[ID] res[ID] = RES[ID] print 'tokenization...' tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) # ================================================= # Set up scorers # ================================================= print 'setting up scorers...' scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] # result_file = '/home/anguyen/workspace/paper_src/2018.icra.v2c.source/output/' + net_id + '/prediction/score_result.txt' print 'RESULT FILE: ', result_file fwriter = open(result_file, 'w') # ================================================= # Compute scores # ================================================= eval = {} for scorer, method in scorers: print 'computing %s score...' % (scorer.method()) score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, scs, m in zip(score, scores, method): self.setEval(sc, m) self.setImgToEvalImgs(scs, IDs, m) print "%s: %0.3f" % (m, sc) fwriter.write("%s %0.3f\n" % (m, sc)) else: self.setEval(score, method) self.setImgToEvalImgs(scores, IDs, method) print "%s: %0.3f" % (method, score) fwriter.write("%s %0.3f\n" % (method, score)) #for metric, score in self.eval.items(): # print '%s: %.3f'%(metric, score) return self.eval
def evaluate(self): res = {} for r in self.rests: res[str(r['image_id'])] = [{'caption': r['caption']}] gts = {} for imgId in self.annos: gts[str(imgId)] = [{'caption': c} for c in self.annos[imgId]] # ================================================= # Set up scorers # ================================================= # print('tokenization...') tokenizer = self.Tokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) # ================================================= # Set up scorers # ================================================= # print('setting up scorers...') use_scorers = self.use_scorers scorers = [] if 'Bleu' in use_scorers: scorers.append((Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"])) if 'METEOR' in use_scorers: scorers.append((Meteor(), "METEOR")) if 'ROUGE_L' in use_scorers: scorers.append((Rouge(), "ROUGE_L")) if 'CIDEr' in use_scorers: scorers.append((Cider(), "CIDEr")) if 'SPICE' in use_scorers: scorers.append((Spice(), "SPICE")) # ================================================= # Compute scores # ================================================= for scorer, method in scorers: # print('computing %s score...'%(scorer.method())) score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, scs, m in zip(score, scores, method): self.setEval(sc, m) self.setImgToEvalImgs(scs, gts.keys(), m) # print("%s: %0.1f" % (m, sc*100)) else: self.setEval(score, method) self.setImgToEvalImgs(scores, gts.keys(), method) # print("%s: %0.1f" % (method, score*100)) self.setEvalImgs()
def evaluate(): with open (os.path.join(FLAGS.data_dir, 'feature.test'), 'rb') as f: feature = cPickle.load(f) with open(os.path.join(FLAGS.data_dir, 'caption.test'), 'rb') as f: sentence = cPickle.load(f) scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(),"METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] vocab, re_vocab = data_utils.initialize_vocabulary() GTS = {} RES = {} batch_size = 1 max_meteor = 0 with tf.Session() as sess: model = Seq2Seq(FLAGS.num_units, FLAGS.use_lstm, FLAGS.epsilon, FLAGS.max_computation, FLAGS.encoder_max_sequence_length, FLAGS.decoder_max_sentence_length, FLAGS.feature_size, FLAGS.vocab_size, FLAGS.learning_rate, FLAGS.learning_rate_decay_factor, FLAGS.time_penalty, FLAGS.max_gradient_norm, forward_only=True) step = 0 while True: step += FLAGS.steps_per_checkpoint ckpt_path = os.path.join(FLAGS.checkpoint_dir,'ckpt-%d'%step) if os.path.isfile(ckpt_path+'.meta'): model.saver.restore(sess, ckpt_path) for vid, _ in feature.iteritems(): feature_inputs, batch_decoder_inputs, batch_weights = model.get_batch(feature, [(vid, [0])]) output_logits, remainders, iterations = model.step(sess, feature_inputs, batch_decoder_inputs, batch_weights, forward_only=True) outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] R = ['%.3f'%remainders[j][0] for j in xrange(FLAGS.encoder_max_sequence_length)] I = ['%d'%iterations[j][0] for j in xrange(FLAGS.encoder_max_sequence_length)] print(' '.join(R)) print(' '.join(I)) sen = " ".join([tf.compat.as_str(re_vocab[output]) for output in outputs]) print ("%s - %s: %s"%(vid, sen, sentence[vid][9])) GTS[vid] = sentence[vid] RES[vid] = [sen] print('STEP: %d'%step) for scorer, method in scorers: score, scores = scorer.compute_score(GTS, RES) if method == "METEOR" and score > max_meteor: max_meteor = score if isinstance(method, list): for k, v in zip(method, score): print("%s:\t%f"%(k, v)) else: print("%s:\t%f"%(method, score)) sys.stdout.flush() else: break print("Max METEOR:\t%f"%max_meteor)
def score(self, GT, RES, IDs): self.eval = {} self.imgToEval = {} gts = {} res = {} for ID in IDs: gts[ID] = GT[ID] res[ID] = RES[ID] print('tokenization...') tokenizer = PTBTokenizer() ''' print("gts: ") for key in gts: print(key) for value in gts[key]: print(value) ''' gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) # ================================================= # Set up scorers # ================================================= print('setting up scorers...') scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] # ================================================= # Compute scores # ================================================= eval = {} for scorer, method in scorers: print('computing %s score...' % (scorer.method())) score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, scs, m in zip(score, scores, method): self.setEval(sc, m) self.setImgToEvalImgs(scs, IDs, m) print("%s: %0.3f" % (m, sc)) else: self.setEval(score, method) self.setImgToEvalImgs(scores, IDs, method) print("%s: %0.3f" % (method, score)) for metric, score in self.eval.items(): print('%s: %.3f' % (metric, score)) return self.eval
def evaluate(self): imgIds = self.params['image_id'] # imgIds = self.coco.getImgIds() gts = {} res = {} for imgId in imgIds: gts[imgId] = self.coco.imgToAnns[imgId] res[imgId] = self.cocoRes.imgToAnns[imgId] # ================================================= # Set up scorers # ================================================= print('you') print('tokenization...') tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) # ================================================= # Set up scorers # ================================================= print('setting up scorers...') scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(),"METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr") ] # ================================================= # Compute scores # ================================================= for scorer, method in scorers: print('computing %s score...'%(scorer.method())) score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, scs, m in zip(score, scores, method): self.setEval(sc, m) self.setImgToEvalImgs(scs, gts.keys(), m) print("%s: %0.3f"%(m, sc)) else: self.setEval(score, method) self.setImgToEvalImgs(scores, gts.keys(), method) print("%s: %0.3f"%(method, score)) self.setEvalImgs()
def score(ref, hypo): """ ref, dictionary of reference sentences (id, sentence) hypo, dictionary of hypothesis sentences (id, sentence) score, dictionary of scores """ scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"])] final_scores = {} for scorer, method in scorers: score, scores = scorer.compute_score(ref, hypo) if type(score) == list: for m, s in zip(method, score): final_scores[m] = s else: final_scores[method] = score return final_scores
def main(eval_caption_file, output, zh=False, embedding_path=None): df = pd.read_json(eval_caption_file) if zh: refs = df.groupby("key")["tokens"].apply(list).to_dict() else: refs = df.groupby("key")["caption"].apply(list).to_dict() from pycocoevalcap.bleu.bleu import Bleu from pycocoevalcap.cider.cider import Cider from pycocoevalcap.rouge.rouge import Rouge scorer = Bleu(zh=zh) bleu_scores = coco_score(copy.deepcopy(refs), scorer) print(bleu_scores) scorer = Cider(zh=zh) cider_score = coco_score(copy.deepcopy(refs), scorer) print(cider_score) scorer = Rouge(zh=zh) rouge_score = coco_score(copy.deepcopy(refs), scorer) print(rouge_score) if not zh: from pycocoevalcap.meteor.meteor import Meteor scorer = Meteor() meteor_score = coco_score(copy.deepcopy(refs), scorer) from pycocoevalcap.spice.spice import Spice scorer = Spice() spice_score = coco_score(copy.deepcopy(refs), scorer) diverse_score = diversity_score(refs, zh) with open(embedding_path, "rb") as f: ref_embeddings = pickle.load(f) bert_score = embedding_score(ref_embeddings, zh) with open(output, "w") as f: for n in range(4): f.write("BLEU-{}: {:6.3f}\n".format(n + 1, bleu_scores[n])) f.write("CIDEr: {:6.3f}\n".format(cider_score)) f.write("ROUGE: {:6.3f}\n".format(rouge_score)) if not zh: f.write("Meteor: {:6.3f}\n".format(meteor_score)) f.write("SPICE: {:6.3f}\n".format(spice_score)) f.write("SentenceBert: {:6.3f}\n".format(bert_score)) f.write("Diversity: {:6.3f}\n".format(diverse_score))