def test(model, dataloader, args): scorer = Bleu(4) m_scorer = Meteor() r_scorer = Rouge() hyp = [] ref = [] model.eval() gold_file = open('tmp_gold.txt', 'w') pred_file = open('tmp_pred.txt', 'w') with tqdm(dataloader, desc='Test ', mininterval=1) as tq: for batch in tq: with torch.no_grad(): seq = model(batch, beam_size=args.beam_size) r = write_txt(batch, batch['tgt_text'], gold_file, args) h = write_txt(batch, seq, pred_file, args) hyp.extend(h) ref.extend(r) hyp = dict(zip(range(len(hyp)), hyp)) ref = dict(zip(range(len(ref)), ref)) print(hyp[0], ref[0]) print('BLEU INP', len(hyp), len(ref)) print('BLEU', scorer.compute_score(ref, hyp)[0]) print('METEOR', m_scorer.compute_score(ref, hyp)[0]) print('ROUGE_L', r_scorer.compute_score(ref, hyp)[0]) gold_file.close() pred_file.close()
def language_eval_excoco(predictions, predictions_bleu, sents_label_eval, loader): Scorer = CiderD() Bleu_scorer = Bleu(4) METEOR_scorer = Meteor() ROUGE_scorer = Rouge() c_score, _ = Scorer.compute_score(sents_label_eval, predictions) b_score, _ = Bleu_scorer.compute_score(sents_label_eval, predictions_bleu) m_score, _ = METEOR_scorer.compute_score(sents_label_eval, predictions_bleu) r_score, _ = ROUGE_scorer.compute_score(sents_label_eval, predictions_bleu) print('Evaluating {} samples'.format(len(predictions))) print('Bleu_1 : ' + str(b_score[0])) print('Bleu_2 : ' + str(b_score[1])) print('Bleu_3 : ' + str(b_score[2])) print('Bleu_4 : ' + str(b_score[3])) print('METEOR : ' + str(m_score)) print('ROUGE_L : ' + str(r_score)) print('CIDEr : ' + str(c_score)) lang_stat = {} lang_stat['BLEU_1'] = b_score[0] lang_stat['BLEU_2'] = b_score[1] lang_stat['BLEU_3'] = b_score[2] lang_stat['BLEU_4'] = b_score[3] lang_stat['METEOR'] = m_score lang_stat['ROUGE_L'] = r_score lang_stat['CIDEr'] = c_score return lang_stat
def rouge_scorer(reference, hypothesis): # ================================================= # Compute scores # ================================================= scorer = Rouge() average_score, score = scorer.compute_score(reference, hypothesis) return average_score, score
def get_corpus_bleu(model, data_loader, vocabs, device, beam_size): import torch from pycocoevalcap.bleu.bleu import Bleu from pycocoevalcap.cider.cider import Cider from pycocoevalcap.rouge.rouge import Rouge """Defining Scorers""" scorer_bleu = Bleu(4) scorer_rouge = Rouge() scorer_cider = Cider() sequences_ref = {} sequences_gen = {} bad_words = ['<SOS>', '<EOS>', '<UNK>'] bad_toks = [vocabs['word_vocab'](i) for i in bad_words] """Generation Loop""" for i, data in enumerate(data_loader): with torch.no_grad(): captions = data['captions'] length = captions.size(1) - 1 targets = captions.narrow(1, 1, length) images = data['images'].to(device) topics = data['topics'].to(device) predictions = model.sample_v2(images, topics, beam_size=beam_size) sequences_ref[i] = [ " ".join([ vocabs['word_vocab'](j.item()) for j in targets[0] if j.item() not in bad_toks ]) ] sequences_gen[i] = [ " ".join([ vocabs['word_vocab'](j.item()) for j in predictions[0][1] if j.item() not in bad_toks ]) ] # sequences_gen[i] = [" ".join([vocabs['word_vocab'](j) for j in predictions[0] if j not in bad_toks])] """Getting Scores""" bleu_score, bleu_scores = scorer_bleu.compute_score( sequences_ref, sequences_gen) rouge_score, rouge_scores = scorer_rouge.compute_score( sequences_ref, sequences_gen) cider_score, cider_scores = scorer_cider.compute_score( sequences_ref, sequences_gen) scores = { 'bleu_score': bleu_score, 'rouge_score': rouge_score, 'cider_score': cider_score } print(scores) return scores
def _define_metrics(gts, res): bleu_scorer = Bleu(n=4) bleu, _ = bleu_scorer.compute_score(gts=gts, res=res) rouge_scorer = Rouge() rouge, _ = rouge_scorer.compute_score(gts=gts, res=res) cider_scorer = Cider() cider, _ = cider_scorer.compute_score(gts=gts, res=res) meteor_scorer = Meteor() meteor, _ = meteor_scorer.compute_score(gts=gts, res=res) for i in range(4): bleu[i] = round(bleu[i], 4) return bleu, round(meteor, 4), round(rouge, 4), round(cider, 4)
def eval(result_gts_path, result_res_path): with open(result_gts_path, 'r') as file: gts_dict = json.load(file) with open(result_res_path, 'r') as file: res_dict = json.load(file) bleu_score = Bleu(n=4) bleu, _ = bleu_score.compute_score(gts=gts_dict, res=res_dict) meteor_score = Meteor() meteor, _ = meteor_score.compute_score(gts=gts_dict, res=res_dict) rouge_scorer = Rouge() rouge, _ = rouge_scorer.compute_score(gts=gts_dict, res=res_dict) cider_scorer = Cider() cider, _ = cider_scorer.compute_score(gts=gts_dict, res=res_dict) return bleu, meteor, rouge, cider
class RougeBleuScore(Metric): def __init__(self, coco, vocab, n = 4): self.coco = coco self.vocab = vocab self.bleu = Bleu(n) self.n = n self.rouge = Rouge() def evaluate(self, y_pred, y, image_ids): if type(y_pred) == list: caption_pred_list = caption_list_to_words(y_pred, self.vocab) else: caption_pred_list = tensor_to_words(y_pred, y, self.vocab) captions_pred, captions_gt = extract_captions(image_ids, caption_pred_list, self.coco) blockPrint() scores = self.bleu.compute_score(captions_gt, captions_pred)[0] enablePrint() scores.append(self.rouge.compute_score(captions_gt, captions_pred)[0]) return scores
def calculate_metric(rnn, meteor=None): gts = {} res = {} lp_avg = 0.0 lp_c = 0 for idx in range(rnn.V_valid.shape[0]): iid = rnn.Id_valid[idx] if iid not in gts: gts[iid] = [] #gts[iid].append(' '.join([rnn.dp.i2w[w] for w in rnn.X_valid[idx] if w != 0][::-1])) gts[iid] = [ ' '.join(rnn.dp.tokens[i][::-1]) for i in rnn.dp.img_id_to_tokens[iid] ] if iid in res: continue res[iid] = [] #pos_sen, pos_att = rnn.get_sentence(rnn.V_valid[idx], senti=np.array([1.0], dtype=theano.config.floatX)) (lp, pos_sen) = decoder_beamsearch(rnn, rnn.V_valid[idx], senti=1.0, beam_size=1) pos_sen = pos_sen[:-1] print(' '.join(pos_sen[::-1])) res[iid].append(' '.join(pos_sen[::-1])) lp_avg += np.exp(lp) lp_c += 1 lp_avg /= float(lp_c) return lp_avg bleu = Bleu() print("Bleu:") print("Positive:", bleu.compute_score(gts, res)[0]) rouge = Rouge() print("Rouge:") print("Positive:", rouge.compute_score(gts, res)[0]) if meteor is None: meteor = Meteor() print("Meteor:") mscore = meteor.compute_score(gts, res)[0] print("Positive:", mscore) return mscore
def rouge(): scorer = Rouge() score, scores = scorer.compute_score(gts, res) print('rouge = %s' % score)
def rouge(gts, res): scorer = Rouge() score, scores = scorer.compute_score(gts, res) out_file.write('ROUGE = %s' % score + '\n')
def coco_caption_metrics(predictions_list, image_id_list, vocabulary_path='data/vocabulary.json', max_caption_length=25, batch_size=32, is_training=True): with open(vocabulary_path, 'r') as file: vocabulary_list = json.load(file) word2id = {} for i in range(vocabulary_list.__len__()): word2id[vocabulary_list[i]] = i id2word = {v: k for k, v in word2id.items()} with open('data/captions_gt.json', 'r') as file: captions_gt_dict = json.load(file) gts = {} res = {} for i in range(0, predictions_list.__len__()): for j in range(0, batch_size): sen_input, sen_ground_truth = [], [] for k in range(max_caption_length): id_input = int(predictions_list[i][k][j]) sen_input.append(id2word[id_input]) sen_pre = [] for n in range(max_caption_length): word = sen_input[n] if word != '</S>': sen_pre.append(word) else: break str_input = ' '.join(sen_pre) image_id = image_id_list[i][j][0] # print(image_id) res[image_id] = [str_input] gts[image_id] = captions_gt_dict[str(image_id)] if not is_training: # for key in gts.keys(): # str_input = res[key] # str_grundtruth = gts[key] # print(key) # print(str_input) # print(str_grundtruth) # print('*' * 100) with open('data/result/result_res.json', 'w') as file: json.dump(res, file) with open('data/result/result_gts.json', 'w') as file: json.dump(gts, file) # print('result.json get success') bleu_scorer = Bleu(n=4) bleu, _ = bleu_scorer.compute_score(gts=gts, res=res) rouge_scorer = Rouge() rouge, _ = rouge_scorer.compute_score(gts=gts, res=res) cider_scorer = Cider() cider, _ = cider_scorer.compute_score(gts=gts, res=res) meteor_scorer = Meteor() meteor, _ = meteor_scorer.compute_score(gts=gts, res=res) for i in range(4): bleu[i] = round(bleu[i], 4) return bleu, round(meteor, 4), round(rouge, 4), round(cider, 4)
def evaluate(beam_size): """ Evaluation :param beam_size: beam size at which to generate captions for evaluation :return: BLEU-4 score """ # DataLoader loader = torch.utils.data.DataLoader(CaptionDataset( data_folder, data_name, 'TEST', transform=transforms.Compose([normalize])), batch_size=1, shuffle=True, num_workers=0, pin_memory=False) # TODO: Batched Beam Search # Therefore, do not use a batch_size greater than 1 - IMPORTANT! # Lists to store references (true captions), and hypothesis (prediction) for each image # If for n images, we have n hypotheses, and references a, b, c... for each image, we need - # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...] references = dict() hypotheses = dict() # For each image for j, (image, caps, caplens, allcaps) in enumerate( tqdm(loader, desc="EVALUATING AT BEAM SIZE " + str(beam_size))): k = beam_size # Move to GPU device, if available image = image.to(device) # (1, 3, 256, 256) attrs, encoder_out = encoder(image) attrs = attrs.expand(3, attrs_dim) enc_image_size = encoder_out.size(1) encoder_dim = encoder_out.size(3) encoder_out = encoder_out.view(1, -1, encoder_dim) num_pixels = encoder_out.size(1) encoder_out = encoder_out.expand(k, num_pixels, encoder_dim) x0 = decoder.init_x0(attrs) # Tensor to store top k previous words at each step; now they're just <start> k_prev_words = torch.LongTensor([[word_map['<start>']]] * k).to( device) # (k, 1) # Tensor to store top k sequences; now they're just <start> seqs = k_prev_words # (k, 1) # Tensor to store top k sequences' scores; now they're just 0 top_k_scores = torch.zeros(k, 1).to(device) # (k, 1) # Lists to store completed sequences and scores complete_seqs = list() complete_seqs_scores = list() # Start decoding step = 1 h1, c1, h2, c2 = decoder.init_hidden_state(attrs, encoder_out, zero=True) h1, c1 = decoder.decode_step1(x0, (h1, c1)) # s is a number less than or equal to k, because sequences are removed from this process once they hit <end> while True: embeddings = decoder.embedding(k_prev_words).squeeze( 1) # (s, embed_dim) h1, c1 = decoder.decode_step1(embeddings, (h1, c1)) awe, _ = decoder.attention(encoder_out, h1, h2) # gate = decoder.sigmoid(decoder.f_beta(h2)) # awe = gate * awe h2, c2 = decoder.decode_step2(torch.cat([embeddings, awe], dim=1), (h2, c2)) scores = decoder.fc2(decoder.dropout2(h2)) scores = F.log_softmax(scores, dim=1) # Add scores = top_k_scores.expand_as(scores) + scores # (s, vocab_size) # For the first step, all k points will have the same scores (since same k previous words, h, c) if step == 1: top_k_scores, top_k_words = scores[0].topk(k, 0, True, True) # (s) else: # Unroll and find top scores, and their unrolled indices # (s) 所有分数中最大的k个 top_k_scores, top_k_words = scores.view(-1).topk( k, 0, True, True) # Convert unrolled indices to actual indices of scores # 上面展开了,prev_word_inds得到哪些句子是概率最大的 prev_word_inds = top_k_words / vocab_size # (s) next_word_inds = top_k_words % vocab_size # (s) # Add new words to sequences seqs = torch.cat( [seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1) # (s, step+1) # Which sequences are incomplete (didn't reach <end>)? incomplete_inds = [ ind for ind, next_word in enumerate(next_word_inds) if next_word != word_map['<end>'] ] complete_inds = list( set(range(len(next_word_inds))) - set(incomplete_inds)) # Set aside complete sequences if len(complete_inds) > 0: complete_seqs.extend(seqs[complete_inds].tolist()) complete_seqs_scores.extend(top_k_scores[complete_inds]) k -= len(complete_inds) # reduce beam length accordingly # Proceed with incomplete sequences if k == 0: break seqs = seqs[incomplete_inds] h1 = h1[prev_word_inds[incomplete_inds]] c1 = c1[prev_word_inds[incomplete_inds]] h2 = h2[prev_word_inds[incomplete_inds]] c2 = c2[prev_word_inds[incomplete_inds]] encoder_out = encoder_out[prev_word_inds[incomplete_inds]] top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1) k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1) # Break if things have been going on too long if step > 50: break step += 1 i = complete_seqs_scores.index(max(complete_seqs_scores)) seq = complete_seqs[i] # References img_caps = allcaps[0].tolist() img_captions = list( map( lambda c: [ rev_word_map[w] for w in c if w not in { word_map['<start>'], word_map['<end>'], word_map[ '<pad>'] } ], img_caps)) # remove <start> and pads img_caps = [' '.join(c) for c in img_captions] # print(img_caps) references[str(j)] = img_caps # Hypotheses hypothesis = ([ rev_word_map[w] for w in seq if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']} ]) hypothesis = [' '.join(hypothesis)] # print(hypothesis) hypotheses[str(j)] = hypothesis assert len(references) == len(hypotheses) # Calculate BLEU-1~BLEU4 scores m1 = Bleu() m2 = Meteor() m3 = Cider() m4 = Rouge() m5 = Spice() (score1, scores1) = m1.compute_score(references, hypotheses) (score2, scores2) = m2.compute_score(references, hypotheses) (score3, scores3) = m3.compute_score(references, hypotheses) (score4, scores4) = m4.compute_score(references, hypotheses) (score5, scores5) = m5.compute_score(references, hypotheses) return score1, score2, score3, score4, score5
with open(system, 'r') as f: for line in f: sys_strs.append(line.strip()) assert len(ref1_strs) == len(ref2_strs) assert len(ref2_strs) == len(sys_strs) word_target_dict = {} word_response_dict = {} rouges = [] for i in range(len(ref1_strs)): wtd = {i: [ref1_strs[i], ref2_strs[i]]} wrd = {i: [sys_strs[i]]} rouge, _ = rouge_obj.compute_score(wtd, wrd) rouges.append(rouge) print(np.mean(rouges)) with open("%s-rouges.txt" % system, 'w') as outf: for r in rouges: outf.write(str(r) + '\n') for i in range(len(ref1_strs)): word_target_dict[i] = [ref1_strs[i], ref2_strs[i]] word_response_dict[i] = [sys_strs[i]] bleu_score, bleu_scores = bleu_obj.compute_score(word_target_dict, word_response_dict)
def coco_caption_metrics_hier(predicts_list, sentences_list, image_id_list, config, batch_size=26, is_training=True): with open(config.vocabulary_path, 'r') as file: vocabulary_list = json.load(file) word2id = {} for i in range(vocabulary_list.__len__()): word2id[vocabulary_list[i]] = i id2word = {v: k for k, v in word2id.items()} gts = {} res = {} for i in range(0, predicts_list.__len__()): for j in range(0, batch_size): sent_pre, sent_gt = [], [] for k in range(config.max_sentence_num * config.max_sentence_length): id_input = int(predicts_list[i][k][j]) sent_pre.append(id2word[id_input]) id_gt = sentences_list[i][j][k] if (not id2word[id_gt].__eq__('</S>')) and ( not id2word[id_gt].__eq__('<EOS>')): sent_gt.append(id2word[id_gt]) # sent_pre2 = sent_pre sent_pre2 = [] for n in range(config.max_sentence_num): for m in range(config.max_sentence_length): word = sent_pre[n * config.max_sentence_length + m] if word != '</S>': sent_pre2.append(word) else: break str_pre, str_gt = ' '.join(sent_pre2), ' '.join(sent_gt) image_id = image_id_list[i][j][0] gts[str(image_id)] = [str_gt] res[str(image_id)] = [str_pre] if not is_training: with open(config.result_gts_path, 'w') as file: json.dump(gts, file) with open(config.result_res_path, 'w') as file: json.dump(res, file) bleu_scorer = Bleu(n=4) bleu, _ = bleu_scorer.compute_score(gts=gts, res=res) rouge_scorer = Rouge() rouge, _ = rouge_scorer.compute_score(gts=gts, res=res) cider_scorer = Cider() cider, _ = cider_scorer.compute_score(gts=gts, res=res) # # # meteor_scorer = Meteor() # meteor, _ = meteor_scorer.compute_score(gts=gts, res=res) for i in range(4): bleu[i] = round(bleu[i], 4) # return bleu, round(meteor, 4), round(rouge, 4), round(cider, 4) return bleu, round(rouge, 4), round(cider, 4)
def run_load_gap_filler(pretrained_filename, do_bleu=False, must_have_anp=False, copy_if_no_anp=False, replace_adj=False, get_human=False, semi_human=False): rnn = RNNModel() rnn.load_model(pretrained_filename) rnn.conf['VAL_SPLIT'] = RNNDataProvider.TEST if get_human: id_to_caps = pickle.load(open("coco_mturk/id_to_caps.pik", "rb")) rnn.build_model_core() rnn.load_val_dataset() rnn.build_sentence_generator() rnn.build_perplexity_calculator() #print rnn.sample_sentence(rnn.V_valid[0]) #print decoder_beamsearch2(rnn, rnn.V_valid[0]) #print decoder_beamsearch(rnn, rnn.V_valid[0]) #calculate_metric(rnn) #sys.exit(0) pos_sentence_res = [] pos_att_res = [] des_sentence_res = [] des_att_res = [] img_files = [] img_ids = [] id_to_sentences = {} seen_ids = set() if 'added_words' in rnn.conf: new_words = set([w[0] for w in rnn.conf['added_words']]) else: new_words = set() num_ignore = 0 num_not_ignore = 0 for idx in range(rnn.V_valid.shape[0]): img_file = rnn.dp.img_id_to_filename[rnn.Id_valid[idx]] img_id = rnn.Id_valid[idx] if img_id not in id_to_sentences: id_to_sentences[img_id] = [] #id_to_sentences[img_id].append(' '.join([rnn.dp.i2w[w] for w in rnn.X_valid[idx] if w != 0][::-1])) if replace_adj: id_to_sentences[img_id] = [ ' '.join(do_replace_adj(rnn.dp.tokens[i])[::-1]) for i in rnn.dp.img_id_to_tokens[img_id] ] elif get_human: id_to_sentences[img_id] = [ ' '.join(rnn.dp.tokens[i][::-1]) for i in rnn.dp.img_id_to_tokens[img_id] ] np.random.shuffle(id_to_sentences[img_id]) print(len(id_to_sentences[img_id])) human_sen_pos = id_to_sentences[img_id].pop() print(len(id_to_sentences[img_id])) if not id_to_sentences[img_id]: continue else: id_to_sentences[img_id] = [ ' '.join(rnn.dp.tokens[i][::-1]) for i in rnn.dp.img_id_to_tokens[img_id] ] #print id_to_sentences[img_id] if img_id in seen_ids: continue seen_ids.add(img_id) if get_human and not semi_human: pos_sen = human_sen_pos.split()[::-1] np.random.shuffle(id_to_caps[img_id]) des_sen = id_to_caps[img_id][0][::-1] else: lp, pos_sen, pos_att = decoder_beamsearch_with_attention( rnn, rnn.V_valid[idx], senti=1.0, beam_size=5) lp, des_sen, des_att = decoder_beamsearch_with_attention( rnn, rnn.V_valid[idx], senti=-1.0, beam_size=5) pos_sen = pos_sen[:-1] des_sen = des_sen[:-1] #des_att = des_att[:-1] pos_att = pos_att[:-1] #pos_sen, pos_att = rnn.get_sentence(rnn.V_valid[idx], senti=np.array([1.0], dtype=theano.config.floatX)) pos_att = np.array(pos_att) pos_att = pos_att.flatten() #des_att = np.array(des_att) #des_att = des_att.flatten() des_att = np.zeros((len(des_sen), )) #pos_att = np.zeros((len(pos_sen),)) if must_have_anp: if not sentence_has_anp(pos_sen[::-1]): num_ignore += 1 continue num_not_ignore += 1 if copy_if_no_anp: if not sentence_has_anp(pos_sen[::-1]): pos_sen = des_sen if replace_adj: pos_sen = do_replace_adj(pos_sen[::-1])[::-1] des_sen = do_replace_adj(des_sen[::-1])[::-1] #des_sen, des_att = rnn.get_sentence(rnn.V_valid[idx], senti=np.array([-1.0], dtype=theano.config.floatX)) new_pos_sen = [] for vv, a in zip(pos_sen, pos_att): out = vv col = "" if a > 0.75: col = "#FF3300" elif a > 0.5: col = "#FF5C33" elif a > 0.25: col = "#FF8566" #if a > 0.75: # col = "#33CC33"# "#3366FF" #elif a > 0.5: # col = "#70DB70" #"#5C85FF" #elif a > 0.25: # col = "#ADEBAD" #"#85A3FF" if col: out = "<font style='background-color: %s'>%s</font>" % (col, vv) new_pos_sen.append(out) pos_sen = new_pos_sen print(pos_sen) print(pos_att) print(des_sen) print_it = False for v in pos_sen: if v in new_words: print_it = True if print_it: for x in zip(pos_sen, pos_att)[::-1]: print(x[0], end=' ') print("") #for x in zip(pos_sen, pos_att)[::-1]: # print x[0], #print "" #for x in zip(des_sen, des_att)[::-1]: # print x[0], #print "\n" pos_att = pos_att[:len(pos_sen)] des_att = des_att[:len(des_sen)] pos_sentence_res.append(pos_sen[::-1]) pos_att_res.append(np.exp(pos_att[::-1])) des_sentence_res.append(des_sen[::-1]) des_att_res.append(np.exp(des_att[::-1])) img_files.append(img_file) img_ids.append(img_id) output = { 'pos_sen': pos_sentence_res, 'pos_att': pos_att_res, 'des_sen': des_sentence_res, 'des_att': des_att_res, 'img_files': img_files, 'img_ids': img_ids } pickle.dump(output, open("output_data/sen_att_pos_01.pik", "wb"), protocol=2) if must_have_anp: print("Must have ANP % removed:", num_ignore / float(num_not_ignore) * 100.0) print("getting Positive perplexity") print(rnn.get_val_perplexity()) print("got perplexity") print("getting Descriptive perplexity") print(rnn.get_val_perplexity(base=True)) print("got perplexity") gts = {} res = {} fout = open("eval/output_pos", "w") for line, iid in zip(pos_sentence_res, img_ids): fout.write(' '.join(line) + '\n') if iid not in res: res[iid] = [] res[iid].append(' '.join(line)) fout.close() res_des = {} fout = open("eval/output_des", "w") for line, iid in zip(des_sentence_res, img_ids): fout.write(' '.join(line) + '\n') if iid not in res_des: res_des[iid] = [] res_des[iid].append(' '.join(line)) fout.close() for i in range(3): fout = open("eval/reference%d" % i, "w") for cid in img_ids: if cid not in gts: gts[cid] = [] if len(id_to_sentences[cid]) > i: gts[cid].append(id_to_sentences[cid][i]) fout.write(id_to_sentences[cid][i] + "\n") else: fout.write("\n") fout.close() bleu = Bleu() #for i in gts.keys()[:10]: # print gts[i] # print res_des[i] # print res[i] # print "" total_ref_sentences = 0 for i in list(gts.keys()): total_ref_sentences += len(gts[i]) print("Total ref sentences:", total_ref_sentences) print("Bleu:") print("Positive:", bleu.compute_score(gts, res)[0]) print("Descriptive:", bleu.compute_score(gts, res_des)[0]) rouge = Rouge() print("Rouge:") print("Positive:", rouge.compute_score(gts, res)[0]) print("Descriptive:", rouge.compute_score(gts, res_des)[0]) cider = Cider() print("Cider:") print("Positive:", cider.compute_score(gts, res)[0]) print("Descriptive:", cider.compute_score(gts, res_des)[0]) meteor = Meteor() print("Meteor:") print("Positive:", meteor.compute_score(gts, res)[0]) print("Descriptive:", meteor.compute_score(gts, res_des)[0])
class SentenceEvaluator(object): def __init__(self): self.gt = {} self.gen = {} self.count = 0 self.bleu = Bleu() self.rouge = Rouge() self.rb = pyrb.Readability(syllable_counter=pyrb.CMUDictCounter()) #self.meteor = Meteor() #self.cider = Cider() def add_sentence_pair(self, generated, ground_truth): if not isinstance(generated, str): print "ERROR:", generated print type(generated) assert isinstance(generated, str) assert isinstance(ground_truth, str) self.gt[self.count] = [ground_truth] self.gen[self.count] = [generated] self.count += 1 def add_pairs(self, generated, ground_truth): assert len(generated) == len(ground_truth) for gen, gt in zip(generated, ground_truth): self.add_sentence_pair(gen, gt) def clear(self): self.gt = {} self.gen = {} self.count = 0 def edit_distance(self): ed = EditDistance() total_dist = 0 total_norm_dist = 0 op_count = {'m': 0, 'i': 0, 'd': 0, 'r': 0} op_count_norm = {'m': 0, 'i': 0, 'd': 0, 'r': 0} num_examples = len(self.gt) num_examples = max(num_examples, 1) for i in self.gt.keys(): gt = self.gt[i][0].split() gen = self.gen[i][0].split() max_len = float(max(len(gt), len(gen))) max_len = max(max_len, 1.0) dist = ed.compute(gt, gen) total_dist += dist total_norm_dist += dist / max_len ops = ed.operations() for op in ops: op_count[op] += 1 op_count_norm[op] += 1.0 / max_len mean_dist = total_dist / float(num_examples) mean_norm_dist = total_norm_dist / float(num_examples) for op in op_count: op_count[op] /= float(num_examples) op_count_norm[op] /= float(num_examples) return mean_dist, mean_norm_dist, op_count, op_count_norm def bleu_score(self): score, scores = self.bleu.compute_score(self.gt, self.gen) return score def bleu_scores(self): score, scores = self.bleu.compute_score(self.gt, self.gen) return np.array(scores).T def rouge_score(self): return self.rouge.compute_score(self.gt, self.gen)[0] def meteor_score(self): return self.meteor.compute_score(self.gt, self.gen)[0] def cider_score(self): return self.cider.compute_score(self.gt, self.gen)[0] def _get_words_per_sequence(self, lst): lens = [len(a[0].split()) for a in lst] return np.array(lens, dtype=np.int32) def _get_words_per_sentence(self, lst): lens = [] for a in lst: for s in nltk.sent_tokenize(a[0]): lens.append(len(s.split())) return np.array(lens, dtype=np.int32) def mean_words_per_sentence_gt(self): return np.mean(self._get_words_per_sentence(self.gt.values())) def mean_words_per_sentence_gen(self): return np.mean(self._get_words_per_sentence(self.gen.values())) def mean_words_per_sentence_diff(self): gt_wps = self._get_words_per_sequence(self.gt.values()) gen_wps = self._get_words_per_sequence(self.gen.values()) return np.mean(gt_wps - gen_wps) def _get_sentence_list(self, sent_map): text = [] for sent in sent_map.values(): text.append(sent[0]) return text def _get_sentence_list_gt(self): return self._get_sentence_list(self.gt) def _get_sentence_list_gen(self): return self._get_sentence_list(self.gen) def _text_stats_str(self, sentences): text = [] for sent in sentences: sent_strip = sent.strip() if len(sent_strip) == 0 or sent_strip[-1] != '.': text.append(sent_strip + '.') else: text.append(sent_strip) text = " ".join(text) stat_str = "" try: fre = self.rb.flesch_kincaid_reading_ease(text) stat_str += "Flesch reading ease: %s\n" % str(fre) #si = textstat.smog_index(text) #stat_str += "Smog index: %s\n" % str(si) fkg = self.rb.flesch_kincaid_grade_level(text) stat_str += "Flesch kincaid grade: %s\n" % str(fkg) cli = self.rb.coleman_liau_index(text) stat_str += "Coleman liau index: %s\n" % str(cli) ari = self.rb.automated_readability_index(text) stat_str += "Automated redability index: %s\n" % str(ari) dcrs = self.rb.dale_chall_readability(text) stat_str += "Dale chall readability score: %s\n" % str(dcrs) #lwf = textstat.linsear_write_formula(text) #stat_str += "Linsear write formula: %s\n" % str(lwf) #gf = textstat.gunning_fog(text) #stat_str += "Gunning fog: %s\n" % str(gf) except Exception as e: stat_str += "Text quality is poor: caused an exeption during evaluation." print e return stat_str def __repr__(self): #for i in self.gt: # print self.gt[i] # print self.gen[i] # print "" bleu = self.bleu_score() rouge = self.rouge_score() #meteor = self.meteor_score() #cider = self.cider_score() rep = "Evaluation Results (%d pairs):\n" % len(self.gt) rep += "Bleu: %s\n" % str(bleu) rep += "Rouge: %s\n" % str(rouge) #rep += "Meteor: %s\n" % str(meteor) #rep += "Cider: %s\n" % str(cider) words_per_sentence_gt = self.mean_words_per_sentence_gt() rep += "Mean words per sentence ground-truth: %f\n" % words_per_sentence_gt words_per_sentence_gen = self.mean_words_per_sentence_gen() rep += "Mean words per sentence generated: %f\n" % words_per_sentence_gen words_per_sentence_diff = self.mean_words_per_sentence_diff() rep += "Mean words per sentence diff 'mean(|gt| - |gen|)': %f\n" % words_per_sentence_diff rep += "--------Generated Readability Stats:--------\n" rep += self._text_stats_str(self._get_sentence_list_gen()) rep += "--------Ground Truth Readability Stats:--------\n" rep += self._text_stats_str(self._get_sentence_list_gt()) return rep def print_edit_distance(self): mean_dist, mean_norm_dist, op_average, op_average_norm = self.edit_distance( ) print "EditDistance Stats:" print "mean_dist:", mean_dist print "mean_norm_dist", mean_norm_dist print "op_average:", op_average print "op_average_norm:", op_average_norm