def compute_score(self, gts, res): assert (list(gts) == list(res)) imgIds = list(gts) bleu_scorer = BleuScorer(n=self._n) for id in imgIds: hypo = res[id] ref = gts[id] # Sanity check. assert (type(hypo) is list) assert (len(hypo) == 1) assert (type(ref) is list) assert (len(ref) >= 1) # Convert to UTF-8 if necessary if sys.version_info.major == 2: for j in range(len(hypo)): if type(hypo[j]) == str: hypo[j] = hypo[j].decode('utf-8') for j in range(len(ref)): if type(ref[j]) == str: ref[j] = ref[j].decode('utf-8') bleu_scorer += (hypo[0], ref) # score, scores = bleu_scorer.compute_score(option='shortest') score, scores = bleu_scorer.compute_score(option='closest', verbose=1) # score, scores = bleu_scorer.compute_score(option='average', verbose=1) # return (bleu, bleu_info) return score, scores
def get_scores(self, preds, target): if self.bleu_scorer == 'coco': bleu_scorer = BleuScorer(n=self.bleu_order) coco = True else: coco = False scores = [] # Go to sentence space to compute scores: hypo = decode_sequence(self.vocab, preds) # candidate refs = decode_sequence(self.vocab, target.data) # references num_img = target.size(0) // self.seq_per_img for e, h in enumerate(hypo): ix_start = e // self.seq_per_img * self.seq_per_img ix_end = ix_start + 5 # self.seq_per_img if coco: bleu_scorer += (h, refs[ix_start:ix_end]) else: scores.append( sentence_bleu(h, ' '.join(refs[ix_start:ix_end]), order=self.bleu_order)) if coco: (score, scores) = bleu_scorer.compute_score() scores = scores[-1] self.logger.debug("Bleu scores: %s" % str(scores)) return scores
def compute_score(self, refs, hypos): ''' :param refs: instance_num x refer_num x str :param hypos: instance_num x 1 x str :return: ''' bleu_scorer = BleuScorer(n=self._n) for ref, hypo in zip(refs, hypos): # Sanity check. assert (type(hypo) is list) assert (len(hypo) == 1) assert (type(ref) is list) assert (len(ref) >= 1) bleu_scorer += (hypo[0], ref) #score, scores = bleu_scorer.compute_score(option='shortest') score, scores = bleu_scorer.compute_score(option='closest', verbose=1) #score, scores = bleu_scorer.compute_score(option='average', verbose=1) # return (bleu, bleu_info) return score, scores
def generate_refs(sentences, order_method, align_method, minus, sentid, simi_mat=None): tokens = [s['tokens'] for s in sentences] if align_method == 'soft': hiddens = [s['hidden'] for s in sentences] else: hiddens = None refs = generate_lattice(tokens, hiddens, order_method, align_method, simi_mat=simi_mat, minus=minus) for e in tokens: refs.add(' '.join(e)) refs = list(refs) bleu_scorer = BleuScorer(n=4) for ref in refs: bleu_scorer += (ref, [' '.join(e) for e in tokens]) score, scores = bleu_scorer.compute_score(option='closest', verbose=0) new_sentences = [] for i, s in enumerate(scores[3]): new_ref = {} new_ref['imgid'] = sentences[0]['imgid'] new_ref['raw'] = refs[i] new_ref['tokens'] = refs[i].split(' ') new_ref['sentid'] = sentid new_ref['bleu'] = s new_sentences.append(new_ref) sentid += 1 return new_sentences
def compute_score(self, gts, res): assert(gts.keys() == res.keys()) imgIds = gts.keys() bleu_scorer = BleuScorer(n=self._n) for id in imgIds: hypo = res[id] ref = gts[id] # Sanity check. assert(type(hypo) is list) assert(len(hypo) == 1) assert(type(ref) is list) #print(ref) #assert(len(ref) > 1) bleu_scorer += (hypo[0], ref) #score, scores = bleu_scorer.compute_score(option='shortest') score, scores = bleu_scorer.compute_score(option='closest', verbose=1) #score, scores = bleu_scorer.compute_score(option='average', verbose=1) # return (bleu, bleu_info) return score, scores
def compute_score(self, gts, res): assert(sorted(gts.keys()) == sorted(res.keys())) #imgIds = sorted(gts.keys()) bleu_scorer = BleuScorer(n=self._n) for id in gts: hypo = res[id] ref = gts[id] # Sanity check. assert(type(hypo) is list) assert(len(hypo) == 1) assert(type(ref) is list) assert(len(ref) >= 1) bleu_scorer += (hypo[0], ref) # Reduce verbosity score, scores = bleu_scorer.compute_score(option='closest', verbose=0) # return (bleu, bleu_info) return score, scores
def forward( self, # type: ignore context: Dict[str, torch.LongTensor], image: torch.Tensor, caption: Dict[str, torch.LongTensor], face_embeds: torch.Tensor, obj_embeds: torch.Tensor, metadata: List[Dict[str, Any]], names: Dict[str, torch.LongTensor] = None, attn_idx=None) -> Dict[str, torch.Tensor]: caption_ids, target_ids, contexts = self._forward( context, image, caption, face_embeds, obj_embeds) decoder_out = self.decoder(caption, contexts) # Assume we're using adaptive loss loss, sample_size = self.criterion(self.decoder.adaptive_softmax, decoder_out, target_ids) loss = loss / math.log(2) output_dict = { 'loss': loss / sample_size, 'sample_size': sample_size, } # During evaluation, we will generate a caption and compute BLEU, etc. if not self.training and self.evaluate_mode: _, gen_ids, attns = self._generate(caption_ids, contexts, attn_idx) # We ignore <s> and <pad> gen_texts = [self.roberta.decode(x[x > 1]) for x in gen_ids.cpu()] captions = [m['caption'] for m in metadata] output_dict['captions'] = captions output_dict['generations'] = gen_texts output_dict['metadata'] = metadata output_dict['attns'] = attns output_dict['gen_ids'] = gen_ids.cpu().detach().numpy() # Remove punctuation gen_texts = [re.sub(r'[^\w\s]', '', t) for t in gen_texts] captions = [re.sub(r'[^\w\s]', '', t) for t in captions] for gen, ref in zip(gen_texts, captions): bleu_scorer = BleuScorer(n=4) bleu_scorer += (gen, [ref]) score, _ = bleu_scorer.compute_score(option='closest') self.sample_history['bleu-1'] += score[0] * 100 self.sample_history['bleu-2'] += score[1] * 100 self.sample_history['bleu-3'] += score[2] * 100 self.sample_history['bleu-4'] += score[3] * 100 # rogue_scorer = Rouge() # score = rogue_scorer.calc_score([gen], [ref]) # self.sample_history['rogue'] += score * 100 if 'rare_tokens' in caption: for gen, ref, rare_list in zip(gen_texts, captions, caption['rare_tokens']): bleu_scorer = BleuScorer(n=4) rare_words = ' '.join(rare_list) gen = gen + ' ' + rare_words if rare_words: print(ref) print(gen) print() bleu_scorer += (gen, [ref]) score, _ = bleu_scorer.compute_score(option='closest') self.sample_history['bleu-1r'] += score[0] * 100 self.n_samples += caption_ids.shape[0] self.n_batches += 1 return output_dict
def forward(self, # type: ignore context: Dict[str, torch.LongTensor], image: torch.Tensor, caption: Dict[str, torch.LongTensor], face_embeds: torch.Tensor, metadata: List[Dict[str, Any]], names=None) -> Dict[str, torch.Tensor]: caption_ids, target_ids, contexts, X_sections_hiddens, article_padding_mask = self._forward( context, image, caption, face_embeds) decoder_out = self.decoder(caption, contexts) # Assume we're using adaptive loss gen_loss, sample_size = self.criterion( self.decoder.adaptive_softmax, decoder_out, target_ids) entity_loss, copy_loss = self.pointer_loss( decoder_out, context, caption, target_ids, X_sections_hiddens, article_padding_mask) gen_loss = gen_loss / sample_size / math.log(2) entity_loss = entity_loss / math.log(2) copy_loss = copy_loss / math.log(2) loss = entity_loss + copy_loss if (self.training and not loss.requires_grad) or torch.isnan(loss): loss = None if not torch.isnan(gen_loss): self.batch_history['gen_loss'] += gen_loss.item() if not torch.isnan(entity_loss): self.batch_history['entity_loss'] += entity_loss.item() if not torch.isnan(copy_loss): self.batch_history['copy_loss'] += copy_loss.item() output_dict = { 'loss': loss, 'sample_size': sample_size, } # During evaluation, we will generate a caption and compute BLEU, etc. if not self.training and self.evaluate_mode: log_probs, copy_probs, should_copy_mask, gen_ids = self._generate( caption_ids, contexts, X_sections_hiddens, article_padding_mask, context) gen_texts = [self.roberta.decode(x[x > 1]) for x in gen_ids.cpu()] captions = [m['caption'] for m in metadata] copied_texts = [self.roberta.decode(x[should_copy_mask[i]]) for i, x in enumerate(gen_ids.cpu())] output_dict['captions'] = captions output_dict['generations'] = gen_texts output_dict['metadata'] = metadata output_dict['copied_texts'] = copied_texts # Remove punctuation gen_texts = [re.sub(r'[^\w\s]', '', t) for t in gen_texts] captions = [re.sub(r'[^\w\s]', '', t) for t in captions] for gen, ref in zip(gen_texts, captions): bleu_scorer = BleuScorer(n=4) bleu_scorer += (gen, [ref]) score, _ = bleu_scorer.compute_score(option='closest') self.sample_history['bleu-1'] += score[0] * 100 self.sample_history['bleu-2'] += score[1] * 100 self.sample_history['bleu-3'] += score[2] * 100 self.sample_history['bleu-4'] += score[3] * 100 # rogue_scorer = Rouge() # score = rogue_scorer.calc_score([gen], [ref]) # self.sample_history['rogue'] += score * 100 self.n_samples += caption_ids.shape[0] self.n_batches += 1 return output_dict
def main(): args = docopt(__doc__, version='0.0.1') args = validate(args) if args['ptvsd']: address = ('0.0.0.0', args['ptvsd']) ptvsd.enable_attach(address) ptvsd.wait_for_attach() with open(args['counters'], 'rb') as f: counters = pickle.load(f) full_counter = counters['context'] + counters['caption'] bleu_scorer = BleuScorer(n=4) rouge_scorer = Rouge() rouge_scores = [] cider_scorer = CiderScorer(n=4, sigma=6.0) meteor_scorer = Meteor() meteor_scorer._stat = types.MethodType(_stat, meteor_scorer) meteor_scores = [] eval_line = 'EVAL' meteor_scorer.lock.acquire() count = 0 recalls, precisions = [], [] rare_recall, rare_recall_total = 0, 0 rare_precision, rare_precision_total = 0, 0 full_recall, full_recall_total = 0, 0 full_precision, full_precision_total = 0, 0 full_rare_recall, full_rare_recall_total = 0, 0 full_rare_precision, full_rare_precision_total = 0, 0 lengths, gt_lengths = [], [] n_uniques, gt_n_uniques = [], [] gen_ttrs, cap_ttrs = [], [] gen_flesch, cap_flesch = [], [] ent_counter = defaultdict(int) with open(args['file']) as f: for line in tqdm(f): obj = json.loads(line) if args['use_processed']: caption = obj['caption'] obj['caption_names'] = obj['processed_caption_names'] else: caption = obj['raw_caption'] generation = obj['generation'] if obj['caption_names']: recalls.append(compute_recall(obj)) if obj['generated_names']: precisions.append(compute_precision(obj)) c, t = compute_full_recall(obj) full_recall += c full_recall_total += t c, t = compute_full_precision(obj) full_precision += c full_precision_total += t c, t = compute_rare_recall(obj, counters['caption']) rare_recall += c rare_recall_total += t c, t = compute_rare_precision(obj, counters['caption']) rare_precision += c rare_precision_total += t c, t = compute_rare_recall(obj, full_counter) full_rare_recall += c full_rare_recall_total += t c, t = compute_rare_precision(obj, full_counter) full_rare_precision += c full_rare_precision_total += t # Remove punctuation caption = re.sub(r'[^\w\s]', '', caption) generation = re.sub(r'[^\w\s]', '', generation) lengths.append(len(generation.split())) gt_lengths.append(len(caption.split())) n_uniques.append(len(set(generation.split()))) gt_n_uniques.append(len(set(caption.split()))) bleu_scorer += (generation, [caption]) rouge_score = rouge_scorer.calc_score([generation], [caption]) rouge_scores.append(rouge_score) cider_scorer += (generation, [caption]) stat = meteor_scorer._stat(generation, [caption]) eval_line += ' ||| {}'.format(stat) count += 1 gen_ttrs.append(obj['gen_np']['basic_ttr']) cap_ttrs.append(obj['caption_np']['basic_ttr']) gen_flesch.append(obj['gen_readability']['flesch_reading_ease']) cap_flesch.append( obj['caption_readability']['flesch_reading_ease']) compute_entities(obj, ent_counter) meteor_scorer.meteor_p.stdin.write('{}\n'.format(eval_line).encode()) meteor_scorer.meteor_p.stdin.flush() for _ in range(count): meteor_scores.append( float(meteor_scorer.meteor_p.stdout.readline().strip())) meteor_score = float(meteor_scorer.meteor_p.stdout.readline().strip()) meteor_scorer.lock.release() blue_score, _ = bleu_scorer.compute_score(option='closest') rouge_score = np.mean(np.array(rouge_scores)) cider_score, _ = cider_scorer.compute_score() final_metrics = { 'BLEU-1': blue_score[0], 'BLEU-2': blue_score[1], 'BLEU-3': blue_score[2], 'BLEU-4': blue_score[3], 'ROUGE': rouge_score, 'METEOR': meteor_score, 'CIDEr': cider_score, 'All names - recall': { 'count': full_recall, 'total': full_recall_total, 'percentage': (full_recall / full_recall_total) if full_recall_total else None, }, 'All names - precision': { 'count': full_precision, 'total': full_precision_total, 'percentage': (full_precision / full_precision_total) if full_precision_total else None, }, 'Caption rare names - recall': { 'count': rare_recall, 'total': rare_recall_total, 'percentage': (rare_recall / rare_recall_total) if rare_recall_total else None, }, 'Caption rare names - precision': { 'count': rare_precision, 'total': rare_precision_total, 'percentage': (rare_precision / rare_precision_total) if rare_precision_total else None, }, 'Article rare names - recall': { 'count': full_rare_recall, 'total': full_rare_recall_total, 'percentage': (full_rare_recall / full_rare_recall_total) if full_rare_recall_total else None, }, 'Article rare names - precision': { 'count': full_rare_precision, 'total': full_rare_precision_total, 'percentage': (full_rare_precision / full_rare_precision_total) if full_rare_precision_total else None, }, 'Length - generation': sum(lengths) / len(lengths), 'Length - reference': sum(gt_lengths) / len(gt_lengths), 'Unique words - generation': sum(n_uniques) / len(n_uniques), 'Unique words - reference': sum(gt_n_uniques) / len(gt_n_uniques), 'Caption TTR': sum(cap_ttrs) / len(cap_ttrs), 'Generation TTR': sum(gen_ttrs) / len(gen_ttrs), 'Caption Flesch Reading Ease': sum(cap_flesch) / len(cap_flesch), 'Generation Flesch Reading Ease': sum(gen_flesch) / len(gen_flesch), 'Entity all - recall': { 'count': ent_counter['n_caption_ent_matches'], 'total': ent_counter['n_caption_ents'], 'percentage': ent_counter['n_caption_ent_matches'] / ent_counter['n_caption_ents'], }, 'Entity all - precision': { 'count': ent_counter['n_gen_ent_matches'], 'total': ent_counter['n_gen_ents'], 'percentage': ent_counter['n_gen_ent_matches'] / ent_counter['n_gen_ents'], }, 'Entity person - recall': { 'count': ent_counter['n_caption_person_matches'], 'total': ent_counter['n_caption_persons'], 'percentage': ent_counter['n_caption_person_matches'] / ent_counter['n_caption_persons'], }, 'Entity person - precision': { 'count': ent_counter['n_gen_person_matches'], 'total': ent_counter['n_gen_persons'], 'percentage': ent_counter['n_gen_person_matches'] / ent_counter['n_gen_persons'], }, 'Entity GPE - recall': { 'count': ent_counter['n_caption_gpes_matches'], 'total': ent_counter['n_caption_gpes'], 'percentage': ent_counter['n_caption_gpes_matches'] / ent_counter['n_caption_gpes'], }, 'Entity GPE - precision': { 'count': ent_counter['n_gen_gpes_matches'], 'total': ent_counter['n_gen_gpes'], 'percentage': ent_counter['n_gen_gpes_matches'] / ent_counter['n_gen_gpes'], }, 'Entity ORG - recall': { 'count': ent_counter['n_caption_orgs_matches'], 'total': ent_counter['n_caption_orgs'], 'percentage': ent_counter['n_caption_orgs_matches'] / ent_counter['n_caption_orgs'], }, 'Entity ORG - precision': { 'count': ent_counter['n_gen_orgs_matches'], 'total': ent_counter['n_gen_orgs'], 'percentage': ent_counter['n_gen_orgs_matches'] / ent_counter['n_gen_orgs'], }, 'Entity DATE - recall': { 'count': ent_counter['n_caption_date_matches'], 'total': ent_counter['n_caption_date'], 'percentage': ent_counter['n_caption_date_matches'] / ent_counter['n_caption_date'], }, 'Entity DATE - precision': { 'count': ent_counter['n_gen_date_matches'], 'total': ent_counter['n_gen_date'], 'percentage': ent_counter['n_gen_date_matches'] / ent_counter['n_gen_date'], }, } serialization_dir = os.path.dirname(args['file']) filename = os.path.basename(args['file']).split('.')[0] if args['use_processed']: filename += '_processed' output_file = os.path.join(serialization_dir, f'{filename}_reported_metrics.json') with open(output_file, 'w') as file: json.dump(final_metrics, file, indent=4) for key, metric in final_metrics.items(): print(f"{key}: {metric}")
Caps[k]['gt']['scores'], Caps[k]['gen']['scores']) print("Mass distribution:", "gt:", sum(np.exp(Caps[k]['gt']['scores'])), "gen:", sum(np.exp(Caps[k]['gen']['scores']))) # print(Caps[k]) print('Gen:', np.unique(np.array(gens))) print('Gt:', np.unique(np.array(gts))) keys = np.array(list(Caps)) batches = np.array_split(keys, 1000) print("Processing in %d batches" % len(batches)) cnt = 0 for batch in batches: cnt += 1 cider_scorer = CiderScorer(n=4, sigma=6) bleu4 = BleuScorer(n=4) infer = [] print('batch indices:', batch) for k in batch: # print('all caps:', Caps[k]) refs = Caps[k]['gt']['sents'] print("Refs:", refs) for e, ref in enumerate(refs): _refs = refs.copy() _refs.pop(e) cider_scorer += (ref, _refs) bleu4 += (ref, _refs) for c in Caps[k]['gen']['sents']: cider_scorer += (c, refs) bleu4 += (c, refs) infer += infer_cosine_gp(Caps[k]['gen']['sents'], refs)
def bleu(): scorer = BleuScorer(n=4) scorer += (hypo[0], ref1) # hypo[0] = 'word1 word2 word3 ...' # ref = ['word1 word2 word3 ...', 'word1 word2 word3 ...'] score, _ = scorer.compute_score() print(score)