def language_eval(dataset, preds, model_id, split): import sys if 'coco' in dataset: sys.path.append("coco-caption") annFile = 'coco-caption/annotations/captions_val2014.json' else: # TODO: NYTIMES if split == 'val': annFile = './data/val.json' with open(annFile, 'rb') as f: dataset = json.load(f) else: annFile = './data/test.json' with open(annFile, 'rb') as f: dataset = json.load(f) # TODO: BREAKINGNEWS # with open("/home/abiten/Desktop/Thesis/newspaper/breakingnews/bnews_caps.json", "rb") as f: dataset = json.load(f) id_to_ix = {v['cocoid']: ix for ix, v in enumerate(dataset)} hypo = {v['image_id']: [v['caption']] for v in preds} ref = { k: [i['raw'] for i in dataset[id_to_ix[k]]['sentences']] for k in hypo.keys() } final_scores = evaluate(ref, hypo) print('Bleu_1:\t', final_scores['Bleu_1']) print('Bleu_2:\t', final_scores['Bleu_2']) print('Bleu_3:\t', final_scores['Bleu_3']) print('Bleu_4:\t', final_scores['Bleu_4']) # print('METEOR:\t', final_scores['METEOR']) print('ROUGE_L:', final_scores['ROUGE_L']) print('CIDEr:\t', final_scores['CIDEr']) # print('Spice:\t', final_scores['Spice']) return final_scores # sys.path.append("f30k-caption") # annFile = 'f30k-caption/annotations/dataset_flickr30k.json' from pycocotools.coco import COCO from pycocoevalcap.eval import COCOEvalCap encoder.FLOAT_REPR = lambda o: format(o, '.3f') if not os.path.isdir('eval_results'): os.mkdir('eval_results') cache_path = os.path.join('eval_results/', model_id + '_' + split + '.json') coco = COCO(annFile) valids = coco.getImgIds() # filter results to only those in MSCOCO validation set (will be about a third) preds_filt = [p for p in preds if p['image_id'] in valids] print('using %d/%d predictions' % (len(preds_filt), len(preds))) json.dump(preds_filt, open(cache_path, 'w')) # serialize to temporary json file. Sigh, COCO API... cocoRes = coco.loadRes(cache_path) cocoEval = COCOEvalCap(coco, cocoRes) cocoEval.params['image_id'] = cocoRes.getImgIds() cocoEval.evaluate() # create output dictionary out = {} for metric, score in cocoEval.eval.items(): out[metric] = score imgToEval = cocoEval.imgToEval for p in preds_filt: image_id, caption = p['image_id'], p['caption'] imgToEval[image_id]['caption'] = caption with open(cache_path, 'w') as outfile: json.dump({'overall': out, 'imgToEval': imgToEval}, outfile) return out
def language_eval(dataset, preds, preds_n, eval_kwargs, split): model_id = eval_kwargs['id'] eval_oracle = eval_kwargs.get('eval_oracle', 0) import sys sys.path.append("coco-caption") annFile = 'coco-caption/annotations/captions_val2014.json' from pycocotools.coco import COCO from pycocoevalcap.eval import COCOEvalCap # encoder.FLOAT_REPR = lambda o: format(o, '.3f') if not os.path.isdir('eval_results'): os.mkdir('eval_results') cache_path = os.path.join('eval_results/', '.cache_' + model_id + '_' + split + '.json') coco = COCO(annFile) valids = coco.getImgIds() # filter results to only those in MSCOCO validation set (will be about a third) preds_filt = [p for p in preds if p['image_id'] in valids] mean_perplexity = sum([_['perplexity'] for _ in preds_filt]) / len(preds_filt) mean_entropy = sum([_['entropy'] for _ in preds_filt]) / len(preds_filt) print('using %d/%d predictions' % (len(preds_filt), len(preds))) json.dump(preds_filt, open(cache_path, 'w')) # serialize to temporary json file. Sigh, COCO API... cocoRes = coco.loadRes(cache_path) cocoEval = COCOEvalCap(coco, cocoRes) cocoEval.params['image_id'] = cocoRes.getImgIds() cocoEval.evaluate() # create output dictionary out = {} for metric, score in cocoEval.eval.items(): out[metric] = score # Add mean perplexity out['perplexity'] = mean_perplexity out['entropy'] = mean_entropy imgToEval = cocoEval.imgToEval for k in list(imgToEval.values())[0]['SPICE'].keys(): if k != 'All': out['SPICE_' + k] = np.array( [v['SPICE'][k]['f'] for v in imgToEval.values()]) out['SPICE_' + k] = (out['SPICE_' + k][out['SPICE_' + k] == out['SPICE_' + k]]).mean() for p in preds_filt: image_id, caption = p['image_id'], p['caption'] imgToEval[image_id]['caption'] = caption if len(preds_n) > 0: cache_path_n = os.path.join( 'eval_results/', '.cache_' + model_id + '_' + split + '_n.json') spice_n = eval_multi.eval_spice_n(preds_n, model_id, split) out.update(spice_n['overall']) div_stats = eval_multi.eval_div_stats(preds_n, model_id, split) out.update(div_stats['overall']) if eval_oracle: oracle = eval_multi.eval_oracle(preds_n, model_id, split) out.update(oracle['overall']) with open(cache_path_n, 'w') as outfile: json.dump( { 'spice_n': spice_n, 'div_stats': div_stats, 'oracle': oracle }, outfile) out['bad_count_rate'] = sum([count_bad(_['caption']) for _ in preds_filt]) / float(len(preds_filt)) outfile_path = os.path.join('eval_results/', model_id + '_' + split + '.json') with open(outfile_path, 'w') as outfile: json.dump({'overall': out, 'imgToEval': imgToEval}, outfile) return out
def mscoco_eval(test_tokenized_sent_groups, generated_sents_tokenized): with open(config.mscoco_eval_dir + '/annotations/references.json', 'w', encoding='utf-8') as f: json.dump( { 'info': { 'description': None, 'url': None, 'version': None, 'year': None, 'contributor': None, 'date_created': None }, 'images': [{ 'license': None, 'url': None, 'file_name': None, 'id': image_id, 'width': None, 'date_captured': None, 'height': None } for image_id in range(len(test_tokenized_sent_groups))], 'licenses': [], 'type': 'captions', 'annotations': [{ 'image_id': image_id, 'id': caption_id, 'caption': ' '.join(sent) } for (caption_id, (image_id, sent)) in enumerate( (image_id, sent) for (image_id, sent_group) in enumerate(test_tokenized_sent_groups) for sent in sent_group)] }, f) with open(config.mscoco_eval_dir + '/results/generated.json', 'w', encoding='utf-8') as f: json.dump([{ 'image_id': image_id, 'caption': ' '.join(sent) } for (image_id, sent) in enumerate(generated_sents_tokenized)], f) coco = COCO(config.mscoco_eval_dir + '/annotations/references.json') cocoRes = coco.loadRes(config.mscoco_eval_dir + '/results/generated.json') cocoEval = COCOEvalCap(coco, cocoRes) cocoEval.evaluate() return { 'Bleu_1': cocoEval.eval['Bleu_1'], 'Bleu_2': cocoEval.eval['Bleu_2'], 'Bleu_3': cocoEval.eval['Bleu_3'], 'Bleu_4': cocoEval.eval['Bleu_4'], 'METEOR': cocoEval.eval['METEOR'], 'ROUGE_L': cocoEval.eval['ROUGE_L'], 'CIDEr': cocoEval.eval['CIDEr'], 'SPICE': cocoEval.eval['SPICE'], 'WMD': cocoEval.eval['WMD'], 'Bleu_1_all': [item['Bleu_1'] for item in cocoEval.evalImgs], 'Bleu_2_all': [item['Bleu_2'] for item in cocoEval.evalImgs], 'Bleu_3_all': [item['Bleu_3'] for item in cocoEval.evalImgs], 'Bleu_4_all': [item['Bleu_4'] for item in cocoEval.evalImgs], 'METEOR_all': [item['METEOR'] for item in cocoEval.evalImgs], 'ROUGE_L_all': [item['ROUGE_L'] for item in cocoEval.evalImgs], 'CIDEr_all': [item['CIDEr'] for item in cocoEval.evalImgs], 'SPICE_all': [item['SPICE']['All']['f'] for item in cocoEval.evalImgs], 'WMD_all': [item['WMD'] for item in cocoEval.evalImgs], }
def score_generation(gt_filename=None, generation_result=None): coco = COCO(gt_filename) generation_coco = coco.loadRes(generation_result) coco_evaluator = COCOEvalCap(coco, generation_coco, 'noc_test_freq') coco_evaluator.evaluate()
def coco_metrics(val_captions_file, result_captions, metric): coco = COCO(val_captions_file) cocoRes = coco.loadRes(result_captions) cocoEval = COCOEvalCap(coco, cocoRes) cocoEval.evaluate() return cocoEval.eval[metric]
def coco_eval(model, args, epoch, split=None): ''' model: trained model to be evaluated args: pre-set parameters epoch: epoch #, for disp purpose ''' model.eval() # Validation images are required to be resized to 224x224 already transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load the vocabulary with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Wrapper the COCO VAL dataset eval_data_loader = torch.utils.data.DataLoader( CocoImageFolder(args.image_dir, args.caption_path, transform, split=split), batch_size=args.eval_size, shuffle=False, num_workers=args.num_workers, drop_last=False) # Generated captions to be compared with GT results = [] print( '---------------------Start evaluation on MS-COCO dataset %s-----------------------' % split) for i, (images, image_ids, filename) in enumerate(eval_data_loader): images = to_var(images) if torch.cuda.device_count() > 1: device_ids = range(torch.cuda.device_count()) encoder_parallel = nn.DataParallel(model.encoder, device_ids=device_ids) features, probs = encoder_parallel(images) else: features, probs = model.encoder(images) if args.pattern == 'truelabel': if args.dataset == 'ucm' or args.dataset == 'sydney': preds = torch.LongTensor(image_ids) // 100 preds += 4 preds = to_var(preds).unsqueeze(1) elif args.dataset == 'rsicd': trueLabels = [ args.vocab(str(fn).split('_')[0]) for fn in filename ] trueLabels = torch.LongTensor(trueLabels) preds = to_var(trueLabels).unsqueeze(1) elif args.pattern == 'label': preds = torch.max(probs.data, 1)[1].unsqueeze(1) preds += 4 preds = to_var(preds) else: preds = None # generated_captions, _ = model.decoder.sample(features) generated_captions, _ = model.decoder.sample( features, preds, args.pattern) #sat(adaptive) # generated_captions, _ = model.decoder.sample(features, probs, args.pattern) #fc_lstm captions = generated_captions.cpu().data.numpy() # Build caption based on Vocabulary and the '<end>' token for image_idx in range(captions.shape[0]): sampled_ids = captions[image_idx] sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] if word == '<end>': break else: sampled_caption.append(word) sentence = ' '.join(sampled_caption) temp = {'image_id': int(image_ids[image_idx]), 'caption': sentence} results.append(temp) # Disp evaluation process if (i + 1) % 100 == 0: print('[%d/%d]' % ((i + 1), len(eval_data_loader))) print( '------------------------Caption Generated-------------------------------------' ) # Evaluate the results based on the COCO API # name = str(args.yml).split('.')[0].split('/')[-1] if not os.path.exists(os.path.join(args.checkpoint_path, 'results')): os.mkdir(os.path.join(args.checkpoint_path, 'results')) if split is 'val': resFile = os.path.join( args.checkpoint_path, 'results', args.dataset + '-' + '{0:03d}'.format(epoch) + '.json') else: resFile = os.path.join( args.checkpoint_path, 'results', args.dataset + '-' + split + '-{0:03d}'.format(epoch) + '.json') # resFile = os.path.join(args.checkpoint_path, # args.dataset + '-' + split + '.json') json.dump(results, open(resFile, 'w'), indent=4) annFile = args.caption_val_path coco = COCO(annFile) cocoRes = coco.loadRes(resFile) cocoEval = COCOEvalCap(coco, cocoRes) cocoEval.params['image_id'] = cocoRes.getImgIds() cocoEval.evaluate() # Get CIDEr score for validation evaluation cider = 0. print( '-----------Evaluation performance on MS-COCO validation dataset for Epoch %d----------' % (epoch)) for metric, score in cocoEval.eval.items(): print('%s: %.4f' % (metric, score)) if metric == 'CIDEr': cider = score return cider, cocoEval.eval
from pycocotools.coco import COCO from pycocoevalcap.eval import COCOEvalCap annotation_file = 'captions_val2014.json' results_file = 'captions_val2014_fakecap_results.json' # create coco object and coco_result object coco = COCO(annotation_file) coco_result = coco.loadRes(results_file) # create coco_eval object by taking coco and coco_result coco_eval = COCOEvalCap(coco, coco_result, exclude_scorers=['spice']) # evaluate on a subset of images by setting # coco_eval.params['image_id'] = coco_result.getImgIds() # please remove this line when evaluating the full validation set coco_eval.params['image_id'] = coco_result.getImgIds() # evaluate results # SPICE will take a few minutes the first time, but speeds up due to caching coco_eval.evaluate() # print output evaluation scores for metric, score in coco_eval.eval.items(): print(f'{metric}: {score:.3f}')
def evaluate(beam_size): """ Evaluation :param beam_size: beam size at which to generate captions for evaluation :return: BLEU-4 score """ # Test Dataset testDataset = CaptionDatasetFastText(data_folder, data_name, 'TEST', transform=transforms.Compose( [normalize])) # DataLoader loader = torch.utils.data.DataLoader(testDataset, batch_size=1, shuffle=False, num_workers=1, pin_memory=True, collate_fn=my_collate_test) # TODO: Batched Beam Search # Therefore, do not use a batch_size greater than 1 - IMPORTANT! # Lists to store references (true captions), and hypothesis (prediction) for each image # If for n images, we have n hypotheses, and references a, b, c... for each image, we need - # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...] references = list() hypotheses = list() #Fields necessary for json used to compute the chair metric resultJson = {} resultJson['overall'] = { 'Bleu_1': 0, 'Bleu_2': 0, 'Bleu_3': 0, 'Bleu_4': 0, 'METEOR': 0, 'CIDEr': 0, 'SPICE': 0, 'ROUGE_L': 0, } captionsJson = [] #Text file with captions captionOutFile = open('evalCaptions.txt', 'w') synonyms = get_word_synonyms() # For each image for i, (tensor_fg, img_bg, caps, caplens, allcaps) in enumerate( tqdm(loader, desc="EVALUATING AT BEAM SIZE " + str(beam_size))): #Only generate one caption per image, a limitation of the coco evaluation code (only one result per id) if i % 5 != 0: continue imgId = testDataset.getImgId(i) if tensor_fg is None: continue captionString = ' '.join([ rev_word_map[caps[0][idx].item()] for idx in range(caplens[0]) if rev_word_map[caps[0][idx].item()] != '<unk>' ][1:-1]) hasPerson = False for synonym in synonyms[0]: pattern = r"\b{}s?\b".format(synonym) if re.search(pattern, captionString) is not None: hasPerson = True if hasPerson is False: continue k = beam_size # Move to GPU device, if available tensor_fg = tensor_fg.to(device) # (1, 3, 256, 256) img_bg = img_bg.to(device) # (1, 3, 256, 256) # Encode encoder_out = encoder( tensor_fg, img_bg) # (1, enc_image_size, enc_image_size, encoder_dim) enc_image_size = encoder_out.size(1) encoder_dim = encoder_out.size(3) # Flatten encoding encoder_out = encoder_out.view( 1, -1, encoder_dim) # (1, num_pixels, encoder_dim) num_pixels = encoder_out.size(1) # We'll treat the problem as having a batch size of k encoder_out = encoder_out.expand( k, num_pixels, encoder_dim) # (k, num_pixels, encoder_dim) # Tensor to store top k previous words at each step; now they're just <start> k_prev_words = torch.LongTensor([[word_map['<start>']]] * k).to( device) # (k, 1) # Tensor to store top k sequences; now they're just <start> seqs = k_prev_words # (k, 1) # Tensor to store top k sequences' scores; now they're just 0 top_k_scores = torch.zeros(k, 1).to(device) # (k, 1) # Lists to store completed sequences and scores complete_seqs = list() complete_seqs_scores = list() # Start decoding step = 1 h, c = decoder.init_hidden_state(encoder_out) # s is a number less than or equal to k, because sequences are removed from this process once they hit <end> while True: embeddings = decoder.embedding(k_prev_words).squeeze( 1) # (s, embed_dim) awe, _ = decoder.attention(encoder_out, h) # (s, encoder_dim), (s, num_pixels) gate = decoder.sigmoid( decoder.f_beta(h)) # gating scalar, (s, encoder_dim) awe = gate * awe h, c = decoder.decode_step(torch.cat([embeddings, awe], dim=1), (h, c)) # (s, decoder_dim) scores = decoder.fc(h) # (s, vocab_size) scores = F.log_softmax(scores, dim=1) # Add scores = top_k_scores.expand_as(scores) + scores # (s, vocab_size) # For the first step, all k points will have the same scores (since same k previous words, h, c) if step == 1: top_k_scores, top_k_words = scores[0].topk(k, 0, True, True) # (s) else: # Unroll and find top scores, and their unrolled indices top_k_scores, top_k_words = scores.view(-1).topk( k, 0, True, True) # (s) # Convert unrolled indices to actual indices of scores prev_word_inds = top_k_words / vocab_size # (s) next_word_inds = top_k_words % vocab_size # (s) # Add new words to sequences seqs = torch.cat( [seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1) # (s, step+1) # Which sequences are incomplete (didn't reach <end>)? incomplete_inds = [ ind for ind, next_word in enumerate(next_word_inds) if next_word != word_map['<end>'] ] complete_inds = list( set(range(len(next_word_inds))) - set(incomplete_inds)) # Set aside complete sequences if len(complete_inds) > 0: complete_seqs.extend(seqs[complete_inds].tolist()) complete_seqs_scores.extend(top_k_scores[complete_inds]) k -= len(complete_inds) # reduce beam length accordingly # Proceed with incomplete sequences if k == 0: break seqs = seqs[incomplete_inds] h = h[prev_word_inds[incomplete_inds]] c = c[prev_word_inds[incomplete_inds]] encoder_out = encoder_out[prev_word_inds[incomplete_inds]] top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1) k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1) # Break if things have been going on too long if step > 50: break step += 1 if len(complete_seqs_scores) == 0: print("Skipping item with no scores") continue seqIdx = complete_seqs_scores.index(max(complete_seqs_scores)) seq = complete_seqs[seqIdx] # References img_caps = allcaps[0].tolist() img_captions = list( map( lambda c: [ w for w in c if w not in { word_map['<start>'], word_map['<end>'], word_map[ '<pad>'] } ], img_caps)) # remove <start> and pads references.append(img_captions) # Hypotheses hypo = [ w for w in seq if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']} ] hypotheses.append(hypo) captionWords = [rev_word_map[item] for item in hypo] captionString = ' '.join(captionWords) resultDict = { 'image_id': imgId, 'caption': captionString, 'Bleu_1': 0, 'Bleu_2': 0, 'Bleu_3': 0, 'Bleu_4': 0, 'METEOR': 0, 'CIDEr': 0, 'SPICE': 0, 'ROUGE_L': 0, } captionsJson.append(resultDict) assert len(references) == len(hypotheses) #Print some captions and their image ids if i % 1000 == 0: captionOutFile.write('Image ID: {}\n'.format(imgId)) captionOutFile.write('Caption: {}\n'.format(captionString)) coco = COCO('testGTCaptions.json') cocoRes = coco.loadRes(captionsJson) cocoEval = COCOEvalCap(coco, cocoRes) cocoEval.params['image_id'] = cocoRes.getImgIds() cocoEval.evaluate() # Save results to json file for chair metric resultJson['imgToEval'] = captionsJson with open('evalCaptions.json', 'w') as fp: json.dump(resultJson, fp) # Calculate BLEU-4 scores bleu4 = corpus_bleu(references, hypotheses) return bleu4
def evaluate(dataset, predictions, lemmatizer_path=None, extended=False, embeddings_file=None, downcase=False): data = dataset[DATA_KEY] comm = lemmatizer(lemmatizer_path) if lemmatizer_path else None f1 = exact_match = total = 0 n_unanswered = 0 datum_count = 0 for datum in data: for qa in datum[DOC_KEY][QAS_KEY]: total += 1 if qa[ID_KEY] not in predictions: n_unanswered += 1 continue ground_truths = list(map(lambda x: x[TXT_KEY], qa[ANS_KEY])) prediction = predictions[qa[ID_KEY]] exact_match += metric_max_over_ground_truths(exact_match_score, prediction, ground_truths, comm=comm) f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths, comm=comm) datum_count += 1 print("There were {} unanswered instances".format(n_unanswered)) exact_match_all = 100.0 * exact_match / total f1_all = 100.0 * f1 / total assert exact_match_all <= f1_all scores = {'exact_match': exact_match_all, 'f1': f1_all} exact_match_ans = 100.0 * exact_match / (total - n_unanswered) f1_ans = 100.0 * f1 / (total - n_unanswered) assert exact_match_ans <= f1_ans scores['exact_match_ans'] = exact_match_ans scores['f1_ans'] = f1_ans if extended: from pycocoevalcap.eval import COCOEvalCap from embedding_eval import EmbeddingEval # COCO ground = {} for id, ans in to_id_answertxt(dataset).items(): normalized_ans = [] for a in ans: normalized_ans.append(normalize_answer(a, comm)) ground[id] = normalized_ans _predictions = { id: [normalize_answer(ans, comm)] for id, ans in predictions.items() } cocoEval = COCOEvalCap(ground, _predictions) cocoEval.evaluate() # Embeddings evaluation embEval = EmbeddingEval(ground, _predictions, embeddings_file, downcase) embEval.evaluate() #scores = {**scores, **cocoEval.eval, **embEval.eval} # only python3.5 scores.update(cocoEval.eval) scores.update(embEval.eval) return scores
def beam_evaluate_trans(data_name, checkpoint_file, data_folder, beam_size, outdir): """ Evaluation :param data_name: name of the data files :param checkpoint_file: which checkpoint file to use :param data_folder: folder where data is stored :param beam_size: beam size at which to generate captions for evaluation :param outdir: place where the outputs are stored, so the checkpoint file :return: Official MSCOCO evaluator scores - bleu4, cider, rouge, meteor """ global word_map device = torch.device("cuda" if torch.cuda.is_available() else "cpu") def load_model(): # Load model using checkpoint file provided torch.nn.Module.dump_patches = True checkpoint = torch.load(os.path.join(outdir, checkpoint_file), map_location=device) decoder = checkpoint['decoder'] decoder = decoder.to(device) decoder.eval() return decoder def load_dictionary(): # Load word map (word2ix) using data folder provided word_map_file = os.path.join(data_folder, 'WORDMAP_' + data_name + '.json') with open(word_map_file, 'r') as j: word_map = json.load(j) rev_word_map = {v: k for k, v in word_map.items()} vocab_size = len(word_map) return word_map, rev_word_map, vocab_size decoder = load_model() word_map, rev_word_map, vocab_size = load_dictionary() # DataLoader loader = torch.utils.data.DataLoader(CaptionDataset( data_folder, data_name, 'TEST'), batch_size=1, shuffle=False, num_workers=1, collate_fn=collate_fn, pin_memory=torch.cuda.is_available()) # Lists to store references (true captions), and hypothesis (prediction) for each image # If for n images, we have n hypotheses, and references a, b, c... for each image, we need - # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...] references = list() hypotheses = list() # For each image for caption_idx, (image_features, caps, caplens, orig_caps) in enumerate( tqdm(loader, desc="EVALUATING AT BEAM SIZE " + str(beam_size))): if caption_idx % 5 != 0: continue k = beam_size # Move to GPU device, if available image_features = image_features.to(device) # (1, 36, 2048) image_features_mean = image_features.mean(1) image_features_mean = image_features_mean.expand(k, 2048) # Tensor to store top k previous words at each step; now they're just <start> k_prev_words = torch.tensor([[word_map['<start>']]] * k, dtype=torch.long).to(device) # (k, 1) # Tensor to store top k sequences; now they're just <start> seqs = k_prev_words # (k, 1) # Tensor to store top k sequences' scores; now they're just 0 top_k_scores = torch.zeros(k, 1).to(device) # (k, 1) # Lists to store completed sequences and scores complete_seqs = list() complete_seqs_scores = list() # Start decoding step = 1 h1, c1 = decoder.init_hidden_state(k) # (batch_size, decoder_dim) h2, c2 = decoder.init_hidden_state(k) # s is a number less than or equal to k, because sequences are removed from this process once they hit <end> while True: embeddings = decoder.embedding(k_prev_words).squeeze( 1) # (s, embed_dim) h1, c1 = decoder.top_down_attention( torch.cat([h2, image_features_mean, embeddings], dim=1), (h1, c1)) # (batch_size_t, decoder_dim) trans_obj = decoder.transformer_encoder( image_features.transpose(0, 1)).transpose(0, 1) attention_weighted_encoding = decoder.attention(trans_obj, h1) h2, c2 = decoder.language_model( torch.cat([attention_weighted_encoding, h1], dim=1), (h2, c2)) scores = decoder.fc(h2) # (s, vocab_size) scores = F.log_softmax(scores, dim=1) # Add scores = top_k_scores.expand_as(scores) + scores # (s, vocab_size) # For the first step, all k points will have the same scores (since same k previous words, h, c) if step == 1: top_k_scores, top_k_words = scores[0].topk(k, 0, True, True) # (s) else: # Unroll and find top scores, and their unrolled indices top_k_scores, top_k_words = scores.view(-1).topk( k, 0, True, True) # (s) # Convert unrolled indices to actual indices of scores prev_word_inds = top_k_words / vocab_size # (s) next_word_inds = top_k_words % vocab_size # (s) # Add new words to sequences seqs = torch.cat( [seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1) # (s, step+1) # Which sequences are incomplete (didn't reach <end>)? incomplete_inds = [ ind for ind, next_word in enumerate(next_word_inds) if next_word != word_map['<end>'] ] complete_inds = list( set(range(len(next_word_inds))) - set(incomplete_inds)) # Set aside complete sequences if len(complete_inds) > 0: complete_seqs.extend(seqs[complete_inds].tolist()) complete_seqs_scores.extend(top_k_scores[complete_inds]) k -= len(complete_inds) # reduce beam length accordingly # Proceed with incomplete sequences if k == 0: break seqs = seqs[incomplete_inds] h1 = h1[prev_word_inds[incomplete_inds]] c1 = c1[prev_word_inds[incomplete_inds]] h2 = h2[prev_word_inds[incomplete_inds]] c2 = c2[prev_word_inds[incomplete_inds]] image_features_mean = image_features_mean[ prev_word_inds[incomplete_inds]] top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1) k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1) # Break if things have been going on too long if step > 50: break step += 1 i = complete_seqs_scores.index(max(complete_seqs_scores)) seq = complete_seqs[i] # References # img_caps = [' '.join(c) for c in orig_caps] img_caps = [c for c in orig_caps] references.append(img_caps) # Hypotheses hypothesis = ([ rev_word_map[w] for w in seq if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']} ]) # hypothesis = ' '.join(hypothesis) hypotheses.append(hypothesis) assert len(references) == len(hypotheses) # Calculate scores # metrics_dict = nlgeval.compute_metrics(references, hypotheses) hypotheses_file = os.path.join(outdir, 'hypotheses', 'TEST.Hypotheses.json') references_file = os.path.join(outdir, 'references', 'TEST.References.json') create_captions_file(range(len(hypotheses)), hypotheses, hypotheses_file) create_captions_file(range(len(references)), references, references_file) coco = COCO(references_file) # add the predicted results to the object coco_results = coco.loadRes(hypotheses_file) # create the evaluation object with both the ground-truth and the predictions coco_eval = COCOEvalCap(coco, coco_results) # change to use the image ids in the results object, not those from the ground-truth coco_eval.params['image_id'] = coco_results.getImgIds() # run the evaluation coco_eval.evaluate(verbose=False, metrics=['bleu', 'meteor', 'rouge', 'cider']) # Results contains: "Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4", "METEOR", "ROUGE_L", "CIDEr", "SPICE" results = coco_eval.eval return results
def language_eval(dataset, preds, model_id, split): import sys sys.path.append("coco-caption") if 'coco' in dataset: annFile = 'coco-caption/annotations/captions_val2014.json' elif 'flickr30k' in dataset or 'f30k' in dataset: annFile = 'coco-caption/f30k_captions4eval.json' from pycocotools.coco import COCO from pycocoevalcap.eval import COCOEvalCap # encoder.FLOAT_REPR = lambda o: format(o, '.3f') if '+' in model_id: # ensemble save_to = 'eval_results/ensemble' else: save_to = 'eval_results/single' if not os.path.isdir(save_to): os.mkdir(save_to) cache_path = os.path.join(save_to, model_id + '_' + split + '.json') # cache_path = os.path.join(save_to, 'tmp_' + split + '.json') coco = COCO(annFile) valids = coco.getImgIds() # filter results to only those in MSCOCO validation set (will be about a third) preds_filt = [p for p in preds if p['image_id'] in valids] print('using %d/%d predictions' % (len(preds_filt), len(preds))) # json.dump(preds_filt, open(cache_path, 'w')) # serialize to temporary json file. Sigh, COCO API... with open(cache_path, 'w') as f: json.dump(preds_filt, f) print("Write prediction results to {}".format(cache_path)) cocoRes = coco.loadRes(cache_path) cocoEval = COCOEvalCap(coco, cocoRes) cocoEval.params['image_id'] = cocoRes.getImgIds() cocoEval.evaluate() # create output dictionary out = {} for metric, score in cocoEval.eval.items(): out[metric] = score imgToEval = cocoEval.imgToEval for p in preds_filt: image_id, caption = p['image_id'], p['caption'] imgToEval[image_id]['caption'] = caption out['bad_count_rate'] = sum([count_bad(_['caption']) for _ in preds_filt]) / float(len(preds_filt)) outfile_path = os.path.join(save_to, model_id + '_' + split + '_imgToEval.json') # outfile_path = os.path.join(save_to, 'tmp_' + split + '_imgToEval.json') with open(outfile_path, 'w') as outfile: json.dump( { 'overall': out, 'imgToEval': imgToEval, 'predCaption': preds_filt }, outfile) print("Write prediction results to {}".format(outfile_path)) return out
def validate(val_loader, decoder, criterion_ce, criterion_dis, epoch): """ Performs one epoch's validation. :param val_loader: DataLoader for validation data. :param decoder: decoder model :param criterion_ce: cross entropy loss layer :param criterion_dis : discriminative loss layer :return: BLEU-4 score """ decoder.eval() # eval mode (no dropout or batchnorm) batch_time = AverageMeter() losses = AverageMeter() top5accs = AverageMeter() start = time.time() references = list() # references (true captions) for calculating BLEU-4 score hypotheses = list() # hypotheses (predictions) # Batches with torch.no_grad(): # for i, (imgs, caps, caplens,allcaps) in enumerate(val_loader): for i, sample in enumerate(val_loader): if i % 5 != 0: # only decode every 5th caption, starting from idx 0. # this is because the iterator iterates over all captions in the dataset, not all images. if i % args.print_freq_val == 0: print('Validation: [{0}/{1}]\t' 'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})\t'.format(i, len(val_loader), batch_time=batch_time, loss=losses, top5=top5accs)) continue if scene_graph: (obj, rel, caps, caplens, orig_caps, obj_mask, rel_mask, pair_idx) = sample obj = obj.to(device) rel = rel.to(device) obj_mask = obj_mask.to(device) rel_mask = rel_mask.to(device) pair_idx = pair_idx.to(device) else: (imgs, caps, caplens, orig_caps) = sample imgs = imgs.to(device) # Move to device, if available caps = caps.to(device) caplens = caplens.to(device) # Forward prop. if scene_graph: scores, scores_d, caps_sorted, decode_lengths, sort_ind = decoder(object_features=obj, relation_features=rel, encoded_captions=caps, caption_lengths=caplens, object_mask=obj_mask, relation_mask=rel_mask, rel_pair_idx=pair_idx) else: scores, scores_d, caps_sorted, decode_lengths, sort_ind = decoder(imgs, caps, caplens) # Max-pooling across predicted words across time steps for discriminative supervision scores_d = scores_d.max(1)[0] # Since we decoded starting with <start>, the targets are all words after <start>, up to <end> targets = caps_sorted[:, 1:] targets_d = torch.zeros(scores_d.size(0), scores_d.size(1)).to(device) targets_d.fill_(-1) for length in decode_lengths: targets_d[:, :length - 1] = targets[:, :length - 1] # Remove timesteps that we didn't decode at, or are pads # pack_padded_sequence is an easy trick to do this scores_copy = scores.clone() scores = pack_padded_sequence(scores, decode_lengths, batch_first=True, enforce_sorted=True).data targets = pack_padded_sequence(targets, decode_lengths, batch_first=True, enforce_sorted=True).data #scores, _ = pack_padded_sequence(scores, decode_lengths, batch_first=True) #targets, _ = pack_padded_sequence(targets, decode_lengths, batch_first=True) # Calculate loss loss_d = criterion_dis(scores_d, targets_d.long()) loss_g = criterion_ce(scores, targets) loss = loss_g + (10 * loss_d) # Keep track of metrics losses.update(loss.item(), sum(decode_lengths)) top5 = accuracy(scores, targets, 5) top5accs.update(top5, sum(decode_lengths)) batch_time.update(time.time() - start) start = time.time() if i % args.print_freq_val == 0: print('Validation: [{0}/{1}]\t' 'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})\t'.format(i, len(val_loader), batch_time=batch_time, loss=losses, top5=top5accs)) # Store references (true captions), and hypothesis (prediction) for each image # If for n images, we have n hypotheses, and references a, b, c... for each image, we need - # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...] # References assert (len(sort_ind) == 1), "Cannot have batch_size>1 for validation." # a reference is a list of lists: # [['the', 'cat', 'sat', 'on', 'the', 'mat'], ['a', 'cat', 'on', 'the', 'mat']] references.append(orig_caps) # Hypotheses _, preds = torch.max(scores_copy, dim=2) preds = preds.tolist() preds_idxs_no_pads = list() for j, p in enumerate(preds): preds_idxs_no_pads.append(preds[j][:decode_lengths[j]]) # remove pads preds_idxs_no_pads = list(map(lambda c: [w for w in c if w not in {word_map['<start>'], word_map['<pad>']}], preds_idxs_no_pads)) temp_preds = list() # remove <start> and pads and convert idxs to string for hyp in preds_idxs_no_pads: temp_preds.append([]) for w in hyp: assert (not w == word_map['pad']), "Should have removed all pads." if not w == word_map['<start>']: temp_preds[-1].append(word_map_inv[w]) preds = temp_preds hypotheses.extend(preds) assert len(references) == len(hypotheses) # Calculate BLEU-4 scores # bleu4 = corpus_bleu(references, hypotheses) # bleu4 = round(bleu4, 4) # compute the metrics hypotheses_file = os.path.join(args.outdir, 'hypotheses', 'Epoch{:0>3d}.Hypotheses.json'.format(epoch)) references_file = os.path.join(args.outdir, 'references', 'Epoch{:0>3d}.References.json'.format(epoch)) create_captions_file(range(len(hypotheses)), hypotheses, hypotheses_file) create_captions_file(range(len(references)), references, references_file) coco = COCO(references_file) # add the predicted results to the object coco_results = coco.loadRes(hypotheses_file) # create the evaluation object with both the ground-truth and the predictions coco_eval = COCOEvalCap(coco, coco_results) # change to use the image ids in the results object, not those from the ground-truth coco_eval.params['image_id'] = coco_results.getImgIds() # run the evaluation coco_eval.evaluate(verbose=False, metrics=['bleu', 'meteor', 'rouge', 'cider']) # Results contains: "Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4", "METEOR", "ROUGE_L", "CIDEr", "SPICE" results = coco_eval.eval results['loss'] = losses.avg results['top5'] = top5accs.avg for k, v in results.items(): print(k+':\t'+str(v)) # print('\n * LOSS - {loss.avg:.3f}, TOP-5 ACCURACY - {top5.avg:.3f}, BLEU-4 - {bleu}, CIDEr - {cider}\n' # .format(loss=losses, top5=top5accs, bleu=round(results['Bleu_4'], 4), cider=round(results['CIDEr'], 1))) return results
def language_eval(dataset, preds, model_id, split): import sys sys.path.append("coco-caption") if 'coco' in dataset: annFile = 'coco-caption/annotations/captions_val2014.json' elif 'flickr30k' in dataset or 'f30k' in dataset: annFile = 'coco-caption/f30k_captions4eval.json' elif 'person' in dataset: annFile='coco-caption/person_captions4eval.json' from pycocotools.coco import COCO from pycocoevalcap.eval import COCOEvalCap # encoder.FLOAT_REPR = lambda o: format(o, '.3f') if not os.path.isdir('eval_results'): os.mkdir('eval_results') cache_path = os.path.join('eval_results/', '.cache_'+ model_id + '_' + split + '.json') best_cider=0 #gdindex=[0,1,2,3,4] gdindex=[-1] cider_list =[] for i in gdindex: annFile='coco-caption/person_captions4eval_'+str(i)+'.json' print(annFile) coco = COCO(annFile) valids = coco.getImgIds() # filter results to only those in MSCOCO validation set (will be about a third) preds_filt = [p for p in preds if p['image_id'] in valids] print('using %d/%d predictions' % (len(preds_filt), len(preds))) json.dump(preds_filt, open(cache_path, 'w')) # serialize to temporary json file. Sigh, COCO API... cocoRes = coco.loadRes(cache_path) cocoEval = COCOEvalCap(coco, cocoRes) cocoEval.params['image_id'] = cocoRes.getImgIds() cocoEval.evaluate() cider_list.append(cocoEval.eval['CIDEr']) # create output dictionary if cocoEval.eval['CIDEr']>=best_cider: best_cider = cocoEval.eval['CIDEr'] out = {} for metric, score in cocoEval.eval.items(): out[metric] = score imgToEval = cocoEval.imgToEval # collect SPICE_sub_score #for k in imgToEval.values()[0]['SPICE'].keys(): # if k != 'All': # out['SPICE_'+k] = np.array([v['SPICE'][k]['f'] for v in imgToEval.values()]) # out['SPICE_'+k] = (out['SPICE_'+k][out['SPICE_'+k]==out['SPICE_'+k]]).mean() for p in preds_filt: image_id, caption = p['image_id'], p['caption'] imgToEval[image_id]['caption'] = caption #update predictions for i in range(len(preds)): if preds[i]['image_id'] in imgToEval: preds[i]['eval'] = imgToEval[preds[i]['image_id']] out['bad_count_rate'] = sum([count_bad(_['caption']) for _ in preds_filt]) / float(len(preds_filt)) else: continue outfile_path = os.path.join('eval_results/', model_id + '_' + split + '.json') with open(outfile_path, 'w') as outfile: json.dump({'overall': out, 'imgToEval': imgToEval}, outfile) cider_list=np.array(cider_list) print("min:",np.min(cider_list)," max:",np.max(cider_list)," mean:",np.mean(cider_list)," std:",np.std(cider_list)) return out
def language_eval(dataset, preds, preds_n, job_id, split, eval_oracle=False): # # create output dictionary out = {} # # Diversity not implemented # =========== # if len(preds_n) > 0: # # vocab size and novel sentences # if "coco" in dataset: # dataset_file = "data/dataset_coco.json" # elif "flickr30k" in dataset or "f30k" in dataset: # dataset_file = "data/dataset_flickr30k.json" # training_sentences = set( # [ # " ".join(__["tokens"]) # for _ in json.load(open(dataset_file))["images"] # if not _["split"] in ["val", "test"] # for __ in _["sentences"] # ] # ) # generated_sentences = set([_["caption"] for _ in preds_n]) # novels = generated_sentences - training_sentences # out["novel_sentences"] = float(len(novels)) / len(preds_n) # tmp = [_.split() for _ in generated_sentences] # words = [] # for _ in tmp: # words += _ # out["vocab_size"] = len(set(words)) # # Set cache path cache_path = os.path.join("eval_results/", f".cache_{job_id}_{split}.json") # # Extract image ids in current data set coco = getCOCO(dataset) image_ids = coco.getImgIds() # # Filter results to only those in MSCOCO validation set filtered_predictions = [p for p in preds if p["image_id"] in image_ids] num_filtered_predictions = float(len(filtered_predictions)) num_predictions = float(len(preds)) # # Save predictions mean_perplexity = (sum([p["perplexity"] for p in filtered_predictions]) / num_filtered_predictions) mean_entropy = (sum([p["entropy"] for p in filtered_predictions]) / num_filtered_predictions) print(f"using {num_filtered_predictions}/{num_predictions} predictions") json.dump(filtered_predictions, open(cache_path, "w")) # serialize to temporary json file. Sigh, COCO API... # # Evaluate captions # NOTE: loadRes() API call requires a json file, hence the above comment cocoRes = coco.loadRes(cache_path) cocoEval = COCOEvalCap(coco, cocoRes) cocoEval.params["image_id"] = cocoRes.getImgIds() cocoEval.evaluate() # # Compile results so far out["perplexity"] = mean_perplexity out["entropy"] = mean_entropy # for metric, score in cocoEval.eval.items(): out[metric] = score # # Record SPICE scores?? imgToEval = cocoEval.imgToEval for k in list(imgToEval.values())[0]["SPICE"].keys(): if k != "All": out["SPICE_" + k] = np.array( [v["SPICE"][k]["f"] for v in imgToEval.values()]) out["SPICE_" + k] = (out["SPICE_" + k][out["SPICE_" + k] == out["SPICE_" + k]]).mean() # # Overwrite caption or set? for p in filtered_predictions: image_id, caption = p["image_id"], p["caption"] imgToEval[image_id]["caption"] = caption # # Diverse sampling not implemented # ================== # if len(preds_n) > 0: # import eval_multi # cache_path_n = os.path.join( # "eval_results/", ".cache_" + job_id + "_" + split + "_n.json" # ) # allspice = eval_multi.eval_allspice(dataset, preds_n, job_id, split) # out.update(allspice["overall"]) # div_stats = eval_multi.eval_div_stats(dataset, preds_n, job_id, split) # out.update(div_stats["overall"]) # if eval_oracle: # oracle = eval_multi.eval_oracle(dataset, preds_n, job_id, split) # out.update(oracle["overall"]) # else: # oracle = None # self_cider = eval_multi.eval_self_cider(dataset, preds_n, job_id, split) # out.update(self_cider["overall"]) # with open(cache_path_n, "w") as outfile: # json.dump( # { # "allspice": allspice, # "div_stats": div_stats, # "oracle": oracle, # "self_cider": self_cider, # }, # outfile, # ) # # Fraction of captions that have illegal endings # SEE: bad_endings in /utils/constants.py num_bad_endings = sum( [count_bad(_["caption"]) for _ in filtered_predictions]) out["bad_count_rate"] = num_bad_endings / num_filtered_predictions # # Write evaluation results to json outfile_path = os.path.join("eval_results/", f"{job_id}_{split}.json") with open(outfile_path, "w") as outfile: json.dump({"overall": out, "imgToEval": imgToEval}, outfile) # return out
def main(test_json_path): model_list = [] for i in xrange(3): model_list.append( os.path.join(os.path.dirname(__file__), 'models/train', 'model.ckpt' + str(i))) var_list = tf.contrib.framework.list_variables(model_list[0]) var_values, var_dtypes = {}, {} for (name, shape) in var_list: if not name.startswith("global_step"): var_values[name] = np.zeros(shape) for model_path in model_list: reader = tf.contrib.framework.load_checkpoint(model_path) for name in var_values: tensor = reader.get_tensor(name) var_dtypes[name] = tensor.dtype var_values[name] += tensor for name in var_values: var_values[name] /= len(model_list) tf_vars = [ tf.get_variable(v, shape=var_values[v].shape, dtype=var_dtypes[name]) for v in var_values ] placeholders = [ tf.placeholder(v.dtype, shape=v.get_shape()) for v in tf_vars ] assign_ops = [tf.assign(v, p) for (v, p) in zip(tf_vars, placeholders)] global_step = tf.Variable(0, name="global_step", trainable=False, dtype=tf.int32) saver = tf.train.Saver(tf.all_variables()) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for p, assign_op, (name, value) in zip(placeholders, assign_ops, six.iteritems(var_values)): sess.run(assign_op, {p: value}) saver.save(sess, os.path.join(os.path.dirname(__file__), 'models/tmp/model.ckpt'), global_step=global_step) with open(os.path.join(os.path.dirname(__file__), 'data/features.pkl'), 'r') as f: keyword_data = cPickle.load(f) with open(test_json_path) as f: test_json = json.load(f) id_to_filename = test_json['images'] id_to_path = [{ 'path': os.path.join('./Data/test', x['file_name']), 'id': x['id'] } for x in id_to_filename] result_json = [] g = tf.Graph() with g.as_default(): model = inference_wrapper.InferenceWrapper() restore_fn = model.build_graph_from_config( configuration.ModelConfig(), os.path.join(os.path.dirname(__file__), 'models/tmp/model.ckpt-0')) g.finalize() vocab = vocabulary.Vocabulary(os.path.join('./Data/word_counts.txt')) with tf.Session(graph=g) as sess: restore_fn(sess) generator = caption_generator.CaptionGenerator(model, vocab) for data in id_to_path: filename = data['path'] with tf.gfile.GFile(filename, "r") as f: image = f.read() captions = generator.beam_search( sess, image, keyword_data[os.path.basename(filename)]) print("Captions for image %s:" % os.path.basename(filename)) result = { 'image_id': data['id'], 'caption': (" ".join([ vocab.id_to_word(w) for w in captions[0].sentence[1:-1] ])).decode('utf-8') } print(result) result_json.append(result) with open(os.path.join(os.path.dirname(__file__), "result.json"), 'w') as f: json.dump(result_json, f) coco = COCO(test_json_path) cocoRes = coco.loadRes( os.path.join(os.path.dirname(__file__), "result.json")) cocoEval = COCOEvalCap(coco, cocoRes) cocoEval.evaluate()
base_dir + '/data2/anja/xai/captions/val-confident-man-500new.txt').readlines() confident_man = [int(c.strip()) for c in confident_man] confident_woman = open( base_dir + '/data2/anja/xai/captions/val-confident-woman-500new.txt').readlines() confident_woman = [int(c.strip()) for c in confident_woman] confident_ims = set(confident_man + confident_woman) & set(bias_ids) image_set = confident_ims else: print "Invalid set specified" for caption_path in caption_paths: generation_coco = coco.loadRes(caption_path[1]) coco_evaluator = COCOEvalCap(coco, generation_coco) coco_evaluator.params['image_id'] = list( set(image_set) & set(generation_coco.getImgIds())) coco_evaluator.evaluate() predicted_caps = json.load(open(caption_path[1])) for cap in predicted_caps: words = nltk.word_tokenize(cap['caption'].lower()) words = [ 'person' if word in gendered_words else word for word in words ] cap['caption'] = ' '.join(words) if len(set(words) & gendered_words) > 0: pdb.set_trace() """ person_caps = 'tmp/person_caps.json'
def language_eval(dataset, preds, model_id, split, detail_flg=False, wbleu_set=None, option='closest'): import sys sys.path.append("coco-caption") if dataset == 'coco': annFile = 'coco-caption/annotations/captions_val2014.json' elif dataset == 'vg': annFile = '/mnt/poplin/share/dataset/visualgenome/captions_vg.json' elif dataset == 'iapr': annFile = '/mnt/workspace2018/nakamura/IAPR/captions_iapr.json' from pycocotools.coco import COCO from pycocoevalcap.eval import COCOEvalCap # encoder.FLOAT_REPR = lambda o: format(o, '.3f') if not os.path.isdir('eval_results'): os.mkdir('eval_results') cache_path = os.path.join('eval_results/', model_id + '_' + split + '.json') coco = COCO(annFile) valids = coco.getImgIds() # filter results to only those in MSCOCO validation set (will be about a third) preds_filt = [p for p in preds if p['image_id'] in valids] print('using %d/%d predictions' % (len(preds_filt), len(preds))) json.dump(preds_filt, open(cache_path, 'w')) # serialize to temporary json file. Sigh, COCO API... cocoRes = coco.loadRes(cache_path) cocoEval = COCOEvalCap(coco, cocoRes, wbleu_set=wbleu_set) cocoEval.params['image_id'] = cocoRes.getImgIds() cocoEval.evaluate() # create output dictionary out = {} for metric, score in cocoEval.eval.items(): out[metric] = score imgToEval = cocoEval.imgToEval for p in preds_filt: image_id, caption = p['image_id'], p['caption'] imgToEval[image_id]['caption'] = caption with open(cache_path, 'w') as outfile: json.dump({'overall': out, 'imgToEval': imgToEval}, outfile) if detail_flg: out_detail_scores = {} # for i in range(len(cocoEval.imgToEval.items())): # pdb.set_trace() # out_detail_scores[str(cocoEval.imgToEval.items()[i][0])] = cocoEval.imgToEval.items()[i][1] for key in cocoEval.imgToEval.keys(): out_detail_scores[str(key)] = cocoEval.imgToEval[key] return [out, out_detail_scores] return out
import tqdm import json import pandas as pd from pycocotools.coco import COCO from torchvision.datasets import CocoCaptions from pycocoevalcap.eval import COCOEvalCap coco = COCO("./data/annotations/captions_val2014.json") res_file = "./results/captions_model.json" out_file = "./results/val2014_scores.xlsx" # evaluate best captions against gt coco_result = coco.loadRes(res_file) cocoEval = COCOEvalCap(coco, coco_result) cocoEval.params['image_id'] = coco_result.getImgIds() cocoEval.evaluate() indices = [ "BLEU 1-gram", "BLEU 2-gram", "BLEU 3-gram", "BLEU 4-gram", "METEOR", "ROUGE_L", "CIDEr" ] data = [cocoEval.eval['Bleu_1']] + [cocoEval.eval['Bleu_2']] + [cocoEval.eval['Bleu_3']] + [cocoEval.eval['Bleu_4']] + \ [cocoEval.eval['METEOR']] + [cocoEval.eval['ROUGE_L']] + [cocoEval.eval['CIDEr']] results = pd.DataFrame(columns=[f"3 epochs, lr=0.001"], index=indices, data=data) results.to_excel(out_file) print(f"Results saved to {out_file}")
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models encoder = Encoder(args.embed_size).eval() decoder = Decoder(args.embed_size, args.hidden_size, len(vocab), args.num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # load validation image set lis = os.listdir(args.image_dir) num = len(lis) captions = [] for i in range(num): im_pth = os.path.join(args.image_dir, lis[i]) image = load_image(im_pth, transform) image_tensor = image.to(device) # Generate an caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy() # (1, max_seq_length) -> (max_seq_length) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] if word == '<start>': continue if word == '<end>': break sampled_caption.append(word) sentence = ' '.join(sampled_caption) cap= {} id = int(lis[i][14:-4]) #extract image id cap['image_id'] = id cap['caption'] = sentence captions.append(cap) # save results with open('captions_res.json', 'w') as f: json.dump(captions, f) # evaluation with coco-caption evaluation tools coco = COCO(args.caption_path) cocoRes = coco.loadRes('captions_res.json') cocoEval = COCOEvalCap(coco, cocoRes) cocoEval.params['image_id'] = cocoRes.getImgIds() cocoEval.evaluate()
def consensus_rerank(self, method='cider', flag_eval=True): # Only support cider and bleu currently assert (method == 'cider' or method == 'bleu') assert (len(self.NNimg_list) == len(self.anno_list_hypo)) fol_cache = os.path.join(self.conf.fol_root_cache, self.conf.name_cache) if not os.path.exists(fol_cache): os.makedirs(fol_cache) if method == 'cider': # prepare cider coco = COCO(self.conf.fname_eval_ref) cocoEvalCider = COCOEvalCapPairCider(coco) cocoEvalCider.setup() k = self.conf.k_cider m = self.conf.m_cider key_reranking = 'rerank_%s_k%d_m%d_cider' \ % (self.conf.gen_method, k, m) else: k = self.conf.k_bleu m = self.conf.m_bleu key_reranking = 'rerank_%s_k%d_m%d_bleu' \ % (self.conf.gen_method, k, m) # start reranking rerank_ind = { } # the ind is used to rerank the sentences that are in original order, for sGPN\dagger in grounding evaluation if self.anno_list_reranked == []: anno_list_reranked = self.anno_list_hypo else: anno_list_reranked = self.anno_list_reranked for (ind_te, anno) in enumerate( anno_list_reranked ): # anno: a dict for an image, 10 sentences are in 'gen_beam_search_10' sentences_gen = anno[self.conf.gen_method] sentences_ret = [] for ind_NN in range(k): ind_tr = self.NNimg_list[ind_te][ind_NN] sentences_ret += self.anno_list_ref[ind_tr]['sentences'] sim = [] for (ind_g, sen_gen) in enumerate(sentences_gen): b_s_arr = [] for (ind_r, sen_ret) in enumerate(sentences_ret): if method == 'cider': b_s_arr.append(cocoEvalCider.calculate_cider_sentence( \ ' '.join(sen_gen), ' '.join(sen_ret) ) ) else: b_s_arr.append(bs_util.calculate_bleu_sentence( \ sen_gen, sen_ret, self.conf.bleu_ngram, fpr=self.conf.fpr_bleu) ) b_s_arr.sort(reverse=True) sim.append(sum(b_s_arr[:m])) # Sort the sentence according to sim arg_sim = np.argsort(-np.array(sim)).tolist() anno[key_reranking] = [ sentences_gen[x] for x in arg_sim ] # put the sentences in the order of ranking score, each sentence is decompose to a list of words rerank_ind[anno['id']] = arg_sim if (ind_te + 1) % self.conf.num_show_finished == 0: logger.info('%d image reranking finished' % (ind_te + 1)) np.save( "consensus_rerank_ind.npy", rerank_ind ) # the index of sorted sentences; the index will be used in grounding evaluation self.anno_list_reranked = anno_list_reranked # list of dicts, each dict is an image fname_anno_list_reranked = os.path.join(fol_cache, 'anno_list_hypo_rerank_%s_%s.npy' \ % (self.conf.name_feat, self.conf.distance_metric) ) np.save( fname_anno_list_reranked, anno_list_reranked ) # save the sentences in the order of reranked, in key of 'rerank_caption_k60_m125_cider' # Write the statistics to hard disk and evaluate the performance if flag_eval: fname_coco_json = os.path.join(fol_cache, 'coco_json_%s_%s_%s.json' \ % (self.conf.name_feat, self.conf.distance_metric, method) ) #fout_eval = open(os.path.join(fol_cache, 'eval_stat_%s_%s_%s.txt' \ #% (self.conf.name_feat, self.conf.distance_metric, method) ), 'w') self._anno_genS2coco( anno_list_reranked, key_reranking, 0, fname_coco_json ) # 0: only save top-1 sentence after re-ranking coco = COCO(self.conf.fname_eval_ref) cocoRes = coco.loadRes(fname_coco_json) cocoEval = COCOEvalCap(coco, cocoRes) cocoEval.params['image_id'] = cocoRes.getImgIds() cocoEval.evaluate() # print output evaluation scores for metric, score in cocoEval.eval.items(): print('%s: %.3f' % (metric, score)) #print >>fout_eval, '%s: %.3f'%(metric, score) #fout_eval.close() self.cocoEval = cocoEval
def language_eval(dataset, align_pred, model_id, split, save_path, is_flickr=False): ''' evaluate the generated sentences ''' sys.path.append("misc/coco-caption") from pycocotools.coco import COCO from pycocoevalcap.eval import COCOEvalCap if is_flickr: annFile = 'misc/coco-caption/annotations/caption_flickr30k.json' else: annFile = 'misc/coco-caption/annotations/captions_val2014.json' coco = COCO(annFile) valids = coco.getImgIds() if not os.path.isdir('eval_results'): os.mkdir('eval_results') all_scores = {} num_oracle = len(align_pred[0]['caption']) num_test_img = len(align_pred) all_scores['Bleu_1'] = np.zeros((num_oracle, num_test_img)) all_scores['Bleu_2'] = np.zeros((num_oracle, num_test_img)) all_scores['Bleu_3'] = np.zeros((num_oracle, num_test_img)) all_scores['Bleu_4'] = np.zeros((num_oracle, num_test_img)) all_scores['CIDEr'] = np.zeros((num_oracle, num_test_img)) all_scores['METEOR'] = np.zeros((num_oracle, num_test_img)) all_scores['ROUGE_L'] = np.zeros((num_oracle, num_test_img)) all_scores['SPICE'] = np.zeros((num_oracle, num_test_img)) all_scores['subgraph_bleu_material'] = [] for sen_i in range(len(align_pred[0]['caption'])): # for each sentence ind cache_path = os.path.join( 'eval_results/', '.cache_' + model_id + '_' + split + str(sen_i) + '.json') preds = [] for img_j in range( len(align_pred )): # extract the sentence in same position for all images entry = { 'image_id': align_pred[img_j]['image_id'], 'caption': align_pred[img_j]['caption'][sen_i] } preds.append(entry) # filter results to only those in MSCOCO validation set (will be about a third) preds_filt = [p for p in preds if p['image_id'] in valids] print('using %d/%d predictions' % (len(preds_filt), len(preds))) json.dump(preds_filt, open( cache_path, 'w')) # serialize to temporary json file. Sigh, COCO API... cocoRes = coco.loadRes(cache_path) if sen_i == 0: # since every time cocoRes_ImgIds is the same, then only initiate cocoEval once cocoRes_ImgIds = cocoRes.getImgIds() cocoEval = COCOEvalCap(coco, cocoRes_ImgIds=cocoRes_ImgIds) #cocoEval.params['image_id'] = cocoRes.getImgIds() all_scores['image_id_list'] = list( cocoEval.gts.keys()) # fixed order of output image cocoEval.evaluate(cocoRes=cocoRes) for method in cocoEval.eval_scores.keys(): all_scores[method][sen_i, :] = np.array( cocoEval.eval_scores[method]).reshape(-1) all_scores['subgraph_bleu_material'].append( cocoEval.subgraph_training_bleu) # pick the bleu material of best subgraph in terms of individual sentence bleu score, # then re-compute the score over selected sentences top_k = len(align_pred[0]['caption']) if top_k != 1: print('\n\nThe following is top-{}: '.format(top_k)) bleu_dict = {'Bleu_1': [], 'Bleu_2': [], 'Bleu_3': [], 'Bleu_4': []} for metric in bleu_dict.keys(): best_ind = np.argmax(all_scores[metric][:top_k], axis=0) bleu_dict[metric] = cal_bleu( best_ind, all_scores['subgraph_bleu_material'][:top_k]) all_scores['bleu_dict'] = bleu_dict for b_i in range(5)[1:]: print('oracle {}: {}'.format( 'Bleu_' + str(b_i), bleu_dict['Bleu_' + str(b_i)][b_i - 1])) # pick maximum spice/cider/rouge/meteor score and average over images print('oracle spice: {}'.format( np.mean(np.max(all_scores['SPICE'][:top_k], axis=0)))) print('oracle cider: {}'.format( np.mean(np.max(all_scores['CIDEr'][:top_k], axis=0)))) print('oracle rouge: {}'.format( np.mean(np.max(all_scores['ROUGE_L'][:top_k], axis=0)))) print('oracle meteor: {}'.format( np.mean(np.max(all_scores['METEOR'][:top_k], axis=0)))) name = 'all_scores_{}_{}-subgraph.npy'.format( save_path[-1].split('-')[1].split('.')[0], len(align_pred[0]['caption'])) np.save(save_path[0] + '/' + save_path[1] + '/' + name, all_scores) print('\n{}'.format(save_path[0] + '/' + save_path[1] + '/' + name))
def evaluate(): coco = COCO(VAL_CAP_FILE) coco_res = coco.loadRes(RESULTS_FILE) coco_eval = COCOEvalCap(coco, coco_res) coco_eval.params["image_id"] = coco_res.getImgIds() coco_eval.evaluate()
def generation_experiment(self, strategy, max_batch_size=1000): # Compute image descriptors. print 'Computing image descriptors' self.compute_descriptors() do_batches = (strategy['type'] == 'beam' and strategy['beam_size'] == 1) or \ (strategy['type'] == 'sample' and ('temp' not in strategy or strategy['temp'] in (1, float('inf'))) and ('num' not in strategy or strategy['num'] == 1)) num_images = len(self.images) batch_size = min(max_batch_size, num_images) if do_batches else 1 # Generate captions for all images. all_captions = [None] * num_images for image_index in xrange(0, num_images, batch_size): batch_end_index = min(image_index + batch_size, num_images) sys.stdout.write("\rGenerating captions for image %d/%d" % (image_index, num_images)) sys.stdout.flush() if do_batches: if strategy['type'] == 'beam' or \ ('temp' in strategy and strategy['temp'] == float('inf')): temp = float('inf') else: temp = strategy['temp'] if 'temp' in strategy else 1 output_captions, output_probs = self.captioner.sample_captions( self.descriptors[image_index:batch_end_index], temp=temp) for batch_index, output in zip( range(image_index, batch_end_index), output_captions): all_captions[batch_index] = output else: for batch_image_index in xrange(image_index, batch_end_index): captions, caption_probs = self.captioner.predict_caption( self.descriptors[batch_image_index], strategy=strategy) best_caption, max_log_prob = None, None for caption, probs in zip(captions, caption_probs): log_prob = gen_stats(probs)['log_p'] if best_caption is None or \ (best_caption is not None and log_prob > max_log_prob): best_caption, max_log_prob = caption, log_prob all_captions[batch_image_index] = best_caption sys.stdout.write('\n') # Compute the number of reference files as the maximum number of ground # truth captions of any image in the dataset. num_reference_files = 0 for captions in self.dataset.values(): if len(captions) > num_reference_files: num_reference_files = len(captions) if num_reference_files <= 0: raise Exception('No reference captions.') # Collect model/reference captions, formatting the model's captions and # each set of reference captions as a list of len(self.images) strings. exp_dir = '%s/generation' % self.cache_dir if not os.path.exists(exp_dir): os.makedirs(exp_dir) # For each image, write out the highest probability caption. model_captions = [''] * len(self.images) reference_captions = [([''] * len(self.images)) for _ in xrange(num_reference_files)] for image_index, image in enumerate(self.images): caption = self.captioner.sentence(all_captions[image_index]) model_captions[image_index] = caption for reference_index, (_, caption) in enumerate(self.dataset[image]): caption = ' '.join(caption) reference_captions[reference_index][image_index] = caption coco_image_ids = [ self.sg.image_path_to_id[image_path] for image_path in self.images ] generation_result = [{ 'image_id': self.sg.image_path_to_id[image_path], 'caption': model_captions[image_index] } for (image_index, image_path) in enumerate(self.images)] json_filename = '%s/generation_result.json' % self.cache_dir print 'Dumping result to file: %s' % json_filename with open(json_filename, 'w') as json_file: json.dump(generation_result, json_file) generation_result = self.sg.coco.loadRes(json_filename) coco_evaluator = COCOEvalCap(self.sg.coco, generation_result) coco_evaluator.params['image_id'] = coco_image_ids coco_evaluator.evaluate()
parser.add_argument('-ref', '--ref_path', type=str, default='./data/annotations/captions_val2014.json', help='path for test output json file') args = parser.parse_args() annFile = args.ref_path resFile = args.hypo_path print('reference: ', annFile) print('hypothesis: ', resFile) coco = COCO(annFile) cocoRes = coco.loadRes(resFile) # create cocoEval object by taking coco and cocoRes cocoEval = COCOEvalCap(coco, cocoRes) # evaluate on a subset of images by setting # cocoEval.params['image_id'] = cocoRes.getImgIds() # please remove this line when evaluating the full validation set cocoEval.params['image_id'] = cocoRes.getImgIds() # evaluate results # SPICE will take a few minutes the first time, but speeds up due to caching cocoEval.evaluate() print('\n\n-----results-----') for metric, score in cocoEval.eval.items(): print('%s: %.3f'%(metric, score))
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) glove = create_glove_dict() weights_matrix = create_weights_matrix(vocab, len(vocab), args.embed_size, glove) weights_matrix = torch.tensor(weights_matrix).detach().cpu() # load the evaluation data print(args.encoder_path) print(args.decoder_path) # Build models encoder = EncoderCNN(args.hidden_size, len(vocab)).eval().to( device) # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers, weights_matrix).to(device) # Load the trained model parameters if torch.cuda.is_available(): encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) else: encoder.load_state_dict( torch.load(args.encoder_path, map_location='cpu')) decoder.load_state_dict( torch.load(args.decoder_path, map_location='cpu')) # perform evaluation here object_list = [ args.bottle_test, args.bus_test, args.couch_test, args.microwave_test, args.pizza_test, args.racket_test, args.suitcase_test, args.zebra_test ] object_names = [ 'bottle', 'bus', 'couch', 'microwave', 'pizza', 'racket', 'suitcase', 'zebra' ] # iterate through each of the held out objects for object_class, name in zip(object_list, object_names): data_loader = get_loader(args.image_dir, object_class, vocab, transform, args.batch_size, shuffle=False, num_workers=args.num_workers) f1_count = 0 total_count = 0 results = [] id_occurences = {} # gets the results into a json file for the captions # Sums up the F1 scores and the total number of captions for j, (images, captions, lengths, img_ids) in enumerate(data_loader): for image, length, img_id in zip(images, lengths, img_ids): print(j) total_count += 1 image = image.view(1, *image.size()) image = image.to(device) #print(image.size()) with torch.no_grad(): feature = encoder(image) start_token = torch.tensor( vocab.word2idx['<start>']).to(device) sampled_ids = decoder.sample(feature, start_token) #sampled_ids = sampled_ids[0].cpu().numpy() # (1, max_seq_length) -> (max_seq_length) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] if word == '<end>': break sampled_caption.append(word) contained = False for word in sampled_caption: if word in object_names: contained = True if contained: f1_count += 1 # removes start token # not needed for parallel captioner # sentence = ' '.join(sampled_caption[1:]) sentence = ' '.join(sampled_caption) # for the general evaluation if str(img_id) not in id_occurences: results.append({ "image_id": int(img_id), "caption": sentence }) id_occurences[str(img_id)] = 1 #name = 'alexnet' with open(f'{name}_val_results', 'w') as outfile: json.dump(results, outfile) print("saved") # Evaluation section coco = COCO(object_class) cocoRes = coco.loadRes(f'{name}_val_results') # create cocoEval object by taking coco and cocoRes cocoEval = COCOEvalCap(coco, cocoRes, 'corpus') # please remove this line when evaluating the full validation set cocoEval.params['image_id'] = cocoRes.getImgIds() cocoEval.evaluate() # write the results to the object file with open(os.path.join(args.results_path, f'{name}_scores.txt'), 'a') as f: for metric, score in cocoEval.eval.items(): f.write('%s: %.3f \n' % (metric, score)) f.write('F1: %.3f' % (f1_count / total_count))
def language_eval(dataset, preds, preds_n, eval_kwargs, split): model_id = eval_kwargs['id'] eval_oracle = eval_kwargs.get('eval_oracle', 0) # create output dictionary out = {} if len(preds_n) > 0: # vocab size and novel sentences if 'coco' in dataset: dataset_file = 'data/dataset_coco.json' elif 'flickr30k' in dataset or 'f30k' in dataset: dataset_file = 'data/dataset_flickr30k.json' training_sentences = set([ ' '.join(__['tokens']) for _ in json.load(open(dataset_file))['images'] if not _['split'] in ['val', 'test'] for __ in _['sentences'] ]) generated_sentences = set([_['caption'] for _ in preds_n]) novels = generated_sentences - training_sentences out['novel_sentences'] = float(len(novels)) / len(preds_n) tmp = [_.split() for _ in generated_sentences] words = [] for _ in tmp: words += _ out['vocab_size'] = len(set(words)) # encoder.FLOAT_REPR = lambda o: format(o, '.3f') cache_path = os.path.join('eval_results/', '.cache_' + model_id + '_' + split + '.json') coco = getCOCO(dataset) valids = coco.getImgIds() # filter results to only those in MSCOCO validation set preds_filt = [p for p in preds if p['image_id'] in valids] mean_perplexity = sum([_['perplexity'] for _ in preds_filt]) / len(preds_filt) mean_entropy = sum([_['entropy'] for _ in preds_filt]) / len(preds_filt) print('using %d/%d predictions' % (len(preds_filt), len(preds))) json.dump(preds_filt, open(cache_path, 'w')) # serialize to temporary json file. Sigh, COCO API... cocoRes = coco.loadRes(cache_path) cocoEval = COCOEvalCap(coco, cocoRes) cocoEval.params['image_id'] = cocoRes.getImgIds() cocoEval.evaluate() for metric, score in cocoEval.eval.items(): out[metric] = score # Add mean perplexity out['perplexity'] = mean_perplexity out['entropy'] = mean_entropy imgToEval = cocoEval.imgToEval for k in list(imgToEval.values())[0]['SPICE'].keys(): if k != 'All': out['SPICE_' + k] = np.array( [v['SPICE'][k]['f'] for v in imgToEval.values()]) out['SPICE_' + k] = (out['SPICE_' + k][out['SPICE_' + k] == out['SPICE_' + k]]).mean() for p in preds_filt: image_id, caption = p['image_id'], p['caption'] imgToEval[image_id]['caption'] = caption if len(preds_n) > 0: import eval_multi cache_path_n = os.path.join( 'eval_results/', '.cache_' + model_id + '_' + split + '_n.json') spice_n = eval_multi.eval_spice_n(dataset, preds_n, model_id, split) out.update(spice_n['overall']) div_stats = eval_multi.eval_div_stats(dataset, preds_n, model_id, split) out.update(div_stats['overall']) if eval_oracle: oracle = eval_multi.eval_oracle(dataset, preds_n, model_id, split) out.update(oracle['overall']) else: oracle = None self_cider = eval_multi.eval_self_cider(dataset, preds_n, model_id, split) out.update(self_cider['overall']) with open(cache_path_n, 'w') as outfile: json.dump( { 'spice_n': spice_n, 'div_stats': div_stats, 'oracle': oracle, 'self_cider': self_cider }, outfile) out['bad_count_rate'] = sum([count_bad(_['caption']) for _ in preds_filt]) / float(len(preds_filt)) outfile_path = os.path.join('eval_results/', model_id + '_' + split + '.json') with open(outfile_path, 'w') as outfile: json.dump({'overall': out, 'imgToEval': imgToEval}, outfile) return out
def test(): word_index = dataset.tokenizer.word_index gen = dataset.generator(b_s, 'test', show_ids=True) m.load_weights('data/mpiimd_model.pkl') seen = 0 hypothesis_for_coco = [] references_for_coco = [] images_for_coco = [] while seen < dataset.nb_test_samples: [X_video, X_caption], Y_gt, snippet_ids = gen.next() Y_t = m.predict_on_batch([X_video, X_caption]) Y_t = np.argmax(Y_t, axis=-1) for t in range(dataset.max_caption_len - 1): X_caption[:, t + 1] = Y_t[:, t] if all(w == 'stoptoken' for w in [ word_index.keys()[word_index.values().index(Yi)] for Yi in X_caption[:, t + 1] ]): break Y_t = m.predict_on_batch([X_video, X_caption]) Y_t = np.argmax(Y_t, axis=-1) for pred, gt, snippet_id in zip(X_caption, Y_gt, snippet_ids): gt_caption = [] pred_caption = [] for gt_t in gt: gt_word = word_index.keys()[word_index.values().index(gt_t[0])] gt_caption.append(gt_word) if gt_word == 'stoptoken': break for pred_t in pred: pred_word = word_index.keys()[word_index.values().index( pred_t)] pred_caption.append(pred_word) if pred_word == 'stoptoken': break gt_caption = gt_caption[1:-1] pred_caption = pred_caption[1:-1] if not any(h for h in hypothesis_for_coco if h['image_id'] == snippet_id): images_for_coco.append({ "id": snippet_id, "url": "", "file_name": "" }) hypothesis_for_coco.append({ "image_id": snippet_id, "id": seen, "caption": ' '.join([ c.decode('utf-8').encode('ascii', 'ignore') for c in pred_caption ]) }) references_for_coco.append({ "image_id": snippet_id, "id": seen, "caption": ' '.join([ c.decode('utf-8').encode('ascii', 'ignore') for c in gt_caption ]) }) seen += 1 print('%d / %d - %s GT: %s' % (seen, dataset.nb_test_samples, snippet_id, ' '.join(gt_caption))) print('%d / %d - %s PR: %s' % (seen, dataset.nb_test_samples, snippet_id, ' '.join(pred_caption))) # Evaluation print(exp_name) import json json.dump(hypothesis_for_coco, open("%s_hypothesis.json" % exp_name, 'w')) json.dump( { 'images': images_for_coco, 'annotations': references_for_coco, 'type': 'captions', 'info': {}, 'licenses': [] }, open("%s_references.json" % exp_name, 'w')) sys.path.append('coco_caption') from pycocotools.coco import COCO from pycocoevalcap.eval import COCOEvalCap coco = COCO("%s_references.json" % exp_name) cocoRes = coco.loadRes("%s_hypothesis.json" % exp_name) cocoEval = COCOEvalCap(coco, cocoRes) cocoEval.evaluate() for metric, score in cocoEval.eval.items(): print('%s: %.3f' % (metric, score))
def language_eval(dataset, preds, model_id, split): import sys if 'coco' in dataset: sys.path.append("coco-caption") annFile = 'coco-caption/annotations/captions_val2014.json' else: sys.path.append("f30k-caption") annFile = 'f30k-caption/annotations/dataset_flickr30k.json' from pycocotools.coco import COCO from pycocoevalcap.eval import COCOEvalCap encoder.FLOAT_REPR = lambda o: format(o, '.3f') if not os.path.isdir('eval_results'): os.mkdir('eval_results') cache_path = os.path.join('eval_results/', model_id + '_' + split + '.json') coco = COCO(annFile) valids = coco.getImgIds() preds_filt = [p for p in preds if p['image_id'] in valids] print('using %d/%d predictions' % (len(preds_filt), len(preds))) json.dump(preds_filt, open(cache_path, 'w')) # serialize to temporary json file. Sigh, COCO API... cocoRes = coco.loadRes(cache_path) cocoEval = COCOEvalCap(coco, cocoRes) cocoEval.params['image_id'] = cocoRes.getImgIds() cocoEval.evaluate() # create output dictionary out = {} for metric, score in cocoEval.eval.items(): out[metric] = score imgToEval = cocoEval.imgToEval # collect SPICE_sub_score for k in imgToEval.values()[0]['SPICE'].keys(): if k != 'All': out['SPICE_' + k] = np.array( [v['SPICE'][k]['f'] for v in imgToEval.values()]) out['SPICE_' + k] = (out['SPICE_' + k][out['SPICE_' + k] == out['SPICE_' + k]]).mean() for p in preds_filt: image_id, caption = p['image_id'], p['caption'] imgToEval[image_id]['caption'] = caption for i in range(len(preds)): if preds[i]['image_id'] in imgToEval: preds[i]['eval'] = imgToEval[preds[i]['image_id']] # filter results to only those in MSCOCO validation set (will be about a third) json.dump( preds, open( os.path.join('eval_results/', model_id + '_' + split + '_nofilt.json'), 'w')) with open(cache_path, 'w') as outfile: json.dump({'overall': out, 'imgToEval': imgToEval}, outfile) return out
def language_eval(type, preds, model_id, split): import sys if 'coco' in type: annFile = 'coco-caption/annotations/captions_val2014.json' sys.path.append("coco-caption") print("Load reference file from: {}".format(annFile)) from pycocotools.coco import COCO from pycocoevalcap.eval import COCOEvalCap elif '30k' in type: annFile = 'coco-caption/annotations/flickr30k_val.json' sys.path.append("coco-caption") print("Load reference file from: {}".format(annFile)) from pycocotools.coco import COCO from pycocoevalcap.eval import COCOEvalCap elif 'zh' in type: annFile = 'data/aic_i2t/eval_reference.json' sys.path.append("AI_Challenger/Evaluation/caption_eval") print("Load reference file from: {}".format(annFile)) from coco_caption.pycxtools.coco import COCO from coco_caption.pycxevalcap.eval import COCOEvalCap else: raise Exception('Current eval type is not recognizable.') encoder.FLOAT_REPR = lambda o: format(o, '.3f') if not os.path.isdir('eval_results'): os.mkdir('eval_results') cache_path = os.path.join('eval_results/', type + '_' + model_id + '_' + split + '.json') print("Load cache path is:" + cache_path) coco = COCO(annFile) valids = coco.getImgIds() # filter results to only those in MSCOCO validation set (will be about a third) if 'coco' in type: preds_filt = [p for p in preds if p['image_id'] in valids] print('using %d/%d predictions' % (len(preds_filt), len(preds))) json.dump(preds_filt, open( cache_path, 'w')) # serialize to temporary json file. Sigh, COCO API... elif '30k' in type: preds_filt = [{ 'caption': p['caption'], 'image_id': str(p['image_id']) } for p in preds if p['image_id'] in valids] else: json.dump(preds, open( cache_path, 'w')) # serialize to temporary json file. Sigh, COCO API... cocoRes = coco.loadRes(cache_path) cocoEval = COCOEvalCap(coco, cocoRes) cocoEval.params['image_id'] = cocoRes.getImgIds() print(len(set(cocoRes.getImgIds()) & set(coco.getImgIds()))) cocoEval.evaluate() # create output dictionary out = {} for metric, score in cocoEval.eval.items(): out[metric] = score imgToEval = cocoEval.imgToEval # for p in preds: # image_id, caption = p['image_id'], p['caption'] # imgToEval[image_id]['caption'] = caption with open(cache_path, 'w') as outfile: json.dump({'overall': out, 'imgToEval': imgToEval}, outfile) return out