Пример #1
0
def language_eval(dataset, preds, model_id, split):
    import sys
    if 'coco' in dataset:
        sys.path.append("coco-caption")
        annFile = 'coco-caption/annotations/captions_val2014.json'
    else:
        # TODO: NYTIMES
        if split == 'val':
            annFile = './data/val.json'
            with open(annFile, 'rb') as f:
                dataset = json.load(f)
        else:
            annFile = './data/test.json'
            with open(annFile, 'rb') as f:
                dataset = json.load(f)

        # TODO: BREAKINGNEWS
        # with open("/home/abiten/Desktop/Thesis/newspaper/breakingnews/bnews_caps.json", "rb") as f: dataset = json.load(f)

        id_to_ix = {v['cocoid']: ix for ix, v in enumerate(dataset)}
        hypo = {v['image_id']: [v['caption']] for v in preds}
        ref = {
            k: [i['raw'] for i in dataset[id_to_ix[k]]['sentences']]
            for k in hypo.keys()
        }
        final_scores = evaluate(ref, hypo)
        print('Bleu_1:\t', final_scores['Bleu_1'])
        print('Bleu_2:\t', final_scores['Bleu_2'])
        print('Bleu_3:\t', final_scores['Bleu_3'])
        print('Bleu_4:\t', final_scores['Bleu_4'])
        # print('METEOR:\t', final_scores['METEOR'])
        print('ROUGE_L:', final_scores['ROUGE_L'])
        print('CIDEr:\t', final_scores['CIDEr'])
        # print('Spice:\t', final_scores['Spice'])
        return final_scores

        # sys.path.append("f30k-caption")
        # annFile = 'f30k-caption/annotations/dataset_flickr30k.json'
    from pycocotools.coco import COCO
    from pycocoevalcap.eval import COCOEvalCap

    encoder.FLOAT_REPR = lambda o: format(o, '.3f')

    if not os.path.isdir('eval_results'):
        os.mkdir('eval_results')
    cache_path = os.path.join('eval_results/',
                              model_id + '_' + split + '.json')

    coco = COCO(annFile)
    valids = coco.getImgIds()

    # filter results to only those in MSCOCO validation set (will be about a third)
    preds_filt = [p for p in preds if p['image_id'] in valids]
    print('using %d/%d predictions' % (len(preds_filt), len(preds)))
    json.dump(preds_filt,
              open(cache_path,
                   'w'))  # serialize to temporary json file. Sigh, COCO API...

    cocoRes = coco.loadRes(cache_path)
    cocoEval = COCOEvalCap(coco, cocoRes)
    cocoEval.params['image_id'] = cocoRes.getImgIds()
    cocoEval.evaluate()

    # create output dictionary
    out = {}
    for metric, score in cocoEval.eval.items():
        out[metric] = score

    imgToEval = cocoEval.imgToEval
    for p in preds_filt:
        image_id, caption = p['image_id'], p['caption']
        imgToEval[image_id]['caption'] = caption
    with open(cache_path, 'w') as outfile:
        json.dump({'overall': out, 'imgToEval': imgToEval}, outfile)

    return out
Пример #2
0
def language_eval(dataset, preds, preds_n, eval_kwargs, split):
    model_id = eval_kwargs['id']
    eval_oracle = eval_kwargs.get('eval_oracle', 0)

    import sys
    sys.path.append("coco-caption")
    annFile = 'coco-caption/annotations/captions_val2014.json'
    from pycocotools.coco import COCO
    from pycocoevalcap.eval import COCOEvalCap

    # encoder.FLOAT_REPR = lambda o: format(o, '.3f')

    if not os.path.isdir('eval_results'):
        os.mkdir('eval_results')
    cache_path = os.path.join('eval_results/',
                              '.cache_' + model_id + '_' + split + '.json')

    coco = COCO(annFile)
    valids = coco.getImgIds()

    # filter results to only those in MSCOCO validation set (will be about a third)
    preds_filt = [p for p in preds if p['image_id'] in valids]
    mean_perplexity = sum([_['perplexity']
                           for _ in preds_filt]) / len(preds_filt)
    mean_entropy = sum([_['entropy'] for _ in preds_filt]) / len(preds_filt)
    print('using %d/%d predictions' % (len(preds_filt), len(preds)))
    json.dump(preds_filt,
              open(cache_path,
                   'w'))  # serialize to temporary json file. Sigh, COCO API...

    cocoRes = coco.loadRes(cache_path)
    cocoEval = COCOEvalCap(coco, cocoRes)
    cocoEval.params['image_id'] = cocoRes.getImgIds()
    cocoEval.evaluate()

    # create output dictionary
    out = {}
    for metric, score in cocoEval.eval.items():
        out[metric] = score
    # Add mean perplexity
    out['perplexity'] = mean_perplexity
    out['entropy'] = mean_entropy

    imgToEval = cocoEval.imgToEval
    for k in list(imgToEval.values())[0]['SPICE'].keys():
        if k != 'All':
            out['SPICE_' + k] = np.array(
                [v['SPICE'][k]['f'] for v in imgToEval.values()])
            out['SPICE_' + k] = (out['SPICE_' + k][out['SPICE_' +
                                                       k] == out['SPICE_' +
                                                                 k]]).mean()
    for p in preds_filt:
        image_id, caption = p['image_id'], p['caption']
        imgToEval[image_id]['caption'] = caption

    if len(preds_n) > 0:
        cache_path_n = os.path.join(
            'eval_results/', '.cache_' + model_id + '_' + split + '_n.json')
        spice_n = eval_multi.eval_spice_n(preds_n, model_id, split)
        out.update(spice_n['overall'])
        div_stats = eval_multi.eval_div_stats(preds_n, model_id, split)
        out.update(div_stats['overall'])
        if eval_oracle:
            oracle = eval_multi.eval_oracle(preds_n, model_id, split)
        out.update(oracle['overall'])
        with open(cache_path_n, 'w') as outfile:
            json.dump(
                {
                    'spice_n': spice_n,
                    'div_stats': div_stats,
                    'oracle': oracle
                }, outfile)

    out['bad_count_rate'] = sum([count_bad(_['caption'])
                                 for _ in preds_filt]) / float(len(preds_filt))
    outfile_path = os.path.join('eval_results/',
                                model_id + '_' + split + '.json')
    with open(outfile_path, 'w') as outfile:
        json.dump({'overall': out, 'imgToEval': imgToEval}, outfile)

    return out
Пример #3
0
def mscoco_eval(test_tokenized_sent_groups, generated_sents_tokenized):
    with open(config.mscoco_eval_dir + '/annotations/references.json',
              'w',
              encoding='utf-8') as f:
        json.dump(
            {
                'info': {
                    'description': None,
                    'url': None,
                    'version': None,
                    'year': None,
                    'contributor': None,
                    'date_created': None
                },
                'images': [{
                    'license': None,
                    'url': None,
                    'file_name': None,
                    'id': image_id,
                    'width': None,
                    'date_captured': None,
                    'height': None
                } for image_id in range(len(test_tokenized_sent_groups))],
                'licenses': [],
                'type':
                'captions',
                'annotations': [{
                    'image_id': image_id,
                    'id': caption_id,
                    'caption': ' '.join(sent)
                } for (caption_id, (image_id, sent)) in enumerate(
                    (image_id, sent)
                    for (image_id,
                         sent_group) in enumerate(test_tokenized_sent_groups)
                    for sent in sent_group)]
            }, f)

    with open(config.mscoco_eval_dir + '/results/generated.json',
              'w',
              encoding='utf-8') as f:
        json.dump([{
            'image_id': image_id,
            'caption': ' '.join(sent)
        } for (image_id, sent) in enumerate(generated_sents_tokenized)], f)

    coco = COCO(config.mscoco_eval_dir + '/annotations/references.json')
    cocoRes = coco.loadRes(config.mscoco_eval_dir + '/results/generated.json')
    cocoEval = COCOEvalCap(coco, cocoRes)
    cocoEval.evaluate()
    return {
        'Bleu_1': cocoEval.eval['Bleu_1'],
        'Bleu_2': cocoEval.eval['Bleu_2'],
        'Bleu_3': cocoEval.eval['Bleu_3'],
        'Bleu_4': cocoEval.eval['Bleu_4'],
        'METEOR': cocoEval.eval['METEOR'],
        'ROUGE_L': cocoEval.eval['ROUGE_L'],
        'CIDEr': cocoEval.eval['CIDEr'],
        'SPICE': cocoEval.eval['SPICE'],
        'WMD': cocoEval.eval['WMD'],
        'Bleu_1_all': [item['Bleu_1'] for item in cocoEval.evalImgs],
        'Bleu_2_all': [item['Bleu_2'] for item in cocoEval.evalImgs],
        'Bleu_3_all': [item['Bleu_3'] for item in cocoEval.evalImgs],
        'Bleu_4_all': [item['Bleu_4'] for item in cocoEval.evalImgs],
        'METEOR_all': [item['METEOR'] for item in cocoEval.evalImgs],
        'ROUGE_L_all': [item['ROUGE_L'] for item in cocoEval.evalImgs],
        'CIDEr_all': [item['CIDEr'] for item in cocoEval.evalImgs],
        'SPICE_all': [item['SPICE']['All']['f'] for item in cocoEval.evalImgs],
        'WMD_all': [item['WMD'] for item in cocoEval.evalImgs],
    }
Пример #4
0
def score_generation(gt_filename=None, generation_result=None):

    coco = COCO(gt_filename)
    generation_coco = coco.loadRes(generation_result)
    coco_evaluator = COCOEvalCap(coco, generation_coco, 'noc_test_freq')
    coco_evaluator.evaluate()
Пример #5
0
def coco_metrics(val_captions_file, result_captions, metric):
    coco = COCO(val_captions_file)
    cocoRes = coco.loadRes(result_captions)
    cocoEval = COCOEvalCap(coco, cocoRes)
    cocoEval.evaluate()
    return cocoEval.eval[metric]
Пример #6
0
def coco_eval(model, args, epoch, split=None):
    '''
    model: trained model to be evaluated
    args: pre-set parameters
    epoch: epoch #, for disp purpose
    '''

    model.eval()

    # Validation images are required to be resized to 224x224 already
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load the vocabulary
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Wrapper the COCO VAL dataset
    eval_data_loader = torch.utils.data.DataLoader(
        CocoImageFolder(args.image_dir,
                        args.caption_path,
                        transform,
                        split=split),
        batch_size=args.eval_size,
        shuffle=False,
        num_workers=args.num_workers,
        drop_last=False)

    # Generated captions to be compared with GT
    results = []
    print(
        '---------------------Start evaluation on MS-COCO dataset %s-----------------------'
        % split)
    for i, (images, image_ids, filename) in enumerate(eval_data_loader):
        images = to_var(images)

        if torch.cuda.device_count() > 1:
            device_ids = range(torch.cuda.device_count())
            encoder_parallel = nn.DataParallel(model.encoder,
                                               device_ids=device_ids)
            features, probs = encoder_parallel(images)
        else:
            features, probs = model.encoder(images)
        if args.pattern == 'truelabel':
            if args.dataset == 'ucm' or args.dataset == 'sydney':
                preds = torch.LongTensor(image_ids) // 100
                preds += 4
                preds = to_var(preds).unsqueeze(1)
            elif args.dataset == 'rsicd':
                trueLabels = [
                    args.vocab(str(fn).split('_')[0]) for fn in filename
                ]
                trueLabels = torch.LongTensor(trueLabels)
                preds = to_var(trueLabels).unsqueeze(1)
        elif args.pattern == 'label':
            preds = torch.max(probs.data, 1)[1].unsqueeze(1)
            preds += 4
            preds = to_var(preds)
        else:
            preds = None

        # generated_captions, _ = model.decoder.sample(features)
        generated_captions, _ = model.decoder.sample(
            features, preds, args.pattern)  #sat(adaptive)
        # generated_captions, _ = model.decoder.sample(features, probs, args.pattern)  #fc_lstm

        captions = generated_captions.cpu().data.numpy()

        # Build caption based on Vocabulary and the '<end>' token
        for image_idx in range(captions.shape[0]):

            sampled_ids = captions[image_idx]
            sampled_caption = []

            for word_id in sampled_ids:

                word = vocab.idx2word[word_id]
                if word == '<end>':
                    break
                else:
                    sampled_caption.append(word)

            sentence = ' '.join(sampled_caption)

            temp = {'image_id': int(image_ids[image_idx]), 'caption': sentence}
            results.append(temp)

        # Disp evaluation process
        if (i + 1) % 100 == 0:
            print('[%d/%d]' % ((i + 1), len(eval_data_loader)))

    print(
        '------------------------Caption Generated-------------------------------------'
    )

    # Evaluate the results based on the COCO API
    # name = str(args.yml).split('.')[0].split('/')[-1]
    if not os.path.exists(os.path.join(args.checkpoint_path, 'results')):
        os.mkdir(os.path.join(args.checkpoint_path, 'results'))
    if split is 'val':
        resFile = os.path.join(
            args.checkpoint_path, 'results',
            args.dataset + '-' + '{0:03d}'.format(epoch) + '.json')
    else:
        resFile = os.path.join(
            args.checkpoint_path, 'results',
            args.dataset + '-' + split + '-{0:03d}'.format(epoch) + '.json')
        # resFile = os.path.join(args.checkpoint_path,
        #                        args.dataset + '-' + split + '.json')
    json.dump(results, open(resFile, 'w'), indent=4)

    annFile = args.caption_val_path
    coco = COCO(annFile)
    cocoRes = coco.loadRes(resFile)

    cocoEval = COCOEvalCap(coco, cocoRes)
    cocoEval.params['image_id'] = cocoRes.getImgIds()
    cocoEval.evaluate()

    # Get CIDEr score for validation evaluation
    cider = 0.
    print(
        '-----------Evaluation performance on MS-COCO validation dataset for Epoch %d----------'
        % (epoch))
    for metric, score in cocoEval.eval.items():

        print('%s: %.4f' % (metric, score))
        if metric == 'CIDEr':
            cider = score

    return cider, cocoEval.eval
from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap

annotation_file = 'captions_val2014.json'
results_file = 'captions_val2014_fakecap_results.json'

# create coco object and coco_result object
coco = COCO(annotation_file)
coco_result = coco.loadRes(results_file)

# create coco_eval object by taking coco and coco_result
coco_eval = COCOEvalCap(coco, coco_result, exclude_scorers=['spice'])

# evaluate on a subset of images by setting
# coco_eval.params['image_id'] = coco_result.getImgIds()
# please remove this line when evaluating the full validation set
coco_eval.params['image_id'] = coco_result.getImgIds()

# evaluate results
# SPICE will take a few minutes the first time, but speeds up due to caching
coco_eval.evaluate()

# print output evaluation scores
for metric, score in coco_eval.eval.items():
    print(f'{metric}: {score:.3f}')
Пример #8
0
def evaluate(beam_size):
    """
    Evaluation

    :param beam_size: beam size at which to generate captions for evaluation
    :return: BLEU-4 score
    """
    # Test Dataset
    testDataset = CaptionDatasetFastText(data_folder,
                                         data_name,
                                         'TEST',
                                         transform=transforms.Compose(
                                             [normalize]))
    # DataLoader
    loader = torch.utils.data.DataLoader(testDataset,
                                         batch_size=1,
                                         shuffle=False,
                                         num_workers=1,
                                         pin_memory=True,
                                         collate_fn=my_collate_test)

    # TODO: Batched Beam Search
    # Therefore, do not use a batch_size greater than 1 - IMPORTANT!

    # Lists to store references (true captions), and hypothesis (prediction) for each image
    # If for n images, we have n hypotheses, and references a, b, c... for each image, we need -
    # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...]
    references = list()
    hypotheses = list()
    #Fields necessary for json used to compute the chair metric
    resultJson = {}
    resultJson['overall'] = {
        'Bleu_1': 0,
        'Bleu_2': 0,
        'Bleu_3': 0,
        'Bleu_4': 0,
        'METEOR': 0,
        'CIDEr': 0,
        'SPICE': 0,
        'ROUGE_L': 0,
    }
    captionsJson = []

    #Text file with captions
    captionOutFile = open('evalCaptions.txt', 'w')

    synonyms = get_word_synonyms()

    # For each image
    for i, (tensor_fg, img_bg, caps, caplens, allcaps) in enumerate(
            tqdm(loader, desc="EVALUATING AT BEAM SIZE " + str(beam_size))):

        #Only generate one caption per image, a limitation of the coco evaluation code (only one result per id)
        if i % 5 != 0:
            continue

        imgId = testDataset.getImgId(i)

        if tensor_fg is None:
            continue

        captionString = ' '.join([
            rev_word_map[caps[0][idx].item()] for idx in range(caplens[0])
            if rev_word_map[caps[0][idx].item()] != '<unk>'
        ][1:-1])

        hasPerson = False

        for synonym in synonyms[0]:
            pattern = r"\b{}s?\b".format(synonym)
            if re.search(pattern, captionString) is not None:
                hasPerson = True

        if hasPerson is False:
            continue

        k = beam_size

        # Move to GPU device, if available
        tensor_fg = tensor_fg.to(device)  # (1, 3, 256, 256)
        img_bg = img_bg.to(device)  # (1, 3, 256, 256)

        # Encode
        encoder_out = encoder(
            tensor_fg,
            img_bg)  # (1, enc_image_size, enc_image_size, encoder_dim)
        enc_image_size = encoder_out.size(1)
        encoder_dim = encoder_out.size(3)

        # Flatten encoding
        encoder_out = encoder_out.view(
            1, -1, encoder_dim)  # (1, num_pixels, encoder_dim)
        num_pixels = encoder_out.size(1)

        # We'll treat the problem as having a batch size of k
        encoder_out = encoder_out.expand(
            k, num_pixels, encoder_dim)  # (k, num_pixels, encoder_dim)

        # Tensor to store top k previous words at each step; now they're just <start>
        k_prev_words = torch.LongTensor([[word_map['<start>']]] * k).to(
            device)  # (k, 1)

        # Tensor to store top k sequences; now they're just <start>
        seqs = k_prev_words  # (k, 1)

        # Tensor to store top k sequences' scores; now they're just 0
        top_k_scores = torch.zeros(k, 1).to(device)  # (k, 1)

        # Lists to store completed sequences and scores
        complete_seqs = list()
        complete_seqs_scores = list()

        # Start decoding
        step = 1
        h, c = decoder.init_hidden_state(encoder_out)

        # s is a number less than or equal to k, because sequences are removed from this process once they hit <end>
        while True:

            embeddings = decoder.embedding(k_prev_words).squeeze(
                1)  # (s, embed_dim)

            awe, _ = decoder.attention(encoder_out,
                                       h)  # (s, encoder_dim), (s, num_pixels)

            gate = decoder.sigmoid(
                decoder.f_beta(h))  # gating scalar, (s, encoder_dim)
            awe = gate * awe

            h, c = decoder.decode_step(torch.cat([embeddings, awe], dim=1),
                                       (h, c))  # (s, decoder_dim)

            scores = decoder.fc(h)  # (s, vocab_size)
            scores = F.log_softmax(scores, dim=1)

            # Add
            scores = top_k_scores.expand_as(scores) + scores  # (s, vocab_size)

            # For the first step, all k points will have the same scores (since same k previous words, h, c)
            if step == 1:
                top_k_scores, top_k_words = scores[0].topk(k, 0, True,
                                                           True)  # (s)
            else:
                # Unroll and find top scores, and their unrolled indices
                top_k_scores, top_k_words = scores.view(-1).topk(
                    k, 0, True, True)  # (s)

            # Convert unrolled indices to actual indices of scores
            prev_word_inds = top_k_words / vocab_size  # (s)
            next_word_inds = top_k_words % vocab_size  # (s)

            # Add new words to sequences
            seqs = torch.cat(
                [seqs[prev_word_inds],
                 next_word_inds.unsqueeze(1)], dim=1)  # (s, step+1)

            # Which sequences are incomplete (didn't reach <end>)?
            incomplete_inds = [
                ind for ind, next_word in enumerate(next_word_inds)
                if next_word != word_map['<end>']
            ]
            complete_inds = list(
                set(range(len(next_word_inds))) - set(incomplete_inds))

            # Set aside complete sequences
            if len(complete_inds) > 0:
                complete_seqs.extend(seqs[complete_inds].tolist())
                complete_seqs_scores.extend(top_k_scores[complete_inds])
            k -= len(complete_inds)  # reduce beam length accordingly

            # Proceed with incomplete sequences
            if k == 0:
                break
            seqs = seqs[incomplete_inds]
            h = h[prev_word_inds[incomplete_inds]]
            c = c[prev_word_inds[incomplete_inds]]
            encoder_out = encoder_out[prev_word_inds[incomplete_inds]]
            top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
            k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

            # Break if things have been going on too long
            if step > 50:
                break
            step += 1

        if len(complete_seqs_scores) == 0:
            print("Skipping item with no scores")
            continue

        seqIdx = complete_seqs_scores.index(max(complete_seqs_scores))
        seq = complete_seqs[seqIdx]

        # References
        img_caps = allcaps[0].tolist()
        img_captions = list(
            map(
                lambda c: [
                    w for w in c if w not in {
                        word_map['<start>'], word_map['<end>'], word_map[
                            '<pad>']
                    }
                ], img_caps))  # remove <start> and pads
        references.append(img_captions)

        # Hypotheses
        hypo = [
            w for w in seq if w not in
            {word_map['<start>'], word_map['<end>'], word_map['<pad>']}
        ]
        hypotheses.append(hypo)
        captionWords = [rev_word_map[item] for item in hypo]
        captionString = ' '.join(captionWords)
        resultDict = {
            'image_id': imgId,
            'caption': captionString,
            'Bleu_1': 0,
            'Bleu_2': 0,
            'Bleu_3': 0,
            'Bleu_4': 0,
            'METEOR': 0,
            'CIDEr': 0,
            'SPICE': 0,
            'ROUGE_L': 0,
        }

        captionsJson.append(resultDict)

        assert len(references) == len(hypotheses)

        #Print some captions and their image ids
        if i % 1000 == 0:
            captionOutFile.write('Image ID: {}\n'.format(imgId))
            captionOutFile.write('Caption: {}\n'.format(captionString))

    coco = COCO('testGTCaptions.json')
    cocoRes = coco.loadRes(captionsJson)
    cocoEval = COCOEvalCap(coco, cocoRes)
    cocoEval.params['image_id'] = cocoRes.getImgIds()
    cocoEval.evaluate()

    # Save results to json file for chair metric
    resultJson['imgToEval'] = captionsJson
    with open('evalCaptions.json', 'w') as fp:
        json.dump(resultJson, fp)
    # Calculate BLEU-4 scores
    bleu4 = corpus_bleu(references, hypotheses)

    return bleu4
Пример #9
0
def evaluate(dataset,
             predictions,
             lemmatizer_path=None,
             extended=False,
             embeddings_file=None,
             downcase=False):
    data = dataset[DATA_KEY]
    comm = lemmatizer(lemmatizer_path) if lemmatizer_path else None
    f1 = exact_match = total = 0

    n_unanswered = 0
    datum_count = 0
    for datum in data:
        for qa in datum[DOC_KEY][QAS_KEY]:
            total += 1
            if qa[ID_KEY] not in predictions:
                n_unanswered += 1
                continue
            ground_truths = list(map(lambda x: x[TXT_KEY], qa[ANS_KEY]))
            prediction = predictions[qa[ID_KEY]]
            exact_match += metric_max_over_ground_truths(exact_match_score,
                                                         prediction,
                                                         ground_truths,
                                                         comm=comm)
            f1 += metric_max_over_ground_truths(f1_score,
                                                prediction,
                                                ground_truths,
                                                comm=comm)
        datum_count += 1
    print("There were {} unanswered instances".format(n_unanswered))
    exact_match_all = 100.0 * exact_match / total
    f1_all = 100.0 * f1 / total
    assert exact_match_all <= f1_all
    scores = {'exact_match': exact_match_all, 'f1': f1_all}

    exact_match_ans = 100.0 * exact_match / (total - n_unanswered)
    f1_ans = 100.0 * f1 / (total - n_unanswered)
    assert exact_match_ans <= f1_ans
    scores['exact_match_ans'] = exact_match_ans
    scores['f1_ans'] = f1_ans

    if extended:
        from pycocoevalcap.eval import COCOEvalCap
        from embedding_eval import EmbeddingEval

        # COCO
        ground = {}
        for id, ans in to_id_answertxt(dataset).items():
            normalized_ans = []
            for a in ans:
                normalized_ans.append(normalize_answer(a, comm))
            ground[id] = normalized_ans
        _predictions = {
            id: [normalize_answer(ans, comm)]
            for id, ans in predictions.items()
        }
        cocoEval = COCOEvalCap(ground, _predictions)
        cocoEval.evaluate()

        # Embeddings evaluation
        embEval = EmbeddingEval(ground, _predictions, embeddings_file,
                                downcase)
        embEval.evaluate()

        #scores = {**scores, **cocoEval.eval, **embEval.eval}  # only python3.5
        scores.update(cocoEval.eval)
        scores.update(embEval.eval)

    return scores
Пример #10
0
def beam_evaluate_trans(data_name, checkpoint_file, data_folder, beam_size,
                        outdir):
    """
    Evaluation
    :param data_name: name of the data files
    :param checkpoint_file: which checkpoint file to use
    :param data_folder: folder where data is stored
    :param beam_size: beam size at which to generate captions for evaluation
    :param outdir: place where the outputs are stored, so the checkpoint file
    :return: Official MSCOCO evaluator scores - bleu4, cider, rouge, meteor
    """
    global word_map
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def load_model():
        # Load model using checkpoint file provided
        torch.nn.Module.dump_patches = True
        checkpoint = torch.load(os.path.join(outdir, checkpoint_file),
                                map_location=device)
        decoder = checkpoint['decoder']
        decoder = decoder.to(device)
        decoder.eval()
        return decoder

    def load_dictionary():
        # Load word map (word2ix) using data folder provided
        word_map_file = os.path.join(data_folder,
                                     'WORDMAP_' + data_name + '.json')
        with open(word_map_file, 'r') as j:
            word_map = json.load(j)
        rev_word_map = {v: k for k, v in word_map.items()}
        vocab_size = len(word_map)
        return word_map, rev_word_map, vocab_size

    decoder = load_model()
    word_map, rev_word_map, vocab_size = load_dictionary()

    # DataLoader
    loader = torch.utils.data.DataLoader(CaptionDataset(
        data_folder, data_name, 'TEST'),
                                         batch_size=1,
                                         shuffle=False,
                                         num_workers=1,
                                         collate_fn=collate_fn,
                                         pin_memory=torch.cuda.is_available())

    # Lists to store references (true captions), and hypothesis (prediction) for each image
    # If for n images, we have n hypotheses, and references a, b, c... for each image, we need -
    # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...]
    references = list()
    hypotheses = list()

    # For each image
    for caption_idx, (image_features, caps, caplens, orig_caps) in enumerate(
            tqdm(loader, desc="EVALUATING AT BEAM SIZE " + str(beam_size))):

        if caption_idx % 5 != 0:
            continue

        k = beam_size

        # Move to GPU device, if available
        image_features = image_features.to(device)  # (1, 36, 2048)
        image_features_mean = image_features.mean(1)
        image_features_mean = image_features_mean.expand(k, 2048)

        # Tensor to store top k previous words at each step; now they're just <start>
        k_prev_words = torch.tensor([[word_map['<start>']]] * k,
                                    dtype=torch.long).to(device)  # (k, 1)

        # Tensor to store top k sequences; now they're just <start>
        seqs = k_prev_words  # (k, 1)

        # Tensor to store top k sequences' scores; now they're just 0
        top_k_scores = torch.zeros(k, 1).to(device)  # (k, 1)

        # Lists to store completed sequences and scores
        complete_seqs = list()
        complete_seqs_scores = list()

        # Start decoding
        step = 1
        h1, c1 = decoder.init_hidden_state(k)  # (batch_size, decoder_dim)
        h2, c2 = decoder.init_hidden_state(k)

        # s is a number less than or equal to k, because sequences are removed from this process once they hit <end>
        while True:

            embeddings = decoder.embedding(k_prev_words).squeeze(
                1)  # (s, embed_dim)
            h1, c1 = decoder.top_down_attention(
                torch.cat([h2, image_features_mean, embeddings], dim=1),
                (h1, c1))  # (batch_size_t, decoder_dim)
            trans_obj = decoder.transformer_encoder(
                image_features.transpose(0, 1)).transpose(0, 1)
            attention_weighted_encoding = decoder.attention(trans_obj, h1)
            h2, c2 = decoder.language_model(
                torch.cat([attention_weighted_encoding, h1], dim=1), (h2, c2))
            scores = decoder.fc(h2)  # (s, vocab_size)
            scores = F.log_softmax(scores, dim=1)

            # Add
            scores = top_k_scores.expand_as(scores) + scores  # (s, vocab_size)

            # For the first step, all k points will have the same scores (since same k previous words, h, c)
            if step == 1:
                top_k_scores, top_k_words = scores[0].topk(k, 0, True,
                                                           True)  # (s)
            else:
                # Unroll and find top scores, and their unrolled indices
                top_k_scores, top_k_words = scores.view(-1).topk(
                    k, 0, True, True)  # (s)

            # Convert unrolled indices to actual indices of scores
            prev_word_inds = top_k_words / vocab_size  # (s)
            next_word_inds = top_k_words % vocab_size  # (s)

            # Add new words to sequences
            seqs = torch.cat(
                [seqs[prev_word_inds],
                 next_word_inds.unsqueeze(1)], dim=1)  # (s, step+1)

            # Which sequences are incomplete (didn't reach <end>)?
            incomplete_inds = [
                ind for ind, next_word in enumerate(next_word_inds)
                if next_word != word_map['<end>']
            ]
            complete_inds = list(
                set(range(len(next_word_inds))) - set(incomplete_inds))

            # Set aside complete sequences
            if len(complete_inds) > 0:
                complete_seqs.extend(seqs[complete_inds].tolist())
                complete_seqs_scores.extend(top_k_scores[complete_inds])
            k -= len(complete_inds)  # reduce beam length accordingly

            # Proceed with incomplete sequences
            if k == 0:
                break
            seqs = seqs[incomplete_inds]
            h1 = h1[prev_word_inds[incomplete_inds]]
            c1 = c1[prev_word_inds[incomplete_inds]]
            h2 = h2[prev_word_inds[incomplete_inds]]
            c2 = c2[prev_word_inds[incomplete_inds]]
            image_features_mean = image_features_mean[
                prev_word_inds[incomplete_inds]]
            top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
            k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

            # Break if things have been going on too long
            if step > 50:
                break
            step += 1

        i = complete_seqs_scores.index(max(complete_seqs_scores))
        seq = complete_seqs[i]

        # References
        # img_caps = [' '.join(c) for c in orig_caps]
        img_caps = [c for c in orig_caps]
        references.append(img_caps)

        # Hypotheses
        hypothesis = ([
            rev_word_map[w] for w in seq if w not in
            {word_map['<start>'], word_map['<end>'], word_map['<pad>']}
        ])
        # hypothesis = ' '.join(hypothesis)
        hypotheses.append(hypothesis)
        assert len(references) == len(hypotheses)

    # Calculate scores
    # metrics_dict = nlgeval.compute_metrics(references, hypotheses)
    hypotheses_file = os.path.join(outdir, 'hypotheses',
                                   'TEST.Hypotheses.json')
    references_file = os.path.join(outdir, 'references',
                                   'TEST.References.json')
    create_captions_file(range(len(hypotheses)), hypotheses, hypotheses_file)
    create_captions_file(range(len(references)), references, references_file)
    coco = COCO(references_file)
    # add the predicted results to the object
    coco_results = coco.loadRes(hypotheses_file)
    # create the evaluation object with both the ground-truth and the predictions
    coco_eval = COCOEvalCap(coco, coco_results)
    # change to use the image ids in the results object, not those from the ground-truth
    coco_eval.params['image_id'] = coco_results.getImgIds()
    # run the evaluation
    coco_eval.evaluate(verbose=False,
                       metrics=['bleu', 'meteor', 'rouge', 'cider'])
    # Results contains: "Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4", "METEOR", "ROUGE_L", "CIDEr", "SPICE"
    results = coco_eval.eval
    return results
Пример #11
0
def language_eval(dataset, preds, model_id, split):
    import sys
    sys.path.append("coco-caption")
    if 'coco' in dataset:
        annFile = 'coco-caption/annotations/captions_val2014.json'
    elif 'flickr30k' in dataset or 'f30k' in dataset:
        annFile = 'coco-caption/f30k_captions4eval.json'
    from pycocotools.coco import COCO
    from pycocoevalcap.eval import COCOEvalCap

    # encoder.FLOAT_REPR = lambda o: format(o, '.3f')

    if '+' in model_id:  # ensemble
        save_to = 'eval_results/ensemble'
    else:
        save_to = 'eval_results/single'

    if not os.path.isdir(save_to):
        os.mkdir(save_to)
    cache_path = os.path.join(save_to, model_id + '_' + split + '.json')
    # cache_path = os.path.join(save_to, 'tmp_' + split + '.json')
    coco = COCO(annFile)
    valids = coco.getImgIds()

    # filter results to only those in MSCOCO validation set (will be about a third)
    preds_filt = [p for p in preds if p['image_id'] in valids]
    print('using %d/%d predictions' % (len(preds_filt), len(preds)))
    # json.dump(preds_filt, open(cache_path, 'w')) # serialize to temporary json file. Sigh, COCO API...
    with open(cache_path, 'w') as f:
        json.dump(preds_filt, f)
    print("Write prediction results to {}".format(cache_path))

    cocoRes = coco.loadRes(cache_path)
    cocoEval = COCOEvalCap(coco, cocoRes)
    cocoEval.params['image_id'] = cocoRes.getImgIds()
    cocoEval.evaluate()

    # create output dictionary
    out = {}
    for metric, score in cocoEval.eval.items():
        out[metric] = score

    imgToEval = cocoEval.imgToEval
    for p in preds_filt:
        image_id, caption = p['image_id'], p['caption']
        imgToEval[image_id]['caption'] = caption

    out['bad_count_rate'] = sum([count_bad(_['caption'])
                                 for _ in preds_filt]) / float(len(preds_filt))
    outfile_path = os.path.join(save_to,
                                model_id + '_' + split + '_imgToEval.json')
    # outfile_path = os.path.join(save_to, 'tmp_' + split + '_imgToEval.json')
    with open(outfile_path, 'w') as outfile:
        json.dump(
            {
                'overall': out,
                'imgToEval': imgToEval,
                'predCaption': preds_filt
            }, outfile)
    print("Write prediction results to {}".format(outfile_path))
    return out
Пример #12
0
def validate(val_loader, decoder, criterion_ce, criterion_dis, epoch):
    """
    Performs one epoch's validation.
    :param val_loader: DataLoader for validation data.
    :param decoder: decoder model
    :param criterion_ce: cross entropy loss layer
    :param criterion_dis : discriminative loss layer
    :return: BLEU-4 score
    """
    decoder.eval()  # eval mode (no dropout or batchnorm)

    batch_time = AverageMeter()
    losses = AverageMeter()
    top5accs = AverageMeter()

    start = time.time()

    references = list()  # references (true captions) for calculating BLEU-4 score
    hypotheses = list()  # hypotheses (predictions)

    # Batches
    with torch.no_grad():
        # for i, (imgs, caps, caplens,allcaps) in enumerate(val_loader):
        for i, sample in enumerate(val_loader):

            if i % 5 != 0:
                # only decode every 5th caption, starting from idx 0.
                # this is because the iterator iterates over all captions in the dataset, not all images.
                if i % args.print_freq_val == 0:
                    print('Validation: [{0}/{1}]\t'
                          'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                          'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                          'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})\t'.format(i, len(val_loader),
                                                                                    batch_time=batch_time,
                                                                                    loss=losses, top5=top5accs))
                continue

            if scene_graph:
                (obj, rel, caps, caplens, orig_caps, obj_mask, rel_mask, pair_idx) = sample
                obj = obj.to(device)
                rel = rel.to(device)
                obj_mask = obj_mask.to(device)
                rel_mask = rel_mask.to(device)
                pair_idx = pair_idx.to(device)
            else:
                (imgs, caps, caplens, orig_caps) = sample
                imgs = imgs.to(device)

            # Move to device, if available
            caps = caps.to(device)
            caplens = caplens.to(device)

            # Forward prop.
            if scene_graph:
                scores, scores_d, caps_sorted, decode_lengths, sort_ind = decoder(object_features=obj,
                                                                                  relation_features=rel,
                                                                                  encoded_captions=caps,
                                                                                  caption_lengths=caplens,
                                                                                  object_mask=obj_mask,
                                                                                  relation_mask=rel_mask,
                                                                                  rel_pair_idx=pair_idx)
            else:
                scores, scores_d, caps_sorted, decode_lengths, sort_ind = decoder(imgs, caps, caplens)

            # Max-pooling across predicted words across time steps for discriminative supervision
            scores_d = scores_d.max(1)[0]

            # Since we decoded starting with <start>, the targets are all words after <start>, up to <end>
            targets = caps_sorted[:, 1:]
            targets_d = torch.zeros(scores_d.size(0), scores_d.size(1)).to(device)
            targets_d.fill_(-1)

            for length in decode_lengths:
                targets_d[:, :length - 1] = targets[:, :length - 1]

            # Remove timesteps that we didn't decode at, or are pads
            # pack_padded_sequence is an easy trick to do this
            scores_copy = scores.clone()
            scores = pack_padded_sequence(scores, decode_lengths, batch_first=True, enforce_sorted=True).data
            targets = pack_padded_sequence(targets, decode_lengths, batch_first=True, enforce_sorted=True).data
            #scores, _ = pack_padded_sequence(scores, decode_lengths, batch_first=True)
            #targets, _ = pack_padded_sequence(targets, decode_lengths, batch_first=True)

            # Calculate loss
            loss_d = criterion_dis(scores_d, targets_d.long())
            loss_g = criterion_ce(scores, targets)
            loss = loss_g + (10 * loss_d)

            # Keep track of metrics
            losses.update(loss.item(), sum(decode_lengths))
            top5 = accuracy(scores, targets, 5)
            top5accs.update(top5, sum(decode_lengths))
            batch_time.update(time.time() - start)

            start = time.time()

            if i % args.print_freq_val == 0:
                print('Validation: [{0}/{1}]\t'
                      'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})\t'.format(i, len(val_loader),
                                                                                batch_time=batch_time,
                                                                                loss=losses, top5=top5accs))

            # Store references (true captions), and hypothesis (prediction) for each image
            # If for n images, we have n hypotheses, and references a, b, c... for each image, we need -
            # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...]

            # References
            assert (len(sort_ind) == 1), "Cannot have batch_size>1 for validation."
            # a reference is a list of lists:
            # [['the', 'cat', 'sat', 'on', 'the', 'mat'], ['a', 'cat', 'on', 'the', 'mat']]
            references.append(orig_caps)

            # Hypotheses
            _, preds = torch.max(scores_copy, dim=2)
            preds = preds.tolist()
            preds_idxs_no_pads = list()
            for j, p in enumerate(preds):
                preds_idxs_no_pads.append(preds[j][:decode_lengths[j]])  # remove pads
                preds_idxs_no_pads = list(map(lambda c: [w for w in c if w not in {word_map['<start>'],
                                                                                   word_map['<pad>']}],
                                              preds_idxs_no_pads))
            temp_preds = list()
            # remove <start> and pads and convert idxs to string
            for hyp in preds_idxs_no_pads:
                temp_preds.append([])
                for w in hyp:
                    assert (not w == word_map['pad']), "Should have removed all pads."
                    if not w == word_map['<start>']:
                        temp_preds[-1].append(word_map_inv[w])
            preds = temp_preds
            hypotheses.extend(preds)
            assert len(references) == len(hypotheses)

    # Calculate BLEU-4 scores
    # bleu4 = corpus_bleu(references, hypotheses)
    # bleu4 = round(bleu4, 4)
    # compute the metrics
    hypotheses_file = os.path.join(args.outdir, 'hypotheses', 'Epoch{:0>3d}.Hypotheses.json'.format(epoch))
    references_file = os.path.join(args.outdir, 'references', 'Epoch{:0>3d}.References.json'.format(epoch))
    create_captions_file(range(len(hypotheses)), hypotheses, hypotheses_file)
    create_captions_file(range(len(references)), references, references_file)
    coco = COCO(references_file)
    # add the predicted results to the object
    coco_results = coco.loadRes(hypotheses_file)
    # create the evaluation object with both the ground-truth and the predictions
    coco_eval = COCOEvalCap(coco, coco_results)
    # change to use the image ids in the results object, not those from the ground-truth
    coco_eval.params['image_id'] = coco_results.getImgIds()
    # run the evaluation
    coco_eval.evaluate(verbose=False, metrics=['bleu', 'meteor', 'rouge', 'cider'])
    # Results contains: "Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4", "METEOR", "ROUGE_L", "CIDEr", "SPICE"
    results = coco_eval.eval
    results['loss'] = losses.avg
    results['top5'] = top5accs.avg

    for k, v in results.items():
        print(k+':\t'+str(v))
    # print('\n * LOSS - {loss.avg:.3f}, TOP-5 ACCURACY - {top5.avg:.3f}, BLEU-4 - {bleu}, CIDEr - {cider}\n'
    #       .format(loss=losses, top5=top5accs, bleu=round(results['Bleu_4'], 4), cider=round(results['CIDEr'], 1)))
    return results
Пример #13
0
def language_eval(dataset, preds, model_id, split):
    import sys
    sys.path.append("coco-caption")
    if 'coco' in dataset:
        annFile = 'coco-caption/annotations/captions_val2014.json'
    elif 'flickr30k' in dataset or 'f30k' in dataset:
        annFile = 'coco-caption/f30k_captions4eval.json'
    elif 'person' in dataset:
        annFile='coco-caption/person_captions4eval.json'
    from pycocotools.coco import COCO
    from pycocoevalcap.eval import COCOEvalCap
    # encoder.FLOAT_REPR = lambda o: format(o, '.3f')

    if not os.path.isdir('eval_results'):
        os.mkdir('eval_results')
    cache_path = os.path.join('eval_results/', '.cache_'+ model_id + '_' + split + '.json')
    best_cider=0
    #gdindex=[0,1,2,3,4]
    gdindex=[-1]
    cider_list =[]
    for i in gdindex:
        annFile='coco-caption/person_captions4eval_'+str(i)+'.json'
        print(annFile)
        coco = COCO(annFile)    
        valids = coco.getImgIds()

        # filter results to only those in MSCOCO validation set (will be about a third)
        preds_filt = [p for p in preds if p['image_id'] in valids]
        print('using %d/%d predictions' % (len(preds_filt), len(preds)))
        json.dump(preds_filt, open(cache_path, 'w')) # serialize to temporary json file. Sigh, COCO API...
        cocoRes = coco.loadRes(cache_path)
        cocoEval = COCOEvalCap(coco, cocoRes)
        cocoEval.params['image_id'] = cocoRes.getImgIds()
        cocoEval.evaluate()
        cider_list.append(cocoEval.eval['CIDEr'])
        # create output dictionary
        if cocoEval.eval['CIDEr']>=best_cider:
            best_cider = cocoEval.eval['CIDEr']
            out = {}
            for metric, score in cocoEval.eval.items():
                out[metric] = score

            imgToEval = cocoEval.imgToEval
                # collect SPICE_sub_score
            #for k in imgToEval.values()[0]['SPICE'].keys():
            #    if k != 'All':
            #        out['SPICE_'+k] = np.array([v['SPICE'][k]['f'] for v in  imgToEval.values()])
            #        out['SPICE_'+k] = (out['SPICE_'+k][out['SPICE_'+k]==out['SPICE_'+k]]).mean()
            
            for p in preds_filt:
                image_id, caption = p['image_id'], p['caption']
                imgToEval[image_id]['caption'] = caption
            #update predictions
            for i in range(len(preds)):
                if preds[i]['image_id'] in imgToEval:
                    preds[i]['eval'] = imgToEval[preds[i]['image_id']]

            out['bad_count_rate'] = sum([count_bad(_['caption']) for _ in preds_filt]) / float(len(preds_filt))
        else:
            continue
    outfile_path = os.path.join('eval_results/', model_id + '_' + split + '.json')
    with open(outfile_path, 'w') as outfile:
        json.dump({'overall': out, 'imgToEval': imgToEval}, outfile)
    cider_list=np.array(cider_list)
    print("min:",np.min(cider_list)," max:",np.max(cider_list)," mean:",np.mean(cider_list)," std:",np.std(cider_list))
    return out
Пример #14
0
def language_eval(dataset, preds, preds_n, job_id, split, eval_oracle=False):

    #
    # create output dictionary
    out = {}

    #
    # Diversity not implemented
    # ===========
    # if len(preds_n) > 0:
    #     # vocab size and novel sentences
    #     if "coco" in dataset:
    #         dataset_file = "data/dataset_coco.json"
    #     elif "flickr30k" in dataset or "f30k" in dataset:
    #         dataset_file = "data/dataset_flickr30k.json"
    #     training_sentences = set(
    #         [
    #             " ".join(__["tokens"])
    #             for _ in json.load(open(dataset_file))["images"]
    #             if not _["split"] in ["val", "test"]
    #             for __ in _["sentences"]
    #         ]
    #     )
    #     generated_sentences = set([_["caption"] for _ in preds_n])
    #     novels = generated_sentences - training_sentences
    #     out["novel_sentences"] = float(len(novels)) / len(preds_n)
    #     tmp = [_.split() for _ in generated_sentences]
    #     words = []
    #     for _ in tmp:
    #         words += _
    #     out["vocab_size"] = len(set(words))

    #
    # Set cache path
    cache_path = os.path.join("eval_results/", f".cache_{job_id}_{split}.json")

    #
    # Extract image ids in current data set
    coco = getCOCO(dataset)
    image_ids = coco.getImgIds()

    #
    # Filter results to only those in MSCOCO validation set
    filtered_predictions = [p for p in preds if p["image_id"] in image_ids]
    num_filtered_predictions = float(len(filtered_predictions))
    num_predictions = float(len(preds))

    #
    # Save predictions
    mean_perplexity = (sum([p["perplexity"] for p in filtered_predictions]) /
                       num_filtered_predictions)
    mean_entropy = (sum([p["entropy"] for p in filtered_predictions]) /
                    num_filtered_predictions)
    print(f"using {num_filtered_predictions}/{num_predictions} predictions")
    json.dump(filtered_predictions,
              open(cache_path,
                   "w"))  # serialize to temporary json file. Sigh, COCO API...

    #
    # Evaluate captions
    # NOTE: loadRes() API call requires a json file, hence the above comment
    cocoRes = coco.loadRes(cache_path)
    cocoEval = COCOEvalCap(coco, cocoRes)
    cocoEval.params["image_id"] = cocoRes.getImgIds()
    cocoEval.evaluate()

    #
    # Compile results so far
    out["perplexity"] = mean_perplexity
    out["entropy"] = mean_entropy
    #
    for metric, score in cocoEval.eval.items():
        out[metric] = score

    #
    # Record SPICE scores??
    imgToEval = cocoEval.imgToEval
    for k in list(imgToEval.values())[0]["SPICE"].keys():
        if k != "All":
            out["SPICE_" + k] = np.array(
                [v["SPICE"][k]["f"] for v in imgToEval.values()])
            out["SPICE_" + k] = (out["SPICE_" + k][out["SPICE_" +
                                                       k] == out["SPICE_" +
                                                                 k]]).mean()
    #
    # Overwrite caption or set?
    for p in filtered_predictions:
        image_id, caption = p["image_id"], p["caption"]
        imgToEval[image_id]["caption"] = caption

    #
    # Diverse sampling not implemented
    # ==================
    # if len(preds_n) > 0:
    #     import eval_multi

    #     cache_path_n = os.path.join(
    #         "eval_results/", ".cache_" + job_id + "_" + split + "_n.json"
    #     )
    #     allspice = eval_multi.eval_allspice(dataset, preds_n, job_id, split)
    #     out.update(allspice["overall"])
    #     div_stats = eval_multi.eval_div_stats(dataset, preds_n, job_id, split)
    #     out.update(div_stats["overall"])
    #     if eval_oracle:
    #         oracle = eval_multi.eval_oracle(dataset, preds_n, job_id, split)
    #         out.update(oracle["overall"])
    #     else:
    #         oracle = None
    #     self_cider = eval_multi.eval_self_cider(dataset, preds_n, job_id, split)
    #     out.update(self_cider["overall"])
    #     with open(cache_path_n, "w") as outfile:
    #         json.dump(
    #             {
    #                 "allspice": allspice,
    #                 "div_stats": div_stats,
    #                 "oracle": oracle,
    #                 "self_cider": self_cider,
    #             },
    #             outfile,
    #         )

    #
    # Fraction of captions that have illegal endings
    # SEE: bad_endings in /utils/constants.py
    num_bad_endings = sum(
        [count_bad(_["caption"]) for _ in filtered_predictions])
    out["bad_count_rate"] = num_bad_endings / num_filtered_predictions

    #
    # Write evaluation results to json
    outfile_path = os.path.join("eval_results/", f"{job_id}_{split}.json")
    with open(outfile_path, "w") as outfile:
        json.dump({"overall": out, "imgToEval": imgToEval}, outfile)
    #
    return out
def main(test_json_path):
    model_list = []
    for i in xrange(3):
        model_list.append(
            os.path.join(os.path.dirname(__file__), 'models/train',
                         'model.ckpt' + str(i)))
    var_list = tf.contrib.framework.list_variables(model_list[0])
    var_values, var_dtypes = {}, {}
    for (name, shape) in var_list:
        if not name.startswith("global_step"):
            var_values[name] = np.zeros(shape)
    for model_path in model_list:
        reader = tf.contrib.framework.load_checkpoint(model_path)
        for name in var_values:
            tensor = reader.get_tensor(name)
            var_dtypes[name] = tensor.dtype
            var_values[name] += tensor
    for name in var_values:
        var_values[name] /= len(model_list)
    tf_vars = [
        tf.get_variable(v, shape=var_values[v].shape, dtype=var_dtypes[name])
        for v in var_values
    ]
    placeholders = [
        tf.placeholder(v.dtype, shape=v.get_shape()) for v in tf_vars
    ]
    assign_ops = [tf.assign(v, p) for (v, p) in zip(tf_vars, placeholders)]
    global_step = tf.Variable(0,
                              name="global_step",
                              trainable=False,
                              dtype=tf.int32)
    saver = tf.train.Saver(tf.all_variables())
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for p, assign_op, (name, value) in zip(placeholders, assign_ops,
                                               six.iteritems(var_values)):
            sess.run(assign_op, {p: value})
        saver.save(sess,
                   os.path.join(os.path.dirname(__file__),
                                'models/tmp/model.ckpt'),
                   global_step=global_step)
    with open(os.path.join(os.path.dirname(__file__), 'data/features.pkl'),
              'r') as f:
        keyword_data = cPickle.load(f)
    with open(test_json_path) as f:
        test_json = json.load(f)
    id_to_filename = test_json['images']
    id_to_path = [{
        'path': os.path.join('./Data/test', x['file_name']),
        'id': x['id']
    } for x in id_to_filename]
    result_json = []
    g = tf.Graph()
    with g.as_default():
        model = inference_wrapper.InferenceWrapper()
        restore_fn = model.build_graph_from_config(
            configuration.ModelConfig(),
            os.path.join(os.path.dirname(__file__), 'models/tmp/model.ckpt-0'))
    g.finalize()
    vocab = vocabulary.Vocabulary(os.path.join('./Data/word_counts.txt'))
    with tf.Session(graph=g) as sess:
        restore_fn(sess)
        generator = caption_generator.CaptionGenerator(model, vocab)
        for data in id_to_path:
            filename = data['path']
            with tf.gfile.GFile(filename, "r") as f:
                image = f.read()
            captions = generator.beam_search(
                sess, image, keyword_data[os.path.basename(filename)])
            print("Captions for image %s:" % os.path.basename(filename))
            result = {
                'image_id':
                data['id'],
                'caption': (" ".join([
                    vocab.id_to_word(w) for w in captions[0].sentence[1:-1]
                ])).decode('utf-8')
            }
            print(result)
            result_json.append(result)
    with open(os.path.join(os.path.dirname(__file__), "result.json"),
              'w') as f:
        json.dump(result_json, f)
    coco = COCO(test_json_path)
    cocoRes = coco.loadRes(
        os.path.join(os.path.dirname(__file__), "result.json"))
    cocoEval = COCOEvalCap(coco, cocoRes)
    cocoEval.evaluate()
        base_dir +
        '/data2/anja/xai/captions/val-confident-man-500new.txt').readlines()
    confident_man = [int(c.strip()) for c in confident_man]
    confident_woman = open(
        base_dir +
        '/data2/anja/xai/captions/val-confident-woman-500new.txt').readlines()
    confident_woman = [int(c.strip()) for c in confident_woman]
    confident_ims = set(confident_man + confident_woman) & set(bias_ids)
    image_set = confident_ims
else:
    print "Invalid set specified"

for caption_path in caption_paths:

    generation_coco = coco.loadRes(caption_path[1])
    coco_evaluator = COCOEvalCap(coco, generation_coco)
    coco_evaluator.params['image_id'] = list(
        set(image_set) & set(generation_coco.getImgIds()))
    coco_evaluator.evaluate()

    predicted_caps = json.load(open(caption_path[1]))

    for cap in predicted_caps:
        words = nltk.word_tokenize(cap['caption'].lower())
        words = [
            'person' if word in gendered_words else word for word in words
        ]
        cap['caption'] = ' '.join(words)
        if len(set(words) & gendered_words) > 0: pdb.set_trace()
    """
    person_caps = 'tmp/person_caps.json'
Пример #17
0
def language_eval(dataset,
                  preds,
                  model_id,
                  split,
                  detail_flg=False,
                  wbleu_set=None,
                  option='closest'):
    import sys
    sys.path.append("coco-caption")
    if dataset == 'coco':
        annFile = 'coco-caption/annotations/captions_val2014.json'
    elif dataset == 'vg':
        annFile = '/mnt/poplin/share/dataset/visualgenome/captions_vg.json'
    elif dataset == 'iapr':
        annFile = '/mnt/workspace2018/nakamura/IAPR/captions_iapr.json'

    from pycocotools.coco import COCO
    from pycocoevalcap.eval import COCOEvalCap

    # encoder.FLOAT_REPR = lambda o: format(o, '.3f')

    if not os.path.isdir('eval_results'):
        os.mkdir('eval_results')
    cache_path = os.path.join('eval_results/',
                              model_id + '_' + split + '.json')

    coco = COCO(annFile)
    valids = coco.getImgIds()

    # filter results to only those in MSCOCO validation set (will be about a third)
    preds_filt = [p for p in preds if p['image_id'] in valids]
    print('using %d/%d predictions' % (len(preds_filt), len(preds)))
    json.dump(preds_filt,
              open(cache_path,
                   'w'))  # serialize to temporary json file. Sigh, COCO API...

    cocoRes = coco.loadRes(cache_path)
    cocoEval = COCOEvalCap(coco, cocoRes, wbleu_set=wbleu_set)
    cocoEval.params['image_id'] = cocoRes.getImgIds()
    cocoEval.evaluate()

    # create output dictionary
    out = {}
    for metric, score in cocoEval.eval.items():
        out[metric] = score

    imgToEval = cocoEval.imgToEval
    for p in preds_filt:
        image_id, caption = p['image_id'], p['caption']
        imgToEval[image_id]['caption'] = caption
    with open(cache_path, 'w') as outfile:
        json.dump({'overall': out, 'imgToEval': imgToEval}, outfile)

    if detail_flg:
        out_detail_scores = {}
        # for i in range(len(cocoEval.imgToEval.items())):
        #     pdb.set_trace()
        #     out_detail_scores[str(cocoEval.imgToEval.items()[i][0])] = cocoEval.imgToEval.items()[i][1]
        for key in cocoEval.imgToEval.keys():
            out_detail_scores[str(key)] = cocoEval.imgToEval[key]
        return [out, out_detail_scores]

    return out
Пример #18
0
import tqdm
import json
import pandas as pd
from pycocotools.coco import COCO
from torchvision.datasets import CocoCaptions
from pycocoevalcap.eval import COCOEvalCap

coco = COCO("./data/annotations/captions_val2014.json")
res_file = "./results/captions_model.json"
out_file = "./results/val2014_scores.xlsx"

# evaluate best captions against gt
coco_result = coco.loadRes(res_file)
cocoEval = COCOEvalCap(coco, coco_result)
cocoEval.params['image_id'] = coco_result.getImgIds()
cocoEval.evaluate()

indices = [
    "BLEU 1-gram", "BLEU 2-gram", "BLEU 3-gram", "BLEU 4-gram", "METEOR",
    "ROUGE_L", "CIDEr"
]
data = [cocoEval.eval['Bleu_1']] + [cocoEval.eval['Bleu_2']] + [cocoEval.eval['Bleu_3']] + [cocoEval.eval['Bleu_4']] + \
       [cocoEval.eval['METEOR']] + [cocoEval.eval['ROUGE_L']] + [cocoEval.eval['CIDEr']]
results = pd.DataFrame(columns=[f"3 epochs, lr=0.001"],
                       index=indices,
                       data=data)
results.to_excel(out_file)
print(f"Results saved to {out_file}")
Пример #19
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406),
                             (0.229, 0.224, 0.225))])

    # Load vocabulary
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = Encoder(args.embed_size).eval()
    decoder = Decoder(args.embed_size, args.hidden_size, len(vocab), args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # load validation image set
    lis = os.listdir(args.image_dir)
    num = len(lis)
    captions = []
    for i in range(num):

        im_pth = os.path.join(args.image_dir, lis[i])

        image = load_image(im_pth, transform)
        image_tensor = image.to(device)

        # Generate an caption from the image
        feature = encoder(image_tensor)
        sampled_ids = decoder.sample(feature)
        sampled_ids = sampled_ids[0].cpu().numpy()  # (1, max_seq_length) -> (max_seq_length)

        # Convert word_ids to words
        sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.idx2word[word_id]
            if word == '<start>':
                continue
            if word == '<end>':
                break

            sampled_caption.append(word)

        sentence = ' '.join(sampled_caption)
        cap= {}
        id = int(lis[i][14:-4]) #extract image id
        cap['image_id'] = id
        cap['caption'] =  sentence
        captions.append(cap)
    # save results
    with open('captions_res.json', 'w') as f:
        json.dump(captions, f)

    # evaluation with coco-caption evaluation tools
    coco = COCO(args.caption_path)
    cocoRes = coco.loadRes('captions_res.json')
    cocoEval = COCOEvalCap(coco, cocoRes)
    cocoEval.params['image_id'] = cocoRes.getImgIds()
    cocoEval.evaluate()
Пример #20
0
    def consensus_rerank(self, method='cider', flag_eval=True):
        # Only support cider and bleu currently
        assert (method == 'cider' or method == 'bleu')
        assert (len(self.NNimg_list) == len(self.anno_list_hypo))
        fol_cache = os.path.join(self.conf.fol_root_cache,
                                 self.conf.name_cache)
        if not os.path.exists(fol_cache):
            os.makedirs(fol_cache)

        if method == 'cider':
            # prepare cider
            coco = COCO(self.conf.fname_eval_ref)
            cocoEvalCider = COCOEvalCapPairCider(coco)
            cocoEvalCider.setup()
            k = self.conf.k_cider
            m = self.conf.m_cider
            key_reranking = 'rerank_%s_k%d_m%d_cider' \
                % (self.conf.gen_method, k, m)
        else:
            k = self.conf.k_bleu
            m = self.conf.m_bleu
            key_reranking = 'rerank_%s_k%d_m%d_bleu' \
                % (self.conf.gen_method, k, m)

        # start reranking
        rerank_ind = {
        }  # the ind is used to rerank the sentences that are in original order, for sGPN\dagger in grounding evaluation
        if self.anno_list_reranked == []:
            anno_list_reranked = self.anno_list_hypo
        else:
            anno_list_reranked = self.anno_list_reranked

        for (ind_te, anno) in enumerate(
                anno_list_reranked
        ):  # anno: a dict for an image, 10 sentences are in 'gen_beam_search_10'
            sentences_gen = anno[self.conf.gen_method]
            sentences_ret = []
            for ind_NN in range(k):
                ind_tr = self.NNimg_list[ind_te][ind_NN]
                sentences_ret += self.anno_list_ref[ind_tr]['sentences']
            sim = []
            for (ind_g, sen_gen) in enumerate(sentences_gen):
                b_s_arr = []
                for (ind_r, sen_ret) in enumerate(sentences_ret):
                    if method == 'cider':
                        b_s_arr.append(cocoEvalCider.calculate_cider_sentence( \
                            ' '.join(sen_gen), ' '.join(sen_ret) ) )
                    else:
                        b_s_arr.append(bs_util.calculate_bleu_sentence( \
                            sen_gen, sen_ret, self.conf.bleu_ngram, fpr=self.conf.fpr_bleu) )
                b_s_arr.sort(reverse=True)
                sim.append(sum(b_s_arr[:m]))

            # Sort the sentence according to sim
            arg_sim = np.argsort(-np.array(sim)).tolist()
            anno[key_reranking] = [
                sentences_gen[x] for x in arg_sim
            ]  # put the sentences in the order of ranking score, each sentence is decompose to a list of words
            rerank_ind[anno['id']] = arg_sim

            if (ind_te + 1) % self.conf.num_show_finished == 0:
                logger.info('%d image reranking finished' % (ind_te + 1))

        np.save(
            "consensus_rerank_ind.npy", rerank_ind
        )  # the index of sorted sentences; the index will be used in grounding evaluation
        self.anno_list_reranked = anno_list_reranked  # list of dicts, each dict is an image
        fname_anno_list_reranked = os.path.join(fol_cache, 'anno_list_hypo_rerank_%s_%s.npy' \
            % (self.conf.name_feat, self.conf.distance_metric) )
        np.save(
            fname_anno_list_reranked, anno_list_reranked
        )  # save the sentences in the order of reranked, in key of 'rerank_caption_k60_m125_cider'

        # Write the statistics to hard disk and evaluate the performance
        if flag_eval:
            fname_coco_json = os.path.join(fol_cache, 'coco_json_%s_%s_%s.json' \
                % (self.conf.name_feat, self.conf.distance_metric, method) )
            #fout_eval = open(os.path.join(fol_cache, 'eval_stat_%s_%s_%s.txt' \
            #% (self.conf.name_feat, self.conf.distance_metric, method) ), 'w')
            self._anno_genS2coco(
                anno_list_reranked, key_reranking, 0, fname_coco_json
            )  # 0: only save top-1 sentence after re-ranking

            coco = COCO(self.conf.fname_eval_ref)
            cocoRes = coco.loadRes(fname_coco_json)
            cocoEval = COCOEvalCap(coco, cocoRes)
            cocoEval.params['image_id'] = cocoRes.getImgIds()
            cocoEval.evaluate()

            # print output evaluation scores
            for metric, score in cocoEval.eval.items():
                print('%s: %.3f' % (metric, score))
                #print >>fout_eval, '%s: %.3f'%(metric, score)
            #fout_eval.close()

            self.cocoEval = cocoEval
Пример #21
0
def language_eval(dataset,
                  align_pred,
                  model_id,
                  split,
                  save_path,
                  is_flickr=False):
    '''
    evaluate the generated sentences
    '''
    sys.path.append("misc/coco-caption")
    from pycocotools.coco import COCO
    from pycocoevalcap.eval import COCOEvalCap

    if is_flickr:
        annFile = 'misc/coco-caption/annotations/caption_flickr30k.json'
    else:
        annFile = 'misc/coco-caption/annotations/captions_val2014.json'
    coco = COCO(annFile)
    valids = coco.getImgIds()
    if not os.path.isdir('eval_results'):
        os.mkdir('eval_results')

    all_scores = {}
    num_oracle = len(align_pred[0]['caption'])
    num_test_img = len(align_pred)
    all_scores['Bleu_1'] = np.zeros((num_oracle, num_test_img))
    all_scores['Bleu_2'] = np.zeros((num_oracle, num_test_img))
    all_scores['Bleu_3'] = np.zeros((num_oracle, num_test_img))
    all_scores['Bleu_4'] = np.zeros((num_oracle, num_test_img))
    all_scores['CIDEr'] = np.zeros((num_oracle, num_test_img))
    all_scores['METEOR'] = np.zeros((num_oracle, num_test_img))
    all_scores['ROUGE_L'] = np.zeros((num_oracle, num_test_img))
    all_scores['SPICE'] = np.zeros((num_oracle, num_test_img))
    all_scores['subgraph_bleu_material'] = []
    for sen_i in range(len(align_pred[0]['caption'])):  # for each sentence ind
        cache_path = os.path.join(
            'eval_results/',
            '.cache_' + model_id + '_' + split + str(sen_i) + '.json')
        preds = []
        for img_j in range(
                len(align_pred
                    )):  # extract the sentence in same position for all images
            entry = {
                'image_id': align_pred[img_j]['image_id'],
                'caption': align_pred[img_j]['caption'][sen_i]
            }
            preds.append(entry)

        # filter results to only those in MSCOCO validation set (will be about a third)
        preds_filt = [p for p in preds if p['image_id'] in valids]
        print('using %d/%d predictions' % (len(preds_filt), len(preds)))
        json.dump(preds_filt, open(
            cache_path,
            'w'))  # serialize to temporary json file. Sigh, COCO API...
        cocoRes = coco.loadRes(cache_path)
        if sen_i == 0:  # since every time cocoRes_ImgIds is the same, then only initiate cocoEval once
            cocoRes_ImgIds = cocoRes.getImgIds()
            cocoEval = COCOEvalCap(coco, cocoRes_ImgIds=cocoRes_ImgIds)
            #cocoEval.params['image_id'] = cocoRes.getImgIds()
            all_scores['image_id_list'] = list(
                cocoEval.gts.keys())  # fixed order of output image

        cocoEval.evaluate(cocoRes=cocoRes)

        for method in cocoEval.eval_scores.keys():
            all_scores[method][sen_i, :] = np.array(
                cocoEval.eval_scores[method]).reshape(-1)
        all_scores['subgraph_bleu_material'].append(
            cocoEval.subgraph_training_bleu)

    # pick the bleu material of best subgraph in terms of individual sentence bleu score,
    # then re-compute the score over selected sentences
    top_k = len(align_pred[0]['caption'])
    if top_k != 1:
        print('\n\nThe following is top-{}: '.format(top_k))
        bleu_dict = {'Bleu_1': [], 'Bleu_2': [], 'Bleu_3': [], 'Bleu_4': []}
        for metric in bleu_dict.keys():
            best_ind = np.argmax(all_scores[metric][:top_k], axis=0)
            bleu_dict[metric] = cal_bleu(
                best_ind, all_scores['subgraph_bleu_material'][:top_k])
        all_scores['bleu_dict'] = bleu_dict
        for b_i in range(5)[1:]:
            print('oracle {}: {}'.format(
                'Bleu_' + str(b_i), bleu_dict['Bleu_' + str(b_i)][b_i - 1]))

        # pick maximum spice/cider/rouge/meteor score and average over images
        print('oracle spice: {}'.format(
            np.mean(np.max(all_scores['SPICE'][:top_k], axis=0))))
        print('oracle cider: {}'.format(
            np.mean(np.max(all_scores['CIDEr'][:top_k], axis=0))))
        print('oracle rouge: {}'.format(
            np.mean(np.max(all_scores['ROUGE_L'][:top_k], axis=0))))
        print('oracle meteor: {}'.format(
            np.mean(np.max(all_scores['METEOR'][:top_k], axis=0))))

        name = 'all_scores_{}_{}-subgraph.npy'.format(
            save_path[-1].split('-')[1].split('.')[0],
            len(align_pred[0]['caption']))
        np.save(save_path[0] + '/' + save_path[1] + '/' + name, all_scores)
        print('\n{}'.format(save_path[0] + '/' + save_path[1] + '/' + name))
Пример #22
0
def evaluate():
    coco = COCO(VAL_CAP_FILE)
    coco_res = coco.loadRes(RESULTS_FILE)
    coco_eval = COCOEvalCap(coco, coco_res)
    coco_eval.params["image_id"] = coco_res.getImgIds()
    coco_eval.evaluate()
Пример #23
0
    def generation_experiment(self, strategy, max_batch_size=1000):
        # Compute image descriptors.
        print 'Computing image descriptors'
        self.compute_descriptors()

        do_batches = (strategy['type'] == 'beam' and strategy['beam_size'] == 1) or \
            (strategy['type'] == 'sample' and
             ('temp' not in strategy or strategy['temp'] in (1, float('inf'))) and
             ('num' not in strategy or strategy['num'] == 1))

        num_images = len(self.images)
        batch_size = min(max_batch_size, num_images) if do_batches else 1

        # Generate captions for all images.
        all_captions = [None] * num_images
        for image_index in xrange(0, num_images, batch_size):
            batch_end_index = min(image_index + batch_size, num_images)
            sys.stdout.write("\rGenerating captions for image %d/%d" %
                             (image_index, num_images))
            sys.stdout.flush()
            if do_batches:
                if strategy['type'] == 'beam' or \
                    ('temp' in strategy and strategy['temp'] == float('inf')):
                    temp = float('inf')
                else:
                    temp = strategy['temp'] if 'temp' in strategy else 1
                output_captions, output_probs = self.captioner.sample_captions(
                    self.descriptors[image_index:batch_end_index], temp=temp)
                for batch_index, output in zip(
                        range(image_index, batch_end_index), output_captions):
                    all_captions[batch_index] = output
            else:
                for batch_image_index in xrange(image_index, batch_end_index):
                    captions, caption_probs = self.captioner.predict_caption(
                        self.descriptors[batch_image_index], strategy=strategy)
                    best_caption, max_log_prob = None, None
                    for caption, probs in zip(captions, caption_probs):
                        log_prob = gen_stats(probs)['log_p']
                        if best_caption is None or \
                            (best_caption is not None and log_prob > max_log_prob):
                            best_caption, max_log_prob = caption, log_prob
                    all_captions[batch_image_index] = best_caption
        sys.stdout.write('\n')

        # Compute the number of reference files as the maximum number of ground
        # truth captions of any image in the dataset.
        num_reference_files = 0
        for captions in self.dataset.values():
            if len(captions) > num_reference_files:
                num_reference_files = len(captions)
        if num_reference_files <= 0:
            raise Exception('No reference captions.')

        # Collect model/reference captions, formatting the model's captions and
        # each set of reference captions as a list of len(self.images) strings.
        exp_dir = '%s/generation' % self.cache_dir
        if not os.path.exists(exp_dir):
            os.makedirs(exp_dir)
        # For each image, write out the highest probability caption.
        model_captions = [''] * len(self.images)
        reference_captions = [([''] * len(self.images))
                              for _ in xrange(num_reference_files)]
        for image_index, image in enumerate(self.images):
            caption = self.captioner.sentence(all_captions[image_index])
            model_captions[image_index] = caption
            for reference_index, (_,
                                  caption) in enumerate(self.dataset[image]):
                caption = ' '.join(caption)
                reference_captions[reference_index][image_index] = caption

        coco_image_ids = [
            self.sg.image_path_to_id[image_path] for image_path in self.images
        ]
        generation_result = [{
            'image_id': self.sg.image_path_to_id[image_path],
            'caption': model_captions[image_index]
        } for (image_index, image_path) in enumerate(self.images)]
        json_filename = '%s/generation_result.json' % self.cache_dir
        print 'Dumping result to file: %s' % json_filename
        with open(json_filename, 'w') as json_file:
            json.dump(generation_result, json_file)
        generation_result = self.sg.coco.loadRes(json_filename)
        coco_evaluator = COCOEvalCap(self.sg.coco, generation_result)
        coco_evaluator.params['image_id'] = coco_image_ids
        coco_evaluator.evaluate()
Пример #24
0
parser.add_argument('-ref', '--ref_path', type=str, default='./data/annotations/captions_val2014.json',
                    help='path for test output json file')

args = parser.parse_args()

annFile = args.ref_path
resFile = args.hypo_path

print('reference: ', annFile)
print('hypothesis: ', resFile)


coco = COCO(annFile)
cocoRes = coco.loadRes(resFile)

# create cocoEval object by taking coco and cocoRes
cocoEval = COCOEvalCap(coco, cocoRes)

# evaluate on a subset of images by setting
# cocoEval.params['image_id'] = cocoRes.getImgIds()
# please remove this line when evaluating the full validation set
cocoEval.params['image_id'] = cocoRes.getImgIds()

# evaluate results
# SPICE will take a few minutes the first time, but speeds up due to caching
cocoEval.evaluate()

print('\n\n-----results-----')
for metric, score in cocoEval.eval.items():
    print('%s: %.3f'%(metric, score))
Пример #25
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    glove = create_glove_dict()
    weights_matrix = create_weights_matrix(vocab, len(vocab), args.embed_size,
                                           glove)
    weights_matrix = torch.tensor(weights_matrix).detach().cpu()

    # load the evaluation data
    print(args.encoder_path)
    print(args.decoder_path)

    # Build models
    encoder = EncoderCNN(args.hidden_size, len(vocab)).eval().to(
        device)  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers, weights_matrix).to(device)
    # Load the trained model parameters
    if torch.cuda.is_available():
        encoder.load_state_dict(torch.load(args.encoder_path))
        decoder.load_state_dict(torch.load(args.decoder_path))
    else:
        encoder.load_state_dict(
            torch.load(args.encoder_path, map_location='cpu'))
        decoder.load_state_dict(
            torch.load(args.decoder_path, map_location='cpu'))

    # perform evaluation here

    object_list = [
        args.bottle_test, args.bus_test, args.couch_test, args.microwave_test,
        args.pizza_test, args.racket_test, args.suitcase_test, args.zebra_test
    ]
    object_names = [
        'bottle', 'bus', 'couch', 'microwave', 'pizza', 'racket', 'suitcase',
        'zebra'
    ]

    # iterate through each of the held out objects
    for object_class, name in zip(object_list, object_names):

        data_loader = get_loader(args.image_dir,
                                 object_class,
                                 vocab,
                                 transform,
                                 args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers)
        f1_count = 0
        total_count = 0
        results = []
        id_occurences = {}

        # gets the results into a json file for the captions
        # Sums up the F1 scores and the total number of captions
        for j, (images, captions, lengths, img_ids) in enumerate(data_loader):
            for image, length, img_id in zip(images, lengths, img_ids):
                print(j)
                total_count += 1
                image = image.view(1, *image.size())
                image = image.to(device)
                #print(image.size())

                with torch.no_grad():
                    feature = encoder(image)

                    start_token = torch.tensor(
                        vocab.word2idx['<start>']).to(device)
                    sampled_ids = decoder.sample(feature, start_token)

                #sampled_ids = sampled_ids[0].cpu().numpy()          # (1, max_seq_length) -> (max_seq_length)

                # Convert word_ids to words
                sampled_caption = []
                for word_id in sampled_ids:
                    word = vocab.idx2word[word_id]
                    if word == '<end>':
                        break
                    sampled_caption.append(word)

                contained = False
                for word in sampled_caption:
                    if word in object_names:
                        contained = True
                if contained:
                    f1_count += 1

                # removes start token
                # not needed for parallel captioner
                # sentence = ' '.join(sampled_caption[1:])
                sentence = ' '.join(sampled_caption)

                # for the general evaluation
                if str(img_id) not in id_occurences:
                    results.append({
                        "image_id": int(img_id),
                        "caption": sentence
                    })
                    id_occurences[str(img_id)] = 1
        #name = 'alexnet'
        with open(f'{name}_val_results', 'w') as outfile:
            json.dump(results, outfile)
        print("saved")

        # Evaluation section
        coco = COCO(object_class)
        cocoRes = coco.loadRes(f'{name}_val_results')

        # create cocoEval object by taking coco and cocoRes
        cocoEval = COCOEvalCap(coco, cocoRes, 'corpus')

        # please remove this line when evaluating the full validation set
        cocoEval.params['image_id'] = cocoRes.getImgIds()

        cocoEval.evaluate()

        # write the results to the object file

        with open(os.path.join(args.results_path, f'{name}_scores.txt'),
                  'a') as f:
            for metric, score in cocoEval.eval.items():
                f.write('%s: %.3f \n' % (metric, score))
            f.write('F1: %.3f' % (f1_count / total_count))
Пример #26
0
def language_eval(dataset, preds, preds_n, eval_kwargs, split):
    model_id = eval_kwargs['id']
    eval_oracle = eval_kwargs.get('eval_oracle', 0)

    # create output dictionary
    out = {}

    if len(preds_n) > 0:
        # vocab size and novel sentences
        if 'coco' in dataset:
            dataset_file = 'data/dataset_coco.json'
        elif 'flickr30k' in dataset or 'f30k' in dataset:
            dataset_file = 'data/dataset_flickr30k.json'
        training_sentences = set([
            ' '.join(__['tokens'])
            for _ in json.load(open(dataset_file))['images']
            if not _['split'] in ['val', 'test'] for __ in _['sentences']
        ])
        generated_sentences = set([_['caption'] for _ in preds_n])
        novels = generated_sentences - training_sentences
        out['novel_sentences'] = float(len(novels)) / len(preds_n)
        tmp = [_.split() for _ in generated_sentences]
        words = []
        for _ in tmp:
            words += _
        out['vocab_size'] = len(set(words))

    # encoder.FLOAT_REPR = lambda o: format(o, '.3f')

    cache_path = os.path.join('eval_results/',
                              '.cache_' + model_id + '_' + split + '.json')

    coco = getCOCO(dataset)
    valids = coco.getImgIds()

    # filter results to only those in MSCOCO validation set
    preds_filt = [p for p in preds if p['image_id'] in valids]
    mean_perplexity = sum([_['perplexity']
                           for _ in preds_filt]) / len(preds_filt)
    mean_entropy = sum([_['entropy'] for _ in preds_filt]) / len(preds_filt)
    print('using %d/%d predictions' % (len(preds_filt), len(preds)))
    json.dump(preds_filt,
              open(cache_path,
                   'w'))  # serialize to temporary json file. Sigh, COCO API...

    cocoRes = coco.loadRes(cache_path)
    cocoEval = COCOEvalCap(coco, cocoRes)
    cocoEval.params['image_id'] = cocoRes.getImgIds()
    cocoEval.evaluate()

    for metric, score in cocoEval.eval.items():
        out[metric] = score
    # Add mean perplexity
    out['perplexity'] = mean_perplexity
    out['entropy'] = mean_entropy

    imgToEval = cocoEval.imgToEval
    for k in list(imgToEval.values())[0]['SPICE'].keys():
        if k != 'All':
            out['SPICE_' + k] = np.array(
                [v['SPICE'][k]['f'] for v in imgToEval.values()])
            out['SPICE_' + k] = (out['SPICE_' + k][out['SPICE_' +
                                                       k] == out['SPICE_' +
                                                                 k]]).mean()
    for p in preds_filt:
        image_id, caption = p['image_id'], p['caption']
        imgToEval[image_id]['caption'] = caption

    if len(preds_n) > 0:
        import eval_multi
        cache_path_n = os.path.join(
            'eval_results/', '.cache_' + model_id + '_' + split + '_n.json')
        spice_n = eval_multi.eval_spice_n(dataset, preds_n, model_id, split)
        out.update(spice_n['overall'])
        div_stats = eval_multi.eval_div_stats(dataset, preds_n, model_id,
                                              split)
        out.update(div_stats['overall'])
        if eval_oracle:
            oracle = eval_multi.eval_oracle(dataset, preds_n, model_id, split)
            out.update(oracle['overall'])
        else:
            oracle = None
        self_cider = eval_multi.eval_self_cider(dataset, preds_n, model_id,
                                                split)
        out.update(self_cider['overall'])
        with open(cache_path_n, 'w') as outfile:
            json.dump(
                {
                    'spice_n': spice_n,
                    'div_stats': div_stats,
                    'oracle': oracle,
                    'self_cider': self_cider
                }, outfile)

    out['bad_count_rate'] = sum([count_bad(_['caption'])
                                 for _ in preds_filt]) / float(len(preds_filt))
    outfile_path = os.path.join('eval_results/',
                                model_id + '_' + split + '.json')
    with open(outfile_path, 'w') as outfile:
        json.dump({'overall': out, 'imgToEval': imgToEval}, outfile)

    return out
def test():
    word_index = dataset.tokenizer.word_index
    gen = dataset.generator(b_s, 'test', show_ids=True)
    m.load_weights('data/mpiimd_model.pkl')
    seen = 0
    hypothesis_for_coco = []
    references_for_coco = []
    images_for_coco = []

    while seen < dataset.nb_test_samples:
        [X_video, X_caption], Y_gt, snippet_ids = gen.next()
        Y_t = m.predict_on_batch([X_video, X_caption])
        Y_t = np.argmax(Y_t, axis=-1)

        for t in range(dataset.max_caption_len - 1):
            X_caption[:, t + 1] = Y_t[:, t]
            if all(w == 'stoptoken' for w in [
                    word_index.keys()[word_index.values().index(Yi)]
                    for Yi in X_caption[:, t + 1]
            ]):
                break
            Y_t = m.predict_on_batch([X_video, X_caption])
            Y_t = np.argmax(Y_t, axis=-1)

        for pred, gt, snippet_id in zip(X_caption, Y_gt, snippet_ids):
            gt_caption = []
            pred_caption = []
            for gt_t in gt:
                gt_word = word_index.keys()[word_index.values().index(gt_t[0])]
                gt_caption.append(gt_word)
                if gt_word == 'stoptoken':
                    break
            for pred_t in pred:
                pred_word = word_index.keys()[word_index.values().index(
                    pred_t)]
                pred_caption.append(pred_word)
                if pred_word == 'stoptoken':
                    break

            gt_caption = gt_caption[1:-1]
            pred_caption = pred_caption[1:-1]

            if not any(h for h in hypothesis_for_coco
                       if h['image_id'] == snippet_id):
                images_for_coco.append({
                    "id": snippet_id,
                    "url": "",
                    "file_name": ""
                })
                hypothesis_for_coco.append({
                    "image_id":
                    snippet_id,
                    "id":
                    seen,
                    "caption":
                    ' '.join([
                        c.decode('utf-8').encode('ascii', 'ignore')
                        for c in pred_caption
                    ])
                })
            references_for_coco.append({
                "image_id":
                snippet_id,
                "id":
                seen,
                "caption":
                ' '.join([
                    c.decode('utf-8').encode('ascii', 'ignore')
                    for c in gt_caption
                ])
            })

            seen += 1

            print('%d / %d - %s GT: %s' % (seen, dataset.nb_test_samples,
                                           snippet_id, ' '.join(gt_caption)))
            print('%d / %d - %s PR: %s' % (seen, dataset.nb_test_samples,
                                           snippet_id, ' '.join(pred_caption)))

    # Evaluation
    print(exp_name)
    import json

    json.dump(hypothesis_for_coco, open("%s_hypothesis.json" % exp_name, 'w'))
    json.dump(
        {
            'images': images_for_coco,
            'annotations': references_for_coco,
            'type': 'captions',
            'info': {},
            'licenses': []
        }, open("%s_references.json" % exp_name, 'w'))

    sys.path.append('coco_caption')
    from pycocotools.coco import COCO
    from pycocoevalcap.eval import COCOEvalCap

    coco = COCO("%s_references.json" % exp_name)
    cocoRes = coco.loadRes("%s_hypothesis.json" % exp_name)
    cocoEval = COCOEvalCap(coco, cocoRes)
    cocoEval.evaluate()

    for metric, score in cocoEval.eval.items():
        print('%s: %.3f' % (metric, score))
Пример #28
0
def language_eval(dataset, preds, model_id, split):
    import sys
    if 'coco' in dataset:
        sys.path.append("coco-caption")
        annFile = 'coco-caption/annotations/captions_val2014.json'
    else:
        sys.path.append("f30k-caption")
        annFile = 'f30k-caption/annotations/dataset_flickr30k.json'
    from pycocotools.coco import COCO
    from pycocoevalcap.eval import COCOEvalCap

    encoder.FLOAT_REPR = lambda o: format(o, '.3f')

    if not os.path.isdir('eval_results'):
        os.mkdir('eval_results')
    cache_path = os.path.join('eval_results/',
                              model_id + '_' + split + '.json')

    coco = COCO(annFile)
    valids = coco.getImgIds()

    preds_filt = [p for p in preds if p['image_id'] in valids]
    print('using %d/%d predictions' % (len(preds_filt), len(preds)))
    json.dump(preds_filt,
              open(cache_path,
                   'w'))  # serialize to temporary json file. Sigh, COCO API...

    cocoRes = coco.loadRes(cache_path)
    cocoEval = COCOEvalCap(coco, cocoRes)
    cocoEval.params['image_id'] = cocoRes.getImgIds()
    cocoEval.evaluate()

    # create output dictionary
    out = {}
    for metric, score in cocoEval.eval.items():
        out[metric] = score

    imgToEval = cocoEval.imgToEval
    # collect SPICE_sub_score
    for k in imgToEval.values()[0]['SPICE'].keys():
        if k != 'All':
            out['SPICE_' + k] = np.array(
                [v['SPICE'][k]['f'] for v in imgToEval.values()])
            out['SPICE_' + k] = (out['SPICE_' + k][out['SPICE_' +
                                                       k] == out['SPICE_' +
                                                                 k]]).mean()
    for p in preds_filt:
        image_id, caption = p['image_id'], p['caption']
        imgToEval[image_id]['caption'] = caption
    for i in range(len(preds)):
        if preds[i]['image_id'] in imgToEval:
            preds[i]['eval'] = imgToEval[preds[i]['image_id']]
    # filter results to only those in MSCOCO validation set (will be about a third)
    json.dump(
        preds,
        open(
            os.path.join('eval_results/',
                         model_id + '_' + split + '_nofilt.json'), 'w'))
    with open(cache_path, 'w') as outfile:
        json.dump({'overall': out, 'imgToEval': imgToEval}, outfile)

    return out
Пример #29
0
def language_eval(type, preds, model_id, split):
    import sys
    if 'coco' in type:
        annFile = 'coco-caption/annotations/captions_val2014.json'
        sys.path.append("coco-caption")
        print("Load reference file from: {}".format(annFile))
        from pycocotools.coco import COCO
        from pycocoevalcap.eval import COCOEvalCap
    elif '30k' in type:
        annFile = 'coco-caption/annotations/flickr30k_val.json'
        sys.path.append("coco-caption")
        print("Load reference file from: {}".format(annFile))
        from pycocotools.coco import COCO
        from pycocoevalcap.eval import COCOEvalCap
    elif 'zh' in type:
        annFile = 'data/aic_i2t/eval_reference.json'
        sys.path.append("AI_Challenger/Evaluation/caption_eval")
        print("Load reference file from: {}".format(annFile))
        from coco_caption.pycxtools.coco import COCO
        from coco_caption.pycxevalcap.eval import COCOEvalCap
    else:
        raise Exception('Current eval type is not recognizable.')

    encoder.FLOAT_REPR = lambda o: format(o, '.3f')

    if not os.path.isdir('eval_results'):
        os.mkdir('eval_results')
    cache_path = os.path.join('eval_results/',
                              type + '_' + model_id + '_' + split + '.json')
    print("Load cache path is:" + cache_path)
    coco = COCO(annFile)
    valids = coco.getImgIds()
    # filter results to only those in MSCOCO validation set (will be about a third)
    if 'coco' in type:
        preds_filt = [p for p in preds if p['image_id'] in valids]
        print('using %d/%d predictions' % (len(preds_filt), len(preds)))
        json.dump(preds_filt, open(
            cache_path,
            'w'))  # serialize to temporary json file. Sigh, COCO API...
    elif '30k' in type:
        preds_filt = [{
            'caption': p['caption'],
            'image_id': str(p['image_id'])
        } for p in preds if p['image_id'] in valids]
    else:
        json.dump(preds, open(
            cache_path,
            'w'))  # serialize to temporary json file. Sigh, COCO API...

    cocoRes = coco.loadRes(cache_path)
    cocoEval = COCOEvalCap(coco, cocoRes)
    cocoEval.params['image_id'] = cocoRes.getImgIds()
    print(len(set(cocoRes.getImgIds()) & set(coco.getImgIds())))
    cocoEval.evaluate()

    # create output dictionary
    out = {}
    for metric, score in cocoEval.eval.items():
        out[metric] = score

    imgToEval = cocoEval.imgToEval
    # for p in preds:
    #     image_id, caption = p['image_id'], p['caption']
    #     imgToEval[image_id]['caption'] = caption
    with open(cache_path, 'w') as outfile:
        json.dump({'overall': out, 'imgToEval': imgToEval}, outfile)

    return out