Exemplo n.º 1
0
def language_eval(dataset, preds, model_id, split):
    import sys
    sys.path.append("coco-caption")
    # if 'coco' in dataset:
    #     annFile = 'coco-caption/annotations/captions_val2014.json'
    # elif 'flickr30k' in dataset or 'f30k' in dataset:
    #     annFile = 'coco-caption/f30k_captions4eval.json'
    annFile = 'C:\\Users\\anke\\PycharmProjects\\pythonProject\\scripts\\wiki_captions4eval.json'
    from cococaption.pycocotools.coco import COCO
    from cococaption.pycocoevalcap.eval import COCOEvalCap

    # encoder.FLOAT_REPR = lambda o: format(o, '.3f')

    if not os.path.isdir('eval_results'):
        os.mkdir('eval_results')
    cache_path = os.path.join('eval_results/',
                              '.cache_' + model_id + '_' + split + '.json')

    coco = COCO(annFile)
    valids = coco.getImgIds()

    # filter results to only those in MSCOCO validation set (will be about a third)
    # preds_filt = [p for p in preds if p['image_id'] in valids] uncomment for not-folder test
    preds_filt = preds  #comment for non-folder test
    print('using %d/%d predictions' % (len(preds_filt), len(preds)))
    with open(cache_path, 'wb') as f:
        utils.pickle_dump(
            preds_filt,
            f)  # serialize to temporary json file. Sigh, COCO API...

    cocoRes = coco.loadRes(cache_path)
    cocoEval = COCOEvalCap(coco, cocoRes)
    cocoEval.params['image_id'] = cocoRes.getImgIds()
    cocoEval.evaluate()

    # create output dictionary
    out = {}
    for metric, score in cocoEval.eval.items():
        out[metric] = score

    imgToEval = cocoEval.imgToEval
    for p in preds_filt:
        image_id, caption = p['image_id'], p['caption']
        imgToEval[image_id]['caption'] = caption

    out['bad_count_rate'] = sum([count_bad(_['caption'])
                                 for _ in preds_filt]) / float(len(preds_filt))
    outfile_path = os.path.join('eval_results/',
                                model_id + '_' + split + '.json')
    with open(outfile_path, 'w') as outfile:
        json.dump({'overall': out, 'imgToEval': imgToEval}, outfile)

    return out
Exemplo n.º 2
0
def evalscores(hypotheses, references):
	targ_annotations = list()
	res_annotations = list()
	img_annotations = list()
	coco_ann_file = 'coco.json'
	res_ann_file = 'res.json'

	for i in range(len(hypotheses)):
		targ_anno_dict = {"image_id": i,
						  "id": i,
						  "caption": " ".join(references[i][0])}

		targ_annotations.append(targ_anno_dict)

		res_anno_dict = {"image_id": i,
						 "id": i,
						 "caption": " ".join(hypotheses[i])}

		res_annotations.append(res_anno_dict)

		image_anno_dict = {"id": i,
						   "file_name": i}

		img_annotations.append(image_anno_dict)

	coco_dict = {"type": 'captions', 
				 "images": img_annotations, 
				 "annotations": targ_annotations}

	res_dict = {"type": 'captions', 
				"images": img_annotations, 
				"annotations": res_annotations}

	with open(coco_ann_file, 'w') as fp:
		json.dump(coco_dict, fp)

	with open(res_ann_file, 'w') as fs:
		json.dump(res_annotations, fs)

	coco = COCO(coco_ann_file)
	cocoRes = coco.loadRes(res_ann_file)

	cocoEval = COCOEvalCap(coco, cocoRes)

	cocoEval.evaluate()

	for metric, score in cocoEval.eval.items():
		print('%s: %.3f'%(metric, score))
Exemplo n.º 3
0
def evaluate(loader, decoder, beam_size, epoch, vocab_size, word_map):

    decoder.eval()
    results = []
    rev_word_map = {v: k for k, v in word_map.items()}

    # For each image
    for i, (img, image_id, previous_caption, prev_caplen) in enumerate(
            tqdm(loader, desc="EVALUATING AT BEAM SIZE " + str(beam_size))):

        k = beam_size
        infinite_pred = False

        image_features = img.to(device)
        image_id = image_id.to(device)  # (1,1)
        encoded_previous_captions = previous_caption.to(device)
        previous_cap_length = prev_caplen.to(device)
        img_mean = image_features.mean(1)
        previous_encoded_h, previous_encoded_m, final_hidden, prev_cap_mask = decoder.caption_encoder(
            encoded_previous_captions, previous_cap_length)
        # Expand all
        image_features = image_features.expand(k, -1, -1)
        img_mean = img_mean.expand(k, -1)
        previous_encoded_h = previous_encoded_h.expand(k, -1, -1)
        previous_encoded_m = previous_encoded_m.expand(k, -1, -1)
        final_hidden = final_hidden.expand(k, -1)
        prev_cap_mask = prev_cap_mask.expand(k, -1)

        # Tensor to store top k previous words at each step; now they're just <start>
        k_prev_words = torch.LongTensor([[word_map['<start>']]] * k).to(
            device)  # (k, 1)

        # Tensor to store top k sequences; now they're just <start>
        seqs = k_prev_words  # (k, 1)

        # Tensor to store top k sequences' scores; now they're just 0
        top_k_scores = torch.zeros(k, 1).to(device)  # (k, 1)

        # Lists to store completed sequences and scores
        complete_seqs = list()
        complete_seqs_scores = list()

        # Start decoding
        step = 1

        h1, c1 = decoder.init_hidden_state(k)  # (k, decoder_dim)
        h2, c2 = decoder.init_hidden_state(k)  # (k, decoder_dim)

        # s is a number less than or equal to k, because sequences are removed from this process once they hit <end>
        while True:

            embeddings = decoder.embed(k_prev_words).squeeze(1)
            topdown_input = torch.cat([embeddings, final_hidden, h2, img_mean],
                                      dim=1)
            h1, c1 = decoder.attention_lstm(topdown_input, (h1, c1))
            attend_cap, alpha_c = decoder.caption_attention(
                previous_encoded_h, h1, embeddings, prev_cap_mask)
            attend_img = decoder.visual_attention(image_features, h1)
            language_input = torch.cat([h1, attend_cap, attend_img], dim=1)
            selected_memory = decoder.select(previous_encoded_m, alpha_c)
            h2, c2 = decoder.copy_lstm(language_input, (h2, c2),
                                       selected_memory)
            scores = decoder.fc(h2)
            scores = F.log_softmax(scores, dim=1)

            scores = top_k_scores.expand_as(scores) + scores  # (s, vocab_size)

            # For the first step, all k points will have the same scores (since same k previous words, h, c)
            if step == 1:
                top_k_scores, top_k_words = scores[0].topk(k, 0, True,
                                                           True)  # (s)
            else:
                # Unroll and find top scores, and their unrolled indices
                top_k_scores, top_k_words = scores.view(-1).topk(
                    k, 0, True, True)  # (s)

            # Convert unrolled indices to actual indices of scores
            prev_word_inds = top_k_words / vocab_size  # (s)
            next_word_inds = top_k_words % vocab_size  # (s)

            # Add new words to sequences
            seqs = torch.cat(
                [seqs[prev_word_inds],
                 next_word_inds.unsqueeze(1)], dim=1)  # (s, step+1)

            # Which sequences are incomplete (didn't reach <end>)?
            incomplete_inds = [
                ind for ind, next_word in enumerate(next_word_inds)
                if next_word != word_map['<end>']
            ]
            complete_inds = list(
                set(range(len(next_word_inds))) - set(incomplete_inds))

            # Set aside complete sequences
            if len(complete_inds) > 0:
                complete_seqs.extend(seqs[complete_inds].tolist())
                complete_seqs_scores.extend(top_k_scores[complete_inds])
            k -= len(complete_inds)  # reduce beam length accordingly

            # Proceed with incomplete sequences
            if k == 0:
                break

            seqs = seqs[incomplete_inds]
            h1 = h1[prev_word_inds[incomplete_inds]]
            c1 = c1[prev_word_inds[incomplete_inds]]
            h2 = h2[prev_word_inds[incomplete_inds]]
            c2 = c2[prev_word_inds[incomplete_inds]]
            image_features = image_features[prev_word_inds[incomplete_inds]]
            img_mean = img_mean[prev_word_inds[incomplete_inds]]
            final_hidden = final_hidden[prev_word_inds[incomplete_inds]]
            previous_encoded_h = previous_encoded_h[
                prev_word_inds[incomplete_inds]]
            previous_encoded_m = previous_encoded_m[
                prev_word_inds[incomplete_inds]]
            prev_cap_mask = prev_cap_mask[prev_word_inds[incomplete_inds]]

            top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
            k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

            # Break if things have been going on too long
            if step > 50:
                infinite_pred = True
                break
            step += 1

        if infinite_pred is not True:
            i = complete_seqs_scores.index(max(complete_seqs_scores))
            seq = complete_seqs[i]
        else:
            seq = seqs[0][:18]
            seq = [seq[i].item() for i in range(len(seq))]

        # Construct Sentence
        sen_idx = [
            w for w in seq if w not in
            {word_map['<start>'], word_map['<end>'], word_map['<pad>']}
        ]
        sentence = ' '.join(
            [rev_word_map[sen_idx[i]] for i in range(len(sen_idx))])
        item_dict = {"image_id": image_id.item(), "caption": sentence}
        results.append(item_dict)

    print("Calculating Evalaution Metric Scores......\n")
    resFile = 'cococaption/results/captions_val2014_results_' + str(
        epoch) + '.json'
    evalFile = 'cococaption/results/captions_val2014_eval_' + str(
        epoch) + '.json'
    # Calculate Evaluation Scores
    with open(resFile, 'w') as wr:
        json.dump(results, wr)

    coco = COCO(annFile)
    cocoRes = coco.loadRes(resFile)
    # create cocoEval object by taking coco and cocoRes
    cocoEval = COCOEvalCap(coco, cocoRes)
    # evaluate on a subset of images
    # please remove this line when evaluating the full validation set
    cocoEval.params['image_id'] = cocoRes.getImgIds()
    # evaluate results
    cocoEval.evaluate()
    # Save Scores for all images in resFile
    with open(evalFile, 'w') as w:
        json.dump(cocoEval.eval, w)

    return cocoEval.eval['CIDEr'], cocoEval.eval['Bleu_4']
def validate(val_loader, encoder, decoder, beam_size, epoch, vocab_size):
    """
    Funtion to validate over the complete dataset
    """
    encoder.eval()
    decoder.eval()
    results = []

    for i, (img, image_id) in enumerate(
            tqdm(val_loader,
                 desc="EVALUATING AT BEAM SIZE " + str(beam_size))):

        k = beam_size
        infinite_pred = False

        # Encode
        image = img.to(device)  # (1, 3, 224, 224)
        enc_image, global_features = encoder(
            image)  # enc_image of shape (1,num_pixels,features)
        # Flatten encoding
        num_pixels = enc_image.size(1)
        encoder_dim = enc_image.size(2)
        # We'll treat the problem as having a batch size of k
        enc_image = enc_image.expand(
            k, num_pixels, encoder_dim)  # (k, num_pixels, encoder_dim)
        # Tensor to store top k previous words at each step; now they're just <start>
        k_prev_words = torch.LongTensor([[word_map['<start>']]] * k).to(
            device)  # (k, 1)
        # Tensor to store top k sequences; now they're just <start>
        seqs = k_prev_words  # (k, 1)
        # Tensor to store top k sequences' scores; now they're just 0
        top_k_scores = torch.zeros(k, 1).to(device)  # (k, 1)
        # Lists to store completed sequences, their alphas and scores
        complete_seqs = list()
        complete_seqs_scores = list()
        # Start decoding
        step = 1
        h, c = decoder.init_hidden_state(enc_image)
        spatial_image = F.relu(
            decoder.encoded_to_hidden(enc_image))  # (k,num_pixels,hidden_size)
        global_image = F.relu(
            decoder.global_features(global_features))  # (1,embed_dim)
        # s is a number less than or equal to k, because sequences are removed from this process once they hit <end>
        while True:
            embeddings = decoder.embedding(k_prev_words).squeeze(
                1)  # (k,embed_dim)
            inputs = torch.cat(
                (embeddings, global_image.expand_as(embeddings)), dim=1)
            h, c, st = decoder.LSTM(inputs,
                                    (h, c))  # (batch_size_t, hidden_size)
            # Run the adaptive attention model
            out_l, _, _ = decoder.adaptive_attention(spatial_image, h, st)
            # Compute the probability over the vocabulary
            scores = decoder.fc(out_l)  # (batch_size, vocab_size)
            scores = F.log_softmax(scores, dim=1)  # (s, vocab_size)
            # (k,1) will be (k,vocab_size), then (k,vocab_size) + (s,vocab_size) --> (s, vocab_size)
            scores = top_k_scores.expand_as(scores) + scores
            # For the first step, all k points will have the same scores (since same k previous words, h, c)
            if step == 1:
                #Remember: torch.topk returns the top k scores in the first argument, and their respective indices in the second argument
                top_k_scores, top_k_words = scores[0].topk(k, 0, True,
                                                           True)  # (s)
            else:
                # Unroll and find top scores, and their unrolled indices
                top_k_scores, top_k_words = scores.view(-1).topk(
                    k, 0, True, True)  # (s)

            # Convert unrolled indices to actual indices of scores
            prev_word_inds = top_k_words / vocab_size  # (s)
            next_word_inds = top_k_words % vocab_size  # (s)
            # Add new words to sequences, alphas
            seqs = torch.cat(
                [seqs[prev_word_inds],
                 next_word_inds.unsqueeze(1)], dim=1)  # (s, step+1)
            # Which sequences are incomplete (didn't reach <end>)?
            incomplete_inds = [
                ind for ind, next_word in enumerate(next_word_inds)
                if next_word != word_map['<end>']
            ]
            complete_inds = list(
                set(range(len(next_word_inds))) - set(incomplete_inds))

            # Set aside complete sequences
            if len(complete_inds) > 0:
                complete_seqs.extend(seqs[complete_inds].tolist())
                complete_seqs_scores.extend(top_k_scores[complete_inds])
            k -= len(complete_inds)  # reduce beam length accordingly

            if k == 0:
                break

            # Proceed with incomplete sequences
            seqs = seqs[incomplete_inds]
            h = h[prev_word_inds[incomplete_inds]]
            c = c[prev_word_inds[incomplete_inds]]
            spatial_image = spatial_image[prev_word_inds[incomplete_inds]]
            top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
            k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

            # Break if things have been going on too long
            if step > 50:
                infinite_pred = True
                break

            step += 1

        if infinite_pred is not True:
            i = complete_seqs_scores.index(max(complete_seqs_scores))
            seq = complete_seqs[i]
        else:
            seq = seqs[0][:20]
            seq = [seq[i].item() for i in range(len(seq))]

        # Construct Sentence
        sen_idx = [
            w for w in seq if w not in
            {word_map['<start>'], word_map['<end>'], word_map['<pad>']}
        ]
        sentence = ' '.join(
            [rev_word_map[sen_idx[i]] for i in range(len(sen_idx))])
        item_dict = {"image_id": image_id.item(), "caption": sentence}
        results.append(item_dict)

    print("Calculating Evalaution Metric Scores......\n")

    resFile = 'cococaptioncider/results/captions_val2014_results_' + str(
        epoch) + '.json'
    evalFile = 'cococaptioncider/results/captions_val2014_eval_' + str(
        epoch) + '.json'
    # Calculate Evaluation Scores
    with open(resFile, 'w') as wr:
        json.dump(results, wr)

    coco = COCO(annFile)
    cocoRes = coco.loadRes(resFile)
    # create cocoEval object by taking coco and cocoRes
    cocoEval = COCOEvalCap(coco, cocoRes)
    # evaluate on a subset of images
    # please remove this line when evaluating the full validation set
    cocoEval.params['image_id'] = cocoRes.getImgIds()
    # evaluate results
    cocoEval.evaluate()
    # Save Scores for all images in resFile
    with open(evalFile, 'w') as w:
        json.dump(cocoEval.eval, w)

    return cocoEval.eval['CIDEr'], cocoEval.eval['Bleu_4']
Exemplo n.º 5
0
def evaluate_full(loader, dae_ar, decoder, beam_size, epoch, word_map):

    vocab_size = len(word_map)
    decoder.eval()
    dae_ar.eval()
    results = []
    rev_word_map = {v: k for k, v in word_map.items()}

    for i, (img, image_id, previous_caption, prev_caplen) in enumerate(
            tqdm(loader, desc="EVALUATING AT BEAM SIZE " + str(beam_size))):
        k = beam_size
        infinite_pred = False

        image_features = img.to(device)
        image_id = image_id.to(device)  # (1,1)
        encoded_previous_captions = previous_caption.to(device)
        previous_cap_length = prev_caplen.to(device)
        img_mean = image_features.mean(1)

        eprevious_encoded_h, eprevious_encoded_m, efinal_hidden, eprev_cap_mask = decoder.caption_encoder(
            encoded_previous_captions, previous_cap_length)
        dprevious_encoded, dfinal_hidden, dprev_caption_mask = dae_ar.dae.caption_encoder(
            encoded_previous_captions, previous_cap_length)
        image_features = image_features.expand(k, -1, -1)
        img_mean = img_mean.expand(k, -1)
        eprevious_encoded_h = eprevious_encoded_h.expand(k, -1, -1)
        eprevious_encoded_m = eprevious_encoded_m.expand(k, -1, -1)
        efinal_hidden = efinal_hidden.expand(k, -1)
        eprev_cap_mask = eprev_cap_mask.expand(k, -1)
        dprevious_encoded = dprevious_encoded.expand(k, -1, -1)
        dprev_cap_mask = dprev_caption_mask.expand(k, -1)
        dfinal_hidden = dfinal_hidden.expand(k, -1)

        k_prev_words = torch.LongTensor([[word_map['<start>']]] * k).to(
            device)  # (k, 1)
        seqs = k_prev_words  # (k, 1)
        top_k_scores = torch.zeros(k, 1).to(device)  # (k, 1)
        complete_seqs = list()
        complete_seqs_scores = list()
        step = 1
        eh1, ec1 = decoder.init_hidden_state(k)  # (k, decoder_dim)
        eh2, ec2 = decoder.init_hidden_state(k)  # (k, decoder_dim)
        dh1, dc1 = dae_ar.dae.init_hidden_state(k)  # (batch_size, decoder_dim)
        dh2, dc2 = dae_ar.dae.init_hidden_state(k)

        while True:
            eembeddings = decoder.embed(k_prev_words).squeeze(1)
            etopdown_input = torch.cat(
                [eembeddings, efinal_hidden, eh2, img_mean], dim=1)
            eh1, ec1 = decoder.attention_lstm(etopdown_input, (eh1, ec1))
            eattend_cap, ealpha_c = decoder.caption_attention(
                eprevious_encoded_h, eh1, eembeddings, eprev_cap_mask)
            eattend_img = decoder.visual_attention(image_features, eh1)
            elanguage_input = torch.cat([eh1, eattend_cap, eattend_img], dim=1)
            eselected_memory = decoder.select(eprevious_encoded_m, ealpha_c)
            eh2, ec2 = decoder.copy_lstm(elanguage_input, (eh2, ec2),
                                         eselected_memory)
            escores = decoder.fc(eh2)
            ########################################################################################
            dembeddings = dae_ar.dae.embed(k_prev_words).squeeze(1)
            dtopdown_input = torch.cat([dembeddings, dfinal_hidden, dh2],
                                       dim=1)
            dh1, dc1 = dae_ar.dae.attention_lstm(dtopdown_input, (dh1, dc1))
            dattend_cap = dae_ar.dae.caption_attention(dprevious_encoded, dh1,
                                                       dprev_cap_mask)
            dlanguage_input = torch.cat([dh1, dattend_cap], dim=1)
            dh2, dc2 = dae_ar.dae.language_lstm(dlanguage_input, (dh2, dc2))
            dscores = dae_ar.dae.fc(dh2)
            ########################################################################################
            soft_escores = F.softmax(escores, dim=1)
            soft_dscores = F.softmax(dscores, dim=1)
            scores = ((soft_escores + soft_dscores) / 2).log()
            ########################################################################################
            scores = top_k_scores.expand_as(scores) + scores  # (s, vocab_size)
            if step == 1:
                top_k_scores, top_k_words = scores[0].topk(k, 0, True,
                                                           True)  # (s)
            else:
                # Unroll and find top scores, and their unrolled indices
                top_k_scores, top_k_words = scores.view(-1).topk(
                    k, 0, True, True)  # (s)

            prev_word_inds = top_k_words / vocab_size  # (s)
            next_word_inds = top_k_words % vocab_size  # (s)
            seqs = torch.cat(
                [seqs[prev_word_inds],
                 next_word_inds.unsqueeze(1)], dim=1)  # (s, step+1)
            incomplete_inds = [
                ind for ind, next_word in enumerate(next_word_inds)
                if next_word != word_map['<end>']
            ]
            complete_inds = list(
                set(range(len(next_word_inds))) - set(incomplete_inds))

            if len(complete_inds) > 0:
                complete_seqs.extend(seqs[complete_inds].tolist())
                complete_seqs_scores.extend(top_k_scores[complete_inds])
            k -= len(complete_inds)

            if k == 0:
                break

            seqs = seqs[incomplete_inds]
            eh1 = eh1[prev_word_inds[incomplete_inds]]
            ec1 = ec1[prev_word_inds[incomplete_inds]]
            eh2 = eh2[prev_word_inds[incomplete_inds]]
            ec2 = ec2[prev_word_inds[incomplete_inds]]
            image_features = image_features[prev_word_inds[incomplete_inds]]
            img_mean = img_mean[prev_word_inds[incomplete_inds]]
            efinal_hidden = efinal_hidden[prev_word_inds[incomplete_inds]]
            eprevious_encoded_h = eprevious_encoded_h[
                prev_word_inds[incomplete_inds]]
            eprevious_encoded_m = eprevious_encoded_m[
                prev_word_inds[incomplete_inds]]
            eprev_cap_mask = eprev_cap_mask[prev_word_inds[incomplete_inds]]
            ###########################################################################################
            dh1 = dh1[prev_word_inds[incomplete_inds]]
            dc1 = dc1[prev_word_inds[incomplete_inds]]
            dh2 = dh2[prev_word_inds[incomplete_inds]]
            dc2 = dc2[prev_word_inds[incomplete_inds]]
            dprevious_encoded = dprevious_encoded[
                prev_word_inds[incomplete_inds]]
            dprev_cap_mask = dprev_cap_mask[prev_word_inds[incomplete_inds]]
            dfinal_hidden = dfinal_hidden[prev_word_inds[incomplete_inds]]
            ###########################################################################################
            top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
            k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

            if step > 50:
                infinite_pred = True
                break
            step += 1

        if infinite_pred is not True:
            i = complete_seqs_scores.index(max(complete_seqs_scores))
            seq = complete_seqs[i]
        else:
            seq = seqs[0][:18]
            seq = [seq[i].item() for i in range(len(seq))]

        # Construct Sentence
        sen_idx = [
            w for w in seq if w not in
            {word_map['<start>'], word_map['<end>'], word_map['<pad>']}
        ]
        sentence = ' '.join(
            [rev_word_map[sen_idx[i]] for i in range(len(sen_idx))])
        item_dict = {"image_id": image_id.item(), "caption": sentence}
        results.append(item_dict)

    print("Calculating Evalaution Metric Scores......\n")
    resFile = 'cococaption/results/captions_val2014_results_' + str(
        epoch) + '.json'
    evalFile = 'cococaption/results/captions_val2014_eval_' + str(
        epoch) + '.json'
    # Calculate Evaluation Scores
    with open(resFile, 'w') as wr:
        json.dump(results, wr)

    coco = COCO(annFile)
    cocoRes = coco.loadRes(resFile)
    # create cocoEval object by taking coco and cocoRes
    cocoEval = COCOEvalCap(coco, cocoRes)
    # evaluate on a subset of images
    # please remove this line when evaluating the full validation set
    cocoEval.params['image_id'] = cocoRes.getImgIds()
    # evaluate results
    cocoEval.evaluate()
    # Save Scores for all images in resFile
    with open(evalFile, 'w') as w:
        json.dump(cocoEval.eval, w)

    return cocoEval.eval['CIDEr'], cocoEval.eval['Bleu_4']