def main(ckpt_path, gts_name='/gts.json', res_name='/res.json'):
    print("eval_spice.py")
    print(ckpt_path)
    with open(ckpt_path + gts_name) as f:
        gts = json.load(f)
    with open(ckpt_path + res_name) as f:
        res = json.load(f)
    scorer = Spice()
    score, scores = scorer.compute_score(gts, res)
    with open(ckpt_path + '/score.json', 'w') as f:
        json.dump(score, f)
    with open(ckpt_path + '/scores.json', 'w') as f:
        json.dump(scores, f)
Exemplo n.º 2
0
def compute_scores(gts, res):
    """
    Performs the MS COCO evaluation using the Python 3 implementation (https://github.com/salaniz/pycocoevalcap)

    :param gts: Dictionary with the image ids and their gold captions,
    :param res: Dictionary with the image ids ant their generated captions
    :print: Evaluation score (the mean of the scores of all the instances) for each measure
    """

    # Preprocess captions
    gts = preprocess_captions(gts)
    res = preprocess_captions(res)

    # Set up scorers
    scorers = [
        (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
        (Meteor(), "METEOR"),
        (Rouge(), "ROUGE_L"),
        (Spice(), "SPICE"),
        (Cider(), "CIDEr")
    ]

    # Compute score for each metric
    for scorer, method in scorers:
        print("Computing", scorer.method(), "...")
        score, scores = scorer.compute_score(gts, res)
        if type(method) == list:
            for sc, m in zip(score, method):
                print("%s : %0.3f" % (m, sc))
        else:
            print("%s : %0.3f" % (method, score))
Exemplo n.º 3
0
def main(eval_caption_file, output, zh=False):
    df = pd.read_json(eval_caption_file)
    if zh:
        refs = df.groupby("key")["tokens"].apply(list).to_dict()
    else:
        refs = df.groupby("key")["caption"].apply(list).to_dict()

    from pycocoevalcap.bleu.bleu import Bleu
    from pycocoevalcap.cider.cider import Cider
    from pycocoevalcap.rouge.rouge import Rouge

    scorer = Bleu(zh=zh)
    bleu_scores = coco_score(copy.deepcopy(refs), scorer)
    scorer = Cider(zh=zh)
    cider_score = coco_score(copy.deepcopy(refs), scorer)
    scorer = Rouge(zh=zh)
    rouge_score = coco_score(copy.deepcopy(refs), scorer)

    if not zh:
        from pycocoevalcap.meteor.meteor import Meteor
        scorer = Meteor()
        meteor_score = coco_score(copy.deepcopy(refs), scorer)

        from pycocoevalcap.spice.spice import Spice
        scorer = Spice()
        spice_score = coco_score(copy.deepcopy(refs), scorer)

    with open(output, "w") as f:
        for n in range(4):
            f.write("BLEU-{}: {:6.3f}\n".format(n + 1, bleu_scores[n]))
        f.write("CIDEr: {:6.3f}\n".format(cider_score))
        f.write("ROUGE: {:6.3f}\n".format(rouge_score))
        if not zh:
            f.write("Meteor: {:6.3f}\n".format(meteor_score))
            f.write("SPICE: {:6.3f}\n".format(spice_score))
Exemplo n.º 4
0
    def __init__(self,
                 ground_truth_filenames=None,
                 prediction_filename=None,
                 tious=None,
                 max_proposals=1000,
                 prediction_fields=PREDICTION_FIELDS,
                 verbose=False):
        # Check that the gt and submission files exist and load them
        if len(tious) == 0:
            raise IOError('Please input a valid tIoU.')
        if not ground_truth_filenames:
            raise IOError('Please input a valid ground truth file.')
        if not prediction_filename:
            raise IOError('Please input a valid prediction file.')

        self.verbose = verbose
        self.tious = tious
        self.max_proposals = max_proposals
        self.pred_fields = prediction_fields
        self.ground_truths = self.import_ground_truths(ground_truth_filenames)
        self.prediction = self.import_prediction(prediction_filename)
        self.tokenizer = PTBTokenizer()

        # Set up scorers, if not verbose, we only use the one we're
        # testing on: METEOR
        if self.verbose:
            self.scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3",
                                       "Bleu_4"]), (Meteor(), "METEOR"),
                            (Rouge(), "ROUGE_L"), (Cider('corpus'), "CIDEr"),
                            (Spice(), "SPICE")]
        else:
            self.scorers = [(Cider('corpus'), "CIDEr")]
Exemplo n.º 5
0
def init_scorer(cached_tokens):
    global CiderD_scorer
    CiderD_scorer = CiderD_scorer or CiderD(df=cached_tokens)
    global Bleu_scorer
    Bleu_scorer = Bleu_scorer or Bleu(4)
    global Spice_scorer
    Spice_scorer = Spice_scorer or Spice()
Exemplo n.º 6
0
    def get_dcc_scores(self):

        imgIds = self.params['image_id']
        # imgIds = self.coco.getImgIds()
        gts = {}
        res = {}
        for imgId in imgIds:
            gts[imgId] = self.coco.imgToAnns[imgId]
            res[imgId] = self.cocoRes.imgToAnns[imgId]

        tokenizer = PTBTokenizer()
        gts = tokenizer.tokenize(gts)
        res = tokenizer.tokenize(res)
        scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
                   (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"),
                   (Cider(df='noc_test_freq'), "CIDEr"), (Spice(), "SPICE")]
        score_dict = {}
        for scorer, method in scorers:
            print('computing %s score...' % (scorer.method()))
            score, scores = scorer.compute_score(gts, res)
            if type(method) == list:
                for sc, scs, m in zip(score, scores, method):
                    score_dict[m] = sc
                    print("%s: %0.3f" % (m, sc))
            else:
                score_dict[method] = score
                print("%s: %0.3f" % (method, score))

        return score_dict
Exemplo n.º 7
0
def score(num, DIR):
    print("Testing results on epoch ", num, " in DIR=", DIR)
    print("Loading coco annotations")
    dataDir = '.'
    dataType = 'val2014'
    algName = 'fakecap'
    annFile = '%s/annotations/captions_%s.json' % (dataDir, dataType)
    subtypes = ['results', 'evalImgs', 'eval']
    [resFile, evalImgsFile, evalFile]= \
    ['%s/results/captions_%s_%s_%s.json'%(dataDir,dataType,algName,subtype) for subtype in subtypes]
    coco_anns = COCO(annFile)
    print("COCO anns imported")

    path = DIR + str(num) + '_test_result.tar.gz'
    save = pickle.load(open(path))
    cocoRes = {}
    coco = {}
    for key, val in save.items():
        reslst = val[u'res']
        res = []
        for data in reslst:
            if data != u'<SEND>':
                res.append(data)
            else:
                break
        res = res[1:]
        #print "RES: ",reslst
        #print "ANN: ", val[u'ann']
        #res = [word for word in res if word!=u'<SEND>'][1:]
        #print "RES FIXED: ", res

        if len(res) == 0:
            res = [u'a']  #just not to be empty, and it has low low idf
        cocoRes[key] = [{u'caption': ' '.join(res)}]

        #coco[key] = [{u'caption':' '.join(val[u'ann'][1:-1])}]
        coco[key] = coco_anns.imgToAnns[key]
    print 'examples'
    for key in coco.keys()[:5]:
        print "IMG_NUM=", key
        print "Annotation: ", '\n'.join(
            [coco[key][i][u'caption'] for i in range(len(coco[key]))])
        print "Generated data: ", ' '.join(save[key][u'res'])
        print "Cleared generation: ", cocoRes[key][0][u'caption']

    print 'tokenization...'
    tokenizer = PTBTokenizer()
    gts = tokenizer.tokenize(coco)
    res = tokenizer.tokenize(cocoRes)

    print 'setting up scorers...'
    scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
               (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"),
               (Spice(), "SPICE")]

    for scorer, method in scorers:
        print 'computing %s score...' % (scorer.method())
        score, scores = scorer.compute_score(gts, res)
        print(score)
Exemplo n.º 8
0
def score_all(ref, hypo):
    scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
               (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"),
               (Spice(), "SPICE")]
    final_scores = {}
    for scorer, method in scorers:
        score, scores = scorer.compute_score(ref, hypo)
        if type(score) == list:
            for m, s in zip(method, score):
                final_scores[m] = s
        else:
            final_scores[method] = score

    return final_scores
Exemplo n.º 9
0
    def evaluate(self):
        res = {}
        for r in self.rests:
            res[str(r['image_id'])] = [{'caption': r['caption']}]

        gts = {}
        for imgId in self.annos:
            gts[str(imgId)] = [{'caption': c} for c in self.annos[imgId]]

        # =================================================
        # Set up scorers
        # =================================================
        # print('tokenization...')
        tokenizer = self.Tokenizer()
        gts = tokenizer.tokenize(gts)
        res = tokenizer.tokenize(res)
        # =================================================
        # Set up scorers
        # =================================================
        # print('setting up scorers...')
        use_scorers = self.use_scorers
        scorers = []
        if 'Bleu' in use_scorers:
            scorers.append((Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]))
        if 'METEOR' in use_scorers:
            scorers.append((Meteor(), "METEOR"))
        if 'ROUGE_L' in use_scorers:
            scorers.append((Rouge(), "ROUGE_L"))
        if 'CIDEr' in use_scorers:
            scorers.append((Cider(), "CIDEr"))
        if 'SPICE' in use_scorers:
            scorers.append((Spice(), "SPICE"))

        # =================================================
        # Compute scores
        # =================================================
        for scorer, method in scorers:
            # print('computing %s score...'%(scorer.method()))
            score, scores = scorer.compute_score(gts, res)
            if type(method) == list:
                for sc, scs, m in zip(score, scores, method):
                    self.setEval(sc, m)
                    self.setImgToEvalImgs(scs, gts.keys(), m)
                    # print("%s: %0.1f" % (m, sc*100))
            else:
                self.setEval(score, method)
                self.setImgToEvalImgs(scores, gts.keys(), method)
                # print("%s: %0.1f" % (method, score*100))
        self.setEvalImgs()
Exemplo n.º 10
0
def main(eval_caption_file, output, zh=False, embedding_path=None):
    df = pd.read_json(eval_caption_file)
    if zh:
        refs = df.groupby("key")["tokens"].apply(list).to_dict()
    else:
        refs = df.groupby("key")["caption"].apply(list).to_dict()

    from pycocoevalcap.bleu.bleu import Bleu
    from pycocoevalcap.cider.cider import Cider
    from pycocoevalcap.rouge.rouge import Rouge

    scorer = Bleu(zh=zh)
    bleu_scores = coco_score(copy.deepcopy(refs), scorer)
    print(bleu_scores)
    scorer = Cider(zh=zh)
    cider_score = coco_score(copy.deepcopy(refs), scorer)
    print(cider_score)
    scorer = Rouge(zh=zh)
    rouge_score = coco_score(copy.deepcopy(refs), scorer)
    print(rouge_score)

    if not zh:
        from pycocoevalcap.meteor.meteor import Meteor
        scorer = Meteor()
        meteor_score = coco_score(copy.deepcopy(refs), scorer)

        from pycocoevalcap.spice.spice import Spice
        scorer = Spice()
        spice_score = coco_score(copy.deepcopy(refs), scorer)

    diverse_score = diversity_score(refs, zh)

    with open(embedding_path, "rb") as f:
        ref_embeddings = pickle.load(f)

    bert_score = embedding_score(ref_embeddings, zh)

    with open(output, "w") as f:
        for n in range(4):
            f.write("BLEU-{}: {:6.3f}\n".format(n + 1, bleu_scores[n]))
        f.write("CIDEr: {:6.3f}\n".format(cider_score))
        f.write("ROUGE: {:6.3f}\n".format(rouge_score))
        if not zh:
            f.write("Meteor: {:6.3f}\n".format(meteor_score))
            f.write("SPICE: {:6.3f}\n".format(spice_score))
        f.write("SentenceBert: {:6.3f}\n".format(bert_score))
        f.write("Diversity: {:6.3f}\n".format(diverse_score))
Exemplo n.º 11
0
    def coco_evaluate(self,
                      path1: str,
                      path2: str,
                      kaldi_stream: str,
                      kaldi_scp: str,
                      caption_file: str,
                      max_length: int = None,
                      output: str = "coco_scores.txt"):
        key2pred = self._ensemble(path1, path2, kaldi_stream, kaldi_scp,
                                  max_length)

        caption_df = pd.read_json(caption_file)
        caption_df["key"] = caption_df["filename"].apply(
            lambda x: os.path.splitext(x)[0])
        key2refs = caption_df.groupby(["key"])["caption"].apply(list).to_dict()

        from pycocoevalcap.bleu.bleu import Bleu
        from pycocoevalcap.rouge.rouge import Rouge
        from pycocoevalcap.cider.cider import Cider
        from pycocoevalcap.meteor.meteor import Meteor
        from pycocoevalcap.spice.spice import Spice

        f = open(output, "w")

        scorer = Bleu(n=4)
        score, scores = scorer.compute_score(key2refs, key2pred)
        for n in range(4):
            f.write("Bleu-{}: {:6.3f}\n".format(n + 1, score[n]))

        scorer = Rouge()
        score, scores = scorer.compute_score(key2refs, key2pred)
        f.write("ROUGE: {:6.3f}\n".format(score))

        scorer = Cider()
        score, scores = scorer.compute_score(key2refs, key2pred)
        f.write("CIDEr: {:6.3f}\n".format(score))

        scorer = Meteor()
        score, scores = scorer.compute_score(key2refs, key2pred)
        f.write("Meteor: {:6.3f}\n".format(score))

        scorer = Spice()
        score, scores = scorer.compute_score(key2refs, key2pred)
        f.write("Spice: {:6.3f}\n".format(score))

        f.close()
Exemplo n.º 12
0
 def __init__(self, coco, cocoRes, metric):
     super().__init__(coco, cocoRes)
     self.scores = []
     if metric == "Bleu":
         self.scores = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"])]
     elif metric == "METEOR":
         self.scores = [(Meteor(), "METEOR")]
     elif metric == "ROUGE_L":
         self.scores = [(Rouge(), "ROUGE_L")]
     elif metric == "CIDEr":
         self.scores = [(Cider(), "CIDEr")]
     elif metric == "SPICE":
         self.scores = [(Spice(), "SPICE")]
     else:
         raise ValueError(
             f'Not supported image caption metric: {metric}. Supported metric list: [Bleu, METEOR, ROUGE_L, CIDEr, SPICE]'
         )
Exemplo n.º 13
0
    def evaluate(self):
        gts = self.coco
        res = self.coco_res

        # =================================================
        # Set up scorers
        # =================================================
        print("tokenization...")
        tokenizer = PTBTokenizer()
        # gts = {k:[' '.join(v)] for k,v in tokenizer.tokenize(gts).items()}
        # res = {k:[' '.join(v)] for k,v in tokenizer.tokenize(res).items()}
        gts = tokenizer.tokenize(gts)
        # res = {k: v[:1] for k, v in tokenizer.tokenize(res).items()}
        res = tokenizer.tokenize(res)
        # breakpoint()

        # =================================================
        # Set up scorers
        # =================================================
        print("setting up scorers...")
        scorers = [
            (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
            (Meteor(), "METEOR"),
            (Rouge(), "ROUGE_L"),
            (RougeF1(), "ROUGE_F1"),
            (Cider(), "CIDEr"),
            (Spice(), "SPICE"),
        ]

        # =================================================
        # Compute scores
        # =================================================
        for scorer, method in scorers:
            print("computing %s score..." % (scorer.method()))
            score, scores = scorer.compute_score(gts, res)
            if type(method) == list:
                for sc, scs, m in zip(score, scores, method):
                    self.set_eval(sc, m)
                    self.set_img_to_eval_imgs(scs, gts.keys(), m)
                    print(f"{m}: {sc:0.3f}")
            else:
                self.set_eval(score, method)
                self.set_img_to_eval_imgs(scores, gts.keys(), method)
                print(f"{method}: {score:0.3f}")
        self.set_eval_imgs()
Exemplo n.º 14
0
    def evaluate(self):
        imgIds = self.params['image_id']
        gts = self.gts
        res = self.res

        # =================================================
        # Set up scorers
        # =================================================
        print('tokenization...')
        tokenizer = PTBTokenizer()
        gts = tokenizer.tokenize(gts)
        res = tokenizer.tokenize(res)

        # =================================================
        # Set up scorers
        # =================================================
        print('setting up scorers...')
        scorers = [
            (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
            (Meteor(), "METEOR"),
            (Rouge(), "ROUGE_L"),
            (Cider(), "CIDEr"),
            (Spice(), "SPICE"),
            (WMD(), "WMD"),
        ]

        # =================================================
        # Compute scores
        # =================================================
        eval = {}
        for scorer, method in scorers:
            print('computing %s score...' % (scorer.method()))
            score, scores = scorer.compute_score(gts, res)
            if type(method) == list:
                for sc, scs, m in zip(score, scores, method):
                    self.setEval(sc, m)
                    self.setImgToEvalImgs(scs, imgIds, m)
                    print("%s: %0.3f" % (m, sc))
            else:
                self.setEval(score, method)
                self.setImgToEvalImgs(scores, imgIds, method)
                print("%s: %0.3f" % (method, score))
        self.setEvalImgs()
Exemplo n.º 15
0
def score_func(ref, hypo, idx=None):
    """
    ref, dictionary of reference sentences (id, sentence)
    hypo, dictionary of hypothesis sentences (id, sentence)
    score, dictionary of scores
    """
    scorers = [(Spice(), "SPICE"),
               (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
               (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), (Meteor(), "METEOR")]
    final_scores = {}
    if idx is not None:
        scorers = [scorers[idx]]
    for scorer, method in scorers:
        score, scores = scorer.compute_score(ref, hypo)
        print('score', method, score)
        if type(score) == list:
            for m, s in zip(method, score):
                final_scores[m] = s
        else:
            final_scores[method] = score
    return final_scores
Exemplo n.º 16
0
    def evaluate(self):
        gts = self.coco
        res = self.coco_res

        # =================================================
        # Set up scorers
        # =================================================
        print("tokenization...")
        tokenizer = PTBTokenizer()
        gts = tokenizer.tokenize(gts)
        res = tokenizer.tokenize(res)

        # =================================================
        # Set up scorers
        # =================================================
        print("setting up scorers...")
        scorers = [
            (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
            (Meteor(), "METEOR"),
            (Rouge(), "ROUGE_L"),
            (Cider(), "CIDEr"),
            (Spice(), "SPICE"),
        ]

        # =================================================
        # Compute scores
        # =================================================
        for scorer, method in scorers:
            print("computing %s score..." % (scorer.method()))
            score, scores = scorer.compute_score(gts, res)
            if type(method) == list:
                for sc, scs, m in zip(score, scores, method):
                    self.set_eval(sc, m)
                    self.set_img_to_eval_imgs(scs, gts.keys(), m)
                    print("%s: %0.3f" % (m, sc))
            else:
                self.set_eval(score, method)
                self.set_img_to_eval_imgs(scores, gts.keys(), method)
                print("%s: %0.3f" % (method, score))
        self.set_eval_imgs()
Exemplo n.º 17
0
    def __init__(self,
                 ground_truth_filename=None,
                 prediction_filename=None,
                 verbose=False):
        # Check that the gt and submission files exist and load them
        if not ground_truth_filename:
            raise IOError('Please input a valid ground truth file.')
        if not prediction_filename:
            raise IOError('Please input a valid prediction file.')

        self.verbose = verbose
        self.ground_truth = self.import_ground_truth(ground_truth_filename)
        self.prediction = self.import_prediction(prediction_filename)
        self.tokenizer = PTBTokenizer()

        # Set up scorers, if not verbose, we only use the one we're
        # testing on: METEOR
        if self.verbose:
            self.scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3",
                                       "Bleu_4"]), (Meteor(), "METEOR"),
                            (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"),
                            (Spice(), "SPICE")]
        else:
            self.scorers = [(Meteor(), "METEOR")]
Exemplo n.º 18
0
def spice():
    scorer = Spice()
    score, scores = scorer.compute_score(gts, res)
    print('spice = %s' % score)
Exemplo n.º 19
0
def validate(val_loader, encoder, decoder, criterion, tok_en, tok_zh):
    '''
    Performs one epoch's validation.
    '''
    decoder.eval()  # eval mode (no dropout or batchnorm)
    if encoder is not None:
        encoder.eval()

    references_en = list(
    )  # references (true captions) for calculating corpus BLEU-4 score
    hypotheses_en = list()  # hypotheses (predictions)

    references_zh = list(
    )  # references (true captions) for calculating corpus BLEU-4 score
    hypotheses_zh = list()  # hypotheses (predictions)

    avg_loss = 0

    with torch.no_grad():
        # Batches
        for cnt, (encap, zhcap, video, caplen_en, caplen_zh, enrefs,
                  zhrefs) in enumerate(val_loader, 1):
            encap, zhcap, video, caplen_en, caplen_zh = encap.cuda(
            ), zhcap.cuda(), video.cuda(), caplen_en.cuda(), caplen_zh.cuda()

            # Forward prop.
            init_hidden, vid_out = encoder(
                video
            )  # fea: decoder input from encoder, should be of size (mb, encout_dim) = (mb, decoder_dim)
            scores_en, pred_lengths_en, scores_zh, pred_lengths_zh = decoder.inference(
                encap, zhcap, init_hidden, vid_out, args.MAX_INPUT_LENGTH)

            # Since we decoded starting with <start>, the targets are all words after <start>, up to <end>
            targets_en = encap[:, 1:]
            scores_copy_en = scores_en.clone()
            targets_zh = zhcap[:, 1:]
            scores_copy_zh = scores_zh.clone()

            # Calculate loss
            loss_en = criterion(
                scores_en[:, 1:].contiguous().view(-1, decoder.vocab_size_en),
                targets_en.contiguous().view(-1))
            loss_zh = criterion(
                scores_zh[:, 1:].contiguous().view(-1, decoder.vocab_size_zh),
                targets_zh.contiguous().view(-1))

            # Hypotheses
            _, preds_en = torch.max(scores_copy_en, dim=2)
            preds_en = preds_en.tolist()
            temp_preds_en = list()
            for j, p in enumerate(preds_en):
                temp_preds_en.append(
                    preds_en[j][1:pred_lengths_en[j]])  # remove pads and idx-0

            preds_en = temp_preds_en
            hypotheses_en.extend(preds_en)  # preds= [1,2,3]

            enrefs = [list(map(int, i.split())) for i in enrefs
                      ]  # tgtrefs = [[1,2,3], [2,4,3], [1,4,5,]]

            for r in enrefs:
                references_en.append([r])

            assert len(references_en) == len(hypotheses_en)

            _, preds_zh = torch.max(scores_copy_zh, dim=2)
            preds_zh = preds_zh.tolist()
            temp_preds_zh = list()
            for j, p in enumerate(preds_zh):
                temp_preds_zh.append(
                    preds_zh[j][1:pred_lengths_zh[j]])  # remove pads and idx-0

            preds_zh = temp_preds_zh
            hypotheses_zh.extend(preds_zh)  # preds= [1,2,3]

            zhrefs = [list(map(int, i.split())) for i in zhrefs
                      ]  # tgtrefs = [[1,2,3], [2,4,3], [1,4,5,]]

            for r in zhrefs:
                references_zh.append([r])

            assert len(references_zh) == len(hypotheses_zh)

            avg_loss += loss_en.item() + loss_zh.item()

            # Since we decoded starting with <start>, the targets are all words after <start>, up to <end>

            # Calculate loss

            # Hypotheses

        # Calculate metrics
        avg_loss = avg_loss / cnt

        scorers = {
            "Bleu": Bleu(4),
            "Meteor": Meteor(),
            "Rouge": Rouge(),
            "Cider": Cider(),
            "Spice": Spice()
        }

        gts_en = {}
        res_en = {}
        for i in range(len(references_en)):
            gts_en[i] = [tok_en.decode_sentence(references_en[i][0])]
            res_en[i] = [tok_en.decode_sentence(hypotheses_en[i])]
        scores = {}
        for name, scorer in scorers.items():
            score, all_scores = scorer.compute_score(gts_en, res_en)
            if isinstance(score, list):
                for i, sc in enumerate(score, 1):
                    scores[name + str(i)] = sc
            else:
                scores[name] = score
        print("Score of EN:")
        print(scores)
        """
        gts_zh = {}
        res_zh = {}
        for i in range(len(references_zh)):
            gts_zh[i] = [tok_zh.decode_sentence(references_zh[i][0])]
            res_zh[i] = [tok_zh.decode_sentence(hypotheses_zh[i])]
        scores = {}
        for name, scorer in scorers.items():
            score, all_scores = scorer.compute_score(gts_zh, res_zh)
            if isinstance(score, list):
                for i, sc in enumerate(score, 1):
                    scores[name + str(i)] = sc
            else:
                scores[name] = score
        print("Score of ZH:")
        print(scores)
        """
        corpbleu_en = corpus_bleu(references_en, hypotheses_en)
        sentbleu_en = 0
        for i, (r, h) in enumerate(zip(references_en, hypotheses_en), 1):
            sentbleu_en += sentence_bleu(r, h, smoothing_function=cc.method7)
        sentbleu_en /= i

    return avg_loss, sentbleu_en, corpbleu_en
Exemplo n.º 20
0
def spice(gts, res):
    scorer = Spice()
    score, scores = scorer.compute_score(gts, res)
    out_file.write('SPICE = %s' % score + '\n')
Exemplo n.º 21
0
from pycocoevalcap.spice.spice import Spice
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer

from utils.logger import setup_logger

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="evaluate")
    parser.add_argument("--gt_caption", type=str)
    parser.add_argument("--pd_caption", type=str)
    parser.add_argument("--save_dir", type=str)
    args = parser.parse_args()

    logger = setup_logger("evaluate", args.save_dir, 0)
    ptb_tokenizer = PTBTokenizer()

    scorers = [(Cider(), "C"), (Spice(), "S"),
               (Bleu(4), ["B1", "B2", "B3", "B4"]),
               (Meteor(), "M"), (Rouge(), "R")]

    logger.info(f"loading ground-truths from {args.gt_caption}")
    with open(args.gt_caption) as f:
        gt_captions = json.load(f)
    gt_captions = ptb_tokenizer.tokenize(gt_captions)

    logger.info(f"loading predictions from {args.pd_caption}")
    with open(args.pd_caption) as f:
        pred_dict = json.load(f)
    pd_captions = dict()
    for image_id, v in pred_dict.items():
        pd_captions[str(image_id)] = [{"caption":v['caption'][0]['caption'],}]
    tmp = ptb_tokenizer.tokenize(pd_captions)
Exemplo n.º 22
0
def eval_model(root_path, inputs):
    """
    Computes evaluation metrics of the model results against the human annotated captions

    Parameters:
    ------------
    root_path: str
        the path to the data folder which contains the raw folder
    inputs: str
        the name of the caption file to process

    Returns:
    ------------
    None, it saves the overall score and individual score files under output path
    """

    # load data
    try:
        with open(f'{root_path}/json/{inputs}.json', 'r') as data:
            ref_data = json.load(data)

    except:
        raise (f'Make sure that human-annotated captions are store in',
               f'{root_path}/json/{inputs}.json.')

    try:
        with open(f'{root_path}/json/{inputs}_model_caption.json',
                  'r') as data:
            results = json.load(data)

    except:
        raise ('Please call generate_captions.py to generate captions first.')

    # format the inputs
    img_id_dict = {'image_id': list(ref_data.keys())}

    imgIds = img_id_dict['image_id']

    gts = {}
    res = {}

    required_key = {'raw', 'imgid', 'sentid'}

    for imgId in imgIds:

        caption_list = ref_data[imgId]['sentences']
        caption_list_sel = []

        for i in caption_list:

            lst = {
                key: value
                for key, value in i.items() if key in required_key
            }
            lst['caption'] = lst.pop('raw')
            lst['image_id'] = lst.pop('imgid')
            lst['id'] = lst.pop('sentid')
            caption_list_sel.append(lst)
        gts[imgId] = caption_list_sel

        generated = [{'caption': results[imgId]}]
        res[imgId] = generated

    # tokenize

    print('tokenization...')

    tokenizer = PTBTokenizer()
    gts = tokenizer.tokenize(gts)
    res = tokenizer.tokenize(res)

    # compute scores
    scorers = [
        (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
        (Meteor(), "METEOR"),
        (Rouge(), "ROUGE_L"),
        (Cider(), "CIDEr"),
        (Spice(), "SPICE"),
        (usc_sim(), "USC_similarity"),
    ]

    score_dict = {}
    scores_dict = {}

    for scorer, method in scorers:

        print('computing %s score...' % (scorer.method()))
        score, scores = scorer.compute_score(gts, res)

        if type(method) == list:
            for sc, scs, m in zip(score, scores, method):
                score_dict[m] = sc
                scores_dict[m] = scs

        else:
            score_dict[method] = score
            scores_dict[method] = scores

    # format the individual scores
    img_score_dict = {}

    for n in range(len(res)):

        img_name = list(res.keys())[n]
        img_score_dict[img_name] = {}

        for metrics in scores_dict.keys():

            if metrics == 'SPICE':
                img_score_dict[img_name][metrics] = scores_dict[metrics][n][
                    'All']['f']
            else:
                img_score_dict[img_name][metrics] = scores_dict[metrics][n]

    output_path = f'{root_path}/score'

    # save the overall score and individual image score
    if not os.path.exists(output_path):
        os.makedirs(output_path, exist_ok=True)

    with open(f'{output_path}/{inputs}_score.json', 'w') as file:
        json.dump(score_dict, file)

    with open(f'{output_path}/{inputs}_img_score.json', 'w') as file:
        json.dump(img_score_dict, file)

    assert os.path.isfile(f'{output_path}/{inputs}_score.json'),\
    "Average scores are not saved."

    assert os.path.isfile(f'{output_path}/{inputs}_img_score.json'),\
    "Individual scores are not saved."
Exemplo n.º 23
0
def train(model,
          criterion,
          optimizer,
          train_loader,
          val_loader,
          opt,
          rl_criterion=None):

    infos = {
        'iter': 0,
        'epoch': 0,
        'start_epoch': 0,
        'best_score': float('-inf'),
        'best_iter': 0,
        'best_epoch': opt.max_epochs
    }

    checkpoint_checked = False
    rl_training = False
    seq_per_img = train_loader.get_seq_per_img()
    infos_history = {}

    if os.path.exists(opt.start_from):
        if os.path.isdir(opt.start_from):
            # loading the same model file at a different experiment dir
            start_from_file = os.path.join(opt.start_from,
                                           os.path.basename(opt.model_file))
        else:
            start_from_file = opt.start_from
        logger.info('Loading state from: %s', start_from_file)
        checkpoint = torch.load(start_from_file)
        model.load_state_dict(checkpoint['model'])
        infos = checkpoint['infos']
        infos['start_epoch'] = infos['epoch']
        checkpoint_checked = True  # this epoch is already checked
    else:
        logger.info('No checkpoint found! Training from the scratch')

    if opt.use_rl == 1 and opt.use_rl_after == 0:
        opt.use_rl_after = infos['epoch']
        opt.use_cst_after = infos['epoch']
        train_loader.set_current_epoch(infos['epoch'])

    if opt.grounder_type in ['niuc', 'iuc']:
        # get class weights
        one_hot_sums = None
        totes = 0
        cur_index = train_loader.get_current_index()
        train_loader.reset()
        ep = infos['epoch']
        while True:
            data = train_loader.get_batch()
            labels_svo = data['labels_svo']
            one_hot = torch.clamp(
                torch.sum(torch.nn.functional.one_hot(
                    labels_svo, num_classes=model.vocab_size),
                          axis=1), 0, 1)
            one_hot[:, 0] = 0  # make the padding index 0
            totes += one_hot.shape[0]
            if one_hot_sums is None:
                one_hot_sums = torch.sum(one_hot, axis=0)
            else:
                one_hot_sums += torch.sum(one_hot, axis=0)

            if ep < train_loader.get_current_epoch():
                one_hot_negs = -one_hot_sums + totes
                pos_weight = one_hot_negs.type(torch.FloatTensor) / (
                    1 + one_hot_sums.type(torch.FloatTensor))
                pos_weight = pos_weight.cuda()

                train_loader.set_current_index(index=cur_index)
                break

    while True:
        t_start = time.time()
        model.train()
        data = train_loader.get_batch()
        feats = data['feats']
        bfeats = data['bfeats']
        labels = data['labels']
        masks = data['masks']
        labels_svo = data['labels_svo']
        masks_svo = data['masks_svo']

        if torch.cuda.is_available():
            feats = [feat.cuda() for feat in feats]
            bfeats = [bfeat.cuda() for bfeat in bfeats]
            labels = labels.cuda()
            masks = masks.cuda()
            labels_svo = labels_svo.cuda()
            masks_svo = masks_svo.cuda()

        # implement scheduled sampling
        opt.ss_prob = 0
        if opt.use_ss == 1 and infos['epoch'] >= opt.use_ss_after:
            annealing_prob = opt.ss_k / \
                (opt.ss_k + np.exp((infos['epoch'] - opt.use_ss_after) / opt.ss_k))
            opt.ss_prob = min(1 - annealing_prob, opt.ss_max_prob)
            model.set_ss_prob(opt.ss_prob)

        if opt.use_rl == 1 and infos[
                'epoch'] >= opt.use_rl_after and not rl_training:
            logger.info('Using RL objective...')
            rl_training = True
            bcmr_scorer = {
                'Bleu_4': Bleu(),
                'CIDEr': Cider(df=opt.train_cached_tokens),
                'METEOR': Meteor(),
                'ROUGE_L': Rouge(),
                'SPICE': Spice()
            }[opt.eval_metric]

            #logger.info('loading gt refs: %s', train_loader.cocofmt_file)
            #gt_refs = utils.load_gt_refs(train_loader.cocofmt_file)

        mixer_from = opt.mixer_from
        if opt.use_mixer == 1 and rl_training:
            #annealing_mixer = opt.ss_k / \
            #    (opt.ss_k + np.exp((infos['epoch'] - opt.use_rl_after) / opt.ss_k))
            #annealing_mixer = int(round(annealing_mixer * opt.seq_length))

            # -1 for annealing
            if opt.mixer_from == -1:
                annealing_mixer = opt.seq_length - int(
                    np.ceil((infos['epoch'] - opt.use_rl_after + 1) /
                            float(opt.mixer_descrease_every)))
                mixer_from = max(1, annealing_mixer)

            model.set_mixer_from(mixer_from)

        scb_captions = opt.scb_captions
        if opt.use_cst == 1 and rl_training:
            # if opt.use_cst == 1 and opt.ss_k == 0,
            # then do not using annealing, but the fixed scb_captions provided
            #annealing_robust = opt.ss_k / \
            #    (opt.ss_k + np.exp((infos['epoch'] - opt.use_rl_after) / opt.ss_k))
            #annealing_robust = int(round((1 - annealing_robust) * seq_per_img))

            # do not use robust before fully mixed
            # if opt.use_mixer == 1 and mixer_from > 1:
            #    opt.use_cst_after = infos['epoch']

            # if opt.scb_captions is -1, then use the annealing value,
            # otherwise, use the set value
            if opt.scb_captions == -1:
                annealing_robust = int(
                    np.ceil((infos['epoch'] - opt.use_cst_after + 1) /
                            float(opt.cst_increase_every)))
                scb_captions = min(annealing_robust, seq_per_img - 1)

        optimizer.zero_grad()
        model.set_seq_per_img(seq_per_img)

        if rl_training:
            # sampling from model distribution
            # model_res, logprobs = model.sample(
            #    feats, {'sample_max': 0, 'expand_feat': opt.expand_feat, 'temperature': 1})

            # using mixer
            pred, model_res, logprobs, pred_svo, res_svo, logprobs_svo = model(
                feats, bfeats, labels, labels_svo)

            if opt.use_cst == 0:
                # greedy decoding baseline in SCST paper
                greedy_baseline, _, _, _ = model.sample(
                    [Variable(f.data, volatile=True) for f in feats],
                    [Variable(f.data, volatile=True) for f in bfeats], {
                        'sample_max': 1,
                        'expand_feat': opt.expand_feat
                    })

            if opt.use_cst == 1:
                bcmrscores = data['bcmrscores']
                reward, m_score, g_score = utils.get_cst_reward(
                    model_res,
                    data['gts'],
                    bcmr_scorer,
                    bcmrscores=bcmrscores,
                    expand_feat=opt.expand_feat,
                    seq_per_img=train_loader.get_seq_per_img(),
                    scb_captions=scb_captions,
                    scb_baseline=opt.scb_baseline,
                    use_eos=opt.use_eos,
                    use_mixer=opt.use_mixer)
            else:
                # use greedy baseline by default, compute self-critical reward
                reward, m_score, g_score = utils.get_self_critical_reward(
                    model_res,
                    greedy_baseline,
                    data['gts'],
                    bcmr_scorer,
                    expand_feat=opt.expand_feat,
                    seq_per_img=train_loader.get_seq_per_img(),
                    use_eos=opt.use_eos)

            loss = rl_criterion(
                model_res, logprobs,
                Variable(torch.from_numpy(reward).float().cuda(),
                         requires_grad=False))
            loss_svo = criterion(pred_svo, labels_svo,
                                 torch.ones(labels.shape).cuda())
            loss = loss + (opt.labda / 10.0) * loss_svo

        else:
            pred, _, _, pred_svo, svo_it, svo_gath = model(
                feats, bfeats, labels, labels_svo)
            loss_cap = criterion(pred,
                                 labels[:, 1:],
                                 masks[:, 1:],
                                 bcmrscores=torch.from_numpy(
                                     data['bcmrscores'].astype(
                                         np.float32)).cuda())
            if opt.grounder_type in ['None', 'none']:
                loss = loss_cap
            else:
                if opt.grounder_type in ['niuc', 'iuc']:  # unordered
                    svo_criterion = torch.nn.BCEWithLogitsLoss(
                        pos_weight=pos_weight)
                    concepts_one_hot = torch.clamp(
                        torch.sum(torch.nn.functional.one_hot(
                            labels_svo, num_classes=model.vocab_size),
                                  axis=1), 0, 1)
                    loss_svo = svo_criterion(
                        pred_svo[:, 0],
                        concepts_one_hot.type(torch.FloatTensor).cuda()
                    )  # pred_svo[: 0] undoes the repeat at the end of non_iterative_grounder()
                else:
                    loss_svo = criterion(pred_svo, labels_svo,
                                         torch.ones(labels.shape).cuda())
                    # loss_svo = criterion(pred_svo, labels_svo, masks_svo)

                if random.random() < 0.01:  # compare the svos during training
                    print('---------------------')
                    print(utils.decode_sequence(opt.vocab, pred.argmax(-1)))
                    print(utils.decode_sequence(opt.vocab, labels_svo)[0])
                    print(utils.decode_sequence(opt.vocab, svo_it)[0])
                loss = loss_cap + (opt.labda / 10.0) * loss_svo

        loss.backward()
        clip_grad_norm_(model.parameters(), opt.grad_clip)
        optimizer.step()
        # memReport()
        del pred, feats, labels, masks, labels_svo
        torch.cuda.empty_cache()

        infos['TrainLoss'] = loss.item()
        infos['CAPTrainLoss'] = loss_cap.item()
        if opt.grounder_type not in ['None', 'none']:
            infos['SVOTrainLoss'] = loss_svo.item()
        else:
            infos['SVOTrainLoss'] = 0
        infos['mixer_from'] = mixer_from
        infos['scb_captions'] = scb_captions

        if infos['iter'] % opt.print_log_interval == 0:
            elapsed_time = time.time() - t_start

            log_info = [('Epoch', infos['epoch']), ('Iter', infos['iter']),
                        ('Loss', infos['TrainLoss']),
                        ('CAP Loss', infos['CAPTrainLoss']),
                        ('SVO Loss', infos['SVOTrainLoss'])]

            if rl_training:
                log_info += [('Reward', np.mean(reward[:, 0])),
                             ('{} (m)'.format(opt.eval_metric), m_score),
                             ('{} (b)'.format(opt.eval_metric), g_score)]

            if opt.use_ss == 1:
                log_info += [('ss_prob', opt.ss_prob)]

            if opt.use_mixer == 1:
                log_info += [('mixer_from', mixer_from)]

            if opt.use_cst == 1:
                log_info += [('scb_captions', scb_captions)]

            log_info += [('Time', elapsed_time)]
            logger.info(
                '%s',
                '\t'.join(['{}: {}'.format(k, v) for (k, v) in log_info]))

        infos['iter'] += 1

        if infos['epoch'] < train_loader.get_current_epoch():
            infos['epoch'] = train_loader.get_current_epoch()
            checkpoint_checked = False
            learning_rate = utils.adjust_learning_rate(
                opt, optimizer, infos['epoch'] - infos['start_epoch'])
            logger.info('===> Learning rate: %f: ', learning_rate)

        # checkpoint_checked = False
        # if 1:   todo debuging, jump straight to validation
        if (infos['epoch'] >= opt.save_checkpoint_from
                and infos['epoch'] % opt.save_checkpoint_every == 0
                and not checkpoint_checked):
            # evaluate the validation performance
            results = validate(model, criterion, val_loader, opt)
            logger.info(
                'Validation output: %s',
                json.dumps(results['scores'], indent=4, sort_keys=True))
            # infos.update(results['scores'])

            # todo added training set eval to check for overfitting
            cur_index = train_loader.get_current_index()
            train_loader.reset()
            results_train = validate(model,
                                     criterion,
                                     train_loader,
                                     opt,
                                     max_iters=20,
                                     type='train')
            train_loader.set_current_index(index=cur_index)
            for k, v in results_train['scores'].items():
                results['scores']['Train_' + k] = v

            logger.info(
                'Training output: %s',
                json.dumps(results_train['scores'], indent=4, sort_keys=True))
            infos.update(results['scores'])

            check_model(model, opt, infos, infos_history)
            checkpoint_checked = True

        if (infos['epoch'] >= opt.max_epochs
                or infos['epoch'] - infos['best_epoch'] > opt.max_patience):
            logger.info('>>> Terminating...')
            break

    return infos
Exemplo n.º 24
0
import time
import os

import sys
sys.path.append("coco-caption")
from pycocotools.coco import COCO
from pycocoevalcap.spice.spice import Spice

train_path = '/home/yangxu/project/self-critical.pytorch/data/coco_annotations/captions_train2014.json'
val_path = '/home/yangxu/project/self-critical.pytorch/data/coco_annotations/captions_val2014.json'

coco_train = COCO(train_path)
coco_val = COCO(val_path)

coco_use = coco_train

image_ids = coco_use.getImgIds()
gts = {}
res = {}
for img_id in image_ids:
    gts[img_id] = []
    data_temp = coco_use.imgToAnns[img_id]
    for dt in data_temp:
        gts[img_id].append(dt['caption'])
    res[img_id] = []
    res[img_id].append(gts[img_id][0])

scorer = Spice()
score, scores = scorer.compute_score(gts, res)
Exemplo n.º 25
0
    def evaluate(self,
                 experiment_path: str,
                 feature_file: str,
                 feature_scp: str,
                 caption_file: str,
                 caption_output: str = "eval_output.json",
                 score_output: str = "scores.txt",
                 **kwargs):
        """kwargs: {'max_length': int, 'method': str, 'beam_size': int}"""

        dump = torch.load(os.path.join(experiment_path, "saved.pth"),
                          map_location="cpu")
        # Load previous training config
        config = dump["config"]

        vocabulary = torch.load(config["vocab_file"])
        model = self._get_model(config, vocabulary)
        model.load_state_dict(dump["model"])
        # Some scaler (sklearn standardscaler)
        scaler = dump["scaler"]
        zh = config["zh"]
        model = model.to(self.device)

        dataset = SJTUDatasetEval(feature=feature_file,
                                  eval_scp=feature_scp,
                                  transform=scaler.transform)
        dataloader = torch.utils.data.DataLoader(dataset,
                                                 shuffle=False,
                                                 collate_fn=collate_fn((1, )),
                                                 batch_size=32,
                                                 num_workers=0)

        caption_df = pd.read_json(caption_file, dtype={"key": str})
        if zh:
            key2refs = caption_df.groupby("key")["tokens"].apply(
                list).to_dict()
        else:
            key2refs = caption_df.groupby("key")["caption"].apply(
                list).to_dict()

        model.eval()

        key2pred = {}

        def _sample(engine, batch):
            with torch.no_grad():
                model.eval()
                keys = batch[0]
                output = self._forward(model, batch, mode="sample", **kwargs)
                seqs = output["seqs"].cpu().numpy()

                for idx, seq in enumerate(seqs):
                    caption = self._convert_idx2sentence(seq, vocabulary, zh)
                    key2pred[keys[idx]] = [
                        caption,
                    ]

        pbar = ProgressBar(persist=False, ascii=True)
        sampler = Engine(_sample)
        pbar.attach(sampler)
        sampler.run(dataloader)

        pred_df = []
        for key, pred in key2pred.items():
            pred_df.append({
                "filename": key + ".wav",
                "caption": "".join(pred[0]) if zh else pred[0],
                "tokens": pred[0] if zh else pred[0].split()
            })
        pred_df = pd.DataFrame(pred_df)
        pred_df.to_json(os.path.join(experiment_path, caption_output))

        from pycocoevalcap.bleu.bleu import Bleu
        from pycocoevalcap.rouge.rouge import Rouge
        from pycocoevalcap.cider.cider import Cider
        from pycocoevalcap.meteor.meteor import Meteor
        from pycocoevalcap.spice.spice import Spice

        f = open(os.path.join(experiment_path, score_output), "w")

        scorer = Bleu(n=4, zh=zh)
        score, scores = scorer.compute_score(key2refs, key2pred)
        for n in range(4):
            f.write("Bleu-{}: {:6.3f}\n".format(n + 1, score[n]))

        scorer = Rouge(zh=zh)
        score, scores = scorer.compute_score(key2refs, key2pred)
        f.write("ROUGE: {:6.3f}\n".format(score))

        scorer = Cider(zh=zh)
        score, scores = scorer.compute_score(key2refs, key2pred)
        f.write("CIDEr: {:6.3f}\n".format(score))

        if not zh:
            scorer = Meteor()
            score, scores = scorer.compute_score(key2refs, key2pred)
            f.write("Meteor: {:6.3f}\n".format(score))

            scorer = Spice()
            score, scores = scorer.compute_score(key2refs, key2pred)
            f.write("Spice: {:6.3f}\n".format(score))

        f.close()
Exemplo n.º 26
0
import model_normal
import data
import helper_datasources
import config

sys.path.append(config.mscoco_dir)
from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.spice.spice import Spice
from pycocoevalcap.wmd.wmd import WMD

_meteor_scorer = Meteor()
_cider_scorer = Cider()
_spice_scorer = Spice()
_wmd_scorer = WMD()


########################################################################################
def geomean(xs):
    if np.all(xs):  #If array does not contain a zero
        return 2**np.mean(np.log2(xs))
    else:
        return 0.0


########################################################################################
def get_meteor(test_tokenized_grouped_sents, generated):
    return _meteor_scorer.compute_score(
        {
Exemplo n.º 27
0
def eval_model(ref_data, results):
    """
    Computes evaluation metrics of the model results against the human annotated captions
    
    Parameters:
    ------------
    ref_data: dict
        a dictionary containing human annotated captions, with image name as key and a list of human annotated captions as values
    
    results: dict
        a dictionary containing model generated caption, with image name as key and a generated caption as value
        
    Returns:
    ------------
    score_dict: a dictionary containing the overall average score for the model
    img_score_dict: a dictionary containing the individual scores for images
    scores_dict: a dictionary containing the scores by metric type
    """
    # download stanford nlp library
    subprocess.call(['../../scr/evaluation/get_stanford_models.sh'])

    # format the inputs
    img_id_dict = {'image_id': list(ref_data.keys())}

    imgIds = img_id_dict['image_id']
    gts = {}
    res = {}

    required_key = {'raw', 'imgid', 'sentid'}

    for imgId in imgIds:
        caption_list = ref_data[imgId]['sentences']
        caption_list_sel = []
        for i in caption_list:
            lst = {
                key: value
                for key, value in i.items() if key in required_key
            }
            lst['caption'] = lst.pop('raw')
            lst['image_id'] = lst.pop('imgid')
            lst['id'] = lst.pop('sentid')
            caption_list_sel.append(lst)
        gts[imgId] = caption_list_sel

        generated = [{'caption': results[imgId]}]
        res[imgId] = generated

    # tokenize
    print('tokenization...')
    tokenizer = PTBTokenizer()
    gts = tokenizer.tokenize(gts)
    res = tokenizer.tokenize(res)

    # compute scores
    scorers = [
        (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
        (Meteor(), "METEOR"),
        (Rouge(), "ROUGE_L"),
        (Cider(), "CIDEr"),
        (Spice(), "SPICE"),
        (usc_sim(), "USC_similarity"),
    ]
    score_dict = {}
    scores_dict = {}
    for scorer, method in scorers:
        print('computing %s score...' % (scorer.method()))
        score, scores = scorer.compute_score(gts, res)
        if type(method) == list:
            for sc, scs, m in zip(score, scores, method):
                score_dict[m] = sc
                scores_dict[m] = scs
        else:
            score_dict[method] = score
            scores_dict[method] = scores

    # format the individual scores
    img_score_dict = {}
    for n in range(len(res)):
        img_name = list(res.keys())[n]
        img_score_dict[img_name] = {}
        for metrics in scores_dict.keys():
            if metrics == 'SPICE':
                img_score_dict[img_name][metrics] = scores_dict[metrics][n][
                    'All']['f']
            else:
                img_score_dict[img_name][metrics] = scores_dict[metrics][n]

    return score_dict, img_score_dict, scores_dict
Exemplo n.º 28
0
def evaluate(beam_size):
    """
    Evaluation

    :param beam_size: beam size at which to generate captions for evaluation
    :return: BLEU-4 score
    """
    # DataLoader
    loader = torch.utils.data.DataLoader(CaptionDataset(
        data_folder,
        data_name,
        'TEST',
        transform=transforms.Compose([normalize])),
                                         batch_size=1,
                                         shuffle=True,
                                         num_workers=0,
                                         pin_memory=False)

    # TODO: Batched Beam Search
    # Therefore, do not use a batch_size greater than 1 - IMPORTANT!

    # Lists to store references (true captions), and hypothesis (prediction) for each image
    # If for n images, we have n hypotheses, and references a, b, c... for each image, we need -
    # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...]
    references = dict()
    hypotheses = dict()

    # For each image
    for j, (image, caps, caplens, allcaps) in enumerate(
            tqdm(loader, desc="EVALUATING AT BEAM SIZE " + str(beam_size))):

        k = beam_size

        # Move to GPU device, if available
        image = image.to(device)  # (1, 3, 256, 256)

        attrs, encoder_out = encoder(image)
        attrs = attrs.expand(3, attrs_dim)

        enc_image_size = encoder_out.size(1)
        encoder_dim = encoder_out.size(3)
        encoder_out = encoder_out.view(1, -1, encoder_dim)
        num_pixels = encoder_out.size(1)
        encoder_out = encoder_out.expand(k, num_pixels, encoder_dim)
        x0 = decoder.init_x0(attrs)

        # Tensor to store top k previous words at each step; now they're just <start>
        k_prev_words = torch.LongTensor([[word_map['<start>']]] * k).to(
            device)  # (k, 1)

        # Tensor to store top k sequences; now they're just <start>
        seqs = k_prev_words  # (k, 1)

        # Tensor to store top k sequences' scores; now they're just 0
        top_k_scores = torch.zeros(k, 1).to(device)  # (k, 1)

        # Lists to store completed sequences and scores
        complete_seqs = list()
        complete_seqs_scores = list()

        # Start decoding
        step = 1
        h1, c1, h2, c2 = decoder.init_hidden_state(attrs,
                                                   encoder_out,
                                                   zero=True)
        h1, c1 = decoder.decode_step1(x0, (h1, c1))
        # s is a number less than or equal to k, because sequences are removed from this process once they hit <end>
        while True:

            embeddings = decoder.embedding(k_prev_words).squeeze(
                1)  # (s, embed_dim)

            h1, c1 = decoder.decode_step1(embeddings, (h1, c1))

            awe, _ = decoder.attention(encoder_out, h1, h2)
            # gate = decoder.sigmoid(decoder.f_beta(h2))
            # awe = gate * awe

            h2, c2 = decoder.decode_step2(torch.cat([embeddings, awe], dim=1),
                                          (h2, c2))

            scores = decoder.fc2(decoder.dropout2(h2))
            scores = F.log_softmax(scores, dim=1)

            # Add
            scores = top_k_scores.expand_as(scores) + scores  # (s, vocab_size)

            # For the first step, all k points will have the same scores (since same k previous words, h, c)
            if step == 1:
                top_k_scores, top_k_words = scores[0].topk(k, 0, True,
                                                           True)  # (s)
            else:
                # Unroll and find top scores, and their unrolled indices
                # (s) 所有分数中最大的k个
                top_k_scores, top_k_words = scores.view(-1).topk(
                    k, 0, True, True)

            # Convert unrolled indices to actual indices of scores
            # 上面展开了,prev_word_inds得到哪些句子是概率最大的
            prev_word_inds = top_k_words / vocab_size  # (s)
            next_word_inds = top_k_words % vocab_size  # (s)

            # Add new words to sequences
            seqs = torch.cat(
                [seqs[prev_word_inds],
                 next_word_inds.unsqueeze(1)], dim=1)  # (s, step+1)

            # Which sequences are incomplete (didn't reach <end>)?
            incomplete_inds = [
                ind for ind, next_word in enumerate(next_word_inds)
                if next_word != word_map['<end>']
            ]
            complete_inds = list(
                set(range(len(next_word_inds))) - set(incomplete_inds))

            # Set aside complete sequences
            if len(complete_inds) > 0:
                complete_seqs.extend(seqs[complete_inds].tolist())
                complete_seqs_scores.extend(top_k_scores[complete_inds])
            k -= len(complete_inds)  # reduce beam length accordingly

            # Proceed with incomplete sequences
            if k == 0:
                break
            seqs = seqs[incomplete_inds]
            h1 = h1[prev_word_inds[incomplete_inds]]
            c1 = c1[prev_word_inds[incomplete_inds]]
            h2 = h2[prev_word_inds[incomplete_inds]]
            c2 = c2[prev_word_inds[incomplete_inds]]
            encoder_out = encoder_out[prev_word_inds[incomplete_inds]]

            top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
            k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

            # Break if things have been going on too long
            if step > 50:
                break
            step += 1

        i = complete_seqs_scores.index(max(complete_seqs_scores))
        seq = complete_seqs[i]

        # References
        img_caps = allcaps[0].tolist()
        img_captions = list(
            map(
                lambda c: [
                    rev_word_map[w] for w in c if w not in {
                        word_map['<start>'], word_map['<end>'], word_map[
                            '<pad>']
                    }
                ], img_caps))  # remove <start> and pads
        img_caps = [' '.join(c) for c in img_captions]
        # print(img_caps)
        references[str(j)] = img_caps

        # Hypotheses
        hypothesis = ([
            rev_word_map[w] for w in seq if w not in
            {word_map['<start>'], word_map['<end>'], word_map['<pad>']}
        ])
        hypothesis = [' '.join(hypothesis)]
        # print(hypothesis)
        hypotheses[str(j)] = hypothesis

        assert len(references) == len(hypotheses)

    # Calculate BLEU-1~BLEU4 scores
    m1 = Bleu()
    m2 = Meteor()
    m3 = Cider()
    m4 = Rouge()
    m5 = Spice()
    (score1, scores1) = m1.compute_score(references, hypotheses)
    (score2, scores2) = m2.compute_score(references, hypotheses)
    (score3, scores3) = m3.compute_score(references, hypotheses)
    (score4, scores4) = m4.compute_score(references, hypotheses)
    (score5, scores5) = m5.compute_score(references, hypotheses)

    return score1, score2, score3, score4, score5