예제 #1
0
    def evaluate(self, gts, res):
        # =================================================
        # Set up scorers
        # =================================================
        logging.info('tokenization...')
        tokenizer = PTBTokenizer()
        gts = tokenizer.tokenize(gts)
        res = tokenizer.tokenize(res)

        # =================================================
        # Set up scorers
        # =================================================
        logging.info('setting up scorers...')
        scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
                   (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"),
                   (Cider(), "CIDEr")
                   #(Spice(), "SPICE")
                   ]

        # =================================================
        # Compute scores
        # =================================================
        bleu_4_score = 0
        for scorer, method in scorers:
            logging.info('computing %s score...' % (scorer.method()))
            score, scores = scorer.compute_score(gts, res)
            if type(method) == list:
                for sc, scs, m in zip(score, scores, method):
                    self.setEval(sc, m)
                    self.setImgToEvalImgs(scs, gts.keys(), m)
                    if m == "Bleu_4":
                        bleu_4_score = sc
                    logging.info("%s: %0.3f" % (m, sc))
                    print("%s: %0.3f" % (m, sc))
            else:
                self.setEval(score, method)
                self.setImgToEvalImgs(scores, gts.keys(), method)
                logging.info("%s: %0.3f" % (method, score))
                print("%s: %0.3f" % (method, score))
        self.setEvalImgs()
        return bleu_4_score
예제 #2
0
    def evaluate(self):
        imgIds = self.params['image_id']
        # imgIds = self.coco.getImgIds()
        gts = {}
        res = {}
        for imgId in imgIds:
            gts[imgId] = self.coco.imgToAnns[imgId]
            res[imgId] = self.cocoRes.imgToAnns[imgId]

        # =================================================
        # Set up scorers
        # =================================================
        print 'tokenization...'
        tokenizer = PTBTokenizer()
        gts = tokenizer.tokenize(gts)
        res = tokenizer.tokenize(res)

        # =================================================
        # Set up scorers
        # =================================================
        print 'setting up scorers...'
        scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
                   (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"),
                   (Cider(), "CIDEr"), (Spice(), "SPICE")]

        # =================================================
        # Compute scores
        # =================================================
        for scorer, method in scorers:
            print 'computing %s score...' % (scorer.method())
            score, scores = scorer.compute_score(gts, res)
            if type(method) == list:
                for sc, scs, m in zip(score, scores, method):
                    self.setEval(sc, m)
                    self.setImgToEvalImgs(scs, gts.keys(), m)
                    print "%s: %0.3f" % (m, sc)
            else:
                self.setEval(score, method)
                self.setImgToEvalImgs(scores, gts.keys(), method)
                print "%s: %0.3f" % (method, score)
        self.setEvalImgs()
예제 #3
0
def __score(ref, hypo):
    """
    ref, dictionary of reference sentences (id, sentence)
    hypo, dictionary of hypothesis sentences (id, sentence)
    score, dictionary of scores
    """
    scorers = [
        (Bleu(4), ["Bleu_1"]),
        (Rouge(), "ROUGE_L"),
    ]

    final_scores = {}
    for scorer, method in scorers:
        score, scores = scorer.compute_score(ref, hypo)
        if type(score) == list:
            for m, s in zip(method, score):
                final_scores[m] = s
        else:
            final_scores[method] = score

    return final_scores
예제 #4
0
def main(hyp, ref, len):
    with open(hyp, 'r') as r:
        hypothesis = r.readlines()
        res = {k: [" ".join(v.strip().lower().split()[:len])] for k, v in enumerate(hypothesis)}
    with open(ref, 'r') as r:
        references = r.readlines()
        gts = {k: [v.strip().lower()] for k, v in enumerate(references)}

    score_Bleu, scores_Bleu = Bleu(4).compute_score(gts, res)
    print("Bleu_1: "), np.mean(scores_Bleu[0])
    print("Bleu_2: "), np.mean(scores_Bleu[1])
    print("Bleu_3: "), np.mean(scores_Bleu[2])
    print("Bleu_4: "), np.mean(scores_Bleu[3])

    score_Meteor, scores_Meteor = Meteor().compute_score(gts, res)
    print("Meteor: "), score_Meteor

    score_Rouge, scores_Rouge = Rouge().compute_score(gts, res)
    print("ROUGe: "), score_Rouge

    score_Cider, scores_Cider = Cider().compute_score(gts, res)
    print("Cider: "), score_Cider
예제 #5
0
    def evaluate(self):
        output = []
        scorers = [
            (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),  #,
            (Rouge(), "ROUGE_L")
        ]

        # =================================================
        # Compute scores
        # =================================================
        for scorer, method in scorers:
            print("scorer:", scorer)
            print("method:", method)
            score, scores = scorer.compute_score(self.gts, self.res)
            if type(method) == list:
                for sc, scs, m in zip(score, scores, method):
                    print "%s: %0.5f" % (m, sc)
                    output.append(sc)
            else:
                print "%s: %0.5f" % (method, score)
                output.append(score)
        return output
예제 #6
0
    def evaluate(self):
        imgIds = self.params['image_id']
        # imgIds = self.coco.getImgIds()
        gts = {}
        res = {}
        for imgId in imgIds:
            gts[imgId] = self.coco.imgToAnns[imgId]
            res[imgId] = self.cocoRes.imgToAnns[imgId]

        # =================================================
        # Set up scorers
        # =================================================
        tokenizer = PTBTokenizer()
        gts  = tokenizer.tokenize(gts)
        res = tokenizer.tokenize(res)

        # =================================================
        # Set up scorers
        # =================================================
        scorers = [
            (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
            (Meteor(),"METEOR"),
            (Rouge(), "ROUGE_L"),
            (Cider(), "CIDEr")
        ]

        # =================================================
        # Compute scores
        # =================================================
        for scorer, method in scorers:
            score, scores = scorer.compute_score(gts, res)
            if type(method) == list:
                for sc, scs, m in zip(score, scores, method):
                    self.setEval(sc, m)
                    self.setImgToEvalImgs(scs, gts.keys(), m)
            else:
                self.setEval(score, method)
                self.setImgToEvalImgs(scores, gts.keys(), method)
        self.setEvalImgs()
예제 #7
0
    def evaluate(self):

        # imgIds = self.coco.getImgIds()
        gts = dict(zip(range(0, len(self.predicted_list)),
                       self.predicted_list))
        res = dict(zip(range(0, len(self.label_list)), self.label_list))

        # =================================================
        # Set up scorers
        # =================================================
        print 'tokenization...'
        tokenizer = PTBTokenizer()
        gts = tokenizer.tokenize(gts)
        res = tokenizer.tokenize(res)

        # =================================================
        # Set up scorers
        # =================================================
        print 'setting up scorers...'
        scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
                   (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"),
                   (Cider(), "CIDEr")]

        # =================================================
        # Compute scores
        # =================================================
        for scorer, method in scorers:
            print 'computing %s score...' % (scorer.method())
            score, scores = scorer.compute_score(gts, res)
            if type(method) == list:
                for sc, scs, m in zip(score, scores, method):
                    self.setEval(sc, m)
                    self.set_textid_to_eval(scs, gts.keys(), m)
                    print "%s: %0.3f" % (m, sc)
            else:
                self.setEval(score, method)
                self.set_textid_to_eval(scores, gts.keys(), method)
                print "%s: %0.3f" % (method, score)
        self.set_eval()
예제 #8
0
    def evaluate(self):
        output = []
        scorers = [
            (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
            # (Meteor(),"METEOR"),
            (Rouge(), "ROUGE_L"),
        ]

        # =================================================
        # Compute scores
        # =================================================
        for scorer, method in scorers:
            # print 'computing %s score...'%(scorer.method())
            score, scores = scorer.compute_score(self.gts, self.res)
            if type(method) == list:
                for sc, scs, m in zip(score, scores, method):
                    print "%s: %0.5f" % (m, sc)
                    output.append(sc)
            else:
                print "%s: %0.5f" % (method, score)
                output.append(score)
        return output
예제 #9
0
    def evaluate(self):
        # ==================================================
        # Tokenization, remove punctutions
        # =================================================
        '''
        print "tokenization ..."
        tokenizer = PTBTokenizer()
        gts = tokenizer.tokenize(self.ref)
        res = tokenizer.tokenize(self.res)
        '''
        gts = self.ref
        # ==================================================
        # Set up scorers
        # ==================================================
        print "setting up scorers ..."
        scorers = [
            (Bleu(4), ("Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4")),
            (Meteor(), "METEOR"),
            (Rouge(), "ROUGE_L"),
            (Cider(), "CIDEr")
        ]

        # ==================================================
        # Set up scorers
        # ==================================================
        out = {}
        for scorer, method in scorers:
            print "computing %s score ..." %(scorer.method())
            score, scores = scorer.compute_score(gts, res)
            if isinstance(method, tuple):
                for sc, scs, m in zip(score, scores, method):
                    out[m] = sc
                    print "%s: %0.4f" %(m, sc)
            else:
                print "%s: %0.4f" %(method, score)
                out[method] = score

        return out
예제 #10
0
def language_eval(
    sample_seqs, gt_seqs
):  # sample_seqs:list[[x,x],[x,x],...], gt_seqs:list[[list1,list2,...],[list1,list2,...],...]
    import sys
    #sys.path.append("caption-eval")
    sys.path.append("coco-caption/pycocoevalcap")
    from bleu.bleu import Bleu
    from cider.cider import Cider
    from meteor.meteor import Meteor
    from rouge.rouge import Rouge

    assert len(sample_seqs) == len(gt_seqs), "number of eval data is different"
    res = OrderedDict()  # res: {0:[xx],1:[xx],...}
    for i in range(len(sample_seqs)):  # for each data(sent)
        res[i] = [sample_seqs[i]]

    gts = OrderedDict()  # gts: {0:[sent1,sent2,...],1:[sent1,sent2,...], ...}
    for i in range(len(gt_seqs)):
        gts[i] = [gt_seqs[i][j] for j in range(len(gt_seqs[i]))]

    res = {i: res[i] for i in range(len(sample_seqs))}
    gts = {i: gts[i] for i in range(len(gt_seqs))}

    avg_bleu_score, bleu_scores = Bleu(4).compute_score(gts, res)
    avg_cider_score, cider_scores = Cider().compute_score(gts, res)
    avg_meteor_score, meteor_scores = Meteor().compute_score(gts, res)
    avg_rouge_score, rouge_scores = Rouge().compute_score(gts, res)

    print(" BLEU1:{}\n BLEU2:{}\n BLEU3:{}\n BLEU4:{}\n METEOR:{}\n ROUGE:{}\n CIDEr:{}\n"\
     .format(avg_bleu_score[0], avg_bleu_score[1], avg_bleu_score[2], avg_bleu_score[3], \
       avg_meteor_score, avg_rouge_score, avg_cider_score))

    return {
        'BLEU': avg_bleu_score,
        'METEOR': avg_meteor_score,
        'ROUGE': avg_rouge_score,
        'CIDEr': avg_cider_score
    }
예제 #11
0
    def evaluate(self):
        output = {}
        scorers = [
            (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
            (Meteor(), "METEOR"),
            (Rouge(), "ROUGE_L"),
            # (Cider(), "CIDEr")
        ]

        # =================================================
        # Compute scores
        # =================================================
        for scorer, method in scorers:
            # print 'computing %s score...'%(scorer.method())
            score, scores = scorer.compute_score(self.gts, self.res)
            if type(method) == list:
                for sc, scs, m in zip(score, scores, method):
                    #print ("%s: %0.5f"%(m, sc))
                    output[m] = sc
            else:
                #print ("%s: %0.5f"%(method, score))
                output[method] = score
        return output
예제 #12
0
def score(ref, hypo):
    """
    ref, dictionary of reference sentences (id, sentence)
    hypo, dictionary of hypothesis sentences (id, sentence)
    score, dictionary of scores
    """
    #print('setting up scorers...')
    scorers = [
        (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
        #(Meteor(), "METEOR"), # hidde currently due to slow speed
        (Rouge(), "ROUGE_L")
        #(Cider(), "CIDEr")
    ]
    final_scores = {}
    for scorer, method in scorers:
        score, scores = scorer.compute_score(ref, hypo)
        if type(score) == list:
            for m, s in zip(method, score):
                final_scores[m] = s
        else:
            final_scores[method] = score
    # print('final_scores: ', final_scores)
    return final_scores
예제 #13
0
    subparsers = parser.add_subparsers(help="choose between different modes of operation")
    preprocess = subparsers.add_parser("preprocess",
                                       help="preprocessing related commands")
    preprocess.add_argument("--op",
                            help="operation",
                            type=str,
                            choices=["make", "vocab"], required=True)
    exclusive = preprocess.add_mutually_exclusive_group(required=True)
    exclusive.add_argument("--dataset",
                           help="the dataset to process, should be subdir of iobasedir",
                           action="append",
                           default=[])
    exclusive.add_argument("--topic",
                           help="the topic to process, should be subdir of iobasedir, and contain a processed topic",
                           action="append",
                           default=[])

    # preprocess_data = preprocess.add_subparsers(help="preprocessing commands")
    # preprocess_dataset = preprocess_data.add_parser("dataset",
    #                                                 help="data preprocessing tool. prepare raw dataset for summarization")
    # preprocess_dataset.add_argument("")

    # postprocess = subparsers.add_parser("postprocess",
    #                                     help="Postprocessing of results. Convering of raw results into pretty pictures and reports")


    args = parser.parse_args()

    do_preprocess(rouge=Rouge(args.rouge), datasets=args.dataset, topics=args.topic, operation=args.op)
예제 #14
0
    def test_n_score(self):
        models = PerlScriptUtils._parse_models(self.models_path)
        systems = PerlScriptUtils._parse_systems(self.systems_path)

        # no swr, no stem
        n_scores = {}
        for system_id, cand_texts in tqdm(systems.items()):
            if system_id not in n_scores:
                n_scores[system_id] = {}
            for topic_id, cand_text in cand_texts.items():
                if topic_id not in n_scores[system_id]:
                    n_scores[system_id][topic_id] = {}
                ref_texts = models[topic_id].values()
                n_score = self.rouge.n_score(ref_texts, cand_text)
                n_scores[system_id][topic_id] = n_score

        n_scores_perl = PerlScriptUtils.parse_rouge_perl_out("duc2005_subset/rouge_perl.out")
        for system_id, pyrouge_topics in n_scores.items():
            for topic_id, pyrouge_scores in pyrouge_topics.items():
                scores_perl = n_scores_perl[system_id][topic_id]
                for n, scores in scores_perl.items():
                    for k,v in scores.items():
                        # ROUGE truncates, while we round.
                        self.assertAlmostEqual(v, pyrouge_scores[n][k], 4,
                            "Results different from original ROUGE.")
        # swr, no stem
        self.rouge = Rouge.from_rouge155_args({"s": True})
        n_scores = {}
        for system_id, cand_texts in tqdm(systems.items()):
            if system_id not in n_scores:
                n_scores[system_id] = {}
            for topic_id, cand_text in cand_texts.items():
                if topic_id not in n_scores[system_id]:
                    n_scores[system_id][topic_id] = {}
                ref_texts = models[topic_id].values()
                n_score = self.rouge.n_score(ref_texts, cand_text)
                n_scores[system_id][topic_id] = n_score

        n_scores_perl = PerlScriptUtils.parse_rouge_perl_out("duc2005_subset/rouge_perl_swr.out")
        for system_id, pyrouge_topics in n_scores.items():
            for topic_id, pyrouge_scores in pyrouge_topics.items():
                scores_perl = n_scores_perl[system_id][topic_id]
                for n, scores in scores_perl.items():
                    for k,v in scores.items():
                        # ROUGE truncates, while we round.
                        self.assertAlmostEqual(v, pyrouge_scores[n][k], 4,
                            "Results different from original ROUGE (swr).")

        # stem, no swr
        self.rouge = Rouge.from_rouge155_args({"m": True})
        n_scores = {}
        for system_id, cand_texts in tqdm(systems.items()):
            if system_id not in n_scores:
                n_scores[system_id] = {}
            for topic_id, cand_text in cand_texts.items():
                if topic_id not in n_scores[system_id]:
                    n_scores[system_id][topic_id] = {}
                ref_texts = models[topic_id].values()
                n_score = self.rouge.n_score(ref_texts, cand_text)
                n_scores[system_id][topic_id] = n_score

        n_scores_perl = PerlScriptUtils.parse_rouge_perl_out("duc2005_subset/rouge_perl_stem.out")
        for system_id, pyrouge_topics in n_scores.items():
            for topic_id, pyrouge_scores in pyrouge_topics.items():
                scores_perl = n_scores_perl[system_id][topic_id]
                for n, scores in scores_perl.items():
                    for k,v in scores.items():
                        # ROUGE truncates, while we round.
                        self.assertAlmostEqual(v, pyrouge_scores[n][k], 4,
                            "Results different from original ROUGE (stem).")

        # stem, swr
        self.rouge = Rouge.from_rouge155_args({"m": True, "s": True})
        n_scores = {}
        for system_id, cand_texts in tqdm(systems.items()):
            if system_id not in n_scores:
                n_scores[system_id] = {}
            for topic_id, cand_text in cand_texts.items():
                if topic_id not in n_scores[system_id]:
                    n_scores[system_id][topic_id] = {}
                ref_texts = models[topic_id].values()
                n_score = self.rouge.n_score(ref_texts, cand_text)
                n_scores[system_id][topic_id] = n_score

        n_scores_perl = PerlScriptUtils.parse_rouge_perl_out("duc2005_subset/rouge_perl_stem_swr.out")
        for system_id, pyrouge_topics in n_scores.items():
            for topic_id, pyrouge_scores in pyrouge_topics.items():
                scores_perl = n_scores_perl[system_id][topic_id]
                for n, scores in scores_perl.items():
                    for k,v in scores.items():
                        # ROUGE truncates, while we round.
                        self.assertAlmostEqual(v, pyrouge_scores[n][k], 4,
                            "Results different from original ROUGE (stem, swr).")
예제 #15
0
    'オレンジ色 の Tシャツ を 着ている 人 が 立って います',
]
#prediceted は一つだけじゃないとダメ
predicted = {}
predicted['262148'] = ['人 が オレンジ色 の シャツ を 着て 立って います']

#keyは数字でも文字列でもどっちでもいいけど、ground truth と predicedで対応が取れるように!

#compute blue
scorer = Bleu(4)
score, scores = scorer.compute_score(ground_truth, predicted)
print(scores)
for i, value in enumerate(scores):
    print(
        i, np.mean(value)
    )  # not same. Blue does not use standard mean.some weighted geometric mean?

#meter requires other thesaurus to

#compute Rouge
scorer = Rouge()
score, scores = scorer.compute_score(ground_truth, predicted)
print(score)
print(np.mean(scores))

#compute CIDEr
scorer = Cider()
score, scores = scorer.compute_score(ground_truth, predicted)
print(score)
print(np.mean(scores))
예제 #16
0
    def cross_evaluate(self):
        """
		We will evaluate how relevant is the generated expression to the ground-truth expressions,
		and how different it is to the expressions of the other objects within the same image.
		Thus, the prerequisite is the dataset is split by image_id, and each ann has multiple
		expressions, e.g., our new RefCOCO dataset whose tesing object has ~10 expressions.
		We first compute score on sc_ii = (sent_i, gd_sents_i), then compute score on 
		sc_ij = (sent_i, gd_sents_j), the margin of max(0, sc_ii - sc_ij) will be considered
		as final score.
		Speficically, we choose METEOR and CIDEr for this kind of evaluation.

		For doing so, we need to prepare ref_to_gts and ref_to_res. As we want to do cross evaluation,
		our key would be paird_id, i.e., "ref_id1_to_ref_id2", e.g, '123_456', then 
		input:
		- Gts[123_456] = [456's gd sents]
		- Res[123_456] = [123's predicted sents]. 
		return:
		- ref_to_eval[123_456] = {method: score}, which measures 123's generation over 456's gd-sents
		Note, we also compute score of 123_123
		
		We will use "sids" and "cids" to donate source_ref_ids and cross_ref_ids.
		"""
        source_ref_ids = [pred['ref_id'] for pred in self.preds]
        Preds = {pred['ref_id']: pred['sent'] for pred in self.preds}

        # construct pair_id, which is [source_ref_id]_[target_ref_id], i.e, 123_456
        Gts = {}
        Res = {}
        for source_ref_id in source_ref_ids:
            image_id = self.refer.Refs[source_ref_id]['image_id']
            cross_refs = self.refer.imgToRefs[
                image_id]  # including source_ref itself
            for cross_ref in cross_refs:
                pair_id = str(source_ref_id) + '_' + str(cross_ref['ref_id'])
                Res[pair_id] = [Preds[source_ref_id]]
                Gts[pair_id] = [
                    sent['sent'] for sent in cross_ref['sentences']
                ]

        # tokenize
        print 'tokenization...'
        tokenizer = PTBTokenizer()
        Gts = tokenizer.tokenize(Gts)
        Res = tokenizer.tokenize(Res)

        # set up scorers
        print 'setting up scorers...'
        scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
                   (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"),
                   (Cider(), "CIDEr")]

        # compute scores
        for scorer, method in scorers:
            print 'computing %s score...' % (scorer.method())
            score, scores = scorer.compute_score(Gts, Res)
            if type(method) == list:
                for sc, scs, m in zip(score, scores, method):
                    self.setEvals(scs, Gts.keys(), m)
                    print "%s: %0.3f" % (m, sc)
            else:
                self.setEvals(scores, Gts.keys(), method)
                print "%s: %0.3f" % (method, score)
예제 #17
0
def valid(model, mode='all'):
    model.eval()
    with open(DATASET_DEV_CLS3, 'rb') as f:
        dataset_cls3 = pickle.load(f)
    with open(DATASET_DEV_CLS18, 'rb') as f:
        dataset_cls18 = pickle.load(f)
    dataset_summ_qa = data.ConcatDataset([dataset_cls3, dataset_cls18])

    cls3_loader = torch.utils.data.DataLoader(dataset=dataset_cls3,
                                              batch_size=VALID_BATCH,
                                              shuffle=False,
                                              collate_fn=lambda x: x)
    cls3_iterator = iter(cls3_loader)
    cls18_loader = torch.utils.data.DataLoader(dataset=dataset_cls18,
                                               batch_size=VALID_BATCH,
                                               shuffle=False,
                                               collate_fn=lambda x: x)
    cls18_iterator = iter(cls18_loader)

    rouge_summ = rouge_qa = None
    acc_cls3 = acc_cls18 = 0
    # --------------------------------------------------------------------
    if mode in ['all', 'summ', 'qa']:
        data_val_sum_qa = []
        if VALID_NUM > 0:
            for i in range(VALID_NUM):
                data_val_sum_qa.append(dataset_summ_qa[i])
        else:
            for i in range(len(dataset_summ_qa)):
                data_val_sum_qa.append(dataset_summ_qa[i])

    if mode in ['all', 'summ']:
        refs = [' '.join(data['question']) for data in data_val_sum_qa]
        x = [data['description'] for data in data_val_sum_qa]
        hyps = beam_search('summ', model, x)
        hyps = [' '.join(list(sent)) for sent in hyps]
        rouge = Rouge()
        try:
            rouge_summ = rouge.get_scores(hyps,
                                          refs,
                                          avg=True,
                                          ignore_empty=True)
            print_rouge(rouge_summ)
        except RuntimeError:
            print('Failed to compute Rouge!')

    if mode in ['all', 'qa']:
        refs = [' '.join(data['answer']) for data in data_val_sum_qa]
        x = [data['question'] for data in data_val_sum_qa]
        hyps = beam_search('qa', model, x)
        hyps = [' '.join(list(sent)) for sent in hyps]
        rouge = Rouge()
        try:
            rouge_qa = rouge.get_scores(hyps,
                                        refs,
                                        avg=True,
                                        ignore_empty=True)
            print_rouge(rouge_qa)
        except RuntimeError:
            print('Failed to compute Rouge!')

    # cls3 & cls18
    def iter_through_cls_dev(iterator, mode):
        val_correct = 0
        val_num = 0
        for i in range(math.ceil(VALID_NUM / VALID_BATCH)):
            mini_batch = next(iterator)
            question = [data['question'] for data in mini_batch]
            description = [data['description'] for data in mini_batch]
            y_gt = torch.tensor([data['category']
                                 for data in mini_batch]).to(device)
            y_pred = model(source=description,
                           source2=question,
                           target=None,
                           mode=mode)
            y_pred_labels = torch.argmax(y_pred, dim=1)
            val_correct += (y_gt == y_pred_labels).sum().item()
            val_num += len(mini_batch)

        return val_correct / val_num

    if mode in ['all', 'cls3']:
        acc_cls3 = iter_through_cls_dev(cls3_iterator, 'cls3')
        print('Acc_cls3:', acc_cls3)
    if mode in ['all', 'cls18']:
        acc_cls18 = iter_through_cls_dev(cls18_iterator, 'cls18')
        print('Acc_cls18:', acc_cls18)

    if is_training:
        model.train()

    return rouge_summ, rouge_qa, acc_cls3, acc_cls18
예제 #18
0
class CaptionEvaluater(object):
    def __init__(self, ):
        self.blue_scorer = Bleu(4)
        self.rouge_scorer = Rouge()
        self.cider_scorer = Cider()
        self.truth = None
        remove = string.punctuation + "、。,."
        self.remove_pattern = r"[{}]".format(remove)  # create the pattern

    def remove_punctuation(self, line):
        #I am not sure how unicode works in python, so just in case.
        line = line.replace(u"<unk>", "")
        line = line.replace("<unk>", "")
        line = line.replace(u"。", "")
        line = line.replace('\u3002', "")
        return re.sub(self.remove_pattern, "", line)

    def trnasform_utf8(self, line):
        # return u' '.join(line).encode('utf-8').strip()
        return line

    def set_ground_truth(self, ground_truth):
        '''
        ground_truth should be a python dictonary whose shape is; 
            {"image_identifier": ["a caption", "a similar caption", ...], ...}
        "image_identifier" can be either string or number.
        '''
        for img in ground_truth:
            # ground_truth[img]=map(self.trnasform_utf8,ground_truth[img])
            ground_truth[img] = map(self.remove_punctuation, ground_truth[img])
        self.truth = ground_truth

    def evaluate(self, predicetd_captions):
        '''
        predicetd_captions should be a python dictonary whose shape is; 
            {"image_identifier": ["the prediced caption"], ...}
        "image_identifier" need to be same as used in ground truth.
        make sure the number of caption is only one, even though it uses python list. 
        '''
        for img in predicetd_captions:
            # predicetd_captions[img]=map(self.trnasform_utf8,predicetd_captions[img])
            predicetd_captions[img] = map(self.remove_punctuation,
                                          predicetd_captions[img])

        results = {}
        for i, score in enumerate(self.get_bleu(predicetd_captions)[0]):
            results["bleu-%d" % i] = score
        results["rouge"] = self.get_rouge(predicetd_captions)[0]
        results["cider"] = self.get_cider(predicetd_captions)[0]

        return results

    def get_bleu(self, predicetd_captions):
        score, scores = self.blue_scorer.compute_score(self.truth,
                                                       predicetd_captions)
        #output is a python list [bleu-1,bleu-2,bleu-3,bleu-4]
        return score, scores

    def get_rouge(self, predicetd_captions):
        score, scores = self.rouge_scorer.compute_score(
            self.truth, predicetd_captions)
        return score, scores

    def get_cider(self, predicetd_captions):
        score, scores = self.cider_scorer.compute_score(
            self.truth, predicetd_captions)
        return score, scores
    def evaluate(self):

        evalRefIds = [ann['ref_id'] for ann in self.Res]

        refToGts = {}
        refToGtRanks1 = {}
        refToGtRanks2 = {}
        for ref_id in evalRefIds:
            ref = self.refer.Refs[ref_id]
            gt_sents = [sent['sent'] for sent in ref['sentences']]
            refToGts[ref_id] = gt_sents
            if self.eval_cider_r:
                gt_ranks1 = self.refer.get_rank1(ref)
                gt_ranks2 = self.refer.get_rank2(ref)
                refToGtRanks1[ref_id] = gt_ranks1
                refToGtRanks2[ref_id] = gt_ranks2

        refToRes = {ann['ref_id']: [ann['sent']] for ann in self.Res}

        print('tokenization...')
        tokenizer = PTBTokenizer()
        self.refToRes = tokenizer.tokenize(refToRes)
        self.refToGts = tokenizer.tokenize(refToGts)

        # =================================================
        # Set up scorers
        # =================================================
        print('setting up scorers...')
        scorers = [
            (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
            (Meteor(), "METEOR"),
            (Rouge(), "ROUGE_L"),
            (Cider(), "CIDEr"),
        ]

        if self.eval_cider_r:
            scorers.append((CiderR(), "R_CIDEr"))
            scorers.append((CiderRa(), "Ra_CIDEr"))

        # =================================================
        # Compute scores
        # =================================================
        for scorer, method in scorers:
            print('computing %s score...' % (scorer.method()))
            if method == "R_CIDEr":
                score, scores = scorer.compute_score(self.refToGts,
                                                     self.refToRes,
                                                     refToGtRanks1)
            elif method == "Ra_CIDEr":
                score, scores = scorer.compute_score(self.refToGts,
                                                     self.refToRes,
                                                     refToGtRanks2)
            else:
                score, scores = scorer.compute_score(self.refToGts,
                                                     self.refToRes)
            if type(method) == list:
                for sc, scs, m in zip(score, scores, method):
                    self.setEval(sc, m)
                    self.setRefToEvalRefs(scs, self.refToGts.keys(), m)
                    print("%s: %0.3f" % (m, sc))
            else:
                self.setEval(score, method)
                self.setRefToEvalRefs(scores, self.refToGts.keys(), method)
                print("%s: %0.3f" % (method, score))
        self.setEvalRefs()
예제 #20
0
파일: eval.py 프로젝트: vinaypn/PQG
    def evaluate(self):
        imgIds = self.params['image_id']
        # imgIds1 = self.params1
        # print "type of imgids ",type(imgIds)
        # # print "imgids",imgIds
        # # print "imgids1",imgIds1
        # for x in xrange(1,10):
        #     print "image ids in evaluate function",imgIds[x]
        # imgIds = self.coco.getImgIds()
        gts = {}
        res = {}
        # count =0
        # print "image ids for gts COCO type ",type(self.coco.imgToAnns)
        for imgId in imgIds:
            # if count <= 10:
            #     # for x in xrange(1,10):
            #     # print "image ids for gts",self.coco.imgToAnns[imgId]
            #     # print "image ids for gts len ",len(self.cocoRes.imgToAnns[imgId])
            #     # gts[imgId] = self.coco.imgToAnns[imgId]
            res[imgId] = self.cocoRes.imgToAnns[imgId]
            gts[imgId] = self.coco.imgToAnns[imgId]
            # print "image ids for res ",self.cocoRes.imgToAnns[imgId]
            # print "image ids for res typr ",type(self.cocoRes.imgToAnns[imgId])
            # count = count+1;

        # for imgId in imgIds1:
        #     if count <= 10:
        #         # for x in xrange(1,10):
        #         # print "image ids for gts",self.coco.imgToAnns[imgId]
        #         # print "image ids for gts len ",len(self.cocoRes.imgToAnns[imgId])
        #         gts[imgId] = self.coco.imgToAnns[imgId]
        #         # res[imgId] = self.cocoRes.imgToAnns[imgId]
        #         print "image ids for gts ",self.coco.imgToAnns[imgId]
        #         print "image ids for gts typr ",type(self.coco.imgToAnns[imgId])
        #         count = count+1;

        # =================================================
        # Set up scorers
        # =================================================
        print 'tokenization...'
        tokenizer = PTBTokenizer()
        gts = tokenizer.tokenize_gt(gts)
        res = tokenizer.tokenize(res)

        # =================================================
        # Set up scorers
        # =================================================
        print 'setting up scorers...'
        scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
                   (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"),
                   (Cider(), "CIDEr")]

        # =================================================
        # Compute scores
        # =================================================
        for scorer, method in scorers:
            print 'computing %s score...' % (scorer.method())
            score, scores = scorer.compute_score(gts, res)
            if type(method) == list:
                for sc, scs, m in zip(score, scores, method):
                    self.setEval(sc, m)
                    self.setImgToEvalImgs(scs, gts.keys(), m)
                    print "%s: %0.3f" % (m, sc)
            else:
                self.setEval(score, method)
                self.setImgToEvalImgs(scores, gts.keys(), method)
                print "%s: %0.3f" % (method, score)
        self.setEvalImgs()
예제 #21
0
def evaluate_summ_qa(model, dataset, mode, batch_size=64):
    assert mode in ('summ', 'qa'), 'Invalid mode!'

    model.eval()

    data_loader = torch.utils.data.DataLoader(dataset=dataset,
                                              batch_size=batch_size,
                                              shuffle=False,
                                              collate_fn=lambda x: x)

    rouge1_f_sum = rouge2_f_sum = rougeL_f_sum = bleu_sum = 0
    examples_rouge = examples_bleu = 0

    rouge = Rouge()
    count = 0
    if mode == 'summ':
        for mini_batch in tqdm(data_loader):
            count += 1
            refs = [' '.join(data['question']) for data in mini_batch]
            x = [data['description'] for data in mini_batch]
            hyps_raw = beam_search('summ', model, x)
            hyps = [' '.join(list(sent)) for sent in hyps_raw]
            try:
                rouge_score = rouge.get_scores(hyps,
                                               refs,
                                               avg=True,
                                               ignore_empty=True)
                rouge1_f_sum += rouge_score['rouge-1']['f'] * len(mini_batch)
                rouge2_f_sum += rouge_score['rouge-2']['f'] * len(mini_batch)
                rougeL_f_sum += rouge_score['rouge-l']['f'] * len(mini_batch)
                examples_rouge += len(mini_batch)
            except ValueError as e:
                print(str(e) + ' | continuing...')
                continue

    elif mode == 'qa':
        for mini_batch in tqdm(data_loader):
            count += 1
            refs = [' '.join(data['answer']) for data in mini_batch]
            x = [data['question'] for data in mini_batch]
            hyps_raw = beam_search('qa', model, x)
            hyps = [' '.join(list(sent)) for sent in hyps_raw]
            try:
                rouge_score = rouge.get_scores(hyps,
                                               refs,
                                               avg=True,
                                               ignore_empty=True)
                rouge1_f_sum += rouge_score['rouge-1']['f'] * len(mini_batch)
                rouge2_f_sum += rouge_score['rouge-2']['f'] * len(mini_batch)
                rougeL_f_sum += rouge_score['rouge-l']['f'] * len(mini_batch)
                examples_rouge += len(mini_batch)
            except ValueError as e:
                print(str(e) + ' | continuing...')
                continue

            # calculate BLEU score
            refs = [data['answer'] for data in mini_batch]
            hyps = [list(sent) for sent in hyps_raw]
            smoothie = SmoothingFunction().method4
            for i in range(len(hyps)):
                try:
                    bleu = sentence_bleu([refs[i]],
                                         hyps[i],
                                         smoothing_function=smoothie)
                    bleu_sum += bleu
                    examples_bleu += 1
                except ZeroDivisionError as e:
                    print(str(e) + ' | continuing...')
                    continue

    rouge_1_f = rouge1_f_sum / examples_rouge
    rouge_2_f = rouge2_f_sum / examples_rouge
    rouge_L_f = rougeL_f_sum / examples_rouge
    if mode == 'qa':
        bleu_score = bleu_sum / examples_bleu

    # with open('output/test_{}.txt'.format(mode), 'w', encoding='utf-8') as f:
    #     f.write('rouge-1 f: ' + str(rouge_1_f) + '\n')
    #     f.write('rouge-2 f: ' + str(rouge_2_f) + '\n')
    #     f.write('rouge-L f: ' + str(rouge_L_f) + '\n')
    #     f.write('\n')
    #
    #     for i in range((len(candidates)):
    #         f.write('input: ' + inputs[i] + '\n')
    #         f.write('hyp: ' + ''.join(candidates[i]) + '\n')
    #         f.write('ref: ' + targets[i] + '\n\n')

    if is_training:
        model.train()
    print('rouge-1 f: ' + str(rouge_1_f))
    print('rouge-2 f: ' + str(rouge_2_f))
    print('rouge-L f: ' + str(rouge_L_f))
    if mode == 'qa':
        print('bleu: ', bleu_score)
예제 #22
0
        hdf = hdf.loc[hdf['LP'] == lp]
        hdf = hdf.loc[hdf['SYSTEM'] == sys]
        hdf.reset_index(drop=True, inplace=True)

        cands = []
        fc = open(csdir + '/' + cs, "r", encoding='utf-8')
        while True:
            line = fc.readline()
            if not line:
                break
            cands.append(line)

        assert len(cands) == len(refs)

        rouge = Rouge()
        scores = rouge.get_scores(cands, refs)
        R1 = [one['rouge-1']['f'] for one in scores]
        R2 = [one['rouge-2']['f'] for one in scores]
        R3 = [one['rouge-l']['f'] for one in scores]
        outlist.append([
            lp, sys,
            np.mean(R1),
            np.mean(R2),
            np.mean(R3), hdf['HUMAN'].item()
        ])
        end = perf_counter()
        print("LP : {0:10}SYS: {1:30s}time taken: {2:5.3f}".format(
            lp, sys, end - start))
    sz = len(cses)
    pees = [row[2] for row in outlist[-sz:]]
예제 #23
0
 def setUp(self):
     self.rouge = Rouge.from_rouge155_args()
     self.models_path = Path("duc2005_subset/models")
     self.systems_path = Path("duc2005_subset/peers")
예제 #24
0
    def evaluate(self, gt_path, results_path):
        #imgIds = self.params['image_id']
        # imgIds = self.coco.getImgIds()
        gts = {}
        gt_pkl = pickle.load(open(gt_path, 'rb'))
        for key in gt_pkl.keys():
            sample = gt_pkl[key]
            img_id = int(sample['image_id'])
            caption = sample['caption']
            if img_id in gts:
                gts[img_id].append({'image_id': img_id, 'caption': caption})
            else:
                gts[img_id] = [{'image_id': img_id, 'caption': caption}]
        #print(gts)

        res = {}        
        res_json = json.load(open(results_path,'rb'))
        for sample in res_json:
            img_id = int(sample['image_id'])
            res[img_id] = [sample]
        #print(res)
        #print(1/0)
        #for imgId in imgIds:
            #gts[imgId] = self.coco.imgToAnns[imgId]
            #print(gts[imgId])
            #res[imgId] = self.cocoRes.imgToAnns[imgId]
            #print(imgId)
            #print(res[imgId])
            #print(1/0)
        #print(res)
        #print(1/0)
        # =================================================
        # Set up scorers
        # =================================================
        print 'tokenization...'
        tokenizer = PTBTokenizer()
        gts  = tokenizer.tokenize(gts)
        res = tokenizer.tokenize(res)

        # =================================================
        # Set up scorers
        # =================================================
        print 'setting up scorers...'
        scorers = [
            (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
            (Meteor(),"METEOR"),
            (Rouge(), "ROUGE_L"),
            (Cider(), "CIDEr")
            #(Spice(), "SPICE")
        ]

        # =================================================
        # Compute scores
        # =================================================
        for scorer, method in scorers:
            print 'computing %s score...'%(scorer.method())
            score, scores = scorer.compute_score(gts, res)
            if type(method) == list:
                for sc, scs, m in zip(score, scores, method):
                    self.setEval(sc, m)
                    self.setImgToEvalImgs(scs, gts.keys(), m)
                    print "%s: %0.3f"%(m, sc)
            else:
                self.setEval(score, method)
                self.setImgToEvalImgs(scores, gts.keys(), method)
                print "%s: %0.3f"%(method, score)
        self.setEvalImgs()
예제 #25
0
        if i % 2 == 1:
            res[int(line.strip('\n').split(':')[0])] = [
                line.strip('\n').split(':')[2]
            ]
        elif i % 2 == 0:
            gts[int(line.strip('\n').split(':')[0])] = [
                line.strip('\n').split(':')[2]
            ]

hyps = []
refs = []
bleu_score = 0.0

for k in res:
    assert k in gts
    hyps.append(res[k][0])
    refs.append(gts[k][0])
for hyp, ref in zip(hyps, refs):
    hyp = hyp.strip().split()
    ref = ref.strip().split()
    bleu_score += sentence_bleu([ref],
                                hyp,
                                smoothing_function=SmoothingFunction().method4)

print("score_Bleu: "), bleu_score * 1.0 / len(hyps)

score_Meteor, scores_Meteor = Meteor().compute_score(gts, res)
print("Meteor: "), score_Meteor
score_Rouge, scores_Rouge = Rouge().compute_score(gts, res)
print("ROUGe: "), score_Rouge
예제 #26
0
#     res[imgId] = res_results[imgId]
# =================================================
# Set up scorers
# =================================================
# print 'tokenization...'
# tokenizer = PTBTokenizer()
# gts  = tokenizer.tokenize(gts)
# res = tokenizer.tokenize(res)
# =================================================
# Set up scorers
# =================================================
print 'setting up scorers...'
scorers = [
    (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
    #(Meteor(),"METEOR"),
    (Rouge(), "ROUGE_L"),
    (Cider(), "CIDEr")
]
# =================================================
# Compute scores
# =================================================
for scorer, method in scorers:
    print 'computing %s score...' % (scorer.method())
    score, scores = scorer.compute_score(gts, res)
    if type(method) == list:
        for sc, scs, m in zip(score, scores, method):
            #self.setEval(sc, m)
            #self.setImgToEvalImgs(scs, gts.keys(), m)
            print "%s: %0.3f" % (m, sc)
    else:
        #self.setEval(score, method)
예제 #27
0
filename = '../data/ASAP_AES/training_set_rel3.tsv'
ref_file = '../data/ASAP_AES/reference_3_aes.tsv'
df = pd.read_csv(filename, delimiter='\t', encoding='ISO-8859–1')
ref1 = pd.read_csv(ref_file, delimiter='\t')

can1 = df.loc[df['essay_set'] == 3]
can1.reset_index(drop=True, inplace=True)
cands = list(can1['essay'])
print("Candidate Sentences: ", len(cands))
# print(cands[0])

ref1.reset_index(drop=True, inplace=True)
ref = ref1['Reference'][0]

rouge = Rouge()
sc = []
for cand, canid, hscore in zip(cands, can1['essay_id'], can1['domain1_score']):
    scores = rouge.get_scores(cand, ref)[0]
    sc.append([ canid, scores['rouge-1']['f'], scores['rouge-1']['p'], scores['rouge-1']['r'], \
       scores['rouge-2']['f'], scores['rouge-2']['p'], scores['rouge-2']['r'], \
       scores['rouge-l']['f'], scores['rouge-l']['p'], scores['rouge-l']['r'], hscore ])

# print(can1.columns)
odf = pd.DataFrame(sc,
                   columns=[
                       'cand id', 'R1f', 'R1p', 'R1r', 'R2f', 'R2p', 'R2r',
                       'RLf', 'RLp', 'RLr', 'score'
                   ])
odf.reset_index(drop=True, inplace=True)
odf.to_csv("outs.tsv", sep="\t", index=False, header=True)
예제 #28
0
from six.moves import cPickle

import torch
import torch.nn as nn

import opts
opt = opts.parse_opt()

from bleu.bleu import Bleu
from meteor.meteor import Meteor
from cider.cider import Cider
from rouge.rouge import Rouge
Bleu_score = Bleu(4)
Meteor_score = Meteor()
Cider_score = Cider()
Rouge_score = Rouge()

with open(opt.train_data_path, 'rb') as f:
    print('\nload  {}'.format(opt.train_data_path))
    train_data = cPickle.load(f)

with open(opt.val_data_path, 'rb') as f:
    print('\nload  {}'.format(opt.val_data_path))
    val_data = cPickle.load(f)

with open(opt.test_data_path, 'rb') as f:
    print('\nload  {}'.format(opt.test_data_path))
    test_data = cPickle.load(f)

with open(opt.token2index_path, 'rb') as f:
    print('\nload  {}'.format(opt.token2index_path))
예제 #29
0
def compute_metrics_from_files(p_path_to_reference_file,
                               p_path_to_candidate_file,
                               p_max_bleu_order):
    """Compute BLEU-N and ROUGE-L metrics.
    IMPORTANT: No-answer reference will be excluded from calculation.

    Args:
    p_path_to_reference_file (str): path to reference file.
    p_path_to_candidate_file (str): path to candidate file.
        Both files should be in format:
            {QUERY_ID_JSON_ID: <a_query_id_int>,
             ANSWERS_JSON_ID: [<list_of_answers_string>]}
    p_max_bleu_order: the maximum n order in bleu_n calculation.

    Returns:
    dict: dictionary of {'bleu_n': <bleu_n score>, 'rouge_l': <rouge_l score>}
    """

    reference_dictionary, reference_no_answer_query_ids = \
        load_file(p_path_to_reference_file)
    candidate_dictionary, candidate_no_answer_query_ids = load_file(p_path_to_candidate_file)
    query_id_answerable = set(reference_dictionary.keys())-reference_no_answer_query_ids
    query_id_answerable_candidate = set(candidate_dictionary.keys())-candidate_no_answer_query_ids
    
    true_positives = len(query_id_answerable_candidate.intersection(query_id_answerable))
    false_negatives = len(query_id_answerable)-true_positives
    true_negatives = len(candidate_no_answer_query_ids.intersection(reference_no_answer_query_ids))
    false_positives = len(reference_no_answer_query_ids)-true_negatives
    precision = float(true_positives)/(true_positives+false_positives) if (true_positives+false_positives)>0 else 1.
    recall = float(true_positives)/(true_positives+false_negatives) if (true_positives+false_negatives)>0 else 1.
    F1 = 2 *((precision*recall)/(precision+recall))
    filtered_reference_dictionary = \
        {key: value for key, value in reference_dictionary.items() \
                    if key not in reference_no_answer_query_ids}

    filtered_candidate_dictionary = \
        {key: value for key, value in candidate_dictionary.items() \
                    if key not in reference_no_answer_query_ids}

    for query_id, answers in filtered_candidate_dictionary.items():
        assert \
            len(answers) <= 1, \
            'query_id %d contains more than 1 answer \"%s\" in candidate file' % \
            (query_id, str(answers))

    reference_query_ids = set(filtered_reference_dictionary.keys())
    candidate_query_ids = set(filtered_candidate_dictionary.keys())
    common_query_ids = reference_query_ids.intersection(candidate_query_ids)
    assert (len(common_query_ids) == len(reference_query_ids)) and \
            (len(common_query_ids) == len(candidate_query_ids)), \
           'Reference and candidate files must share same query ids'

    all_scores = {}
    bleu_scores, _ = \
        Bleu(p_max_bleu_order).compute_score(filtered_reference_dictionary, \
                                             filtered_candidate_dictionary)
    for i, bleu_score in enumerate(bleu_scores):
        all_scores['bleu_%d' % (i+1)] = bleu_score

    rouge_score, _ = Rouge().compute_score(filtered_reference_dictionary, \
                                           filtered_candidate_dictionary)
    all_scores['rouge_l'] = rouge_score
    all_scores['F1'] = F1
    similarity = 0
    for key in filtered_reference_dictionary:
        candidate_answer = nlp(filtered_candidate_dictionary[key][0])
        reference_answer = filtered_reference_dictionary[key]
        answersimilarity = 0
        for answer in reference_answer:
            answersimilarity += candidate_answer.similarity(nlp(answer))
        similarity += answersimilarity/len(reference_answer)
    semantic_similarity = similarity/len(filtered_reference_dictionary)
    all_scores['Semantic_Similarity'] = semantic_similarity
    return all_scores
예제 #30
0
    def evaluate(self, preds, measure=None):
        """
        measure is a subset of ['bleu', 'meteor', 'rouge', 'cider']
        if measure is None, we will apply all the above.
        """

        # story_img_ids -> pred story str
        stimgids_to_Res = {
            item['stimgids']:
            [item['pred_story_str'].encode('ascii', 'ignore').decode('ascii')]
            for item in preds
        }

        # story_img_ids -> gt storie str(s)
        stimgids_to_stories = {}
        for story in self.vist_sis.stories:
            story_img_ids = '_'.join(
                [str(img_id) for img_id in story['img_ids']])
            if story_img_ids in stimgids_to_stories:
                stimgids_to_stories[story_img_ids] += [story]
            else:
                stimgids_to_stories[story_img_ids] = [story]

        stimgids_to_Gts = {}
        for stimgids in stimgids_to_Res.keys():
            gd_story_strs = []
            related_stories = stimgids_to_stories[stimgids]
            for story in related_stories:
                gd_sent_ids = self.vist_sis.Stories[story['id']]['sent_ids']
                gd_story_str = ' '.join([
                    self.vist_sis.Sents[sent_id]['text']
                    for sent_id in gd_sent_ids
                ])
                gd_story_str = gd_story_str.encode('ascii', 'ignore').decode(
                    'ascii')  # ignore some weird token
                gd_story_strs += [gd_story_str]
            stimgids_to_Gts[stimgids] = gd_story_strs

        # tokenize
        # print 'tokenization ... '
        # tokenizer = PTBTokenizer()
        # self.stimgids_to_Res = tokenizer.tokenize(stimgids_to_Res)
        # self.stimgids_to_Gts = tokenizer.tokenize(stimgids_to_Gts)
        self.stimgids_to_Res = stimgids_to_Res
        self.stimgids_to_Gts = stimgids_to_Gts

        # =================================================
        # Set up scorers
        # =================================================
        print 'setting up scorers...'
        scorers = []
        if not measure:
            scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
                       (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"),
                       (Cider(), "CIDEr")]
        else:
            if 'bleu' in measure:
                scorers += [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3",
                                       "Bleu_4"])]
            if 'meteor' in measure:
                scorers += [(Meteor(), "METEOR")]
            if 'rouge' in measure:
                scorers += [(Rouge(), "ROUGE_L")]
            if 'cider' in measure:
                scorers += [(Cider(), "CIDEr")]

        # =================================================
        # Compute scores
        # =================================================
        for scorer, method in scorers:
            print 'computing %s score ...' % (scorer.method())
            score, scores = scorer.compute_score(self.stimgids_to_Gts,
                                                 self.stimgids_to_Res)
            if isinstance(method, list):
                for sc, scs, m in zip(score, scores, method):
                    self.setEval(sc, m)
                    self.setStimgidsToEval(scs, self.stimgids_to_Gts.keys(), m)
                    print '%s: %.3f' % (m, sc)
            else:
                self.setEval(score, method)
                self.setStimgidsToEval(scores, self.stimgids_to_Gts.keys(),
                                       method)
                print '%s: %.3f' % (method, score)

        self.setEvalStimgids()