예제 #1
0
def calculate_nlg_score(generated_question_tensor, reference_question_tensor,
                        wid_to_word):
    batch_size = generated_question_tensor.size(0)
    bleu_score = 0
    metrics = []
    val = {}
    for j in range(batch_size):
        sampled_aqa = []
        new_question = translate_tokens(generated_question_tensor[j].tolist(),
                                        wid_to_word)
        ref_question = translate_tokens(
            reference_question_tensor[j][1:].tolist(), wid_to_word)
        if (len(val.keys()) == 0):
            val.update(
                compute_individual_metrics(' '.join(ref_question),
                                           ' '.join(new_question)))
        else:
            temp = compute_individual_metrics(' '.join(ref_question),
                                              ' '.join(new_question))
            for key in temp.keys():
                val[key] += temp[key]
        # print(val['METEOR'])
    for key in temp.keys():
        val[key] = val[key] / batch_size
    return val
예제 #2
0
def compute_metrics(hypothesis, references):
    '''

    :param hypothesis:
    :param references:
    :return:
    '''

    return nlgeval.compute_individual_metrics(hypothesis, references, False,
                                              True, False)
예제 #3
0
def event_type_nlg_eval(config, logger, result):
    print_and_log(logger, "====================================")
    print_and_log(logger, "event type average bleu scores:")
    print_and_log(
        logger, "Please install nlg-eval package!\n"
        "Reference: https://github.com/Maluuba/nlg-eval")
    print_and_log(
        logger, "After installing, please change the package "
        "__init__.py file (contact: [email protected]).")

    sys.path.append(config['nlgeval_repo_dir'])
    from nlgeval import compute_individual_metrics

    # avg  bleu
    avg_bleu = dict()
    for i in range(len(result['chain_name'])):
        gt_chain = " ".join(result['gt_all_event_type'][i])
        hy_chain = " ".join(result['pred_all_event_type'][i])
        metrics_dict = compute_individual_metrics(gt_chain,
                                                  hy_chain,
                                                  no_overlap=(False, True),
                                                  no_skipthoughts=True,
                                                  no_glove=True)
        for metric in metrics_dict:
            try:
                avg_bleu[metric] += metrics_dict[metric]
            except KeyError:
                avg_bleu[metric] = metrics_dict[metric]

    for metric in avg_bleu:
        avg_bleu[metric] = avg_bleu[metric] / len(result['chain_name'])

    for metric in avg_bleu:
        print_and_log(logger, "{}: {}".format(metric,
                                              round(avg_bleu[metric], 2)))
    return
예제 #4
0
def user_cluster_nlg_eval(config, logger, result):
    print_and_log(logger, "====================================")
    print_and_log(logger, "user cluster average bleu scores:")
    # print_and_log(logger, "Please install nlg-eval package!\n"
    #               "Reference: https://github.com/Maluuba/nlg-eval")
    # print_and_log(logger, "After installing, please change the package "
    #               "__init__.py file (contact: [email protected]).")

    sys.path.append(config['nlgeval_repo_dir'])
    from nlgeval import compute_individual_metrics

    # avg  bleu
    avg_bleu = dict()
    avg_bleu = dict()
    avg_bleu['Bleu_1'] = list()
    avg_bleu['Bleu_2'] = list()
    avg_bleu['Bleu_3'] = list()
    avg_bleu['Bleu_4'] = list()

    result['uc_bleu1'] = list()
    result['uc_bleu2'] = list()
    result['uc_bleu3'] = list()
    result['uc_bleu4'] = list()
    for i in range(len(result['chain_name'])):
        if len(result['gt_all_user_cluster'][i]) == 0:
            gt_chain = " ".join(['no_event_in_simperiod'])
        else:
            gt_chain = " ".join(
                [str(ele) for ele in result['gt_all_user_cluster'][i]])
        if len(result['pred_all_user_cluster'][i]) == 0:
            hy_chain = " ".join(['no_event_in_simperiod'])
        else:
            hy_chain = " ".join(
                [str(ele) for ele in result['pred_all_user_cluster'][i]])
        metrics_dict = compute_individual_metrics(gt_chain,
                                                  hy_chain,
                                                  no_overlap=(False, True),
                                                  no_skipthoughts=True,
                                                  no_glove=True)
        result['uc_bleu1'].append(metrics_dict['Bleu_1'])
        avg_bleu['Bleu_1'].append(metrics_dict['Bleu_1'])

        if len(result['gt_all_user_cluster'][i]) >= 2:  # and (
            # len(result['pred_all_user_cluster'][i]) >= 2
            # ):
            result['uc_bleu2'].append(metrics_dict['Bleu_2'])
            avg_bleu['Bleu_2'].append(metrics_dict['Bleu_2'])
        else:
            result['uc_bleu2'].append('null')

        if len(result['gt_all_user_cluster'][i]) >= 3:  # and (
            # len(result['pred_all_user_cluster'][i])
            # ):
            result['uc_bleu3'].append(metrics_dict['Bleu_3'])
            avg_bleu['Bleu_3'].append(metrics_dict['Bleu_3'])
        else:
            result['uc_bleu3'].append('null')

        if len(result['gt_all_user_cluster'][i]) >= 4:  # and (
            # len(result['pred_all_user_cluster'][i]) >= 4
            # ):
            result['uc_bleu4'].append(metrics_dict['Bleu_4'])
            avg_bleu['Bleu_4'].append(metrics_dict['Bleu_4'])
        else:
            result['uc_bleu4'].append('null')

    for metric in avg_bleu:
        print_and_log(
            logger, "{}: {}".format(metric,
                                    round(np.average(avg_bleu[metric]), 4)))


#        print_and_log(logger, "{}: {}, calculated from {} values".format(
#                metric, round(np.average(avg_bleu[metric]), 4),
#                len(avg_bleu[metric])))
# pdb.set_trace()
    return result
예제 #5
0
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
from nlgeval import compute_individual_metrics
from nltk.tokenize import RegexpTokenizer
import numpy as np
tokenizer = RegexpTokenizer(r'\w+')

# This function only calculates bleu scores 
def calculate_bleu(actual, predicted):
    
    cc = SmoothingFunction()
    bleu = sentence_bleu
    weights = [(1,0,0,0),(0.5, 0.5, 0, 0),(0.33, 0.33, 0.33, 0),(0.25, 0.25, 0.25, 0.25)]
    actual = tokenizer.tokenize(actual)
    actual = [actual]
    predicted = tokenizer.tokenize(predicted)
    results = {}
    keys = ['BLEU-1','BLEU-2','BLEU-3','BLEU-4']
    for i in range(4):
        score = bleu(actual, predicted, weights=weights[i], smoothing_function=cc.method4)
        results[keys[i]] = score
    
    return results
    
actual = 'This is a small test'
predicted = 'This is a test'

results = calculate_bleu(actual, predicted)
print(results)
# This calculate all scores but it is really slow
metrics_dict = compute_individual_metrics(actual, predicted)
print(metrics_dict)
__author__ = 'yhd'


system_generated_summary = " The Kyrgyz President pushed through the law requiring " \
                           "the use of ink during the upcoming Parliamentary and Presidential " \
                           "elections In an effort to live up to its reputation in the 1990s as an " \
                           "island of democracy. The use of ink is one part of a general effort to show commitment " \
                           "towards more open elections. improper use of this type of ink can cause " \
                           "additional problems as the elections in Afghanistan showed. The use of ink " \
                           "and readers by itself is not a panacea for election ills."
manual_summmary = " The use of invisible ink and ultraviolet readers in the elections of the Kyrgyz Republic which is a small, mountainous state of the former Soviet republic, causing both worries and guarded optimism among different sectors of the population. Though the actual technology behind the ink is not complicated, the presence of ultraviolet light (of the kind used to verify money) causes the ink to glow with a neon yellow light. But, this use of the new technology has caused a lot of problems. "


from nlgeval import compute_individual_metrics
metrics_dict = compute_individual_metrics(manual_summmary, system_generated_summary)
print(metrics_dict)