def get_evalutation_scores(hypothesis, refrences, testing_mode=False):
    gleu_scores = {"Gleu_1": gleu.corpus_gleu(refrences, hypothesis, min_len=1, max_len=1),
                   "Gleu_2": gleu.corpus_gleu(refrences, hypothesis, min_len=1, max_len=2),
                   "Gleu_3": gleu.corpus_gleu(refrences, hypothesis, min_len=1, max_len=3),
                   "Gleu_4": gleu.corpus_gleu(refrences, hypothesis, min_len=1, max_len=4)
                   }

    if testing_mode:
        for i in range(len(hypothesis)):
            hypothesis[i] = ' '.join(hypothesis[i])

        refs = [[]]
        for i in range(len(refrences)):
            refs[0].append(' '.join(refrences[i][0]))
            if refs[0][-1] == "":
                refs[0][-1] = "no"
        refrences = refs

        n = NLGEval()
        scores = n.compute_metrics(ref_list=refrences, hyp_list=hypothesis)
    else:
        scores = {"Bleu_1": bleu_score.corpus_bleu(refrences, hypothesis, weights=[1.0]),
                  "Bleu_2": bleu_score.corpus_bleu(refrences, hypothesis, weights=[1. / 2, 1. / 2]),
                  "Bleu_3": bleu_score.corpus_bleu(refrences, hypothesis, weights=[1. / 3, 1. / 3, 1. / 3]),
                  "Bleu_4": bleu_score.corpus_bleu(refrences, hypothesis, weights=[1. / 4, 1. / 4, 1. / 4, 1. / 4])}

    for key, val in gleu_scores.items():
        scores[key] = val
    return scores
示例#2
0
def calculate_bleu(data, src_field, model, device, decodeType, max_len=30):
    cc = SmoothingFunction()
    sentBleu = 0.0
    sentGleu = 0.0
    trgs = []
    pred_trgs = []
    #bs = Beam_Search(model)
    for datum in tqdm(data):
        trg = vars(datum)['correction1']
        src = vars(datum)['orig']
        #translate_sentence(src, src_field, model, device, max_len = 25)
        #HERE
        if decodeType == "greedy":
            pred_trg = translate_sentence(src, src_field, model, device,
                                          max_len)
        else:
            #pred_trg = bs(src, src_field, device)
            pred_trg = beam_search(src, src_field, model, device, max_len)
        #cut off <eos> token
        #HERE
        #pred_trg = pred_trg[1:-1]
        #if len(pred_trg) < 2: pred_trg.append(".")
        sentBleu += sentence_bleu([trg],
                                  pred_trg,
                                  smoothing_function=cc.method3)
        sentGleu += sentence_gleu([trg], pred_trg)
        pred_trgs.append(pred_trg)
        trgs.append([trg])
    sentBleu = sentBleu / len(data)
    sentGleu = sentGleu / len(data)
    corpusBleu = corpus_bleu(trgs, pred_trgs, smoothing_function=cc.method3)
    corpusGleu = corpus_gleu(trgs, pred_trgs)
    return sentBleu, sentGleu, corpusBleu, corpusGleu
示例#3
0
def corpus_gleu(references: List[str], predictions: List[str]):
    if len(references) != len(predictions):
        raise ValueError("The lists must have the same length")

    references = [[o] for o in references]

    return gleu_score.corpus_gleu(references, predictions)
示例#4
0
def test():
    hyp1 = [
        'It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures', 'that',
        'the', 'military', 'always', 'obeys', 'the', 'commands', 'of', 'the',
        'party'
    ]
    ref1a = [
        'It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures', 'that',
        'the', 'military', 'will', 'forever', 'heed', 'Party', 'commands'
    ]
    ref1b = [
        'It', 'is', 'the', 'guiding', 'principle', 'which', 'guarantees',
        'the', 'military', 'forces', 'always', 'being', 'under', 'the',
        'command', 'of', 'the', 'Party'
    ]
    ref1c = [
        'It', 'is', 'the', 'practical', 'guide', 'for', 'the', 'army',
        'always', 'to', 'heed', 'the', 'directions', 'of', 'the', 'party'
    ]

    hyp2 = str(
        'he read the book because he was interested in world history').split()
    ref2a = str(
        'he was interested in world history because he read the book').split()

    list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
    hypotheses = [hyp1, hyp2]
    corpus_score = gleu.corpus_gleu(list_of_references, hypotheses)
    print("Corpus score: " + str(corpus_score))

    #The example below show that corpus_gleu() is different from averaging sentence_gleu() for hypotheses
    score1 = gleu.sentence_gleu([ref1a], hyp1)
    score2 = gleu.sentence_gleu([ref2a], hyp2)
    average_score = (score1 + score2) / 2
    print("Sentence score average: " + str(average_score))
示例#5
0
def computeGLEU(outputs, targets, corpus=False, tokenizer=None):
    if tokenizer is None:
        tokenizer = revtok.tokenize

    if not corpus:
        return [my_sentence_gleu([t], o) for o, t in zip(outputs, targets)]

    return corpus_gleu([[t] for t in targets], [o for o in outputs])
示例#6
0
def computeGLEU(outputs, targets, corpus=False, tokenizer=None):
    if tokenizer is None:
        tokenizer = revtok.tokenize

    outputs = [tokenizer(o) for o in outputs]
    targets = [tokenizer(t) for t in targets]

    if not corpus:
        return torch.Tensor(
            [sentence_gleu([t], o) for o, t in zip(outputs, targets)])
    return corpus_gleu([[t] for t in targets], [o for o in outputs])
示例#7
0
文件: utils.py 项目: GZJAS/Squirrel
def computeGLEU(outputs, targets, corpus=False, tokenizer=None, segmenter=None):
    outputs = [tokenizer(o) for o in outputs]
    targets = [tokenizer(t) for t in targets]

    if segmenter is not None:
        outputs = segmenter(outputs)
        targets = segmenter(targets)

    if not corpus:
        return [sentence_gleu([t],  o) for o, t in zip(outputs, targets)]
    return corpus_gleu([[t] for t in targets], [o for o in outputs])
示例#8
0
 def _compute(
     self,
     predictions: List[List[List[str]]],
     references: List[List[str]],
     min_len: int = 1,
     max_len: int = 4,
 ) -> Dict[str, float]:
     return {
         "google_bleu":
         gleu_score.corpus_gleu(list_of_references=references,
                                hypotheses=predictions,
                                min_len=min_len,
                                max_len=max_len)
     }
示例#9
0
    def corpus_score(self, list_of_references, hypotheses, score_type="BLEU"):
        """ Score specifically implemented for corpus

        :param list_of_references: list of reference texts
        :param hypotheses: hypotheses relative to reference
        :param score_type: metric being used
        :return: corpus score
        """

        corpus_score = None
        if utils.BLEU_NAME in score_type:
            corpus_score = bleu.corpus_bleu(list_of_references, hypotheses)
        elif utils.GOOGLE_BLEU_NAME in score_type:
            corpus_score = gleu.corpus_gleu(list_of_references, hypotheses)

        print("%s corpus score: %.4f" % (score_type, corpus_score))
        return corpus_score
def txt_to_error_detection_tsv(source_txt, target_txt, main_label='i'):
    # Convert txt to tsv
    # returns DataTSV data structure
    overall_correct = 0
    overall_incorrect = 0
    average_incorrect = 0
    average_correct = 0
    tracker = 1

    wrong_sents = read_txt(source_txt)
    correct_sents = read_txt(target_txt)
    assert len(wrong_sents) == len(
        correct_sents), "Files " + source_txt + " and " + target_txt + " do not have same number of sentences! Aborting..."
    data_list = []
    for wrong, correct in zip(wrong_sents, correct_sents):
        label_string, incorrect_counter, correct_counter = get_alignment(correct, wrong)
        overall_correct += correct_counter
        overall_incorrect += incorrect_counter
        data_list += [[wrong, label_string]]
        average_incorrect = average_incorrect * (tracker - 1) / tracker + incorrect_counter / tracker
        average_correct = average_correct * (tracker - 1) / tracker + correct_counter / tracker
        tracker += 1

    # Calculating GLEU score
    wrong_lister = []
    right_lister = []
    for sent in wrong_sents:
        split_sent = sent.split()
        wrong_lister.append(split_sent)

    for sent in correct_sents:
        split_sent = sent.split()
        right_lister.append(split_sent)

    # word_error_rate = wer(correct_sents, wrong_sents)
    # corpus_gleu = gleu_score.corpus_gleu(wrong_lister, right_lister)
    smoother = bleu_score.SmoothingFunction()
    corpus_gleu = gleu_score.corpus_gleu(right_lister, wrong_lister)
    #corpus_bleu = bleu_score.corpus_bleu(right_lister, wrong_lister, smoothing_function=smoother.method1)
    corpus_bleu = 0
    tsv_data = DataTSV(main_label)
    tsv_data.data_list = data_list
    tsv_data.propagate_list()
    return tsv_data, corpus_gleu, corpus_bleu, overall_correct, overall_incorrect, average_incorrect, average_correct
示例#11
0
文件: SMT.py 项目: huykyk/-
    def test(self, testData):
        """
		Args:
			testData (list): [[FS, SS], ...], type(FS)=type(SS)=unicode
		"""
        TSSList = self.generateSSList([FS for FS, SS in testData])
        filepath = SMT_RESULT_PATH + '/poemSMT_lmn%d_sm%.3f_lmw%.3f_be%d.txt' % (
            self.LM_GRAM_NUM, self.SMOOTHING_LAMBDA, self.LM_WEIGHT,
            self.BEAM_SIZE)
        saveResult(testData, TSSList, filepath)
        refList = [[SS] for FS, SS in testData]
        bleu = corpus_bleu(refList, TSSList, weights=SCORE_WEIGHT)
        gleu = corpus_gleu(refList,
                           TSSList,
                           min_len=1,
                           max_len=len(SCORE_WEIGHT))
        infoStr = 'BLEU=%.4f, GLEU=%.4f |poemSMT_lmn%d_sm%.3f_lmw%.3f_be%d' % (
            bleu, gleu, self.LM_GRAM_NUM, self.SMOOTHING_LAMBDA,
            self.LM_WEIGHT, self.BEAM_SIZE)
        return bleu, gleu, infoStr
示例#12
0
def get_bleu_score():
    """
    :return: Bleu (optionally Gleu) score for the translation.
    """

    folds = os.listdir(tr)
    # folds = [dir for dir in os.listdir(tr) if os.path.isdir(tr + dir)]
    print(folds)
    fbleu = 0
    fgleu = 0
    for fold in folds:
        target_file = os.path.join(data, fold + '/train.de')
        trans_file = os.path.join(tr, fold + '/train.en.trans.de')
        target = codecs.open(target_file, 'r', encoding='utf-8')  # reference
        trans = codecs.open(trans_file, 'r', encoding='utf-8')  # correction
        references = []
        hypotheses = []
        for pair in zip(target, trans):
            ref = [pair[0].split()]
            hp = pair[1].split()
            if '10_gram' in tr:
                h1 = ''.join(hp)
                hp = h1.split('_')
            # print(hp)
            references.append(ref)
            hypotheses.append(hp)
            # print(hypotheses)
            # print(references)
            # sys.exit()
        bleu_score = nltk.translate.bleu_score.corpus_bleu(
            references, hypotheses)
        gleu_score = gleu.corpus_gleu(references, hypotheses)
        fbleu += bleu_score
        fgleu += gleu_score
        print(f'Bleu score for {fold}: {bleu_score}')
        # print(f'Gleu score for {fold}: {gleu_score}')
    av_bleu = fbleu / len(folds)
    av_gleu = fgleu / len(folds)
    print(f'Average bleu score: {av_bleu}')
示例#13
0
def toy(batch_size, max_len, vocab_size, seed, score_type, gs_type, output_dir,
        iterations, use_reg):
    assert gs_type in ['softmax', 'gs_hard']

    min_len = max(5, max_len // 4)
    eos_id = 2
    pad_id = 0

    random.seed(seed)
    np.random.seed(seed)
    tf.set_random_seed(seed)

    reference_lengths = np.random.randint(low=min_len,
                                          high=max_len - 1,
                                          size=batch_size)

    reference_tokens = generate_reference(max_len, reference_lengths,
                                          vocab_size, eos_id, pad_id)

    ref_tokens_var = tf.Variable(reference_tokens, trainable=False)
    ref_onehot = onehot(reference_tokens, vocab_size)
    ref = tf.Variable(ref_onehot, dtype=tf.float32, trainable=False)
    hyp_shape = (batch_size, max_len, vocab_size)
    hyp_logits = tf.Variable(np.random.rand(*hyp_shape), dtype=tf.float32)
    preds = tf.to_int32(tf.arg_max(hyp_logits, dimension=-1))
    #preds = p(preds, 'preds')
    global_step_var = tf.Variable(0,
                                  name='global_step',
                                  dtype=tf.int32,
                                  trainable=False)

    weights = tf.sequence_mask(reference_lengths,
                               maxlen=max_len,
                               dtype=tf.float32)
    # w_noise = tf.distributions.Bernoulli(probs=0.01, dtype=tf.float32).sample(tf.shape(weights))
    # weights = tf.multiply(weights, w_noise)
    # mle_loss = sequence_loss(targets=ref_tokens_var,
    #                          logits=hyp_logits,
    #                          weights=weights,
    #                          average_across_batch=True)

    if gs_type == 'softmax':
        scorer_input = tf.nn.softmax(hyp_logits)
    else:
        scorer_input = gumbel_softmax(hyp_logits, 0.5, hard=True)

    scorer_class = BleuScorer if score_type == 'bleu' else GleuScorer

    scorer = scorer_class(seq_length=max_len,
                          vocab_size=vocab_size,
                          eos_idx=eos_id,
                          reference=ref,
                          hypothesis=scorer_input,
                          ngram_lengths=[1, 2, 3, 4],
                          input_type=ONEHOT_SOFT)

    score = scorer.batch_score
    length_diff = tf.abs(scorer.ref_lengths - scorer.hyp_lengths)
    ref_hyp_length_diff = tf.reduce_mean(length_diff)
    target_prob = .95
    mean_max_prob = tf.reduce_mean(
        tf.reduce_max(tf.clip_by_value(scorer_input, -.1, target_prob),
                      axis=-1))
    #mean_max_prob =  p(mean_max_prob, 'maxmean')
    #reg_on_softmax=  -mean_max_prob
    reg_on_softmax = tf.reduce_mean(-tf.square(scorer_input) + scorer_input)
    #score_loss = -tf.log(1e-7 + score) + length_penalty_loss + mle_loss
    #score_loss = -tf.log(1e-7 + score) + length_penalty_loss
    #score_loss = -tf.log(1e-7 + score) + mle_loss
    scale = tf.clip_by_value(tf.to_float(global_step_var) / 1., 0., 50000.)
    #score_loss = -tf.log(1e-7 + score) + scale * reg_on_softmax + length_penalty_loss
    score_loss = -tf.log(1e-7 + score)
    if use_reg:
        score_loss = score_loss + 10000. * reg_on_softmax
    #score_loss = -score
    #score_loss = mle_loss

    optimizer = tf.train.AdamOptimizer(learning_rate=.01,
                                       beta1=0.9,
                                       beta2=0.98,
                                       epsilon=1e-8)

    grads_and_vars = optimizer.compute_gradients(score_loss)
    gradients, variables = list(zip(*grads_and_vars))
    gradient_norm = tf.global_norm(gradients)
    #gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
    train_op = optimizer.apply_gradients(zip(gradients, variables),
                                         global_step=global_step_var)

    init_op = tf.group(tf.global_variables_initializer(),
                       tf.local_variables_initializer())

    tf.summary.scalar('score', scorer.batch_score)
    tf.summary.scalar('score_loss', score_loss)
    sums_op = tf.summary.merge_all()

    with tf.train.MonitoredTrainingSession(checkpoint_dir=output_dir,
                                           save_summaries_steps=5,
                                           save_checkpoint_secs=1200) as sess:

        sum_writer = SummaryWriterCache.get(output_dir)

        sess.run(init_op)
        targets = [
            train_op, global_step_var, sums_op, score_loss, score,
            gradient_norm, preds, ref_hyp_length_diff, mean_max_prob
        ]

        best_score = -np.infty
        nltk_score = 0

        for step in tqdm(range(iterations),
                         ncols=70,
                         leave=False,
                         unit='batch'):
            _, global_step, graph_sums, loss_value, score_value, norm, pred_values, diff, mmp = sess.run(
                targets)

            # Compute batch BLEU and GLEU and save summaries of them
            cropped_y = [[_crop(reference_tokens[k, :], eos_id)]
                         for k in range(batch_size)]
            cropped_preds = [
                _crop(pred_values[k, :], eos_id) for k in range(batch_size)
            ]
            nltk_bleu = corpus_bleu(cropped_y, cropped_preds)
            nltk_gleu = corpus_gleu(cropped_y, cropped_preds)
            nltk_score = nltk_bleu if score_type == 'bleu' else nltk_gleu

            if nltk_score > best_score:
                best_score = nltk_score

            if step % 10 == 0:
                msg = "Loss: {:.5e}, score: {:.5e}, nltk.score: {:.5e}, norm: {:.5e}, diff: {:01.2f}, maxprob: {:.2f}"
                #print(msg.format(loss_value, score_value, nltk_score, norm, diff, mmp))

            sums = {
                'nltk.bleu': nltk_bleu,
                'nltk.gleu': nltk_gleu,
            }

            for label, measure in sums.items():
                summary = tf.Summary(
                    value=[tf.Summary.Value(tag=label, simple_value=measure)])
                sum_writer.add_summary(summary, global_step=global_step)

        best_score_file = os.path.join(output_dir, 'best_score.txt')
        with open(best_score_file, 'w') as f:
            print("best score: {}".format(best_score), file=f)
            print("last score: {}".format(nltk_score), file=f)
示例#14
0
    print("Restoring model from {}...".format(ckpt_path))
    optimistic_restore(sess, ckpt_path)
    print("done.")
    try:

        def cond(idx):
            return idx < args.training if args.training else True

        k = 0
        i = 0
        while cond(k):
            pred_values, y_values = sess.run([preds, y])
            bleu_preds.extend([_crop(p, EOS) for p in pred_values.tolist()])
            bleu_references.extend([[_crop(r, EOS)]
                                    for r in y_values.tolist()])
            if args.decode:
                for t, p in zip(y_values.tolist(), pred_values.tolist()):
                    i += 1
                    print("T[{}] : {}".format(i, decode(t)))
                    print("P[{}] : {}".format(i, decode(p)))
            else:
                print("|", end='', flush=True)
            k += 1
    except tf.errors.OutOfRangeError:
        pass

bleu_score = corpus_bleu(bleu_references, bleu_preds)
gleu_score = corpus_gleu(bleu_references, bleu_preds)
print("BLEU: {}".format(bleu_score))
print("GLEU: {}".format(gleu_score))
示例#15
0
def main():
    from nltk.translate.gleu_score import corpus_gleu, sentence_gleu

    eos = 6
    reference_batch = [[1, 1, 2, 1,
                        eos]]  #, [5, 1, eos, 0, 0], [2, 5, 3, eos, 1]]
    candidate_batch = [[1, 3, 1, eos,
                        0]]  #, [5, 2, eos, 0, 0], [2, 2, 3, eos, 0]]
    row = 0

    seq_length = len(candidate_batch[row])

    true_batch_gleu = corpus_gleu([[_crop(r, eos)] for r in reference_batch],
                                  [_crop(c, eos) for c in candidate_batch])

    gleu_score, n_match, tpfp, tpfn = custom_sentence_gleu(
        [_crop(reference_batch[row], eos)], _crop(candidate_batch[row], eos))

    true_gleu_scores = [
        sentence_gleu([_crop(reference_batch[k], eos)],
                      _crop(candidate_batch[k], eos))
        for k in range(len(candidate_batch))
    ]
    print("true gleu: {}, n_match: {}, tpfp: {}, tpfn: {}".format(
        gleu_score, n_match, tpfp, tpfn))

    gleu_scorer = GleuScorer(seq_length=seq_length,
                             vocab_size=eos + 1,
                             eos_idx=eos,
                             input_type=ONEHOT_SOFT)

    #feed_hyp = np_label_smoothing(np_onehot(np.array(candidate_batch)), epsilon=1e-5)
    #feed_refs = np_label_smoothing(np_onehot(np.array(reference_batch)), epsilon=1e-5)

    feed_hyp = np_onehot(np.array(candidate_batch))
    feed_refs = np_onehot(np.array(reference_batch))

    #print("---> {}".format(feed_refs))
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        feed_dict = {
            gleu_scorer.hypothesis: feed_hyp,
            gleu_scorer.reference: feed_refs
        }

        targets = [
            gleu_scorer.batch_gleu_score, gleu_scorer.sentence_n_match,
            gleu_scorer.tpfn, gleu_scorer.tpfp,
            gleu_scorer.sentence_gleu_score, gleu_scorer.individual_ngrams[0]
        ]
        (batch_gleu, n_match, tpfn, tpfp, gleu,
         ngram) = sess.run(targets, feed_dict=feed_dict)

    print("our gleu: {}, n_match: {}, tpfp: {}, tpfn: {}".format(
        gleu[row], n_match[row], tpfp[row], tpfn[row]))

    print("\n\nBatch gleu's. official: {}. ours: {}".format(
        true_batch_gleu, batch_gleu))

    print("\n\nall gleus....")
    print("true ones: {}".format(true_gleu_scores))
    print("ours: {}".format(gleu))

    print("ngram: {}".format(ngram))
示例#16
0
def computeCorpGleu(target,reference):
    return corpus_gleu(reference,target)
示例#17
0
def evaluation_metrics(dataset, steps, size):
    references = []
    hypotheses = []

    rouge = Rouge()
    rouge_dict = {
        "rouge-1": {
            "f": 0.0,
            "p": 0.0,
            "r": 0.0
        },
        "rouge-2": {
            "f": 0.0,
            "p": 0.0,
            "r": 0.0
        },
        "rouge-l": {
            "f": 0.0,
            "p": 0.0,
            "r": 0.0
        }
    }

    # make references & hypotheses lists
    for inputs, targets in dataset.take(steps):
        for labels in target_tokenizer.sequences_to_texts(
                test_step(inputs, targets)):
            if len(labels) > 0:
                hypotheses.append(labels.split())
            else:
                hypotheses.append([""])

        for labels in input_tokenizer.sequences_to_texts(inputs.numpy()):
            references.append(word_split(labels))

    for index, hypothesis in enumerate(hypotheses):
        max_score = {
            "rouge-1": {
                "f": 0.0,
                "p": 0.0,
                "r": 0.0
            },
            "rouge-2": {
                "f": 0.0,
                "p": 0.0,
                "r": 0.0
            },
            "rouge-l": {
                "f": 0.0,
                "p": 0.0,
                "r": 0.0
            }
        }

        # one hypothesis may have several references
        for reference in references[index]:
            try:
                rouge_score = rouge.get_scores(" ".join(hypothesis),
                                               " ".join(reference))[0]
                # keep the best score
                if rouge_sum_score(rouge_score) > rouge_sum_score(max_score):
                    max_score = rouge_score
            except ValueError:
                pass

        for method_key in rouge_dict:
            # fpr for traversing f1 precision recall
            for fpr in rouge_dict[method_key]:
                rouge_dict[method_key][fpr] += max_score[method_key][fpr]

    # average
    for method_key in rouge_dict:
        for fpr in rouge_dict[method_key]:
            rouge_dict[method_key][fpr] /= size

    bleu = bleu_score.corpus_bleu(references, hypotheses, weights=(1, ))
    gleu = gleu_score.corpus_gleu(references, hypotheses, max_len=1)
    nist = nist_score.corpus_nist(references, hypotheses, n=1)

    print("BLEU-1 Score: %.4f" % bleu)
    print("GLEU-1 Score: %.4f" % gleu)
    print("NIST-1 Score: %.4f" % nist)
    print("ROUGE Scores: %s" % rouge_dict_format(rouge_dict))

    return bleu, gleu, nist, rouge_dict
def score_compute(comp_res):

    res_wer = []
    bleu_indi1 = []
    bleu_indi2 = []
    bleu_indi3 = []
    bleu_indi4 = []
    bleu_cum2 = []
    bleu_cum3 = []
    bleu_cum4 = []
    gleu_sent = []
    meteor_score = []
    rouge_score = []

    translated = []
    reference = []
    for i in range(len(comp_res)):
        reference.append([comp_res[i][0].split(' ')])
        translated.append(comp_res[i][1].split(' '))
    bleu_corpus = corpus_bleu(reference, translated)
    #sacrebleu_corpus = sacrebleu.corpus_bleu( translated, reference)
    gleu_corpus = corpus_gleu(reference, translated)

    # evaluator obj for rouge-l metric
    evaluator = Rouge(
        metrics=['rouge-l'],
        limit_length=True,
        length_limit=100,
        length_limit_type='words',
        apply_avg=True,
        apply_best=False,
        alpha=0.5,  # Default F1_score
        weight_factor=1.2,
        stemming=True)

    #for result_pair in compare_results:
    for result_pair in comp_res:
        # ------------ WER
        #res_back = wer( result_pair[0].split(' '), result_pair[1].split(' '))
        res_back = wer(result_pair[0].split(' '), result_pair[1].split(' '))

        res_wer.append(res_back)

        # ----------- BLEU
        indi1_gr = sentence_bleu([result_pair[0].split(' ')],
                                 result_pair[1].split(' '),
                                 weights=(1, 0, 0, 0))  # individual 1-gram
        indi2_gr = sentence_bleu([result_pair[0].split(' ')],
                                 result_pair[1].split(' '),
                                 weights=(0, 1, 0, 0))  # individual 2-gram
        indi3_gr = sentence_bleu([result_pair[0].split(' ')],
                                 result_pair[1].split(' '),
                                 weights=(0, 0, 1, 0))  # individual 3-gram
        indi4_gr = sentence_bleu([result_pair[0].split(' ')],
                                 result_pair[1].split(' '),
                                 weights=(0, 0, 0, 1))  # individual 4-gram

        # cumulative 2-gram, 3-gram, 4-gram bleu
        cum2_gr = sentence_bleu([result_pair[0].split(' ')],
                                result_pair[1].split(' '),
                                weights=(0.5, 0.5, 0, 0))
        cum3_gr = sentence_bleu([result_pair[0].split(' ')],
                                result_pair[1].split(' '),
                                weights=(0.33, 0.33, 0.33, 0))
        cum4_gr = sentence_bleu([result_pair[0].split(' ')],
                                result_pair[1].split(' '),
                                weights=(0.25, 0.25, 0.25, 0.25))

        gleu_s = sentence_gleu([result_pair[0].split(' ')],
                               result_pair[1].split(' '))
        meteor = round(single_meteor_score(result_pair[0], result_pair[1]), 4)
        rouge_all = evaluator.get_scores(result_pair[1], result_pair[0])
        rouge_l_f1 = rouge_all['rouge-l']['f']

        bleu_indi1.append(indi1_gr)
        bleu_indi2.append(indi2_gr)
        bleu_indi3.append(indi3_gr)
        bleu_indi4.append(indi4_gr)
        bleu_cum2.append(cum2_gr)
        bleu_cum3.append(cum3_gr)
        bleu_cum4.append(cum4_gr)
        gleu_sent.append(gleu_s)
        meteor_score.append(meteor)
        rouge_score.append(rouge_l_f1)

    wer_mean = np.mean(res_wer)
    wer_var = np.var(res_wer)
    bleu_indi1_mean = np.mean(bleu_indi1)
    bleu_indi2_mean = np.mean(bleu_indi2)
    bleu_indi3_mean = np.mean(bleu_indi3)
    bleu_indi4_mean = np.mean(bleu_indi4)
    bleu_cum2_mean = np.mean(bleu_cum2)
    bleu_cum3_mean = np.mean(bleu_cum3)
    bleu_cum4_mean = np.mean(bleu_cum4)
    gleu_s_mean = np.mean(gleu_sent)
    meteor_s_mean = np.mean(meteor_score)
    rouge_s_mean = np.mean(rouge_score)

    bleus = (bleu_indi1_mean, bleu_indi2_mean, bleu_indi3_mean,
             bleu_indi4_mean, bleu_cum2_mean, bleu_cum3_mean, bleu_cum4_mean,
             bleu_corpus)
    gleus = (gleu_s_mean, gleu_corpus)

    return wer_mean, wer_var, bleus, gleus, meteor_s_mean, rouge_s_mean
示例#19
0
def compute_gleu(targets, translations):
    references, translations = [
        [target.replace("@@ ", "").split(" ")] for target in targets
    ], [t.replace("@@ ", "").split(" ") for t in translations]
    return gleu_score.corpus_gleu(references, translations)
示例#20
0
def get_gleu_score(targets, decodes):
    return corpus_gleu([[t] for t in targets], [o for o in decodes])

#################################################################################################
def gleu(prediction, ground_truth):
    return corpus_gleu([[x] for x in ground_truth], prediction) * 100.
示例#22
0
        all_records_prediction.append(record_translation)
    # print("=" * 40)
    print("Done : ", text_index + 1)

bleu_score_4 = corpus_bleu(list_of_references, hypothesis)
bleu_score_1 = corpus_bleu(list_of_references,
                           hypothesis,
                           weights=(1.0, 0, 0, 0))
bleu_score_2 = corpus_bleu(list_of_references,
                           hypothesis,
                           weights=(0.5, 0.5, 0, 0))
bleu_score_3 = corpus_bleu(list_of_references,
                           hypothesis,
                           weights=(0.3, 0.3, 0.3, 0))

gleu_score = corpus_gleu(list_of_references, hypothesis)

bleu_score_final = "Overall BLEU Score on FFR v1.0 Test Dataset : {}".format(
    round(
        max(bleu_score_1, bleu_score_2, bleu_score_3, bleu_score_4) * 100, 2))

gleu_score_ = "Overall GLEU Score on FFR v1.0 Test Dataset : {}".format(
    round(gleu_score * 100), 2)

testing_scores = list()
testing_scores.append(bleu_score_final)
testing_scores.append(gleu_score_)
# np_all_results = np.array(all_bleu_scores)
# np_all_predictions = np.array(all_records_prediction)
# np.save("all_bleu_results_fr", np_all_results)
# np.save("all_records_prediction", np_all_predictions)
示例#23
0
        outputs.append(tokens_orig)

        # bleu = bleu_score([outputs[-1]], [targets[-1]])
        # bleu_overall = bleu_score(outputs, targets)
        # BleuScores.append(bleu)
        # if np.int(np.floor((i + 1) /5)+ 1) % 2 == 0:
        #     # bleu = bleu_score(outputs, targets)
        #     print(np.int(np.floor((i + 1) /5)+1))
        #     bleu = bleu_score(outputs, targets)
        #     BleuScores.append(bleu)
        #     outputs = []
        #     targets = []

    if (i - 1) % 5 == 0:

        targ = line.strip().split(".", 1)[0]
        print(line.strip())
        print("+++++", targ)

        tokens_targ = [token.text.lower() for token in spacy_eng(targ)]

        targets.append([tokens_targ])

# BleuScores = np.array(BleuScores)
# print("average = ", np.mean(BleuScores))
print(bleu_score(outputs, targets))
print(corpus_bleu(targets, outputs))
print(corpus_gleu(targets, outputs))
print(BleuScores)
示例#24
0
import os

os.system('pip install nltk')
from nltk.translate.gleu_score import corpus_gleu

ref_final = []
hyp_final = []
ref_list = input('Enter the list of refrences : ').split('.')
hyp_list = input('Enter the list of hypothesis : ').split('.')
for r in ref_list[:-1]:
    ref_final.append(r.split(' '))
for h in hyp_list[:-1]:
    hyp_final.append(h.split(' '))
print(corpus_gleu(ref_final, hyp_final))
# bleu = bleu_score([[tokens_orig]],[tokens_orig])
#
# print(bleu)

# f = open("test10k.tsv", "r")
#9:13pm
f = open("/data/chaudhryz/ankit/test10k.tsv", "r")

BleuScores = []

targets = []
outputs = []

for i, line in enumerate(f):
    orig = line.split("\t")[0]
    targ = line.split("\t")[1]
    # print(orig)

    tokens_orig = [token.text.lower() for token in spacy_eng(orig)]
    tokens_targ = [token.text.lower() for token in spacy_eng(targ)]

    outputs.append(tokens_orig)
    targets.append([tokens_targ])
    if i % 100 == 0:
        print(i)
# print(bleu_score(outputs, targets))

print("NLTK BLEU score: ", corpus_bleu(targets, outputs))
print("NLTK GLEU score: ", corpus_gleu(targets, outputs))