示例#1
0
 def get_metric(self, reset: bool = False):
     metrics = {"BLEU": bleu.list_bleu([self.ref_list], self.hypo_list)}
     # Compute score for each pair and rouge type
     rouge_list: List[List[Score]] = list(
         map(self._unpack_rouge, self.ref_list, self.hypo_list))
     for i, current_rouge in enumerate(zip(*rouge_list)):
         for j, metric_name in enumerate(self.metric_mapping.keys()):
             metrics[
                 f"{self.rouge.rouge_types[i].upper()} {self.metric_mapping[metric_name]}"] = (
                     sum(score[j] for score in current_rouge) /
                     len(current_rouge) * 100)
     if reset:
         self.reset()
     return metrics
示例#2
0
 def __call__(self, prediction):
     preds = prediction.predictions
     preds_size = prediction.predictions_size
     label_ids = prediction.label_ids
     label_size = prediction.label_size
     p_start, l_start = 0, 0
     correct, total = 0, 0
     ref = []
     hyp = []
     if is_rank_0():
         fout = open(self.output_path, "w")
     for idx, (p_size, l_size) in enumerate(zip(preds_size, label_size)):
         p_end = p_start + p_size
         l_end = l_start + l_size
         pred = self.get_sequence(preds[p_start:p_end])
         label = self.get_sequence(label_ids[l_start:l_end])
         p_start = p_end
         l_start = l_end
         if pred == label:
             correct += 1
         total += 1
         if is_rank_0():
             pred_text = self.tokenizer.decode(
                 pred,
                 skip_special_tokens=True,
                 clean_up_tokenization_spaces=True).strip()
             label_text = self.tokenizer.decode(
                 label,
                 skip_special_tokens=True,
                 clean_up_tokenization_spaces=True).strip()
             ref.append(label_text)
             hyp.append(pred_text)
             fout.write(
                 json.dumps({
                     "idx": idx,
                     "pred": pred_text,
                     "label": label_text
                 }) + "\n")
     score = list_bleu([ref], hyp)
     return {
         "bleu": score,
         "accuracy": correct / total,
         "correct": correct,
         "total": total
     }
示例#3
0
                                     K.TOKENIZER_TRG_PATH, True, encoder,
                                     decoder)
            target_hyp_sequences.append(hyp)

        # pad predicted target sequences
        target_hyp_sequences = pad_sequences(target_hyp_sequences,
                                             padding="post",
                                             maxlen=K.SEQUENCE_LENGTH)
        target_hyp_sentences = [
            target_tokenizer.decode_ids(s.tolist())
            for s in target_hyp_sequences
        ]

        print("Computing Scores.")
        # compute BLEU score
        bleu_score = list_bleu([target_ref_sentences], target_hyp_sentences)

        # compute average ROUGE score
        rouge = Rouge()
        rouge_scores = rouge.get_scores(target_hyp_sentences,
                                        target_ref_sentences,
                                        avg=True)

        # compute average Accuracy score
        # important: padding tokens in both sequences would yield a higher accuracy score.
        # to avoid this, for each reference and hypotheses pair, we compute the inidices of the first occurence
        # of a padding token id, and then take the higher index. Then we truncate the sequence pairs according to
        # the computed index, such that we will minimize the influence of padding tokens affecting the acc score.

        # truncate both sequences
        # will store sequences with removed padding tokens
示例#4
0
files = [open(i, "r") for i in filenames]


cnt = 0

#for line in test_file:
#	print(cnt, " : ")
#	preds = model.predict([line])
#	cnt += 1
#	pred_file.write(preds[0] + "\n")



for rows in zip(*files):
	preds = model.predict([rows[0]])
	hyp = rows[1][:-1]
	result = ''
	max_score = -1

	for ele in preds[0]:
		bleu_score = list_bleu([ele], [hyp])
		if bleu_score > max_score:
			max_score = bleu_score
			result = ele

	cnt += 1
	pred_file.write(result + "\n")
	print(cnt, " : ", max_score, " : ", result)


示例#5
0
from bleu import list_bleu

ref = ['it is a white cat .', 'wow , this dog is huge .']
ref1 = ['This cat is white .', 'wow , this is a huge dog .']
hyp = ['it is a white kitten .', 'wowww , the dog is huge !']
hyp1 = ["it 's a white kitten .", 'wow , this dog is huge !']
assert 34.99 == list_bleu([ref], hyp)
assert 34.99 == list_bleu(ref, hyp)

assert 57.91 == list_bleu([ref, ref1], hyp1)

from bleu import multi_list_bleu

assert [34.99, 53.28] == multi_list_bleu(ref, [hyp, hyp1])
assert [34.99, 57.91] == multi_list_bleu([ref, ref1], [hyp, hyp1])

# if you want to get files that saved the detokenized version of your input lists
bleus, ref_files, hyp_files = multi_list_bleu([ref, ref1], [hyp, hyp1],
                                              return_files=True)
print(ref_files)
['TMP_DIR/ref0.txt', 'TMP_DIR/ref1.txt']
print(hyp_files)
['TMP_DIR/hyp0.txt', 'TMP_DIR/hyp1.txt']
assert 39.76 == list_bleu([ref], hyp, detok=False)

# or if you want to test multiple hypotheses
assert [39.76, 47.47] == multi_list_bleu([ref, ref1], [hyp, hyp1], detok=False)

from bleu import file_bleu

hyp_file, hyp_file1 = hyp_files
#                 machine_n_gram[j] = 0

#         #print (sum(machine_n_gram.values()), c)
#         clipped_precision_score.append(sum(machine_n_gram.values())/c)

#     #print (clipped_precision_score)

#     weights =[0.25]*4

#     s = (w_i * math.log(p_i) for w_i, p_i in zip(weights, clipped_precision_score))
#     s = BP * math.exp(math.fsum(s))
#     return s

# original = "It is a guide to action which ensures that the military alwasy obeys the command of the party"
# machine_translated = "It is the guiding principle which guarantees the military forces alwasy being under the command of the party"

# print (bleu_score(original, machine_translated))
# print (sentence_bleu([original.split()], machine_translated.split()))


from bleu import list_bleu
ref = ['it is a white cat .',
             'wow , this dog is huge .']
ref1 = ['This cat is white .',
             'wow , this is a huge dog .']
hyp = ['it is a white kitten .',
            'wowww , the dog is huge !']
hyp1 = ["it 's a white kitten .",
             'wow , this dog is huge !']
list_bleu([ref], hyp)
list_bleu([ref, ref1], hyp1)
示例#7
0
TOP_K = 5
HIDDEN_DIM = 32
BATCH_SIZE = 16
CHUNK_SIZE = 32
TRAIN_PATH = "data/french.txt"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dataset = CorpusDataset(TRAIN_PATH, CHUNK_SIZE, BATCH_SIZE)
model = Model(EMBEDDING_DIM, HIDDEN_DIM, len(dataset.vocabulary), device)

model.load_state_dict(torch.load('models/french.pt'))
model.eval()
input_sequence = "Le"

# running BLEU evaluation
ref = [
    top_k(model,
          input_sequence.split(),
          25,
          dataset.word_to_integer,
          dataset.integer_to_word,
          TOP_K,
          sample=True)
]
hyp = [
    'Le comportement de la',
]

output = list_bleu([ref], hyp)
with open('results/turing.txt', 'w+') as f:
    f.write("BLEU Metric: " + str(output))
F1       : 0.18038845327110314
Precision: 0.26704042769616537
Recall   : 0.14676986008799853
'''

### BLUE SCORE
"""

!pip install bleu

from bleu import list_bleu

KPs = [r[1] for r in RougeScores]
Args = [r[2] for r in RougeScores]

BlueScore = list_bleu([Args], KPs)

BlueScore #

"""Blue Score Result:
0.98 (below 1.0%!!!)

###BERTScore
"""

!pip install bert-score

from bert_score import score
def calc_bert_score(cands, refs):
    Precision, Recall, F1 = score(cands, refs, lang="en", verbose=True)
    return Precision.numpy().tolist(), Recall.numpy().tolist(), F1.numpy().tolist()