def G_bleu_score(tru, summ, rev): actual = [] predicted = [] review = [] for i in range(len(tru)): actual.append(tru[i].split(' ')) predicted.append(summ[i].split(' ')) review.append(rev[i].split(' ')) gleu_actual = [] gleu_predicted = [] gleu_pred_to_actual = [] for i in range(len(actual)): gleu_actual.append(gleu.sentence_gleu(actual[i], ' '.join(review[i]))) gleu_predicted.append( gleu.sentence_gleu(predicted[i], ' '.join(review[i]))) gleu_pred_to_actual.append( gleu.sentence_gleu(predicted[i], ' '.join(actual[i]))) ar = np.mean(gleu_actual) pr = np.mean(gleu_predicted) ap = np.mean(gleu_pred_to_actual) print('actual GLEU score: ', ar) print('predicted GLEU score: ', pr) print('actual to predicted GLEU score: ', ap)
def test(): hyp1 = [ 'It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures', 'that', 'the', 'military', 'always', 'obeys', 'the', 'commands', 'of', 'the', 'party' ] ref1a = [ 'It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures', 'that', 'the', 'military', 'will', 'forever', 'heed', 'Party', 'commands' ] ref1b = [ 'It', 'is', 'the', 'guiding', 'principle', 'which', 'guarantees', 'the', 'military', 'forces', 'always', 'being', 'under', 'the', 'command', 'of', 'the', 'Party' ] ref1c = [ 'It', 'is', 'the', 'practical', 'guide', 'for', 'the', 'army', 'always', 'to', 'heed', 'the', 'directions', 'of', 'the', 'party' ] hyp2 = str( 'he read the book because he was interested in world history').split() ref2a = str( 'he was interested in world history because he read the book').split() list_of_references = [[ref1a, ref1b, ref1c], [ref2a]] hypotheses = [hyp1, hyp2] corpus_score = gleu.corpus_gleu(list_of_references, hypotheses) print("Corpus score: " + str(corpus_score)) #The example below show that corpus_gleu() is different from averaging sentence_gleu() for hypotheses score1 = gleu.sentence_gleu([ref1a], hyp1) score2 = gleu.sentence_gleu([ref2a], hyp2) average_score = (score1 + score2) / 2 print("Sentence score average: " + str(average_score))
def calc_seq_mt_features(ql, qr, signature=""): bleu_score_l = sentence_bleu( [ql], qr, smoothing_function=cc.method3) #NIST smoothing bleu_score_r = sentence_bleu([qr], ql, smoothing_function=cc.method3) gleu_score_l = sentence_gleu(ql, qr) gleu_score_r = sentence_gleu(qr, ql) try: ribes_score_l = sentence_ribes([ql], qr) except ZeroDivisionError: ribes_score_l = 0 try: ribes_score_r = sentence_ribes([qr], ql) except ZeroDivisionError: ribes_score_r = 0 feature_dict = {} if signature: signature = signature + "_" feature_names = [ "bleu_score_l", "bleu_score_r", "gleu_score_l", "gleu_score_r", "ribes_score_l", "ribes_score_r" ] for feature_name in feature_names: feature_dict[signature + feature_name] = locals()[feature_name] return feature_dict
def sentence_average_score(self, list_of_references, hypotheses, score_type="BLEU"): """ Averages score applied for every sentence :param list_of_references: list of reference texts (separated into words) :param hypotheses: hypotheses relative to reference (separated into words) :param score_type: metric being used :return: average sentences score """ sent_average_score = 0 if utils.BLEU_NAME in score_type: for ref, hyp in zip(list_of_references, hypotheses): sent_average_score += bleu.sentence_bleu(ref, hyp) # gram: default is between 1 and 4 elif utils.GOOGLE_BLEU_NAME in score_type: for ref, hyp in zip(list_of_references, hypotheses): sent_average_score += gleu.sentence_gleu(ref, hyp) # gram: default is between 1 and 4 elif utils.WER_NAME in score_type: for ref, hyp in zip(list_of_references, hypotheses): sent_average_score += self.wer_score(ref[0], hyp) # Assumes only 1 reference elif utils.TER_NAME in score_type: for ref, hyp in zip(list_of_references, hypotheses): sent_average_score += self.ter_score(ref[0], hyp) sent_average_score /= len(list_of_references) print("%s sentence average score: %.4f" % (score_type, sent_average_score)) return sent_average_score
def calculate_bleu(data, src_field, model, device, decodeType, max_len=30): cc = SmoothingFunction() sentBleu = 0.0 sentGleu = 0.0 trgs = [] pred_trgs = [] #bs = Beam_Search(model) for datum in tqdm(data): trg = vars(datum)['correction1'] src = vars(datum)['orig'] #translate_sentence(src, src_field, model, device, max_len = 25) #HERE if decodeType == "greedy": pred_trg = translate_sentence(src, src_field, model, device, max_len) else: #pred_trg = bs(src, src_field, device) pred_trg = beam_search(src, src_field, model, device, max_len) #cut off <eos> token #HERE #pred_trg = pred_trg[1:-1] #if len(pred_trg) < 2: pred_trg.append(".") sentBleu += sentence_bleu([trg], pred_trg, smoothing_function=cc.method3) sentGleu += sentence_gleu([trg], pred_trg) pred_trgs.append(pred_trg) trgs.append([trg]) sentBleu = sentBleu / len(data) sentGleu = sentGleu / len(data) corpusBleu = corpus_bleu(trgs, pred_trgs, smoothing_function=cc.method3) corpusGleu = corpus_gleu(trgs, pred_trgs) return sentBleu, sentGleu, corpusBleu, corpusGleu
def main(_): model = ShowAndTellModel(FLAGS.model_path) vocab = Vocabulary(FLAGS.vocab_file) filenames = _load_filenames() can1 = "a table with different kinds of food" candidate = can1.split() generator = CaptionGenerator(model, vocab) for filename in filenames: with tf.gfile.GFile(filename, "rb") as f: image = f.read() captions = generator.beam_search(image) print("Captions: ") for i, caption in enumerate(captions): sentence = [vocab.id_to_token(w) for w in caption.sentence[1:-1]] sentence = " ".join(sentence) temp = " %d) %s (p=%f)" % (i + 1, sentence, math.exp(caption.logprob)) print(temp) comp = [sentence.split()] # Calculating The Blue Score print('Blue cumulative 1-gram: %f' % sentence_bleu(comp, candidate, weights=(1, 0, 0, 0))) print('Blue cumulative 2-gram: %f' % sentence_bleu(comp, candidate, weights=(0.5, 0.5, 0, 0))) # Glue Score G = gleu.sentence_gleu(comp, candidate, min_len=1, max_len=2) print("Glue score for this sentence: {}".format(G))
def get_score(actual_list: List[str], desired_list: List[List[str]], n_gram: int): """ :param desired_list: A List of a List of all possible sentences ex: [['cats are cute'], ['dogs are cute']] :param actual_list: A list of the sentences to be scored ex: ['cats are cute'] :param n_gram: is the gram size ex: ['cats are cute'] -> n_gram = 3 return gives a float of the sentence-level GLEU score """ import nltk.translate.gleu_score as gleu if n_gram <= 4: return gleu.sentence_gleu(desired_list, actual_list, max_len=n_gram) else: return gleu.sentence_gleu( desired_list, actual_list) # if the ngram is at least 5, use the standard
def get_gleu_score(sentence_gleu,hyp,ref): """ This function return the gleu-Score :param sentence_gleu: nltk.translate.gleu_score.sentence_gleu :param hyp: hypothesis sentences, list(str) :param ref: reference sentences, list(list(str)) :return gleu-score """ return sentence_gleu(ref, hyp)
def _get_sent_gleu( hypothesis: List[str], references: List[List[str]], extra_args: Optional[Dict[str, str]] = None ) -> List[float]: joined_references = list(zip(*references)) return [ sentence_gleu([rr.split() for rr in r], h.split()) for r, h in zip(joined_references, hypothesis) ]
def score_sentence(self, sentence, target): tgt = self.itos(target) sen = self.itos(sentence) # if (self.i == 0): # print(tgt) # print(sen) # print() self.i = 1 return sentence_gleu([tgt], sen)
def _get_reward(y_hat, y, n_gram=6, method='gleu'): # This method gets the reward based on the sampling result and reference sentence. # For now, we uses GLEU in NLTK, but you can used your own well-defined reward function. # In addition, GLEU is variation of BLEU, and it is more fit to reinforcement learning. sf = SmoothingFunction() score_func = { 'gleu': lambda ref, hyp: sentence_gleu([ref], hyp, max_len=n_gram), 'bleu1': lambda ref, hyp: sentence_bleu([ref], hyp, weights=[1. / n_gram] * n_gram, smoothing_function=sf.method1), 'bleu2': lambda ref, hyp: sentence_bleu([ref], hyp, weights=[1. / n_gram] * n_gram, smoothing_function=sf.method2), 'bleu4': lambda ref, hyp: sentence_bleu([ref], hyp, weights=[1. / n_gram] * n_gram, smoothing_function=sf.method4), }[method] # Since we don't calculate reward score exactly as same as multi-bleu.perl, # (especialy we do have different tokenization,) I recommend to set n_gram to 6. # |y| = (batch_size, length1) # |y_hat| = (batch_size, length2) with torch.no_grad(): scores = [] for b in range(y.size(0)): ref, hyp = [], [] for t in range(y.size(-1)): ref += [str(int(y[b, t]))] if y[b, t] == data_loader.EOS: break for t in range(y_hat.size(-1)): hyp += [str(int(y_hat[b, t]))] if y_hat[b, t] == data_loader.EOS: break # Below lines are slower than naive for loops in above. # ref = y[b].masked_select(y[b] != data_loader.PAD).tolist() # hyp = y_hat[b].masked_select(y_hat[b] != data_loader.PAD).tolist() scores += [score_func(ref, hyp) * 100.] scores = torch.FloatTensor(scores).to(y.device) # |scores| = (batch_size) return scores
def computeGLEU(outputs, targets, corpus=False, tokenizer=None, segmenter=None): outputs = [tokenizer(o) for o in outputs] targets = [tokenizer(t) for t in targets] if segmenter is not None: outputs = segmenter(outputs) targets = segmenter(targets) if not corpus: return [sentence_gleu([t], o) for o, t in zip(outputs, targets)] return corpus_gleu([[t] for t in targets], [o for o in outputs])
def evaluateUsingGLEU(transliterated_file,row_name): j, score = 0,0 dataReader = csv.DictReader(transliterated_file) for j, row in enumerate(dataReader): sent_man_written = row['man_written'] reference = [tokenizer.tokenize(sent_man_written)] sent_machine_gen = row[row_name] candidate = tokenizer.tokenize(sent_machine_gen) score += gleu.sentence_gleu(reference, candidate) avg_score = score / (j + 1) print("Score using GLEU : ",avg_score)
def computeGLEU(outputs, targets, corpus=False, tokenizer=None): if tokenizer is None: tokenizer = revtok.tokenize outputs = [tokenizer(o) for o in outputs] targets = [tokenizer(t) for t in targets] if not corpus: return torch.Tensor( [sentence_gleu([t], o) for o, t in zip(outputs, targets)]) return corpus_gleu([[t] for t in targets], [o for o in outputs])
def computeBLEU(outputs, targets, corpus=False, tokenizer=None, segmenter=None): outputs = [tokenizer(o) for o in outputs] targets = [tokenizer(t) for t in targets] if segmenter is not None: outputs = segmenter(outputs) targets = segmenter(targets) if not corpus: return torch.Tensor([sentence_gleu( [t], o) for o, t in zip(outputs, targets)]) return corpus_bleu([[t] for t in targets], [o for o in outputs], emulate_multibleu=True)
def correlation(self, prev_sentences, response, dialog): """ Evaluate the relationship between every pair of sentences <previous_i, response> for i in [0, len(prev_sentence)] using averaged Word2Vec vectors and term-frequencies as weights. Also compute translation metrics scores using the previous lines as given references and the response as the translation hypothesis. TODO: - use bigrams - use tf-idf trained on opposite character's dialog - add more metrics (ROUGE, NIST, LEPOR, ...) INPUT: prev_sentences - list of previous sentences in the dialog response - response to the last sentence dialog - dialog generated so far (used as context) OUTPUT: score between 0 and MAX, where MAX is the maximum correlation score. MAX is just the number of metrics used to evaluate the correlation; it has yet to be established. """ #0) Compute term frequency for each word from the given corpus score = 0.0 counts = None corpus = prev_sentences + [response] if dialog: corpus += dialog counts = Counter(word_tokenize('\n'.join(corpus))) #1) Calculate weighted vector distance between every pair <previous_i, response> # and compute the mean of such distances vect_dist = 0.0 for previous in prev_sentences: vect_dist += 1 - np.linalg.norm( self._tf_weighted_sum(previous, counts) - self._tf_weighted_sum(response, counts)) #TODO: scale down the weight as going back into the dialog history score += vect_dist / len(prev_sentences) #2) Sentence-level translation metrics (BLEU, GLEU) f = SmoothingFunction().method3 bleu = sentence_bleu(prev_sentences, response, smoothing_function=f) gleu = sentence_gleu(prev_sentences, response) metrics_score = np.mean([bleu, gleu]) score += metrics_score return score
def gleu(reference, predict): """Compute sentence-level gleu score. Args: reference (list[str]) predict (list[str]) """ from nltk.translate import gleu_score if len(predict) == 0: if len(reference) == 0: return 1.0 else: return 0.0 return gleu_score.sentence_gleu([reference], predict)
def sentence_gleu(reference: str, prediction: str): return gleu_score.sentence_gleu([reference.strip()], prediction.strip())
def reinforce( input_tensor, target_tensor, target_sentence, # used for calculating GLEU score encoder, decoder, output_lang, encoder_optimizer, decoder_optimizer, max_length=MAX_LENGTH, teacher_forcing_ratio=0.5, hypothesis_to_generate=20, baseline_reward=0.2): # Part 1: Generate k hypothesis sentences hyp_sents = [] # list of generated sentences hyp_probs = [] # their respecive log probabilities for k in range(hypothesis_to_generate): encoder_hidden = encoder.init_hidden() encoder_optimizer.zero_grad() decoder_optimizer.zero_grad() input_length = input_tensor.size(0) target_length = target_tensor.size(0) encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=DEV_) for ei in range(input_length): encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden) encoder_outputs[ei] = encoder_output[0, 0] decoder_input = torch.tensor([[Lang.SOS_token]], device=DEV_) decoder_hidden = encoder_hidden use_teacher_forcing = True if random.random() < teacher_forcing_ratio \ else False out_sent = [] # the kth generated sentence out_prob = 0 # and its respecive log probability if use_teacher_forcing: # Teacher forcing: Feed the target as the next input to decoder for di in range(target_length): decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) # topv, topi = decoder_output.topk(1) # out_sent.append(output_lang.idx2word[topi.item()]) # out_prob += topv try: m = Categorical(logits=decoder_output) action = m.sample() if action.cpu().item() == Lang.EOS_token: break out_sent.append(output_lang.idx2word[action.cpu().item()]) out_prob += decoder_output[0][action.cpu().item()] decoder_input = target_tensor[di] # Teacher forcing except Exception as e: print(e) breakpoint() else: # Without teacher forcing: use its own predictions as the next input to decoder for di in range(target_length): decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) # topv, topi = decoder_output.topk(1) # out_sent.append(output_lang.idx2word[topi.item()]) # out_prob += topv try: m = Categorical(logits=decoder_output) action = m.sample() if action.item() == Lang.EOS_token: break out_sent.append(output_lang.idx2word[action.cpu().item()]) out_prob += decoder_output[0][action.cpu().item()] decoder_input = action.detach() except Exception as e: print(e) breakpoint() # FIXME Check this! # decoder_input = topi.squeeze().detach() # detach from history as input # if decoder_input.item() == Lang.EOS_token: # break hyp_sents.append(out_sent) hyp_probs.append(out_prob) hyp_probs = torch.tensor(hyp_probs, device=DEV_) # FIXME normalize probability values hyp_probs = F.softmax(hyp_probs, dim=0) print(hyp_probs) scores = [] for k in range(hypothesis_to_generate): # Score the output sentence using GLEU score = gleu_score.sentence_gleu([target_sentence], hyp_sents[k]) scores.append(score) scores = torch.tensor(scores, device=DEV_) reward = torch.sum(hyp_probs * scores) baseline = reward / hypothesis_to_generate # TODO CHECK BASELINE loss = -torch.sum(torch.log(hyp_probs)) * (reward - baseline) print('loss {} - reward {} - baseline {}'.format(loss, reward, baseline)) if loss < 1e-3: breakpoint() loss.backward() encoder_optimizer.step() decoder_optimizer.step() return loss.item() / target_length # TODO CHECK THIS
def computeGleu(target,reference): return [sentence_gleu([ref.strip()],tgt.strip()) for (ref,tgt) in zip(reference,target)]
def compute_features(data): # Initialize all feature placeholders data[ratio_num_char_source_candidate] = [] data[ratio_num_tokens_source_candidate] = [] data[ratio_mean_token_length_source_candidate] = [] data[ratio_common_bigrams_candidate_reference] = [] data[ratio_num_token_candidate_reference] = [] data[gleu_scores] = [] data[bleu_scores] = [] data[chrf_scores] = [] data[labels] = [] data[ratio_tree_height_candidate_reference] = [] for line_idx in range(0, len(data[source_lines])): # Feature: gleu_scores data[gleu_scores].append(sentence_gleu(data[reference_lines][line_idx], data[candidate_lines][line_idx])) # Feature: chrf_scores data[chrf_scores].append(sentence_chrf(data[reference_lines][line_idx], data[candidate_lines][line_idx])) # Feature: bleu_scores data[bleu_scores].append(float(data[bleu_scores_lines][line_idx])) # Feature: ratio_num_char_source_candidate data[ratio_num_char_source_candidate].append( len(re.sub('[\s+]', '', data[source_lines][line_idx])) / len(re.sub('[\s+]', '', data[candidate_lines][line_idx]))) # Feature: ratio_num_tokens_source_candidate data[ratio_num_tokens_source_candidate].append( len(re.compile('\S+').findall(data[source_lines][line_idx])) / len(re.compile('\S+').findall(data[candidate_lines][line_idx]))) # Feature: ratio_num_token_candidate_reference data[ratio_num_token_candidate_reference].append( len(re.sub('[\S+]', '', data[candidate_lines][line_idx])) / len(re.sub('[\S+]', '', data[reference_lines][line_idx]))) # Feature: ratio_mean_token_length_source_candidate data[ratio_mean_token_length_source_candidate].append( np.mean(list(map(len, re.compile('\S+').findall(data[source_lines][line_idx])))) / np.mean(list(map(len, re.compile('\S+').findall(data[candidate_lines][line_idx]))))) # Feature: ratio_common_bigrams_candidate_reference data[ratio_common_bigrams_candidate_reference].append( len( set([b for b in zip(re.compile('\S+').findall(data[reference_lines][line_idx])[:-1], re.compile('\S+').findall(data[reference_lines][line_idx])[1:])]) & set([b for b in zip(re.compile('\S+').findall(data[candidate_lines][line_idx])[:-1], re.compile('\S+').findall(data[candidate_lines][line_idx])[1:])]) ) / len([b for b in zip(re.compile('\S+').findall(data[reference_lines][line_idx])[:-1], re.compile('\S+').findall(data[reference_lines][line_idx])[1:])])) # Feature: ratio_tree_height_candidate_reference data[ratio_tree_height_candidate_reference].append( data[candidate_tree_heights][line_idx] / data[reference_tree_heights][line_idx] ) # Feature: labels data[labels].append(1 if data[provided_labels][line_idx] == "H" else 0)
def main(): from nltk.translate.gleu_score import corpus_gleu, sentence_gleu eos = 6 reference_batch = [[1, 1, 2, 1, eos]] #, [5, 1, eos, 0, 0], [2, 5, 3, eos, 1]] candidate_batch = [[1, 3, 1, eos, 0]] #, [5, 2, eos, 0, 0], [2, 2, 3, eos, 0]] row = 0 seq_length = len(candidate_batch[row]) true_batch_gleu = corpus_gleu([[_crop(r, eos)] for r in reference_batch], [_crop(c, eos) for c in candidate_batch]) gleu_score, n_match, tpfp, tpfn = custom_sentence_gleu( [_crop(reference_batch[row], eos)], _crop(candidate_batch[row], eos)) true_gleu_scores = [ sentence_gleu([_crop(reference_batch[k], eos)], _crop(candidate_batch[k], eos)) for k in range(len(candidate_batch)) ] print("true gleu: {}, n_match: {}, tpfp: {}, tpfn: {}".format( gleu_score, n_match, tpfp, tpfn)) gleu_scorer = GleuScorer(seq_length=seq_length, vocab_size=eos + 1, eos_idx=eos, input_type=ONEHOT_SOFT) #feed_hyp = np_label_smoothing(np_onehot(np.array(candidate_batch)), epsilon=1e-5) #feed_refs = np_label_smoothing(np_onehot(np.array(reference_batch)), epsilon=1e-5) feed_hyp = np_onehot(np.array(candidate_batch)) feed_refs = np_onehot(np.array(reference_batch)) #print("---> {}".format(feed_refs)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) feed_dict = { gleu_scorer.hypothesis: feed_hyp, gleu_scorer.reference: feed_refs } targets = [ gleu_scorer.batch_gleu_score, gleu_scorer.sentence_n_match, gleu_scorer.tpfn, gleu_scorer.tpfp, gleu_scorer.sentence_gleu_score, gleu_scorer.individual_ngrams[0] ] (batch_gleu, n_match, tpfn, tpfp, gleu, ngram) = sess.run(targets, feed_dict=feed_dict) print("our gleu: {}, n_match: {}, tpfp: {}, tpfn: {}".format( gleu[row], n_match[row], tpfp[row], tpfn[row])) print("\n\nBatch gleu's. official: {}. ours: {}".format( true_batch_gleu, batch_gleu)) print("\n\nall gleus....") print("true ones: {}".format(true_gleu_scores)) print("ours: {}".format(gleu)) print("ngram: {}".format(ngram))
def glue_similarity(hyp, ref): return gleu.sentence_gleu([hyp.split()], ref)
def score_compute(comp_res): res_wer = [] bleu_indi1 = [] bleu_indi2 = [] bleu_indi3 = [] bleu_indi4 = [] bleu_cum2 = [] bleu_cum3 = [] bleu_cum4 = [] gleu_sent = [] meteor_score = [] rouge_score = [] translated = [] reference = [] for i in range(len(comp_res)): reference.append([comp_res[i][0].split(' ')]) translated.append(comp_res[i][1].split(' ')) bleu_corpus = corpus_bleu(reference, translated) #sacrebleu_corpus = sacrebleu.corpus_bleu( translated, reference) gleu_corpus = corpus_gleu(reference, translated) # evaluator obj for rouge-l metric evaluator = Rouge( metrics=['rouge-l'], limit_length=True, length_limit=100, length_limit_type='words', apply_avg=True, apply_best=False, alpha=0.5, # Default F1_score weight_factor=1.2, stemming=True) #for result_pair in compare_results: for result_pair in comp_res: # ------------ WER #res_back = wer( result_pair[0].split(' '), result_pair[1].split(' ')) res_back = wer(result_pair[0].split(' '), result_pair[1].split(' ')) res_wer.append(res_back) # ----------- BLEU indi1_gr = sentence_bleu([result_pair[0].split(' ')], result_pair[1].split(' '), weights=(1, 0, 0, 0)) # individual 1-gram indi2_gr = sentence_bleu([result_pair[0].split(' ')], result_pair[1].split(' '), weights=(0, 1, 0, 0)) # individual 2-gram indi3_gr = sentence_bleu([result_pair[0].split(' ')], result_pair[1].split(' '), weights=(0, 0, 1, 0)) # individual 3-gram indi4_gr = sentence_bleu([result_pair[0].split(' ')], result_pair[1].split(' '), weights=(0, 0, 0, 1)) # individual 4-gram # cumulative 2-gram, 3-gram, 4-gram bleu cum2_gr = sentence_bleu([result_pair[0].split(' ')], result_pair[1].split(' '), weights=(0.5, 0.5, 0, 0)) cum3_gr = sentence_bleu([result_pair[0].split(' ')], result_pair[1].split(' '), weights=(0.33, 0.33, 0.33, 0)) cum4_gr = sentence_bleu([result_pair[0].split(' ')], result_pair[1].split(' '), weights=(0.25, 0.25, 0.25, 0.25)) gleu_s = sentence_gleu([result_pair[0].split(' ')], result_pair[1].split(' ')) meteor = round(single_meteor_score(result_pair[0], result_pair[1]), 4) rouge_all = evaluator.get_scores(result_pair[1], result_pair[0]) rouge_l_f1 = rouge_all['rouge-l']['f'] bleu_indi1.append(indi1_gr) bleu_indi2.append(indi2_gr) bleu_indi3.append(indi3_gr) bleu_indi4.append(indi4_gr) bleu_cum2.append(cum2_gr) bleu_cum3.append(cum3_gr) bleu_cum4.append(cum4_gr) gleu_sent.append(gleu_s) meteor_score.append(meteor) rouge_score.append(rouge_l_f1) wer_mean = np.mean(res_wer) wer_var = np.var(res_wer) bleu_indi1_mean = np.mean(bleu_indi1) bleu_indi2_mean = np.mean(bleu_indi2) bleu_indi3_mean = np.mean(bleu_indi3) bleu_indi4_mean = np.mean(bleu_indi4) bleu_cum2_mean = np.mean(bleu_cum2) bleu_cum3_mean = np.mean(bleu_cum3) bleu_cum4_mean = np.mean(bleu_cum4) gleu_s_mean = np.mean(gleu_sent) meteor_s_mean = np.mean(meteor_score) rouge_s_mean = np.mean(rouge_score) bleus = (bleu_indi1_mean, bleu_indi2_mean, bleu_indi3_mean, bleu_indi4_mean, bleu_cum2_mean, bleu_cum3_mean, bleu_cum4_mean, bleu_corpus) gleus = (gleu_s_mean, gleu_corpus) return wer_mean, wer_var, bleus, gleus, meteor_s_mean, rouge_s_mean
def _gleu(x, y): return sentence_gleu([x.split(" ")], y.split(" "))
def gleu_multi(x, y): return sentence_gleu([xx.split(" ") for xx in x], y.split(" "))
def gleu_one(prediction, ground_truth): ''' prediction: ['나', '는', ' ', '바', '보', '다', '.'] ground_truth: ['나', '는', ' ', '바', '보', '일', '까', '?'] ''' return sentence_gleu([ground_truth], prediction) * 100.
num_sentences = len(hypothesis) - 1 rf = open(ref_file, "r") reference = rf.read().split("\n") sf = open(scores_file, "w") gleu_score_average = 0.0 real_num_sentences = 0 for i in range(0, num_sentences): if (len(reference[i].strip()) != 0 or len(hypothesis[i].strip()) != 0): print("Ref" + str(i) + ": " + reference[i]) print("Hyp" + str(i) + ": " + hypothesis[i]) ref, hypo = reference[i].split(), hypothesis[i].split() gleu_score_average = gleu_score_average + gleu.sentence_gleu([ref], hypo) real_num_sentences = real_num_sentences + 1 gleu_score_average = gleu_score_average / real_num_sentences print("Sentences: " + str(real_num_sentences) + "; GLEU score average: " + str(gleu_score_average)) scores_str = 'GLEU: ' + str(gleu_score_average) sf.write(scores_str) sf.close() #print 'Average Metric Score for All Review Summary Pairs:' #print scores_str
def predictTestset(self, sess): """ Try predicting the sentences from the samples.txt file. The sentences are saved on the modelDir under the same name Args: sess: The current running session """ lines = [] hypseqs = [] refs = [] hyps = [] average_bleu = 0 average_gleu = 0 average_uni_ratio = 0 average_bi_ratio = 0 total_token = 0 uni_dict = {} bi_dict = {} av_total_uni_ratio = 0.0 av_total_bi_ratio = 0.0 flag = self.textData.loadTestData(self.textData.testSamplesDir) if not flag: # Loading the file to predict with open(os.path.join(self.args.rootDir, self.TEST_IN_NAME), 'r') as f: lines = f.readlines() else: for sample in self.textData.testingSamples: lines.append(self.textData.sequence2str(sample[0], clean=True)) hypseqs.append( self.textData.sequence2str(sample[1], clean=True)) modelList = self._getModelList() if not modelList: print( 'Warning: No model found in \'{}\'. Please train a model before trying to predict' .format(self.modelDir)) return # Predicting for each model present in modelDir for modelName in sorted(modelList): # TODO: Natural sorting print('Restoring previous model from {}'.format(modelName)) self.saver.restore(sess, modelName) print('Testing...') saveName = modelName[:-len( self.MODEL_EXT )] + self.TEST_OUT_SUFFIX # We remove the model extension and add the prediction suffix with open(saveName, 'w') as f: nbIgnored = 0 index = 0 for line in tqdm(lines, desc='Sentences'): if not flag: question = line[:-1] # Remove the endl character else: question = line answer = self.singlePredict(question) if not answer: nbIgnored += 1 continue # Back to the beginning, try again if not flag: predString = '{x[0]}{0}\n{x[1]}{1}\n\n'.format( question, self.textData.sequence2str(answer, clean=True), x=self.SENTENCES_PREFIX) else: predString = '{x[0]}{0}\n{x[1]}{1}\n{y}{2}\n\n'.format( question, self.textData.sequence2str(answer, clean=True), hypseqs[index], x=self.SENTENCES_PREFIX, y='T: ') ref = nltk.word_tokenize( self.textData.sequence2str(answer, clean=True)) refs.append([ref]) hyp = nltk.word_tokenize(hypseqs[index]) hyps.append(hyp) tokens = len(ref) total_token += tokens dic1 = {k: ref.count(k) for k in set(ref)} uni_types = len(dic1) dic2 = {} for i in range(len(ref) - 1): item = ref[i] + ' ' + ref[i + 1] if item in dic2.keys(): dic2[item] += 1 else: dic2[item] = 1 bi_types = len(dic2) if tokens > 0: uni_ratio = float(uni_types) / float(tokens) else: uni_ratio = 0 if tokens > 1: bi_ratio = float(bi_types) / float(tokens - 1) else: bi_ratio = 0 for it1 in dic1.keys(): if it1 in uni_dict.keys(): uni_dict[it1] += 1 else: uni_dict[it1] = 1 for it2 in dic2.keys(): if it2 in bi_dict.keys(): bi_dict[it2] += 1 else: bi_dict[it2] = 1 # bleu = bleu_score.sentence_bleu([ref], hyp, smoothing_function=bleu_score.SmoothingFunction().method2, weights=[0.3, 0.3, 0.2, 0.2]) bleu = bleu_score.sentence_bleu( [ref], hyp, smoothing_function=bleu_score.SmoothingFunction( ).method2) try: gleu = gleu_score.sentence_gleu(ref, hyp) except (ZeroDivisionError): print( "Error: Division by zero, need smoothing function." ) gleu = 0.0 predString = predString + ( "Sentence BLEU %.4f, Sentence Google-BlEU %.4f.\n" "Unigram diversity %.4f, Bigram diversity %.4f.\n\n" % (bleu, gleu, uni_ratio, bi_ratio)) average_bleu += bleu average_gleu += gleu average_bi_ratio += bi_ratio average_uni_ratio += uni_ratio if self.args.verbose: tqdm.write(predString) f.write(predString) index += 1 if flag: average_bleu /= (len(lines) - nbIgnored) average_gleu /= (len(lines) - nbIgnored) average_uni_ratio /= (len(lines) - nbIgnored) average_bi_ratio /= (len(lines) - nbIgnored) av_total_uni_ratio = float( len(uni_dict)) / float(total_token) av_total_bi_ratio = float( len(bi_dict)) / float(total_token - len(lines) + nbIgnored) # corpus_bleu = bleu_score.corpus_bleu(refs, hyps, # smoothing_function=bleu_score.SmoothingFunction().method2, # weights=[0.3, 0.3, 0.2, 0.2]) corpus_bleu = bleu_score.corpus_bleu( refs, hyps, smoothing_function=bleu_score.SmoothingFunction( ).method2) f.write( "Average BLEU %.4f, Average Google-BLEU %.4f, Corpus BLEU %.4f.\n" "Average Unigram diversity %.4f, Average Bigram diversity %.4f.\n" "Average T-Unigram diversity %.4f, Average T-Bigram diversity %.4f." % (average_bleu, average_gleu, corpus_bleu, average_uni_ratio, average_bi_ratio, av_total_uni_ratio, av_total_bi_ratio)) print( 'Prediction finished, {}/{} sentences ignored (too long)'. format(nbIgnored, len(lines)))
def compute_gleu(target, translation): gleu = gleu_score.sentence_gleu([target.replace("@@ ", "").split(" ")], translation.replace("@@ ", "").split(" ")) return gleu