def compute_novelty(sentences, corpus_file, opt, idx_to_word): """Computes the novelty of a batch of sentences given a corpus.""" # Prepare sampled sentences and corpus to compare to ref = sentences[0].split("\n") sentences = [s.split(" ") for s in sentences[1].split("\n")] with open(corpus_file, 'r') as f: corpus = [s.rstrip().split(" ") for s in f.readlines()] # Remove sentences much longer than the sampled sentences length corpus = [s for s in corpus if len(s) < opt.sample_len + 5] # Compute the novelty for each sentence novelty = [] closest = [] for i, sen in enumerate(sentences): print("Computing novelty for sentence {}/{}.\n".format( i, len(sentences))) mindex = np.argmin(np.array([ter(sen, s) for s in corpus])) novelty.append(ter(sen, corpus[mindex])) closest.append(" ".join( [idx_to_word[int(idx)] for idx in corpus[mindex]])) print("Novelty: {}, Sentence: {}, Closest: {}\n".format( novelty[i], ref[i], closest[i])) return sum(novelty) / float(len(novelty)), sorted( zip(novelty, ref, closest))
def spl(mt_path, ht_path): # Scores per line (bleu and ter) logger.info([mt_path, ht_path]) sacreBLEU = subprocess.Popen( "cat {} | sacrebleu -sl -b {} > {}.bpl".format(mt_path, ht_path, mt_path), cwd=app.config['TMP_FOLDER'], shell=True, stdout=subprocess.PIPE) sacreBLEU.wait() rows = [] with open('{}.bpl'.format(mt_path), 'r') as bl_file: rows = [{"bleu": line.strip()} for line in bl_file] os.remove("{}.bpl".format(mt_path)) with open(ht_path) as ht_file, open(mt_path) as mt_file: for i, row in enumerate(rows): ht_line = ht_file.readline().strip() mt_line = mt_file.readline().strip() if ht_line and mt_line: ter = round(pyter.ter(ht_line.split(), mt_line.split()), 2) rows[i]['ter'] = 100 if ter > 1 else utils.parse_number( ter * 100, 2) rows[i]['text'] = mt_line return rows
def ter(rw, hw): #ter terScore = '%.3f' % pyter.ter(hw, rw) return terScore print("SENTENCE ", sentCounter + 1, " calculus completed")
def score_instance(self, hypothesis: List[str], reference: List[str]) -> float: if reference and hypothesis: return pyter.ter(hypothesis, reference) if not reference and not hypothesis: return 0.0 return 1.0
def ter(ref, gen): ''' Args: ref - reference sentences - in a list gen - generated sentences - in a list Returns: averaged TER score over all sentence pairs ''' if len(ref) == 1: total_score = pyter.ter(gen[0].split(), ref[0].split()) else: total_score = 0 for i in range(len(gen)): total_score = total_score + pyter.ter(gen[i].split(), ref[i].split()) total_score = total_score / len(gen) return total_score
def get_value(self, mt_path, ht_path): ter = 0.0 with open(mt_path, 'r') as mt_file, open(ht_path, 'r') as ht_file: for i, (mt_line, ht_line) in enumerate(zip(mt_file, ht_file)): ter += pyter.ter(ht_line.split(), mt_line.split()) ter = round((ter / (i + 1)) * 100, 2) return 100.0, float(ter), 0.0
def __call__(self, decoded, references): ter_sum = 0 for hyp, ref in zip(decoded, references): if ref and hyp: ter_sum += pyter.ter(hyp, ref) elif not ref and not hyp: ter_sum += 0. else: ter_sum += 1. return ter_sum / len(decoded)
def __call__(self, decoded, references) -> float: ter_sum = 0. count = 0 for hyp, ref in zip(decoded, references): count += 1 if ref and hyp: ter_sum += pyter.ter(hyp, ref) elif not ref and not hyp: ter_sum += 0. else: ter_sum += 1. return ter_sum / count
def get_ter_score(hypothesis: List[List[str]], reference: List[str]) -> list: ter_score_list = [] for (hyps, ref) in zip(hypothesis, reference): try: ter_score = 0 for hyp_n in hyps: ter_score += pyter.ter(hyp_n, ref) ter_score = ter_score / len(hyps) ter_score_list.append(ter_score) except: continue return ter_score_list
def compute_ter(pred, data, pad_idx): """Computes the translation error rate of predicted sentences. Args: pred(list): [num_sentences, max_len]. Predictions in index form. data(list): [num_sentences, max_len]. Gold standard indices. Return: float: corpus TER between 0 and 1. """ pred = [remove_padding(p, pad_idx) for p in pred] data = [remove_padding(d, pad_idx) for d in data] return sum([ter(p, d) for p, d in zip(pred, data)]) / float(len(pred))
def cal_seq(data: pd.DataFrame, selected_list: list, remaining_list: list, n: int, memo_ter: list, alpha: float, beta: float): #assert n > 0, "Number of selected predictions has to be a positive integer." # Select the top score in data["Scores"] as the first selected index if n == 1: selected_idx = np.argmax(data["Scores"].to_numpy()) selected_list.append(selected_idx) remaining_list[selected_idx] = False return selected_list, remaining_list, memo_ter if n > 1: selected_list, remaining_list, memo_ter = cal_seq( data, selected_list, remaining_list, n - 1, memo_ter, alpha, beta) #print(n - 2) ter_list = [[] for _ in range(len(data.index)) ] # ter_list stores TER scores for n ref = data["Predictions"][selected_list[ n - 2]] # Take the latest selected index for iter_i in range(len(data.index)): if remaining_list[iter_i] == 0: # Setting False for already selected indexes in remaining_list to exclude them from calculating TER scores ter_list[iter_i] = 0.0 else: ter_list[iter_i] = pyter.ter(data["Predictions"][iter_i], ref) # ter_list[iter_i] = fake_ter(data["Predictions"][iter_i], ref) memo_ter[n - 2] = ter_list # Save TER socres to memo_ter #print("second") sum_ter = np.zeros((len(data.index), 1)) z_scores = np.ones(sum_ter.shape) * ( -np.inf) # Initialize z_scores with negative infinite values for j in range(n - 1): ter = np.array(memo_ter[j], dtype=np.float64).reshape(50, 1) sum_ter = sum_ter + ter sum_ter = sum_ter / ( n - 1) # Calculate diversity scores by averaging TER scores z_scores[remaining_list] = alpha * np.array(data["Scores"][ remaining_list]).reshape(-1, 1) + beta * sum_ter[ remaining_list] # Update z_scores for remaining hypotheses selected_idx = np.argmax(z_scores) selected_list.append(selected_idx) remaining_list[selected_idx] = False return selected_list, remaining_list, memo_ter
def escolha_ref_ter(references, candidate): """ No TER usa-se a referência mais próxima do output do tradutor. Esta função calcula para todas as referências o valor TER e escolhe o menor. :param references: Lista com as traducoes no corpus de teste. :param candidate: Lista com as traducoes do sistema de traducao. :return: TER minimo, a traducao de referencia usada para calcular o TER minimo """ score = [] for r in references: score.append(pyter.ter(candidate, r)) index_score = score.index(min(score)) reference_escolhida = references[index_score] return min(score), reference_escolhida
def ter(ref_path, hyp_path): """ Compute Translation Edit Rate between two files """ with open(ref_path) as ref_fp, open(hyp_path) as hyp_fp: ref_line = ref_fp.readline() hyp_line = hyp_fp.readline() ter_score = 0.0 line_cpt = 0.0 while ref_line and hyp_line: ter_score = ter_score+(pyter.ter(hyp_line.strip().split(), \ ref_line.strip().split())) line_cpt = line_cpt+1 ref_line = ref_fp.readline() hyp_line = hyp_fp.readline() mean_ter = 1.0 if line_cpt > 0: mean_ter = ter_score/line_cpt return mean_ter
def ter_score(self, ref, hyp): """ pyter: https://pypi.python.org/pypi/pyter/0.2.2.1 Java: tercom.jar: http://www.cs.umd.edu/~snover/tercom/ Tercom github: https://github.com/jhclark/tercom os.system('java -jar {dir}dependencies/tercom.7.25.jar -r {ref_file} -h {hyp_file} -n {dir}{output_file}'. format(hyp_file=hyp, ref_file=ref, dir=utils.project_dir_name(), output_file="assets/test_ter.txt")) :param ref: reference text (separated into words) :param hyp: hypotheses text (separated into words) :return: TER score """ return pyter.ter(hyp, ref)
def ter_score(references, hypothesis, num_refs): logging.info('STARTING TO COMPUTE TER...') print('STARTING TO COMPUTE TER...') ter_scores = [] for hyp, refs in zip(hypothesis, references): candidates = [] for ref in refs[:num_refs]: if len(ref) == 0: ter_score = 1 else: try: ter_score = pyter.ter(hyp.split(), ref.split()) except: ter_score = 1 candidates.append(ter_score) ter_scores.append(min(candidates)) logging.info('FINISHING TO COMPUTE TER...') print('FINISHING TO COMPUTE TER...') return sum(ter_scores) / len(ter_scores)
def test_paper(): ref = 'SAUDI ARABIA denied THIS WEEK information published in the AMERICAN new york times'.split() hyp = 'THIS WEEK THE SAUDIS denied information published in the new york times'.split() assert 0.3076923076923077 == pyter.ter(hyp, ref)
def _add_cache(self, iwords, mat): node = self._cache skipnum = len(iwords) - len(mat) for i in range(skipnum): node = node[iwords[i]][0] assert len(iwords[skipnum:]) == len(mat) for word, row in zip(iwords[skipnum:], mat): if word not in node: node[word] = [{}, None] value = node[word] if value[1] is None: value[1] = tuple(row) node = value[0] def _find_cache(self, iwords): node = self._cache start_position, row = 0, None for idx, word in enumerate(iwords): if word in node: start_position = idx + 1 node, row = node[word] else: break return start_position, row ref = ' hello how are you '.split() hyp = 'bonjour toi hellojdioro how '.split() print('%.3f' % pyter.ter(hyp, ref))
def ter_score(self, src_x, src_y): return pyter.ter(src_x.split(), src_y.split())
def metrics(fname): # BLEU from nltk.translate.bleu_score import sentence_bleu, corpus_bleu scores = [] f = open("poc_english.txt", "r") f2 = open(fname, "r") lines = f.readlines() cand = f2.readlines() for i in range(len(cand)): line = lines[i] candidate = [] l = cand[i].lower().strip('\n')[1:len(cand[i]) - 2].split(", ") for item in l: item = item.strip('.').split(" ") candidate.append(item) arr = line.strip('.\n').split(" ") for i in range(len(arr)): arr[i] = arr[i].lower() reference = [arr] for c in candidate: # print(reference, c, ': ', sentence_bleu(reference, c, weights=(1,0))) scores.append(sentence_bleu(reference, c, weights=(1, 0))) print("BLEU: " + str(sum(scores) / (1.0 * len(scores)))) # Word2Vec Cosine Similarity import torch import torch.nn.functional as F from sentence_transformers import SentenceTransformer import nltk from nltk import tokenize def similarity(par1, par2): transformer = SentenceTransformer('roberta-base-nli-stsb-mean-tokens') transformer.eval() par1 = tokenize.sent_tokenize(par1) vec1 = torch.Tensor(transformer.encode(par1)) vec1 = vec1.mean(0) par2 = tokenize.sent_tokenize(par2) vec2 = torch.Tensor(transformer.encode(par2)) vec2 = vec2.mean(0) cos_sim = F.cosine_similarity(vec1, vec2, dim=0) return cos_sim.item() scores = [] f = open("poc_english.txt", "r") f2 = open(fname, "r") lines = f.readlines() cand = f2.readlines() for i in range(len(cand)): line = lines[i] candidate = [] l = cand[i].lower().strip('\n')[1:len(cand[i]) - 2].split(", ") for item in l: item = item.strip('.').split(" ") candidate.append(item) arr = line.strip('.\n').split(" ") if (len(arr) == 1): continue for i in range(len(arr)): arr[i] = arr[i].lower() reference = arr for c in candidate: scores.append(similarity(" ".join(reference), " ".join(c))) print("Word2Vec Cosine Similarity: " + str(sum(scores) / (1.0 * len(scores)))) # WER scores = [] f = open("poc_english.txt", "r") f2 = open(fname, "r") lines = f.readlines() cand = f2.readlines() for i in range(len(cand)): line = lines[i] candidate = [] l = cand[i].lower().strip('\n')[1:len(cand[i]) - 2].split(", ") for item in l: item = item.strip('.').split(" ") candidate.append(item) arr = line.strip('.\n').split(" ") if (len(arr) == 1): continue for i in range(len(arr)): arr[i] = arr[i].lower() reference = arr for c in candidate: scores.append(wer_score(c, reference)) print("WER: " + str(sum(scores) / (1.0 * len(scores)))) # TER import pyter scores = [] f = open("poc_english.txt", "r") f2 = open(fname, "r") lines = f.readlines() cand = f2.readlines() for i in range(len(cand)): line = lines[i] candidate = [] l = cand[i].lower().strip('\n')[1:len(cand[i]) - 2].split(", ") for item in l: item = item.strip('.').split(" ") candidate.append(item) arr = line.strip('.\n').split(" ") if (len(arr) == 1): continue for i in range(len(arr)): arr[i] = arr[i].lower() reference = arr for c in candidate: scores.append(pyter.ter(reference, c)) print("TER: " + str(sum(scores) / (1.0 * len(scores))))
def ter_sim(text, hypo): return ter(text, hypo)
def test_same(): s = '''Since the visigoth period, the term Hispania, up until then used geographically, began to be also used with a political connotation, as an example the use of the expression Laus Hispaniae to describe the history of the towns of the peninsula in the chronicles of Isodoro de Sevilla.''' assert pyter.ter(s.split(), s.split()) == 0
def test_paper(): ref = 'SAUDI ARABIA denied THIS WEEK information published in the AMERICAN new york times'.split( ) hyp = 'THIS WEEK THE SAUDIS denied information published in the new york times'.split( ) assert 0.3076923076923077 == pyter.ter(hyp, ref)
def test(corpus, test_pairs, max_length, enable_cuda, epoch, transformer=False): scores_bleu = [] scores_ter = [] chencherry = SmoothingFunction() greedy_ref = open("greedy.ref", 'w', encoding='utf8') greedy_hyp = open("greedy.hyp", 'w', encoding='utf8') for i, (english, french) in tqdm(enumerate(list(zip(test_pairs[0], test_pairs[1])))): positions = corpus.word_positions(english) indices = corpus.to_indices(english) translation, attention = greedy(encoder, decoder, indices, positions, corpus.dict_f.word2index, corpus.dict_f.index2word, max_length, enable_cuda) if i == 35 and transformer: data = [ go.Heatmap(z=attention, x=english, y=translation, colorscale='Viridis') ] layout = go.Layout(width=800, height=600) fig = go.Figure(data=data, layout=layout) py.image.save_as(fig, filename='weights_{}.png'.format(epoch)) attention1 = encoder.layer1.attention.last_weights1 attention2 = encoder.layer1.attention.last_weights2 attention3 = encoder.layer1.attention.last_weights3 with open("weights_{}.txt".format(epoch), 'w') as f: f.write("\n".join([ "\t".join([str(num) for num in line]) for line in attention1 ])) f.write("\n") f.write("\n".join([ "\t".join([str(num) for num in line]) for line in attention2 ])) f.write("\n") f.write("\n".join([ "\t".join([str(num) for num in line]) for line in attention3 ])) f.write("\n") f.write("\t".join(english)) f.write("\t".join(translation)) elif i == 35: data = [ go.Heatmap(z=attention, x=english, y=translation, colorscale='Viridis') ] layout = go.Layout(width=800, height=600) fig = go.Figure(data=data, layout=layout) py.image.save_as(fig, filename='weights_{}.png'.format(epoch)) with open("weights_{}.txt".format(epoch), 'w') as f: f.write("\n".join([ "\t".join([str(num) for num in line]) for line in attention ])) f.write("\n") f.write("\t".join(english)) f.write("\t".join(translation)) french = clean(corpus.bpe_to_sentence(french)) translation = clean(corpus.bpe_to_sentence(translation)) scores_bleu.append( sentence_bleu([french], translation, smoothing_function=chencherry.method1)) scores_ter.append(pyter.ter(translation, french)) greedy_ref.write(" ".join(french) + "\n") greedy_hyp.write(" ".join(translation) + "\n") greedy_ref.close() greedy_hyp.close() score_bleu = sum(scores_bleu) / len(scores_bleu) score_ter = sum(scores_ter) / len(scores_ter) logging.info("Greedy, BLEU: {}, TER: {}, METEOR".format( score_bleu, score_ter)) scores_bleu = [] scores_ter = [] beam_ref = open("beam.ref", 'w', encoding='utf8') beam_hyp = open("beam.hyp", 'w', encoding='utf8') lengths = [] for english, french in tqdm(list(zip(test_pairs[0], test_pairs[1]))): positions = corpus.word_positions(english) indices = corpus.to_indices(english) translation, attention = beam(encoder, decoder, indices, positions, corpus.dict_f.word2index, corpus.dict_f.index2word, max_length, enable_cuda) if i == 35: # Attention visualization data = [ go.Heatmap(z=attention, x=english, y=translation, colorscale='Viridis') ] layout = go.Layout(width=800, height=600) fig = go.Figure(data=data, layout=layout) py.image.save_as(fig, filename='weights_{}.png'.format(epoch)) with open("weights_{}.txt".format(epoch), 'w') as f: f.write("\n".join([ "\t".join([str(num) for num in line]) for line in attention ])) f.write("\n") f.write("\t".join(english)) f.write("\t".join(translation)) french = clean(corpus.bpe_to_sentence(french)) translation = clean(corpus.bpe_to_sentence(translation)) scores_bleu.append( sentence_bleu([french], translation, smoothing_function=chencherry.method1)) scores_ter.append(pyter.ter(translation, french)) beam_ref.write(" ".join(french) + "\n") beam_hyp.write(" ".join(translation) + "\n") lengths.append(len(french)) beam_ref.close() beam_hyp.close() score_bleu = sum(scores_bleu) / len(scores_bleu) score_ter = sum(scores_ter) / len(scores_ter) logging.info("Beam, BLEU: {}, TER: {}, METEOR".format( score_bleu, score_ter)) with open("lengths.txt", 'w') as f: f.write("\n".join([str(l) for l in lengths])) with open("bleu.txt", 'w') as f: f.write("\n".join([str(l) for l in scores_bleu])) with open("ter.txt", 'w') as f: f.write("\n".join([str(l) for l in scores_ter]))
def ter(rw, hw): #ter terScore = 100 * float("{:.2f}".format(pyter.ter(hw, rw))) return terScore
def get_ter_score(candidate, reference): return pyter.ter(candidate.split(), reference.split())
n_best = 50 num_pred = 1000 # default = 100 predictions = readcsv_to_df(file_pth, num_pred) hyps_data = clear_pad(predictions) # <class 'pandas.core.frame.DataFrame'> #TODO : # Step 1: For one sequence, take the top prediction as the reference, the rest predictions as hypothese to be compared. # Calculating TER scores ter_scores = [[] for _ in range(len(hyps_data.index))] list_of_hyps = hyps_data["Predictions"].to_numpy(dtype=str) ref = list_of_hyps[0] for i, hyp in enumerate(list_of_hyps): ter_scores[i] = pyter.ter(hyp, ref) hyps_data["TER scores"] = ter_scores # TODO: # Step 2: Consider both scores for quality and TER scores as selection criteria # Scores: the higher, the better quality better # TER scores: the higher, the larger difference # Function (simple version): z_scores = beta * Scores + alpha * (TER scores), # beta can possibly be 0 when quality scores do not play a role in selections # alpha = 1, beta = 1 hyps_data["Z_scores"] = hyps_data["Scores"] + hyps_data["TER scores"] # sort data for every n_best number of sequences sorted_data = pd.DataFrame(columns=hyps_data.columns)
arr[i] = arr[i].lower() reference = arr for c in candidate: scores.append(wer_score(c, reference)) print("WER: " + str(sum(scores) / (1.0 * len(scores)))) # TER import pyter scores = [] f = open("poc_english.txt", "r") f2 = open(fname, "r") lines = f.readlines() cand = f2.readlines() for i in range(len(cand)): line = lines[i] candidate = [] l = cand[i].lower().strip('\n')[1:len(cand[i]) - 2].split(", ") for item in l: item = item.strip('.').split(" ") candidate.append(item) arr = line.strip('.\n').split(" ") if (len(arr) == 1): continue for i in range(len(arr)): arr[i] = arr[i].lower() reference = arr for c in candidate: scores.append(pyter.ter(reference, c)) print("TER: " + str(sum(scores) / (1.0 * len(scores))))
def compute_ter_score(hyp, ref): return pyter.ter(hyp, ref)
def train(self, savepoint=None): print 'Start training...' with tf.Session() as sess: init = tf.global_variables_initializer() sess.run(init) if savepoint != None: tf.train.Saver().restore(sess, savepoint) last_val_ter = 100.0 patience_counter = 0 for epoch in range(1000): print '[Epoch #' + str(epoch) + ']' train_pair_list = self._prepareTrainPairList() percent = 0 for i in range(0, len(train_pair_list) - self.batch_size, self.batch_size): feed_dict = self._prepareTrainFeedDictList( train_pair_list, i) #_, caption, wp_loss, seq_loss, prior_factor = sess.run([self.train_step, self.caption, self.word_predict_loss, self.seq_loss, self.prior_factor], feed_dict=feed_dict) _, caption, seq_loss = sess.run( [self.train_step, self.caption, self.seq_loss], feed_dict=feed_dict) caption_str = self.data.tokenListToCaption([ self.data.word_list[word] for word in caption[0].tolist() ]) #print 'Caption: "{}", WP loss: {}, Seq loss: {}'.format(caption_str, wp_loss, seq_loss) print 'Caption: "{}", Seq loss: {}'.format( caption_str, seq_loss) if i * 100 / len(train_pair_list) > percent: percent += 1 print '{}%'.format(percent) if epoch > 1: ''' mean_bleu_list = [] max_bleu_list = [] ''' ter_score_list = [] for i in range(len(self.data.val_feat_list)): feed_dict = self._prepareTestFeedDictList( self.data.val_feat_list, i) caption = sess.run(self.caption, feed_dict=feed_dict) caption_str = self.data.tokenListToCaption([ self.data.word_list[word] for word in caption[0].tolist() ]) bleu_list = [] ''' for ref_caption in self.data.val_caption_str_list[i]: if caption_str != '': #bleu = bleu_eval.BLEU_fn(caption_str, ref_caption) bleu = pyter.ter(caption_str, ref_caption) else: bleu = 0.0 bleu_list.append(bleu) mean_bleu = np.mean(bleu_list) max_bleu = max(bleu_list) print 'Caption: "{}", Correct: {}, Average BLEU: {}, Best BLEU: {}'.format(caption_str, random.choice(self.data.val_caption_str_list[i]), mean_bleu, max_bleu) mean_bleu_list.append(mean_bleu) max_bleu_list.append(max_bleu) ''' ter_score = pyter.ter( caption_str, random.choice(self.data.val_caption_str_list[i])) ter_score_list.append(ter_score) print 'Caption: "{}", Correct: {}, TER: {}'.format( caption_str, random.choice(self.data.val_caption_str_list[i]), ter_score) ''' val_bleu = np.mean(max_bleu_list) print 'Validation BLEU: {}'.format(val_bleu) ''' val_ter = np.mean(ter_score_list) print 'Validation TER: {}'.format(val_ter) if val_ter > last_val_ter: patience_counter += 1 print 'Patience Counter: {}'.format(patience_counter) if patience_counter > self.patience: break else: patience_counter = 0 last_val_ter = val_ter tf.train.Saver().save(sess, self.save_path, global_step=epoch)
def ter(hyp, ref): return pyter.ter(hyp, ref)
# You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, # MA 02110-1301, USA. # import sys import pyter from nltk.tokenize import word_tokenize import argparse reload(sys) sys.setdefaultencoding("utf-8") parser = argparse.ArgumentParser() parser.add_argument("hyptextfile", help="Hypothesis sentences") parser.add_argument("reftextfile", help="Reference sentences") parser.add_argument("resultfile", help="Result file") args = parser.parse_args() reftext = open(args.reftextfile).readlines() hyptext = open(args.hyptextfile).readlines() result = open(args.resultfile, "w") for pair in zip(reftext, hyptext): tokenizedhyp = word_tokenize(pair[0]) tokenizedref = word_tokenize(pair[1]) result.write("{0}\n".format( pyter.ter(tokenizedhyp, tokenizedref) * len(tokenizedref)))