def main(): parser = argparse.ArgumentParser() parser.add_argument('--translation_file', type=str, help='File with translations (must be desegmented)') parser.add_argument('--reference_file', type=str, help='Reference file for target language') parser.add_argument('--tgt_lang', type=str, default='de', help='Target language code') args = parser.parse_args() # Detokenize the files detok = MosesDetokenizer(lang=args.tgt_lang) with open(args.translation_file, 'r') as f: translations = [detok.detokenize(l.strip().split()) for l in f] with open(args.reference_file, 'r') as f: refs = [detok.detokenize(l.strip().split()) for l in f] refs = [refs] # Compute BLEU score bleu = sacrebleu.corpus_bleu(translations, refs) print(bleu.score)
def eval_epoch_end(self, outputs, mode): eval_loss = self.eval_loss.compute() translations = list( itertools.chain(*[x['translations'] for x in outputs])) ground_truths = list( itertools.chain(*[x['ground_truths'] for x in outputs])) # TODO: add target language so detokenizer can be lang specific. detokenizer = MosesDetokenizer() translations = [ detokenizer.detokenize(sent.split()) for sent in translations ] ground_truths = [ detokenizer.detokenize(sent.split()) for sent in ground_truths ] assert len(translations) == len(ground_truths) sacre_bleu = corpus_bleu(translations, [ground_truths], tokenize="13a") dataset_name = "Validation" if mode == 'val' else "Test" logging.info(f"\n\n\n\n{dataset_name} set size: {len(translations)}") logging.info(f"{dataset_name} Sacre BLEU = {sacre_bleu.score}") logging.info(f"{dataset_name} TRANSLATION EXAMPLES:".upper()) for i in range(0, 3): ind = random.randint(0, len(translations) - 1) logging.info(" " + '\u0332'.join(f"EXAMPLE {i}:")) logging.info(f" Prediction: {translations[ind]}") logging.info(f" Ground Truth: {ground_truths[ind]}") ans = { f"{mode}_loss": eval_loss, f"{mode}_sacreBLEU": sacre_bleu.score } ans['log'] = dict(ans) return ans
def search_nli(self, prem_token_inflections, hypo_token_inflections, orig_prem_tokenized, orig_hypo_tokenized, original_loss, label, conservative=True, backward=False): perturbed_prem_tokenized = orig_prem_tokenized.copy() perturbed_hypo_tokenized = orig_hypo_tokenized.copy() max_loss = original_loss num_queries = 0 max_predicted = label if backward: prem_token_inflections = reversed(prem_token_inflections) hypo_token_inflections = reversed(hypo_token_inflections) detokenizer = MosesDetokenizer(lang='en') premise = detokenizer.detokenize(perturbed_prem_tokenized) for curr_token in hypo_token_inflections: max_infl = orig_hypo_tokenized[curr_token[0]] for infl in curr_token[1]: perturbed_hypo_tokenized[curr_token[0]] = infl perturbed = detokenizer.detokenize(perturbed_hypo_tokenized) loss, predicted = self.get_loss(premise, perturbed, label) num_queries += 1 if loss > max_loss: max_loss = loss max_infl = infl max_predicted = predicted if conservative and predicted != label: break perturbed_hypo_tokenized[curr_token[0]] = max_infl hypothesis = detokenizer.detokenize(perturbed_hypo_tokenized) for curr_token in prem_token_inflections: max_infl = orig_prem_tokenized[curr_token[0]] for infl in curr_token[1]: perturbed_prem_tokenized[curr_token[0]] = infl perturbed = detokenizer.detokenize(perturbed_prem_tokenized) loss, predicted = self.get_loss(perturbed, hypothesis, label) num_queries += 1 if loss > max_loss: max_loss = loss max_infl = infl max_predicted = predicted if conservative and predicted != label: break perturbed_prem_tokenized[curr_token[0]] = max_infl premise = detokenizer.detokenize(perturbed_prem_tokenized) return premise, hypothesis, max_loss, max_predicted, num_queries
def postprocess(sents, lang, common_lang="hi"): """ parse fairseq interactive output, convert script back to native Indic script (in case of Indic languages) and detokenize. infname: fairseq log file outfname: output file of translation (sentences not translated contain the dummy string 'DUMMY_OUTPUT' input_size: expected number of output sentences lang: language """ postprocessed_sents = [] if lang == "en": en_detok = MosesDetokenizer(lang="en") for sent in sents: # outfile.write(en_detok.detokenize(sent.split(" ")) + "\n") postprocessed_sents.append(en_detok.detokenize(sent.split(" "))) else: xliterator = unicode_transliterate.UnicodeIndicTransliterator() for sent in sents: outstr = indic_detokenize.trivial_detokenize( xliterator.transliterate(sent, common_lang, lang), lang) # outfile.write(outstr + "\n") postprocessed_sents.append(outstr) postprocessed_sents = [i.replace("<unk>", "") for i in postprocessed_sents] return postprocessed_sents
def main(): assert len(sys.argv) == 3, 'Usage: detokenizer.py $input $output' with open(sys.argv[1]) as f: tokens = f.read().split(' ') md = MosesDetokenizer(lang='en') with open(sys.argv[2], 'w') as out: out.write(md.detokenize(tokens, unescape=False))
def span_tokenize( self, text, aggressive_dash_splits=False, escape=True, protected_patterns=None, ): # https://stackoverflow.com/a/35634472 import re detokenizer = MosesDetokenizer(lang=self.lang) tokens = self.tokenize(text=text, aggressive_dash_splits=aggressive_dash_splits, return_str=False, escape=escape, protected_patterns=protected_patterns) tail = text accum = 0 tokens_spans = [] for token in tokens: detokenized_token = detokenizer.detokenize(tokens=[token], return_str=True, unescape=escape) escaped_token = re.escape(detokenized_token) m = re.search(escaped_token, tail) tok_start_pos, tok_end_pos = m.span() sent_start_pos = accum + tok_start_pos sent_end_pos = accum + tok_end_pos accum += tok_end_pos tail = tail[tok_end_pos:] tokens_spans.append( (detokenized_token, (sent_start_pos, sent_end_pos))) return tokens_spans
class MosesTokenizer(Tokenizer): def __init__(self, language, glossaries=None, aggressive_dash_splits=True, escape=False): super(MosesTokenizer, self).__init__(language=language, glossaries=glossaries) self._aggressive_dash_splits = aggressive_dash_splits self._escape = escape try: from sacremoses import MosesDetokenizer as MDetok from sacremoses import MosesTokenizer as MTok self._tok = MTok(lang=self.language) self._detok = MDetok(lang=self.language) except ImportError: raise ImportError( 'Please install Moses tokenizer with: pip3 install sacremoses') def tokenize(self, text, return_str=False): return self._tok.tokenize( self._convert_to_str(text), aggressive_dash_splits=self._aggressive_dash_splits, return_str=return_str, escape=self._escape, protected_patterns=self._glossaries) def detokenize(self, text, return_str=True): return self._detok.detokenize(self._convert_to_list(text), return_str=return_str, unescape=True)
class MosesProcessor: """ Tokenizer, Detokenizer and Normalizer utilities in Moses """ def __init__(self, lang_id: str): self.moses_tokenizer = MosesTokenizer(lang=lang_id) self.moses_detokenizer = MosesDetokenizer(lang=lang_id) self.normalizer = MosesPunctNormalizer(lang=lang_id) def detokenize(self, tokens: List[str]) -> str: """ Detokenizes a list of tokens Args: tokens: list of strings as tokens Returns: detokenized string """ return self.moses_detokenizer.detokenize(tokens) def tokenize(self, text: str): """ Tokenizes text using Moses -> Sentencepiece. """ return self.moses_tokenizer.tokenize(text, escape=False, return_str=True) def normalize(self, text: str): return self.normalizer.normalize(text)
class IndicProcessor: """ Tokenizer, Detokenizer and Normalizer utilities in Indic Languages. Currently supports: 'hi' """ def __init__(self, lang_id: str): if lang_id != 'hi': raise NotImplementedError self.moses_tokenizer = MosesTokenizer(lang=lang_id) self.moses_detokenizer = MosesDetokenizer(lang=lang_id) self.normalizer = MosesPunctNormalizer(lang=lang_id) def detokenize(self, tokens: List[str]) -> str: """ Detokenizes a list of tokens Args: tokens: list of strings as tokens Returns: detokenized string """ return self.moses_detokenizer.detokenize(tokens) def tokenize(self, text: str): return text def normalize(self, text: str): return text
def process_tokens(token_folder_name): """Processes data, outputs tokens and labels. :param token_folder_name: name of the directory containing tokenized data :return: list of list of tokens, list of list of labels, list of sentence texts, list of sentence labels """ m = MosesDetokenizer() tokens_by_sent, labels_by_sent, sent_text, sent_label = list(), list( ), list(), list() for file_ in os.listdir(token_folder_name): with open(os.path.join(token_folder_name, file_), 'r', encoding='utf-8') as f: f = f.readlines() sent = list() for line in f: line = line.split() if line: sent.append(line) else: words, labels = zip(*sent) detokenized_sent = m.detokenize(tokens=words, return_str=True) cum_label = 0 if all(label == 'O' for label in labels) else 1 tokens_by_sent.append(words) labels_by_sent.append(labels) sent_text.append(detokenized_sent) sent_label.append(cum_label) sent = list() return tokens_by_sent, labels_by_sent, sent_text, sent_label
class PyMosesTokenizer(GenericTokenizer): """ The call to standard moses tokenizer """ def __init__(self, lang, lowercase): self.mpn = MosesPunctNormalizer() self.tokenizer = MosesTokenizer(lang=lang) self.detokenizer = MosesDetokenizer(lang=lang) self.lowercase = lowercase self.lang = lang def tokenize(self, text): return self.tokenizer.tokenize( self.mpn.normalize(text.lower() if self.lowercase else text)) def detokenize(self, tokenized_list): temp_result = "" t_list_len = len(tokenized_list) for t_ind, token in enumerate(tokenized_list): apos_cnd = token == "'" and t_ind < t_list_len - 1 and tokenized_list[ t_ind + 1] == "s" if apos_cnd or token == "/": temp_result = temp_result.strip() + token else: temp_result += token + " " f_result = self.detokenizer.detokenize(temp_result.strip().split()) if len(f_result ) > 3 and f_result[-3] in string.punctuation and f_result[ -2] == " " and f_result[-1] == "\"": f_result = f_result[:-2] + f_result[-1] return f_result @property def model_name(self): return "Moses"
class MosesTokenizer(object): @staticmethod def add_args(parser): # fmt: off parser.add_argument('--moses-source-lang', default='en', metavar='SRC', help='source language') parser.add_argument('--moses-target-lang', default='en', metavar='TARGET', help='target language') parser.add_argument('--moses-no-dash-splits', action='store_true', default=False, help='don\'t apply dash split rules') parser.add_argument('--moses-no-escape', action='store_true', default=False, help='don\'t perform HTML escaping on apostrophy, quotes, etc.') # fmt: on def __init__(self, args): self.args = args try: from sacremoses import MosesTokenizer, MosesDetokenizer self.tok = MosesTokenizer(args.moses_source_lang) self.detok = MosesDetokenizer(args.moses_target_lang) except ImportError: raise ImportError('Please install Moses tokenizer with: pip install sacremoses') def encode(self, x: str) -> str: return self.tok.tokenize( x, aggressive_dash_splits=(not self.args.moses_no_dash_splits), return_str=True, escape=(not self.args.moses_no_escape), ) def decode(self, x: str) -> str: return self.detok.detokenize(x.split())
def search_qa(self, token_inflections, orig_tokenized, original_loss, question_dict, context, conservative=True, backward=False): perturbed_tokenized = orig_tokenized.copy() max_loss = original_loss num_queries = 0 max_predicted = '' if backward: token_inflections = reversed(token_inflections) detokenizer = MosesDetokenizer(lang='en') for curr_token in token_inflections: max_infl = orig_tokenized[curr_token[0]] for infl in curr_token[1]: perturbed_tokenized[curr_token[0]] = infl perturbed = detokenizer.detokenize(perturbed_tokenized) loss, predicted = self.get_loss(perturbed, question_dict, context) num_queries += 1 if loss > max_loss: max_loss = loss max_infl = infl max_predicted = predicted if conservative and metric_max_over_ground_truths(compute_f1, predicted, question_dict['gold_texts']) == 0: break perturbed_tokenized[curr_token[0]] = max_infl return perturbed_tokenized, max_loss, max_predicted, num_queries
class MosesTokenizer(Tokenizer): def __init__(self): super().__init__() self._tokenizer = SacreMosesTokenizer() self._detokenizer = MosesDetokenizer() def tokenize(self, sentence): return self._tokenizer.tokenize(sentence) def detokenize(self, tokens): """Unescape Moses punctuation tokens. Replaces escape sequences like [ with the original characters (such as '['), so they better align to the original text. """ return [self._detokenizer.unescape_xml(t) for t in tokens] def detokenize_ptb(self, tokens): # Not a perfect detokenizer, but a "good-enough" stand in. rep_dict = { "-LSB-": "[", "-RSB-": "]", "-LRB-": "(", "-RRB-": ")", "-LCB-": "{", "-RCB-": "}", "``": '"', "''": '"', } str1 = self._detokenizer.detokenize(replace_list(tokens, rep_dict)) return str1
class Tokenizer: def __init__(self): self.detokenizer = MosesDetokenizer(lang='en') def tokenize(self, s): tokens = [] for sentence in sent_tokenize(s): # remove underscores sentence = re.sub(r'_', r' ', sentence) # split basic camel case, lowercase first letters sentence = re.sub(r"([a-z])([A-Z])", lambda m: rf"{m.group(1)} {m.group(2).lower()}", sentence) # NLTK word tokenize tokens += word_tokenize(sentence) res = " ".join(tokens) return res def detokenize(self, s): tokens = s.split() return self.detokenizer.detokenize(tokens)
def search_nmt(self, token_inflections, orig_tokenized, original, original_bleu, reference, backward=False): perturbed_tokenized = orig_tokenized.copy() max_bleu = original_bleu num_queries = 0 max_predicted = '' if backward: token_inflections = reversed(token_inflections) detokenizer = MosesDetokenizer(lang='en') for curr_token in token_inflections: max_infl = orig_tokenized[curr_token[0]] for infl in curr_token[1]: perturbed_tokenized[curr_token[0]] = infl perturbed = detokenizer.detokenize(perturbed_tokenized) curr_bleu, predicted = self.get_bleu(perturbed, reference) num_queries += 1 if curr_bleu < max_bleu: # the smaller, the better max_bleu = curr_bleu max_infl = infl max_predicted = predicted perturbed_tokenized[curr_token[0]] = max_infl return perturbed_tokenized, max_bleu, max_predicted, num_queries
def search_seq2seq(self, token_inflections, orig_tokenized, original, original_score, reference, backward=False): perturbed_tokenized = orig_tokenized.copy() max_score = original_score num_queries = 0 max_predicted = '' if backward: token_inflections = reversed(token_inflections) detokenizer = MosesDetokenizer(lang='en') for curr_token in token_inflections: max_infl = orig_tokenized[curr_token[0]] for infl in curr_token[1]: perturbed_tokenized[curr_token[0]] = infl perturbed = detokenizer.detokenize(perturbed_tokenized) curr_score, predicted = self.get_score(perturbed, reference) num_queries += 1 if curr_score < max_score: max_score = curr_score max_infl = infl max_predicted = predicted perturbed_tokenized[curr_token[0]] = max_infl return perturbed_tokenized, max_score, max_predicted, num_queries
class Detokenizer(BatchProcessor): def __init__(self, lang): self.handler = MosesDetokenizer(lang=lang) def process(self, input): return self.handler.detokenize(clean_list(input.split()))
def detokenize(l_data): """general detokenizing for agmted train informal data""" print("detokenizing...") detokenizer = MosesDetokenizer(lang='en') # file_root = '../hybrid/' # infile = file_root + 'train_decap_agmt.informal.raw' # outfile = file_root + 'train_decap_agmt_2.informal.raw' # data = [] # df = pd.read_csv(infile, header=None, sep='\t', quoting=csv.QUOTE_NONE, encoding='utf-8') # df = list(df[0]) for line in tqdm(l_data): while detokenizer.detokenize(line.split()) != line: line = detokenizer.detokenize(line.split()) data.append(line) return data
def evaluate_qg(qg_src, qg_ref, qg_result): detokenizer = MosesDetokenizer() hypos = generate_questsions(load_file_lines(qg_src)) refs = load_file_lines(qg_ref) refs_detok = [] for ref in refs: refs_detok.append(detokenizer.detokenize([ref])) write_result(qg_result, hypos) return evaluate(hypos, [refs_detok])
def generate_questsions(sentence_list): qg = xlingqg.QuestionGenerator() detokenizer = MosesDetokenizer(lang='en') generated_questions = [] for sentence in tqdm(sentence_list): question = qg.generate_question(sentence) generated_questions.append(detokenizer.detokenize(question)) return generated_questions
class Detokenizer: def __init__(self, language): self.language = language self.detokenizer = MosesDetokenizer(lang=language) def __repr__(self): return f"Detokenizer({self.language})" def __call__(self, line): return self.detokenizer.detokenize(line.split())
def get_moses_detokenizer(lang): try: moses_detokenizer = MosesDetokenizer(lang=lang) except: print("WARNING: Moses doesn't have tokenizer for", lang) moses_detokenizer = MosesDetokenizer(lang=MOSES_TOKENIZER_DEFAULT_LANG) tokenizer = lambda x: moses_detokenizer.detokenize( x.split(), return_str=True) #string IN -> string OUT return tokenizer
def generate_tweet(chain, length=0): """Generate a tweet of given length (or until it ends naturally)""" detokenizer = MosesDetokenizer() size = 1 sentence = [get_next_word(chain, "START")] while length == 0 or length + 2 <= size: next_word = get_next_word(chain, sentence[-1]) if next_word == "END": break sentence.append(next_word) return detokenizer.detokenize(sentence)
def postprocess(target_lang): md = MosesDetokenizer(lang="fi") utf8_stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') for line in utf8_stdin: #the input is from Marian which outputs utf-8 desegmented = line.replace("@@ ", "") detokenized = md.detokenize(desegmented.split()) sys.stderr.write("sentence processed\n") sys.stdout.buffer.write((detokenized + "\n").encode("utf-8")) sys.stdout.flush()
def generate_answers(config, model, processor, qn_uuid_data, context_token_data, qn_token_data): uuid2ans = {} # maps uuid to string containing predicted answer data_size = len(qn_uuid_data) num_batches = ((data_size - 1) / config.batch_size) + 1 batch_num = 0 detokenizer = MosesDetokenizer() print("Generating answers...") for batch in get_batch_generator(processor.word2id, qn_uuid_data, context_token_data, qn_token_data, config.batch_size, config.context_len, config.question_len): # Get the predicted spans pred_start_batch, pred_end_batch = processor.test_one_batch( batch, model) # Convert pred_start_batch and pred_end_batch to lists length batch_size pred_start_batch = pred_start_batch.tolist() pred_end_batch = pred_end_batch.tolist() # For each example in the batch: for ex_idx, (pred_start, pred_end) in enumerate( zip(pred_start_batch, pred_end_batch)): # Original context tokens (no UNKs or padding) for this example context_tokens = batch.context_tokens[ex_idx] # list of strings # Check the predicted span is in range assert pred_start in range(len(context_tokens)) assert pred_end in range(len(context_tokens)) # Predicted answer tokens pred_ans_tokens = context_tokens[pred_start:pred_end + 1] # list of strings # Detokenize and add to dict uuid = batch.uuids[ex_idx] uuid2ans[uuid] = detokenizer.detokenize(pred_ans_tokens, return_str=True) batch_num += 1 if batch_num % 10 == 0: print("Generated answers for %i/%i batches = %.2f%%" % (batch_num, num_batches, batch_num * 100.0 / num_batches)) print("Finished generating answers for dataset.") return uuid2ans
def postprocess(infname, outfname, input_size, lang, common_lang="hi", transliterate=False): """ parse fairseq interactive output, convert script back to native Indic script (in case of Indic languages) and detokenize. infname: fairseq log file outfname: output file of translation (sentences not translated contain the dummy string 'DUMMY_OUTPUT' input_size: expected number of output sentences lang: language """ consolidated_testoutput = [] # with open(infname,'r',encoding='utf-8') as infile: # consolidated_testoutput= list(map(lambda x: x.strip(), filter(lambda x: x.startswith('H-'),infile) )) # consolidated_testoutput.sort(key=lambda x: int(x.split('\t')[0].split('-')[1])) # consolidated_testoutput=[ x.split('\t')[2] for x in consolidated_testoutput ] consolidated_testoutput = [(x, 0.0, "") for x in range(input_size)] temp_testoutput = [] with open(infname, "r", encoding="utf-8") as infile: temp_testoutput = list( map( lambda x: x.strip().split("\t"), filter(lambda x: x.startswith("H-"), infile), )) temp_testoutput = list( map(lambda x: (int(x[0].split("-")[1]), float(x[1]), x[2]), temp_testoutput)) for sid, score, hyp in temp_testoutput: consolidated_testoutput[sid] = (sid, score, hyp) consolidated_testoutput = [x[2] for x in consolidated_testoutput] if lang == "en": en_detok = MosesDetokenizer(lang="en") with open(outfname, "w", encoding="utf-8") as outfile: for sent in consolidated_testoutput: outfile.write(en_detok.detokenize(sent.split(" ")) + "\n") else: xliterator = unicode_transliterate.UnicodeIndicTransliterator() with open(outfname, "w", encoding="utf-8") as outfile: for sent in consolidated_testoutput: if transliterate: outstr = indic_detokenize.trivial_detokenize( xliterator.transliterate(sent, common_lang, lang), lang) else: outstr = indic_detokenize.trivial_detokenize(sent, lang) outfile.write(outstr + "\n")
def save_predictions(preds, evaluate_path, detokenize): md = MosesDetokenizer() with open(evaluate_path, 'w') as f: for pred in preds: if '<EOS>' in pred: pred = pred[:pred.index('<EOS>')] if detokenize: # print("pred", pred) output = md.detokenize(' '.join(pred).replace('@@ ', '').split()) else: output = ' '.join(pred) f.write(output + '\n')
def detokenize_data(token_list): detokenized_sentences = [] detokenizer = MosesDetokenizer() for one_document_token in token_list: sentence = detokenizer.detokenize(one_document_token, return_str=True) detokenized_sentences.append(sentence) return detokenized_sentences
class SacreMosesDetokenizer: r"""Apply the Moses Detokenizer implemented in sacremoses. Users of this class are required to `install sacremoses <https://github.com/alvations/sacremoses>`_. For example, one can use :samp:`pip install sacremoses`. .. note:: sacremoses carries an LGPL 2.1+ license. Parameters ---------- return_str: bool, default False True: return a single string False: return a list of words Examples -------- >>> detokenizer = gluonnlp.data.SacreMosesDetokenizer() >>> detokenizer(['Gluon', 'NLP', 'toolkit', 'provides', 'a', 'suite', 'of', ... 'text', 'processing', 'tools', '.'], return_str=True) 'Gluon NLP toolkit provides a suite of text processing tools.' >>> detokenizer(['Das', 'Gluon','NLP-Toolkit','stellt','eine','Reihe','von', ... 'Textverarbeitungstools','zur','Verfügung','.'], return_str=True) 'Das Gluon NLP-Toolkit stellt eine Reihe von Textverarbeitungstools zur Verfügung.' """ def __init__(self, return_str=True): self._return_str = return_str from sacremoses import MosesDetokenizer # pylint: disable=import-outside-toplevel self._detokenizer = MosesDetokenizer() def __call__(self, sample: List[str], return_str: Optional[bool] = None): """ Parameters ---------- sample The sentence to detokenize return_str True: return a single string False: return a list of words None: use constructor setting Returns ------- ret : list of strs or str List of words or detokenized text """ ret_str = self._return_str if return_str is None else return_str return self._detokenizer.detokenize(sample, return_str=ret_str)