class MosesTokenizer(Tokenizer): def __init__(self): super().__init__() self._tokenizer = SacreMosesTokenizer() self._detokenizer = MosesDetokenizer() def tokenize(self, sentence): return self._tokenizer.tokenize(sentence) def detokenize(self, tokens): """Unescape Moses punctuation tokens. Replaces escape sequences like [ with the original characters (such as '['), so they better align to the original text. """ return [self._detokenizer.unescape_xml(t) for t in tokens] def detokenize_ptb(self, tokens): # Not a perfect detokenizer, but a "good-enough" stand in. rep_dict = { "-LSB-": "[", "-RSB-": "]", "-LRB-": "(", "-RRB-": ")", "-LCB-": "{", "-RCB-": "}", "``": '"', "''": '"', } str1 = self._detokenizer.detokenize(replace_list(tokens, rep_dict)) return str1
class MosesTokenizer(Tokenizer): def __init__(self): super().__init__() self._tokenizer = SacreMosesTokenizer() self._detokenizer = MosesDetokenizer() def tokenize(self, sentence): return self._tokenizer.tokenize(sentence) def detokenize(self, tokens): """Unescape Moses punctuation tokens. Replaces escape sequences like [ with the original characters (such as '['), so they better align to the original text. """ return [self._detokenizer.unescape_xml(t) for t in tokens]