示例#1
0
class MosesText(markovify.NewlineText):
    mt = sacremoses.MosesTokenizer()
    md = sacremoses.MosesDetokenizer()
    def word_join(self, words):
        return self.md.detokenize(words)
    def word_split(self, text):
        return self.mt.tokenize(text)
示例#2
0
    def __init__(self, config_file):
        with open(config_file) as f:
            self.__dict__.update(yaml.safe_load(f))
        assert self.type in {"cn2en", "en2cn"}
        codes = codecs.open(self.codes_file, encoding='utf-8')
        cur_path = os.path.dirname(os.path.realpath(__file__))
        self.tokenizer = BPE(codes)

        if self.type == "en2cn":
            # pre_process: normalize, tokenize, subEntity,to_lower,bpe
            # post_process: delbpe,remove_space
            self.en_tokenizer = os.path.join(cur_path, self.en_tokenizer)
            self.en_normalize_punctuation = sacremoses.MosesPunctNormalizer(
                lang="en")
            self.en_tokenizer = sacremoses.MosesTokenizer(
                lang='en', custom_nonbreaking_prefixes_file=self.en_tokenizer)
        elif self.type == "cn2en":
            # pre_process: tokenize, bpe
            # post_process: delbpe,detruecase,detokenize
            self.detruecase = sacremoses.MosesDetruecaser()
            self.detokenize = sacremoses.MosesDetokenizer(lang='en')
            self.client = aiohttp.ClientSession(
                timeout=aiohttp.ClientTimeout(total=3600),
                connector=aiohttp.TCPConnector(limit=sys.maxsize,
                                               limit_per_host=sys.maxsize))
            self.cn2en_trans_dict = slang_dict(self.trans_dict_file)
            self.chinese_char_pattern = re.compile(u"[\u4E00-\u9FA5]+")
            self.stops = re.compile(u"[.!?!?。。]+")
示例#3
0
    def __init__(self,
                 vocab_address=None,
                 bpe_code_address=None,
                 src_en='en',
                 tgt_de='de',
                 vocab_pad=8,
                 isolator='@@'):
        """
        Constructor for the Tokenizer class.

        Args:
            vocab_address: vocabulary address.
            bpe_code_address: path to the file with bpe codes.
            vocab_pad: pads vocabulary to a multiple of 'vocab_pad' tokens.
            isolator: tokenization isolator.
        """
        self.padding_index = 0
        self.unk_index = 1
        self.bos_index = 2
        self.eos_index = 3
        self.pad_word = '<pad>'
        self.unk_word = '<unk>'
        self.bos_word = '<s>'
        self.eos_word = r'<\s>'
        self.isolator = isolator
        self.init_bpe(bpe_code_address)
        self.vocab_establist(vocab_address, vocab_pad)
        self.sacremoses_tokenizer = sacremoses.MosesTokenizer(src_en)
        self.sacremoses_detokenizer = sacremoses.MosesDetokenizer(tgt_de)
示例#4
0
def init_word_detokenizers(main, lang):
    if lang not in ['zho_cn', 'zho_tw', 'jpn', 'tha', 'bod']:
        # Sacremoses
        lang_sacremoses = wl_conversion.remove_lang_code_suffixes(main, wl_conversion.to_iso_639_1(main, lang))
        lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        if f'sacremoses_moses_detokenizer_{lang}' not in main.__dict__:
            main.__dict__[f'sacremoses_moses_detokenizer_{lang}'] = sacremoses.MosesDetokenizer(lang = lang_sacremoses)
示例#5
0
def postprocess(outputs, truecaser=None, tokenizer=None):
    if truecaser:
        tr = sacremoses.MosesDetruecaser()
        output = [tr.detruecase(hyp, return_str=True) for hyp in outputs]
    if tokenizer:
        tk = sacremoses.MosesDetokenizer(tokenizer)
        outputs = [
            tk.detokenize(hyp.split(), return_str=True) for hyp in outputs
        ]
    return outputs
示例#6
0
    def __init__(self, lang: str = 'en', vocab: Optional[Vocab] = None):
        self._lang = lang
        self._vocab = vocab
        if lang == 'zh':
            warnings.warn(
                'You may not use MosesTokenizer for Chinese sentences because it is '
                'not accurate. Try to use JiebaTokenizer. You may also tokenize the '
                'chinese sentence to characters and learn a BPE.')
        self._tokenizer = sacremoses.MosesTokenizer(lang=lang)
        self._detokenizer = sacremoses.MosesDetokenizer(lang=lang)

        # Here, we need to warm-up the tokenizer to compile the regex
        # This will boost the performance in MacOS
        # For benchmarking results, see
        # https://gist.github.com/sxjscience/f59d2b88262fefd4fb08565c9dec6099
        self._warmup()
示例#7
0
文件: detok.py 项目: StatNLP/ada4asr
def main():
    parser = argparse.ArgumentParser(description="")
    parser.add_argument("files", nargs="*", help="input files")
    args = parser.parse_args()

    detok = sacremoses.MosesDetokenizer()

    for line in fileinput.input(args.files,
                                openhook=fileinput.hook_compressed):
        print(
            detok.detokenize(line.strip().split(" ")).replace(
                " @",
                "").replace("@ ",
                            "").replace(" =",
                                        "=").replace("= ",
                                                     "=").replace(" – ", "–"))
示例#8
0
文件: detok.py 项目: bcmi220/d2gpo
def main():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('files', nargs='*', help='input files')
    args = parser.parse_args()

    detok = sacremoses.MosesDetokenizer()

    for line in fileinput.input(args.files,
                                openhook=fileinput.hook_compressed):
        print(
            detok.detokenize(line.strip().split(' ')).replace(
                ' @',
                '').replace('@ ',
                            '').replace(' =',
                                        '=').replace('= ',
                                                     '=').replace(' – ', '–'))
示例#9
0
文件: eval.py 项目: hnt4499/seq2seq
def calc_detokenized_bleu(hyp, refs, tgt_lang="fr", unescape=False):
    """Calculate detokenized BLEU using raw (tokenized) texts. The input
    tokenized texts will be detokenized using `sacremoses`.

    References:
        https://github.com/pytorch/fairseq/blob/409032596bd80240f7fbc833b5d37485dee85b0e/fairseq/tasks/translation.py#L414
        https://github.com/pytorch/fairseq/blob/409032596bd80240f7fbc833b5d37485dee85b0e/fairseq_cli/score.py#L79

    Parameters
    ----------
    hyp : list of str
        A list containing hypotheses for each source sentence.
    refs : list of list of str
        A list of lists of candidate reference translations.
    tgt_lang : str
        Target language. Used for detokenizer.
    unescape : False
        Set this to True if the training data was tokenized using `moses` with
        escaping, e.g., "'" gets turned into "&apos;". Defaults to False since
        the data in this project is preprocessed with `--no-escape`.

    Returns:
    sacrebleu.metrics.bleu.BLEUScore
        Detokenized BLEU score.
    """
    # Check for validity
    for ref in refs:
        assert len(ref) == len(hyp), ("Number of sentences in hypothesis and "
                                      f"reference does not match: {len(hyp)} "
                                      f"and {len(ref)}")

    # Make sure unknown words are escaped
    hyp = [hyp_sent.replace("<unk>", "<unk_hyp>") for hyp_sent in hyp]

    # Detoknize
    detokenizer = sacremoses.MosesDetokenizer(lang=tgt_lang)
    hyp = [detokenizer.detokenize(hyp_sent.split()) for hyp_sent in hyp]
    refs_detok = []
    for ref in refs:
        ref = [
            detokenizer.detokenize(ref_sent.split(), unescape=unescape)
            for ref_sent in ref
        ]
        refs_detok.append(ref)

    return sacrebleu.corpus_bleu(hyp, refs_detok, tokenize="none")
 def __init__(self):
     self.question_generator = xlingqg.QuestionGenerator()
     self.translator = xlingqg.Translator()
     self.tokenizer = sacremoses.MosesTokenizer()
     self.detokenizer = sacremoses.MosesDetokenizer()
     self.answer_encoder = AnswerEncoder()
示例#11
0
    def __init__(self,
                 special=None,
                 min_freq=0,
                 max_size=None,
                 lower_case=False,
                 delimiter=None,
                 vocab_file=None,
                 pretrained_vocab_file: str = None,
                 never_split=None,
                 unk_token="<unk>",
                 eos_token="<eos>",
                 additional_special_tokens=["<formula>"],
                 language="en",
                 **kwargs):
        super().__init__(unk_token=unk_token,
                         eos_token=eos_token,
                         additional_special_tokens=additional_special_tokens,
                         **kwargs)

        if never_split is None:
            never_split = self.all_special_tokens
        if special is None:
            special = []
        self.counter = Counter()
        self.special = special
        self.min_freq = min_freq
        self.max_size = max_size
        self.lower_case = lower_case
        self.delimiter = delimiter
        self.vocab_file = vocab_file
        self.never_split = never_split
        self.punctuation_symbols = '!"#$%&()*+,-./\\:;<=>?@[\\]^_`{|}~'
        self.punction_without_space_before_pattern = re.compile(
            r"[^\s][{}]".format(self.punctuation_symbols))
        self.punctuation_with_space_around_pattern = self._compile_space_around_punctuation_pattern(
        )
        self.language = language
        self.moses_punct_normalizer = sm.MosesPunctNormalizer(language)
        self.moses_tokenizer = sm.MosesTokenizer(language)
        self.moses_detokenizer = sm.MosesDetokenizer(language)

        # This try... catch... is not beautiful but honestly this tokenizer was not made to be used
        # in a library like ours, at all.
        try:
            vocab_dict = None
            if pretrained_vocab_file is not None:
                # Priority on pickle files (support PyTorch and TF)
                with open(pretrained_vocab_file, "rb") as f:
                    vocab_dict = pickle.load(f)

                # Loading a torch-saved transfo-xl vocab dict with pickle results in an integer
                # Entering this if statement means that we tried to load a torch-saved file with pickle, and we failed.
                # We therefore load it with torch, if it's available.
                if type(vocab_dict) == int:
                    if not is_torch_available():
                        raise ImportError(
                            "Not trying to load dict with PyTorch as you need to install pytorch to load "
                            "from a PyTorch pretrained vocabulary, "
                            "or activate it with environment variables USE_TORCH=1 and USE_TF=0."
                        )
                    vocab_dict = torch.load(pretrained_vocab_file)

            if vocab_dict is not None:
                for key, value in vocab_dict.items():
                    if key not in self.__dict__:
                        self.__dict__[key] = value
            elif vocab_file is not None:
                self.build_vocab()

        except Exception as e:
            raise ValueError(
                "Unable to parse file {}. Unknown format. "
                "If you tried to load a model saved through TransfoXLTokenizerFast,"
                "please note they are not compatible.".format(
                    pretrained_vocab_file)) from e

        if vocab_file is not None:
            self.build_vocab()
示例#12
0
 def moses_detokenize(self, tokens, lang):
     if lang not in self.cache_moses_tokenizer:
         moses_detokenizer = sm.MosesDetokenizer(lang=self.tgt_lang)
         self.cache_moses_detokenizer[lang] = moses_detokenizer
     return self.cache_moses_detokenizer[lang].detokenize(tokens)
示例#13
0
def wordless_word_detokenize(main, tokens, lang, word_detokenizer='default'):
    sentence_start = 0
    sentences = []
    text = ''

    if lang not in main.settings_global['word_detokenizers']:
        lang = 'other'

    if word_detokenizer == 'default':
        word_detokenizer = main.settings_custom['word_detokenization'][
            'word_detokenizers'][lang]

    for i, token in enumerate(tokens):
        if type(token
                ) == wordless_text.Wordless_Token and token.sentence_ending:
            sentences.append(tokens[sentence_start:i + 1])

            sentence_start = i + 1
        elif i == len(tokens) - 1:
            sentences.append(tokens[sentence_start:])

    # English & Other Languages
    if word_detokenizer == main.tr('NLTK - Penn Treebank Detokenizer'):
        treebank_detokenizer = nltk.tokenize.treebank.TreebankWordDetokenizer()

        for sentence in sentences:
            text += treebank_detokenizer.tokenize(tokens)
    elif word_detokenizer == main.tr('Sacremoses - Moses Detokenizer'):
        moses_detokenizer = sacremoses.MosesDetokenizer(
            lang=wordless_conversion.to_iso_639_1(main, lang))

        for sentence in sentences:
            text += moses_detokenizer.detokenize(sentence)
    # Chinese
    elif word_detokenizer == main.tr('Wordless - Chinese Word Detokenizer'):
        non_cjk_start = 0

        for i, token in enumerate(tokens):
            if i >= non_cjk_start:
                if (wordless_checking_unicode.has_han(token)
                        or all(map(str.isnumeric, token))):
                    text += token

                    non_cjk_start += 1
                else:
                    # English
                    if wordless_checking_unicode.is_eng_token(token):
                        for j, token in enumerate(tokens[i:]):
                            if i + j + 1 == len(
                                    tokens
                            ) or not wordless_checking_unicode.is_eng_token(
                                    tokens[i + j + 1]):
                                text += wordless_word_detokenize(
                                    main,
                                    tokens[non_cjk_start:i + j + 1],
                                    lang='eng')

                                non_cjk_start = i + j + 1

                                break
                    # Other Languages
                    else:
                        for j, token in enumerate(tokens[i:]):
                            if (i + j + 1 == len(tokens)
                                    or wordless_checking_unicode.has_han(
                                        tokens[i + j + 1])):
                                text += wordless_word_detokenize(
                                    main,
                                    tokens[non_cjk_start:i + j + 1],
                                    lang='other')

                                non_cjk_start = i + j + 1

                                break
    elif word_detokenizer == main.tr('Wordless - Japanese Word Detokenizer'):
        non_cjk_start = 0

        for i, token in enumerate(tokens):
            if i < non_cjk_start:
                continue

            if (wordless_checking_unicode.has_han(token)
                    or wordless_checking_unicode.has_kana(token)
                    or all(map(str.isnumeric, token))):
                text += token

                non_cjk_start = i + 1
            else:
                # English
                if wordless_checking_unicode.is_eng_token(token):
                    for j, token in enumerate(tokens[i:]):
                        if i + j + 1 == len(
                                tokens
                        ) or not wordless_checking_unicode.is_eng_token(
                                tokens[i + j + 1]):
                            text += wordless_word_detokenize(
                                main,
                                tokens[non_cjk_start:i + j + 1],
                                lang='eng')

                            non_cjk_start = i + j + 1

                            break
                # Other Languages
                else:
                    for j, token in enumerate(tokens[i:]):
                        if (i + j + 1 == len(tokens)
                                or wordless_checking_unicode.has_han(
                                    tokens[i + j + 1])
                                or wordless_checking_unicode.has_kana(
                                    tokens[i + j + 1])):
                            text += wordless_word_detokenize(
                                main,
                                tokens[non_cjk_start:i + j + 1],
                                lang='other')

                            non_cjk_start = i + j + 1

                            break
    # Thai
    elif word_detokenizer in main.tr('Wordless - Thai Word Detokenizer'):
        non_thai_start = 0

        for i, token in enumerate(tokens):
            if i < non_thai_start:
                continue

            if wordless_checking_unicode.has_thai(token):
                if type(token) == wordless_text.Wordless_Token:
                    text += token + token.boundary
                else:
                    text += token

                non_thai_start = i + 1
            else:
                # English
                if wordless_checking_unicode.is_eng_token(token):
                    for j, token in enumerate(tokens[i:]):
                        if i + j + 1 == len(
                                tokens
                        ) or not wordless_checking_unicode.is_eng_token(
                                tokens[i + j + 1]):
                            text += wordless_word_detokenize(
                                main,
                                tokens[non_thai_start:i + j + 1],
                                lang='eng')

                            non_thai_start = i + j + 1

                            break
                # Other Languages
                else:
                    for j, token in enumerate(tokens[i:]):
                        if (i + j + 1 == len(tokens)
                                or wordless_checking_unicode.has_thai(
                                    tokens[i + j + 1])):
                            text += wordless_word_detokenize(
                                main,
                                tokens[non_thai_start:i + j + 1],
                                lang='other')

                            non_thai_start = i + j + 1

                            break
    # Tibetan
    elif word_detokenizer == main.tr('Wordless - Tibetan Word Detokenizer'):
        non_tibetan_start = 0

        for i, token in enumerate(tokens):
            if i < non_tibetan_start:
                continue

            if wordless_checking_unicode.has_tibetan(token):
                # Check for Tibetan Mark Shad
                # See: https://w3c.github.io/tlreq/#section_breaks
                if i > 0 and token[0] == '།':
                    text += token
                else:
                    text += token

                non_tibetan_start = i + 1
            else:
                # English
                if wordless_checking_unicode.is_eng_token(token):
                    for j, token in enumerate(tokens[i:]):
                        if i + j + 1 == len(
                                tokens
                        ) or not wordless_checking_unicode.is_eng_token(
                                tokens[i + j + 1]):
                            text += wordless_word_detokenize(
                                main,
                                tokens[non_tibetan_start:i + j + 1],
                                lang='eng')

                            non_tibetan_start = i + j + 1

                            break
                # Other Languages
                else:
                    for j, token in enumerate(tokens[i:]):
                        if (i + j + 1 == len(tokens)
                                or wordless_checking_unicode.has_tibetan(
                                    tokens[i + j + 1])):
                            text += wordless_word_detokenize(
                                main,
                                tokens[non_tibetan_start:i + j + 1],
                                lang='other')

                            non_tibetan_start = i + j + 1

                            break

    return re.sub(r'\s{2,}', ' ', text)
示例#14
0
 def __init__(self, lang):
     self.detokenizer = sacremoses.MosesDetokenizer(lang)
    def __init__(self,
                 special=None,
                 min_freq=0,
                 max_size=None,
                 lower_case=False,
                 delimiter=None,
                 vocab_file=None,
                 pretrained_vocab_file=None,
                 never_split=None,
                 unk_token="<unk>",
                 eos_token="<eos>",
                 additional_special_tokens=["<formula>"],
                 language="en",
                 **kwargs):
        super().__init__(unk_token=unk_token,
                         eos_token=eos_token,
                         additional_special_tokens=additional_special_tokens,
                         **kwargs)

        if never_split is None:
            never_split = self.all_special_tokens
        if special is None:
            special = []
        self.counter = Counter()
        self.special = special
        self.min_freq = min_freq
        self.max_size = max_size
        self.lower_case = lower_case
        self.delimiter = delimiter
        self.vocab_file = vocab_file
        self.never_split = never_split
        self.punctuation_symbols = '!"#$%&()*+,-./\\:;<=>?@[\\]^_`{|}~'
        self.punction_without_space_before_pattern = re.compile(
            r"[^\s][{}]".format(self.punctuation_symbols))
        self.punctuation_with_space_around_pattern = self._compile_space_around_punctuation_pattern(
        )
        self.language = language
        self.moses_punct_normalizer = sm.MosesPunctNormalizer(language)
        self.moses_tokenizer = sm.MosesTokenizer(language)
        self.moses_detokenizer = sm.MosesDetokenizer(language)

        try:
            if pretrained_vocab_file is not None:
                # Hack because, honestly this tokenizer was not made to be used
                # in a library like ours, at all.
                vocab_dict = torch.load(pretrained_vocab_file)
                for key, value in vocab_dict.items():
                    if key not in self.__dict__:
                        self.__dict__[key] = value

            if vocab_file is not None:
                self.build_vocab()
        except Exception:
            raise ValueError(
                "Unable to parse file {}. Unknown format. "
                "If you tried to load a model saved through TransfoXLTokenizerFast,"
                "please note they are not compatible.".format(
                    pretrained_vocab_file))

        if vocab_file is not None:
            self.build_vocab()
示例#16
0
 def __init__(
     self,
     special=None,
     min_freq=0,
     max_size=None,
     lower_case=False,
     delimiter=None,
     vocab_file=None,
     pretrained_vocab_file=None,
     never_split=None,
     unk="<unk>",
     eos="<eos>",
     additional_special_tokens=["<formula>"],
     language="en",
     **kw,
 ):
     super().__init__(
         special=special,
         min_freq=min_freq,
         max_size=max_size,
         lower_case=lower_case,
         delimiter=delimiter,
         vocab_file=vocab_file,
         pretrained_vocab_file=pretrained_vocab_file,
         never_split=never_split,
         unk=unk,
         eos=eos,
         additional_special_tokens=additional_special_tokens,
         language=language,
         **kw,
     )
     if never_split is None:
         never_split = self.all_special_tokens
     if special is None:
         special = []
     self.counter = Counter()
     self.special = special
     self.min_freq = min_freq
     self.max_size = max_size
     self.lower_case = lower_case
     self.delimiter = delimiter
     self.vocab_file = vocab_file
     self.never_split = never_split
     self.punctuation_symbols = '!"#$%&()*+,-./\\:;<=>?@[\\]^_`{|}~'
     self.punction_without_space_before_pattern = re.compile(
         rf"[^\s][{self.punctuation_symbols}]"
     )
     self.punctuation_with_space_around_pattern = (
         self._compile_space_around_punctuation_pattern()
     )
     self.language = language
     self.moses_punct_normalizer = sm.MosesPunctNormalizer(language)
     self.moses_tokenizer = sm.MosesTokenizer(language)
     self.moses_detokenizer = sm.MosesDetokenizer(language)
     try:
         vocab_dict = None
         if pretrained_vocab_file is not None:
             with open(pretrained_vocab_file, "rb") as f:
                 vocab_dict = pickle.load(f)
             if type(vocab_dict) == int:
                 if not is_torch_available():
                     raise ImportError(
                         "Not trying to load dict with PyTorch as you need to install pytorch to load "
                         "from a PyTorch pretrained vocabulary, "
                         "or activate it with environment variables USE_TORCH=1 and USE_TF=0."
                     )
                 vocab_dict = torch.load(pretrained_vocab_file)
         if vocab_dict is not None:
             for key, value in vocab_dict.items():
                 if key not in self.__dict__:
                     self.__dict__[key] = value
         elif vocab_file is not None:
             self.build_vocab()
     except Exception as e:
         raise ValueError(
             f"Unable to parse file {pretrained_vocab_file}. Unknown format. "
             "If you tried to load a model saved through TokenizerFast, "
             "please note they are not compatible."
         ) from e
     if vocab_file is not None:
         self.build_vocab()
 def init_moses(self, lang):
     self.moses_tokenizer = sacremoses.MosesTokenizer(lang['src'])
     self.moses_detokenizer = sacremoses.MosesDetokenizer(lang['tgt'])