Пример #1
0
    def __init__(self, language: str = "en") -> None:
        """
        MosesNormalizer constructor.

        Parameters
        ----------
        language : str
            Language argument for the normalizer. Default: "en".

        Raises
        ------
        ImportError
            If sacremoses is not installed.
        """
        try:
            from sacremoses import MosesPunctNormalizer
        except ImportError:
            print(
                "Problem occured while trying to import sacremoses. "
                "If the library is not installed visit "
                "https://github.com/alvations/sacremoses for more details."
            )
            raise

        self._normalizer = MosesPunctNormalizer(language)
Пример #2
0
    def __init__(self,
                 lang: str = 'en',
                 lower_case: bool = True,
                 romanize: bool = False,
                 descape: bool = False):
        assert lower_case, 'lower case is needed by all the models'

        if lang in ('cmn', 'wuu', 'yue'):
            lang = 'zh'
        if lang == 'jpn':
            lang = 'ja'

        if lang == 'zh':
            raise NotImplementedError('jieba is not yet implemented')
        if lang == 'ja':
            raise NotImplementedError('mecab is not yet implemented')
        if romanize:
            raise NotImplementedError('romanize is not yet implemented')

        self.lower_case = lower_case
        self.romanize = romanize
        self.descape = descape

        self.normalizer = MosesPunctNormalizer(lang=lang)
        self.tokenizer = MosesTokenizer(lang=lang)
Пример #3
0
    def __init__(self,
                 lang: str = 'en',
                 lower_case: bool = True,
                 romanize: Optional[bool] = None,
                 descape: bool = False):
        assert lower_case, 'lower case is needed by all the models'

        if lang in ('cmn', 'wuu', 'yue'):
            lang = 'zh'
        if lang == 'jpn':
            lang = 'ja'

        if lang == 'zh' and jieba is None:
            raise ModuleNotFoundError(
                '''No module named 'jieba'. Install laserembeddings with 'zh' extra to fix that: "pip install laserembeddings[zh]"'''
            )
        if lang == 'ja' and MeCab is None:
            raise ModuleNotFoundError(
                '''No module named 'MeCab'. Install laserembeddings with 'ja' extra to fix that: "pip install laserembeddings[ja]"'''
            )

        self.lang = lang
        self.lower_case = lower_case
        self.romanize = romanize if romanize is not None else lang == 'el'
        self.descape = descape

        self.normalizer = MosesPunctNormalizer(lang=lang)
        self.tokenizer = MosesTokenizer(lang=lang)
        self.mecab_tokenizer = MeCab.Tagger(
            "-O wakati -b 50000") if lang == 'ja' else None
Пример #4
0
 def __init__(self, lang_id: str):
     self.lang_id = lang_id
     self.moses_tokenizer = MosesTokenizer(lang=lang_id)
     self.moses_detokenizer = MosesDetokenizer(lang=lang_id)
     self.normalizer = MosesPunctNormalizer(lang=lang_id,
                                            pre_replace_unicode_punct=True,
                                            post_remove_control_chars=True)
Пример #5
0
 def __init__(self, lang, lowercase=True):
     # the tokenizer names are the same for BertTokenizer and PreTrainedTokenizer since they have both been distributed by huggingface
     pre_trained_model_name = PreTrainedTokenizer.get_default_model_name(
         lang, lowercase)
     self.tokenizer = BertTokenizer.from_pretrained(pre_trained_model_name)
     self.mpn = MosesPunctNormalizer()
     self.detokenizer = MosesDetokenizer(lang=lang)
     self._model_name_ = pre_trained_model_name
Пример #6
0
    def __init__(self, lm_type: LMType, language: str, tokenizer_command):
        """
            lm_type: LMType
            language: language code
            tokenizer_command: tokenizer full command (with flags if needed)
        """

        self.language = language
        self.tokenizer = Tokenizer(tokenizer_command, self.language)
        self.normalizer = MosesPunctNormalizer(lang=self.language)
        self.type = lm_type
Пример #7
0
    def __init__(self, exp):
        self.exp = exp
        self.tokr = MosesTokenizer()
        self.detokr = MosesDetokenizer()
        self.punct_normr = MosesPunctNormalizer()
        #self.true_caser = MosesTruecaser()

        self.punct_normalize = True
        self.tokenize = True
        self.html_unesc = True
        self.drop_unks = True
        #self.truecase = True
        self.detokenize = True
Пример #8
0
 def translate(self,
               text: List[str],
               source_lang: str = None,
               target_lang: str = None) -> List[str]:
     """
     Translates list of sentences from source language to target language.
     Should be regular text, this method performs its own tokenization/de-tokenization
     Args:
         text: list of strings to translate
         source_lang: if not None, corresponding MosesTokenizer and MosesPunctNormalizer will be run
         target_lang: if not None, corresponding MosesDecokenizer will be run
     Returns:
         list of translated strings
     """
     mode = self.training
     if source_lang != "None":
         tokenizer = MosesTokenizer(lang=source_lang)
         normalizer = MosesPunctNormalizer(lang=source_lang)
     if target_lang != "None":
         detokenizer = MosesDetokenizer(lang=target_lang)
     try:
         self.eval()
         res = []
         for txt in text:
             if source_lang != "None":
                 txt = normalizer.normalize(txt)
                 txt = tokenizer.tokenize(txt,
                                          escape=False,
                                          return_str=True)
             ids = self.encoder_tokenizer.text_to_ids(txt)
             ids = [self.encoder_tokenizer.bos_id
                    ] + ids + [self.encoder_tokenizer.eos_id]
             src = torch.Tensor(ids).long().to(self._device).unsqueeze(0)
             src_mask = torch.ones_like(src)
             src_hiddens = self.encoder(input_ids=src,
                                        encoder_mask=src_mask)
             beam_results = self.beam_search(
                 encoder_hidden_states=src_hiddens,
                 encoder_input_mask=src_mask)
             beam_results = self.filter_predicted_ids(beam_results)
             translation_ids = beam_results.cpu()[0].numpy()
             translation = self.decoder_tokenizer.ids_to_text(
                 translation_ids)
             if target_lang != "None":
                 translation = detokenizer.detokenize(translation.split())
             res.append(translation)
     finally:
         self.train(mode=mode)
     return res
Пример #9
0
class MosesProcessor:
    """
    Tokenizer, Detokenizer and Normalizer utilities in Moses
    """

    def __init__(self, lang_id: str):
        self.moses_tokenizer = MosesTokenizer(lang=lang_id)
        self.moses_detokenizer = MosesDetokenizer(lang=lang_id)
        self.normalizer = MosesPunctNormalizer(lang=lang_id)

    def detokenize(self, tokens: List[str]) -> str:
        """
        Detokenizes a list of tokens
        Args:
            tokens: list of strings as tokens
        Returns:
            detokenized string
        """
        return self.moses_detokenizer.detokenize(tokens)

    def tokenize(self, text: str):
        """
        Tokenizes text using Moses -> Sentencepiece.
        """
        return self.moses_tokenizer.tokenize(text, escape=False, return_str=True)

    def normalize(self, text: str):
        return self.normalizer.normalize(text)
Пример #10
0
class PTBertTokenizer:
    """
    The tokenizer pre-trained tokenizer trained alongside BERT by huggingface
    """
    def __init__(self, lang, lowercase=True):
        # the tokenizer names are the same for BertTokenizer and PreTrainedTokenizer since they have both been distributed by huggingface
        pre_trained_model_name = PreTrainedTokenizer.get_default_model_name(
            lang, lowercase)
        self.tokenizer = BertTokenizer.from_pretrained(pre_trained_model_name)
        self.mpn = MosesPunctNormalizer()
        self.detokenizer = MosesDetokenizer(lang=lang)
        self._model_name_ = pre_trained_model_name

    def tokenize(self, text):
        return self.tokenizer.tokenize(self.mpn.normalize(text))

    def detokenize(self, tokenized_list):
        # WARNING! this is a one way tokenizer, the detokenized sentences do not necessarily align with the actual tokenized sentences!
        return self.tokenizer.decode(
            self.tokenizer.convert_tokens_to_ids(tokenized_list))

    @staticmethod
    def get_default_model_name(lang, lowercase):
        return PreTrainedTokenizer.get_default_model_name(lang, lowercase)

    @property
    def model_name(self):
        return self._model_name_
Пример #11
0
class PyMosesTokenizer(GenericTokenizer):
    """
    The call to standard moses tokenizer
    """
    def __init__(self, lang, lowercase):
        self.mpn = MosesPunctNormalizer()
        self.tokenizer = MosesTokenizer(lang=lang)
        self.detokenizer = MosesDetokenizer(lang=lang)
        self.lowercase = lowercase
        self.lang = lang

    def tokenize(self, text):
        return self.tokenizer.tokenize(
            self.mpn.normalize(text.lower() if self.lowercase else text))

    def detokenize(self, tokenized_list):
        temp_result = ""
        t_list_len = len(tokenized_list)
        for t_ind, token in enumerate(tokenized_list):
            apos_cnd = token == "&apos;" and t_ind < t_list_len - 1 and tokenized_list[
                t_ind + 1] == "s"
            if apos_cnd or token == "/":
                temp_result = temp_result.strip() + token
            else:
                temp_result += token + " "
        f_result = self.detokenizer.detokenize(temp_result.strip().split())
        if len(f_result
               ) > 3 and f_result[-3] in string.punctuation and f_result[
                   -2] == " " and f_result[-1] == "\"":
            f_result = f_result[:-2] + f_result[-1]
        return f_result

    @property
    def model_name(self):
        return "Moses"
Пример #12
0
    def _setup_normalizer(self):
        try:
            from sacremoses import MosesPunctNormalizer

            self.punc_normalizer = MosesPunctNormalizer(self.source_lang).normalize
        except (ImportError, FileNotFoundError):
            warnings.warn("Recommended: pip install sacremoses.")
            self.punc_normalizer = lambda x: x
class MosesPreTokenizer:
    def __init__(self, lng, do_lowercase):
        self.mpn = MosesPunctNormalizer()
        self.moses_tokenizer = MosesTokenizer(lang=lng)
        self.do_lowercase = do_lowercase

    def pre_tokenize(self, text):
        return self.moses_tokenizer.tokenize(self.mpn.normalize(text.lower() if self.do_lowercase else text))
Пример #14
0
class Normalizepunctuation(BatchProcessor):
    def __init__(self, lang):

        self.handler = MosesPunctNormalizer(lang=lang)

    def process(self, input):

        return self.handler.normalize(input)
Пример #15
0
class PunctNormalizer:
    def __init__(self, language):
        self.language = language
        self.normalizer = MosesPunctNormalizer(lang=language)

    def __repr__(self):
        return f"PunctNormalizer({self.language})"

    def __call__(self, line):
        return self.normalizer.normalize(line).strip()
Пример #16
0
def old_preprocess(infname, outfname, lang):
    """
    Preparing each corpus file:
      - Normalization
      - Tokenization
      - Script coversion to Devanagari for Indic scripts
    """
    n = 0
    num_lines = sum(1 for line in open(infname, "r"))
    # reading
    with open(infname, "r",
              encoding="utf-8") as infile, open(outfname,
                                                "w",
                                                encoding="utf-8") as outfile:

        if lang == "en":
            en_tok = MosesTokenizer(lang="en")
            en_normalizer = MosesPunctNormalizer()
            for line in tqdm(infile, total=num_lines):
                outline = " ".join(
                    en_tok.tokenize(en_normalizer.normalize(line.strip()),
                                    escape=False))
                outfile.write(outline + "\n")
                n += 1

        else:
            normfactory = indic_normalize.IndicNormalizerFactory()
            normalizer = normfactory.get_normalizer(lang)
            for line in tqdm(infile, total=num_lines):
                outline = (unicode_transliterate.UnicodeIndicTransliterator.
                           transliterate(
                               " ".join(
                                   indic_tokenize.trivial_tokenize(
                                       normalizer.normalize(line.strip()),
                                       lang)),
                               lang,
                               "hi",
                           ).replace(" ् ", "्"))

                outfile.write(outline + "\n")
                n += 1
    return n
Пример #17
0
def Token(inp_fname,
          out_fname,
          lang='en',
          lower_case=True,
          romanize=False,
          descape=False,
          verbose=False,
          over_write=False,
          gzip=False):
    assert lower_case, 'lower case is needed by all the models'
    assert not over_write, 'over-write is not yet implemented'
    if not os.path.isfile(out_fname):
        cat = 'zcat ' if gzip else 'cat '
        roman = lang if romanize else 'none'
        # handle some iso3 langauge codes
        if lang in ('cmn', 'wuu', 'yue'):
            lang = 'zh'
        if lang in ('jpn'):
            lang = 'ja'
        if verbose:
            print(' - Tokenizer: {} in language {} {} {}'.format(
                os.path.basename(inp_fname), lang, '(gzip)' if gzip else '',
                '(de-escaped)' if descape else '',
                '(romanized)' if romanize else ''))
        '''
        run(cat + inp_fname
            + '|' + REM_NON_PRINT_CHAR
            + '|' + NORM_PUNC + lang
            + ('|' + DESCAPE if descape else '')
            + '|' + MOSES_TOKENIZER + lang
            + ('| python3 -m jieba -d ' if lang == 'zh' else '')
            + ('|' + MECAB + '/bin/mecab -O wakati -b 50000 ' if lang == 'ja' else '')
            + '|' + ROMAN_LC + roman
            + '>' + out_fname,
            env=dict(os.environ, LD_LIBRARY_PATH=MECAB + '/lib'),
            shell=True)
        '''
        curNormalizer = sacremoses_norm_punct.get(
            lang, MosesPunctNormalizer(lang=lang))
        curTokenizer = sacremoses_tokenizers.get(lang,
                                                 MosesTokenizer(lang=lang))
        with open(out_fname, 'w') as outF:
            for line in sys.stdin:
                tok_norm_punct = curNormalizer.normalize(line)
                tok = curTokenizer.tokenize(tok_norm_punct,
                                            return_str=True,
                                            escape=False)
                outF.write(tok.strip() + "\n")
    elif not over_write and verbose:
        print(' - Tokenizer: {} exists already'.format(
            os.path.basename(out_fname), lang))
Пример #18
0
    def __init__(self, expdir):
        self.expdir = expdir
        self.en_tok = MosesTokenizer(lang="en")
        self.en_normalizer = MosesPunctNormalizer()
        self.en_detok = MosesDetokenizer(lang="en")
        self.xliterator = unicode_transliterate.UnicodeIndicTransliterator()
        print("Initializing vocab and bpe")
        self.vocabulary = read_vocabulary(
            codecs.open(f"{expdir}/vocab/vocab.SRC", encoding="utf-8"), 5
        )
        self.bpe = BPE(
            codecs.open(f"{expdir}/vocab/bpe_codes.32k.SRC", encoding="utf-8"),
            -1,
            "@@",
            self.vocabulary,
            None,
        )

        print("Initializing model for translation")
        # initialize the model
        self.translator = Translator(
            f"{expdir}/final_bin", f"{expdir}/model/checkpoint_best.pt", batch_size=100
        )
Пример #19
0
def get_normalize_preprocessor():
    """
    get sacaremoses normalize processor
    >>> processor = get_normalize_preprocessor()
    >>> text = "Hi…"
    >>> processor(text)
    'Hi...'
    """
    from sacremoses import MosesPunctNormalizer
    mpn = MosesPunctNormalizer()

    def preprocessor(line):
        return mpn.normalize(line)

    return preprocessor
Пример #20
0
class MosesNormalizer:
    """
    Pretokenization took that normalizes the raw textual data.

    Uses sacremoses.MosesPunctNormalizer to perform normalization.
    """

    def __init__(self, language: str = "en") -> None:
        """
        MosesNormalizer constructor.

        Parameters
        ----------
        language : str
            Language argument for the normalizer. Default: "en".

        Raises
        ------
        ImportError
            If sacremoses is not installed.
        """
        try:
            from sacremoses import MosesPunctNormalizer
        except ImportError:
            print(
                "Problem occured while trying to import sacremoses. "
                "If the library is not installed visit "
                "https://github.com/alvations/sacremoses for more details."
            )
            raise

        self._normalizer = MosesPunctNormalizer(language)

    def __call__(self, raw: str) -> str:
        """
        Applies normalization to the raw textual data.

        Parameters
        ----------
        raw : str
            Raw textual data.

        Returns
        -------
        str
            Normalized textual data.
        """
        return self._normalizer.normalize(raw)
Пример #21
0
def lowercase_and_remove_punctuations(language,
                                      text,
                                      lowercase=True,
                                      remove_punctuation=True):
    if lowercase:
        text = text.lower()
    if language not in ["zh", "ja"]:
        if language not in PUNC_NORMERS:
            PUNC_NORMERS[language] = MosesPunctNormalizer(lang=language)
        text = PUNC_NORMERS[language].normalize(text)
        text = text.replace("' s ", "'s ").replace("' ve ", "'ve ").replace(
            "' m ", "'m ").replace("' t ", "'t ").replace("' re ", "'re ")
    if remove_punctuation:
        text = PUNC_PATTERN.sub(" ", text)
    text = " ".join(text.strip().split())
    return text
Пример #22
0
def prepare_csv(path, cleanup=False):
    if path.endswith('.processed.csv'):
        return path

    compression = 'xz' if path.endswith('.xz') else None
    df = pd.read_csv(
        path,
        compression=compression,
        sep='\t',
        encoding='utf-8',
        usecols=['id', 'hyperp', 'bias', 'publisher', 'title',
                 'text']).dropna()

    print(' > Cleaning up data')
    df.bias = df.bias.astype('category')
    df.publisher = df.publisher

    print(' > Initializing Moses')
    mt = MosesTokenizer()
    mn = MosesPunctNormalizer()

    line_blacklist, token_blacklist = set(), set()
    if cleanup:
        print(' > Generating line blacklist')
        line_blacklist = get_line_blacklist(df)
        print(' > Generating token blacklist')
        token_blacklist = get_token_blacklist(df)

    tqdm.pandas()
    print(' > Processing titles')
    df.title = df.title.apply(clean_text).apply(mn.normalize).progress_apply(
        mt.tokenize, return_str=True)

    print(' > Processing texts')
    df.text = df.text.apply(clean_text,
                            line_blacklist=line_blacklist,
                            token_blacklist=token_blacklist).progress_apply(
                                mn.normalize).progress_apply(mt.tokenize,
                                                             return_str=True)

    new_path = path.replace('.xz', '').replace('.csv', '.processed.csv')
    df.to_csv(new_path, index=False)

    return new_path
Пример #23
0
class RtgIO:
    def __init__(self, exp):
        self.exp = exp
        self.tokr = MosesTokenizer()
        self.detokr = MosesDetokenizer()
        self.punct_normr = MosesPunctNormalizer()
        #self.true_caser = MosesTruecaser()

        self.punct_normalize = True
        self.tokenize = True
        self.html_unesc = True
        self.drop_unks = True
        #self.truecase = True
        self.detokenize = True

    def pre_process(self, text):
        # Any pre-processing on input
        if self.html_unesc:
            text = unescape(text)
        if self.punct_normalize:
            text = self.punct_normr.normalize(text)
        if self.tokenize:
            text = self.tokr.tokenize(text,
                                      escape=False,
                                      return_str=True,
                                      aggressive_dash_splits=True)
            # protected_patterns=self.tokr.WEB_PROTECTED_PATTERNS
        return text

    def post_process(self, tokens):
        # Any post-processing on output
        assert isinstance(tokens, list)
        if self.detokenize:
            text = self.detokr.detokenize(tokens=tokens,
                                          return_str=True,
                                          unescape=True)
        else:
            text = " ".join(tokens)
        if self.drop_unks:
            text = text.replace("<unk>", "")
        #if self.truecase:
        #    text = self.true_caser.truecase(text, return_str=True)
        return text
Пример #24
0
def TokenLine(line, lang='en', lower_case=True, romanize=False):
    assert lower_case, 'lower case is needed by all the models'
    '''
    roman = lang if romanize else 'none'
    tok = check_output(
            REM_NON_PRINT_CHAR
            + '|' + NORM_PUNC + lang
            + '|' + DESCAPE
            + '|' + MOSES_TOKENIZER + lang
            + ('| python3 -m jieba -d ' if lang == 'zh' else '')
            + ('|' + MECAB + '/bin/mecab -O wakati -b 50000 ' if lang == 'ja' else '')
            + '|' + ROMAN_LC + roman,
            input=line,
            encoding='UTF-8',
            shell=True)
    '''
    tok_norm_punct = sacremoses_norm_punct.get(
        lang, MosesPunctNormalizer(lang=lang)).normalize(line)
    tok = sacremoses_tokenizers.get(lang, MosesTokenizer(lang=lang))\
        .tokenize(tok_norm_punct, return_str=True, escape=False)
    return tok.strip()
Пример #25
0
class EnJaProcessor:
    """
    Tokenizer, Detokenizer and Normalizer utilities for Japanese & English
    Args:
        lang_id: One of ['en', 'ja'].
    """
    def __init__(self, lang_id: str):
        self.lang_id = lang_id
        self.moses_tokenizer = MosesTokenizer(lang=lang_id)
        self.moses_detokenizer = MosesDetokenizer(lang=lang_id)
        self.normalizer = MosesPunctNormalizer(lang=lang_id,
                                               pre_replace_unicode_punct=True,
                                               post_remove_control_chars=True)

    def detokenize(self, tokens: List[str]) -> str:
        """
        Detokenizes a list of tokens
        Args:
            tokens: list of strings as tokens
        Returns:
            detokenized Japanese or English string
        """
        return self.moses_detokenizer.detokenize(tokens)

    def tokenize(self, text) -> str:
        """
        Tokenizes text using Moses. Returns a string of tokens.
        """
        tokens = self.moses_tokenizer.tokenize(text)
        return ' '.join(tokens)

    def normalize(self, text) -> str:
        # Normalization doesn't handle Japanese periods correctly;
        # '。'becomes '.'.
        if self.lang_id == 'en':
            return self.normalizer.normalize(text)
        else:
            return text
Пример #26
0
    tmp_data = news_item(
        url=kaz_url,
        section=kaz_section,
        title=kaz_title,
        date_time=kaz_date_time,
        text=kaz_text,
    )

    news_items.append(tmp_data)

# собранные данные токенизируем, нормализуем, делаем морфологический анализ
# результат каждой обработки записываем в свой list(namedtuple)

# токенизация
tokenized_news_items = []
mpn = MosesPunctNormalizer()
mtok = MosesTokenizer()

for item in tqdm(news_items):
    tokenized_text = mpn.normalize(text=item.text)
    tokenized_text = mtok.tokenize(text=tokenized_text, return_str=True)

    tmp_data = news_item(
        url=item.url,
        section=item.section,
        title=item.title,
        date_time=item.date_time,
        text=tokenized_text,
    )

    tokenized_news_items.append(tmp_data)
Пример #27
0
from tqdm import tqdm
from speechbrain.utils.data_utils import get_all_files
from speechbrain.utils.torch_audio_backend import check_torchaudio_backend
from speechbrain.processing.speech_augmentation import Resample

try:
    from sacremoses import MosesPunctNormalizer, MosesTokenizer
except ImportError:
    err_msg = "The optional dependency sacremoses must be installed to run this recipe.\n"
    err_msg += "Install using `pip install sacremoses`.\n"
    raise ImportError(err_msg)

logger = logging.getLogger(__name__)
check_torchaudio_backend()

es_normalizer = MosesPunctNormalizer(lang="es")
en_normalizer = MosesPunctNormalizer(lang="en")

en_tokenizer = MosesTokenizer(lang="en")

SAMPLE_RATE = 16000


@dataclass
class TDF:
    """
    channel: int
        channel of utterance
    start: int
        start time of utterance
    end: int
Пример #28
0
class Model:
    def __init__(self, expdir):
        self.expdir = expdir
        self.en_tok = MosesTokenizer(lang="en")
        self.en_normalizer = MosesPunctNormalizer()
        self.en_detok = MosesDetokenizer(lang="en")
        self.xliterator = unicode_transliterate.UnicodeIndicTransliterator()
        print("Initializing vocab and bpe")
        self.vocabulary = read_vocabulary(
            codecs.open(f"{expdir}/vocab/vocab.SRC", encoding="utf-8"), 5
        )
        self.bpe = BPE(
            codecs.open(f"{expdir}/vocab/bpe_codes.32k.SRC", encoding="utf-8"),
            -1,
            "@@",
            self.vocabulary,
            None,
        )

        print("Initializing model for translation")
        # initialize the model
        self.translator = Translator(
            f"{expdir}/final_bin", f"{expdir}/model/checkpoint_best.pt", batch_size=100
        )

    # translate a batch of sentences from src_lang to tgt_lang
    def batch_translate(self, batch, src_lang, tgt_lang):

        assert isinstance(batch, list)
        preprocessed_sents = self.preprocess(batch, lang=src_lang)
        bpe_sents = self.apply_bpe(preprocessed_sents)
        tagged_sents = apply_lang_tags(bpe_sents, src_lang, tgt_lang)
        tagged_sents = truncate_long_sentences(tagged_sents)

        translations = self.translator.translate(tagged_sents)
        postprocessed_sents = self.postprocess(translations, tgt_lang)

        return postprocessed_sents

    # translate a paragraph from src_lang to tgt_lang
    def translate_paragraph(self, paragraph, src_lang, tgt_lang):

        assert isinstance(paragraph, str)
        sents = split_sentences(paragraph, src_lang)

        postprocessed_sents = self.batch_translate(sents, src_lang, tgt_lang)

        translated_paragraph = " ".join(postprocessed_sents)

        return translated_paragraph

    def preprocess_sent(self, sent, normalizer, lang):
        if lang == "en":
            return " ".join(
                self.en_tok.tokenize(
                    self.en_normalizer.normalize(sent.strip()), escape=False
                )
            )
        else:
            # line = indic_detokenize.trivial_detokenize(line.strip(), lang)
            return unicode_transliterate.UnicodeIndicTransliterator.transliterate(
                " ".join(
                    indic_tokenize.trivial_tokenize(
                        normalizer.normalize(sent.strip()), lang
                    )
                ),
                lang,
                "hi",
            ).replace(" ् ", "्")

    def preprocess(self, sents, lang):
        """
        Normalize, tokenize and script convert(for Indic)
        return number of sentences input file

        """

        if lang == "en":

            # processed_sents = Parallel(n_jobs=-1, backend="multiprocessing")(
            #     delayed(preprocess_line)(line, None, lang) for line in tqdm(sents, total=num_lines)
            # )
            processed_sents = [
                self.preprocess_sent(line, None, lang) for line in tqdm(sents)
            ]

        else:
            normfactory = indic_normalize.IndicNormalizerFactory()
            normalizer = normfactory.get_normalizer(lang)

            # processed_sents = Parallel(n_jobs=-1, backend="multiprocessing")(
            #     delayed(preprocess_line)(line, normalizer, lang) for line in tqdm(infile, total=num_lines)
            # )
            processed_sents = [
                self.preprocess_sent(line, normalizer, lang) for line in tqdm(sents)
            ]

        return processed_sents

    def postprocess(self, sents, lang, common_lang="hi"):
        """
        parse fairseq interactive output, convert script back to native Indic script (in case of Indic languages) and detokenize.

        infname: fairseq log file
        outfname: output file of translation (sentences not translated contain the dummy string 'DUMMY_OUTPUT'
        input_size: expected number of output sentences
        lang: language
        """
        postprocessed_sents = []

        if lang == "en":
            for sent in sents:
                # outfile.write(en_detok.detokenize(sent.split(" ")) + "\n")
                postprocessed_sents.append(self.en_detok.detokenize(sent.split(" ")))
        else:
            for sent in sents:
                outstr = indic_detokenize.trivial_detokenize(
                    self.xliterator.transliterate(sent, common_lang, lang), lang
                )
                # outfile.write(outstr + "\n")
                postprocessed_sents.append(outstr)
        return postprocessed_sents

    def apply_bpe(self, sents):

        return [self.bpe.process_line(sent) for sent in sents]
Пример #29
0
 def __init__(self, lang, lowercase):
     self.mpn = MosesPunctNormalizer()
     self.tokenizer = MosesTokenizer(lang=lang)
     self.detokenizer = MosesDetokenizer(lang=lang)
     self.lowercase = lowercase
     self.lang = lang
Пример #30
0
Normalizes and tokenizes every sentence in a corpus using the Moses normalizer and tokenizer.

Takes three arguments:
* the language of the corpus (language code)
* the path to the corpus file
* the path to the output file
"""
from sacremoses import MosesPunctNormalizer, MosesTokenizer
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("lang", type=str, help="Language of the corpus")
parser.add_argument("f_in", type=str, help="Path to the corpus")
parser.add_argument("f_out", type=str, help="Output path")

args = parser.parse_args()

normalizer = MosesPunctNormalizer(args.lang)
tokenizer = MosesTokenizer(args.lang)

with open(args.f_in,
          'r', encoding='UTF-8') as f_in, open(args.f_out,
                                               'w',
                                               encoding='UTF-8') as f_out:
    for line in f_in:
        line = line.strip()
        if line != '':
            line = normalizer.normalize(line)
            line = tokenizer.tokenize(line, return_str=True, escape=False)
            f_out.write(line + '\n')