Exemplo n.º 1
0
    def __init__(self,
                 lang: str = 'en',
                 lower_case: bool = True,
                 romanize: bool = False,
                 descape: bool = False):
        assert lower_case, 'lower case is needed by all the models'

        if lang in ('cmn', 'wuu', 'yue'):
            lang = 'zh'
        if lang == 'jpn':
            lang = 'ja'

        if lang == 'zh':
            raise NotImplementedError('jieba is not yet implemented')
        if lang == 'ja':
            raise NotImplementedError('mecab is not yet implemented')
        if romanize:
            raise NotImplementedError('romanize is not yet implemented')

        self.lower_case = lower_case
        self.romanize = romanize
        self.descape = descape

        self.normalizer = MosesPunctNormalizer(lang=lang)
        self.tokenizer = MosesTokenizer(lang=lang)
Exemplo n.º 2
0
    def __init__(self,
                 lang: str = 'en',
                 lower_case: bool = True,
                 romanize: Optional[bool] = None,
                 descape: bool = False):
        assert lower_case, 'lower case is needed by all the models'

        if lang in ('cmn', 'wuu', 'yue'):
            lang = 'zh'
        if lang == 'jpn':
            lang = 'ja'

        if lang == 'zh' and jieba is None:
            raise ModuleNotFoundError(
                '''No module named 'jieba'. Install laserembeddings with 'zh' extra to fix that: "pip install laserembeddings[zh]"'''
            )
        if lang == 'ja' and MeCab is None:
            raise ModuleNotFoundError(
                '''No module named 'MeCab'. Install laserembeddings with 'ja' extra to fix that: "pip install laserembeddings[ja]"'''
            )

        self.lang = lang
        self.lower_case = lower_case
        self.romanize = romanize if romanize is not None else lang == 'el'
        self.descape = descape

        self.normalizer = MosesPunctNormalizer(lang=lang)
        self.tokenizer = MosesTokenizer(lang=lang)
        self.mecab_tokenizer = MeCab.Tagger(
            "-O wakati -b 50000") if lang == 'ja' else None
Exemplo n.º 3
0
    def __init__(self, language: str = "en") -> None:
        """
        MosesNormalizer constructor.

        Parameters
        ----------
        language : str
            Language argument for the normalizer. Default: "en".

        Raises
        ------
        ImportError
            If sacremoses is not installed.
        """
        try:
            from sacremoses import MosesPunctNormalizer
        except ImportError:
            print(
                "Problem occured while trying to import sacremoses. "
                "If the library is not installed visit "
                "https://github.com/alvations/sacremoses for more details."
            )
            raise

        self._normalizer = MosesPunctNormalizer(language)
Exemplo n.º 4
0
 def __init__(self, lang_id: str):
     self.lang_id = lang_id
     self.moses_tokenizer = MosesTokenizer(lang=lang_id)
     self.moses_detokenizer = MosesDetokenizer(lang=lang_id)
     self.normalizer = MosesPunctNormalizer(lang=lang_id,
                                            pre_replace_unicode_punct=True,
                                            post_remove_control_chars=True)
Exemplo n.º 5
0
    def _setup_normalizer(self):
        try:
            from sacremoses import MosesPunctNormalizer

            self.punc_normalizer = MosesPunctNormalizer(self.source_lang).normalize
        except (ImportError, FileNotFoundError):
            warnings.warn("Recommended: pip install sacremoses.")
            self.punc_normalizer = lambda x: x
Exemplo n.º 6
0
 def __init__(self, lang, lowercase=True):
     # the tokenizer names are the same for BertTokenizer and PreTrainedTokenizer since they have both been distributed by huggingface
     pre_trained_model_name = PreTrainedTokenizer.get_default_model_name(
         lang, lowercase)
     self.tokenizer = BertTokenizer.from_pretrained(pre_trained_model_name)
     self.mpn = MosesPunctNormalizer()
     self.detokenizer = MosesDetokenizer(lang=lang)
     self._model_name_ = pre_trained_model_name
Exemplo n.º 7
0
    def __init__(self, lm_type: LMType, language: str, tokenizer_command):
        """
            lm_type: LMType
            language: language code
            tokenizer_command: tokenizer full command (with flags if needed)
        """

        self.language = language
        self.tokenizer = Tokenizer(tokenizer_command, self.language)
        self.normalizer = MosesPunctNormalizer(lang=self.language)
        self.type = lm_type
Exemplo n.º 8
0
    def __init__(self, exp):
        self.exp = exp
        self.tokr = MosesTokenizer()
        self.detokr = MosesDetokenizer()
        self.punct_normr = MosesPunctNormalizer()
        #self.true_caser = MosesTruecaser()

        self.punct_normalize = True
        self.tokenize = True
        self.html_unesc = True
        self.drop_unks = True
        #self.truecase = True
        self.detokenize = True
Exemplo n.º 9
0
def Token(inp_fname,
          out_fname,
          lang='en',
          lower_case=True,
          romanize=False,
          descape=False,
          verbose=False,
          over_write=False,
          gzip=False):
    assert lower_case, 'lower case is needed by all the models'
    assert not over_write, 'over-write is not yet implemented'
    if not os.path.isfile(out_fname):
        cat = 'zcat ' if gzip else 'cat '
        roman = lang if romanize else 'none'
        # handle some iso3 langauge codes
        if lang in ('cmn', 'wuu', 'yue'):
            lang = 'zh'
        if lang in ('jpn'):
            lang = 'ja'
        if verbose:
            print(' - Tokenizer: {} in language {} {} {}'.format(
                os.path.basename(inp_fname), lang, '(gzip)' if gzip else '',
                '(de-escaped)' if descape else '',
                '(romanized)' if romanize else ''))
        '''
        run(cat + inp_fname
            + '|' + REM_NON_PRINT_CHAR
            + '|' + NORM_PUNC + lang
            + ('|' + DESCAPE if descape else '')
            + '|' + MOSES_TOKENIZER + lang
            + ('| python3 -m jieba -d ' if lang == 'zh' else '')
            + ('|' + MECAB + '/bin/mecab -O wakati -b 50000 ' if lang == 'ja' else '')
            + '|' + ROMAN_LC + roman
            + '>' + out_fname,
            env=dict(os.environ, LD_LIBRARY_PATH=MECAB + '/lib'),
            shell=True)
        '''
        curNormalizer = sacremoses_norm_punct.get(
            lang, MosesPunctNormalizer(lang=lang))
        curTokenizer = sacremoses_tokenizers.get(lang,
                                                 MosesTokenizer(lang=lang))
        with open(out_fname, 'w') as outF:
            for line in sys.stdin:
                tok_norm_punct = curNormalizer.normalize(line)
                tok = curTokenizer.tokenize(tok_norm_punct,
                                            return_str=True,
                                            escape=False)
                outF.write(tok.strip() + "\n")
    elif not over_write and verbose:
        print(' - Tokenizer: {} exists already'.format(
            os.path.basename(out_fname), lang))
Exemplo n.º 10
0
 def translate(self,
               text: List[str],
               source_lang: str = None,
               target_lang: str = None) -> List[str]:
     """
     Translates list of sentences from source language to target language.
     Should be regular text, this method performs its own tokenization/de-tokenization
     Args:
         text: list of strings to translate
         source_lang: if not None, corresponding MosesTokenizer and MosesPunctNormalizer will be run
         target_lang: if not None, corresponding MosesDecokenizer will be run
     Returns:
         list of translated strings
     """
     mode = self.training
     if source_lang != "None":
         tokenizer = MosesTokenizer(lang=source_lang)
         normalizer = MosesPunctNormalizer(lang=source_lang)
     if target_lang != "None":
         detokenizer = MosesDetokenizer(lang=target_lang)
     try:
         self.eval()
         res = []
         for txt in text:
             if source_lang != "None":
                 txt = normalizer.normalize(txt)
                 txt = tokenizer.tokenize(txt,
                                          escape=False,
                                          return_str=True)
             ids = self.encoder_tokenizer.text_to_ids(txt)
             ids = [self.encoder_tokenizer.bos_id
                    ] + ids + [self.encoder_tokenizer.eos_id]
             src = torch.Tensor(ids).long().to(self._device).unsqueeze(0)
             src_mask = torch.ones_like(src)
             src_hiddens = self.encoder(input_ids=src,
                                        encoder_mask=src_mask)
             beam_results = self.beam_search(
                 encoder_hidden_states=src_hiddens,
                 encoder_input_mask=src_mask)
             beam_results = self.filter_predicted_ids(beam_results)
             translation_ids = beam_results.cpu()[0].numpy()
             translation = self.decoder_tokenizer.ids_to_text(
                 translation_ids)
             if target_lang != "None":
                 translation = detokenizer.detokenize(translation.split())
             res.append(translation)
     finally:
         self.train(mode=mode)
     return res
Exemplo n.º 11
0
def get_normalize_preprocessor():
    """
    get sacaremoses normalize processor
    >>> processor = get_normalize_preprocessor()
    >>> text = "Hi…"
    >>> processor(text)
    'Hi...'
    """
    from sacremoses import MosesPunctNormalizer
    mpn = MosesPunctNormalizer()

    def preprocessor(line):
        return mpn.normalize(line)

    return preprocessor
Exemplo n.º 12
0
def lowercase_and_remove_punctuations(language,
                                      text,
                                      lowercase=True,
                                      remove_punctuation=True):
    if lowercase:
        text = text.lower()
    if language not in ["zh", "ja"]:
        if language not in PUNC_NORMERS:
            PUNC_NORMERS[language] = MosesPunctNormalizer(lang=language)
        text = PUNC_NORMERS[language].normalize(text)
        text = text.replace("' s ", "'s ").replace("' ve ", "'ve ").replace(
            "' m ", "'m ").replace("' t ", "'t ").replace("' re ", "'re ")
    if remove_punctuation:
        text = PUNC_PATTERN.sub(" ", text)
    text = " ".join(text.strip().split())
    return text
Exemplo n.º 13
0
def prepare_csv(path, cleanup=False):
    if path.endswith('.processed.csv'):
        return path

    compression = 'xz' if path.endswith('.xz') else None
    df = pd.read_csv(
        path,
        compression=compression,
        sep='\t',
        encoding='utf-8',
        usecols=['id', 'hyperp', 'bias', 'publisher', 'title',
                 'text']).dropna()

    print(' > Cleaning up data')
    df.bias = df.bias.astype('category')
    df.publisher = df.publisher

    print(' > Initializing Moses')
    mt = MosesTokenizer()
    mn = MosesPunctNormalizer()

    line_blacklist, token_blacklist = set(), set()
    if cleanup:
        print(' > Generating line blacklist')
        line_blacklist = get_line_blacklist(df)
        print(' > Generating token blacklist')
        token_blacklist = get_token_blacklist(df)

    tqdm.pandas()
    print(' > Processing titles')
    df.title = df.title.apply(clean_text).apply(mn.normalize).progress_apply(
        mt.tokenize, return_str=True)

    print(' > Processing texts')
    df.text = df.text.apply(clean_text,
                            line_blacklist=line_blacklist,
                            token_blacklist=token_blacklist).progress_apply(
                                mn.normalize).progress_apply(mt.tokenize,
                                                             return_str=True)

    new_path = path.replace('.xz', '').replace('.csv', '.processed.csv')
    df.to_csv(new_path, index=False)

    return new_path
Exemplo n.º 14
0
def old_preprocess(infname, outfname, lang):
    """
    Preparing each corpus file:
      - Normalization
      - Tokenization
      - Script coversion to Devanagari for Indic scripts
    """
    n = 0
    num_lines = sum(1 for line in open(infname, "r"))
    # reading
    with open(infname, "r",
              encoding="utf-8") as infile, open(outfname,
                                                "w",
                                                encoding="utf-8") as outfile:

        if lang == "en":
            en_tok = MosesTokenizer(lang="en")
            en_normalizer = MosesPunctNormalizer()
            for line in tqdm(infile, total=num_lines):
                outline = " ".join(
                    en_tok.tokenize(en_normalizer.normalize(line.strip()),
                                    escape=False))
                outfile.write(outline + "\n")
                n += 1

        else:
            normfactory = indic_normalize.IndicNormalizerFactory()
            normalizer = normfactory.get_normalizer(lang)
            for line in tqdm(infile, total=num_lines):
                outline = (unicode_transliterate.UnicodeIndicTransliterator.
                           transliterate(
                               " ".join(
                                   indic_tokenize.trivial_tokenize(
                                       normalizer.normalize(line.strip()),
                                       lang)),
                               lang,
                               "hi",
                           ).replace(" ् ", "्"))

                outfile.write(outline + "\n")
                n += 1
    return n
Exemplo n.º 15
0
def TokenLine(line, lang='en', lower_case=True, romanize=False):
    assert lower_case, 'lower case is needed by all the models'
    '''
    roman = lang if romanize else 'none'
    tok = check_output(
            REM_NON_PRINT_CHAR
            + '|' + NORM_PUNC + lang
            + '|' + DESCAPE
            + '|' + MOSES_TOKENIZER + lang
            + ('| python3 -m jieba -d ' if lang == 'zh' else '')
            + ('|' + MECAB + '/bin/mecab -O wakati -b 50000 ' if lang == 'ja' else '')
            + '|' + ROMAN_LC + roman,
            input=line,
            encoding='UTF-8',
            shell=True)
    '''
    tok_norm_punct = sacremoses_norm_punct.get(
        lang, MosesPunctNormalizer(lang=lang)).normalize(line)
    tok = sacremoses_tokenizers.get(lang, MosesTokenizer(lang=lang))\
        .tokenize(tok_norm_punct, return_str=True, escape=False)
    return tok.strip()
Exemplo n.º 16
0
    def __init__(self, expdir):
        self.expdir = expdir
        self.en_tok = MosesTokenizer(lang="en")
        self.en_normalizer = MosesPunctNormalizer()
        self.en_detok = MosesDetokenizer(lang="en")
        self.xliterator = unicode_transliterate.UnicodeIndicTransliterator()
        print("Initializing vocab and bpe")
        self.vocabulary = read_vocabulary(
            codecs.open(f"{expdir}/vocab/vocab.SRC", encoding="utf-8"), 5
        )
        self.bpe = BPE(
            codecs.open(f"{expdir}/vocab/bpe_codes.32k.SRC", encoding="utf-8"),
            -1,
            "@@",
            self.vocabulary,
            None,
        )

        print("Initializing model for translation")
        # initialize the model
        self.translator = Translator(
            f"{expdir}/final_bin", f"{expdir}/model/checkpoint_best.pt", batch_size=100
        )
Exemplo n.º 17
0
 def __init__(self, lang_id: str):
     if lang_id != 'hi':
         raise NotImplementedError
     self.moses_tokenizer = MosesTokenizer(lang=lang_id)
     self.moses_detokenizer = MosesDetokenizer(lang=lang_id)
     self.normalizer = MosesPunctNormalizer(lang=lang_id)
Exemplo n.º 18
0
 def __init__(self, lang, lowercase):
     self.mpn = MosesPunctNormalizer()
     self.tokenizer = MosesTokenizer(lang=lang)
     self.detokenizer = MosesDetokenizer(lang=lang)
     self.lowercase = lowercase
     self.lang = lang
Exemplo n.º 19
0
Normalizes and tokenizes every sentence in a corpus using the Moses normalizer and tokenizer.

Takes three arguments:
* the language of the corpus (language code)
* the path to the corpus file
* the path to the output file
"""
from sacremoses import MosesPunctNormalizer, MosesTokenizer
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("lang", type=str, help="Language of the corpus")
parser.add_argument("f_in", type=str, help="Path to the corpus")
parser.add_argument("f_out", type=str, help="Output path")

args = parser.parse_args()

normalizer = MosesPunctNormalizer(args.lang)
tokenizer = MosesTokenizer(args.lang)

with open(args.f_in,
          'r', encoding='UTF-8') as f_in, open(args.f_out,
                                               'w',
                                               encoding='UTF-8') as f_out:
    for line in f_in:
        line = line.strip()
        if line != '':
            line = normalizer.normalize(line)
            line = tokenizer.tokenize(line, return_str=True, escape=False)
            f_out.write(line + '\n')
Exemplo n.º 20
0
    def translate(self,
                  text: List[str],
                  source_lang: str = None,
                  target_lang: str = None) -> List[str]:
        """
        Translates list of sentences from source language to target language.
        Should be regular text, this method performs its own tokenization/de-tokenization
        Args:
            text: list of strings to translate
            source_lang: if not None, corresponding MosesTokenizer and MosesPunctNormalizer will be run
            target_lang: if not None, corresponding MosesDecokenizer will be run
        Returns:
            list of translated strings
        """
        if source_lang is None:
            source_lang = self.src_language
        if target_lang is None:
            target_lang = self.tgt_language

        mode = self.training
        if source_lang not in ['zh', 'ja']:
            tokenizer = MosesTokenizer(lang=source_lang)
            normalizer = MosesPunctNormalizer(lang=source_lang)
        elif source_lang == 'ja':
            raise NotImplementedError(
                "Input tokenization for Japanese is not implemented yet")
        elif source_lang == 'zh':
            normalizer = opencc.OpenCC('t2s.json')

        if target_lang not in ['zh']:
            detokenizer = MosesDetokenizer(lang=target_lang)
        elif target_lang == 'zh':
            detokenizer = PanguJiebaDetokenizer()

        try:
            self.eval()
            res = []
            for txt in text:
                if source_lang != "None":
                    if source_lang == "zh":
                        txt = normalizer.convert(txt)
                        txt = ' '.join(jieba.cut(txt))
                    else:
                        txt = normalizer.normalize(txt)
                        txt = tokenizer.tokenize(txt,
                                                 escape=False,
                                                 return_str=True)
                ids = self.encoder_tokenizer.text_to_ids(txt)
                ids = [self.encoder_tokenizer.bos_id
                       ] + ids + [self.encoder_tokenizer.eos_id]
                src = torch.Tensor(ids).long().to(self._device).unsqueeze(0)
                src_mask = torch.ones_like(src)
                src_hiddens = self.encoder(input_ids=src,
                                           encoder_mask=src_mask)
                beam_results = self.beam_search(
                    encoder_hidden_states=src_hiddens,
                    encoder_input_mask=src_mask)
                beam_results = self.filter_predicted_ids(beam_results)
                translation_ids = beam_results.cpu()[0].numpy()
                translation = self.decoder_tokenizer.ids_to_text(
                    translation_ids)
                if target_lang == 'ja':
                    sp_detokenizer = SentencePieceDetokenizer()
                    translation = sp_detokenizer.detokenize(
                        translation.split())
                translation = detokenizer.detokenize(translation.split())
                res.append(translation)
        finally:
            self.train(mode=mode)
        return res
Exemplo n.º 21
0
import chardet
import re

from sacremoses import MosesPunctNormalizer

mosesNorm = MosesPunctNormalizer()

DATES = re.compile(r'\d{1,2}\/\d{1,2}\/\d{4}$|\d{1,2}\.\d{1,2}\.\d{4}')
LIST = re.compile(r'^([0-9][\.\-\)])+([0-9]+|/s)(?!([a-zA-Z]|\%))')
EMAIL = re.compile('[a-zA-Z0-9_\-\.]+@[a-zA-Z0-9_\-\.]+\.[a-zA-Z]{2,6}')
LINK = re.compile(
    '(http|ftp|https|www)(://)?([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?'
)


def turnLink(sent_s, sent_t):
    link = LINK.findall(sent_s)
    if link:
        link = ''.join(link[0])
        sent_t = re.sub('LINNK', link, sent_t)
        sent_t = re.sub('linnk', link, sent_t)
    return sent_t


def turnDates(sent_s, sent_t):
    date = DATES.findall(sent_s)
    if date:
        sent_t = re.sub('DAATTE', date[0], sent_t)
        sent_t = re.sub('daatte', date[0], sent_t)
    return sent_t
Exemplo n.º 22
0
MOSES_BDIR = LASER + '/tools-external/moses-tokenizer/tokenizer/'
MOSES_TOKENIZER = MOSES_BDIR + 'tokenizer.perl -q -no-escape -threads 20 -l '
MOSES_LC = MOSES_BDIR + 'lowercase.perl'
NORM_PUNC = MOSES_BDIR + 'normalize-punctuation.perl -l '
DESCAPE = MOSES_BDIR + 'deescape-special-chars.perl'
REM_NON_PRINT_CHAR = MOSES_BDIR + 'remove-non-printing-char.perl'

# Romanization (Greek only)
ROMAN_LC = 'python3 ' + LASER + '/source/lib/romanize_lc.py -l '

# Mecab tokenizer for Japanese
MECAB = LASER + '/tools-external/mecab'

# Dictionaries to keep all sacremoses objects for different languages
sacremoses_norm_punct = {
    'en': MosesPunctNormalizer(lang='en'),
    'sv': MosesPunctNormalizer(lang='sv')
}
sacremoses_tokenizers = {
    'en': MosesTokenizer(lang='en'),
    'sv': MosesTokenizer(lang='sv')
}

###############################################################################
#
# Tokenize a line of text
#
###############################################################################


def TokenLine(line, lang='en', lower_case=True, romanize=False):
Exemplo n.º 23
0
loader.load()
from sacremoses import MosesPunctNormalizer
from sacremoses import MosesTokenizer
from sacremoses import MosesDetokenizer
from collections import defaultdict

from tqdm import tqdm
from joblib import Parallel, delayed

from indicnlp.tokenize import indic_tokenize
from indicnlp.tokenize import indic_detokenize
from indicnlp.normalize import indic_normalize
from indicnlp.transliterate import unicode_transliterate

en_tok = MosesTokenizer(lang="en")
en_normalizer = MosesPunctNormalizer()


def preprocess_line(line, normalizer, lang, transliterate=False):
    if lang == "en":
        return " ".join(
            en_tok.tokenize(en_normalizer.normalize(line.strip()),
                            escape=False))
    elif transliterate:
        # line = indic_detokenize.trivial_detokenize(line.strip(), lang)
        return unicode_transliterate.UnicodeIndicTransliterator.transliterate(
            " ".join(
                indic_tokenize.trivial_tokenize(
                    normalizer.normalize(line.strip()), lang)),
            lang,
            "hi",
Exemplo n.º 24
0
 def __init__(self, lang_id: str):
     self.moses_tokenizer = MosesTokenizer(lang=lang_id)
     self.moses_detokenizer = MosesDetokenizer(lang=lang_id)
     self.normalizer = MosesPunctNormalizer(lang=lang_id)
Exemplo n.º 25
0
    if normalizer:
        normalizer: MosesPunctNormalizer = normalizer

    def tokenizer(text: str) -> List[Token]:
        if normalizer:
            text = normalizer.normalize(text=text)
        doc = moses_tokenizer.span_tokenize(text=text, escape=False)
        previous_token = None
        tokens: List[Token] = []
        for word, (start_pos, end_pos) in doc:
            word: str = word
            token = Token(text=word,
                          start_position=start_pos,
                          whitespace_after=True)
            tokens.append(token)

            if (previous_token is not None) and (token.start_pos - 1
                                                 == previous_token.start_pos +
                                                 len(previous_token.text)):
                previous_token.whitespace_after = False

            previous_token = token
        return tokens

    return tokenizer


MOSES_TOKENIZER = build_moses_tokenizer(
    tokenizer=MosesTokenizerSpans(lang="fr"),
    normalizer=MosesPunctNormalizer(lang="fr"))
Exemplo n.º 26
0
	def __init__(self, lang):

		self.handler = MosesPunctNormalizer(lang=lang)
 def __init__(self, lng, do_lowercase):
     self.mpn = MosesPunctNormalizer()
     self.moses_tokenizer = MosesTokenizer(lang=lng)
     self.do_lowercase = do_lowercase
Exemplo n.º 28
0
from tqdm import tqdm
from speechbrain.utils.data_utils import get_all_files
from speechbrain.utils.torch_audio_backend import check_torchaudio_backend
from speechbrain.processing.speech_augmentation import Resample

try:
    from sacremoses import MosesPunctNormalizer, MosesTokenizer
except ImportError:
    err_msg = "The optional dependency sacremoses must be installed to run this recipe.\n"
    err_msg += "Install using `pip install sacremoses`.\n"
    raise ImportError(err_msg)

logger = logging.getLogger(__name__)
check_torchaudio_backend()

es_normalizer = MosesPunctNormalizer(lang="es")
en_normalizer = MosesPunctNormalizer(lang="en")

en_tokenizer = MosesTokenizer(lang="en")

SAMPLE_RATE = 16000


@dataclass
class TDF:
    """
    channel: int
        channel of utterance
    start: int
        start time of utterance
    end: int
Exemplo n.º 29
0
    tmp_data = news_item(
        url=kaz_url,
        section=kaz_section,
        title=kaz_title,
        date_time=kaz_date_time,
        text=kaz_text,
    )

    news_items.append(tmp_data)

# собранные данные токенизируем, нормализуем, делаем морфологический анализ
# результат каждой обработки записываем в свой list(namedtuple)

# токенизация
tokenized_news_items = []
mpn = MosesPunctNormalizer()
mtok = MosesTokenizer()

for item in tqdm(news_items):
    tokenized_text = mpn.normalize(text=item.text)
    tokenized_text = mtok.tokenize(text=tokenized_text, return_str=True)

    tmp_data = news_item(
        url=item.url,
        section=item.section,
        title=item.title,
        date_time=item.date_time,
        text=tokenized_text,
    )

    tokenized_news_items.append(tmp_data)
Exemplo n.º 30
0
def row_to_tsv(row):
    return f'{row.lang}\t{row.ds_name}\t{row.src}\t{row.eng}'

log.basicConfig(level=log.INFO)


import sacremoses
print(sacremoses.__version__)




normr = MosesPunctNormalizer(
        lang='en',
        norm_quote_commas=True,
        norm_numbers=True,
        pre_replace_unicode_punct=True,
        post_remove_control_chars=True,
    )
tok = MosesTokenizer(lang='en')

def tokenize_eng(text):
    try:
        text=unescape(text)
        text = normr.normalize(text)
        text = tok.tokenize(text, escape=False, return_str=True, aggressive_dash_splits=True,
            protected_patterns=tok.WEB_PROTECTED_PATTERNS)
        return text
    except:
        if text:
            log.exception(f"error: {text}")