def __init__(self, language: str = "en") -> None: """ MosesNormalizer constructor. Parameters ---------- language : str Language argument for the normalizer. Default: "en". Raises ------ ImportError If sacremoses is not installed. """ try: from sacremoses import MosesPunctNormalizer except ImportError: print( "Problem occured while trying to import sacremoses. " "If the library is not installed visit " "https://github.com/alvations/sacremoses for more details." ) raise self._normalizer = MosesPunctNormalizer(language)
def __init__(self, lang: str = 'en', lower_case: bool = True, romanize: bool = False, descape: bool = False): assert lower_case, 'lower case is needed by all the models' if lang in ('cmn', 'wuu', 'yue'): lang = 'zh' if lang == 'jpn': lang = 'ja' if lang == 'zh': raise NotImplementedError('jieba is not yet implemented') if lang == 'ja': raise NotImplementedError('mecab is not yet implemented') if romanize: raise NotImplementedError('romanize is not yet implemented') self.lower_case = lower_case self.romanize = romanize self.descape = descape self.normalizer = MosesPunctNormalizer(lang=lang) self.tokenizer = MosesTokenizer(lang=lang)
def __init__(self, lang: str = 'en', lower_case: bool = True, romanize: Optional[bool] = None, descape: bool = False): assert lower_case, 'lower case is needed by all the models' if lang in ('cmn', 'wuu', 'yue'): lang = 'zh' if lang == 'jpn': lang = 'ja' if lang == 'zh' and jieba is None: raise ModuleNotFoundError( '''No module named 'jieba'. Install laserembeddings with 'zh' extra to fix that: "pip install laserembeddings[zh]"''' ) if lang == 'ja' and MeCab is None: raise ModuleNotFoundError( '''No module named 'MeCab'. Install laserembeddings with 'ja' extra to fix that: "pip install laserembeddings[ja]"''' ) self.lang = lang self.lower_case = lower_case self.romanize = romanize if romanize is not None else lang == 'el' self.descape = descape self.normalizer = MosesPunctNormalizer(lang=lang) self.tokenizer = MosesTokenizer(lang=lang) self.mecab_tokenizer = MeCab.Tagger( "-O wakati -b 50000") if lang == 'ja' else None
def __init__(self, lang_id: str): self.lang_id = lang_id self.moses_tokenizer = MosesTokenizer(lang=lang_id) self.moses_detokenizer = MosesDetokenizer(lang=lang_id) self.normalizer = MosesPunctNormalizer(lang=lang_id, pre_replace_unicode_punct=True, post_remove_control_chars=True)
def __init__(self, lang, lowercase=True): # the tokenizer names are the same for BertTokenizer and PreTrainedTokenizer since they have both been distributed by huggingface pre_trained_model_name = PreTrainedTokenizer.get_default_model_name( lang, lowercase) self.tokenizer = BertTokenizer.from_pretrained(pre_trained_model_name) self.mpn = MosesPunctNormalizer() self.detokenizer = MosesDetokenizer(lang=lang) self._model_name_ = pre_trained_model_name
def __init__(self, lm_type: LMType, language: str, tokenizer_command): """ lm_type: LMType language: language code tokenizer_command: tokenizer full command (with flags if needed) """ self.language = language self.tokenizer = Tokenizer(tokenizer_command, self.language) self.normalizer = MosesPunctNormalizer(lang=self.language) self.type = lm_type
def __init__(self, exp): self.exp = exp self.tokr = MosesTokenizer() self.detokr = MosesDetokenizer() self.punct_normr = MosesPunctNormalizer() #self.true_caser = MosesTruecaser() self.punct_normalize = True self.tokenize = True self.html_unesc = True self.drop_unks = True #self.truecase = True self.detokenize = True
def translate(self, text: List[str], source_lang: str = None, target_lang: str = None) -> List[str]: """ Translates list of sentences from source language to target language. Should be regular text, this method performs its own tokenization/de-tokenization Args: text: list of strings to translate source_lang: if not None, corresponding MosesTokenizer and MosesPunctNormalizer will be run target_lang: if not None, corresponding MosesDecokenizer will be run Returns: list of translated strings """ mode = self.training if source_lang != "None": tokenizer = MosesTokenizer(lang=source_lang) normalizer = MosesPunctNormalizer(lang=source_lang) if target_lang != "None": detokenizer = MosesDetokenizer(lang=target_lang) try: self.eval() res = [] for txt in text: if source_lang != "None": txt = normalizer.normalize(txt) txt = tokenizer.tokenize(txt, escape=False, return_str=True) ids = self.encoder_tokenizer.text_to_ids(txt) ids = [self.encoder_tokenizer.bos_id ] + ids + [self.encoder_tokenizer.eos_id] src = torch.Tensor(ids).long().to(self._device).unsqueeze(0) src_mask = torch.ones_like(src) src_hiddens = self.encoder(input_ids=src, encoder_mask=src_mask) beam_results = self.beam_search( encoder_hidden_states=src_hiddens, encoder_input_mask=src_mask) beam_results = self.filter_predicted_ids(beam_results) translation_ids = beam_results.cpu()[0].numpy() translation = self.decoder_tokenizer.ids_to_text( translation_ids) if target_lang != "None": translation = detokenizer.detokenize(translation.split()) res.append(translation) finally: self.train(mode=mode) return res
class MosesProcessor: """ Tokenizer, Detokenizer and Normalizer utilities in Moses """ def __init__(self, lang_id: str): self.moses_tokenizer = MosesTokenizer(lang=lang_id) self.moses_detokenizer = MosesDetokenizer(lang=lang_id) self.normalizer = MosesPunctNormalizer(lang=lang_id) def detokenize(self, tokens: List[str]) -> str: """ Detokenizes a list of tokens Args: tokens: list of strings as tokens Returns: detokenized string """ return self.moses_detokenizer.detokenize(tokens) def tokenize(self, text: str): """ Tokenizes text using Moses -> Sentencepiece. """ return self.moses_tokenizer.tokenize(text, escape=False, return_str=True) def normalize(self, text: str): return self.normalizer.normalize(text)
class PTBertTokenizer: """ The tokenizer pre-trained tokenizer trained alongside BERT by huggingface """ def __init__(self, lang, lowercase=True): # the tokenizer names are the same for BertTokenizer and PreTrainedTokenizer since they have both been distributed by huggingface pre_trained_model_name = PreTrainedTokenizer.get_default_model_name( lang, lowercase) self.tokenizer = BertTokenizer.from_pretrained(pre_trained_model_name) self.mpn = MosesPunctNormalizer() self.detokenizer = MosesDetokenizer(lang=lang) self._model_name_ = pre_trained_model_name def tokenize(self, text): return self.tokenizer.tokenize(self.mpn.normalize(text)) def detokenize(self, tokenized_list): # WARNING! this is a one way tokenizer, the detokenized sentences do not necessarily align with the actual tokenized sentences! return self.tokenizer.decode( self.tokenizer.convert_tokens_to_ids(tokenized_list)) @staticmethod def get_default_model_name(lang, lowercase): return PreTrainedTokenizer.get_default_model_name(lang, lowercase) @property def model_name(self): return self._model_name_
class PyMosesTokenizer(GenericTokenizer): """ The call to standard moses tokenizer """ def __init__(self, lang, lowercase): self.mpn = MosesPunctNormalizer() self.tokenizer = MosesTokenizer(lang=lang) self.detokenizer = MosesDetokenizer(lang=lang) self.lowercase = lowercase self.lang = lang def tokenize(self, text): return self.tokenizer.tokenize( self.mpn.normalize(text.lower() if self.lowercase else text)) def detokenize(self, tokenized_list): temp_result = "" t_list_len = len(tokenized_list) for t_ind, token in enumerate(tokenized_list): apos_cnd = token == "'" and t_ind < t_list_len - 1 and tokenized_list[ t_ind + 1] == "s" if apos_cnd or token == "/": temp_result = temp_result.strip() + token else: temp_result += token + " " f_result = self.detokenizer.detokenize(temp_result.strip().split()) if len(f_result ) > 3 and f_result[-3] in string.punctuation and f_result[ -2] == " " and f_result[-1] == "\"": f_result = f_result[:-2] + f_result[-1] return f_result @property def model_name(self): return "Moses"
def _setup_normalizer(self): try: from sacremoses import MosesPunctNormalizer self.punc_normalizer = MosesPunctNormalizer(self.source_lang).normalize except (ImportError, FileNotFoundError): warnings.warn("Recommended: pip install sacremoses.") self.punc_normalizer = lambda x: x
class MosesPreTokenizer: def __init__(self, lng, do_lowercase): self.mpn = MosesPunctNormalizer() self.moses_tokenizer = MosesTokenizer(lang=lng) self.do_lowercase = do_lowercase def pre_tokenize(self, text): return self.moses_tokenizer.tokenize(self.mpn.normalize(text.lower() if self.do_lowercase else text))
class Normalizepunctuation(BatchProcessor): def __init__(self, lang): self.handler = MosesPunctNormalizer(lang=lang) def process(self, input): return self.handler.normalize(input)
class PunctNormalizer: def __init__(self, language): self.language = language self.normalizer = MosesPunctNormalizer(lang=language) def __repr__(self): return f"PunctNormalizer({self.language})" def __call__(self, line): return self.normalizer.normalize(line).strip()
def old_preprocess(infname, outfname, lang): """ Preparing each corpus file: - Normalization - Tokenization - Script coversion to Devanagari for Indic scripts """ n = 0 num_lines = sum(1 for line in open(infname, "r")) # reading with open(infname, "r", encoding="utf-8") as infile, open(outfname, "w", encoding="utf-8") as outfile: if lang == "en": en_tok = MosesTokenizer(lang="en") en_normalizer = MosesPunctNormalizer() for line in tqdm(infile, total=num_lines): outline = " ".join( en_tok.tokenize(en_normalizer.normalize(line.strip()), escape=False)) outfile.write(outline + "\n") n += 1 else: normfactory = indic_normalize.IndicNormalizerFactory() normalizer = normfactory.get_normalizer(lang) for line in tqdm(infile, total=num_lines): outline = (unicode_transliterate.UnicodeIndicTransliterator. transliterate( " ".join( indic_tokenize.trivial_tokenize( normalizer.normalize(line.strip()), lang)), lang, "hi", ).replace(" ् ", "्")) outfile.write(outline + "\n") n += 1 return n
def Token(inp_fname, out_fname, lang='en', lower_case=True, romanize=False, descape=False, verbose=False, over_write=False, gzip=False): assert lower_case, 'lower case is needed by all the models' assert not over_write, 'over-write is not yet implemented' if not os.path.isfile(out_fname): cat = 'zcat ' if gzip else 'cat ' roman = lang if romanize else 'none' # handle some iso3 langauge codes if lang in ('cmn', 'wuu', 'yue'): lang = 'zh' if lang in ('jpn'): lang = 'ja' if verbose: print(' - Tokenizer: {} in language {} {} {}'.format( os.path.basename(inp_fname), lang, '(gzip)' if gzip else '', '(de-escaped)' if descape else '', '(romanized)' if romanize else '')) ''' run(cat + inp_fname + '|' + REM_NON_PRINT_CHAR + '|' + NORM_PUNC + lang + ('|' + DESCAPE if descape else '') + '|' + MOSES_TOKENIZER + lang + ('| python3 -m jieba -d ' if lang == 'zh' else '') + ('|' + MECAB + '/bin/mecab -O wakati -b 50000 ' if lang == 'ja' else '') + '|' + ROMAN_LC + roman + '>' + out_fname, env=dict(os.environ, LD_LIBRARY_PATH=MECAB + '/lib'), shell=True) ''' curNormalizer = sacremoses_norm_punct.get( lang, MosesPunctNormalizer(lang=lang)) curTokenizer = sacremoses_tokenizers.get(lang, MosesTokenizer(lang=lang)) with open(out_fname, 'w') as outF: for line in sys.stdin: tok_norm_punct = curNormalizer.normalize(line) tok = curTokenizer.tokenize(tok_norm_punct, return_str=True, escape=False) outF.write(tok.strip() + "\n") elif not over_write and verbose: print(' - Tokenizer: {} exists already'.format( os.path.basename(out_fname), lang))
def __init__(self, expdir): self.expdir = expdir self.en_tok = MosesTokenizer(lang="en") self.en_normalizer = MosesPunctNormalizer() self.en_detok = MosesDetokenizer(lang="en") self.xliterator = unicode_transliterate.UnicodeIndicTransliterator() print("Initializing vocab and bpe") self.vocabulary = read_vocabulary( codecs.open(f"{expdir}/vocab/vocab.SRC", encoding="utf-8"), 5 ) self.bpe = BPE( codecs.open(f"{expdir}/vocab/bpe_codes.32k.SRC", encoding="utf-8"), -1, "@@", self.vocabulary, None, ) print("Initializing model for translation") # initialize the model self.translator = Translator( f"{expdir}/final_bin", f"{expdir}/model/checkpoint_best.pt", batch_size=100 )
def get_normalize_preprocessor(): """ get sacaremoses normalize processor >>> processor = get_normalize_preprocessor() >>> text = "Hi…" >>> processor(text) 'Hi...' """ from sacremoses import MosesPunctNormalizer mpn = MosesPunctNormalizer() def preprocessor(line): return mpn.normalize(line) return preprocessor
class MosesNormalizer: """ Pretokenization took that normalizes the raw textual data. Uses sacremoses.MosesPunctNormalizer to perform normalization. """ def __init__(self, language: str = "en") -> None: """ MosesNormalizer constructor. Parameters ---------- language : str Language argument for the normalizer. Default: "en". Raises ------ ImportError If sacremoses is not installed. """ try: from sacremoses import MosesPunctNormalizer except ImportError: print( "Problem occured while trying to import sacremoses. " "If the library is not installed visit " "https://github.com/alvations/sacremoses for more details." ) raise self._normalizer = MosesPunctNormalizer(language) def __call__(self, raw: str) -> str: """ Applies normalization to the raw textual data. Parameters ---------- raw : str Raw textual data. Returns ------- str Normalized textual data. """ return self._normalizer.normalize(raw)
def lowercase_and_remove_punctuations(language, text, lowercase=True, remove_punctuation=True): if lowercase: text = text.lower() if language not in ["zh", "ja"]: if language not in PUNC_NORMERS: PUNC_NORMERS[language] = MosesPunctNormalizer(lang=language) text = PUNC_NORMERS[language].normalize(text) text = text.replace("' s ", "'s ").replace("' ve ", "'ve ").replace( "' m ", "'m ").replace("' t ", "'t ").replace("' re ", "'re ") if remove_punctuation: text = PUNC_PATTERN.sub(" ", text) text = " ".join(text.strip().split()) return text
def prepare_csv(path, cleanup=False): if path.endswith('.processed.csv'): return path compression = 'xz' if path.endswith('.xz') else None df = pd.read_csv( path, compression=compression, sep='\t', encoding='utf-8', usecols=['id', 'hyperp', 'bias', 'publisher', 'title', 'text']).dropna() print(' > Cleaning up data') df.bias = df.bias.astype('category') df.publisher = df.publisher print(' > Initializing Moses') mt = MosesTokenizer() mn = MosesPunctNormalizer() line_blacklist, token_blacklist = set(), set() if cleanup: print(' > Generating line blacklist') line_blacklist = get_line_blacklist(df) print(' > Generating token blacklist') token_blacklist = get_token_blacklist(df) tqdm.pandas() print(' > Processing titles') df.title = df.title.apply(clean_text).apply(mn.normalize).progress_apply( mt.tokenize, return_str=True) print(' > Processing texts') df.text = df.text.apply(clean_text, line_blacklist=line_blacklist, token_blacklist=token_blacklist).progress_apply( mn.normalize).progress_apply(mt.tokenize, return_str=True) new_path = path.replace('.xz', '').replace('.csv', '.processed.csv') df.to_csv(new_path, index=False) return new_path
class RtgIO: def __init__(self, exp): self.exp = exp self.tokr = MosesTokenizer() self.detokr = MosesDetokenizer() self.punct_normr = MosesPunctNormalizer() #self.true_caser = MosesTruecaser() self.punct_normalize = True self.tokenize = True self.html_unesc = True self.drop_unks = True #self.truecase = True self.detokenize = True def pre_process(self, text): # Any pre-processing on input if self.html_unesc: text = unescape(text) if self.punct_normalize: text = self.punct_normr.normalize(text) if self.tokenize: text = self.tokr.tokenize(text, escape=False, return_str=True, aggressive_dash_splits=True) # protected_patterns=self.tokr.WEB_PROTECTED_PATTERNS return text def post_process(self, tokens): # Any post-processing on output assert isinstance(tokens, list) if self.detokenize: text = self.detokr.detokenize(tokens=tokens, return_str=True, unescape=True) else: text = " ".join(tokens) if self.drop_unks: text = text.replace("<unk>", "") #if self.truecase: # text = self.true_caser.truecase(text, return_str=True) return text
def TokenLine(line, lang='en', lower_case=True, romanize=False): assert lower_case, 'lower case is needed by all the models' ''' roman = lang if romanize else 'none' tok = check_output( REM_NON_PRINT_CHAR + '|' + NORM_PUNC + lang + '|' + DESCAPE + '|' + MOSES_TOKENIZER + lang + ('| python3 -m jieba -d ' if lang == 'zh' else '') + ('|' + MECAB + '/bin/mecab -O wakati -b 50000 ' if lang == 'ja' else '') + '|' + ROMAN_LC + roman, input=line, encoding='UTF-8', shell=True) ''' tok_norm_punct = sacremoses_norm_punct.get( lang, MosesPunctNormalizer(lang=lang)).normalize(line) tok = sacremoses_tokenizers.get(lang, MosesTokenizer(lang=lang))\ .tokenize(tok_norm_punct, return_str=True, escape=False) return tok.strip()
class EnJaProcessor: """ Tokenizer, Detokenizer and Normalizer utilities for Japanese & English Args: lang_id: One of ['en', 'ja']. """ def __init__(self, lang_id: str): self.lang_id = lang_id self.moses_tokenizer = MosesTokenizer(lang=lang_id) self.moses_detokenizer = MosesDetokenizer(lang=lang_id) self.normalizer = MosesPunctNormalizer(lang=lang_id, pre_replace_unicode_punct=True, post_remove_control_chars=True) def detokenize(self, tokens: List[str]) -> str: """ Detokenizes a list of tokens Args: tokens: list of strings as tokens Returns: detokenized Japanese or English string """ return self.moses_detokenizer.detokenize(tokens) def tokenize(self, text) -> str: """ Tokenizes text using Moses. Returns a string of tokens. """ tokens = self.moses_tokenizer.tokenize(text) return ' '.join(tokens) def normalize(self, text) -> str: # Normalization doesn't handle Japanese periods correctly; # '。'becomes '.'. if self.lang_id == 'en': return self.normalizer.normalize(text) else: return text
tmp_data = news_item( url=kaz_url, section=kaz_section, title=kaz_title, date_time=kaz_date_time, text=kaz_text, ) news_items.append(tmp_data) # собранные данные токенизируем, нормализуем, делаем морфологический анализ # результат каждой обработки записываем в свой list(namedtuple) # токенизация tokenized_news_items = [] mpn = MosesPunctNormalizer() mtok = MosesTokenizer() for item in tqdm(news_items): tokenized_text = mpn.normalize(text=item.text) tokenized_text = mtok.tokenize(text=tokenized_text, return_str=True) tmp_data = news_item( url=item.url, section=item.section, title=item.title, date_time=item.date_time, text=tokenized_text, ) tokenized_news_items.append(tmp_data)
from tqdm import tqdm from speechbrain.utils.data_utils import get_all_files from speechbrain.utils.torch_audio_backend import check_torchaudio_backend from speechbrain.processing.speech_augmentation import Resample try: from sacremoses import MosesPunctNormalizer, MosesTokenizer except ImportError: err_msg = "The optional dependency sacremoses must be installed to run this recipe.\n" err_msg += "Install using `pip install sacremoses`.\n" raise ImportError(err_msg) logger = logging.getLogger(__name__) check_torchaudio_backend() es_normalizer = MosesPunctNormalizer(lang="es") en_normalizer = MosesPunctNormalizer(lang="en") en_tokenizer = MosesTokenizer(lang="en") SAMPLE_RATE = 16000 @dataclass class TDF: """ channel: int channel of utterance start: int start time of utterance end: int
class Model: def __init__(self, expdir): self.expdir = expdir self.en_tok = MosesTokenizer(lang="en") self.en_normalizer = MosesPunctNormalizer() self.en_detok = MosesDetokenizer(lang="en") self.xliterator = unicode_transliterate.UnicodeIndicTransliterator() print("Initializing vocab and bpe") self.vocabulary = read_vocabulary( codecs.open(f"{expdir}/vocab/vocab.SRC", encoding="utf-8"), 5 ) self.bpe = BPE( codecs.open(f"{expdir}/vocab/bpe_codes.32k.SRC", encoding="utf-8"), -1, "@@", self.vocabulary, None, ) print("Initializing model for translation") # initialize the model self.translator = Translator( f"{expdir}/final_bin", f"{expdir}/model/checkpoint_best.pt", batch_size=100 ) # translate a batch of sentences from src_lang to tgt_lang def batch_translate(self, batch, src_lang, tgt_lang): assert isinstance(batch, list) preprocessed_sents = self.preprocess(batch, lang=src_lang) bpe_sents = self.apply_bpe(preprocessed_sents) tagged_sents = apply_lang_tags(bpe_sents, src_lang, tgt_lang) tagged_sents = truncate_long_sentences(tagged_sents) translations = self.translator.translate(tagged_sents) postprocessed_sents = self.postprocess(translations, tgt_lang) return postprocessed_sents # translate a paragraph from src_lang to tgt_lang def translate_paragraph(self, paragraph, src_lang, tgt_lang): assert isinstance(paragraph, str) sents = split_sentences(paragraph, src_lang) postprocessed_sents = self.batch_translate(sents, src_lang, tgt_lang) translated_paragraph = " ".join(postprocessed_sents) return translated_paragraph def preprocess_sent(self, sent, normalizer, lang): if lang == "en": return " ".join( self.en_tok.tokenize( self.en_normalizer.normalize(sent.strip()), escape=False ) ) else: # line = indic_detokenize.trivial_detokenize(line.strip(), lang) return unicode_transliterate.UnicodeIndicTransliterator.transliterate( " ".join( indic_tokenize.trivial_tokenize( normalizer.normalize(sent.strip()), lang ) ), lang, "hi", ).replace(" ् ", "्") def preprocess(self, sents, lang): """ Normalize, tokenize and script convert(for Indic) return number of sentences input file """ if lang == "en": # processed_sents = Parallel(n_jobs=-1, backend="multiprocessing")( # delayed(preprocess_line)(line, None, lang) for line in tqdm(sents, total=num_lines) # ) processed_sents = [ self.preprocess_sent(line, None, lang) for line in tqdm(sents) ] else: normfactory = indic_normalize.IndicNormalizerFactory() normalizer = normfactory.get_normalizer(lang) # processed_sents = Parallel(n_jobs=-1, backend="multiprocessing")( # delayed(preprocess_line)(line, normalizer, lang) for line in tqdm(infile, total=num_lines) # ) processed_sents = [ self.preprocess_sent(line, normalizer, lang) for line in tqdm(sents) ] return processed_sents def postprocess(self, sents, lang, common_lang="hi"): """ parse fairseq interactive output, convert script back to native Indic script (in case of Indic languages) and detokenize. infname: fairseq log file outfname: output file of translation (sentences not translated contain the dummy string 'DUMMY_OUTPUT' input_size: expected number of output sentences lang: language """ postprocessed_sents = [] if lang == "en": for sent in sents: # outfile.write(en_detok.detokenize(sent.split(" ")) + "\n") postprocessed_sents.append(self.en_detok.detokenize(sent.split(" "))) else: for sent in sents: outstr = indic_detokenize.trivial_detokenize( self.xliterator.transliterate(sent, common_lang, lang), lang ) # outfile.write(outstr + "\n") postprocessed_sents.append(outstr) return postprocessed_sents def apply_bpe(self, sents): return [self.bpe.process_line(sent) for sent in sents]
def __init__(self, lang, lowercase): self.mpn = MosesPunctNormalizer() self.tokenizer = MosesTokenizer(lang=lang) self.detokenizer = MosesDetokenizer(lang=lang) self.lowercase = lowercase self.lang = lang
Normalizes and tokenizes every sentence in a corpus using the Moses normalizer and tokenizer. Takes three arguments: * the language of the corpus (language code) * the path to the corpus file * the path to the output file """ from sacremoses import MosesPunctNormalizer, MosesTokenizer import argparse parser = argparse.ArgumentParser() parser.add_argument("lang", type=str, help="Language of the corpus") parser.add_argument("f_in", type=str, help="Path to the corpus") parser.add_argument("f_out", type=str, help="Output path") args = parser.parse_args() normalizer = MosesPunctNormalizer(args.lang) tokenizer = MosesTokenizer(args.lang) with open(args.f_in, 'r', encoding='UTF-8') as f_in, open(args.f_out, 'w', encoding='UTF-8') as f_out: for line in f_in: line = line.strip() if line != '': line = normalizer.normalize(line) line = tokenizer.tokenize(line, return_str=True, escape=False) f_out.write(line + '\n')