def input_pipeline(sentence, lang, bpe=None): """ 1. 分词(zh) 2. 转小写(en) 3. tokenzie 4. bpe """ if lang == 'zh': seg = [term.word for term in HanLP.segment(sentence)] seg_str = ' '.join(seg) #print('分词后:', seg) mt = MosesTokenizer(lang='zh') tokenized_str = mt.tokenize(seg_str, return_str=True) #print('tokenize后;',tokenized_str) if bpe is not None: bpe_str = bpe.apply([tokenized_str])[0] #print('bpe后:', bpe_str) return bpe_str.split() return tokenized_str.split() elif lang == 'en': lower = sentence.lower() #print('小写后:'. lower) mt = MosesTokenizer(lang='en') tokenized_str = mt.tokenize(lower, return_str=True) #print('tokenize后;',tokenized_str) if bpe is not None: bpe_str = bpe.apply([tokenized_str])[0] #print('bpe后:', bpe_str) return bpe_str.split() return tokenized_str.split() else: raise Exception
def clean( l1="C:/Users/azaninello/Desktop/experiments/nuovo/exp1/zing_phrases_examples", l2="C:/Users/azaninello/Desktop/experiments/nuovo/exp1/zing_phrases_examples" ): en_tok = MT(lang='en') it_tok = MT(lang='it') with open(l1, "r", encoding="utf-8") as en, open(l2, "r", encoding="utf-8") as it: en_text = en.readlines() it_text = it.readlines() with open("STOCAZZO.en", "w+", encoding="utf-8") as cl_en, open("DAJE.it", "w+", encoding="utf-8") as cl_it: c = 0 for line_en, line_it in zip(en_text, it_text): line_en = " ".join(en_tok.tokenize(line_en)).lower().replace( "'", "'").replace(""", '"') line_it = " ".join(it_tok.tokenize(line_it)).lower().replace( "'", "'").replace(""", '"') cl_en.write(line_en + "\n") cl_it.write(line_it + "\n") c += 1 if c % 500 == 0: print("Processed {} sentences".format(c))
def score(path_to_segmentation: str, path_to_reference: str) -> None: path_to_segmentation = Path(path_to_segmentation) path_to_reference = Path(path_to_reference) # init tokenizer and detokenizer mt, md = MosesTokenizer(lang="de"), MosesDetokenizer(lang="de") # extract the reference sentences from the xml file reference = [] with open(path_to_reference, "r", encoding="utf-8") as f: for line in f.read().splitlines(): if line[:4] == "<seg": reference.append( line.split(">", maxsplit=1)[1].split("</seg>")[0]) scores = {} for path_to_segmentation_file_i in path_to_segmentation.glob("own_*.xml"): max_segm_len = int(path_to_segmentation_file_i.stem.split("_")[-1]) # extract generated translations from the xml file segm_translation = load_segm_file(path_to_segmentation_file_i) # detokenize (have to tokenize first with the python implementation of Moses) segm_translation = [ md.detokenize(mt.tokenize(s)) for s in segm_translation ] assert len(reference) == len(segm_translation) # get bleu score bleu = sacrebleu.corpus_bleu(segm_translation, [reference]) scores[max_segm_len] = bleu.score scores = dict(sorted(scores.items())) # do the same process for the original segmentation path_to_original_segmentation_file = path_to_segmentation / "original_segm.xml" original_segm_translation = load_segm_file( path_to_original_segmentation_file) original_segm_translation = [ md.detokenize(mt.tokenize(s)) for s in original_segm_translation ] assert len(reference) == len(original_segm_translation) bleu = sacrebleu.corpus_bleu(original_segm_translation, [reference]) scores["original"] = bleu.score for n, s in scores.items(): print(f"{n}: {s} BLEU")
class EnThTranslator: def __init__(self): self._tokenizer = MosesTokenizer("en") self._model_name = _EN_TH_MODEL_NAME _download_install(self._model_name) self._model = TransformerModel.from_pretrained( model_name_or_path=_get_translate_path( self._model_name, _EN_TH_FILE_NAME, "models", ), checkpoint_file="checkpoint.pt", data_name_or_path=_get_translate_path( self._model_name, _EN_TH_FILE_NAME, "vocab", ), ) def translate(self, text: str) -> str: """ Translate text from English to Thai :param str text: input text in source language :return: translated text in target language :rtype: str """ tokens = " ".join(self._tokenizer.tokenize(text)) translated = self._model.translate(tokens) return translated.replace(" ", "").replace("▁", " ").strip()
def _moses_tokenize(text, lang): """ Tokenize a given string using moses tokenizer Tokenization: https://github.com/alvations/sacremoses """ from sacremoses import MosesTokenizer mt = MosesTokenizer(lang) return [string_unescape(t) for t in mt.tokenize(text)]
def generate(corpus: Optional[str] = None, test: Optional[str] = None): moses = MosesTokenizer(lang='fr') pos_tagged = loadCorpusFromStr(corpus) grammar_str = FormatGrammarAsCFG(InductGrammar(pos_tagged)) grammar = nltk.CFG.fromstring(grammar_str.split('\n')) parsed = [] valid = [] not_valide = [] for s in test.split('$'): try: tagged_sent = [ token[1] for token in tagger.tag(moses.tokenize(s, escape=False)) ] parsed = parse(tagged_sent, grammar) if parsed != None: valid.append((s, str(parsed))) else: not_valide.append(s) except: not_valide.append(s) return { "grammar": grammar_str, "test_results": { "valide": valid, "not_valide": not_valide } }
class SacreMosesTokenizer(object): def __init__(self): try: from sacremoses import MosesTokenizer self._tokenizer = MosesTokenizer() except (ImportError, TypeError) as err: print('sacremoses is not installed. ' 'To install sacremoses, use pip install -U sacremoses' ' Now try NLTKMosesTokenizer using NLTK ...') raise def __call__(self, sample, return_str=False): """ Parameters ---------- sample: str The sentence to tokenize return_str: bool, default False True: return a single string False: return a list of tokens Returns ------- ret : list of strs or str List of tokens or tokenized text """ return self._tokenizer.tokenize(sample, return_str=return_str, escape=False)
class Tokenizer: def __init__(self, command=None, l="en"): if command: self.tokenizer = ToolWrapper(command.split(' ')) self.external = True self.spm = command.find('spm_encode') > -1 else: self.tokenizer = MosesTokenizer(lang=l) self.external = False self.spm = False def tokenize(self, text): if self.external: self.tokenizer.writeline(text.rstrip('\n')) return ([ no_escaping(t) for t in self.tokenizer.readline().rstrip('\n').split() ]) else: return self.tokenizer.tokenize(text, escape=False) def detokenize(self, text): if self.spm: return ''.join(text).replace('\u2581', ' ') else: return ' '.join(text) def close(self): if self.external: try: self.tokenizer.close() except: return
def get_tokenized_review_list(): mt = MosesTokenizer() reviews = get_reviews()[0] tokenized_list = [ mt.tokenize(review_text, escape=False) for review_text in reviews ] return (tokenized_list, reviews[1])
class MosesPreprocessingFunc(): def __init__(self, lang: str): self.mt = MosesTokenizer(lang) def __call__(self, t: str) -> str: return self.mt.tokenize(t, return_str=True, escape=True)
def build_vocab(num): vocab_file = f"vocab_{num}.txt" vocab_list = list() tokenizer = MosesTokenizer(lang='en') i = 0 for file in os.listdir("dataset/train/body"): filename = os.fsdecode(file) with open(f"dataset/train/body/{filename}", 'r', encoding='utf-8') as in_file: corpus_lines = in_file.readlines() corpus_lines = tokenizer.tokenize(corpus_lines) for line in corpus_lines: for word in line.split(): if word.lower() not in vocab_list: vocab_list.append(word.lower()) i += 1 if i >= 1000: break with open(vocab_file, 'w', encoding='utf-8') as out_file: for word in vocab_list: out_file.write(f"{word}\n")
class PyMosesTokenizer(GenericTokenizer): """ The call to standard moses tokenizer """ def __init__(self, lang, lowercase): self.mpn = MosesPunctNormalizer() self.tokenizer = MosesTokenizer(lang=lang) self.detokenizer = MosesDetokenizer(lang=lang) self.lowercase = lowercase self.lang = lang def tokenize(self, text): return self.tokenizer.tokenize( self.mpn.normalize(text.lower() if self.lowercase else text)) def detokenize(self, tokenized_list): temp_result = "" t_list_len = len(tokenized_list) for t_ind, token in enumerate(tokenized_list): apos_cnd = token == "'" and t_ind < t_list_len - 1 and tokenized_list[ t_ind + 1] == "s" if apos_cnd or token == "/": temp_result = temp_result.strip() + token else: temp_result += token + " " f_result = self.detokenizer.detokenize(temp_result.strip().split()) if len(f_result ) > 3 and f_result[-3] in string.punctuation and f_result[ -2] == " " and f_result[-1] == "\"": f_result = f_result[:-2] + f_result[-1] return f_result @property def model_name(self): return "Moses"
class MosesTokenizer(object): @staticmethod def add_args(parser): # fmt: off parser.add_argument('--moses-source-lang', default='en', metavar='SRC', help='source language') parser.add_argument('--moses-target-lang', default='en', metavar='TARGET', help='target language') parser.add_argument('--moses-no-dash-splits', action='store_true', default=False, help='don\'t apply dash split rules') parser.add_argument('--moses-no-escape', action='store_true', default=False, help='don\'t perform HTML escaping on apostrophy, quotes, etc.') # fmt: on def __init__(self, args): self.args = args try: from sacremoses import MosesTokenizer, MosesDetokenizer self.tok = MosesTokenizer(args.moses_source_lang) self.detok = MosesDetokenizer(args.moses_target_lang) except ImportError: raise ImportError('Please install Moses tokenizer with: pip install sacremoses') def encode(self, x: str) -> str: return self.tok.tokenize( x, aggressive_dash_splits=(not self.args.moses_no_dash_splits), return_str=True, escape=(not self.args.moses_no_escape), ) def decode(self, x: str) -> str: return self.detok.detokenize(x.split())
class MosesTokenizer(Tokenizer): def __init__(self, language, glossaries=None, aggressive_dash_splits=True, escape=False): super(MosesTokenizer, self).__init__(language=language, glossaries=glossaries) self._aggressive_dash_splits = aggressive_dash_splits self._escape = escape try: from sacremoses import MosesDetokenizer as MDetok from sacremoses import MosesTokenizer as MTok self._tok = MTok(lang=self.language) self._detok = MDetok(lang=self.language) except ImportError: raise ImportError( 'Please install Moses tokenizer with: pip3 install sacremoses') def tokenize(self, text, return_str=False): return self._tok.tokenize( self._convert_to_str(text), aggressive_dash_splits=self._aggressive_dash_splits, return_str=return_str, escape=self._escape, protected_patterns=self._glossaries) def detokenize(self, text, return_str=True): return self._detok.detokenize(self._convert_to_list(text), return_str=return_str, unescape=True)
class MosesProcessor: """ Tokenizer, Detokenizer and Normalizer utilities in Moses """ def __init__(self, lang_id: str): self.moses_tokenizer = MosesTokenizer(lang=lang_id) self.moses_detokenizer = MosesDetokenizer(lang=lang_id) self.normalizer = MosesPunctNormalizer(lang=lang_id) def detokenize(self, tokens: List[str]) -> str: """ Detokenizes a list of tokens Args: tokens: list of strings as tokens Returns: detokenized string """ return self.moses_detokenizer.detokenize(tokens) def tokenize(self, text: str): """ Tokenizes text using Moses -> Sentencepiece. """ return self.moses_tokenizer.tokenize(text, escape=False, return_str=True) def normalize(self, text: str): return self.normalizer.normalize(text)
class MosesTokenizer(Tokenizer): def __init__(self): super().__init__() self._tokenizer = SacreMosesTokenizer() self._detokenizer = MosesDetokenizer() def tokenize(self, sentence): return self._tokenizer.tokenize(sentence) def detokenize(self, tokens): """Unescape Moses punctuation tokens. Replaces escape sequences like [ with the original characters (such as '['), so they better align to the original text. """ return [self._detokenizer.unescape_xml(t) for t in tokens] def detokenize_ptb(self, tokens): # Not a perfect detokenizer, but a "good-enough" stand in. rep_dict = { "-LSB-": "[", "-RSB-": "]", "-LRB-": "(", "-RRB-": ")", "-LCB-": "{", "-RCB-": "}", "``": '"', "''": '"', } str1 = self._detokenizer.detokenize(replace_list(tokens, rep_dict)) return str1
class MosesPreTokenizer: def __init__(self, lng, do_lowercase): self.mpn = MosesPunctNormalizer() self.moses_tokenizer = MosesTokenizer(lang=lng) self.do_lowercase = do_lowercase def pre_tokenize(self, text): return self.moses_tokenizer.tokenize(self.mpn.normalize(text.lower() if self.do_lowercase else text))
def tokenize_captions(captions, lang='en'): """Tokenizes captions list with Moses tokenizer. """ tokenizer = MosesTokenizer(lang=lang) return [ tokenizer.tokenize(caption, return_str=True) for caption in captions ]
def tokenize(txt, to_lower=False): assert isinstance(txt, str) tokenizer = MosesTokenizer() lines = txt.split('\n') t = [tokenizer.tokenize(line) for line in lines] if to_lower: return [[word.lower() for word in line] for line in t] else: return t
def run_sentence_bleu(candidates: list, references: list, language: str) -> list: """ Runs sentence BLEU from Sacrebleu. """ tokenizer = MosesTokenizer(lang=language) candidates = [tokenizer.tokenize(mt, return_str=True) for mt in candidates] references = [ tokenizer.tokenize(ref, return_str=True) for ref in references ] assert len(candidates) == len(references) bleu_scores = [] for i in tqdm(range(len(candidates)), desc="Running BLEU..."): bleu_scores.append( corpus_bleu([ candidates[i], ], [[ references[i], ]]).score) return bleu_scores
def preprocess(source_lang,tcmodel,escape): mtok = MosesTokenizer(lang=source_lang) mtr = MosesTruecaser(tcmodel) sys.stderr.write("model loaded\n") for line in sys.stdin: tokenized = mtok.tokenize(line,escape=escape) truecased = mtr.truecase(" ".join(tokenized)) sys.stderr.write("sentence processed\n") sys.stdout.buffer.write((" ".join(truecased) + "\n").encode("utf-8")) sys.stdout.flush()
class Tokenizer: def __init__(self, language): self.language = language self.tokenizer = MosesTokenizer(lang=language) def __repr__(self): return f"Tokenizer({self.language})" def __call__(self, line): return " ".join(self.tokenizer.tokenize(line, escape=False))
class MosesTokenizerWrapper(AbstractTokenizer): def __init__(self, do_lower_case: bool = False, escape: bool = False): self._tokenizer = MosesTokenizer() self._do_lower_case = do_lower_case self._escape = escape def tokenize_single(self, sentence: str): if self._do_lower_case: sentence = sentence.lower() return self._tokenizer.tokenize(sentence, escape=self._escape)
def get_moses_tokenizer(lang): try: moses_tokenizer = MosesTokenizer(lang=lang) except: print("WARNING: Moses doesn't have tokenizer for", lang) moses_tokenizer = MosesTokenizer(lang='en') tokenizer = lambda x: moses_tokenizer.tokenize(x, return_str=True ) #string IN -> string OUT return tokenizer
class Tokenizer(BatchProcessor): # default args: ["-a", "-no-escape"] def __init__(self, lang, args=["-a"]): self.handler = MosesTokenizer(lang=lang) self.escape = not ("-no-escape" in args or "--no-escape" in args) self.aggressive = "-a" in args def process(self, input): return self.handler.tokenize(input, aggressive_dash_splits=self.aggressive, return_str=True, escape=self.escape)
def main(): tic = time.time() args = parse_args() logging.basicConfig(level=logging.DEBUG) logging.debug(args) if args.tokenize: tokenizer = MosesTokenizer(lang=args.lang) lines = sys.stdin.readlines() all_tokens = [] for line in lines: if args.tokenize: t = tokenizer.tokenize(line) else: t = line.split() all_tokens.append(t) flat_tokens = chain.from_iterable(all_tokens) counter = Counter(flat_tokens) # try to free up memory early del flat_tokens logging.debug("Vocabulary size before/after/max_allowed = %d/%d/%d" % (len(counter.keys()), min(args.vocab_size, len(counter.keys())), args.vocab_size)) vocabulary = [ token for token, frequency in counter.most_common(args.vocab_size) ] for tokens in all_tokens: output_tokens = [] for token in tokens: if token in vocabulary: output_tokens.append(token) else: output_tokens.append(args.unk_string) output_string = " ".join(output_tokens) sys.stdout.write(output_string + "\n") toc = time.time() - tic logging.debug("Time taken: %f seconds" % toc)
class MosesTokenizerFunc(BaseTokenizer): "Wrapper around a MosesTokenizer to make it a `BaseTokenizer`." def __init__(self, lang: str): self.tok = MosesTokenizer(lang) def tokenizer(self, t: str) -> List[str]: return self.tok.tokenize(t, return_str=False, escape=False) def add_special_cases(self, toks: Collection[str]): for w in toks: assert len(self.tokenizer( w)) == 1, f"Tokenizer is unable to keep {w} as one token!"
def _test_tokenize(self, test_file, language='en'): """ Compares MosesPunctuationNormalizer's output to the output of the original Perl script. """ tokenizer = MosesTokenizer(lang=language) # Normalize test file with original Perl script and given flags path_gold = self._create_gold(test_file, language) # Compare to output of original Perl script with open(test_file, encoding='utf-8') as u, open(path_gold, encoding='utf-8') as g: for text, gold in zip(u, g): tokenized = tokenizer.tokenize(text, return_str=True) self.assertEqual(tokenized.rstrip(), gold.rstrip())
class EnThTranslator: """ English-Thai Machine Translation from VISTEC-depa Thailand Artificial Intelligence Research Institute Website: https://airesearch.in.th/releases/machine-translation-models/ """ def __init__(self): self._tokenizer = MosesTokenizer("en") self._model_name = _EN_TH_MODEL_NAME _download_install(self._model_name) self._model = TransformerModel.from_pretrained( model_name_or_path=_get_translate_path( self._model_name, _EN_TH_FILE_NAME, "models", ), checkpoint_file="checkpoint.pt", data_name_or_path=_get_translate_path( self._model_name, _EN_TH_FILE_NAME, "vocab", ), ) def translate(self, text: str) -> str: """ Translate text from English to Thai :param str text: input text in source language :return: translated text in target language :rtype: str :Example: Translate text from English to Thai:: from pythainlp.translate import EnThTranslator enth = EnThTranslator() enth.translate("I love cat.") # output: ฉันรักแมว """ tokens = " ".join(self._tokenizer.tokenize(text)) translated = self._model.translate(tokens) return translated.replace(" ", "").replace("▁", " ").strip()
def translate(self, text: List[str], source_lang: str = None, target_lang: str = None) -> List[str]: """ Translates list of sentences from source language to target language. Should be regular text, this method performs its own tokenization/de-tokenization Args: text: list of strings to translate source_lang: if not None, corresponding MosesTokenizer and MosesPunctNormalizer will be run target_lang: if not None, corresponding MosesDecokenizer will be run Returns: list of translated strings """ mode = self.training if source_lang != "None": tokenizer = MosesTokenizer(lang=source_lang) normalizer = MosesPunctNormalizer(lang=source_lang) if target_lang != "None": detokenizer = MosesDetokenizer(lang=target_lang) try: self.eval() res = [] for txt in text: if source_lang != "None": txt = normalizer.normalize(txt) txt = tokenizer.tokenize(txt, escape=False, return_str=True) ids = self.encoder_tokenizer.text_to_ids(txt) ids = [self.encoder_tokenizer.bos_id ] + ids + [self.encoder_tokenizer.eos_id] src = torch.Tensor(ids).long().to(self._device).unsqueeze(0) src_mask = torch.ones_like(src) src_hiddens = self.encoder(input_ids=src, encoder_mask=src_mask) beam_results = self.beam_search( encoder_hidden_states=src_hiddens, encoder_input_mask=src_mask) beam_results = self.filter_predicted_ids(beam_results) translation_ids = beam_results.cpu()[0].numpy() translation = self.decoder_tokenizer.ids_to_text( translation_ids) if target_lang != "None": translation = detokenizer.detokenize(translation.split()) res.append(translation) finally: self.train(mode=mode) return res