def test_normalization_pipeline(self): moses_norm_unicode = MosesPunctNormalizer( pre_replace_unicode_punct=True, post_remove_control_chars=True ) text = u"0《123》 456% '' 【789】" expected = u'0"123" 456% " [789]' assert moses_norm_unicode.normalize(text) == expected
def __init__(self, lang: str, remove_non_printable_char: bool = True, unicode_norm_form: Optional[str] = None): self._remove_non_printable_char = remove_non_printable_char self._moses_normalizer = MosesPunctNormalizer(lang) self._unicode_norm_form = unicode_norm_form if unicode_norm_form is not None: assert unicode_norm_form in ['NFC', 'NFKC', 'NFD', 'NFKD'],\ 'Unsupported unicode normalization format, you may refer to ' \ 'https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize for ' \ 'more details.' self.__warmup()
def normalize_file(iterator, language, processes, quiet): print(args, kwargs) moses = MosesPunctNormalizer( language, ) moses_normalize = partial(moses.normalize) return parallel_or_not(iterator, moses_normalize, processes, quiet)
def normalize_file( language, processes, normalize_quote_commas, normalize_numbers, encoding ): moses = MosesPunctNormalizer( language, norm_quote_commas=normalize_quote_commas, norm_numbers=normalize_numbers, ) moses_normalize = partial(moses.normalize) with click.get_text_stream("stdin", encoding=encoding) as fin: with click.get_text_stream("stdout", encoding=encoding) as fout: # If it's single process, joblib parallization is slower, # so just process line by line normally. if processes == 1: # TODO: Actually moses_normalize(fin.read()) gives the same output # and it's a lot better but it's inconsistent with the other # preprocessing interfaces, so we're doing it line by line here. for line in tqdm(fin.readlines()): # Note: not stripping newlines, so don't need end='\n' when printing to stdout. print(moses_normalize(line), end="", file=fout) else: for outline in parallelize_preprocess( moses_normalize, fin.readlines(), processes, progress_bar=True ): # Note: not stripping newlines, so don't need end='\n' when printing to stdout. print(outline, end="", file=fout)
def normalize_file(iterator, language, processes, quiet, replace_unicode_puncts): moses = MosesPunctNormalizer( language, pre_replace_unicode_punct=replace_unicode_puncts, ) moses_normalize = partial(moses.normalize) return parallel_or_not(iterator, moses_normalize, processes, quiet)
def test_moses_normalize_documents(self): moses = MosesPunctNormalizer() # Examples from normalizing big.txt inputs = [ "The United States in 1805 (color map) _Facing_ 193", "=Formation of the Constitution.=--(1) The plans before the convention,", "directions--(1) The infective element must be eliminated. When the ulcer", "College of Surgeons, Edinburgh.)]", ] expected = [ "The United States in 1805 (color map) _Facing_ 193", "=Formation of the Constitution.=-- (1) The plans before the convention,", "directions-- (1) The infective element must be eliminated. When the ulcer", "College of Surgeons, Edinburgh.) ]", ] for text, expect in zip(inputs, expected): assert moses.normalize(text) == expect
def test_moses_normalize_quote_comma(self): moses_norm_quote = MosesPunctNormalizer("en", norm_quote_commas=True) moses_no_norm_quote = MosesPunctNormalizer("en", norm_quote_commas=False) text = 'THIS EBOOK IS OTHERWISE PROVIDED TO YOU "AS-IS".' expected_norm_quote = 'THIS EBOOK IS OTHERWISE PROVIDED TO YOU "AS-IS."' assert moses_norm_quote.normalize(text) == expected_norm_quote expected_no_norm_quote = 'THIS EBOOK IS OTHERWISE PROVIDED TO YOU "AS-IS".' assert moses_no_norm_quote.normalize(text) == expected_no_norm_quote
def test_moses_normalize_numbers(self): # See https://stackoverflow.com/a/55233871/610569 moses_norm_num = MosesPunctNormalizer("en", norm_numbers=True) moses_no_norm_num = MosesPunctNormalizer("en", norm_numbers=False) text = u"12{}123".format(u"\u00A0") expected = u"12.123" assert moses_norm_num.normalize(text) == expected text = expected = u"12 123" assert moses_no_norm_num.normalize(text) == expected
class MosesNormalizer: """Normalizes the input sentence. Currently, we support the combination of the Moses Punctuation Normalizer 'normalize-punctuation.perl' and the 'remove-non-printing-char.perl' in [mosesdecoder](https://github.com/moses-smt/mosesdecoder): Also, we will normalize the Parameters ---------- lang The input language remove_non_printable_char Whether to remove the non-printable unicode characters in the input unicode_norm_form The unicode normalization format used. Supported """ def __init__(self, lang: str, remove_non_printable_char: bool = True, unicode_norm_form: Optional[str] = None): self._remove_non_printable_char = remove_non_printable_char self._moses_normalizer = MosesPunctNormalizer(lang) self._unicode_norm_form = unicode_norm_form if unicode_norm_form is not None: assert unicode_norm_form in ['NFC', 'NFKC', 'NFD', 'NFKD'],\ 'Unsupported unicode normalization format, you may refer to ' \ 'https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize for ' \ 'more details.' self.__warmup() def __warmup(self): self('hello world') def __call__(self, sentence: str) -> str: if self._unicode_norm_form: sentence = unicodedata.normalize(self._unicode_norm_form, sentence) sentence = self._moses_normalizer.normalize(sentence) if self._remove_non_printable_char: return non_printing_char_regex.sub(' ', sentence) else: return sentence
def normalize_file( iterator, language, processes, quiet, normalize_quote_commas, normalize_numbers, replace_unicode_puncts, remove_control_chars, ): moses = MosesPunctNormalizer( language, norm_quote_commas=normalize_quote_commas, norm_numbers=normalize_numbers, pre_replace_unicode_punct=replace_unicode_puncts, post_remove_control_chars=remove_control_chars, ) moses_normalize = partial(moses.normalize) return parallel_or_not(iterator, moses_normalize, processes, quiet)
def normalize_file( language, processes, encoding, quiet ): moses = MosesPunctNormalizer( language, ) moses_normalize = partial(moses.normalize) def processor(iterator): print(processes) if processes == 1: for line in list(iterator): yield moses_normalize(line) else: for outline in parallelize_preprocess( moses_normalize, list(iterator), processes, progress_bar=(not quiet) ): yield outline return processor
def cleanup_transcript(language, transcript, lowercase=True, remove_punctuation=True): if lowercase: transcript = transcript.lower() if language not in ["zh", "ja"]: if language not in TranscriptDataPipeline.PUNC_NORMERS: TranscriptDataPipeline.PUNC_NORMERS[ language] = MosesPunctNormalizer(lang=language) transcript = TranscriptDataPipeline.PUNC_NORMERS[ language].normalize(transcript) transcript = transcript.replace("' s ", "'s ").replace( "' ve ", "'ve ").replace("' m ", "'m ").replace("' t ", "'t ").replace("' re ", "'re ") if remove_punctuation: transcript = PUNC_PATTERN.sub(" ", transcript) transcript = " ".join(transcript.strip().split()) return transcript
def test_replace_unicode_punct(self): moses_norm_unicode = MosesPunctNormalizer() text = u"0《123》 456% 【789】" expected = u'0"123" 456% [789]' assert moses_norm_unicode.replace_unicode_punct(text) == expected
def test_moses_noralize_single_apostrophe(self): moses_norm_num = MosesPunctNormalizer("en") text = u"yesterday ’s reception" expected = u"yesterday 's reception" assert moses_norm_num.normalize(text) == expected
def get_moses_punct_normalizer(language='en'): return MosesPunctNormalizer(pre_replace_unicode_punct=True, lang=language)
def preprocess(src_file, mt_file, output_dir, tokenize_lang=None): """ pre-process input file before post-editing split at <br> and remove <i> tags and music symbols. store everything in a codes file in output_dir Args: src_file: src_file of the translation to be preprocessed mt_file: output of the mt system file to be preprocessed output_dir: output directory to output the preprocessed files and codes file """ punct_normalizer = MosesPunctNormalizer() # set tokenizer tokenizer = None if tokenize_lang: tokenizer = MosesTokenizer(lang=tokenize_lang) code_file = output_dir+'/codes.'+os.path.basename(mt_file) src_out_file = output_dir+'/'+os.path.basename(src_file)+'.pre' mt_out_file = output_dir+'/'+os.path.basename(mt_file)+'.pre' with open(src_out_file,'w') as fosrc, open(mt_out_file,'w') as fomt, open(code_file,'w') as fcodes, open(src_file) as fsrc, open(mt_file) as fmt: idx=0 for src,mt in zip(fsrc,fmt): src, mt = src.strip(), mt.strip() idx+=1 # standardize br tags src = re.sub('<\s*br\s*\/*>', '<br>', src, flags=re.IGNORECASE) mt = re.sub('<\s*br\s*\/*>', '<br>', mt, flags=re.IGNORECASE) # if number of <br> is same, split and save it as multiple lines src_split = re.split(r'\s*<br>\s*',src) mt_split = re.split(r'\s*<br>\s*',mt) # if the src, mt, do not have the same number of <br>, then do not split it if not (len(src_split) == len(mt_split)): src_split = [src] mt_split = [mt] for src_part, mt_part in zip(src_split, mt_split): code = "{}\t".format(idx) # check if they start with the hyphen has_hyphen = False if src_part.startswith('-'): has_hyphen = True src_part = src_part[1:].lstrip() if mt_part.startswith('-'): has_hyphen = True mt_part = mt_part[1:].lstrip() # check if they start with the music symbol music_syms = ('♫','♬','♪') has_music = False if re.search(r'\s*[{}]\s*'.format(''.join(music_syms)), src_part): has_music = True src_part = re.sub(r'\s*[{}]\s*'.format(''.join(music_syms)), '', src_part) #if mt_part.startswith(music_syms) or mt_part.endswith(music_syms): if re.search(r'\s*[{}]\s*'.format(''.join(music_syms)), mt_part): has_music = True mt_part = re.sub(r'\s*[{}]\s*'.format(''.join(music_syms)), '', mt_part) # check if it has enclosing italics tags. otherwise leave it as it is itag = '<i>' eitag = '</i>' has_itag = False if src_part.startswith(itag) or src_part.endswith(eitag): has_itag = True if mt_part.startswith(itag) or mt_part.endswith(eitag): has_itag = True #if re.match(r'^<i>[^<]*</i>$', src_part): if has_hyphen == True: code += 'HYPHENBEGIN\t' if has_music == True: code += 'MUSIC\t' if has_itag == True: code += 'ITALICTAGS\t' src_part = punct_normalizer.normalize(cleanup(src_part)) mt_part = punct_normalizer.normalize(cleanup(mt_part)) if tokenizer: src_part = " ".join(tokenizer.tokenize(src_part, escape=False)) mt_part = " ".join(tokenizer.tokenize(mt_part, escape=False)) fosrc.write(src_part.strip()+'\n') fomt.write(mt_part.strip()+'\n') fcodes.write("{}\n".format(code))
def main(args): normalizer = MosesPunctNormalizer(lang=args.lang, penn=args.penn) for line in sys.stdin: print(normalizer.normalize(line.rstrip()), flush=True)
def main(args): print(args, file=sys.stderr) if args.human_scores: systems = [] scores = {} for line in open(args.human_scores): system, score = line.rstrip().split() scores[system] = float(score) for system in args.systems: system_name = '.'.join(os.path.basename(system).split('.')[1:-1]) if system_name not in scores: print(f"COULDN'T FIND SYSTEM {system_name}", file=sys.stderr) elif scores[system_name] > args.scope: systems.append(system) else: systems = args.systems if args.normalize: normalizer = MosesPunctNormalizer(lang='en', penn=False) if args.spm: sp = spm.SentencePieceProcessor() sp.Load(args.spm) # leave one out fds = [open(file) for file in systems] num_constraints = 0 num_skipped = 0 for lineno, (ref, *systems) in enumerate(zip(open(args.reference), *fds), 1): def preprocess(text): if args.normalize: text = normalizer.normalize(text) if args.spm: text = ' '.join(sp.EncodeAsPieces(text)) return ' '.join(text.split()[:args.maxlen]) if len(ref.split()) > args.maxlen: continue ref_ngrams = sacrebleu.extract_ngrams(ref, min_order=args.ngram_min, max_order=args.ngram_max) ngrams = Counter() for system in systems: ngrams += sacrebleu.extract_ngrams(system, min_order=args.ngram_min, max_order=args.ngram_max) for ngram in ref_ngrams.keys(): ngrams[ngram] = 0 ngrams -= ref_ngrams if args.threshold <= 1: attested_ngrams = [ ngram for ngram in ngrams.keys() if (ngrams[ngram] / len(systems)) >= args.threshold ] else: attested_ngrams = [ ngram for ngram in ngrams.keys() if ngrams[ngram] >= args.threshold ] used_ngrams = [] for ngram in sorted(attested_ngrams, key=len, reverse=True): for used in used_ngrams: if ngram in used: # print(f"** {lineno} already saw '{ngram}' in '{used}', skipping", file=sys.stderr) num_skipped += 1 break else: num_constraints += 1 used_ngrams.append(ngram) j = { 'sentno': lineno, 'text': preprocess(ref), 'constraints': [preprocess(ngram)] } print(json.dumps(j, ensure_ascii=False), flush=True) #print(*attested_ngrams, sep='\t', flush=True) print( f"Created {num_constraints} constrained sentences, skipping {num_skipped} smaller ones", file=sys.stderr)