def moses_punct_norm(self, text, lang): if lang not in self.cache_moses_punct_normalizer: punct_normalizer = sm.MosesPunctNormalizer(lang=lang) self.cache_moses_punct_normalizer[lang] = punct_normalizer else: punct_normalizer = self.cache_moses_punct_normalizer[lang] return punct_normalizer.normalize(text)
def __init__(self, config_file): with open(config_file) as f: self.__dict__.update(yaml.safe_load(f)) assert self.type in {"cn2en", "en2cn"} codes = codecs.open(self.codes_file, encoding='utf-8') cur_path = os.path.dirname(os.path.realpath(__file__)) self.tokenizer = BPE(codes) if self.type == "en2cn": # pre_process: normalize, tokenize, subEntity,to_lower,bpe # post_process: delbpe,remove_space self.en_tokenizer = os.path.join(cur_path, self.en_tokenizer) self.en_normalize_punctuation = sacremoses.MosesPunctNormalizer( lang="en") self.en_tokenizer = sacremoses.MosesTokenizer( lang='en', custom_nonbreaking_prefixes_file=self.en_tokenizer) elif self.type == "cn2en": # pre_process: tokenize, bpe # post_process: delbpe,detruecase,detokenize self.detruecase = sacremoses.MosesDetruecaser() self.detokenize = sacremoses.MosesDetokenizer(lang='en') self.client = aiohttp.ClientSession( timeout=aiohttp.ClientTimeout(total=3600), connector=aiohttp.TCPConnector(limit=sys.maxsize, limit_per_host=sys.maxsize)) self.cn2en_trans_dict = slang_dict(self.trans_dict_file) self.chinese_char_pattern = re.compile(u"[\u4E00-\u9FA5]+") self.stops = re.compile(u"[.!?!?。。]+")
def __init__(self, special=None, min_freq=0, max_size=None, lower_case=False, delimiter=None, vocab_file=None, pretrained_vocab_file: str = None, never_split=None, unk_token="<unk>", eos_token="<eos>", additional_special_tokens=["<formula>"], language="en", **kwargs): super().__init__(unk_token=unk_token, eos_token=eos_token, additional_special_tokens=additional_special_tokens, **kwargs) if never_split is None: never_split = self.all_special_tokens if special is None: special = [] self.counter = Counter() self.special = special self.min_freq = min_freq self.max_size = max_size self.lower_case = lower_case self.delimiter = delimiter self.vocab_file = vocab_file self.never_split = never_split self.punctuation_symbols = '!"#$%&()*+,-./\\:;<=>?@[\\]^_`{|}~' self.punction_without_space_before_pattern = re.compile( r"[^\s][{}]".format(self.punctuation_symbols)) self.punctuation_with_space_around_pattern = self._compile_space_around_punctuation_pattern( ) self.language = language self.moses_punct_normalizer = sm.MosesPunctNormalizer(language) self.moses_tokenizer = sm.MosesTokenizer(language) self.moses_detokenizer = sm.MosesDetokenizer(language) # This try... catch... is not beautiful but honestly this tokenizer was not made to be used # in a library like ours, at all. try: vocab_dict = None if pretrained_vocab_file is not None: # Priority on pickle files (support PyTorch and TF) with open(pretrained_vocab_file, "rb") as f: vocab_dict = pickle.load(f) # Loading a torch-saved transfo-xl vocab dict with pickle results in an integer # Entering this if statement means that we tried to load a torch-saved file with pickle, and we failed. # We therefore load it with torch, if it's available. if type(vocab_dict) == int: if not is_torch_available(): raise ImportError( "Not trying to load dict with PyTorch as you need to install pytorch to load " "from a PyTorch pretrained vocabulary, " "or activate it with environment variables USE_TORCH=1 and USE_TF=0." ) vocab_dict = torch.load(pretrained_vocab_file) if vocab_dict is not None: for key, value in vocab_dict.items(): if key not in self.__dict__: self.__dict__[key] = value elif vocab_file is not None: self.build_vocab() except Exception as e: raise ValueError( "Unable to parse file {}. Unknown format. " "If you tried to load a model saved through TransfoXLTokenizerFast," "please note they are not compatible.".format( pretrained_vocab_file)) from e if vocab_file is not None: self.build_vocab()
parser.add_argument('-L', '--lang', default='en') parser.add_argument('-o', '--overwrite', action='store_true') parser.add_argument('-v', '--verbose', action='store_true') args = parser.parse_args() if os.path.isfile(args.output_tsv) and not args.overwrite: print( f'output file: {args.output_tsv} exists, use -o/--overwrite to force overwrite' ) exit(1) verbose(args, args) normalizer = sacremoses.MosesPunctNormalizer( lang=args.lang, pre_replace_unicode_punct=True, post_remove_control_chars=True, ) p_list = set(string.punctuation) - set("'-") lines = [] with open(args.input_tsv, 'r') as f: reader = csv.DictReader( f, delimiter='\t', quotechar=None, doublequote=False, lineterminator='\n', quoting=csv.QUOTE_NONE, ) for line in reader:
def __init__(self, special=None, min_freq=0, max_size=None, lower_case=False, delimiter=None, vocab_file=None, pretrained_vocab_file=None, never_split=None, unk_token="<unk>", eos_token="<eos>", additional_special_tokens=["<formula>"], language="en", **kwargs): super().__init__(unk_token=unk_token, eos_token=eos_token, additional_special_tokens=additional_special_tokens, **kwargs) if never_split is None: never_split = self.all_special_tokens if special is None: special = [] self.counter = Counter() self.special = special self.min_freq = min_freq self.max_size = max_size self.lower_case = lower_case self.delimiter = delimiter self.vocab_file = vocab_file self.never_split = never_split self.punctuation_symbols = '!"#$%&()*+,-./\\:;<=>?@[\\]^_`{|}~' self.punction_without_space_before_pattern = re.compile( r"[^\s][{}]".format(self.punctuation_symbols)) self.punctuation_with_space_around_pattern = self._compile_space_around_punctuation_pattern( ) self.language = language self.moses_punct_normalizer = sm.MosesPunctNormalizer(language) self.moses_tokenizer = sm.MosesTokenizer(language) self.moses_detokenizer = sm.MosesDetokenizer(language) try: if pretrained_vocab_file is not None: # Hack because, honestly this tokenizer was not made to be used # in a library like ours, at all. vocab_dict = torch.load(pretrained_vocab_file) for key, value in vocab_dict.items(): if key not in self.__dict__: self.__dict__[key] = value if vocab_file is not None: self.build_vocab() except Exception: raise ValueError( "Unable to parse file {}. Unknown format. " "If you tried to load a model saved through TransfoXLTokenizerFast," "please note they are not compatible.".format( pretrained_vocab_file)) if vocab_file is not None: self.build_vocab()
def __init__( self, special=None, min_freq=0, max_size=None, lower_case=False, delimiter=None, vocab_file=None, pretrained_vocab_file=None, never_split=None, unk="<unk>", eos="<eos>", additional_special_tokens=["<formula>"], language="en", **kw, ): super().__init__( special=special, min_freq=min_freq, max_size=max_size, lower_case=lower_case, delimiter=delimiter, vocab_file=vocab_file, pretrained_vocab_file=pretrained_vocab_file, never_split=never_split, unk=unk, eos=eos, additional_special_tokens=additional_special_tokens, language=language, **kw, ) if never_split is None: never_split = self.all_special_tokens if special is None: special = [] self.counter = Counter() self.special = special self.min_freq = min_freq self.max_size = max_size self.lower_case = lower_case self.delimiter = delimiter self.vocab_file = vocab_file self.never_split = never_split self.punctuation_symbols = '!"#$%&()*+,-./\\:;<=>?@[\\]^_`{|}~' self.punction_without_space_before_pattern = re.compile( rf"[^\s][{self.punctuation_symbols}]" ) self.punctuation_with_space_around_pattern = ( self._compile_space_around_punctuation_pattern() ) self.language = language self.moses_punct_normalizer = sm.MosesPunctNormalizer(language) self.moses_tokenizer = sm.MosesTokenizer(language) self.moses_detokenizer = sm.MosesDetokenizer(language) try: vocab_dict = None if pretrained_vocab_file is not None: with open(pretrained_vocab_file, "rb") as f: vocab_dict = pickle.load(f) if type(vocab_dict) == int: if not is_torch_available(): raise ImportError( "Not trying to load dict with PyTorch as you need to install pytorch to load " "from a PyTorch pretrained vocabulary, " "or activate it with environment variables USE_TORCH=1 and USE_TF=0." ) vocab_dict = torch.load(pretrained_vocab_file) if vocab_dict is not None: for key, value in vocab_dict.items(): if key not in self.__dict__: self.__dict__[key] = value elif vocab_file is not None: self.build_vocab() except Exception as e: raise ValueError( f"Unable to parse file {pretrained_vocab_file}. Unknown format. " "If you tried to load a model saved through TokenizerFast, " "please note they are not compatible." ) from e if vocab_file is not None: self.build_vocab()