Пример #1
0
 def from_pretrained(cls,
                     pretrained_model_name_or_path,
                     use_fast=True,
                     do_basic_tokenize=True) -> PreTrainedTokenizer:
     if isinstance(pretrained_model_name_or_path, str):
         transformer = pretrained_model_name_or_path
     else:
         transformer = pretrained_model_name_or_path.transformer
     additional_config = dict()
     if transformer.startswith('voidful/albert_chinese_'):
         cls = BertTokenizer
     elif transformer == 'cl-tohoku/bert-base-japanese-char':
         # Since it's char level model, it's OK to use char level tok instead of fugashi
         # from hanlp.utils.lang.ja.bert_tok import BertJapaneseTokenizerFast
         # cls = BertJapaneseTokenizerFast
         from transformers import BertJapaneseTokenizer
         cls = BertJapaneseTokenizer
         # from transformers import BertTokenizerFast
         # cls = BertTokenizerFast
         additional_config['word_tokenizer_type'] = 'basic'
     else:
         cls = AutoTokenizer
     if use_fast and not do_basic_tokenize:
         warnings.warn(
             '`do_basic_tokenize=False` might not work when `use_fast=True`'
         )
     tokenizer = cls.from_pretrained(get_tokenizer_mirror(transformer),
                                     use_fast=use_fast,
                                     do_basic_tokenize=do_basic_tokenize,
                                     **additional_config)
     tokenizer.name_or_path = transformer
     return tokenizer
Пример #2
0
 def from_pretrained(cls, pretrained_model_name_or_path, *model_args, training=True, **kwargs):
     pretrained_model_name_or_path = get_model_mirror(pretrained_model_name_or_path)
     if training:
         return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
     else:
         if isinstance(pretrained_model_name_or_path, str):
             pretrained_model_name_or_path = get_tokenizer_mirror(pretrained_model_name_or_path)
             return super().from_config(AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs))
         else:
             assert not kwargs
             return super().from_config(pretrained_model_name_or_path)
Пример #3
0
 def build_tokenizer(self, additional_tokens) -> PENMANBartTokenizer:
     transformer = self.config.transformer
     if 't5-' in transformer:
         cls = PENMANT5Tokenizer
     elif 'bart-' in transformer:
         cls = PENMANBartTokenizer
     else:
         raise NotImplemented(f'Unsupported transformer {transformer}')
     transformer = get_tokenizer_mirror(transformer)
     self._tokenizer = cls.from_pretrained(
         transformer,
         collapse_name_ops=self.config.collapse_name_ops,
         use_pointer_tokens=self.config.use_pointer_tokens,
         raw_graph=self.config.raw_graph,
         additional_tokens=additional_tokens,
         recategorization_tokens=self.config.recategorization_tokens,
         config=self._transformer_config,
     )
     return self._tokenizer
Пример #4
0
 def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
     pretrained_model_name_or_path = get_tokenizer_mirror(
         pretrained_model_name_or_path)
     return super().from_pretrained(pretrained_model_name_or_path, **kwargs)