def from_pretrained(cls, pretrained_model_name_or_path, use_fast=True, do_basic_tokenize=True) -> PreTrainedTokenizer: if isinstance(pretrained_model_name_or_path, str): transformer = pretrained_model_name_or_path else: transformer = pretrained_model_name_or_path.transformer additional_config = dict() if transformer.startswith('voidful/albert_chinese_'): cls = BertTokenizer elif transformer == 'cl-tohoku/bert-base-japanese-char': # Since it's char level model, it's OK to use char level tok instead of fugashi # from hanlp.utils.lang.ja.bert_tok import BertJapaneseTokenizerFast # cls = BertJapaneseTokenizerFast from transformers import BertJapaneseTokenizer cls = BertJapaneseTokenizer # from transformers import BertTokenizerFast # cls = BertTokenizerFast additional_config['word_tokenizer_type'] = 'basic' else: cls = AutoTokenizer if use_fast and not do_basic_tokenize: warnings.warn( '`do_basic_tokenize=False` might not work when `use_fast=True`' ) tokenizer = cls.from_pretrained(get_mirror(transformer), use_fast=use_fast, do_basic_tokenize=do_basic_tokenize, **additional_config) tokenizer.name_or_path = transformer return tokenizer
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, training=True, **kwargs): if training: return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) else: if isinstance(pretrained_model_name_or_path, str): pretrained_model_name_or_path = get_mirror( pretrained_model_name_or_path) return super().from_config( AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)) else: assert not kwargs return super().from_config(pretrained_model_name_or_path)
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): pretrained_model_name_or_path = get_mirror( pretrained_model_name_or_path) return super().from_pretrained(pretrained_model_name_or_path, **kwargs)