def test_tokenization_distilbert(self): # Given self.base_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased', do_lower_case=False, cache_dir=self.test_dir) self.rust_tokenizer = PyBertTokenizer( get_from_cache(self.base_tokenizer.pretrained_vocab_files_map['vocab_file']['distilbert-base-cased']), do_lower_case=False) output_baseline = [] for example in self.examples: output_baseline.append(self.base_tokenizer.encode_plus(example.text_a, text_pair=example.text_b, add_special_tokens=True, return_overflowing_tokens=True, return_special_tokens_mask=True, max_length=128)) # When output_rust = self.rust_tokenizer.encode_pair_list( [(example.text_a, example.text_b) for example in self.examples], max_len=128, truncation_strategy='longest_first', stride=0) # Then for idx, (rust, baseline) in enumerate(zip(output_rust, output_baseline)): assert rust.token_ids == baseline[ 'input_ids'], f'Difference in tokenization for {self.rust_tokenizer.__class__}: \n ' \ f'Sentence a: {self.examples[idx].text_a} \n' \ f'Sentence b: {self.examples[idx].text_b} \n' \ f'Token mismatch: {self.get_token_diff(rust.token_ids, baseline["input_ids"])} \n' \ f'Rust: {rust.token_ids} \n' \ f' Python {baseline["input_ids"]}'
def test_tokenization_distilbert(self): # Given self.base_tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-uncased', do_lower_case=True, cache_dir=self.test_dir) self.rust_tokenizer = PyBertTokenizer( get_from_cache( self.base_tokenizer.pretrained_vocab_files_map['vocab_file'] ['distilbert-base-uncased'])) output_baseline = [] for example in self.examples: output_baseline.append( self.base_tokenizer.encode_plus( example.text_a, add_special_tokens=True, return_overflowing_tokens=True, return_special_tokens_mask=True, max_length=128)) # When output_rust = self.rust_tokenizer.encode_list( [example.text_a for example in self.examples], max_len=128, truncation_strategy='longest_first', stride=0) # Then for rust, baseline in zip(output_rust, output_baseline): assert (rust.token_ids == baseline['input_ids']) assert (rust.segment_ids == baseline['token_type_ids']) assert ( rust.special_tokens_mask == baseline['special_tokens_mask'])
def load(cls, pretrained_model_name_or_path, tokenizer_class=None, **kwargs): """ Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from `pretrained_model_name_or_path` or define it manually via `tokenizer_class`. :param pretrained_model_name_or_path: The path of the saved pretrained model or its name (e.g. `bert-base-uncased`) :type pretrained_model_name_or_path: str :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`) :type tokenizer_class: str :param kwargs: :return: Tokenizer """ pretrained_model_name_or_path = str(pretrained_model_name_or_path) # guess tokenizer type from name if tokenizer_class is None: if "albert" in pretrained_model_name_or_path.lower(): tokenizer_class = "AlbertTokenizer" elif "xlm-roberta" in pretrained_model_name_or_path.lower(): tokenizer_class = "XLMRobertaTokenizer" elif "roberta" in pretrained_model_name_or_path.lower(): tokenizer_class = "RobertaTokenizer" elif "distilbert" in pretrained_model_name_or_path.lower(): tokenizer_class = "DistilBertTokenizer" elif "bert" in pretrained_model_name_or_path.lower(): tokenizer_class = "BertTokenizer" elif "xlnet" in pretrained_model_name_or_path.lower(): tokenizer_class = "XLNetTokenizer" else: raise ValueError( f"Could not infer tokenizer_type from name '{pretrained_model_name_or_path}'. Set arg `tokenizer_type` in Tokenizer.load() to one of: 'bert', 'roberta', 'xlnet' " ) logger.info(f"Loading tokenizer of type '{tokenizer_class}'") # return appropriate tokenizer object if tokenizer_class == "AlbertTokenizer": ret = AlbertTokenizer.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) elif tokenizer_class == "XLMRobertaTokenizer": ret = XLMRobertaTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "RobertaTokenizer": ret = RobertaTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "DistilBertTokenizer": ret = DistilBertTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "BertTokenizer": ret = BertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "XLNetTokenizer": ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs) if ret is None: raise Exception("Unable to load tokenizer") else: return ret
def test_sequence_builders(self): tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") text = tokenizer.encode("sequence builders") text_2 = tokenizer.encode("multi-sequence build") encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + \ text_2 + [tokenizer.sep_token_id]
def setup_class(self): self.use_gpu = torch.cuda.is_available() self.test_dir = Path(tempfile.mkdtemp()) self.base_tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-uncased', do_lower_case=True, cache_dir=self.test_dir) self.rust_tokenizer = PyBertTokenizer(get_from_cache( self.base_tokenizer.pretrained_vocab_files_map['vocab_file'] ['distilbert-base-uncased']), do_lower_case=True, strip_accents=True) self.model = DistilBertForSequenceClassification.from_pretrained( 'distilbert-base-uncased', output_attentions=False).eval() if self.use_gpu: self.model.cuda() self.sentence_list = [ 'For instance, on the planet Earth, man had always assumed that he was more intelligent ' 'than dolphins because he had achieved so much—the wheel, New York, wars and so on—whilst' ' all the dolphins had ever done was muck about in the water having a good time. But ' 'conversely, the dolphins had always believed that they were far more intelligent than ' 'man—for precisely the same reasons.' ] * 64 # Pre-allocate GPU memory tokens_list = [ self.base_tokenizer.tokenize(sentence) for sentence in self.sentence_list ] features = [ self.base_tokenizer.convert_tokens_to_ids(tokens) for tokens in tokens_list ] features = [ self.base_tokenizer.prepare_for_model(input, None, add_special_tokens=True, max_length=128) for input in features ] all_input_ids = torch.tensor([f['input_ids'] for f in features], dtype=torch.long) if self.use_gpu: all_input_ids = all_input_ids.cuda() with torch.no_grad(): _ = self.model(all_input_ids)[0].cpu().numpy()
def preprocess(data: List[Dict], model: str, label2idx: Dict, max_seq_length: int) -> List[BertInputItem]: """ Runs the full preprocessing pipeline on a list of data items. Args: data: a list of examples as dicts of the form {"text": ..., "label": ...} model: the name of the BERT model label2idx: a dict that maps label strings to label ids max_seq_length: the maximum sequence length for the input items Returns: a list of BertInputItems """ if "distilbert" in model: tokenizer = DistilBertTokenizer.from_pretrained(model) else: tokenizer = BertTokenizer.from_pretrained(model) bert_items = convert_data_to_input_items(data, label2idx, max_seq_length, tokenizer) return bert_items
def get_tokenizer(self, **kwargs): return DistilBertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
def setup_base_tokenizer(self): self.base_tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-uncased', do_lower_case=True, cache_dir=self.test_dir)
def load(cls, pretrained_model_name_or_path, tokenizer_class=None, use_fast=False, **kwargs): """ Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from `pretrained_model_name_or_path` or define it manually via `tokenizer_class`. :param pretrained_model_name_or_path: The path of the saved pretrained model or its name (e.g. `bert-base-uncased`) :type pretrained_model_name_or_path: str :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`) :type tokenizer_class: str :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or use the Python one (False). Only DistilBERT, BERT and Electra fast tokenizers are supported. :type use_fast: bool :param kwargs: :return: Tokenizer """ pretrained_model_name_or_path = str(pretrained_model_name_or_path) # guess tokenizer type from name if tokenizer_class is None: if "albert" in pretrained_model_name_or_path.lower(): tokenizer_class = "AlbertTokenizer" elif "xlm-roberta" in pretrained_model_name_or_path.lower(): tokenizer_class = "XLMRobertaTokenizer" elif "roberta" in pretrained_model_name_or_path.lower(): tokenizer_class = "RobertaTokenizer" elif 'codebert' in pretrained_model_name_or_path.lower(): if "mlm" in pretrained_model_name_or_path.lower(): raise NotImplementedError( "MLM part of codebert is currently not supported in FARM" ) else: tokenizer_class = "RobertaTokenizer" elif "camembert" in pretrained_model_name_or_path.lower( ) or "umberto" in pretrained_model_name_or_path: tokenizer_class = "CamembertTokenizer" elif "distilbert" in pretrained_model_name_or_path.lower(): tokenizer_class = "DistilBertTokenizer" elif "bert" in pretrained_model_name_or_path.lower(): tokenizer_class = "BertTokenizer" elif "xlnet" in pretrained_model_name_or_path.lower(): tokenizer_class = "XLNetTokenizer" elif "electra" in pretrained_model_name_or_path.lower(): tokenizer_class = "ElectraTokenizer" elif "word2vec" in pretrained_model_name_or_path.lower() or \ "glove" in pretrained_model_name_or_path.lower() or \ "fasttext" in pretrained_model_name_or_path.lower(): tokenizer_class = "EmbeddingTokenizer" elif "minilm" in pretrained_model_name_or_path.lower(): tokenizer_class = "BertTokenizer" elif "dpr-question_encoder" in pretrained_model_name_or_path.lower( ): tokenizer_class = "DPRQuestionEncoderTokenizer" elif "dpr-ctx_encoder" in pretrained_model_name_or_path.lower(): tokenizer_class = "DPRContextEncoderTokenizer" else: raise ValueError( f"Could not infer tokenizer_class from name '{pretrained_model_name_or_path}'. Set " f"arg `tokenizer_class` in Tokenizer.load() to one of: AlbertTokenizer, " f"XLMRobertaTokenizer, RobertaTokenizer, DistilBertTokenizer, BertTokenizer, or " f"XLNetTokenizer.") logger.info(f"Loading tokenizer of type '{tokenizer_class}'") # return appropriate tokenizer object ret = None if tokenizer_class == "AlbertTokenizer": if use_fast: logger.error( 'AlbertTokenizerFast is not supported! Using AlbertTokenizer instead.' ) ret = AlbertTokenizer.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) else: ret = AlbertTokenizer.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) elif tokenizer_class == "XLMRobertaTokenizer": if use_fast: logger.error( 'XLMRobertaTokenizerFast is not supported! Using XLMRobertaTokenizer instead.' ) ret = XLMRobertaTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = XLMRobertaTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "RobertaTokenizer" in tokenizer_class: # because it also might be fast tokekenizer we use "in" if use_fast: logger.error( 'RobertaTokenizerFast is not supported! Using RobertaTokenizer instead.' ) ret = RobertaTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = RobertaTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "DistilBertTokenizer" in tokenizer_class: # because it also might be fast tokekenizer we use "in" if use_fast: ret = DistilBertTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = DistilBertTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "BertTokenizer" in tokenizer_class: # because it also might be fast tokekenizer we use "in" if use_fast: ret = BertTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = BertTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "XLNetTokenizer": if use_fast: logger.error( 'XLNetTokenizerFast is not supported! Using XLNetTokenizer instead.' ) ret = XLNetTokenizer.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) else: ret = XLNetTokenizer.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) elif "ElectraTokenizer" in tokenizer_class: # because it also might be fast tokekenizer we use "in" if use_fast: ret = ElectraTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = ElectraTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "EmbeddingTokenizer": if use_fast: logger.error( 'EmbeddingTokenizerFast is not supported! Using EmbeddingTokenizer instead.' ) ret = EmbeddingTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = EmbeddingTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "CamembertTokenizer": if use_fast: logger.error( 'CamembertTokenizerFast is not supported! Using CamembertTokenizer instead.' ) ret = CamembertTokenizer._from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = CamembertTokenizer._from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "DPRQuestionEncoderTokenizer" or tokenizer_class == "DPRQuestionEncoderTokenizerFast": if use_fast or tokenizer_class == "DPRQuestionEncoderTokenizerFast": ret = DPRQuestionEncoderTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = DPRQuestionEncoderTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "DPRContextEncoderTokenizer" or tokenizer_class == "DPRContextEncoderTokenizerFast": if use_fast or tokenizer_class == "DPRContextEncoderTokenizerFast": ret = DPRContextEncoderTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = DPRContextEncoderTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) if ret is None: raise Exception("Unable to load tokenizer") else: return ret
def __init__(self, args=None, labels=None, device='cuda', bert_model_path='bert-base-uncased', architecture="DocumentBertLSTM", batch_size=10, bert_batch_size=7, learning_rate=5e-5, weight_decay=0, use_tensorboard=False): if args is not None: self.args = vars(args) if not args: self.args = {} self.args['bert_model_path'] = bert_model_path self.args['device'] = device self.args['learning_rate'] = learning_rate self.args['weight_decay'] = weight_decay self.args['batch_size'] = batch_size self.args['labels'] = labels self.args['bert_batch_size'] = bert_batch_size self.args['architecture'] = architecture self.args['use_tensorboard'] = use_tensorboard if 'fold' not in self.args: self.args['fold'] = 0 assert self.args[ 'labels'] is not None, "Must specify all labels in prediction" self.log = logging.getLogger() #account for some random tensorflow naming scheme if 'Distil' in self.args['architecture']: ArchitectureConfig = DistilBertConfig self.bert_tokenizer = DistilBertTokenizer.from_pretrained( self.args['bert_model_path']) else: ArchitectureConfig = BertConfig self.bert_tokenizer = BertTokenizer.from_pretrained( self.args['bert_model_path']) if os.path.exists(self.args['bert_model_path']): if os.path.exists( os.path.join(self.args['bert_model_path'], CONFIG_NAME)): config = ArchitectureConfig.from_json_file( os.path.join(self.args['bert_model_path'], CONFIG_NAME)) elif os.path.exists( os.path.join(self.args['bert_model_path'], 'bert_config.json')): config = ArchitectureConfig.from_json_file( os.path.join(self.args['bert_model_path'], 'bert_config.json')) else: raise ValueError( "Cannot find a configuration for the BERT based model you are attempting to load." ) else: config = ArchitectureConfig.from_pretrained( self.args['bert_model_path']) config.__setattr__('num_labels', len(self.args['labels'])) config.__setattr__('bert_batch_size', self.args['bert_batch_size']) if 'use_tensorboard' in self.args and self.args['use_tensorboard']: assert 'model_directory' in self.args is not None, "Must have a logging and checkpoint directory set." from torch.utils.tensorboard import SummaryWriter self.tensorboard_writer = SummaryWriter( os.path.join( self.args['model_directory'], "..", "runs", self.args['model_directory'].split(os.path.sep)[-1] + '_' + self.args['architecture'] + '_' + str(self.args['fold']))) self.bert_doc_classification = document_bert_architectures[ self.args['architecture']].from_pretrained( self.args['bert_model_path'], config=config) #param_optimizer=list(self.bert_doc_classification.named_parameters()) #no_decay = ['bias', 'gamma', 'beta'] #optimizer_grouped_parameters = [ # {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], # 'weight_decay_rate': 0.01}, # {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], # 'weight_decay_rate': 0.0} #] #Change these lines if you want to freeze bert, unfreeze bert, or only freeze last layers of BERT self.bert_doc_classification.freeze_bert_encoder() self.bert_doc_classification.unfreeze_bert_encoder_last_layers() #self.optimizer = torch.optim.Adam( # optimizer_grouped_parameters, # lr=self.args['learning_rate'] #) self.optimizer = torch.optim.Adam( self.bert_doc_classification.parameters(), weight_decay=self.args['weight_decay'], lr=self.args['learning_rate'])
def load(cls, pretrained_model_name_or_path, tokenizer_class=None, use_fast=False, **kwargs): """ Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from model config or define it manually via `tokenizer_class`. :param pretrained_model_name_or_path: The path of the saved pretrained model or its name (e.g. `bert-base-uncased`) :type pretrained_model_name_or_path: str :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`) :type tokenizer_class: str :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or use the Python one (False). Only DistilBERT, BERT and Electra fast tokenizers are supported. :type use_fast: bool :param kwargs: :return: Tokenizer """ pretrained_model_name_or_path = str(pretrained_model_name_or_path) if tokenizer_class is None: tokenizer_class = cls._infer_tokenizer_class(pretrained_model_name_or_path) logger.info(f"Loading tokenizer of type '{tokenizer_class}'") # return appropriate tokenizer object ret = None if tokenizer_class == "AlbertTokenizer": if use_fast: logger.error('AlbertTokenizerFast is not supported! Using AlbertTokenizer instead.') ret = AlbertTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs) else: ret = AlbertTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs) elif tokenizer_class == "XLMRobertaTokenizer": if use_fast: logger.error('XLMRobertaTokenizerFast is not supported! Using XLMRobertaTokenizer instead.') ret = XLMRobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) else: ret = XLMRobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif "RobertaTokenizer" in tokenizer_class: # because it also might be fast tokekenizer we use "in" if use_fast: logger.error('RobertaTokenizerFast is not supported! Using RobertaTokenizer instead.') ret = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) else: ret = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif "DistilBertTokenizer" in tokenizer_class: # because it also might be fast tokekenizer we use "in" if use_fast: ret = DistilBertTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs) else: ret = DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif "BertTokenizer" in tokenizer_class: # because it also might be fast tokekenizer we use "in" if use_fast: ret = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs) else: ret = BertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "XLNetTokenizer": if use_fast: logger.error('XLNetTokenizerFast is not supported! Using XLNetTokenizer instead.') ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs) else: ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs) elif "ElectraTokenizer" in tokenizer_class: # because it also might be fast tokekenizer we use "in" if use_fast: ret = ElectraTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs) else: ret = ElectraTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "EmbeddingTokenizer": if use_fast: logger.error('EmbeddingTokenizerFast is not supported! Using EmbeddingTokenizer instead.') ret = EmbeddingTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) else: ret = EmbeddingTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "CamembertTokenizer": if use_fast: logger.error('CamembertTokenizerFast is not supported! Using CamembertTokenizer instead.') ret = CamembertTokenizer._from_pretrained(pretrained_model_name_or_path, **kwargs) else: ret = CamembertTokenizer._from_pretrained(pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "DPRQuestionEncoderTokenizer" or tokenizer_class == "DPRQuestionEncoderTokenizerFast": if use_fast or tokenizer_class == "DPRQuestionEncoderTokenizerFast": ret = DPRQuestionEncoderTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs) else: ret = DPRQuestionEncoderTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "DPRContextEncoderTokenizer" or tokenizer_class == "DPRContextEncoderTokenizerFast": if use_fast or tokenizer_class == "DPRContextEncoderTokenizerFast": ret = DPRContextEncoderTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs) else: ret = DPRContextEncoderTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) if ret is None: raise Exception("Unable to load tokenizer") else: return ret
def load(cls, pretrained_model_name_or_path, tokenizer_class=None, **kwargs): """ Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from `pretrained_model_name_or_path` or define it manually via `tokenizer_class`. :param pretrained_model_name_or_path: The path of the saved pretrained model or its name (e.g. `bert-base-uncased`) :type pretrained_model_name_or_path: str :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`) :type tokenizer_class: str :param kwargs: :return: Tokenizer """ pretrained_model_name_or_path = str(pretrained_model_name_or_path) # guess tokenizer type from name if tokenizer_class is None: if "albert" in pretrained_model_name_or_path.lower(): tokenizer_class = "AlbertTokenizer" elif "xlm-roberta" in pretrained_model_name_or_path.lower(): tokenizer_class = "XLMRobertaTokenizer" elif "roberta" in pretrained_model_name_or_path.lower(): tokenizer_class = "RobertaTokenizer" elif 'codebert' in pretrained_model_name_or_path.lower(): if "mlm" in pretrained_model_name_or_path.lower(): raise NotImplementedError( "MLM part of codebert is currently not supported in FARM" ) else: tokenizer_class = "RobertaTokenizer" elif "camembert" in pretrained_model_name_or_path.lower( ) or "umberto" in pretrained_model_name_or_path: tokenizer_class = "CamembertTokenizer" elif "distilbert" in pretrained_model_name_or_path.lower(): tokenizer_class = "DistilBertTokenizer" elif "bert" in pretrained_model_name_or_path.lower(): tokenizer_class = "BertTokenizer" elif "xlnet" in pretrained_model_name_or_path.lower(): tokenizer_class = "XLNetTokenizer" elif "electra" in pretrained_model_name_or_path.lower(): tokenizer_class = "ElectraTokenizer" elif "word2vec" in pretrained_model_name_or_path.lower() or \ "glove" in pretrained_model_name_or_path.lower() or \ "fasttext" in pretrained_model_name_or_path.lower(): tokenizer_class = "EmbeddingTokenizer" elif "minilm" in pretrained_model_name_or_path.lower(): tokenizer_class = "BertTokenizer" else: raise ValueError( f"Could not infer tokenizer_class from name '{pretrained_model_name_or_path}'. Set " f"arg `tokenizer_class` in Tokenizer.load() to one of: AlbertTokenizer, " f"XLMRobertaTokenizer, RobertaTokenizer, DistilBertTokenizer, BertTokenizer, or " f"XLNetTokenizer.") logger.info(f"Loading tokenizer of type '{tokenizer_class}'") # return appropriate tokenizer object if tokenizer_class == "AlbertTokenizer": ret = AlbertTokenizer.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) elif tokenizer_class == "XLMRobertaTokenizer": ret = XLMRobertaTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "RobertaTokenizer": ret = RobertaTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "DistilBertTokenizer": ret = DistilBertTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "BertTokenizer": ret = BertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "XLNetTokenizer": ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs) elif tokenizer_class == "ElectraTokenizer": ret = ElectraTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "EmbeddingTokenizer": ret = EmbeddingTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "CamembertTokenizer": ret = CamembertTokenizer._from_pretrained( pretrained_model_name_or_path, **kwargs) if ret is None: raise Exception("Unable to load tokenizer") else: return ret
os.remove(f"./distilbert-squad-{SEQUENCE_LENGTH}.onnx") # fp16 try: model_fp16_spec = coremltools.utils.convert_neural_network_spec_weights_to_fp16( mlmodel.get_spec()) coremltools.utils.save_spec( model_fp16_spec, f"../Resources/distilbert-squad-{SEQUENCE_LENGTH}_FP16.mlmodel") except Exception as e: print(e) ##### Now check the outputs. print("––––––\n") tokenizer = DistilBertTokenizer.from_pretrained( "distilbert-base-uncased-distilled-squad") def generate_input_ids() -> np.array: """ Returns: np.array of shape (1, seq_len) """ x = tokenizer.encode( "Here is some text to encode, Here is some text to encode, Here is some text to encode", add_special_tokens=True, ) x += (SEQUENCE_LENGTH - len(x)) * [tokenizer.pad_token_id] return np.array([x], dtype=np.long)