def _ensure_bert(): if "tokenizer" not in BERT_SINGLETONS: tokenizer = BertTokenizer.from_pretrained(FLAGS.bert_version) BERT_SINGLETONS["tokenizer"] = tokenizer if "representer" not in BERT_SINGLETONS: representer = BertModel.from_pretrained(FLAGS.bert_version, output_hidden_states=True).to(_device()) BERT_SINGLETONS["representer"] = representer return BERT_SINGLETONS["tokenizer"], BERT_SINGLETONS["representer"]
def __init__(self): super(BertForWordSegmentation_4, self).__init__() self.tokenizer = BertTokenizer.from_pretrained( 'bert-base-multilingual-cased', do_lower_case=False) self.model = BertModel.from_pretrained( 'bert-base-multilingual-cased', do_lower_case=False, output_hidden_states=True).to('cuda') self.classifier = DropoutClassifier(768 * 4, 2).to('cuda')
def load_tokenizer(self): if self.model_configuration.is_xlnet: self.tokenizer = XLNetTokenizer.from_pretrained(self.model_configuration.bert_model, do_lower_case=self.model_configuration.do_lower) elif not self.model_configuration.is_scibert: self.tokenizer = BertTokenizer.from_pretrained(self.model_configuration.bert_model, do_lower_case=self.model_configuration.do_lower) else: self.tokenizer = BertTokenizer(self.model_configuration.vocab_file, do_lower_case=self.model_configuration.do_lower)
def initialize(self): super().initialize() bert_model = bert_models.get_model(self.bert_base, self.logger) self.tokenizer = BertTokenizer.from_pretrained(bert_model) # TODO: HACK! Until the transformers library adopts tokenizers, save and re-load vocab with tempfile.TemporaryDirectory() as d: self.tokenizer.save_vocabulary(d) # this tokenizer is ~4x faster as the BertTokenizer, per my measurements self.tokenizer = tk.BertWordPieceTokenizer( os.path.join(d, 'vocab.txt'))
def __init__(self, vocab, encoding_length=20, added_special_tokens=[]): # <NAV>, <ORA>,<TAR> from pytorch_transformers import BertTokenizer self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') #added_tok = {'additional_special_tokens': added_special_tokens} #self.tokenizer.add_special_tokens(added_tok) self.encoding_length = encoding_length self.split_regex = re.compile( r'(\W+)') # Split on any non-alphanumeric character self.vocab = vocab
def __init__(self, device=torch.device("cuda" if torch.cuda.is_available() else "cpu"), batch_size=16): self.device = device if isinstance(device, torch.device) else torch.device(device) self.model_type = "distress" self.batch_size = batch_size model_path = os.path.join(os.path.dirname(__file__), f"models/{self.model_type}.pth") self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") self.model = load_model(self.model_type, model_path, self.device)
def __init__(self, inv_dict): super(Model, self).__init__() self.config = config.SNLIConfig() model = BertModel.from_pretrained(self.config.BERT_MODEL) self.model = ModelTrainer(model, 3) self.model.load_state_dict(torch.load(self.config.model_name)) self.model = self.model.eval().cuda() self.inv_dict = inv_dict self.tokenizer = BertTokenizer.from_pretrained(self.config.BERT_MODEL) self.m = nn.Softmax(1)
def __init__(self, device='cuda'): super().__init__() self._tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self._model = BertForMultipleChoice.from_pretrained('bert-base-uncased') self._blank = '[unused0]' self._question = '[SEP]' self._context = '[SEP]' self._choice = '[SEP]' self._choice_split = '[SEP]' self._device = device
def train_abs_single(args, device_id): setattr(args, "device_id", device_id) init_logger(args.log_file) logger.info(str(args)) device = "cpu" if args.visible_gpus == '-1' else "cuda" logger.info('Device ID %d' % device_id) logger.info('Device %s' % device) torch.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True if device_id >= 0: torch.cuda.set_device(device_id) torch.cuda.manual_seed(args.seed) if args.train_from != '': logger.info('Loading checkpoint from %s' % args.train_from) checkpoint = torch.load(args.train_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) else: checkpoint = None torch.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True def train_iter_fct(): return data_loader.Dataloader(args, load_dataset(args, 'train', shuffle=True), args.batch_size, device, shuffle=True, is_test=False) model = AbsSummarizer(args, device, checkpoint) if (args.sep_optim): optim_bert = model_builder.build_optim_bert(args, model, checkpoint) optim_dec = model_builder.build_optim_dec(args, model, checkpoint) optim = [optim_bert, optim_dec] else: optim = [model_builder.build_optim(args, model, checkpoint)] logger.info(model) tokenizer = BertTokenizer.from_pretrained(path.join(args.bert_model_path, model.bert.model_name), do_lower_case=True, cache_dir=args.temp_dir) symbols = {'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'], 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]']} train_loss = abs_loss(model.generator, symbols['PAD'], model.vocab_size, device, train=True, label_smoothing=args.label_smoothing) trainer = build_trainer(args, device_id, model, optim, train_loss) trainer.train(train_iter_fct, args.train_steps)
def gen_dataloader(_train_path, _test_path, batch_size, preprocess_inputs=False, tokenizer_type='bert-base-uncased', input_len=128, **kwargs): """ Helper function that takes either just the train data path or both train and test data an outputs the appropriate dataloader instance kwargs are: for preprocessing: sample_size=None, weak_supervision=True max_len = 128 filter_bad_rows = True tokenizer = DFAULT_TOKENIIZER For dataloaders: val_sample_dataloader=True pin_memory = False num_workers = 0 """ if 'bert' in tokenizer_type.lower(): tokenizer = BertTokenizer.from_pretrained(tokenizer_type) elif 'xlnet' in tokenizer_type.lower(): tokenizer = XLNetTokenizer.from_pretrained(tokenizer_type) else: raise NotImplementedError( 'model {} is not implemented'.format(tokenizer_type)) train_dataset = read_data_to_dataframe(_train_path) if preprocess_inputs: df_train = preprocess_model_inputs(train_dataset, tokenizer=tokenizer, output_len=input_len, **kwargs) else: df_train = train_dataset if _test_path: test_dataset = read_data_to_dataframe(_test_path) if preprocess_inputs: df_test = preprocess_model_inputs(test_dataset, tokenizer=tokenizer, **kwargs) else: df_test = test_dataset dl = TrainValDataloader(df_train, df_test, batch_size, kwargs) return dl dl = TrainValSplitDataloader(df_train, batch_size, kwargs) return dl
def bert_word_data_variable(batch, config): tokenizer = BertTokenizer.from_pretrained('Data/ms/.') batch_size = len(batch) * 2 src_premise_matrix = np.zeros((batch_size, config.max_sen_len + 2)) src_hypothesis_matrix = np.zeros((batch_size, config.max_sen_len + 2)) p_mask = np.zeros((batch_size, config.max_sen_len + 2)) h_mask = np.zeros((batch_size, config.max_sen_len + 2)) tag_matrix = np.zeros(batch_size) for idx, instance in enumerate(batch): premise = tokenizer.encode(instance[0]) hypothesis_b = tokenizer.encode(instance[1]) hypothesis_c = tokenizer.encode(instance[2]) while len(premise) > config.max_sen_len: premise = premise[len(premise) - config.max_sen_len:] while len(hypothesis_b) > config.max_sen_len: hypothesis_b = hypothesis_b[len(hypothesis_b) - config.max_sen_len:] while len(hypothesis_c) > config.max_sen_len: hypothesis_c = hypothesis_c[len(hypothesis_c) - config.max_sen_len:] premise.insert(0, 101) premise.append(102) p_len = len(premise) hypothesis_b.insert(0, 101) hypothesis_b.append(102) hb_len = len(hypothesis_b) hypothesis_c.insert(0, 101) hypothesis_c.append(102) hc_len = len(hypothesis_c) for jdx in range(p_len): src_premise_matrix[idx * 2][jdx] = premise[jdx] src_premise_matrix[idx * 2 + 1][jdx] = premise[jdx] p_mask[idx * 2][jdx] = 1 p_mask[idx * 2 + 1][jdx] = 1 for kdx in range(hb_len): src_hypothesis_matrix[idx * 2][kdx] = hypothesis_b[kdx] h_mask[idx * 2][kdx] = 1 for gdx in range(hc_len): src_hypothesis_matrix[idx * 2 + 1][gdx] = hypothesis_c[gdx] h_mask[idx * 2 + 1][gdx] = 1 tag_matrix[idx * 2] = 1 tag_matrix[idx * 2 + 1] = 0 src_premise_matrix = torch.from_numpy(src_premise_matrix).long() src_hypothesis_matrix = torch.from_numpy(src_hypothesis_matrix).long() p_mask = torch.from_numpy(p_mask).float() h_mask = torch.from_numpy(h_mask).float() tag_matrix = torch.from_numpy(tag_matrix).long() if config.use_cuda: src_premise_matrix = src_premise_matrix.cuda() src_hypothesis_matrix = src_hypothesis_matrix.cuda() p_mask = p_mask.cuda() h_mask = h_mask.cuda() tag_matrix = tag_matrix.cuda() return [src_premise_matrix, src_hypothesis_matrix, p_mask, h_mask, tag_matrix]
def getBertSentenceFromRaw(raw_sent): """ convert the original tokenization to the BERT tokenization returns the BERT tokenization """ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') bert_sent_list = [] for raw_word in raw_sent: bert_tokens = tokenizer.tokenize(raw_word) bert_sent_list += bert_tokens return bert_sent_list
def build_lm_data(raw_data: List) -> List: tokenizer = BertTokenizer.from_pretrained(config.lm_name) sents = [] for data in raw_data: sent = data[0] sub = data[1] obj = data[4] sent = '[CLS]' + sent + '[SEP]' + sub + '[SEP]' + obj + '[SEP]' input_ids = torch.tensor([tokenizer.encode(sent)]) sents.append(input_ids) return sents
def __init__(self, test_text, ref_text, batch_size, device, method='mean'): super().__init__() self.name = 'BertDist' self.ref_text = ref_text self.test_text = test_text self.batch_size = batch_size self.device = device self.method = method self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def __init__(self, batches, batch_size, device, bert_model_path): self.bert_tokenizer = BertTokenizer.from_pretrained(bert_model_path) self.batch_size = batch_size self.batches = batches self.n_batches = len(batches) // batch_size self.residue = False # 记录batch数量是否为整数 if len(batches) % self.n_batches != 0: self.residue = True self.index = 0 self.device = device
def validate(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = Z_AbsSummarizer(args, device, checkpoint) model.eval() valid_iter = data_loader.Dataloader(args, load_dataset(args, 'valid', shuffle=False), args.batch_size, device, shuffle=False, is_test=False) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir) symbols = { 'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'], 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]'] } if COPY: valid_loss = abs_loss(model.generator, symbols, model.vocab_size, train=False, device=device, copy_generator=model.copy_generator) else: valid_loss = abs_loss(model.generator, symbols, model.vocab_size, train=False, device=device) trainer = build_trainer(args, device_id, model, None, valid_loss) stats = trainer.validate(valid_iter, step) return stats.xent()
def init(args): BERTTool.multi_bert = BertModel.from_pretrained( args.multi_bert.location) BERTTool.multi_tokener = BertTokenizer.from_pretrained( args.multi_bert.location) BERTTool.multi_pad = BERTTool.multi_tokener.convert_tokens_to_ids( ["[PAD]"])[0] BERTTool.multi_sep = BERTTool.multi_tokener.convert_tokens_to_ids( ["[SEP]"])[0] BERTTool.multi_cls = BERTTool.multi_tokener.convert_tokens_to_ids( ["[CLS]"])[0]
def __init__(self, args=None, device='cuda', bert_model_path='bert-base-uncased', batch_size=10, learning_rate=5e-5, weight_decay=0, additional_features=None): if args is not None: self.args = vars(args) assert device in ['cuda', 'cpu'] if not args: self.args = {} self.args['bert_model_path'] = bert_model_path self.args['device'] = device self.args['learning_rate'] = learning_rate self.args['weight_decay'] = weight_decay self.args['batch_size'] = batch_size self.log = logging.getLogger() self.bert_tokenizer = BertTokenizer.from_pretrained( self.args['bert_model_path']) if os.path.exists(self.args['bert_model_path']): if os.path.exists( os.path.join(self.args['bert_model_path'], CONFIG_NAME)): config = BertConfig.from_json_file( os.path.join(self.args['bert_model_path'], CONFIG_NAME)) elif os.path.exists( os.path.join(self.args['bert_model_path'], 'bert_config.json')): config = BertConfig.from_json_file( os.path.join(self.args['bert_model_path'], 'bert_config.json')) else: raise ValueError( "Cannot find a configuration for the BERT model you are attempting to load." ) self.loss_function = torch.nn.MSELoss() config.pretrained_config_archive_map[ 'additional_features'] = additional_features self.regressor_net = BertSimilarityRegressor.from_pretrained( self.args['bert_model_path'], config=config) self.optimizer = torch.optim.Adam( self.regressor_net.parameters(), weight_decay=self.args['weight_decay'], lr=self.args['learning_rate']) self.log.info('Initialized BertSentencePairSimilarity model from %s' % self.args['bert_model_path'])
def __init__(self, ext_vocab=None, \ key_name=None, \ bert_vocab_name='bert-base-uncased'): # initialize by default value. (can be overwritten by subclass) self.ext_vocab = ext_vocab or ["[PAD]", "[UNK]", "[CLS]", "[SEP]"] self.tokenizer = BertTokenizer.from_pretrained(bert_vocab_name) self._build_bert_vocab() super().__init__(self.ext_vocab, key_name)
def main(): parser = argparse.ArgumentParser( description= "Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids)." ) parser.add_argument('--file_path', type=str, default='data/dump.txt', help='The path to the data.') parser.add_argument('--bert_tokenizer', type=str, default='bert-base-uncased', help="The tokenizer to use.") parser.add_argument('--dump_file', type=str, default='data/dump', help='The dump file prefix.') args = parser.parse_args() logger.info(f'Loading Tokenizer ({args.bert_tokenizer})') bert_tokenizer = BertTokenizer.from_pretrained(args.bert_tokenizer) logger.info(f'Loading text from {args.file_path}') with open(args.file_path, 'r', encoding='utf8') as fp: data = fp.readlines() logger.info(f'Start encoding') logger.info(f'{len(data)} examples to process.') rslt = [] iter = 0 interval = 10000 start = time.time() for text in data: text = f'[CLS] {text.strip()} [SEP]' token_ids = bert_tokenizer.encode(text) rslt.append(token_ids) iter += 1 if iter % interval == 0: end = time.time() logger.info( f'{iter} examples processed. - {(end-start)/interval:.2f}s/expl' ) start = time.time() logger.info('Finished binarization') logger.info(f'{len(data)} examples processed.') dp_file = f'{args.dump_file}.{args.bert_tokenizer}.pickle' rslt_ = [np.uint16(d) for d in rslt] random.shuffle(rslt_) logger.info(f'Dump to {dp_file}') with open(dp_file, 'wb') as handle: pickle.dump(rslt_, handle, protocol=pickle.HIGHEST_PROTOCOL)
def tokenize(input_string, output_file): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokens = tokenizer.tokenize(input_string) line = " ".join(tokens) line = '{}\n'.format(line) with open(output_file + ".src", 'w', encoding='utf-8') as src: src.write(line) with open(output_file + ".tgt", 'w', encoding='utf-8') as tgt: tgt.write("line")
def __init__(self, model_path='bert-base-uncased', tokenizer_path=None, device=None): super().__init__(device) self.model_path = model_path self.tokenizer = BertTokenizer.from_pretrained(model_path) self.model = BertForMaskedLM.from_pretrained(model_path) self.model.to(device) self.model.eval()
def _add_lm_data(data: List[Dict]) -> List[Dict]: '使用语言模型的词表,序列化输入的句子' tokenizer = BertTokenizer.from_pretrained(config.lm.lm_file) for d in data: sent = d['sentence'] sent += '[SEP]' + d['head'] + '[SEP]' + d['tail'] d['lm_idx'] = tokenizer.encode(sent, add_special_tokens=True) d['seq_len'] = len(d['lm_idx']) return data
def main(text): tokenizer = BertTokenizer.from_pretrained('./', do_lower_case=True) model = BertForSequenceClassification.from_pretrained('./') model.to(device) texts = [] preds = [] texts.append("[CLS] " + text[:509] + " [SEP]") tokenized_texts = [tokenizer.tokenize(sent) for sent in texts] input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts] input_ids = pad_sequences( input_ids, maxlen=100, dtype="long", truncating="post", padding="post" ) attention_masks = [[float(i>0) for i in seq] for seq in input_ids] prediction_inputs = torch.tensor(input_ids) prediction_masks = torch.tensor(attention_masks) prediction_data = TensorDataset( prediction_inputs, prediction_masks ) prediction_dataloader = DataLoader( prediction_data, sampler=SequentialSampler(prediction_data), batch_size=1 ) model.eval() preds = [] for batch in prediction_dataloader: # добавляем батч для вычисления на GPU batch = tuple(t.to(device) for t in batch) # Распаковываем данные из dataloader b_input_ids, b_input_mask = batch # При использовании .no_grad() модель не будет считать и хранить градиенты. # Это ускорит процесс предсказания меток для тестовых данных. with torch.no_grad(): logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) # Перемещаем logits и метки классов на CPU для дальнейшей работы logits = logits[0].detach().cpu().numpy() # Сохраняем предсказанные классы и ground truth batch_preds = np.argmax(logits, axis=1) preds.extend(batch_preds) return preds
def get_hf_path(): if not os.path.isdir("cache/gpt-2"): os.makedirs("cache/gpt-2") model = AutoModelWithLMHead.from_pretrained("gpt2") model.save_pretrained("cache/gpt-2") tokenizer = AutoTokenizer.from_pretrained("gpt2") tokenizer.save_pretrained("cache/gpt-2") if not os.path.isdir("cache/bert"): os.makedirs("cache/bert") reward_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) reward_tokenizer.save_pretrained("cache/bert")
def __init__(self, model_state_dict) -> None: no_cuda = True self.device = torch.device( "cuda" if torch.cuda.is_available() and not no_cuda else "cpu") self.tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', do_lower_case=False) config = BertConfig.from_pretrained('bert-base-chinese') self.model = BertForQuestionAnswering(config) self.model.load_state_dict( torch.load(model_state_dict, map_location='cpu')) self.model.to(self.device) self.model.eval() # TODO
def __init__(self, config, vocab): super(BERT_PRETRAINED_MODEL_JAPANESE, self).__init__() self.config = config self.vocab = vocab self.BERT_config = BertConfig.from_json_file( '../published_model/bert_spm/bert_config.json') self.tokenizer = BertTokenizer.from_pretrained( './spm_model/wiki-ja.vocab.txt') self.pretrained_BERT_model = BertModel.from_pretrained( '../published_model/bert_spm/pytorch_model.bin', config=self.BERT_config)
def __init__(self, text=None, tokenizerName="bert-base-chinese"): self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.predict_map = { 0: "High Positive", 1: "Clam Positive", 2: "High Negative", 3: "clam Negative" } self.modelPath = "bert_sentiment_wordmax_128_loss_0.033_lr_2e-05.pkl" self.tokenizer = BertTokenizer.from_pretrained(tokenizerName) self.text = text
def main(args): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') ontology = json.load(open(os.path.join(args.data_root, args.ontology_data))) slot_meta, _ = make_slot_meta(ontology) tokenizer = BertTokenizer.from_pretrained(args.bert_config) special_tokens = ['[SLOT]', '[NULL]'] special_tokens_dict = {'additional_special_tokens': special_tokens} tokenizer.add_special_tokens(special_tokens_dict) data = prepare_dataset(data_path=os.path.join(args.data_root, args.test_data), data_list=None, tokenizer=tokenizer, slot_meta=slot_meta, n_history=args.n_history, max_seq_length=args.max_seq_length, op_code=args.op_code) model_config = BertConfig.from_json_file(args.bert_config_path) model_config.dropout = 0.1 op2id = OP_SET[args.op_code] model = TransformerDST(model_config, len(op2id), len(domain2id), op2id['update']) ckpt = torch.load(args.model_ckpt_path, map_location='cpu') model.load_state_dict(ckpt) model.eval() model.to(device) if args.eval_all: model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, False, False, False) model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, False, False, True) model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, False, True, False) model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, False, True, True) model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, True, False, False) model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, True, True, False) model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, True, False, True) model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, True, True, True) else: model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, args.gt_op, args.gt_p_state, args.gt_gen)
def add_pytorch_transformers_vocab(vocab, tokenizer_name): """Add vocabulary from tokenizers in pytorch_transformers for use with pre-tokenized data. These tokenizers have a convert_tokens_to_ids method, but this doesn't do anything special, so we can just use the standard indexers. """ do_lower_case = "uncased" in tokenizer_name if tokenizer_name.startswith("bert-"): tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("roberta-"): tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlnet-"): tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("openai-gpt"): tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("gpt2"): tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("transfo-xl-"): tokenizer = TransfoXLTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlm-"): tokenizer = XLMTokenizer.from_pretrained(tokenizer_name) if (tokenizer_name.startswith("openai-gpt") or tokenizer_name.startswith("gpt2") or tokenizer_name.startswith("transo-xl-")): tokenizer.add_special_tokens({ "bos_token": "<start>", "sep_token": "<delim>", "cls_token": "<extract>" }) # TODO: this is another place can be simplified by "model-before-preprocess" reorganization # we can pass tokenizer created in model here, see issue <TBD> vocab_size = len(tokenizer) # do not use tokenizer.vocab_size, it does not include newly added token if tokenizer_name.startswith("roberta-"): if tokenizer.convert_ids_to_tokens(vocab_size - 1) is None: vocab_size -= 1 else: log.info("Time to delete vocab_size-1 in preprocess.py !!!") # due to a quirk in huggingface's file, the last token of RobertaTokenizer is None, remove # this when they fix the problem ordered_vocab = tokenizer.convert_ids_to_tokens(range(vocab_size)) log.info("Added pytorch_transformers vocab (%s): %d tokens", tokenizer_name, len(ordered_vocab)) for word in ordered_vocab: vocab.add_token_to_namespace( word, input_module_tokenizer_name(tokenizer_name))