def convert_tf_checkpoint_to_pytorch(tf_hub_path, pytorch_dump_path, is_encoder_named_decoder, vocab_size, is_encoder): # Initialise PyTorch model bert_config = BertConfig.from_pretrained( "bert-large-cased", vocab_size=vocab_size, max_position_embeddings=512, is_decoder=True, add_cross_attention=True, ) bert_config_dict = bert_config.to_dict() del bert_config_dict["type_vocab_size"] config = BertGenerationConfig(**bert_config_dict) if is_encoder: model = BertGenerationEncoder(config) else: model = BertGenerationDecoder(config) print("Building PyTorch model from configuration: {}".format(str(config))) # Load weights from tf checkpoint load_tf_weights_in_bert_generation( model, tf_hub_path, model_class="bert", is_encoder_named_decoder=is_encoder_named_decoder, is_encoder=is_encoder, ) # Save pytorch-model print("Save PyTorch model and config to {}".format(pytorch_dump_path)) model.save_pretrained(pytorch_dump_path)
def __init__(self, config, dataset): super(BERT2BERT, self).__init__(config, dataset) self.sos_token_idx = 101 self.eos_token_idx = 102 self.pretrained_model_path = config['pretrained_model_path'] self.tokenizer = BertTokenizer.from_pretrained(self.pretrained_model_path) self.encoder_configure = BertConfig.from_pretrained(self.pretrained_model_path) self.decoder_configure = BertConfig.from_pretrained(self.pretrained_model_path) self.encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs( encoder_config=self.encoder_configure, decoder_config=self.decoder_configure ) self.encoder = BertGenerationEncoder.from_pretrained( self.pretrained_model_path, bos_token_id=self.sos_token_idx, eos_token_id=self.eos_token_idx ) self.decoder = BertGenerationDecoder.from_pretrained( self.pretrained_model_path, bos_token_id=self.sos_token_idx, eos_token_id=self.eos_token_idx, add_cross_attention=True, is_decoder=True ) self.model = EncoderDecoderModel(encoder=self.encoder, decoder=self.decoder, config=self.encoder_decoder_config) self.padding_token_idx = self.tokenizer.pad_token_id self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx, reduction='none')
def test_inference_no_head_absolute_embedding(self): model = BertGenerationEncoder.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder") input_ids = torch.tensor([[101, 7592, 1010, 2026, 3899, 2003, 10140, 102]]) with torch.no_grad(): output = model(input_ids)[0] expected_shape = torch.Size([1, 8, 1024]) self.assertEqual(output.shape, expected_shape) expected_slice = torch.tensor( [[[0.1775, 0.0083, -0.0321], [1.6002, 0.1287, 0.3912], [2.1473, 0.5791, 0.6066]]] ) self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
def test_torch_encode_plus_sent_to_model(self): import torch from transformers import BertGenerationConfig, BertGenerationEncoder # Build sequence first_ten_tokens = list(self.big_tokenizer.get_vocab().keys())[:10] sequence = " ".join(first_ten_tokens) encoded_sequence = self.big_tokenizer.encode_plus(sequence, return_tensors="pt", return_token_type_ids=False) batch_encoded_sequence = self.big_tokenizer.batch_encode_plus( [sequence + " " + sequence], return_tensors="pt", return_token_type_ids=False ) config = BertGenerationConfig() model = BertGenerationEncoder(config) assert model.get_input_embeddings().weight.shape[0] >= self.big_tokenizer.vocab_size with torch.no_grad(): model(**encoded_sequence) model(**batch_encoded_sequence)
def create_and_check_model_as_decoder( self, config, input_ids, input_mask, token_labels, encoder_hidden_states, encoder_attention_mask, **kwargs, ): config.add_cross_attention = True model = BertGenerationEncoder(config=config) model.to(torch_device) model.eval() result = model( input_ids, attention_mask=input_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, ) result = model( input_ids, attention_mask=input_mask, encoder_hidden_states=encoder_hidden_states, ) self.parent.assertEqual( result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
def get_model(args): if args.model_path: model = EncoderDecoderModel.from_pretrained(args.model_path) src_tokenizer = BertTokenizer.from_pretrained( os.path.join(args.model_path, "src_tokenizer") ) tgt_tokenizer = GPT2Tokenizer.from_pretrained( os.path.join(args.model_path, "tgt_tokenizer") ) tgt_tokenizer.build_inputs_with_special_tokens = types.MethodType( build_inputs_with_special_tokens, tgt_tokenizer ) if local_rank == 0 or local_rank == -1: print("model and tokenizer load from save success") else: src_tokenizer = BertTokenizer.from_pretrained(args.src_pretrain_dataset_name) tgt_tokenizer = GPT2Tokenizer.from_pretrained(args.tgt_pretrain_dataset_name) tgt_tokenizer.add_special_tokens( {"bos_token": "[BOS]", "eos_token": "[EOS]", "pad_token": "[PAD]"} ) tgt_tokenizer.build_inputs_with_special_tokens = types.MethodType( build_inputs_with_special_tokens, tgt_tokenizer ) encoder = BertGenerationEncoder.from_pretrained(args.src_pretrain_dataset_name) decoder = GPT2LMHeadModel.from_pretrained( args.tgt_pretrain_dataset_name, add_cross_attention=True, is_decoder=True ) decoder.resize_token_embeddings(len(tgt_tokenizer)) decoder.config.bos_token_id = tgt_tokenizer.bos_token_id decoder.config.eos_token_id = tgt_tokenizer.eos_token_id decoder.config.vocab_size = len(tgt_tokenizer) decoder.config.add_cross_attention = True decoder.config.is_decoder = True model_config = EncoderDecoderConfig.from_encoder_decoder_configs( encoder.config, decoder.config ) model = EncoderDecoderModel( encoder=encoder, decoder=decoder, config=model_config ) if local_rank != -1: model = model.to(device) if args.ngpu > 1: print("{}/{} GPU start".format(local_rank, torch.cuda.device_count())) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank ) optimizer, scheduler = get_optimizer_and_schedule(args, model) return model, src_tokenizer, tgt_tokenizer, optimizer, scheduler
def create_slt_transformer(input_vocab_size=1, output_vocab_size=1, **bert_params): if input_vocab_size == 1: print('WARNING: Input vocab size is 1') if output_vocab_size == 1: print('WARNING: Output vocab size is 1') params = { 'vocab_size': input_vocab_size, 'hidden_size': 512, 'intermediate_size': 2048, 'max_position_embeddings': 500, 'num_attention_heads': 8, 'num_hidden_layers': 3, 'hidden_act': 'relu', 'type_vocab_size': 1, 'hidden_dropout_prob': 0.1, 'attention_probs_dropout_prob': 0.1 } params.update(bert_params) config = BertGenerationConfig(**params) encoder = BertGenerationEncoder(config=config) params['vocab_size'] = output_vocab_size decoder_config = BertGenerationConfig(is_decoder=True, add_cross_attention=True, **params) decoder = BertGenerationDecoder(config=decoder_config) transformer = EncoderDecoderModel(encoder=encoder, decoder=decoder) def count_parameters(m): return sum(p.numel() for p in m.parameters() if p.requires_grad) print( f'The encoder has {count_parameters(encoder):,} trainable parameters') print( f'The decoder has {count_parameters(decoder):,} trainable parameters') print( f'The whole model has {count_parameters(transformer):,} trainable parameters' ) return transformer
def __init__(self, lr, **args): super(BERT2BERTTrainer, self).__init__() self.save_hyperparameters() encoder = BertGenerationEncoder.from_pretrained( "ckiplab/bert-base-chinese", bos_token_id=101, eos_token_id=102, # force_download=True ) decoder = BertGenerationDecoder.from_pretrained( "ckiplab/bert-base-chinese", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102) self.bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder) if args['with_keywords_loss']: self.loss_fct2 = KeywordsLoss(alpha=args['keywords_loss_alpha'], loss_fct=args['keywords_loss_fct'])
def create_model(model_checkpoint_name): encoder = BertGenerationEncoder.from_pretrained( model_checkpoint_name, bos_token_id=BOS_TOKEN_ID, eos_token_id=EOS_TOKEN_ID ) # add cross attention layers and use BERT’s cls token as BOS token and sep token as EOS token decoder = BertGenerationDecoder.from_pretrained(model_checkpoint_name, add_cross_attention=True, is_decoder=True, bos_token_id=BOS_TOKEN_ID, eos_token_id=EOS_TOKEN_ID) decoder.bert.encoder.requires_grad_(True) decoder.lm_head.requires_grad_(True) encoder.requires_grad_(False) decoder.bert.embeddings.requires_grad_(False) model = EncoderDecoderModel(encoder=encoder, decoder=decoder) return model
def __init__(self, config, dataset): super(BERT2BERT, self).__init__(config, dataset) self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased') self.encoder_configure = BertConfig.from_pretrained('bert-base-cased') self.decoder_configure = BertConfig.from_pretrained('bert-base-cased') self.encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs( encoder_config=self.encoder_configure, decoder_config=self.decoder_configure) self.encoder = BertGenerationEncoder.from_pretrained('bert-base-cased', bos_token_id=101, eos_token_id=102) self.decoder = BertGenerationDecoder.from_pretrained( 'bert-base-cased', add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102) self.encoder_decoder = EncoderDecoderModel( encoder=self.encoder, decoder=self.decoder, config=self.encoder_decoder_config) self.sos_token = dataset.sos_token self.eos_token = dataset.eos_token self.padding_token_idx = self.tokenizer.pad_token_id self.max_source_length = config['source_max_seq_length'] self.max_target_length = config['target_max_seq_length'] self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx, reduction='none')
# SPDX-License-Identifier: Apache-2.0 # based on: https://huggingface.co/blog/how-to-generate from transformers import BertTokenizer, EncoderDecoderModel, AutoModel from transformers import BertGenerationEncoder, GPT2LMHeadModel, BertGenerationDecoder tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") # Version 1: load encoder-decoder together. #model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "gpt2") # Version 2: load pretrained modules separatelly and join them. encoder = BertGenerationEncoder.from_pretrained("bert-base-uncased", bos_token_id=101, eos_token_id=102) # add cross attention layers and use the same BOS and EOS tokens. decoder = GPT2LMHeadModel.from_pretrained("gpt2", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102) model = EncoderDecoderModel(encoder=encoder, decoder=decoder) # encode context the generation is conditioned on input_ids = tokenizer.encode('I enjoy walking with my cute dog', return_tensors='pt') # Activate beam search and early_stopping. # A simple remedy is to introduce n-grams (a.k.a word sequences of n words) penalties # as introduced by Paulus et al. (2017) and Klein et al. (2017). # The most common n-grams penalty makes sure that no n-gram appears twice by
decoder_tokenizer.add_special_tokens({'bos_token': '[BOS]'}) decoder_tokenizer.add_special_tokens({'eos_token': '[EOS]'}) #print(f"\Decoder tokenizer vocabulary ({len(decoder_tokenizer.get_vocab())}):\n" + "-"*50) #for k, v in decoder_tokenizer.get_vocab().items(): # print(k, ": ", v) # decoder_tokenizer.model_max_length=512 ?? # Create dataset/dataloader. sierra_ds = SierraDataset(data_path=data_path) sierra_dl = DataLoader(sierra_ds, batch_size=64, shuffle=True, num_workers=2) # leverage checkpoints for Bert2Bert model... # use BERT's cls token as BOS token and sep token as EOS token encoder = BertGenerationEncoder.from_pretrained("bert-base-uncased", # Set required tokens. #bos_token_id=encoder_tokenizer.vocab["[CLS]"], #eos_token_id=encoder_tokenizer.vocab["[SEP]"], ) # Fresh decoder config. decoder_config = BertConfig( is_decoder = True, add_cross_attention = True, # add cross attention layers vocab_size = len(decoder_tokenizer), # Set required tokens. unk_token_id = decoder_tokenizer.vocab["[UNK]"], sep_token_id = decoder_tokenizer.vocab["[SEP]"], pad_token_id = decoder_tokenizer.vocab["[PAD]"], cls_token_id = decoder_tokenizer.vocab["[CLS]"], mask_token_id = decoder_tokenizer.vocab["[MASK]"], bos_token_id = decoder_tokenizer.vocab["[BOS]"],
def test_model_from_pretrained(self): model = BertGenerationEncoder.from_pretrained( "google/bert_for_seq_generation_L-24_bbc_encoder") self.assertIsNotNone(model)
from transformers import (EncoderDecoderModel, PreTrainedModel, BertTokenizer, BertGenerationEncoder, BertGenerationDecoder) encoder = BertGenerationEncoder.from_pretrained( model_type, bos_token_id=BOS_TOKEN_ID, eos_token_id=EOS_TOKEN_ID ) # add cross attention layers and use BERT’s cls token as BOS token and sep token as EOS token decoder = BertGenerationDecoder.from_pretrained(model_type, add_cross_attention=True, is_decoder=True, bos_token_id=BOS_TOKEN_ID, eos_token_id=EOS_TOKEN_ID) model = EncoderDecoderModel(encoder=encoder, decoder=decoder).to(device)
import torch from transformers import BertGenerationConfig, BertGenerationEncoder, BertGenerationDecoder, BertTokenizer, BertGenerationTokenizer, \ DistilBertModel, DistilBertForMaskedLM, DistilBertTokenizer, DistilBertConfig, \ DataCollatorForLanguageModeling, Trainer, TrainingArguments, EncoderDecoderModel from datasets import load_dataset model_name = 'distilbert-base-multilingual-cased' tokenizer_name = 'distilbert-base-multilingual-cased' config = BertGenerationConfig.from_pretrained(model_name) tokenizer = BertGenerationTokenizer.from_pretrained(tokenizer_name) # leverage checkpoints for Bert2Bert model... # use BERT's cls token as BOS token and sep token as EOS token encoder = BertGenerationEncoder.from_pretrained("bert-large-uncased") # add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token decoder = BertGenerationDecoder.from_pretrained("bert-large-uncased", add_cross_attention=True, is_decoder=True) bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder) # create tokenizer... tokenizer = DistilBertTokenizer.from_pretrained("bert-large-uncased") input_ids = tokenizer('This is a long article to summarize', add_special_tokens=False, return_tensors="pt").input_ids labels = tokenizer('This is a short summary', return_tensors="pt").input_ids # train... # loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss # loss.backward() config.attention_type = 'performer'
def get_encoder_decoder_model(self, config, decoder_config): encoder_model = BertGenerationEncoder(config) decoder_model = BertGenerationDecoder(decoder_config) return encoder_model, decoder_model
data_path = "/home/tkornuta/data/local-leonardo-sierra5k" decoder_tokenizer_path = os.path.join( data_path, "leonardo_sierra.decoder_tokenizer.json") # Let's see how to increase the vocabulary of Bert model and tokenizer encoder_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') #decoder_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') decoder_tokenizer = PreTrainedTokenizerFast( tokenizer_file=decoder_tokenizer_path) print(len(decoder_tokenizer)) # leverage checkpoints for Bert2Bert model... # use BERT's cls token as BOS token and sep token as EOS token encoder = BertGenerationEncoder.from_pretrained( "bert-base-uncased", bos_token_id=encoder_tokenizer.vocab["[CLS]"], eos_token_id=encoder_tokenizer.vocab["[SEP]"], ) # add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token #decoder = BertGenerationDecoder.from_pretrained("bert-base-uncased", # add_cross_attention=True, is_decoder=True, # bos_token_id=decoder_tokenizer.vocab["[CLS]"], # eos_token_id=decoder_tokenizer.vocab["[SEP]"], # ) #decoder.resize_token_embeddings(len(decoder_tokenizer)) # Fresh decoder config. decoder_config = BertConfig( is_decoder=True, add_cross_attention=True, vocab_size=len(decoder_tokenizer),