def __init__(self, config, dataset): super(BERT2BERT, self).__init__(config, dataset) self.sos_token_idx = 101 self.eos_token_idx = 102 self.pretrained_model_path = config['pretrained_model_path'] self.tokenizer = BertTokenizer.from_pretrained(self.pretrained_model_path) self.encoder_configure = BertConfig.from_pretrained(self.pretrained_model_path) self.decoder_configure = BertConfig.from_pretrained(self.pretrained_model_path) self.encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs( encoder_config=self.encoder_configure, decoder_config=self.decoder_configure ) self.encoder = BertGenerationEncoder.from_pretrained( self.pretrained_model_path, bos_token_id=self.sos_token_idx, eos_token_id=self.eos_token_idx ) self.decoder = BertGenerationDecoder.from_pretrained( self.pretrained_model_path, bos_token_id=self.sos_token_idx, eos_token_id=self.eos_token_idx, add_cross_attention=True, is_decoder=True ) self.model = EncoderDecoderModel(encoder=self.encoder, decoder=self.decoder, config=self.encoder_decoder_config) self.padding_token_idx = self.tokenizer.pad_token_id self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx, reduction='none')
def convert_tf_checkpoint_to_pytorch(tf_hub_path, pytorch_dump_path, is_encoder_named_decoder, vocab_size, is_encoder): # Initialise PyTorch model bert_config = BertConfig.from_pretrained( "bert-large-cased", vocab_size=vocab_size, max_position_embeddings=512, is_decoder=True, add_cross_attention=True, ) bert_config_dict = bert_config.to_dict() del bert_config_dict["type_vocab_size"] config = BertGenerationConfig(**bert_config_dict) if is_encoder: model = BertGenerationEncoder(config) else: model = BertGenerationDecoder(config) print("Building PyTorch model from configuration: {}".format(str(config))) # Load weights from tf checkpoint load_tf_weights_in_bert_generation( model, tf_hub_path, model_class="bert", is_encoder_named_decoder=is_encoder_named_decoder, is_encoder=is_encoder, ) # Save pytorch-model print("Save PyTorch model and config to {}".format(pytorch_dump_path)) model.save_pretrained(pytorch_dump_path)
def create_and_check_decoder_model_past_large_inputs( self, config, input_ids, input_mask, token_labels, encoder_hidden_states, encoder_attention_mask, **kwargs, ): config.is_decoder = True config.add_cross_attention = True model = BertGenerationDecoder(config=config).to(torch_device).eval() # first forward pass outputs = model( input_ids, attention_mask=input_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, use_cache=True, ) past_key_values = outputs.past_key_values # create hypothetical multiple next token and extent to next_input_ids next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size) next_mask = ids_tensor((self.batch_size, 3), vocab_size=2) # append to next input_ids and next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) next_attention_mask = torch.cat([input_mask, next_mask], dim=-1) output_from_no_past = model( next_input_ids, attention_mask=next_attention_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, output_hidden_states=True, )["hidden_states"][0] output_from_past = model( next_tokens, attention_mask=next_attention_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, past_key_values=past_key_values, output_hidden_states=True, )["hidden_states"][0] # select random slice random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach() output_from_past_slice = output_from_past[:, :, random_slice_idx].detach() self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1]) # test that outputs are equal for slice self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
def test_inference_no_head_absolute_embedding(self): model = BertGenerationDecoder.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder") input_ids = torch.tensor([[101, 7592, 1010, 2026, 3899, 2003, 10140, 102]]) with torch.no_grad(): output = model(input_ids)[0] expected_shape = torch.Size([1, 8, 50358]) self.assertEqual(output.shape, expected_shape) expected_slice = torch.tensor( [[[-0.5788, -2.5994, -3.7054], [0.0438, 4.7997, 1.8795], [1.5862, 6.6409, 4.4638]]] ) self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
def create_and_check_for_causal_lm( self, config, input_ids, input_mask, token_labels, *args, ): model = BertGenerationDecoder(config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, labels=token_labels) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
def create_slt_transformer(input_vocab_size=1, output_vocab_size=1, **bert_params): if input_vocab_size == 1: print('WARNING: Input vocab size is 1') if output_vocab_size == 1: print('WARNING: Output vocab size is 1') params = { 'vocab_size': input_vocab_size, 'hidden_size': 512, 'intermediate_size': 2048, 'max_position_embeddings': 500, 'num_attention_heads': 8, 'num_hidden_layers': 3, 'hidden_act': 'relu', 'type_vocab_size': 1, 'hidden_dropout_prob': 0.1, 'attention_probs_dropout_prob': 0.1 } params.update(bert_params) config = BertGenerationConfig(**params) encoder = BertGenerationEncoder(config=config) params['vocab_size'] = output_vocab_size decoder_config = BertGenerationConfig(is_decoder=True, add_cross_attention=True, **params) decoder = BertGenerationDecoder(config=decoder_config) transformer = EncoderDecoderModel(encoder=encoder, decoder=decoder) def count_parameters(m): return sum(p.numel() for p in m.parameters() if p.requires_grad) print( f'The encoder has {count_parameters(encoder):,} trainable parameters') print( f'The decoder has {count_parameters(decoder):,} trainable parameters') print( f'The whole model has {count_parameters(transformer):,} trainable parameters' ) return transformer
def __init__(self, lr, **args): super(BERT2BERTTrainer, self).__init__() self.save_hyperparameters() encoder = BertGenerationEncoder.from_pretrained( "ckiplab/bert-base-chinese", bos_token_id=101, eos_token_id=102, # force_download=True ) decoder = BertGenerationDecoder.from_pretrained( "ckiplab/bert-base-chinese", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102) self.bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder) if args['with_keywords_loss']: self.loss_fct2 = KeywordsLoss(alpha=args['keywords_loss_alpha'], loss_fct=args['keywords_loss_fct'])
def create_model(model_checkpoint_name): encoder = BertGenerationEncoder.from_pretrained( model_checkpoint_name, bos_token_id=BOS_TOKEN_ID, eos_token_id=EOS_TOKEN_ID ) # add cross attention layers and use BERT’s cls token as BOS token and sep token as EOS token decoder = BertGenerationDecoder.from_pretrained(model_checkpoint_name, add_cross_attention=True, is_decoder=True, bos_token_id=BOS_TOKEN_ID, eos_token_id=EOS_TOKEN_ID) decoder.bert.encoder.requires_grad_(True) decoder.lm_head.requires_grad_(True) encoder.requires_grad_(False) decoder.bert.embeddings.requires_grad_(False) model = EncoderDecoderModel(encoder=encoder, decoder=decoder) return model
def __init__(self, config, dataset): super(BERT2BERT, self).__init__(config, dataset) self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased') self.encoder_configure = BertConfig.from_pretrained('bert-base-cased') self.decoder_configure = BertConfig.from_pretrained('bert-base-cased') self.encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs( encoder_config=self.encoder_configure, decoder_config=self.decoder_configure) self.encoder = BertGenerationEncoder.from_pretrained('bert-base-cased', bos_token_id=101, eos_token_id=102) self.decoder = BertGenerationDecoder.from_pretrained( 'bert-base-cased', add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102) self.encoder_decoder = EncoderDecoderModel( encoder=self.encoder, decoder=self.decoder, config=self.encoder_decoder_config) self.sos_token = dataset.sos_token self.eos_token = dataset.eos_token self.padding_token_idx = self.tokenizer.pad_token_id self.max_source_length = config['source_max_seq_length'] self.max_target_length = config['target_max_seq_length'] self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx, reduction='none')
def get_encoder_decoder_model(self, config, decoder_config): encoder_model = BertGenerationEncoder(config) decoder_model = BertGenerationDecoder(decoder_config) return encoder_model, decoder_model
# Fresh decoder config. decoder_config = BertConfig( is_decoder = True, add_cross_attention = True, # add cross attention layers vocab_size = len(decoder_tokenizer), # Set required tokens. unk_token_id = decoder_tokenizer.vocab["[UNK]"], sep_token_id = decoder_tokenizer.vocab["[SEP]"], pad_token_id = decoder_tokenizer.vocab["[PAD]"], cls_token_id = decoder_tokenizer.vocab["[CLS]"], mask_token_id = decoder_tokenizer.vocab["[MASK]"], bos_token_id = decoder_tokenizer.vocab["[BOS]"], eos_token_id = decoder_tokenizer.vocab["[EOS]"], ) # Initialize a brand new bert-based decoder. decoder = BertGenerationDecoder(config=decoder_config) # Setup enc-decoder mode. bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder) bert2bert.config.decoder_start_token_id=decoder_tokenizer.vocab["[CLS]"] bert2bert.config.pad_token_id=decoder_tokenizer.vocab["[PAD]"] # Elementary Training. optimizer = torch.optim.Adam(bert2bert.parameters(), lr=0.000001) bert2bert.cuda() for epoch in range(30): print("*"*50, "Epoch", epoch, "*"*50) if True: for batch in tqdm(sierra_dl): # tokenize commands and goals.
# https://medium.com/huggingface/encoder-decoders-in-transformers-a-hybrid-pre-trained-architecture-for-seq2seq-af4d7bf14bb8 from transformers import BertTokenizer, BertTokenizerFast, EncoderDecoderModel, BertGenerationEncoder, BertGenerationDecoder # add the EOS token as PAD token to avoid warnings #model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased") # leverage checkpoints for Bert2Bert model... # use BERT's cls token as BOS token and sep token as EOS token encoder = BertGenerationEncoder.from_pretrained("bert-large-uncased", bos_token_id=101, eos_token_id=102) # add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token decoder = BertGenerationDecoder.from_pretrained("bert-large-uncased", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102) bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder) # create tokenizer... tokenizer = BertTokenizer.from_pretrained("bert-large-uncased") # Inputs. #input_ids = tokenizer('This is a long article to summarize', add_special_tokens=False, return_tensors="pt").input_ids #labels = tokenizer('This is a short summary', return_tensors="pt").input_ids # train... #loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss #loss.backward()
from transformers import (EncoderDecoderModel, PreTrainedModel, BertTokenizer, BertGenerationEncoder, BertGenerationDecoder) encoder = BertGenerationEncoder.from_pretrained( model_type, bos_token_id=BOS_TOKEN_ID, eos_token_id=EOS_TOKEN_ID ) # add cross attention layers and use BERT’s cls token as BOS token and sep token as EOS token decoder = BertGenerationDecoder.from_pretrained(model_type, add_cross_attention=True, is_decoder=True, bos_token_id=BOS_TOKEN_ID, eos_token_id=EOS_TOKEN_ID) model = EncoderDecoderModel(encoder=encoder, decoder=decoder).to(device)
from transformers import BertGenerationTokenizer, BertGenerationDecoder, BertGenerationConfig import torch tokenizer = BertGenerationTokenizer.from_pretrained( 'google/bert_for_seq_generation_L-24_bbc_encoder') config = BertGenerationConfig.from_pretrained( "google/bert_for_seq_generation_L-24_bbc_encoder") config.is_decoder = True model = BertGenerationDecoder.from_pretrained( 'google/bert_for_seq_generation_L-24_bbc_encoder', config=config, return_dict=True) inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") outputs = model(**inputs) prediction_logits = outputs.logits
DistilBertModel, DistilBertForMaskedLM, DistilBertTokenizer, DistilBertConfig, \ DataCollatorForLanguageModeling, Trainer, TrainingArguments, EncoderDecoderModel from datasets import load_dataset model_name = 'distilbert-base-multilingual-cased' tokenizer_name = 'distilbert-base-multilingual-cased' config = BertGenerationConfig.from_pretrained(model_name) tokenizer = BertGenerationTokenizer.from_pretrained(tokenizer_name) # leverage checkpoints for Bert2Bert model... # use BERT's cls token as BOS token and sep token as EOS token encoder = BertGenerationEncoder.from_pretrained("bert-large-uncased") # add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token decoder = BertGenerationDecoder.from_pretrained("bert-large-uncased", add_cross_attention=True, is_decoder=True) bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder) # create tokenizer... tokenizer = DistilBertTokenizer.from_pretrained("bert-large-uncased") input_ids = tokenizer('This is a long article to summarize', add_special_tokens=False, return_tensors="pt").input_ids labels = tokenizer('This is a short summary', return_tensors="pt").input_ids # train... # loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss # loss.backward() config.attention_type = 'performer' model = DistilBertForMaskedLM.from_pretrained(bert2bert, config=config)
# SPDX-License-Identifier: Apache-2.0 # based on: https://huggingface.co/docs/transformers/v4.15.0/en/internal/tokenization_utils#transformers.SpecialTokensMixin from transformers import BertTokenizerFast, BertModel, BertGenerationDecoder # Let's see how to increase the vocabulary of Bert model and tokenizer tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased') print(tokenizer.all_special_tokens ) # --> ['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]'] print(tokenizer.all_special_ids) # --> [100, 102, 0, 101, 103] model = BertGenerationDecoder.from_pretrained('bert-base-uncased') print("Original tokenizer\n" + "*" * 50) print("Vocabulary size: ", tokenizer.vocab_size) #print("Number of special tokens: ", len(tokenizer.added_tokens_encoder)) print("Size of the full vocabulary with the added tokens: ", len(tokenizer)) # Add special tokens. #num_added_special_toks = tokenizer.add_special_tokens({"[OBJ]":10001,"[YO]":10002}) num_added_special_toks = tokenizer.add_tokens(["[OBJ]", "[YO]"], special_tokens=True) print('We have added', num_added_special_toks, 'special tokens') # Add "regular" tokens. num_added_toks = tokenizer.add_tokens( ['new_tok1', 'my_new-tok2', 'my_new-tok3', 'new_tok3'], special_tokens=False) print('We have added', num_added_toks, 'tokens')