def __init__(self, config, dataset): super(BERT2BERT, self).__init__(config, dataset) self.sos_token_idx = 101 self.eos_token_idx = 102 self.pretrained_model_path = config['pretrained_model_path'] self.tokenizer = BertTokenizer.from_pretrained(self.pretrained_model_path) self.encoder_configure = BertConfig.from_pretrained(self.pretrained_model_path) self.decoder_configure = BertConfig.from_pretrained(self.pretrained_model_path) self.encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs( encoder_config=self.encoder_configure, decoder_config=self.decoder_configure ) self.encoder = BertGenerationEncoder.from_pretrained( self.pretrained_model_path, bos_token_id=self.sos_token_idx, eos_token_id=self.eos_token_idx ) self.decoder = BertGenerationDecoder.from_pretrained( self.pretrained_model_path, bos_token_id=self.sos_token_idx, eos_token_id=self.eos_token_idx, add_cross_attention=True, is_decoder=True ) self.model = EncoderDecoderModel(encoder=self.encoder, decoder=self.decoder, config=self.encoder_decoder_config) self.padding_token_idx = self.tokenizer.pad_token_id self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx, reduction='none')
def test_inference_no_head_absolute_embedding(self): model = BertGenerationEncoder.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder") input_ids = torch.tensor([[101, 7592, 1010, 2026, 3899, 2003, 10140, 102]]) with torch.no_grad(): output = model(input_ids)[0] expected_shape = torch.Size([1, 8, 1024]) self.assertEqual(output.shape, expected_shape) expected_slice = torch.tensor( [[[0.1775, 0.0083, -0.0321], [1.6002, 0.1287, 0.3912], [2.1473, 0.5791, 0.6066]]] ) self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
def get_model(args): if args.model_path: model = EncoderDecoderModel.from_pretrained(args.model_path) src_tokenizer = BertTokenizer.from_pretrained( os.path.join(args.model_path, "src_tokenizer") ) tgt_tokenizer = GPT2Tokenizer.from_pretrained( os.path.join(args.model_path, "tgt_tokenizer") ) tgt_tokenizer.build_inputs_with_special_tokens = types.MethodType( build_inputs_with_special_tokens, tgt_tokenizer ) if local_rank == 0 or local_rank == -1: print("model and tokenizer load from save success") else: src_tokenizer = BertTokenizer.from_pretrained(args.src_pretrain_dataset_name) tgt_tokenizer = GPT2Tokenizer.from_pretrained(args.tgt_pretrain_dataset_name) tgt_tokenizer.add_special_tokens( {"bos_token": "[BOS]", "eos_token": "[EOS]", "pad_token": "[PAD]"} ) tgt_tokenizer.build_inputs_with_special_tokens = types.MethodType( build_inputs_with_special_tokens, tgt_tokenizer ) encoder = BertGenerationEncoder.from_pretrained(args.src_pretrain_dataset_name) decoder = GPT2LMHeadModel.from_pretrained( args.tgt_pretrain_dataset_name, add_cross_attention=True, is_decoder=True ) decoder.resize_token_embeddings(len(tgt_tokenizer)) decoder.config.bos_token_id = tgt_tokenizer.bos_token_id decoder.config.eos_token_id = tgt_tokenizer.eos_token_id decoder.config.vocab_size = len(tgt_tokenizer) decoder.config.add_cross_attention = True decoder.config.is_decoder = True model_config = EncoderDecoderConfig.from_encoder_decoder_configs( encoder.config, decoder.config ) model = EncoderDecoderModel( encoder=encoder, decoder=decoder, config=model_config ) if local_rank != -1: model = model.to(device) if args.ngpu > 1: print("{}/{} GPU start".format(local_rank, torch.cuda.device_count())) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank ) optimizer, scheduler = get_optimizer_and_schedule(args, model) return model, src_tokenizer, tgt_tokenizer, optimizer, scheduler
def __init__(self, lr, **args): super(BERT2BERTTrainer, self).__init__() self.save_hyperparameters() encoder = BertGenerationEncoder.from_pretrained( "ckiplab/bert-base-chinese", bos_token_id=101, eos_token_id=102, # force_download=True ) decoder = BertGenerationDecoder.from_pretrained( "ckiplab/bert-base-chinese", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102) self.bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder) if args['with_keywords_loss']: self.loss_fct2 = KeywordsLoss(alpha=args['keywords_loss_alpha'], loss_fct=args['keywords_loss_fct'])
def create_model(model_checkpoint_name): encoder = BertGenerationEncoder.from_pretrained( model_checkpoint_name, bos_token_id=BOS_TOKEN_ID, eos_token_id=EOS_TOKEN_ID ) # add cross attention layers and use BERT’s cls token as BOS token and sep token as EOS token decoder = BertGenerationDecoder.from_pretrained(model_checkpoint_name, add_cross_attention=True, is_decoder=True, bos_token_id=BOS_TOKEN_ID, eos_token_id=EOS_TOKEN_ID) decoder.bert.encoder.requires_grad_(True) decoder.lm_head.requires_grad_(True) encoder.requires_grad_(False) decoder.bert.embeddings.requires_grad_(False) model = EncoderDecoderModel(encoder=encoder, decoder=decoder) return model
def __init__(self, config, dataset): super(BERT2BERT, self).__init__(config, dataset) self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased') self.encoder_configure = BertConfig.from_pretrained('bert-base-cased') self.decoder_configure = BertConfig.from_pretrained('bert-base-cased') self.encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs( encoder_config=self.encoder_configure, decoder_config=self.decoder_configure) self.encoder = BertGenerationEncoder.from_pretrained('bert-base-cased', bos_token_id=101, eos_token_id=102) self.decoder = BertGenerationDecoder.from_pretrained( 'bert-base-cased', add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102) self.encoder_decoder = EncoderDecoderModel( encoder=self.encoder, decoder=self.decoder, config=self.encoder_decoder_config) self.sos_token = dataset.sos_token self.eos_token = dataset.eos_token self.padding_token_idx = self.tokenizer.pad_token_id self.max_source_length = config['source_max_seq_length'] self.max_target_length = config['target_max_seq_length'] self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx, reduction='none')
# SPDX-License-Identifier: Apache-2.0 # based on: https://huggingface.co/blog/how-to-generate from transformers import BertTokenizer, EncoderDecoderModel, AutoModel from transformers import BertGenerationEncoder, GPT2LMHeadModel, BertGenerationDecoder tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") # Version 1: load encoder-decoder together. #model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "gpt2") # Version 2: load pretrained modules separatelly and join them. encoder = BertGenerationEncoder.from_pretrained("bert-base-uncased", bos_token_id=101, eos_token_id=102) # add cross attention layers and use the same BOS and EOS tokens. decoder = GPT2LMHeadModel.from_pretrained("gpt2", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102) model = EncoderDecoderModel(encoder=encoder, decoder=decoder) # encode context the generation is conditioned on input_ids = tokenizer.encode('I enjoy walking with my cute dog', return_tensors='pt') # Activate beam search and early_stopping. # A simple remedy is to introduce n-grams (a.k.a word sequences of n words) penalties # as introduced by Paulus et al. (2017) and Klein et al. (2017). # The most common n-grams penalty makes sure that no n-gram appears twice by
decoder_tokenizer.add_special_tokens({'bos_token': '[BOS]'}) decoder_tokenizer.add_special_tokens({'eos_token': '[EOS]'}) #print(f"\Decoder tokenizer vocabulary ({len(decoder_tokenizer.get_vocab())}):\n" + "-"*50) #for k, v in decoder_tokenizer.get_vocab().items(): # print(k, ": ", v) # decoder_tokenizer.model_max_length=512 ?? # Create dataset/dataloader. sierra_ds = SierraDataset(data_path=data_path) sierra_dl = DataLoader(sierra_ds, batch_size=64, shuffle=True, num_workers=2) # leverage checkpoints for Bert2Bert model... # use BERT's cls token as BOS token and sep token as EOS token encoder = BertGenerationEncoder.from_pretrained("bert-base-uncased", # Set required tokens. #bos_token_id=encoder_tokenizer.vocab["[CLS]"], #eos_token_id=encoder_tokenizer.vocab["[SEP]"], ) # Fresh decoder config. decoder_config = BertConfig( is_decoder = True, add_cross_attention = True, # add cross attention layers vocab_size = len(decoder_tokenizer), # Set required tokens. unk_token_id = decoder_tokenizer.vocab["[UNK]"], sep_token_id = decoder_tokenizer.vocab["[SEP]"], pad_token_id = decoder_tokenizer.vocab["[PAD]"], cls_token_id = decoder_tokenizer.vocab["[CLS]"], mask_token_id = decoder_tokenizer.vocab["[MASK]"], bos_token_id = decoder_tokenizer.vocab["[BOS]"],
def test_model_from_pretrained(self): model = BertGenerationEncoder.from_pretrained( "google/bert_for_seq_generation_L-24_bbc_encoder") self.assertIsNotNone(model)
from transformers import (EncoderDecoderModel, PreTrainedModel, BertTokenizer, BertGenerationEncoder, BertGenerationDecoder) encoder = BertGenerationEncoder.from_pretrained( model_type, bos_token_id=BOS_TOKEN_ID, eos_token_id=EOS_TOKEN_ID ) # add cross attention layers and use BERT’s cls token as BOS token and sep token as EOS token decoder = BertGenerationDecoder.from_pretrained(model_type, add_cross_attention=True, is_decoder=True, bos_token_id=BOS_TOKEN_ID, eos_token_id=EOS_TOKEN_ID) model = EncoderDecoderModel(encoder=encoder, decoder=decoder).to(device)
import torch from transformers import BertGenerationConfig, BertGenerationEncoder, BertGenerationDecoder, BertTokenizer, BertGenerationTokenizer, \ DistilBertModel, DistilBertForMaskedLM, DistilBertTokenizer, DistilBertConfig, \ DataCollatorForLanguageModeling, Trainer, TrainingArguments, EncoderDecoderModel from datasets import load_dataset model_name = 'distilbert-base-multilingual-cased' tokenizer_name = 'distilbert-base-multilingual-cased' config = BertGenerationConfig.from_pretrained(model_name) tokenizer = BertGenerationTokenizer.from_pretrained(tokenizer_name) # leverage checkpoints for Bert2Bert model... # use BERT's cls token as BOS token and sep token as EOS token encoder = BertGenerationEncoder.from_pretrained("bert-large-uncased") # add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token decoder = BertGenerationDecoder.from_pretrained("bert-large-uncased", add_cross_attention=True, is_decoder=True) bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder) # create tokenizer... tokenizer = DistilBertTokenizer.from_pretrained("bert-large-uncased") input_ids = tokenizer('This is a long article to summarize', add_special_tokens=False, return_tensors="pt").input_ids labels = tokenizer('This is a short summary', return_tensors="pt").input_ids # train... # loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss # loss.backward() config.attention_type = 'performer'
data_path = "/home/tkornuta/data/local-leonardo-sierra5k" decoder_tokenizer_path = os.path.join( data_path, "leonardo_sierra.decoder_tokenizer.json") # Let's see how to increase the vocabulary of Bert model and tokenizer encoder_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') #decoder_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') decoder_tokenizer = PreTrainedTokenizerFast( tokenizer_file=decoder_tokenizer_path) print(len(decoder_tokenizer)) # leverage checkpoints for Bert2Bert model... # use BERT's cls token as BOS token and sep token as EOS token encoder = BertGenerationEncoder.from_pretrained( "bert-base-uncased", bos_token_id=encoder_tokenizer.vocab["[CLS]"], eos_token_id=encoder_tokenizer.vocab["[SEP]"], ) # add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token #decoder = BertGenerationDecoder.from_pretrained("bert-base-uncased", # add_cross_attention=True, is_decoder=True, # bos_token_id=decoder_tokenizer.vocab["[CLS]"], # eos_token_id=decoder_tokenizer.vocab["[SEP]"], # ) #decoder.resize_token_embeddings(len(decoder_tokenizer)) # Fresh decoder config. decoder_config = BertConfig( is_decoder=True, add_cross_attention=True, vocab_size=len(decoder_tokenizer),