def __init__(self, config, sequence_length, use_pretrained=True, pretrained_model=None): """Constructor""" super().__init__(config, sequence_length) # suspend logging due to hellish verbosity lvl = logging.getLogger().level logging.getLogger().setLevel(logging.WARN) config_args = {"pretrained_model_name_or_path": self.pretrained_id} if pretrained_model is None: if use_pretrained: model = EncoderDecoderModel.from_encoder_decoder_pretrained( self.pretrained_id, self.pretrained_id) else: enc, dec = BertConfig(), BertConfig() dec.is_decoder = True dec.add_cross_attention = True enc_dec_config = EncoderDecoderConfig.from_encoder_decoder_configs( enc, dec) model = EncoderDecoderModel(config=enc_dec_config) logging.getLogger().setLevel(lvl) self.model = model else: self.model = pretrained_model logging.getLogger().setLevel(self.config.print.log_level.upper())
def test_real_bert_model_save_load_from_pretrained(self): model_2 = EncoderDecoderModel.from_encoder_decoder_pretrained( "bert-base-uncased", "bert-base-uncased") model_2.to(torch_device) input_ids = ids_tensor([13, 5], model_2.config.encoder.vocab_size) decoder_input_ids = ids_tensor([13, 1], model_2.config.encoder.vocab_size) attention_mask = ids_tensor([13, 5], vocab_size=2) with torch.no_grad(): outputs = model_2( input_ids=input_ids, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, ) out_2 = outputs[0].cpu().numpy() out_2[np.isnan(out_2)] = 0 with tempfile.TemporaryDirectory() as tmp_dirname: model_2.save_pretrained(tmp_dirname) model_1 = EncoderDecoderModel.from_pretrained(tmp_dirname) model_1.to(torch_device) after_outputs = model_1( input_ids=input_ids, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, ) out_1 = after_outputs[0].cpu().numpy() out_1[np.isnan(out_1)] = 0 max_diff = np.amax(np.abs(out_1 - out_2)) self.assertLessEqual(max_diff, 1e-5)
def check_equivalence_tf_to_pt(self, config, decoder_config, inputs_dict): encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) # Using `_tf_model`, the test will fail, because the weights of `_tf_model` get extended before saving # the encoder/decoder models. # There was a (very) ugly potential fix, which wasn't integrated to `transformers`: see # https://github.com/huggingface/transformers/pull/13222/commits/dbb3c9de76eee235791d2064094654637c99f36d#r697304245 # (the change in `src/transformers/modeling_tf_utils.py`) _tf_model = TFEncoderDecoderModel(encoder_decoder_config) # Make sure model is built _tf_model(**inputs_dict) # Using `tf_model` to pass the test. encoder = _tf_model.encoder.__class__(encoder_decoder_config.encoder) decoder = _tf_model.decoder.__class__(encoder_decoder_config.decoder) # Make sure models are built encoder(encoder.dummy_inputs) decoder(decoder.dummy_inputs) tf_model = TFEncoderDecoderModel(encoder=encoder, decoder=decoder) with tempfile.TemporaryDirectory() as encoder_tmp_dirname, tempfile.TemporaryDirectory() as decoder_tmp_dirname: tf_model.encoder.save_pretrained(encoder_tmp_dirname) tf_model.decoder.save_pretrained(decoder_tmp_dirname) pt_model = EncoderDecoderModel.from_encoder_decoder_pretrained( encoder_tmp_dirname, decoder_tmp_dirname, encoder_from_tf=True, decoder_from_tf=True ) # This is only for copying some specific attributes of this particular model. pt_model.config = tf_model.config self.check_pt_tf_equivalence(pt_model, tf_model, inputs_dict)
def __init__( self, model_name_or_path, tokenizer_name, model_cache_dir, input_max_length, target_max_length, summary_column_name, document_column_name, wandb_project, wandb_run_name, **kwargs, ): super().__init__( input_max_length, target_max_length, summary_column_name, document_column_name, wandb_project, wandb_run_name, ) self.tokenizer = BertTokenizer.from_pretrained( tokenizer_name if tokenizer_name else model_name_or_path, cache_dir=model_cache_dir, ) self.model = EncoderDecoderModel.from_encoder_decoder_pretrained( model_name_or_path, model_name_or_path, cache_dir=model_cache_dir, )
def __init__( self, is_eval=False, ): super().__init__() self.model = EncoderDecoderModel.from_encoder_decoder_pretrained( 'bert-base-uncased', 'bert-base-uncased', ) if is_eval: self.model = self.model.eval() self.optimizer = torch.optim.Adam(self.parameters(), lr=config.lr) if config.use_sgd: self.optimizer = torch.optim.SGD(self.parameters(), lr=config.lr) if config.USE_CUDA: self.model = self.model.cuda() self.model_dir = config.save_path if not os.path.exists(self.model_dir): os.makedirs(self.model_dir) self.best_path = ""
def check_encoder_decoder_model_from_pretrained( self, config, input_ids, attention_mask, encoder_hidden_states, decoder_config, decoder_input_ids, decoder_attention_mask, **kwargs): encoder_model, decoder_model = self.get_encoder_decoder_model( config, decoder_config) kwargs = { "encoder_model": encoder_model, "decoder_model": decoder_model } enc_dec_model = EncoderDecoderModel.from_encoder_decoder_pretrained( **kwargs) enc_dec_model.to(torch_device) outputs_encoder_decoder = enc_dec_model( input_ids=input_ids, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, ) self.assertEqual(outputs_encoder_decoder[0].shape, (decoder_input_ids.shape + (decoder_config.vocab_size, ))) self.assertEqual(outputs_encoder_decoder[1].shape, (input_ids.shape + (config.hidden_size, )))
def check_save_and_load_encoder_decoder_model( self, config, input_ids, attention_mask, encoder_hidden_states, decoder_config, decoder_input_ids, decoder_attention_mask, **kwargs, ): encoder_model, decoder_model = self.get_encoder_decoder_model( config, decoder_config) enc_dec_model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) enc_dec_model.to(torch_device) enc_dec_model.eval() with torch.no_grad(): outputs = enc_dec_model( input_ids=input_ids, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, ) out_2 = outputs[0].cpu().numpy() out_2[np.isnan(out_2)] = 0 with tempfile.TemporaryDirectory( ) as encoder_tmp_dirname, tempfile.TemporaryDirectory( ) as decoder_tmp_dirname: enc_dec_model.encoder.save_pretrained(encoder_tmp_dirname) enc_dec_model.decoder.save_pretrained(decoder_tmp_dirname) EncoderDecoderModel.from_encoder_decoder_pretrained( encoder_pretrained_model_name_or_path=encoder_tmp_dirname, decoder_pretrained_model_name_or_path=decoder_tmp_dirname, ) after_outputs = enc_dec_model( input_ids=input_ids, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, ) out_1 = after_outputs[0].cpu().numpy() out_1[np.isnan(out_1)] = 0 max_diff = np.amax(np.abs(out_1 - out_2)) self.assertLessEqual(max_diff, 1e-5)
def __init__(self, cfg: DictConfig, trainer: Trainer = None): # must assign tokenizers before init if cfg.language_model.pretrained_model_name: if cfg.language_model.pretrained_encoder_model_name or cfg.language_model.pretrained_decoder_model_name: raise ValueError( "Must have either pretrained_model_name or both pretrained_encoder_model name and " "pretrained_decoder_model_name." ) # setup tokenizer self.encoder_tokenizer = self.setup_tokenizer(cfg.encoder_tokenizer) self.encoder_add_special_tokens = cfg.encoder_tokenizer.add_special_tokens # set decoder to encoder self.decoder_tokenizer = self.encoder_tokenizer self.decoder_add_special_tokens = self.encoder_add_special_tokens else: if not ( cfg.language_model.pretrained_encoder_model_name and cfg.language_model.pretrained_decoder_model_name ): raise ValueError("Both encoder and decoder must be specified") # setup tokenizers self.encoder_tokenizer = self.setup_tokenizer(cfg.encoder_tokenizer) self.encoder_add_special_tokens = cfg.encoder_tokenizer.add_special_tokens self.decoder_tokenizer = self.setup_tokenizer(cfg.decoder_tokenizer) self.decoder_add_special_tokens = cfg.decoder_tokenizer.add_special_tokens if not self.encoder_tokenizer: raise TypeError("encoder_tokenizer failed to initialize") if not self.decoder_tokenizer: raise TypeError("decoder_tokenizer failed to initialize") # init superclass super().__init__(cfg=cfg, trainer=trainer) # must assign modules after init if cfg.language_model.pretrained_model_name: # Setup end-to-end model if "bart" in cfg.language_model.pretrained_model_name: self.model = BartForConditionalGeneration.from_pretrained(cfg.language_model.pretrained_model_name) else: self.model = AutoModel.from_pretrained(cfg.language_model.pretrained_model_name) else: if not ( cfg.language_model.pretrained_encoder_model_name and cfg.language_model.pretrained_decoder_model_name ): raise ValueError("Both encoder and decoder must be specified") # Setup encoder/decoder model self.model = EncoderDecoderModel.from_encoder_decoder_pretrained( encoder=cfg.language_model.pretrained_encoder_model_name, decoder=cfg.language_model.pretrained_decoder_model_name, ) self.validation_perplexity = Perplexity(compute_on_step=False) self.setup_optimization(cfg.optim)
def __init__(self): super().__init__() from transformers import EncoderDecoderModel from transformers import BertTokenizer self.seq2seq_model = EncoderDecoderModel.from_encoder_decoder_pretrained( 'bert-base-uncased', 'bert-base-uncased' ) # initialize Bert2Bert from pre-trained checkpoints self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def prep(encoder_model, decoder_model, seq_length): tokenizer = AutoTokenizer.from_pretrained(encoder_model, model_max_length=seq_length) model = EncoderDecoderModel.from_encoder_decoder_pretrained( encoder_model, decoder_model, max_length=40, ) return tokenizer, model
def build(self): # to be further set # breakpoint() self.image_feature_module = build_image_encoder( self.config.image_feature_processor, direct_features=True ) if self.config.concate_trace: self.trace_feature_module = build_encoder(self.config.trace_feature_encoder) if self.config.base_model_name == "bert-base-uncased": self.encoderdecoder = EncoderDecoderModel.from_encoder_decoder_pretrained( "bert-base-uncased", "bert-base-uncased" ) elif self.config.base_model_name == "2layer-base": config_encoder = BertConfig() config_decoder = BertConfig() config_encoder.max_position_embeddings = 1090 config_encoder.num_hidden_layers = 2 config_decoder.num_hidden_layers = 2 self.codec_config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder ) self.encoderdecoder = EncoderDecoderModel(config=self.codec_config) elif self.config.base_model_name == "3layer-base": config_encoder = BertConfig() config_decoder = BertConfig() config_encoder.num_hidden_layers = 3 config_decoder.num_hidden_layers = 3 self.codec_config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder ) self.encoderdecoder = EncoderDecoderModel(config=self.codec_config) if self.config.loop_contrastive: self.trace_caption_contrastive = TraceCaptionContrastiveModel( self.config.tc_contrastive_aggregate_method ) if ( hasattr(self.config, "pretrans_attention") and self.config.pretrans_attention ): # import ipdb; ipdb.set_trace() tempconf = self.encoderdecoder.config.encoder num_heads = tempconf.num_attention_heads num_layers = tempconf.num_hidden_layers self.attention_trans = AttentionTransform(num_layers, num_heads, 100) self.BOS_ID = 101 self.vae = OpenAIDiscreteVAE() image_code_dim = 768 image_fmap_size = self.vae.image_size // (2 ** self.vae.num_layers) self.image_seq_len = image_fmap_size ** 2 self.image_emb = torch.nn.Embedding(self.vae.num_tokens, image_code_dim) self.image_pos_emb = AxialPositionalEmbedding( image_code_dim, axial_shape=(image_fmap_size, image_fmap_size) )
def from_encoder_decoder_pretrained( cls, encoder_pretrained_model_name_or_path: str = None, decoder_pretrained_model_name_or_path: str = None, *model_args, **kwargs): instance = EncoderDecoderModel.from_encoder_decoder_pretrained( encoder_pretrained_model_name_or_path, decoder_pretrained_model_name_or_path, *model_args, **kwargs) return BottleneckEncoderDecoderModel(instance.config, instance.encoder, instance.decoder)
def __init__(self, args=args, max_seq_len=64, max_seq_len_title=32, max_img_seq_len=args.num_features, tr_name=args.tr): """ max_seq_len: Or Repo - VQA: 128 max_img_seq_len: Or Repo - NLVR2: 40 // GQA: 45 // VQA: 50 --- Set to args.num_features, as we dont have padding implemented tr_name: transformer model """ super().__init__() self.max_seq_len = max_seq_len self.max_seq_len_title = max_seq_len_title self.tr_name = tr_name self.max_img_seq_len = max_img_seq_len ### BUILD TOKENIZER ### self.tokenizer = AutoTokenizer.from_pretrained(tr_name) ### BUILD MODEL ### if tr_name.startswith("bert"): self.model, loading_info = BertO.from_pretrained( tr_name, output_loading_info=True, img_feature_dim=2048 + args.num_pos) print("UNEXPECTED: ", loading_info["unexpected_keys"]) print("MISSING: ", loading_info["missing_keys"]) print("ERRORS: ", loading_info["error_msgs"]) ### CLASSIFICATION HEADS ### # LXRT Default classifier tends to perform best; For Albert gelu_new outperforms gelu # Make sure to only have used stuff below as it seems to have an effect on random initilization! self.encoder_decoder = EncoderDecoderModel.from_encoder_decoder_pretrained( 'bert-base-uncased', 'gpt2') self.decoder = self.encoder_decoder.decoder self.decoder = self.decoder.cuda(0) self.decoder.config.max_length = 128 self.decoder.config.min_length = 8 self.decoder.config.no_repeat_ngram_size = 3 self.decoder.config.early_stopping = True self.decoder.config.length_penalty = 2.0 self.decoder.config.num_beams = 4 if args.from_scratch: print("initializing all the weights") self.model.apply(self.model.init_weights)
def define_G(model, source='en', dest='de', gpu_ids=[], use_init_net= True, freeze_encoder=False): """Create a generator Parameters: model (str) -- the type of the network: encoder | encoder-decoder netG (str) -- the architecture's name: resnet_9blocks | resnet_6blocks | unet_256 | unet_128 norm (str) -- the name of normalization layers used in the network: batch | instance | none use_dropout (bool) -- if use dropout layers. init_type (str) -- the name of our initialization method. init_gain (float) -- scaling factor for normal, xavier and orthogonal. gpu_ids (int list) -- which GPUs the network runs on: e.g., 0,1,2 Returns a generator Our current implementation provides two types of generators: U-Net: [unet_128] (for 128x128 input images) and [unet_256] (for 256x256 input images) The original U-Net paper: https://arxiv.org/abs/1505.04597 Resnet-based generator: [resnet_6blocks] (with 6 Resnet blocks) and [resnet_9blocks] (with 9 Resnet blocks) Resnet-based generator consists of several Resnet blocks between a few downsampling/upsampling operations. We adapt Torch code from Justin Johnson's neural style transfer project (https://github.com/jcjohnson/fast-neural-style). The generator has been initialized by <init_net>. It uses RELU for non-linearity. """ net = None if model == 't5': src_lang = define_language(source) tgt_lang = define_language(dest) model_name = 't5-small' net = EncDecT5Model(model_name, freeze_encoder=freeze_encoder, source_language=src_lang, target_language=tgt_lang) elif model == 'marianMT': model_name = 'Helsinki-NLP/opus-mt-'+source+'-'+dest net = EncDecModel(model_name, freeze_encoder=freeze_encoder) else: net = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-cased', 'bert-base-german-cased') #net=SentenceTransformer(netG) if use_init_net == True: return init_net(net, gpu_ids) else: return net
def __init__(self): super().__init__() # Model - load pretrained BERT-based encoder-decoder. self.bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained( "bert-base-uncased", "bert-base-uncased") # Set special tokens. self.bert2bert.config.decoder_start_token_id = tokenizer.bos_token_id self.bert2bert.config.eos_token_id = tokenizer.eos_token_id self.bert2bert.config.pad_token_id = tokenizer.pad_token_id # Sensible parameters for beam search. self.bert2bert.config.vocab_size = self.bert2bert.config.decoder.vocab_size self.bert2bert.config.max_length = 142 self.bert2bert.config.min_length = 56 self.bert2bert.config.no_repeat_ngram_size = 3 self.bert2bert.config.early_stopping = True self.bert2bert.config.length_penalty = 2.0 self.bert2bert.config.num_beams = 4
def build(self): # to be further set # breakpoint() self.image_feature_module = build_image_encoder( self.config.image_feature_processor, direct_features=True) if self.config.concate_trace: self.trace_feature_module = build_encoder( self.config.trace_feature_encoder) if self.config.base_model_name == "bert-base-uncased": self.encoderdecoder = EncoderDecoderModel.from_encoder_decoder_pretrained( "bert-base-uncased", "bert-base-uncased") elif self.config.base_model_name == "2layer-base": config_encoder = BertConfig() config_decoder = BertConfig() config_encoder.num_hidden_layers = 2 config_decoder.num_hidden_layers = 2 self.codec_config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder) self.encoderdecoder = EncoderDecoderModel(config=self.codec_config) elif self.config.base_model_name == "3layer-base": config_encoder = BertConfig() config_decoder = BertConfig() config_encoder.num_hidden_layers = 3 config_decoder.num_hidden_layers = 3 self.codec_config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder) self.encoderdecoder = EncoderDecoderModel(config=self.codec_config) if self.config.loop_contrastive: self.trace_caption_contrastive = TraceCaptionContrastiveModel( self.config.tc_contrastive_aggregate_method) if (hasattr(self.config, "pretrans_attention") and self.config.pretrans_attention): # import ipdb; ipdb.set_trace() tempconf = self.encoderdecoder.config.encoder num_heads = tempconf.num_attention_heads num_layers = tempconf.num_hidden_layers self.attention_trans = AttentionTransform(num_layers, num_heads, 100) self.BOS_ID = 101
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡') parser.add_argument('--model_path', default='model/epoch_0/model.pth', type=str, required=False, help='模型位置') args = parser.parse_args() print('args:\n' + args.__repr__()) device = args.device model_path = args.model_path # device os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 device = 'cuda' if torch.cuda.is_available() else 'cpu' print('using device:', device) # model model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-multilingual-cased", "bert-base-multilingual-cased") model.load_state_dict(torch.load(model_path)) model.eval() # dataset tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased") # 打印参数量 num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() print('number of parameters: {}'.format(num_parameters)) while True: question = input('请输入问题:') ids = tokenizer.encode(question) input_ids = torch.tensor([ids], dtype=torch.long) generated = model.generate(input_ids, decoder_start_token_id=model.config.decoder.pad_token_id) answer = tokenizer.decode(generated[0,:]) print(answer)
def generate_summaries( examples: list, out_file: str, model_name: str, batch_size: int = 8, device: str = DEFAULT_DEVICE ): fout = Path(out_file).open("w") # model = EncoderDecoderModel.from_pretrained(model_name, output_past=True).to(device) model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model.to(device) # max_length = 140 # min_length = 55 for batch in tqdm(list(chunks(examples, batch_size))): dct = tokenizer.batch_encode_plus(batch, max_length=128, return_tensors="pt", pad_to_max_length=True, add_special_tokens=True) print(dct["input_ids"][0]) print(dct["attention_mask"][0]) summaries = model.generate( input_ids=dct["input_ids"].to(device), attention_mask=dct["attention_mask"].to(device), num_beams=4, length_penalty=10.0, repetition_penalty = 5.0, max_length=20, # +2 from original because we start at step=1 and stop before max_length min_length=3, # +1 from original because we start at step=1 no_repeat_ngram_size=3, early_stopping=True, # decoder_start_token_id=model.config.decoder.bos_token_id decoder_start_token_id=0 ) dec = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summaries] in_ids = dct["input_ids"].to(device) in_dec = [tokenizer.decode(id, skip_special_tokens=True, clean_up_tokenization_spaces=False) for id in in_ids] for input, hypothesis in zip(in_dec, dec): fout.write(input + ' ||| ' + hypothesis + "\n") fout.flush()
def check_encoder_decoder_model_from_pretrained_using_model_paths( self, config, input_ids, attention_mask, encoder_hidden_states, decoder_config, decoder_input_ids, decoder_attention_mask, **kwargs): encoder_model, decoder_model = self.get_encoder_decoder_model( config, decoder_config) with tempfile.TemporaryDirectory( ) as encoder_tmp_dirname, tempfile.TemporaryDirectory( ) as decoder_tmp_dirname: encoder_model.save_pretrained(encoder_tmp_dirname) decoder_model.save_pretrained(decoder_tmp_dirname) model_kwargs = {"encoder_hidden_dropout_prob": 0.0} # BartConfig has no hidden_dropout_prob. if not hasattr(decoder_config, "hidden_dropout_prob"): model_kwargs["decoder_activation_function"] = "gelu" else: model_kwargs["decoder_hidden_dropout_prob"] = 0.0 enc_dec_model = EncoderDecoderModel.from_encoder_decoder_pretrained( encoder_tmp_dirname, decoder_tmp_dirname, **model_kwargs) enc_dec_model.to(torch_device) outputs_encoder_decoder = enc_dec_model( input_ids=input_ids, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, return_dict=True, ) self.assertEqual(outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size, ))) self.assertEqual( outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size, )))
from transformers import EncoderDecoderModel model_path = 'rubert_cased_L-12_H-768_A-12_pt' model = EncoderDecoderModel.from_encoder_decoder_pretrained( model_path, model_path) model.save_pretrained('pretrained_init_enc_dec')
def test_finetune_bert2bert(self): bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained("prajjwal1/bert-tiny", "prajjwal1/bert-tiny") tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") bert2bert.config.vocab_size = bert2bert.config.encoder.vocab_size bert2bert.config.eos_token_id = tokenizer.sep_token_id bert2bert.config.decoder_start_token_id = tokenizer.cls_token_id bert2bert.config.max_length = 128 train_dataset = datasets.load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]") val_dataset = datasets.load_dataset("cnn_dailymail", "3.0.0", split="validation[:1%]") train_dataset = train_dataset.select(range(32)) val_dataset = val_dataset.select(range(16)) batch_size = 4 def _map_to_encoder_decoder_inputs(batch): # Tokenizer will automatically set [BOS] <text> [EOS] inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=512) outputs = tokenizer(batch["highlights"], padding="max_length", truncation=True, max_length=128) batch["input_ids"] = inputs.input_ids batch["attention_mask"] = inputs.attention_mask batch["decoder_input_ids"] = outputs.input_ids batch["labels"] = outputs.input_ids.copy() batch["labels"] = [ [-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"] ] batch["decoder_attention_mask"] = outputs.attention_mask assert all([len(x) == 512 for x in inputs.input_ids]) assert all([len(x) == 128 for x in outputs.input_ids]) return batch def _compute_metrics(pred): labels_ids = pred.label_ids pred_ids = pred.predictions # all unnecessary tokens are removed pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True) label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True) accuracy = sum([int(pred_str[i] == label_str[i]) for i in range(len(pred_str))]) / len(pred_str) return {"accuracy": accuracy} # map train dataset train_dataset = train_dataset.map( _map_to_encoder_decoder_inputs, batched=True, batch_size=batch_size, remove_columns=["article", "highlights"], ) train_dataset.set_format( type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"], ) # same for validation dataset val_dataset = val_dataset.map( _map_to_encoder_decoder_inputs, batched=True, batch_size=batch_size, remove_columns=["article", "highlights"], ) val_dataset.set_format( type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"], ) output_dir = self.get_auto_remove_tmp_dir() training_args = Seq2SeqTrainingArguments( output_dir=output_dir, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, predict_with_generate=True, evaluation_strategy="steps", do_train=True, do_eval=True, warmup_steps=0, eval_steps=2, logging_steps=2, ) # instantiate trainer trainer = Seq2SeqTrainer( model=bert2bert, args=training_args, compute_metrics=_compute_metrics, train_dataset=train_dataset, eval_dataset=val_dataset, tokenizer=tokenizer, ) # start training trainer.train()
def get_pretrained_model(self): return EncoderDecoderModel.from_encoder_decoder_pretrained( "bert-large-uncased", "patrickvonplaten/prophetnet-decoder-clm-large-uncased")
def get_pretrained_model(self): return EncoderDecoderModel.from_encoder_decoder_pretrained( "bert-base-cased", "gpt2")
def get_pretrained_model(self): return EncoderDecoderModel.from_encoder_decoder_pretrained( "roberta-base", "roberta-base")
def get_pretrained_model(self): return EncoderDecoderModel.from_encoder_decoder_pretrained( "google/bert_for_seq_generation_L-24_bbc_encoder", "google/bert_for_seq_generation_L-24_bbc_encoder")
def __init__(self): super(BERTEncDecModel, self).__init__() print("Model creation...") self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.model = EncoderDecoderModel.from_encoder_decoder_pretrained( 'bert-base-uncased', 'bert-base-uncased')
def test_real_bert_model_from_pretrained_add_cross_attention(self): model = EncoderDecoderModel.from_encoder_decoder_pretrained( "bert-base-uncased", "bert-base-uncased") self.assertTrue( hasattr(model.decoder.bert.encoder.layer[0], "crossattention"))
def test_real_bert_model_from_pretrained(self): model = EncoderDecoderModel.from_encoder_decoder_pretrained( "bert-base-uncased", "bert-base-uncased") self.assertIsNotNone(model)
val_data = val_data.map( process_data_to_model_inputs, batched=True, batch_size=batch_size, # remove_columns=['name', 'note'], ) val_data.set_format( type='torch', columns=[ 'input_ids', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask', 'labels' ], ) ed_model = EncoderDecoderModel.from_encoder_decoder_pretrained( 'bert-base-uncased', 'bert-base-uncased') # set special tokens ed_model.config.decoder_start_token_id = 1 ed_model.config.eos_token_id = input_tokenizer.eos_token_id ed_model.config.pad_token_id = input_tokenizer.pad_token_id # sensible parameters for beam search ed_model.config.vocab_size = len(output_vocab) ed_model.config.max_length = 142 ed_model.config.min_length = 56 ed_model.config.no_repeat_ngram_size = 3 ed_model.config.early_stopping = True ed_model.config.length_penalty = 2.0 ed_model.config.num_beams = 4
def get_model(model=0, seed=8888): set_seed(seed) print("loading :", config.MODEL_LIST[model]) config.TOKENIZER = AutoTokenizer.from_pretrained(config.MODEL_LIST[model]) return EncoderDecoderModel.from_encoder_decoder_pretrained( config.MODEL_LIST[model], config.MODEL_LIST[model])