def train_model(config_path: str): writer = SummaryWriter() config = read_training_pipeline_params(config_path) logger.info("pretrained_emb {b}", b=config.net_params.pretrained_emb) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') logger.info("Device is {device}", device=device) SRC, TRG, dataset = get_dataset(config.dataset_path, False) train_data, valid_data, test_data = split_data( dataset, **config.split_ration.__dict__) SRC.build_vocab(train_data, min_freq=3) TRG.build_vocab(train_data, min_freq=3) torch.save(SRC.vocab, config.src_vocab_name) torch.save(TRG.vocab, config.trg_vocab_name) logger.info("Vocab saved") print(f"Unique tokens in source (ru) vocabulary: {len(SRC.vocab)}") print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}") train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=config.BATCH_SIZE, device=device, sort_key=_len_sort_key, ) INPUT_DIM = len(SRC.vocab) OUTPUT_DIM = len(TRG.vocab) config_encoder = BertConfig(vocab_size=INPUT_DIM) config_decoder = BertConfig(vocab_size=OUTPUT_DIM) config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder) model = EncoderDecoderModel(config=config) config_encoder = model.config.encoder config_decoder = model.config.decoder config_decoder.is_decoder = True config_decoder.add_cross_attention = True config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder) model = EncoderDecoderModel(config=config) args = TrainingArguments( output_dir="output", evaluation_strategy="steps", eval_steps=500, per_device_train_batch_size=128, per_device_eval_batch_size=128, num_train_epochs=10, save_steps=3000, seed=0, load_best_model_at_end=True, ) # args.place_model_on_device = device trainer = Trainer( model=model, args=args, train_dataset=train_iterator, eval_dataset=valid_iterator, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], ) trainer.train() model.save_pretrained("bert2bert")
def build(self): # to be further set # breakpoint() self.image_feature_module = build_image_encoder( self.config.image_feature_processor, direct_features=True ) if self.config.concate_trace: self.trace_feature_module = build_encoder(self.config.trace_feature_encoder) if self.config.base_model_name == "bert-base-uncased": self.encoderdecoder = EncoderDecoderModel.from_encoder_decoder_pretrained( "bert-base-uncased", "bert-base-uncased" ) elif self.config.base_model_name == "2layer-base": config_encoder = BertConfig() config_decoder = BertConfig() config_encoder.max_position_embeddings = 1090 config_encoder.num_hidden_layers = 2 config_decoder.num_hidden_layers = 2 self.codec_config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder ) self.encoderdecoder = EncoderDecoderModel(config=self.codec_config) elif self.config.base_model_name == "3layer-base": config_encoder = BertConfig() config_decoder = BertConfig() config_encoder.num_hidden_layers = 3 config_decoder.num_hidden_layers = 3 self.codec_config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder ) self.encoderdecoder = EncoderDecoderModel(config=self.codec_config) if self.config.loop_contrastive: self.trace_caption_contrastive = TraceCaptionContrastiveModel( self.config.tc_contrastive_aggregate_method ) if ( hasattr(self.config, "pretrans_attention") and self.config.pretrans_attention ): # import ipdb; ipdb.set_trace() tempconf = self.encoderdecoder.config.encoder num_heads = tempconf.num_attention_heads num_layers = tempconf.num_hidden_layers self.attention_trans = AttentionTransform(num_layers, num_heads, 100) self.BOS_ID = 101 self.vae = OpenAIDiscreteVAE() image_code_dim = 768 image_fmap_size = self.vae.image_size // (2 ** self.vae.num_layers) self.image_seq_len = image_fmap_size ** 2 self.image_emb = torch.nn.Embedding(self.vae.num_tokens, image_code_dim) self.image_pos_emb = AxialPositionalEmbedding( image_code_dim, axial_shape=(image_fmap_size, image_fmap_size) )
def __init__(self, word_num, embedding_dim, batch_size): super().__init__() self.word_num = word_num self.embedding_dim = embedding_dim self.batch_size = batch_size self.config_encoder = BertConfig( vocab_size=word_num, hidden_size=embedding_dim, num_hidden_layers=6, num_attention_heads=2, intermediate_size=512, output_hidden_states=False, output_attentions=False) #shape (bs, inp_len, inp_len) self.config_decoder = BertConfig( vocab_size=word_num, hidden_size=embedding_dim, num_hidden_layers=6, num_attention_heads=2, intermediate_size=512, output_hidden_states=True, output_attentions=False) #shape (bs, tar_len, tar_len) self.config = EncoderDecoderConfig.from_encoder_decoder_configs( self.config_encoder, self.config_decoder) self.encoder = BertModel(config=self.config_encoder) #self.seq2seq = EncoderDecoderModel(config=self.config) #self.fc1 = nn.Linear(word_num, 1) self.fc2 = nn.Linear(embedding_dim, 1)
def __init__(self, config, sequence_length, use_pretrained=True, pretrained_model=None): """Constructor""" super().__init__(config, sequence_length) # suspend logging due to hellish verbosity lvl = logging.getLogger().level logging.getLogger().setLevel(logging.WARN) config_args = {"pretrained_model_name_or_path": self.pretrained_id} if pretrained_model is None: if use_pretrained: model = EncoderDecoderModel.from_encoder_decoder_pretrained( self.pretrained_id, self.pretrained_id) else: enc, dec = BertConfig(), BertConfig() dec.is_decoder = True dec.add_cross_attention = True enc_dec_config = EncoderDecoderConfig.from_encoder_decoder_configs( enc, dec) model = EncoderDecoderModel(config=enc_dec_config) logging.getLogger().setLevel(lvl) self.model = model else: self.model = pretrained_model logging.getLogger().setLevel(self.config.print.log_level.upper())
def check_encoder_decoder_model_from_pretrained_configs( self, config, input_ids, attention_mask, encoder_hidden_states, decoder_config, decoder_input_ids, decoder_attention_mask, **kwargs): encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs( config, decoder_config) self.assertTrue(encoder_decoder_config.decoder.is_decoder) enc_dec_model = EncoderDecoderModel(encoder_decoder_config) enc_dec_model.to(torch_device) enc_dec_model.eval() self.assertTrue(enc_dec_model.config.is_encoder_decoder) outputs_encoder_decoder = enc_dec_model( input_ids=input_ids, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, return_dict=True, ) self.assertEqual(outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size, ))) self.assertEqual( outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size, )))
def __init__(self, config, pad_id): super(Transformer, self).__init__() encoder_config = BertConfig( vocab_size=config.src_vocab_size, hidden_size=config.h_size, num_hidden_layers=config.enc_layers, num_attention_heads=config.n_heads, intermediate_size=config.d_ff, hidden_dropout_prob=config.dropout, pad_token_id=pad_id, ) decoder_config = BertConfig( vocab_size=config.tgt_vocab_size, hidden_size=config.h_size, num_hidden_layers=config.dec_layers, num_attention_heads=config.n_heads, intermediate_size=config.d_ff, hidden_dropout_prob=config.dropout, pad_token_id=pad_id, is_decoder=True, add_cross_attention=True, ) encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs( encoder_config, decoder_config) self.tr = EncoderDecoderModel(config=encoder_decoder_config) if config.joined_vocab: self.tr.encoder.embeddings.word_embeddings = self.tr.decoder.bert.embeddings.word_embeddings
def __init__(self, config, dataset): super(BERT2BERT, self).__init__(config, dataset) self.sos_token_idx = 101 self.eos_token_idx = 102 self.pretrained_model_path = config['pretrained_model_path'] self.tokenizer = BertTokenizer.from_pretrained(self.pretrained_model_path) self.encoder_configure = BertConfig.from_pretrained(self.pretrained_model_path) self.decoder_configure = BertConfig.from_pretrained(self.pretrained_model_path) self.encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs( encoder_config=self.encoder_configure, decoder_config=self.decoder_configure ) self.encoder = BertGenerationEncoder.from_pretrained( self.pretrained_model_path, bos_token_id=self.sos_token_idx, eos_token_id=self.eos_token_idx ) self.decoder = BertGenerationDecoder.from_pretrained( self.pretrained_model_path, bos_token_id=self.sos_token_idx, eos_token_id=self.eos_token_idx, add_cross_attention=True, is_decoder=True ) self.model = EncoderDecoderModel(encoder=self.encoder, decoder=self.decoder, config=self.encoder_decoder_config) self.padding_token_idx = self.tokenizer.pad_token_id self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx, reduction='none')
def check_equivalence_tf_to_pt(self, config, decoder_config, inputs_dict): encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) # Using `_tf_model`, the test will fail, because the weights of `_tf_model` get extended before saving # the encoder/decoder models. # There was a (very) ugly potential fix, which wasn't integrated to `transformers`: see # https://github.com/huggingface/transformers/pull/13222/commits/dbb3c9de76eee235791d2064094654637c99f36d#r697304245 # (the change in `src/transformers/modeling_tf_utils.py`) _tf_model = TFEncoderDecoderModel(encoder_decoder_config) # Make sure model is built _tf_model(**inputs_dict) # Using `tf_model` to pass the test. encoder = _tf_model.encoder.__class__(encoder_decoder_config.encoder) decoder = _tf_model.decoder.__class__(encoder_decoder_config.decoder) # Make sure models are built encoder(encoder.dummy_inputs) decoder(decoder.dummy_inputs) tf_model = TFEncoderDecoderModel(encoder=encoder, decoder=decoder) with tempfile.TemporaryDirectory() as encoder_tmp_dirname, tempfile.TemporaryDirectory() as decoder_tmp_dirname: tf_model.encoder.save_pretrained(encoder_tmp_dirname) tf_model.decoder.save_pretrained(decoder_tmp_dirname) pt_model = EncoderDecoderModel.from_encoder_decoder_pretrained( encoder_tmp_dirname, decoder_tmp_dirname, encoder_from_tf=True, decoder_from_tf=True ) # This is only for copying some specific attributes of this particular model. pt_model.config = tf_model.config self.check_pt_tf_equivalence(pt_model, tf_model, inputs_dict)
def get_encoder_decoder_config(self): encoder_config = AutoConfig.from_pretrained("bert-base-uncased") decoder_config = AutoConfig.from_pretrained("bert-base-uncased", is_decoder=True, add_cross_attention=True) return EncoderDecoderConfig.from_encoder_decoder_configs( encoder_config, decoder_config)
def get_encoder_decoder_config_small(self): encoder_config = AutoConfig.from_pretrained( "hf-internal-testing/tiny-bert") decoder_config = AutoConfig.from_pretrained( "hf-internal-testing/tiny-bert", is_decoder=True, add_cross_attention=True) return EncoderDecoderConfig.from_encoder_decoder_configs( encoder_config, decoder_config)
def load_model(path, model=0): config_encoder = AutoConfig.from_pretrained(config.MODEL_LIST[model]) config_decoder = AutoConfig.from_pretrained(config.MODEL_LIST[model]) configer = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder) config.TOKENIZER = AutoTokenizer.from_pretrained(config.MODEL_LIST[model]) model = EncoderDecoderModel.from_pretrained(path, config=configer) print('MODEL LOADED!') return model
def check_equivalence_flax_to_pt(self, config, decoder_config, inputs_dict): encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) pt_model = EncoderDecoderModel(encoder_decoder_config) fx_model = FlaxEncoderDecoderModel(encoder_decoder_config) pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params) self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict)
def build(self): # to be further set # breakpoint() self.image_feature_module = build_image_encoder( self.config.image_feature_processor, direct_features=True) if self.config.concate_trace: self.trace_feature_module = build_encoder( self.config.trace_feature_encoder) if self.config.base_model_name == "bert-base-uncased": self.encoderdecoder = EncoderDecoderModel.from_encoder_decoder_pretrained( "bert-base-uncased", "bert-base-uncased") elif self.config.base_model_name == "2layer-base": config_encoder = BertConfig() config_decoder = BertConfig() config_encoder.num_hidden_layers = 2 config_decoder.num_hidden_layers = 2 self.codec_config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder) self.encoderdecoder = EncoderDecoderModel(config=self.codec_config) elif self.config.base_model_name == "3layer-base": config_encoder = BertConfig() config_decoder = BertConfig() config_encoder.num_hidden_layers = 3 config_decoder.num_hidden_layers = 3 self.codec_config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder) self.encoderdecoder = EncoderDecoderModel(config=self.codec_config) if self.config.loop_contrastive: self.trace_caption_contrastive = TraceCaptionContrastiveModel( self.config.tc_contrastive_aggregate_method) if (hasattr(self.config, "pretrans_attention") and self.config.pretrans_attention): # import ipdb; ipdb.set_trace() tempconf = self.encoderdecoder.config.encoder num_heads = tempconf.num_attention_heads num_layers = tempconf.num_hidden_layers self.attention_trans = AttentionTransform(num_layers, num_heads, 100) self.BOS_ID = 101
def check_equivalence_pt_to_flax(self, config, decoder_config, inputs_dict): encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) pt_model = EncoderDecoderModel(encoder_decoder_config) fx_model = FlaxEncoderDecoderModel(encoder_decoder_config) fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model) fx_model.params = fx_state self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict)
def test_pt_tf_equivalence(self): config_inputs_dict = self.prepare_config_and_inputs() # Keep only common arguments arg_names = [ "config", "input_ids", "attention_mask", "decoder_config", "decoder_input_ids", "decoder_attention_mask", "encoder_hidden_states", ] config_inputs_dict = {k: v for k, v in config_inputs_dict.items() if k in arg_names} config = config_inputs_dict.pop("config") decoder_config = config_inputs_dict.pop("decoder_config") inputs_dict = config_inputs_dict # `encoder_hidden_states` is not used in model call/forward del inputs_dict["encoder_hidden_states"] # Avoid the case where a sequence has no place to attend (after combined with the causal attention mask) batch_size = inputs_dict["decoder_attention_mask"].shape[0] inputs_dict["decoder_attention_mask"] = tf.constant( np.concatenate([np.ones(shape=(batch_size, 1)), inputs_dict["decoder_attention_mask"][:, 1:]], axis=1) ) # TF models don't use the `use_cache` option and cache is not returned as a default. # So we disable `use_cache` here for PyTorch model. decoder_config.use_cache = False self.assertTrue(decoder_config.cross_attention_hidden_size is None) # check without `enc_to_dec_proj` projection self.assertTrue(config.hidden_size == decoder_config.hidden_size) self.check_equivalence_pt_to_tf(config, decoder_config, inputs_dict) self.check_equivalence_tf_to_pt(config, decoder_config, inputs_dict) # This is not working, because pt/tf equivalence test for encoder-decoder use `from_encoder_decoder_pretrained`, # which randomly initialize `enc_to_dec_proj`. # # check `enc_to_dec_proj` work as expected # decoder_config.hidden_size = decoder_config.hidden_size * 2 # self.assertTrue(config.hidden_size != decoder_config.hidden_size) # self.check_equivalence_pt_to_tf(config, decoder_config, inputs_dict) # self.check_equivalence_tf_to_pt(config, decoder_config, inputs_dict) # Let's just check `enc_to_dec_proj` can run for now decoder_config.hidden_size = decoder_config.hidden_size * 2 self.assertTrue(config.hidden_size != decoder_config.hidden_size) encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) model = TFEncoderDecoderModel(encoder_decoder_config) model(**inputs_dict)
def inference(): step = sys.argv[1] encoder_config = BertConfig.from_pretrained("monologg/kobert") decoder_config = BertConfig.from_pretrained("monologg/kobert") config = EncoderDecoderConfig.from_encoder_decoder_configs( encoder_config, decoder_config) tokenizer = KoBertTokenizer() model = EncoderDecoderModel(config=config) ckpt = "model.pt" device = "cuda" model.load_state_dict( torch.load(f"saved/{ckpt}.{step}", map_location="cuda"), strict=True, ) model = model.half().eval().to(device) test_data = open("dataset/abstractive_test_v2.jsonl", "r").read().splitlines() submission = open(f"submission_{step}.csv", "w") test_set = [] for data in test_data: data = json.loads(data) article_original = data["article_original"] article_original = " ".join(article_original) news_id = data["id"] test_set.append((news_id, article_original)) for i, (news_id, text) in tqdm(enumerate(test_set)): tokens = tokenizer.encode_batch([text], max_length=512) generated = model.generate( input_ids=tokens["input_ids"].to(device), attention_mask=tokens["attention_mask"].to(device), use_cache=True, bos_token_id=tokenizer.token2idx["[CLS]"], eos_token_id=tokenizer.token2idx["[SEP]"], pad_token_id=tokenizer.token2idx["[PAD]"], num_beams=12, do_sample=False, temperature=1.0, no_repeat_ngram_size=3, bad_words_ids=[[tokenizer.token2idx["[UNK]"]]], length_penalty=1.0, repetition_penalty=1.5, max_length=512, ) output = tokenizer.decode_batch(generated.tolist())[0] submission.write(f"{news_id},{output}" + "\n") print(news_id, output)
def get_model(vocab_size=30000): config_encoder = BertConfig() config_decoder = BertConfig() config_encoder.vocab_size = vocab_size config_decoder.vocab_size = vocab_size config_decoder.is_decoder = True config_decoder.add_cross_attention = True config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder) model = EncoderDecoderModel(config=config) return model
def get_model(args): if args.model_path: model = EncoderDecoderModel.from_pretrained(args.model_path) src_tokenizer = BertTokenizer.from_pretrained( os.path.join(args.model_path, "src_tokenizer") ) tgt_tokenizer = GPT2Tokenizer.from_pretrained( os.path.join(args.model_path, "tgt_tokenizer") ) tgt_tokenizer.build_inputs_with_special_tokens = types.MethodType( build_inputs_with_special_tokens, tgt_tokenizer ) if local_rank == 0 or local_rank == -1: print("model and tokenizer load from save success") else: src_tokenizer = BertTokenizer.from_pretrained(args.src_pretrain_dataset_name) tgt_tokenizer = GPT2Tokenizer.from_pretrained(args.tgt_pretrain_dataset_name) tgt_tokenizer.add_special_tokens( {"bos_token": "[BOS]", "eos_token": "[EOS]", "pad_token": "[PAD]"} ) tgt_tokenizer.build_inputs_with_special_tokens = types.MethodType( build_inputs_with_special_tokens, tgt_tokenizer ) encoder = BertGenerationEncoder.from_pretrained(args.src_pretrain_dataset_name) decoder = GPT2LMHeadModel.from_pretrained( args.tgt_pretrain_dataset_name, add_cross_attention=True, is_decoder=True ) decoder.resize_token_embeddings(len(tgt_tokenizer)) decoder.config.bos_token_id = tgt_tokenizer.bos_token_id decoder.config.eos_token_id = tgt_tokenizer.eos_token_id decoder.config.vocab_size = len(tgt_tokenizer) decoder.config.add_cross_attention = True decoder.config.is_decoder = True model_config = EncoderDecoderConfig.from_encoder_decoder_configs( encoder.config, decoder.config ) model = EncoderDecoderModel( encoder=encoder, decoder=decoder, config=model_config ) if local_rank != -1: model = model.to(device) if args.ngpu > 1: print("{}/{} GPU start".format(local_rank, torch.cuda.device_count())) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank ) optimizer, scheduler = get_optimizer_and_schedule(args, model) return model, src_tokenizer, tgt_tokenizer, optimizer, scheduler
def check_equivalence_pt_to_tf(self, config, decoder_config, inputs_dict): encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) pt_model = EncoderDecoderModel(encoder_decoder_config) with tempfile.TemporaryDirectory() as encoder_tmp_dirname, tempfile.TemporaryDirectory() as decoder_tmp_dirname: pt_model.encoder.save_pretrained(encoder_tmp_dirname) pt_model.decoder.save_pretrained(decoder_tmp_dirname) tf_model = TFEncoderDecoderModel.from_encoder_decoder_pretrained( encoder_tmp_dirname, decoder_tmp_dirname, encoder_from_pt=True, decoder_from_pt=True ) # This is only for copying some specific attributes of this particular model. tf_model.config = pt_model.config self.check_pt_tf_equivalence(pt_model, tf_model, inputs_dict)
def test_relative_position_embeds(self): config_and_inputs = self.prepare_config_and_inputs() encoder_config = config_and_inputs["config"] decoder_config = config_and_inputs["decoder_config"] encoder_config.position_embedding_type = "relative_key_query" decoder_config.position_embedding_type = "relative_key_query" config = EncoderDecoderConfig.from_encoder_decoder_configs(encoder_config, decoder_config) model = EncoderDecoderModel(config).eval().to(torch_device) logits = model( input_ids=config_and_inputs["input_ids"], decoder_input_ids=config_and_inputs["decoder_input_ids"] ).logits self.assertTrue(logits.shape, (13, 7))
class EncoderDecoderAdapterTestBase(AdapterTestBase): model_class = EncoderDecoderModel config_class = EncoderDecoderConfig config = staticmethod( lambda: EncoderDecoderConfig.from_encoder_decoder_configs( BertConfig( hidden_size=32, num_hidden_layers=4, num_attention_heads=4, intermediate_size=37, ), BertConfig( hidden_size=32, num_hidden_layers=4, num_attention_heads=4, intermediate_size=37, is_decoder=True, add_cross_attention=True, ), )) tokenizer_name = "bert-base-uncased"
def __init__( self, model_save_path: str, batch_size: int, num_gpus: int, max_len: int = 512, lr: float = 3e-5, weight_decay: float = 1e-4, save_step_interval: int = 1000, accelerator: str = "ddp", precision: int = 16, use_amp: bool = True, ) -> None: super(Bert2Bert, self).__init__( model_save_path=model_save_path, max_len=max_len, batch_size=batch_size, num_gpus=num_gpus, lr=lr, weight_decay=weight_decay, save_step_interval=save_step_interval, accelerator=accelerator, precision=precision, use_amp=use_amp, ) encoder_config = BertConfig.from_pretrained("monologg/kobert") decoder_config = BertConfig.from_pretrained("monologg/kobert") config = EncoderDecoderConfig.from_encoder_decoder_configs( encoder_config, decoder_config) self.model = EncoderDecoderModel(config) self.tokenizer = KoBertTokenizer() state_dict = BertModel.from_pretrained("monologg/kobert").state_dict() self.model.encoder.load_state_dict(state_dict) self.model.decoder.bert.load_state_dict(state_dict, strict=False)
def __init__(self, config, dataset): super(BERT2BERT, self).__init__(config, dataset) self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased') self.encoder_configure = BertConfig.from_pretrained('bert-base-cased') self.decoder_configure = BertConfig.from_pretrained('bert-base-cased') self.encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs( encoder_config=self.encoder_configure, decoder_config=self.decoder_configure) self.encoder = BertGenerationEncoder.from_pretrained('bert-base-cased', bos_token_id=101, eos_token_id=102) self.decoder = BertGenerationDecoder.from_pretrained( 'bert-base-cased', add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102) self.encoder_decoder = EncoderDecoderModel( encoder=self.encoder, decoder=self.decoder, config=self.encoder_decoder_config) self.sos_token = dataset.sos_token self.eos_token = dataset.eos_token self.padding_token_idx = self.tokenizer.pad_token_id self.max_source_length = config['source_max_seq_length'] self.max_target_length = config['target_max_seq_length'] self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx, reduction='none')
def sample_generate(top_k=50, temperature=1.0, model_path='/content/BERT checkpoints/model-9.pth', gpu_id=0): # make sure your model is on GPU device = torch.device(f"cuda:{gpu_id}") # ------------------------LOAD MODEL----------------- print('load the model....') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') bert_encoder = BertConfig.from_pretrained('bert-base-uncased') bert_decoder = BertConfig.from_pretrained('bert-base-uncased', is_decoder=True) config = EncoderDecoderConfig.from_encoder_decoder_configs( bert_encoder, bert_decoder) model = EncoderDecoderModel(config) model.load_state_dict(torch.load(model_path, map_location='cuda')) model = model.to(device) encoder = model.get_encoder() decoder = model.get_decoder() model.eval() print('load success') # ------------------------END LOAD MODEL-------------- # ------------------------LOAD VALIDATE DATA------------------ test_data = torch.load("/content/test_data.pth") test_dataset = TensorDataset(*test_data) test_dataloader = DataLoader(dataset=test_dataset, shuffle=False, batch_size=1) # ------------------------END LOAD VALIDATE DATA-------------- # ------------------------START GENERETE------------------- update_count = 0 bleu_2scores = 0 bleu_4scores = 0 nist_2scores = 0 nist_4scores = 0 sentences = [] meteor_scores = 0 print('start generating....') for batch in test_dataloader: with torch.no_grad(): batch = [item.to(device) for item in batch] encoder_input, decoder_input, mask_encoder_input, _ = batch past, _ = encoder(encoder_input, mask_encoder_input) prev_pred = decoder_input[:, :1] sentence = prev_pred # decoding loop for i in range(100): logits = decoder(sentence, encoder_hidden_states=past) logits = logits[0][:, -1] logits = logits.squeeze(1) / temperature logits = top_k_logits(logits, k=top_k) probs = F.softmax(logits, dim=-1) prev_pred = torch.multinomial(probs, num_samples=1) sentence = torch.cat([sentence, prev_pred], dim=-1) if prev_pred[0][0] == 102: break predict = tokenizer.convert_ids_to_tokens(sentence[0].tolist()) encoder_input = encoder_input.squeeze(dim=0) encoder_input_num = (encoder_input != 0).sum() inputs = tokenizer.convert_ids_to_tokens( encoder_input[:encoder_input_num].tolist()) decoder_input = decoder_input.squeeze(dim=0) decoder_input_num = (decoder_input != 0).sum() reference = tokenizer.convert_ids_to_tokens( decoder_input[:decoder_input_num].tolist()) print('-' * 20 + f"example {update_count}" + '-' * 20) print(f"input: {' '.join(inputs)}") print(f"output: {' '.join(reference)}") print(f"predict: {' '.join(predict)}") temp_bleu_2, \ temp_bleu_4, \ temp_nist_2, \ temp_nist_4, \ temp_meteor_scores = calculate_metrics(predict[1:-1], reference[1:-1]) bleu_2scores += temp_bleu_2 bleu_4scores += temp_bleu_4 nist_2scores += temp_nist_2 nist_4scores += temp_nist_4 meteor_scores += temp_meteor_scores sentences.append(" ".join(predict[1:-1])) update_count += 1 entro, dist = cal_entropy(sentences) mean_len, var_len = cal_length(sentences) print(f'avg: {mean_len}, var: {var_len}') print(f'entro: {entro}') print(f'dist: {dist}') print(f'test bleu_2scores: {bleu_2scores / update_count}') print(f'test bleu_4scores: {bleu_4scores / update_count}') print(f'test nist_2scores: {nist_2scores / update_count}') print(f'test nist_4scores: {nist_4scores / update_count}') print(f'test meteor_scores: {meteor_scores / update_count}')
def create_and_check_encoder_decoder_shared_weights( self, config, input_ids, attention_mask, encoder_hidden_states, decoder_config, decoder_input_ids, decoder_attention_mask, labels, **kwargs): torch.manual_seed(0) encoder_model, decoder_model = self.get_encoder_decoder_model( config, decoder_config) model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) model.to(torch_device) model.eval() # load state dict copies weights but does not tie them decoder_state_dict = model.decoder._modules[ model.decoder.base_model_prefix].state_dict() model.encoder.load_state_dict(decoder_state_dict, strict=False) torch.manual_seed(0) tied_encoder_model, tied_decoder_model = self.get_encoder_decoder_model( config, decoder_config) config = EncoderDecoderConfig.from_encoder_decoder_configs( tied_encoder_model.config, tied_decoder_model.config, tie_encoder_decoder=True) tied_model = EncoderDecoderModel(encoder=tied_encoder_model, decoder=tied_decoder_model, config=config) tied_model.to(torch_device) tied_model.eval() model_result = model( input_ids=input_ids, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, ) tied_model_result = tied_model( input_ids=input_ids, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, ) # check that models has less parameters self.assertLess(sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters())) random_slice_idx = ids_tensor((1, ), model_result[0].shape[-1]).item() # check that outputs are equal self.assertTrue( torch.allclose(model_result[0][0, :, random_slice_idx], tied_model_result[0][0, :, random_slice_idx], atol=1e-4)) # check that outputs after saving and loading are equal with tempfile.TemporaryDirectory() as tmpdirname: tied_model.save_pretrained(tmpdirname) tied_model = EncoderDecoderModel.from_pretrained(tmpdirname) tied_model.to(torch_device) tied_model.eval() # check that models has less parameters self.assertLess(sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters())) random_slice_idx = ids_tensor((1, ), model_result[0].shape[-1]).item() tied_model_result = tied_model( input_ids=input_ids, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, ) # check that outputs are equal self.assertTrue( torch.allclose(model_result[0][0, :, random_slice_idx], tied_model_result[0][0, :, random_slice_idx], atol=1e-4))
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the training files for the CoNLL-2003 NER task.", ) parser.add_argument( "--encoder_model_type", default=None, type=str, required=True, help="Model type selected", ) parser.add_argument( "--encoder_model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name", ) parser.add_argument( "--decoder_model_type", default=None, type=str, required=True, help="Model type selected", ) parser.add_argument( "--decoder_model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name", ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written.", ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name", ) parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument( "--do_predict", action="store_true", help="Whether to run predictions on the test set.", ) parser.add_argument( "--evaluate_during_training", action="store_true", help="Whether to run evaluation during training at each logging step.", ) parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.", ) parser.add_argument( "--keep_accents", action="store_const", const=True, help="Set this flag if model is trained with accents.", ) parser.add_argument( "--strip_accents", action="store_const", const=True, help="Set this flag if model is trained without accents.", ) parser.add_argument( "--use_fast", action="store_const", const=True, help="Set this flag to use fast tokenization.", ) parser.add_argument( "--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.", ) parser.add_argument( "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.", ) parser.add_argument( "--optimizer", default="lamb", type=str, help="Optimizer (AdamW or lamb)", ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument( "--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.", ) parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.", ) parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.") parser.add_argument( "--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.", ) parser.add_argument( "--eval_all_checkpoints", action="store_true", help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument( "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory", ) parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets", ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument( "--local_rank", type=int, default=-1, help="For distributed training: local_rank", ) parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") args = parser.parse_args() if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda:0" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda:0", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device print('DEVICE : ' + str(args.device)) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab args.encoder_model_type = args.encoder_model_type.lower() args.decoder_model_type = args.decoder_model_type.lower() tokenizer_args = { k: v for k, v in vars(args).items() if v is not None and k in TOKENIZER_ARGS } logger.info("Tokenizer arguments: %s", tokenizer_args) tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.encoder_model_name_or_path, cache_dir=args.cache_dir if args.cache_dir else None, **tokenizer_args, ) # ensure there's a pad token if tokenizer.pad_token is None: tokenizer.pad_token = "<PAD>" # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later # pad_token_label_id = CrossEntropyLoss().ignore_index pad_token_label_id = tokenizer.pad_token_id if args.encoder_model_type == 'bert': config_encoder = BertConfig() elif args.encoder_model_type == 'gpt2': config_encoder = GPT2Config() elif args.encoder_model_type == 'xlnet': config_encoder = XLNetConfig() if args.decoder_model_type == 'bert': config_decoder = BertConfig() elif args.decoder_model_type == 'gpt2': config_decoder = GPT2Config() elif args.decoder_model_type == 'xlnet': config_decoder = XLNetConfig() config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder) logger.info('Defining model...') model = EncoderDecoderModel.from_encoder_decoder_pretrained( args.encoder_model_name_or_path, args.decoder_model_name_or_path, config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, pad_token_label_id, mode="train") global_step, tr_loss = train(args, train_dataset, model, tokenizer, pad_token_label_id) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = (model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: # since the config is prefaced with `tokenizer_`, Autotokenizer doesn't instatiate this correctly #config = AutoConfig.from_pretrained(os.path.join(args.output_dir, "tokenizer_config.json")) #config = {"do_lower_case": False, "model_max_length": 512} #tokenizer = AutoTokenizer.from_pretrained(args.output_dir, config=config, **tokenizer_args) checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))) logging.getLogger("pytorch_transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( "-")[-1] if len(checkpoints) > 1 else "" #model = EncoderDecoderModel.from_pretrained( # os.path.join(args.output_dir, "encoder"), os.path.join(args.output_dir, "decoder"), #) model.to(args.device) result, _ = evaluate( args, model, tokenizer, pad_token_label_id, mode="dev", prefix=global_step, ) if global_step: result = { "{}_{}".format(global_step, k): v for k, v in result.items() } results.update(result) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w", encoding="utf-8") as writer: for key in sorted(results.keys()): writer.write("{} = {}\n".format(key, str(results[key]))) if args.do_predict and args.local_rank in [-1, 0]: # since the config is prefaced with `tokenizer_`, Autotokenizer doesn't instatiate this correctly #config = AutoConfig.from_pretrained(os.path.join(args.output_dir, "tokenizer_config.json")) #config = {"do_lower_case": False, "model_max_length": 512} #tokenizer = AutoTokenizer.from_pretrained(args.output_dir, config=config, **tokenizer_args) #model = EncoderDecoderModel.from_pretrained( # os.path.join(args.output_dir, "encoder"), os.path.join(args.output_dir, "decoder"), #) model.to(args.device) result, predictions = evaluate(args, model, tokenizer, pad_token_label_id, mode="test") # Save results output_test_results_file = os.path.join(args.output_dir, "test_results.txt") with open(output_test_results_file, "w", encoding="utf-8") as writer: for key in sorted(result.keys()): writer.write("{} = {}\n".format(key, str(result[key]))) # Save predictions output_test_predictions_file = os.path.join(args.output_dir, "test_predictions.txt") with open(output_test_predictions_file, "w", encoding="utf-8") as writer: for example in predictions: output_line = ("output: " + tokenizer.decode( example, skip_special_tokens=True, clean_up_tokenization_spaces=True, ) + "\n") writer.write(output_line) return results
def encoder_decoder_example(): from transformers import EncoderDecoderConfig, EncoderDecoderModel from transformers import BertConfig, GPT2Config pretrained_model_name = 'bert-base-uncased' #pretrained_model_name = 'gpt2' if 'bert' in pretrained_model_name: # Initialize a BERT bert-base-uncased style configuration. config_encoder, config_decoder = BertConfig(), BertConfig() elif 'gpt2' in pretrained_model_name: config_encoder, config_decoder = GPT2Config(), GPT2Config() else: print('Invalid model, {}.'.format(pretrained_model_name)) return config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder) if 'bert' in pretrained_model_name: # Initialize a Bert2Bert model from the bert-base-uncased style configurations. model = EncoderDecoderModel(config=config) #model = EncoderDecoderModel.from_encoder_decoder_pretrained(pretrained_model_name, pretrained_model_name) # Initialize Bert2Bert from pre-trained checkpoints. tokenizer = BertTokenizer.from_pretrained(pretrained_model_name) elif 'gpt2' in pretrained_model_name: model = EncoderDecoderModel(config=config) tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name) #print('Configuration of the encoder & decoder:\n{}.\n{}.'.format(model.config.encoder, model.config.decoder)) #print('Encoder type = {}, decoder type = {}.'.format(type(model.encoder), type(model.decoder))) if False: # Access the model configuration. config_encoder = model.config.encoder config_decoder = model.config.decoder # Set decoder config to causal LM. config_decoder.is_decoder = True config_decoder.add_cross_attention = True #-------------------- input_ids = torch.tensor(tokenizer.encode('Hello, my dog is cute', add_special_tokens=True)).unsqueeze(0) # Batch size 1. if False: # Forward. outputs = model(input_ids=input_ids, decoder_input_ids=input_ids) # Train. outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids) loss, logits = outputs.loss, outputs.logits # Save the model, including its configuration. model.save_pretrained('my-model') #-------------------- # Load model and config from pretrained folder. encoder_decoder_config = EncoderDecoderConfig.from_pretrained('my-model') model = EncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config) #-------------------- # Generate. # REF [site] >> # https://huggingface.co/transformers/internal/generation_utils.html # https://huggingface.co/blog/how-to-generate generated = model.generate(input_ids, decoder_start_token_id=model.config.decoder.pad_token_id) #generated = model.generate(input_ids, max_length=50, num_beams=5, no_repeat_ngram_size=2, num_return_sequences=5, do_sample=True, top_k=0, temperature=0.7, early_stopping=True, decoder_start_token_id=model.config.decoder.pad_token_id) print('Generated = {}.'.format(tokenizer.decode(generated[0], skip_special_tokens=True)))
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized_decoder]) input_ids_ = torch.LongTensor(np.array(padded)) attention_mask_ = np.where(padded != 0, 1, 0) padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized_encoder]) input_ids = torch.LongTensor(np.array(padded)) attention_mask = np.where(padded != 0, 1, 0) attention_mask=torch.Tensor(attention_mask) attention_mask_=torch.Tensor(attention_mask_) device = torch.device("cuda:7" if torch.cuda.is_available() else "cpu") #model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-multilingual-cased', 'bert-base-multilingual-cased') config_encoder = BertConfig() config_decoder = BertConfig() config_encoder.max_length = 1566 config_decoder.max_length = 101 config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder) model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-multilingual-cased', 'bert-base-multilingual-cased',config=config) # initialize Bert2Bert model.to(device) for i in range(10): optimizer.zero_grad() loss= model(input_ids=input_ids[:1].to(device), decoder_input_ids=input_ids_[:1].to(device), lm_labels=input_ids_[:1].to(device),attention_mask=attention_mask[:1].to(device),decoder_attention_mask = attention_mask_[:1].to(device))[:1] print(loss[0].item()) loss[0].backward() optimizer.step()
def train_model(epochs=10, num_gradients_accumulation=4, batch_size=4, gpu_id=0, lr=1e-5, load_dir='/content/BERT checkpoints'): # make sure your model is on GPU device = torch.device(f"cuda:{gpu_id}") # ------------------------LOAD MODEL----------------- print('load the model....') bert_encoder = BertConfig.from_pretrained('bert-base-uncased') bert_decoder = BertConfig.from_pretrained('bert-base-uncased', is_decoder=True) config = EncoderDecoderConfig.from_encoder_decoder_configs( bert_encoder, bert_decoder) model = EncoderDecoderModel(config) model = model.to(device) print('load success') # ------------------------END LOAD MODEL-------------- # ------------------------LOAD TRAIN DATA------------------ train_data = torch.load("/content/train_data.pth") train_dataset = TensorDataset(*train_data) train_dataloader = DataLoader(dataset=train_dataset, shuffle=True, batch_size=batch_size) val_data = torch.load("/content/validate_data.pth") val_dataset = TensorDataset(*val_data) val_dataloader = DataLoader(dataset=val_dataset, shuffle=True, batch_size=batch_size) # ------------------------END LOAD TRAIN DATA-------------- # ------------------------SET OPTIMIZER------------------- num_train_optimization_steps = len( train_dataset) * epochs // batch_size // num_gradients_accumulation param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW( optimizer_grouped_parameters, lr=lr, weight_decay=0.01, ) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=num_train_optimization_steps // 10, num_training_steps=num_train_optimization_steps) # ------------------------START TRAINING------------------- update_count = 0 start = time.time() print('start training....') for epoch in range(epochs): # ------------------------training------------------------ model.train() losses = 0 times = 0 print('\n' + '-' * 20 + f'epoch {epoch}' + '-' * 20) for batch in tqdm(train_dataloader): batch = [item.to(device) for item in batch] encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch logits = model(input_ids=encoder_input, attention_mask=mask_encoder_input, decoder_input_ids=decoder_input, decoder_attention_mask=mask_decoder_input) out = logits[0][:, :-1].contiguous() target = decoder_input[:, 1:].contiguous() target_mask = mask_decoder_input[:, 1:].contiguous() loss = util.sequence_cross_entropy_with_logits(out, target, target_mask, average="token") loss.backward() losses += loss.item() times += 1 update_count += 1 if update_count % num_gradients_accumulation == num_gradients_accumulation - 1: torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) optimizer.step() scheduler.step() optimizer.zero_grad() end = time.time() print(f'time: {(end - start)}') print(f'loss: {losses / times}') start = end # ------------------------validate------------------------ model.eval() perplexity = 0 batch_count = 0 print('\nstart calculate the perplexity....') with torch.no_grad(): for batch in tqdm(val_dataloader): batch = [item.to(device) for item in batch] encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch logits = model(input_ids=encoder_input, attention_mask=mask_encoder_input, decoder_input_ids=decoder_input, decoder_attention_mask=mask_decoder_input) out = logits[0][:, :-1].contiguous() target = decoder_input[:, 1:].contiguous() target_mask = mask_decoder_input[:, 1:].contiguous() # print(out.shape,target.shape,target_mask.shape) loss = util.sequence_cross_entropy_with_logits(out, target, target_mask, average="token") perplexity += np.exp(loss.item()) batch_count += 1 print(f'\nvalidate perplexity: {perplexity / batch_count}') torch.save( model.state_dict(), os.path.join(os.path.abspath('.'), load_dir, "model-" + str(epoch) + ".pth"))