def main(args): tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") tokenizer.bos_token = tokenizer.cls_token tokenizer.eos_token = tokenizer.sep_token train_texts, train_labels = read_split('train', args.data_path, args.context) test_texts, test_labels = read_split('test', args.data_path, args.context) train_texts, val_texts, train_labels, val_labels = train_test_split( train_texts, train_labels, test_size=.2) encoder_max_length = 128 decoder_max_length = 128 train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=encoder_max_length) val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=encoder_max_length) test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=encoder_max_length) train_decodings = tokenizer(train_labels, truncation=True, padding=True, max_length=decoder_max_length) val_decodings = tokenizer(val_labels, truncation=True, padding=True, max_length=decoder_max_length) test_decodings = tokenizer(test_labels, truncation=True, padding=True, max_length=decoder_max_length) train_data = ReviewDataset(train_texts, train_labels, train_encodings, train_decodings, tokenizer.pad_token_id) val_data = ReviewDataset(val_texts, val_labels, val_encodings, val_decodings, tokenizer.pad_token_id) test_data = ReviewDataset(test_texts, test_labels, test_encodings, test_decodings, tokenizer.pad_token_id) bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained( "bert-base-uncased", "bert-base-uncased") # set special tokens bert2bert.config.decoder_start_token_id = tokenizer.bos_token_id bert2bert.config.eos_token_id = tokenizer.eos_token_id bert2bert.config.pad_token_id = tokenizer.pad_token_id # sensible parameters for beam search bert2bert.config.vocab_size = bert2bert.config.decoder.vocab_size bert2bert.config.max_length = 142 bert2bert.config.min_length = 56 bert2bert.config.no_repeat_ngram_size = 3 bert2bert.config.early_stopping = True bert2bert.config.length_penalty = 2.0 bert2bert.config.num_beams = 4 # set training arguments - these params are not really tuned, feel free to change batch_size = 10 # change to 16 for full training training_args = Seq2SeqTrainingArguments( output_dir="./cptk_yelp", per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, predict_with_generate=True, evaluate_during_training=True, do_train=True, do_eval=True, logging_steps=1000, # set to 1000 for full training save_steps=800, # set to 500 for full training eval_steps=800, # set to 8000 for full training warmup_steps=2000, # set to 2000 for full training overwrite_output_dir=True, save_total_limit=10, fp16=True, num_train_epochs=1000, ) # instantiate trainer trainer = Seq2SeqTrainer( model=bert2bert, args=training_args, compute_metrics=build_compute_metrics_fn, train_dataset=train_data, eval_dataset=val_data, ) trainer.train()
add_cross_attention=True, # add cross attention layers vocab_size=len(decoder_tokenizer), # Set required tokens. unk_token_id=decoder_tokenizer.vocab["[UNK]"], sep_token_id=decoder_tokenizer.vocab["[SEP]"], pad_token_id=decoder_tokenizer.vocab["[PAD]"], cls_token_id=decoder_tokenizer.vocab["[CLS]"], mask_token_id=decoder_tokenizer.vocab["[MASK]"], #bos_token_id = decoder_tokenizer.vocab["[BOS]"], #eos_token_id = decoder_tokenizer.vocab["[EOS]"], ) # Initialize a brand new bert-based decoder. decoder = BertGenerationDecoder(config=decoder_config) # Setup enc-decoder mode. bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder) bert2bert.config.decoder_start_token_id = decoder_tokenizer.vocab["[CLS]"] bert2bert.config.pad_token_id = decoder_tokenizer.vocab["[PAD]"] # Elementary Training. optimizer = torch.optim.Adam(bert2bert.parameters(), lr=0.000001) bert2bert.cuda() for epoch in range(30): print("*" * 50, "Epoch", epoch, "*" * 50) for batch in tqdm(sierra_dl): # tokenize commands and goals. inputs = encoder_tokenizer(batch["command"], add_special_tokens=True, return_tensors="pt", padding=True,
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the training files for the CoNLL-2003 NER task.", ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name", ) parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument( "--do_predict", action="store_true", help="Whether to run predictions on the test set.", ) parser.add_argument( "--evaluate_during_training", action="store_true", help="Whether to run evaluation during training at each logging step.", ) parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.", ) parser.add_argument( "--keep_accents", action="store_const", const=True, help="Set this flag if model is trained with accents.", ) parser.add_argument( "--strip_accents", action="store_const", const=True, help="Set this flag if model is trained without accents.", ) parser.add_argument( "--use_fast", action="store_const", const=True, help="Set this flag to use fast tokenization.", ) parser.add_argument( "--train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.", ) parser.add_argument( "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.", ) parser.add_argument( "--optimizer", default="lamb", type=str, help="Optimizer (AdamW or lamb)", ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument( "--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.", ) parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.", ) parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.") parser.add_argument( "--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.", ) parser.add_argument( "--eval_all_checkpoints", action="store_true", help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument( "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory", ) parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets", ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument( "--local_rank", type=int, default=-1, help="For distributed training: local_rank", ) parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") args = parser.parse_args() # New example based on https://colab.research.google.com/drive/1uVP09ynQ1QUmSE2sjEysHjMfKgo4ssb7?usp=sharing device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print('DEVICE: ' + str(device)) tokenizer = BertTokenizer.from_pretrained('bert-base-cased') model = EncoderDecoderModel.from_encoder_decoder_pretrained( 'bert-base-cased', 'bert-base-cased') model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5) model.train() train_loss_set = [] train_loss = 0 save_step = 500 for epoch in range(int(args.num_train_epochs)): batches = tqdm(batch_loader(tokenizer, args.data_dir, step='train', batch_size=args.train_batch_size, start_pad=False), desc='Training') for step, batch in enumerate(batches): batch = tuple(t.to(device) for t in batch) input_ids_encode, attention_mask_encode, input_ids_decode, attention_mask_decode, lm_labels = batch optimizer.zero_grad() model.zero_grad() loss, outputs = model(input_ids=input_ids_encode, decoder_input_ids=input_ids_decode, attention_mask=attention_mask_encode, decoder_attention_mask=attention_mask_decode, lm_labels=lm_labels)[:2] train_loss_set.append(loss.item()) loss.backward() optimizer.step() train_loss += loss.item() print(epoch) clear_output(True) plt.plot(train_loss_set) plt.title(f'Training loss. Epoch {epoch}') plt.xlabel(f'Batch {step}') plt.ylabel('Loss') plt.show() print('STARTING EVALUATION') model.eval() test_batches = tqdm(batch_loader(tokenizer, args.data_dir, step='test', batch_size=1, start_pad=True), desc='Evaluating') for step, batch in enumerate(test_batches): batch = tuple(t.to(device) for t in batch) input_ids_encode, attention_mask_encode, input_ids_decode, attention_mask_decode, lm_labels = batch with torch.no_grad(): generated = model.generate( input_ids_encode, attention_mask=attention_mask_encode, decoder_start_token_id=model.config.decoder.pad_token_id, do_sample=True, max_length=10, top_k=200, top_p=0.75, num_return_sequences=10, #num_beams=5, #no_repeat_ngram_size=2, ) for i in range(len(generated)): print( f'Generated {i}: {tokenizer.decode(generated[i], skip_special_tokens=True, clean_up_tokenization_spaces=True)}' ) print( 'Expected: ', ' '.join([ tokenizer.decode(elem, skip_special_tokens=True, clean_up_tokenization_spaces=True) for elem in input_ids_decode ])) print( 'Lm Labels: ', ' '.join([ tokenizer.decode(elem, skip_special_tokens=True, clean_up_tokenization_spaces=True) for elem in lm_labels ])) print( 'Input: ', ' '.join([ tokenizer.decode(elem, skip_special_tokens=True, clean_up_tokenization_spaces=True) for elem in input_ids_encode ])) print()
def create_and_check_encoder_decoder_shared_weights( self, config, input_ids, attention_mask, encoder_hidden_states, decoder_config, decoder_input_ids, decoder_attention_mask, labels, **kwargs ): torch.manual_seed(0) encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) model.to(torch_device) model.eval() # load state dict copies weights but does not tie them decoder_state_dict = model.decoder._modules[model.decoder.base_model_prefix].state_dict() model.encoder.load_state_dict(decoder_state_dict, strict=False) torch.manual_seed(0) tied_encoder_model, tied_decoder_model = self.get_encoder_decoder_model(config, decoder_config) config = EncoderDecoderConfig.from_encoder_decoder_configs( tied_encoder_model.config, tied_decoder_model.config, tie_encoder_decoder=True ) tied_model = EncoderDecoderModel(encoder=tied_encoder_model, decoder=tied_decoder_model, config=config) tied_model.to(torch_device) tied_model.eval() model_result = model( input_ids=input_ids, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, ) tied_model_result = tied_model( input_ids=input_ids, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, ) # check that models has less parameters self.assertLess(sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters())) random_slice_idx = ids_tensor((1,), model_result[0].shape[-1]).item() # check that outputs are equal self.assertTrue( torch.allclose( model_result[0][0, :, random_slice_idx], tied_model_result[0][0, :, random_slice_idx], atol=1e-4 ) ) # check that outputs after saving and loading are equal with tempfile.TemporaryDirectory() as tmpdirname: tied_model.save_pretrained(tmpdirname) tied_model = EncoderDecoderModel.from_pretrained(tmpdirname) tied_model.to(torch_device) tied_model.eval() # check that models has less parameters self.assertLess( sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters()) ) random_slice_idx = ids_tensor((1,), model_result[0].shape[-1]).item() tied_model_result = tied_model( input_ids=input_ids, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, ) # check that outputs are equal self.assertTrue( torch.allclose( model_result[0][0, :, random_slice_idx], tied_model_result[0][0, :, random_slice_idx], atol=1e-4 ) )
def get_pretrained_model(self): return EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-cased", "gpt2")
def test_real_bert_model_from_pretrained(self): model = EncoderDecoderModel.from_encoder_decoder_pretrained( "bert-base-uncased", "bert-base-uncased") self.assertIsNotNone(model)
def __init__(self, model_name, device): self.tokenizer = BertTokenizerFast.from_pretrained(model_name) self.model = EncoderDecoderModel.from_pretrained(model_name) self.model = self.model.to(device)
if args.checkpoint != None: model_created = True if args.bart: config = BartConfig.from_json_file(args.checkpoint + "/config.json") model = BartForConditionalGeneration.from_pretrained( args.checkpoint + "/pytorch_model.bin", config=config) if args.t5: config = T5Config.from_json_file(args.checkpoint + "/config.json") model = T5ForConditionalGeneration.from_pretrained( args.checkpoint + "/pytorch_model.bin", config=config) elif not args.bart and not args.t5: config = EncoderDecoderConfig.from_json_file(args.checkpoint + "/config.json") model = EncoderDecoderModel.from_pretrained(args.checkpoint + "/pytorch_model.bin", config=config) model_name = args.checkpoint if args.bart: if args.checkpoint == None: model_name = "WikinewsSum/bart-large-multi-fr-wiki-news" if args.model_name == "" else args.model_name tokenizer = BartTokenizer.from_pretrained( args.tokenizer ) if args.tokenizer != None else BartTokenizer.from_pretrained( model_name) if not model_created: model = BartForConditionalGeneration.from_pretrained(model_name) model_created = True if args.t5:
return batch def format_rouge_output(rouge_output): return { "rouge1_precision": round(rouge_output["rouge1"].mid.precision, 4), "rouge1_recall": round(rouge_output["rouge1"].mid.recall, 4), "rouge1_fmeasure": round(rouge_output["rouge1"].mid.fmeasure, 4), "rouge2_precision": round(rouge_output["rouge2"].mid.precision, 4), "rouge2_recall": round(rouge_output["rouge2"].mid.recall, 4), "rouge2_fmeasure": round(rouge_output["rouge2"].mid.fmeasure, 4), "rougeL_precision": round(rouge_output["rougeL"].mid.precision, 4), "rougeL_recall": round(rouge_output["rougeL"].mid.recall, 4), "rougeL_fmeasure": round(rouge_output["rougeL"].mid.fmeasure, 4) } model = EncoderDecoderModel.from_encoder_decoder_pretrained( "bert-base-uncased", "bert-base-uncased") tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") tokenizer.bos_token = tokenizer.cls_token tokenizer.eos_token = tokenizer.sep_token wiki_train_dataset = load_dataset('wikihow', 'all', data_dir='manual_wikihow_data', split='train') wiki_val_dataset = load_dataset('wikihow', 'all', data_dir='manual_wikihow_data', split='validation') rouge = load_metric('rouge')
return [json.loads(l) for l in istream] if __name__ == "__main__": args = parse_args() token_to_index = TokenToIndexConverter( "vocab/github_python_minus_ethpy150open_deduplicated_vocabulary.txt") set_global_seed(19) DATA_FOLDER = Path("data") train = read_jsonl(DATA_FOLDER / "train_preprocessed.jsonl") test = read_jsonl(DATA_FOLDER / "test_preprocessed.jsonl") model = EncoderDecoderModel.from_pretrained(args.model) train_dataset = get_method_name_dataset( train, token_to_index, token_to_index.pad_index, model.encoder.config.max_position_embeddings) test_dataset = get_method_name_dataset( test, token_to_index, token_to_index.pad_index, model.encoder.config.max_position_embeddings) DEVICE = torch.device(args.device) model.to(DEVICE).eval() metrics = [] with torch.no_grad(): for i in tqdm(range(len(test_dataset))):
import streamlit as st from ..components.fetch import * from .translation import * from transformers import BertTokenizerFast, EncoderDecoderModel import torch device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #Se carga el modelo mini2bert con fine tuning en cnn_daily_mail tokenizer = BertTokenizerFast.from_pretrained('mrm8488/bert-mini2bert-mini-finetuned-cnn_daily_mail-summarization') model = EncoderDecoderModel.from_pretrained('mrm8488/bert-mini2bert-mini-finetuned-cnn_daily_mail-summarization').to(device) #Inferencia del modelo de Sumarizacion, por el momento en ingles. def get_answer(text): inputs = tokenizer([text], padding="max_length", truncation=True, max_length=512, return_tensors="pt") input_ids = inputs.input_ids.to(device) attention_mask = inputs.attention_mask.to(device) output = model.generate(input_ids, attention_mask=attention_mask) return tokenizer.decode(output[0], skip_special_tokens=True) def main(): front_up() st.title('Sistema de Sumarizacion de texto') context = st.text_area(label="Ingrese el texto a resumir", height=320) if st.button("Cargar modelo"): context2 = get_answer_es_en(context) answer_summ = get_answer(context2) answer_summ2 = get_answer_en_es(answer_summ) st.text("El resumen del texto es:") st.write(answer_summ2)
if(perplexity): #eval mode. mean_nll = loss_sum / count_eles ppl = math.exp(mean_nll) print("Perplexity: ", datatype, ppl) else: #training going on print("Mean loss", datatype, (epoch_loss / len(dataloader))) if(globalparams["do_train"]): #load model from pretrained/scratch and train it/save it in the provided dir. print("TRAIN MODE: ") if(globalparams["pretrained"]): #load pretrained encoder and pretrained decoder. model = EncoderDecoderModel.from_encoder_decoder_pretrained(globalparams['pretrained_path'], globalparams['pretrained_path']) print("pretrained model loaded.", globalparams["pretrained_path"]) else: pass model.to(device) print(f'The model has {count_parameters(model):,} trainable parameters') optimizer = optim.Adam(model.parameters(), lr=modelparams['lr']) criterion = nn.NLLLoss(ignore_index=de_tokenizer.pad_token_id) num_train_batches = len(train_dataloader) num_valid_batches = len(valid_dataloader) print("num batches: ", num_train_batches, num_valid_batches)
class PhonetizerModel: phon_tokenizer = { 'e': 7, 'i': 8, 'R': 9, 'a': 10, 'o': 11, 't': 12, 's': 13, 'l': 14, 'k': 15, 'p': 16, 'm': 17, 'n': 18, 'd': 19, 'y': 20, '@': 21, 'f': 22, 'z': 23, 'b': 24, '§': 25, 'v': 26, '2': 27, '1': 28, 'Z': 29, 'g': 30, 'u': 31, 'S': 32 } phon_untokenizer = {v: k for k, v in phon_tokenizer.items()} char_tokenizer = { 'e': 7, 'i': 8, 'a': 9, 'r': 10, 'o': 11, 's': 12, 't': 13, 'n': 14, 'l': 15, 'é': 16, 'c': 17, 'p': 18, 'u': 19, 'm': 20, 'd': 21, '-': 22, 'h': 23, 'g': 24, 'b': 25, 'v': 26, 'f': 27, 'k': 28, 'y': 29, 'x': 30, 'è': 31, 'ï': 32, 'j': 33, 'z': 34, 'w': 35, 'q': 36 } def __init__(self, device='cpu', model=None): vocabsize = 37 max_length = 50 encoder_config = BertConfig(vocab_size=vocabsize, max_position_embeddings=max_length + 64, num_attention_heads=4, num_hidden_layers=4, hidden_size=128, type_vocab_size=1) encoder = BertModel(config=encoder_config) vocabsize = 33 max_length = 50 decoder_config = BertConfig(vocab_size=vocabsize, max_position_embeddings=max_length + 64, num_attention_heads=4, num_hidden_layers=4, hidden_size=128, type_vocab_size=1, add_cross_attentions=True, is_decoder=True) decoder_config.add_cross_attention = True decoder = BertLMHeadModel(config=decoder_config) # Define encoder decoder model self.model = EncoderDecoderModel(encoder=encoder, decoder=decoder) self.model.to(device) self.device = device if model is not None: self.model.load_state_dict(torch.load(model)) def phonetize(self, word): word = word.replace('à', 'a') word = word.replace('û', 'u') word = word.replace('ù', 'u') word = word.replace('î', 'i') word = word.replace('ç', 'ss') word = word.replace('ô', 'o') word = word.replace('â', 'a') word = word.replace('qu', 'k') word = word.replace('ê', 'e') assert set(word).issubset(set(PhonetizerModel.char_tokenizer.keys())) encoded = torch.tensor( [0] + [PhonetizerModel.char_tokenizer[p] for p in word] + [2]) output = self.model.generate( encoded.unsqueeze(0).to(self.device), max_length=50, decoder_start_token_id=0, eos_token_id=2, pad_token_id=1, ).detach().cpu().numpy()[0] bound = np.where(output == 2)[0][0] if 2 in output else 1000 phon_pred = ''.join([ PhonetizerModel.phon_untokenizer[c] for c in output[:bound] if c > 6 ]) return phon_pred def check_phonetization_error(self, word, phon): prediction = self.phonetize(word)[:5] score = pairwise2.align.globalms(list(phon[:5]), list(prediction), 2, -1, -1, -.5, score_only=True, gap_char=['-']) / len(phon[:5]) return score
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡') parser.add_argument('--raw_data_path', default='data/train.txt', type=str, required=False, help='原始训练语料') parser.add_argument('--output_dir', default='model/', type=str, required=False, help='模型输出路径') parser.add_argument('--batch_size', default=1, type=int, required=False, help='模型训练batch大小') parser.add_argument('--lr', default=1.5e-4, type=float, required=False, help='学习率') parser.add_argument('--epochs', default=5, type=int, required=False, help='训练循环') args = parser.parse_args() print('args:\n' + args.__repr__()) raw_data_path = args.raw_data_path output_dir = args.output_dir batch_size = args.batch_size lr = args.lr epochs = args.epochs # device os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 device = 'cuda' if torch.cuda.is_available() else 'cpu' print('using device:', device) # model model = EncoderDecoderModel.from_encoder_decoder_pretrained( "bert-base-multilingual-cased", "bert-base-multilingual-cased") # dataset tokenizer = BertTokenizerFast.from_pretrained( "bert-base-multilingual-cased") dataset = TextDataset(tokenizer, './dataset/train.jsonl') # 打印参数量 num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() print('number of parameters: {}'.format(num_parameters)) # dataloader def pad_collate_fn(batch): batch_size = len(batch) # find longest sequence source_max_len = max(map(lambda x: x['source'].shape[0], batch)) target_max_len = max(map(lambda x: x['target'].shape[0], batch)) # pad according to max_len ret = { 'source': torch.full((batch_size, source_max_len), tokenizer.pad_token_id, dtype=torch.long), 'target': torch.full((batch_size, target_max_len), tokenizer.pad_token_id, dtype=torch.long) } for i, sample in enumerate(batch): sample_source = sample['source'] sample_target = sample['target'] ret['source'][i, :sample_source.numel()] = sample_source ret['target'][i, :sample_target.numel()] = sample_target return ret dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=pad_collate_fn) # optimizer optimizer = torch.optim.AdamW(model.parameters(), lr=lr, betas=(0.9, 0.999), eps=1e-8) print('start training') for epoch in range(epochs): with tqdm(total=len(dataloader), ascii=True) as t: for i, sample in enumerate(dataloader): optimizer.zero_grad() input_ids = sample['source'] decoder_input_ids = sample['target'] loss, *args = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids, labels=decoder_input_ids) # backward loss.backward() optimizer.step() t.set_postfix({'loss': loss.item()}) t.update(1) # save model if not os.path.exists(output_dir): os.mkdir(output_dir) output_epoch_dir = os.path.join(output_dir, f'epoch_{str(epoch)}') if not os.path.exists(output_epoch_dir): os.mkdir(output_epoch_dir) torch.save(model.state_dict(), os.path.join(output_epoch_dir, 'model.pth'))
# SPDX-License-Identifier: Apache-2.0 # based on: # https://huggingface.co/docs/transformers/model_doc/bertgeneration from transformers import AutoTokenizer, EncoderDecoderModel # instantiate sentence fusion model model = EncoderDecoderModel.from_pretrained( "google/roberta2roberta_L-24_discofuse") tokenizer = AutoTokenizer.from_pretrained( "google/roberta2roberta_L-24_discofuse") input_ids = tokenizer( 'This is the first sentence. This is the second sentence.', add_special_tokens=False, return_tensors="pt").input_ids greedy_output = model.generate(input_ids) print(f"Output ({greedy_output.shape}): {greedy_output}") print( f"Detokenized: `{tokenizer.decode(greedy_output[0], skip_special_tokens=False)}`" )
class TracedEncoderDecoder(BaseModel): def __init__(self, config): super().__init__(config) self.build() @classmethod def config_path(cls): return "configs/models/ted/defaults.yaml" def build(self): # to be further set # breakpoint() self.image_feature_module = build_image_encoder( self.config.image_feature_processor, direct_features=True) if self.config.concate_trace: self.trace_feature_module = build_encoder( self.config.trace_feature_encoder) if self.config.base_model_name == "bert-base-uncased": self.encoderdecoder = EncoderDecoderModel.from_encoder_decoder_pretrained( "bert-base-uncased", "bert-base-uncased") elif self.config.base_model_name == "2layer-base": config_encoder = BertConfig() config_decoder = BertConfig() config_encoder.num_hidden_layers = 2 config_decoder.num_hidden_layers = 2 self.codec_config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder) self.encoderdecoder = EncoderDecoderModel(config=self.codec_config) elif self.config.base_model_name == "3layer-base": config_encoder = BertConfig() config_decoder = BertConfig() config_encoder.num_hidden_layers = 3 config_decoder.num_hidden_layers = 3 self.codec_config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder) self.encoderdecoder = EncoderDecoderModel(config=self.codec_config) if self.config.loop_contrastive: self.trace_caption_contrastive = TraceCaptionContrastiveModel( self.config.tc_contrastive_aggregate_method) if (hasattr(self.config, "pretrans_attention") and self.config.pretrans_attention): # import ipdb; ipdb.set_trace() tempconf = self.encoderdecoder.config.encoder num_heads = tempconf.num_attention_heads num_layers = tempconf.num_hidden_layers self.attention_trans = AttentionTransform(num_layers, num_heads, 100) self.BOS_ID = 101 def forward(self, sample_list, *args, **kwargs): # breakpoint() decoder_input_ids = sample_list["input_ids"][:, :-1] # using default mask # target_mask = sample_list["input_mask"] # segment_ids = sample_list["segment_ids"] # token_attends = sample_list["token_attends"] other_kwargs = {} if self.config.image_feature_processor.type == "spatial": bbox_feature = sample_list["image_feature_0"] spatial_feature = sample_list["image_info_0"]["bbox"] inputs_embeds = self.image_feature_module(bbox_feature, spatial_feature) else: bbox_feature = sample_list["image_feature_0"] inputs_embeds = self.image_feature_module(bbox_feature) if hasattr(self.config, "no_vision") and self.config.no_vision: inputs_embeds = inputs_embeds * 0 batch_size = inputs_embeds.shape[0] if self.config.concate_trace: trace_boxes = sample_list["trace_boxes"] trace_boxes_mask = sample_list["trace_boxes_mask"] trace_feature = self.trace_feature_module(trace_boxes) trace_seg_id = sample_list["trace_boxes_seg_id"] inputs_embeds = torch.cat((inputs_embeds, trace_feature), dim=1) image_feats_mask = trace_boxes_mask.new_ones((batch_size, 100)) image_feats_seg_id = trace_seg_id.new_zeros((batch_size, 100)) attention_mask = torch.cat((image_feats_mask, trace_boxes_mask), dim=1) token_type_ids = torch.cat((image_feats_seg_id, trace_seg_id), dim=1) position_ids = trace_seg_id.new_zeros( (batch_size, attention_mask.shape[1])) other_kwargs.update({ "attention_mask": attention_mask, "token_type_ids": token_type_ids, "position_ids": position_ids, }) if self.training: decoder_output = self.encoderdecoder( decoder_input_ids=decoder_input_ids, inputs_embeds=inputs_embeds, output_attentions=True, output_hidden_states=True, return_dict=True, **other_kwargs) logits = decoder_output["logits"] cross_attentions = [] # import ipdb; ipdb.set_trace() for cross_attention in decoder_output["cross_attentions"]: if self.config.concate_trace: cross_attention = cross_attention[:, :, :, :100] # cross_attentions.append(cross_attention.mean(dim=1)) cross_attentions.append(cross_attention) # breakpoint() if (hasattr(self.config, "pretrans_attention") and self.config.pretrans_attention): cross_attentions = self.attention_trans(cross_attentions) else: cross_attentions = [ crs.mean(dim=1) for crs in cross_attentions ] model_output = {} model_output["captions"] = torch.max(logits, dim=-1)[1] model_output["scores"] = logits model_output["cross_attentions"] = cross_attentions sample_list["targets"] = sample_list["input_ids"][:, 1:] if self.config.loop_contrastive: cap_feat, vision_trace_feat = self.trace_caption_contrastive( decoder_output["encoder_hidden_states"][-1], sample_list["trace_boxes_loop_contrastive_seg_id"], decoder_output["decoder_hidden_states"][-1], sample_list["segment_ids"], ) model_output["contrastive_a"] = cap_feat model_output["contrastive_b"] = vision_trace_feat else: if self.config.inference.type == "beam_search": generate_output = self.encoderdecoder.generate( input_ids=None, input_embeds=inputs_embeds, bos_token_id=self.BOS_ID, decoder_start_token_id=self.BOS_ID, **self.config.inference.args, **other_kwargs) elif self.config.inference.type == "greedy": generate_output = self.encoderdecoder.generate( input_ids=None, input_embeds=inputs_embeds, max_length=self.config.max_gen_length, bos_token_id=self.BOS_ID, decoder_start_token_id=self.BOS_ID, **other_kwargs) elif self.config.inference.type == "nucleus_sampling": generate_output = self.encoderdecoder.generate( input_ids=None, input_embeds=inputs_embeds, bos_token_id=self.BOS_ID, decoder_start_token_id=self.BOS_ID, **self.config.inference.args, **other_kwargs) model_output = {} # breakpoint() if ("return_attention" in self.config.inference and self.config.inference.return_attention): with torch.no_grad(): attention_temp_output = self.encoderdecoder( decoder_input_ids=generate_output, inputs_embeds=inputs_embeds, output_attentions=True, return_dict=True, ) cross_attentions = [] for cross_attention in attention_temp_output[ "cross_attentions"]: if self.config.concate_trace: cross_attention = cross_attention[:, :, :, :100] cross_attentions.append(cross_attention.mean(dim=1)) # breakpoint() cross_attentions = (torch.stack(cross_attentions).max( dim=0)[0].max(dim=-1)[1]) model_output["cross_attention"] = cross_attentions # breakpoint() model_output["captions"] = generate_output model_output["losses"] = {} loss_key = "{}/{}".format(sample_list.dataset_name, sample_list.dataset_type) # Add a dummy loss so that loss calculation is not required model_output["losses"][loss_key + "/dummy_loss"] = torch.zeros( batch_size, device=sample_list.image_feature_0.device) # breakpoint() return model_output
def train_model(epochs=10, num_gradients_accumulation=4, batch_size=4, gpu_id=0, lr=1e-5, load_dir='/content/GPT CheckPoints/'): # make sure your model is on GPU device = torch.device(f"cuda:{gpu_id}") # ------------------------LOAD MODEL----------------- print('load the model....') model = EncoderDecoderModel.from_encoder_decoder_pretrained( "gpt2", "gpt2", use_cache=False) model.load_state_dict( torch.load("/content/EnglishGPT/decoder_model/model2.pth", map_location='cuda')) model = model.to(device) print('load success') # ------------------------END LOAD MODEL-------------- # ------------------------LOAD TRAIN DATA------------------ train_data = torch.load("/content/train_data.pth") train_dataset = TensorDataset(*train_data) train_dataloader = DataLoader(dataset=train_dataset, shuffle=True, batch_size=batch_size) val_data = torch.load("/content/validate_data.pth") val_dataset = TensorDataset(*val_data) val_dataloader = DataLoader(dataset=val_dataset, shuffle=True, batch_size=batch_size) # ------------------------END LOAD TRAIN DATA-------------- # ------------------------SET OPTIMIZER------------------- num_train_optimization_steps = len( train_dataset) * epochs // batch_size // num_gradients_accumulation param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW( optimizer_grouped_parameters, \ lr=lr, \ weight_decay=0.01, ) scheduler = get_linear_schedule_with_warmup( optimizer, \ num_warmup_steps=num_train_optimization_steps // 10, \ num_training_steps=num_train_optimization_steps ) # ------------------------END SET OPTIMIZER-------------- # ------------------------START TRAINING------------------- update_count = 0 start = time.time() print('start training....') for epoch in range(epochs): # ------------------------training------------------------ model.train() losses = 0 times = 0 print('\n' + '-' * 20 + f'epoch {epoch}' + '-' * 20) for batch in tqdm(train_dataloader): batch = [item.to(device) for item in batch] encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch _, past = model.encoder(input_ids=encoder_input, attention_mask=mask_encoder_input) mask = torch.cat([mask_encoder_input, mask_decoder_input], dim=1) logits, _ = model.decoder(decoder_input, attention_mask=mask, past=list(past)) out = logits[:, :-1].contiguous() target = decoder_input[:, 1:].contiguous() target_mask = mask_decoder_input[:, 1:].contiguous() loss = util.sequence_cross_entropy_with_logits(out, target, target_mask, average="token") loss.backward() losses += loss.item() times += 1 update_count += 1 if update_count % num_gradients_accumulation == num_gradients_accumulation - 1: torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) optimizer.step() scheduler.step() optimizer.zero_grad() end = time.time() print(f'time: {(end - start)}') print(f'loss: {losses / times}') start = end # ------------------------validate------------------------ model.eval() perplexity = 0 batch_count = 0 print('\nstart calculate the perplexity....') with torch.no_grad(): for batch in tqdm(val_dataloader): batch = [item.to(device) for item in batch] encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch _, past = model.encoder(input_ids=encoder_input, attention_mask=mask_encoder_input) mask = torch.cat([mask_encoder_input, mask_decoder_input], dim=1) logits, _ = model.decoder(decoder_input, attention_mask=mask, past=list(past)) out = logits[:, :-1].contiguous() target = decoder_input[:, 1:].contiguous() target_mask = mask_decoder_input[:, 1:].contiguous() # print(out.shape,target.shape,target_mask.shape) loss = util.sequence_cross_entropy_with_logits(out, target, target_mask, average="token") perplexity += np.exp(loss.item()) batch_count += 1 print(f'\nvalidate perplexity: {perplexity / batch_count}') torch.save( model.state_dict(), os.path.join(os.path.abspath('.'), load_dir, "model-" + str(epoch) + ".pth"))
def __init__(self, cfg: DictConfig, trainer: Trainer = None): # must assign tokenizers before init if cfg.language_model.pretrained_model_name: if cfg.language_model.pretrained_encoder_model_name or cfg.language_model.pretrained_decoder_model_name: raise ValueError( "Must have either pretrained_model_name or both pretrained_encoder_model name and " "pretrained_decoder_model_name.") # setup tokenizer self.encoder_tokenizer = self.setup_tokenizer( cfg.encoder_tokenizer) self.encoder_add_special_tokens = cfg.encoder_tokenizer.add_special_tokens # set decoder to encoder self.decoder_tokenizer = self.encoder_tokenizer self.decoder_add_special_tokens = self.encoder_add_special_tokens else: if not (cfg.language_model.pretrained_encoder_model_name and cfg.language_model.pretrained_decoder_model_name): raise ValueError("Both encoder and decoder must be specified") # setup tokenizers self.encoder_tokenizer = self.setup_tokenizer( cfg.encoder_tokenizer) self.encoder_add_special_tokens = cfg.encoder_tokenizer.add_special_tokens self.decoder_tokenizer = self.setup_tokenizer( cfg.decoder_tokenizer) self.decoder_add_special_tokens = cfg.decoder_tokenizer.add_special_tokens if not self.encoder_tokenizer: raise TypeError("encoder_tokenizer failed to initialize") if not self.decoder_tokenizer: raise TypeError("decoder_tokenizer failed to initialize") # init superclass super().__init__(cfg=cfg, trainer=trainer) # must assign modules after init if cfg.language_model.pretrained_model_name: # Setup end-to-end model if "bart" in cfg.language_model.pretrained_model_name: self.model = BartForConditionalGeneration.from_pretrained( cfg.language_model.pretrained_model_name) else: self.model = AutoModel.from_pretrained( cfg.language_model.pretrained_model_name) else: if not (cfg.language_model.pretrained_encoder_model_name and cfg.language_model.pretrained_decoder_model_name): raise ValueError("Both encoder and decoder must be specified") # Setup encoder/decoder model self.model = EncoderDecoderModel.from_encoder_decoder_pretrained( encoder=cfg.language_model.pretrained_encoder_model_name, decoder=cfg.language_model.pretrained_decoder_model_name, ) self.validation_perplexity = Perplexity(compute_on_step=False) self.setup_optimization(cfg.optim)
def test_real_bert_model_from_pretrained_has_cross_attention(self): model = EncoderDecoderModel.from_encoder_decoder_pretrained( "bert-base-uncased", "bert-base-uncased") self.assertTrue( hasattr(model.decoder.bert.encoder.layer[0], "crossattention"))
def check_save_and_load(self, config, input_ids, attention_mask, encoder_hidden_states, decoder_config, decoder_input_ids, decoder_attention_mask, **kwargs): encoder_model, decoder_model = self.get_encoder_decoder_model( config, decoder_config) enc_dec_model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) enc_dec_model.to(torch_device) enc_dec_model.eval() with torch.no_grad(): outputs = enc_dec_model( input_ids=input_ids, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, ) out_2 = outputs[0].cpu().numpy() out_2[np.isnan(out_2)] = 0 with tempfile.TemporaryDirectory() as tmpdirname: enc_dec_model.save_pretrained(tmpdirname) enc_dec_model = EncoderDecoderModel.from_pretrained(tmpdirname) enc_dec_model.to(torch_device) after_outputs = enc_dec_model( input_ids=input_ids, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, ) out_1 = after_outputs[0].cpu().numpy() out_1[np.isnan(out_1)] = 0 max_diff = np.amax(np.abs(out_1 - out_2)) self.assertLessEqual(max_diff, 1e-5)
def evaluate_style_gen_title( existing_run_name: str, existing_run_id: str, config_file: str, gen_model_file: str, discr_model_file: str, test_file: str, test_sample_rate: float, ): logging.set_verbosity_info() init_wandb(existing_run_name, None, existing_run_id) config = json.loads(jsonnet_evaluate_file(config_file)) tokenizer_model_path = config["tokenizer_model_path"] tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path, do_lower_case=False, do_basic_tokenize=False) max_tokens_text = config["max_tokens_text"] max_tokens_title = config["max_tokens_title"] setattr(tokenizer, 'max_tokens_text', max_tokens_text) batch_size = config["batch_size"] print("Loading model...") model = EncoderDecoderModel.from_pretrained(gen_model_file) model.eval() model.cuda() agency_list = config['agency_list'] discriminator = AutoModelForSequenceClassification.from_pretrained(discr_model_file, num_labels=len(agency_list)).cuda() print("Fetching TG data...") test_records = [r for r in tqdm.tqdm(tg_reader(test_file)) if random.random() <= test_sample_rate] print("Building datasets...") agency_to_special_token_id = { a: tokenizer.vocab[f'[unused{i+1}]'] for i, a in enumerate(agency_list) } agency_to_target = {a: i for i, a in enumerate(sorted(agency_list))} test_dataset = AgencyTitleDatasetGeneration( test_records, tokenizer, filter_agencies=list(agency_to_special_token_id.keys()), agency_to_special_token_id=agency_to_special_token_id, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title ) print('Dataset size:', len(test_dataset)) y_pred = [] y_true = [] for i in tqdm.trange(0, len(test_dataset), batch_size): data = test_dataset[i] for k in tuple(data.keys()): if k not in ('input_ids', 'attention_mask'): del data[k] else: data[k] = data[k].unsqueeze(0) for j in range(i + 1, min(i + batch_size, len(test_dataset))): for k in data.keys(): data[k] = torch.cat((data[k], test_dataset[j][k].unsqueeze(0)), dim=0) y_true.extend([ agency_to_target[test_dataset.get_strings(j)['agency']] for j in range(i, min(i + batch_size, len(test_dataset)))]) data['input_ids'] = data['input_ids'].cuda() data['attention_mask'] = data['attention_mask'].cuda() output_ids = model.generate( **data, decoder_start_token_id=model.config.decoder.pad_token_id, min_length=7, max_length=20, num_beams=6 ) preds = [ tokenizer.decode(first_sent(x, tokenizer.sep_token_id), skip_special_tokens=True) for x in output_ids ] for title in preds: inp = tokenizer(title, add_special_tokens=True, max_length=max_tokens_title, padding='max_length', truncation=True ) logits = discriminator(input_ids=torch.LongTensor(inp['input_ids']).cuda().unsqueeze(0), attention_mask=torch.LongTensor(inp['attention_mask']).cuda().unsqueeze(0))[0] y_pred.append(torch.argmax(logits).item()) wandb.summary.update({ 'D-Style': classification_report(y_true, y_pred, output_dict=True) })
def get_pretrained_model(self): return EncoderDecoderModel.from_encoder_decoder_pretrained( "google/bert_for_seq_generation_L-24_bbc_encoder", "google/bert_for_seq_generation_L-24_bbc_encoder")
def get_pretrained_model(self): return EncoderDecoderModel.from_encoder_decoder_pretrained("roberta-base", "roberta-base")
def get_pretrained_model(self): return EncoderDecoderModel.from_encoder_decoder_pretrained( "bert-large-uncased", "microsoft/prophetnet-large-uncased")
def get_from_pretrained(path): conf_path = join(dirname(path), "config.json") conf = EncoderDecoderConfig.from_pretrained(conf_path) model = EncoderDecoderModel.from_pretrained(path, config=conf) return model
def get_pretrained_model(self): return EncoderDecoderModel.from_encoder_decoder_pretrained( "bert-large-uncased", "facebook/bart-large")
# In[4]: tokenizer_model_path = config["tokenizer_model_path"] tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path, do_lower_case=False, do_basic_tokenize=False) max_tokens_text = config["max_tokens_text"] max_tokens_title = config["max_tokens_title"] # In[5]: enc_model_path = config["enc_model_path"] dec_model_path = config["dec_model_path"] model = EncoderDecoderModel.from_encoder_decoder_pretrained( enc_model_path, dec_model_path) # In[8]: from torch.utils.data import Dataset class StyleModelDataset(Dataset): def __init__( self, path, tokenizer, agency, # lenta or ria is_train=True, max_tokens_text=250, max_tokens_title=50):
def get_encoderdecoder_model(self): return EncoderDecoderModel.from_pretrained( "patrickvonplaten/bert2bert-cnn_dailymail-fp16")
def test_finetune_bert2bert(self): if not is_datasets_available(): return import datasets bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained( "prajjwal1/bert-tiny", "prajjwal1/bert-tiny") tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") bert2bert.config.vocab_size = bert2bert.config.encoder.vocab_size bert2bert.config.eos_token_id = tokenizer.sep_token_id bert2bert.config.decoder_start_token_id = tokenizer.cls_token_id bert2bert.config.max_length = 128 train_dataset = datasets.load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]") val_dataset = datasets.load_dataset("cnn_dailymail", "3.0.0", split="validation[:1%]") train_dataset = train_dataset.select(range(32)) val_dataset = val_dataset.select(range(16)) rouge = datasets.load_metric("rouge") batch_size = 4 def _map_to_encoder_decoder_inputs(batch): # Tokenizer will automatically set [BOS] <text> [EOS] inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=512) outputs = tokenizer(batch["highlights"], padding="max_length", truncation=True, max_length=128) batch["input_ids"] = inputs.input_ids batch["attention_mask"] = inputs.attention_mask batch["decoder_input_ids"] = outputs.input_ids batch["labels"] = outputs.input_ids.copy() batch["labels"] = [[ -100 if token == tokenizer.pad_token_id else token for token in labels ] for labels in batch["labels"]] batch["decoder_attention_mask"] = outputs.attention_mask assert all([len(x) == 512 for x in inputs.input_ids]) assert all([len(x) == 128 for x in outputs.input_ids]) return batch def _compute_metrics(pred): labels_ids = pred.label_ids pred_ids = pred.predictions # all unnecessary tokens are removed pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True) label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True) rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid return { "rouge2_precision": round(rouge_output.precision, 4), "rouge2_recall": round(rouge_output.recall, 4), "rouge2_fmeasure": round(rouge_output.fmeasure, 4), } # map train dataset train_dataset = train_dataset.map( _map_to_encoder_decoder_inputs, batched=True, batch_size=batch_size, remove_columns=["article", "highlights"], ) train_dataset.set_format( type="torch", columns=[ "input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels" ], ) # same for validation dataset val_dataset = val_dataset.map( _map_to_encoder_decoder_inputs, batched=True, batch_size=batch_size, remove_columns=["article", "highlights"], ) val_dataset.set_format( type="torch", columns=[ "input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels" ], ) output_dir = self.get_auto_remove_tmp_dir() training_args = Seq2SeqTrainingArguments( output_dir=output_dir, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, predict_with_generate=True, evaluate_during_training=True, do_train=True, do_eval=True, warmup_steps=0, eval_steps=2, logging_steps=2, ) # instantiate trainer trainer = Seq2SeqTrainer( model=bert2bert, args=training_args, compute_metrics=_compute_metrics, train_dataset=train_dataset, eval_dataset=val_dataset, ) # start training trainer.train()
eos_token_id=decoder_tokenizer.vocab["[EOS]"], ) # AutoConfig.from_pretrained("bert-base-uncased") #decoder_config = BertGenerationDecoderConfig() # From: https://github.com/huggingface/transformers/blob/master/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py#L464 #>>> model.config.decoder_start_token_id = tokenizer.cls_token_id #>>> model.config.pad_token_id = tokenizer.pad_token_id #>>> model.config.vocab_size = model.config.decoder.vocab_size #decoder_config.decoder_start_token_id = decoder_tokenizer.vocab["[CLS]"] # decoder_config.pad_token_type_id = 0 ? decoder = BertGenerationDecoder(config=decoder_config) #enc_dec_config = EncoderDecoderConfig(encoder=encoder.config, decoder=decoder.config, decoder_start_token_id=decoder_tokenizer.vocab["[CLS]"]) bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder) bert2bert.config.decoder_start_token_id = decoder_tokenizer.vocab["[CLS]"] bert2bert.config.pad_token_id = decoder_tokenizer.vocab["[PAD]"] # Tokenize inputs and labels. inputs = encoder_tokenizer( 'Make a stack of all blocks except the green block.', add_special_tokens=False, return_tensors="pt") print("Inputs: ", inputs) labels = decoder_tokenizer( "has_anything(robot),on_surface(blue_block, tabletop),stacked(blue_block, red_block),on_surface(yellow_block, tabletop)", return_tensors="pt", padding=True, truncation=True)