def create_and_check_for_masked_lm( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): model = RobertaForMaskedLM(config=config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
def create_and_check_roberta_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = RobertaForMaskedLM(config=config) model.to(torch_device) model.eval() loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels) result = { "loss": loss, "prediction_scores": prediction_scores, } self.parent.assertListEqual( list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]) self.check_loss_output(result)
def main(): parser = argparse.ArgumentParser() #model arguments parser.add_argument("--model_type", default='roberta', type=str) parser.add_argument("--model_name_or_path", default='roberta-base', type=str) #data arguments parser.add_argument("--output_dir", default="./output", type=str) parser.add_argument("--train_data_file", default=None, type=str) parser.add_argument("--eval_data_file", default=None, type=str) parser.add_argument("--mlm_probability", default=0.15, type=float) parser.add_argument("--block_size", default=-1, type=int) #training arguments parser.add_argument("--per_gpu_train_batch_size", default=8, type=int) parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int) parser.add_argument("--gradient_accumulation_steps", type=int, default=1) parser.add_argument("--learning_rate", default=5e-5, type=float) parser.add_argument("--weight_decay", default=0.0, type=float) parser.add_argument("--adam_beta1", default=0.9, type=float) parser.add_argument("--adam_beta2", default=0.999, type=float) parser.add_argument("--adam_epsilon", default=1e-8, type=float) parser.add_argument("--max_grad_norm", default=1.0, type=float) parser.add_argument("--num_train_epochs", default=1.0, type=float) parser.add_argument( "--max_steps", default=-1, type=int ) #If > 0: set total number of training steps to perform. Override num_train_epochs. parser.add_argument("--warmup_steps", default=0, type=int) #Linear warmup over warmup_steps. parser.add_argument("--logging_steps", type=int, default=-1) #help="Log every X updates steps. parser.add_argument( "--save_steps", type=int, default=-1) #help="Save checkpoint every X updates steps. parser.add_argument("--seed", type=int, default=42) parser.add_argument( "--dataloader_drop_last", type=bool, default=False ) #"Drop the last incomplete batch if it is not divisible by the batch size parser.add_argument("--device", type=str, default='cuda') args, _ = parser.parse_known_args() logger = logging.getLogger(__name__) set_seed(args) #load config and tokenizer config = AutoConfig.from_pretrained(args.model_name_or_path) tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) # load model weights from vanilla RoBERTa model model = RobertaForMaskedLM(config=config, ) #Or resume training from saved checkpoint with below code #model.load_state_dict(torch.load("../input/biomedical-questionanswer/roberta-base-pretrain-pubmed8252.bin")) model.to(args.device) # load data and train the model print("load the pubmed abstract text and generate dataset") file_path = '../input/biomedical-questionanswer/abstract.txt' inputs_ids = text_to_ids_tensor(file_path, tokenizer) inputs_ids, labels = mask_tokens(inputs_ids, tokenizer, args) train_dataset = TensorDataset(inputs_ids, labels) print("start to train") global_step, tr_loss = train(train_dataset, model, tokenizer, args) #Save the pretrained model print("save the model") output_dir = "roberta-base-pretrain-pubmed.bin" torch.save(model.state_dict(), output_dir)
hidden_size=Config['embedding_dim'], num_attention_heads=Config['attention_heads'], num_hidden_layers=Config['encoder_layers'], intermediate_size=Config['intermediate_size'], type_vocab_size=Config['type_vocab_size']) if Config['last_ckpt_path'] == None or Config['last_ckpt_path'] == '': model = RobertaForMaskedLM(config) else: model = RobertaForMaskedLM.from_pretrained(Config['last_ckpt_path'], config=config) device = torch.device('cuda') if torch.cuda.is_available() else torch.device( 'cpu') # and move our model over to the selected device model.to(device) model.train() # activate training mode optim = torch.optim.AdamW(model.parameters(), lr=Config['learning_rate']) dt_str = train_start_datetime.strftime("D%Y_%m_%d_T%H_%M_%S") model_folder = os.path.join(Config['model_path'], dt_str) ckpt_path = os.path.join(model_folder, 'checkpoints') config_path = os.path.join(model_folder, 'train_config.json') results_path = os.path.join(model_folder, 'results.json') start_epoch = Config['start_epoch'] end_epoch = start_epoch + Config['num_epochs'] results = {} for epoch in range(start_epoch, end_epoch):