class SoftMaskedBertTrainer(): def __init__(self, bert, tokenizer, device, hidden=256, layer_n=1, lr=2e-5, gama=0.8, betas=(0.9, 0.999), weight_decay=0.01, warmup_steps=10000): self.device = device self.bert = bert self.tokenizer = tokenizer self.model = SoftMaskedBert(self.bert, self.tokenizer, hidden, layer_n, self.device).to(self.device) # if torch.cuda.device_count() > 1: # print("Using %d GPUS for train" % torch.cuda.device_count()) # self.model = nn.DataParallel(self.model, device_ids=[0,1,2]) self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay) self.optim_schedule = ScheduledOptim(self.optim, hidden, n_warmup_steps=warmup_steps) self.criterion_c = nn.NLLLoss() self.criterion_d = nn.BCELoss() self.gama = gama self.log_freq = 10 def train(self, train_data, epoch): self.model.train() return self.iteration(epoch, train_data) def evaluate(self, val_data, epoch): self.model.eval() return self.iteration(epoch, val_data, train=False) def inference(self, data_loader): self.model.eval() out_put = [] data_iter = tqdm.tqdm(enumerate(data_loader), desc="%s" % 'Inference:', total=len(data_loader), bar_format="{l_bar}{r_bar}") for i, data in data_iter: # 0. batch_data will be sent into the device(GPU or cpu) data = {key: value.to(self.device) for key, value in data.items()} out, prob = self.model( data["random_text"]) #prob [batch_size, seq_len, 1] out_put.extend(out.argmax(dim=-1)) return [self.tokenizer.convert_ids_to_tokens(x) for x in out_put] def save(self, file_path): torch.save(self.model.cpu(), file_path) self.model.to(self.device) print('Model save {}'.format(file_path)) def load(self, file_path): if not os.path.exists(file_path): return self.model = torch.load(file_path) def iteration(self, epoch, data_loader, train=True): str_code = "train" if train else "val" # Setting the tqdm progress bar data_iter = tqdm.tqdm(enumerate(data_loader), desc="EP_%s:%d" % (str_code, epoch), total=len(data_loader), bar_format="{l_bar}{r_bar}") avg_loss = 0.0 total_correct = 0 total_element = 0 for i, data in data_iter: # 0. batch_data will be sent into the device(GPU or cpu) data = {key: value.to(self.device) for key, value in data.items()} out, prob = self.model( data["random_text"]) #prob [batch_size, seq_len, 1] # label = data["label"].reshape(-1,prob.shape[1], prob.shape[-1]) #prob [batch_size, seq_len] prob = prob.reshape(-1, prob.shape[1]) # prob = prob.transpose(1, 2) # label = data['label'].reshape(-1, prob.shape[1], prob.shape[-1]) # p = prob.reshape(prob.shape[0]*prob.shape[1],-1) # label = data['label'].reshape(prob.shape[0]*prob.shape[1]) # print(p.shape) # print(label.shape) loss_d = self.criterion_d(prob, data['label'].float()) loss_c = self.criterion_c(out.transpose(1, 2), data["origin_text"]) loss = self.gama * loss_c + (1 - self.gama) * loss_d if train: self.optim_schedule.zero_grad() loss.backward(retain_graph=True) self.optim_schedule.step_and_update_lr() correct = out.argmax(dim=-1).eq(data["origin_text"]).sum().item() avg_loss += loss.item() total_correct += correct total_element += data["label"].nelement() post_fix = { "epoch": epoch, "iter": i, "avg_loss": avg_loss / (i + 1), "avg_acc": total_correct / total_element * 100, "loss": loss.item() } if i % self.log_freq == 0: data_iter.write(str(post_fix)) print("EP%d_%s, avg_loss=" % (epoch, str_code), avg_loss / len(data_iter), "total_acc=", total_correct * 100.0 / total_element) return avg_loss / len(data_iter)
class SoftMaskedErnieTrainer(): def __init__(self, args, ernie, tokenizer, device, hidden=256, layer_n=1, lr=2e-5, gama=0.8, betas=(0.9, 0.999), weight_decay=0.01, warmup_steps=10000, g_clip=0.001): self.device = device self.tokenizer = tokenizer self.model = SoftMaskedErnie(ernie, self.tokenizer, hidden, layer_n, self.device).to(self.device) opt = AdamW(learning_rate=LinearDecay( args.lr, int(args.warmup_proportion * args.max_steps), args.max_steps), parameter_list=self.model.parameters(), weight_decay=args.wd, grad_clip=g_clip) self.optim_schedule = ScheduledOptim(opt, hidden, n_warmup_steps=warmup_steps) self.criterion_c = fluid.dygraph.NLLLoss() self.criterion_d = fluid.dygraph.BCELoss() self.gama = gama self.log_freq = 10 def train(self, train_data, epoch): self.model.train() return self.iteration(epoch, train_data) def evaluate(self, val_data, epoch): self.model.eval() return self.iteration(epoch, val_data, train=False) def save(self, file_path): torch.save(self.model.cpu(), file_path) self.model.to(self.device) print('Model save {}'.format(file_path)) def load(self, file_path): if not os.path.exists(file_path): return self.model = torch.load(file_path) def inference(self, data_loader): self.model.eval() out_put = [] data_loader = tqdm.tqdm(enumerate(data_loader), desc="%s" % 'Inference:', total=len(data_loader), bar_format="{l_bar}{r_bar}") for i, data in data_loader: data = {key: value for key, value in data.items()} out, prob = self.model( data["input_ids"], data["input_mask"], data["segment_ids"]) # prob [batch_size, seq_len, 1] out_put.extend(out.argmax(dim=-1)) return [ ''.join(self.tokenizer.convert_ids_to_tokens(x)) for x in out_put ] def iteration(self, epoch, data_loader, train=True): str_code = "train" if train else "val" # Setting the tqdm progress bar data_loader = tqdm.tqdm(enumerate(data_loader), desc="EP_%s:%d" % (str_code, epoch), total=len(data_loader), bar_format="{l_bar}{r_bar}") avg_loss = 0.0 total_correct = 0 total_element = 0 for i, data in data_loader: # 0. batch_data will be sent into the device(GPU or cpu) data = {key: value.to(self.device) for key, value in data.items()} out, prob = self.model( data["input_ids"], data["input_mask"], data["segment_ids"]) # prob [batch_size, seq_len, 1] prob = prob.reshape(-1, prob.shape[1]) loss_d = self.criterion_d(prob, data['label']) loss_c = self.criterion_c( out.transpose(1, 2).detach(), data["output_ids"]) loss = self.gama * loss_c + (1 - self.gama) * loss_d if train: # with torch.autograd.set_detect_anomaly(True): self.optim_schedule.zero_grad() loss.backward(retain_graph=True) self.optim_schedule.step_and_update_lr() correct = out.argmax(dim=-1).eq(data["output_ids"]).sum().item() avg_loss += loss.item() total_correct += correct total_element += data["label"].nelement() post_fix = { "epoch": epoch, "iter": i, "avg_loss": avg_loss / (i + 1), "avg_acc": total_correct / total_element * 100, "loss": loss.item() } if i % self.log_freq == 0: data_loader.write(str(post_fix)) print("EP%d_%s, avg_loss=" % (epoch, str_code), avg_loss / len(data_loader), "total_acc=", total_correct * 100.0 / total_element) return avg_loss / len(data_loader)
class BERTTrainer: """ BERTTrainer make the pretrained BERT model with two LM training method. 1. Masked Language Model : 3.3.1 Task #1: Masked LM 2. Next Sentence prediction : 3.3.2 Task #2: Next Sentence Prediction please check the details on README.md with simple example. """ def __init__(self, bert: BERT, vocab_size: int, train_dataloader: DataLoader, test_dataloader: DataLoader = None, lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000, with_cuda: bool = True, cuda_devices=None, log_freq: int = 10): """ :param bert: BERT model which you want to train :param vocab_size: total word vocab size :param train_dataloader: train dataset data loader :param test_dataloader: test dataset data loader [can be None] :param lr: learning rate of optimizer :param betas: Adam optimizer betas :param weight_decay: Adam optimizer weight decay param :param with_cuda: traning with cuda :param log_freq: logging frequency of the batch iteration """ # Setup cuda device for BERT training, argument -c, --cuda should be true cuda_condition = torch.cuda.is_available() and with_cuda self.device = torch.device("cuda:0" if cuda_condition else "cpu") # This BERT model will be saved every epoch self.bert = bert # Initialize the BERT Language Model, with BERT model self.model = BERTLM(bert, vocab_size).to(self.device) # Distributed GPU training if CUDA can detect more than 1 GPU if with_cuda and torch.cuda.device_count() > 1: print("Using %d GPUS for BERT" % torch.cuda.device_count()) self.model = nn.DataParallel(self.model, device_ids=cuda_devices) # Setting the train and test data loader self.train_data = train_dataloader self.test_data = test_dataloader # Setting the Adam optimizer with hyper-param self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay) self.optim_schedule = ScheduledOptim(self.optim, self.bert.hidden, n_warmup_steps=warmup_steps) # Using Negative Log Likelihood Loss function for predicting the masked_token self.criterion = nn.NLLLoss(ignore_index=0) self.log_freq = log_freq print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()])) def train(self, epoch): self.iteration(epoch, self.train_data) def test(self, epoch): self.iteration(epoch, self.test_data, train=False) def iteration(self, epoch, data_loader, train=True): """ loop over the data_loader for training or testing if on train status, backward operation is activated and also auto save the model every peoch :param epoch: current epoch index :param data_loader: torch.utils.data.DataLoader for iteration :param train: boolean value of is train or test :return: None """ str_code = "train" if train else "test" # Setting the tqdm progress bar data_iter = tqdm.tqdm(enumerate(data_loader), desc="EP_%s:%d" % (str_code, epoch), total=len(data_loader), bar_format="{l_bar}{r_bar}") avg_loss = 0.0 total_correct = 0 total_element = 0 for i, data in data_iter: # 0. batch_data will be sent into the device(GPU or cpu) data = {key: value.to(self.device) for key, value in data.items()} # 1. forward the next_sentence_prediction and masked_lm model next_sent_output, mask_lm_output = self.model.forward( data["bert_input"], data["segment_label"]) # 2-1. NLL(negative log likelihood) loss of is_next classification result next_loss = self.criterion(next_sent_output, data["is_next"]) # 2-2. NLLLoss of predicting masked token word mask_loss = self.criterion(mask_lm_output.transpose(1, 2), data["bert_label"]) # 2-3. Adding next_loss and mask_loss : 3.4 Pre-training Procedure loss = next_loss + mask_loss # 3. backward and optimization only in train if train: self.optim_schedule.zero_grad() loss.backward() self.optim_schedule.step_and_update_lr() # next sentence prediction accuracy correct = next_sent_output.argmax(dim=-1).eq( data["is_next"]).sum().item() avg_loss += loss.item() total_correct += correct total_element += data["is_next"].nelement() post_fix = { "epoch": epoch, "iter": i, "avg_loss": avg_loss / (i + 1), "avg_acc": total_correct / total_element * 100, "loss": loss.item() } if i % self.log_freq == 0: data_iter.write(str(post_fix)) print("EP%d_%s, avg_loss=" % (epoch, str_code), avg_loss / len(data_iter), "total_acc=", total_correct * 100.0 / total_element) def save(self, epoch, file_path="output/bert_trained.model"): """ Saving the current BERT model on file_path :param epoch: current epoch number :param file_path: model output path which gonna be file_path+"ep%d" % epoch :return: final_output_path """ output_path = file_path + ".ep%d" % epoch torch.save(self.bert.cpu(), output_path) self.bert.to(self.device) print("EP:%d Model Saved on:" % epoch, output_path) return output_path
class BERTTrainerFull(BasicTrainer): def __init__(self, bert: BERTFull, vocab_size: int, epochs: int, tensorboard_log_dir: str, output_path: str, train_dataloader: DataLoader, lr: float = 1e-7, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000, lambda_beta: float = 1e-2, with_cuda: bool = True, log_freq: int = 10, save_steps: int = -1): super(BERTTrainerFull, self).__init__(bert=bert, epochs=epochs, tensorboard_log_dir=tensorboard_log_dir, output_path=output_path, train_dataloader=train_dataloader, with_cuda=with_cuda, log_freq=log_freq, save_steps=save_steps) self.model = BERTLMFull(bert, vocab_size).to(self.device) self.lambda_beta = lambda_beta if with_cuda and torch.cuda.device_count() > 1: print("Using %d GPUS for BERT" % torch.cuda.device_count()) self.model = nn.DataParallel(self.model) # Setting the Adam optimizer with hyper-param self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay) self.optim_schedule = ScheduledOptim(self.optim, self.bert.hidden, n_warmup_steps=warmup_steps) print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()])) def iteration(self, epoch, data_loader, pos_mask): data_iter = tqdm.tqdm(enumerate(data_loader), desc="EP:%d" % epoch, total=self.n_batches, bar_format="{l_bar}{r_bar}", disable=False) avg_loss = 0.0 for i, data in data_iter: global_step = epoch * self.n_batches + i + 1 data = {key: value.to(self.device) for key, value in data.items()} mask_tp_output, mask_lm_output = self.model.forward(data["token_input"], data["time_input"], pos_mask) # 2-2. NLLLoss of predicting masked token word mask_lm_loss = self.mask_lm_criterion(mask_lm_output.transpose(1, 2), data["token_label"]) # mask_tp_loss = self.mask_time_prediction_criterion(mask_tp_output, data["time_label"]) mask_tp_loss = get_mask_time_prediction_loss(mask_tp_output, data["time_label"]) # print("mask_lm_loss", mask_lm_loss) # print("mask_tp_loss", mask_tp_loss) # 2-3. Adding next_loss and mask_loss : 3.4 Pre-training Procedure loss = mask_lm_loss + self.lambda_beta * mask_tp_loss # print("loss", loss) # 3. backward and optimization only in train self.optim_schedule.zero_grad() # loss.backward(retain_graph=True) # todo: maybe be removed later loss.backward() self.optim_schedule.step_and_update_lr() avg_loss += loss.item() self.tensorborad_writer.add_scalar("Masked_language_model loss", mask_lm_loss.item(), global_step) self.tensorborad_writer.add_scalar("Masked_time_prediction loss", mask_tp_loss.item(), global_step) self.tensorborad_writer.add_scalar("Average loss in epoch", avg_loss / (i + 1), global_step) post_fix = { "epoch": epoch, "iter": i+1, "avg_loss": avg_loss / (i + 1), "loss": loss.item() } if (i+1) % self.log_freq == 0: data_iter.write(str(post_fix)) if self.save_steps > 0 and ((i + 1) % self.save_steps == 0 or (i + 1) == self.n_batches): self.save(epoch, i + 1)
class BERTTrainer(BasicTrainer): def __init__(self, bert: BERT, vocab_size: int, epochs: int, tensorboard_log_dir: str, output_path: str, train_dataloader: DataLoader, lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000, with_cuda: bool = True, log_freq: int = 10, save_steps: int = -1): super(BERTTrainer, self).__init__(bert=bert, epochs=epochs, tensorboard_log_dir=tensorboard_log_dir, output_path=output_path, train_dataloader=train_dataloader, with_cuda=with_cuda, log_freq=log_freq, save_steps=save_steps) self.model = BERTLM(bert, vocab_size).to(self.device) if with_cuda and torch.cuda.device_count() > 1: print("Using %d GPUS for BERT" % torch.cuda.device_count()) self.model = nn.DataParallel(self.model) # Setting the Adam optimizer with hyper-param self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay) self.optim_schedule = ScheduledOptim(self.optim, self.bert.hidden, n_warmup_steps=warmup_steps) print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()])) def iteration(self, epoch, data_loader): data_iter = tqdm.tqdm(enumerate(data_loader), desc="EP:%d" % epoch, total=self.n_batches, bar_format="{l_bar}{r_bar}", disable=False) avg_loss = 0.0 for i, data in data_iter: global_step = epoch * self.n_batches + i + 1 data = {key: value.to(self.device) for key, value in data.items()} mask_lm_output = self.model.forward(data["bert_input"]) mask_loss = self.criterion(mask_lm_output.transpose(1, 2), data["bert_label"]) loss = mask_loss self.optim_schedule.zero_grad() loss.backward() self.optim_schedule.step_and_update_lr() avg_loss += loss.item() self.tensorborad_writer.add_scalar("Masked_language_model loss", mask_loss.item(), global_step) self.tensorborad_writer.add_scalar("Average loss in epoch", avg_loss / (i + 1), global_step) post_fix = { "epoch": epoch, "iter": i+1, "avg_loss": avg_loss / (i + 1), "loss": loss.item() } if (i+1) % self.log_freq == 0: data_iter.write(str(post_fix)) if self.save_steps > 0 and ((i + 1) % self.save_steps == 0 or (i + 1) == self.n_batches): self.save(epoch, i + 1)