예제 #1
0
class SoftMaskedBertTrainer():
    def __init__(self,
                 bert,
                 tokenizer,
                 device,
                 hidden=256,
                 layer_n=1,
                 lr=2e-5,
                 gama=0.8,
                 betas=(0.9, 0.999),
                 weight_decay=0.01,
                 warmup_steps=10000):

        self.device = device
        self.bert = bert
        self.tokenizer = tokenizer
        self.model = SoftMaskedBert(self.bert, self.tokenizer, hidden, layer_n,
                                    self.device).to(self.device)

        # if torch.cuda.device_count() > 1:
        #     print("Using %d GPUS for train" % torch.cuda.device_count())
        #     self.model = nn.DataParallel(self.model, device_ids=[0,1,2])

        self.optim = Adam(self.model.parameters(),
                          lr=lr,
                          betas=betas,
                          weight_decay=weight_decay)
        self.optim_schedule = ScheduledOptim(self.optim,
                                             hidden,
                                             n_warmup_steps=warmup_steps)
        self.criterion_c = nn.NLLLoss()
        self.criterion_d = nn.BCELoss()
        self.gama = gama
        self.log_freq = 10

    def train(self, train_data, epoch):
        self.model.train()
        return self.iteration(epoch, train_data)

    def evaluate(self, val_data, epoch):
        self.model.eval()
        return self.iteration(epoch, val_data, train=False)

    def inference(self, data_loader):
        self.model.eval()
        out_put = []
        data_iter = tqdm.tqdm(enumerate(data_loader),
                              desc="%s" % 'Inference:',
                              total=len(data_loader),
                              bar_format="{l_bar}{r_bar}")
        for i, data in data_iter:
            # 0. batch_data will be sent into the device(GPU or cpu)
            data = {key: value.to(self.device) for key, value in data.items()}

            out, prob = self.model(
                data["random_text"])  #prob [batch_size, seq_len, 1]
            out_put.extend(out.argmax(dim=-1))

        return [self.tokenizer.convert_ids_to_tokens(x) for x in out_put]

    def save(self, file_path):
        torch.save(self.model.cpu(), file_path)
        self.model.to(self.device)
        print('Model save {}'.format(file_path))

    def load(self, file_path):
        if not os.path.exists(file_path):
            return
        self.model = torch.load(file_path)

    def iteration(self, epoch, data_loader, train=True):
        str_code = "train" if train else "val"

        # Setting the tqdm progress bar
        data_iter = tqdm.tqdm(enumerate(data_loader),
                              desc="EP_%s:%d" % (str_code, epoch),
                              total=len(data_loader),
                              bar_format="{l_bar}{r_bar}")

        avg_loss = 0.0
        total_correct = 0
        total_element = 0

        for i, data in data_iter:
            # 0. batch_data will be sent into the device(GPU or cpu)
            data = {key: value.to(self.device) for key, value in data.items()}

            out, prob = self.model(
                data["random_text"])  #prob [batch_size, seq_len, 1]
            # label = data["label"].reshape(-1,prob.shape[1], prob.shape[-1]) #prob [batch_size, seq_len]
            prob = prob.reshape(-1, prob.shape[1])
            # prob = prob.transpose(1, 2)
            # label = data['label'].reshape(-1, prob.shape[1], prob.shape[-1])
            # p = prob.reshape(prob.shape[0]*prob.shape[1],-1)
            # label = data['label'].reshape(prob.shape[0]*prob.shape[1])
            # print(p.shape)
            # print(label.shape)
            loss_d = self.criterion_d(prob, data['label'].float())
            loss_c = self.criterion_c(out.transpose(1, 2), data["origin_text"])
            loss = self.gama * loss_c + (1 - self.gama) * loss_d

            if train:
                self.optim_schedule.zero_grad()
                loss.backward(retain_graph=True)
                self.optim_schedule.step_and_update_lr()

            correct = out.argmax(dim=-1).eq(data["origin_text"]).sum().item()
            avg_loss += loss.item()
            total_correct += correct
            total_element += data["label"].nelement()

            post_fix = {
                "epoch": epoch,
                "iter": i,
                "avg_loss": avg_loss / (i + 1),
                "avg_acc": total_correct / total_element * 100,
                "loss": loss.item()
            }

            if i % self.log_freq == 0:
                data_iter.write(str(post_fix))

        print("EP%d_%s, avg_loss=" % (epoch, str_code),
              avg_loss / len(data_iter), "total_acc=",
              total_correct * 100.0 / total_element)
        return avg_loss / len(data_iter)
예제 #2
0
class SoftMaskedErnieTrainer():
    def __init__(self,
                 args,
                 ernie,
                 tokenizer,
                 device,
                 hidden=256,
                 layer_n=1,
                 lr=2e-5,
                 gama=0.8,
                 betas=(0.9, 0.999),
                 weight_decay=0.01,
                 warmup_steps=10000,
                 g_clip=0.001):

        self.device = device
        self.tokenizer = tokenizer
        self.model = SoftMaskedErnie(ernie, self.tokenizer, hidden, layer_n,
                                     self.device).to(self.device)

        opt = AdamW(learning_rate=LinearDecay(
            args.lr, int(args.warmup_proportion * args.max_steps),
            args.max_steps),
                    parameter_list=self.model.parameters(),
                    weight_decay=args.wd,
                    grad_clip=g_clip)

        self.optim_schedule = ScheduledOptim(opt,
                                             hidden,
                                             n_warmup_steps=warmup_steps)
        self.criterion_c = fluid.dygraph.NLLLoss()
        self.criterion_d = fluid.dygraph.BCELoss()

        self.gama = gama
        self.log_freq = 10

    def train(self, train_data, epoch):
        self.model.train()
        return self.iteration(epoch, train_data)

    def evaluate(self, val_data, epoch):
        self.model.eval()
        return self.iteration(epoch, val_data, train=False)

    def save(self, file_path):
        torch.save(self.model.cpu(), file_path)
        self.model.to(self.device)
        print('Model save {}'.format(file_path))

    def load(self, file_path):
        if not os.path.exists(file_path):
            return
        self.model = torch.load(file_path)

    def inference(self, data_loader):
        self.model.eval()
        out_put = []
        data_loader = tqdm.tqdm(enumerate(data_loader),
                                desc="%s" % 'Inference:',
                                total=len(data_loader),
                                bar_format="{l_bar}{r_bar}")
        for i, data in data_loader:
            data = {key: value for key, value in data.items()}

            out, prob = self.model(
                data["input_ids"], data["input_mask"],
                data["segment_ids"])  # prob [batch_size, seq_len, 1]
            out_put.extend(out.argmax(dim=-1))
        return [
            ''.join(self.tokenizer.convert_ids_to_tokens(x)) for x in out_put
        ]

    def iteration(self, epoch, data_loader, train=True):
        str_code = "train" if train else "val"

        # Setting the tqdm progress bar
        data_loader = tqdm.tqdm(enumerate(data_loader),
                                desc="EP_%s:%d" % (str_code, epoch),
                                total=len(data_loader),
                                bar_format="{l_bar}{r_bar}")

        avg_loss = 0.0
        total_correct = 0
        total_element = 0

        for i, data in data_loader:
            # 0. batch_data will be sent into the device(GPU or cpu)
            data = {key: value.to(self.device) for key, value in data.items()}

            out, prob = self.model(
                data["input_ids"], data["input_mask"],
                data["segment_ids"])  # prob [batch_size, seq_len, 1]
            prob = prob.reshape(-1, prob.shape[1])
            loss_d = self.criterion_d(prob, data['label'])
            loss_c = self.criterion_c(
                out.transpose(1, 2).detach(), data["output_ids"])
            loss = self.gama * loss_c + (1 - self.gama) * loss_d

            if train:
                # with torch.autograd.set_detect_anomaly(True):
                self.optim_schedule.zero_grad()
                loss.backward(retain_graph=True)
                self.optim_schedule.step_and_update_lr()

            correct = out.argmax(dim=-1).eq(data["output_ids"]).sum().item()
            avg_loss += loss.item()
            total_correct += correct
            total_element += data["label"].nelement()

            post_fix = {
                "epoch": epoch,
                "iter": i,
                "avg_loss": avg_loss / (i + 1),
                "avg_acc": total_correct / total_element * 100,
                "loss": loss.item()
            }

            if i % self.log_freq == 0:
                data_loader.write(str(post_fix))

        print("EP%d_%s, avg_loss=" % (epoch, str_code),
              avg_loss / len(data_loader), "total_acc=",
              total_correct * 100.0 / total_element)
        return avg_loss / len(data_loader)
예제 #3
0
class BERTTrainer:
    """
    BERTTrainer make the pretrained BERT model with two LM training method.

        1. Masked Language Model : 3.3.1 Task #1: Masked LM
        2. Next Sentence prediction : 3.3.2 Task #2: Next Sentence Prediction

    please check the details on README.md with simple example.

    """
    def __init__(self,
                 bert: BERT,
                 vocab_size: int,
                 train_dataloader: DataLoader,
                 test_dataloader: DataLoader = None,
                 lr: float = 1e-4,
                 betas=(0.9, 0.999),
                 weight_decay: float = 0.01,
                 warmup_steps=10000,
                 with_cuda: bool = True,
                 cuda_devices=None,
                 log_freq: int = 10):
        """
        :param bert: BERT model which you want to train
        :param vocab_size: total word vocab size
        :param train_dataloader: train dataset data loader
        :param test_dataloader: test dataset data loader [can be None]
        :param lr: learning rate of optimizer
        :param betas: Adam optimizer betas
        :param weight_decay: Adam optimizer weight decay param
        :param with_cuda: traning with cuda
        :param log_freq: logging frequency of the batch iteration
        """

        # Setup cuda device for BERT training, argument -c, --cuda should be true
        cuda_condition = torch.cuda.is_available() and with_cuda
        self.device = torch.device("cuda:0" if cuda_condition else "cpu")

        # This BERT model will be saved every epoch
        self.bert = bert
        # Initialize the BERT Language Model, with BERT model
        self.model = BERTLM(bert, vocab_size).to(self.device)

        # Distributed GPU training if CUDA can detect more than 1 GPU
        if with_cuda and torch.cuda.device_count() > 1:
            print("Using %d GPUS for BERT" % torch.cuda.device_count())
            self.model = nn.DataParallel(self.model, device_ids=cuda_devices)

        # Setting the train and test data loader
        self.train_data = train_dataloader
        self.test_data = test_dataloader

        # Setting the Adam optimizer with hyper-param
        self.optim = Adam(self.model.parameters(),
                          lr=lr,
                          betas=betas,
                          weight_decay=weight_decay)
        self.optim_schedule = ScheduledOptim(self.optim,
                                             self.bert.hidden,
                                             n_warmup_steps=warmup_steps)

        # Using Negative Log Likelihood Loss function for predicting the masked_token
        self.criterion = nn.NLLLoss(ignore_index=0)

        self.log_freq = log_freq

        print("Total Parameters:",
              sum([p.nelement() for p in self.model.parameters()]))

    def train(self, epoch):
        self.iteration(epoch, self.train_data)

    def test(self, epoch):
        self.iteration(epoch, self.test_data, train=False)

    def iteration(self, epoch, data_loader, train=True):
        """
        loop over the data_loader for training or testing
        if on train status, backward operation is activated
        and also auto save the model every peoch

        :param epoch: current epoch index
        :param data_loader: torch.utils.data.DataLoader for iteration
        :param train: boolean value of is train or test
        :return: None
        """
        str_code = "train" if train else "test"

        # Setting the tqdm progress bar
        data_iter = tqdm.tqdm(enumerate(data_loader),
                              desc="EP_%s:%d" % (str_code, epoch),
                              total=len(data_loader),
                              bar_format="{l_bar}{r_bar}")

        avg_loss = 0.0
        total_correct = 0
        total_element = 0

        for i, data in data_iter:
            # 0. batch_data will be sent into the device(GPU or cpu)
            data = {key: value.to(self.device) for key, value in data.items()}

            # 1. forward the next_sentence_prediction and masked_lm model
            next_sent_output, mask_lm_output = self.model.forward(
                data["bert_input"], data["segment_label"])

            # 2-1. NLL(negative log likelihood) loss of is_next classification result
            next_loss = self.criterion(next_sent_output, data["is_next"])

            # 2-2. NLLLoss of predicting masked token word
            mask_loss = self.criterion(mask_lm_output.transpose(1, 2),
                                       data["bert_label"])

            # 2-3. Adding next_loss and mask_loss : 3.4 Pre-training Procedure
            loss = next_loss + mask_loss

            # 3. backward and optimization only in train
            if train:
                self.optim_schedule.zero_grad()
                loss.backward()
                self.optim_schedule.step_and_update_lr()

            # next sentence prediction accuracy
            correct = next_sent_output.argmax(dim=-1).eq(
                data["is_next"]).sum().item()
            avg_loss += loss.item()
            total_correct += correct
            total_element += data["is_next"].nelement()

            post_fix = {
                "epoch": epoch,
                "iter": i,
                "avg_loss": avg_loss / (i + 1),
                "avg_acc": total_correct / total_element * 100,
                "loss": loss.item()
            }

            if i % self.log_freq == 0:
                data_iter.write(str(post_fix))

        print("EP%d_%s, avg_loss=" % (epoch, str_code),
              avg_loss / len(data_iter), "total_acc=",
              total_correct * 100.0 / total_element)

    def save(self, epoch, file_path="output/bert_trained.model"):
        """
        Saving the current BERT model on file_path

        :param epoch: current epoch number
        :param file_path: model output path which gonna be file_path+"ep%d" % epoch
        :return: final_output_path
        """
        output_path = file_path + ".ep%d" % epoch
        torch.save(self.bert.cpu(), output_path)
        self.bert.to(self.device)
        print("EP:%d Model Saved on:" % epoch, output_path)
        return output_path
예제 #4
0
class BERTTrainerFull(BasicTrainer):
    def __init__(self, bert: BERTFull, vocab_size: int, epochs: int,
                 tensorboard_log_dir: str, output_path: str,
                 train_dataloader: DataLoader,
                 lr: float = 1e-7, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000,
                 lambda_beta: float = 1e-2,
                 with_cuda: bool = True, log_freq: int = 10, save_steps: int = -1):

        super(BERTTrainerFull, self).__init__(bert=bert, epochs=epochs,
                                              tensorboard_log_dir=tensorboard_log_dir,
                                              output_path=output_path,
                                              train_dataloader=train_dataloader,
                                              with_cuda=with_cuda, log_freq=log_freq, save_steps=save_steps)

        self.model = BERTLMFull(bert, vocab_size).to(self.device)
        self.lambda_beta = lambda_beta

        if with_cuda and torch.cuda.device_count() > 1:
            print("Using %d GPUS for BERT" % torch.cuda.device_count())
            self.model = nn.DataParallel(self.model)

        # Setting the Adam optimizer with hyper-param
        self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay)
        self.optim_schedule = ScheduledOptim(self.optim, self.bert.hidden, n_warmup_steps=warmup_steps)
        print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))

    def iteration(self, epoch, data_loader, pos_mask):
        data_iter = tqdm.tqdm(enumerate(data_loader),
                              desc="EP:%d" % epoch,
                              total=self.n_batches,
                              bar_format="{l_bar}{r_bar}",
                              disable=False)

        avg_loss = 0.0
        for i, data in data_iter:
            global_step = epoch * self.n_batches + i + 1

            data = {key: value.to(self.device) for key, value in data.items()}

            mask_tp_output, mask_lm_output = self.model.forward(data["token_input"], data["time_input"], pos_mask)

            # 2-2. NLLLoss of predicting masked token word
            mask_lm_loss = self.mask_lm_criterion(mask_lm_output.transpose(1, 2), data["token_label"])
            # mask_tp_loss = self.mask_time_prediction_criterion(mask_tp_output, data["time_label"])
            mask_tp_loss = get_mask_time_prediction_loss(mask_tp_output, data["time_label"])
            # print("mask_lm_loss", mask_lm_loss)
            # print("mask_tp_loss", mask_tp_loss)
            # 2-3. Adding next_loss and mask_loss : 3.4 Pre-training Procedure
            loss = mask_lm_loss + self.lambda_beta * mask_tp_loss
            # print("loss", loss)
            # 3. backward and optimization only in train
            self.optim_schedule.zero_grad()
            # loss.backward(retain_graph=True)  # todo: maybe be removed later
            loss.backward()
            self.optim_schedule.step_and_update_lr()

            avg_loss += loss.item()

            self.tensorborad_writer.add_scalar("Masked_language_model loss", mask_lm_loss.item(), global_step)
            self.tensorborad_writer.add_scalar("Masked_time_prediction loss", mask_tp_loss.item(), global_step)
            self.tensorborad_writer.add_scalar("Average loss in epoch", avg_loss / (i + 1), global_step)

            post_fix = {
                "epoch": epoch,
                "iter": i+1,
                "avg_loss": avg_loss / (i + 1),
                "loss": loss.item()
            }

            if (i+1) % self.log_freq == 0:
                data_iter.write(str(post_fix))

            if self.save_steps > 0 and ((i + 1) % self.save_steps == 0 or (i + 1) == self.n_batches):
                self.save(epoch, i + 1)
예제 #5
0
class BERTTrainer(BasicTrainer):
    def __init__(self, bert: BERT, vocab_size: int, epochs: int,
                 tensorboard_log_dir: str, output_path: str,
                 train_dataloader: DataLoader,
                 lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000,
                 with_cuda: bool = True, log_freq: int = 10, save_steps: int = -1):

        super(BERTTrainer, self).__init__(bert=bert, epochs=epochs,
                                               tensorboard_log_dir=tensorboard_log_dir,
                                               output_path=output_path,
                                               train_dataloader=train_dataloader,
                                               with_cuda=with_cuda, log_freq=log_freq, save_steps=save_steps)

        self.model = BERTLM(bert, vocab_size).to(self.device)

        if with_cuda and torch.cuda.device_count() > 1:
            print("Using %d GPUS for BERT" % torch.cuda.device_count())
            self.model = nn.DataParallel(self.model)

        # Setting the Adam optimizer with hyper-param
        self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay)
        self.optim_schedule = ScheduledOptim(self.optim, self.bert.hidden, n_warmup_steps=warmup_steps)
        print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))

    def iteration(self, epoch, data_loader):
        data_iter = tqdm.tqdm(enumerate(data_loader),
                              desc="EP:%d" % epoch,
                              total=self.n_batches,
                              bar_format="{l_bar}{r_bar}",
                              disable=False)

        avg_loss = 0.0
        for i, data in data_iter:
            global_step = epoch * self.n_batches + i + 1

            data = {key: value.to(self.device) for key, value in data.items()}

            mask_lm_output = self.model.forward(data["bert_input"])

            mask_loss = self.criterion(mask_lm_output.transpose(1, 2), data["bert_label"])

            loss = mask_loss

            self.optim_schedule.zero_grad()
            loss.backward()
            self.optim_schedule.step_and_update_lr()

            avg_loss += loss.item()

            self.tensorborad_writer.add_scalar("Masked_language_model loss", mask_loss.item(), global_step)
            self.tensorborad_writer.add_scalar("Average loss in epoch", avg_loss / (i + 1), global_step)

            post_fix = {
                "epoch": epoch,
                "iter": i+1,
                "avg_loss": avg_loss / (i + 1),
                "loss": loss.item()
            }

            if (i+1) % self.log_freq == 0:
                data_iter.write(str(post_fix))

            if self.save_steps > 0 and ((i + 1) % self.save_steps == 0 or (i + 1) == self.n_batches):
                self.save(epoch, i + 1)