Exemplo n.º 1
0
    def get_optimizers(
        self, num_training_steps: int
    ) -> Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]:
        # Prepare optimizer and schedule (linear warmup and decay)
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.args.weight_decay,
            },
            {
                "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon)
        #scheduler = get_linear_schedule_with_warmup(
        #    optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=num_training_steps
        #)
        if self.args.warmup_steps>0:
            logger.info("*****Linear warmup over %d warmup_steps *****"%self.args.warmup_steps)
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=num_training_steps
            )
        else:
            logger.info("*****Linear warmup over %.1f%% of training.*****"%(self.args.warmup_proportion*100))
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=int(self.args.warmup_proportion*num_training_steps), num_training_steps=num_training_steps
            )

        return optimizer, scheduler
Exemplo n.º 2
0
def init_schedule(config, optimizer, train_loader):
    t_total = len(train_loader) * config.epochs
    warmup_steps = t_total * config.warmup_ratio
    if switch:
        optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        scheduler = get_linear_schedule_with_warmup(
            optimizer.optimizer,
            num_warmup_steps=warmup_steps,
            num_training_steps=t_total)
    else:
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=warmup_steps,
            num_training_steps=t_total)
    return scheduler
Exemplo n.º 3
0
def start_train(args):
    '''data loaders'''
    train_token_ids, train_att_masks, train_labels = load_tokens(args.train_sent_tokens, args.train_att_tokens, args.train_labels)
    train_dataset = TensorDataset(train_token_ids, train_att_masks, train_labels.reshape(-1,1))
    train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=4)
    
    test_token_ids, test_att_masks, test_labels = load_tokens(args.test_sent_tokens, args.test_att_tokens, args.test_labels)
    test_dataset = TensorDataset(test_token_ids, test_att_masks, test_labels.reshape(-1,1))
    test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False, num_workers=4)

    '''model setup'''
    model = BertAttractor()
    optimizer = optim.AdamW(model.parameters(), lr=1e-5)

    num_train_steps = int(len(train_dataset) / args.batch_size * args.epochs)
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0,
                                                num_training_steps=num_train_steps)

    '''init model'''
    if args.init:
        print(f'Initializing model from {args.init}')
        checkpoint = torch.load(args.init)
        model.load_state_dict(checkpoint['model_state_dict'])

    '''device allocation'''
    if torch.cuda.is_available():
        print('Using gpu')
        device = torch.device('cuda:1')
    else:
        print('Cannot train without GPU')
        sys.exit()

    train_acc, train_loss = [], []
    test_acc, test_loss = [], []
    start_epoch = 0
    if args.resume:
        print(f'Resuming model from {args.resume}')
        checkpoint = torch.load(args.resume)
        start_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        train_acc, train_loss = checkpoint['train_acc'], checkpoint['train_loss']
        test_acc, test_loss = checkpoint['test_acc'], checkpoint['test_loss']
        for state in optimizer.state.values():
            for key, value in state.items():
                if isinstance(value, torch.Tensor):
                    state[key] = value.to(device)

    '''freeze bert model'''
#    for param in model.parameters():
#        param.requires_grad = False
#    for param in model.bert.classifier.parameters():
#        param.requires_grad = True
#    for param in model.attractor.parameters():
#        param.requires_grad = True

    '''train the model'''
    train_model(model, train_loader, test_loader, optimizer, device, start_epoch, args.epochs, 
                train_acc, train_loss, test_acc, test_loss, args.out_dir, scheduler)
Exemplo n.º 4
0
def fit(model,
        epochs,
        train_loader,
        valid_loader,
        lr=1e-3,
        class_weights=None):
    nbBatch = trainSetlength / batch_size
    num_training_steps = nbBatch * epochs
    criterion = nn.CrossEntropyLoss(
        weight=class_weights)  # voir quel loss utilisé pour du multilabel
    #optimizer = optim.Adam(model.parameters(), lr=lr) #BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, t_total=num_total_steps)
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps)
    for epoch in range(epochs):
        model.train()
        total_loss = num = 0
        for x, y in train_loader:
            optimizer.zero_grad()
            y_scores = model(x)
            loss = criterion(y_scores, y)
            loss.backward()
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()
            num += len(y)
        print(epoch, total_loss / num, *perf(model, valid_loader))
Exemplo n.º 5
0
    def get_optimizers(
        self, num_training_steps: int
    ) -> Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]:
        """
        Setup the optimizer and the learning rate scheduler.

        We provide a reasonable default that works well.
        If you want to use something else, you can pass a tuple in the Trainer's init,
        or override this method in a subclass.
        """
        if self.optimizers is not None:
            return self.optimizers
        # Prepare optimizer and schedule (linear warmup and decay)
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in self.model.named_parameters() if "relational_transformer" not in n and not any(nd in n for nd in no_decay)],
                "weight_decay": self.args.weight_decay,
            },
            {
                "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
            {
                "params": [p for n, p in self.model.named_parameters() if "relational_transformer" in n and not any(nd in n for nd in no_decay)],
                "weight_decay": self.args.weight_decay,
                "lr": 7e-5
            }
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=num_training_steps
        )
        return optimizer, scheduler
Exemplo n.º 6
0
 def get_optimizer(self, num_training_steps: int):
     no_decay = ["bias", "LayerNorm.weight"]
     optimizer_grouped_parameters = [
         {
             "params": [
                 p for n, p in self.model.named_parameters()
                 if not any(nd in n for nd in no_decay)
             ],
             "weight_decay":
             self.args.weight_decay,
         },
         {
             "params": [
                 p for n, p in self.model.named_parameters()
                 if any(nd in n for nd in no_decay)
             ],
             "weight_decay":
             0.0,
         },
     ]
     optimizer = AdamW(optimizer_grouped_parameters,
                       lr=self.args.learning_rate,
                       eps=self.args.adam_epsilon)
     scheduler = get_linear_schedule_with_warmup(
         optimizer,
         num_warmup_steps=self.args.warmup_steps,
         num_training_steps=num_training_steps)
     return optimizer, scheduler
Exemplo n.º 7
0
 def get_optimizers(
     self, num_training_steps: int
 ) -> Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]:
     if self.optimizers is not None:
         return self.optimizers
     # Prepare optimizer and schedule (linear warmup and decay)
     no_decay = ["bias", "LayerNorm.weight"]
     optimizer_grouped_parameters = [{
         "params": [
             p for n, p in self.model.named_parameters()
             if not any(nd in n for nd in no_decay)
         ],
         "weight_decay":
         self.args.weight_decay,
     }, {
         "params": [
             p for n, p in self.model.named_parameters()
             if any(nd in n for nd in no_decay)
         ],
         "weight_decay":
         0.0,
     }, {
         "params": self.fc.parameters(),
         "weight_decay": self.args.weight_decay
     }]
     optimizer = AdamW(optimizer_grouped_parameters,
                       lr=self.args.learning_rate,
                       eps=self.args.adam_epsilon)
     scheduler = get_linear_schedule_with_warmup(
         optimizer,
         num_warmup_steps=self.args.warmup_steps,
         num_training_steps=num_training_steps)
     return optimizer, scheduler
Exemplo n.º 8
0
 def get_optimizers(self):
     # Setup the optimizer and the learning rate scheduler.
     no_decay = ["bias", "LayerNorm.weight"]
     optimizer_grouped_parameters = [
         {
             "params": [
                 p for n, p in self.model.named_parameters()
                 if not any(nd in n for nd in no_decay)
             ],
             "weight_decay":
             cfg.weight_decay,
         },
         {
             "params": [
                 p for n, p in self.model.named_parameters()
                 if any(nd in n for nd in no_decay)
             ],
             "weight_decay":
             0.0,
         },
     ]
     optimizer = AdamW(optimizer_grouped_parameters, lr=cfg.lr)
     num_training_steps = self.reader.set_stats['train']['num_dials'] * \
                          cfg.epoch_num // (cfg.gradient_accumulation_steps * cfg.batch_size)
     num_warmup_steps = cfg.warmup_steps if cfg.warmup_steps >= 0 else int(
         num_training_steps * 0.2)
     scheduler = get_linear_schedule_with_warmup(
         optimizer,
         num_warmup_steps=num_warmup_steps,
         num_training_steps=num_training_steps)
     return optimizer, scheduler
Exemplo n.º 9
0
def get_optimizers(model, learning_rate, adam_epsilon, weight_decay,
                   num_training_steps):
    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            weight_decay
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=learning_rate,
                      eps=adam_epsilon)

    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
    return optimizer, scheduler
Exemplo n.º 10
0
 def create_scheduler(self, num_training_steps: int):
     scheduler = get_linear_schedule_with_warmup(
         self.optimizer,
         num_warmup_steps=self.args.warmup_steps,
         num_training_steps=num_training_steps,
     )
     return scheduler
Exemplo n.º 11
0
def build_default_model(args):
    """
    自定义模型
    规格要求返回模型(model)、优化器(optimizer)、调度器(scheduler)三元组。
    """

    # -------- model --------
    model = load_pretrained_model(args)
    model.to(args.device)

    # -------- optimizer --------
    from transformers.optimization import AdamW
    optimizer_parameters = get_default_optimizer_parameters(
        model, args.weight_decay)
    optimizer = AdamW(optimizer_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon,
                      correct_bias=False)

    # -------- scheduler --------
    from transformers.optimization import get_linear_schedule_with_warmup

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.total_steps * args.warmup_rate,
        num_training_steps=args.total_steps)

    return model, optimizer, scheduler
Exemplo n.º 12
0
    def configure_optimizers(self):
        no_decay = ["bias", "LayerNorm.weight"]

        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in self.named_parameters()
                    if not any(nd in n for nd in no_decay) and p.requires_grad
                ],
                "weight_decay":
                self.args.weight_decay,
            },
            {
                "params": [
                    p for n, p in self.named_parameters()
                    if any(nd in n for nd in no_decay) and p.requires_grad
                ],
                "weight_decay":
                0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=self.args.lr,
                          eps=self.args.adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.args.warmup_steps,
            num_training_steps=self.args.train_steps)
        return [optimizer], [{"scheduler": scheduler, "interval": "step"}]
Exemplo n.º 13
0
def build_optimizer_scheduler(model, num_train_steps, learning_rate):
    optimizer = AdamW(
        model.parameters(), lr=learning_rate, correct_bias=False)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=100, 
        num_training_steps=num_train_steps)

    return optimizer, scheduler
Exemplo n.º 14
0
    def train(self, epochs):
        """
        Runs the training.
        """
        pretrained_model = self.config.get("pretrained_mtb_model", None)
        pretrained_model = ("pretrained"
                            if pretrained_model else "no_pretraining")
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in self.model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.01,
            },
            {
                "params": [
                    p for n, p in self.model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=self.config.get("lr"))
        ovr_steps = (epochs * len(self.data_loader.train_generator) *
                     self.config.get("mini_batch_size") /
                     self.config.get("batch_size"))
        scheduler = get_linear_schedule_with_warmup(optimizer, ovr_steps // 10,
                                                    ovr_steps)

        results_path = os.path.join("results", "sem_eval", pretrained_model,
                                    str(epochs))
        best_model_path = os.path.join(self.checkpoint_dir,
                                       "best_model.pth.tar")
        resume = self.config.get("resume", False)
        if resume and os.path.exists(best_model_path):
            (
                self._start_epoch,
                self._best_test_f1,
                self._train_loss,
                self._train_acc,
                self._test_f1,
                self._train_acc,
            ) = self.load_best_model(self.checkpoint_dir)

        logger.info("Starting training process")
        pad_id = self.tokenizer.pad_token_id
        for epoch in range(self._start_epoch, epochs):
            self._train_epoch(epoch, pad_id, optimizer, scheduler)
            data = self._write_kpis(results_path)
            self._plot_results(data, results_path)

        logger.info("Finished Training.")
        return self.model
Exemplo n.º 15
0
 def create_optimizer_and_scheduler(self, num_training_steps: int):
     """
     Based on Transformers' default one, we add fixing layer option where the bottom n layers' parameters
     are fixed and only the top layers are further fine-tuned.
     """
     if self.optimizer is None:
         params = {}
         for n, p in self.model.named_parameters():
             if self.args.fix_layers > 0:
                 if 'encoder.layer' in n:
                     try:
                         layer_num = int(n[n.find('encoder.layer') +
                                           14:].split('.')[0])
                     except:
                         print(n)
                         raise Exception("")
                     if layer_num >= self.args.fix_layers:
                         print('yes', n)
                         params[n] = p
                     else:
                         print('no ', n)
                 elif 'embeddings' in n:
                     print('no ', n)
                 else:
                     print('yes', n)
                     params[n] = p
             else:
                 params[n] = p
         no_decay = ["bias", "LayerNorm.weight"]
         optimizer_grouped_parameters = [
             {
                 "params": [
                     p for n, p in params.items()
                     if not any(nd in n for nd in no_decay)
                 ],
                 "weight_decay":
                 self.args.weight_decay,
             },
             {
                 "params": [
                     p for n, p in params.items()
                     if any(nd in n for nd in no_decay)
                 ],
                 "weight_decay":
                 0.0,
             },
         ]
         self.optimizer = AdamW(
             optimizer_grouped_parameters,
             lr=self.args.learning_rate,
             betas=(self.args.adam_beta1, self.args.adam_beta2),
             eps=self.args.adam_epsilon,
         )
     if self.lr_scheduler is None:
         self.lr_scheduler = get_linear_schedule_with_warmup(
             self.optimizer,
             num_warmup_steps=self.args.warmup_steps,
             num_training_steps=num_training_steps)
Exemplo n.º 16
0
 def configure_optimizers(self):
     optimizer = torch.optim.Adam(self.parameters(), lr=1e-5)
     dataset_size = self.train_dataloader.dataloader.dataset.__len__()
     num_steps = dataset_size * self.args.epochs / self.args.grad_accum / self.args.batch_size
     scheduler = get_linear_schedule_with_warmup(
         optimizer,
         num_warmup_steps=num_steps * 0.1,
         num_training_steps=num_steps)
     return [optimizer], [{"scheduler": scheduler, "interval": "step"}]
Exemplo n.º 17
0
def run_train():
    data_dir = config.DATA_DIR
    nerProcessor = NerProcessor()
    train_example = nerProcessor.get_train_examples(data_dir)
    label_list = nerProcessor.get_labels()
    tokenizer = transformers.BertTokenizer.from_pretrained(
        config.BERT_TOKENIZER_PATH)
    train_features = convert_examples_to_features(train_example, label_list,
                                                  config.MAX_SEQ_LEN,
                                                  tokenizer)

    # input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
    # attention_mask = torch.tensor([f.attention_mask for f in train_features], dtype=torch.long)
    # token_type_ids = torch.tensor([f.token_type_ids for f in train_features], dtype=torch.long)
    # label_ids = torch.tensor([f.label_ids for f in train_features], dtype=torch.long)

    input_ids = torch.tensor([f["input_ids"] for f in train_features],
                             dtype=torch.long)
    attention_mask = torch.tensor(
        [f["attention_mask"] for f in train_features], dtype=torch.long)
    token_type_ids = torch.tensor(
        [f["token_type_ids"] for f in train_features], dtype=torch.long)
    label_ids = torch.tensor([f["label_ids"] for f in train_features])
    label_ids = F.one_hot(label_ids)
    label_ids = torch.tensor(label_ids.numpy(), dtype=torch.float)

    train_dataset = TensorDataset(input_ids, attention_mask, token_type_ids,
                                  label_ids)
    sampler = SequentialSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=sampler,
                                  batch_size=config.TRAIN_BATCH_SIZE)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = BertNER(config.BERT_MODEL_PATH, len(label_list) + 1)
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=config.LEARNING_RATE)
    num_training_step = len(
        train_dataset) // config.TRAIN_BATCH_SIZE * config.TRAIN_EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_training_step)

    for epoch in range(config.TRAIN_EPOCHS):
        train_fn(model, device, train_dataloader, optimizer, scheduler)

        model_to_save = model.module if hasattr(model, "module") else model
        model_save_path = os.path.join(f"{config.BERT_OUTPUT}/{epoch+1}",
                                       WEIGHTS_NAME)
        torch.save(model_to_save.state_dict(), model_save_path)
        tokenizer.save_vocabulary(f"{config.BERT_OUTPUT}/{epoch+1}/vocab.txt")

    model_to_save = model.module if hasattr(model, "module") else model
    model_save_path = os.path.join(config.BERT_OUTPUT, WEIGHTS_NAME)
    torch.save(model_to_save.state_dict(), model_save_path)
    tokenizer.save_vocabulary(f"{config.BERT_OUTPUT}/vocab.txt")
Exemplo n.º 18
0
    def finetune(features, optimizer, num_epoch, num_steps):
        best_score = -1
        train_dataloader = DataLoader(features,
                                      batch_size=args.train_batch_size,
                                      shuffle=True,
                                      collate_fn=collate_fn,
                                      drop_last=True)
        train_iterator = range(int(num_epoch))
        total_steps = int(len(train_dataloader) * num_epoch)
        warmup_steps = int(total_steps * args.warmup_ratio)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=warmup_steps,
            num_training_steps=total_steps)
        print("Total steps: {}".format(total_steps))
        print("Warmup steps: {}".format(warmup_steps))
        for epoch in train_iterator:
            for step, batch in enumerate(train_dataloader):
                num_steps += 1
                model.train()
                inputs = {
                    'input_ids': batch[0].to(args.device),
                    'attention_mask': batch[1].to(args.device),
                    'labels': batch[2],
                    'entity_pos': batch[3],
                    'hts': batch[4],
                    'index': batch[5].to(args.device),
                }
                outputs = model(**inputs)
                loss = outputs[0]
                if args.n_gpu > 1:
                    loss = loss.mean()
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                if args.max_grad_norm > 0:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                optimizer.step()
                scheduler.step()
                model.zero_grad()
                wandb.log({"loss": loss.item()}, step=num_steps)
                if (step + 1) == len(train_dataloader) - 1:
                    dev_score, dev_output = evaluate(args,
                                                     model,
                                                     dev_features,
                                                     tag="dev")
                    wandb.log(dev_output, step=num_steps)
                    print(dev_output)
                    if dev_score > best_score:
                        best_score = dev_score
                        if test_features is not None:
                            pred = test(args, model, test_features)
                            with open("result.json", "w") as fh:
                                json.dump(pred, fh)

        return num_steps
def get_scheduler(optimizer, len_train_data):
    batch_size = config_dict["accumulated_batch_size"]
    epochs = config_dict["epochs"]

    num_train_steps = int(len_train_data / batch_size) * epochs
    num_warmup_steps = int(num_train_steps * config_dict["warmup_proportion"])

    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps,
                                                num_train_steps)
    return scheduler
Exemplo n.º 20
0
 def configure_optimizers(self):
     if (self.trial):
         lr = self.trial.suggest_loguniform('learning_rate', 2e-5, 5e-5)
         self.logger.log_hyperparams({'lr': lr})
         optimizer = AdamW(self.parameters(), lr=lr, correct_bias=False)
     else:
         optimizer = AdamW(self.parameters(), lr=2e-5, correct_bias=False)
     scheduler = get_linear_schedule_with_warmup(
         optimizer, num_warmup_steps=0, num_training_steps=self.total_steps)
     return [optimizer], [scheduler]
Exemplo n.º 21
0
    def __init__(self, dataset, batch_size=32):
        """Creates a new model for sentiment analysis using BERT."""
        # The pretrained weights to use.
        pretrained_weights = 'bert-base-uncased'

        # Create trainsformer to convert text to indexed tokens.
        transformer = BertTransform(62, pretrained_weights)

        # Setup the train loader
        train_dataset = dataset('./',
                                train=True,
                                transforms=DataToTensor(),
                                vectorizer=transformer,
                                download=True)
        self.train_loader = DataLoader(train_dataset,
                                       batch_size=batch_size,
                                       shuffle=False)

        # Setup the validation loader
        val_dataset = dataset('./',
                              train=False,
                              transforms=DataToTensor(),
                              vectorizer=transformer,
                              download=True)
        self.val_loader = DataLoader(val_dataset,
                                     batch_size=batch_size,
                                     shuffle=False)

        # Retrive the CUDA device if available otherwise use CPU instead
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        device_name = torch.cuda.get_device_name(
            0) if torch.cuda.is_available() else "CPU"
        print("Training on:", device_name)

        # Loads the pretrained BERT model with classifcation layer
        self.model = BertForSequenceClassification.from_pretrained(
            pretrained_weights, num_labels=2)
        self.model.to(self.device)

        # Set the learning rate
        self.lr = 1e-5

        # Set the optimizer and scheduler
        training_steps = len(train_dataset) / batch_size
        self.optimizer = optim.AdamW(self.model.parameters(),
                                     lr=self.lr,
                                     correct_bias=False)
        self.scheduler = optim.get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=0.1,
            num_training_steps=training_steps)

        # Maximum gradient norm (used for gradient clipping)
        self.max_grad_norm = 1.0
Exemplo n.º 22
0
def get_linear_schedule_with_warmup_frac(optimizer,
                                         num_training_steps,
                                         num_warmup_steps,
                                         frac_training_steps=0,
                                         last_epoch=-1):
    num_warmup_steps = int(num_training_steps * frac_training_steps)
    return get_linear_schedule_with_warmup(
        optimizer,
        num_training_steps=num_training_steps,
        num_warmup_steps=num_warmup_steps,
        last_epoch=last_epoch)
Exemplo n.º 23
0
 def get_lr_scheduler(self, opt):
     scheduler = get_linear_schedule_with_warmup(
         opt,
         num_warmup_steps=self.hparams.warmup_steps,
         num_training_steps=self.train_steps)
     scheduler = {
         "scheduler": scheduler,
         "interval": "step",
         "frequency": 1
     }
     return scheduler
Exemplo n.º 24
0
    def pre_train_bert(self):
        tokenizer= RobertaTokenizer.from_pretrained('roberta-base')
        optimizer = optim.Adam(self.bert_model.parameters(), 2e-5)
        scheduler = get_linear_schedule_with_warmup(optimizer, 31,310)
        step = 0
        train_dataloader = get_bert_lm_dataloader(self.lm_file_path, 64)
        print("Training LM")
        if torch.cuda.is_available():
            self.bert_model.cuda()
        for epoch in range(2):
            print("Epoch : " + str(epoch))
            for ind, batch in enumerate(train_dataloader):
                step += 1

                optimizer.zero_grad()
                if torch.cuda.is_available():
                    inp = batch[0].cuda()
                else:
                    inp = batch[0]

                labels = inp.clone()
                # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
                probability_matrix = torch.full(labels.shape, 0.15)
                special_tokens_mask = [
                    tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in
                    labels.tolist()
                ]
                probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
                if tokenizer._pad_token is not None:
                    padding_mask = labels.eq(tokenizer.pad_token_id)
                    padding_mask = padding_mask.detach().cpu()
                    probability_matrix.masked_fill_(padding_mask, value=0.0)
                masked_indices = torch.bernoulli(probability_matrix).bool()
                labels[~masked_indices] = -100  # We only compute loss on masked tokens

                # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
                indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
                inp[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

                # 10% of the time, we replace masked input tokens with random word
                indices_random = torch.bernoulli(
                    torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
                random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
                inp[indices_random] = random_words[indices_random]
                outputs = self.bert_model(inp, masked_lm_labels=labels.long(),attention_mask=(inp!=tokenizer.pad_token_id).long())
                loss, prediction_scores = outputs[:2]
                loss.backward()
                #torch.nn.utils.clip_grad_norm_(self.bert_model.parameters(), 1.0)
                print(str(step) + " Loss is :" + str(loss.item()))
                optimizer.step()
                scheduler.step()
        print("LM training done")
        torch.save(self.bert_model.state_dict(), "lm_joke_bert.pth")
Exemplo n.º 25
0
    def fit(self,
            dataset,
            validation=True,
            batch_size=1,
            patience=3,
            delta=0.):
        """ Fits the model to the given dataset.

        Usage:
        ``` y
        >>> rge = Framework(**config)
        >>> rge.fit(train_data)
        """
        self.model.to(self.device)
        train_data = dataset.get_train(batch_size)

        if self.config['half']:
            self.model, self.optimizer = amp.initialize(
                self.model,
                self.optimizer,
                opt_level='O2',
                keep_batchnorm_fp32=True)

        if self.config['linear_scheduler']:
            num_training_steps = int(
                len(train_data) // self.grad_acc * self.config['epochs'])
            scheduler = get_linear_schedule_with_warmup(
                self.optimizer,
                num_warmup_steps=self.config.get('warmup_steps', 0),
                num_training_steps=num_training_steps)
        else:
            scheduler = None

        early_stopping = EarlyStopping(patience, delta, self._save_checkpoint)

        for epoch in range(self.config['epochs']):
            self.optimizer.zero_grad()
            loss = self._train_step(train_data, epoch, scheduler=scheduler)
            if validation:
                val_loss, _, _, _ = self._val_step(dataset.get_val(batch_size),
                                                   epoch)
                if early_stopping(val_loss,
                                  dataset=dataset,
                                  epoch=epoch,
                                  loss=loss):
                    break

        # Recover the best epoch
        path = os.path.join("checkpoints", f"{dataset.name}.pth")
        config_path = os.path.join("checkpoints",
                                   f"{dataset.name}_config.json")
        _, _ = self._load_checkpoint(path, config_path)
Exemplo n.º 26
0
def run_train():
    data_dir = config.DATA_DIR
    kgp = KGProcessor()
    rela_list = kgp.get_all_relations()
    examples = kgp.get_train_examples(data_dir)
    tokenizer = transformers.BertTokenizer.from_pretrained(
        config.BERT_TOKENIZER_PATH)
    features = kgp.convert_examples_to_features(examples, config.MAX_SEQ_LEN,
                                                tokenizer)

    input_ids = torch.tensor([f["input_ids"] for f in features],
                             dtype=torch.long)
    attention_mask = torch.tensor([f["attention_mask"] for f in features],
                                  dtype=torch.long)
    token_type_ids = torch.tensor([f["token_type_ids"] for f in features],
                                  dtype=torch.long)
    labels = torch.tensor([f["label"] for f in features])
    labels = F.one_hot(labels)
    labels = torch.tensor(labels.numpy(), dtype=float)

    dataset = TensorDataset(input_ids, attention_mask, token_type_ids, labels)
    sampler = SequentialSampler(dataset)
    data_loader = DataLoader(dataset,
                             sampler=sampler,
                             batch_size=config.TRAIN_BATCH_SIZE)

    num_training_steps = len(
        input_ids) / config.TRAIN_BATCH_SIZE * config.TRAIN_EPOCHS
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = BertKG(config.BERT_MODEL_PATH, len(rela_list))
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=config.LEARNING_RATE)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

    for epoch in range(config.TRAIN_EPOCHS):
        print(
            f"\n---------------------------epoch: {epoch+1}---------------------------"
        )
        train_fn(model, device, data_loader, optimizer, scheduler)

        model_to_save = model.module if hasattr(model, "module") else model
        output_path = os.path.join(f"{config.BERT_OUTPUT_PATH}/{epoch+1}",
                                   WEIGHTS_NAME)
        torch.save(model_to_save.state_dict(), output_path)
        tokenizer.save_vocabulary(
            f"{config.BERT_OUTPUT_PATH}/{epoch+1}/vocab.txt")

    model_to_save = model.module if hasattr(model, "module") else model
    output_path = os.path.join(f"{config.BERT_OUTPUT_PATH}", WEIGHTS_NAME)
    torch.save(model_to_save.state_dict(), output_path)
    tokenizer.save_vocabulary(f"{config.BERT_OUTPUT_PATH}/vocab.txt")
Exemplo n.º 27
0
 def configure_optimizers(self):
     if self.args.adafactor:
         optimizer = Adafactor(self.model.parameters(), lr=self.args.lr, scale_parameter=False, relative_step=False)
     else:
         optimizer = torch.optim.Adam(self.model.parameters(), lr=self.args.lr)
     if self.args.debug:
         return optimizer  # const LR
     num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 1
     num_steps = self.args.dataset_size * self.args.epochs / num_gpus / self.args.grad_accum / self.args.batch_size
     scheduler = get_linear_schedule_with_warmup(
         optimizer, num_warmup_steps=self.args.warmup, num_training_steps=num_steps
     )
     return [optimizer], [{"scheduler": scheduler, "interval": "step"}]
Exemplo n.º 28
0
    def _get_scheduler(self):
        """Get scheduler for different models.

        Returns:
            scheduler
        """
        if self.config.model_type == 'bert':
            scheduler = get_linear_schedule_with_warmup(
                self.optimizer,
                num_warmup_steps=self.config.num_warmup_steps,
                num_training_steps=self.config.num_training_steps)
        else:  # rnn
            scheduler = get_constant_schedule(self.optimizer)
        return scheduler
    def train_dataloader(self):
        train_batch_size = self.hparams.train_batch_size
        dataloader = self.load_dataset("train", train_batch_size)

        t_total = (
            (len(dataloader.dataset) // (train_batch_size * max(1, self.hparams.n_gpu)))
            // self.hparams.gradient_accumulation_steps
            * float(self.hparams.num_train_epochs)
        )
        scheduler = get_linear_schedule_with_warmup(
            self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
        )
        self.lr_scheduler = scheduler
        return dataloader
Exemplo n.º 30
0
def train(output_dim, num_layers, embedding_dim, hidden_size, model_path,
          model_type, dropout, criterion, lr, epochs, sequence_length,
          num_channels, kernels, nhead, warmup_epochs, batch_size, device):
    model_types = ('Transformer', 'CNN', 'LSTM')
    assert model_type in model_types, f'model_type must be one of {", ".join(model_types)}'
    song_loader = get_song_loader(batch_size, sequence_length)
    if model_type == 'Transformer':
        model = Classical_Music_Transformer(embedding_dim, hidden_size,
                                            output_dim, num_layers, dropout,
                                            device, nhead,
                                            sequence_length).to(device)
    elif model_type == 'CNN':
        model = Classical_Music_CNN(embedding_dim, output_dim, num_channels,
                                    kernels, dropout, device,
                                    sequence_length).to(device)

    elif model_type == 'LSTM':
        model = Classical_Music_LSTM(embedding_dim, hidden_size, output_dim,
                                     num_layers, dropout, device,
                                     sequence_length).to(device)
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr)
    num_training_steps = len(song_loader) * epochs
    num_warmup_steps = len(song_loader) * warmup_epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps)
    for epoch in tqdm(range(1, epochs + 1), total=epochs):
        batch_losses = []
        start = time.time()
        for inputs, targets in song_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            if model_type == 'Transformer':
                output, mu, logvar = model(inputs, targets)
                loss = cross_entropy_and_KL(output, targets, mu, logvar)
            else:
                outputs = model(inputs)
                loss = criterion(outputs, targets)

            batch_losses.append(loss.item())
            loss.backward()
            optimizer.step()
            scheduler.step()

        print(f'Epoch {epoch}/{epochs},\tLoss {np.mean(batch_losses)}\
              ,\tDuration {time.time()-start}')
        torch.save(model, model_path)
    torch.save(model.to('cpu'), model_path)