Пример #1
0
def getModel(path_config, gpu='0', fp16=False):
    print("load model......")
    torch.cuda.set_device(int(gpu))
    #os.environ["CUDA_VISIBLE_DEVICES"] = gpu
    with open(path_config, 'r') as f:
        config = json.load(f)
    from tokenizations import tokenization_bert
    tokenizer_path = config['tokenizer_path']
    model_path = config['model_path']
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("use device:%s" % device)
    tokenizer = tokenization_bert.BertTokenizer(vocab_file=tokenizer_path)
    model = GPT2LMHeadModel.from_pretrained(model_path)
    model.to(device)
    model.eval()
    if fp16:
        optimizer = transformers.AdamW(model.parameters(),
                                       lr=0.1,
                                       correct_bias=True)
        from apex import amp
        fp16_opt_level = 'O1'
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=fp16_opt_level)
    return model, tokenizer, config, device
Пример #2
0
def instantiate_model(config, tokenizer):
    configure_devices(config)
    model = Model(config)
    optimizer = transformers.AdamW(model.parameters(), lr=config.learning_rate, weight_decay=0)
    metrics = None

    if config.continue_training:
        state_dict = torch.load(config.continue_training, map_location='cpu')
        model.load_state_dict(state_dict['model'])
        if 'optimizer_state_dict' in state_dict:
            optimizer.load_state_dict(state_dict['optimizer_state_dict'])
            for g in optimizer.param_groups:
                g['lr'] = config.learning_rate
        
        try:
            print(f"Loaded model:\nEpochs: {state_dict['epoch']}\nLoss: {state_dict['loss']}\n", 
                  f"Recall: {state_dict['rec']}\nMRR: {state_dict['mrr']}")
        except:
            pass
        
    if config.use_cuda:
        model = model.cuda()
        optimizer_to(optimizer, config.device)
        model = torch.nn.DataParallel(model, device_ids=config.devices)
    return model, optimizer, metrics
Пример #3
0
    def train_on_batch(self, batch):
        if self.optimizer is None:
            no_decay = ["bias", "LayerNorm.weight"]
            optimizer_grouped_parameters = [
                {
                    "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
                    "weight_decay": float(self.config["decay"]),
                },
                {"params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
                 "weight_decay": 0.0},
            ]
            self.optimizer = transformers.AdamW(optimizer_grouped_parameters, lr=float(self.config["learning_rate"]))
            self.scheduler = transformers.get_cosine_schedule_with_warmup(
                self.optimizer,
                num_warmup_steps=int(self.config["num_warmup_steps"]),
                num_training_steps=int(self.config["num_train_steps"]))
            self.optimizer.zero_grad()

        self.model.train()
        for k, v in batch.items():
            batch[k] = v.to(self.device)
        batch_loss = torch.mean(self.model(**batch)["loss"])
        batch_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
        self.optimizer.step()
        self.scheduler.step()
        self.optimizer.zero_grad()

        return batch_loss.cpu().detach().numpy()
Пример #4
0
def _default_train_setup(opt, model, batch_loader):
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.01
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = transformers.AdamW(optimizer_grouped_parameters,
                                   lr=opt["learning_rate"])
    loss_function = torch.nn.CrossEntropyLoss()

    t_total = len(batch_loader) * opt["num_epochs"]
    warmup_steps = int(t_total * opt["warmup_ratio"])
    scheduler = transformers.optimization.get_linear_schedule_with_warmup(
        optimizer, warmup_steps, t_total)

    return loss_function, optimizer, scheduler,
Пример #5
0
    def _get_optimizer(self):
        parameters = self.model.parameters()
        optimizer = transformers.AdamW(
            params=parameters,
            lr=self._learning_rate)

        return optimizer
Пример #6
0
    def _configure_training(self, n_batches_train):
        """
        Configures training component:
            1. optimizer
            2. scheduler

        Parameters
        ----------
        n_batches_train : int
            Number of batches of training data

        Returns
        -------
        optimizer : transformers.optimization.AdamW
            Optimizer
        scheduler : torch.optim.lr_scheduler.LambdaLR
            Scheduler

        """

        # Create optimizer
        params = filter(lambda x: x.requires_grad, self.model.parameters())
        optimizer = transformers.AdamW(params, lr=self.alpha, eps=1e-8)

        # Total number of training steps is number of batches * number of epochs.
        total_steps = n_batches_train * self.epochs

        # Create the learning rate scheduler
        scheduler = transformers.get_linear_schedule_with_warmup(optimizer,
                                                                 num_warmup_steps=0, # Default value in run_glue.py
                                                                 num_training_steps=total_steps)

        return optimizer, scheduler
Пример #7
0
 def configure_optimizers(self):
     model = self.model
     no_decay = ["bias", "LayerNorm.weight", "LayerNorm.bias"]
     optimizer_grouped_parameters = [
         {
             "params": [
                 p for n, p in model.named_parameters()
                 if not any(nd in n for nd in no_decay)
             ],
             "weight_decay":
             self.hparams.weight_decay,
         },
         {
             "params": [
                 p for n, p in model.named_parameters()
                 if any(nd in n for nd in no_decay)
             ],
             "weight_decay":
             0.0,
         },
     ]
     optimizer = transformers.AdamW(optimizer_grouped_parameters,
                                    lr=self.hparams.learning_rate,
                                    eps=self.hparams.adam_epsilon)
     scheduler = transformers.get_linear_schedule_with_warmup(
         optimizer,
         num_warmup_steps=self.hparams.warmup_steps,
         num_training_steps=self.total_steps)
     scheduler = {
         'scheduler': scheduler,
         'interval': 'step',
         'frequency': 1
     }
     return [optimizer], [scheduler]
Пример #8
0
def main():

    hvd.init()
    config = json.load(open('config.json'))
    torch.cuda.set_device(hvd.local_rank())
    writer = Logging(user='******', name=f'albert_mlm_{hvd.local_rank()}')
    writer.add_hparams(config)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    trainset = dataset.dataset.MaskedLMDataset(
        csv_file=config['mask_train_file'], config=config)
    trainsampler = torch.utils.data.distributed.DistributedSampler(
        trainset, num_replicas=hvd.size(), rank=hvd.rank())
    trainloader = torch.utils.data.DataLoader(
        dataset=trainset,
        batch_size=config['batch_size'],
        num_workers=config['num_workers'],
        sampler=trainsampler)

    validset = dataset.dataset.MaskedLMDataset(
        csv_file=config['mask_valid_file'], config=config)
    validsampler = torch.utils.data.distributed.DistributedSampler(
        validset, num_replicas=hvd.size(), rank=hvd.rank())
    validloader = torch.utils.data.DataLoader(
        dataset=validset,
        batch_size=config['batch_size'],
        num_workers=config['num_workers'],
        sampler=validsampler)

    net = model.bert.Model(config=config).to(device)

    optimizer = transformers.AdamW(params=net.parameters(),
                                   lr=config['start_lr'],
                                   weight_decay=config['weight_decay'])
    optimizer = hvd.DistributedOptimizer(
        optimizer=optimizer, named_parameters=net.named_parameters())

    scheduler_1 = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer=optimizer, T_max=config['T_max'], eta_min=config['eta_min'])
    scheduler_2 = torch.optim.lr_scheduler.CyclicLR(
        optimizer=optimizer,
        base_lr=config['base_lr'],
        max_lr=config['max_lr'],
        step_size_up=config['step_size_up'],
        step_size_down=config['step_size_down'],
        cycle_momentum=False)

    scheduler = [scheduler_1, scheduler_2]

    hvd.broadcast_parameters(net.state_dict(), root_rank=0)

    training_method = engine.bert.TrainingClass(model=net,
                                                optimizer=optimizer,
                                                scheduler=scheduler,
                                                config=config,
                                                trainloader=trainloader,
                                                validloader=validloader,
                                                writer=writer,
                                                device=device,
                                                tokenizer=trainset.tokenizer)
Пример #9
0
 def get_opt_and_sched(model):
     no_decay = ["bias", "LayerNorm.weight"]
     optimizer_grouped_parameters = [
         {
             "params": [
                 p for n, p in model.named_parameters()
                 if not any(nd in n for nd in no_decay)
             ],
             "weight_decay":
             0,
         },
         {
             "params": [
                 p for n, p in model.named_parameters()
                 if any(nd in n for nd in no_decay)
             ],
             "weight_decay":
             0.0,
         },
     ]
     optimizer = transformers.AdamW(optimizer_grouped_parameters,
                                    lr=start_lr,
                                    eps=1e-8)
     scheduler = transformers.get_linear_schedule_with_warmup(
         optimizer,
         num_warmup_steps=0,
         num_training_steps=num_epochs * len(dataset),
     )
     return optimizer, scheduler
Пример #10
0
    def __setup_model_data(self, dataset, lower_case):
        """ set up data/language model """
        if self.model is not None:
            return
        if self.args.is_trained:
            self.model = transformers.AutoModelForTokenClassification.from_pretrained(self.args.transformers_model)
            self.transforms = Transforms(self.args.transformers_model, cache_dir=self.cache_dir)
            self.label_to_id = self.model.config.label2id
            self.dataset_split, self.label_to_id, self.language, self.unseen_entity_set = get_dataset_ner(
                dataset, label_to_id=self.label_to_id, fix_label_dict=True, lower_case=lower_case)
            self.id_to_label = {v: str(k) for k, v in self.label_to_id.items()}
        else:
            self.dataset_split, self.label_to_id, self.language, self.unseen_entity_set = get_dataset_ner(
                dataset, lower_case=lower_case)
            self.id_to_label = {v: str(k) for k, v in self.label_to_id.items()}
            config = transformers.AutoConfig.from_pretrained(
                self.args.transformers_model,
                num_labels=len(self.label_to_id),
                id2label=self.id_to_label,
                label2id=self.label_to_id,
                cache_dir=self.cache_dir)

            self.model = transformers.AutoModelForTokenClassification.from_pretrained(
                self.args.transformers_model, config=config)
            self.transforms = Transforms(self.args.transformers_model, cache_dir=self.cache_dir)

        # optimizer
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {"params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
             "weight_decay": self.args.weight_decay},
            {"params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
             "weight_decay": 0.0}]
        self.optimizer = transformers.AdamW(optimizer_grouped_parameters, lr=self.args.lr, eps=1e-8)

        # scheduler
        self.scheduler = transformers.get_linear_schedule_with_warmup(
            self.optimizer, num_warmup_steps=self.args.warmup_step, num_training_steps=self.args.total_step)

        # GPU allocation
        self.model.to(self.device)

        # GPU mixture precision
        if self.args.fp16:
            try:
                from apex import amp  # noqa: F401
                self.model, self.optimizer = amp.initialize(
                    self.model, self.optimizer, opt_level='O1', max_loss_scale=2 ** 13, min_loss_scale=1e-5)
                self.master_params = amp.master_params
                self.scale_loss = amp.scale_loss
                logging.info('using `apex.amp`')
            except ImportError:
                logging.exception("Skip apex: please install apex from https://www.github.com/nvidia/apex to use fp16")

        # multi-gpus
        if self.n_gpu > 1:
            # multi-gpu training (should be after apex fp16 initialization)
            self.model = torch.nn.DataParallel(self.model.cuda())
            logging.info('using `torch.nn.DataParallel`')
        logging.info('running on %i GPUs' % self.n_gpu)
Пример #11
0
    def setup_optimizer_and_scheduler(self):
        def _filter_params(parameters: List,
                           filters: List[str],
                           exclude: bool = True) -> List[str]:
            if exclude:
                return [
                    parameter for name, parameter in parameters
                    if not any(param in name for param in filters)
                ]
            else:
                return [
                    parameter for name, parameter in parameters
                    if any(param in name for param in filters)
                ]

        model_params = list(self.model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_params = [{
            'params':
            _filter_params(params=model_params, filter=no_decay, exclude=True),
            'weight_decay':
            0.001
        }, {
            'params':
            _filter_params(params=model_params, filter=no_decay,
                           exclude=False),
            'weight_decay':
            0.0
        }]
        self.optimizer = transformers.AdamW(optimizer_params, lr=1e-4)
        self.scheduler = transformers.get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=0,
            num_training_steps=self.num_training_steps)
def config_optimizer(
    model, 
    learning_rate, 
    adam_eps,
    freeze_decoder=False, 
    freeze_bert=False,
    freeze_embeddings=False,
    ):
    for param in model.model.parameters():
        param.requires_grad = False
        
    if not freeze_decoder:
        for param in model.model.mt_model.model.decoder.parameters():
            param.requires_grad = True
 
    if not freeze_bert:
        for param in model.model.bert.parameters():
            param.requires_grad = True

    if freeze_embeddings:
        for param in model.model.mt_model.model.decoder.embed_tokens.parameters() :
            param.requires_grad = False
        for param in model.model.bert.bert.embeddings.parameters():
            param.requires_grad = False        

        #optimizer = AdamW(self.model.parameters(), lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
    print(f'Optimizer will update parameters for the decoder:{not freeze_decoder} AND for bert:{not freeze_bert}. freeze_embeddings:{freeze_embeddings}')
    optimizer = transformers.AdamW(
            filter(lambda p: p.requires_grad, model.model.parameters()),
            lr=learning_rate, 
            eps=adam_eps 
        )
    return optimizer
Пример #13
0
def initialize_model(epochs):
    model = RegressionModel()
    model.to(device)
    optimizer = transformers.AdamW(model.parameters())
    total_steps = len(train_dataloader) * epochs
    scheduler = transformers.get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    return model, optimizer, scheduler
Пример #14
0
 def configure_optimizers(self):
     opt = transformers.AdamW(self.siamese_model.parameters(),
                              **self.hparams["optimizer"])
     output = opt
     if "scheduler" in self.hparams:
         scheduler = get_linear_schedule_with_warmup(
             opt, **self.hparams["scheduler"])
         output = ([opt], [scheduler])
     return output
Пример #15
0
def train(tokenizer, model, train_dataset, validate_dataset, args):
    train_dataloader = DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn,
        drop_last=True
    )
    validate_dataloader = DataLoader(validate_dataset, batch_size=args.batch_size, shuffle=True,
                                     num_workers=args.num_workers, collate_fn=collate_fn, drop_last=True)
    early_stopping = EarlyStopping(args.patience, verbose=True, save_path=args.save_model_path)
    t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.epochs
    optimizer = transformers.AdamW(model.parameters(), lr=args.lr, eps=args.eps)
    num_warmup_steps = int(t_total * args.warmup_steps_rate)
    scheduler = transformers.get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=t_total
    )
    logger.info('starting training')

    # 用于记录每个epoch训练和验证的loss
    train_losses, validate_losses = [], []
    # 记录验证集的最小loss
    best_val_loss = 10000
    # 开始训练
    train_iterator = trange(int(args.epochs), desc='Epoch', mininterval=0)
    for epoch in train_iterator:
        # ========== train ========== #
        train_loss = train_epoch(
            model=model, train_dataloader=train_dataloader,
            optimizer=optimizer, scheduler=scheduler,
            epoch=epoch, args=args)
        train_losses.append(train_loss)

        # ========== validate ========== #
        validate_loss = validate_epoch(
            model=model, validate_dataloader=validate_dataloader,
            epoch=epoch, args=args)
        validate_losses.append(validate_loss)

        # 保存当前困惑度最低的模型
        if validate_loss < best_val_loss:
            best_val_loss = validate_loss
            logger.info('saving current best model for epoch {}'.format(epoch + 1))
            model_path = join(args.save_model_path, 'min_ppl_model'.format(epoch + 1))
            if not os.path.exists(model_path):
                os.mkdir(model_path)
            model_to_save = model.module if hasattr(model, 'module') else model
            model_to_save.save_pretrained(model_path)
            tokenizer.save_pretrained(model_path)

        #  如果patience=0,则不进行early stopping
        if args.patience == 0:
            continue
        early_stopping(validate_loss, model)
        if early_stopping.early_stop:
            logger.info("Early stopping")
            break
    logger.info('training finished')
    logger.info("train_losses:{}".format(train_losses))
    logger.info("validate_losses:{}".format(validate_losses))
Пример #16
0
 def __init__(self, model: Any, model_name: str = None):
     super().__init__(model)
     self.model_name = model_name
     self.device = xm.xla_device()
     self.optimizer = transformers.AdamW(self.model.parameters(),
                                         lr=1e-4 * xm.xrt_world_size())
     self.criterion = nn.BCEWithLogitsLoss()
     self.early_stopping = utils.EarlyStopping(patience=5, verbose=True)
     self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
         self.optimizer, mode="max", patience=5, factor=0.3, verbose=True)
Пример #17
0
 def __init__(self, model: Any, model_name: str = None):
     super().__init__(model)
     self.model_name = model_name
     self.device = torch.device(
         'cuda' if torch.cuda.is_available() else 'cpu')
     self.optimizer = transformers.AdamW(self.model.parameters(), lr=1e-4)
     self.criterion = nn.BCEWithLogitsLoss()
     self.early_stopping = EarlyStopping(patience=5, verbose=True)
     self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
         self.optimizer, mode="max", patience=5, factor=0.3, verbose=True)
Пример #18
0
def train(dataloader, model, device, total_steps=None):
    model.train()
    # 统计总参数量
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    logging.info(' *** start training, parameter total:{}, trainable:{} *** '.format(total, trainable))

    # 定义优化器与scheduler
    optimizer = transformers.AdamW(model.parameters(), lr=args.lr, correct_bias=True)
    scheduler = transformers.get_linear_schedule_with_warmup(optimizer,
                                                             args.warmup_step,
                                                             total_steps)

    logging.info(' ----------- Start Training --------------')

    for epoch in range(args.num_epoch):
        total_loss = 0

        for i, batch_inputs in enumerate(dataloader):
            optimizer.zero_grad()
            batch_inputs.to(device)
            outputs = model.forward(input_ids=batch_inputs,
                                    labels=batch_inputs)
            loss, logits = outputs[:2]

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

            total_loss += loss.item()
            optimizer.step()
            scheduler.step()

            if (i + 1) % 500 == 0 or i == 0:
                logging.info('[ {4}: {5}, Epoch {0}: {1}/{2} AVG_LOSS: {3} ]'.format(
                    epoch + 1,
                    i + 1,
                    len(dataloader),
                    total_loss / (i + 1),
                    datetime.now().hour,
                    datetime.now().minute))

        logging.info('\n *** In Epoch {0}, average loss: {1} *** \n'.format(
            epoch + 1,
            total_loss / len(dataloader))
        )

        logging.info('Saving model for epoch {}'.format(epoch + 1))
        if not os.path.exists(args.output_dir):
            os.mkdir(args.output_dir)
        epoch_model_dir = os.path.join(args.output_dir, 'model_epoch_{}'.format(epoch + 1 + args.pretrained_epoch))
        if not os.path.exists(epoch_model_dir):
            os.mkdir(epoch_model_dir)
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(epoch_model_dir)
Пример #19
0
    def build_model(self):
        args = self.args
        output_dim = self.input_dim
        input_dim = self.input_dim

        self.best_model = None
        # TODO: change all modules param to single config,
        #       change input_dim and output_dim to args.vocab_size
        self.model = models.AR.build(
            args, input_dim, output_dim, self.vocab, self.embeddings,
            self.pretrain_feature_model).to(self.device)

        self.optimizer = transformers.AdamW(
            self.model.parameters(),
            lr=args.lr,
            correct_bias=True,
            #self.optimizer = optim.AdamW(self.model.parameters(), lr=args.lr,
            #self.optimizer = toptim.Lamb(self.model.parameters(), lr=args.lr,
            weight_decay=args.weight_decay)
        self.logger.info(self.model)
        self.logger.info(
            f'The model has {utils.count_parameters(self.model):,} trainable parameters'
        )

        if args.use_scheduler:
            #self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, 1.0, gamma=0.95)
            #self.scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(self.optimizer, 2)
            if args.warmup_steps == 0:
                self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(
                    self.optimizer,
                    mode='min',
                    factor=0.5,
                    min_lr=1.5e-4,
                    patience=60,
                    verbose=True)
            else:
                # XXX: scheduler will run once at start, even if has no scheduler.step()
                total_steps = int(
                    len(self.train_iter.dataset) * args.n_epochs /
                    args.batch_size / args.gradient_accumulation)
                self.scheduler = transformers.get_linear_schedule_with_warmup(
                    self.optimizer,
                    num_warmup_steps=args.warmup_steps,
                    num_training_steps=total_steps)

        if args.pretrained_fname is None:
            pass
            # pytorch module will auto init_weights with uniform
            # self.model.apply(models.init_weights)
        else:
            self.logger.info()
            self.logger.info(
                f'Load pretrained model {args.pretrained_fname}...')
            self.load_model()
Пример #20
0
    def configure_optimizers(self):
        optimizer = transformers.AdamW(self.parameters(),
                                       lr=self.learning_rate)

        warmup_steps = self.steps_per_epoch // 3
        total_steps = self.steps_per_epoch * self.n_epochs - warmup_steps

        scheduler = transformers.get_cosine_schedule_with_warmup(
            optimizer, warmup_steps, total_steps)

        return [optimizer], [scheduler]
Пример #21
0
def build_model_and_get_results(encoded_plus_list, labels_torch, part, n_parts,
                                device, n_classes, learning_rate, epochs):

    val_len = len(encoded_plus_list) // n_parts

    start = val_len * part
    end = val_len * (part + 1)

    encoded_plus_val = encoded_plus_list[start:end]
    labels_val = labels_torch[start:end]

    encoded_plus_train = encoded_plus_list[:start] + encoded_plus_list[end:]
    labels_train = torch.cat([labels_torch[:start], labels_torch[end:]])

    model = Classifier(n_classes=n_classes).to(device)

    optimizer = transformers.AdamW(model.parameters(),
                                   lr=learning_rate,
                                   correct_bias=False)

    loss_fn = nn.CrossEntropyLoss().to(device)
    best_f1_score = -1

    for epoch in range(epochs):

        print(f"On epoch {epoch + 1} of {epochs}")

        train_acc, train_f1_score, train_loss = train_epoch(
            model, encoded_plus_train, labels_train, loss_fn, optimizer,
            device, len(encoded_plus_train))

        print(
            f'Train loss {train_loss} Train f1-score {train_f1_score} accuracy {train_acc}'
        )

        val_acc, val_f1_score, val_loss, cf_matrix = eval_model(
            model, encoded_plus_val, labels_val, loss_fn, device,
            len(encoded_plus_val))

        print(
            f'Val   loss {val_loss} f1-score {val_f1_score} accuracy {val_acc}'
        )
        print(f"confusion matrix: {cf_matrix}")

        if val_f1_score > best_f1_score:

            best_model_dict = deepcopy(model.state_dict())
            best_cf_matrix = cf_matrix
            best_f1_score = val_f1_score

    torch.save(best_model_dict, p.NEW_MODEL_NAME.format(i))

    return best_f1_score, best_cf_matrix
Пример #22
0
    def configure_optimizers(
            self):  # Scheduler can be changed with a one without hard_restarts
        optimizer = transformers.AdamW(self.parameters(),
                                       lr=self.learning_rate)

        warmup_steps = self.steps_per_epoch // 3  # First third of the epoch is warmup to fasten the training process
        total_steps = self.steps_per_epoch * self.n_epochs - warmup_steps

        # We use default 1 hard restart
        scheduler = transformers.get_cosine_with_hard_restarts_schedule_with_warmup(
            optimizer, warmup_steps, total_steps)

        return [optimizer], [scheduler]
 def __init__(self):
     self.max_seq = 100
     self.classifier = Net(classes=2)
     if torch.cuda.is_available():
         self.classifier.to('cuda')
     self.criterion = nn.CrossEntropyLoss()
     self.optim = transformers.AdamW(self.classifier.parameters(), lr=5e-5)
     self.writer = SummaryWriter('logs/')
     self.val_epoch_step = 0
     self.epochs = 10
     self.log_file = 'logs/logs.txt'
     self.prepare_dataset()
     self.start_epoch = 0
Пример #24
0
    def init_optim(self):
        param_optimizer = list(self.model.named_parameters())  # 模型参数名字列表
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
        # self.optimizer = BertAdam(optimizer_grouped_parameters,
        #                     lr=self.opt['lr'],
        #                     warmup=self.opt['warmup'],
        #                     t_total=len(self.dataset_loader['train']) * self.epoch
        #                     )

        # self.optimizer = transformers.AdamW(self.model.parameters(), lr=self.opt['lr'])
        self.optimizer = transformers.AdamW(optimizer_grouped_parameters, lr=self.opt['lr'])
Пример #25
0
 def configure_optimizers(self):
     # optimizer = torch.optim.Adam(self.parameters(), lr=self.config['lr'])
     optimizer = transformers.AdamW(
         self.parameters(), lr=self.config['lr'])  #, weight_decay=0.01
     scheduler = transformers.get_cosine_with_hard_restarts_schedule_with_warmup(
         optimizer,
         num_warmup_steps=350,
         num_training_steps=3000,
         num_cycles=1)
     schedulers = [{
         'scheduler': scheduler,
         'interval': 'step',
         'frequency': 1
     }]
     return [optimizer], schedulers
Пример #26
0
    def _create_optimizer(self):
        """Create the optimzier instance used for training and wrap with distributed library if need.

        Return:
            True if optimizer instance is created successfully.
        """
        if self._args.distributed_impl == DistributedImpl.DDP:
            self._model = torch.nn.parallel.DistributedDataParallel(
                self._model,
                device_ids=[self._local_rank],
                output_device=self._local_rank)

        if self._optimizer_type == Optimizer.SGD:
            self._optimizer = torch.optim.SGD(self._model.parameters(),
                                              lr=1e-5,
                                              momentum=0.9,
                                              weight_decay=1e-4,
                                              nesterov=True)
        elif self._optimizer_type == Optimizer.ADAM:
            self._optimizer = torch.optim.Adam(self._model.parameters(),
                                               lr=1e-5,
                                               betas=(0.9, 0.999),
                                               eps=1e-08)
        elif self._optimizer_type == Optimizer.ADAMW:
            self._optimizer = transformers.AdamW(self._model.parameters(),
                                                 lr=1e-5,
                                                 betas=(0.9, 0.999),
                                                 eps=1e-08)
        else:
            self._optimizer = None

        if not self._optimizer:
            logger.error(
                'Create optimizer failed - model: {}, optimizer type: {}.'.
                format(self._name, self._optimizer_type))
            return False

        if self._args.distributed_impl == DistributedImpl.HOROVOD:
            import horovod.torch as hvd
            self._optimizer = hvd.DistributedOptimizer(
                self._optimizer,
                named_parameters=self._model.named_parameters(),
                compression=hvd.Compression.none,
                op=hvd.Average)
            hvd.broadcast_parameters(self._model.state_dict(), root_rank=0)
            hvd.broadcast_optimizer_state(self._optimizer, root_rank=0)

        return True
Пример #27
0
def make_optimizer(model, optimizer_name="AdamW", sam=False):
    optimizer_grouped_parameters = get_optimizer_params(model)
    kwargs = {
        'lr': 5e-5,
        'weight_decay': 0.01,
        # 'betas': (0.9, 0.98),
        # 'eps': 1e-06
    }
    if sam:
        if optimizer_name == "LAMB":
            optimizer = Lamb(optimizer_grouped_parameters, **kwargs)
            return optimizer
        elif optimizer_name == "Adam":
            from torch.optim import Adam
            optimizer = Adam(optimizer_grouped_parameters, **kwargs)
            return optimizer
        elif optimizer_name == "AdamW":
            optimizer = transformers.AdamW(optimizer_grouped_parameters,
                                           **kwargs)
            return optimizer
        else:
            raise Exception('Unknown optimizer: {}'.format(optimizer_name))
    else:
        if optimizer_name == "LAMB":
            base_optimizer = Lamb
            optimizer = SAM(optimizer_grouped_parameters,
                            base_optimizer,
                            rho=0.05,
                            **kwargs)
            return optimizer
        elif optimizer_name == "Adam":
            from torch.optim import Adam
            base_optimizer = Adam
            optimizer = SAM(optimizer_grouped_parameters,
                            base_optimizer,
                            rho=0.05,
                            **kwargs)
            return optimizer
        elif optimizer_name == "AdamW":
            from transformers import AdamW
            base_optimizer = AdamW
            optimizer = SAM(optimizer_grouped_parameters,
                            base_optimizer,
                            rho=0.05,
                            **kwargs)
            return optimizer
        else:
            raise Exception('Unknown optimizer: {}'.format(optimizer_name))
Пример #28
0
    def init_optim(self):
        param_optimizer = list(self.model.bert.named_parameters())  # 模型参数名字列表
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [p for n, p in param_optimizer]
        }]

        fc_optimizer = list(self.model.fc.named_parameters())  # 模型参数名字列表
        optimizer_grouped_parameters += [{
            'params': [p for n, p in fc_optimizer],
            'lr': self.opt['lr_sasrec']
        }]

        # self.optimizer = transformers.AdamW(self.model.parameters(), lr=self.opt['lr'])
        self.optimizer = transformers.AdamW(optimizer_grouped_parameters,
                                            lr=self.opt['lr_bert'])
Пример #29
0
def configure_optimizer(m, h):
    prelim_groups = {}
    opt_config = h["opt_config"]
    for (name, param) in m.named_parameters():
        add_to_group(opt_config, prelim_groups, name, param)

    groups = []
    for _, gps in prelim_groups.items():
        for gp in gps:
            if len(gp["params"]) > 0:
                groups.append(gp)

    optimizer = transformers.AdamW(params=groups)

    lr_scheduler = transformers.get_constant_schedule_with_warmup(
        optimizer, num_warmup_steps=h["num_warmup_steps"])
    return {"optimizer": optimizer, "lr_scheduler": lr_scheduler}
Пример #30
0
    def get_optim(self, model):
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters()],
                "weight_decay": self.weight_decay
            },
        ]

        optimizer = transformers.AdamW(optimizer_grouped_parameters,
                                       lr=self.lr,
                                       weight_decay=self.weight_decay)
        scheduler = transformers.get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.warmup_steps,
            num_training_steps=self.total_steps)

        return optimizer, scheduler