Exemplo n.º 1
0
 def init_optimizer(self):
     optimize_steps = iceil(len(self.sampler) / self.batch_size)
     no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
     optimizer_params = [
         {
             'params': [
                 i for n, i in self.model.named_parameters()
                 if not any(nd in n for nd in no_decay)
             ],
             'weight_decay':
             0.01
         },
         {
             'params': [
                 i for n, i in self.model.named_parameters()
                 if any(nd in n for nd in no_decay)
             ],
             'weight_decay':
             0.0
         },
     ]
     self.optimizer = BertAdam(optimizer_params,
                               lr=self.learning_rate,
                               warmup=self.warmup_prop,
                               t_total=optimize_steps)
     return self
Exemplo n.º 2
0
    def __init__(self):
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        self.model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")
        train_dir = os.path.join("./save", "qa")
        self.save_dir = os.path.join(train_dir, "train_%d" % int(time.strftime("%m%d%H%M%S")))
        if not os.path.exists(self.save_dir):
            os.makedirs(self.save_dir)
        # read data-set and prepare iterator
        self.train_loader = self.get_data_loader("./squad/train-v1.1.json")
        self.dev_loader = self.get_data_loader("./squad/new_dev-v1.1.json")

        num_train_optimization_steps = len(self.train_loader) * config.num_epochs
        # optimizer
        param_optimizer = list(self.model.named_parameters())
        # hack to remove pooler, which is not used
        # thus it produce None grad that break apex
        param_optimizer = [n for n in param_optimizer if "pooler" not in n[0]]
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        self.qa_opt = BertAdam(optimizer_grouped_parameters,
                               lr=config.qa_lr,
                               warmup=config.warmup_proportion,
                               t_total=num_train_optimization_steps)

        # self.qg_lr = config.lr

        # assign model to device
        self.model = self.model.to(config.device)
Exemplo n.º 3
0
    def __init__(self, opt, batch_num):
        self.opt = opt
        self.model = Extraction(opt)
        self.model.cuda()

        param_optimizer = list(self.model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay_rate':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay_rate':
            0.0
        }]
        num_train_optimization_steps = batch_num * (opt['num_epoch'] + 1)
        self.optimizer = BertAdam(optimizer_grouped_parameters,
                                  lr=opt['lr'],
                                  warmup=0.1,
                                  t_total=num_train_optimization_steps)
        self.bce = nn.BCELoss(reduction='none')

        self.ema = layers.EMA(self.model, opt['ema'])
        self.ema.register()
Exemplo n.º 4
0
    def _init_nn(self, train_dataset_len):
        """Initialize the nn model for training."""
        self.model = MCBertForPretrainingModel(vis_feat_dim=self.vis_feat_dim,
                                               spatial_size=self.spatial_size,
                                               hidden_dim=self.hidden_dim,
                                               cmb_feat_dim=self.cmb_feat_dim,
                                               kernel_size=self.kernel_size)

        # Prepare optimizer
        param_optimizer = list(self.model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        self.optimizer = BertAdam(
            optimizer_grouped_parameters,
            lr=self.learning_rate,
            warmup=self.warmup_proportion,
            t_total=int(train_dataset_len / 40 / self.batch_size *
                        self.num_epochs))

        if self.USE_CUDA:
            self.model = self.model.cuda()
Exemplo n.º 5
0
    def __init__(self, qa_model_path, ca2q_model_path, c2q_model_path, c2a_model_path):
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        self.model = DualNet(qa_model_path, ca2q_model_path, c2q_model_path, c2a_model_path)
        train_dir = os.path.join("./save", "dual")
        self.save_dir = os.path.join(train_dir, "train_%d" % int(time.strftime("%m%d%H%M%S")))
        # read data-set and prepare iterator
        self.train_loader = self.get_data_loader("./squad/train-v1.1.json")
        self.dev_loader = self.get_data_loader("./squad/new_dev-v1.1.json")

        num_train_optimization_steps = len(self.train_loader) * config.num_epochs
        # optimizer
        param_optimizer = list(self.model.qa_model.named_parameters())
        # hack to remove pooler, which is not used
        # thus it produce None grad that break apex
        param_optimizer = [n for n in param_optimizer if "pooler" not in n[0]]
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        self.qa_opt = BertAdam(optimizer_grouped_parameters,
                               lr=config.qa_lr,
                               warmup=config.warmup_proportion,
                               t_total=num_train_optimization_steps)

        params = list(self.model.ca2q_model.encoder.parameters()) \
                 + list(self.model.ca2q_model.decoder.parameters())
        # self.qg_lr = config.lr
        self.qg_opt = optim.Adam(params, config.qa_lr)

        # assign model to device and wrap it with DataParallel
        torch.cuda.set_device(0)
        self.model.cuda()
        self.model = nn.DataParallel(self.model)
Exemplo n.º 6
0
def train(config, model, train_iter, dev_iter):
    start_time = time.time()
    model.train()
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
    # optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=config.learning_rate,
                         warmup=0.05,
                         t_total=len(train_iter) * config.num_epochs)
    total_batch = 0  # 记录进行到多少batch
    # dev_best_loss = float('inf')
    dev_best_acc = -float('inf')
    last_improve = 0  # 记录上次验证集loss下降的batch数
    flag = False  # 记录是否很久没有效果提升
    model.train()
    for epoch in range(config.num_epochs):
        print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs))
        for i, batch in enumerate(train_iter):
            trains = (batch[0], batch[1], batch[2])
            labels = torch.squeeze(batch[3], dim=1)
            outputs = model(trains)

            model.zero_grad()
            class_weight = torch.FloatTensor([1, 1, 1, 1, 1, 1, 1, 0.4, 2, 1]).cuda()
            loss = F.cross_entropy(outputs, labels, weight=class_weight)
            # loss = F.cross_entropy(outputs, labels)
            loss.backward()
            optimizer.step()

            if total_batch % 100 == 0:
                # 每多少轮输出在训练集和验证集上的效果
                true = labels.data.cpu()
                predic = torch.max(outputs.data, 1)[1].cpu()
                train_acc = metrics.accuracy_score(true, predic)
                dev_acc, dev_loss, ouputs_all = evaluate(config, model, dev_iter)
                if dev_acc > dev_best_acc:
                    # dev_best_loss = dev_loss
                    dev_best_acc = dev_acc
                    torch.save(model.state_dict(), config.save_path)
                    improve = '*'
                    last_improve = total_batch
                else:
                    improve = ''
                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6},  Train Loss: {1:>5.2},  Train Acc: {2:>6.2%},  Val Loss: {3:>5.2},  Val Acc: {4:>6.2%},  Time: {5} {6}'
                print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve))
                model.train()
            total_batch += 1
            if total_batch - last_improve > config.require_improvement:
                # 验证集loss超过1000batch没下降,结束训练
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break
        if flag:
            break
Exemplo n.º 7
0
def train(opt):

    # basics definition
    opt.experiment = os.path.join(root_dir, opt.experiment)
    if not os.path.exists(opt.experiment):
        os.makedirs(opt.experiment)
    opt.save_model = os.path.join(opt.experiment, opt.save_model)
    opt.log_path = os.path.join(opt.experiment, 'log.train')
    opt.logger = make_logger(opt.log_path)

    bert_tokenizer, bert_model = make_bert()

    # dataIter definition
    class2idx = build_class_vocab(opt.data_root + 'class.all')
    opt.class_size = len(class2idx)
    train_iter = BertIter4STC(opt.data_root + 'train', bert_tokenizer,
                              class2idx, opt.batch_size, opt.cuda, True)
    valid_iter = BertIter4STC(opt.data_root + 'valid', bert_tokenizer,
                              class2idx, opt.batch_size, opt.cuda, False)

    # model definition
    model = make_model(opt, bert_model)

    # criterion definition
    criterion = nn.BCELoss(reduction='sum')
    if opt.cuda:
        criterion = criterion.cuda()

    # optimizer definition
    if opt.fix_bert:
        for (name, parameter) in model.bert.named_parameters():
            parameter.requires_grad = False

    if opt.optim == 'bert':
        params = list(
            filter(lambda x: x[1].requires_grad == True,
                   model.named_parameters()))
        print('Trainable parameter number: {}'.format(len(params)))
        print('Trainer: bert')
        no_decay = ['bias', 'gamma', 'beta']
        grouped_params = [{
            'params': [p for n, p in params if n not in no_decay],
            'weight_decay_rate': 0.01
        }, {
            'params': [p for n, p in params if n in no_decay],
            'weight_decay_rate': 0.0
        }]
        optimizer = BertAdam(grouped_params,
                             opt.lr,
                             warmup=0.1,
                             t_total=len(train_iter) * opt.epochs)
    else:
        optimizer = Optim(opt.optim, opt.lr, max_grad_norm=opt.max_norm)
        optimizer.set_parameters(model.named_parameters())
        print('Trainable parameter number: {}'.format(len(optimizer.params)))

    # training procedure
    trainer = BertTrainer4STC(model, criterion, optimizer, opt.logger)
    trainer.train(opt.epochs, train_iter, valid_iter, opt.save_model)
Exemplo n.º 8
0
def train(config, model, train_iter):
    """
    模型训练方法
    :param config:
    :param model:
    :param train_iter:
    :param dev_iter:
    :param test_iter:
    :return:
    """
    # 启动 BatchNormalization 和 dropout
    model.train()
    # 拿到所有mode种的参数
    param_optimizer = list(model.named_parameters())
    # 不需要衰减的参数
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_deacy':
        0.0
    }]

    optimizer = BertAdam(params=optimizer_grouped_parameters,
                         lr=config.learning_rate,
                         warmup=0.05,
                         t_total=len(train_iter) * config.num_epochs)

    total_batch = 0  # 记录进行多少batch
    model.train()
    for epoch in range(config.num_epochs):
        print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs))
        # 迭代器返回的结果: (x, seq_len, mask), y  ==> (list(list(int)), list(int), list(list(int)), int)
        for i, (trains, labels) in enumerate(train_iter):
            outputs = model(trains)
            model.zero_grad()
            loss = F.cross_entropy(outputs, labels)
            loss.backward()
            optimizer.step()

            if total_batch % 10 == 0:  # 每多少次输出在训练集和校验集上的效果
                true = labels.data.cpu()
                predit = torch.max(outputs.data, 1)[1].cpu()
                train_acc = metrics.accuracy_score(true, predit)
                print(f"##### {loss.item()}, acc {train_acc}")
                model.train()
            total_batch = total_batch + 1
Exemplo n.º 9
0
 def test_adam(self):
     w = torch.tensor([0.1, -0.2, -0.1], requires_grad=True)
     target = torch.tensor([0.4, 0.2, -0.5])
     criterion = torch.nn.MSELoss()
     # No warmup, constant schedule, no gradient clipping
     optimizer = BertAdam(params=[w], lr=2e-1,
                                       weight_decay=0.0,
                                       max_grad_norm=-1)
     for _ in range(100):
         loss = criterion(w, target)
         loss.backward()
         optimizer.step()
         w.grad.detach_() # No zero_grad() function on simple tensors. we do it ourselves.
         w.grad.zero_()
     self.assertListAlmostEqual(w.tolist(), [0.4, 0.2, -0.5], tol=1e-2)
Exemplo n.º 10
0
def get_optimizer(model, args):
    if args.model in ["bert", "concatbert", "mmbt"]:
        total_steps = (args.train_data_len / args.batch_sz /
                       args.gradient_accumulation_steps * args.max_epochs)
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.01
            },
            {
                "params": [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0,
            },
        ]
        optimizer = BertAdam(
            optimizer_grouped_parameters,
            lr=args.lr,
            warmup=args.warmup,
            t_total=total_steps,
        )
    else:
        optimizer = optim.Adam(model.parameters(), lr=args.lr)

    return optimizer
Exemplo n.º 11
0
def set_bertadam_optimizer(model,
                           lr,
                           t_total,
                           warmup=0.1,
                           schedule='warmup_linear',
                           weight_decay=0.01):
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    grouped_parameters = [{
        'params': [
            p for n, p in param_optimizer
            if p.requires_grad and not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        weight_decay
    }, {
        'params': [
            p for n, p in param_optimizer
            if p.requires_grad and any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = BertAdam(grouped_parameters,
                         lr=lr,
                         schedule=schedule,
                         warmup=warmup,
                         t_total=t_total,
                         max_grad_norm=1.0,
                         weight_decay=weight_decay)
    return optimizer
Exemplo n.º 12
0
def getOptim(model):
    FULL_FINETUNING = True
    if FULL_FINETUNING:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay_rate':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay_rate':
            0.0
        }]
    else:
        param_optimizer = list(mode.classifier.named_parameters())
        optimizer_grouped_parameters = [{
            "params": [p for n, p in param_optimizer]
        }]

    optimizer = BertAdam(optimizer_grouped_parameters, lr=3e-5)

    return optimizer
Exemplo n.º 13
0
 def test_bert_sched_init(self):
     m = torch.nn.Linear(50, 50)
     optim = BertAdam(m.parameters(),
                      lr=0.001,
                      warmup=.1,
                      t_total=1000,
                      schedule=None)
     self.assertTrue(
         isinstance(optim.param_groups[0]["schedule"], ConstantLR))
     optim = BertAdam(m.parameters(),
                      lr=0.001,
                      warmup=.1,
                      t_total=1000,
                      schedule="none")
     self.assertTrue(
         isinstance(optim.param_groups[0]["schedule"], ConstantLR))
     optim = BertAdam(m.parameters(), lr=0.001, warmup=.01, t_total=1000)
     self.assertTrue(
         isinstance(optim.param_groups[0]["schedule"],
                    WarmupLinearSchedule))
Exemplo n.º 14
0
def train_main(args):

    train_loader, num_train = load_dataset(
        os.path.join(args.input, "train_retrievedsents_leetal.json"),
        args.bs,
        shuffle=True,
    )
    print("loaded train, # samples: %d" % num_train)
    dev_loader, num_dev = load_dataset(
        os.path.join(args.input, "dev_retrievedsents_leetal.json"), args.bs)
    print("loaded dev, # samples: %d" % num_dev)

    pretrained_model = BertForSequenceClassification.from_pretrained(
        args.checkpoint)
    pretrained_state_dict = pretrained_model.state_dict()
    bert_state_dict = {
        key: pretrained_state_dict[key]
        for key in pretrained_state_dict if not key.startswith("classifier")
    }
    """ only load the encoder layers from ROVER/base-cased-base """
    model = BertForSequenceClassification.from_pretrained(
        args.checkpoint, state_dict=bert_state_dict, num_labels=3)
    print("loaded pretrained model")

    if torch.cuda.is_available():
        model = model.cuda()

    no_decay = ["bias", "LayerNorm.weight", "LayerNorm.bias"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0,
        },
    ]
    num_training_steps = int(len(train_loader) * args.max_epoch)
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.lr,
                         warmup=0.1,
                         t_total=num_training_steps)

    train(model, train_loader, dev_loader, optimizer, args.max_epoch,
          args.eval_steps)
Exemplo n.º 15
0
    def _create_net_and_optim(self, net_cfg, optim_cfg, num_train_optimization_steps):
        net = BertForSequenceClassification.from_pretrained(net_cfg.bert_pretrain, net_cfg.num_labels)
        net.to(device=self._device)

        param_optimizer = filter(lambda p: p.requires_grad, net.parameters())
        if num_train_optimization_steps != None:
            optim = BertAdam(param_optimizer,
                             t_total=num_train_optimization_steps,
                             **optim_cfg.kwargs)
        else:
            optim = None
        return net, optim
Exemplo n.º 16
0
def get_bert_optimizer(model, lr=2e-5, **kwargs):
    """
    A convenient function to get the BERT-Adam optimizer.
    :param model: the model to apply this optimizer on.
    :param lr: the learning rate; ! for a complicated model like bert, a tiny learning rate like 2e-5 is usually desired.
    :return: a BERT-Adam optimizer.
    """
    if lr > 2e-4:
        warnings.warn(
            f"for a complicated model like bert, a tiny learning rate like 2e-5 is usually desired; got `{lr}`"
        )
    return BertAdam(model.parameters(), lr=lr, **kwargs)
Exemplo n.º 17
0
    def run_LR_training(self, config, dev_labels, dev_results, labels,
                        lossfunction, results, total_labels):
        model = SecondaryCls(config).cuda()
        glorot_param_init(model)
        optimizer = BertAdam(filter(lambda p: p.requires_grad,
                                    model.parameters()),
                             lr=config["hyperparameters"]["learning_rate"],
                             weight_decay=0.02)
        best_distribution = None
        best_F1 = 0
        for i in range(1000):
            pred_logits = model(results)
            loss = lossfunction(pred_logits, labels)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            dev_pred_logits = model(dev_results)
            dev_loss = lossfunction(dev_pred_logits, dev_labels)
            maxpreds, argmaxpreds = torch.max(F.softmax(dev_pred_logits, -1),
                                              dim=1)
            total_preds = list(argmaxpreds.cpu().numpy())
            correct_vec = argmaxpreds == dev_labels
            total_correct = torch.sum(correct_vec).item()
            loss, acc = dev_loss, total_correct / results.shape[0]
            F1 = metrics.f1_score(total_labels, total_preds, average="macro")
            if F1 > best_F1:
                best_F1 = F1
                best_distribution = F.softmax(model.a)

            # logging.info(
            #     f"Validation loss|acc|F1|BEST: {loss:.6f}|{acc:.6f}|{F1:.6f} || {best_F1} || ")
        return best_F1, best_distribution
def configure_model():
	model = BertForSequenceClassification.from_pretrained(BERT_MODEL_PATH,cache_dir=None,num_labels=1)
	model.zero_grad()
	model = model.to(device)

	param_optimizer = list(model.named_parameters())
	no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
	lr = 3e-5
	epsilon=1
	lr_d = {}
	weight_d = {}
	for n, p in param_optimizer:
	    if any(nd in n for nd in no_decay):
	        weight_d[n] = 0.0
	    else:
	        weight_d[n] = 0.01
	for n, p in param_optimizer[:5]:
	    lr_d[n] = lr*(epsilon**(11))
	for n, p in param_optimizer:
	    if 'bert.encoder.layer.' in n:
	        for i in range(0, 12):
	            if 'bert.encoder.layer.'+str(i)+'.'  in n:
	                lr_d[n] = lr*(epsilon**(11-i))
	                break
	for n, p in param_optimizer[-4:]:
	    lr_d[n] = lr
	comb_dict = {}
	for n, p in param_optimizer:
	    para = (weight_d[n], lr_d[n])
	    if para in comb_dict:
	        comb_dict[para].append(p)
	    else:
	        comb_dict[para] = [p]
	optimizer_grouped_parameters = []
	for i, j in comb_dict.items():
	    optimizer_grouped_parameters.append({'params':j, 'weight_decay' : i[0], 'lr' : i[1]})

	train = train_dataset

	num_train_optimization_steps = int(EPOCHS*len(train)/batch_size/accumulation_steps)

	optimizer = BertAdam(optimizer_grouped_parameters,
	                     lr=lr,
	                     warmup=0.05,
	                     t_total=num_train_optimization_steps)

	model, optimizer = amp.initialize(model, optimizer, opt_level="O1",verbosity=0)
	model=model.train()

	return model, optimizer, train
Exemplo n.º 19
0
    def __fit_net(self, train_dataloader):
        self.model.cuda()

        param_optimizer = list(self.model.named_parameters())

        no_decay = ['bias', 'gamma', 'beta']

        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay_rate':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay_rate':
            0.0
        }]

        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=self.lr,
                             warmup=.1)

        train_loss_set = []

        for _ in trange(self.n_epochs, desc="Epoch"):
            self.model.train()

            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0

            for step, batch in enumerate(train_dataloader):
                batch = tuple(t.to(self.device) for t in batch)
                b_input_ids, b_input_mask, b_labels, = batch
                optimizer.zero_grad()

                loss = self.model(b_input_ids,
                                  token_type_ids=None,
                                  attention_mask=b_input_mask,
                                  labels=b_labels)
                train_loss_set.append(loss.item())

                loss.backward()
                optimizer.step()

                # Update tracking variables
                tr_loss += loss.item()
                nb_tr_examples += b_input_ids.size(0)
                nb_tr_steps += 1

            print("Train loss: {}".format(tr_loss / nb_tr_steps))

            # add Validation

            self.model.eval()
Exemplo n.º 20
0
    def __init__(self, model: SketchPredictor, num_train_step: int,
                 freeze_bert_for_niter: int, config: Dict):
        self.model = model

        bert_params = list([
            (p_name, p)
            for (p_name,
                 p) in model.encoder_model.bert_model.named_parameters()
            if p.requires_grad
        ])
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

        bert_grouped_parameters = [{
            'params':
            [p for n, p in bert_params if not any(nd in n for nd in no_decay)],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in bert_params if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        self.other_params = [
            p for n, p in model.named_parameters()
            if 'bert_model' not in n and p.requires_grad
        ]

        self.bert_optimizer = BertAdam(bert_grouped_parameters,
                                       lr=config['bert_learning_rate'],
                                       warmup=0.1,
                                       t_total=num_train_step)

        self.optimizer = torch.optim.Adam(self.other_params, lr=0.001)

        self.freeze_bert_for_niter = freeze_bert_for_niter
Exemplo n.º 21
0
 def _get_optimizer_(self, model):
     no_decay = ['bias', 'gamma', 'beta']
     param_optimizer = list(model.named_parameters())
     optimizer_grouped_parameters = [
         {'params': [p for n, p in param_optimizer \
             if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
         {'params': [p for n, p in param_optimizer \
             if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
     ]
     # TODO allow diff lr for G & D
     return BertAdam(optimizer_grouped_parameters,
                     lr=self.args.lr,
                     warmup=self.args.warmup)
Exemplo n.º 22
0
def main():
    data = ingest.getTrainData(tokenize=False, lower=False)[:20]
    validData = ingest.getValidationData(tokenize=False, lower=False)[:20]
    transformedDataSet = transform(data)
    transformedValidationDataSet = transform(validData)

    optimizer = BertAdam(model.parameters(), lr=2e-6, warmup=.1)

    epochs = 100

    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()

        print("Training started for epoch {}".format(epoch + 1))

        correct = 0
        total = 0
        tr_loss = 0

        for dataPoint in tqdm(transformedDataSet):
            story = dataPoint["story"]
            segmentMask = dataPoint["segmentMask"]
            label = torch.LongTensor([0 if dataPoint["label"] else 1])

            # Forward pass
            seq_relationship_score = model(story, token_type_ids=segmentMask)

            if (dataPoint["label"] and
                (seq_relationship_score[0][0] > seq_relationship_score[0][1])):
                correct += 1
            elif (
                    not dataPoint["label"] and
                (seq_relationship_score[0][0] < seq_relationship_score[0][1])):
                correct += 1
            total += 1

            # Backward pass
            loss_fct = CrossEntropyLoss(ignore_index=-1)
            loss = loss_fct(seq_relationship_score.view(-1, 2), label.view(-1))
            tr_loss += loss.item()
            loss.backward()
            optimizer.step()

        print("Training accuracy for epoch {}: {}".format(
            epoch + 1, correct / total))
        print("Training loss for epoch {}: {}".format(epoch + 1,
                                                      tr_loss / total))

        correct = 0
        total = 0
Exemplo n.º 23
0
    def buildOptimizer(self,
                       neural,
                       epochs,
                       batch_size,
                       accumulation_steps,
                       lr=2e-5,
                       warmup=0.05):
        """

    build bert optimizer
  
    """
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

        param_optimizer = list(neural.named_parameters())

        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        num_train_optimization_steps = int(epochs * len(self.sentences) /
                                           batch_size / accumulation_steps)

        if self.optimizer == 'BertAdam':
            return BertAdam(optimizer_grouped_parameters,
                            lr=lr,
                            warmup=warmup,
                            t_total=num_train_optimization_steps)
        else:
            return OpenAIAdam(optimizer_grouped_parameters,
                              lr=lr,
                              warmup=warmup,
                              t_total=num_train_optimization_steps)
def setup_bert_optimizer_for_model(model, epochs, lrate, lrate_clf, batch_size,
                                   accum_steps, warmup, apex_mixed_precision,
                                   train_loader):
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in param_optimizer
            if ('classifier' not in n) and not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.01
    }, {
        'params': [
            p for n, p in param_optimizer
            if ('classifier' not in n) and any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }, {
        'params': [p for n, p in param_optimizer if 'classifier' in n],
        'weight_decay':
        0.01,
        'lr':
        lrate_clf
    }]

    num_train_optimization_steps = math.ceil(
        (epochs + 1) * len(train_loader) / accum_steps)

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=lrate,
                         warmup=warmup,
                         t_total=num_train_optimization_steps)
    if apex_mixed_precision:
        from apex import amp
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level="O1",
                                          verbosity=0)

    return model, optimizer
Exemplo n.º 25
0
def init_optimizer(model, config, *args, **params):
    optimizer_type = config.get("train", "optimizer")
    learning_rate = config.getfloat("train", "learning_rate")
    if optimizer_type == "adam":
        optimizer = optim.Adam(model.parameters(), lr=learning_rate,
                               weight_decay=config.getfloat("train", "weight_decay"))
    elif optimizer_type == "sgd":
        optimizer = optim.SGD(model.parameters(), lr=learning_rate,
                              weight_decay=config.getfloat("train", "weight_decay"))
    elif optimizer_type == "bert_adam":
        optimizer = BertAdam(model.parameters(), lr=learning_rate,
                             weight_decay=config.getfloat("train", "weight_decay"))
    elif optimizer_type == "lamb":
        optimizer = Lamb(model.parameters(), lr=learning_rate,
                             weight_decay=config.getfloat("train", "weight_decay"))

    else:
        raise NotImplementedError

    return optimizer
Exemplo n.º 26
0
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    # print(torch.cuda.get_device_name(0))
    datadir = '../cola_public/raw/'
    filename = '../cndata/cntext.json'
    classes = [
        'C31-Enviornment', 'C32-Agriculture', 'C34-Economy', 'C38-Politics',
        'C39-Sports'
    ]
    # loadcn(filename, classes)
    MAX_LEN = 128
    batch_size = 32
    lr = 2e-5
    epoch = 4
    train_dataloader, dev_dataloader = loadcn(filename, classes, MAX_LEN,
                                              batch_size)
    model = BertForSequenceClassification.from_pretrained(
        "../bert-base-chinese", num_labels=len(classes))
    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    no_decay = ['bias', 'gamma', 'beta']
    param_optimizer = list(model.named_parameters())
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.0
    }, {
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }]
    optimizer = BertAdam(optimizer_grouped_parameters, lr, warmup=0.1)
    train(model, device, epoch, train_dataloader, dev_dataloader, optimizer,
          n_gpu)
    torch.save(model.state_dict(), "../save/bert_cn%d" % epoch)
Exemplo n.º 27
0
def train(config):
    train_loader = load_dataset(config.train_path, config)
    dev_loader = load_dataset(config.dev_path, config)

    model = Model(config).to(config.device)
    optimizer = BertAdam(model.parameters(),
                         lr=config.lr,
                         warmup=0.05,
                         t_total=len(train_loader) * config.num_epoches)
    loss_func = torch.nn.CrossEntropyLoss()
    print_loss = 0
    best_acc = 0
    model.train()
    for epoch in range(config.num_epoches):
        for step, (batch_texts, batch_span) in enumerate(train_loader):
            max_len = max([len(i) for i in batch_texts])
            x = config.tokenizer.batch_encode_plus(batch_texts, add_special_tokens=True,
                                                   return_tensors="pt", max_length=max_len, pad_to_max_length=True)
            x["input_ids"] = x["input_ids"].to(config.device)
            x["attention_mask"] = x["attention_mask"].to(config.device)
            x["token_type_ids"] = x["token_type_ids"].to(config.device)
            batch_span = batch_span.to(config.device)

            out = model(input_ids=x["input_ids"], attention_mask=x["attention_mask"], token_type_ids=x["token_type_ids"])
            optimizer.zero_grad()
            loss = loss_func(out, batch_span)
            loss.backward()
            optimizer.step()

            if step % 1  == 0:
                corrects = (torch.max(out, 1)[1].view(batch_span.size()).data == batch_span.data).sum()
                train_acc = 100.0 * corrects / config.batch_size
                # print("epoch:", epoch, "step:", step, "loss:", print_loss.item() / 50)
                sys.stdout.write(
                    '\rBatch[{}] - loss: {:.6f}  acc: {:.4f}%({}/{})'.format(step,
                                                                             loss.item(),
                                                                             train_acc,
                                                                             corrects,
                                                                             config.batch_size))
            if step % 50 == 0:
                dev_acc = eval(dev_loader, model, config)
                if dev_acc > best_acc:
                    best_acc = dev_acc
                    print(
                        'Saving best model, acc: {:.4f}%\n'.format(best_acc))
                    save(model, config.model_path, 'best', step)
Exemplo n.º 28
0
def build_optimizer(model, num_train_steps, learning_rate, warmup_proportion,
                    weight_decay):
    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        weight_decay
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=learning_rate,
                         warmup=warmup_proportion,
                         t_total=num_train_steps)

    return optimizer
Exemplo n.º 29
0
    def configure_optimizers(self):
        if self.hparams.model in ["bert", "concatbert", "mmbt"]:
            total_steps = (self.hparams.train_data_len /
                           self.hparams.batch_sz /
                           self.hparams.gradient_accumulation_steps *
                           self.hparams.max_epochs)

            param_optimizer = self.exclude_from_wt_decay(
                list(self.named_parameters()),
                ["bias", "LayerNorm.bias", "LayerNorm.weight"])

            optimizer = BertAdam(
                param_optimizer,
                lr=self.hparams.lr,
                warmup=self.hparams.warmup,
                t_total=total_steps,
            )
        else:
            optimizer = optim.Adam(self.parameters(), lr=self.hparams.lr)

        scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            "max",
            patience=self.hparams.lr_patience,
            verbose=True,
            factor=self.hparams.lr_factor,
        )

        scheduler = {
            'scheduler': scheduler,
            'monitor': 'val_checkpoint_on',
            'interval': 'epoch',
            'frequency': self.hparams.lr_patience
        }

        return [optimizer], [scheduler]
Exemplo n.º 30
0
def bert_train(inputs, token_type_ids, masked_lm_labels):
    #Pre_training & Fine tuning Bert for text Generation
    model = BertForMaskedLM.from_pretrained('bert-base-uncased')
    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    # hack to remove pooler, which is not used
    # thus it produce None grad that break apex
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=5e-5,
                         warmup=0.1,
                         t_total=300)
    model.train()
    n_steps = 10
    n_batches = len(inputs)
    for epoch in range(0, n_steps):  #(0,2)
        eveloss = 0
        for i in range(n_batches):  # (1):
            loss = model(inputs[i],
                         token_type_ids=token_type_ids[i],
                         masked_lm_labels=masked_lm_labels[i])
            eveloss += loss.mean().item()
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        print("step " + str(epoch) + " : " + str(eveloss))
    return model