def get_optimizer(args, model):
    logger = get_logger(args.log_name)
    args.warmup_steps = math.ceil(args.warmup_prop * args.max_train_steps)
    if args.optimizer == 'adamw-bertology':
        if args.different_lr:
            optimizer_grouped_parameters = _get_bertology_different_lr_grouped_parameters(
                args, model)
        else:
            optimizer_grouped_parameters = _get_bertology_optimizer_grouped_parameters(
                args, model)
        optimizer = huggingfaceOptim.AdamW(optimizer_grouped_parameters,
                                           lr=args.learning_rate,
                                           eps=args.adam_epsilon,
                                           betas=(args.beta1, args.beta2))
        scheduler = huggingfaceOptim.WarmupLinearSchedule(
            optimizer,
            warmup_steps=args.warmup_steps,
            t_total=args.max_train_steps)
        if args.local_rank in [-1, 0]:
            logger.info('Use Huggingface\'s AdamW Optimizer')
    elif args.optimizer == 'adamw-torch':
        try:
            from torch.optim import AdamW
        except ImportError as e:
            debug_print(f'torch version: {torch.__version__}')
            raise e
        if args.different_lr:
            optimizer_grouped_parameters = _get_bertology_different_lr_grouped_parameters(
                args, model)
        else:
            optimizer_grouped_parameters = _get_bertology_optimizer_grouped_parameters(
                args, model)
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon,
                          betas=(args.beta1, args.beta2))
        scheduler = huggingfaceOptim.WarmupLinearSchedule(
            optimizer,
            warmup_steps=args.warmup_steps,
            t_total=args.max_train_steps)
    elif args.optimizer == 'sgd':
        optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate)
        scheduler = None
    elif args.optimizer == 'adagrad':
        optimizer = torch.optim.Adagrad(model.parameters(),
                                        lr=args.learning_rate)
        scheduler = None
    elif args.optimizer == 'adam':
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=args.learning_rate,
                                     betas=args.betas,
                                     eps=args.eps,
                                     weight_decay=args.weight_decay)
        scheduler = None
    elif args.rnn_optimizer == 'adamax':
        optimizer = torch.optim.Adamax(model.parameters())  # use default lr
        scheduler = None
    else:
        raise Exception("Unsupported optimizer: {}".format(args.optimizer))
    return optimizer, scheduler
예제 #2
0
     JointModel = JointModel.cuda()
     criterion = criterion.cuda()
     # relation_model = relation_model.cuda()
     if args.use_RL:
         RL_model = RL_model.cuda()
     # RL_model.cuda()
 out_losses = []
 # RL_RE_losses = []
 # RE_rewardsall = []
 # TOTAL_rewardsall = []
 print_loss_total = 0  # Reset every print_every
 # plot_loss_total = 0  # Reset every plot_every
 _params = filter(lambda p: p.requires_grad, JointModel.parameters())
 if args.encoder_model == 'BERT':
     optimizer = optimization.AdamW(_params,
                                    lr=learning_rate,
                                    weight_decay=l2)
 else:
     optimizer = optim.Adam(_params, lr=learning_rate, weight_decay=l2)
 logger.info(JointModel)
 logger.info(
     "hidden_dim: %s, dropout_NER: %s, dropout_RE: %s, lr: %s, epoch: %s, batch_size: %s"
     % (args.hidden_dim, args.dropout_NER, args.dropout_RE, args.lr,
        args.epochRL, args.batchsize))
 # encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate, weight_decay=l2)  # SGD
 # decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate, weight_decay=l2)
 # RE_optimizer = optim.Adam(relation_model.parameters(), lr=learning_rate, weight_decay=l2)
 # RL_optimizer = optim.Adam(RL_model.parameters(), lr=args.lr_RL, weight_decay=l2)
 # sentence_reward_noisy = [0 for i in range(args.batchsize)]
 # noisy_sentences_vec = Variable(torch.FloatTensor(1, dim).fill_(0))
 torch.manual_seed(args.seed)
예제 #3
0
def layerwise_decay_optimizer(model, lr, layerwise_decay=None):

    # optimizer and lr scheduler
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.01
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    if layerwise_decay is True:
        optimizer_grouped_parameters = []
        for i in range(12):
            tmp = [{
                'params': [
                    p for n, p in model.named_parameters()
                    if 'bert.encoder.layer.' + str(i) +
                    '.' in n and not any(nd in n for nd in no_decay)
                ],
                'lr':
                lr * (layerwise_decay**(11 - i)),
                'weight_decay':
                0.01
            }, {
                'params': [
                    p for n, p in model.named_parameters()
                    if 'bert.encoder.layer.' + str(i) + '.' in n and any(
                        nd in n for nd in no_decay)
                ],
                'lr':
                lr * (layerwise_decay**(11 - i)),
                'weight_decay':
                0
            }]
            optimizer_grouped_parameters += tmp

        tmp = [{
            'params': [
                p for n, p in model.named_parameters()
                if 'bert.encoder.layer.' not in n and not any(
                    nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if 'bert.encoder.layer.' not in n and any(nd in n
                                                          for nd in no_decay)
            ],
            'weight_decay':
            0
        }]
        optimizer_grouped_parameters += tmp

    optimizer = optimization.AdamW(optimizer_grouped_parameters,
                                   lr=lr,
                                   correct_bias=False)

    return optimizer
def get_optimizer(args, model):
    args.warmup_steps = math.ceil(args.warmup_prop * args.max_train_steps)
    if args.optimizer == 'adamw-bert':
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            args.weight_decay
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]
        optimizer = huggingfaceOptim.AdamW(optimizer_grouped_parameters,
                                           lr=args.learning_rate,
                                           eps=args.adam_epsilon,
                                           betas=(args.beta1, args.beta2))
        scheduler = huggingfaceOptim.WarmupLinearSchedule(
            optimizer,
            warmup_steps=args.warmup_steps,
            t_total=args.max_train_steps)
        debug_print('\n - Use Huggingface\'s AdamW Optimizer')
    elif args.optimizer == 'adamw-torch':
        try:
            from torch.optim import AdamW
        except ImportError as e:
            debug_print(f'torch version: {torch.__version__}')
            raise e
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay) and p.requires_grad
            ],
            'weight_decay':
            args.weight_decay
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay) and p.requires_grad
            ],
            'weight_decay':
            0.0
        }]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon,
                          betas=(args.beta1, args.beta2))
        scheduler = huggingfaceOptim.WarmupLinearSchedule(
            optimizer,
            warmup_steps=args.warmup_steps,
            t_total=args.max_train_steps)
    elif args.optimizer == 'sgd':
        optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate)
        scheduler = None
    elif args.optimizer == 'adagrad':
        optimizer = torch.optim.Adagrad(model.parameters(),
                                        lr=args.learning_rate)
        scheduler = None
    elif args.optimizer == 'adam':
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=args.learning_rate,
                                     betas=args.betas,
                                     eps=args.eps,
                                     weight_decay=args.weight_decay)
        scheduler = None
    elif args.rnn_optimizer == 'adamax':
        optimizer = torch.optim.Adamax(model.parameters())  # use default lr
        scheduler = None
    else:
        raise Exception("Unsupported optimizer: {}".format(args.optimizer))
    return optimizer, scheduler