def get_optimizer(args, model): logger = get_logger(args.log_name) args.warmup_steps = math.ceil(args.warmup_prop * args.max_train_steps) if args.optimizer == 'adamw-bertology': if args.different_lr: optimizer_grouped_parameters = _get_bertology_different_lr_grouped_parameters( args, model) else: optimizer_grouped_parameters = _get_bertology_optimizer_grouped_parameters( args, model) optimizer = huggingfaceOptim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=(args.beta1, args.beta2)) scheduler = huggingfaceOptim.WarmupLinearSchedule( optimizer, warmup_steps=args.warmup_steps, t_total=args.max_train_steps) if args.local_rank in [-1, 0]: logger.info('Use Huggingface\'s AdamW Optimizer') elif args.optimizer == 'adamw-torch': try: from torch.optim import AdamW except ImportError as e: debug_print(f'torch version: {torch.__version__}') raise e if args.different_lr: optimizer_grouped_parameters = _get_bertology_different_lr_grouped_parameters( args, model) else: optimizer_grouped_parameters = _get_bertology_optimizer_grouped_parameters( args, model) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=(args.beta1, args.beta2)) scheduler = huggingfaceOptim.WarmupLinearSchedule( optimizer, warmup_steps=args.warmup_steps, t_total=args.max_train_steps) elif args.optimizer == 'sgd': optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate) scheduler = None elif args.optimizer == 'adagrad': optimizer = torch.optim.Adagrad(model.parameters(), lr=args.learning_rate) scheduler = None elif args.optimizer == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps, weight_decay=args.weight_decay) scheduler = None elif args.rnn_optimizer == 'adamax': optimizer = torch.optim.Adamax(model.parameters()) # use default lr scheduler = None else: raise Exception("Unsupported optimizer: {}".format(args.optimizer)) return optimizer, scheduler
JointModel = JointModel.cuda() criterion = criterion.cuda() # relation_model = relation_model.cuda() if args.use_RL: RL_model = RL_model.cuda() # RL_model.cuda() out_losses = [] # RL_RE_losses = [] # RE_rewardsall = [] # TOTAL_rewardsall = [] print_loss_total = 0 # Reset every print_every # plot_loss_total = 0 # Reset every plot_every _params = filter(lambda p: p.requires_grad, JointModel.parameters()) if args.encoder_model == 'BERT': optimizer = optimization.AdamW(_params, lr=learning_rate, weight_decay=l2) else: optimizer = optim.Adam(_params, lr=learning_rate, weight_decay=l2) logger.info(JointModel) logger.info( "hidden_dim: %s, dropout_NER: %s, dropout_RE: %s, lr: %s, epoch: %s, batch_size: %s" % (args.hidden_dim, args.dropout_NER, args.dropout_RE, args.lr, args.epochRL, args.batchsize)) # encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate, weight_decay=l2) # SGD # decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate, weight_decay=l2) # RE_optimizer = optim.Adam(relation_model.parameters(), lr=learning_rate, weight_decay=l2) # RL_optimizer = optim.Adam(RL_model.parameters(), lr=args.lr_RL, weight_decay=l2) # sentence_reward_noisy = [0 for i in range(args.batchsize)] # noisy_sentences_vec = Variable(torch.FloatTensor(1, dim).fill_(0)) torch.manual_seed(args.seed)
def layerwise_decay_optimizer(model, lr, layerwise_decay=None): # optimizer and lr scheduler no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] if layerwise_decay is True: optimizer_grouped_parameters = [] for i in range(12): tmp = [{ 'params': [ p for n, p in model.named_parameters() if 'bert.encoder.layer.' + str(i) + '.' in n and not any(nd in n for nd in no_decay) ], 'lr': lr * (layerwise_decay**(11 - i)), 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if 'bert.encoder.layer.' + str(i) + '.' in n and any( nd in n for nd in no_decay) ], 'lr': lr * (layerwise_decay**(11 - i)), 'weight_decay': 0 }] optimizer_grouped_parameters += tmp tmp = [{ 'params': [ p for n, p in model.named_parameters() if 'bert.encoder.layer.' not in n and not any( nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if 'bert.encoder.layer.' not in n and any(nd in n for nd in no_decay) ], 'weight_decay': 0 }] optimizer_grouped_parameters += tmp optimizer = optimization.AdamW(optimizer_grouped_parameters, lr=lr, correct_bias=False) return optimizer
def get_optimizer(args, model): args.warmup_steps = math.ceil(args.warmup_prop * args.max_train_steps) if args.optimizer == 'adamw-bert': no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = huggingfaceOptim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=(args.beta1, args.beta2)) scheduler = huggingfaceOptim.WarmupLinearSchedule( optimizer, warmup_steps=args.warmup_steps, t_total=args.max_train_steps) debug_print('\n - Use Huggingface\'s AdamW Optimizer') elif args.optimizer == 'adamw-torch': try: from torch.optim import AdamW except ImportError as e: debug_print(f'torch version: {torch.__version__}') raise e no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=(args.beta1, args.beta2)) scheduler = huggingfaceOptim.WarmupLinearSchedule( optimizer, warmup_steps=args.warmup_steps, t_total=args.max_train_steps) elif args.optimizer == 'sgd': optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate) scheduler = None elif args.optimizer == 'adagrad': optimizer = torch.optim.Adagrad(model.parameters(), lr=args.learning_rate) scheduler = None elif args.optimizer == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps, weight_decay=args.weight_decay) scheduler = None elif args.rnn_optimizer == 'adamax': optimizer = torch.optim.Adamax(model.parameters()) # use default lr scheduler = None else: raise Exception("Unsupported optimizer: {}".format(args.optimizer)) return optimizer, scheduler