def get_optimizer(model, args): """Set up the optimizer.""" # Build parameter groups (weight decay and non-decay). while isinstance(model, (DDP, FP16_Module)): model = model.module layers = model.model.bert.encoder.layer pooler = model.model.bert.pooler lmheads = model.model.cls.predictions nspheads = model.model.cls.seq_relationship embeddings = model.model.bert.embeddings param_groups = [] param_groups += list(get_params_for_weight_decay_optimization(layers)) param_groups += list(get_params_for_weight_decay_optimization(pooler)) param_groups += list(get_params_for_weight_decay_optimization(nspheads)) param_groups += list(get_params_for_weight_decay_optimization(embeddings)) param_groups += list( get_params_for_weight_decay_optimization(lmheads.transform)) param_groups[1]['params'].append(lmheads.bias) # Use Adam. optimizer = Adam(param_groups, lr=args.lr, weight_decay=args.weight_decay) # Wrap into fp16 optimizer. if args.fp16: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale, dynamic_loss_scale=args.dynamic_loss_scale, dynamic_loss_args={ 'scale_window': args.loss_scale_window, 'min_scale': args.min_scale, 'delayed_shift': args.hysteresis }) return optimizer
def get_optimizer(model, args): """Set up the optimizer.""" # Build parameter groups (weight decay and non-decay). while isinstance(model, (args.DDP_type, FP16_Module)): model = model.module layers = model.model.bert.encoder.layer pooler = model.model.bert.pooler lmheads = model.model.cls.predictions nspheads = model.model.cls.seq_relationship embeddings = model.model.bert.embeddings param_groups = [] param_groups += list(get_params_for_weight_decay_optimization(layers)) param_groups += list(get_params_for_weight_decay_optimization(pooler)) param_groups += list(get_params_for_weight_decay_optimization(nspheads)) param_groups += list(get_params_for_weight_decay_optimization(embeddings)) param_groups += list( get_params_for_weight_decay_optimization(lmheads.transform)) param_groups[1]['params'].append(lmheads.bias) # Add model parallel attribute if it is not set. for param_group in param_groups: for param in param_group['params']: if not hasattr(param, 'model_parallel'): param.model_parallel = False # Use Adam. betas = (0.9, 0.999) optimizer = Adam(param_groups, betas=betas, lr=args.lr, weight_decay=args.weight_decay) # Wrap into fp16 optimizer. if args.fp16: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale, dynamic_loss_scale=args.dynamic_loss_scale, dynamic_loss_args={ 'scale_window': args.loss_scale_window, 'min_scale': args.min_scale, 'delayed_shift': args.hysteresis }) return optimizer