def optimization(loss, warmup_steps, num_train_steps, learning_rate, train_program, startup_prog, weight_decay, scheduler='linear_warmup_decay', use_fp16=False, use_dynamic_loss_scaling=False, init_loss_scaling=1.0, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, incr_ratio=2.0, decr_ratio=0.8): scheduled_lr, loss_scaling = None, None if scheduler == 'noam_decay': if warmup_steps > 0: scheduled_lr = fluid.layers.learning_rate_scheduler\ .noam_decay(1/(warmup_steps *(learning_rate ** 2)), warmup_steps) else: print( "WARNING: noam decay of learning rate should have postive warmup " "steps but given {}, using constant learning rate instead!". format(warmup_steps)) scheduled_lr = fluid.layers.create_global_var( name=fluid.unique_name.generate("learning_rate"), shape=[1], value=learning_rate, dtype='float32', persistable=True) elif scheduler == 'linear_warmup_decay': if warmup_steps > 0: scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps, num_train_steps) else: print( "WARNING: linear warmup decay of learning rate should have " "postive warmup steps but given {}, use constant learning rate " "instead!".format(warmup_steps)) scheduled_lr = fluid.layers.create_global_var( name=fluid.unique_name.generate("learning_rate"), shape=[1], value=learning_rate, dtype='float32', persistable=True) else: raise ValueError("Unkown learning rate scheduler, should be " "'noam_decay' or 'linear_warmup_decay'") optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr) fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=1.0)) def exclude_from_weight_decay(param): name = param.name.rstrip(".master") if name.find("layer_norm") > -1: return True bias_suffix = ["_bias", "_b", ".b_0"] for suffix in bias_suffix: if name.endswith(suffix): return True return False param_list = dict() if use_fp16: loss_scaling = fluid.layers.create_global_var( name=fluid.unique_name.generate("loss_scaling"), shape=[1], value=init_loss_scaling, dtype='float32', persistable=True) loss *= loss_scaling param_grads = optimizer.backward(loss) master_param_grads = create_master_params_grads( param_grads, train_program, startup_prog, loss_scaling) if weight_decay > 0: for param, _ in master_param_grads: param_list[param.name] = param * 1.0 param_list[param.name].stop_gradient = True if use_dynamic_loss_scaling: apply_dynamic_loss_scaling(loss_scaling, master_param_grads, incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio) optimizer.apply_gradients(master_param_grads) if weight_decay > 0: for param, grad in master_param_grads: if exclude_from_weight_decay(param): continue with param.block.program._optimized_guard( [param, grad]), fluid.framework.name_scope("weight_decay"): updated_param = param - param_list[ param.name] * weight_decay * scheduled_lr fluid.layers.assign(output=param, input=updated_param) master_param_to_train_param(master_param_grads, param_grads, train_program) else: if weight_decay > 0: for param in train_program.global_block().all_parameters(): param_list[param.name] = param * 1.0 param_list[param.name].stop_gradient = True _, param_grads = optimizer.minimize(loss) if weight_decay > 0: for param, grad in param_grads: if exclude_from_weight_decay(param): continue with param.block.program._optimized_guard( [param, grad]), fluid.framework.name_scope("weight_decay"): updated_param = param - param_list[ param.name] * weight_decay * scheduled_lr fluid.layers.assign(output=param, input=updated_param) return scheduled_lr, loss_scaling
def optimization(loss, warmup_steps, num_train_steps, learning_rate, train_program, startup_prog, weight_decay, d_model, scheduler='linear_warmup_decay', use_fp16=False, use_dynamic_loss_scaling=False, init_loss_scaling=1.0, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, incr_ratio=2.0, decr_ratio=0.8, grad_norm=1.0, beta1=0.9, beta2=0.999, epsilon=1e-8): """Optimization function""" if warmup_steps > 0: if scheduler == 'noam_decay': # scheduled_lr = fluid.layers.learning_rate_scheduler \ # .noam_decay(1 / (warmup_steps * (learning_rate ** 2)), # warmup_steps) with fluid.default_main_program()._lr_schedule_guard(): scheduled_lr = fluid.layers.learning_rate_scheduler.noam_decay( d_model, warmup_steps) * learning_rate elif scheduler == 'linear_warmup_decay': scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps, num_train_steps) else: raise ValueError("Unkown learning rate scheduler, should be " "'noam_decay' or 'linear_warmup_decay'") optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr, beta1=beta1, beta2=beta2, epsilon=epsilon) else: scheduled_lr = fluid.layers.create_global_var( name=fluid.unique_name.generate("learning_rate"), shape=[1], value=learning_rate, dtype='float32', persistable=True) optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr) optimizer._learning_rate_map[ fluid.default_main_program()] = scheduled_lr fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=grad_norm)) def exclude_from_weight_decay(name): """params exclude from weight decay""" if name.find("layer_norm") > -1: return True bias_suffix = ["_bias", "_b", ".b_0"] for suffix in bias_suffix: if name.endswith(suffix): return True return False param_list = dict() loss_scaling = fluid.layers.create_global_var( name=fluid.unique_name.generate("loss_scaling"), shape=[1], value=init_loss_scaling, dtype='float32', persistable=True) if use_fp16: loss *= loss_scaling param_grads = optimizer.backward(loss) master_param_grads = create_master_params_grads( param_grads, train_program, startup_prog, loss_scaling) for param, _ in master_param_grads: param_list[param.name] = param * 1.0 param_list[param.name].stop_gradient = True if use_dynamic_loss_scaling: apply_dynamic_loss_scaling(loss_scaling, master_param_grads, incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio) optimizer.apply_gradients(master_param_grads) if weight_decay > 0: for param, grad in master_param_grads: if exclude_from_weight_decay(param.name.rstrip(".master")): continue with param.block.program._optimized_guard( [param, grad]), fluid.framework.name_scope("weight_decay"): updated_param = param - param_list[ param.name] * weight_decay * scheduled_lr fluid.layers.assign(output=param, input=updated_param) master_param_to_train_param(master_param_grads, param_grads, train_program) else: for param in train_program.global_block().all_parameters(): param_list[param.name] = param * 1.0 param_list[param.name].stop_gradient = True _, param_grads = optimizer.minimize(loss) if weight_decay > 0: for param, grad in param_grads: if exclude_from_weight_decay(param.name): continue with param.block.program._optimized_guard( [param, grad]), fluid.framework.name_scope("weight_decay"): updated_param = param - param_list[ param.name] * weight_decay * scheduled_lr fluid.layers.assign(output=param, input=updated_param) return scheduled_lr, loss_scaling
def optimization(loss, warmup_steps, num_train_steps, learning_rate, train_program, startup_prog, weight_decay, scheduler='linear_warmup_decay', use_fp16=False, loss_scaling=1.0): if warmup_steps > 0: if scheduler == 'noam_decay': scheduled_lr = fluid.layers.learning_rate_scheduler\ .noam_decay(1/(warmup_steps *(learning_rate ** 2)), warmup_steps) elif scheduler == 'linear_warmup_decay': scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps, num_train_steps) else: raise ValueError("Unkown learning rate scheduler, should be " "'noam_decay' or 'linear_warmup_decay'") optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr, epsilon=1e-6) else: optimizer = fluid.optimizer.Adam(learning_rate=learning_rate, epsilon=1e-6) scheduled_lr = learning_rate clip_norm_thres = 1.0 # When using mixed precision training, scale the gradient clip threshold # by loss_scaling if use_fp16 and loss_scaling > 1.0: clip_norm_thres *= loss_scaling fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=clip_norm_thres)) def exclude_from_weight_decay(name): if name.find("layer_norm") > -1: return True bias_suffix = ["_bias", "_b", ".b_0"] for suffix in bias_suffix: if name.endswith(suffix): return True return False param_list = dict() if use_fp16: param_grads = optimizer.backward(loss) master_param_grads = create_master_params_grads( param_grads, train_program, startup_prog, loss_scaling) for param, _ in master_param_grads: param_list[param.name] = param * 1.0 param_list[param.name].stop_gradient = True optimizer.apply_gradients(master_param_grads) if weight_decay > 0: for param, grad in master_param_grads: # if exclude_from_weight_decay(param.name.rstrip(".master")): # continue with param.block.program._optimized_guard( [param, grad]), fluid.framework.name_scope("weight_decay"): updated_param = param - param_list[ param.name] * weight_decay * scheduled_lr fluid.layers.assign(output=param, input=updated_param) master_param_to_train_param(master_param_grads, param_grads, train_program) else: for param in train_program.global_block().all_parameters(): param_list[param.name] = param * 1.0 param_list[param.name].stop_gradient = True _, param_grads = optimizer.minimize(loss) if weight_decay > 0: for param, grad in param_grads: # if exclude_from_weight_decay(param.name): # continue with param.block.program._optimized_guard( [param, grad]), fluid.framework.name_scope("weight_decay"): updated_param = param - param_list[ param.name] * weight_decay * scheduled_lr fluid.layers.assign(output=param, input=updated_param) return scheduled_lr
def optimization(loss, warmup_steps, num_train_steps, learning_rate, train_program, startup_prog, weight_decay, scheduler='linear_warmup_decay', use_fp16=False, use_lamb=False, use_dynamic_loss_scaling=False, init_loss_scaling=1.0, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, incr_ratio=2.0, decr_ratio=0.8, layer_decay_rate=0.0, n_layers=12): def exclude_from_weight_decay(param): name = param.name.rstrip('.master') if name.find("layer_norm") > -1: return True bias_suffix = ["_bias", "_b", ".b_0"] for suffix in bias_suffix: if name.endswith(suffix): return True return False if warmup_steps > 0: if scheduler == 'noam_decay': scheduled_lr = fluid.layers.learning_rate_scheduler\ .noam_decay(1/(warmup_steps *(learning_rate ** 2)), warmup_steps) elif scheduler == 'linear_warmup_decay': scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps, num_train_steps) else: raise ValueError("Unkown learning rate scheduler, should be " "'noam_decay' or 'linear_warmup_decay'") if not use_lamb: log.debug('using Adam') optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr) else: log.debug('using Lamb') optimizer = fluid.optimizer.Lamb( learning_rate=scheduled_lr, lamb_weight_decay=weight_decay, exclude_from_weight_decay_fn=exclude_from_weight_decay) else: scheduled_lr = fluid.layers.create_global_var( name=fluid.unique_name.generate("learning_rate"), shape=[1], value=learning_rate, dtype='float32', persistable=True) if not use_lamb: log.debug('using Adam') optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr) else: log.debug('using Lamb') optimizer = fluid.optimizer.Lamb( learning_rate=scheduled_lr, lamb_weight_decay=weight_decay, exclude_from_weight_decay_fn=exclude_from_weight_decay) optimizer._learning_rate_map[fluid.default_main_program( )] = scheduled_lr fluid.clip.set_gradient_clip( clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)) param_list = dict() loss_scaling = fluid.layers.create_global_var( name=fluid.unique_name.generate("loss_scaling"), shape=[1], value=init_loss_scaling, dtype='float32', persistable=True) if use_fp16: from utils.fp16 import create_master_params_grads, master_param_to_train_param, apply_dynamic_loss_scaling loss *= loss_scaling param_grads = optimizer.backward(loss) master_param_grads = create_master_params_grads( param_grads, train_program, startup_prog, loss_scaling) for param, _ in master_param_grads: param_list[param.name] = param * 1.0 param_list[param.name].stop_gradient = True if use_dynamic_loss_scaling: apply_dynamic_loss_scaling( loss_scaling, master_param_grads, incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio) optimizer.apply_gradients(master_param_grads) if not use_lamb and weight_decay > 0: for param, grad in master_param_grads: if exclude_from_weight_decay(param): continue with param.block.program._optimized_guard( [param, grad]), fluid.framework.name_scope("weight_decay"): updated_param = param - param_list[ param.name] * weight_decay * scheduled_lr fluid.layers.assign(output=param, input=updated_param) master_param_to_train_param(master_param_grads, param_grads, train_program) else: for param in train_program.global_block().all_parameters(): param_list[param.name] = param * 1.0 param_list[param.name].stop_gradient = True _, param_grads = optimizer.minimize(loss) if layer_decay_rate > 0: for param, grad in param_grads: with param.block.program._optimized_guard( [param, grad]), fluid.framework.name_scope("layer_decay"): param_decay = layer_decay(param, param_list[param.name], scheduled_lr, layer_decay_rate, n_layers) if param_decay: fluid.layers.assign(output=param, input=param_decay) if not use_lamb and weight_decay > 0: for param, grad in param_grads: if exclude_from_weight_decay(param): continue with param.block.program._optimized_guard( [param, grad]), fluid.framework.name_scope("weight_decay"): updated_param = param - param_list[ param.name] * weight_decay * scheduled_lr fluid.layers.assign(output=param, input=updated_param) return scheduled_lr, loss_scaling