def test_adam(self): with self.test_session() as sess: w = tf.compat.v1.get_variable( "w", shape=[3], initializer=tf.compat.v1.constant_initializer( [0.1, -0.2, -0.1])) x = tf.compat.v1.constant([0.4, 0.2, -0.5]) loss = tf.compat.v1.reduce_mean(tf.compat.v1.square(x - w)) tvars = tf.compat.v1.trainable_variables() grads = tf.compat.v1.gradients(loss, tvars) global_step = tf.compat.v1.train.get_or_create_global_step() optimizer = optimization.AdamWeightDecayOptimizer( learning_rate=0.2) train_op = optimizer.apply_gradients(zip(grads, tvars), global_step) init_op = tf.compat.v1.group( tf.compat.v1.global_variables_initializer(), tf.compat.v1.local_variables_initializer()) sess.run(init_op) for _ in range(100): sess.run(train_op) w_np = sess.run(w) self.assertAllClose(w_np.flat, [0.4, 0.2, -0.5], rtol=1e-2, atol=1e-2)
def add_train_op(self, lr_method, lr, loss, clip=-1): """Defines self.train_op that performs an update on a batch Args: lr_method: (string) sgd method, for example "adam" lr: (tf.placeholder) tf.float32, learning rate loss: (tensor) tf.float32 loss to minimize clip: (python float) clipping of gradient. If < 0, no clipping """ _lr_m = lr_method.lower() # lower to make sure with tf.variable_scope("train_step"): if _lr_m == 'adam': optimizer = tf.train.AdamOptimizer(lr) elif _lr_m == 'adagrad': optimizer = tf.train.AdagradOptimizer(lr) elif _lr_m == 'sgd': optimizer = tf.train.GradientDescentOptimizer(lr) elif _lr_m == 'rmsprop': optimizer = tf.train.RMSPropOptimizer(lr) elif _lr_m == 'adam_w': optimizer = optimization.AdamWeightDecayOptimizer(lr) else: raise NotImplementedError("Unknown method {}".format(_lr_m)) # self.trainable_variables = tf.trainable_variables() self.grads, self.vs = zip(*optimizer.compute_gradients(loss)) if clip > 0: # gradient clipping if clip is positive grads, vs = zip(*optimizer.compute_gradients(loss)) grads, gnorm = tf.clip_by_global_norm(grads, clip) self.train_op = optimizer.apply_gradients(zip(grads, vs)) else: self.train_op = optimizer.minimize(loss)
def model(use_custom_op, inputs, targets): weights = tf.get_variable("weights", shape=[3, 1], initializer=tf.zeros_initializer(), dtype=tf.float16) # Forward function: preds = tf.matmul(inputs, weights) sigmoid = 0.5 * (tf.math.tanh(preds) + 1) probs = sigmoid * targets + (1 - sigmoid) * (1 - targets) training_loss = tf.math.reduce_sum(-tf.math.log(probs)) gradOfLossWrtInput = tf.gradients(training_loss, [inputs])[0] # Optimiser: if use_custom_op == 'sgd': opt = tf.train.GradientDescentOptimizer(learning_rate=0.05) elif use_custom_op == 'momentum': opt = tf.train.MomentumOptimizer(learning_rate=0.05, momentum=0.9, use_nesterov=False) elif use_custom_op == 'lamb': opt = _opt.LAMBOptimizer(0.05, high_precision=False) elif use_custom_op == 'adamw': opt = _opt.AdamWeightDecayOptimizer(0.05) train_op = opt.minimize(training_loss) return training_loss, weights, gradOfLossWrtInput, train_op
global_steps_int = tf.cast(global_step, tf.int32) warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) global_steps_float = tf.cast(global_steps_int, tf.float32) warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) warmup_percent_done = global_steps_float / warmup_steps_float warmup_learning_rate = lr * warmup_percent_done is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) learning_rate = ( (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) optimizer = optimization.AdamWeightDecayOptimizer( learning_rate=learning_rate, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) tvars = tf.trainable_variables() grads = tf.gradients(loss, tvars) # This is how the model was pre-trained. (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) train_op = optimizer.apply_gradients( # 在这个方法内部 可以调整是否需要训练 BERT 参数 zip(grads, tvars), global_step=global_step) new_global_step = global_step + 1 train_op = tf.group(train_op, [global_step.assign(new_global_step)]) # train_op = tf.train.AdamOptimizer(lr).minimize(loss) # 这个是传统的optimization