예제 #1
0
 def test_adam(self):
     with self.test_session() as sess:
         w = tf.compat.v1.get_variable(
             "w",
             shape=[3],
             initializer=tf.compat.v1.constant_initializer(
                 [0.1, -0.2, -0.1]))
         x = tf.compat.v1.constant([0.4, 0.2, -0.5])
         loss = tf.compat.v1.reduce_mean(tf.compat.v1.square(x - w))
         tvars = tf.compat.v1.trainable_variables()
         grads = tf.compat.v1.gradients(loss, tvars)
         global_step = tf.compat.v1.train.get_or_create_global_step()
         optimizer = optimization.AdamWeightDecayOptimizer(
             learning_rate=0.2)
         train_op = optimizer.apply_gradients(zip(grads, tvars),
                                              global_step)
         init_op = tf.compat.v1.group(
             tf.compat.v1.global_variables_initializer(),
             tf.compat.v1.local_variables_initializer())
         sess.run(init_op)
         for _ in range(100):
             sess.run(train_op)
         w_np = sess.run(w)
         self.assertAllClose(w_np.flat, [0.4, 0.2, -0.5],
                             rtol=1e-2,
                             atol=1e-2)
예제 #2
0
    def add_train_op(self, lr_method, lr, loss, clip=-1):
        """Defines self.train_op that performs an update on a batch
        Args:
            lr_method: (string) sgd method, for example "adam"
            lr: (tf.placeholder) tf.float32, learning rate
            loss: (tensor) tf.float32 loss to minimize
            clip: (python float) clipping of gradient. If < 0, no clipping
        """
        _lr_m = lr_method.lower()  # lower to make sure

        with tf.variable_scope("train_step"):
            if _lr_m == 'adam':
                optimizer = tf.train.AdamOptimizer(lr)
            elif _lr_m == 'adagrad':
                optimizer = tf.train.AdagradOptimizer(lr)
            elif _lr_m == 'sgd':
                optimizer = tf.train.GradientDescentOptimizer(lr)
            elif _lr_m == 'rmsprop':
                optimizer = tf.train.RMSPropOptimizer(lr)
            elif _lr_m == 'adam_w':
                optimizer = optimization.AdamWeightDecayOptimizer(lr)
            else:
                raise NotImplementedError("Unknown method {}".format(_lr_m))
            # self.trainable_variables = tf.trainable_variables()
            self.grads, self.vs = zip(*optimizer.compute_gradients(loss))
            if clip > 0:  # gradient clipping if clip is positive
                grads, vs = zip(*optimizer.compute_gradients(loss))
                grads, gnorm = tf.clip_by_global_norm(grads, clip)
                self.train_op = optimizer.apply_gradients(zip(grads, vs))
            else:
                self.train_op = optimizer.minimize(loss)
예제 #3
0
def model(use_custom_op, inputs, targets):
    weights = tf.get_variable("weights",
                              shape=[3, 1],
                              initializer=tf.zeros_initializer(),
                              dtype=tf.float16)
    # Forward function:
    preds = tf.matmul(inputs, weights)

    sigmoid = 0.5 * (tf.math.tanh(preds) + 1)
    probs = sigmoid * targets + (1 - sigmoid) * (1 - targets)
    training_loss = tf.math.reduce_sum(-tf.math.log(probs))

    gradOfLossWrtInput = tf.gradients(training_loss, [inputs])[0]

    # Optimiser:
    if use_custom_op == 'sgd':
        opt = tf.train.GradientDescentOptimizer(learning_rate=0.05)
    elif use_custom_op == 'momentum':
        opt = tf.train.MomentumOptimizer(learning_rate=0.05,
                                         momentum=0.9,
                                         use_nesterov=False)
    elif use_custom_op == 'lamb':
        opt = _opt.LAMBOptimizer(0.05, high_precision=False)
    elif use_custom_op == 'adamw':
        opt = _opt.AdamWeightDecayOptimizer(0.05)
    train_op = opt.minimize(training_loss)

    return training_loss, weights, gradOfLossWrtInput, train_op
예제 #4
0
    global_steps_int = tf.cast(global_step, tf.int32)
    warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)

    global_steps_float = tf.cast(global_steps_int, tf.float32)
    warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)

    warmup_percent_done = global_steps_float / warmup_steps_float
    warmup_learning_rate = lr * warmup_percent_done

    is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
    learning_rate = (
            (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
optimizer = optimization.AdamWeightDecayOptimizer(
    learning_rate=learning_rate,
    weight_decay_rate=0.01,
    beta_1=0.9,
    beta_2=0.999,
    epsilon=1e-6,
    exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
tvars = tf.trainable_variables()
grads = tf.gradients(loss, tvars)
# This is how the model was pre-trained.
(grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
train_op = optimizer.apply_gradients(  # 在这个方法内部 可以调整是否需要训练 BERT 参数
    zip(grads, tvars), global_step=global_step)
new_global_step = global_step + 1
train_op = tf.group(train_op, [global_step.assign(new_global_step)])

# train_op = tf.train.AdamOptimizer(lr).minimize(loss) # 这个是传统的optimization