def _create_variables(accum_count): global_step = tf.Variable(0, trainable=False, dtype=tf.int64) optimizer = tf.train.AdamOptimizer(1.0) gradient = tf.placeholder(tf.float32, shape=[2]) variable = tf.Variable([1.0, 2.0]) optim.delayed_update(optimizer, [(gradient, variable)], global_step, accum_count=accum_count) return list(sorted(var.name for var in tf.global_variables()))
def testDelayedUpdate(self): global_step = tf.Variable(0, trainable=False, dtype=tf.int64) optimizer = tf.train.GradientDescentOptimizer(1.0) gradient = tf.placeholder(tf.float32, shape=[2]) variable = tf.Variable([1.0, 2.0]) train_op, extra_variables = optim.delayed_update( optimizer, [(gradient, variable)], global_step, accum_count=3) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.variables_initializer(extra_variables)) def _check_step(grad, expected_variable, expected_step): _, variable_value, step_value = sess.run( [train_op, variable, global_step], feed_dict={gradient: grad}) self.assertAllEqual(variable_value, expected_variable) self.assertAllEqual(step_value, expected_step) _check_step([3.0, 3.0], [1.0, 2.0], 0) # accum_grad = [3.0, 3.0] _check_step([4.0, 1.0], [1.0, 2.0], 0) # accum_grad = [7.0, 4.0] _check_step([-1.0, 0.0], [-5.0, -2.0], 1) # accum_grad = [6.0, 4.0], apply _check_step([-3.0, 1.0], [-5.0, -2.0], 1) # accum_grad = [-3.0, 1.0] _check_step([0.0, -3.0], [-5.0, -2.0], 1) # accum_grad = [-3.0, -2.0] _check_step([2.0, -1.0], [-4.0, 1.0], 2) # accum_grad = [-1.0, -3.0], apply
def testDelayedUpdateSparseGradients(self): # Test that delayed update does not crash on sparse gradients. global_step = tf.Variable(0, trainable=False, dtype=tf.int64) optimizer = tf.train.GradientDescentOptimizer(1.0) embeddings = tf.Variable([[1.0, 2.0], [3.0, 4.0]]) x = tf.nn.embedding_lookup(embeddings, [0]) loss = tf.losses.mean_squared_error([[1.1, 2.1]], x) gradients = optimizer.compute_gradients(loss) _ = optim.delayed_update(optimizer, gradients, global_step, accum_count=3)
def train(model_dir, example_inputter, source_file, target_file, maximum_length=100, shuffle_buffer_size=1000000, gradients_accum=8, train_steps=100000, save_every=1000, report_every=50): """Runs the training loop. Args: model_dir: Directory where checkpoints are saved. example_inputter: The inputter instance that produces the training examples. source_file: The source training file. target_file: The target training file. maximum_length: Filter sequences longer than this. shuffle_buffer_size: How many examples to load for shuffling. gradients_accum: Accumulate gradients of this many iterations. train_steps: Train for this many iterations. save_every: Save a checkpoint every this many iterations. report_every: Report training progress every this many iterations. """ mode = tf.estimator.ModeKeys.TRAIN # Create the dataset. dataset = example_inputter.make_training_dataset( source_file, target_file, batch_size=3072, batch_type="tokens", shuffle_buffer_size=shuffle_buffer_size, bucket_width=1, # Bucketize sequences by the same length for efficiency. maximum_features_length=maximum_length, maximum_labels_length=maximum_length) iterator = dataset.make_initializable_iterator() source, target = iterator.get_next() # Encode the source. with tf.variable_scope("encoder"): source_embedding = source_inputter.make_inputs(source, training=True) memory, _, _ = encoder.encode(source_embedding, source["length"], mode=mode) # Decode the target. with tf.variable_scope("decoder"): target_embedding = target_inputter.make_inputs(target, training=True) logits, _, _ = decoder.decode( target_embedding, target["length"], vocab_size=target_inputter.vocabulary_size, mode=mode, memory=memory, memory_sequence_length=source["length"]) # Compute the loss. loss, normalizer, _ = losses.cross_entropy_sequence_loss( logits, target["ids_out"], target["length"], label_smoothing=0.1, average_in_time=True, mode=mode) loss /= normalizer # Define the learning rate schedule. step = tf.train.create_global_step() learning_rate = decay.noam_decay_v2(2.0, step, model_dim=512, warmup_steps=4000) # Define the optimization op. optimizer = tf.train.AdamOptimizer(learning_rate) gradients = optimizer.compute_gradients(loss) train_op, optim_variables = optim.delayed_update( optimizer, gradients, step, accum_count=gradients_accum) # Runs the training loop. saver = tf.train.Saver() checkpoint_path = None if os.path.exists(model_dir): checkpoint_path = tf.train.latest_checkpoint(model_dir) with tf.Session() as sess: if checkpoint_path is not None: print("Restoring parameters from %s" % checkpoint_path) saver.restore(sess, checkpoint_path) else: sess.run(tf.global_variables_initializer()) sess.run(tf.variables_initializer(optim_variables)) sess.run(tf.tables_initializer()) sess.run(iterator.initializer) last_step = -1 while True: step_, lr_, loss_, _ = sess.run( [step, learning_rate, loss, train_op]) if step_ != last_step: if step_ % report_every == 0: print("Step = %d ; Learning rate = %f ; Loss = %f" % (step_, lr_, loss_)) if step_ % save_every == 0: print("Saving checkpoint for step %d" % step_) saver.save(sess, "%s/model" % model_dir, global_step=step_) if step_ == train_steps: break last_step = step_