def main(_): data_path = os.path.join(FLAGS.datadir, "training-monolingual.tokenized.shuffled/*") distribute_batch_size = FLAGS.batch_size * autodist._resource_spec.num_gpus with tf.Graph().as_default(), autodist.scope(): train_dataset = gen_lm1b_train_dataset(data_path, FLAGS.num_steps) train_dataset = train_dataset.batch(FLAGS.batch_size) train_iterator = train_dataset.make_one_shot_iterator().get_next() model = language_model.LM(FLAGS.num_steps) # TODO (Hao): need to improve this. train_step = autodist.function(model.train_step) prev_time = time.time() for local_step in range(FLAGS.max_steps): loss, _ = train_step(train_iterator) if local_step % FLAGS.log_frequency == 0: cur_time = time.time() elapsed_time = cur_time - prev_time num_words = distribute_batch_size * FLAGS.log_frequency wps = float(num_words) / elapsed_time print( "Iteration %d, time = %.2fs, wps = %.0f, train loss = %.4f" % (local_step, cur_time - prev_time, wps, loss)) prev_time = cur_time
def build_model(): model = language_model.LM(FLAGS.num_steps) global_step = tf.train.get_or_create_global_step() with tf.device('/gpu:0'): placeholder_x = tf.placeholder(tf.int32, [FLAGS.batch_size, FLAGS.num_steps]) placeholder_y = tf.placeholder(tf.int32, [FLAGS.batch_size, FLAGS.num_steps]) placeholder_w = tf.placeholder(tf.int32, [FLAGS.batch_size, FLAGS.num_steps]) initial_state_c = tf.placeholder(dtype=tf.float32, shape=[FLAGS.batch_size, model.state_size], name='initial_c') initial_state_h = tf.placeholder(dtype=tf.float32, shape=[FLAGS.batch_size, model.projected_size], name='initial_h') loss, final_state_c, final_state_h = model(placeholder_x, placeholder_y, placeholder_w, initial_state_c, initial_state_h, training=True) scaled_loss = loss * FLAGS.num_steps emb_vars = list(model.emb) lstm_vars = [model.W, model.B, model.W_P] softmax_vars = list(model.softmax_w) + [model.softmax_b] all_vars = emb_vars + lstm_vars + softmax_vars grads = tf.gradients(scaled_loss, all_vars) emb_grads = grads[:len(emb_vars)] emb_grads = [tf.IndexedSlices(grad.values * FLAGS.batch_size, grad.indices, grad.dense_shape) for grad in emb_grads] lstm_grads = grads[len(emb_vars):len(emb_vars) + len(lstm_vars)] lstm_grads, _ = tf.clip_by_global_norm(lstm_grads, FLAGS.max_grad_norm) softmax_grads = grads[len(emb_vars) + len(lstm_vars):] clipped_grads = emb_grads + lstm_grads + softmax_grads grads_and_vars = list(zip(clipped_grads, all_vars)) optimizer = tf.train.AdagradOptimizer(FLAGS.learning_rate, initial_accumulator_value=1.0) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) ema = tf.train.ExponentialMovingAverage(decay=0.999) with tf.control_dependencies([train_op]): train_op = ema.apply(lstm_vars) model.global_step = global_step model.loss = loss model.train_op = train_op model.final_state_c = final_state_c model.final_state_h = final_state_h model.initial_state_c = initial_state_c model.initial_state_h = initial_state_h model.x = placeholder_x model.y = placeholder_y model.w = placeholder_w return model
def main(argv): data_path = os.path.join(FLAGS.datadir, "training-monolingual.tokenized.shuffled/*") train_dataset = gen_lm1b_train_dataset(data_path, FLAGS.num_steps) train_dataset = train_dataset.batch(FLAGS.batch_size) model = language_model.LM(FLAGS.num_steps) prev_time = time.time() for local_step, input_data in enumerate(train_dataset.take(10)): loss, _ = model.train_step(input_data) if local_step % FLAGS.log_frequency == 0: cur_time = time.time() elapsed_time = cur_time - prev_time num_words = FLAGS.batch_size * FLAGS.log_frequency wps = float(num_words) / elapsed_time logging.info("Iteration %d, time = %.2fs, wps = %.0f, train loss = %.4f" % ( local_step, cur_time - prev_time, wps, loss)) prev_time = cur_time
def main(): with tf.Graph().as_default() as g: with tf.device('/gpu:0'): model = lm1b_model_graph.LM(FLAGS.num_steps) placeholder_x = tf.placeholder(tf.int32, [FLAGS.batch_size, FLAGS.num_steps]) placeholder_y = tf.placeholder(tf.int32, [FLAGS.batch_size, FLAGS.num_steps]) placeholder_w = tf.placeholder(tf.int32, [FLAGS.batch_size, FLAGS.num_steps]) initial_state_c = tf.placeholder( dtype=tf.float32, shape=[FLAGS.batch_size, model.state_size], name='initial_c') initial_state_h = tf.placeholder( dtype=tf.float32, shape=[FLAGS.batch_size, model.projected_size], name='initial_h') loss, final_state_c, final_state_h = model(placeholder_x, placeholder_y, placeholder_w, initial_state_c, initial_state_h, training=False) ema = tf.train.ExponentialMovingAverage(decay=0.999) lstm_vars = tf.trainable_variables()[-3:] avg_dict = ema.variables_to_restore(lstm_vars) new_dict = {} for key, value in avg_dict.items(): new_dict[new_names[key]] = value saver = tf.train.Saver(new_dict) ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) with tf.Session(config=config) as sess: for i in range(len(ckpt.all_model_checkpoint_paths)): if i % FLAGS.evaluate_every_nth_ckpt != 0: continue evaluate(sess, loss, final_state_c, final_state_h, placeholder_x, placeholder_y, placeholder_w, initial_state_c, initial_state_h, saver, ckpt, i)