def main(unused_argv): del unused_argv # Unused tokenizer = get_tokenizer() input_fn = input_func_builder.get_input_fn( doc_dir=FLAGS.doc_dir, semi_dir=FLAGS.semi_dir, sent_dir=FLAGS.sent_dir, split=FLAGS.split, uncased=FLAGS.uncased, seq_len=FLAGS.seq_len, bsz_per_host=FLAGS.bsz_per_host, num_hosts=1, num_core_per_host=FLAGS.num_core_per_host, ) bsz_per_core = FLAGS.bsz_per_host // FLAGS.num_core_per_host params = {"batch_size": bsz_per_core} dataset = input_fn(params) example = tf.compat.v1.data.make_one_shot_iterator(dataset).get_next() for k, v in example.items(): print(k, v.shape) with tf.Session() as sess: for _ in range(FLAGS.num_example): example_np = sess.run(example) print("=" * 160) for k, v in example_np.items(): if v.ndim == 2: for i in range(v.shape[0]): if k in [ "gen_inp", "gen_tgt", "dec_inp", "dec_tgt", "inputs", "dec_masked_tgt" ]: print(k, v[i].shape, tokenizer.convert_ids_to_text(v[i].tolist())) else: print(k, v[i].shape, " ".join([str(j) for j in v[i].tolist()])) elif v.ndim == 1: if k in [ "gen_inp", "gen_tgt", "dec_inp", "dec_tgt", "inputs", "dec_masked_tgt" ]: print(k, v.shape, tokenizer.convert_ids_to_text(v.tolist())) else: print(k, v.shape, " ".join([str(j) for j in v.tolist()])) elif v.ndim > 3: for i in range(v.shape[0]): print(k, v.shape, v[i])
def main(unused_argv): del unused_argv # Unused tf.logging.set_verbosity(tf.logging.INFO) #### Tokenizer tokenizer = get_tokenizer() #### Get corpus info n_token = tokenizer.get_vocab_size() tf.logging.info("n_token %d", n_token) # test data inputs_np = [ 3933, 7752, 15179, 893, 24249, 703, 19119, 4, 2919, 335, 8511, 1094, 43, 1661, 669, 5481, 1106, 7029, 891, 891 ] type_id_np = [0] * len(inputs_np) inputs_np = np.array(inputs_np)[None] type_id_np = np.array(type_id_np)[None] # tensorflow graph inputs = tf.placeholder(tf.int64, [1, None]) type_id = tf.placeholder(tf.int64, [1, None]) hiddens = model_func_builder.extract_hiddens(inputs, type_id, n_token, is_training=False) # run session saver = tf.train.Saver() with tf.Session(config=tf.ConfigProto(allow_soft_placement=False)) as sess: sess.run(tf.global_variables_initializer()) saver.restore(sess, FLAGS.init_checkpoint) feed_dict = { inputs: inputs_np, type_id: type_id_np, } hiddens_np = sess.run(hiddens, feed_dict=feed_dict) tf.logging.info(len(hiddens_np))
def main(unused_argv): del unused_argv # Unused tokenizer = get_tokenizer() input_fn, _ = input_func_builder.get_input_fn( tfrecord_dir=FLAGS.record_dir, split=FLAGS.split, max_length=FLAGS.max_length, num_hosts=1, uncased=FLAGS.uncased, num_threads=FLAGS.num_threads, ) bsz_per_core = FLAGS.bsz_per_host // FLAGS.num_core_per_host params = {"batch_size": bsz_per_core} dataset = input_fn(params) example = dataset.make_one_shot_iterator().get_next() with tf.Session() as sess: for _ in range(FLAGS.num_example): example_np = sess.run(example) print("=" * 160) for k, v in example_np.items(): print(k, v.shape) if v.ndim == 2: for i in range(v.shape[0]): if k in ["source", "target", "inputs", "targets"]: print( tokenizer.convert_ids_to_tokens(v[i].tolist())) else: print(v[i].tolist()) elif v.ndim == 1: if k in ["source", "target", "inputs", "targets"]: print(tokenizer.convert_ids_to_tokens(v.tolist())) else: print(v.tolist())
def create_data(_): """create pretraining data (tfrecords).""" tokenizer = tokenization.get_tokenizer() # Make workdirs if not tf.gfile.Exists(FLAGS.save_dir): tf.gfile.MakeDirs(FLAGS.save_dir) # Interleavely split the work into FLAGS.num_task splits file_paths = sorted(tf.gfile.Glob(FLAGS.input_glob)) tf.logging.info("Use glob: %s", FLAGS.input_glob) tf.logging.info("Find %d files", len(file_paths)) task_file_paths = file_paths[FLAGS.task::FLAGS.num_task] if not task_file_paths: tf.logging.info("Exit: task %d has no file to process.", FLAGS.task) return tf.logging.info("Task %d process %d files:", FLAGS.task, len(task_file_paths)) for task_file in task_file_paths: tf.logging.debug(" - %s", task_file) _create_data(task_file_paths, tokenizer)
def main(_): """create pretraining data (tfrecords).""" # Load tokenizer tokenizer = get_tokenizer() # Make workdirs if not tf.io.gfile.exists(FLAGS.save_dir): tf.io.gfile.makedirs(FLAGS.save_dir) # Interleavely split the work into FLAGS.num_task splits file_paths = sorted(tf.io.gfile.glob(FLAGS.input_glob)) tf.logging.info("Use glob: %s", FLAGS.input_glob) tf.logging.info("Find %d files: %s", len(file_paths), file_paths) task_file_paths = file_paths[FLAGS.task::FLAGS.num_task] if not task_file_paths: tf.logging.info("Exit: task %d has no file to process.", FLAGS.task) return tf.logging.info("Task %d process %d files: %s", FLAGS.task, len(task_file_paths), task_file_paths) _create_data(task_file_paths, src_tok=tokenizer, tgt_tok=tokenizer)
def main(unused_argv): del unused_argv # Unused tf.logging.set_verbosity(tf.logging.INFO) #### Validate FLAGS if FLAGS.save_steps == 0: FLAGS.save_steps = None assert FLAGS.seq_len > 0 #### Tokenizer tokenizer = get_tokenizer() #### Get corpus info n_token = tokenizer.get_vocab_size() tf.logging.info("n_token %d", n_token) if FLAGS.do_train: # Get train input function train_input_fn = get_input_fn("train") # Get train cache function train_cache_fn = get_cache_fn(FLAGS.mem_len) else: train_cache_fn = None if FLAGS.do_eval: assert FLAGS.num_hosts == 1 # Get eval input function eval_input_fn = get_input_fn(FLAGS.eval_split) tf.logging.info("num of eval batches %d", FLAGS.eval_steps) # Get eval cache function eval_cache_fn = get_cache_fn(FLAGS.mem_len) else: eval_cache_fn = None ##### Get model function model_fn = get_model_fn(n_token) ##### Create TPUEstimator # TPU Configuration if not run_internal and FLAGS.use_tpu: tpu_cluster = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) else: tpu_cluster = None per_host_input = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster, master=FLAGS.master, model_dir=FLAGS.model_dir, session_config=tf.ConfigProto(allow_soft_placement=True), tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations, per_host_input_for_training=per_host_input), keep_checkpoint_max=FLAGS.max_save, save_checkpoints_secs=None, save_checkpoints_steps=FLAGS.save_steps) # warm start warm_start_from = None if FLAGS.warm_start_path is not None: warm_start_from = tf.estimator.WarmStartSettings( ckpt_to_initialize_from=FLAGS.warm_start_path) # TPU Estimator estimator = tpu_estimator.TPUEstimator( model_fn=model_fn, train_cache_fn=train_cache_fn, eval_cache_fn=eval_cache_fn, use_tpu=FLAGS.use_tpu, config=run_config, params={}, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, eval_on_tpu=FLAGS.use_tpu, warm_start_from=warm_start_from) #### Training if FLAGS.do_train: estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_steps) #### Evaluation if FLAGS.do_eval: if FLAGS.eval_ckpt_path is not None: if FLAGS.eval_ckpt_path.endswith("latest"): ckpt_dir = os.path.dirname(FLAGS.eval_ckpt_path) FLAGS.eval_ckpt_path = tf.train.latest_checkpoint(ckpt_dir) ret = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.eval_steps, checkpoint_path=FLAGS.eval_ckpt_path) tf.logging.info("=" * 200) log_str = "Eval results | " for key, val in ret.items(): log_str += "{} {} | ".format(key, val) tf.logging.info(log_str) tf.logging.info("=" * 200) else: ckpt_state = tf.train.get_checkpoint_state(FLAGS.model_dir) eval_results = [] for eval_checkpoint in ckpt_state.all_model_checkpoint_paths: if not tf.gfile.Exists(eval_checkpoint + ".index"): continue global_step = int(eval_checkpoint.split("-")[-1]) if (global_step < FLAGS.start_eval_steps or global_step > FLAGS.train_steps): continue tf.logging.info("Evaluate ckpt %d", global_step) ret = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.eval_steps, checkpoint_path=eval_checkpoint) eval_results.append(ret) eval_results.sort(key=lambda x: x["perplexity"]) tf.logging.info("=" * 200) log_str = "Best results | " for key, val in eval_results[0].items(): log_str += "{} {} | ".format(key, val) tf.logging.info(log_str) tf.logging.info("=" * 200)
def main(unused_argv): del unused_argv # Unused tf.logging.set_verbosity(tf.logging.INFO) tokenizer = tokenization.get_tokenizer() ##### Get train cache function train_cache_fn = get_cache_fn(FLAGS.mem_len) eval_cache_fn = get_cache_fn(FLAGS.mem_len) ##### Get model function model_fn = get_model_fn(tokenizer.get_vocab_size()) ##### Create TPUEstimator # TPU Configuration if not run_internal and FLAGS.use_tpu: tpu_cluster = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) else: tpu_cluster = None per_host_input = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster, master=FLAGS.master, model_dir=FLAGS.model_dir, session_config=tf.ConfigProto(allow_soft_placement=True), tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations, per_host_input_for_training=per_host_input), keep_checkpoint_max=FLAGS.max_save, save_checkpoints_secs=None, save_checkpoints_steps=FLAGS.save_steps) # warm start warm_start_from = None if FLAGS.warm_start_path is not None: warm_start_from = tf.estimator.WarmStartSettings( ckpt_to_initialize_from=FLAGS.warm_start_path) # TPU Estimator estimator = tpu_estimator.TPUEstimator( model_fn=model_fn, train_cache_fn=train_cache_fn, eval_cache_fn=eval_cache_fn, use_tpu=FLAGS.use_tpu, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, eval_on_tpu=FLAGS.use_tpu, warm_start_from=warm_start_from) ##### Training if FLAGS.do_train: # Get train input function train_input_fn = get_input_fn("train") estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_steps) #### Evaluation if FLAGS.do_eval: # Get eval input function eval_input_fn = get_input_fn(FLAGS.eval_split) estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.eval_steps, checkpoint_path=FLAGS.eval_ckpt_path)