def next_undecoded_checkpoint(model_dir, timeout_mins=240): """Yields successive checkpoints from model_dir.""" last_ckpt = None last_step = 0 while True: # Get the latest checkpoint. last_ckpt = contrib.training().wait_for_new_checkpoint( model_dir, last_ckpt, seconds_to_sleep=60, timeout=60 * timeout_mins) # Get all the checkpoint from the model dir. ckpt_path = tf.train.get_checkpoint_state(model_dir) all_model_checkpoint_paths = ckpt_path.all_model_checkpoint_paths ckpt_step = np.inf next_ckpt = None # Find the next checkpoint to eval based on last_step. for ckpt in all_model_checkpoint_paths: step = int(os.path.basename(ckpt).split("-")[1]) if step > last_step and step < ckpt_step: ckpt_step = step next_ckpt = ckpt # If all the checkpoints have been evaluated. if last_ckpt is None and next_ckpt is None: tf.logging.info("Eval timeout: no new checkpoints within %dm" % timeout_mins) break if next_ckpt is not None: last_step = ckpt_step last_ckpt = next_ckpt yield last_ckpt
def test_model_shapes(self): """Test a few of the important output shapes for NeuralStackModel. """ batch_size = 100 seq_length = 80 embedding_size = 64 vocab_size = 128 hparams = neural_stack.neural_stack() problem_hparams = contrib.training().HParams() problem_hparams.add_hparam( "modality", { "inputs": modalities.ModalityType.SYMBOL, "targets": modalities.ModalityType.SYMBOL, }) problem_hparams.add_hparam("vocab_size", { "inputs": vocab_size, "targets": vocab_size, }) model = neural_stack.NeuralStackModel(hparams, problem_hparams=problem_hparams) features = { "inputs": tf.ones([batch_size, seq_length, 1, 1], dtype=tf.int32), "targets": tf.ones([batch_size, seq_length, 1, 1], dtype=tf.int32) } transformed_features = model.bottom(features) self.assertEqual([batch_size, seq_length, 1, embedding_size], transformed_features["inputs"].shape) logits = model.body(transformed_features) self.assertEqual([batch_size, seq_length, 1, embedding_size], logits.shape)
def next_checkpoint(model_dir, timeout_mins=240): """Yields successive checkpoints from model_dir. Args: model_dir: The directory in which checkpoints are saved. timeout_mins: The maximum amount of time in minutes to wait between checkpoints. Set this to -1 to wait indefinitely. Yields: last_ckpt: a new checkpoint path, or None if the timeout was reached. """ last_ckpt = None timeout_secs = None if timeout_mins != -1: timeout_secs = timeout_mins * 60 while True: last_ckpt = contrib.training().wait_for_new_checkpoint( model_dir, last_ckpt, seconds_to_sleep=60, timeout=timeout_secs) if last_ckpt is None: tf.logging.info("Eval timeout: no new checkpoints within %dm" % timeout_mins) break yield last_ckpt