Пример #1
0
 def _maybe_create_checkpoint_manager(self,
                                      checkpoint_or_checkpoint_manager,
                                      checkpoint_dir, cluster_resolver):
     """Create CheckpointManager(s) if a checkpoint is passed else take it."""
     if isinstance(checkpoint_or_checkpoint_manager,
                   checkpoint_management.CheckpointManager):
         self._read_checkpoint_manager = checkpoint_or_checkpoint_manager
         self._write_checkpoint_manager = checkpoint_or_checkpoint_manager
         self._api_made_checkpoint_manager = False
     else:
         self._api_made_checkpoint_manager = True
         # Make CheckpointManagers. MultiWorkerMirroredStrategy requires different
         # setup on chief and on other workers.
         self._read_checkpoint_manager = checkpoint_management.CheckpointManager(
             checkpoint_or_checkpoint_manager,
             directory=checkpoint_dir,
             max_to_keep=1)
         if multi_worker_util.is_chief(
                 cluster_spec=cluster_resolver.cluster_spec(),
                 task_type=cluster_resolver.task_type,
                 task_id=cluster_resolver.task_id):
             self._write_checkpoint_manager = self._read_checkpoint_manager
         else:
             self._write_checkpoint_manager = (
                 checkpoint_management.CheckpointManager(
                     checkpoint_or_checkpoint_manager,
                     _non_chief_checkpoint_dir(checkpoint_dir,
                                               cluster_resolver.task_id),
                     max_to_keep=1))
def _make_checkpoint_manager(checkpoint, checkpoint_dir, cluster_resolver):
    if multi_worker_util.is_chief(cluster_spec=cluster_resolver.cluster_spec(),
                                  task_type=cluster_resolver.task_type,
                                  task_id=cluster_resolver.task_id):
        return checkpoint_management.CheckpointManager(
            checkpoint, directory=checkpoint_dir, max_to_keep=1)
    else:
        return checkpoint_management.CheckpointManager(
            checkpoint,
            directory=failure_handling._non_chief_checkpoint_dir(
                checkpoint_dir, cluster_resolver.task_id),
            max_to_keep=1)
 def testCustomCheckpointPrefix(self):
     directory = self.get_temp_dir()
     checkpoint = util.Checkpoint()
     manager = checkpoint_management.CheckpointManager(
         checkpoint, directory, max_to_keep=2, checkpoint_name="ckpt_name")
     path = manager.save(checkpoint_number=5)
     self.assertEqual(os.path.basename(path), "ckpt_name-5")
     manager = checkpoint_management.CheckpointManager(checkpoint,
                                                       directory,
                                                       max_to_keep=2)
     path = manager.save(checkpoint_number=5)
     self.assertEqual(os.path.basename(path), "ckpt-5")
 def testClockReset(self, mock_time):
     directory = self.get_temp_dir()
     mock_time.time.return_value = 10000.
     checkpoint = util.Checkpoint()
     first_manager = checkpoint_management.CheckpointManager(
         checkpoint,
         directory,
         max_to_keep=1,
         keep_checkpoint_every_n_hours=1.)
     first_path = first_manager.save()
     mock_time.time.return_value += 3600.
     second_path = first_manager.save()
     mock_time.time.return_value += 3600.
     third_path = first_manager.save()
     self.assertFalse(checkpoint_management.checkpoint_exists(first_path))
     self.assertTrue(checkpoint_management.checkpoint_exists(second_path))
     self.assertTrue(checkpoint_management.checkpoint_exists(third_path))
     self.assertEqual([third_path], first_manager.checkpoints)
     state = checkpoint_management.get_checkpoint_state(directory)
     self.assertEqual(13600., state.last_preserved_timestamp)
     # Set the clock back in time
     mock_time.time.return_value = 5000.
     del first_manager
     with test.mock.patch.object(logging, "warning") as mock_log:
         second_manager = checkpoint_management.CheckpointManager(
             checkpoint, directory, max_to_keep=1)
         self.assertRegex(str(mock_log.call_args),
                          "behind the last preserved checkpoint timestamp")
     # We should err on the side of keeping checkpoints around when we're not
     # sure whether they were preserved or not due to clock funkiness.
     self.assertTrue(checkpoint_management.checkpoint_exists(second_path))
     # We know about the existing checkpoints, but they'll never be deleted and
     # so won't go in the CheckpointState proto on save.
     self.assertEqual(third_path, second_manager.latest_checkpoint)
     self.assertEqual([], second_manager.checkpoints)
     mock_time.time.return_value += 10.
     fourth_path = second_manager.save()
     self.assertTrue(checkpoint_management.checkpoint_exists(second_path))
     self.assertTrue(checkpoint_management.checkpoint_exists(third_path))
     self.assertEqual(fourth_path, second_manager.latest_checkpoint)
     self.assertEqual([fourth_path], second_manager.checkpoints)
     mock_time.time.return_value += 10.
     fifth_path = second_manager.save()
     self.assertTrue(checkpoint_management.checkpoint_exists(second_path))
     self.assertTrue(checkpoint_management.checkpoint_exists(third_path))
     self.assertEqual([fifth_path], second_manager.checkpoints)
     state = checkpoint_management.get_checkpoint_state(directory)
     self.assertEqual(5000., state.last_preserved_timestamp)
     self.assertEqual([5020.], state.all_model_checkpoint_timestamps)
Пример #5
0
    def test_training_loop(self):
        self.skipTest("b/216201668: revisit parallel device and checkpointing")
        for _ in range(5):
            layer = _Dense(5)
            checkpoint = tracking.Checkpoint(layer=layer)
            manager = checkpoint_management.CheckpointManager(
                checkpoint, directory=self.get_temp_dir(), max_to_keep=5)
            manager.restore_or_initialize()

            for _ in range(10):
                x = self.device.pack([
                    constant_op.constant([[-0.5]]),
                    constant_op.constant([[0.5]])
                ])
                with self.device:
                    with backprop.GradientTape() as tape:
                        y = layer(x)
                        loss = (y - math_ops.range(5.))**2.
                    parameters = layer.trainable_variables
                    unreduced_gradients = tape.gradient(loss, parameters)
                    reduced_gradients = _collective_sum(
                        unreduced_gradients,
                        num_replicas=len(self.device.components))
                    for grad, param in zip(reduced_gradients, parameters):
                        param.assign_sub(0.01 * grad)

                manager.save()
Пример #6
0
def save(self,
         path,
         compression=None,
         shard_func=None,
         checkpoint_args=None):
  """Implements the save function and checkpoint functionality."""
  if context.executing_eagerly() and checkpoint_args:
    save_dataset = _SaveDataset(self, path, shard_func, compression)
    save_iterator = iter(save_dataset)

    if "checkpoint" in checkpoint_args:
      raise ValueError(
          "'Invalid `checkpoint_args`. `checkpoint_args` are not allowed "
          "to include 'checkpoint'."
      )
    checkpoint = checkpoint_lib.Checkpoint(iterator=save_iterator)
    checkpoint_args["checkpoint"] = checkpoint
    manager = checkpoint_management.CheckpointManager(**checkpoint_args)
    checkpoint.restore(manager.latest_checkpoint)

    for _ in enumerate(save_iterator):
      if "step_counter" in checkpoint_args:
        checkpoint_args["step_counter"].assign_add(delta=1)
      manager.save(check_interval=True)
  else:
    dataset, shard_func, use_shard_func, path = set_save_dataset_attributes(
        self, shard_func, path)
    ged_ops.save_dataset(
        dataset._variant_tensor,   # pylint: disable=protected-access
        path=path,
        shard_func_other_args=shard_func.captured_inputs,
        compression=compression,
        shard_func=shard_func,
        use_shard_func=use_shard_func)
    def testRestoreOrInitialize(self):
        directory = self.get_temp_dir()

        # Create a checkpoint for initializing.
        init_prefix = os.path.join(directory, "init")
        init_v = variables.Variable(2.0)
        init_ckpt = util.Checkpoint(v=init_v)
        self.evaluate(init_v.initializer)
        init_path = init_ckpt.save(init_prefix)

        # Create the checkpoint manager.
        ckpt_dir = os.path.join(directory, "ckpt")
        v = variables.Variable(1.0)
        checkpoint = util.Checkpoint(v=v)
        manager = checkpoint_management.CheckpointManager(
            checkpoint,
            ckpt_dir,
            max_to_keep=None,
            init_fn=lambda: checkpoint.restore(init_path).run_restore_ops())
        self.evaluate(v.initializer)

        # First call should call `init_fn`.
        self.assertIsNone(manager.restore_or_initialize())
        self.assertEqual(2.0, self.evaluate(v))

        # Save a checkpoint and second call should restore from the checkpoints.
        manager.save()
        self.assertIsNotNone(manager.restore_or_initialize())
Пример #8
0
 def _assertNotCheckpointable(self, dataset):
     iterator = iter(dataset)
     ckpt = trackable_utils.Checkpoint(step=variables.Variable(0),
                                       iterator=iterator)
     manager = checkpoint_management.CheckpointManager(ckpt,
                                                       self.get_temp_dir(),
                                                       max_to_keep=3)
     with self.assertRaises(errors.FailedPreconditionError):
         manager.save()
    def testLatestCheckpointFSpathDirectory(self):
        directory = pathlib.Path(self.get_temp_dir())
        checkpoint = util.Checkpoint()
        manager = checkpoint_management.CheckpointManager(
            checkpoint, directory, max_to_keep=2, checkpoint_name="ckpt_name")
        manager.save()

        cp_dir = checkpoint_management.latest_checkpoint(directory)
        self.assertEqual(str(directory / "ckpt_name-1"), cp_dir)
 def testKeepAll(self):
     checkpoint = util.Checkpoint()
     directory = os.path.join(
         self.get_temp_dir(),
         # Avoid sharing directories between eager and graph
         # TODO(allenl): stop run_in_graph_and_eager_modes reusing directories
         str(context.executing_eagerly()))
     manager = checkpoint_management.CheckpointManager(checkpoint,
                                                       directory,
                                                       max_to_keep=None)
     first_path = manager.save()
     second_path = manager.save()
     third_path = manager.save()
     self.assertTrue(checkpoint_management.checkpoint_exists(third_path))
     self.assertTrue(checkpoint_management.checkpoint_exists(second_path))
     self.assertTrue(checkpoint_management.checkpoint_exists(first_path))
     self.assertEqual(third_path, manager.latest_checkpoint)
     self.assertEqual([first_path, second_path, third_path],
                      manager.checkpoints)
     del manager
     manager = checkpoint_management.CheckpointManager(checkpoint,
                                                       directory,
                                                       max_to_keep=None)
     fourth_path = manager.save()
     self.assertEqual([first_path, second_path, third_path, fourth_path],
                      manager.checkpoints)
     del manager
     manager = checkpoint_management.CheckpointManager(checkpoint,
                                                       directory,
                                                       max_to_keep=3)
     self.assertEqual([first_path, second_path, third_path, fourth_path],
                      manager.checkpoints)
     self.assertTrue(checkpoint_management.checkpoint_exists(fourth_path))
     self.assertTrue(checkpoint_management.checkpoint_exists(third_path))
     self.assertTrue(checkpoint_management.checkpoint_exists(second_path))
     self.assertTrue(checkpoint_management.checkpoint_exists(first_path))
     fifth_path = manager.save()
     self.assertEqual([third_path, fourth_path, fifth_path],
                      manager.checkpoints)
     self.assertTrue(checkpoint_management.checkpoint_exists(fifth_path))
     self.assertTrue(checkpoint_management.checkpoint_exists(fourth_path))
     self.assertTrue(checkpoint_management.checkpoint_exists(third_path))
     self.assertFalse(checkpoint_management.checkpoint_exists(second_path))
     self.assertFalse(checkpoint_management.checkpoint_exists(first_path))
Пример #11
0
    def __init__(self, model, checkpoint_dir):
        self._model = model

        # The epoch at which the checkpoint is saved. Used for fault-tolerance.
        # GPU device only has int64 dtype registered VarHandleOp.
        self._ckpt_saved_epoch = variables.Variable(
            initial_value=constant_op.constant(CKPT_SAVED_EPOCH_UNUSED_VALUE,
                                               dtype=dtypes.int64),
            name='ckpt_saved_epoch')

        # Variable initialization.
        backend.set_value(self._ckpt_saved_epoch,
                          CKPT_SAVED_EPOCH_UNUSED_VALUE)

        # _ckpt_saved_epoch gets tracked and is included in the checkpoint file
        # when backing up.
        checkpoint = trackable_util.Checkpoint(
            model=self._model, ckpt_saved_epoch=self._ckpt_saved_epoch)

        # If this is single-worker training, checkpoint_dir are the same for
        # write_checkpoint_manager and read_checkpoint_manager.
        #
        # If this is multi-worker training, and this worker should not
        # save checkpoint, we replace the write_checkpoint_manager's checkpoint_dir
        # with a temp filepath, so it writes to a file that will be removed at the
        # end of back_up() call. This is necessary because the SyncOnReadVariable
        # needs to be synced across all the workers in order to be read, and all
        # workers need to perform `save()`.
        # But all workers should restore from the same checkpoint_dir as passed in
        # read_checkpoint_manager.
        self.read_checkpoint_manager = checkpoint_management.CheckpointManager(
            checkpoint,
            directory=os.path.join(checkpoint_dir, 'chief'),
            max_to_keep=1)
        write_checkpoint_dir = distributed_file_utils.write_dirpath(
            checkpoint_dir, self._model.distribute_strategy)
        if self._model.distribute_strategy.extended.should_checkpoint:
            self.write_checkpoint_manager = self.read_checkpoint_manager
        else:
            self.write_checkpoint_manager = checkpoint_management.CheckpointManager(
                checkpoint, directory=write_checkpoint_dir, max_to_keep=1)
    def testCheckpointManagerFSpathDirectory(self):
        directory = pathlib.Path(self.get_temp_dir())
        v = variables.Variable(0.0)
        checkpoint = util.Checkpoint(v=v)
        self.evaluate(v.initializer)
        manager = checkpoint_management.CheckpointManager(
            checkpoint, directory, max_to_keep=2, checkpoint_name="ckpt_name")
        save_path = manager.save()
        expected = str(directory / "ckpt_name-1")
        self.assertEqual(expected, save_path)

        restore_path = manager.restore_or_initialize()
        self.assertEqual(str(directory / "ckpt_name-1"), restore_path)
 def testDeletion(self):
     checkpoint = util.Checkpoint()
     manager = checkpoint_management.CheckpointManager(checkpoint,
                                                       self.get_temp_dir(),
                                                       max_to_keep=3)
     first_path = manager.save()
     second_path = manager.save()
     third_path = manager.save()
     fourth_path = manager.save()
     self.assertTrue(checkpoint_management.checkpoint_exists(fourth_path))
     self.assertTrue(checkpoint_management.checkpoint_exists(third_path))
     self.assertTrue(checkpoint_management.checkpoint_exists(second_path))
     self.assertFalse(checkpoint_management.checkpoint_exists(first_path))
Пример #14
0
 def testCheckpointLargeShuffleBuffer(self):
     # Tensor of size 512M
     dataset = dataset_ops.Dataset.from_tensors(
         array_ops.ones((128, 1024, 1024), dtype=dtypes.float32))
     dataset = dataset.repeat()
     # Set shuffle buffer size to 5 to exceed the 2GB protobuf limit.
     dataset = dataset.shuffle(5)
     iterator = iter(dataset)
     next(iterator)  # request an element to fill the shuffle buffer
     ckpt = trackable_utils.Checkpoint(iterator=iterator)
     manager = checkpoint_management.CheckpointManager(ckpt,
                                                       self.get_temp_dir(),
                                                       max_to_keep=1)
     manager.save()
Пример #15
0
 def testCheckpointLargeBatches(self):
     if pywrap_sanitizers.is_tsan_enabled():
         self.skipTest(
             'Creating a large buffer causes OOM when using tsan.')
     # Batches of size 512M
     dataset = dataset_ops.Dataset.from_tensors(
         array_ops.ones((64, 1024, 1024), dtype=dtypes.float32)).repeat()
     dataset = dataset.batch(2, num_parallel_calls=5)
     iterator = iter(dataset)
     next(iterator)  # request an element to fill the buffer
     ckpt = trackable_utils.Checkpoint(iterator=iterator)
     manager = checkpoint_management.CheckpointManager(ckpt,
                                                       self.get_temp_dir(),
                                                       max_to_keep=1)
     manager.save()
Пример #16
0
    def testSaveRestoreModifiedDataset(self):
        ckpt_dir = self.get_temp_dir()
        dataset = dataset_ops.Dataset.range(10)
        iterator = iter(dataset)
        ckpt = trackable_utils.Checkpoint(iterator=iterator)
        manager = checkpoint_management.CheckpointManager(ckpt,
                                                          ckpt_dir,
                                                          max_to_keep=3)

        for _ in range(5):
            next(iterator)
        manager.save()

        # Define a different dataset and try to restore into its iterator.
        dataset = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
        iterator = iter(dataset)
        ckpt = trackable_utils.Checkpoint(iterator=iterator)
        manager = checkpoint_management.CheckpointManager(ckpt,
                                                          ckpt_dir,
                                                          max_to_keep=3)
        with self.assertRaisesRegex(
                errors.NotFoundError,
                "Make sure the dataset definition has not changed"):
            ckpt.restore(manager.latest_checkpoint)
Пример #17
0
    def testCheckpointFinishedCache(self):
        num_elements = 10
        ds = dataset_ops.Dataset.range(num_elements)
        ds = ds.cache()

        iterator = iter(ds)
        for i in range(num_elements):
            self.assertEqual(next(iterator).numpy(), i)
        ckpt = trackable_utils.Checkpoint(iterator=iterator)
        manager = checkpoint_management.CheckpointManager(ckpt,
                                                          self.get_temp_dir(),
                                                          max_to_keep=1)
        manager.save()
        manager.restore_or_initialize()
        with self.assertRaises(StopIteration):
            next(iterator)
Пример #18
0
    def testSaveRestoreReshuffleDataset(self):
        dataset = dataset_ops.Dataset.range(10)
        dataset = dataset.shuffle(10, reshuffle_each_iteration=True)
        iterator = iter(dataset)
        ckpt = trackable_utils.Checkpoint(step=variables.Variable(0),
                                          iterator=iterator)
        manager = checkpoint_management.CheckpointManager(ckpt,
                                                          self.get_temp_dir(),
                                                          max_to_keep=3)

        iter1 = [next(iterator).numpy() for _ in range(5)]

        manager.save()
        iter2 = [next(iterator).numpy() for _ in range(5)]

        ckpt.restore(manager.latest_checkpoint)
        iter3 = [next(iterator).numpy() for _ in range(5)]

        self.assertNotEqual(iter1, iter2)
        self.assertCountEqual(iter2, iter3)
    def testCheckpointInterval(self):
        v = variables.Variable(1.0)
        step_counter = variables.Variable(0)
        self.evaluate([v.initializer, step_counter.initializer])
        checkpoint = util.Checkpoint(v=v)
        manager = checkpoint_management.CheckpointManager(
            checkpoint,
            self.get_temp_dir(),
            max_to_keep=None,
            step_counter=step_counter,
            checkpoint_interval=2)

        # step_counter: 0, save an initial checkpoint.
        path = manager.save(check_interval=True)
        self.assertTrue(checkpoint_management.checkpoint_exists(path))

        # step_counter: 1, no checkpoint saved.
        self.evaluate(step_counter.assign_add(1))
        path = manager.save(check_interval=True)
        self.assertIsNone(path)

        # step_counter: 2, checkpoint saved.
        self.evaluate(step_counter.assign_add(1))
        path = manager.save(check_interval=True)
        self.assertTrue(checkpoint_management.checkpoint_exists(path))

        # no checkpoint saved when calling `save` with the same step counter.
        path = manager.save(check_interval=True)
        self.assertIsNone(path)

        # step_counter: 3, no checkpoint saved.
        self.evaluate(step_counter.assign_add(1))
        path = manager.save(check_interval=True)
        self.assertIsNone(path)

        # Always save the checkpoint.
        path = manager.save(check_interval=False)
        self.assertTrue(checkpoint_management.checkpoint_exists(path))
 def testContinueFromUnmanaged(self):
     directory = self.get_temp_dir()
     prefix = os.path.join(directory, "unusual_prefix")
     checkpoint = util.Checkpoint()
     first_path = checkpoint.save(prefix)
     second_path = checkpoint.save(prefix)
     del checkpoint
     checkpoint = util.Checkpoint()
     manager = checkpoint_management.CheckpointManager(checkpoint,
                                                       directory,
                                                       max_to_keep=2)
     checkpoint.restore(manager.latest_checkpoint).run_restore_ops()
     self.assertEqual(2, self.evaluate(checkpoint.save_counter))
     third_path = manager.save()
     self.assertEqual([third_path], manager.checkpoints)
     fourth_path = manager.save()
     self.assertEqual([third_path, fourth_path], manager.checkpoints)
     fifth_path = manager.save()
     self.assertEqual([fourth_path, fifth_path], manager.checkpoints)
     self.assertTrue(checkpoint_management.checkpoint_exists(first_path))
     self.assertTrue(checkpoint_management.checkpoint_exists(second_path))
     self.assertFalse(checkpoint_management.checkpoint_exists(third_path))
     self.assertTrue(checkpoint_management.checkpoint_exists(fourth_path))
     self.assertTrue(checkpoint_management.checkpoint_exists(fifth_path))
    def testCheckpointIntervalWithRestore(self):
        directory = self.get_temp_dir()
        v = variables.Variable(1.0)
        step_counter = variables.Variable(0)
        self.evaluate([v.initializer, step_counter.initializer])

        # Prepare a checkpoint.
        checkpoint = util.Checkpoint(v=v)
        checkpoint.save(os.path.join(directory, "ckpt"))

        manager = checkpoint_management.CheckpointManager(
            checkpoint,
            directory,
            max_to_keep=None,
            step_counter=step_counter,
            checkpoint_interval=2)

        # Restore from the checkpoint.
        self.assertIsNotNone(manager.restore_or_initialize())

        # step_counter: 0, no checkpoint saved because it is restored from the
        # checkpoint with the same step.
        path = manager.save()
        self.assertIsNone(path)
 def testCustomNumbering(self):
     directory = self.get_temp_dir()
     step = variables.Variable(0, dtype=dtypes.int64)
     checkpoint = util.Checkpoint(step=step)
     manager = checkpoint_management.CheckpointManager(checkpoint,
                                                       directory,
                                                       max_to_keep=2)
     self.evaluate(step.initializer)
     for i in range(5):
         path = manager.save(checkpoint_number=step)
         expected_suffix = "-%d" % (2 * i, )
         if not path.endswith(expected_suffix):
             self.fail("%s should have suffix %s" % (path, expected_suffix))
         self.evaluate(step.assign_add(2))
     self.assertEqual(5, self.evaluate(checkpoint.save_counter))
     # Test regular integers
     last_path = manager.save(checkpoint_number=32)
     self.assertIn("-32", last_path)
     self.assertEqual(last_path, manager.latest_checkpoint)
     self.assertEqual(last_path,
                      checkpoint_management.latest_checkpoint(directory))
     state = checkpoint_management.get_checkpoint_state(directory)
     # Only the most recent two checkpoints are saved
     self.assertEqual([path, last_path], state.all_model_checkpoint_paths)
    def testSaveRestoreState(self, mock_time):
        directory = self.get_temp_dir()
        mock_time.time.return_value = 3.
        checkpoint = util.Checkpoint()
        first_manager = checkpoint_management.CheckpointManager(checkpoint,
                                                                directory,
                                                                max_to_keep=2)
        first_time = 10000.
        first_name = os.path.join(directory, "ckpt-1")
        mock_time.time.return_value = first_time
        first_manager.save()
        state = checkpoint_management.get_checkpoint_state(directory)
        second_time = first_time + 3610.
        second_name = os.path.join(directory, "ckpt-2")
        mock_time.time.return_value = second_time
        first_manager.save()
        state = checkpoint_management.get_checkpoint_state(directory)
        self.assertEqual([first_time, second_time],
                         state.all_model_checkpoint_timestamps)
        self.assertEqual([first_name, second_name], first_manager.checkpoints)
        self.assertEqual(second_name, first_manager.latest_checkpoint)
        del first_manager

        second_manager = checkpoint_management.CheckpointManager(
            checkpoint,
            directory,
            max_to_keep=2,
            keep_checkpoint_every_n_hours=1.5)
        self.assertEqual([first_name, second_name], second_manager.checkpoints)
        self.assertEqual(second_name, second_manager.latest_checkpoint)
        third_name = os.path.join(directory, "ckpt-3")
        third_time = second_time + 3600. * 0.2
        mock_time.time.return_value = third_time
        second_manager.save()
        self.assertTrue(checkpoint_management.checkpoint_exists(first_name))
        self.assertTrue(checkpoint_management.checkpoint_exists(second_name))
        self.assertEqual([second_name, third_name], second_manager.checkpoints)
        state = checkpoint_management.get_checkpoint_state(directory)
        self.assertEqual(first_time, state.last_preserved_timestamp)
        fourth_time = third_time + 3600. * 0.5
        mock_time.time.return_value = fourth_time
        fourth_name = os.path.join(directory, "ckpt-4")
        second_manager.save()
        self.assertTrue(checkpoint_management.checkpoint_exists(first_name))
        self.assertFalse(checkpoint_management.checkpoint_exists(second_name))
        self.assertEqual([third_name, fourth_name], second_manager.checkpoints)
        fifth_time = fourth_time + 3600. * 0.5
        mock_time.time.return_value = fifth_time
        fifth_name = os.path.join(directory, "ckpt-5")
        second_manager.save()
        self.assertEqual([fourth_name, fifth_name], second_manager.checkpoints)
        state = checkpoint_management.get_checkpoint_state(directory)
        self.assertEqual(first_time, state.last_preserved_timestamp)
        del second_manager
        third_manager = checkpoint_management.CheckpointManager(
            checkpoint,
            directory,
            max_to_keep=2,
            keep_checkpoint_every_n_hours=1.5)
        self.assertEqual(fifth_name, third_manager.latest_checkpoint)
        mock_time.time.return_value += 10.
        third_manager.save()
        sixth_name = os.path.join(directory, "ckpt-6")
        state = checkpoint_management.get_checkpoint_state(directory)
        self.assertEqual(fourth_time, state.last_preserved_timestamp)
        self.assertTrue(checkpoint_management.checkpoint_exists(first_name))
        self.assertTrue(checkpoint_management.checkpoint_exists(fourth_name))
        self.assertTrue(checkpoint_management.checkpoint_exists(fifth_name))
        self.assertTrue(checkpoint_management.checkpoint_exists(sixth_name))
        self.assertFalse(checkpoint_management.checkpoint_exists(second_name))
        self.assertFalse(checkpoint_management.checkpoint_exists(third_name))
        self.assertEqual([fifth_name, sixth_name], third_manager.checkpoints)