예제 #1
0
    def callableForTestModelCheckpointSavesOnChiefButNotOtherwise(
            model, test_obj, train_ds, num_epoch, steps, strategy,
            saving_filepath, **kwargs):

        extension = os.path.splitext(saving_filepath)[1]

        # Incorporate type/index information and thread id in saving_filepath to
        # ensure every worker has a unique path. Note that in normal use case the
        # saving_filepath will be the same for all workers, but we use different
        # ones here just to test out chief saves checkpoint but non-chief doesn't.

        saving_filepath = os.path.join(
            test_obj.get_temp_dir(), 'checkpoint_%s_%d%s' %
            (test_base.get_task_type(), test_base.get_task_index(), extension))

        # The saving_filepath shouldn't exist at the beginning (as it's unique).
        test_obj.assertFalse(training_state.checkpoint_exists(saving_filepath))

        model.fit(
            x=train_ds,
            epochs=num_epoch,
            steps_per_epoch=steps,
            callbacks=[callbacks.ModelCheckpoint(filepath=saving_filepath)])

        # If it's chief, the model should be saved; if not, the model shouldn't.
        test_obj.assertEqual(training_state.checkpoint_exists(saving_filepath),
                             test_base.is_chief())
예제 #2
0
    def callableForTestModelCheckpointSavesOnChiefButNotOtherwise(
            model, test_obj, train_ds, num_epoch, steps, strategy,
            saving_filepath, **kwargs):
        # Incorporate type/index information and thread id in saving_filepath to
        # ensure every worker has a unique path. Note that in normal use case the
        # saving_filepath will be the same for all workers, but we use different
        # ones here just to test out chief saves checkpoint but non-chief doesn't.

        # TODO(b/134551335): Must save to hdf5 until bug with copying
        # MirroredVariables is resolved.
        saving_filepath = os.path.join(
            test_obj.get_temp_dir(), 'checkpoint_%s_%d.h5' %
            (test_base.get_task_type(), test_base.get_task_index()))

        # The saving_filepath shouldn't exist at the beginning (as it's unique).
        test_obj.assertFalse(os.path.exists(saving_filepath))

        model.fit(
            x=train_ds,
            epochs=num_epoch,
            steps_per_epoch=steps,
            callbacks=[callbacks.ModelCheckpoint(filepath=saving_filepath)])

        # If it's chief, the model should be saved; if not, the model shouldn't.
        test_obj.assertEqual(os.path.exists(saving_filepath),
                             test_base.is_chief())
        def proc_tensorboard_saves_on_chief_but_not_otherwise(test_obj):
            model, _, train_ds, steps = _model_setup(test_obj, file_format='')
            num_epoch = 2

            # Incorporate type/index information and thread id in saving_filepath to
            # ensure every worker has a unique path. Note that in normal use case the
            # saving_filepath will be the same for all workers, but we use different
            # ones here just to test out chief saves summaries but non-chief doesn't.
            task_config = _get_task_config()
            saving_filepath = os.path.join(
                test_obj.get_temp_dir(),
                'logfile_%s_%d' % (task_config['type'], task_config['index']))

            # The saving_filepath shouldn't exist at the beginning (as it's unique).
            test_obj.assertFalse(file_io.file_exists_v2(saving_filepath))

            model.fit(
                x=train_ds,
                epochs=num_epoch,
                steps_per_epoch=steps,
                callbacks=[callbacks.TensorBoard(log_dir=saving_filepath)])

            # If it's chief, the summaries should be saved in the filepath; if not,
            # the directory should be empty (although created). Using
            # `file_io.list_directory()` since the directory may be created at this
            # point.
            test_obj.assertEqual(
                bool(file_io.list_directory_v2(saving_filepath)),
                test_base.is_chief())
예제 #4
0
        def proc_model_checkpoint_saves_on_chief_but_not_otherwise(
                test_obj, file_format):

            model, saving_filepath, train_ds, steps = _model_setup(
                test_obj, file_format)
            num_epoch = 2
            extension = os.path.splitext(saving_filepath)[1]

            # Incorporate type/index information and thread id in saving_filepath to
            # ensure every worker has a unique path. Note that in normal use case the
            # saving_filepath will be the same for all workers, but we use different
            # ones here just to test out chief saves checkpoint but non-chief doesn't.
            saving_filepath = os.path.join(
                test_obj.get_temp_dir(),
                'checkpoint_%s_%d%s' % (test_base.get_task_type(),
                                        test_base.get_task_index(), extension))

            # The saving_filepath shouldn't exist at the beginning (as it's unique).
            test_obj.assertFalse(
                training_state.checkpoint_exists(saving_filepath))

            model.fit(x=train_ds,
                      epochs=num_epoch,
                      steps_per_epoch=steps,
                      validation_data=train_ds,
                      validation_steps=steps,
                      callbacks=[
                          callbacks.ModelCheckpoint(
                              filepath=saving_filepath,
                              save_weights_only=save_weights_only)
                      ])

            # If it's chief, the model should be saved; if not, the model shouldn't.
            test_obj.assertEqual(
                training_state.checkpoint_exists(saving_filepath),
                test_base.is_chief())

            # If it's chief, the model should be saved (`write_filepath` should
            # simply return `saving_filepath`); if not, i.e. for non-chief workers,
            # the temporary path generated by `write_filepath` should no longer
            # contain the checkpoint that has been deleted.
            test_obj.assertEqual(
                training_state.checkpoint_exists(
                    distributed_file_utils.write_filepath(
                        saving_filepath, model._distribution_strategy)),
                test_base.is_chief())
예제 #5
0
 def mocked_mkstemp():
     # Only non-chief should call tempfile.mkstemp() inside fit() in sync
     # training.
     assert not test_base.is_chief()
     file_handle, temp_file_name = real_mkstemp()
     extension = os.path.splitext(saving_filepath)[1]
     temp_filepath = temp_file_name + extension
     filepaths.append(temp_filepath)
     return file_handle, temp_file_name
 def mocked_mkstemp():
   # Only non-chief should call tempfile.mkstemp() inside fit() in sync
   # training.
   assert not test_base.is_chief()
   file_handle, temp_file_name = real_mkstemp()
   extension = os.path.splitext(saving_filepath)[1]
   temp_filepath = temp_file_name + extension
   filepaths.append(temp_filepath)
   return file_handle, temp_file_name
  def callableForTestModelCheckpointSavesOnChiefButNotOtherwise(
      model, test_obj, train_ds, num_epoch, steps, strategy, saving_filepath):
    # Incorporate type/index information and thread id in saving_filepath to
    # ensure every worker has a unique path. Note that in normal use case the
    # saving_filepath will be the same for all workers, but we use different
    # ones here just to test out chief saves checkpoint but non-chief doesn't.
    saving_filepath = os.path.join(
        test_obj.get_temp_dir(), 'checkpoint_%s_%d' %
        (test_base.get_task_type(), test_base.get_task_index()))

    # The saving_filepath shouldn't exist at the beginning (as it's unique).
    test_obj.assertFalse(os.path.exists(saving_filepath))

    model.fit(
        x=train_ds,
        epochs=num_epoch,
        steps_per_epoch=steps,
        callbacks=[callbacks.ModelCheckpoint(filepath=saving_filepath)])

    # If it's chief, the model should be saved; if not, the model shouldn't.
    test_obj.assertEqual(os.path.exists(saving_filepath), test_base.is_chief())
예제 #8
0
        def _independent_worker_fn(*args, **kwargs):  # pylint: disable=unused-argument
            with test.mock.patch.object(dc, '_run_std_server',
                                        self._make_mock_run_std_server()):
                # Condition variable that blocks the thread that represents the
                # restarted chief.
                cv = kwargs.get('cv', None)
                # `before_restart` is True for the threads that represent the original
                # chief and non-chief worker, and False for threads that represent the
                # restarted chief and non-chief workers.
                before_restart = kwargs['before_restart']
                if kwargs['new_chief']:
                    # `new_chief` is only True for the restarted chief thread. It waits
                    # until non-chief is preempted and restarted to simulate the causality
                    # where chief's restart results from non-chief's failure.
                    cv.acquire()
                    while not hasattr(cv, 'preempted'):
                        cv.wait()
                    cv.release()

                # Model building under strategy scope. Following is the code we expect
                # the user runs on every worker.
                strategy = get_strategy_object(strategy_cls)
                batch_size = 64
                steps = 3
                train_ds, _ = _mnist_synthetic_dataset(batch_size, steps)
                with strategy.scope():
                    model = _get_model((28, 28, 1))

                # Function to start a new thread. This will be called twice in the
                # following code: one represents the restart of the non-chief, and one
                # represents the restart of the chief as a result of the restart of the
                # non-chief (so the training can continue in sync).
                def start_new_thread(new_chief=False):
                    new_thread_tf_config = json.loads(os.environ['TF_CONFIG'])
                    new_thread_tf_config['cluster']['worker'] = kwargs[
                        'reserved_ports']
                    return self._run_task_in_thread(
                        task_fn=_independent_worker_fn,
                        cluster_spec=None,
                        task_type=None,
                        task_id=None,
                        tf_config=new_thread_tf_config,
                        before_restart=False,
                        cv=cv,
                        new_chief=new_chief)

                if test_base.is_chief() and before_restart:
                    # Chief to start a new thread (that will be blocked by a condition
                    # variable until the non-chief's new thread is started). The thread
                    # for (recovered) chief is started before entering `fit()` because
                    # the original chief thread will eventually hang and be ignored.
                    start_new_thread(new_chief=True)

                try:

                    class CkptSavedEpochAssertingCallback(callbacks.Callback):
                        def __init__(self, test_obj):
                            super(CkptSavedEpochAssertingCallback,
                                  self).__init__()
                            self.test_obj = test_obj

                        def on_epoch_begin(self, epoch, logs=None):
                            # `_ckpt_saved_epoch` attribute is set at the end of every epoch.
                            self.test_obj.assertEqual(
                                self.model._ckpt_saved_epoch is None,
                                epoch == 0)

                    callbacks_list = [
                        callbacks.ModelCheckpoint(
                            filepath=saving_filepath,
                            save_weights_only=True,
                            load_weights_on_restart=True),
                        CkptSavedEpochAssertingCallback(self)
                    ]
                    if before_restart:
                        callbacks_list.append(preemption_callback())

                    self.assertIsNone(model._ckpt_saved_epoch)
                    history = model.fit(x=train_ds,
                                        epochs=num_epoch,
                                        steps_per_epoch=steps,
                                        callbacks=callbacks_list)
                    self.assertIsNone(model._ckpt_saved_epoch)

                    # `history` of the training result is collected to be compared against
                    # each other. It is expected that the training results (loss and
                    # accuracy`) are the same with or without preemption.
                    self._histories.append(history.history)

                except RuntimeError:
                    # pylint: disable=g-assert-in-except
                    self.assertTrue(before_restart)
                    # Reset the barrier so the new threads simulating recovery can
                    # continue.
                    self._barrier._counter = 0
                    self._barrier._flag = False

                    # Now that the non-chief has been preempted, it notifies the thread
                    # that simulates the restarted chief to start so they can be back in
                    # sync.
                    cv.acquire()
                    cv.preempted = True
                    cv.notify()
                    cv.release()

                    # At this point we should discard the original non-chief thread, and
                    # start the new thread that simulates the restarted non-chief, hence
                    # joining the thread and return.
                    self.join_independent_workers([start_new_thread()])
                    return

                # Successful end of a `fit()` call.
                self._successful_thread_ends += 1
                self.assertFalse(before_restart)
예제 #9
0
 def on_epoch_begin(self, epoch, logs=None):
     if epoch == 1 and not test_base.is_chief():
         # Simulate preemtion at the start of second epoch.
         raise RuntimeError('Preemption!')
    def _independent_worker_fn(*args, **kwargs):  # pylint: disable=unused-argument
      with test.mock.patch.object(dc, '_run_std_server',
                                  self._make_mock_run_std_server()):
        # Condition variable that blocks the thread that represents the
        # restarted chief.
        cv = kwargs.get('cv', None)
        # `before_restart` is True for the threads that represent the original
        # chief and non-chief worker, and False for threads that represent the
        # restarted chief and non-chief workers.
        before_restart = kwargs['before_restart']
        if kwargs['new_chief']:
          # `new_chief` is only True for the restarted chief thread. It waits
          # until non-chief is preempted and restarted to simulate the causality
          # where chief's restart results from non-chief's failure.
          cv.acquire()
          while not hasattr(cv, 'preempted'):
            cv.wait()
          cv.release()

        # Model building under strategy scope. Following is the code we expect
        # the user runs on every worker.
        strategy = get_strategy_object(strategy_cls)
        batch_size = 64
        steps = 3
        train_ds, _ = _mnist_synthetic_dataset(batch_size, steps)
        with strategy.scope():
          model = _get_model((28, 28, 1))

        # Function to start a new thread. This will be called twice in the
        # following code: one represents the restart of the non-chief, and one
        # represents the restart of the chief as a result of the restart of the
        # non-chief (so the training can continue in sync).
        def start_new_thread(new_chief=False):
          new_thread_tf_config = json.loads(os.environ['TF_CONFIG'])
          new_thread_tf_config['cluster']['worker'] = kwargs['reserved_ports']
          return self._run_task_in_thread(
              task_fn=_independent_worker_fn,
              cluster_spec=None,
              task_type=None,
              task_id=None,
              tf_config=new_thread_tf_config,
              before_restart=False,
              cv=cv,
              new_chief=new_chief)

        if test_base.is_chief() and before_restart:
          # Chief to start a new thread (that will be blocked by a condition
          # variable until the non-chief's new thread is started). The thread
          # for (recovered) chief is started before entering `fit()` because
          # the original chief thread will eventually hang and be ignored.
          start_new_thread(new_chief=True)

        try:

          class CkptSavedEpochAssertingCallback(callbacks.Callback):

            def __init__(self, test_obj):
              super(CkptSavedEpochAssertingCallback, self).__init__()
              self.test_obj = test_obj

            def on_epoch_begin(self, epoch, logs=None):
              # `_ckpt_saved_epoch` attribute is set at the end of every epoch.
              self.test_obj.assertEqual(self.model._ckpt_saved_epoch is None,
                                        epoch == 0)

          callbacks_list = [
              callbacks.ModelCheckpoint(
                  filepath=saving_filepath,
                  save_weights_only=True,
                  load_weights_on_restart=True),
              CkptSavedEpochAssertingCallback(self)
          ]
          if before_restart:
            callbacks_list.append(preemption_callback())

          self.assertIsNone(model._ckpt_saved_epoch)
          history = model.fit(
              x=train_ds,
              epochs=num_epoch,
              steps_per_epoch=steps,
              callbacks=callbacks_list)
          self.assertIsNone(model._ckpt_saved_epoch)

          # `history` of the training result is collected to be compared against
          # each other. It is expected that the training results (loss and
          # accuracy`) are the same with or without preemption.
          self._histories.append(history.history)

        except RuntimeError:
          # pylint: disable=g-assert-in-except
          self.assertTrue(before_restart)
          # Reset the barrier so the new threads simulating recovery can
          # continue.
          self._barrier._counter = 0
          self._barrier._flag = False

          # Now that the non-chief has been preempted, it notifies the thread
          # that simulates the restarted chief to start so they can be back in
          # sync.
          cv.acquire()
          cv.preempted = True
          cv.notify()
          cv.release()

          # At this point we should discard the original non-chief thread, and
          # start the new thread that simulates the restarted non-chief, hence
          # joining the thread and return.
          self.join_independent_workers([start_new_thread()])
          return

        # Successful end of a `fit()` call.
        self._successful_thread_ends += 1
        self.assertFalse(before_restart)
 def on_epoch_begin(self, epoch, logs=None):
   if epoch == 1 and not test_base.is_chief():
     # Simulate preemtion at the start of second epoch.
     raise RuntimeError('Preemption!')