def test_preempt_workers_ask_master(dummy: bool, auto_ack: bool) -> None: with parallel.Execution(2) as pex: # Steal the automatically-created pex.distributed contexts, then test chief/worker serially # so we know they're not using distributed comms. @pex.run def distributed_contexts() -> core.DistributedContext: return pex.distributed # Test steps are identical for chief and worker. for dist in distributed_contexts: if not dummy: state, context = make_test_preempt_context( dist, core.PreemptMode.WorkersAskMaster) else: context = core.DummyPreemptContext( dist, core.PreemptMode.WorkersAskMaster) with context: assert context.should_preempt() is False if not dummy: # No ack preemption calls yet. state.mock_session.post.assert_not_called() # Send the preemption signal. state.preempt() wait_on_watcher(context) # Call again, to make sure we only ack once. assert context.should_preempt(auto_ack=auto_ack) is True if auto_ack: state.mock_session.post.assert_called_once() else: state.mock_session.post.assert_not_called()
def _dummy_init( *, distributed: Optional[core.DistributedContext] = None, # TODO(DET-6153): allow a Union[StorageManager, str] here. storage_manager: Optional[storage.StorageManager] = None, preempt_mode: core.PreemptMode = core.PreemptMode.WorkersAskChief, ) -> Context: """ Build a core.Context suitable for running off-cluster. This is normally called by init() when it is detected that there is no ClusterInfo available, but can be invoked directly for e.g. local test mode. """ distributed = distributed or core.DummyDistributedContext() preempt = core.DummyPreemptContext(distributed, preempt_mode) if storage_manager is None: base_path = appdirs.user_data_dir("determined") logger.info("no storage_manager provided; storing checkpoints in {base_path}") storage_manager = storage.SharedFSStorageManager(base_path) checkpoint = core.DummyCheckpointContext(distributed, storage_manager) train = core.DummyTrainContext() searcher = core.DummySearcherContext(distributed) _install_stacktrace_on_sigusr1() return Context( distributed=distributed, checkpoint=checkpoint, preempt=preempt, train=train, searcher=searcher, )
def test_preempt_chief_only(dummy: bool, auto_ack: bool) -> None: with parallel.Execution(2) as pex: # Steal the automatically-created pex.distributed contexts, then test chief/worker serially # so we know they're not using distributed comms. @pex.run def distributed_contexts() -> core.DistributedContext: return pex.distributed # Test chief. if not dummy: state, context = make_test_preempt_context( distributed_contexts[0], core.PreemptMode.ChiefOnly) else: context = core.DummyPreemptContext(distributed_contexts[0], core.PreemptMode.ChiefOnly) with context: assert context.should_preempt() is False if not dummy: # No ack preemption calls yet. state.mock_session.post.assert_not_called() # Send the preemption signal. state.preempt() wait_on_watcher(context) assert context.should_preempt(auto_ack=auto_ack) is True # Call again, to make sure we only ack once. assert context.should_preempt(auto_ack=auto_ack) is True if auto_ack: state.mock_session.post.assert_called_once() else: state.mock_session.post.assert_not_called() # Test worker. if not dummy: state, context = make_test_preempt_context( distributed_contexts[1], core.PreemptMode.ChiefOnly) else: context = core.DummyPreemptContext(distributed_contexts[1], core.PreemptMode.ChiefOnly) with context: with pytest.raises(RuntimeError, match="should_preempt.*called from non-chief"): context.should_preempt()
def __init__( self, checkpoint: core.CheckpointContext, distributed: Optional[core.DistributedContext] = None, preempt: Optional[core.PreemptContext] = None, train: Optional[core.TrainContext] = None, searcher: Optional[core.SearcherContext] = None, ) -> None: self.checkpoint = checkpoint self.distributed = distributed or core.DummyDistributedContext() self.preempt = preempt or core.DummyPreemptContext(self.distributed) self.train = train or core.DummyTrainContext() self.searcher = searcher or core.DummySearcherContext(self.distributed)
def do_test() -> None: if not dummy: state, context = make_test_preempt_context( pex.distributed, core.PreemptMode.WorkersAskChief) else: context = core.DummyPreemptContext( pex.distributed, core.PreemptMode.WorkersAskChief) with pytest.raises( RuntimeError, match="cannot call.*should_preempt.*before.*start"): context.should_preempt() with context: assert context.should_preempt() is False
def do_test() -> None: if not dummy: state, context = make_test_preempt_context( pex.distributed, core.PreemptMode.WorkersAskChief) else: context = core.DummyPreemptContext( pex.distributed, core.PreemptMode.WorkersAskChief) with context: if pex.rank == 0: # Check preemption. assert context.should_preempt() is False # Make sure the worker is receiving broadcasts. _ = pex.distributed.broadcast(False) if not dummy: # No ack preemption calls yet. state.mock_session.post.assert_not_called() # Send the preemption signal. state.preempt() wait_on_watcher(context) assert context.should_preempt( auto_ack=auto_ack) is True # Call again, to make sure we only ack once. assert context.should_preempt( auto_ack=auto_ack) is True if auto_ack: state.mock_session.post.assert_called_once() else: state.mock_session.post.assert_not_called() else: # Intercept the broadcast from the chief to make sure it's happening. out = pex.distributed.broadcast(None) assert out is False, out # Try receving from the chief. assert context.should_preempt() is False if not dummy: # The chief should send a True now. assert context.should_preempt() is True # Only the chief acknowledges the preemption signal. state.mock_session.post.assert_not_called()