def test_fix_lost_trials_race_condition(self, monkeypatch, caplog): """Test that a lost trial fixed by a concurrent process does not cause error.""" trial = copy.deepcopy(base_trial) trial['status'] = 'interrupted' trial['heartbeat'] = datetime.datetime.utcnow() - datetime.timedelta( seconds=360) with OrionState(trials=[trial]) as cfg: exp = Experiment('supernaekei') exp._id = cfg.trials[0]['experiment'] assert len(exp.fetch_trials_by_status('interrupted')) == 1 assert len(exp._storage.fetch_lost_trials(exp)) == 0 def fetch_lost_trials(self, query): trial_object = Trial(**trial) trial_object.status = 'reserved' return [trial_object] # Force the fetch of a trial marked as reserved (and lost) while actually interrupted # (as if already failed-over by another process). with monkeypatch.context() as m: m.setattr(exp._storage.__class__, 'fetch_lost_trials', fetch_lost_trials) assert len(exp._storage.fetch_lost_trials(exp)) == 1 with caplog.at_level(logging.DEBUG): exp.fix_lost_trials() assert caplog.records[-1].levelname == 'DEBUG' assert caplog.records[-1].msg == 'failed' assert len(exp.fetch_trials_by_status('interrupted')) == 1 assert len(exp.fetch_trials_by_status('reserved')) == 0
def test_fix_lost_trials(self): """Test that a running trial with an old heartbeat is set to interrupted.""" trial = copy.deepcopy(base_trial) trial['status'] = 'reserved' trial['heartbeat'] = datetime.datetime.utcnow() - datetime.timedelta( seconds=360) with OrionState(trials=[trial]) as cfg: exp = Experiment('supernaekei') exp._id = cfg.trials[0]['experiment'] assert len(exp.fetch_trials_by_status('reserved')) == 1 exp.fix_lost_trials() assert len(exp.fetch_trials_by_status('reserved')) == 0
def test_fix_lost_trials(self): """Test that a running trial with an old heartbeat is set to interrupted.""" trial = copy.deepcopy(base_trial) trial["status"] = "reserved" trial["heartbeat"] = datetime.datetime.utcnow() - datetime.timedelta( seconds=60 * 10) with OrionState(trials=[trial]) as cfg: exp = Experiment("supernaekei", mode="x") exp._id = cfg.trials[0]["experiment"] assert len(exp.fetch_trials_by_status("reserved")) == 1 exp.fix_lost_trials() assert len(exp.fetch_trials_by_status("reserved")) == 0
def test_fix_lost_trials_configurable_hb(self): """Test that heartbeat is correctly being configured.""" trial = copy.deepcopy(base_trial) trial['status'] = 'reserved' trial['heartbeat'] = datetime.datetime.utcnow() - datetime.timedelta( seconds=180) with OrionState(trials=[trial]) as cfg: exp = Experiment('supernaekei') exp._id = cfg.trials[0]['experiment'] assert len(exp.fetch_trials_by_status('reserved')) == 1 orion.core.config.worker.heartbeat = 360 exp.fix_lost_trials() assert len(exp.fetch_trials_by_status('reserved')) == 1 orion.core.config.worker.heartbeat = 180 exp.fix_lost_trials() assert len(exp.fetch_trials_by_status('reserved')) == 0
def test_fix_lost_trials_configurable_hb(self): """Test that heartbeat is correctly being configured.""" trial = copy.deepcopy(base_trial) trial["status"] = "reserved" trial["heartbeat"] = datetime.datetime.utcnow() - datetime.timedelta( seconds=60 * 2) with OrionState(trials=[trial]) as cfg: exp = Experiment("supernaekei", mode="x") exp._id = cfg.trials[0]["experiment"] assert len(exp.fetch_trials_by_status("reserved")) == 1 orion.core.config.worker.heartbeat = 60 * 2 exp.fix_lost_trials() assert len(exp.fetch_trials_by_status("reserved")) == 1 orion.core.config.worker.heartbeat = 60 * 2 / 10.0 exp.fix_lost_trials() assert len(exp.fetch_trials_by_status("reserved")) == 0
def test_fix_only_lost_trials(self): """Test that an old trial is set to interrupted but not a recent one.""" lost_trial, running_trial = generate_trials(["reserved"] * 2) lost_trial["heartbeat"] = datetime.datetime.utcnow() - datetime.timedelta( seconds=60 * 10 ) running_trial["heartbeat"] = datetime.datetime.utcnow() with OrionState(trials=[lost_trial, running_trial]) as cfg: exp = Experiment("supernaekei", mode="x") exp._id = cfg.trials[0]["experiment"] assert len(exp.fetch_trials_by_status("reserved")) == 2 exp.fix_lost_trials() reserved_trials = exp.fetch_trials_by_status("reserved") assert len(reserved_trials) == 1 assert reserved_trials[0].to_dict()["params"] == running_trial["params"] failedover_trials = exp.fetch_trials_by_status("interrupted") assert len(failedover_trials) == 1 assert failedover_trials[0].to_dict()["params"] == lost_trial["params"]
def test_fix_only_lost_trials(self): """Test that an old trial is set to interrupted but not a recent one.""" lost_trial, running_trial = generate_trials(['reserved'] * 2) lost_trial['heartbeat'] = datetime.datetime.utcnow( ) - datetime.timedelta(seconds=360) running_trial['heartbeat'] = datetime.datetime.utcnow() with OrionState(trials=[lost_trial, running_trial]) as cfg: exp = Experiment('supernaekei') exp._id = cfg.trials[0]['experiment'] assert len(exp.fetch_trials_by_status('reserved')) == 2 exp.fix_lost_trials() reserved_trials = exp.fetch_trials_by_status('reserved') assert len(reserved_trials) == 1 assert reserved_trials[0].to_dict( )['params'] == running_trial['params'] failedover_trials = exp.fetch_trials_by_status('interrupted') assert len(failedover_trials) == 1 assert failedover_trials[0].to_dict( )['params'] == lost_trial['params']