def test_monitor_failing_ensemble(make_ee_config, unused_tcp_port): ensemble = TestEnsemble(iter=1, reals=2, steps=2, jobs=2) ensemble.addFailJob(real=1, step=0, job=1) ee_config = make_ee_config(use_token=False, generate_cert=False) ee = EnsembleEvaluator( ensemble, ee_config, 0, ee_id="ee-0", ) with ee.run(): pass with NarrativeProxy( monitor_failing_ensemble.on_uri( f"ws://localhost:{unused_tcp_port}")).proxy( ee_config.url) as port: with ee_monitor.create("localhost", port, "ws", None, None) as monitor: for event in monitor.track(): if event["type"] == identifiers.EVTYPE_EE_SNAPSHOT: ensemble.start() if (event.data and event.data.get(identifiers.STATUS) == ENSEMBLE_STATE_STOPPED): monitor.signal_done() ensemble.join()
def test_monitor_failing_evaluation(make_ee_config): ee_config = make_ee_config(use_token=False, generate_cert=False) ensemble = TestEnsemble(iter=1, reals=1, steps=1, jobs=1) ensemble.with_failure() ee = EnsembleEvaluator( ensemble, ee_config, 0, ee_id="ee-0", ) ee.run() with NarrativeProxy(monitor_failing_evaluation().on_uri( f"ws://localhost:{ee_config.port}")).proxy(ee_config.url): with ee_monitor.create(ee_config.get_connection_info()) as monitor: for event in monitor.track(): if event["type"] == identifiers.EVTYPE_EE_SNAPSHOT: ensemble.start() if (event.data and event.data.get(identifiers.STATUS) == ENSEMBLE_STATE_FAILED): monitor.signal_done() ensemble.join()
def test_monitor_successful_ensemble(make_ee_config): ensemble = TestEnsemble(iter=1, reals=2, steps=2, jobs=2) ensemble.addFailJob(real=1, step=0, job=1) ee_config = make_ee_config(use_token=False, generate_cert=False) ee = EnsembleEvaluator( ensemble, ee_config, 0, ee_id="ee-0", ) ee.run() with NarrativeProxy(monitor_successful_ensemble()).proxy(ee_config.url): with ee_monitor.create(ee_config.get_connection_info()) as monitor: for event in monitor.track(): if event["type"] == identifiers.EVTYPE_EE_SNAPSHOT: ensemble.start() if (event.data and event.data.get(identifiers.STATUS) == ENSEMBLE_STATE_STOPPED): monitor.signal_done() ensemble.join()
def test_dispatchers_can_connect_and_monitor_can_shut_down_evaluator( evaluator): with evaluator.run() as monitor: events = monitor.track() host = evaluator._config.host port = evaluator._config.port # first snapshot before any event occurs snapshot_event = next(events) snapshot = Snapshot(snapshot_event.data) assert snapshot.get_status() == ENSEMBLE_STATE_STARTED # two dispatchers connect with Client(host, port, "/dispatch") as dispatch1, Client( host, port, "/dispatch") as dispatch2: # first dispatcher informs that job 0 is running send_dispatch_event( dispatch1, identifiers.EVTYPE_FM_JOB_RUNNING, "/ert/ee/0/real/0/step/0/job/0", "event1", {"current_memory_usage": 1000}, ) snapshot = Snapshot(next(events).data) assert snapshot.get_job("0", "0", "0").status == JOB_STATE_RUNNING # second dispatcher informs that job 0 is running send_dispatch_event( dispatch2, identifiers.EVTYPE_FM_JOB_RUNNING, "/ert/ee/0/real/1/step/0/job/0", "event1", {"current_memory_usage": 1000}, ) snapshot = Snapshot(next(events).data) assert snapshot.get_job("1", "0", "0").status == JOB_STATE_RUNNING # second dispatcher informs that job 0 is done send_dispatch_event( dispatch2, identifiers.EVTYPE_FM_JOB_SUCCESS, "/ert/ee/0/real/1/step/0/job/0", "event1", {"current_memory_usage": 1000}, ) snapshot = Snapshot(next(events).data) assert snapshot.get_job("1", "0", "0").status == JOB_STATE_FINISHED # second dispatcher informs that job 1 is failed send_dispatch_event( dispatch2, identifiers.EVTYPE_FM_JOB_FAILURE, "/ert/ee/0/real/1/step/0/job/1", "event_job_1_fail", {identifiers.ERROR_MSG: "error"}, ) snapshot = Snapshot(next(events).data) assert snapshot.get_job("1", "0", "1").status == JOB_STATE_FAILURE # a second monitor connects with ee_monitor.create(host, port) as monitor2: events2 = monitor2.track() snapshot = Snapshot(next(events2).data) assert snapshot.get_status() == ENSEMBLE_STATE_STARTED assert snapshot.get_job("0", "0", "0").status == JOB_STATE_RUNNING assert snapshot.get_job("1", "0", "0").status == JOB_STATE_FINISHED # one monitor requests that server exit monitor.signal_cancel() # both monitors should get a terminated event terminated = next(events) terminated2 = next(events2) assert terminated["type"] == identifiers.EVTYPE_EE_TERMINATED assert terminated2["type"] == identifiers.EVTYPE_EE_TERMINATED for e in [events, events2]: for _ in e: assert False, "got unexpected event from monitor"
def test_dispatchers_can_connect_and_monitor_can_shut_down_evaluator( evaluator): with evaluator.run() as monitor: events = monitor.track() token = evaluator._config.token cert = evaluator._config.cert url = evaluator._config.url # first snapshot before any event occurs snapshot_event = next(events) print(snapshot_event) snapshot = Snapshot(snapshot_event.data) assert snapshot.status == ENSEMBLE_STATE_UNKNOWN # two dispatchers connect with Client( url + "/dispatch", cert=cert, token=token, max_retries=1, timeout_multiplier=1, ) as dispatch1, Client( url + "/dispatch", cert=cert, token=token, max_retries=1, timeout_multiplier=1, ) as dispatch2: # first dispatcher informs that job 0 is running send_dispatch_event( dispatch1, identifiers.EVTYPE_FM_JOB_RUNNING, f"/ert/ee/{evaluator._ee_id}/real/0/step/0/job/0", "event1", {"current_memory_usage": 1000}, ) # second dispatcher informs that job 0 is running send_dispatch_event( dispatch2, identifiers.EVTYPE_FM_JOB_RUNNING, f"/ert/ee/{evaluator._ee_id}/real/1/step/0/job/0", "event1", {"current_memory_usage": 1000}, ) # second dispatcher informs that job 0 is done send_dispatch_event( dispatch2, identifiers.EVTYPE_FM_JOB_SUCCESS, f"/ert/ee/{evaluator._ee_id}/real/1/step/0/job/0", "event1", {"current_memory_usage": 1000}, ) # second dispatcher informs that job 1 is failed send_dispatch_event( dispatch2, identifiers.EVTYPE_FM_JOB_FAILURE, f"/ert/ee/{evaluator._ee_id}/real/1/step/0/job/1", "event_job_1_fail", {identifiers.ERROR_MSG: "error"}, ) evt = next(events) print(evt) snapshot = Snapshot(evt.data) assert snapshot.get_job("1", "0", "0").status == JOB_STATE_FINISHED assert snapshot.get_job("0", "0", "0").status == JOB_STATE_RUNNING assert snapshot.get_job("1", "0", "1").status == JOB_STATE_FAILURE # a second monitor connects with ee_monitor.create( evaluator._config.get_connection_info()) as monitor2: events2 = monitor2.track() full_snapshot_event = next(events2) assert full_snapshot_event[ "type"] == identifiers.EVTYPE_EE_SNAPSHOT snapshot = Snapshot(full_snapshot_event.data) assert snapshot.status == ENSEMBLE_STATE_UNKNOWN assert snapshot.get_job("0", "0", "0").status == JOB_STATE_RUNNING assert snapshot.get_job("1", "0", "0").status == JOB_STATE_FINISHED # one monitor requests that server exit monitor.signal_cancel() # both monitors should get a terminated event terminated = next(events) terminated2 = next(events2) assert terminated["type"] == identifiers.EVTYPE_EE_TERMINATED assert terminated2["type"] == identifiers.EVTYPE_EE_TERMINATED for e in [events, events2]: for undexpected_event in e: assert ( False ), f"got unexpected event {undexpected_event} from monitor"
def test_dispatchers_can_connect_and_monitor_can_shut_down_evaluator( evaluator): monitor = evaluator.run() events = monitor.track() host = evaluator._config.host port = evaluator._config.port # first snapshot before any event occurs snapshot_event = next(events) snapshot = Snapshot(snapshot_event.data) assert snapshot.get_status() == "Unknown" # two dispatchers connect with Client(host, port, "/dispatch") as dispatch1, Client(host, port, "/dispatch") as dispatch2: # first dispatcher informs that job 0 is running send_dispatch_event( dispatch1, identifiers.EVTYPE_FM_JOB_RUNNING, "/ert/ee/0/real/0/stage/0/step/0/job/0", "event1", {"current_memory_usage": 1000}, ) snapshot = Snapshot(next(events).data) assert snapshot.get_job("0", "0", "0", "0")["status"] == "Running" # second dispatcher informs that job 0 is running send_dispatch_event( dispatch2, identifiers.EVTYPE_FM_JOB_RUNNING, "/ert/ee/0/real/1/stage/0/step/0/job/0", "event1", {"current_memory_usage": 1000}, ) snapshot = Snapshot(next(events).data) assert snapshot.get_job("1", "0", "0", "0")["status"] == "Running" # second dispatcher informs that job 0 is done send_dispatch_event( dispatch2, identifiers.EVTYPE_FM_JOB_SUCCESS, "/ert/ee/0/real/1/stage/0/step/0/job/0", "event1", {"current_memory_usage": 1000}, ) snapshot = Snapshot(next(events).data) assert snapshot.get_job("1", "0", "0", "0")["status"] == "Finished" # a second monitor connects monitor2 = ee_monitor.create(host, port) events2 = monitor2.track() snapshot = Snapshot(next(events2).data) assert snapshot.get_status() == "Unknown" assert snapshot.get_job("0", "0", "0", "0")["status"] == "Running" assert snapshot.get_job("1", "0", "0", "0")["status"] == "Finished" # one monitor requests that server exit monitor.signal_cancel() # both monitors should get a terminated event terminated = next(events) terminated2 = next(events2) assert terminated["type"] == identifiers.EVTYPE_EE_TERMINATED assert terminated2["type"] == identifiers.EVTYPE_EE_TERMINATED for e in [events, events2]: for _ in e: assert False, "got unexpected event from monitor"