def test_report_with_failed_start_message_argument(self): msg = Start(Job({"name": "job1"}, 0)).with_error("massive_failure") self.reporter.status_dict = self.reporter._init_job_status_dict( msg.timestamp, 0, [msg.job]) self.reporter.report(msg) with open(self.reporter.STATUS_file, "r") as f: self.assertIn( "EXIT: {}/{}".format(-10, "massive_failure"), f.readline(), "STATUS file missing EXIT message", ) with open(self.reporter.STATUS_json, "r") as f: contents = "".join(f.readlines()) self.assertIn('"status": "Failure"', contents, "status.json missing Failure status") self.assertIn( '"error": "massive_failure"', contents, "status.json missing error message", ) self.assertIsNotNone( self.reporter.status_dict["jobs"][0]["end_time"], "end_time not set for job1", )
def test_report_with_successful_start_message_argument(self): msg = Start( Job( { "name": "job1", "stdout": "/stdout.0", "stderr": "/stderr.0", "argList": ["--foo", "1", "--bar", "2"], "executable": "/bin/bash", }, 0, )) self.reporter.status_dict = self.reporter._init_job_status_dict( msg.timestamp, 0, [msg.job]) self.reporter.report(msg) with open(self.reporter.STATUS_file, "r") as f: self.assertIn("job1", f.readline(), "STATUS file missing job1") with open(self.reporter.LOG_file, "r") as f: self.assertIn( "Calling: /bin/bash --foo 1 --bar 2", f.readline(), """JOB_LOG file missing executable and arguments""", ) with open(self.reporter.STATUS_json, "r") as f: contents = "".join(f.readlines()) self.assertIn('"status": "Running"', contents, "status.json missing Running status") self.assertNotIn('"start_time": null', contents, "start_time not set")
def test_status_file_is_correct(self): """The STATUS file is a file to which we append data about jobs as they are run. So this involves multiple reports, and should be tested as such. See https://github.com/equinor/libres/issues/764 """ j_1 = Job({"name": "j_1", "executable": "", "argList": []}, 0) j_2 = Job({"name": "j_2", "executable": "", "argList": []}, 0) init = Init([j_1, j_2], 1, 1) start_j_1 = Start(j_1) exited_j_1 = Exited(j_1, 0) start_j_2 = Start(j_2) exited_j_2 = Exited(j_2, 9).with_error("failed horribly") for msg in [init, start_j_1, exited_j_1, start_j_2, exited_j_2]: self.reporter.report(msg) expected_j1_line = ( "{:32}: {start_ts:%H:%M:%S} .... {end_ts:%H:%M:%S} \n". format( # noqa j_1.name(), start_ts=start_j_1.timestamp, end_ts=exited_j_1.timestamp)) expected_j2_line = "{:32}: {start_ts:%H:%M:%S} .... {end_ts:%H:%M:%S} EXIT: {code}/{msg}\n".format( # noqa j_2.name(), start_ts=start_j_2.timestamp, end_ts=exited_j_2.timestamp, code=exited_j_2.exit_code, msg=exited_j_2.error_message, ) with open(self.reporter.STATUS_file, "r") as f: for expected in [ "Current host", expected_j1_line, expected_j2_line, ]: # noqa self.assertIn(expected, f.readline()) # EOF self.assertEqual("", f.readline())
def test_report_with_successful_start_message_argument(tmpdir): reporter = Event(event_log=tmpdir / "event_log") job1 = Job({"name": "job1", "stdout": "stdout", "stderr": "stderr"}, 0) reporter.report(Init([job1], 1, 19, ee_id="ee_id", real_id=0, stage_id=0)) msg = Start(job1) reporter.report(msg) with open(reporter._event_log, "r") as f: lines = f.readlines() assert len(lines) == 2 event = json.loads(lines[1]) assert event["type"] == _FM_JOB_START assert event["source"] == "/ert/ee/ee_id/real/0/stage/0/step/0/job/0"
def test_report_with_failed_start_message_argument(tmpdir): reporter = Event(event_log=tmpdir / "event_log") job1 = Job({"name": "job1", "stdout": "stdout", "stderr": "stderr"}, 0) reporter.report(Init([job1], 1, 19, ee_id="ee_id", real_id=0, stage_id=0)) msg = Start(job1).with_error("massive_failure") reporter.report(msg) with open(reporter._event_log, "r") as f: lines = f.readlines() assert len(lines) == 3 event = json.loads(lines[2]) assert event["type"] == _FM_JOB_FAILURE assert event["data"]["error_msg"] == "massive_failure"
def test_report_with_successful_start_message_argument(unused_tcp_port): host = "localhost" url = f"ws://{host}:{unused_tcp_port}" reporter = Event(evaluator_url=url) job1 = Job({"name": "job1", "stdout": "stdout", "stderr": "stderr"}, 0) lines = [] with _mock_ws_thread(host, unused_tcp_port, lines): reporter.report(Init([job1], 1, 19, ee_id="ee_id", real_id=0, step_id=0)) reporter.report(Start(job1)) reporter.report(Finish()) assert len(lines) == 1 event = json.loads(lines[0]) assert event["type"] == _FM_JOB_START assert event["source"] == "/ert/ee/ee_id/real/0/step/0/job/0" assert os.path.basename(event["data"]["stdout"]) == "stdout" assert os.path.basename(event["data"]["stderr"]) == "stderr"
def test_report_with_failed_start_message_argument(unused_tcp_port): host = "localhost" url = f"ws://{host}:{unused_tcp_port}" reporter = Event(evaluator_url=url) job1 = Job({"name": "job1", "stdout": "stdout", "stderr": "stderr"}, 0) lines = [] with _mock_ws_thread(host, unused_tcp_port, lines): reporter.report(Init([job1], 1, 19, ee_id="ee_id", real_id=0, step_id=0)) msg = Start(job1).with_error("massive_failure") reporter.report(msg) reporter.report(Finish()) assert len(lines) == 2 event = json.loads(lines[1]) assert event["type"] == _FM_JOB_FAILURE assert event["data"]["error_msg"] == "massive_failure"
def run(self): start_message = Start(self) errors = self._check_job_files() errors.extend(self._assert_arg_list()) self._dump_exec_env() if errors: yield start_message.with_error("\n".join(errors)) return yield start_message executable = self.job_data.get("executable") assert_file_executable(executable) arg_list = [executable] if self.job_data.get("argList"): arg_list += self.job_data["argList"] if self.job_data.get("stdin"): stdin = open(self.job_data.get("stdin")) else: stdin = None if self.std_err: stderr = open(self.std_err, "w") else: stderr = None if self.std_out: stdout = open(self.std_out, "w") else: stdout = None if self.job_data.get("target_file"): target_file_mtime = 0 if os.path.exists(self.job_data["target_file"]): stat = os.stat(self.job_data["target_file"]) target_file_mtime = stat.st_mtime exec_env = self.job_data.get("exec_env") if exec_env: exec_name, _ = os.path.splitext( os.path.basename(self.job_data.get("executable")) ) with open("%s_exec_env.json" % exec_name, "w") as f: f.write(json.dumps(exec_env)) max_running_minutes = self.job_data.get("max_running_minutes") run_start_time = dt.now() proc = Popen( arg_list, stdin=stdin, stdout=stdout, stderr=stderr, env=self.job_data.get("environment"), ) exit_code = None process = Process(proc.pid) max_memory_usage = 0 while exit_code is None: try: memory = process.memory_info().rss except (NoSuchProcess, AccessDenied, ZombieProcess): """In case of a process that has died and is in some transitional state, we ignore any failures. Only seen on OSX thus far. See https://github.com/giampaolo/psutil/issues/1044#issuecomment-298745532 """ memory = 0 if memory > max_memory_usage: max_memory_usage = memory yield Running(self, max_memory_usage, memory) try: exit_code = process.wait(timeout=self.MEMORY_POLL_PERIOD) except TimeoutExpired: run_time = dt.now() - run_start_time if ( max_running_minutes is not None and run_time.seconds > max_running_minutes * 60 ): """ If the spawned process is not in the same process group as the callee (job_dispatch), we will kill the process group explicitly. Propagating the unsuccessful Exited message will kill the callee group. See job_dispatch.py. """ process_group_id = os.getpgid(proc.pid) this_group_id = os.getpgid(os.getpid()) if process_group_id != this_group_id: os.killpg(process_group_id, signal.SIGKILL) yield Exited(self, exit_code).with_error( "Job:{} has been running for more than {} minutes - explicitly killed.".format( self.name(), max_running_minutes ) ) return exited_message = Exited(self, exit_code) if exit_code != 0: yield exited_message.with_error( "Process exited with status code {}".format(exit_code) ) return # exit_code is 0 if self.job_data.get("error_file"): if os.path.exists(self.job_data["error_file"]): yield exited_message.with_error( "Found the error file:{} - job failed.".format( self.job_data["error_file"] ) ) return if self.job_data.get("target_file"): target_file_error = self._check_target_file_is_written(target_file_mtime) if target_file_error: yield exited_message.with_error(target_file_error) return yield exited_message