def test_report_with_failed_exit_message_argument(self): msg = Exited(Job({"name": "job1"}, 0), 1).with_error("massive_failure") self.reporter.status_dict = self.reporter._init_job_status_dict( msg.timestamp, 0, [msg.job]) self.reporter.report(msg) with open(self.reporter.STATUS_file, "r") as f: self.assertIn("EXIT: {}/{}".format(1, "massive_failure"), f.readline()) with open(self.reporter.ERROR_file, "r") as f: contents = "".join(f.readlines()) self.assertIn("<job>job1</job>", contents, "ERROR file missing job") self.assertIn( "<reason>massive_failure</reason>", contents, "ERROR file missing reason", ) self.assertIn( "<stderr: Not redirected>", contents, "ERROR had invalid stderr information", ) with open(self.reporter.STATUS_json, "r") as f: contents = "".join(f.readlines()) self.assertIn('"status": "Failure"', contents, "status.json missing Failure status") self.assertIn( '"error": "massive_failure"', contents, "status.json missing error message", ) self.assertIsNotNone(self.reporter.status_dict["jobs"][0]["end_time"])
def test_exited_success_msg(self, post_mock): self.reporter.start_time = dt.now() self.reporter.report(Exited(None, 9)) _, data = post_mock.call_args self.assertTrue(post_mock.called) self.assertIn('"status": "OK"', data["data"])
def test_report_with_successful_exit_message_argument(self): msg = Exited(Job({"name": "job1"}, 0), 0) self.reporter.status_dict = self.reporter._init_job_status_dict( msg.timestamp, 0, [msg.job]) self.reporter.report(msg) with open(self.reporter.STATUS_json, "r") as f: contents = "".join(f.readlines()) self.assertIn('"status": "Success"', contents, "status.json missing Success status")
def test_report_with_successful_exit_message_argument(tmpdir): reporter = Event(event_log=tmpdir / "event_log") job1 = Job({"name": "job1", "stdout": "stdout", "stderr": "stderr"}, 0) reporter.report(Init([job1], 1, 19, ee_id="ee_id", real_id=0, stage_id=0)) reporter.report(Exited(job1, 0)) with open(reporter._event_log, "r") as f: lines = f.readlines() assert len(lines) == 2 event = json.loads(lines[1]) assert event["type"] == _FM_JOB_SUCCESS
def test_status_file_is_correct(self): """The STATUS file is a file to which we append data about jobs as they are run. So this involves multiple reports, and should be tested as such. See https://github.com/equinor/libres/issues/764 """ j_1 = Job({"name": "j_1", "executable": "", "argList": []}, 0) j_2 = Job({"name": "j_2", "executable": "", "argList": []}, 0) init = Init([j_1, j_2], 1, 1) start_j_1 = Start(j_1) exited_j_1 = Exited(j_1, 0) start_j_2 = Start(j_2) exited_j_2 = Exited(j_2, 9).with_error("failed horribly") for msg in [init, start_j_1, exited_j_1, start_j_2, exited_j_2]: self.reporter.report(msg) expected_j1_line = ( "{:32}: {start_ts:%H:%M:%S} .... {end_ts:%H:%M:%S} \n". format( # noqa j_1.name(), start_ts=start_j_1.timestamp, end_ts=exited_j_1.timestamp)) expected_j2_line = "{:32}: {start_ts:%H:%M:%S} .... {end_ts:%H:%M:%S} EXIT: {code}/{msg}\n".format( # noqa j_2.name(), start_ts=start_j_2.timestamp, end_ts=exited_j_2.timestamp, code=exited_j_2.exit_code, msg=exited_j_2.error_message, ) with open(self.reporter.STATUS_file, "r") as f: for expected in [ "Current host", expected_j1_line, expected_j2_line, ]: # noqa self.assertIn(expected, f.readline()) # EOF self.assertEqual("", f.readline())
def test_report_with_failed_exit_message_argument(tmpdir): reporter = Event(event_log=tmpdir / "event_log") job1 = Job({"name": "job1", "stdout": "stdout", "stderr": "stderr"}, 0) reporter.report(Init([job1], 1, 19, ee_id="ee_id", real_id=0, stage_id=0)) reporter.report(Exited(job1, 1).with_error("massive_failure")) with open(reporter._event_log, "r") as f: lines = f.readlines() assert len(lines) == 2 event = json.loads(lines[1]) assert event["type"] == _FM_JOB_FAILURE assert event["data"]["error_msg"] == "massive_failure"
def test_failed_job_is_reported(self, post_mock): self.reporter.start_time = dt.now() job = Job( { "name": "failing job", "executable": "/dev/null", "argList": [] }, 0) self.reporter.report(Exited(job, 9).with_error("failed")) _, data = post_mock.call_args self.assertTrue(post_mock.called, "post not called for failed Exit") self.assertIn('"status": "exit"', data["data"], "no exit in data") self.assertIn('"error": true', data["data"], "no set err flag in data")
def test_report_with_successful_exit_message_argument(unused_tcp_port): host = "localhost" url = f"ws://{host}:{unused_tcp_port}" reporter = Event(evaluator_url=url) job1 = Job({"name": "job1", "stdout": "stdout", "stderr": "stderr"}, 0) lines = [] with _mock_ws_thread(host, unused_tcp_port, lines): reporter.report(Init([job1], 1, 19, ee_id="ee_id", real_id=0, step_id=0)) reporter.report(Exited(job1, 0)) reporter.report(Finish().with_error("failed")) assert len(lines) == 1 event = json.loads(lines[0]) assert event["type"] == _FM_JOB_SUCCESS
def test_successful_job_not_reported(self, post_mock): self.reporter.report(Exited(None, 9)) self.assertFalse(post_mock.called, "post called on successful Exit")
def run(self): start_message = Start(self) errors = self._check_job_files() errors.extend(self._assert_arg_list()) self._dump_exec_env() if errors: yield start_message.with_error("\n".join(errors)) return yield start_message executable = self.job_data.get("executable") assert_file_executable(executable) arg_list = [executable] if self.job_data.get("argList"): arg_list += self.job_data["argList"] if self.job_data.get("stdin"): stdin = open(self.job_data.get("stdin")) else: stdin = None if self.std_err: stderr = open(self.std_err, "w") else: stderr = None if self.std_out: stdout = open(self.std_out, "w") else: stdout = None if self.job_data.get("target_file"): target_file_mtime = 0 if os.path.exists(self.job_data["target_file"]): stat = os.stat(self.job_data["target_file"]) target_file_mtime = stat.st_mtime exec_env = self.job_data.get("exec_env") if exec_env: exec_name, _ = os.path.splitext( os.path.basename(self.job_data.get("executable")) ) with open("%s_exec_env.json" % exec_name, "w") as f: f.write(json.dumps(exec_env)) max_running_minutes = self.job_data.get("max_running_minutes") run_start_time = dt.now() proc = Popen( arg_list, stdin=stdin, stdout=stdout, stderr=stderr, env=self.job_data.get("environment"), ) exit_code = None process = Process(proc.pid) max_memory_usage = 0 while exit_code is None: try: memory = process.memory_info().rss except (NoSuchProcess, AccessDenied, ZombieProcess): """In case of a process that has died and is in some transitional state, we ignore any failures. Only seen on OSX thus far. See https://github.com/giampaolo/psutil/issues/1044#issuecomment-298745532 """ memory = 0 if memory > max_memory_usage: max_memory_usage = memory yield Running(self, max_memory_usage, memory) try: exit_code = process.wait(timeout=self.MEMORY_POLL_PERIOD) except TimeoutExpired: run_time = dt.now() - run_start_time if ( max_running_minutes is not None and run_time.seconds > max_running_minutes * 60 ): """ If the spawned process is not in the same process group as the callee (job_dispatch), we will kill the process group explicitly. Propagating the unsuccessful Exited message will kill the callee group. See job_dispatch.py. """ process_group_id = os.getpgid(proc.pid) this_group_id = os.getpgid(os.getpid()) if process_group_id != this_group_id: os.killpg(process_group_id, signal.SIGKILL) yield Exited(self, exit_code).with_error( "Job:{} has been running for more than {} minutes - explicitly killed.".format( self.name(), max_running_minutes ) ) return exited_message = Exited(self, exit_code) if exit_code != 0: yield exited_message.with_error( "Process exited with status code {}".format(exit_code) ) return # exit_code is 0 if self.job_data.get("error_file"): if os.path.exists(self.job_data["error_file"]): yield exited_message.with_error( "Found the error file:{} - job failed.".format( self.job_data["error_file"] ) ) return if self.job_data.get("target_file"): target_file_error = self._check_target_file_is_written(target_file_mtime) if target_file_error: yield exited_message.with_error(target_file_error) return yield exited_message
def test_exited_failure_msg(self, post_mock): self.reporter.start_time = dt.now() self.reporter.report(Exited(None, 9)) self.assertTrue(post_mock.called)