def test_report_with_failed_finish_message_argument(unused_tcp_port): host = "localhost" url = f"ws://{host}:{unused_tcp_port}" reporter = Event(evaluator_url=url) job1 = Job({"name": "job1", "stdout": "stdout", "stderr": "stderr"}, 0) lines = [] with _mock_ws_thread(host, unused_tcp_port, lines): reporter.report(Init([job1], 1, 19, ee_id="ee_id", real_id=0, step_id=0)) reporter.report(Running(job1, 100, 10)) reporter.report(Finish().with_error("massive_failure")) assert len(lines) == 1
def test_report_only_job_running_for_successful_run(unused_tcp_port): host = "localhost" url = f"ws://{host}:{unused_tcp_port}" reporter = Event(evaluator_url=url) job1 = Job({"name": "job1", "stdout": "stdout", "stderr": "stderr"}, 0) lines = [] with _mock_ws_thread(host, unused_tcp_port, lines): reporter.report(Init([job1], 1, 19, ee_id="ee_id", real_id=0, step_id=0)) reporter.report(Running(job1, 100, 10)) reporter.report(Finish()) assert len(lines) == 1
def test_report_with_successful_finish_message_argument(tmpdir): reporter = Event(event_log=tmpdir / "event_log") job1 = Job({"name": "job1", "stdout": "stdout", "stderr": "stderr"}, 0) reporter.report(Init([job1], 1, 19, ee_id="ee_id", real_id=0, stage_id=0)) reporter.report(Running(job1, 100, 10)) reporter.report(Finish()) with open(reporter._event_log, "r") as f: lines = f.readlines() assert len(lines) == 3 event = json.loads(lines[2]) assert event["type"] == _FM_STEP_SUCCESS
def test_report_with_failed_finish_message_argument(tmpdir): reporter = Event(event_log=tmpdir / "event_log") job1 = Job({"name": "job1", "stdout": "stdout", "stderr": "stderr"}, 0) reporter.report(Init([job1], 1, 19, ee_id="ee_id", real_id=0, stage_id=0)) reporter.report(Running(job1, 100, 10)) reporter.report(Finish().with_error("massive_failure")) with open(reporter._event_log, "r") as f: lines = f.readlines() assert len(lines) == 3 event = json.loads(lines[2]) assert event["type"] == _FM_STEP_FAILURE assert event["data"]["error_msg"] == "massive_failure"
def test_report_with_running_message_argument(tmpdir): reporter = Event(event_log=tmpdir / "event_log") job1 = Job({"name": "job1", "stdout": "stdout", "stderr": "stderr"}, 0) reporter.report(Init([job1], 1, 19, ee_id="ee_id", real_id=0, stage_id=0)) reporter.report(Running(job1, 100, 10)) with open(reporter._event_log, "r") as f: lines = f.readlines() assert len(lines) == 2 event = json.loads(lines[1]) assert event["type"] == _FM_JOB_RUNNING assert event["data"]["max_memory_usage"] == 100 assert event["data"]["current_memory_usage"] == 10
def test_report_with_running_message_argument(self): msg = Running(Job({"name": "job1"}, 0), 100, 10) self.reporter.status_dict = self.reporter._init_job_status_dict( msg.timestamp, 0, [msg.job]) self.reporter.report(msg) with open(self.reporter.STATUS_json, "r") as f: contents = "".join(f.readlines()) self.assertIn('"status": "Running"', contents, "status.json missing status") self.assertIn('"max_memory_usage": 100', contents, "status.json missing max_memory_usage") self.assertIn('"current_memory_usage": 10', contents, "status.json missing current_memory_usage")
def test_report_with_running_message_argument(unused_tcp_port): host = "localhost" url = f"ws://{host}:{unused_tcp_port}" reporter = Event(evaluator_url=url) job1 = Job({"name": "job1", "stdout": "stdout", "stderr": "stderr"}, 0) lines = [] with _mock_ws_thread(host, unused_tcp_port, lines): reporter.report(Init([job1], 1, 19, ee_id="ee_id", real_id=0, step_id=0)) reporter.report(Running(job1, 100, 10)) reporter.report(Finish()) assert len(lines) == 1 event = json.loads(lines[0]) assert event["type"] == _FM_JOB_RUNNING assert event["data"]["max_memory_usage"] == 100 assert event["data"]["current_memory_usage"] == 10
def run(self): start_message = Start(self) errors = self._check_job_files() errors.extend(self._assert_arg_list()) self._dump_exec_env() if errors: yield start_message.with_error("\n".join(errors)) return yield start_message executable = self.job_data.get("executable") assert_file_executable(executable) arg_list = [executable] if self.job_data.get("argList"): arg_list += self.job_data["argList"] if self.job_data.get("stdin"): stdin = open(self.job_data.get("stdin")) else: stdin = None if self.std_err: stderr = open(self.std_err, "w") else: stderr = None if self.std_out: stdout = open(self.std_out, "w") else: stdout = None if self.job_data.get("target_file"): target_file_mtime = 0 if os.path.exists(self.job_data["target_file"]): stat = os.stat(self.job_data["target_file"]) target_file_mtime = stat.st_mtime exec_env = self.job_data.get("exec_env") if exec_env: exec_name, _ = os.path.splitext( os.path.basename(self.job_data.get("executable")) ) with open("%s_exec_env.json" % exec_name, "w") as f: f.write(json.dumps(exec_env)) max_running_minutes = self.job_data.get("max_running_minutes") run_start_time = dt.now() proc = Popen( arg_list, stdin=stdin, stdout=stdout, stderr=stderr, env=self.job_data.get("environment"), ) exit_code = None process = Process(proc.pid) max_memory_usage = 0 while exit_code is None: try: memory = process.memory_info().rss except (NoSuchProcess, AccessDenied, ZombieProcess): """In case of a process that has died and is in some transitional state, we ignore any failures. Only seen on OSX thus far. See https://github.com/giampaolo/psutil/issues/1044#issuecomment-298745532 """ memory = 0 if memory > max_memory_usage: max_memory_usage = memory yield Running(self, max_memory_usage, memory) try: exit_code = process.wait(timeout=self.MEMORY_POLL_PERIOD) except TimeoutExpired: run_time = dt.now() - run_start_time if ( max_running_minutes is not None and run_time.seconds > max_running_minutes * 60 ): """ If the spawned process is not in the same process group as the callee (job_dispatch), we will kill the process group explicitly. Propagating the unsuccessful Exited message will kill the callee group. See job_dispatch.py. """ process_group_id = os.getpgid(proc.pid) this_group_id = os.getpgid(os.getpid()) if process_group_id != this_group_id: os.killpg(process_group_id, signal.SIGKILL) yield Exited(self, exit_code).with_error( "Job:{} has been running for more than {} minutes - explicitly killed.".format( self.name(), max_running_minutes ) ) return exited_message = Exited(self, exit_code) if exit_code != 0: yield exited_message.with_error( "Process exited with status code {}".format(exit_code) ) return # exit_code is 0 if self.job_data.get("error_file"): if os.path.exists(self.job_data["error_file"]): yield exited_message.with_error( "Found the error file:{} - job failed.".format( self.job_data["error_file"] ) ) return if self.job_data.get("target_file"): target_file_error = self._check_target_file_is_written(target_file_mtime) if target_file_error: yield exited_message.with_error(target_file_error) return yield exited_message