def test_kill_jobs(self): with TestAreaContext("job_queue_test_kill") as work_area: job_queue = create_queue(never_ending_script) assert job_queue.queue_size == 10 assert job_queue.is_active() pool_sema = BoundedSemaphore(value=10) start_all(job_queue, pool_sema) # make sure never ending jobs are running wait_until(lambda: self.assertTrue(job_queue.is_active())) for job in job_queue.job_list: job.stop() wait_until(lambda: self.assertFalse(job_queue.is_active())) job_queue._differ.transition(job_queue.job_list) for q_index, job in enumerate(job_queue.job_list): assert job.status == JobStatusType.JOB_QUEUE_IS_KILLED iens = job_queue._differ.qindex_to_iens(q_index) assert job_queue.snapshot()[iens] == str( JobStatusType.JOB_QUEUE_IS_KILLED) for job in job_queue.job_list: job.wait_for()
def test_failing_jobs(self): with TestAreaContext("job_queue_test_add") as work_area: job_queue = create_queue(failing_script, max_submit=1) assert job_queue.queue_size == 10 assert job_queue.is_active() pool_sema = BoundedSemaphore(value=10) start_all(job_queue, pool_sema) wait_until( func=(lambda: self.assertFalse(job_queue.is_active())), ) for job in job_queue.job_list: job.wait_for() job_queue._differ.transition(job_queue.job_list) assert job_queue.fetch_next_waiting() is None for q_index, job in enumerate(job_queue.job_list): assert job.status == JobStatusType.JOB_QUEUE_FAILED iens = job_queue._differ.qindex_to_iens(q_index) assert job_queue.snapshot()[iens] == str( JobStatusType.JOB_QUEUE_FAILED)
def test_workflow_thread_cancel_external(self): with TestAreaContext( "python/job_queue/workflow_runner_external") as work_area: WorkflowCommon.createWaitJob() joblist = WorkflowJoblist() self.assertTrue(joblist.addJobFromFile("WAIT", "external_wait_job")) self.assertTrue("WAIT" in joblist) workflow = Workflow("wait_workflow", joblist) self.assertEqual(len(workflow), 3) workflow_runner = WorkflowRunner(workflow, ert=None, context=SubstitutionList()) self.assertFalse(workflow_runner.isRunning()) with workflow_runner: wait_until( lambda: self.assertTrue(workflow_runner.isRunning())) wait_until(lambda: self.assertFileExists("wait_started_0")) wait_until(lambda: self.assertFileExists("wait_finished_0")) wait_until(lambda: self.assertFileExists("wait_started_1")) workflow_runner.cancel() self.assertTrue(workflow_runner.isCancelled()) self.assertFileDoesNotExist("wait_finished_1") self.assertFileDoesNotExist("wait_started_2") self.assertFileDoesNotExist("wait_cancelled_2") self.assertFileDoesNotExist("wait_finished_2")
def test_add_jobs(self): with TestAreaContext("job_queue_test_add") as work_area: job_queue = create_queue(simple_script) assert job_queue.queue_size == 10 assert job_queue.is_active() assert job_queue.fetch_next_waiting() is not None pool_sema = BoundedSemaphore(value=10) start_all(job_queue, pool_sema) for job in job_queue.job_list: job.stop() wait_until(lambda: self.assertFalse(job_queue.is_active())) for job in job_queue.job_list: job.wait_for()
def test_timeout_jobs(self): with TestAreaContext("job_queue_test_kill") as work_area: job_numbers = set() def callback(arg): nonlocal job_numbers job_numbers.add(arg[0]["job_number"]) job_queue = create_queue( never_ending_script, max_submit=1, max_runtime=5, callback_timeout=callback, ) assert job_queue.queue_size == 10 assert job_queue.is_active() pool_sema = BoundedSemaphore(value=10) start_all(job_queue, pool_sema) # make sure never ending jobs are running wait_until(lambda: self.assertTrue(job_queue.is_active())) wait_until(lambda: self.assertFalse(job_queue.is_active())) job_queue._differ.transition(job_queue.job_list) for q_index, job in enumerate(job_queue.job_list): assert job.status == JobStatusType.JOB_QUEUE_IS_KILLED iens = job_queue._differ.qindex_to_iens(q_index) assert job_queue.snapshot()[iens] == str( JobStatusType.JOB_QUEUE_IS_KILLED) assert job_numbers == set(range(10)) for job in job_queue.job_list: job.wait_for()
def test_workflow_thread_cancel_ert_script(self): with TestAreaContext("python/job_queue/workflow_runner_ert_script"): WorkflowCommon.createWaitJob() joblist = WorkflowJoblist() self.assertTrue(joblist.addJobFromFile("WAIT", "wait_job")) self.assertTrue("WAIT" in joblist) workflow = Workflow("wait_workflow", joblist) self.assertEqual(len(workflow), 3) workflow_runner = WorkflowRunner(workflow) self.assertFalse(workflow_runner.isRunning()) with workflow_runner: self.assertIsNone(workflow_runner.workflowResult()) wait_until(lambda: self.assertTrue(workflow_runner.isRunning())) wait_until(lambda: self.assertFileExists("wait_started_0")) wait_until(lambda: self.assertFileExists("wait_finished_0")) wait_until(lambda: self.assertFileExists("wait_started_1")) workflow_runner.cancel() wait_until(lambda: self.assertFileExists("wait_cancelled_1")) self.assertTrue(workflow_runner.isCancelled()) self.assertFileDoesNotExist("wait_finished_1") self.assertFileDoesNotExist("wait_started_2") self.assertFileDoesNotExist("wait_cancelled_2") self.assertFileDoesNotExist("wait_finished_2")
def test_simulation_context(self): config_file = self.createTestPath("local/batch_sim/sleepy_time.ert") with ErtTestContext("res/sim/simulation_context", config_file) as test_context: ert = test_context.getErt() size = 4 even_mask = [True, False] * (size // 2) odd_mask = [False, True] * (size // 2) fs_manager = ert.getEnkfFsManager() even_half = fs_manager.getFileSystem("even_half") odd_half = fs_manager.getFileSystem("odd_half") # i represents geo_id case_data = [(i, {}) for i in range(size)] even_ctx = SimulationContext(ert, even_half, even_mask, 0, case_data) odd_ctx = SimulationContext(ert, odd_half, odd_mask, 0, case_data) for iens in range(size): # do we have the proper geo_id in run_args? if iens % 2 == 0: self.assertFalse(even_ctx.isRealizationFinished(iens)) self.assertEqual(even_ctx.get_run_args(iens).geo_id, iens) else: self.assertFalse(odd_ctx.isRealizationFinished(iens)) self.assertEqual(odd_ctx.get_run_args(iens).geo_id, iens) def any_is_running(): return even_ctx.isRunning() or odd_ctx.isRunning() wait_until(func=(lambda: self.assertFalse(any_is_running())), timeout=90) self.assertEqual(even_ctx.getNumFailed(), 0) self.assertEqual(even_ctx.getNumRunning(), 0) self.assertEqual(even_ctx.getNumSuccess(), size / 2) self.assertEqual(odd_ctx.getNumFailed(), 0) self.assertEqual(odd_ctx.getNumRunning(), 0) self.assertEqual(odd_ctx.getNumSuccess(), size / 2) even_state_map = even_half.getStateMap() odd_state_map = odd_half.getStateMap() for iens in range(size): if iens % 2 == 0: self.assertTrue(even_ctx.didRealizationSucceed(iens)) self.assertFalse(even_ctx.didRealizationFail(iens)) self.assertTrue(even_ctx.isRealizationFinished(iens)) self.assertEqual(even_state_map[iens], RealizationStateEnum.STATE_HAS_DATA) else: self.assertTrue(odd_ctx.didRealizationSucceed(iens)) self.assertFalse(odd_ctx.didRealizationFail(iens)) self.assertTrue(odd_ctx.isRealizationFinished(iens)) self.assertEqual(odd_state_map[iens], RealizationStateEnum.STATE_HAS_DATA)
def test_terminate_jobs(self): # Executes it self recursively and sleeps for 100 seconds with open("dummy_executable", "w") as f: f.write( """#!/usr/bin/env python import sys, os, time counter = eval(sys.argv[1]) if counter > 0: os.fork() os.execv(sys.argv[0],[sys.argv[0], str(counter - 1) ]) else: time.sleep(100)""" ) executable = os.path.realpath("dummy_executable") os.chmod("dummy_executable", stat.S_IRWXU | stat.S_IRWXO | stat.S_IRWXG) self.job_list = { "umask": "0002", "DATA_ROOT": "", "global_environment": {}, "global_update_path": {}, "jobList": [ { "name": "dummy_executable", "executable": executable, "target_file": None, "error_file": None, "start_file": None, "stdout": "dummy.stdout", "stderr": "dummy.stderr", "stdin": None, "argList": ["3"], "environment": None, "exec_env": None, "license_path": None, "max_running_minutes": None, "max_running": None, "min_arg": 1, "arg_types": [], "max_arg": None, } ], "run_id": "", "ert_pid": "", } with open("jobs.json", "w") as f: f.write(json.dumps(self.job_list)) # macOS doesn't provide /usr/bin/setsid, so we roll our own with open("setsid", "w") as f: f.write( dedent( """\ #!/usr/bin/env python import os import sys os.setsid() os.execvp(sys.argv[1], sys.argv[1:]) """ ) ) os.chmod("setsid", 0o755) job_dispatch_script = importlib.util.find_spec("job_runner.job_dispatch").origin job_dispatch_process = Popen( [ os.getcwd() + "/setsid", sys.executable, job_dispatch_script, os.getcwd(), ] ) p = psutil.Process(job_dispatch_process.pid) # Three levels of processes should spawn 8 children in total wait_until(lambda: self.assertEqual(len(p.children(recursive=True)), 8)) p.terminate() wait_until(lambda: self.assertEqual(len(p.children(recursive=True)), 0)) os.wait() # allow os to clean up zombie processes