def test_get_timestamps(self): """ test that LogUtils.get_timestamps() works correctly """ # Submit a 1 cpu job, this will create some accounting records a = {'Resource_List.ncpus': 1} j = Job(attrs=a) jid = self.server.submit(j) self.server.expect(JOB, {'job_state': 'R'}, id=jid) acctpath = os.path.join(self.server.pbs_conf["PBS_HOME"], "server_priv", "accounting") self.assertTrue( self.du.isdir(self.server.hostname, acctpath, sudo=True)) logs = self.du.listdir(self.server.hostname, acctpath, sudo=True) log_today = sorted(logs, reverse=True)[0] self.assertTrue( self.du.isfile(self.server.hostname, acctpath, sudo=True)) # Call LogUtils.get_timestamps() for today's accounting log file lu = PBSLogUtils() tm = lu.get_timestamps(log_today, self.server.hostname, num=1, sudo=True) self.assertIsNotNone(tm) self.assertEqual(len(tm), 1)
def test_job_array(self): """ Test that a job array switches from accruing eligible time to ineligible time when its last subjob starts running """ logutils = PBSLogUtils() a = {'resources_available.ncpus': 2} self.server.manager(MGR_CMD_SET, NODE, a, id=self.mom.shortname) a = {'log_events': 2047} self.server.manager(MGR_CMD_SET, SERVER, a) J1 = Job(TEST_USER, attrs={ATTR_J: '1-3'}) J1.set_sleep_time(20) jid = self.server.submit(J1) jid_short = jid.split('[')[0] sjid1 = jid_short + '[1]' sjid2 = jid_short + '[2]' sjid3 = jid_short + '[3]' # Capture the time stamp when accrue type changed to # eligible time msg1 = jid + ";Accrue type has changed to eligible_time" m1 = self.server.log_match(msg1) t1 = logutils.convert_date_time(m1[1].split(';')[0]) self.server.expect(JOB, {ATTR_state: 'R'}, id=sjid1, extend='t') self.server.expect(JOB, {ATTR_state: 'R'}, id=sjid2, extend='t') self.server.expect(JOB, {ATTR_state: 'Q'}, id=sjid3, extend='t') # accrue_type = 2 is eligible_time self.server.expect(JOB, {'accrue_type': 2}, id=jid) self.logger.info("subjobs 1 and 2 finished; subjob 3 must run now") self.server.expect(JOB, {ATTR_state: 'R'}, id=sjid3, extend='t', offset=20) self.server.expect(JOB, {'accrue_type': 1}, id=jid) # Capture the time stamp when accrue type changed to # ineligible time msg2 = jid + ";Accrue type has changed to ineligible_time" m2 = self.server.log_match(msg2) t2 = logutils.convert_date_time(m2[1].split(';')[0]) eligible_time = t2 - t1 m = jid + ";Accrue type has changed to ineligible_time, " m += "previous accrue type was eligible_time" m += " for %d secs, " % eligible_time m += "total eligible_time=00:00:%d" % eligible_time self.server.log_match(m)
class TestPythonRestartSettings(TestFunctional): """ For addressing memory leak in server due to python objects python interpreter needs to be restarted. Previously there were macros in code to do that. The new design has added attributes in server to configure how frequently python interpreter should be restarted This test suite is to validate the server attributes. Actual memory leak test is still manual """ logutils = PBSLogUtils() def test_non_integer(self): """ This is to test that qmgr will throw error when non-integer values are provided """ exp_err = "Illegal attribute or resource value" # -1 will throw error try: self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_max_hooks': '-1'}, runas=ROOT_USER, logerr=True) except PbsManagerError, e: self.assertTrue(exp_err in e.msg[0], "Error message is not expected") try: self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_max_objects': '-1'}, runas=ROOT_USER, logerr=True) except PbsManagerError, e: self.assertTrue(exp_err in e.msg[0], "Error message is not expected")
def match_str_in_input_file(self, file_path, file_pattern, search_str): """ Assert that search string appears in the input file that matches file_pattern """ input_file = None for item in self.du.listdir(path=file_path, sudo=True): if fnmatch.fnmatch(item, file_pattern): input_file = item break self.assertTrue(input_file is not None) with PBSLogUtils().open_log(input_file, sudo=True) as f: self.assertTrue(search_str in f.read().decode()) self.logger.info("Found \"%s\" in %s" % (search_str, input_file))
def match_queue_name_in_input_file(self, input_file_pattern, qname): """ Assert that qname appears in the hook debug input file that matches input_file_pattern """ input_file = None for item in self.du.listdir(path=self.server_hooks_tmp_dir, sudo=True): if fnmatch.fnmatch(item, input_file_pattern): input_file = item break self.assertTrue(input_file is not None) with PBSLogUtils().open_log(input_file, sudo=True) as f: search_str = 'pbs.event().job.queue=%s' % qname self.assertTrue(search_str in f.read().decode()) self.remove_files_match(input_file_pattern)
def wait_and_read_file(self, filename=''): """ Wait up to ten seconds for a file to appear and then read it. """ from ptl.utils.pbs_logutils import PBSLogUtils self.logger.info("Reading file: %s" % filename) if not filename: raise ValueError('Invalid filename') for _ in range(10): if os.path.isfile(filename): break time.sleep(1) self.assertTrue(os.path.isfile(filename), "File not found: %s" % filename) with PBSLogUtils().open_log(filename, sudo=True) as fd: output = fd.read().splitlines() return output
def match_in_debug_file(self, input_file_pattern, search_list, mom=False): """ Assert that all the strings in 'search_list' appears in the hook debug file that matches input_file_pattern """ input_file = None if mom: hooks_tmp_dir = self.mom_hooks_tmp_dir else: hooks_tmp_dir = self.server_hooks_tmp_dir for item in self.du.listdir(path=hooks_tmp_dir, sudo=True): if fnmatch.fnmatch(item, input_file_pattern): input_file = item break self.assertTrue(input_file is not None) with PBSLogUtils().open_log(input_file, sudo=True) as f: content = f.read().decode() for entry in search_list: self.assertTrue(entry in content) self.remove_files_match(input_file_pattern, mom)
def common_log_match(self, daemon): """ Validate the env variable output in daemon logs """ logutils = PBSLogUtils() logmsg = ["TEST_COMMA=1\,2\,3\,4", "TEST_SEMICOLON=;", "TEST_ENCLOSED=\\'\,\\'", "TEST_COLON=:", "TEST_BACKSLASH=\\\\", "TEST_DQUOTE=\\\"", "TEST_DQUOTE2=happy days\\\"are\\\"here to stay", "TEST_DQUOTE3=nothing compares\\\" to you", "TEST_DQUOTE4=\\\"music makes the people\\\"", "TEST_DQUOTE5=music \\\"makes \\'the\\'\\\"people", "TEST_DQUOTE6=lalaland\\\"", "TEST_SQUOTE=\\'", "TEST_SQUOTE2=happy\\'days", "TEST_SQUOTE3=the days\\'are here now\\'then", "TEST_SQUOTE4=\\'the way that was\\'", "TEST_SQUOTE5=music \\'makes \\\"the\\'\\\"people", "TEST_SQUOTE6=loving\\'", "TEST_SPECIAL={}[]()~@#$%^&*!", "TEST_SPECIAL2=<dumb-test_text>", "TEST_RETURN=\\'3\,", # Cannot add '\n' here because '\n' is not included in # the items of the list returned by log_lines(), (though # lines are split by '\n') "4\,", "5\\',"] if (daemon == "mom"): self.logger.info("Matching in mom logs") logfile_type = self.mom elif (daemon == "server"): self.logger.info("Matching in server logs") logfile_type = self.server else: self.logger.info("Provide a valid daemon name; server or mom") return lines = None ret_linenum = 0 search_msg = 'log match: searching for ' nomatch_msg = ' No match for ' for msg in logmsg: for attempt in range(1, 61): lines = self.server.log_lines( logfile_type, starttime=self.server.ctime) match = logutils.match_msg(lines, msg=msg) if match: # Dont want the test to pass if there are # unwanted matched for "4\," and "5\\'. if msg == "TEST_RETURN=\\'3\,": ret_linenum = match[0] if (msg == "4\," and match[0] != (ret_linenum - 1)) or \ (msg == "5\\'" and match[0] != (ret_linenum - 2)): pass else: self.logger.info(search_msg + msg + ' ... OK') break else: self.logger.info(nomatch_msg + msg + ' attempt ' + str(attempt)) time.sleep(0.5) if match is None: _msg = nomatch_msg + msg raise PtlLogMatchError(rc=1, rv=False, msg=_msg)
class TestPythonRestartSettings(TestFunctional): """ For addressing memory leak in server due to python objects python interpreter needs to be restarted. Previously there were macros in code to do that. The new design has added attributes in server to configure how frequently python interpreter should be restarted This test suite is to validate the server attributes. Actual memory leak test is still manual """ logutils = PBSLogUtils() def test_non_integer(self): """ This is to test that qmgr will throw error when non-integer values are provided """ exp_err = "Illegal attribute or resource value" # -1 will throw error try: self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_max_hooks': '-1'}, runas=ROOT_USER, logerr=True) except PbsManagerError as e: self.assertTrue(exp_err in e.msg[0], "Error message is not expected") try: self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_max_objects': '-1'}, runas=ROOT_USER, logerr=True) except PbsManagerError as e: self.assertTrue(exp_err in e.msg[0], "Error message is not expected") try: self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_min_interval': '-1'}, runas=ROOT_USER, logerr=True) except PbsManagerError as e: self.assertTrue(exp_err in e.msg[0], "Error message is not expected") # 0 will also give error try: self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_max_hooks': 0}, runas=ROOT_USER, logerr=True) except PbsManagerError as e: self.assertTrue(exp_err in e.msg[0], "Error message is not expected") try: self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_max_objects': 0}, runas=ROOT_USER, logerr=True) except PbsManagerError as e: self.assertTrue(exp_err in e.msg[0], "Error message is not expected") try: self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_min_interval': 0}, runas=ROOT_USER, logerr=True) except PbsManagerError as e: self.assertTrue(exp_err in e.msg[0], "Error message is not expected") try: self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_min_interval': "00:00:00"}, runas=ROOT_USER, logerr=True) except PbsManagerError as e: self.assertTrue(exp_err in e.msg[0], "Error message is not expected") try: self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_min_interval': "HH:MM:SS"}, runas=ROOT_USER, logerr=True) except PbsManagerError as e: self.assertTrue(exp_err in e.msg[0], "Error message is not expected") def test_non_manager(self): """ Test that hook values can not be set as operator or users. """ exp_err = "Cannot set attribute, read only or insufficient permission" try: self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_max_hooks': 30}, runas=OPER_USER, logerr=True) except PbsManagerError as e: self.assertIn(exp_err, e.msg[0], "Error message is not expected") try: self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_max_objects': 2000}, runas=OPER_USER, logerr=True) except PbsManagerError as e: self.assertIn(exp_err, e.msg[0], "Error message is not expected") try: self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_min_interval': 10}, runas=OPER_USER, logerr=True) except PbsManagerError as e: self.assertIn(exp_err, e.msg[0], "Error message is not expected") exp_err = "Unauthorized Request" try: self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_max_hooks': 30}, runas=TEST_USER, logerr=True) except PbsManagerError as e: self.assertIn(exp_err, e.msg[0], "Error message is not expected") try: self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_max_objects': 2000}, runas=TEST_USER, logerr=True) except PbsManagerError as e: self.assertIn(exp_err, e.msg[0], "Error message is not expected") try: self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_min_interval': 10}, runas=TEST_USER, logerr=True) except PbsManagerError as e: self.assertIn(exp_err, e.msg[0], "Error message is not expected") def test_log_message(self): """ Test that message logged in server_logs when values get set """ self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_max_hooks': 200}, runas=ROOT_USER, logerr=True) self.server.log_match("python_restart_max_hooks = 200", max_attempts=5) self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_max_objects': 2000}, runas=ROOT_USER, logerr=True) self.server.log_match("python_restart_max_objects = 2000", max_attempts=5) self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_min_interval': "00:01:00"}, runas=ROOT_USER, logerr=True) self.server.log_match("python_restart_min_interval = 00:01:00", max_attempts=5) def test_long_values(self): """ Test that very long values are accepted """ self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_max_hooks': 2147483647}, runas=ROOT_USER, logerr=True) self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_max_objects': 2147483647}, runas=ROOT_USER, logerr=True) self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_min_interval': 2147483647}, runas=ROOT_USER, logerr=True) self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_min_interval': "596523:00:00"}, runas=ROOT_USER, logerr=True) def test_set_unset(self): """ Test that when unset attribte is not visible in qmgr. Also values will not change after server restart. """ self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_max_hooks': 20}, runas=ROOT_USER, logerr=True) self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_max_objects': 20}, runas=ROOT_USER, logerr=True) self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_min_interval': "00:00:20"}, runas=ROOT_USER, logerr=True) # Restart server self.server.restart() self.server.expect(SERVER, {'python_restart_max_hooks': 20}, op=SET, runas=ROOT_USER) self.server.expect(SERVER, {'python_restart_max_objects': 20}, op=SET, runas=ROOT_USER) self.server.expect(SERVER, {'python_restart_min_interval': 20}, op=SET, runas=ROOT_USER) self.server.manager(MGR_CMD_UNSET, SERVER, 'python_restart_max_hooks', runas=ROOT_USER, logerr=True) self.server.manager(MGR_CMD_UNSET, SERVER, 'python_restart_max_objects', runas=ROOT_USER, logerr=True) self.server.manager(MGR_CMD_UNSET, SERVER, 'python_restart_min_interval', runas=ROOT_USER, logerr=True) # Restart server again self.server.restart() self.server.expect(SERVER, "python_restart_max_hooks", op=UNSET, runas=ROOT_USER) self.server.expect(SERVER, "python_restart_max_objects", op=UNSET, runas=ROOT_USER) self.server.expect(SERVER, "python_restart_min_interval", op=UNSET, runas=ROOT_USER) def test_max_hooks(self): """ Test that python restarts at set interval """ # create a hook hook_body = """ import pbs e = pbs.event() s = pbs.server() localnode = pbs.get_local_nodename() vn = pbs.server().vnode(localnode) pbs.event().accept() """ a = { 'event': ["queuejob", "movejob", "modifyjob", "runjob"], 'enabled': "True" } self.server.create_import_hook("test", a, hook_body, overwrite=True) # Create workq2 a = {'queue_type': 'e', 'started': 't', 'enabled': 't'} self.server.manager(MGR_CMD_CREATE, QUEUE, a, "workq2") # Set max_hooks and min_interval so that further changes # will generate a log message. self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_max_hooks': 100}, runas=ROOT_USER) self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_min_interval': 30}, runas=ROOT_USER) # Need to run a job so these new settings are remembered j = Job() jid = self.server.submit(j) # Set server log_events self.server.manager(MGR_CMD_SET, SERVER, {"log_events": 2047}) # Set time to start scanning logs time.sleep(1) stime = time.time() # Set max_hooks to low to hit max_hooks only self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_max_hooks': 1}, runas=ROOT_USER) # Set min_interval to 3 self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_min_interval': 3}, runas=ROOT_USER) # Submit multiple jobs for x in range(6): j = Job() j.set_attributes({ATTR_h: None}) j.set_sleep_time(1) jid = self.server.submit(j) self.server.expect(JOB, {'job_state': "H"}, id=jid) self.server.alterjob(jid, {ATTR_N: "yaya"}) self.server.movejob(jid, "workq2") self.server.rlsjob(jid, None) time.sleep(1) # Verify the logs and make sure that python interpreter is restarted # every 3s logs = self.server.log_match( "Restarting Python interpreter to reduce mem usage", allmatch=True, starttime=stime, max_attempts=8) self.assertTrue(len(logs) > 1) log1 = logs[0][1] log2 = logs[1][1] tmp = log1.split(';') # Convert the time into epoch time time1 = int(self.logutils.convert_date_time(tmp[0])) tmp = log2.split(';') time2 = int(self.logutils.convert_date_time(tmp[0])) # Difference between log message should not be less than 3 diff = time2 - time1 self.logger.info("Time difference between log message is " + str(diff) + " seconds") # Leave a little wiggle room for slow systems self.assertTrue(diff >= 3 and diff <= 5) # This message only gets printed if /proc/self/statm is present if os.path.isfile("/proc/self/statm"): self.server.log_match("Current memory usage:", starttime=self.server.ctime, max_attempts=5) else: self.server.log_match("unknown", max_attempts=5) # Verify other log messages self.server.log_match("python_restart_max_hooks is now 1", starttime=stime, max_attempts=5) self.server.log_match("python_restart_min_interval is now 3", starttime=stime, max_attempts=5) def test_max_objects(self): """ Test that python restarts if max objects limit have met """ hook_body = """ import pbs pbs.event().accept() """ a = {'event': ["queuejob", "modifyjob"], 'enabled': 'True'} self.server.create_import_hook("test", a, hook_body, overwrite=True) # Set max_objects and min_interval so that further changes # will generate a log message. self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_max_objects': 1000}, runas=ROOT_USER) self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_min_interval': 30}, runas=ROOT_USER) # Need to run a job so these new settings are remembered j = Job() jid = self.server.submit(j) # Set server log_events self.server.manager(MGR_CMD_SET, SERVER, {"log_events": 2047}) # Set time to start scanning logs time.sleep(1) stime = time.time() # Set max_objects only self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_max_objects': 1}, runas=ROOT_USER) # Set min_interval to 1 self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_min_interval': '00:00:01'}, runas=ROOT_USER) # Submit held jobs for x in range(3): j = Job() j.set_attributes({ATTR_h: None}) j.set_sleep_time(1) jid = self.server.submit(j) self.server.expect(JOB, {'job_state': "H"}, id=jid) self.server.alterjob(jid, {ATTR_N: "yaya"}) # Verify that python is restarted self.server.log_match( "Restarting Python interpreter to reduce mem usage", starttime=self.server.ctime, max_attempts=5) # This message only gets printed if # /proc/self/statm presents if os.path.isfile("/proc/self/statm"): self.server.log_match("Current memory usage:", starttime=self.server.ctime, max_attempts=5) else: self.server.log_match("unknown", max_attempts=5) # Verify other log messages self.server.log_match("python_restart_max_objects is now 1", starttime=stime, max_attempts=5) self.server.log_match("python_restart_min_interval is now 1", starttime=stime, max_attempts=5) def test_no_restart(self): """ Test that if limit not reached then python interpreter will not be started """ hook_body = """ import pbs pbs.event().accept() """ a = {'event': "queuejob", 'enabled': "True"} self.server.create_import_hook("test", a, hook_body, overwrite=True) # Set max_hooks, max_objects, and min_interval to large values # to avoid restarting the Python interpreter. self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_max_hooks': 10000}, runas=ROOT_USER) self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_max_objects': 10000}, runas=ROOT_USER) self.server.manager(MGR_CMD_SET, SERVER, {'python_restart_min_interval': 10000}, runas=ROOT_USER) stime = time.time() # Submit a job for x in range(10): j = Job() j.set_sleep_time(1) jid = self.server.submit(j) # Verify no restart message msg = "Restarting Python interpreter to reduce mem usage" self.server.log_match(msg, starttime=stime, max_attempts=8, existence=False)
class TestPbsExecjobEnd(TestFunctional): """ This tests the feature in PBS that allows execjob_end hook to execute such that pbs_mom is not blocked upon execution. """ logutils = PBSLogUtils() def setUp(self): TestFunctional.setUp(self) self.attr = {'event': 'execjob_end', 'enabled': 'True', 'alarm': '50'} self.hook_body = ("import pbs\n" "import time\n" "e = pbs.event()\n" "pbs.logjobmsg(e.job.id, \ 'executed execjob_end hook')\n" "time.sleep(10)\n" "pbs.logjobmsg(e.job.id, \ 'execjob_end hook ended')\n" "e.accept()\n") def test_execjob_end_non_blocking(self): """ Test to make sure that mom is unblocked and running exechost_periodic hook while it's child process is executing execjob_end hook. """ hook_name = "execjob_end_logmsg" self.server.create_import_hook(hook_name, self.attr, self.hook_body) hook_name = "exechost_periodic_logmsg" hook_body = ("import pbs\n" "e = pbs.event()\n" "pbs.logmsg(pbs.LOG_DEBUG, \ 'executed exechost_periodic hook')\n" "e.accept()\n") attr = {'event': 'exechost_periodic', 'freq': '3', 'enabled': 'True'} j = Job(TEST_USER) j.set_sleep_time(1) self.server.create_import_hook(hook_name, attr, hook_body) jid = self.server.submit(j) # need to verify hook messages in the below mentioned order to # confirm mom is not blocked on execjob_end hook execution. # The order is verified with the use of starttime and endtime # parameters. (_, str1) = self.mom.log_match("Job;%s;executed execjob_end hook" % jid, n=100, max_attempts=10, interval=2) date_time1 = str1.split(";")[0] epoch1 = self.logutils.convert_date_time(date_time1) # following message should be logged while execjob_end hook is in sleep (_, str1) = self.mom.log_match("executed exechost_periodic hook", starttime=epoch1 - 1, endtime=epoch1 + 10, n=100, max_attempts=10, interval=1) date_time2 = str1.split(";")[0] epoch2 = self.logutils.convert_date_time(date_time2) (_, str1) = self.mom.log_match("Job;%s;execjob_end hook ended" % jid, starttime=epoch2 - 1, n=100, max_attempts=10, interval=2) date_time3 = str1.split(";")[0] self.logger.info( "execjob_end hook executed at: %s," "exechost_periodic at: %s and execjob_end hook ended at: %s" % (date_time1, date_time2, date_time3)) def test_execjob_end_hook_order_and_reject(self): """ Test with multiple execjob_end hooks having different order with one of the hooks rejecting the job. """ hook_name1 = "execjob_end_logmsg1" hook_body_accept = ("import pbs\n" "e = pbs.event()\n" "pbs.logjobmsg(e.job.id, \ 'executed %s hook' % e.hook_name)\n" "e.accept()\n") attr = {'event': 'execjob_end', 'order': '1', 'enabled': 'True'} self.server.create_import_hook(hook_name1, attr, hook_body_accept) hook_name = "execjob_end_logmsg2" hook_body_reject = ( "import pbs\n" "e = pbs.event()\n" "pbs.logjobmsg(e.job.id, 'executed execjob_end hook')\n" "e.reject('Job is rejected')\n") attr = {'event': 'execjob_end', 'order': '2', 'enabled': 'True'} self.server.create_import_hook(hook_name, attr, hook_body_reject) hook_name2 = "execjob_end_logmsg3" attr = {'event': 'execjob_end', 'order': '170', 'enabled': 'True'} self.server.create_import_hook(hook_name2, attr, hook_body_accept) j = Job(TEST_USER) j.set_sleep_time(1) jid = self.server.submit(j) self.mom.log_match("Job;%s;executed %s hook" % (jid, hook_name1), n=100, max_attempts=10, interval=2) self.mom.log_match("Job;%s;Job is rejected" % jid, n=100, max_attempts=10, interval=2) self.mom.log_match("Job;%s;executed %s hook" % (jid, hook_name2), n=100, max_attempts=10, interval=2, existence=False) def test_execjob_end_multi_job(self): """ Test to make sure that mom is unblocked with execjob_end hook with mutiple jobs """ status = self.server.status(NODE, id=self.mom.shortname) if status[0]["resources_available.ncpus"] < "2": self.skip_test(reason="need 2 or more available ncpus") hook_name = "execjob_end_logmsg4" self.server.create_import_hook(hook_name, self.attr, self.hook_body) # jobs need to land on the same host even in a multi-node setup a = {'Resource_List.select': '1:ncpus=1:host=%s' % self.mom.shortname} j = Job(TEST_USER, attrs=a) j.set_sleep_time(1) jid1 = self.server.submit(j) # jid1 should be in E state, in sleep of execjob_end hook for # jid2 submmision. self.server.expect(JOB, {'job_state': 'E'}, id=jid1) j = Job(TEST_USER, attrs=a) j.set_sleep_time(1) jid2 = self.server.submit(j) # hook message of jid2 should be logged after the message of jid1 and # before completion of sleep in hook for jid1 inorder to prove mom # is not in blocked state. (_, str1) = self.mom.log_match("Job;%s;executed execjob_end hook" % jid1, n=100, max_attempts=10, interval=2) date_time1 = str1.split(";")[0] epoch1 = self.logutils.convert_date_time(date_time1) # hook message for jid2 should appear while hook is in sleep for jid1 (_, str1) = self.mom.log_match("Job;%s;executed execjob_end hook" % jid2, starttime=epoch1 - 1, endtime=epoch1 + 10, n=100, max_attempts=10, interval=1) date_time1 = str1.split(";")[0] epoch1 = self.logutils.convert_date_time(date_time1) (_, str1) = self.mom.log_match("Job;%s;execjob_end hook ended" % jid1, starttime=epoch1 - 1, n=100, max_attempts=10, interval=2) self.mom.log_match("Job;%s;execjob_end hook ended" % jid2, n=100, max_attempts=10, interval=2) def test_execjob_end_non_blocking_multi_node(self): """ Test to make sure sister mom is unblocked when execjob_end hook is running on sister mom """ if len(self.moms) != 2: self.skip_test(reason="need 2 mom hosts: -p moms=<m1>:<m2>") self.momA = self.moms.values()[0] self.momB = self.moms.values()[1] hook_name = "execjob_end_logmsg5" self.server.create_import_hook(hook_name, self.attr, self.hook_body) hook_name = "exechost_periodic_logmsg2" hook_body = ("import pbs\n" "e = pbs.event()\n" "pbs.logmsg(pbs.LOG_DEBUG, \ 'executed exechost_periodic hook')\n" "e.accept()\n") attr = {'event': 'exechost_periodic', 'freq': '3', 'enabled': 'True'} a = { 'Resource_List.select': '1:ncpus=1:host=%s+1:ncpus=1:host=%s' % (self.momA.shortname, self.momB.shortname) } j = Job(TEST_USER, attrs=a) j.set_sleep_time(1) self.server.create_import_hook(hook_name, attr, hook_body) jid = self.server.submit(j) for host, mom in self.moms.iteritems(): (_, str1) = mom.log_match("Job;%s;executed execjob_end hook" % jid, n=100, max_attempts=10, interval=2) date_time1 = str1.split(";")[0] epoch1 = self.logutils.convert_date_time(date_time1) (_, str1) = mom.log_match("executed exechost_periodic hook", starttime=epoch1 - 1, endtime=epoch1 + 10, n=100, max_attempts=10, interval=1) date_time2 = str1.split(";")[0] epoch2 = self.logutils.convert_date_time(date_time2) (_, str1) = mom.log_match("Job;%s;execjob_end hook ended" % jid, starttime=epoch2 - 1, n=100, max_attempts=10, interval=2) date_time3 = str1.split(";")[0] msg = "Got expected log_msg on host:%s" % host self.logger.info(msg) self.logger.info( "execjob_end hook executed at: %s," "exechost_periodic at: %s and execjob_end hook ended at: %s" % (date_time1, date_time2, date_time3)) def test_execjob_end_delete_request(self): """ Test to make sure execjob_end hook is running after job force deletion request(IS_DISCARD_JOB) when mom is unblocked. """ hook_name = "execjob_end_logmsg6" self.server.create_import_hook(hook_name, self.attr, self.hook_body) if len(self.moms) == 2: self.momA = self.moms.values()[0] self.momB = self.moms.values()[1] a = { 'Resource_List.select': '1:ncpus=1:host=%s+1:ncpus=1:host=%s' % (self.momA.shortname, self.momB.shortname) } j = Job(TEST_USER, attrs=a) elif len(self.moms) == 1: j = Job(TEST_USER) j.set_sleep_time(10) jid = self.server.submit(j) self.server.expect(JOB, {'job_state': 'R'}, id=jid) self.server.deljob(id=jid, wait=True, attr_W="force") for host, mom in self.moms.iteritems(): mom.log_match("Job;%s;executed execjob_end hook" % jid, n=100, max_attempts=10, interval=2) mom.log_match("Job;%s;execjob_end hook ended" % jid, n=100, max_attempts=10, interval=2) msg = "Got expected log_msg on host:%s" % host self.logger.info(msg)
class TestPbsExecjobSuspendResume(TestFunctional): """ Tests the hook events execjob_postsuspend, execjob_preresume which are called after a job is suspended, and before the job is resumed. """ logutils = PBSLogUtils() def setUp(self): if len(self.moms) != 2: self.skipTest('test requires two MoMs as input, ' + 'use -p moms=<mom1>:<mom2>') TestFunctional.setUp(self) self.momA = self.moms.values()[0] self.momB = self.moms.values()[1] # execjob_postsuspend hook self.postsuspend_hook_body = """import pbs e=pbs.event() def proc_status(pid): try: for line in open("/proc/%d/status" % pid).readlines(): if line.startswith("State:"): return line.split(":",1)[1].strip().split(' ')[0] except: pass return None def print_attribs(pbs_obj, header): for a in pbs_obj.attributes: v = getattr(pbs_obj, a) if v and str(v): pbs.logjobmsg(e.job.id, "%s: %s = %s" % (header, a, v)) if a == "session_id": st = proc_status(v) if st == 'T': pbs.logjobmsg(e.job.id, "%s: process seen as suspended" % header) if e.type == pbs.EXECJOB_POSTSUSPEND: pbs.logmsg(pbs.LOG_DEBUG, "%s;called execjob_postsuspend hook" %e.job.id) print_attribs(e.job, "JOB") for vn in e.vnode_list: v = e.vnode_list[vn] print_attribs(v, "vnode_list[" + vn + "]") """ # execjob_postsuspend hook, reject self.postsuspend_hook_reject_body = """import pbs e=pbs.event() job=e.job e.reject("bad suspend on ms") """ # execjob_postsuspend hook, reject by sister only self.postsuspend_hook_sis_reject_body = """import pbs e=pbs.event() job=e.job if not e.job.in_ms_mom(): e.reject("bad suspend on sis") """ # hook with an unhandled exception self.hook_error_body = """import pbs e=pbs.event() job=e.job raise NameError """ # hook with an unhandled exception, sister only self.hook_sis_error_body = """import pbs e=pbs.event() job=e.job if not job.in_ms_mom(): raise NameError """ # execjob_preresume hook self.preresume_hook_body = """import pbs e=pbs.event() def proc_status(pid): try: for line in open("/proc/%d/status" % pid).readlines(): if line.startswith("State:"): return line.split(":",1)[1].strip().split(' ')[0] except: pass return None def print_attribs(pbs_obj, header): for a in pbs_obj.attributes: v = getattr(pbs_obj, a) if v and str(v): pbs.logjobmsg(e.job.id, "%s: %s = %s" % (header, a, v)) if a == "session_id": st = proc_status(v) if st == 'T': pbs.logjobmsg(e.job.id, "%s: process seen as suspended" % header) if e.type == pbs.EXECJOB_PRERESUME: pbs.logmsg(pbs.LOG_DEBUG, "%s;called execjob_preresume hook" %e.job.id) print_attribs(e.job, "JOB") for vn in e.vnode_list: v = e.vnode_list[vn] print_attribs(v, "vnode_list[" + vn + "]") """ # execjob_preresume hook, reject self.preresume_hook_reject_body = """import pbs e=pbs.event() job=e.job e.reject("bad resumption on ms") """ # execjob_preresume hook, reject by sister only self.preresume_hook_sis_reject_body = """import pbs e=pbs.event() job=e.job if not e.job.in_ms_mom(): e.reject("bad resumption on sis") """ # job used in the tests self.j = Job(self.du.get_current_user()) script = """ #PBS -l select=2:ncpus=1 #PBS -l place=scatter #PBS -S /bin/bash pbsdsh -n 1 -- sleep 60 & sleep 60 """ self.j.create_script(script) def test_execjob_postsuspend(self): """ An execjob_postsuspend hook is executed by primary mom and then by the connected sister moms after a job has been suspended. """ # instantiate execjob_postsuspend hook hook_event = 'execjob_postsuspend' hook_name = 'psus' a = {'event': hook_event, 'enabled': 'true'} self.server.create_import_hook(hook_name, a, self.postsuspend_hook_body) # Submit a job jid = self.server.submit(self.j) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid) self.server.status(JOB, 'exec_vnodes', id=jid) job_node = self.j.get_vnodes()[0] # Suspend job self.server.sigjob(jobid=jid, signal="suspend") for vn in [self.momA, self.momB]: if vn == self.momA: vn.log_match("Job;%s;signal job with suspend" % jid) else: vn.log_match("Job;%s;SUSPEND received" % jid) vn.log_match("%s;called execjob_postsuspend hook" % jid) if vn == self.momA: # as postsuspend hook is executing, # job's process should be seen as suspended vn.log_match("Job;%s;JOB: process seen as suspended" % jid) # Check presence of pbs.event().job vn.log_match("Job;%s;JOB: id = %s" % (jid, jid)) # Check presence of vnode_list[] parameter vnode_list = [self.momA.name, self.momB.name] for v in vnode_list: vn.log_match("Job;%s;vnode_list[%s]: name = %s" % (jid, job_node, job_node)) # after hook executes, job continues to be suspended self.server.expect(JOB, {ATTR_state: 'S'}, id=jid) def test_execjob_preresume(self): """ An execjob_preresume hook is executed by primary mom and then by the connected sister moms just before a job is resumed. """ # instantiate execjob_preresume hook hook_event = 'execjob_preresume' hook_name = 'pres' a = {'event': hook_event, 'enabled': 'true'} self.server.create_import_hook(hook_name, a, self.preresume_hook_body) # Submit a job jid = self.server.submit(self.j) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid) self.server.status(JOB, 'exec_vnodes', id=jid) job_node = self.j.get_vnodes()[0] # Suspend, then resume job self.server.sigjob(jobid=jid, signal="suspend") self.server.expect(JOB, {ATTR_state: 'S'}, id=jid) self.server.sigjob(jobid=jid, signal="resume") for vn in [self.momA, self.momB]: if vn == self.momA: vn.log_match("Job;%s;signal job with resume" % jid) else: vn.log_match("Job;%s;RESUME received" % jid) vn.log_match("%s;called execjob_preresume hook" % jid) if vn == self.momA: # as preresume hook is executing, # job's process should be seen as suspended vn.log_match("Job;%s;JOB: process seen as suspended" % jid) # Check presence of pbs.event().job vn.log_match("Job;%s;JOB: id = %s" % (jid, jid)) # Check presence of vnode_list[] parameter vnode_list = [self.momA.name, self.momB.name] for v in vnode_list: vn.log_match("Job;%s;vnode_list[%s]: name = %s" % (jid, job_node, job_node)) # after hook executes, job should be running again. self.server.expect(JOB, {ATTR_state: 'R'}, id=jid) def test_execjob_postsuspend_reject(self): """ An execjob_postsuspend hook that results in a reject action would not affect the currently suspended job. """ # instantiate execjob_postsuspend hook hook_event = 'execjob_postsuspend' hook_name = 'psus' a = {'event': hook_event, 'enabled': 'true'} self.server.create_import_hook(hook_name, a, self.postsuspend_hook_reject_body) # Submit a job jid = self.server.submit(self.j) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid) # Suspend job self.server.sigjob(jobid=jid, signal="suspend") hook_msg = "bad suspend on ms" reject_msg = "%s hook rejected request: %s" % (hook_event, hook_msg) for vn in [self.momA, self.momB]: if vn == self.momA: vn.log_match("Job;%s;signal job with suspend" % jid) else: vn.log_match("Job;%s;SUSPEND received" % jid) vn.log_match("Job;%s;%s" % (jid, hook_msg)) vn.log_match("Job;%s;%s" % (jid, reject_msg)) # after hook executes, job continues to be suspended self.server.expect(JOB, {ATTR_state: 'S'}, id=jid) def test_execjob_postsuspend_reject_sis(self): """ An execjob_postsuspend hook that results in a reject action by sister mom only would not affect the currently suspended job. """ # instantiate execjob_postsuspend hook hook_event = 'execjob_postsuspend' hook_name = 'psus' a = {'event': hook_event, 'enabled': 'true'} self.server.create_import_hook(hook_name, a, self.postsuspend_hook_sis_reject_body) # Submit a job jid = self.server.submit(self.j) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid) # Suspend job self.server.sigjob(jobid=jid, signal="suspend") hook_msg = "bad suspend on sis" reject_msg = "%s hook rejected request: %s" % (hook_event, hook_msg) for vn in [self.momA, self.momB]: if vn == self.momA: vn.log_match("Job;%s;signal job with suspend" % jid) vn.log_match("Job;%s;%s" % (jid, hook_msg), existence=False, max_attempts=30) vn.log_match("Job;%s;%s" % (jid, reject_msg), existence=False, max_attempts=30) else: vn.log_match("Job;%s;SUSPEND received" % jid) vn.log_match("Job;%s;%s" % (jid, hook_msg)) vn.log_match("Job;%s;%s" % (jid, reject_msg)) # after hook executes, job continues to be suspended self.server.expect(JOB, {ATTR_state: 'S'}, id=jid) def test_execjob_postsuspend_error(self): """ An execjob_postsuspend hook that results in an error action would not affect the currently suspended job. """ # instantiate execjob_postsuspend hook hook_event = 'execjob_postsuspend' hook_name = 'psus' a = {'event': hook_event, 'enabled': 'true'} self.server.create_import_hook(hook_name, a, self.hook_error_body) # Submit a job jid = self.server.submit(self.j) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid) # Suspend job self.server.sigjob(jobid=jid, signal="suspend") error_msg = \ "%s hook \'%s\' encountered an exception, request rejected" \ % (hook_event, hook_name) for vn in [self.momA, self.momB]: if vn == self.momA: vn.log_match("Job;%s;signal job with suspend" % jid) else: vn.log_match("Job;%s;SUSPEND received" % jid) vn.log_match("Job;%s;%s" % (jid, error_msg)) # after hook executes, job continues to be suspended self.server.expect(JOB, {ATTR_state: 'S'}, id=jid) def test_execjob_postsuspend_error_sis(self): """ An execjob_postsuspend hook that results in a error action by sister mom only would not affect the currently suspended job. """ # instantiate execjob_postsuspend hook hook_event = 'execjob_postsuspend' hook_name = 'psus' a = {'event': hook_event, 'enabled': 'true'} self.server.create_import_hook(hook_name, a, self.hook_sis_error_body) # Submit a job jid = self.server.submit(self.j) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid) # Suspend job self.server.sigjob(jobid=jid, signal="suspend") error_msg = \ "%s hook \'%s\' encountered an exception, request rejected" \ % (hook_event, hook_name) for vn in [self.momA, self.momB]: if vn == self.momA: vn.log_match("Job;%s;signal job with suspend" % jid) vn.log_match("Job;%s;%s" % (jid, error_msg), existence=False, max_attempts=30) else: vn.log_match("Job;%s;SUSPEND received" % jid) vn.log_match("Job;%s;%s" % (jid, error_msg)) # after hook executes, job continues to be suspended self.server.expect(JOB, {ATTR_state: 'S'}, id=jid) def test_execjob_preresume_reject(self): """ An execjob_preresume hook that results in a reject action would prevent suspended job from being resumed. """ # instantiate execjob_preresume hook hook_event = 'execjob_preresume' hook_name = 'pres' a = {'event': hook_event, 'enabled': 'true'} self.server.create_import_hook(hook_name, a, self.preresume_hook_reject_body) # Submit a job jid = self.server.submit(self.j) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid) # Suspend, then resume job self.server.sigjob(jobid=jid, signal="suspend") self.server.expect(JOB, {ATTR_state: 'S'}, id=jid) self.server.sigjob(jobid=jid, signal="resume") hook_msg = "bad resumption on ms" reject_msg = "%s hook rejected request: %s" % (hook_event, hook_msg) # Mom hook executes on momA and gets a rejection, # so a resume request is not sent to sister momB. self.momA.log_match("Job;%s;signal job with resume" % jid) self.momA.log_match("Job;%s;%s" % (jid, hook_msg)) self.momA.log_match("Job;%s;%s" % (jid, reject_msg)) # after hook executes, job continues to be suspended self.server.expect(JOB, {ATTR_state: 'S'}, id=jid) def test_execjob_preresume_reject_sis(self): """ An execjob_preresume hook that results in a reject action by sister mom only would not affect the currently suspended job. """ # instantiate execjob_preresume hook hook_event = 'execjob_preresume' hook_name = 'psus' a = {'event': hook_event, 'enabled': 'true'} self.server.create_import_hook(hook_name, a, self.preresume_hook_sis_reject_body) # Submit a job jid = self.server.submit(self.j) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid) # Suspend, then resume job self.server.sigjob(jobid=jid, signal="suspend") self.server.expect(JOB, {ATTR_state: 'S'}, id=jid) self.server.sigjob(jobid=jid, signal="resume") hook_msg = "bad resumption on sis" reject_msg = "%s hook rejected request: %s" % (hook_event, hook_msg) for vn in [self.momA, self.momB]: if vn == self.momA: vn.log_match("Job;%s;signal job with resume" % jid) vn.log_match("Job;%s;%s" % (jid, hook_msg), existence=False, max_attempts=30) vn.log_match("Job;%s;%s" % (jid, reject_msg), existence=False, max_attempts=30) else: vn.log_match("Job;%s;RESUME received" % jid) vn.log_match("Job;%s;%s" % (jid, hook_msg)) vn.log_match("Job;%s;%s" % (jid, reject_msg)) # after hook executes, job continues to be suspended self.server.expect(JOB, {ATTR_state: 'S'}, id=jid) def test_execjob_preresume_error(self): """ An execjob_preresume hook that results in a error action would prevent suspended job from being resumed. """ # instantiate execjob_preresume hook hook_event = 'execjob_preresume' hook_name = 'pres' a = {'event': hook_event, 'enabled': 'true'} self.server.create_import_hook(hook_name, a, self.hook_error_body) # Submit a job jid = self.server.submit(self.j) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid) # Suspend, then resume job self.server.sigjob(jobid=jid, signal="suspend") self.server.expect(JOB, {ATTR_state: 'S'}, id=jid) self.server.sigjob(jobid=jid, signal="resume") error_msg = \ "%s hook \'%s\' encountered an exception, request rejected" \ % (hook_event, hook_name) # Mom hook executes on momA and gets a errorion, # so a resume request is not sent to sister momB. self.momA.log_match("Job;%s;signal job with resume" % jid) self.momA.log_match("Job;%s;%s" % (jid, error_msg)) # after hook executes, job continues to be suspended self.server.expect(JOB, {ATTR_state: 'S'}, id=jid) def test_execjob_preresume_error_sis(self): """ An execjob_preresume hook that results in a error action by sister mom only would not affect the currently suspended job. """ # instantiate execjob_preresume hook hook_event = 'execjob_preresume' hook_name = 'psus' a = {'event': hook_event, 'enabled': 'true'} self.server.create_import_hook(hook_name, a, self.hook_sis_error_body) # Submit a job jid = self.server.submit(self.j) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid) # Suspend, then resume job self.server.sigjob(jobid=jid, signal="suspend") self.server.expect(JOB, {ATTR_state: 'S'}, id=jid) self.server.sigjob(jobid=jid, signal="resume") error_msg = \ "%s hook \'%s\' encountered an exception, request rejected" \ % (hook_event, hook_name) for vn in [self.momA, self.momB]: if vn == self.momA: vn.log_match("Job;%s;signal job with resume" % jid) vn.log_match("Job;%s;%s" % (jid, error_msg), existence=False, max_attempts=30) else: vn.log_match("Job;%s;RESUME received" % jid) vn.log_match("Job;%s;%s" % (jid, error_msg)) # after hook executes, job continues to be suspended self.server.expect(JOB, {ATTR_state: 'S'}, id=jid)
def get_epoch(msg): # Since its a log message split on ';' to get timestamp a = PBSLogUtils.convert_date_time(msg.split(';')[0]) return a
def test_job_array(self): """ Test that a job array switches from accruing eligible time to ineligible time when its last subjob starts running """ logutils = PBSLogUtils() a = {'resources_available.ncpus': 2} self.server.manager(MGR_CMD_SET, NODE, a, id=self.mom.shortname) a = {'log_events': 2047} self.server.manager(MGR_CMD_SET, SERVER, a) J1 = Job(TEST_USER, attrs={ATTR_J: '1-3'}) J1.set_sleep_time(20) jid = self.server.submit(J1) jid_short = jid.split('[')[0] sjid1 = jid_short + '[1]' sjid2 = jid_short + '[2]' sjid3 = jid_short + '[3]' # Capture the time stamp when subjob 1 starts run. Accrue type changes # to eligible time msg1 = J1.create_subjob_id(jid, 1) + ";Job Run at request of Scheduler" m1 = self.server.log_match(msg1) t1 = logutils.convert_date_time(m1[1].split(';')[0]) self.server.expect(JOB, {ATTR_state: 'R'}, id=sjid1, extend='t') self.server.expect(JOB, {ATTR_state: 'R'}, id=sjid2, extend='t') self.server.expect(JOB, {ATTR_state: 'Q'}, id=sjid3, extend='t') # accrue_type = 2 is eligible_time self.server.expect(JOB, {'accrue_type': 2}, id=jid) self.logger.info("subjobs 1 and 2 finished; subjob 3 must run now") self.server.expect(JOB, {ATTR_state: 'R'}, id=sjid3, extend='t', offset=20) self.server.expect(JOB, {'accrue_type': 1}, id=jid) # Capture the time stamp when subjob 3 starts run. Accrue type changes # to ineligible time. eligible_time calculation is completed. msg2 = J1.create_subjob_id(jid, 3) + ";Job Run at request of Scheduler" m2 = self.server.log_match(msg2) t2 = logutils.convert_date_time(m2[1].split(';')[0]) eligible_time = t2 - t1 m1 = jid + ";Accrue type has changed to ineligible_time, " m1 += "previous accrue type was eligible_time" m2 = m1 + " for %d secs, " % eligible_time # Format timedelta object as it does not print a preceding 0 for # hours in HH:MM:SS m2 += "total eligible_time={:0>8}".format( datetime.timedelta(seconds=eligible_time)) try: self.server.log_match(m2) except PtlLogMatchError as e: # In some slow machines, there is a delay observed between # job run and accrue type change. # Checking if log_match failed because eligible_time # value was off only by a few seconds(5 seconds). # This is done to acommodate differences in the eligible # time calculated by the test and the eligible time # calculated by PBS. # If the eligible_time value was off by > 5 seconds, test fails. match = self.server.log_match(m1) e_time = re.search('(\d+) secs', match[1]) if e_time: self.logger.info("Checking if log_match failed because " "the eligible_time value was off by " "a few seconds, but within the allowed " "range (5 secs). Expected %d secs Got: %s" % (eligible_time, e_time.group(1))) if int(e_time.group(1)) - eligible_time > 5: raise PtlLogMatchError(rc=1, rv=False, msg=e.msg) else: raise PtlLogMatchError(rc=1, rv=False, msg=e.msg)
def test_job_array(self): """ Test that a job array switches from accruing eligible time to ineligible time when its last subjob starts running """ logutils = PBSLogUtils() a = {'resources_available.ncpus': 2} self.server.manager(MGR_CMD_SET, NODE, a, id=self.mom.shortname) a = {'log_events': 2047} self.server.manager(MGR_CMD_SET, SERVER, a) J1 = Job(TEST_USER, attrs={ATTR_J: '1-3'}) J1.set_sleep_time(20) jid = self.server.submit(J1) jid_short = jid.split('[')[0] sjid1 = jid_short + '[1]' sjid2 = jid_short + '[2]' sjid3 = jid_short + '[3]' # Capture the time stamp when subjob 1 starts run. Accrue type changes # to eligible time msg1 = J1.create_subjob_id(jid, 1) + ";Job Run at request of Scheduler" m1 = self.server.log_match(msg1) t1 = logutils.convert_date_time(m1[1].split(';')[0]) self.server.expect(JOB, {ATTR_state: 'R'}, id=sjid1, extend='t') self.server.expect(JOB, {ATTR_state: 'R'}, id=sjid2, extend='t') self.server.expect(JOB, {ATTR_state: 'Q'}, id=sjid3, extend='t') self.server.expect(JOB, {'accrue_type': self.accrue['eligible']}, id=jid) self.logger.info("subjobs 1 and 2 finished; subjob 3 must run now") self.server.expect(JOB, {ATTR_state: 'R'}, id=sjid3, extend='t', offset=20) self.server.expect(JOB, {'accrue_type': self.accrue['ineligible']}, id=jid) # Capture the time stamp when subjob 3 starts run. Accrue type changes # to ineligible time. eligible_time calculation is completed. msg2 = J1.create_subjob_id(jid, 3) + ";Job Run at request of Scheduler" m2 = self.server.log_match(msg2) t2 = logutils.convert_date_time(m2[1].split(';')[0]) eligible_time = int(t2) - int(t1) m1 = jid + ";Accrue type has changed to ineligible_time, " m1 += "previous accrue type was eligible_time" m2 = m1 + " for %d secs, " % eligible_time # Format timedelta object as it does not print a preceding 0 for # hours in HH:MM:SS m2 += "total eligible_time={!s:0>8}".format( datetime.timedelta(seconds=eligible_time)) try: self.server.log_match(m2) except PtlLogMatchError as e: # In some slow machines, there is a delay observed between # job run and accrue type change. # Checking if log_match failed because eligible_time # value was off only by a few seconds(5 seconds). # This is done to acommodate differences in the eligible # time calculated by the test and the eligible time # calculated by PBS. # If the eligible_time value was off by > 5 seconds, test fails. match = self.server.log_match(m1) e_time = re.search(r'(\d+) secs', match[1]) if e_time: self.logger.info("Checking if log_match failed because " "the eligible_time value was off by " "a few seconds, but within the allowed " "range (5 secs). Expected %d secs Got: %s" % (eligible_time, e_time.group(1))) if int(e_time.group(1)) - eligible_time > 5: raise PtlLogMatchError(rc=1, rv=False, msg=e.msg) else: raise PtlLogMatchError(rc=1, rv=False, msg=e.msg)
class TestJobArray(TestFunctional): """ Test suite for job array feature """ lu = PBSLogUtils() qjh = """ import pbs e = pbs.event() j = e.job if j.max_run_subjobs is None: j.max_run_subjobs = %d pbs.logmsg(pbs.LOG_DEBUG, "max_run_subjobs set to %%d" %% j.max_run_subjobs) e.accept() """ mjh = """ import pbs e = pbs.event() j = e.job if j.max_run_subjobs != 0: if j.max_run_subjobs > 10: j.max_run_subjobs = %d pbs.logmsg(pbs.LOG_DEBUG, "max_run_subjobs set to %%d" %% j.max_run_subjobs) e.accept() """ mjh2 = """ import pbs e = pbs.event() j = e.job j.max_run_subjobs = %d pbs.logmsg(pbs.LOG_DEBUG, "max_run_subjobs set to %%d" %% j.max_run_subjobs) e.accept() """ def create_max_run_subjobs_hook(self, max_run, event, name, script): """ function to create a hook - max_run Number of subjobs that can concurrently run - event queuejob or modifyjob - name hook name - script hook script """ hook = script % int(max_run) attrs = {'event': event} self.server.create_import_hook(name, attrs, hook, overwrite=True) def test_arrayjob_Erecord_startval(self): """ Check that an arrayjob's E record's 'start' value is not set to 0 """ j = Job(TEST_USER, attrs={ ATTR_J: '1-2', ATTR_k: 'oe', 'Resource_List.select': 'ncpus=1' }) j.set_sleep_time(1) j_id = self.server.submit(j) # Check for the E record for the arrayjob acct_string = ";E;" + str(j_id) _, record = self.server.accounting_match(acct_string, max_attempts=10, interval=1) # Extract the 'start' value from the E record values = record.split(";", 3)[3] start_str = " start=" values_temp = values.split(start_str, 1)[1] start_val = int(values_temp.split()[0]) # Verify that the value of 'start' isn't 0 self.assertNotEqual(start_val, 0, "E record value of 'start' for arrayjob is 0") def kill_and_restart_svr(self): try: self.server.stop('-KILL') except PbsServiceError as e: # The server failed to stop raise self.failureException("Server failed to stop:" + e.msg) try: self.server.start() except PbsServiceError as e: # The server failed to start raise self.failureException("Server failed to start:" + e.msg) self.server.isUp() rv = self.is_server_licensed(self.server) _msg = 'No license found on server %s' % (self.server.shortname) self.assertTrue(rv, _msg) attr = {'state': (MATCH_RE, 'free|job-busy')} self.server.expect(NODE, attr, id=self.mom.shortname) def test_running_subjob_survive_restart(self): """ Test to check if a running subjob of an array job survive a pbs_server restart """ a = {'resources_available.ncpus': 1} self.server.manager(MGR_CMD_SET, NODE, a, self.mom.shortname) j = Job(TEST_USER, attrs={ ATTR_J: '1-3', 'Resource_List.select': 'ncpus=1' }) j_id = self.server.submit(j) subjid_1 = j.create_subjob_id(j_id, 1) # 1. check job array has begun self.server.expect(JOB, {'job_state': 'B'}, j_id) # 2. check subjob 1 started running self.server.expect(JOB, {'job_state': 'R'}, subjid_1) # 3. Kill and restart the server self.kill_and_restart_svr() # 4. array job should be B self.server.expect(JOB, {'job_state': 'B'}, j_id) # 5. subjob 1 should be R self.server.expect(JOB, {'job_state': 'R'}, subjid_1) def test_running_subjob_survive_restart_with_history(self): """ Test to check if a running subjob of an array job survive a pbs_server restart when history is enabled """ attr = {'job_history_enable': 'true'} self.server.manager(MGR_CMD_SET, SERVER, attr) self.test_running_subjob_survive_restart() def test_suspended_subjob_survive_restart(self): """ Test to check if a suspended subjob of an array job survive a pbs_server restart """ a = {'resources_available.ncpus': 1} self.server.manager(MGR_CMD_SET, NODE, a, self.mom.shortname) j = Job(TEST_USER, attrs={ ATTR_J: '1-3', 'Resource_List.select': 'ncpus=1' }) j.set_sleep_time(10) j_id = self.server.submit(j) subjid_2 = j.create_subjob_id(j_id, 2) # 1. check job array has begun self.server.expect(JOB, {'job_state': 'B'}, j_id) # 2. wait till subjob_2 starts running self.server.expect(JOB, {'job_state': 'R'}, subjid_2) try: self.server.sigjob(subjid_2, 'suspend') except PbsSignalError as e: raise self.failureException("Failed to suspend subjob:" + e.msg) self.server.expect(JOB, {'job_state': 'S'}, subjid_2, max_attempts=1) # 3. Kill and restart the server self.kill_and_restart_svr() # 4. array job should be B self.server.expect(JOB, {'job_state': 'B'}, j_id, max_attempts=1) # 5. subjob_2 should be S self.server.expect(JOB, {'job_state': 'S'}, subjid_2, max_attempts=1) try: self.server.sigjob(subjid_2, 'resume') except PbsSignalError as e: raise self.failureException("Failed to resume subjob:" + e.msg) def test_suspended_subjob_survive_restart_with_history(self): """ Test to check if a suspended subjob of an array job survive a pbs_server restart when history is enabled """ attr = {'job_history_enable': 'true'} self.server.manager(MGR_CMD_SET, SERVER, attr) self.test_suspended_subjob_survive_restart() def test_deleted_q_subjob_survive_restart(self): """ Test to check if a deleted queued subjob of an array job survive a pbs_server restart when history is disabled """ a = {'resources_available.ncpus': 1} self.server.manager(MGR_CMD_SET, NODE, a, self.mom.shortname) j = Job(TEST_USER, attrs={ ATTR_J: '1-3', 'Resource_List.select': 'ncpus=1' }) j.set_sleep_time(10) j_id = self.server.submit(j) subjid_3 = j.create_subjob_id(j_id, 3) self.server.expect(JOB, {'job_state': 'B'}, j_id) self.server.deljob(subjid_3) self.server.expect(JOB, {'job_state': 'X'}, subjid_3) self.kill_and_restart_svr() self.server.expect(JOB, {'job_state': 'B'}, j_id, max_attempts=1) self.server.expect(JOB, {'job_state': 'X'}, subjid_3, max_attempts=1) def test_deleted_q_subjob_survive_restart_w_history(self): """ Test to check if a deleted queued subjob of an array job survive a pbs_server restart when history is enabled """ attr = {'job_history_enable': 'true'} self.server.manager(MGR_CMD_SET, SERVER, attr) self.test_deleted_q_subjob_survive_restart() def test_deleted_r_subjob_survive_restart(self): """ Test to check if a deleted running subjob of an array job survive a pbs_server restart when history is disabled """ a = {'resources_available.ncpus': 1} self.server.manager(MGR_CMD_SET, NODE, a, self.mom.shortname) j = Job(TEST_USER, attrs={ ATTR_J: '1-3', 'Resource_List.select': 'ncpus=1' }) j.set_sleep_time(10) j_id = self.server.submit(j) subjid_1 = j.create_subjob_id(j_id, 1) self.server.expect(JOB, {'job_state': 'B'}, j_id) self.server.expect(JOB, {'job_state': 'R'}, subjid_1) self.server.deljob(subjid_1) self.server.expect(JOB, {'job_state': 'X'}, subjid_1) self.kill_and_restart_svr() self.server.expect(JOB, {'job_state': 'B'}, j_id, max_attempts=1) self.server.expect(JOB, {'job_state': 'X'}, subjid_1, max_attempts=1) def test_deleted_r_subjob_survive_restart_w_history(self): """ Test to check if a deleted running subjob of an array job survive a pbs_server restart when history is enabled """ attr = {'job_history_enable': 'true'} self.server.manager(MGR_CMD_SET, SERVER, attr) self.test_deleted_q_subjob_survive_restart() def test_qdel_expired_subjob(self): """ Test to check if qdel of a subjob is disallowed """ attr = {'job_history_enable': 'true'} self.server.manager(MGR_CMD_SET, SERVER, attr) a = {'resources_available.ncpus': 1} self.server.manager(MGR_CMD_SET, NODE, a, self.mom.shortname) j = Job(TEST_USER, attrs={ ATTR_J: '1-3', 'Resource_List.select': 'ncpus=1' }) j.set_sleep_time(5) j_id = self.server.submit(j) subjid_1 = j.create_subjob_id(j_id, 1) # 1. check job array has begun self.server.expect(JOB, {'job_state': 'B'}, j_id) # 2. wait till subjob 1 becomes expired self.server.expect(JOB, {'job_state': 'X'}, subjid_1) try: self.server.deljob(subjid_1) except PbsDeljobError as e: err_msg = "Request invalid for finished array subjob" self.assertTrue(err_msg in e.msg[0], "Error message is not expected") else: raise self.failureException("subjob in X state can be deleted") try: self.server.deljob(subjid_1, extend="deletehist") except PbsDeljobError as e: err_msg = "Request invalid for finished array subjob" self.assertTrue(err_msg in e.msg[0], "Error message is not expected") else: raise self.failureException("subjob in X state can be deleted") def test_subjob_comments(self): """ Test subjob comments for finished and terminated subjobs """ a = {'resources_available.ncpus': 1} self.server.manager(MGR_CMD_SET, NODE, a, self.mom.shortname) j = Job(TEST_USER, attrs={ ATTR_J: '1-30', 'Resource_List.select': 'ncpus=1' }) j.set_sleep_time(8) j_id = self.server.submit(j) subjid_1 = j.create_subjob_id(j_id, 1) subjid_2 = j.create_subjob_id(j_id, 2) self.server.expect(JOB, {'comment': 'Subjob finished'}, subjid_1, offset=8) self.server.delete(subjid_2, extend='force') self.server.expect(JOB, {'comment': 'Subjob finished'}, subjid_2) self.kill_and_restart_svr() self.server.expect(JOB, {'comment': 'Subjob finished'}, subjid_1, max_attempts=1) def test_subjob_comments_with_history(self): """ Test subjob comments for finished, failed and terminated subjobs when history is enabled """ a = {'resources_available.ncpus': 1} self.server.manager(MGR_CMD_SET, NODE, a, self.mom.shortname) a = {'job_history_enable': 'True'} self.server.manager(MGR_CMD_SET, SERVER, a) j = Job(TEST_USER, attrs={ ATTR_J: '1-2', 'Resource_List.select': 'ncpus=1' }) j.set_sleep_time(5) j_id = self.server.submit(j) subjid_1 = j.create_subjob_id(j_id, 1) subjid_2 = j.create_subjob_id(j_id, 2) self.server.delete(subjid_2, extend='force') self.server.expect(JOB, {'comment': (MATCH_RE, 'finished')}, subjid_2, extend='x') self.server.expect(JOB, {'comment': (MATCH_RE, 'Job run at.*and finished')}, subjid_1, extend='x') self.kill_and_restart_svr() self.server.expect(JOB, {'comment': (MATCH_RE, 'Job run at.*and finished')}, subjid_1, extend='x', max_attempts=1) script_body = "exit 1" j = Job(TEST_USER, attrs={ ATTR_J: '1-2', 'Resource_List.select': 'ncpus=1' }) j.create_script(body=script_body) j_id = self.server.submit(j) subjid_1 = j.create_subjob_id(j_id, 1) subjid_2 = j.create_subjob_id(j_id, 2) self.server.expect(JOB, {'comment': (MATCH_RE, 'Job run at.*and failed')}, subjid_1, extend='x') self.server.expect(JOB, {'comment': (MATCH_RE, 'Job run at.*and failed')}, subjid_2, extend='x') self.kill_and_restart_svr() self.server.expect(JOB, {'comment': (MATCH_RE, 'Job run at.*and failed')}, subjid_1, extend='x', max_attempts=1) self.server.expect(JOB, {'comment': (MATCH_RE, 'Job run at.*and failed')}, subjid_2, extend='x') def test_multiple_server_restarts(self): """ Test subjobs wont rerun after multiple server restarts """ a = {'resources_available.ncpus': 1} self.server.manager(MGR_CMD_SET, NODE, a, self.mom.shortname) a = {'job_history_enable': 'True'} self.server.manager(MGR_CMD_SET, SERVER, a) j = Job(TEST_USER, attrs={ ATTR_J: '1-2', 'Resource_List.select': 'ncpus=1' }) j.set_sleep_time(300) j_id = self.server.submit(j) subjid_1 = j.create_subjob_id(j_id, 1) a = {'job_state': 'R', 'run_count': 1} self.server.expect(JOB, a, subjid_1, attrop=PTL_AND) for _ in range(5): self.kill_and_restart_svr() self.server.expect(JOB, a, subjid_1, attrop=PTL_AND) def test_job_array_history_duration(self): """ Test that job array and subjobs are purged after history duration """ a = {'resources_available.ncpus': 1} self.server.manager(MGR_CMD_SET, NODE, a, self.mom.shortname) a = {'job_history_enable': 'True'} self.server.manager(MGR_CMD_SET, SERVER, a) a = {'job_history_duration': 30} self.server.manager(MGR_CMD_SET, SERVER, a) j = Job(TEST_USER, attrs={ ATTR_J: '1-2', 'Resource_List.select': 'ncpus=1' }) j.set_sleep_time(15) j_id = self.server.submit(j) subjid_1 = j.create_subjob_id(j_id, 1) subjid_2 = j.create_subjob_id(j_id, 2) a = {'job_state': 'R', 'run_count': 1} self.server.expect(JOB, a, subjid_1, attrop=PTL_AND) self.server.delete(subjid_1, extend='force') b = {'job_state': 'X'} self.server.expect(JOB, b, subjid_1) self.server.expect(JOB, a, subjid_2, attrop=PTL_AND) msg = "Waiting for 150 secs as server will purge db once" msg += " in 2 mins plus 30 sec of history duration" self.logger.info(msg) self.server.expect(JOB, 'job_state', op=UNSET, id=subjid_1, offset=150, extend='x') self.server.expect(JOB, 'job_state', op=UNSET, id=subjid_2, extend='x') def test_queue_deletion_after_terminated_subjob(self): """ Test that queue can be deleted after the job array is terminated and server is restarted. """ a = {'resources_available.ncpus': 1} self.server.manager(MGR_CMD_SET, NODE, a, self.mom.shortname) j = Job(TEST_USER, attrs={ ATTR_J: '1-2', 'Resource_List.select': 'ncpus=1' }) j_id = self.server.submit(j) subjid_1 = j.create_subjob_id(j_id, 1) a = {'job_state': 'R', 'run_count': 1} self.server.expect(JOB, a, subjid_1, attrop=PTL_AND) self.server.delete(subjid_1, extend='force') self.kill_and_restart_svr() subjid_2 = j.create_subjob_id(j_id, 2) self.server.expect(JOB, {'job_state': 'R'}, subjid_2) self.server.delete(j_id, wait=True) self.server.manager(MGR_CMD_DELETE, QUEUE, id='workq') def test_held_job_array_survive_server_restart(self): """ Test held job array can be released after server restart """ a = {'resources_available.ncpus': 1} self.server.manager(MGR_CMD_SET, NODE, a, self.mom.shortname) j = Job(TEST_USER, attrs={ ATTR_J: '1-2', 'Resource_List.select': 'ncpus=1' }) j.set_sleep_time(60) j_id = self.server.submit(j) j_id2 = self.server.submit(j) subjid_1 = j.create_subjob_id(j_id, 1) subjid_3 = j.create_subjob_id(j_id2, 1) a = {'job_state': 'R', 'run_count': 1} self.server.expect(JOB, a, subjid_1, attrop=PTL_AND) self.server.holdjob(j_id2, USER_HOLD) self.server.expect(JOB, {'job_state': 'H'}, j_id2) self.kill_and_restart_svr() self.server.delete(j_id, wait=True) self.server.expect(JOB, {'job_state': 'H'}, j_id2) self.server.rlsjob(j_id2, USER_HOLD) self.server.expect(JOB, {'job_state': 'B'}, j_id2) self.server.expect(JOB, a, subjid_3, attrop=PTL_AND) def test_held_job_array_survive_server_restart_w_history(self): """ Test held job array can be released after server restart when history is enabled """ a = {'job_history_enable': 'True'} self.server.manager(MGR_CMD_SET, SERVER, a) self.test_held_job_array_survive_server_restart() def test_subjobs_qrun(self): """ Test that job array's subjobs can be qrun """ a = {'resources_available.ncpus': 1} self.server.manager(MGR_CMD_SET, NODE, a, self.mom.shortname) a = {'scheduling': 'false'} self.server.manager(MGR_CMD_SET, SERVER, a) j = Job(TEST_USER, attrs={ ATTR_J: '1-2', 'Resource_List.select': 'ncpus=1' }) j.set_sleep_time(60) j_id = self.server.submit(j) subjid_1 = j.create_subjob_id(j_id, 1) self.server.runjob(subjid_1) self.server.expect(JOB, {'job_state': 'B'}, j_id) self.server.expect(JOB, {'job_state': 'R'}, subjid_1) def test_dependent_job_array_server_restart(self): """ Check Job array dependency is not released after server restart """ a = {'job_history_enable': 'true'} self.server.manager(MGR_CMD_SET, SERVER, a) a = {'resources_available.ncpus': 2} self.server.manager(MGR_CMD_SET, NODE, a, self.mom.shortname) j = Job(TEST_USER, attrs={ ATTR_J: '1-2', 'Resource_List.select': 'ncpus=1' }) j.set_sleep_time(10) j_id = self.server.submit(j) subjid_1 = j.create_subjob_id(j_id, 1) subjid_2 = j.create_subjob_id(j_id, 2) self.server.expect(JOB, {'job_state': 'B'}, j_id) self.server.expect(JOB, {'job_state': 'R'}, subjid_1) self.server.expect(JOB, {'job_state': 'R'}, subjid_2) depend_value = 'afterok:' + j_id j = Job(TEST_USER, attrs={ ATTR_J: '1-2', 'Resource_List.select': 'ncpus=1', ATTR_depend: depend_value }) j_id2 = self.server.submit(j) self.server.expect(JOB, {'job_state': 'H'}, j_id2) self.kill_and_restart_svr() self.server.expect(JOB, {'job_state': 'F'}, j_id, extend='x', interval=5) self.server.expect(JOB, {'job_state': 'B'}, j_id2, interval=5) def test_rerun_subjobs_server_restart(self): """ Test that subjobs which are requeued remain queued after server restart """ a = {'job_history_enable': 'true'} self.server.manager(MGR_CMD_SET, SERVER, a) a = {'resources_available.ncpus': 1} self.server.manager(MGR_CMD_SET, NODE, a, self.mom.shortname) j = Job(TEST_USER, attrs={ ATTR_J: '1-2', 'Resource_List.select': 'ncpus=1' }) j.set_sleep_time(60) j_id = self.server.submit(j) subjid_1 = j.create_subjob_id(j_id, 1) self.server.expect(JOB, {'job_state': 'R'}, subjid_1) a = {'scheduling': 'false'} self.server.manager(MGR_CMD_SET, SERVER, a) self.server.rerunjob(subjid_1) self.server.expect(JOB, {'job_state': 'Q'}, subjid_1) self.kill_and_restart_svr() self.server.expect(JOB, {'job_state': 'Q'}, subjid_1) a = {'scheduling': 'true'} self.server.manager(MGR_CMD_SET, SERVER, a) a = {'job_state': 'R'} self.server.expect(JOB, a, subjid_1) def test_rerun_node_fail_requeue(self): """ Test sub jobs gets requeued after node_fail_requeue time """ a = {'node_fail_requeue': 10} self.server.manager(MGR_CMD_SET, SERVER, a) a = {'resources_available.ncpus': 1} self.server.manager(MGR_CMD_SET, NODE, a, self.mom.shortname) j = Job(TEST_USER, attrs={ ATTR_J: '1-2', 'Resource_List.select': 'ncpus=1' }) j.set_sleep_time(60) j_id = self.server.submit(j) subjid_1 = j.create_subjob_id(j_id, 1) self.server.expect(JOB, {'job_state': 'R'}, subjid_1) self.mom.stop() self.server.expect(JOB, {'job_state': 'Q'}, subjid_1, offset=5) def test_qmove_job_array(self): """ Test job array's can be qmoved to a high priority queue and qmoved job array preempts running subjob """ a = { 'queue_type': 'execution', 'started': 'True', 'enabled': 'True', 'priority': 150 } self.server.manager(MGR_CMD_CREATE, QUEUE, a, id='wq1') a = {'job_history_enable': 'true'} self.server.manager(MGR_CMD_SET, SERVER, a) a = {'resources_available.ncpus': 1} self.server.manager(MGR_CMD_SET, NODE, a, self.mom.shortname) j = Job(TEST_USER, attrs={ ATTR_J: '1-2', 'Resource_List.select': 'ncpus=1' }) j.set_sleep_time(60) j_id = self.server.submit(j) subjid_1 = j.create_subjob_id(j_id, 1) self.server.expect(JOB, {'job_state': 'R'}, subjid_1) j_id2 = self.server.submit(j) subjid_3 = j.create_subjob_id(j_id2, 1) self.server.movejob(j_id2, 'wq1') a = {'scheduling': 'true'} self.server.manager(MGR_CMD_SET, SERVER, a) self.server.expect(JOB, {'job_state': 'S'}, subjid_1) self.server.expect(JOB, {'job_state': 'R'}, subjid_3) def test_delete_history_subjob_server_restart(self): """ Test that subjobs can be deleted from history and they remain deleted after server restart """ a = {'job_history_enable': 'true'} self.server.manager(MGR_CMD_SET, SERVER, a) a = {'resources_available.ncpus': 2} self.server.manager(MGR_CMD_SET, NODE, a, self.mom.shortname) j = Job(TEST_USER, attrs={ ATTR_J: '1-2', 'Resource_List.select': 'ncpus=1', ATTR_k: 'oe' }) j.set_sleep_time(5) j_id = self.server.submit(j) self.server.expect(JOB, {'job_state': 'F'}, j_id, extend='x', offset=5) self.server.delete(j_id, extend='deletehist') self.server.expect(JOB, 'job_state', op=UNSET, extend='x', id=j_id) self.kill_and_restart_svr() self.server.expect(JOB, 'job_state', op=UNSET, extend='x', id=j_id) def test_job_id_duplicate_server_restart(self): """ Test that after server restart there is no duplication of job identifiers """ a = {'resources_available.ncpus': 1} self.server.manager(MGR_CMD_SET, NODE, a, self.mom.shortname) j = Job(TEST_USER, attrs={ ATTR_J: '1-2', 'Resource_List.select': 'ncpus=1' }) self.server.submit(j) j = Job(TEST_USER) self.server.submit(j) self.kill_and_restart_svr() try: j = Job(TEST_USER, attrs={ ATTR_J: '1-2', 'Resource_List.select': 'ncpus=1' }) self.server.submit(j) except PbsSubmitError as e: raise self.failureException("Failed to submit job: " + str(e.msg)) def test_expired_subjobs_not_reported(self): """ Test if a subjob is finished and moves to expired state, it is not reported to scheduler in the next scheduling cycle. Scheduler expects only running subjobs to be reported to it. """ a = {'job_history_enable': 'True'} self.server.manager(MGR_CMD_SET, SERVER, a) req_node = ":host=" + self.mom.shortname res_req = { 'Resource_List.select': '1:ncpus=1' + req_node, 'array_indices_submitted': '1-16', 'Resource_List.place': 'excl' } j1 = Job(TEST_USER, attrs=res_req) j1.set_sleep_time(2) jid1 = self.server.submit(j1) j1_sub1 = j1.create_subjob_id(jid1, 1) self.server.expect(JOB, {'job_state': 'X'}, j1_sub1) # Trigger a sched cycle a = {'scheduling': 'True'} self.server.manager(MGR_CMD_SET, SERVER, a) msg = j1_sub1 + ";" + "Subjob found in undesirable state" msg += ", ignoring this job" self.scheduler.log_match(msg, existence=False, max_attempts=10) def test_subjob_stdfile_custom_dir(self): """ Test that subjobs standard error and out files are generated in the custom directory provided with oe qsub options """ tmp_dir = self.du.create_temp_dir(asuser=TEST_USER) a = {ATTR_e: tmp_dir, ATTR_o: tmp_dir, ATTR_J: '1-4'} j = Job(TEST_USER, attrs=a) j.set_sleep_time(2) jid = self.server.submit(j) self.server.expect(JOB, {ATTR_state: 'B'}, id=jid) self.server.expect(JOB, ATTR_state, op=UNSET, id=jid) file_list = [ name for name in os.listdir(tmp_dir) if os.path.isfile(os.path.join(tmp_dir, name)) ] self.assertEqual(8, len(file_list), 'expected 8 std files') for ext in ['.OU', '.ER']: for sub_ind in range(1, 5): f_name = j.create_subjob_id(jid, sub_ind) + ext if f_name not in file_list: raise self.failureException("std file " + f_name + " not found") @skipOnCray def test_subjob_wrong_state(self): """ Test that after submitting a job and restarting the server, the subjobs are not in the wrong substate and can be scheduled. """ a = {'resources_available.ncpus': 200} self.server.manager(MGR_CMD_SET, NODE, a, self.mom.shortname) j = Job(attrs={ATTR_J: '1-200'}) j.set_sleep_time(200) self.server.submit(j) # while the server is sending the jobs to the MoM, restart the server self.server.restart() # triggering scheduling cycle all jobs are in R state. self.scheduler.run_scheduling_cycle() # ensure all the subjobs are running self.server.expect(JOB, {'job_state=R': 200}, extend='t') def test_recover_big_array_job(self): """ Test that during server restart, server is able to recover valid array jobs which are bigger than the current value of max_array_size server attribute """ # submit a medium size array job a = {'resources_available.ncpus': 4} self.server.manager(MGR_CMD_SET, NODE, a, self.mom.shortname) j = Job(attrs={ATTR_J: '1-200'}) j_id = self.server.submit(j) self.server.expect(JOB, {ATTR_state: 'B'}, id=j_id) # reduce max_array_size a = {ATTR_maxarraysize: 40} self.server.manager(MGR_CMD_SET, SERVER, a) self.server.expect(SERVER, a) try: self.server.submit(Job(attrs={ATTR_J: '1-200'})) except PbsSubmitError as e: exp_msg = 'qsub: Array job exceeds server or queue size limit' self.assertEqual(exp_msg, e.msg[0]) # restart the server to check for crash try: self.server.restart() except PbsServiceError as e: if 'pbs_server startup failed' in e.msg: reset_db = 'echo y | ' + \ os.path.join(self.server.pbs_conf['PBS_EXEC'], 'sbin', 'pbs_server') + ' -t create' self.du.run_cmd(cmd=reset_db, sudo=True, as_script=True) self.fail('TC failed as server recovery failed') else: self.server.expect(JOB, {ATTR_state: 'B'}, id=j_id) def test_max_run_subjobs_basic(self): """ Test that if a job is submitted with 'max_run_subjobs' attribute number of subjobs that run do not exceed the attribute value. """ a = {'resources_available.ncpus': 8} self.mom.create_vnodes(a, 1) j = Job(attrs={ATTR_J: '1-20%2'}) j_id = self.server.submit(j) self.server.expect(JOB, {ATTR_state: 'B'}, id=j_id) self.server.expect(JOB, {'job_state=R': 2}, extend='t') self.server.alterjob(j_id, {ATTR_W: 'max_run_subjobs=5'}) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'True'}) self.server.expect(JOB, {'job_state=R': 5}, extend='t') msg = "Number of concurrent running subjobs limit reached" self.scheduler.log_match(j_id + ';' + msg) @skipOnCpuSet def test_max_run_subjobs_equiv_class(self): """ Test that if a job is submitted with 'max_run_subjobs' attribute it does not stop jobs in equivalence class from running """ a = {'resources_available.ncpus': 8} self.server.manager(MGR_CMD_SET, NODE, a, self.mom.shortname) j = Job( attrs={ ATTR_J: '1-20%2', 'Resource_List.walltime': 3600, 'Resource_List.select': 'ncpus=2' }) j_id = self.server.submit(j) self.server.expect(JOB, {ATTR_state: 'B'}, id=j_id) self.server.expect(JOB, {'job_state=R': 2}, extend='t') j = Job(attrs={ 'Resource_List.walltime': 3600, 'Resource_List.select': 'ncpus=2' }) j_id_equiv = self.server.submit(j) self.server.expect(JOB, {ATTR_state: 'R'}, id=j_id_equiv) @skipOnCpuSet def test_max_run_subjobs_calendar(self): """ Test that if a job is submitted with 'max_run_subjobs' attribute gets into calendar when it cannot run. """ a = {'resources_available.ncpus': 8} self.mom.create_vnodes(a, 1) a = {'backfill_depth': '2'} self.server.manager(MGR_CMD_SET, SERVER, a) self.scheduler.set_sched_config({'strict_ordering': 'True'}) j1 = Job(attrs={'Resource_List.walltime': 200}) j1.set_sleep_time(200) j1_id = self.server.submit(j1) self.server.expect(JOB, {ATTR_state: 'R'}, id=j1_id) j2 = Job(attrs={ATTR_J: '1-20%2', 'Resource_List.walltime': 300}) j2.set_sleep_time(300) j2_id = self.server.submit(j2) self.server.expect(JOB, {ATTR_state: 'B'}, id=j2_id) self.server.expect(JOB, {'job_state=R': 3}, extend='t') j2_sub1 = j2.create_subjob_id(j2_id, 1) job_arr = self.server.status(JOB, id=j2_sub1) stime = self.lu.convert_date_time(job_arr[0]['stime'], fmt="%a %b %d %H:%M:%S %Y") job_arr = self.server.status(JOB, id=j2_id) # check estimated start time is set on job array self.assertIn('estimated.start_time', job_arr[0]) errmsg = j2_id + ";Error in calculation of start time of top job" self.scheduler.log_match(errmsg, existence=False, max_attempts=10) est = self.lu.convert_date_time(job_arr[0]['estimated.start_time'], fmt="%a %b %d %H:%M:%S %Y") self.assertAlmostEqual(stime + 300, est, 1) def test_max_run_subjobs_queuejob_hook(self): """ Test that a queuejob hook is able to set max_run_subjobs attribute. """ a = {'resources_available.ncpus': 8} self.mom.create_vnodes(a, 1) self.create_max_run_subjobs_hook(3, "queuejob", "h1", self.qjh) j1 = Job(attrs={ATTR_J: '1-20'}) jid1 = self.server.submit(j1) self.server.log_match("max_run_subjobs set to 3") self.server.expect(JOB, {ATTR_state: 'B'}, id=jid1) self.server.expect(JOB, {'job_state=R': 3}, extend='t') # Submit a normal job and see if queuejob hook cannot set the # attribute. with self.assertRaises(PbsSubmitError) as e: self.server.submit(Job()) self.assertIn("Attribute has to be set on an array job", e.exception.msg[0]) def test_max_run_subjobs_modifyjob_hook(self): """ Submit array job with large max_run_subjobs limit see if modifyjob modifies it. """ a = {'resources_available.ncpus': 20} self.mom.create_vnodes(a, 1) self.create_max_run_subjobs_hook(3, "modifyjob", "h1", self.mjh) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) j = Job(attrs={ATTR_J: '1-50'}) jid = self.server.submit(j) self.server.expect(JOB, {ATTR_state: 'Q'}, id=jid) self.server.alterjob(jid, {ATTR_W: 'max_run_subjobs=20'}) self.server.log_match("max_run_subjobs set to 3") self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'True'}) self.server.expect(JOB, {'job_state=R': 3}, extend='t') # Modify a normal job and see if queuejob hook cannot set the # attribute. self.create_max_run_subjobs_hook(3, "modifyjob", "h1", self.mjh2) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) nj = self.server.submit(Job()) with self.assertRaises(PbsAlterError) as e: self.server.alterjob(nj, {'Resource_List.soft_walltime': 50}) self.assertIn("Attribute has to be set on an array job", e.exception.msg[0]) def test_max_run_subjobs_preemption(self): """ Submit array job with max_run_subjobs limit and see if such a job hits the limit, no preemption is attempted. """ a = { 'queue_type': 'execution', 'started': 'True', 'enabled': 'True', 'Priority': 200 } self.server.manager(MGR_CMD_CREATE, QUEUE, a, "wq2") a = {'resources_available.ncpus': 8} self.mom.create_vnodes(a, 1) a = {'Resource_List.select': 'ncpus=2'} j = Job(attrs=a) jid = self.server.submit(j) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid) a = { ATTR_J: '1-20%3', 'Resource_List.select': 'ncpus=2', ATTR_q: 'wq2' } j_arr = Job(attrs=a) jid_arr = self.server.submit(j_arr) self.server.expect(JOB, {ATTR_state: 'B'}, id=jid_arr) self.server.expect(JOB, {'job_state=R': 4}, extend='t') self.server.expect(JOB, {ATTR_state: 'R'}, id=jid) def test_max_run_subjobs_qrun(self): """ Submit array job with max_run_subjobs limit and see if such a job is run using qrun, max_run_subjobs limit is ignored. """ a = {'resources_available.ncpus': 8} self.mom.create_vnodes(a, 1) a = {'Resource_List.select': 'ncpus=2'} j = Job(attrs=a) jid = self.server.submit(j) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid) a = {ATTR_J: '1-20%3', 'Resource_List.select': 'ncpus=2'} j_arr = Job(attrs=a) jid_arr = self.server.submit(j_arr) self.server.expect(JOB, {ATTR_state: 'B'}, id=jid_arr) self.server.expect(JOB, {'job_state=R': 4}, extend='t') self.server.expect(JOB, {ATTR_state: 'R'}, id=jid) subjid_4 = j_arr.create_subjob_id(jid_arr, 4) self.server.expect(JOB, {ATTR_state: 'Q'}, id=subjid_4) self.server.runjob(subjid_4) self.server.expect(JOB, {ATTR_state: 'B'}, id=jid_arr) self.server.expect(JOB, {'job_state=R': 4}, extend='t') self.server.expect(JOB, {ATTR_state: 'S'}, id=jid) self.server.expect(JOB, {ATTR_state: 'R'}, id=subjid_4) def test_max_run_subjobs_suspend(self): """ Submit array job with max_run_subjobs limit and see if such a job is has suspended subjobs, those subjobs are not counted against the limit. """ a = {'resources_available.ncpus': 8} self.mom.create_vnodes(a, 1) a = {ATTR_J: '1-20%3', 'Resource_List.select': 'ncpus=2'} j_arr = Job(attrs=a) jid_arr = self.server.submit(j_arr) self.server.expect(JOB, {ATTR_state: 'B'}, id=jid_arr) self.server.expect(JOB, {'job_state=R': 3}, extend='t') subjid_2 = j_arr.create_subjob_id(jid_arr, 2) self.server.expect(JOB, {ATTR_state: 'R'}, id=subjid_2) self.server.sigjob(jobid=subjid_2, signal="suspend") self.server.expect(JOB, {ATTR_state: 'S'}, id=subjid_2) self.server.expect(JOB, {'job_state=R': 3}, extend='t') subjid_4 = j_arr.create_subjob_id(jid_arr, 4) self.server.expect(JOB, {ATTR_state: 'R'}, id=subjid_4) def test_max_run_subjobs_eligible_time(self): """ Test that array jobs hitting max_run_subjobs limit still accrues eligible time. """ a = {'resources_available.ncpus': 8} self.mom.create_vnodes(a, 1) a = {'eligible_time_enable': 'True'} self.server.manager(MGR_CMD_SET, SERVER, a) accrue = {'ineligible': 1, 'eligible': 2, 'run': 3, 'exit': 4} a = {ATTR_J: '1-20%3', 'Resource_List.select': 'ncpus=2'} j_arr = Job(attrs=a) jid_arr = self.server.submit(j_arr) self.server.expect(JOB, {ATTR_state: 'B'}, id=jid_arr) self.server.expect(JOB, {'job_state=R': 3}, extend='t') self.server.expect(JOB, {'accrue_type': accrue['eligible']}, id=jid_arr) def test_max_run_subjobs_on_non_array(self): """ Test that setting max_run_subjobs on non-array jobs is rejected. """ a = {ATTR_W: 'max_run_subjobs=4'} with self.assertRaises(PbsSubmitError) as e: self.server.submit(Job(attrs=a)) self.assertIn("Attribute has to be set on an array job", e.exception.msg[0]) def test_multiple_max_run_subjobs_values(self): """ Test that setting max_run_subjobs more than once on an array job is rejected. """ qsub_cmd = os.path.join(self.server.pbs_conf['PBS_EXEC'], 'bin', 'qsub') cmd = [ qsub_cmd, '-J1-4%2', '-Wmax_run_subjobs=4', '--', self.mom.sleep_cmd, '100' ] rv = self.du.run_cmd(self.server.hostname, cmd=cmd) self.assertNotEqual(rv['rc'], 0, 'qsub must fail') msg = "qsub: multiple max_run_subjobs values found" self.assertEqual(rv['err'][0], msg) def test_qdel_job_array_downed_mom(self): """ Test to check if qdel of a job array returns an error when mom is downed. """ a = {'resources_available.ncpus': 1} self.server.manager(MGR_CMD_SET, NODE, a, self.mom.shortname) j = Job(TEST_USER, attrs={ ATTR_J: '1-3', 'Resource_List.select': 'ncpus=1' }) j_id = self.server.submit(j) # 1. check job array has begun self.server.expect(JOB, {'job_state': 'B'}, j_id) self.mom.stop() try: self.server.deljob(j_id) except PbsDeljobError as e: err_msg = "could not connect to MOM" self.assertTrue(err_msg in e.msg[0], "Did not get the expected message") self.assertTrue(e.rc != 0, "Exit code shows success") else: raise self.failureException("qdel job array did not return error")
def test_multi_sched_perf(self): """ Test time taken to schedule and run 5k jobs with single scheduler and workload divided among 5 schedulers. """ a = {'resources_available.ncpus': 1000} self.server.create_vnodes(self.mom.shortname, a, 5, self.mom) a = {'scheduling': 'False'} self.server.manager(MGR_CMD_SET, SERVER, a) self.submit_njobs(5000) start = time.time() self.scheduler.run_scheduling_cycle() c = self.scheduler.cycles(lastN=1)[0] cyc_dur = c.end - c.start self.perf_test_result(cyc_dur, "default_cycle_duration", "secs") msg = 'Time taken by default scheduler to run 5k jobs is ' self.logger.info(msg + str(cyc_dur)) self.server.cleanup_jobs() self.setup_scheds() self.setup_queues_nodes() for sc in self.scheds: a = {'scheduling': 'False'} self.server.manager(MGR_CMD_SET, SCHED, a, id=sc) a = {ATTR_q: 'wq1'} self.submit_njobs(1000, a) a = {ATTR_q: 'wq2'} self.submit_njobs(1000, a) a = {ATTR_q: 'wq3'} self.submit_njobs(1000, a) a = {ATTR_q: 'wq4'} self.submit_njobs(1000, a) a = {ATTR_q: 'wq5'} self.submit_njobs(1000, a) start = time.time() for sc in self.scheds: a = {'scheduling': 'True'} self.server.manager(MGR_CMD_SET, SCHED, a, id=sc) for sc in self.scheds: a = {'scheduling': 'False'} self.server.manager(MGR_CMD_SET, SCHED, a, id=sc) sc_dur = [] for sc in self.scheds: if sc != 'default': self.logger.info("searching log for scheduler " + str(sc)) log_msg = self.scheds[sc].log_match("Leaving Scheduling Cycle", starttime=int(start), max_attempts=30) endtime = PBSLogUtils.convert_date_time( log_msg[1].split(';')[0]) dur = endtime - start sc_dur.append(dur) max_dur = max(sc_dur) self.perf_test_result(max_dur, "max_multisched_cycle_duration", "secs") msg = 'Max time taken by one of the multi sched to run 1k jobs is ' self.logger.info(msg + str(max_dur)) self.perf_test_result(cyc_dur - max_dur, "multisched_defaultsched_cycle_diff", "secs") self.assertLess(max_dur, cyc_dur) msg1 = 'Multi scheduler is faster than single scheduler by ' msg2 = 'secs in scheduling 5000 jobs with 5 schedulers' self.logger.info(msg1 + str(cyc_dur - max_dur) + msg2)
class TestPbsExecjobEnd(TestFunctional): """ This tests the feature in PBS that allows execjob_end hook to execute such that pbs_mom is not blocked upon execution. """ logutils = PBSLogUtils() def setUp(self): TestFunctional.setUp(self) self.attr = {'event': 'execjob_end', 'enabled': 'True', 'alarm': '50'} self.hook_body = ("import pbs\n" "import time\n" "e = pbs.event()\n" "pbs.logjobmsg(e.job.id, \ 'executed execjob_end hook')\n" "time.sleep(10)\n" "pbs.logjobmsg(e.job.id, \ 'execjob_end hook ended')\n" "e.accept()\n") def test_execjob_end_non_blocking(self): """ Test to make sure that mom is unblocked and running exechost_periodic hook while it's child process is executing execjob_end hook. """ hook_name = "execjob_end_logmsg" self.server.create_import_hook(hook_name, self.attr, self.hook_body) hook_name = "exechost_periodic_logmsg" hook_body = ("import pbs\n" "e = pbs.event()\n" "pbs.logmsg(pbs.LOG_DEBUG, \ 'executed exechost_periodic hook')\n" "e.accept()\n") attr = {'event': 'exechost_periodic', 'freq': '3', 'enabled': 'True'} j = Job(TEST_USER) j.set_sleep_time(1) self.server.create_import_hook(hook_name, attr, hook_body) jid = self.server.submit(j) # need to verify hook messages in the below mentioned order to # confirm mom is not blocked on execjob_end hook execution. # The order is verified with the use of starttime and endtime # parameters. (_, str1) = self.mom.log_match("Job;%s;executed execjob_end hook" % jid, n=100, max_attempts=10, interval=2) date_time1 = str1.split(";")[0] epoch1 = self.logutils.convert_date_time(date_time1) # following message should be logged while execjob_end hook is in sleep (_, str1) = self.mom.log_match("executed exechost_periodic hook", starttime=epoch1 - 1, endtime=epoch1 + 10, n=100, max_attempts=10, interval=1) date_time2 = str1.split(";")[0] epoch2 = self.logutils.convert_date_time(date_time2) (_, str1) = self.mom.log_match("Job;%s;execjob_end hook ended" % jid, starttime=epoch2 - 1, n=100, max_attempts=10, interval=2) date_time3 = str1.split(";")[0] self.logger.info( "execjob_end hook executed at: %s," "exechost_periodic at: %s and execjob_end hook ended at: %s" % (date_time1, date_time2, date_time3)) def test_execjob_end_hook_order_and_reject(self): """ Test with multiple execjob_end hooks having different order with one of the hooks rejecting the job. """ hook_name1 = "execjob_end_logmsg1" hook_body_accept = ("import pbs\n" "e = pbs.event()\n" "pbs.logjobmsg(e.job.id, \ 'executed %s hook' % e.hook_name)\n" "e.accept()\n") attr = {'event': 'execjob_end', 'order': '1', 'enabled': 'True'} self.server.create_import_hook(hook_name1, attr, hook_body_accept) hook_name = "execjob_end_logmsg2" hook_body_reject = ( "import pbs\n" "e = pbs.event()\n" "pbs.logjobmsg(e.job.id, 'executed execjob_end hook')\n" "e.reject('Job is rejected')\n") attr = {'event': 'execjob_end', 'order': '2', 'enabled': 'True'} self.server.create_import_hook(hook_name, attr, hook_body_reject) hook_name2 = "execjob_end_logmsg3" attr = {'event': 'execjob_end', 'order': '170', 'enabled': 'True'} self.server.create_import_hook(hook_name2, attr, hook_body_accept) j = Job(TEST_USER) j.set_sleep_time(1) jid = self.server.submit(j) self.mom.log_match("Job;%s;executed %s hook" % (jid, hook_name1), n=100, max_attempts=10, interval=2) self.mom.log_match("Job;%s;Job is rejected" % jid, n=100, max_attempts=10, interval=2) self.mom.log_match("Job;%s;executed %s hook" % (jid, hook_name2), n=100, max_attempts=10, interval=2, existence=False) def test_execjob_end_multi_job(self): """ Test to make sure that mom is unblocked with execjob_end hook with mutiple jobs """ status = self.server.status(NODE, id=self.mom.shortname) if status[0]["resources_available.ncpus"] < "2": self.skip_test(reason="need 2 or more available ncpus") hook_name = "execjob_end_logmsg4" self.server.create_import_hook(hook_name, self.attr, self.hook_body) # jobs need to land on the same host even in a multi-node setup a = {'Resource_List.select': '1:ncpus=1:host=%s' % self.mom.shortname} j = Job(TEST_USER, attrs=a) j.set_sleep_time(1) jid1 = self.server.submit(j) # jid1 should be in E state, in sleep of execjob_end hook for # jid2 submmision. self.server.expect(JOB, {'job_state': 'E'}, id=jid1) j = Job(TEST_USER, attrs=a) j.set_sleep_time(1) jid2 = self.server.submit(j) # hook message of jid2 should be logged after the message of jid1 and # before completion of sleep in hook for jid1 inorder to prove mom # is not in blocked state. (_, str1) = self.mom.log_match("Job;%s;executed execjob_end hook" % jid1, n=100, max_attempts=10, interval=2) date_time1 = str1.split(";")[0] epoch1 = self.logutils.convert_date_time(date_time1) # hook message for jid2 should appear while hook is in sleep for jid1 (_, str1) = self.mom.log_match("Job;%s;executed execjob_end hook" % jid2, starttime=epoch1 - 1, endtime=epoch1 + 10, n=100, max_attempts=10, interval=1) date_time1 = str1.split(";")[0] epoch1 = self.logutils.convert_date_time(date_time1) (_, str1) = self.mom.log_match("Job;%s;execjob_end hook ended" % jid1, starttime=epoch1 - 1, n=100, max_attempts=10, interval=2) self.mom.log_match("Job;%s;execjob_end hook ended" % jid2, n=100, max_attempts=10, interval=2) @requirements(num_moms=2) def test_execjob_end_non_blocking_multi_node(self): """ Test to make sure sister mom is unblocked when execjob_end hook is running on sister mom """ if len(self.moms) != 2: self.skip_test(reason="need 2 mom hosts: -p moms=<m1>:<m2>") self.momA = self.moms.values()[0] self.momB = self.moms.values()[1] hook_name = "execjob_end_logmsg5" self.server.create_import_hook(hook_name, self.attr, self.hook_body) hook_name = "exechost_periodic_logmsg2" hook_body = ("import pbs\n" "e = pbs.event()\n" "pbs.logmsg(pbs.LOG_DEBUG, \ 'executed exechost_periodic hook')\n" "e.accept()\n") attr = {'event': 'exechost_periodic', 'freq': '3', 'enabled': 'True'} a = { 'Resource_List.select': '1:ncpus=1:host=%s+1:ncpus=1:host=%s' % (self.momA.shortname, self.momB.shortname) } j = Job(TEST_USER, attrs=a) j.set_sleep_time(1) self.server.create_import_hook(hook_name, attr, hook_body) jid = self.server.submit(j) for host, mom in self.moms.items(): (_, str1) = mom.log_match("Job;%s;executed execjob_end hook" % jid, n=100, max_attempts=10, interval=2) date_time1 = str1.split(";")[0] epoch1 = self.logutils.convert_date_time(date_time1) (_, str1) = mom.log_match("executed exechost_periodic hook", starttime=epoch1 - 1, endtime=epoch1 + 10, n=100, max_attempts=10, interval=1) date_time2 = str1.split(";")[0] epoch2 = self.logutils.convert_date_time(date_time2) (_, str1) = mom.log_match("Job;%s;execjob_end hook ended" % jid, starttime=epoch2 - 1, n=100, max_attempts=10, interval=2) date_time3 = str1.split(";")[0] msg = "Got expected log_msg on host:%s" % host self.logger.info(msg) self.logger.info( "execjob_end hook executed at: %s," "exechost_periodic at: %s and execjob_end hook ended at: %s" % (date_time1, date_time2, date_time3)) @requirements(num_moms=2) def test_execjob_end_delete_request(self): """ Test to make sure execjob_end hook is running after job force deletion request(IS_DISCARD_JOB) when mom is unblocked. """ hook_name = "execjob_end_logmsg6" self.server.create_import_hook(hook_name, self.attr, self.hook_body) if len(self.moms) == 2: self.momA = self.moms.values()[0] self.momB = self.moms.values()[1] a = { 'Resource_List.select': '1:ncpus=1:host=%s+1:ncpus=1:host=%s' % (self.momA.shortname, self.momB.shortname) } j = Job(TEST_USER, attrs=a) elif len(self.moms) == 1: j = Job(TEST_USER) j.set_sleep_time(10) jid = self.server.submit(j) self.server.expect(JOB, {'job_state': 'R'}, id=jid) self.server.deljob(id=jid, wait=True, attr_W="force") for host, mom in self.moms.items(): mom.log_match("Job;%s;executed execjob_end hook" % jid, n=100, max_attempts=10, interval=2) mom.log_match("Job;%s;execjob_end hook ended" % jid, n=100, max_attempts=10, interval=2) msg = "Got expected log_msg on host:%s" % host self.logger.info(msg) @requirements(num_moms=2) def test_execjob_end_reject_request(self): """ Test to make sure hook job reject message should appear in mom log in case sister mom went down before executing execjob_end hook """ if len(self.moms) != 2: self.skip_test(reason="need 2 mom hosts: -p moms=<m1>:<m2>") self.momA = self.moms.values()[0] self.momB = self.moms.values()[1] # Create hook hook_name = "execjob_end_logmsg7" self.server.create_import_hook(hook_name, self.attr, self.hook_body) # Submit a multi-node job a = { 'Resource_List.select': '1:ncpus=1:host=%s+1:ncpus=1:host=%s' % (self.momA.shortname, self.momB.shortname) } j = Job(TEST_USER, attrs=a) j.set_sleep_time(30) jid = self.server.submit(j) # Verify job spawn on sisterm mom self.momB.log_match("Job;%s;JOIN_JOB as node" % jid, n=100, max_attempts=10, interval=2) # When the job run approx 5 sec, bring sister mom down self.server.expect(JOB, {'job_state': 'R'}, id=jid, offset=5) msg = 'mom is not down' self.assertTrue(self.momB.stop(), msg) # Verify momB is down and job is running a = {'state': (MATCH_RE, "down")} self.server.expect(NODE, a, id=self.momB.shortname) self.server.expect(JOB, {'job_state': "R"}, id=jid) # Wait for job to be in E state time.sleep(15) hook_execution_time = int(time.time()) self.server.expect(JOB, {'job_state': "E"}, id=jid, offset=10) # Following message should be logged on momA after job delete request # received msg = "%s;Unable to send delete job request to one or more" % jid msg += " sisters" self.momA.log_match(msg, interval=2, starttime=hook_execution_time) # Following message should be logged on momA while execjob_end hook is # in sleep self.momA.log_match("Job;%s;executed execjob_end hook" % jid, starttime=hook_execution_time, max_attempts=10, interval=2) self.momA.log_match("Job;%s;execjob_end hook ended" % jid, starttime=hook_execution_time, max_attempts=10, interval=2) # Verify reject reply code 15059 for hook job logged in mother # superior(momA) self.momA.log_match("Req;req_reject;Reject reply code=15059,", starttime=hook_execution_time, max_attempts=10, interval=2) # Start pbs on MomA self.server.pi.restart(hostname=self.server.hostname) # Verify mother superior is not down self.assertTrue(self.momA.isUp()) # Start pbs on MomB self.momB.start() # Verify sister mom is not down self.assertTrue(self.momB.isUp()) def test_rerun_on_epilogue_hook(self): """ Test force qrerun when epilogue hook is running """ hook_name = "epiend_hook" global hook_body attr = {'event': 'execjob_epilogue,execjob_end', 'enabled': 'True'} self.server.create_import_hook(hook_name, attr, hook_body) j = Job(TEST_USER) j.set_sleep_time(1) jid = self.server.submit(j) self.mom.log_match("starting hook event EXECJOB_EPILOGUE") # Force rerun job self.server.rerunjob(jid, extend='force') self.mom.log_match("starting hook event EXECJOB_END") self.server.expect(JOB, {'job_state': 'R'}, jid) self.mom.log_match("ending hook event EXECJOB_END") def common_steps(self, jid, host): """ Function to run common steps for test job on mom breakdown and restarts """ self.server.expect(JOB, {ATTR_state: 'R'}, id=jid) host.signal('-KILL') self.logger.info("Successfully killed mom process on %s" % host.shortname) # set scheduling false self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) # check for the job's state after the node_fail_requeue time hits self.logger.info("Waiting for 30s so that node_fail_requeue time hits") self.server.expect(JOB, {ATTR_state: 'Q'}, id=jid, offset=30) host.start() self.logger.info("Successfully started mom process on %s" % host.shortname) self.server.expect(NODE, {'state': 'free'}, id=host.shortname) self.server.expect(JOB, {ATTR_state: 'Q'}, id=jid) # run job try: # qrun will fail as it is discarding the job self.server.runjob(jid) except PbsRunError as e: self.logger.info("Runjob throws error: " + e.msg[0]) self.assertTrue( 'qrun: Request invalid for state of job' in e.msg[0]) now = int(time.time()) self.mom.log_match("ending hook event EXECJOB_END", starttime=now, interval=2) time.sleep(5) self.server.runjob(jid) self.server.expect(JOB, {'job_state': 'R'}, jid) now = int(time.time()) self.mom.log_match("starting hook event EXECJOB_END", starttime=now, interval=2) time.sleep(5) self.mom.log_match("ending hook event EXECJOB_END", starttime=now, interval=2) def test_qrun_on_mom_breakdown(self): """ Test qrun when mom breaksdown and restarts """ hook_name = "end_hook" global hook_body attr = {'event': 'execjob_end', 'enabled': 'True', 'alarm': '50'} self.server.create_import_hook(hook_name, attr, hook_body) attrib = {ATTR_nodefailrq: 30} self.server.manager(MGR_CMD_SET, SERVER, attrib=attrib) j = Job(TEST_USER) j.set_sleep_time(10) jid = self.server.submit(j) self.common_steps(jid, self.mom) def test_qrun_arrayjob_on_mom_breakdown(self): """ Test qrun array job when mom breaksdown and restarts """ hook_name = "end_hook" global hook_body attr = {'event': 'execjob_end', 'enabled': 'True', 'alarm': '50'} self.server.create_import_hook(hook_name, attr, hook_body) attrib = {ATTR_nodefailrq: 30} self.server.manager(MGR_CMD_SET, SERVER, attrib=attrib) j = Job(TEST_USER, attrs={ ATTR_J: '1-2', ATTR_k: 'oe', 'Resource_List.select': 'ncpus=1' }) j.set_sleep_time(10) jid = self.server.submit(j) # check job array has begun self.server.expect(JOB, {'job_state': 'B'}, jid) subjid_1 = j.create_subjob_id(jid, 1) self.common_steps(subjid_1, self.mom)
class TestPbsExecjobAbort(TestFunctional): """ Tests the hook event execjob_abort for when a job prematurely exits during startup and the epilogue hook or end hook may not always execute. """ logutils = PBSLogUtils() def setUp(self): if len(self.moms) != 3: self.skipTest('test requires three MoMs as input, ' + 'use -p moms=<mom1>:<mom2>:<mom3>') TestFunctional.setUp(self) self.momC = self.moms.values()[2] # execjob_abort hook self.abort_hook_body = """import pbs e=pbs.event() pbs.logmsg(pbs.LOG_DEBUG, "called execjob_abort hook") def print_attribs(pbs_obj, header): for a in pbs_obj.attributes: v = getattr(pbs_obj, a) if v and str(v) != "": pbs.logmsg(pbs.LOG_DEBUG, "%s: %s = %s" % (header, a, v)) print_attribs(e.job, "JOB") for vn in e.vnode_list: v = e.vnode_list[vn] print_attribs(v, "vnode_list[" + vn + "]") """ # instantiate execjob_abort hook hook_event = 'execjob_abort' hook_name = 'abort' a = {'event': hook_event, 'enabled': 'true'} self.server.create_import_hook(hook_name, a, self.abort_hook_body) # execjob_abort hook with sleep self.abort1_hook_body = """import pbs import time e=pbs.event() pbs.logmsg(pbs.LOG_DEBUG, "called execjob_abort hook") time.sleep(2) """ # execjob_prologue hook, unhandled exception self.prolo_hook_body = """import pbs e=pbs.event() job=e.job if %s job.in_ms_mom(): raise NameError """ # execjob_launch hook, unhandled exception on MS mom self.launch_hook_body = """import pbs e=pbs.event() job=e.job if job.in_ms_mom(): raise NameError """ # execjob_end hook self.end_hook_body = """import pbs e=pbs.event() e.reject("end hook rejected") """ # execjob_begin hook with sleep self.begin_hook_body = """import pbs import time e=pbs.event() e.job.delete() pbs.logmsg(pbs.LOG_DEBUG, "called execjob_begin hook with job.delete()") time.sleep(2) """ # job used in the tests a = {ATTR_l + '.select': '3:ncpus=1', ATTR_l + '.place': 'scatter'} self.j = Job(TEST_USER, attrs=a) def test_execjob_abort_ms_prologue(self): """ An execjob_abort hook is executed in the primary mom and then in the connected sister moms when a job has problems starting up and needing to be aborted because execjob_prologue hook rejected in the primary mom. Job is requeued, gets held (H state). """ # instantiate execjob_prologue hook to run on MS mom hook_event = 'execjob_prologue' hook_name = 'prolo' a = {'event': hook_event, 'enabled': 'true'} self.server.create_import_hook(hook_name, a, self.prolo_hook_body % ('')) # Submit a job that eventually goes in H state start_time = int(time.time()) jid = self.server.submit(self.j) msg = "Job;%s;Job requeued, execution node down" % jid self.server.log_match(msg, starttime=start_time) # Check for abort hook message in each of the moms msg = "called execjob_abort hook" for mom in self.moms.values(): mom.log_match(msg, starttime=start_time) self.server.expect(JOB, {ATTR_state: 'H'}, id=jid, offset=7) def test_execjob_abort_exit_job_launch_reject(self): """ An execjob_abort hook is executed in the primary mom and then in the connected sister moms when a job has problems starting up and needing to be aborted because execjob_launch hook rejected in the primary mom. Job exits. """ # instantiate execjob_launch hook to run on primary moms hook_event = 'execjob_launch' hook_name = 'launch' a = {'event': hook_event, 'enabled': 'true'} self.server.create_import_hook(hook_name, a, self.launch_hook_body) # Submit a job start_time = int(time.time()) jid = self.server.submit(self.j) self.server.expect(JOB, 'exec_host', id=jid, op=SET) job_stat = self.server.status(JOB, id=jid) exechost = job_stat[0]['exec_host'].split('/')[0] mom_superior = self.moms[exechost] msg = "Job;%s;execjob_launch hook 'launch' " % ( jid) + "encountered an exception, request rejected" mom_superior.log_match(msg, starttime=start_time) # Check for abort hook message in each of the moms msg = "called execjob_abort hook" for mom in self.moms.values(): mom.log_match(msg, starttime=start_time) self.server.expect(JOB, 'queue', op=UNSET, id=jid, offset=1) def msg_order(self, node, msg1, msg2, stime): """ Checks if msg1 appears after stime, and msg1 appears before msg2 in mom logs of node. Returns date and time when msg1 and msg2 appeared. """ # msg1 appears before msg2 (_, str1) = node.log_match(msg1, starttime=stime) date_time1 = str1.split(";")[0] epoch1 = self.logutils.convert_date_time(date_time1) # Use epoch1 to mark the starttime of msg2 (_, str2) = node.log_match(msg2, starttime=epoch1) date_time2 = str2.split(";")[0] return (date_time1, date_time2) def test_execjob_abort_sis_joinjob_requeue(self): """ An execjob_abort hook is executed on a sister mom when a sister mom fails to join job. On connected primary mom an execjob_abort hook is executed first then execjob_end hook. On connected mom only an execjob_abort hook is executed. Job gets requeued. """ # instantiate execjob_abort hook with sleep hook_event = 'execjob_abort' hook_name = 'abort' a = {'event': hook_event, 'enabled': 'true'} self.server.create_import_hook(hook_name, a, self.abort1_hook_body) # instantiate execjob_end hook hook_event = 'execjob_end' hook_name = 'end' a = {'event': hook_event, 'enabled': 'true'} self.server.create_import_hook(hook_name, a, self.end_hook_body) # Simulate a sister failure to join job # kill -STOP momC, submit a multi-node job, kill -9 momC self.momC.signal("-STOP") stime = int(time.time()) jid = self.server.submit(self.j) self.server.expect(JOB, 'exec_host', id=jid, op=SET) job_stat = self.server.status(JOB, id=jid) exechost = job_stat[0]['exec_host'].split('/')[0] pri_mom = self.moms[exechost] self.momC.signal("-KILL") msg = "Job;%s;job_start_error.+from node %s " % ( jid, self.momC.hostname) + "could not JOIN_JOB successfully" msg_abort = "called execjob_abort hook" msg_end = "end hook rejected" # momC failed to join job pri_mom.log_match(msg, starttime=stime, regexp=True) # abort hook executed before end hook on primary mom (dt1, dt2) = self.msg_order(pri_mom, msg_abort, msg_end, stime) self.logger.info("\n%s: abort hook executed at: %s" "\n%s: end hook executed at: %s" % (pri_mom.shortname, dt1, pri_mom.shortname, dt2)) # only abort hook executed on connected sister mom for mom in self.moms.values(): if mom != pri_mom and mom != self.momC: mom.log_match(msg_abort, starttime=stime) mom.log_match(msg_end, starttime=stime, max_attempts=10, interval=2, existence=False) self.server.expect(JOB, {ATTR_state: 'Q'}, id=jid, offset=1) def test_execjob_abort_sis_joinjob_exit(self): """ An execjob_abort hook is executed on a sister mom when a sister mom fails to join job. An execjob_begin hook which instructs the job to be deleted via the pbs.event().job.delete() call executes before the execjob_abort hook. Job exits. """ # instantiate execjob_begin hook hook_event = 'execjob_begin' hook_name = 'begin' a = {'event': hook_event, 'enabled': 'true'} self.server.create_import_hook(hook_name, a, self.begin_hook_body) # Simulate a sister failure to join job # kill -STOP momC, submit a multi-node job, kill -9 momC self.momC.signal("-STOP") stime = int(time.time()) jid = self.server.submit(self.j) self.server.expect(JOB, 'exec_host', id=jid, op=SET) job_stat = self.server.status(JOB, id=jid) exechost = job_stat[0]['exec_host'].split('/')[0] pri_mom = self.moms[exechost] self.momC.signal("-KILL") msg = "Job;%s;job_start_error.+from node %s " % ( jid, self.momC.hostname) + "could not JOIN_JOB successfully" msg_begin = "called execjob_begin hook with job.delete()" msg_abort = "called execjob_abort hook" # momC failed to join job pri_mom.log_match(msg, starttime=stime, regexp=True) # begin hook executed before abort hook on connected sister mom for mom in self.moms.values(): if mom != pri_mom and mom != self.momC: (dt1, dt2) = self.msg_order(pri_mom, msg_begin, msg_abort, stime) self.logger.info("\n%s: begin hook executed at: %s" "\n%s: abort hook executed at: %s" % (mom.shortname, dt1, mom.shortname, dt2)) # begin hook executed before abort hook executed on primary mom (dt1, dt2) = self.msg_order(pri_mom, msg_begin, msg_abort, stime) self.logger.info("\n%s: begin hook executed at: %s" "\n%s: abort hook executed at: %s" % (pri_mom.shortname, dt1, pri_mom.shortname, dt2)) self.server.expect(JOB, 'queue', op=UNSET, id=jid, offset=1)
class TestPreemptPerformance(TestPerformance): """ Check the preemption performance """ lu = PBSLogUtils() def setUp(self): TestPerformance.setUp(self) # set poll cycle to a high value because mom spends a lot of time # in gathering job's resources used. We don't need that in this test self.mom.add_config({'$min_check_poll': 7200, '$max_check_poll': 9600}) def create_workload_and_preempt(self): a = {'queue_type': 'execution', 'started': 'True', 'enabled': 'True'} self.server.manager(MGR_CMD_CREATE, QUEUE, a, 'workq2') a = {'max_run_res_soft.ncpus': "[u:PBS_GENERIC=2]"} self.server.manager(MGR_CMD_SET, QUEUE, a, 'workq') a = {'max_run_res.mem': "[u:" + str(TEST_USER) + "=1500mb]"} self.server.manager(MGR_CMD_SET, SERVER, a) a = { 'Resource_List.select': '1:ncpus=3:mem=90mb', 'Resource_List.walltime': 9999 } for _ in range(8): j = Job(TEST_USER, attrs=a) j.set_sleep_time(9999) self.server.submit(j) for _ in range(7): j = Job(TEST_USER1, attrs=a) j.set_sleep_time(9999) self.server.submit(j) sched_off = {'scheduling': 'False'} self.server.manager(MGR_CMD_SET, SERVER, sched_off) a = { 'Resource_List.select': '1:ncpus=3', 'Resource_List.walltime': 9999 } for _ in range(775): j = Job(TEST_USER, attrs=a) j.set_sleep_time(9999) self.server.submit(j) for _ in range(800): j = Job(TEST_USER1, attrs=a) j.set_sleep_time(9999) self.server.submit(j) sched_on = {'scheduling': 'True'} self.server.manager(MGR_CMD_SET, SERVER, sched_on) self.server.expect(JOB, {'job_state=R': 1590}, offset=15, interval=20) a = { 'Resource_List.select': '1:ncpus=90:mem=1350mb', 'Resource_List.walltime': 9999, ATTR_queue: 'workq2' } j1 = Job(TEST_USER, attrs=a) j1.set_sleep_time(9999) j1id = self.server.submit(j1) self.server.expect(JOB, {'job_state': 'R'}, id=j1id, offset=15, interval=5) self.server.expect(JOB, {'job_state=S': 20}, interval=5) (_, str1) = self.scheduler.log_match(j1id + ";Considering job to run", id=j1id, n='ALL', max_attempts=1, interval=2) (_, str2) = self.scheduler.log_match(j1id + ";Job run", id=j1id, n='ALL', max_attempts=1, interval=2) date_time1 = str1.split(";")[0] date_time2 = str2.split(";")[0] epoch1 = self.lu.convert_date_time(date_time1) epoch2 = self.lu.convert_date_time(date_time2) time_diff = epoch2 - epoch1 self.logger.info('#' * 80) self.logger.info('#' * 80) res_str = "RESULT: THE TIME TAKEN IS : " + str(time_diff) + " SECONDS" self.logger.info(res_str) self.logger.info('#' * 80) self.logger.info('#' * 80) @timeout(3600) @tags('sched', 'scheduling_policy') def test_preemption_with_limits(self): """ Measure the time scheduler takes to preempt when the high priority job hits soft/hard limits under a considerable amount of workload. """ a = { 'resources_available.ncpus': 4800, 'resources_available.mem': '2800mb' } self.server.create_vnodes('vn', a, 1, self.mom, usenatvnode=True) p = '"express_queue, normal_jobs, server_softlimits, queue_softlimits"' a = {'preempt_prio': p} self.server.manager(MGR_CMD_SET, SCHED, a, runas=ROOT_USER) self.create_workload_and_preempt() @timeout(3600) @tags('sched', 'scheduling_policy') def test_preemption_with_insufficient_resc(self): """ Measure the time scheduler takes to preempt when the high priority job hits soft/hard limits and there is scarcity of resources under a considerable amount of workload. """ a = { 'resources_available.ncpus': 4800, 'resources_available.mem': '1500mb' } self.server.create_vnodes('vn', a, 1, self.mom, usenatvnode=True) p = '"express_queue, normal_jobs, server_softlimits, queue_softlimits"' a = {'preempt_prio': p} self.server.manager(MGR_CMD_SET, SCHED, a, runas=ROOT_USER) self.create_workload_and_preempt() @timeout(3600) @tags('sched', 'scheduling_policy') def test_insufficient_resc_non_cons(self): """ Submit a number of low priority job and then submit a high priority job that needs a non-consumable resource which is assigned to last running job. This will make scheduler go through all running jobs to find the preemptable job. """ a = {'type': 'string', 'flag': 'h'} self.server.manager(MGR_CMD_CREATE, RSC, a, id='qlist') a = { ATTR_rescavail + ".qlist": "list1", ATTR_rescavail + ".ncpus": "8" } self.server.create_vnodes("vn1", a, 400, self.mom, additive=True, fname="vnodedef1") a = { ATTR_rescavail + ".qlist": "list2", ATTR_rescavail + ".ncpus": "1" } self.server.create_vnodes("vn2", a, 1, self.mom, additive=True, fname="vnodedef2") self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) a = {ATTR_l + '.select': '1:ncpus=1:qlist=list1'} for _ in range(3200): j = Job(TEST_USER, attrs=a) j.set_sleep_time(3000) self.server.submit(j) a = {ATTR_l + '.select': '1:ncpus=1:qlist=list2'} j = Job(TEST_USER, attrs=a) j.set_sleep_time(3000) # Add qlist to the resources scheduler checks for self.scheduler.add_resource('qlist') self.server.manager(MGR_CMD_UNSET, SCHED, 'preempt_sort', runas=ROOT_USER) jid = self.server.submit(j) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'True'}) self.server.expect(JOB, {'job_state=R': 3201}, interval=20, offset=15) qname = 'highp' a = { 'queue_type': 'execution', 'priority': '200', 'started': 'True', 'enabled': 'True' } self.server.manager(MGR_CMD_CREATE, QUEUE, a, qname) a = {ATTR_l + '.select': '1:ncpus=1:qlist=list2', ATTR_q: 'highp'} j = Job(TEST_USER, attrs=a) j.set_sleep_time(3000) jid_highp = self.server.submit(j) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid_highp, interval=10) self.server.expect(JOB, {ATTR_state: (MATCH_RE, 'S|Q')}, id=jid) search_str = jid_highp + ";Considering job to run" (_, str1) = self.scheduler.log_match(search_str, id=jid_highp, n='ALL', max_attempts=1, interval=2) search_str = jid_highp + ";Job run" (_, str2) = self.scheduler.log_match(search_str, id=jid_highp, n='ALL', max_attempts=1, interval=2) date_time1 = str1.split(";")[0] date_time2 = str2.split(";")[0] epoch1 = self.lu.convert_date_time(date_time1) epoch2 = self.lu.convert_date_time(date_time2) time_diff = epoch2 - epoch1 self.logger.info('#' * 80) self.logger.info('#' * 80) res_str = "RESULT: PREEMPTION TOOK: " + str(time_diff) + " SECONDS" self.logger.info(res_str) self.logger.info('#' * 80) self.logger.info('#' * 80) self.perf_test_result(time_diff, "preempt_time_nonconsumable_resc", "sec") @timeout(3600) @tags('sched', 'scheduling_policy') def test_insufficient_resc_multiple_non_cons(self): """ Submit a number of low priority jobs and then submit a high priority job that needs a non-consumable resource in 2 chunks. These resources are assigned to last two running jobs. This will make scheduler go through all running jobs to find preemptable jobs. """ a = {'type': 'string', 'flag': 'h'} self.server.manager(MGR_CMD_CREATE, RSC, a, id='qlist') a = { ATTR_rescavail + ".qlist": "list1", ATTR_rescavail + ".ncpus": "8" } self.server.create_vnodes("vn1", a, 400, self.mom, additive=True, fname="vnodedef1") a = { ATTR_rescavail + ".qlist": "list2", ATTR_rescavail + ".ncpus": "1" } self.server.create_vnodes("vn2", a, 1, self.mom, additive=True, fname="vnodedef2") a = { ATTR_rescavail + ".qlist": "list3", ATTR_rescavail + ".ncpus": "1" } self.server.create_vnodes("vn3", a, 1, self.mom, additive=True, fname="vnodedef3") self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) a = {ATTR_l + '.select': '1:ncpus=1:qlist=list1'} for _ in range(3200): j = Job(TEST_USER, attrs=a) j.set_sleep_time(3000) self.server.submit(j) a = {ATTR_l + '.select': '1:ncpus=1:qlist=list2'} j = Job(TEST_USER, attrs=a) j.set_sleep_time(3000) b = {ATTR_l + '.select': '1:ncpus=1:qlist=list3'} j2 = Job(TEST_USER, attrs=b) j2.set_sleep_time(3000) # Add qlist to the resources scheduler checks for self.scheduler.add_resource('qlist') self.server.manager(MGR_CMD_UNSET, SCHED, 'preempt_sort', runas=ROOT_USER) jid = self.server.submit(j) jid2 = self.server.submit(j2) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'True'}) self.server.expect(JOB, {'job_state=R': 3202}, interval=20, offset=15) qname = 'highp' a = { 'queue_type': 'execution', 'priority': '200', 'started': 'True', 'enabled': 'True' } self.server.manager(MGR_CMD_CREATE, QUEUE, a, qname) a = { ATTR_l + '.select': '1:ncpus=1:qlist=list2+1:ncpus=1:qlist=list3', ATTR_q: 'highp' } j = Job(TEST_USER, attrs=a) j.set_sleep_time(3000) jid_highp = self.server.submit(j) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid_highp, interval=10) self.server.expect(JOB, {ATTR_state: (MATCH_RE, 'S|Q')}, id=jid) self.server.expect(JOB, {ATTR_state: (MATCH_RE, 'S|Q')}, id=jid2) search_str = jid_highp + ";Considering job to run" (_, str1) = self.scheduler.log_match(search_str, id=jid_highp, n='ALL', max_attempts=1, interval=2) search_str = jid_highp + ";Job run" (_, str2) = self.scheduler.log_match(search_str, id=jid_highp, n='ALL', max_attempts=1, interval=2) date_time1 = str1.split(";")[0] date_time2 = str2.split(";")[0] epoch1 = self.lu.convert_date_time(date_time1) epoch2 = self.lu.convert_date_time(date_time2) time_diff = epoch2 - epoch1 self.logger.info('#' * 80) self.logger.info('#' * 80) res_str = "RESULT: PREEMPTION TOOK: " + str(time_diff) + " SECONDS" self.logger.info(res_str) self.logger.info('#' * 80) self.logger.info('#' * 80) self.perf_test_result(time_diff, "preempt_time_multiplenonconsumable_resc", "sec") @timeout(3600) @tags('sched', 'scheduling_policy') def test_insufficient_server_resc(self): """ Submit a number of low priority jobs and then make the last low priority job to consume some server level resources. Submit a high priority job that request for this server level resource and measure the time it takes for preemption. """ a = {'type': 'long', 'flag': 'q'} self.server.manager(MGR_CMD_CREATE, RSC, a, id='foo') a = {ATTR_rescavail + ".ncpus": "8"} self.server.create_vnodes("vn1", a, 401, self.mom, additive=True, fname="vnodedef1") # Make resource foo available on server a = {ATTR_rescavail + ".foo": 50, 'scheduling': 'False'} self.server.manager(MGR_CMD_SET, SERVER, a) a = {ATTR_l + '.select': '1:ncpus=1'} for _ in range(3200): j = Job(TEST_USER, attrs=a) j.set_sleep_time(3000) self.server.submit(j) # Add foo to the resources scheduler checks for self.scheduler.add_resource('foo') self.server.manager(MGR_CMD_UNSET, SCHED, 'preempt_sort', runas=ROOT_USER) a = {ATTR_l + '.select': '1:ncpus=1', ATTR_l + '.foo': 25} j = Job(TEST_USER, attrs=a) j.set_sleep_time(3000) jid = self.server.submit(j) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'True'}) self.server.expect(JOB, {'job_state=R': 3201}, interval=20, offset=15) qname = 'highp' a = { 'queue_type': 'execution', 'priority': '200', 'started': 'True', 'enabled': 'True' } self.server.manager(MGR_CMD_CREATE, QUEUE, a, qname) a = { ATTR_l + '.select': '1:ncpus=1', ATTR_l + '.foo': 50, ATTR_q: 'highp' } j2 = Job(TEST_USER, attrs=a) j2.set_sleep_time(3000) jid_highp = self.server.submit(j2) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid_highp, interval=10) self.server.expect(JOB, {ATTR_state: (MATCH_RE, 'S|Q')}, id=jid) search_str = jid_highp + ";Considering job to run" (_, str1) = self.scheduler.log_match(search_str, id=jid_highp, n='ALL', max_attempts=1, interval=2) search_str = jid_highp + ";Job run" (_, str2) = self.scheduler.log_match(search_str, id=jid_highp, n='ALL', max_attempts=1, interval=2) date_time1 = str1.split(";")[0] date_time2 = str2.split(";")[0] epoch1 = self.lu.convert_date_time(date_time1) epoch2 = self.lu.convert_date_time(date_time2) time_diff = epoch2 - epoch1 self.logger.info('#' * 80) self.logger.info('#' * 80) res_str = "RESULT: PREEMPTION TOOK: " + str(time_diff) + " SECONDS" self.logger.info(res_str) self.logger.info('#' * 80) self.logger.info('#' * 80) self.perf_test_result(time_diff, "High_priority_preemption", "sec") @timeout(7200) def test_preemption_basic(self): """ Submit a number of low priority job and then submit a high priority job. """ a = {ATTR_rescavail + ".ncpus": "8"} self.server.create_vnodes("vn1", a, 400, self.mom, additive=True, fname="vnodedef1") self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) a = {ATTR_l + '.select': '1:ncpus=1'} for _ in range(3200): j = Job(TEST_USER, attrs=a) j.set_sleep_time(3000) self.server.submit(j) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'True'}) self.server.expect(JOB, {'job_state=R': 3200}, interval=20, offset=15) qname = 'highp' a = { 'queue_type': 'execution', 'priority': '200', 'started': 'True', 'enabled': 'True' } self.server.manager(MGR_CMD_CREATE, QUEUE, a, qname) ncpus = 20 S_jobs = 20 for _ in range(5): a = {ATTR_l + '.select': ncpus, ATTR_q: 'highp'} j = Job(TEST_USER, attrs=a) j.set_sleep_time(3000) jid_highp = self.server.submit(j) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid_highp, interval=10) self.server.expect(JOB, {'job_state=S': S_jobs}, interval=5) search_str = jid_highp + ";Considering job to run" (_, str1) = self.scheduler.log_match(search_str, id=jid_highp, n='ALL', max_attempts=1) search_str = jid_highp + ";Job run" (_, str2) = self.scheduler.log_match(search_str, id=jid_highp, n='ALL', max_attempts=1) date_time1 = str1.split(";")[0] date_time2 = str2.split(";")[0] epoch1 = self.lu.convert_date_time(date_time1) epoch2 = self.lu.convert_date_time(date_time2) time_diff = epoch2 - epoch1 self.logger.info('#' * 80) self.logger.info('#' * 80) res_str = "RESULT: PREEMPTION OF " + str(ncpus) + " JOBS TOOK: " \ + str(time_diff) + " SECONDS" self.logger.info(res_str) self.logger.info('#' * 80) self.logger.info('#' * 80) ncpus *= 3 S_jobs += ncpus self.perf_test_result(time_diff, "preemption_time", "sec") @timeout(3600) def test_preemption_with_unrelated_soft_limits(self): """ Measure the time scheduler takes to preempt when there are user soft limits in the system and preemptor and preemptee jobs are submitted as different user. """ a = { 'resources_available.ncpus': 4, 'resources_available.mem': '6400mb' } self.server.create_vnodes('vn', a, 500, self.mom, usenatvnode=False, sharednode=False) p = "express_queue, normal_jobs, server_softlimits, queue_softlimits" a = {'preempt_prio': p} self.server.manager(MGR_CMD_SET, SCHED, a) a = {'max_run_res_soft.ncpus': "[u:" + str(TEST_USER) + "=1]"} self.server.manager(MGR_CMD_SET, QUEUE, a, 'workq') self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) # submit a bunch of jobs as TEST_USER2 a = {ATTR_l + '.select=1:ncpus': 1} for _ in range(2000): j = Job(TEST_USER2, attrs=a) j.set_sleep_time(3000) self.server.submit(j) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'True'}) self.server.expect(JOB, {'job_state=R': 2000}, interval=10, offset=5, max_attempts=100) qname = 'highp' a = { 'queue_type': 'execution', 'priority': '200', 'started': 'True', 'enabled': 'True' } self.server.manager(MGR_CMD_CREATE, QUEUE, a, qname) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) a = {ATTR_l + '.select=2000:ncpus': 1, ATTR_q: qname} j = Job(TEST_USER3, attrs=a) j.set_sleep_time(3000) hjid = self.server.submit(j) scycle = time.time() self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'True'}) (_, str1) = self.scheduler.log_match(hjid + ";Considering job to run") date_time1 = str1.split(";")[0] epoch1 = self.lu.convert_date_time(date_time1) # make sure 2000 jobs were suspended self.server.expect(JOB, {'job_state=S': 2000}, interval=10, offset=5, max_attempts=100) # check when server received the request (_, req_svr) = self.server.log_match(";Type 93 request received", starttime=epoch1) date_time_svr = req_svr.split(";")[0] epoch_svr = self.lu.convert_date_time(date_time_svr) # check when scheduler gets first reply from server (_, resp_sched) = self.scheduler.log_match(";Job preempted ", starttime=epoch1) date_time_sched = resp_sched.split(";")[0] epoch_sched = self.lu.convert_date_time(date_time_sched) svr_delay = epoch_sched - epoch_svr # record the start time of high priority job (_, str2) = self.scheduler.log_match(hjid + ";Job run", n='ALL', interval=2) date_time2 = str2.split(";")[0] epoch2 = self.lu.convert_date_time(date_time2) time_diff = epoch2 - epoch1 self.logger.info('#' * 80) self.logger.info('#' * 80) res_str = "RESULT: TOTAL PREEMPTION TIME: " + \ str(time_diff) + " SECONDS, SERVER TOOK: " + \ str(svr_delay) + " , SCHED TOOK: " + \ str(time_diff - svr_delay) self.logger.info(res_str) self.logger.info('#' * 80) self.logger.info('#' * 80) @timeout(3600) def test_preemption_with_user_soft_limits(self): """ Measure the time scheduler takes to preempt when there are user soft limits in the system for one user and only some preemptee jobs are submitted as that user. """ a = { 'resources_available.ncpus': 4, 'resources_available.mem': '6400mb' } self.server.create_vnodes('vn', a, 500, self.mom, usenatvnode=False, sharednode=False) p = "express_queue, normal_jobs, server_softlimits, queue_softlimits" a = {'preempt_prio': p} self.server.manager(MGR_CMD_SET, SCHED, a) a = {'max_run_res_soft.ncpus': "[u:" + str(TEST_USER) + "=1]"} self.server.manager(MGR_CMD_SET, QUEUE, a, 'workq') self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) # submit a bunch of jobs as different users a = {ATTR_l + '.select=1:ncpus': 1} usr_list = [TEST_USER, TEST_USER2, TEST_USER3, TEST_USER4] num_usr = len(usr_list) for ind in range(2000): j = Job(usr_list[ind % num_usr], attrs=a) j.set_sleep_time(3000) self.server.submit(j) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'True'}) self.server.expect(JOB, {'job_state=R': 2000}, interval=10, offset=5, max_attempts=100) qname = 'highp' a = { 'queue_type': 'execution', 'priority': '200', 'started': 'True', 'enabled': 'True' } self.server.manager(MGR_CMD_CREATE, QUEUE, a, qname) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) a = {ATTR_l + '.select=2000:ncpus': 1, ATTR_q: qname} j = Job(TEST_USER5, attrs=a) j.set_sleep_time(3000) hjid = self.server.submit(j) scycle = time.time() self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'True'}) (_, str1) = self.scheduler.log_match(hjid + ";Considering job to run") date_time1 = str1.split(";")[0] epoch1 = self.lu.convert_date_time(date_time1) # make sure 2000 jobs were suspended self.server.expect(JOB, {'job_state=S': 2000}, interval=10, offset=5, max_attempts=100) # check when server received the request (_, req_svr) = self.server.log_match(";Type 93 request received", starttime=epoch1) date_time_svr = req_svr.split(";")[0] epoch_svr = self.lu.convert_date_time(date_time_svr) # check when scheduler gets first reply from server (_, resp_sched) = self.scheduler.log_match(";Job preempted ", starttime=epoch1) date_time_sched = resp_sched.split(";")[0] epoch_sched = self.lu.convert_date_time(date_time_sched) svr_delay = epoch_sched - epoch_svr # record the start time of high priority job (_, str2) = self.scheduler.log_match(hjid + ";Job run", n='ALL', interval=2) date_time2 = str2.split(";")[0] epoch2 = self.lu.convert_date_time(date_time2) time_diff = epoch2 - epoch1 self.logger.info('#' * 80) self.logger.info('#' * 80) res_str = "RESULT: TOTAL PREEMPTION TIME: " + \ str(time_diff) + " SECONDS, SERVER TOOK: " + \ str(svr_delay) + " , SCHED TOOK: " + \ str(time_diff - svr_delay) self.logger.info(res_str) self.logger.info('#' * 80) self.logger.info('#' * 80) self.perf_test_result(time_diff, "preempt_time_soft_limits", "sec") def tearDown(self): TestPerformance.tearDown(self) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) job_ids = self.server.select() self.server.delete(id=job_ids)
class TestPreemptPerformance(TestPerformance): """ Check the preemption performance """ lu = PBSLogUtils() def setUp(self): TestPerformance.setUp(self) # set poll cycle to a high value because mom spends a lot of time # in gathering job's resources used. We don't need that in this test self.mom.add_config({'$min_check_poll': 7200, '$max_check_poll': 9600}) abort_script = """#!/bin/bash kill $1 exit 0 """ self.abort_file = self.du.create_temp_file(body=abort_script) self.du.chmod(path=self.abort_file, mode=0755) self.du.chown(path=self.abort_file, uid=0, gid=0, runas=ROOT_USER) c = {'$action': 'checkpoint_abort 30 !' + self.abort_file + ' %sid'} self.mom.add_config(c) self.platform = self.du.get_platform() def create_workload_and_preempt(self): a = {'queue_type': 'execution', 'started': 'True', 'enabled': 'True'} self.server.manager(MGR_CMD_CREATE, QUEUE, a, 'workq2') a = {'max_run_res_soft.ncpus': "[u:PBS_GENERIC=2]"} self.server.manager(MGR_CMD_SET, QUEUE, a, 'workq') a = {'max_run_res.mem': "[u:" + str(TEST_USER) + "=1500mb]"} self.server.manager(MGR_CMD_SET, SERVER, a) a = { 'Resource_List.select': '1:ncpus=3:mem=90mb', 'Resource_List.walltime': 9999 } for _ in range(8): j = Job(TEST_USER, attrs=a) j.set_sleep_time(9999) self.server.submit(j) for _ in range(7): j = Job(TEST_USER1, attrs=a) j.set_sleep_time(9999) self.server.submit(j) sched_off = {'scheduling': 'False'} self.server.manager(MGR_CMD_SET, SERVER, sched_off) a = { 'Resource_List.select': '1:ncpus=3', 'Resource_List.walltime': 9999 } for _ in range(775): j = Job(TEST_USER, attrs=a) j.set_sleep_time(9999) self.server.submit(j) for _ in range(800): j = Job(TEST_USER1, attrs=a) j.set_sleep_time(9999) self.server.submit(j) sched_on = {'scheduling': 'True'} self.server.manager(MGR_CMD_SET, SERVER, sched_on) self.server.expect(JOB, {'job_state=R': 1590}, offset=15, interval=20) a = { 'Resource_List.select': '1:ncpus=90:mem=1350mb', 'Resource_List.walltime': 9999, ATTR_queue: 'workq2' } j1 = Job(TEST_USER, attrs=a) j1.set_sleep_time(9999) j1id = self.server.submit(j1) self.server.expect(JOB, {'job_state': 'R'}, id=j1id, offset=15, interval=5) self.server.expect(JOB, {'job_state=S': 20}, interval=5) (_, str1) = self.scheduler.log_match(j1id + ";Considering job to run", id=j1id, n='ALL', max_attempts=1, interval=2) (_, str2) = self.scheduler.log_match(j1id + ";Job run", id=j1id, n='ALL', max_attempts=1, interval=2) date_time1 = str1.split(";")[0] date_time2 = str2.split(";")[0] epoch1 = self.lu.convert_date_time(date_time1) epoch2 = self.lu.convert_date_time(date_time2) time_diff = epoch2 - epoch1 self.logger.info('#' * 80) self.logger.info('#' * 80) res_str = "RESULT: THE TIME TAKEN IS : " + str(time_diff) + " SECONDS" self.logger.info(res_str) self.logger.info('#' * 80) self.logger.info('#' * 80) @timeout(3600) @tags('sched', 'scheduling_policy') def test_preemption_with_limits(self): """ Measure the time scheduler takes to preempt when the high priority job hits soft/hard limits under a considerable amount of workload. """ a = { 'resources_available.ncpus': 4800, 'resources_available.mem': '2800mb' } self.server.create_vnodes('vn', a, 1, self.mom, usenatvnode=True) p = '"express_queue, normal_jobs, server_softlimits, queue_softlimits"' a = {'preempt_prio': p} self.server.manager(MGR_CMD_SET, SCHED, a, runas=ROOT_USER) self.create_workload_and_preempt() @timeout(3600) @tags('sched', 'scheduling_policy') def test_preemption_with_insufficient_resc(self): """ Measure the time scheduler takes to preempt when the high priority job hits soft/hard limits and there is scarcity of resources under a considerable amount of workload. """ a = { 'resources_available.ncpus': 4800, 'resources_available.mem': '1500mb' } self.server.create_vnodes('vn', a, 1, self.mom, usenatvnode=True) p = '"express_queue, normal_jobs, server_softlimits, queue_softlimits"' a = {'preempt_prio': p} self.server.manager(MGR_CMD_SET, SCHED, a, runas=ROOT_USER) self.create_workload_and_preempt() @timeout(3600) @tags('sched', 'scheduling_policy') def test_insufficient_resc_non_cons(self): """ Submit a number of low priority job and then submit a high priority job that needs a non-consumable resource which is assigned to last running job. This will make scheduler go through all running jobs to find the preemptable job. """ a = {'type': 'string', 'flag': 'h'} self.server.manager(MGR_CMD_CREATE, RSC, a, id='qlist') a = { ATTR_rescavail + ".qlist": "list1", ATTR_rescavail + ".ncpus": "8" } self.server.create_vnodes("vn1", a, 400, self.mom, additive=True, fname="vnodedef1") a = { ATTR_rescavail + ".qlist": "list2", ATTR_rescavail + ".ncpus": "1" } self.server.create_vnodes("vn2", a, 1, self.mom, additive=True, fname="vnodedef2") self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) a = {ATTR_l + '.select': '1:ncpus=1:qlist=list1'} for _ in range(3200): j = Job(TEST_USER, attrs=a) j.set_sleep_time(3000) self.server.submit(j) a = {ATTR_l + '.select': '1:ncpus=1:qlist=list2'} j = Job(TEST_USER, attrs=a) j.set_sleep_time(3000) # Add qlist to the resources scheduler checks for self.scheduler.add_resource('qlist') self.server.manager(MGR_CMD_UNSET, SCHED, 'preempt_sort', runas=ROOT_USER) jid = self.server.submit(j) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'True'}) self.server.expect(JOB, {'job_state=R': 3201}, interval=20, offset=15) qname = 'highp' a = { 'queue_type': 'execution', 'priority': '200', 'started': 'True', 'enabled': 'True' } self.server.manager(MGR_CMD_CREATE, QUEUE, a, qname) a = {ATTR_l + '.select': '1:ncpus=1:qlist=list2', ATTR_q: 'highp'} j = Job(TEST_USER, attrs=a) j.set_sleep_time(3000) jid_highp = self.server.submit(j) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid_highp, interval=10) self.server.expect(JOB, {ATTR_state: (MATCH_RE, 'S|Q')}, id=jid) search_str = jid_highp + ";Considering job to run" (_, str1) = self.scheduler.log_match(search_str, id=jid_highp, n='ALL', max_attempts=1, interval=2) search_str = jid_highp + ";Job run" (_, str2) = self.scheduler.log_match(search_str, id=jid_highp, n='ALL', max_attempts=1, interval=2) date_time1 = str1.split(";")[0] date_time2 = str2.split(";")[0] epoch1 = self.lu.convert_date_time(date_time1) epoch2 = self.lu.convert_date_time(date_time2) time_diff = epoch2 - epoch1 self.logger.info('#' * 80) self.logger.info('#' * 80) res_str = "RESULT: PREEMPTION TOOK: " + str(time_diff) + " SECONDS" self.logger.info(res_str) self.logger.info('#' * 80) self.logger.info('#' * 80) @timeout(3600) @tags('sched', 'scheduling_policy') def test_insufficient_resc_multiple_non_cons(self): """ Submit a number of low priority jobs and then submit a high priority job that needs a non-consumable resource in 2 chunks. These resources are assigned to last two running jobs. This will make scheduler go through all running jobs to find preemptable jobs. """ a = {'type': 'string', 'flag': 'h'} self.server.manager(MGR_CMD_CREATE, RSC, a, id='qlist') a = { ATTR_rescavail + ".qlist": "list1", ATTR_rescavail + ".ncpus": "8" } self.server.create_vnodes("vn1", a, 400, self.mom, additive=True, fname="vnodedef1") a = { ATTR_rescavail + ".qlist": "list2", ATTR_rescavail + ".ncpus": "1" } self.server.create_vnodes("vn2", a, 1, self.mom, additive=True, fname="vnodedef2") a = { ATTR_rescavail + ".qlist": "list3", ATTR_rescavail + ".ncpus": "1" } self.server.create_vnodes("vn3", a, 1, self.mom, additive=True, fname="vnodedef3") self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) a = {ATTR_l + '.select': '1:ncpus=1:qlist=list1'} for _ in range(3200): j = Job(TEST_USER, attrs=a) j.set_sleep_time(3000) self.server.submit(j) a = {ATTR_l + '.select': '1:ncpus=1:qlist=list2'} j = Job(TEST_USER, attrs=a) j.set_sleep_time(3000) b = {ATTR_l + '.select': '1:ncpus=1:qlist=list3'} j2 = Job(TEST_USER, attrs=b) j2.set_sleep_time(3000) # Add qlist to the resources scheduler checks for self.scheduler.add_resource('qlist') self.server.manager(MGR_CMD_UNSET, SCHED, 'preempt_sort', runas=ROOT_USER) jid = self.server.submit(j) jid2 = self.server.submit(j2) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'True'}) self.server.expect(JOB, {'job_state=R': 3202}, interval=20, offset=15) qname = 'highp' a = { 'queue_type': 'execution', 'priority': '200', 'started': 'True', 'enabled': 'True' } self.server.manager(MGR_CMD_CREATE, QUEUE, a, qname) a = { ATTR_l + '.select': '1:ncpus=1:qlist=list2+1:ncpus=1:qlist=list3', ATTR_q: 'highp' } j = Job(TEST_USER, attrs=a) j.set_sleep_time(3000) jid_highp = self.server.submit(j) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid_highp, interval=10) self.server.expect(JOB, {ATTR_state: (MATCH_RE, 'S|Q')}, id=jid) self.server.expect(JOB, {ATTR_state: (MATCH_RE, 'S|Q')}, id=jid2) search_str = jid_highp + ";Considering job to run" (_, str1) = self.scheduler.log_match(search_str, id=jid_highp, n='ALL', max_attempts=1, interval=2) search_str = jid_highp + ";Job run" (_, str2) = self.scheduler.log_match(search_str, id=jid_highp, n='ALL', max_attempts=1, interval=2) date_time1 = str1.split(";")[0] date_time2 = str2.split(";")[0] epoch1 = self.lu.convert_date_time(date_time1) epoch2 = self.lu.convert_date_time(date_time2) time_diff = epoch2 - epoch1 self.logger.info('#' * 80) self.logger.info('#' * 80) res_str = "RESULT: PREEMPTION TOOK: " + str(time_diff) + " SECONDS" self.logger.info(res_str) self.logger.info('#' * 80) self.logger.info('#' * 80) @timeout(3600) @tags('sched', 'scheduling_policy') def test_insufficient_server_resc(self): """ Submit a number of low priority jobs and then make the last low priority job to consume some server level resources. Submit a high priority job that request for this server level resource and measure the time it takes for preemption. """ a = {'type': 'long', 'flag': 'q'} self.server.manager(MGR_CMD_CREATE, RSC, a, id='foo') a = {ATTR_rescavail + ".ncpus": "8"} self.server.create_vnodes("vn1", a, 401, self.mom, additive=True, fname="vnodedef1") # Make resource foo available on server a = {ATTR_rescavail + ".foo": 50, 'scheduling': 'False'} self.server.manager(MGR_CMD_SET, SERVER, a) a = {ATTR_l + '.select': '1:ncpus=1'} for _ in range(3200): j = Job(TEST_USER, attrs=a) j.set_sleep_time(3000) self.server.submit(j) # Add foo to the resources scheduler checks for self.scheduler.add_resource('foo') self.server.manager(MGR_CMD_UNSET, SCHED, 'preempt_sort', runas=ROOT_USER) a = {ATTR_l + '.select': '1:ncpus=1', ATTR_l + '.foo': 25} j = Job(TEST_USER, attrs=a) j.set_sleep_time(3000) jid = self.server.submit(j) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'True'}) self.server.expect(JOB, {'job_state=R': 3201}, interval=20, offset=15) qname = 'highp' a = { 'queue_type': 'execution', 'priority': '200', 'started': 'True', 'enabled': 'True' } self.server.manager(MGR_CMD_CREATE, QUEUE, a, qname) a = { ATTR_l + '.select': '1:ncpus=1', ATTR_l + '.foo': 50, ATTR_q: 'highp' } j2 = Job(TEST_USER, attrs=a) j2.set_sleep_time(3000) jid_highp = self.server.submit(j2) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid_highp, interval=10) self.server.expect(JOB, {ATTR_state: (MATCH_RE, 'S|Q')}, id=jid) search_str = jid_highp + ";Considering job to run" (_, str1) = self.scheduler.log_match(search_str, id=jid_highp, n='ALL', max_attempts=1, interval=2) search_str = jid_highp + ";Job run" (_, str2) = self.scheduler.log_match(search_str, id=jid_highp, n='ALL', max_attempts=1, interval=2) date_time1 = str1.split(";")[0] date_time2 = str2.split(";")[0] epoch1 = self.lu.convert_date_time(date_time1) epoch2 = self.lu.convert_date_time(date_time2) time_diff = epoch2 - epoch1 self.logger.info('#' * 80) self.logger.info('#' * 80) res_str = "RESULT: PREEMPTION TOOK: " + str(time_diff) + " SECONDS" self.logger.info(res_str) self.logger.info('#' * 80) self.logger.info('#' * 80) @timeout(7200) def test_preemption_basic(self): """ Submit a number of low priority job and then submit a high priority job. """ a = {ATTR_rescavail + ".ncpus": "8"} self.server.create_vnodes("vn1", a, 400, self.mom, additive=True, fname="vnodedef1") self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) a = {ATTR_l + '.select': '1:ncpus=1'} for _ in range(3200): j = Job(TEST_USER, attrs=a) j.set_sleep_time(3000) self.server.submit(j) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'True'}) self.server.expect(JOB, {'job_state=R': 3200}, interval=20, offset=15) qname = 'highp' a = { 'queue_type': 'execution', 'priority': '200', 'started': 'True', 'enabled': 'True' } self.server.manager(MGR_CMD_CREATE, QUEUE, a, qname) ncpus = 20 S_jobs = 20 for _ in range(5): a = {ATTR_l + '.select': ncpus, ATTR_q: 'highp'} j = Job(TEST_USER, attrs=a) j.set_sleep_time(3000) jid_highp = self.server.submit(j) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid_highp, interval=10) self.server.expect(JOB, {'job_state=S': S_jobs}, interval=5) search_str = jid_highp + ";Considering job to run" (_, str1) = self.scheduler.log_match(search_str, id=jid_highp, n='ALL', max_attempts=1) search_str = jid_highp + ";Job run" (_, str2) = self.scheduler.log_match(search_str, id=jid_highp, n='ALL', max_attempts=1) date_time1 = str1.split(";")[0] date_time2 = str2.split(";")[0] epoch1 = self.lu.convert_date_time(date_time1) epoch2 = self.lu.convert_date_time(date_time2) time_diff = epoch2 - epoch1 self.logger.info('#' * 80) self.logger.info('#' * 80) res_str = "RESULT: PREEMPTION OF " + str(ncpus) + " JOBS TOOK: " \ + str(time_diff) + " SECONDS" self.logger.info(res_str) self.logger.info('#' * 80) self.logger.info('#' * 80) ncpus *= 3 S_jobs += ncpus def tearDown(self): TestPerformance.tearDown(self) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) job_ids = self.server.select() self.server.delete(id=job_ids)