def test_soak_3(self): """ Test ID: DAOS-2192 Test Description: this time try a dmg command combined with IOR run Use Cases: :avocado: tags=soak3 """ script1 = None script2 = None try: # retrieve IOR job parameters script1 = self.build_ior_script('job1') job_id1 = slurm_utils.run_slurm_script(script1) slurm_utils.register_for_job_results(job_id1, self, maxwait=3600) # now do the dmg job dmgcmds = dmg_utils.get_dmg_script("dmg1", self.params, self.basepath) s3_job2_name = self.params.get("name", '/run/job3/') s3_job2_nodes = self.params.get("nodes", '/run/job3/') output = os.path.join(self.tmpdir, s3_job2_name + "_results.out") script2 = slurm_utils.write_slurm_script(self.tmpdir, s3_job2_name, output, s3_job2_nodes, dmgcmds) job_id2 = slurm_utils.run_slurm_script(script2) slurm_utils.register_for_job_results(job_id2, self, maxwait=3600) # wait for all the jobs to finish while len(self.soak_results) < 2: time.sleep(10) for job, result in self.soak_results.iteritems(): if result != "COMPLETED": self.fail( "Soak job: {} didn't complete as expected: {}".format( job, result)) except (DaosApiError, ior_utils.IorFailed) as error: self.fail("Soak Test 3 Failed\n {}".format(error)) finally: try: os.remove(script1) except StandardError: pass try: os.remove(script2) except StandardError: pass
def test_soak_3(self): """ Test ID: DAOS-2192 Test Description: this time try a dmg command combined with IOR run Use Cases: :avocado: tags=soak3 """ script1 = None script2 = None try: # retrieve IOR job parameters script1 = self.build_ior_script('job1') job_id1 = slurm_utils.run_slurm_script(script1) slurm_utils.register_for_job_results(job_id1, self, maxwait=3600) # now do the dmg job dmgcmds = dmg_utils.get_dmg_script("dmg1", self.params, self.basepath) s3_job2_name = self.params.get("name", '/run/job3/') s3_job2_nodes = self.params.get("nodes", '/run/job3/') output = os.path.join(self.tmpdir, s3_job2_name + "_results.out") script2 = slurm_utils.write_slurm_script(self.tmpdir, s3_job2_name, output, s3_job2_nodes, dmgcmds) job_id2 = slurm_utils.run_slurm_script(script2) slurm_utils.register_for_job_results(job_id2, self, maxwait=3600) # wait for all the jobs to finish while len(self.soak_results) < 2: time.sleep(10) for job, result in self.soak_results.iteritems(): if result != "COMPLETED": self.fail("Soak job: {} didn't complete as expected: {}". format(job, result)) except (DaosApiError, ior_utils.IorFailed) as error: self.fail("Soak Test 3 Failed\n {}".format(error)) finally: try: os.remove(script1) except StandardError: pass try: os.remove(script2) except StandardError: pass
def test_soak_1(self): """ Test ID: DAOS-2192 Test Description: This test runs 2 DAOS API IOR jobs. :avocado: tags=soak1 """ try: # turn job parameters into slurm script script1 = self.build_ior_script('job1') # queue it up to run and register a callback to retrieve results job_id1 = slurm_utils.run_slurm_script(script1) slurm_utils.register_for_job_results(job_id1, self, maxwait=3600) # queue up a second job script2 = self.build_ior_script('job2') job_id2 = slurm_utils.run_slurm_script(script2) slurm_utils.register_for_job_results(job_id2, self, maxwait=3600) # wait for all the jobs to finish while len(self.soak_results) < 2: time.sleep(10) for job, result in self.soak_results.iteritems(): if result != "COMPLETED": self.fail( "Soak job: {} didn't complete as expected: {}".format( job, result)) except (DaosApiError, ior_utils.IorFailed) as error: self.fail("<Soak Test 1 Failed>\n {}".format(error)) finally: try: os.remove(script1) except StandardError: pass try: os.remove(script2) except StandardError: pass
def test_soak_1(self): """ Test ID: DAOS-2192 Test Description: This test runs 2 DAOS API IOR jobs. :avocado: tags=soak1 """ try: # turn job parameters into slurm script script1 = self.build_ior_script('job1') # queue it up to run and register a callback to retrieve results job_id1 = slurm_utils.run_slurm_script(script1) slurm_utils.register_for_job_results(job_id1, self, maxwait=3600) # queue up a second job script2 = self.build_ior_script('job2') job_id2 = slurm_utils.run_slurm_script(script2) slurm_utils.register_for_job_results(job_id2, self, maxwait=3600) # wait for all the jobs to finish while len(self.soak_results) < 2: time.sleep(10) for job, result in self.soak_results.iteritems(): if result != "COMPLETED": self.fail("Soak job: {} didn't complete as expected: {}". format(job, result)) except (DaosApiError, ior_utils.IorFailed) as error: self.fail("<Soak Test 1 Failed>\n {}".format(error)) finally: try: os.remove(script1) except StandardError: pass try: os.remove(script2) except StandardError: pass
def job_startup(self, job_cmdlist): """Submit job batch script. Args: job_cmdlist (list): list of jobs to execute Returns: job_id_list: IDs of each job submitted to slurm. """ self.log.info("<<Job Startup - %s >> at %s", self.test_name, time.ctime()) job_id_list = [] # before submitting the jobs to the queue, check the job timeout; if time.time() > self.end_time: self.log.info("<< SOAK test timeout in Job Startup>>") return job_id_list # job_cmdlist is a list of batch script files for script in job_cmdlist: try: job_id = slurm_utils.run_slurm_script(str(script)) except slurm_utils.SlurmFailed as error: self.log.error(error) # Force the test to exit with failure job_id = None if job_id: self.log.info("<<Job %s started with %s >> at %s", job_id, script, time.ctime()) slurm_utils.register_for_job_results(job_id, self, maxwait=self.test_timeout) # keep a list of the job_id's job_id_list.append(int(job_id)) else: # one of the jobs failed to queue; exit on first fail for now. err_msg = "Slurm failed to submit job for {}".format(script) job_id_list = [] raise SoakTestError("<<FAILED: Soak {}: {}>>".format( self.test_name, err_msg)) return job_id_list
def test_soak_2(self): """ Test ID: DAOS-2192 Test Description: This test verifies that a dmg script can be submitted. :avocado: tags=soak2 """ script = None try: dmgcmds = dmg_utils.get_dmg_script("dmg1", self.params, self.basepath) s2_job1_name = self.params.get("name", '/run/job3/') s2_job1_nodes = self.params.get("nodes", '/run/job3/') output = os.path.join(self.tmpdir, s2_job1_name + "_results.out") script = slurm_utils.write_slurm_script(self.tmpdir, s2_job1_name, output, s2_job1_nodes, dmgcmds) job_id = slurm_utils.run_slurm_script(script) slurm_utils.register_for_job_results(job_id, self, maxwait=3600) # wait for all the jobs to finish while len(self.soak_results) < 1: time.sleep(10) for job, result in self.soak_results.iteritems(): if result != "COMPLETED": self.fail( "Soak job: {} didn't complete as expected: {}".format( job, result)) except (DaosApiError, ior_utils.IorFailed) as error: self.fail("Soak Test 2 Failed/n {}".format(error)) finally: try: os.remove(script) finally: pass
def test_soak_2(self): """ Test ID: DAOS-2192 Test Description: This test verifies that a dmg script can be submitted. :avocado: tags=soak2 """ script = None try: dmgcmds = dmg_utils.get_dmg_script("dmg1", self.params, self.basepath) s2_job1_name = self.params.get("name", '/run/job3/') s2_job1_nodes = self.params.get("nodes", '/run/job3/') output = os.path.join(self.tmpdir, s2_job1_name + "_results.out") script = slurm_utils.write_slurm_script(self.tmpdir, s2_job1_name, output, s2_job1_nodes, dmgcmds) job_id = slurm_utils.run_slurm_script(script) slurm_utils.register_for_job_results(job_id, self, maxwait=3600) # wait for all the jobs to finish while len(self.soak_results) < 1: time.sleep(10) for job, result in self.soak_results.iteritems(): if result != "COMPLETED": self.fail("Soak job: {} didn't complete as expected: {}". format(job, result)) except (DaosApiError, ior_utils.IorFailed) as error: self.fail("Soak Test 2 Failed/n {}".format(error)) finally: try: os.remove(script) finally: pass
def job_startup(self, scripts): """Submit job batch script. Args: scripts (list): list of slurm batch scripts to submit to queue Returns: job_id_list: IDs of each job submitted to slurm. """ self.log.info("<<Job Startup - %s >> at %s", self.test_name, time.ctime()) job_id_list = [] # scripts is a list of batch script files for script in scripts: try: job_id = slurm_utils.run_slurm_script(str(script)) except slurm_utils.SlurmFailed as error: self.log.error(error) # Force the test to exit with failure job_id = None if job_id: print("<<Job {} started with {} >> at {}".format( job_id, script, time.ctime())) slurm_utils.register_for_job_results(job_id, self, maxwait=self.test_timeout) # keep a list of the job_id's job_id_list.append(int(job_id)) else: # one of the jobs failed to queue; exit on first fail for now. err_msg = "Slurm failed to submit job for {}".format(script) job_id_list = [] raise SoakTestError("<<FAILED: Soak {}: {}>>".format( self.test_name, err_msg)) return job_id_list