def test_run_and_log_case_status_custom_msg(self): test_lines = [ "00:00:00 default starting starting extra\n", "00:00:00 default success success extra\n", ] starting_func = mock.MagicMock(return_value="starting extra") success_func = mock.MagicMock(return_value="success extra") def normal_func(): return "data" with tempfile.TemporaryDirectory() as tempdir, MockTime(): run_and_log_case_status( normal_func, "default", custom_starting_msg_functor=starting_func, custom_success_msg_functor=success_func, caseroot=tempdir, ) self.assertMatchAllLines(tempdir, test_lines) starting_func.assert_called_with() success_func.assert_called_with("data")
def test_run_and_log_case_status_custom_msg_error_on_batch(self): test_lines = [ "00:00:00 default starting starting extra\n", "00:00:00 default success success extra\n", ] starting_func = mock.MagicMock(return_value="starting extra") success_func = mock.MagicMock(return_value="success extra") def error_func(): raise Exception("Error") with tempfile.TemporaryDirectory() as tempdir, MockTime(), self.assertRaises( Exception ): run_and_log_case_status( error_func, "default", custom_starting_msg_functor=starting_func, custom_success_msg_functor=success_func, caseroot=tempdir, ) self.assertMatchAllLines(tempdir, test_lines) starting_func.assert_called_with() success_func.assert_not_called()
def submit(self, job=None, no_batch=False, prereq=None, resubmit=False, skip_pnl=False, mail_user=None, mail_type=None, batch_args=None): if self.get_value("TEST"): caseroot = self.get_value("CASEROOT") casebaseid = self.get_value("CASEBASEID") # This should take care of the race condition where the submitted job # begins immediately and tries to set RUN phase. We proactively assume # a passed SUBMIT phase. If this state is already PASS, don't set it again # because then we'll lose RUN phase info if it's there. This info is important # for system_tests_common to know if it needs to reinitialize the test or not. with TestStatus(test_dir=caseroot, test_name=casebaseid) as ts: phase_status = ts.get_status(SUBMIT_PHASE) if phase_status != TEST_PASS_STATUS: ts.set_status(SUBMIT_PHASE, TEST_PASS_STATUS) try: functor = lambda: _submit(self, job=job, no_batch=no_batch, prereq=prereq, resubmit=resubmit, skip_pnl=skip_pnl, mail_user=mail_user, mail_type=mail_type, batch_args=batch_args) run_and_log_case_status(functor, "case.submit", caseroot=self.get_value("CASEROOT"), custom_success_msg_functor=verbatim_success_msg) except: # If something failed in the batch system, make sure to mark # the test as failed if we are running a test. if self.get_value("TEST"): with TestStatus(test_dir=caseroot, test_name=casebaseid) as ts: ts.set_status(SUBMIT_PHASE, TEST_FAIL_STATUS) raise
def case_setup(self, clean=False, test_mode=False, reset=False, keep=None): ############################################################################### caseroot, casebaseid = self.get_value("CASEROOT"), self.get_value( "CASEBASEID") phase = "setup.clean" if clean else "case.setup" functor = lambda: _case_setup_impl(self, caseroot, clean=clean, test_mode=test_mode, reset=reset, keep=keep) if self.get_value("TEST") and not test_mode: test_name = casebaseid if casebaseid is not None else self.get_value( "CASE") with TestStatus(test_dir=caseroot, test_name=test_name) as ts: try: run_and_log_case_status(functor, phase, caseroot=caseroot) except BaseException: # Want to catch KeyboardInterrupt too ts.set_status(SETUP_PHASE, TEST_FAIL_STATUS) raise else: if clean: ts.set_status(SETUP_PHASE, TEST_PEND_STATUS) else: ts.set_status(SETUP_PHASE, TEST_PASS_STATUS) else: run_and_log_case_status(functor, phase, caseroot=caseroot)
def case_st_archive(self, last_date_str=None, archive_incomplete_logs=True, copy_only=False, resubmit=True): ############################################################################### """ Create archive object and perform short term archiving """ caseroot = self.get_value("CASEROOT") self.load_env(job="case.st_archive") if last_date_str is not None: try: last_date = get_file_date(last_date_str) except ValueError: expect(False, 'Could not parse the last date to archive') else: last_date = None dout_s_root = self.get_value('DOUT_S_ROOT') if dout_s_root is None or dout_s_root == 'UNSET': expect(False, 'XML variable DOUT_S_ROOT is required for short-term achiver') if not isdir(dout_s_root): os.makedirs(dout_s_root) dout_s_save_interim = self.get_value('DOUT_S_SAVE_INTERIM_RESTART_FILES') if dout_s_save_interim == 'FALSE' or dout_s_save_interim == 'UNSET': rest_n = self.get_value('REST_N') stop_n = self.get_value('STOP_N') if rest_n < stop_n: logger.warning('Restart files from end of run will be saved' 'interim restart files will be deleted') logger.info("st_archive starting") archive = self.get_env('archive') functor = lambda: _archive_process(self, archive, last_date, archive_incomplete_logs, copy_only) run_and_log_case_status(functor, "st_archive", caseroot=caseroot) logger.info("st_archive completed") # resubmit case if appropriate resubmit_cnt = self.get_value("RESUBMIT") logger.debug("resubmit_cnt {} resubmit {}".format(resubmit_cnt, resubmit)) if resubmit_cnt > 0 and resubmit: logger.info( "resubmitting from st_archive, resubmit={:d}".format(resubmit_cnt)) if self.get_value("MACH") == "mira": expect(os.path.isfile(".original_host"), "ERROR alcf host file not found") with open(".original_host", "r") as fd: sshhost = fd.read() run_cmd("ssh cooleylogin1 ssh {} '{}/case.submit {} --resubmit' "\ .format(sshhost, caseroot, caseroot), verbose=True) else: self.submit(resubmit=True) return True
def test_run_and_log_case_status(self): test_lines = [ "00:00:00 default starting \n", "00:00:00 default success \n", ] with tempfile.TemporaryDirectory() as tempdir, MockTime(): run_and_log_case_status(self.base_func, "default", caseroot=tempdir) self.assertMatchAllLines(tempdir, test_lines)
def submit(self, job=None, no_batch=False, prereq=None, allow_fail=False, resubmit=False, resubmit_immediate=False, skip_pnl=False, mail_user=None, mail_type=None, batch_args=None, workflow=True): if resubmit_immediate and self.get_value("MACH") in ['mira', 'cetus']: logger.warning("resubmit_immediate does not work on Mira/Cetus, submitting normally") resubmit_immediate = False caseroot = self.get_value("CASEROOT") if self.get_value("TEST"): casebaseid = self.get_value("CASEBASEID") # This should take care of the race condition where the submitted job # begins immediately and tries to set RUN phase. We proactively assume # a passed SUBMIT phase. If this state is already PASS, don't set it again # because then we'll lose RUN phase info if it's there. This info is important # for system_tests_common to know if it needs to reinitialize the test or not. with TestStatus(test_dir=caseroot, test_name=casebaseid) as ts: phase_status = ts.get_status(SUBMIT_PHASE) if phase_status != TEST_PASS_STATUS: ts.set_status(SUBMIT_PHASE, TEST_PASS_STATUS) # If this is a resubmit check the hidden file .submit_options for # any submit options used on the original submit and use them again submit_options = os.path.join(caseroot, ".submit_options") if resubmit and os.path.exists(submit_options): config = configparser.RawConfigParser() config.read(submit_options) if not skip_pnl and config.has_option('SubmitOptions','skip_pnl'): skip_pnl = config.getboolean('SubmitOptions', 'skip_pnl') if mail_user is None and config.has_option('SubmitOptions', 'mail_user'): mail_user = config.get('SubmitOptions', 'mail_user') if mail_type is None and config.has_option('SubmitOptions', 'mail_type'): mail_type = str(config.get('SubmitOptions', 'mail_type')).split(',') if batch_args is None and config.has_option('SubmitOptions', 'batch_args'): batch_args = config.get('SubmitOptions', 'batch_args') is_batch = self.get_value("BATCH_SYSTEM") is not None try: functor = lambda: _submit(self, job=job, no_batch=no_batch, prereq=prereq, allow_fail=allow_fail, resubmit=resubmit, resubmit_immediate=resubmit_immediate, skip_pnl=skip_pnl, mail_user=mail_user, mail_type=mail_type, batch_args=batch_args, workflow=workflow) run_and_log_case_status(functor, "case.submit", caseroot=caseroot, custom_success_msg_functor=lambda x: x.split(":")[-1], is_batch=is_batch) except BaseException: # Want to catch KeyboardInterrupt too # If something failed in the batch system, make sure to mark # the test as failed if we are running a test. if self.get_value("TEST"): with TestStatus(test_dir=caseroot, test_name=casebaseid) as ts: ts.set_status(SUBMIT_PHASE, TEST_FAIL_STATUS) raise
def test_run_and_log_case_status_case_submit_no_batch(self): test_lines = [ "00:00:00 case.submit starting \n", "00:00:00 case.submit success \n", ] with TemporaryDirectory() as tempdir, MockTime(): run_and_log_case_status(self.base_func, "case.submit", caseroot=tempdir, is_batch=False) self.assertMatchAllLines(tempdir, test_lines)
def case_st_archive(self, last_date_str=None, archive_incomplete_logs=True, copy_only=False, resubmit=True): ############################################################################### """ Create archive object and perform short term archiving """ caseroot = self.get_value("CASEROOT") self.load_env(job="case.st_archive") if last_date_str is not None: try: last_date = get_file_date(last_date_str) except ValueError: expect(False, 'Could not parse the last date to archive') else: last_date = None dout_s_root = self.get_value('DOUT_S_ROOT') if dout_s_root is None or dout_s_root == 'UNSET': expect(False, 'XML variable DOUT_S_ROOT is required for short-term achiver') if not isdir(dout_s_root): os.makedirs(dout_s_root) dout_s_save_interim = self.get_value('DOUT_S_SAVE_INTERIM_RESTART_FILES') if dout_s_save_interim == 'FALSE' or dout_s_save_interim == 'UNSET': rest_n = self.get_value('REST_N') stop_n = self.get_value('STOP_N') if rest_n < stop_n: logger.warning('Restart files from end of run will be saved' 'interim restart files will be deleted') logger.info("st_archive starting") archive = self.get_env('archive') functor = lambda: _archive_process(self, archive, last_date, archive_incomplete_logs, copy_only) run_and_log_case_status(functor, "st_archive", caseroot=caseroot) logger.info("st_archive completed") # resubmit case if appropriate resubmit_cnt = self.get_value("RESUBMIT") logger.debug("resubmit_cnt {} resubmit {}".format(resubmit_cnt, resubmit)) if resubmit_cnt > 0 and resubmit: logger.info("resubmitting from st_archive, resubmit={:d}".format(resubmit_cnt)) if self.get_value("MACH") == "mira": expect(os.path.isfile(".original_host"), "ERROR alcf host file not found") with open(".original_host", "r") as fd: sshhost = fd.read() run_cmd("ssh cooleylogin1 ssh {} '{case}/case.submit {case} --resubmit' "\ .format(sshhost, case=caseroot), verbose=True) else: self.submit(resubmit=True) return True
def test_run_and_log_case_status_error(self): test_lines = [ "00:00:00 default starting \n", "00:00:00 default error \n", "Something went wrong\n", ] with tempfile.TemporaryDirectory() as tempdir, MockTime(): with self.assertRaises(Exception): run_and_log_case_status(self.error_func, "default", caseroot=tempdir) self.assertMatchAllLines(tempdir, test_lines)
def submit(self, job=None, no_batch=False, prereq=None, allow_fail=False, resubmit=False, resubmit_immediate=False, skip_pnl=False, mail_user=None, mail_type=None, batch_args=None): if resubmit_immediate and self.get_value("MACH") in ['mira', 'cetus']: logger.warning("resubmit_immediate does not work on Mira/Cetus, submitting normally") resubmit_immediate = False if self.get_value("TEST"): caseroot = self.get_value("CASEROOT") casebaseid = self.get_value("CASEBASEID") # This should take care of the race condition where the submitted job # begins immediately and tries to set RUN phase. We proactively assume # a passed SUBMIT phase. If this state is already PASS, don't set it again # because then we'll lose RUN phase info if it's there. This info is important # for system_tests_common to know if it needs to reinitialize the test or not. with TestStatus(test_dir=caseroot, test_name=casebaseid) as ts: phase_status = ts.get_status(SUBMIT_PHASE) if phase_status != TEST_PASS_STATUS: ts.set_status(SUBMIT_PHASE, TEST_PASS_STATUS) # If this is a resubmit check the hidden file .submit_options for # any submit options used on the original submit and use them again caseroot = self.get_value("CASEROOT") submit_options = os.path.join(caseroot, ".submit_options") if resubmit and os.path.exists(submit_options): config = configparser.SafeConfigParser() config.read(submit_options) if not skip_pnl and config.has_option('SubmitOptions','skip_pnl'): skip_pnl = config.getboolean('SubmitOptions', 'skip_pnl') if mail_user is None and config.has_option('SubmitOptions', 'mail_user'): mail_user = config.get('SubmitOptions', 'mail_user') if mail_type is None and config.has_option('SubmitOptions', 'mail_type'): mail_type = str(config.get('SubmitOptions', 'mail_type')).split(',') if batch_args is None and config.has_option('SubmitOptions', 'batch_args'): batch_args = config.get('SubmitOptions', 'batch_args') try: functor = lambda: _submit(self, job=job, no_batch=no_batch, prereq=prereq, allow_fail=allow_fail, resubmit=resubmit, resubmit_immediate=resubmit_immediate, skip_pnl=skip_pnl, mail_user=mail_user, mail_type=mail_type, batch_args=batch_args) run_and_log_case_status(functor, "case.submit", caseroot=caseroot, custom_success_msg_functor=verbatim_success_msg) except: # If something failed in the batch system, make sure to mark # the test as failed if we are running a test. if self.get_value("TEST"): with TestStatus(test_dir=caseroot, test_name=casebaseid) as ts: ts.set_status(SUBMIT_PHASE, TEST_FAIL_STATUS) raise
def test_run_and_log_case_status_case_submit_error_on_batch(self): test_lines = [ "00:00:00 case.submit starting \n", "00:00:00 case.submit error \n", "Something went wrong\n", ] with TemporaryDirectory() as tempdir, MockTime(): with self.assertRaises(Exception): run_and_log_case_status(self.error_func, "case.submit", caseroot=tempdir, is_batch=True) self.assertMatchAllLines(tempdir, test_lines)
def test_run_and_log_case_status_custom_msg(self): test_lines = [ "00:00:00 default starting starting extra\n", "00:00:00 default success success extra\n", ] starting_func = lambda *args: "starting extra" success_func = lambda *args: "success extra" with TemporaryDirectory() as tempdir, MockTime(): run_and_log_case_status(self.base_func, "default", custom_starting_msg_functor=starting_func, custom_success_msg_functor=success_func, caseroot=tempdir) self.assertMatchAllLines(tempdir, test_lines)
def submit(self, job=None, no_batch=False, prereq=None, resubmit=False, skip_pnl=False, mail_user=None, mail_type=None, batch_args=None): if self.get_value("TEST"): caseroot = self.get_value("CASEROOT") casebaseid = self.get_value("CASEBASEID") # This should take care of the race condition where the submitted job # begins immediately and tries to set RUN phase. We proactively assume # a passed SUBMIT phase. If this state is already PASS, don't set it again # because then we'll lose RUN phase info if it's there. This info is important # for system_tests_common to know if it needs to reinitialize the test or not. with TestStatus(test_dir=caseroot, test_name=casebaseid) as ts: phase_status = ts.get_status(SUBMIT_PHASE) if phase_status != TEST_PASS_STATUS: ts.set_status(SUBMIT_PHASE, TEST_PASS_STATUS) try: functor = lambda: _submit(self, job=job, no_batch=no_batch, prereq=prereq, resubmit=resubmit, skip_pnl=skip_pnl, mail_user=mail_user, mail_type=mail_type, batch_args=batch_args) run_and_log_case_status( functor, "case.submit", caseroot=self.get_value("CASEROOT"), custom_success_msg_functor=verbatim_success_msg) except: # If something failed in the batch system, make sure to mark # the test as failed if we are running a test. if self.get_value("TEST"): with TestStatus(test_dir=caseroot, test_name=casebaseid) as ts: ts.set_status(SUBMIT_PHASE, TEST_FAIL_STATUS) raise
def case_setup(self, clean=False, test_mode=False, reset=False): ############################################################################### caseroot, casebaseid = self.get_value("CASEROOT"), self.get_value("CASEBASEID") phase = "setup.clean" if clean else "case.setup" functor = lambda: _case_setup_impl(self, caseroot, clean, test_mode, reset) if self.get_value("TEST") and not test_mode: test_name = casebaseid if casebaseid is not None else self.get_value("CASE") with TestStatus(test_dir=caseroot, test_name=test_name) as ts: try: run_and_log_case_status(functor, phase, caseroot=caseroot) except: ts.set_status(SETUP_PHASE, TEST_FAIL_STATUS) raise else: if clean: ts.set_status(SETUP_PHASE, TEST_PEND_STATUS) else: ts.set_status(SETUP_PHASE, TEST_PASS_STATUS) else: run_and_log_case_status(functor, phase, caseroot=caseroot)
def case_setup(self, clean=False, test_mode=False, reset=False, keep=None): ############################################################################### caseroot, casebaseid = self.get_value("CASEROOT"), self.get_value("CASEBASEID") phase = "setup.clean" if clean else "case.setup" functor = lambda: _case_setup_impl(self, caseroot, clean=clean, test_mode=test_mode, reset=reset, keep=keep) is_batch = self.get_value("BATCH_SYSTEM") is not None msg_func = None if is_batch: jobid = batch_jobid() msg_func = lambda *args: jobid if jobid is not None else "" if self.get_value("TEST") and not test_mode: test_name = casebaseid if casebaseid is not None else self.get_value("CASE") with TestStatus(test_dir=caseroot, test_name=test_name) as ts: try: run_and_log_case_status(functor, phase, custom_starting_msg_functor=msg_func, custom_success_msg_functor=msg_func, caseroot=caseroot, is_batch=is_batch) except BaseException: # Want to catch KeyboardInterrupt too ts.set_status(SETUP_PHASE, TEST_FAIL_STATUS) raise else: if clean: ts.set_status(SETUP_PHASE, TEST_PEND_STATUS) else: ts.set_status(SETUP_PHASE, TEST_PASS_STATUS) else: run_and_log_case_status(functor, phase, custom_starting_msg_functor=msg_func, custom_success_msg_functor=msg_func, caseroot=caseroot, is_batch=is_batch)
def case_build(caseroot, case, sharedlib_only=False, model_only=False, buildlist=None, save_build_provenance=True): ############################################################################### functor = lambda: _case_build_impl(caseroot, case, sharedlib_only, model_only, buildlist, save_build_provenance) return run_and_log_case_status(functor, "case.build", caseroot=caseroot)
model_log( "e3sm", logger, "{} SAVE_PRERUN_PROVENANCE BEGINS HERE".format( time.strftime("%Y-%m-%d %H:%M:%S"))) save_prerun_provenance(case) model_log( "e3sm", logger, "{} SAVE_PRERUN_PROVENANCE HAS FINISHED".format( time.strftime("%Y-%m-%d %H:%M:%S"))) model_log( "e3sm", logger, "{} MODEL EXECUTION BEGINS HERE".format( time.strftime("%Y-%m-%d %H:%M:%S"))) run_func = lambda: run_cmd_no_fail(cmd, from_dir=rundir) case.flush() try: run_and_log_case_status(run_func, "model execution", caseroot=case.get_value("CASEROOT")) cmd_success = True except CIMEError: cmd_success = False # The run will potentially take a very long time. We need to # allow the user to xmlchange things in their case. # # WARNING: All case variables are reloaded after this call to get the # new values of any variables that may have been changed by # the user during model execution. Thus, any local variables # set from case variables before this point may be # inconsistent with their latest values in the xml files, so # should generally be reloaded (via case.get_value(XXX)) if they are still needed. case.read_xml()
os.makedirs(dout_s_root) dout_s_save_interim = case.get_value('DOUT_S_SAVE_INTERIM_RESTART_FILES') if dout_s_save_interim == 'FALSE' or dout_s_save_interim == 'UNSET': rest_n = case.get_value('REST_N') stop_n = case.get_value('STOP_N') if rest_n < stop_n: logger.warning('Restart files from end of run will be saved' 'interim restart files will be deleted') logger.info("st_archive starting") archive = case.get_env('archive') functor = lambda: _archive_process(case, archive, last_date, archive_incomplete_logs, copy_only) run_and_log_case_status(functor, "st_archive", caseroot=caseroot) logger.info("st_archive completed") # resubmit case if appropriate resubmit = case.get_value("RESUBMIT") if resubmit > 0 and not no_resubmit: logger.info( "resubmitting from st_archive, resubmit={:d}".format(resubmit)) if case.get_value("MACH") == "mira": expect(os.path.isfile(".original_host"), "ERROR alcf host file not found") with open(".original_host", "r") as fd: sshhost = fd.read() run_cmd("ssh cooleylogin1 ssh {} '{}/case.submit {} --resubmit' "\ .format(sshhost, caseroot, caseroot), verbose=True)
phase_status = ts.get_status(SUBMIT_PHASE) if phase_status != TEST_PASS_STATUS: ts.set_status(SUBMIT_PHASE, TEST_PASS_STATUS) try: functor = lambda: _submit(case, job=job, no_batch=no_batch, prereq=prereq, resubmit=resubmit, skip_pnl=skip_pnl, mail_user=mail_user, mail_type=mail_type, batch_args=batch_args) run_and_log_case_status( functor, "case.submit", caseroot=case.get_value("CASEROOT"), custom_success_msg_functor=verbatim_success_msg) except: # If something failed in the batch system, make sure to mark # the test as failed if we are running a test. if case.get_value("TEST"): with TestStatus(test_dir=caseroot, test_name=casebaseid) as ts: ts.set_status(SUBMIT_PHASE, TEST_FAIL_STATUS) raise def check_case(case): check_lockedfiles(case) create_namelists(case) # Must be called before check_all_input_data
time.strftime("%Y-%m-%d %H:%M:%S")), ) model_log( "e3sm", logger, "{} MODEL EXECUTION BEGINS HERE".format( time.strftime("%Y-%m-%d %H:%M:%S")), ) run_func = lambda: run_cmd_no_fail(cmd, from_dir=rundir) case.flush() try: run_and_log_case_status( run_func, "model execution", custom_starting_msg_functor=msg_func, custom_success_msg_functor=msg_func, caseroot=case.get_value("CASEROOT"), is_batch=is_batch, ) cmd_success = True except CIMEError: cmd_success = False # The run will potentially take a very long time. We need to # allow the user to xmlchange things in their case. # # WARNING: All case variables are reloaded after this call to get the # new values of any variables that may have been changed by # the user during model execution. Thus, any local variables # set from case variables before this point may be
def case_setup(self, clean=False, test_mode=False, reset=False, keep=None): ############################################################################### caseroot, casebaseid = self.get_value("CASEROOT"), self.get_value( "CASEBASEID") phase = "setup.clean" if clean else "case.setup" functor = lambda: _case_setup_impl(self, caseroot, clean=clean, test_mode=test_mode, reset=reset, keep=keep) is_batch = self.get_value("BATCH_SYSTEM") is not None msg_func = None if is_batch: jobid = batch_jobid() msg_func = lambda *args: jobid if jobid is not None else "" if self.get_value("TEST") and not test_mode: test_name = casebaseid if casebaseid is not None else self.get_value( "CASE") with TestStatus(test_dir=caseroot, test_name=test_name) as ts: try: run_and_log_case_status( functor, phase, custom_starting_msg_functor=msg_func, custom_success_msg_functor=msg_func, caseroot=caseroot, is_batch=is_batch, ) except BaseException: # Want to catch KeyboardInterrupt too ts.set_status(SETUP_PHASE, TEST_FAIL_STATUS) raise else: if clean: ts.set_status(SETUP_PHASE, TEST_PEND_STATUS) else: ts.set_status(SETUP_PHASE, TEST_PASS_STATUS) else: run_and_log_case_status( functor, phase, custom_starting_msg_functor=msg_func, custom_success_msg_functor=msg_func, caseroot=caseroot, is_batch=is_batch, ) # put the following section here to make sure the rundir is generated first machdir = self.get_value("MACHDIR") mach = self.get_value("MACH") ngpus_per_node = self.get_value("NGPUS_PER_NODE") overrides = {} overrides["ngpus_per_node"] = ngpus_per_node input_template = os.path.join(machdir, "mpi_run_gpu.{}".format(mach)) if os.path.isfile(input_template): # update the wrapper script that sets the device id for each MPI rank output_text = transform_vars(open(input_template, "r").read(), case=self, overrides=overrides) # write it out to the run dir rundir = self.get_value("RUNDIR") output_name = os.path.join(rundir, "set_device_rank.sh") logger.info("Creating file {}".format(output_name)) with open(output_name, "w") as f: f.write(output_text) # make the wrapper script executable if os.path.isfile(output_name): os.system("chmod +x " + output_name) else: expect( False, "The file {} is not written out correctly.".format( output_name))
def case_st_archive( self, last_date_str=None, archive_incomplete_logs=True, copy_only=False, resubmit=True, ): ############################################################################### """ Create archive object and perform short term archiving """ logger.debug("resubmit {}".format(resubmit)) caseroot = self.get_value("CASEROOT") self.load_env(job="case.st_archive") if last_date_str is not None: try: last_date = get_file_date(last_date_str) except ValueError: expect(False, "Could not parse the last date to archive") else: last_date = None dout_s_root = self.get_value("DOUT_S_ROOT") if dout_s_root is None or dout_s_root == "UNSET": expect(False, "XML variable DOUT_S_ROOT is required for short-term achiver") if not isdir(dout_s_root): os.makedirs(dout_s_root) dout_s_save_interim = self.get_value("DOUT_S_SAVE_INTERIM_RESTART_FILES") if dout_s_save_interim == "FALSE" or dout_s_save_interim == "UNSET": rest_n = self.get_value("REST_N") stop_n = self.get_value("STOP_N") if rest_n < stop_n: logger.warning("Restart files from end of run will be saved" "interim restart files will be deleted") logger.info("st_archive starting") is_batch = self.get_value("BATCH_SYSTEM") msg_func = None if is_batch: jobid = batch_jobid() msg_func = lambda *args: jobid if jobid is not None else "" archive = self.get_env("archive") functor = lambda: _archive_process(self, archive, last_date, archive_incomplete_logs, copy_only) run_and_log_case_status( functor, "st_archive", custom_starting_msg_functor=msg_func, custom_success_msg_functor=msg_func, caseroot=caseroot, is_batch=is_batch, ) logger.info("st_archive completed") # resubmit case if appropriate if not self.get_value("EXTERNAL_WORKFLOW") and resubmit: resubmit_cnt = self.get_value("RESUBMIT") logger.debug("resubmit_cnt {} resubmit {}".format( resubmit_cnt, resubmit)) if resubmit_cnt > 0: logger.info("resubmitting from st_archive, resubmit={:d}".format( resubmit_cnt)) if self.get_value("MACH") == "mira": expect(os.path.isfile(".original_host"), "ERROR alcf host file not found") with open(".original_host", "r") as fd: sshhost = fd.read() run_cmd( "ssh cooleylogin1 ssh {} '{case}/case.submit {case} --resubmit' " .format(sshhost, case=caseroot), verbose=True, ) else: self.submit(resubmit=True) return True
def case_lt_archive(case): ############################################################################### functor = lambda: _case_lt_archive_impl(case) return run_and_log_case_status(functor, "lt_archiving", caseroot=case.get_value("CASEROOT"))
def clean(case, cleanlist=None, clean_all=False, clean_depends=None): ############################################################################### functor = lambda: _clean_impl(case, cleanlist, clean_all, clean_depends) return run_and_log_case_status(functor, "build.clean", caseroot=case.get_value("CASEROOT"))
def _run_model_impl(case, lid, skip_pnl=False, da_cycle=0): ############################################################################### logger.debug("{} PRE_RUN_CHECK BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S"))) _pre_run_check(case, lid, skip_pnl=skip_pnl, da_cycle=da_cycle) logger.debug("{} PRE_RUN_CHECK HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S"))) model = case.get_value("MODEL") # Set OMP_NUM_THREADS os.environ["OMP_NUM_THREADS"] = str(case.thread_count) # Run the model cmd = case.get_mpirun_cmd(allow_unresolved_envvars=False) logger.info("run command is {} ".format(cmd)) rundir = case.get_value("RUNDIR") loop = True # MPIRUN_RETRY_REGEX allows the mpi command to be reattempted if the # failure described by that regular expression is matched in the model log # case.spare_nodes is overloaded and may also represent the number of # retries to attempt if ALLOCATE_SPARE_NODES is False retry_run_re = case.get_value("MPIRUN_RETRY_REGEX") node_fail_re = case.get_value("NODE_FAIL_REGEX") retry_count = 0 if retry_run_re: retry_run_regex = re.compile(re.escape(retry_run_re)) retry_count = case.get_value("MPIRUN_RETRY_COUNT") if node_fail_re: node_fail_regex = re.compile(re.escape(node_fail_re)) while loop: loop = False logger.debug("{} SAVE_PRERUN_PROVENANCE BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S"))) save_prerun_provenance(case) logger.debug("{} SAVE_PRERUN_PROVENANCE HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S"))) logger.debug("{} MODEL EXECUTION BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S"))) run_func = lambda: run_cmd(cmd, from_dir=rundir)[0] stat = run_and_log_case_status(run_func, "model execution", caseroot=case.get_value("CASEROOT")) logger.debug("{} MODEL EXECUTION HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S"))) model_logfile = os.path.join(rundir, model + ".log." + lid) # Determine if failure was due to a failed node, if so, try to restart if retry_run_re or node_fail_re: model_logfile = os.path.join(rundir, model + ".log." + lid) if os.path.exists(model_logfile): num_node_fails=0 num_retry_fails=0 if node_fail_re: num_node_fails = len(node_fail_regex.findall(open(model_logfile, 'r').read())) if retry_run_re: num_retry_fails = len(retry_run_regex.findall(open(model_logfile, 'r').read())) logger.debug ("RETRY: num_retry_fails {} spare_nodes {} retry_count {}". format(num_retry_fails, case.spare_nodes, retry_count)) if num_node_fails > 0 and case.spare_nodes >= num_node_fails: # We failed due to node failure! logger.warning("Detected model run failed due to node failure, restarting") case.spare_nodes -= num_node_fails loop = True case.set_value("CONTINUE_RUN", case.get_value("RESUBMIT_SETS_CONTINUE_RUN")) elif num_retry_fails > 0 and retry_count >= num_retry_fails: logger.warning("Detected model run failed, restarting") retry_count -= 1 loop = True if loop: # Archive the last consistent set of restart files and restore them if case.get_value("DOUT_S"): case.case_st_archive(resubmit=False) case.restore_from_archive() lid = new_lid() case.create_namelists() if stat != 0 and not loop: # We failed and we're not restarting expect(False, "RUN FAIL: Command '{}' failed\nSee log file for details: {}".format(cmd, model_logfile)) logger.debug("{} POST_RUN_CHECK BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S"))) _post_run_check(case, lid) logger.debug("{} POST_RUN_CHECK HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S"))) return lid
env_module = case.get_env("mach_specific") env_module.make_env_mach_specific_file(compiler, debug, mpilib, "sh") env_module.make_env_mach_specific_file(compiler, debug, mpilib, "csh") env_module.save_all_env_info("software_environment.txt") ############################################################################### def case_setup(case, clean=False, test_mode=False, reset=False): ############################################################################### caseroot, casebaseid = case.get_value("CASEROOT"), case.get_value( "CASEBASEID") phase = "setup.clean" if clean else "case.setup" functor = lambda: _case_setup_impl(case, caseroot, clean, test_mode, reset) if case.get_value("TEST") and not test_mode: test_name = casebaseid if casebaseid is not None else case.get_value( "CASE") with TestStatus(test_dir=caseroot, test_name=test_name) as ts: try: run_and_log_case_status(functor, phase, caseroot=caseroot) except: ts.set_status(SETUP_PHASE, TEST_FAIL_STATUS) raise else: if clean: ts.set_status(SETUP_PHASE, TEST_PEND_STATUS) else: ts.set_status(SETUP_PHASE, TEST_PASS_STATUS) else: run_and_log_case_status(functor, phase, caseroot=caseroot)
if not loop: # We failed and we're not restarting expect(False, "RUN FAIL: Command '{}' failed\nSee log file for details: {}".format(cmd, model_logfile)) logger.info("{} MODEL EXECUTION HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S"))) post_run_check(case, lid) return lid ############################################################################### def run_model(case, lid, skip_pnl=False): ############################################################################### functor = lambda: _run_model_impl(case, lid, skip_pnl=skip_pnl) return run_and_log_case_status(functor, "case.run", caseroot=case.get_value("CASEROOT")) ############################################################################### def post_run_check(case, lid): ############################################################################### rundir = case.get_value("RUNDIR") model = case.get_value("MODEL") # find the last model.log and cpl.log model_logfile = os.path.join(rundir, model + ".log." + lid) cpl_logfile = os.path.join(rundir, "cpl" + ".log." + lid) if not os.path.isfile(model_logfile): expect(False, "Model did not complete, no {} log file ".format(model_logfile)) elif not os.path.isfile(cpl_logfile):
def _run_model_impl(case, lid, skip_pnl=False, da_cycle=0): ############################################################################### pre_run_check(case, lid, skip_pnl=skip_pnl, da_cycle=da_cycle) model = case.get_value("MODEL") # Set OMP_NUM_THREADS env_mach_pes = case.get_env("mach_pes") comp_classes = case.get_values("COMP_CLASSES") thread_count = env_mach_pes.get_max_thread_count(comp_classes) os.environ["OMP_NUM_THREADS"] = str(thread_count) # Run the model logger.info("{} MODEL EXECUTION BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S"))) cmd = case.get_mpirun_cmd(job="case.run") cmd = case.get_resolved_value(cmd) logger.info("run command is {} ".format(cmd)) rundir = case.get_value("RUNDIR") loop = True while loop: loop = False save_prerun_provenance(case) run_func = lambda: run_cmd(cmd, from_dir=rundir)[0] stat = run_and_log_case_status(run_func, "model execution", caseroot=case.get_value("CASEROOT")) model_logfile = os.path.join(rundir, model + ".log." + lid) # Determine if failure was due to a failed node, if so, try to restart if stat != 0: node_fail_re = case.get_value("NODE_FAIL_REGEX") if node_fail_re: node_fail_regex = re.compile(node_fail_re) model_logfile = os.path.join(rundir, model + ".log." + lid) if os.path.exists(model_logfile): num_fails = len(node_fail_regex.findall(open(model_logfile, 'r').read())) if num_fails > 0 and case.spare_nodes >= num_fails: # We failed due to node failure! logger.warning("Detected model run failed due to node failure, restarting") # Archive the last consistent set of restart files and restore them case_st_archive(case, no_resubmit=True) restore_from_archive(case) case.set_value("CONTINUE_RUN", case.get_value("RESUBMIT_SETS_CONTINUE_RUN")) create_namelists(case) lid = new_lid() loop = True case.spare_nodes -= num_fails if not loop: # We failed and we're not restarting expect(False, "RUN FAIL: Command '{}' failed\nSee log file for details: {}".format(cmd, model_logfile)) logger.info("{} MODEL EXECUTION HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S"))) post_run_check(case, lid) return lid
lock_file("env_build.xml", caseroot=case.get_value("CASEROOT")) ############################################################################### def case_build(caseroot, case, sharedlib_only=False, model_only=False, buildlist=None, save_build_provenance=True): ############################################################################### functor = lambda: _case_build_impl(caseroot, case, sharedlib_only, model_only, buildlist, save_build_provenance) cb = "case.build" if (sharedlib_only == True): cb = cb + " (SHAREDLIB_BUILD)" if (model_only == True): cb = cb + " (MODEL_BUILD)" return run_and_log_case_status(functor, cb, caseroot=caseroot) ############################################################################### def clean(case, cleanlist=None, clean_all=False, clean_depends=None): ############################################################################### functor = lambda: _clean_impl(case, cleanlist, clean_all, clean_depends) return run_and_log_case_status(functor, "build.clean", caseroot=case.get_value("CASEROOT"))
def _run_model(case, lid, skip_pnl=False, da_cycle=0): ############################################################################### functor = lambda: _run_model_impl(case, lid, skip_pnl=skip_pnl, da_cycle=da_cycle) return run_and_log_case_status(functor, "case.run", caseroot=case.get_value("CASEROOT"))