示例#1
0
    def test_run_and_log_case_status_custom_msg(self):
        test_lines = [
            "00:00:00 default starting starting extra\n",
            "00:00:00 default success success extra\n",
        ]

        starting_func = mock.MagicMock(return_value="starting extra")
        success_func = mock.MagicMock(return_value="success extra")

        def normal_func():
            return "data"

        with tempfile.TemporaryDirectory() as tempdir, MockTime():
            run_and_log_case_status(
                normal_func,
                "default",
                custom_starting_msg_functor=starting_func,
                custom_success_msg_functor=success_func,
                caseroot=tempdir,
            )

            self.assertMatchAllLines(tempdir, test_lines)

        starting_func.assert_called_with()
        success_func.assert_called_with("data")
示例#2
0
    def test_run_and_log_case_status_custom_msg_error_on_batch(self):
        test_lines = [
            "00:00:00 default starting starting extra\n",
            "00:00:00 default success success extra\n",
        ]

        starting_func = mock.MagicMock(return_value="starting extra")
        success_func = mock.MagicMock(return_value="success extra")

        def error_func():
            raise Exception("Error")

        with tempfile.TemporaryDirectory() as tempdir, MockTime(), self.assertRaises(
            Exception
        ):
            run_and_log_case_status(
                error_func,
                "default",
                custom_starting_msg_functor=starting_func,
                custom_success_msg_functor=success_func,
                caseroot=tempdir,
            )

            self.assertMatchAllLines(tempdir, test_lines)

        starting_func.assert_called_with()
        success_func.assert_not_called()
示例#3
0
def submit(self, job=None, no_batch=False, prereq=None, resubmit=False,
           skip_pnl=False, mail_user=None, mail_type=None, batch_args=None):
    if self.get_value("TEST"):
        caseroot = self.get_value("CASEROOT")
        casebaseid = self.get_value("CASEBASEID")
        # This should take care of the race condition where the submitted job
        # begins immediately and tries to set RUN phase. We proactively assume
        # a passed SUBMIT phase. If this state is already PASS, don't set it again
        # because then we'll lose RUN phase info if it's there. This info is important
        # for system_tests_common to know if it needs to reinitialize the test or not.
        with TestStatus(test_dir=caseroot, test_name=casebaseid) as ts:
            phase_status = ts.get_status(SUBMIT_PHASE)
            if phase_status != TEST_PASS_STATUS:
                ts.set_status(SUBMIT_PHASE, TEST_PASS_STATUS)

    try:
        functor = lambda: _submit(self, job=job, no_batch=no_batch, prereq=prereq,
                                  resubmit=resubmit, skip_pnl=skip_pnl,
                                  mail_user=mail_user, mail_type=mail_type,
                                  batch_args=batch_args)
        run_and_log_case_status(functor, "case.submit", caseroot=self.get_value("CASEROOT"),
                                custom_success_msg_functor=verbatim_success_msg)
    except:
        # If something failed in the batch system, make sure to mark
        # the test as failed if we are running a test.
        if self.get_value("TEST"):
            with TestStatus(test_dir=caseroot, test_name=casebaseid) as ts:
                ts.set_status(SUBMIT_PHASE, TEST_FAIL_STATUS)

        raise
示例#4
0
文件: case_setup.py 项目: piokuc/cime
def case_setup(self, clean=False, test_mode=False, reset=False, keep=None):
    ###############################################################################
    caseroot, casebaseid = self.get_value("CASEROOT"), self.get_value(
        "CASEBASEID")
    phase = "setup.clean" if clean else "case.setup"
    functor = lambda: _case_setup_impl(self,
                                       caseroot,
                                       clean=clean,
                                       test_mode=test_mode,
                                       reset=reset,
                                       keep=keep)

    if self.get_value("TEST") and not test_mode:
        test_name = casebaseid if casebaseid is not None else self.get_value(
            "CASE")
        with TestStatus(test_dir=caseroot, test_name=test_name) as ts:
            try:
                run_and_log_case_status(functor, phase, caseroot=caseroot)
            except BaseException:  # Want to catch KeyboardInterrupt too
                ts.set_status(SETUP_PHASE, TEST_FAIL_STATUS)
                raise
            else:
                if clean:
                    ts.set_status(SETUP_PHASE, TEST_PEND_STATUS)
                else:
                    ts.set_status(SETUP_PHASE, TEST_PASS_STATUS)
    else:
        run_and_log_case_status(functor, phase, caseroot=caseroot)
示例#5
0
def case_st_archive(self,
                    last_date_str=None,
                    archive_incomplete_logs=True,
                    copy_only=False,
                    resubmit=True):
    ###############################################################################
    """
    Create archive object and perform short term archiving
    """
    caseroot = self.get_value("CASEROOT")
    self.load_env(job="case.st_archive")
    if last_date_str is not None:
        try:
            last_date = get_file_date(last_date_str)
        except ValueError:
            expect(False, 'Could not parse the last date to archive')
    else:
        last_date = None

    dout_s_root = self.get_value('DOUT_S_ROOT')
    if dout_s_root is None or dout_s_root == 'UNSET':
        expect(False,
               'XML variable DOUT_S_ROOT is required for short-term achiver')
    if not isdir(dout_s_root):
        os.makedirs(dout_s_root)

    dout_s_save_interim = self.get_value('DOUT_S_SAVE_INTERIM_RESTART_FILES')
    if dout_s_save_interim == 'FALSE' or dout_s_save_interim == 'UNSET':
        rest_n = self.get_value('REST_N')
        stop_n = self.get_value('STOP_N')
        if rest_n < stop_n:
            logger.warning('Restart files from end of run will be saved'
                           'interim restart files will be deleted')

    logger.info("st_archive starting")

    archive = self.get_env('archive')
    functor = lambda: _archive_process(self, archive, last_date,
                                       archive_incomplete_logs, copy_only)
    run_and_log_case_status(functor, "st_archive", caseroot=caseroot)

    logger.info("st_archive completed")

    # resubmit case if appropriate
    resubmit_cnt = self.get_value("RESUBMIT")
    logger.debug("resubmit_cnt {} resubmit {}".format(resubmit_cnt, resubmit))
    if resubmit_cnt > 0 and resubmit:
        logger.info(
            "resubmitting from st_archive, resubmit={:d}".format(resubmit_cnt))
        if self.get_value("MACH") == "mira":
            expect(os.path.isfile(".original_host"),
                   "ERROR alcf host file not found")
            with open(".original_host", "r") as fd:
                sshhost = fd.read()
            run_cmd("ssh cooleylogin1 ssh {} '{}/case.submit {} --resubmit' "\
                        .format(sshhost, caseroot, caseroot), verbose=True)
        else:
            self.submit(resubmit=True)

    return True
示例#6
0
    def test_run_and_log_case_status(self):
        test_lines = [
            "00:00:00 default starting \n",
            "00:00:00 default success \n",
        ]

        with tempfile.TemporaryDirectory() as tempdir, MockTime():
            run_and_log_case_status(self.base_func, "default", caseroot=tempdir)

            self.assertMatchAllLines(tempdir, test_lines)
示例#7
0
def submit(self, job=None, no_batch=False, prereq=None, allow_fail=False, resubmit=False,
           resubmit_immediate=False, skip_pnl=False, mail_user=None, mail_type=None,
           batch_args=None, workflow=True):
    if resubmit_immediate and self.get_value("MACH") in ['mira', 'cetus']:
        logger.warning("resubmit_immediate does not work on Mira/Cetus, submitting normally")
        resubmit_immediate = False

    caseroot = self.get_value("CASEROOT")
    if self.get_value("TEST"):
        casebaseid = self.get_value("CASEBASEID")
        # This should take care of the race condition where the submitted job
        # begins immediately and tries to set RUN phase. We proactively assume
        # a passed SUBMIT phase. If this state is already PASS, don't set it again
        # because then we'll lose RUN phase info if it's there. This info is important
        # for system_tests_common to know if it needs to reinitialize the test or not.
        with TestStatus(test_dir=caseroot, test_name=casebaseid) as ts:
            phase_status = ts.get_status(SUBMIT_PHASE)
            if phase_status != TEST_PASS_STATUS:
                ts.set_status(SUBMIT_PHASE, TEST_PASS_STATUS)

    # If this is a resubmit check the hidden file .submit_options for
    # any submit options used on the original submit and use them again
    submit_options = os.path.join(caseroot, ".submit_options")
    if resubmit and os.path.exists(submit_options):
        config = configparser.RawConfigParser()
        config.read(submit_options)
        if not skip_pnl and config.has_option('SubmitOptions','skip_pnl'):
            skip_pnl = config.getboolean('SubmitOptions', 'skip_pnl')
        if mail_user is None and config.has_option('SubmitOptions', 'mail_user'):
            mail_user = config.get('SubmitOptions', 'mail_user')
        if mail_type is None and config.has_option('SubmitOptions', 'mail_type'):
            mail_type = str(config.get('SubmitOptions', 'mail_type')).split(',')
        if batch_args is None and config.has_option('SubmitOptions', 'batch_args'):
            batch_args = config.get('SubmitOptions', 'batch_args')

    is_batch = self.get_value("BATCH_SYSTEM") is not None

    try:
        functor = lambda: _submit(self, job=job, no_batch=no_batch, prereq=prereq,
                                  allow_fail=allow_fail, resubmit=resubmit,
                                  resubmit_immediate=resubmit_immediate, skip_pnl=skip_pnl,
                                  mail_user=mail_user, mail_type=mail_type,
                                  batch_args=batch_args, workflow=workflow)
        run_and_log_case_status(functor, "case.submit", caseroot=caseroot,
                                custom_success_msg_functor=lambda x: x.split(":")[-1],
                                is_batch=is_batch)
    except BaseException: # Want to catch KeyboardInterrupt too
        # If something failed in the batch system, make sure to mark
        # the test as failed if we are running a test.
        if self.get_value("TEST"):
            with TestStatus(test_dir=caseroot, test_name=casebaseid) as ts:
                ts.set_status(SUBMIT_PHASE, TEST_FAIL_STATUS)

        raise
示例#8
0
    def test_run_and_log_case_status_case_submit_no_batch(self):
        test_lines = [
            "00:00:00 case.submit starting \n",
            "00:00:00 case.submit success \n",
        ]

        with TemporaryDirectory() as tempdir, MockTime():
            run_and_log_case_status(self.base_func, "case.submit",
                                    caseroot=tempdir, is_batch=False)

            self.assertMatchAllLines(tempdir, test_lines)
示例#9
0
def case_st_archive(self, last_date_str=None, archive_incomplete_logs=True, copy_only=False, resubmit=True):
###############################################################################
    """
    Create archive object and perform short term archiving
    """
    caseroot = self.get_value("CASEROOT")
    self.load_env(job="case.st_archive")
    if last_date_str is not None:
        try:
            last_date = get_file_date(last_date_str)
        except ValueError:
            expect(False, 'Could not parse the last date to archive')
    else:
        last_date = None

    dout_s_root = self.get_value('DOUT_S_ROOT')
    if dout_s_root is None or dout_s_root == 'UNSET':
        expect(False,
               'XML variable DOUT_S_ROOT is required for short-term achiver')
    if not isdir(dout_s_root):
        os.makedirs(dout_s_root)

    dout_s_save_interim = self.get_value('DOUT_S_SAVE_INTERIM_RESTART_FILES')
    if dout_s_save_interim == 'FALSE' or dout_s_save_interim == 'UNSET':
        rest_n = self.get_value('REST_N')
        stop_n = self.get_value('STOP_N')
        if rest_n < stop_n:
            logger.warning('Restart files from end of run will be saved'
                        'interim restart files will be deleted')

    logger.info("st_archive starting")

    archive = self.get_env('archive')
    functor = lambda: _archive_process(self, archive, last_date, archive_incomplete_logs, copy_only)
    run_and_log_case_status(functor, "st_archive", caseroot=caseroot)

    logger.info("st_archive completed")

    # resubmit case if appropriate
    resubmit_cnt = self.get_value("RESUBMIT")
    logger.debug("resubmit_cnt {} resubmit {}".format(resubmit_cnt, resubmit))
    if resubmit_cnt > 0 and resubmit:
        logger.info("resubmitting from st_archive, resubmit={:d}".format(resubmit_cnt))
        if self.get_value("MACH") == "mira":
            expect(os.path.isfile(".original_host"), "ERROR alcf host file not found")
            with open(".original_host", "r") as fd:
                sshhost = fd.read()
            run_cmd("ssh cooleylogin1 ssh {} '{case}/case.submit {case} --resubmit' "\
                        .format(sshhost, case=caseroot), verbose=True)
        else:
            self.submit(resubmit=True)

    return True
示例#10
0
    def test_run_and_log_case_status_error(self):
        test_lines = [
            "00:00:00 default starting \n",
            "00:00:00 default error \n",
            "Something went wrong\n",
        ]

        with tempfile.TemporaryDirectory() as tempdir, MockTime():
            with self.assertRaises(Exception):
                run_and_log_case_status(self.error_func, "default", caseroot=tempdir)

            self.assertMatchAllLines(tempdir, test_lines)
示例#11
0
def submit(self, job=None, no_batch=False, prereq=None, allow_fail=False, resubmit=False,
           resubmit_immediate=False, skip_pnl=False, mail_user=None, mail_type=None,
           batch_args=None):
    if resubmit_immediate and self.get_value("MACH") in ['mira', 'cetus']:
        logger.warning("resubmit_immediate does not work on Mira/Cetus, submitting normally")
        resubmit_immediate = False

    if self.get_value("TEST"):
        caseroot = self.get_value("CASEROOT")
        casebaseid = self.get_value("CASEBASEID")
        # This should take care of the race condition where the submitted job
        # begins immediately and tries to set RUN phase. We proactively assume
        # a passed SUBMIT phase. If this state is already PASS, don't set it again
        # because then we'll lose RUN phase info if it's there. This info is important
        # for system_tests_common to know if it needs to reinitialize the test or not.
        with TestStatus(test_dir=caseroot, test_name=casebaseid) as ts:
            phase_status = ts.get_status(SUBMIT_PHASE)
            if phase_status != TEST_PASS_STATUS:
                ts.set_status(SUBMIT_PHASE, TEST_PASS_STATUS)

    # If this is a resubmit check the hidden file .submit_options for
    # any submit options used on the original submit and use them again
    caseroot = self.get_value("CASEROOT")
    submit_options = os.path.join(caseroot, ".submit_options")
    if resubmit and os.path.exists(submit_options):
        config = configparser.SafeConfigParser()
        config.read(submit_options)
        if not skip_pnl and config.has_option('SubmitOptions','skip_pnl'):
            skip_pnl = config.getboolean('SubmitOptions', 'skip_pnl')
        if mail_user is None and config.has_option('SubmitOptions', 'mail_user'):
            mail_user = config.get('SubmitOptions', 'mail_user')
        if mail_type is None and config.has_option('SubmitOptions', 'mail_type'):
            mail_type = str(config.get('SubmitOptions', 'mail_type')).split(',')
        if batch_args is None and config.has_option('SubmitOptions', 'batch_args'):
            batch_args = config.get('SubmitOptions', 'batch_args')

    try:
        functor = lambda: _submit(self, job=job, no_batch=no_batch, prereq=prereq,
                                  allow_fail=allow_fail, resubmit=resubmit,
                                  resubmit_immediate=resubmit_immediate, skip_pnl=skip_pnl,
                                  mail_user=mail_user, mail_type=mail_type,
                                  batch_args=batch_args)
        run_and_log_case_status(functor, "case.submit", caseroot=caseroot,
                                custom_success_msg_functor=verbatim_success_msg)
    except:
        # If something failed in the batch system, make sure to mark
        # the test as failed if we are running a test.
        if self.get_value("TEST"):
            with TestStatus(test_dir=caseroot, test_name=casebaseid) as ts:
                ts.set_status(SUBMIT_PHASE, TEST_FAIL_STATUS)

        raise
示例#12
0
    def test_run_and_log_case_status_case_submit_error_on_batch(self):
        test_lines = [
            "00:00:00 case.submit starting \n",
            "00:00:00 case.submit error \n",
            "Something went wrong\n",
        ]

        with TemporaryDirectory() as tempdir, MockTime():
            with self.assertRaises(Exception):
                run_and_log_case_status(self.error_func, "case.submit",
                                        caseroot=tempdir, is_batch=True)

            self.assertMatchAllLines(tempdir, test_lines)
示例#13
0
    def test_run_and_log_case_status_custom_msg(self):
        test_lines = [
            "00:00:00 default starting starting extra\n",
            "00:00:00 default success success extra\n",
        ]

        starting_func = lambda *args: "starting extra"
        success_func = lambda *args: "success extra"

        with TemporaryDirectory() as tempdir, MockTime():
            run_and_log_case_status(self.base_func, "default", 
                                    custom_starting_msg_functor=starting_func,
                                    custom_success_msg_functor=success_func,
                                    caseroot=tempdir)

            self.assertMatchAllLines(tempdir, test_lines)
示例#14
0
def submit(self,
           job=None,
           no_batch=False,
           prereq=None,
           resubmit=False,
           skip_pnl=False,
           mail_user=None,
           mail_type=None,
           batch_args=None):
    if self.get_value("TEST"):
        caseroot = self.get_value("CASEROOT")
        casebaseid = self.get_value("CASEBASEID")
        # This should take care of the race condition where the submitted job
        # begins immediately and tries to set RUN phase. We proactively assume
        # a passed SUBMIT phase. If this state is already PASS, don't set it again
        # because then we'll lose RUN phase info if it's there. This info is important
        # for system_tests_common to know if it needs to reinitialize the test or not.
        with TestStatus(test_dir=caseroot, test_name=casebaseid) as ts:
            phase_status = ts.get_status(SUBMIT_PHASE)
            if phase_status != TEST_PASS_STATUS:
                ts.set_status(SUBMIT_PHASE, TEST_PASS_STATUS)

    try:
        functor = lambda: _submit(self,
                                  job=job,
                                  no_batch=no_batch,
                                  prereq=prereq,
                                  resubmit=resubmit,
                                  skip_pnl=skip_pnl,
                                  mail_user=mail_user,
                                  mail_type=mail_type,
                                  batch_args=batch_args)
        run_and_log_case_status(
            functor,
            "case.submit",
            caseroot=self.get_value("CASEROOT"),
            custom_success_msg_functor=verbatim_success_msg)
    except:
        # If something failed in the batch system, make sure to mark
        # the test as failed if we are running a test.
        if self.get_value("TEST"):
            with TestStatus(test_dir=caseroot, test_name=casebaseid) as ts:
                ts.set_status(SUBMIT_PHASE, TEST_FAIL_STATUS)

        raise
示例#15
0
def case_setup(self, clean=False, test_mode=False, reset=False):
###############################################################################
    caseroot, casebaseid = self.get_value("CASEROOT"), self.get_value("CASEBASEID")
    phase = "setup.clean" if clean else "case.setup"
    functor = lambda: _case_setup_impl(self, caseroot, clean, test_mode, reset)

    if self.get_value("TEST") and not test_mode:
        test_name = casebaseid if casebaseid is not None else self.get_value("CASE")
        with TestStatus(test_dir=caseroot, test_name=test_name) as ts:
            try:
                run_and_log_case_status(functor, phase, caseroot=caseroot)
            except:
                ts.set_status(SETUP_PHASE, TEST_FAIL_STATUS)
                raise
            else:
                if clean:
                    ts.set_status(SETUP_PHASE, TEST_PEND_STATUS)
                else:
                    ts.set_status(SETUP_PHASE, TEST_PASS_STATUS)
    else:
        run_and_log_case_status(functor, phase, caseroot=caseroot)
示例#16
0
def case_setup(self, clean=False, test_mode=False, reset=False, keep=None):
###############################################################################
    caseroot, casebaseid = self.get_value("CASEROOT"), self.get_value("CASEBASEID")
    phase = "setup.clean" if clean else "case.setup"
    functor = lambda: _case_setup_impl(self, caseroot, clean=clean, test_mode=test_mode, reset=reset, keep=keep)

    is_batch = self.get_value("BATCH_SYSTEM") is not None
    msg_func = None

    if is_batch:
        jobid = batch_jobid()
        msg_func = lambda *args: jobid if jobid is not None else ""

    if self.get_value("TEST") and not test_mode:
        test_name = casebaseid if casebaseid is not None else self.get_value("CASE")
        with TestStatus(test_dir=caseroot, test_name=test_name) as ts:
            try:
                run_and_log_case_status(functor, phase, 
                                        custom_starting_msg_functor=msg_func,
                                        custom_success_msg_functor=msg_func,
                                        caseroot=caseroot,
                                        is_batch=is_batch)
            except BaseException: # Want to catch KeyboardInterrupt too
                ts.set_status(SETUP_PHASE, TEST_FAIL_STATUS)
                raise
            else:
                if clean:
                    ts.set_status(SETUP_PHASE, TEST_PEND_STATUS)
                else:
                    ts.set_status(SETUP_PHASE, TEST_PASS_STATUS)
    else:
        run_and_log_case_status(functor, phase, 
                                custom_starting_msg_functor=msg_func,
                                custom_success_msg_functor=msg_func,
                                caseroot=caseroot,
                                is_batch=is_batch)
示例#17
0
文件: build.py 项目: bertinia/cime
def case_build(caseroot, case, sharedlib_only=False, model_only=False, buildlist=None, save_build_provenance=True):
###############################################################################
    functor = lambda: _case_build_impl(caseroot, case, sharedlib_only, model_only, buildlist,
                                       save_build_provenance)
    return run_and_log_case_status(functor, "case.build", caseroot=caseroot)
示例#18
0
        model_log(
            "e3sm", logger, "{} SAVE_PRERUN_PROVENANCE BEGINS HERE".format(
                time.strftime("%Y-%m-%d %H:%M:%S")))
        save_prerun_provenance(case)
        model_log(
            "e3sm", logger, "{} SAVE_PRERUN_PROVENANCE HAS FINISHED".format(
                time.strftime("%Y-%m-%d %H:%M:%S")))

        model_log(
            "e3sm", logger, "{} MODEL EXECUTION BEGINS HERE".format(
                time.strftime("%Y-%m-%d %H:%M:%S")))
        run_func = lambda: run_cmd_no_fail(cmd, from_dir=rundir)
        case.flush()
        try:
            run_and_log_case_status(run_func,
                                    "model execution",
                                    caseroot=case.get_value("CASEROOT"))
            cmd_success = True
        except CIMEError:
            cmd_success = False

        # The run will potentially take a very long time. We need to
        # allow the user to xmlchange things in their case.
        #
        # WARNING: All case variables are reloaded after this call to get the
        # new values of any variables that may have been changed by
        # the user during model execution. Thus, any local variables
        # set from case variables before this point may be
        # inconsistent with their latest values in the xml files, so
        # should generally be reloaded (via case.get_value(XXX)) if they are still needed.
        case.read_xml()
示例#19
0
        os.makedirs(dout_s_root)

    dout_s_save_interim = case.get_value('DOUT_S_SAVE_INTERIM_RESTART_FILES')
    if dout_s_save_interim == 'FALSE' or dout_s_save_interim == 'UNSET':
        rest_n = case.get_value('REST_N')
        stop_n = case.get_value('STOP_N')
        if rest_n < stop_n:
            logger.warning('Restart files from end of run will be saved'
                           'interim restart files will be deleted')

    logger.info("st_archive starting")

    archive = case.get_env('archive')
    functor = lambda: _archive_process(case, archive, last_date,
                                       archive_incomplete_logs, copy_only)
    run_and_log_case_status(functor, "st_archive", caseroot=caseroot)

    logger.info("st_archive completed")

    # resubmit case if appropriate
    resubmit = case.get_value("RESUBMIT")
    if resubmit > 0 and not no_resubmit:
        logger.info(
            "resubmitting from st_archive, resubmit={:d}".format(resubmit))
        if case.get_value("MACH") == "mira":
            expect(os.path.isfile(".original_host"),
                   "ERROR alcf host file not found")
            with open(".original_host", "r") as fd:
                sshhost = fd.read()
            run_cmd("ssh cooleylogin1 ssh {} '{}/case.submit {} --resubmit' "\
                        .format(sshhost, caseroot, caseroot), verbose=True)
示例#20
0
            phase_status = ts.get_status(SUBMIT_PHASE)
            if phase_status != TEST_PASS_STATUS:
                ts.set_status(SUBMIT_PHASE, TEST_PASS_STATUS)

    try:
        functor = lambda: _submit(case,
                                  job=job,
                                  no_batch=no_batch,
                                  prereq=prereq,
                                  resubmit=resubmit,
                                  skip_pnl=skip_pnl,
                                  mail_user=mail_user,
                                  mail_type=mail_type,
                                  batch_args=batch_args)
        run_and_log_case_status(
            functor,
            "case.submit",
            caseroot=case.get_value("CASEROOT"),
            custom_success_msg_functor=verbatim_success_msg)
    except:
        # If something failed in the batch system, make sure to mark
        # the test as failed if we are running a test.
        if case.get_value("TEST"):
            with TestStatus(test_dir=caseroot, test_name=casebaseid) as ts:
                ts.set_status(SUBMIT_PHASE, TEST_FAIL_STATUS)

        raise


def check_case(case):
    check_lockedfiles(case)
    create_namelists(case)  # Must be called before check_all_input_data
示例#21
0
                time.strftime("%Y-%m-%d %H:%M:%S")),
        )

        model_log(
            "e3sm",
            logger,
            "{} MODEL EXECUTION BEGINS HERE".format(
                time.strftime("%Y-%m-%d %H:%M:%S")),
        )
        run_func = lambda: run_cmd_no_fail(cmd, from_dir=rundir)
        case.flush()

        try:
            run_and_log_case_status(
                run_func,
                "model execution",
                custom_starting_msg_functor=msg_func,
                custom_success_msg_functor=msg_func,
                caseroot=case.get_value("CASEROOT"),
                is_batch=is_batch,
            )
            cmd_success = True
        except CIMEError:
            cmd_success = False

        # The run will potentially take a very long time. We need to
        # allow the user to xmlchange things in their case.
        #
        # WARNING: All case variables are reloaded after this call to get the
        # new values of any variables that may have been changed by
        # the user during model execution. Thus, any local variables
        # set from case variables before this point may be
示例#22
0
def case_setup(self, clean=False, test_mode=False, reset=False, keep=None):
    ###############################################################################
    caseroot, casebaseid = self.get_value("CASEROOT"), self.get_value(
        "CASEBASEID")
    phase = "setup.clean" if clean else "case.setup"
    functor = lambda: _case_setup_impl(self,
                                       caseroot,
                                       clean=clean,
                                       test_mode=test_mode,
                                       reset=reset,
                                       keep=keep)

    is_batch = self.get_value("BATCH_SYSTEM") is not None
    msg_func = None

    if is_batch:
        jobid = batch_jobid()
        msg_func = lambda *args: jobid if jobid is not None else ""

    if self.get_value("TEST") and not test_mode:
        test_name = casebaseid if casebaseid is not None else self.get_value(
            "CASE")
        with TestStatus(test_dir=caseroot, test_name=test_name) as ts:
            try:
                run_and_log_case_status(
                    functor,
                    phase,
                    custom_starting_msg_functor=msg_func,
                    custom_success_msg_functor=msg_func,
                    caseroot=caseroot,
                    is_batch=is_batch,
                )
            except BaseException:  # Want to catch KeyboardInterrupt too
                ts.set_status(SETUP_PHASE, TEST_FAIL_STATUS)
                raise
            else:
                if clean:
                    ts.set_status(SETUP_PHASE, TEST_PEND_STATUS)
                else:
                    ts.set_status(SETUP_PHASE, TEST_PASS_STATUS)
    else:
        run_and_log_case_status(
            functor,
            phase,
            custom_starting_msg_functor=msg_func,
            custom_success_msg_functor=msg_func,
            caseroot=caseroot,
            is_batch=is_batch,
        )

    # put the following section here to make sure the rundir is generated first
    machdir = self.get_value("MACHDIR")
    mach = self.get_value("MACH")
    ngpus_per_node = self.get_value("NGPUS_PER_NODE")
    overrides = {}
    overrides["ngpus_per_node"] = ngpus_per_node
    input_template = os.path.join(machdir, "mpi_run_gpu.{}".format(mach))
    if os.path.isfile(input_template):
        # update the wrapper script that sets the device id for each MPI rank
        output_text = transform_vars(open(input_template, "r").read(),
                                     case=self,
                                     overrides=overrides)

        # write it out to the run dir
        rundir = self.get_value("RUNDIR")
        output_name = os.path.join(rundir, "set_device_rank.sh")
        logger.info("Creating file {}".format(output_name))
        with open(output_name, "w") as f:
            f.write(output_text)

        # make the wrapper script executable
        if os.path.isfile(output_name):
            os.system("chmod +x " + output_name)
        else:
            expect(
                False, "The file {} is not written out correctly.".format(
                    output_name))
示例#23
0
def case_st_archive(
    self,
    last_date_str=None,
    archive_incomplete_logs=True,
    copy_only=False,
    resubmit=True,
):
    ###############################################################################
    """
    Create archive object and perform short term archiving
    """
    logger.debug("resubmit {}".format(resubmit))
    caseroot = self.get_value("CASEROOT")
    self.load_env(job="case.st_archive")
    if last_date_str is not None:
        try:
            last_date = get_file_date(last_date_str)
        except ValueError:
            expect(False, "Could not parse the last date to archive")
    else:
        last_date = None

    dout_s_root = self.get_value("DOUT_S_ROOT")
    if dout_s_root is None or dout_s_root == "UNSET":
        expect(False,
               "XML variable DOUT_S_ROOT is required for short-term achiver")
    if not isdir(dout_s_root):
        os.makedirs(dout_s_root)

    dout_s_save_interim = self.get_value("DOUT_S_SAVE_INTERIM_RESTART_FILES")
    if dout_s_save_interim == "FALSE" or dout_s_save_interim == "UNSET":
        rest_n = self.get_value("REST_N")
        stop_n = self.get_value("STOP_N")
        if rest_n < stop_n:
            logger.warning("Restart files from end of run will be saved"
                           "interim restart files will be deleted")

    logger.info("st_archive starting")

    is_batch = self.get_value("BATCH_SYSTEM")
    msg_func = None

    if is_batch:
        jobid = batch_jobid()
        msg_func = lambda *args: jobid if jobid is not None else ""

    archive = self.get_env("archive")
    functor = lambda: _archive_process(self, archive, last_date,
                                       archive_incomplete_logs, copy_only)
    run_and_log_case_status(
        functor,
        "st_archive",
        custom_starting_msg_functor=msg_func,
        custom_success_msg_functor=msg_func,
        caseroot=caseroot,
        is_batch=is_batch,
    )

    logger.info("st_archive completed")

    # resubmit case if appropriate
    if not self.get_value("EXTERNAL_WORKFLOW") and resubmit:
        resubmit_cnt = self.get_value("RESUBMIT")
        logger.debug("resubmit_cnt {} resubmit {}".format(
            resubmit_cnt, resubmit))
        if resubmit_cnt > 0:
            logger.info("resubmitting from st_archive, resubmit={:d}".format(
                resubmit_cnt))
            if self.get_value("MACH") == "mira":
                expect(os.path.isfile(".original_host"),
                       "ERROR alcf host file not found")
                with open(".original_host", "r") as fd:
                    sshhost = fd.read()
                run_cmd(
                    "ssh cooleylogin1 ssh {} '{case}/case.submit {case} --resubmit' "
                    .format(sshhost, case=caseroot),
                    verbose=True,
                )
            else:
                self.submit(resubmit=True)

    return True
示例#24
0
def case_lt_archive(case):
    ###############################################################################
    functor = lambda: _case_lt_archive_impl(case)
    return run_and_log_case_status(functor,
                                   "lt_archiving",
                                   caseroot=case.get_value("CASEROOT"))
示例#25
0
文件: build.py 项目: bertinia/cime
def clean(case, cleanlist=None, clean_all=False, clean_depends=None):
###############################################################################
    functor = lambda: _clean_impl(case, cleanlist, clean_all, clean_depends)
    return run_and_log_case_status(functor, "build.clean", caseroot=case.get_value("CASEROOT"))
示例#26
0
文件: case_run.py 项目: srinathv/cime
def _run_model_impl(case, lid, skip_pnl=False, da_cycle=0):
###############################################################################

    logger.debug("{} PRE_RUN_CHECK BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S")))
    _pre_run_check(case, lid, skip_pnl=skip_pnl, da_cycle=da_cycle)
    logger.debug("{} PRE_RUN_CHECK HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S")))

    model = case.get_value("MODEL")

    # Set OMP_NUM_THREADS
    os.environ["OMP_NUM_THREADS"] = str(case.thread_count)

    # Run the model
    cmd = case.get_mpirun_cmd(allow_unresolved_envvars=False)
    logger.info("run command is {} ".format(cmd))

    rundir = case.get_value("RUNDIR")
    loop = True

    # MPIRUN_RETRY_REGEX allows the mpi command to be reattempted if the
    # failure described by that regular expression is matched in the model log
    # case.spare_nodes is overloaded and may also represent the number of
    # retries to attempt if ALLOCATE_SPARE_NODES is False
    retry_run_re = case.get_value("MPIRUN_RETRY_REGEX")
    node_fail_re = case.get_value("NODE_FAIL_REGEX")
    retry_count = 0
    if retry_run_re:
        retry_run_regex = re.compile(re.escape(retry_run_re))
        retry_count = case.get_value("MPIRUN_RETRY_COUNT")
    if node_fail_re:
        node_fail_regex = re.compile(re.escape(node_fail_re))

    while loop:
        loop = False

        logger.debug("{} SAVE_PRERUN_PROVENANCE BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S")))
        save_prerun_provenance(case)
        logger.debug("{} SAVE_PRERUN_PROVENANCE HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S")))

        logger.debug("{} MODEL EXECUTION BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S")))
        run_func = lambda: run_cmd(cmd, from_dir=rundir)[0]
        stat = run_and_log_case_status(run_func, "model execution", caseroot=case.get_value("CASEROOT"))
        logger.debug("{} MODEL EXECUTION HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S")))

        model_logfile = os.path.join(rundir, model + ".log." + lid)
        # Determine if failure was due to a failed node, if so, try to restart
        if retry_run_re or node_fail_re:
            model_logfile = os.path.join(rundir, model + ".log." + lid)
            if os.path.exists(model_logfile):
                num_node_fails=0
                num_retry_fails=0
                if node_fail_re:
                    num_node_fails = len(node_fail_regex.findall(open(model_logfile, 'r').read()))
                if retry_run_re:
                    num_retry_fails = len(retry_run_regex.findall(open(model_logfile, 'r').read()))
                logger.debug ("RETRY: num_retry_fails {} spare_nodes {} retry_count {}".
                              format(num_retry_fails, case.spare_nodes, retry_count))
                if num_node_fails > 0 and case.spare_nodes >= num_node_fails:
                        # We failed due to node failure!
                    logger.warning("Detected model run failed due to node failure, restarting")
                    case.spare_nodes -= num_node_fails
                    loop = True
                    case.set_value("CONTINUE_RUN",
                                   case.get_value("RESUBMIT_SETS_CONTINUE_RUN"))
                elif num_retry_fails > 0 and retry_count >= num_retry_fails:
                    logger.warning("Detected model run failed, restarting")
                    retry_count -= 1
                    loop = True
                if loop:
                    # Archive the last consistent set of restart files and restore them
                    if case.get_value("DOUT_S"):
                        case.case_st_archive(resubmit=False)
                        case.restore_from_archive()

                    lid = new_lid()
                    case.create_namelists()

        if stat != 0 and not loop:
            # We failed and we're not restarting
            expect(False, "RUN FAIL: Command '{}' failed\nSee log file for details: {}".format(cmd, model_logfile))

    logger.debug("{} POST_RUN_CHECK BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S")))
    _post_run_check(case, lid)
    logger.debug("{} POST_RUN_CHECK HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S")))

    return lid
        env_module = case.get_env("mach_specific")
        env_module.make_env_mach_specific_file(compiler, debug, mpilib, "sh")
        env_module.make_env_mach_specific_file(compiler, debug, mpilib, "csh")
        env_module.save_all_env_info("software_environment.txt")


###############################################################################
def case_setup(case, clean=False, test_mode=False, reset=False):
    ###############################################################################
    caseroot, casebaseid = case.get_value("CASEROOT"), case.get_value(
        "CASEBASEID")
    phase = "setup.clean" if clean else "case.setup"
    functor = lambda: _case_setup_impl(case, caseroot, clean, test_mode, reset)

    if case.get_value("TEST") and not test_mode:
        test_name = casebaseid if casebaseid is not None else case.get_value(
            "CASE")
        with TestStatus(test_dir=caseroot, test_name=test_name) as ts:
            try:
                run_and_log_case_status(functor, phase, caseroot=caseroot)
            except:
                ts.set_status(SETUP_PHASE, TEST_FAIL_STATUS)
                raise
            else:
                if clean:
                    ts.set_status(SETUP_PHASE, TEST_PEND_STATUS)
                else:
                    ts.set_status(SETUP_PHASE, TEST_PASS_STATUS)
    else:
        run_and_log_case_status(functor, phase, caseroot=caseroot)
示例#28
0
            if not loop:
                # We failed and we're not restarting
                expect(False, "RUN FAIL: Command '{}' failed\nSee log file for details: {}".format(cmd, model_logfile))

    logger.info("{} MODEL EXECUTION HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S")))

    post_run_check(case, lid)

    return lid

###############################################################################
def run_model(case, lid, skip_pnl=False):
###############################################################################
    functor = lambda: _run_model_impl(case, lid, skip_pnl=skip_pnl)
    return run_and_log_case_status(functor, "case.run", caseroot=case.get_value("CASEROOT"))

###############################################################################
def post_run_check(case, lid):
###############################################################################

    rundir = case.get_value("RUNDIR")
    model = case.get_value("MODEL")

    # find the last model.log and cpl.log
    model_logfile = os.path.join(rundir, model + ".log." + lid)
    cpl_logfile = os.path.join(rundir, "cpl" + ".log." + lid)

    if not os.path.isfile(model_logfile):
        expect(False, "Model did not complete, no {} log file ".format(model_logfile))
    elif not os.path.isfile(cpl_logfile):
示例#29
0
文件: case_run.py 项目: Katetc/cime
def _run_model_impl(case, lid, skip_pnl=False, da_cycle=0):
###############################################################################

    pre_run_check(case, lid, skip_pnl=skip_pnl, da_cycle=da_cycle)

    model = case.get_value("MODEL")

    # Set OMP_NUM_THREADS
    env_mach_pes = case.get_env("mach_pes")
    comp_classes = case.get_values("COMP_CLASSES")
    thread_count = env_mach_pes.get_max_thread_count(comp_classes)
    os.environ["OMP_NUM_THREADS"] = str(thread_count)

    # Run the model
    logger.info("{} MODEL EXECUTION BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S")))

    cmd = case.get_mpirun_cmd(job="case.run")
    cmd = case.get_resolved_value(cmd)
    logger.info("run command is {} ".format(cmd))

    rundir = case.get_value("RUNDIR")
    loop = True

    while loop:
        loop = False

        save_prerun_provenance(case)
        run_func = lambda: run_cmd(cmd, from_dir=rundir)[0]
        stat = run_and_log_case_status(run_func, "model execution", caseroot=case.get_value("CASEROOT"))

        model_logfile = os.path.join(rundir, model + ".log." + lid)
        # Determine if failure was due to a failed node, if so, try to restart
        if stat != 0:
            node_fail_re = case.get_value("NODE_FAIL_REGEX")
            if node_fail_re:
                node_fail_regex = re.compile(node_fail_re)
                model_logfile = os.path.join(rundir, model + ".log." + lid)
                if os.path.exists(model_logfile):
                    num_fails = len(node_fail_regex.findall(open(model_logfile, 'r').read()))
                    if num_fails > 0 and case.spare_nodes >= num_fails:
                        # We failed due to node failure!
                        logger.warning("Detected model run failed due to node failure, restarting")

                        # Archive the last consistent set of restart files and restore them
                        case_st_archive(case, no_resubmit=True)
                        restore_from_archive(case)

                        case.set_value("CONTINUE_RUN",
                                       case.get_value("RESUBMIT_SETS_CONTINUE_RUN"))
                        create_namelists(case)

                        lid = new_lid()
                        loop = True

                        case.spare_nodes -= num_fails

            if not loop:
                # We failed and we're not restarting
                expect(False, "RUN FAIL: Command '{}' failed\nSee log file for details: {}".format(cmd, model_logfile))

    logger.info("{} MODEL EXECUTION HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S")))

    post_run_check(case, lid)

    return lid
示例#30
0
        lock_file("env_build.xml", caseroot=case.get_value("CASEROOT"))


###############################################################################
def case_build(caseroot,
               case,
               sharedlib_only=False,
               model_only=False,
               buildlist=None,
               save_build_provenance=True):
    ###############################################################################
    functor = lambda: _case_build_impl(caseroot, case, sharedlib_only,
                                       model_only, buildlist,
                                       save_build_provenance)
    cb = "case.build"
    if (sharedlib_only == True):
        cb = cb + " (SHAREDLIB_BUILD)"
    if (model_only == True):
        cb = cb + " (MODEL_BUILD)"
    return run_and_log_case_status(functor, cb, caseroot=caseroot)


###############################################################################
def clean(case, cleanlist=None, clean_all=False, clean_depends=None):
    ###############################################################################
    functor = lambda: _clean_impl(case, cleanlist, clean_all, clean_depends)
    return run_and_log_case_status(functor,
                                   "build.clean",
                                   caseroot=case.get_value("CASEROOT"))
示例#31
0
文件: case_run.py 项目: srinathv/cime
def _run_model(case, lid, skip_pnl=False, da_cycle=0):
###############################################################################
    functor = lambda: _run_model_impl(case, lid, skip_pnl=skip_pnl, da_cycle=da_cycle)
    return run_and_log_case_status(functor, "case.run", caseroot=case.get_value("CASEROOT"))