示例#1
0
 def _process_message_failed(self, itask, event_time, message):
     """Helper for process_message, handle a failed message."""
     if event_time is None:
         event_time = get_current_time_string()
     itask.set_summary_time('finished', event_time)
     job_d = get_task_job_id(itask.point, itask.tdef.name, itask.submit_num)
     self.job_pool.set_job_time(job_d, 'finished', event_time)
     self.suite_db_mgr.put_update_task_jobs(itask, {
         "run_status": 1,
         "time_run_exit": event_time,
     })
     if (TASK_STATUS_RETRYING not in itask.try_timers
             or itask.try_timers[TASK_STATUS_RETRYING].next() is None):
         # No retry lined up: definitive failure.
         self.pflag = True
         if itask.state.reset(TASK_STATUS_FAILED):
             self.setup_event_handlers(itask, "failed", message)
             self.job_pool.set_job_state(job_d, TASK_STATUS_FAILED)
         LOG.critical("[%s] -job(%02d) %s", itask, itask.submit_num,
                      "failed")
     elif itask.state.reset(TASK_STATUS_RETRYING):
         delay_msg = "retrying in %s" % (
             itask.try_timers[TASK_STATUS_RETRYING].delay_timeout_as_str())
         if itask.state.is_held:
             delay_msg = "held (%s)" % delay_msg
         msg = "failed, %s" % (delay_msg)
         LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg)
         itask.set_summary_message(msg)
         self.setup_event_handlers(itask, "retry",
                                   "%s, %s" % (self.JOB_FAILED, delay_msg))
     self._reset_job_timers(itask)
示例#2
0
 def put_broadcast(self, modified_settings, is_cancel=False):
     """Put or clear broadcasts in runtime database."""
     now = get_current_time_string(display_sub_seconds=True)
     for broadcast_change in (
             get_broadcast_change_iter(modified_settings, is_cancel)):
         broadcast_change["time"] = now
         self.db_inserts_map[self.TABLE_BROADCAST_EVENTS].append(
             broadcast_change)
         if is_cancel:
             self.db_deletes_map[self.TABLE_BROADCAST_STATES].append({
                 "point": broadcast_change["point"],
                 "namespace": broadcast_change["namespace"],
                 "key": broadcast_change["key"]})
             # Delete statements are currently executed before insert
             # statements, so we should clear out any insert statements that
             # are deleted here.
             # (Not the most efficient logic here, but unless we have a
             # large number of inserts, then this should not be a big
             # concern.)
             inserts = []
             for insert in self.db_inserts_map[self.TABLE_BROADCAST_STATES]:
                 if any(insert[key] != broadcast_change[key]
                        for key in ["point", "namespace", "key"]):
                     inserts.append(insert)
             self.db_inserts_map[self.TABLE_BROADCAST_STATES] = inserts
         else:
             self.db_inserts_map[self.TABLE_BROADCAST_STATES].append({
                 "point": broadcast_change["point"],
                 "namespace": broadcast_change["namespace"],
                 "key": broadcast_change["key"],
                 "value": broadcast_change["value"]})
示例#3
0
 def _process_message_submit_failed(self, itask, event_time):
     """Helper for process_message, handle a submit-failed message."""
     LOG.error('[%s] -%s', itask, self.EVENT_SUBMIT_FAILED)
     if event_time is None:
         event_time = get_current_time_string()
     self.suite_db_mgr.put_update_task_jobs(itask, {
         "time_submit_exit": event_time,
         "submit_status": 1,
     })
     job_d = get_task_job_id(itask.point, itask.tdef.name, itask.submit_num)
     self.job_pool.set_job_attr(job_d, 'batch_sys_job_id', None)
     itask.summary['submit_method_id'] = None
     self.pflag = True
     if (TASK_STATUS_SUBMIT_RETRYING not in itask.try_timers or
             itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].next() is None):
         # No submission retry lined up: definitive failure.
         # See github #476.
         if itask.state.reset(TASK_STATUS_SUBMIT_FAILED):
             self.setup_event_handlers(itask, self.EVENT_SUBMIT_FAILED,
                                       'job %s' % self.EVENT_SUBMIT_FAILED)
             self.job_pool.set_job_state(job_d, TASK_STATUS_SUBMIT_FAILED)
     elif itask.state.reset(TASK_STATUS_SUBMIT_RETRYING, ):
         # There is a submission retry lined up.
         timer = itask.try_timers[TASK_STATUS_SUBMIT_RETRYING]
         delay_msg = "submit-retrying in %s" % timer.delay_timeout_as_str()
         if itask.state.is_held:
             delay_msg = "held (%s)" % delay_msg
         msg = "%s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg)
         LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg)
         itask.set_summary_message(msg)
         self.setup_event_handlers(
             itask, self.EVENT_SUBMIT_RETRY,
             "job %s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg))
         self.job_pool.set_job_state(job_d, TASK_STATUS_SUBMIT_RETRYING)
     self._reset_job_timers(itask)
示例#4
0
文件: rundb.py 项目: cylc/cylc
    def take_checkpoints(self, event, other_daos=None):
        """Add insert items to *_checkpoints tables.

        Select items in suite_params, broadcast_states and task_pool and
        prepare them for insert into the relevant *_checkpoints tables, and
        prepare an insert into the checkpoint_id table the event and the
        current time.

        If other_daos is a specified, it should be a list of CylcSuiteDAO
        objects.  The logic will prepare insertion of the same items into the
        *_checkpoints tables of these DAOs as well.
        """
        id_ = 1
        for max_id, in self.connect().execute(
                "SELECT MAX(id) FROM checkpoint_id"):
            if max_id is not None and max_id >= id_:
                id_ = max_id + 1
        daos = [self]
        if other_daos:
            daos.extend(other_daos)
        for dao in daos:
            dao.tables[self.TABLE_CHECKPOINT_ID].add_insert_item([
                id_, get_current_time_string(), event])
        for table_name in [
                self.TABLE_SUITE_PARAMS,
                self.TABLE_BROADCAST_STATES,
                self.TABLE_TASK_POOL]:
            for row in self.connect().execute("SELECT * FROM %s" % table_name):
                for dao in daos:
                    dao.tables[table_name + "_checkpoints"].add_insert_item(
                        [id_] + list(row))
示例#5
0
    def take_checkpoints(self, event, other_daos=None):
        """Add insert items to *_checkpoints tables.

        Select items in suite_params, broadcast_states and task_pool and
        prepare them for insert into the relevant *_checkpoints tables, and
        prepare an insert into the checkpoint_id table the event and the
        current time.

        If other_daos is a specified, it should be a list of CylcSuiteDAO
        objects.  The logic will prepare insertion of the same items into the
        *_checkpoints tables of these DAOs as well.
        """
        id_ = 1
        for max_id, in self.connect().execute(
                "SELECT MAX(id) FROM checkpoint_id"):
            if max_id is not None and max_id >= id_:
                id_ = max_id + 1
        daos = [self]
        if other_daos:
            daos.extend(other_daos)
        for dao in daos:
            dao.tables[self.TABLE_CHECKPOINT_ID].add_insert_item(
                [id_, get_current_time_string(), event])
        for table_name in [
                self.TABLE_SUITE_PARAMS, self.TABLE_BROADCAST_STATES,
                self.TABLE_TASK_POOL
        ]:
            for row in self.connect().execute("SELECT * FROM %s" % table_name):
                for dao in daos:
                    dao.tables[table_name +
                               "_checkpoints"].add_insert_item([id_] +
                                                               list(row))
示例#6
0
def testrender_node__task__running():
    """It renders running tasks."""
    child = Mock()
    child.get_value = lambda: {'data': {
        'startedTime': get_current_time_string(),
        'state': 'running'
    }}
    node = Mock()
    node.get_child_node = lambda _: child
    assert render_node(
        node,
        {
            'name': 'foo',
            'state': 'running',
            'isHeld': False,
            'isQueued': False,
            'task': {'meanElapsedTime': 100}
        },
        'task'
    ) == [
        TASK_ICONS['running'],
        ' ',
        ('job_running', JOB_ICON),
        ' ',
        'foo'
    ]
示例#7
0
 def _run_command_exit(cls, ctx, callback=None, callback_args=None):
     """Process command completion."""
     ctx.timestamp = get_current_time_string()
     if callable(callback):
         if not callback_args:
             callback_args = []
         callback(ctx, *callback_args)
示例#8
0
    def jobs_kill(self, job_log_root, job_log_dirs):
        """Kill multiple jobs.

        job_log_root -- The log/job/ sub-directory of the suite.
        job_log_dirs -- A list containing point/name/submit_num for task jobs.

        """
        # Note: The more efficient way to do this is to group the jobs by their
        # job runners, and call the kill command for each job runner once.
        # However, this will make it more difficult to determine if the kill
        # command for a particular job is successful or not.
        if "$" in job_log_root:
            job_log_root = os.path.expandvars(job_log_root)
        self.configure_suite_run_dir(job_log_root.rsplit(os.sep, 2)[0])
        now = get_current_time_string()
        for job_log_dir in job_log_dirs:
            ret_code, err = self.job_kill(
                os.path.join(job_log_root, job_log_dir, JOB_LOG_STATUS))
            sys.stdout.write(
                "%s%s|%s|%d\n" %
                (self.OUT_PREFIX_SUMMARY, now, job_log_dir, ret_code))
            # Note: Print STDERR to STDOUT may look a bit strange, but it
            # requires less logic for the suite to parse the output.
            if err.strip():
                for line in err.splitlines(True):
                    if not line.endswith("\n"):
                        line += "\n"
                    sys.stdout.write(
                        "%s%s|%s|%s" %
                        (self.OUT_PREFIX_CMD_ERR, now, job_log_dir, line))
示例#9
0
 def _filter_submit_output(self, st_file_path, job_runner, out, err):
     """Filter submit command output, if relevant."""
     job_id = None
     if hasattr(job_runner, "REC_ID_FROM_SUBMIT_ERR"):
         text = err
         rec_id = job_runner.REC_ID_FROM_SUBMIT_ERR
     elif hasattr(job_runner, "REC_ID_FROM_SUBMIT_OUT"):
         text = out
         rec_id = job_runner.REC_ID_FROM_SUBMIT_OUT
     if rec_id:
         for line in str(text).splitlines():
             match = rec_id.match(line)
             if match:
                 job_id = match.group("id")
                 if hasattr(job_runner, "manip_job_id"):
                     job_id = job_runner.manip_job_id(job_id)
                 job_status_file = open(st_file_path, "a")
                 job_status_file.write("{0}={1}\n".format(
                     self.CYLC_JOB_ID, job_id))
                 job_status_file.write("{0}={1}\n".format(
                     self.CYLC_JOB_RUNNER_SUBMIT_TIME,
                     get_current_time_string()))
                 job_status_file.close()
                 break
     if hasattr(job_runner, "filter_submit_output"):
         out, err = job_runner.filter_submit_output(out, err)
     return out, err, job_id
示例#10
0
文件: subprocpool.py 项目: cylc/cylc
 def _run_command_exit(cls, ctx, callback=None, callback_args=None):
     """Process command completion."""
     ctx.timestamp = get_current_time_string()
     if callable(callback):
         if not callback_args:
             callback_args = []
         callback(ctx, *callback_args)
示例#11
0
文件: suite_db_mgr.py 项目: cylc/cylc
 def put_broadcast(self, modified_settings, is_cancel=False):
     """Put or clear broadcasts in runtime database."""
     now = get_current_time_string(display_sub_seconds=True)
     for broadcast_change in (
             get_broadcast_change_iter(modified_settings, is_cancel)):
         broadcast_change["time"] = now
         self.db_inserts_map[self.TABLE_BROADCAST_EVENTS].append(
             broadcast_change)
         if is_cancel:
             self.db_deletes_map[self.TABLE_BROADCAST_STATES].append({
                 "point": broadcast_change["point"],
                 "namespace": broadcast_change["namespace"],
                 "key": broadcast_change["key"]})
             # Delete statements are currently executed before insert
             # statements, so we should clear out any insert statements that
             # are deleted here.
             # (Not the most efficient logic here, but unless we have a
             # large number of inserts, then this should not be a big
             # concern.)
             inserts = []
             for insert in self.db_inserts_map[self.TABLE_BROADCAST_STATES]:
                 if any(insert[key] != broadcast_change[key]
                        for key in ["point", "namespace", "key"]):
                     inserts.append(insert)
             self.db_inserts_map[self.TABLE_BROADCAST_STATES] = inserts
         else:
             self.db_inserts_map[self.TABLE_BROADCAST_STATES].append({
                 "point": broadcast_change["point"],
                 "namespace": broadcast_change["namespace"],
                 "key": broadcast_change["key"],
                 "value": broadcast_change["value"]})
示例#12
0
 def _prep_submit_task_job_error(self, suite, itask, dry_run, action, exc):
     """Helper for self._prep_submit_task_job. On error."""
     LOG.debug("submit_num %s" % itask.submit_num)
     LOG.debug(traceback.format_exc())
     LOG.error(exc)
     log_task_job_activity(SubProcContext(self.JOBS_SUBMIT,
                                          action,
                                          err=exc,
                                          ret_code=1),
                           suite,
                           itask.point,
                           itask.tdef.name,
                           submit_num=itask.submit_num)
     if not dry_run:
         # Persist
         self.suite_db_mgr.put_insert_task_jobs(
             itask, {
                 'is_manual_submit': itask.is_manual_submit,
                 'try_num': itask.get_try_num(),
                 'time_submit': get_current_time_string(),
                 'batch_sys_name': itask.summary.get('batch_sys_name'),
             })
         itask.is_manual_submit = False
         self.task_events_mgr.process_message(
             itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED)
示例#13
0
    def jobs_kill(self, job_log_root, job_log_dirs):
        """Kill multiple jobs.

        job_log_root -- The log/job/ sub-directory of the suite.
        job_log_dirs -- A list containing point/name/submit_num for task jobs.

        """
        # Note: The more efficient way to do this is to group the jobs by their
        # batch systems, and call the kill command for each batch system once.
        # However, this will make it more difficult to determine if the kill
        # command for a particular job is successful or not.
        if "$" in job_log_root:
            job_log_root = os.path.expandvars(job_log_root)
        self.configure_suite_run_dir(job_log_root.rsplit(os.sep, 2)[0])
        now = get_current_time_string()
        for job_log_dir in job_log_dirs:
            ret_code, err = self.job_kill(
                os.path.join(job_log_root, job_log_dir, JOB_LOG_STATUS))
            sys.stdout.write("%s%s|%s|%d\n" % (
                self.OUT_PREFIX_SUMMARY, now, job_log_dir, ret_code))
            # Note: Print STDERR to STDOUT may look a bit strange, but it
            # requires less logic for the suite to parse the output.
            if err.strip():
                for line in err.splitlines(True):
                    if not line.endswith("\n"):
                        line += "\n"
                    sys.stdout.write("%s%s|%s|%s" % (
                        self.OUT_PREFIX_CMD_ERR, now, job_log_dir, line))
示例#14
0
 def _filter_submit_output(self, st_file_path, batch_sys, out, err):
     """Filter submit command output, if relevant."""
     job_id = None
     if hasattr(batch_sys, "REC_ID_FROM_SUBMIT_ERR"):
         text = err
         rec_id = batch_sys.REC_ID_FROM_SUBMIT_ERR
     elif hasattr(batch_sys, "REC_ID_FROM_SUBMIT_OUT"):
         text = out
         rec_id = batch_sys.REC_ID_FROM_SUBMIT_OUT
     if rec_id:
         for line in str(text).splitlines():
             match = rec_id.match(line)
             if match:
                 job_id = match.group("id")
                 if hasattr(batch_sys, "manip_job_id"):
                     job_id = batch_sys.manip_job_id(job_id)
                 job_status_file = open(st_file_path, "a")
                 job_status_file.write("%s=%s\n" % (
                     self.CYLC_BATCH_SYS_JOB_ID, job_id))
                 job_status_file.write("%s=%s\n" % (
                     self.CYLC_BATCH_SYS_JOB_SUBMIT_TIME,
                     get_current_time_string()))
                 job_status_file.close()
                 break
     if hasattr(batch_sys, "filter_submit_output"):
         out, err = batch_sys.filter_submit_output(out, err)
     return out, err, job_id
示例#15
0
 def _set_state(self, status, respect_hold_swap=False):
     """Set state to new status and log."""
     if self.status == self.hold_swap:
         self.hold_swap = None
     if (self.status, self.hold_swap) == (status, None):
         return
     if (respect_hold_swap and
         (self.status, self.hold_swap) == (TASK_STATUS_HELD, status)):
         return
     prev_message = str(self)
     prev_status, prev_hold_swap = self.status, self.hold_swap
     if status == TASK_STATUS_HELD:
         self.hold_swap = self.status
     elif status in TASK_STATUSES_ACTIVE:
         if self.status == TASK_STATUS_HELD:
             self.hold_swap = TASK_STATUS_HELD
     elif (self.hold_swap == TASK_STATUS_HELD
           and status not in TASK_STATUSES_FINAL):
         self.hold_swap = status
         status = TASK_STATUS_HELD
     elif self.hold_swap:
         self.hold_swap = None
     self.status = status
     self.time_updated = get_current_time_string()
     self.is_updated = True
     LOG.debug("[%s] -%s => %s", self.identity, prev_message, str(self))
     return (prev_status, prev_hold_swap)
示例#16
0
def ses_test_dir(request, run_dir):
    """The root reg dir for test flows in this test session."""
    timestamp = get_current_time_string(use_basic_format=True)
    uuid = f'cit-{timestamp}'
    path = Path(run_dir, uuid)
    path.mkdir(exist_ok=True)
    yield path
    _rm_if_empty(path)
示例#17
0
 def _db_events_insert(self, itask, event="", message=""):
     """Record an event to the DB."""
     self.suite_db_mgr.put_insert_task_events(
         itask, {
             "time": get_current_time_string(),
             "event": event,
             "message": message
         })
示例#18
0
文件: subprocctx.py 项目: cylc/cylc
    def __init__(self, cmd_key, cmd, **cmd_kwargs):
        self.timestamp = get_current_time_string()
        self.cmd_key = cmd_key
        self.cmd = cmd
        self.cmd_kwargs = cmd_kwargs

        self.err = cmd_kwargs.get('err')
        self.ret_code = cmd_kwargs.get('ret_code')
        self.out = cmd_kwargs.get('out')
示例#19
0
    def __init__(self, cmd_key, cmd, **cmd_kwargs):
        self.timestamp = get_current_time_string()
        self.cmd_key = cmd_key
        self.cmd = cmd
        self.cmd_kwargs = cmd_kwargs

        self.err = cmd_kwargs.get('err')
        self.ret_code = cmd_kwargs.get('ret_code')
        self.out = cmd_kwargs.get('out')
示例#20
0
def _backup(tgt: Path) -> None:
    """Make a timestamped backup of a dir or file."""
    tstamp = get_current_time_string(use_basic_format=True)
    backup = Path(tgt).parent / (tgt.name + f'.{tstamp}')
    LOG.warning('Replacing an existing cylc-tutorials folder which will'
                f' be copied to {backup}')
    # NOTE: shutil interfaces don't fully support Path objects at all
    # python versions
    shutil.move(str(tgt), str(backup))
示例#21
0
 def test_set_job_time(self):
     """Test method setting event time."""
     event_time = get_current_time_string()
     self.job_pool.insert_job(JOB_CONFIG)
     job = self.job_pool.pool[self.ext_id]
     old_time = copy(job.submitted_time)
     self.job_pool.set_job_time(self.int_id, 'jumped', event_time)
     self.assertEqual(old_time, job.submitted_time)
     self.job_pool.set_job_time(self.int_id, 'submitted', event_time)
     self.assertNotEqual(old_time, job.submitted_time)
示例#22
0
def test_delta_job_time(harness):
    """Test method setting job state change time."""
    schd, data = harness
    event_time = get_current_time_string()
    schd.data_store_mgr.delta_job_time(int_id(schd), 'submitted', event_time)
    job_updated = schd.data_store_mgr.updated[JOBS][ext_id(schd)]
    with pytest.raises(ValueError):
        job_updated.HasField('jumped_time')
    assert job_updated.submitted_time != (
        schd.data_store_mgr.added[JOBS][ext_id(schd)].submitted_time)
示例#23
0
def test_set_job_time(myflow):
    """Test method setting event time."""
    event_time = get_current_time_string()
    myflow.job_pool.insert_job(job_config(myflow))
    job_added = myflow.job_pool.added[ext_id(myflow)]
    myflow.job_pool.set_job_time(int_id(myflow), 'submitted', event_time)
    job_updated = myflow.job_pool.updated[ext_id(myflow)]
    with pytest.raises(ValueError):
        job_updated.HasField('jumped_time')
    assert job_added.submitted_time != job_updated.submitted_time
示例#24
0
    def jobs_poll(self, job_log_root, job_log_dirs):
        """Poll multiple jobs.

        job_log_root -- The log/job/ sub-directory of the suite.
        job_log_dirs -- A list containing point/name/submit_num for task jobs.

        """
        if "$" in job_log_root:
            job_log_root = os.path.expandvars(job_log_root)
        self.configure_suite_run_dir(job_log_root.rsplit(os.sep, 2)[0])

        ctx_list = []  # Contexts for all relevant jobs
        ctx_list_by_batch_sys = {}  # {batch_sys_name1: [ctx1, ...], ...}

        for job_log_dir in job_log_dirs:
            ctx = self._jobs_poll_status_files(job_log_root, job_log_dir)
            if ctx is None:
                continue
            ctx_list.append(ctx)

            if not ctx.batch_sys_name or not ctx.batch_sys_job_id:
                # Lost batch system information for some reason.
                # Mark the job as if it is no longer in the batch system.
                ctx.batch_sys_exit_polled = 1
                sys.stderr.write(
                    "%s/%s: incomplete batch system info\n" % (
                        ctx.job_log_dir, JOB_LOG_STATUS))

            # We can trust:
            # * Jobs previously polled to have exited the batch system.
            # * Jobs succeeded or failed with ERR/EXIT.
            if (ctx.batch_sys_exit_polled or ctx.run_status == 0 or
                    ctx.run_signal in ["ERR", "EXIT"]):
                continue

            if ctx.batch_sys_name not in ctx_list_by_batch_sys:
                ctx_list_by_batch_sys[ctx.batch_sys_name] = []
            ctx_list_by_batch_sys[ctx.batch_sys_name].append(ctx)

        for batch_sys_name, my_ctx_list in ctx_list_by_batch_sys.items():
            self._jobs_poll_batch_sys(
                job_log_root, batch_sys_name, my_ctx_list)

        cur_time_str = get_current_time_string()
        for ctx in ctx_list:
            for message in ctx.messages:
                sys.stdout.write("%s%s|%s|%s\n" % (
                    self.OUT_PREFIX_MESSAGE,
                    cur_time_str,
                    ctx.job_log_dir,
                    message))
            sys.stdout.write("%s%s|%s\n" % (
                self.OUT_PREFIX_SUMMARY,
                cur_time_str,
                ctx.get_summary_str()))
示例#25
0
    def jobs_poll(self, job_log_root, job_log_dirs):
        """Poll multiple jobs.

        job_log_root -- The log/job/ sub-directory of the workflow.
        job_log_dirs -- A list containing point/name/submit_num for task jobs.

        """
        if "$" in job_log_root:
            job_log_root = os.path.expandvars(job_log_root)
        self.configure_workflow_run_dir(job_log_root.rsplit(os.sep, 2)[0])

        ctx_list = []  # Contexts for all relevant jobs
        ctx_list_by_job_runner = {}  # {job_runner_name1: [ctx1, ...], ...}

        for job_log_dir in job_log_dirs:
            ctx = self._jobs_poll_status_files(job_log_root, job_log_dir)
            if ctx is None:
                continue
            ctx_list.append(ctx)

            if not ctx.job_runner_name or not ctx.job_id:
                # Lost job runner information for some reason.
                # Mark the job as if it is no longer in the job runner.
                ctx.job_runner_exit_polled = 1
                sys.stderr.write(
                    "%s/%s: incomplete job runner info\n" % (
                        ctx.job_log_dir, JOB_LOG_STATUS))

            # We can trust:
            # * Jobs previously polled to have exited the job runner.
            # * Jobs succeeded or failed with ERR/EXIT.
            if (ctx.job_runner_exit_polled or ctx.run_status == 0 or
                    ctx.run_signal in ["ERR", "EXIT"]):
                continue

            if ctx.job_runner_name not in ctx_list_by_job_runner:
                ctx_list_by_job_runner[ctx.job_runner_name] = []
            ctx_list_by_job_runner[ctx.job_runner_name].append(ctx)

        for job_runner_name, my_ctx_list in ctx_list_by_job_runner.items():
            self._jobs_poll_runner(
                job_log_root, job_runner_name, my_ctx_list)

        cur_time_str = get_current_time_string()
        for ctx in ctx_list:
            for message in ctx.messages:
                sys.stdout.write("%s%s|%s|%s\n" % (
                    self.OUT_PREFIX_MESSAGE,
                    cur_time_str,
                    ctx.job_log_dir,
                    message))
            sys.stdout.write("%s%s|%s\n" % (
                self.OUT_PREFIX_SUMMARY,
                cur_time_str,
                ctx.get_summary_str()))
示例#26
0
def record_messages(suite, task_job, messages):
    """Record task job messages.

    Print the messages according to their severity.
    Write the messages in the job status file.
    Send the messages to the suite, if possible.

    Arguments:
        suite (str): Suite name.
        task_job (str): Task job identifier "CYCLE/TASK_NAME/SUBMIT_NUM".
        messages (list): List of messages "[[severity, message], ...]".
    """
    # Record the event time, in case the message is delayed in some way.
    event_time = get_current_time_string(
        override_use_utc=(os.getenv('CYLC_UTC') == 'True'))
    # Print to stdout/stderr
    for severity, message in messages:
        if severity in STDERR_LEVELS:
            handle = sys.stderr
        else:
            handle = sys.stdout
        handle.write('%s %s - %s\n' % (event_time, severity, message))
        handle.flush()
    # Write to job.status
    _append_job_status_file(suite, task_job, event_time, messages)
    # Send messages
    suite = os.path.normpath(suite)
    try:
        pclient = get_client(suite)
    except SuiteStopped:
        # on a remote host this means the contact file is not present
        # either the suite is stopped or the contact file is not present
        # on the job host (i.e. comms method is polling)
        # eitherway don't try messaging
        pass
    except Exception:
        # Backward communication not possible
        if cylc.flow.flags.debug:
            import traceback
            traceback.print_exc()
    else:
        mutation_kwargs = {
            'request_string': MUTATION,
            'variables': {
                'wFlows': [suite],
                'taskJob': task_job,
                'eventTime': event_time,
                'messages': messages,
            }
        }
        pclient('graphql', mutation_kwargs)
示例#27
0
def record_messages(workflow, task_job, messages):
    """Record task job messages.

    Print the messages according to their severity.
    Write the messages in the job status file.
    Send the messages to the workflow, if possible.

    Arguments:
        workflow (str): Workflow name.
        task_job (str): Task job identifier "CYCLE/TASK_NAME/SUBMIT_NUM".
        messages (list): List of messages "[[severity, message], ...]".
    """
    # Record the event time, in case the message is delayed in some way.
    event_time = get_current_time_string(
        override_use_utc=(os.getenv('CYLC_UTC') == 'True'))
    write_messages(workflow, task_job, messages, event_time)
    if get_comms_method() != CommsMeth.POLL:
        send_messages(workflow, task_job, messages, event_time)
示例#28
0
def _open_install_log(rund, logger):
    """Open Cylc log handlers for install/reinstall."""
    time_str = get_current_time_string(
        override_use_utc=True, use_basic_format=True,
        display_sub_seconds=False
    )
    rund = Path(rund).expanduser()
    log_type = logger.name[logger.name.startswith('cylc-') and len('cylc-'):]
    log_path = Path(
        rund,
        SuiteFiles.LOG_DIR,
        'install',
        f"{time_str}-{log_type}.log")
    log_parent_dir = log_path.parent
    log_parent_dir.mkdir(exist_ok=True, parents=True)
    handler = logging.FileHandler(log_path)
    handler.setFormatter(CylcLogFormatter())
    logger.addHandler(handler)
示例#29
0
    def jobs_submit(self,
                    job_log_root,
                    job_log_dirs,
                    remote_mode=False,
                    utc_mode=False):
        """Submit multiple jobs.

        job_log_root -- The log/job/ sub-directory of the suite.
        job_log_dirs -- A list containing point/name/submit_num for task jobs.
        remote_mode -- am I running on the remote job host?
        utc_mode -- is the suite running in UTC mode?

        """
        if "$" in job_log_root:
            job_log_root = os.path.expandvars(job_log_root)
        self.configure_suite_run_dir(job_log_root.rsplit(os.sep, 2)[0])

        if remote_mode:
            items = self._jobs_submit_prep_by_stdin(job_log_root, job_log_dirs)
        else:
            items = self._jobs_submit_prep_by_args(job_log_root, job_log_dirs)
        now = get_current_time_string(override_use_utc=utc_mode)
        for job_log_dir, batch_sys_name, submit_opts in items:
            job_file_path = os.path.join(job_log_root, job_log_dir,
                                         JOB_LOG_JOB)
            if not batch_sys_name:
                sys.stdout.write("%s%s|%s|1|\n" %
                                 (self.OUT_PREFIX_SUMMARY, now, job_log_dir))
                continue
            ret_code, out, err, job_id = self._job_submit_impl(
                job_file_path, batch_sys_name, submit_opts)
            sys.stdout.write(
                "%s%s|%s|%d|%s\n" %
                (self.OUT_PREFIX_SUMMARY, now, job_log_dir, ret_code, job_id))
            for key, value in [("STDERR", err), ("STDOUT", out)]:
                if value is None or not value.strip():
                    continue
                for line in value.splitlines(True):
                    if not value.endswith("\n"):
                        value += "\n"
                    sys.stdout.write(
                        "%s%s|%s|[%s] %s" %
                        (self.OUT_PREFIX_COMMAND, now, job_log_dir, key, line))
示例#30
0
文件: task_job_mgr.py 项目: cylc/cylc
 def _prep_submit_task_job_error(self, suite, itask, dry_run, action, exc):
     """Helper for self._prep_submit_task_job. On error."""
     LOG.debug("submit_num %s" % itask.submit_num)
     LOG.debug(traceback.format_exc())
     LOG.error(exc)
     log_task_job_activity(
         SubProcContext(self.JOBS_SUBMIT, action, err=exc, ret_code=1),
         suite, itask.point, itask.tdef.name, submit_num=itask.submit_num)
     if not dry_run:
         # Persist
         self.suite_db_mgr.put_insert_task_jobs(itask, {
             'is_manual_submit': itask.is_manual_submit,
             'try_num': itask.get_try_num(),
             'time_submit': get_current_time_string(),
             'batch_sys_name': itask.summary.get('batch_sys_name'),
         })
         itask.is_manual_submit = False
         self.task_events_mgr.process_message(
             itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED)
示例#31
0
    def _process_message_submit_failed(self, itask, event_time):
        """Helper for process_message, handle a submit-failed message.

        Return True if no retries (hence go to the submit-failed state).
        """
        no_retries = False
        LOG.error('[%s] -%s', itask, self.EVENT_SUBMIT_FAILED)
        if event_time is None:
            event_time = get_current_time_string()
        self.suite_db_mgr.put_update_task_jobs(itask, {
            "time_submit_exit": event_time,
            "submit_status": 1,
        })
        job_d = get_task_job_id(itask.point, itask.tdef.name, itask.submit_num)
        self.job_pool.set_job_state(job_d, TASK_STATUS_SUBMIT_FAILED)
        itask.summary['submit_method_id'] = None
        self.pflag = True
        if (
                TimerFlags.SUBMISSION_RETRY not in itask.try_timers
                or itask.try_timers[TimerFlags.SUBMISSION_RETRY].next() is None
        ):
            # No submission retry lined up: definitive failure.
            # See github #476.
            no_retries = True
            if itask.state.reset(TASK_STATUS_SUBMIT_FAILED):
                self.setup_event_handlers(
                    itask, self.EVENT_SUBMIT_FAILED,
                    f'job {self.EVENT_SUBMIT_FAILED}')
        else:
            # There is a submission retry lined up.
            timer = itask.try_timers[TimerFlags.SUBMISSION_RETRY]
            self._retry_task(itask, timer.timeout, submit_retry=True)
            delay_msg = f"submit-retrying in {timer.delay_timeout_as_str()}"
            if itask.state.is_held:
                delay_msg = f"held ({delay_msg})"
            msg = "%s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg)
            LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg)
            itask.set_summary_message(msg)
            self.setup_event_handlers(
                itask, self.EVENT_SUBMIT_RETRY,
                f"job {self.EVENT_SUBMIT_FAILED}, {delay_msg}")
        self._reset_job_timers(itask)
        return no_retries
示例#32
0
    def _process_message_failed(self, itask, event_time, message):
        """Helper for process_message, handle a failed message.

        Return True if no retries (hence go to the failed state).
        """
        no_retries = False
        if event_time is None:
            event_time = get_current_time_string()
        itask.set_summary_time('finished', event_time)
        job_d = get_task_job_id(
            itask.point, itask.tdef.name, itask.submit_num)
        self.job_pool.set_job_time(job_d, 'finished', event_time)
        self.job_pool.set_job_state(job_d, TASK_STATUS_FAILED)
        self.suite_db_mgr.put_update_task_jobs(itask, {
            "run_status": 1,
            "time_run_exit": event_time,
        })
        self.pflag = True
        if (
                TimerFlags.EXECUTION_RETRY not in itask.try_timers
                or itask.try_timers[TimerFlags.EXECUTION_RETRY].next() is None
        ):
            # No retry lined up: definitive failure.
            if itask.state.reset(TASK_STATUS_FAILED):
                self.setup_event_handlers(itask, self.EVENT_FAILED, message)
            LOG.critical(
                "[%s] -job(%02d) %s", itask, itask.submit_num, "failed")
            no_retries = True
        else:
            # There is an execution retry lined up.
            timer = itask.try_timers[TimerFlags.EXECUTION_RETRY]
            self._retry_task(itask, timer.timeout)
            delay_msg = f"retrying in {timer.delay_timeout_as_str()}"
            if itask.state.is_held:
                delay_msg = "held (%s)" % delay_msg
            msg = "failed, %s" % (delay_msg)
            LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg)
            itask.set_summary_message(msg)
            self.setup_event_handlers(
                itask, self.EVENT_RETRY, f"{self.JOB_FAILED}, {delay_msg}")
        self._reset_job_timers(itask)
        return no_retries
示例#33
0
 def do_rollover(self):
     """Create and rollover log file if necessary."""
     # Generate new file name
     self.stamp = get_current_time_string(use_basic_format=True)
     filename = self.baseFilename + '.' + self.stamp
     os.makedirs(os.path.dirname(filename), exist_ok=True)
     # Touch file
     with open(filename, 'w+'):
         os.utime(filename, None)
     # Update symlink
     if (os.path.exists(self.baseFilename)
             or os.path.lexists(self.baseFilename)):
         os.unlink(self.baseFilename)
     os.symlink(os.path.basename(filename), self.baseFilename)
     # Housekeep log files
     arch_len = glbl_cfg().get(
         ['scheduler', 'logging', 'rolling archive length'])
     if arch_len:
         log_files = glob(self.baseFilename + '.*')
         log_files.sort()
         while len(log_files) > arch_len:
             os.unlink(log_files.pop(0))
     # Reopen stream, redirect STDOUT and STDERR to log
     if self.stream:
         self.stream.close()
         self.stream = None
     self.stream = self._open()
     # Dup STDOUT and STDERR in detach mode
     if not self.no_detach:
         os.dup2(self.stream.fileno(), sys.stdout.fileno())
         os.dup2(self.stream.fileno(), sys.stderr.fileno())
     # Emit header records (should only do this for subsequent log files)
     for header_record in self.header_records:
         if self.FILE_NUM in header_record.__dict__:
             # Increment log file number
             header_record.__dict__[self.FILE_NUM] += 1
             header_record.args = header_record.args[0:-1] + (
                 header_record.__dict__[self.FILE_NUM], )
         logging.FileHandler.emit(self, header_record)
示例#34
0
def record_messages(suite, task_job, messages):
    """Record task job messages.

    Print the messages according to their severity.
    Write the messages in the job status file.
    Send the messages to the suite, if possible.

    Arguments:
        suite (str): Suite name.
        task_job (str): Task job identifier "CYCLE/TASK_NAME/SUBMIT_NUM".
        messages (list): List of messages "[[severity, message], ...]".
    """
    # Record the event time, in case the message is delayed in some way.
    event_time = get_current_time_string(
        override_use_utc=(os.getenv('CYLC_UTC') == 'True'))
    # Print to stdout/stderr
    for severity, message in messages:
        if severity in STDERR_LEVELS:
            handle = sys.stderr
        else:
            handle = sys.stdout
        handle.write('%s %s - %s\n' % (event_time, severity, message))
    handle.flush()
    # Write to job.status
    _append_job_status_file(suite, task_job, event_time, messages)
    # Send messages
    try:
        pclient = SuiteRuntimeClient(suite)
    except Exception:
        # Backward communication not possible
        if cylc.flow.flags.debug:
            import traceback
            traceback.print_exc()
    else:
        pclient('put_messages', {
            'task_job': task_job,
            'event_time': event_time,
            'messages': messages
        })
示例#35
0
 def _prep_submit_task_job_error(self, workflow, itask, action, exc):
     """Helper for self._prep_submit_task_job. On error."""
     LOG.debug("submit_num %s" % itask.submit_num)
     log_task_job_activity(SubProcContext(self.JOBS_SUBMIT,
                                          action,
                                          err=exc,
                                          ret_code=1),
                           workflow,
                           itask.point,
                           itask.tdef.name,
                           submit_num=itask.submit_num)
     # Persist
     self.workflow_db_mgr.put_insert_task_jobs(
         itask, {
             'is_manual_submit': itask.is_manual_submit,
             'try_num': itask.get_try_num(),
             'time_submit': get_current_time_string(),
             'job_runner_name': itask.summary.get('job_runner_name'),
         })
     itask.is_manual_submit = False
     self.task_events_mgr.process_message(
         itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED)
示例#36
0
    def jobs_submit(self, job_log_root, job_log_dirs, remote_mode=False,
                    utc_mode=False):
        """Submit multiple jobs.

        job_log_root -- The log/job/ sub-directory of the suite.
        job_log_dirs -- A list containing point/name/submit_num for task jobs.
        remote_mode -- am I running on the remote job host?
        utc_mode -- is the suite running in UTC mode?

        """
        if "$" in job_log_root:
            job_log_root = os.path.expandvars(job_log_root)
        self.configure_suite_run_dir(job_log_root.rsplit(os.sep, 2)[0])

        if remote_mode:
            items = self._jobs_submit_prep_by_stdin(job_log_root, job_log_dirs)
        else:
            items = self._jobs_submit_prep_by_args(job_log_root, job_log_dirs)
        now = get_current_time_string(override_use_utc=utc_mode)
        for job_log_dir, batch_sys_name, submit_opts in items:
            job_file_path = os.path.join(
                job_log_root, job_log_dir, JOB_LOG_JOB)
            if not batch_sys_name:
                sys.stdout.write("%s%s|%s|1|\n" % (
                    self.OUT_PREFIX_SUMMARY, now, job_log_dir))
                continue
            ret_code, out, err, job_id = self._job_submit_impl(
                job_file_path, batch_sys_name, submit_opts)
            sys.stdout.write("%s%s|%s|%d|%s\n" % (
                self.OUT_PREFIX_SUMMARY, now, job_log_dir, ret_code, job_id))
            for key, value in [("STDERR", err), ("STDOUT", out)]:
                if value is None or not value.strip():
                    continue
                for line in value.splitlines(True):
                    if not value.endswith("\n"):
                        value += "\n"
                    sys.stdout.write("%s%s|%s|[%s] %s" % (
                        self.OUT_PREFIX_COMMAND, now, job_log_dir, key, line))
示例#37
0
文件: loggingutil.py 项目: cylc/cylc
 def do_rollover(self):
     """Create and rollover log file if necessary."""
     # Generate new file name
     self.stamp = get_current_time_string(use_basic_format=True)
     filename = self.baseFilename + '.' + self.stamp
     os.makedirs(os.path.dirname(filename), exist_ok=True)
     # Touch file
     with open(filename, 'w+'):
         os.utime(filename, None)
     # Update symlink
     if (os.path.exists(self.baseFilename) or
             os.path.lexists(self.baseFilename)):
         os.unlink(self.baseFilename)
     os.symlink(os.path.basename(filename), self.baseFilename)
     # Housekeep log files
     arch_len = glbl_cfg().get([self.GLBL_KEY, 'rolling archive length'])
     if arch_len:
         log_files = glob(self.baseFilename + '.*')
         log_files.sort()
         while len(log_files) > arch_len:
             os.unlink(log_files.pop(0))
     # Reopen stream, redirect STDOUT and STDERR to log
     if self.stream:
         self.stream.close()
         self.stream = None
     self.stream = self._open()
     # Dup STDOUT and STDERR in detach mode
     if not self.no_detach:
         os.dup2(self.stream.fileno(), sys.stdout.fileno())
         os.dup2(self.stream.fileno(), sys.stderr.fileno())
     # Emit header records (should only do this for subsequent log files)
     for header_record in self.header_records:
         if self.FILE_NUM in header_record.__dict__:
             # Increment log file number
             header_record.__dict__[self.FILE_NUM] += 1
             header_record.args = header_record.args[0:-1] + (
                 header_record.__dict__[self.FILE_NUM],)
         logging.FileHandler.emit(self, header_record)
示例#38
0
文件: task_message.py 项目: cylc/cylc
def record_messages(suite, task_job, messages):
    """Record task job messages.

    Print the messages according to their severity.
    Write the messages in the job status file.
    Send the messages to the suite, if possible.

    Arguments:
        suite (str): Suite name.
        task_job (str): Task job identifier "CYCLE/TASK_NAME/SUBMIT_NUM".
        messages (list): List of messages "[[severity, message], ...]".
    """
    # Record the event time, in case the message is delayed in some way.
    event_time = get_current_time_string(
        override_use_utc=(os.getenv('CYLC_UTC') == 'True'))
    # Print to stdout/stderr
    for severity, message in messages:
        if severity in STDERR_LEVELS:
            handle = sys.stderr
        else:
            handle = sys.stdout
        handle.write('%s %s - %s\n' % (event_time, severity, message))
    handle.flush()
    # Write to job.status
    _append_job_status_file(suite, task_job, event_time, messages)
    # Send messages
    try:
        pclient = SuiteRuntimeClient(suite)
    except Exception:
        # Backward communication not possible
        if cylc.flow.flags.debug:
            import traceback
            traceback.print_exc()
    pclient(
        'put_messages',
        {'task_job': task_job, 'event_time': event_time, 'messages': messages}
    )
示例#39
0
    def _jobs_poll_batch_sys(self, job_log_root, batch_sys_name, my_ctx_list):
        """Helper 2 for self.jobs_poll(job_log_root, job_log_dirs)."""
        exp_job_ids = [ctx.batch_sys_job_id for ctx in my_ctx_list]
        bad_job_ids = list(exp_job_ids)
        exp_pids = []
        bad_pids = []
        items = [[self._get_sys(batch_sys_name), exp_job_ids, bad_job_ids]]
        if getattr(items[0][0], "SHOULD_POLL_PROC_GROUP", False):
            exp_pids = [ctx.pid for ctx in my_ctx_list if ctx.pid is not None]
            bad_pids.extend(exp_pids)
            items.append([self._get_sys("background"), exp_pids, bad_pids])
        debug_messages = []
        for batch_sys, exp_ids, bad_ids in items:
            if hasattr(batch_sys, "get_poll_many_cmd"):
                # Some poll commands may not be as simple
                cmd = batch_sys.get_poll_many_cmd(exp_ids)
            else:  # if hasattr(batch_sys, "POLL_CMD"):
                # Simple poll command that takes a list of job IDs
                cmd = [batch_sys.POLL_CMD] + exp_ids
            try:
                proc = procopen(cmd, stdin=open(os.devnull),
                                stderrpipe=True, stdoutpipe=True)
            except OSError as exc:
                # subprocess.Popen has a bad habit of not setting the
                # filename of the executable when it raises an OSError.
                if not exc.filename:
                    exc.filename = cmd[0]
                sys.stderr.write(str(exc) + "\n")
                return
            ret_code = proc.wait()
            out, err = (f.decode() for f in proc.communicate())
            debug_messages.append('%s - %s' % (
                batch_sys, len(out.split('\n'))))
            sys.stderr.write(err)
            if (ret_code and hasattr(batch_sys, "POLL_CANT_CONNECT_ERR") and
                    batch_sys.POLL_CANT_CONNECT_ERR in err):
                # Poll command failed because it cannot connect to batch system
                # Assume jobs are still healthy until the batch system is back.
                bad_ids[:] = []
            elif hasattr(batch_sys, "filter_poll_many_output"):
                # Allow custom filter
                for id_ in batch_sys.filter_poll_many_output(out):
                    try:
                        bad_ids.remove(id_)
                    except ValueError:
                        pass
            else:
                # Just about all poll commands return a table, with column 1
                # being the job ID. The logic here should be sufficient to
                # ensure that any table header is ignored.
                for line in out.splitlines():
                    try:
                        head = line.split(None, 1)[0]
                    except IndexError:
                        continue
                    if head in exp_ids:
                        try:
                            bad_ids.remove(head)
                        except ValueError:
                            pass

        debug_flag = False
        for ctx in my_ctx_list:
            ctx.batch_sys_exit_polled = int(
                ctx.batch_sys_job_id in bad_job_ids)
            # Exited batch system, but process still running
            # This can happen to jobs in some "at" implementation
            if ctx.batch_sys_exit_polled and ctx.pid in exp_pids:
                if ctx.pid not in bad_pids:
                    ctx.batch_sys_exit_polled = 0
                else:
                    debug_flag = True
            # Add information to "job.status"
            if ctx.batch_sys_exit_polled:
                try:
                    handle = open(os.path.join(
                        job_log_root, ctx.job_log_dir, JOB_LOG_STATUS), "a")
                    handle.write("%s=%s\n" % (
                        self.CYLC_BATCH_SYS_EXIT_POLLED,
                        get_current_time_string()))
                    handle.close()
                except IOError as exc:
                    sys.stderr.write(str(exc) + "\n")

        if debug_flag:
            ctx.batch_sys_call_no_lines = ', '.join(debug_messages)
示例#40
0
    def _poll_task_job_callback(self, suite, itask, cmd_ctx, line):
        """Helper for _poll_task_jobs_callback, on one task job."""
        ctx = SubProcContext(self.JOBS_POLL, None)
        ctx.out = line
        ctx.ret_code = 0

        # See cylc.flow.batch_sys_manager.JobPollContext
        job_d = get_task_job_id(itask.point, itask.tdef.name, itask.submit_num)
        try:
            job_log_dir, context = line.split('|')[1:3]
            items = json.loads(context)
            jp_ctx = JobPollContext(job_log_dir, **items)
        except TypeError:
            itask.set_summary_message(self.POLL_FAIL)
            self.job_pool.add_job_msg(job_d, self.POLL_FAIL)
            ctx.cmd = cmd_ctx.cmd  # print original command on failure
            return
        except ValueError:
            # back compat for cylc 7.7.1 and previous
            try:
                values = line.split('|')
                items = dict(  # done this way to ensure IndexError is raised
                    (key, values[x])
                    for x, key in enumerate(JobPollContext.CONTEXT_ATTRIBUTES))
                job_log_dir = items.pop('job_log_dir')
            except (ValueError, IndexError):
                itask.set_summary_message(self.POLL_FAIL)
                self.job_pool.add_job_msg(job_d, self.POLL_FAIL)
                ctx.cmd = cmd_ctx.cmd  # print original command on failure
                return
        finally:
            log_task_job_activity(ctx, suite, itask.point, itask.tdef.name)

        flag = self.task_events_mgr.FLAG_POLLED
        if jp_ctx.run_status == 1 and jp_ctx.run_signal in ["ERR", "EXIT"]:
            # Failed normally
            self.task_events_mgr.process_message(itask, INFO,
                                                 TASK_OUTPUT_FAILED,
                                                 jp_ctx.time_run_exit, flag)
        elif jp_ctx.run_status == 1 and jp_ctx.batch_sys_exit_polled == 1:
            # Failed by a signal, and no longer in batch system
            self.task_events_mgr.process_message(itask, INFO,
                                                 TASK_OUTPUT_FAILED,
                                                 jp_ctx.time_run_exit, flag)
            self.task_events_mgr.process_message(
                itask, INFO, FAIL_MESSAGE_PREFIX + jp_ctx.run_signal,
                jp_ctx.time_run_exit, flag)
        elif jp_ctx.run_status == 1:
            # The job has terminated, but is still managed by batch system.
            # Some batch system may restart a job in this state, so don't
            # mark as failed yet.
            self.task_events_mgr.process_message(itask, INFO,
                                                 TASK_OUTPUT_STARTED,
                                                 jp_ctx.time_run, flag)
        elif jp_ctx.run_status == 0:
            # The job succeeded
            self.task_events_mgr.process_message(itask, INFO,
                                                 TASK_OUTPUT_SUCCEEDED,
                                                 jp_ctx.time_run_exit, flag)
        elif jp_ctx.time_run and jp_ctx.batch_sys_exit_polled == 1:
            # The job has terminated without executing the error trap
            self.task_events_mgr.process_message(itask, INFO,
                                                 TASK_OUTPUT_FAILED,
                                                 get_current_time_string(),
                                                 flag)
        elif jp_ctx.time_run:
            # The job has started, and is still managed by batch system
            self.task_events_mgr.process_message(itask, INFO,
                                                 TASK_OUTPUT_STARTED,
                                                 jp_ctx.time_run, flag)
        elif jp_ctx.batch_sys_exit_polled == 1:
            # The job never ran, and no longer in batch system
            self.task_events_mgr.process_message(
                itask, INFO, self.task_events_mgr.EVENT_SUBMIT_FAILED,
                jp_ctx.time_submit_exit, flag)
        else:
            # The job never ran, and is in batch system
            self.task_events_mgr.process_message(itask, INFO,
                                                 TASK_STATUS_SUBMITTED,
                                                 jp_ctx.time_submit_exit, flag)
示例#41
0
    def submit_task_jobs(self, suite, itasks, is_simulation=False):
        """Prepare and submit task jobs.

        Submit tasks where possible. Ignore tasks that are waiting for host
        select command to complete, or tasks that are waiting for remote
        initialisation. Bad host select command, error writing to a job file or
        bad remote initialisation will cause a bad task - leading to submission
        failure.

        This method uses prep_submit_task_job() as helper.

        Return (list): list of tasks that attempted submission.
        """
        if is_simulation:
            return self._simulation_submit_task_jobs(itasks)

        # Prepare tasks for job submission
        prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks)

        # Reset consumed host selection results
        self.task_remote_mgr.remote_host_select_reset()

        if not prepared_tasks:
            return bad_tasks

        # Group task jobs by (host, owner)
        auth_itasks = {}  # {(host, owner): [itask, ...], ...}
        for itask in prepared_tasks:
            auth_itasks.setdefault((itask.task_host, itask.task_owner), [])
            auth_itasks[(itask.task_host, itask.task_owner)].append(itask)
        # Submit task jobs for each (host, owner) group
        done_tasks = bad_tasks
        for (host, owner), itasks in sorted(auth_itasks.items()):
            is_init = self.task_remote_mgr.remote_init(host, owner)
            if is_init is None:
                # Remote is waiting to be initialised
                for itask in itasks:
                    itask.set_summary_message(self.REMOTE_INIT_MSG)
                    self.job_pool.add_job_msg(
                        get_task_job_id(itask.point, itask.tdef.name,
                                        itask.submit_num),
                        self.REMOTE_INIT_MSG)
                continue
            # Ensure that localhost background/at jobs are recorded as running
            # on the host name of the current suite host, rather than just
            # "localhost". On suite restart on a different suite host, this
            # allows the restart logic to correctly poll the status of the
            # background/at jobs that may still be running on the previous
            # suite host.
            if (self.batch_sys_mgr.is_job_local_to_host(
                    itask.summary['batch_sys_name'])
                    and not is_remote_host(host)):
                owner_at_host = get_host()
            else:
                owner_at_host = host
            # Persist
            if owner:
                owner_at_host = owner + '@' + owner_at_host
            now_str = get_current_time_string()
            done_tasks.extend(itasks)
            for itask in itasks:
                # Log and persist
                LOG.info('[%s] -submit-num=%02d, owner@host=%s', itask,
                         itask.submit_num, owner_at_host)
                self.suite_db_mgr.put_insert_task_jobs(
                    itask, {
                        'is_manual_submit': itask.is_manual_submit,
                        'try_num': itask.get_try_num(),
                        'time_submit': now_str,
                        'user_at_host': owner_at_host,
                        'batch_sys_name': itask.summary['batch_sys_name'],
                    })
                itask.is_manual_submit = False
            if is_init == REMOTE_INIT_FAILED:
                # Remote has failed to initialise
                # Set submit-failed for all affected tasks
                for itask in itasks:
                    itask.local_job_file_path = None  # reset for retry
                    log_task_job_activity(
                        SubProcContext(self.JOBS_SUBMIT,
                                       '(init %s)' % owner_at_host,
                                       err=REMOTE_INIT_FAILED,
                                       ret_code=1), suite, itask.point,
                        itask.tdef.name)
                    self.task_events_mgr.process_message(
                        itask, CRITICAL,
                        self.task_events_mgr.EVENT_SUBMIT_FAILED)
                continue
            # Build the "cylc jobs-submit" command
            cmd = ['cylc', self.JOBS_SUBMIT]
            if LOG.isEnabledFor(DEBUG):
                cmd.append('--debug')
            if get_utc_mode():
                cmd.append('--utc-mode')
            remote_mode = False
            kwargs = {}
            for key, value, test_func in [('host', host, is_remote_host),
                                          ('user', owner, is_remote_user)]:
                if test_func(value):
                    cmd.append('--%s=%s' % (key, value))
                    remote_mode = True
                    kwargs[key] = value
            if remote_mode:
                cmd.append('--remote-mode')
            cmd.append('--')
            cmd.append(get_remote_suite_run_job_dir(host, owner, suite))
            # Chop itasks into a series of shorter lists if it's very big
            # to prevent overloading of stdout and stderr pipes.
            itasks = sorted(itasks, key=lambda itask: itask.identity)
            chunk_size = len(itasks) // ((len(itasks) // 100) + 1) + 1
            itasks_batches = [
                itasks[i:i + chunk_size]
                for i in range(0, len(itasks), chunk_size)
            ]
            LOG.debug('%s ... # will invoke in batches, sizes=%s', cmd,
                      [len(b) for b in itasks_batches])
            for i, itasks_batch in enumerate(itasks_batches):
                stdin_files = []
                job_log_dirs = []
                for itask in itasks_batch:
                    if remote_mode:
                        stdin_files.append(
                            get_task_job_job_log(suite, itask.point,
                                                 itask.tdef.name,
                                                 itask.submit_num))
                    job_log_dirs.append(
                        get_task_job_id(itask.point, itask.tdef.name,
                                        itask.submit_num))
                    # The job file is now (about to be) used: reset the file
                    # write flag so that subsequent manual retrigger will
                    # generate a new job file.
                    itask.local_job_file_path = None
                    itask.state.reset(TASK_STATUS_READY)
                    if itask.state.outputs.has_custom_triggers():
                        self.suite_db_mgr.put_update_task_outputs(itask)
                self.proc_pool.put_command(
                    SubProcContext(self.JOBS_SUBMIT,
                                   cmd + job_log_dirs,
                                   stdin_files=stdin_files,
                                   job_log_dirs=job_log_dirs,
                                   **kwargs), self._submit_task_jobs_callback,
                    [suite, itasks_batch])
        return done_tasks
示例#42
0
    def _jobs_poll_runner(self, job_log_root, job_runner_name, my_ctx_list):
        """Helper 2 for self.jobs_poll(job_log_root, job_log_dirs)."""
        exp_job_ids = [ctx.job_id for ctx in my_ctx_list]
        bad_job_ids = list(exp_job_ids)
        exp_pids = []
        bad_pids = []
        items = [[self._get_sys(job_runner_name), exp_job_ids, bad_job_ids]]
        if getattr(items[0][0], "SHOULD_POLL_PROC_GROUP", False):
            exp_pids = [ctx.pid for ctx in my_ctx_list if ctx.pid is not None]
            bad_pids.extend(exp_pids)
            items.append([self._get_sys("background"), exp_pids, bad_pids])
        debug_messages = []
        for job_runner, exp_ids, bad_ids in items:
            if hasattr(job_runner, "get_poll_many_cmd"):
                # Some poll commands may not be as simple
                cmd = job_runner.get_poll_many_cmd(exp_ids)
            else:  # if hasattr(job_runner, "POLL_CMD"):
                # Simple poll command that takes a list of job IDs
                cmd = [job_runner.POLL_CMD, *exp_ids]
            try:
                proc = procopen(cmd,
                                stdindevnull=True,
                                stderrpipe=True,
                                stdoutpipe=True)
            except OSError as exc:
                # subprocess.Popen has a bad habit of not setting the
                # filename of the executable when it raises an OSError.
                if not exc.filename:
                    exc.filename = cmd[0]
                sys.stderr.write(f"{exc}\n")
                return
            ret_code = proc.wait()
            out, err = (f.decode() for f in proc.communicate())
            debug_messages.append('{0} - {1}'.format(job_runner,
                                                     len(out.split('\n'))))
            sys.stderr.write(err)
            if (ret_code and hasattr(job_runner, "POLL_CANT_CONNECT_ERR")
                    and job_runner.POLL_CANT_CONNECT_ERR in err):
                # Poll command failed because it cannot connect to job runner
                # Assume jobs are still healthy until the job runner is back.
                bad_ids[:] = []
            elif hasattr(job_runner, "filter_poll_many_output"):
                # Allow custom filter
                for id_ in job_runner.filter_poll_many_output(out):
                    try:
                        bad_ids.remove(id_)
                    except ValueError:
                        pass
            else:
                # Just about all poll commands return a table, with column 1
                # being the job ID. The logic here should be sufficient to
                # ensure that any table header is ignored.
                for line in out.splitlines():
                    try:
                        head = line.split(None, 1)[0]
                    except IndexError:
                        continue
                    if head in exp_ids:
                        try:
                            bad_ids.remove(head)
                        except ValueError:
                            pass

        debug_flag = False
        for ctx in my_ctx_list:
            ctx.job_runner_exit_polled = int(ctx.job_id in bad_job_ids)
            # Exited job runner, but process still running
            # This can happen to jobs in some "at" implementation
            if ctx.job_runner_exit_polled and ctx.pid in exp_pids:
                if ctx.pid not in bad_pids:
                    ctx.job_runner_exit_polled = 0
                else:
                    debug_flag = True
            # Add information to "job.status"
            if ctx.job_runner_exit_polled:
                try:
                    handle = open(
                        os.path.join(job_log_root, ctx.job_log_dir,
                                     JOB_LOG_STATUS), "a")
                    handle.write("{0}={1}\n".format(
                        self.CYLC_JOB_RUNNER_EXIT_POLLED,
                        get_current_time_string()))
                    handle.close()
                except IOError as exc:
                    sys.stderr.write(f"{exc}\n")

        if debug_flag:
            ctx.job_runner_call_no_lines = ', '.join(debug_messages)
示例#43
0
文件: suite_db_mgr.py 项目: cylc/cylc
    def put_task_pool(self, pool):
        """Put statements to update the task_pool table in runtime database.

        Update the task_pool table and the task_action_timers table.
        Queue delete (everything) statements to wipe the tables, and queue the
        relevant insert statements for the current tasks in the pool.
        """
        self.db_deletes_map[self.TABLE_TASK_POOL].append({})
        # No need to do:
        # self.db_deletes_map[self.TABLE_TASK_ACTION_TIMERS].append({})
        # Should already be done by self.put_task_event_timers above.
        self.db_deletes_map[self.TABLE_TASK_TIMEOUT_TIMERS].append({})
        for itask in pool.get_all_tasks():
            self.db_inserts_map[self.TABLE_TASK_POOL].append({
                "name": itask.tdef.name,
                "cycle": str(itask.point),
                "spawned": int(itask.has_spawned),
                "status": itask.state.status,
                "hold_swap": itask.state.hold_swap})
            if itask.timeout is not None:
                self.db_inserts_map[self.TABLE_TASK_TIMEOUT_TIMERS].append({
                    "name": itask.tdef.name,
                    "cycle": str(itask.point),
                    "timeout": itask.timeout})
            if itask.poll_timer is not None:
                self.db_inserts_map[self.TABLE_TASK_ACTION_TIMERS].append({
                    "name": itask.tdef.name,
                    "cycle": str(itask.point),
                    "ctx_key": json.dumps("poll_timer"),
                    "ctx": self._namedtuple2json(itask.poll_timer.ctx),
                    "delays": json.dumps(itask.poll_timer.delays),
                    "num": itask.poll_timer.num,
                    "delay": itask.poll_timer.delay,
                    "timeout": itask.poll_timer.timeout})
            for ctx_key_1, timer in itask.try_timers.items():
                if timer is None:
                    continue
                self.db_inserts_map[self.TABLE_TASK_ACTION_TIMERS].append({
                    "name": itask.tdef.name,
                    "cycle": str(itask.point),
                    "ctx_key": json.dumps(("try_timers", ctx_key_1)),
                    "ctx": self._namedtuple2json(timer.ctx),
                    "delays": json.dumps(timer.delays),
                    "num": timer.num,
                    "delay": timer.delay,
                    "timeout": timer.timeout})
            if itask.state.time_updated:
                set_args = {
                    "time_updated": itask.state.time_updated,
                    "submit_num": itask.submit_num,
                    "try_num": itask.get_try_num(),
                    "status": itask.state.status}
                where_args = {
                    "cycle": str(itask.point),
                    "name": itask.tdef.name,
                }
                self.db_updates_map.setdefault(self.TABLE_TASK_STATES, [])
                self.db_updates_map[self.TABLE_TASK_STATES].append(
                    (set_args, where_args))
                itask.state.time_updated = None

        self.db_inserts_map[self.TABLE_CHECKPOINT_ID].append({
            # id = -1 for latest
            "id": CylcSuiteDAO.CHECKPOINT_LATEST_ID,
            "time": get_current_time_string(),
            "event": CylcSuiteDAO.CHECKPOINT_LATEST_EVENT})
示例#44
0
文件: task_job_mgr.py 项目: cylc/cylc
    def _poll_task_job_callback(self, suite, itask, cmd_ctx, line):
        """Helper for _poll_task_jobs_callback, on one task job."""
        ctx = SubProcContext(self.JOBS_POLL, None)
        ctx.out = line
        ctx.ret_code = 0

        # See cylc.flow.batch_sys_manager.JobPollContext
        try:
            job_log_dir, context = line.split('|')[1:3]
            items = json.loads(context)
            jp_ctx = JobPollContext(job_log_dir, **items)
        except TypeError:
            itask.set_summary_message(self.POLL_FAIL)
            ctx.cmd = cmd_ctx.cmd  # print original command on failure
            return
        except ValueError:
            # back compat for cylc 7.7.1 and previous
            try:
                values = line.split('|')
                items = dict(  # done this way to ensure IndexError is raised
                    (key, values[x]) for
                    x, key in enumerate(JobPollContext.CONTEXT_ATTRIBUTES))
                job_log_dir = items.pop('job_log_dir')
            except (ValueError, IndexError):
                itask.set_summary_message(self.POLL_FAIL)
                ctx.cmd = cmd_ctx.cmd  # print original command on failure
                return
        finally:
            log_task_job_activity(ctx, suite, itask.point, itask.tdef.name)

        flag = self.task_events_mgr.FLAG_POLLED
        if jp_ctx.run_status == 1 and jp_ctx.run_signal in ["ERR", "EXIT"]:
            # Failed normally
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_FAILED, jp_ctx.time_run_exit, flag)
        elif jp_ctx.run_status == 1 and jp_ctx.batch_sys_exit_polled == 1:
            # Failed by a signal, and no longer in batch system
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_FAILED, jp_ctx.time_run_exit, flag)
            self.task_events_mgr.process_message(
                itask, INFO, FAIL_MESSAGE_PREFIX + jp_ctx.run_signal,
                jp_ctx.time_run_exit,
                flag)
        elif jp_ctx.run_status == 1:
            # The job has terminated, but is still managed by batch system.
            # Some batch system may restart a job in this state, so don't
            # mark as failed yet.
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_STARTED, jp_ctx.time_run, flag)
        elif jp_ctx.run_status == 0:
            # The job succeeded
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_SUCCEEDED, jp_ctx.time_run_exit,
                flag)
        elif jp_ctx.time_run and jp_ctx.batch_sys_exit_polled == 1:
            # The job has terminated without executing the error trap
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_FAILED, get_current_time_string(),
                flag)
        elif jp_ctx.time_run:
            # The job has started, and is still managed by batch system
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_STARTED, jp_ctx.time_run, flag)
        elif jp_ctx.batch_sys_exit_polled == 1:
            # The job never ran, and no longer in batch system
            self.task_events_mgr.process_message(
                itask, INFO, self.task_events_mgr.EVENT_SUBMIT_FAILED,
                jp_ctx.time_submit_exit, flag)
        else:
            # The job never ran, and is in batch system
            self.task_events_mgr.process_message(
                itask, INFO, TASK_STATUS_SUBMITTED, jp_ctx.time_submit_exit,
                flag)
示例#45
0
文件: task_job_mgr.py 项目: cylc/cylc
    def submit_task_jobs(self, suite, itasks, is_simulation=False):
        """Prepare and submit task jobs.

        Submit tasks where possible. Ignore tasks that are waiting for host
        select command to complete, or tasks that are waiting for remote
        initialisation. Bad host select command, error writing to a job file or
        bad remote initialisation will cause a bad task - leading to submission
        failure.

        This method uses prep_submit_task_job() as helper.

        Return (list): list of tasks that attempted submission.
        """
        if is_simulation:
            return self._simulation_submit_task_jobs(itasks)

        # Prepare tasks for job submission
        prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks)

        # Reset consumed host selection results
        self.task_remote_mgr.remote_host_select_reset()

        if not prepared_tasks:
            return bad_tasks

        # Group task jobs by (host, owner)
        auth_itasks = {}  # {(host, owner): [itask, ...], ...}
        for itask in prepared_tasks:
            auth_itasks.setdefault((itask.task_host, itask.task_owner), [])
            auth_itasks[(itask.task_host, itask.task_owner)].append(itask)
        # Submit task jobs for each (host, owner) group
        done_tasks = bad_tasks
        for (host, owner), itasks in sorted(auth_itasks.items()):
            is_init = self.task_remote_mgr.remote_init(host, owner)
            if is_init is None:
                # Remote is waiting to be initialised
                for itask in itasks:
                    itask.set_summary_message(self.REMOTE_INIT_MSG)
                continue
            # Ensure that localhost background/at jobs are recorded as running
            # on the host name of the current suite host, rather than just
            # "localhost". On suite restart on a different suite host, this
            # allows the restart logic to correctly poll the status of the
            # background/at jobs that may still be running on the previous
            # suite host.
            if (
                self.batch_sys_mgr.is_job_local_to_host(
                    itask.summary['batch_sys_name']) and
                not is_remote_host(host)
            ):
                owner_at_host = get_host()
            else:
                owner_at_host = host
            # Persist
            if owner:
                owner_at_host = owner + '@' + owner_at_host
            now_str = get_current_time_string()
            done_tasks.extend(itasks)
            for itask in itasks:
                # Log and persist
                LOG.info(
                    '[%s] -submit-num=%02d, owner@host=%s',
                    itask, itask.submit_num, owner_at_host)
                self.suite_db_mgr.put_insert_task_jobs(itask, {
                    'is_manual_submit': itask.is_manual_submit,
                    'try_num': itask.get_try_num(),
                    'time_submit': now_str,
                    'user_at_host': owner_at_host,
                    'batch_sys_name': itask.summary['batch_sys_name'],
                })
                itask.is_manual_submit = False
            if is_init == REMOTE_INIT_FAILED:
                # Remote has failed to initialise
                # Set submit-failed for all affected tasks
                for itask in itasks:
                    itask.local_job_file_path = None  # reset for retry
                    log_task_job_activity(
                        SubProcContext(
                            self.JOBS_SUBMIT,
                            '(init %s)' % owner_at_host,
                            err=REMOTE_INIT_FAILED,
                            ret_code=1),
                        suite, itask.point, itask.tdef.name)
                    self.task_events_mgr.process_message(
                        itask, CRITICAL,
                        self.task_events_mgr.EVENT_SUBMIT_FAILED)
                continue
            # Build the "cylc jobs-submit" command
            cmd = ['cylc', self.JOBS_SUBMIT]
            if LOG.isEnabledFor(DEBUG):
                cmd.append('--debug')
            if get_utc_mode():
                cmd.append('--utc-mode')
            remote_mode = False
            kwargs = {}
            for key, value, test_func in [
                    ('host', host, is_remote_host),
                    ('user', owner, is_remote_user)]:
                if test_func(value):
                    cmd.append('--%s=%s' % (key, value))
                    remote_mode = True
                    kwargs[key] = value
            if remote_mode:
                cmd.append('--remote-mode')
            cmd.append('--')
            cmd.append(glbl_cfg().get_derived_host_item(
                suite, 'suite job log directory', host, owner))
            # Chop itasks into a series of shorter lists if it's very big
            # to prevent overloading of stdout and stderr pipes.
            itasks = sorted(itasks, key=lambda itask: itask.identity)
            chunk_size = len(itasks) // ((len(itasks) // 100) + 1) + 1
            itasks_batches = [
                itasks[i:i + chunk_size] for i in range(0,
                                                        len(itasks),
                                                        chunk_size)]
            LOG.debug(
                '%s ... # will invoke in batches, sizes=%s',
                cmd, [len(b) for b in itasks_batches])
            for i, itasks_batch in enumerate(itasks_batches):
                stdin_files = []
                job_log_dirs = []
                for itask in itasks_batch:
                    if remote_mode:
                        stdin_files.append(
                            get_task_job_job_log(
                                suite, itask.point, itask.tdef.name,
                                itask.submit_num))
                    job_log_dirs.append(get_task_job_id(
                        itask.point, itask.tdef.name, itask.submit_num))
                    # The job file is now (about to be) used: reset the file
                    # write flag so that subsequent manual retrigger will
                    # generate a new job file.
                    itask.local_job_file_path = None
                    itask.state.reset_state(TASK_STATUS_READY)
                    if itask.state.outputs.has_custom_triggers():
                        self.suite_db_mgr.put_update_task_outputs(itask)
                self.proc_pool.put_command(
                    SubProcContext(
                        self.JOBS_SUBMIT,
                        cmd + job_log_dirs,
                        stdin_files=stdin_files,
                        job_log_dirs=job_log_dirs,
                        **kwargs
                    ),
                    self._submit_task_jobs_callback, [suite, itasks_batch])
        return done_tasks