def _get_job_scripts(itask, rtconfig): """Return pre-script, script, post-script for a job.""" script = rtconfig['script'] pre_script = rtconfig['pre-script'] post_script = rtconfig['post-script'] if itask.tdef.suite_polling_cfg: # Automatic suite state polling script comstr = "cylc suite-state " + \ " --task=" + itask.tdef.suite_polling_cfg['task'] + \ " --point=" + str(itask.point) if LOG.isEnabledFor(DEBUG): comstr += ' --debug' for key, fmt in [('user', ' --%s=%s'), ('host', ' --%s=%s'), ('interval', ' --%s=%d'), ('max-polls', ' --%s=%s'), ('run-dir', ' --%s=%s')]: if rtconfig['suite state polling'][key]: comstr += fmt % (key, rtconfig['suite state polling'][key]) if rtconfig['suite state polling']['message']: comstr += " --message='%s'" % ( rtconfig['suite state polling']['message']) else: comstr += " --status=" + itask.tdef.suite_polling_cfg['status'] comstr += " " + itask.tdef.suite_polling_cfg['suite'] script = "echo " + comstr + "\n" + comstr return pre_script, script, post_script
def _run_job_cmd(self, cmd_key, suite, itasks, callback): """Run job commands, e.g. poll, kill, etc. Group itasks with their user@host. Put a job command for each user@host to the multiprocess pool. """ if not itasks: return auth_itasks = {} for itask in itasks: if (itask.task_host, itask.task_owner) not in auth_itasks: auth_itasks[(itask.task_host, itask.task_owner)] = [] auth_itasks[(itask.task_host, itask.task_owner)].append(itask) for (host, owner), itasks in sorted(auth_itasks.items()): cmd = ["cylc", cmd_key] if LOG.isEnabledFor(DEBUG): cmd.append("--debug") if is_remote_host(host): cmd.append("--host=%s" % (host)) if is_remote_user(owner): cmd.append("--user=%s" % (owner)) cmd.append("--") cmd.append(get_remote_suite_run_job_dir(host, owner, suite)) job_log_dirs = [] for itask in sorted(itasks, key=lambda itask: itask.identity): job_log_dirs.append( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num)) cmd += job_log_dirs self.proc_pool.put_command(SubProcContext(cmd_key, cmd), callback, [suite, itasks])
def _run_job_cmd(self, cmd_key, suite, itasks, callback): """Run job commands, e.g. poll, kill, etc. Group itasks with their user@host. Put a job command for each user@host to the multiprocess pool. """ if not itasks: return auth_itasks = {} for itask in itasks: if (itask.task_host, itask.task_owner) not in auth_itasks: auth_itasks[(itask.task_host, itask.task_owner)] = [] auth_itasks[(itask.task_host, itask.task_owner)].append(itask) for (host, owner), itasks in sorted(auth_itasks.items()): cmd = ["cylc", cmd_key] if LOG.isEnabledFor(DEBUG): cmd.append("--debug") if is_remote_host(host): cmd.append("--host=%s" % (host)) if is_remote_user(owner): cmd.append("--user=%s" % (owner)) cmd.append("--") cmd.append(glbl_cfg().get_derived_host_item( suite, "suite job log directory", host, owner)) job_log_dirs = [] for itask in sorted(itasks, key=lambda itask: itask.identity): job_log_dirs.append(get_task_job_id( itask.point, itask.tdef.name, itask.submit_num)) cmd += job_log_dirs self.proc_pool.put_command( SubProcContext(cmd_key, cmd), callback, [suite, itasks])
def _get_job_scripts(itask, rtconfig): """Return pre-script, script, post-script for a job.""" script = rtconfig['script'] pre_script = rtconfig['pre-script'] post_script = rtconfig['post-script'] if itask.tdef.suite_polling_cfg: # Automatic suite state polling script comstr = "cylc suite-state " + \ " --task=" + itask.tdef.suite_polling_cfg['task'] + \ " --point=" + str(itask.point) if LOG.isEnabledFor(DEBUG): comstr += ' --debug' for key, fmt in [ ('user', ' --%s=%s'), ('host', ' --%s=%s'), ('interval', ' --%s=%d'), ('max-polls', ' --%s=%s'), ('run-dir', ' --%s=%s')]: if rtconfig['suite state polling'][key]: comstr += fmt % (key, rtconfig['suite state polling'][key]) if rtconfig['suite state polling']['message']: comstr += " --message='%s'" % ( rtconfig['suite state polling']['message']) else: comstr += " --status=" + itask.tdef.suite_polling_cfg['status'] comstr += " " + itask.tdef.suite_polling_cfg['suite'] script = "echo " + comstr + "\n" + comstr return pre_script, script, post_script
def _process_job_logs_retrieval(self, schd_ctx, ctx, id_keys): """Process retrieval of task job logs from remote user@host.""" if ctx.user_at_host and "@" in ctx.user_at_host: s_user, s_host = ctx.user_at_host.split("@", 1) else: s_user, s_host = (None, ctx.user_at_host) ssh_str = str(glbl_cfg().get_host_item("ssh command", s_host, s_user)) rsync_str = str(glbl_cfg().get_host_item("retrieve job logs command", s_host, s_user)) cmd = shlex.split(rsync_str) + ["--rsh=" + ssh_str] if LOG.isEnabledFor(DEBUG): cmd.append("-v") if ctx.max_size: cmd.append("--max-size=%s" % (ctx.max_size, )) # Includes and excludes includes = set() for _, point, name, submit_num in id_keys: # Include relevant directories, all levels needed includes.add("/%s" % (point)) includes.add("/%s/%s" % (point, name)) includes.add("/%s/%s/%02d" % (point, name, submit_num)) includes.add("/%s/%s/%02d/**" % (point, name, submit_num)) cmd += ["--include=%s" % (include) for include in sorted(includes)] cmd.append("--exclude=/**") # exclude everything else # Remote source cmd.append( "%s:%s/" % (ctx.user_at_host, get_remote_suite_run_job_dir(s_host, s_user, schd_ctx.suite))) # Local target cmd.append(get_suite_run_job_dir(schd_ctx.suite) + "/") self.proc_pool.put_command( SubProcContext(ctx, cmd, env=dict(os.environ), id_keys=id_keys), self._job_logs_retrieval_callback, [schd_ctx])
def _run_job_cmd(self, cmd_key, suite, itasks, callback): """Run job commands, e.g. poll, kill, etc. Group itasks with their platform_name and host. Put a job command for each group to the multiprocess pool. """ if not itasks: return # sort itasks into lists based upon where they were run. auth_itasks = {} for itask in itasks: platform_n = itask.platform['name'] if platform_n not in auth_itasks: auth_itasks[platform_n] = [] auth_itasks[platform_n].append(itask) # Go through each list of itasks and carry out commands as required. for platform_n, itasks in sorted(auth_itasks.items()): platform = get_platform(platform_n) if is_remote_platform(platform): remote_mode = True cmd = [cmd_key] else: cmd = ["cylc", cmd_key] remote_mode = False if LOG.isEnabledFor(DEBUG): cmd.append("--debug") cmd.append("--") cmd.append(get_remote_suite_run_job_dir(platform, suite)) job_log_dirs = [] if remote_mode: cmd = construct_ssh_cmd(cmd, platform) for itask in sorted(itasks, key=lambda itask: itask.identity): job_log_dirs.append( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num)) cmd += job_log_dirs self.proc_pool.put_command(SubProcContext(cmd_key, cmd), callback, [suite, itasks])
def submit_task_jobs(self, suite, itasks, is_simulation=False): """Prepare and submit task jobs. Submit tasks where possible. Ignore tasks that are waiting for host select command to complete, or tasks that are waiting for remote initialisation. Bad host select command, error writing to a job file or bad remote initialisation will cause a bad task - leading to submission failure. This method uses prep_submit_task_job() as helper. Return (list): list of tasks that attempted submission. """ if is_simulation: return self._simulation_submit_task_jobs(itasks) # Prepare tasks for job submission prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks) # Reset consumed host selection results self.task_remote_mgr.remote_host_select_reset() if not prepared_tasks: return bad_tasks # Group task jobs by (host, owner) auth_itasks = {} # {(host, owner): [itask, ...], ...} for itask in prepared_tasks: auth_itasks.setdefault((itask.task_host, itask.task_owner), []) auth_itasks[(itask.task_host, itask.task_owner)].append(itask) # Submit task jobs for each (host, owner) group done_tasks = bad_tasks for (host, owner), itasks in sorted(auth_itasks.items()): is_init = self.task_remote_mgr.remote_init(host, owner) if is_init is None: # Remote is waiting to be initialised for itask in itasks: itask.set_summary_message(self.REMOTE_INIT_MSG) self.job_pool.add_job_msg( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num), self.REMOTE_INIT_MSG) continue # Ensure that localhost background/at jobs are recorded as running # on the host name of the current suite host, rather than just # "localhost". On suite restart on a different suite host, this # allows the restart logic to correctly poll the status of the # background/at jobs that may still be running on the previous # suite host. if (self.batch_sys_mgr.is_job_local_to_host( itask.summary['batch_sys_name']) and not is_remote_host(host)): owner_at_host = get_host() else: owner_at_host = host # Persist if owner: owner_at_host = owner + '@' + owner_at_host now_str = get_current_time_string() done_tasks.extend(itasks) for itask in itasks: # Log and persist LOG.info('[%s] -submit-num=%02d, owner@host=%s', itask, itask.submit_num, owner_at_host) self.suite_db_mgr.put_insert_task_jobs( itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': now_str, 'user_at_host': owner_at_host, 'batch_sys_name': itask.summary['batch_sys_name'], }) itask.is_manual_submit = False if is_init == REMOTE_INIT_FAILED: # Remote has failed to initialise # Set submit-failed for all affected tasks for itask in itasks: itask.local_job_file_path = None # reset for retry log_task_job_activity( SubProcContext(self.JOBS_SUBMIT, '(init %s)' % owner_at_host, err=REMOTE_INIT_FAILED, ret_code=1), suite, itask.point, itask.tdef.name) self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED) continue # Build the "cylc jobs-submit" command cmd = ['cylc', self.JOBS_SUBMIT] if LOG.isEnabledFor(DEBUG): cmd.append('--debug') if get_utc_mode(): cmd.append('--utc-mode') remote_mode = False kwargs = {} for key, value, test_func in [('host', host, is_remote_host), ('user', owner, is_remote_user)]: if test_func(value): cmd.append('--%s=%s' % (key, value)) remote_mode = True kwargs[key] = value if remote_mode: cmd.append('--remote-mode') cmd.append('--') cmd.append(get_remote_suite_run_job_dir(host, owner, suite)) # Chop itasks into a series of shorter lists if it's very big # to prevent overloading of stdout and stderr pipes. itasks = sorted(itasks, key=lambda itask: itask.identity) chunk_size = len(itasks) // ((len(itasks) // 100) + 1) + 1 itasks_batches = [ itasks[i:i + chunk_size] for i in range(0, len(itasks), chunk_size) ] LOG.debug('%s ... # will invoke in batches, sizes=%s', cmd, [len(b) for b in itasks_batches]) for i, itasks_batch in enumerate(itasks_batches): stdin_files = [] job_log_dirs = [] for itask in itasks_batch: if remote_mode: stdin_files.append( get_task_job_job_log(suite, itask.point, itask.tdef.name, itask.submit_num)) job_log_dirs.append( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num)) # The job file is now (about to be) used: reset the file # write flag so that subsequent manual retrigger will # generate a new job file. itask.local_job_file_path = None itask.state.reset(TASK_STATUS_READY) if itask.state.outputs.has_custom_triggers(): self.suite_db_mgr.put_update_task_outputs(itask) self.proc_pool.put_command( SubProcContext(self.JOBS_SUBMIT, cmd + job_log_dirs, stdin_files=stdin_files, job_log_dirs=job_log_dirs, **kwargs), self._submit_task_jobs_callback, [suite, itasks_batch]) return done_tasks
def submit_task_jobs(self, suite, itasks, is_simulation=False): """Prepare and submit task jobs. Submit tasks where possible. Ignore tasks that are waiting for host select command to complete, or tasks that are waiting for remote initialisation. Bad host select command, error writing to a job file or bad remote initialisation will cause a bad task - leading to submission failure. This method uses prep_submit_task_job() as helper. Return (list): list of tasks that attempted submission. """ if is_simulation: return self._simulation_submit_task_jobs(itasks) # Prepare tasks for job submission prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks) # Reset consumed host selection results self.task_remote_mgr.remote_host_select_reset() if not prepared_tasks: return bad_tasks # Group task jobs by (host, owner) auth_itasks = {} # {(host, owner): [itask, ...], ...} for itask in prepared_tasks: auth_itasks.setdefault((itask.task_host, itask.task_owner), []) auth_itasks[(itask.task_host, itask.task_owner)].append(itask) # Submit task jobs for each (host, owner) group done_tasks = bad_tasks for (host, owner), itasks in sorted(auth_itasks.items()): is_init = self.task_remote_mgr.remote_init(host, owner) if is_init is None: # Remote is waiting to be initialised for itask in itasks: itask.set_summary_message(self.REMOTE_INIT_MSG) continue # Ensure that localhost background/at jobs are recorded as running # on the host name of the current suite host, rather than just # "localhost". On suite restart on a different suite host, this # allows the restart logic to correctly poll the status of the # background/at jobs that may still be running on the previous # suite host. if ( self.batch_sys_mgr.is_job_local_to_host( itask.summary['batch_sys_name']) and not is_remote_host(host) ): owner_at_host = get_host() else: owner_at_host = host # Persist if owner: owner_at_host = owner + '@' + owner_at_host now_str = get_current_time_string() done_tasks.extend(itasks) for itask in itasks: # Log and persist LOG.info( '[%s] -submit-num=%02d, owner@host=%s', itask, itask.submit_num, owner_at_host) self.suite_db_mgr.put_insert_task_jobs(itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': now_str, 'user_at_host': owner_at_host, 'batch_sys_name': itask.summary['batch_sys_name'], }) itask.is_manual_submit = False if is_init == REMOTE_INIT_FAILED: # Remote has failed to initialise # Set submit-failed for all affected tasks for itask in itasks: itask.local_job_file_path = None # reset for retry log_task_job_activity( SubProcContext( self.JOBS_SUBMIT, '(init %s)' % owner_at_host, err=REMOTE_INIT_FAILED, ret_code=1), suite, itask.point, itask.tdef.name) self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED) continue # Build the "cylc jobs-submit" command cmd = ['cylc', self.JOBS_SUBMIT] if LOG.isEnabledFor(DEBUG): cmd.append('--debug') if get_utc_mode(): cmd.append('--utc-mode') remote_mode = False kwargs = {} for key, value, test_func in [ ('host', host, is_remote_host), ('user', owner, is_remote_user)]: if test_func(value): cmd.append('--%s=%s' % (key, value)) remote_mode = True kwargs[key] = value if remote_mode: cmd.append('--remote-mode') cmd.append('--') cmd.append(glbl_cfg().get_derived_host_item( suite, 'suite job log directory', host, owner)) # Chop itasks into a series of shorter lists if it's very big # to prevent overloading of stdout and stderr pipes. itasks = sorted(itasks, key=lambda itask: itask.identity) chunk_size = len(itasks) // ((len(itasks) // 100) + 1) + 1 itasks_batches = [ itasks[i:i + chunk_size] for i in range(0, len(itasks), chunk_size)] LOG.debug( '%s ... # will invoke in batches, sizes=%s', cmd, [len(b) for b in itasks_batches]) for i, itasks_batch in enumerate(itasks_batches): stdin_files = [] job_log_dirs = [] for itask in itasks_batch: if remote_mode: stdin_files.append( get_task_job_job_log( suite, itask.point, itask.tdef.name, itask.submit_num)) job_log_dirs.append(get_task_job_id( itask.point, itask.tdef.name, itask.submit_num)) # The job file is now (about to be) used: reset the file # write flag so that subsequent manual retrigger will # generate a new job file. itask.local_job_file_path = None itask.state.reset_state(TASK_STATUS_READY) if itask.state.outputs.has_custom_triggers(): self.suite_db_mgr.put_update_task_outputs(itask) self.proc_pool.put_command( SubProcContext( self.JOBS_SUBMIT, cmd + job_log_dirs, stdin_files=stdin_files, job_log_dirs=job_log_dirs, **kwargs ), self._submit_task_jobs_callback, [suite, itasks_batch]) return done_tasks
def submit_task_jobs(self, suite, itasks, curve_auth, client_pub_key_dir, is_simulation=False): """Prepare for job submission and submit task jobs. Preparation (host selection, remote host init, and remote install) is done asynchronously. Newly released tasks may be sent here several times until these init subprocesses have returned. Failure during preparation is considered to be job submission failure. Once preparation has completed or failed, reset .waiting_on_job_prep in task instances so the scheduler knows to stop sending them back here. This method uses prep_submit_task_job() as helper. Return (list): list of tasks that attempted submission. """ if is_simulation: return self._simulation_submit_task_jobs(itasks) # Prepare tasks for job submission prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks) # Reset consumed host selection results self.task_remote_mgr.subshell_eval_reset() if not prepared_tasks: return bad_tasks auth_itasks = {} # {platform: [itask, ...], ...} for itask in prepared_tasks: platform_name = itask.platform['name'] auth_itasks.setdefault(platform_name, []) auth_itasks[platform_name].append(itask) # Submit task jobs for each platform done_tasks = bad_tasks for platform_name, itasks in sorted(auth_itasks.items()): platform = itasks[0].platform install_target = get_install_target_from_platform(platform) ri_map = self.task_remote_mgr.remote_init_map if (ri_map.get(install_target) != REMOTE_FILE_INSTALL_DONE): if install_target == get_localhost_install_target(): # Skip init and file install for localhost. LOG.debug(f"REMOTE INIT NOT REQUIRED for {install_target}") ri_map[install_target] = (REMOTE_FILE_INSTALL_DONE) elif install_target not in ri_map: # Remote init not in progress for target, so start it. self.task_remote_mgr.remote_init(platform, curve_auth, client_pub_key_dir) for itask in itasks: itask.set_summary_message(self.REMOTE_INIT_MSG) self.data_store_mgr.delta_job_msg( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num), self.REMOTE_INIT_MSG) continue elif (ri_map[install_target] == REMOTE_INIT_DONE): # Already done remote init so move on to file install self.task_remote_mgr.file_install(platform) continue elif (ri_map[install_target] in self.IN_PROGRESS.keys()): # Remote init or file install in progress. for itask in itasks: msg = self.IN_PROGRESS[ri_map[install_target]] itask.set_summary_message(msg) self.data_store_mgr.delta_job_msg( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num), msg) continue # Ensure that localhost background/at jobs are recorded as running # on the host name of the current suite host, rather than just # "localhost". On suite restart on a different suite host, this # allows the restart logic to correctly poll the status of the # background/at jobs that may still be running on the previous # suite host. host = get_host_from_platform(platform) if (self.job_runner_mgr.is_job_local_to_host( itask.summary['job_runner_name']) and not is_remote_platform(platform)): host = get_host() now_str = get_current_time_string() done_tasks.extend(itasks) for itask in itasks: # Log and persist LOG.info('[%s] -submit-num=%02d, host=%s', itask, itask.submit_num, host) self.suite_db_mgr.put_insert_task_jobs( itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': now_str, 'platform_name': itask.platform['name'], 'job_runner_name': itask.summary['job_runner_name'], }) itask.is_manual_submit = False if (ri_map[install_target] in [REMOTE_INIT_FAILED, REMOTE_FILE_INSTALL_FAILED]): # Remote init or install failed. Set submit-failed for all # affected tasks and remove target from remote init map # - this enables new tasks to re-initialise that target init_error = (ri_map[install_target]) del ri_map[install_target] for itask in itasks: itask.waiting_on_job_prep = False itask.local_job_file_path = None # reset for retry log_task_job_activity( SubProcContext(self.JOBS_SUBMIT, '(init %s)' % host, err=init_error, ret_code=1), suite, itask.point, itask.tdef.name) self._prep_submit_task_job_error(suite, itask, '(remote init)', '') continue # Build the "cylc jobs-submit" command cmd = [self.JOBS_SUBMIT] if LOG.isEnabledFor(DEBUG): cmd.append('--debug') if get_utc_mode(): cmd.append('--utc-mode') if is_remote_platform(itask.platform): remote_mode = True cmd.append('--remote-mode') else: remote_mode = False if itask.platform['clean job submission environment']: cmd.append('--clean-env') for var in itask.platform[ 'job submission environment pass-through']: cmd.append(f"--env={var}") for path in itask.platform[ 'job submission executable paths'] + SYSPATH: cmd.append(f"--path={path}") cmd.append('--') cmd.append(get_remote_suite_run_job_dir(platform, suite)) # Chop itasks into a series of shorter lists if it's very big # to prevent overloading of stdout and stderr pipes. itasks = sorted(itasks, key=lambda itask: itask.identity) chunk_size = (len(itasks) // ( (len(itasks) // platform['max batch submit size']) + 1) + 1) itasks_batches = [ itasks[i:i + chunk_size] for i in range(0, len(itasks), chunk_size) ] LOG.debug('%s ... # will invoke in batches, sizes=%s', cmd, [len(b) for b in itasks_batches]) if remote_mode: cmd = construct_ssh_cmd(cmd, platform) else: cmd = ['cylc'] + cmd for i, itasks_batch in enumerate(itasks_batches): stdin_files = [] job_log_dirs = [] for itask in itasks_batch: if remote_mode: stdin_files.append( os.path.expandvars( get_task_job_job_log(suite, itask.point, itask.tdef.name, itask.submit_num))) job_log_dirs.append( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num)) # The job file is now (about to be) used: reset the file # write flag so that subsequent manual retrigger will # generate a new job file. itask.local_job_file_path = None if itask.state.outputs.has_custom_triggers(): self.suite_db_mgr.put_update_task_outputs(itask) itask.waiting_on_job_prep = False self.proc_pool.put_command( SubProcContext( self.JOBS_SUBMIT, cmd + job_log_dirs, stdin_files=stdin_files, job_log_dirs=job_log_dirs, ), self._submit_task_jobs_callback, [suite, itasks_batch]) return done_tasks