def remote_clean(reg, platform_names, timeout): """Run subprocesses to clean workflows on remote install targets (skip localhost), given a set of platform names to look up. Args: reg (str): Workflow name. platform_names (list): List of platform names to look up in the global config, in order to determine the install targets to clean on. timeout (str): Number of seconds to wait before cancelling. """ try: install_targets_map = ( get_install_target_to_platforms_map(platform_names)) except PlatformLookupError as exc: raise PlatformLookupError( "Cannot clean on remote platforms as the workflow database is " f"out of date/inconsistent with the global config - {exc}") pool = [] for target, platforms in install_targets_map.items(): if target == get_localhost_install_target(): continue shuffle(platforms) LOG.info( f"Cleaning on install target: {platforms[0]['install target']}") # Issue ssh command: pool.append( (_remote_clean_cmd(reg, platforms[0], timeout), target, platforms) ) failed_targets = [] # Handle subproc pool results almost concurrently: while pool: for proc, target, platforms in pool: ret_code = proc.poll() if ret_code is None: # proc still running continue pool.remove((proc, target, platforms)) out, err = (f.decode() for f in proc.communicate()) if out: LOG.debug(out) if ret_code: # Try again using the next platform for this install target: this_platform = platforms.pop(0) excn = TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_TIDY, this_platform['name'], " ".join(proc.args), ret_code, out, err) LOG.debug(excn) if platforms: pool.append( (_remote_clean_cmd(reg, platforms[0], timeout), target, platforms) ) else: # Exhausted list of platforms failed_targets.append(target) elif err: LOG.debug(err) time.sleep(0.2) if failed_targets: raise CylcError( f"Could not clean on install targets: {', '.join(failed_targets)}")
def make_localhost_symlinks(rund, named_sub_dir): """Creates symlinks for any configured symlink dirs from glbl_cfg. Args: rund: the entire run directory path named_sub_dir: e.g flow_name/run1 Returns: dict - A dictionary of Symlinks with sources as keys and destinations as values: ``{source: destination}`` """ dirs_to_symlink = get_dirs_to_symlink(get_localhost_install_target(), named_sub_dir) symlinks_created = {} for key, value in dirs_to_symlink.items(): if key == 'run': dst = rund else: dst = os.path.join(rund, key) src = expand_path(value) if '$' in src: raise WorkflowFilesError( f'Unable to create symlink to {src}.' f' \'{value}\' contains an invalid environment variable.' ' Please check configuration.') symlink_success = make_symlink(src, dst) # symlink info returned for logging purposes, symlinks created # before logs as this dir may be a symlink. if symlink_success: symlinks_created[src] = dst return symlinks_created
def test_localhost_different_install_target(mock_glbl_cfg): mock_glbl_cfg( 'cylc.flow.platforms.glbl_cfg', ''' [platforms] [[localhost]] install target = file_system_1 ''') assert get_localhost_install_target() == 'file_system_1'
def remote_tidy(self): """Remove workflow contact files and keys from initialised remotes. Call "cylc remote-tidy". This method is called on workflow shutdown, so we want nothing to hang. Timeout any incomplete commands after 10 seconds. """ # Issue all SSH commands in parallel procs = {} for install_target, message in self.remote_init_map.items(): if message != REMOTE_FILE_INSTALL_DONE: continue if install_target == get_localhost_install_target(): continue platform = get_random_platform_for_install_target(install_target) platform_n = platform['name'] cmd = ['remote-tidy'] if cylc.flow.flags.verbosity > 1: cmd.append('--debug') cmd.append(install_target) cmd.append(get_remote_workflow_run_dir(self.workflow)) cmd = construct_ssh_cmd(cmd, platform, timeout='10s') LOG.debug("Removing authentication keys and contact file " f"from remote: \"{install_target}\"") procs[platform_n] = (cmd, Popen(cmd, stdout=PIPE, stderr=PIPE, stdin=DEVNULL)) # Wait for commands to complete for a max of 10 seconds timeout = time() + 10.0 while procs and time() < timeout: for platform_n, (cmd, proc) in procs.copy().items(): if proc.poll() is None: continue del procs[platform_n] out, err = (f.decode() for f in proc.communicate()) if proc.wait(): LOG.warning( TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_TIDY, platform_n, ' '.join(quote(item) for item in cmd), proc.returncode, out, err)) # Terminate any remaining commands for platform_n, (cmd, proc) in procs.items(): try: proc.terminate() except OSError: pass out, err = (f.decode() for f in proc.communicate()) if proc.wait(): LOG.warning( TaskRemoteMgmtError(TaskRemoteMgmtError.MSG_TIDY, platform_n, ' '.join(quote(item) for item in cmd), proc.returncode, out, err))
def remote_init(self, platform: Dict[str, Any], curve_auth: 'ThreadAuthenticator', client_pub_key_dir: str) -> None: """Initialise a remote host if necessary. Call "cylc remote-init" to install workflow items to remote: ".service/contact": For TCP task communication "python/": if source exists Args: platform: A dict containing settings relating to platform used in this remote installation. curve_auth: The ZMQ authenticator. client_pub_key_dir: Client public key directory, used by the ZMQ authenticator. """ install_target = platform['install target'] if install_target == get_localhost_install_target(): self.remote_init_map[install_target] = REMOTE_FILE_INSTALL_DONE return # Set status of install target to in progress while waiting for remote # initialisation to finish self.remote_init_map[install_target] = REMOTE_INIT_IN_PROGRESS # Determine what items to install comms_meth: CommsMeth = CommsMeth(platform['communication method']) items = self._remote_init_items(comms_meth) # Create a TAR archive with the service files, # so they can be sent later via SSH's STDIN to the task remote. tmphandle = self.proc_pool.get_temporary_file() tarhandle = tarfile.open(fileobj=tmphandle, mode='w') for path, arcname in items: tarhandle.add(path, arcname=arcname) tarhandle.close() tmphandle.seek(0) # Build the remote-init command to be run over ssh cmd = ['remote-init'] if cylc.flow.flags.verbosity > 1: cmd.append('--debug') cmd.append(str(install_target)) cmd.append(get_remote_workflow_run_dir(self.workflow)) dirs_to_symlink = get_dirs_to_symlink(install_target, self.workflow) for key, value in dirs_to_symlink.items(): if value is not None: cmd.append(f"{key}={quote(value)} ") # Create the ssh command cmd = construct_ssh_cmd(cmd, platform) self.proc_pool.put_command( SubProcContext('remote-init', cmd, stdin_files=[tmphandle]), self._remote_init_callback, [platform, tmphandle, curve_auth, client_pub_key_dir])
def make_localhost_symlinks( rund: Union[Path, str], named_sub_dir: str, symlink_conf: Optional[Dict[str, Dict[str, str]]] = None ) -> Dict[str, Union[Path, str]]: """Creates symlinks for any configured symlink dirs from glbl_cfg. Args: rund: the entire run directory path named_sub_dir: e.g workflow_name/run1 symlink_conf: Symlinks dirs configuration passed from cli Returns: Dictionary of symlinks with sources as keys and destinations as values: ``{source: destination}`` """ symlinks_created = {} dirs_to_symlink = get_dirs_to_symlink( get_localhost_install_target(), named_sub_dir, symlink_conf=symlink_conf ) for key, value in dirs_to_symlink.items(): if value is None: continue if key == 'run': symlink_path = rund else: symlink_path = os.path.join(rund, key) target = expand_path(value) if '$' in target: raise WorkflowFilesError( f'Unable to create symlink to {target}.' f' \'{value}\' contains an invalid environment variable.' ' Please check configuration.') symlink_success = make_symlink(symlink_path, target) # Symlink info returned for logging purposes. Symlinks should be # created before logs as the log dir may be a symlink. if symlink_success: symlinks_created[target] = symlink_path return symlinks_created
def test_get_localhost_install_target(): assert get_localhost_install_target() == 'localhost'
def submit_task_jobs(self, suite, itasks, curve_auth, client_pub_key_dir, is_simulation=False): """Prepare for job submission and submit task jobs. Preparation (host selection, remote host init, and remote install) is done asynchronously. Newly released tasks may be sent here several times until these init subprocesses have returned. Failure during preparation is considered to be job submission failure. Once preparation has completed or failed, reset .waiting_on_job_prep in task instances so the scheduler knows to stop sending them back here. This method uses prep_submit_task_job() as helper. Return (list): list of tasks that attempted submission. """ if is_simulation: return self._simulation_submit_task_jobs(itasks) # Prepare tasks for job submission prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks) # Reset consumed host selection results self.task_remote_mgr.subshell_eval_reset() if not prepared_tasks: return bad_tasks auth_itasks = {} # {platform: [itask, ...], ...} for itask in prepared_tasks: platform_name = itask.platform['name'] auth_itasks.setdefault(platform_name, []) auth_itasks[platform_name].append(itask) # Submit task jobs for each platform done_tasks = bad_tasks for platform_name, itasks in sorted(auth_itasks.items()): platform = itasks[0].platform install_target = get_install_target_from_platform(platform) ri_map = self.task_remote_mgr.remote_init_map if (ri_map.get(install_target) != REMOTE_FILE_INSTALL_DONE): if install_target == get_localhost_install_target(): # Skip init and file install for localhost. LOG.debug(f"REMOTE INIT NOT REQUIRED for {install_target}") ri_map[install_target] = (REMOTE_FILE_INSTALL_DONE) elif install_target not in ri_map: # Remote init not in progress for target, so start it. self.task_remote_mgr.remote_init(platform, curve_auth, client_pub_key_dir) for itask in itasks: itask.set_summary_message(self.REMOTE_INIT_MSG) self.data_store_mgr.delta_job_msg( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num), self.REMOTE_INIT_MSG) continue elif (ri_map[install_target] == REMOTE_INIT_DONE): # Already done remote init so move on to file install self.task_remote_mgr.file_install(platform) continue elif (ri_map[install_target] in self.IN_PROGRESS.keys()): # Remote init or file install in progress. for itask in itasks: msg = self.IN_PROGRESS[ri_map[install_target]] itask.set_summary_message(msg) self.data_store_mgr.delta_job_msg( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num), msg) continue # Ensure that localhost background/at jobs are recorded as running # on the host name of the current suite host, rather than just # "localhost". On suite restart on a different suite host, this # allows the restart logic to correctly poll the status of the # background/at jobs that may still be running on the previous # suite host. host = get_host_from_platform(platform) if (self.job_runner_mgr.is_job_local_to_host( itask.summary['job_runner_name']) and not is_remote_platform(platform)): host = get_host() now_str = get_current_time_string() done_tasks.extend(itasks) for itask in itasks: # Log and persist LOG.info('[%s] -submit-num=%02d, host=%s', itask, itask.submit_num, host) self.suite_db_mgr.put_insert_task_jobs( itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': now_str, 'platform_name': itask.platform['name'], 'job_runner_name': itask.summary['job_runner_name'], }) itask.is_manual_submit = False if (ri_map[install_target] in [REMOTE_INIT_FAILED, REMOTE_FILE_INSTALL_FAILED]): # Remote init or install failed. Set submit-failed for all # affected tasks and remove target from remote init map # - this enables new tasks to re-initialise that target init_error = (ri_map[install_target]) del ri_map[install_target] for itask in itasks: itask.waiting_on_job_prep = False itask.local_job_file_path = None # reset for retry log_task_job_activity( SubProcContext(self.JOBS_SUBMIT, '(init %s)' % host, err=init_error, ret_code=1), suite, itask.point, itask.tdef.name) self._prep_submit_task_job_error(suite, itask, '(remote init)', '') continue # Build the "cylc jobs-submit" command cmd = [self.JOBS_SUBMIT] if LOG.isEnabledFor(DEBUG): cmd.append('--debug') if get_utc_mode(): cmd.append('--utc-mode') if is_remote_platform(itask.platform): remote_mode = True cmd.append('--remote-mode') else: remote_mode = False if itask.platform['clean job submission environment']: cmd.append('--clean-env') for var in itask.platform[ 'job submission environment pass-through']: cmd.append(f"--env={var}") for path in itask.platform[ 'job submission executable paths'] + SYSPATH: cmd.append(f"--path={path}") cmd.append('--') cmd.append(get_remote_suite_run_job_dir(platform, suite)) # Chop itasks into a series of shorter lists if it's very big # to prevent overloading of stdout and stderr pipes. itasks = sorted(itasks, key=lambda itask: itask.identity) chunk_size = (len(itasks) // ( (len(itasks) // platform['max batch submit size']) + 1) + 1) itasks_batches = [ itasks[i:i + chunk_size] for i in range(0, len(itasks), chunk_size) ] LOG.debug('%s ... # will invoke in batches, sizes=%s', cmd, [len(b) for b in itasks_batches]) if remote_mode: cmd = construct_ssh_cmd(cmd, platform) else: cmd = ['cylc'] + cmd for i, itasks_batch in enumerate(itasks_batches): stdin_files = [] job_log_dirs = [] for itask in itasks_batch: if remote_mode: stdin_files.append( os.path.expandvars( get_task_job_job_log(suite, itask.point, itask.tdef.name, itask.submit_num))) job_log_dirs.append( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num)) # The job file is now (about to be) used: reset the file # write flag so that subsequent manual retrigger will # generate a new job file. itask.local_job_file_path = None if itask.state.outputs.has_custom_triggers(): self.suite_db_mgr.put_update_task_outputs(itask) itask.waiting_on_job_prep = False self.proc_pool.put_command( SubProcContext( self.JOBS_SUBMIT, cmd + job_log_dirs, stdin_files=stdin_files, job_log_dirs=job_log_dirs, ), self._submit_task_jobs_callback, [suite, itasks_batch]) return done_tasks
def remote_tidy(self) -> None: """Remove workflow contact files and keys from initialised remotes. Call "cylc remote-tidy". This method is called on workflow shutdown, so we want nothing to hang. Timeout any incomplete commands after 10 seconds. """ # Issue all SSH commands in parallel def construct_remote_tidy_ssh_cmd( platform: Dict[str, Any]) -> Tuple[List[str], str]: cmd = ['remote-tidy'] cmd.extend(verbosity_to_opts(cylc.flow.flags.verbosity)) cmd.append(get_install_target_from_platform(platform)) cmd.append(get_remote_workflow_run_dir(self.workflow)) host = get_host_from_platform(platform, bad_hosts=self.bad_hosts) cmd = construct_ssh_cmd(cmd, platform, host, timeout='10s') return cmd, host queue: Deque[RemoteTidyQueueTuple] = deque() for install_target, message in self.remote_init_map.items(): if message != REMOTE_FILE_INSTALL_DONE: continue if install_target == get_localhost_install_target(): continue try: platform = get_random_platform_for_install_target( install_target) cmd, host = construct_remote_tidy_ssh_cmd(platform) except (NoHostsError, PlatformLookupError) as exc: LOG.warning( PlatformError( f'{PlatformError.MSG_TIDY}\n{exc}', platform['name'], )) else: log_platform_event('remote tidy', platform, host) queue.append( RemoteTidyQueueTuple( platform, host, Popen( # nosec cmd, stdout=PIPE, stderr=PIPE, stdin=DEVNULL, text=True ) # * command constructed by internal interface )) # Wait for commands to complete for a max of 10 seconds timeout = time() + 10.0 while queue and time() < timeout: item = queue.popleft() if item.proc.poll() is None: # proc still running queue.append(item) continue out, err = item.proc.communicate() # 255 error has to be handled here because remote tidy doesn't # use SubProcPool. if item.proc.returncode == 255: timeout = time() + 10.0 self.bad_hosts.add(item.host) try: retry_cmd, retry_host = construct_remote_tidy_ssh_cmd( item.platform) except (NoHostsError, PlatformLookupError) as exc: LOG.warning( PlatformError(f'{PlatformError.MSG_TIDY}\n{exc}', item.platform['name'])) else: queue.append( item._replace( host=retry_host, proc=Popen( # nosec retry_cmd, stdout=PIPE, stderr=PIPE, stdin=DEVNULL, text=True ) # * command constructed by internal interface )) elif item.proc.returncode: LOG.warning( PlatformError(PlatformError.MSG_TIDY, item.platform['name'], cmd=item.proc.args, ret_code=item.proc.returncode, out=out, err=err)) sleep(0.1) # Terminate any remaining commands for item in queue: with suppress(OSError): item.proc.terminate() out, err = item.proc.communicate() if item.proc.wait(): LOG.warning( PlatformError( PlatformError.MSG_TIDY, item.platform['name'], cmd=item.proc.args, ret_code=item.proc.returncode, out=out, err=err, ))
def remote_tidy(self): """Remove workflow contact files and keys from initialised remotes. Call "cylc remote-tidy". This method is called on workflow shutdown, so we want nothing to hang. Timeout any incomplete commands after 10 seconds. """ from cylc.flow.platforms import PlatformLookupError # Issue all SSH commands in parallel def construct_remote_tidy_ssh_cmd(install_target, platform): cmd = ['remote-tidy'] if cylc.flow.flags.verbosity > 1: cmd.append('--debug') cmd.append(install_target) cmd.append(get_remote_workflow_run_dir(self.workflow)) host = get_host_from_platform(platform, bad_hosts=self.bad_hosts) cmd = construct_ssh_cmd(cmd, platform, host, timeout='10s') return cmd, host procs = {} for install_target, message in self.remote_init_map.items(): if message != REMOTE_FILE_INSTALL_DONE: continue if install_target == get_localhost_install_target(): continue platform = get_random_platform_for_install_target(install_target) platform_n = platform['name'] try: cmd, host = construct_remote_tidy_ssh_cmd( install_target, platform) except (NoHostsError, PlatformLookupError): LOG.warning( TaskRemoteMgmtError(TaskRemoteMgmtError.MSG_TIDY, platform_n, 1, '', '', 'remote tidy')) else: LOG.debug("Removing authentication keys and contact file " f"from remote: \"{install_target}\"") procs[platform_n] = (cmd, host, Popen(cmd, stdout=PIPE, stderr=PIPE, stdin=DEVNULL)) # Wait for commands to complete for a max of 10 seconds timeout = time() + 10.0 while procs and time() < timeout: for platform_n, (cmd, host, proc) in procs.copy().items(): if proc.poll() is None: continue del procs[platform_n] out, err = (f.decode() for f in proc.communicate()) # 255 error has to be handled here becuase remote tidy doesn't # use SubProcPool. if proc.returncode == 255: timeout = time() + 10.0 self.bad_hosts.add(host) LOG.warning( f'Tried to tidy remote platform: \'{platform_n}\' ' f'using host \'{host}\' but failed; ' 'trying a different host') try: retry_cmd, host = construct_remote_tidy_ssh_cmd( install_target, platform) except (NoHostsError, PlatformLookupError): LOG.warning( TaskRemoteMgmtError(TaskRemoteMgmtError.MSG_TIDY, platform_n, '', '', '', '')) else: procs[platform_n] = (retry_cmd, host, Popen(retry_cmd, stdout=PIPE, stderr=PIPE, stdin=DEVNULL)) if proc.wait() and proc.returncode != 255: LOG.warning( TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_TIDY, platform_n, ' '.join(quote(item) for item in cmd), proc.returncode, out, err)) # Terminate any remaining commands for platform_n, (cmd, proc) in procs.items(): with suppress(OSError): proc.terminate() out, err = (f.decode() for f in proc.communicate()) if proc.wait(): LOG.warning( TaskRemoteMgmtError(TaskRemoteMgmtError.MSG_TIDY, platform_n, ' '.join(quote(item) for item in cmd), proc.returncode, out, err))