def _run_job_cmd(self, cmd_key, suite, itasks, callback): """Run job commands, e.g. poll, kill, etc. Group itasks with their user@host. Put a job command for each user@host to the multiprocess pool. """ if not itasks: return auth_itasks = {} for itask in itasks: if (itask.task_host, itask.task_owner) not in auth_itasks: auth_itasks[(itask.task_host, itask.task_owner)] = [] auth_itasks[(itask.task_host, itask.task_owner)].append(itask) for (host, owner), itasks in sorted(auth_itasks.items()): cmd = ["cylc", cmd_key] if LOG.isEnabledFor(DEBUG): cmd.append("--debug") if is_remote_host(host): cmd.append("--host=%s" % (host)) if is_remote_user(owner): cmd.append("--user=%s" % (owner)) cmd.append("--") cmd.append(glbl_cfg().get_derived_host_item( suite, "suite job log directory", host, owner)) job_log_dirs = [] for itask in sorted(itasks, key=lambda itask: itask.identity): job_log_dirs.append(get_task_job_id( itask.point, itask.tdef.name, itask.submit_num)) cmd += job_log_dirs self.proc_pool.put_command( SubProcContext(cmd_key, cmd), callback, [suite, itasks])
def _run_job_cmd(self, cmd_key, suite, itasks, callback): """Run job commands, e.g. poll, kill, etc. Group itasks with their user@host. Put a job command for each user@host to the multiprocess pool. """ if not itasks: return auth_itasks = {} for itask in itasks: if (itask.task_host, itask.task_owner) not in auth_itasks: auth_itasks[(itask.task_host, itask.task_owner)] = [] auth_itasks[(itask.task_host, itask.task_owner)].append(itask) for (host, owner), itasks in sorted(auth_itasks.items()): cmd = ["cylc", cmd_key] if cylc.flags.debug: cmd.append("--debug") if is_remote_host(host): cmd.append("--host=%s" % (host)) if is_remote_user(owner): cmd.append("--user=%s" % (owner)) cmd.append("--") cmd.append( GLOBAL_CFG.get_derived_host_item(suite, "suite job log directory", host, owner)) job_log_dirs = [] for itask in sorted(itasks, key=lambda itask: itask.identity): job_log_dirs.append( self.task_events_mgr.get_task_job_id( itask.point, itask.tdef.name, itask.submit_num)) cmd += job_log_dirs self.proc_pool.put_command(SuiteProcContext(cmd_key, cmd), callback, [suite, itasks])
async def scan_one(reg, host, port, timeout=None, methods=None): if not methods: methods = ['identify'] if is_remote_host(host): try: host = get_host_ip_by_name(host) # IP reduces DNS traffic except socket.error as exc: if cylc.flags.debug: raise sys.stderr.write("ERROR: %s: %s\n" % (exc, host)) return (reg, host, port, None) # NOTE: Connect to the suite by host:port, this was the # SuiteRuntimeClient will not attempt to check the contact file # which would be unnecessary as we have already done so. # NOTE: This part of the scan *is* IO blocking. client = SuiteRuntimeClient(reg, host=host, port=port, timeout=timeout) result = {} for method in methods: # work our way up the chain of identity methods, extract as much # information as we can before the suite rejects us try: msg = await client.async_request(method) except ClientTimeout as exc: return (reg, host, port, MSG_TIMEOUT) except ClientError as exc: return (reg, host, port, result or None) else: result.update(msg) return (reg, host, port, result)
def construct_newtree(self): """construct self.newtree[one][two]...[nnn] = [auth, descr, dir ]""" regd_choices = {} for suite, suite_dir, descr in sorted(self.regd_choices): regd_choices[suite] = (suite, suite_dir, descr) self.newtree = {} for suite, auth in self.running_choices: if suite in regd_choices: if is_remote_host(auth.split(':', 1)[0]): descr, suite_dir = (None, None) else: # local suite suite_dir, descr = regd_choices[suite][1:3] del regd_choices[suite] nest2 = self.newtree regp = suite.split(SuiteSrvFilesManager.DELIM) for key in regp[:-1]: if key not in nest2: nest2[key] = {} nest2 = nest2[key] nest2[(regp[-1], suite, auth)] = [auth, descr, suite_dir] for suite, suite_dir, descr in regd_choices.values(): suite_dir = re.sub('^' + os.environ['HOME'], '~', suite_dir) nest2 = self.newtree regp = suite.split(SuiteSrvFilesManager.DELIM) for key in regp[:-1]: if key not in nest2: nest2[key] = {} nest2 = nest2[key] nest2[(regp[-1], suite, '-')] = ['-', descr, suite_dir]
def _scan_item(timeout, my_uuid, srv_files_mgr, item): """Connect to item host:port (item) to get suite identify.""" host, port = item host_anon = host if is_remote_host(host): try: host_anon = get_host_ip_by_name(host) # IP reduces DNS traffic except socket.error as exc: if cylc.flags.debug: raise sys.stderr.write("ERROR: %s: %s\n" % (exc, host)) return (host, port, None) client = SuiteRuntimeServiceClient( None, host=host_anon, port=port, my_uuid=my_uuid, timeout=timeout, auth=SuiteRuntimeServiceClient.ANON_AUTH) try: result = client.identify() except ClientTimeout: return (host, port, MSG_TIMEOUT) except ClientError: return (host, port, None) else: owner = result.get(KEY_OWNER) name = result.get(KEY_NAME) states = result.get(KEY_STATES, None) if cylc.flags.debug: sys.stderr.write(' suite: %s %s\n' % (name, owner)) if states is None: # This suite keeps its state info private. # Try again with the passphrase if I have it. try: pphrase = srv_files_mgr.get_auth_item( srv_files_mgr.FILE_BASE_PASSPHRASE, name, owner, host, content=True) except SuiteServiceFileError: pass else: if pphrase: client.suite = name client.owner = owner client.auth = None try: result = client.identify() except ClientError: # Nope (private suite, wrong passphrase). if cylc.flags.debug: sys.stderr.write(' (wrong passphrase)\n') else: if cylc.flags.debug: sys.stderr.write( ' (got states with passphrase)\n') return (host, port, result)
def remote_tidy(self): """Remove suite contact files from initialised remotes. Call "cylc remote-tidy". This method is called on suite shutdown, so we want nothing to hang. Timeout any incomplete commands after 10 seconds. Also remove UUID file on suite host ".service/uuid". """ # Remove UUID file uuid_fname = os.path.join( self.suite_srv_files_mgr.get_suite_srv_dir(self.suite), FILE_BASE_UUID) try: os.unlink(uuid_fname) except OSError: pass # Issue all SSH commands in parallel procs = {} for (host, owner), init_with_contact in self.remote_init_map.items(): if init_with_contact != REMOTE_INIT_DONE: continue cmd = ['timeout', '10', 'cylc', 'remote-tidy'] if is_remote_host(host): cmd.append('--host=%s' % host) if is_remote_user(owner): cmd.append('--user=%s' % owner) if cylc.flags.debug: cmd.append('--debug') cmd.append(os.path.join(glbl_cfg().get_derived_host_item( self.suite, 'suite run directory', host, owner))) procs[(host, owner)] = ( cmd, Popen(cmd, stdout=PIPE, stderr=PIPE, stdin=open(os.devnull))) # Wait for commands to complete for a max of 10 seconds timeout = time() + 10.0 while procs and time() < timeout: for (host, owner), (cmd, proc) in procs.copy().items(): if proc.poll() is None: continue del procs[(host, owner)] out, err = (f.decode() for f in proc.communicate()) if proc.wait(): LOG.warning(TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_TIDY, (host, owner), ' '.join(quote(item) for item in cmd), proc.returncode, out, err)) # Terminate any remaining commands for (host, owner), (cmd, proc) in procs.items(): try: proc.terminate() except OSError: pass out, err = proc.communicate() if proc.wait(): LOG.warning(TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_TIDY, (host, owner), ' '.join(quote(item) for item in cmd), proc.returncode, out, err))
def main(is_restart=False): """CLI main.""" options, args = parse_commandline(is_restart) if not args: # Auto-registration: "cylc run" (no args) in source dir. try: reg = SuiteSrvFilesManager().register() except SuiteServiceFileError as exc: sys.exit(exc) # Replace this process with "cylc run REG ..." for 'ps -f'. os.execv(sys.argv[0], [sys.argv[0]] + [reg] + sys.argv[1:]) # Check suite is not already running before start of host selection. try: SuiteSrvFilesManager().detect_old_contact_file(args[0]) except SuiteServiceFileError as exc: sys.exit(exc) # Create auth files if needed. On a shared FS if the suite host changes # this may (will?) renew the ssl.cert to reflect the change in host name. SuiteSrvFilesManager().create_auth_files(args[0]) # Check whether a run host is explicitly specified, else select one. if not options.host: try: host = HostAppointer().appoint_host() except EmptyHostList as exc: if cylc.flags.debug: raise else: sys.exit(str(exc)) if is_remote_host(host): if is_restart: base_cmd = ["restart"] + sys.argv[1:] else: base_cmd = ["run"] + sys.argv[1:] # Prevent recursive host selection base_cmd.append("--host=localhost") return remote_cylc_cmd(base_cmd, host=host) if remrun(set_rel_local=True): # State localhost as above. sys.exit() try: SuiteSrvFilesManager().get_suite_source_dir(args[0], options.owner) except SuiteServiceFileError: # Source path is assumed to be the run directory SuiteSrvFilesManager().register( args[0], glbl_cfg().get_derived_host_item(args[0], 'suite run directory')) try: scheduler = Scheduler(is_restart, options, args) except SuiteServiceFileError as exc: sys.exit(exc) scheduler.start()
def remote_host_select(self, host_str): """Evaluate a task host string. Arguments: host_str (str): An explicit host name, a command in back-tick or $(command) format, or an environment variable holding a hostname. Return (str): None if evaluate of host_str is still taking place. 'localhost' if host_str is not defined or if the evaluated host name is equivalent to 'localhost'. Otherwise, return the evaluated host name on success. Raise TaskRemoteMgmtError on error. """ if not host_str: return 'localhost' # Host selection command: $(command) or `command` match = REC_COMMAND.match(host_str) if match: cmd_str = match.groups()[1] if cmd_str in self.remote_host_str_map: # Command recently launched value = self.remote_host_str_map[cmd_str] if isinstance(value, TaskRemoteMgmtError): raise value # command failed elif value is None: return # command not yet ready else: host_str = value # command succeeded else: # Command not launched (or already reset) timeout = glbl_cfg().get(['task host select command timeout']) if timeout: cmd = ['timeout', str(int(timeout)), 'bash', '-c', cmd_str] else: cmd = ['bash', '-c', cmd_str] self.proc_pool.put_command( SuiteProcContext('remote-host-select', cmd, env=dict(os.environ)), self._remote_host_select_callback, [cmd_str]) self.remote_host_str_map[cmd_str] = None return self.remote_host_str_map[cmd_str] # Environment variable substitution host_str = os.path.expandvars(host_str) # Remote? if is_remote_host(host_str): return host_str else: return 'localhost'
def remote_host_select(self, host_str): """Evaluate a task host string. Arguments: host_str (str): An explicit host name, a command in back-tick or $(command) format, or an environment variable holding a hostname. Return (str): None if evaluate of host_str is still taking place. 'localhost' if host_str is not defined or if the evaluated host name is equivalent to 'localhost'. Otherwise, return the evaluated host name on success. Raise TaskRemoteMgmtError on error. """ if not host_str: return 'localhost' # Host selection command: $(command) or `command` match = REC_COMMAND.match(host_str) if match: cmd_str = match.groups()[1] if cmd_str in self.remote_host_str_map: # Command recently launched value = self.remote_host_str_map[cmd_str] if isinstance(value, TaskRemoteMgmtError): raise value # command failed elif value is None: return # command not yet ready else: host_str = value # command succeeded else: # Command not launched (or already reset) timeout = glbl_cfg().get(['task host select command timeout']) if timeout: cmd = ['timeout', str(int(timeout)), 'bash', '-c', cmd_str] else: cmd = ['bash', '-c', cmd_str] self.proc_pool.put_command( SubProcContext( 'remote-host-select', cmd, env=dict(os.environ)), self._remote_host_select_callback, [cmd_str]) self.remote_host_str_map[cmd_str] = None return self.remote_host_str_map[cmd_str] # Environment variable substitution host_str = os.path.expandvars(host_str) # Remote? if is_remote_host(host_str): return host_str else: return 'localhost'
def main(is_restart=False): """CLI main.""" options, args = parse_commandline(is_restart) if not args: # Auto-registration: "cylc run" (no args) in source dir. try: reg = SuiteSrvFilesManager().register() except SuiteServiceFileError as exc: sys.exit(exc) # Replace this process with "cylc run REG ..." for 'ps -f'. os.execv(sys.argv[0], [sys.argv[0]] + [reg] + sys.argv[1:]) # Check suite is not already running before start of host selection. try: SuiteSrvFilesManager().detect_old_contact_file(args[0]) except SuiteServiceFileError as exc: sys.exit(exc) # Create auth files if needed. SuiteSrvFilesManager().create_auth_files(args[0]) # Check whether a run host is explicitly specified, else select one. if not options.host: try: host = HostAppointer().appoint_host() except EmptyHostList as exc: if cylc.flags.debug: raise else: sys.exit(str(exc)) if is_remote_host(host): if is_restart: base_cmd = ["restart"] + sys.argv[1:] else: base_cmd = ["run"] + sys.argv[1:] # Prevent recursive host selection base_cmd.append("--host=localhost") return remote_cylc_cmd(base_cmd, host=host) if remrun(set_rel_local=True): # State localhost as above. sys.exit() try: SuiteSrvFilesManager().get_suite_source_dir(args[0], options.owner) except SuiteServiceFileError: # Source path is assumed to be the run directory SuiteSrvFilesManager().register( args[0], glbl_cfg().get_derived_host_item(args[0], 'suite run directory')) try: scheduler = Scheduler(is_restart, options, args) except SuiteServiceFileError as exc: sys.exit(exc) scheduler.start()
def _get_host_metrics(self): """Run "cylc get-host-metrics" commands on hosts. Return (dict): {host: host-metrics-dict, ...} """ host_stats = {} # Run "cylc get-host-metrics" commands on hosts host_proc_map = {} cmd = [self.CMD_BASE] + sorted(self._get_host_metrics_opts()) # Start up commands on hosts for host in self.hosts: if is_remote_host(host): host_proc_map[host] = remote_cylc_cmd(cmd, stdin=None, host=host, capture_process=True) elif 'localhost' in host_proc_map: continue # Don't duplicate localhost else: # 1st instance of localhost host_proc_map['localhost'] = run_cmd(['cylc'] + cmd, capture_process=True) # Collect results from commands while host_proc_map: for host, proc in list(host_proc_map.copy().items()): if proc.poll() is None: continue del host_proc_map[host] out, err = (f.decode() for f in proc.communicate()) if proc.wait(): # Command failed in verbose/debug mode LOG.warning( "can't get host metric from '%s'" + "%s # returncode=%d, err=%s\n", host, ' '.join( (quote(item) for item in cmd)), proc.returncode, err) else: # Command OK # Users may have profile scripts that write to STDOUT. # Drop all output lines until the the first character of a # line is '{'. Hopefully this is enough to find us the # first line that denotes the beginning of the expected # JSON data structure. out = ''.join( dropwhile(lambda s: not s.startswith('{'), out.splitlines(True))) host_stats[host] = json.loads(out) sleep(0.01) return host_stats
def _scan_item(timeout, my_uuid, srv_files_mgr, item): """Connect to item host:port (item) to get suite identify.""" host, port = item host_anon = host if is_remote_host(host): host_anon = get_host_ip_by_name(host) # IP reduces DNS traffic client = SuiteRuntimeServiceClient( None, host=host_anon, port=port, my_uuid=my_uuid, timeout=timeout, auth=SuiteRuntimeServiceClient.ANON_AUTH) try: result = client.identify() except ClientTimeout: return (host, port, MSG_TIMEOUT) except ClientError: return (host, port, None) else: owner = result.get(KEY_OWNER) name = result.get(KEY_NAME) states = result.get(KEY_STATES, None) if cylc.flags.debug: sys.stderr.write(' suite: %s %s\n' % (name, owner)) if states is None: # This suite keeps its state info private. # Try again with the passphrase if I have it. try: pphrase = srv_files_mgr.get_auth_item( srv_files_mgr.FILE_BASE_PASSPHRASE, name, owner, host, content=True) except SuiteServiceFileError: pass else: if pphrase: client.suite = name client.owner = owner client.auth = None try: result = client.identify() except ClientError: # Nope (private suite, wrong passphrase). if cylc.flags.debug: sys.stderr.write(' (wrong passphrase)\n') else: if cylc.flags.debug: sys.stderr.write( ' (got states with passphrase)\n') return (host, port, result)
def _get_host_metrics(self): """Run "cylc get-host-metrics" commands on hosts. Return (dict): {host: host-metrics-dict, ...} """ host_stats = {} # Run "cylc get-host-metrics" commands on hosts host_proc_map = {} cmd = [self.CMD_BASE] + sorted(self._get_host_metrics_opts()) # Start up commands on hosts for host in self.hosts: if is_remote_host(host): host_proc_map[host] = remote_cylc_cmd( cmd, stdin=None, host=host, capture_process=True) elif 'localhost' in host_proc_map: continue # Don't duplicate localhost else: # 1st instance of localhost host_proc_map['localhost'] = run_cmd( ['cylc'] + cmd, capture_process=True) # Collect results from commands while host_proc_map: for host, proc in host_proc_map.copy().items(): if proc.poll() is None: continue del host_proc_map[host] out, err = proc.communicate() if proc.wait(): # Command failed in verbose/debug mode LOG.warning( "can't get host metric from '%s'" + "%s # returncode=%d, err=%s\n", host, ' '.join((quote(item) for item in cmd)), proc.returncode, err) else: # Command OK host_stats[host] = json.loads(out) sleep(0.01) return host_stats
def remote_init(self, host, owner): """Initialise a remote [owner@]host if necessary. Create UUID file on suite host ".service/uuid" for remotes to identify shared file system with suite host. Call "cylc remote-init" to install suite items to remote: ".service/contact": HTTP(S) and SSH+HTTP(S) task comm ".service/passphrase": HTTP(S) task comm ".service/ssl.cert": HTTPS task comm "python/": if source exists Return: REMOTE_INIT_NOT_REQUIRED: If remote init is not required, e.g. not remote REMOTE_INIT_DONE: If remote init done. REMOTE_INIT_FAILED: If init of the remote failed. Note: this will reset to None to allow retry. None: If waiting for remote init command to complete """ if self.single_task_mode or not is_remote(host, owner): return REMOTE_INIT_NOT_REQUIRED try: status = self.remote_init_map[(host, owner)] except KeyError: pass # Not yet initialised else: if status == REMOTE_INIT_FAILED: del self.remote_init_map[(host, owner)] # reset to allow retry return status # Determine what items to install items = self._remote_init_items(host, owner) # No item to install if not items: self.remote_init_map[(host, owner)] = REMOTE_INIT_NOT_REQUIRED return self.remote_init_map[(host, owner)] # Create "stdin_file_paths" file, with "items" in it. tmphandle = NamedTemporaryFile() tarhandle = tarfile.open(fileobj=tmphandle, mode='w') for path, arcname in items: tarhandle.add(path, arcname=arcname) tarhandle.close() tmphandle.seek(0) # UUID file - for remote to identify shared file system with suite host uuid_fname = os.path.join( self.suite_srv_files_mgr.get_suite_srv_dir(self.suite), 'uuid') if not os.path.exists(uuid_fname): open(uuid_fname, 'wb').write(str(self.uuid)) # Build the command cmd = ['cylc', 'remote-init'] if is_remote_host(host): cmd.append('--host=%s' % host) if is_remote_user(owner): cmd.append('--user=%s' % owner) if cylc.flags.debug: cmd.append('--debug') cmd.append(str(self.uuid)) cmd.append(glbl_cfg().get_derived_host_item(self.suite, 'suite run directory', host, owner)) self.proc_pool.put_command( SuiteProcContext('remote-init', cmd, stdin_file_paths=[tmphandle.name]), self._remote_init_callback, [host, owner, tmphandle]) # None status: Waiting for command to finish self.remote_init_map[(host, owner)] = None return self.remote_init_map[(host, owner)]
def _load_remote_item(self, item, reg, owner, host): """Load content of service item from remote [owner@]host via SSH.""" if not is_remote(host, owner): return if host is None: host = 'localhost' if owner is None: owner = get_user() if item == self.FILE_BASE_CONTACT and not is_remote_host(host): # Attempt to read suite contact file via the local filesystem. path = r'%(run_d)s/%(srv_base)s' % { 'run_d': glbl_cfg().get_derived_host_item(reg, 'suite run directory', 'localhost', owner, replace_home=False), 'srv_base': self.DIR_BASE_SRV, } content = self._load_local_item(item, path) if content is not None: return content # Else drop through and attempt via ssh to the suite account. # Prefix STDOUT to ensure returned content is relevant prefix = r'[CYLC-AUTH] %(suite)s' % {'suite': reg} # Attempt to cat passphrase file under suite service directory script = (r"""echo '%(prefix)s'; """ r'''cat "%(run_d)s/%(srv_base)s/%(item)s"''') % { 'prefix': prefix, 'run_d': glbl_cfg().get_derived_host_item( reg, 'suite run directory', host, owner), 'srv_base': self.DIR_BASE_SRV, 'item': item } import shlex command = shlex.split(glbl_cfg().get_host_item('ssh command', host, owner)) command += ['-n', owner + '@' + host, script] from subprocess import Popen, PIPE try: proc = Popen(command, stdin=open(os.devnull), stdout=PIPE, stderr=PIPE) except OSError: if cylc.flags.debug: import traceback traceback.print_exc() return out, err = proc.communicate() ret_code = proc.wait() # Extract passphrase from STDOUT # It should live in the line with the correct prefix content = "" can_read = False for line in out.splitlines(True): if can_read: content += line elif line.strip() == prefix: can_read = True if not content or ret_code: LOG.debug( '$ %(command)s # code=%(ret_code)s\n%(err)s', { 'command': command, # STDOUT may contain passphrase, so not safe to print # 'out': out, 'err': err, 'ret_code': ret_code, }) return return content
def detect_old_contact_file(self, reg, check_host_port=None): """Detect old suite contact file. If an old contact file does not exist, do nothing. If one does exist but the suite process is definitely not alive, remove it. If one exists and the suite process is still alive, raise SuiteServiceFileError. If check_host_port is specified and does not match the (host, port) value in the old contact file, raise AssertionError. Args: reg (str): suite name check_host_port (tuple): (host, port) to check against Raise: AssertionError: If old contact file exists but does not have matching (host, port) with value of check_host_port. SuiteServiceFileError: If old contact file exists and the suite process still alive. """ # An old suite of the same name may be running if a contact file exists # and can be loaded. try: data = self.load_contact_file(reg) old_host = data[self.KEY_HOST] old_port = data[self.KEY_PORT] old_proc_str = data[self.KEY_PROCESS] except (IOError, ValueError, SuiteServiceFileError): # Contact file does not exist or corrupted, should be OK to proceed return if check_host_port and check_host_port != (old_host, int(old_port)): raise AssertionError("%s != (%s, %s)" % (check_host_port, old_host, old_port)) # Run the "ps" command to see if the process is still running or not. # If the old suite process is still running, it should show up with the # same command line as before. # Terminate command after 10 seconds to prevent hanging, etc. old_pid_str = old_proc_str.split(None, 1)[0].strip() cmd = ["timeout", "10", "ps", self.PS_OPTS, str(old_pid_str)] if is_remote_host(old_host): import shlex ssh_str = str(glbl_cfg().get_host_item("ssh command", old_host)) cmd = shlex.split(ssh_str) + ["-n", old_host] + cmd from subprocess import Popen, PIPE from time import sleep, time proc = Popen(cmd, stdin=open(os.devnull), stdout=PIPE, stderr=PIPE) # Terminate command after 10 seconds to prevent hanging SSH, etc. timeout = time() + 10.0 while proc.poll() is None: if time() > timeout: proc.terminate() sleep(0.1) fname = self.get_contact_file(reg) ret_code = proc.wait() out, err = proc.communicate() if ret_code: LOG.debug("$ %s # return %d\n%s", ' '.join(cmd), ret_code, err) for line in reversed(out.splitlines()): if line.strip() == old_proc_str: # Suite definitely still running break elif line.split(None, 1)[0].strip() == "PID": # Only "ps" header - "ps" has run, but no matching results. # Suite not running. Attempt to remove suite contact file. try: os.unlink(fname) return except OSError: break raise SuiteServiceFileError( (r"""ERROR, suite contact file exists: %(fname)s Suite "%(suite)s" is already running, and listening at "%(host)s:%(port)s". To start a new run, stop the old one first with one or more of these: * cylc stop %(suite)s # wait for active tasks/event handlers * cylc stop --kill %(suite)s # kill active tasks and wait * cylc stop --now %(suite)s # don't wait for active tasks * cylc stop --now --now %(suite)s # don't wait * ssh -n "%(host)s" kill %(pid)s # final brute force! """) % { "host": old_host, "port": old_port, "pid": old_pid_str, "fname": fname, "suite": reg, })
def test_is_remote_host_on_localhost(self): """is_remote_host with localhost.""" self.assertFalse(is_remote_host(None)) self.assertFalse(is_remote_host('localhost')) self.assertFalse(is_remote_host(os.getenv('HOSTNAME'))) self.assertFalse(is_remote_host(get_host()))
def scan_many(items=None, timeout=None, updater=None): """Call "identify" method of suites on many host:port. Args: items (list): list of 'host' string or ('host', port) tuple to scan. timeout (float): connection timeout, default is CONNECT_TIMEOUT. updater (object): quit scan cleanly if updater.quit is set. Return: list: [(host, port, identify_result), ...] """ try: timeout = float(timeout) except (TypeError, ValueError): timeout = CONNECT_TIMEOUT my_uuid = uuid4() # Determine hosts to scan if not items: items = GLOBAL_CFG.get(["suite host scanning", "hosts"]) # Ensure that it does "localhost" only once items = set(items) for item in list(items): if not isinstance(item, tuple) and not is_remote_host(item): items.remove(item) items.add("localhost") # To do and wait (submitted, waiting for results) sets todo_set = set() wait_set = set() # Determine ports to scan base_port = None max_ports = None for item in items: if isinstance(item, tuple): # Assume item is ("host", port) todo_set.add(item) else: # Full port range for a host if base_port is None or max_ports is None: base_port = GLOBAL_CFG.get(['communication', 'base port']) max_ports = GLOBAL_CFG.get( ['communication', 'maximum number of ports']) for port in range(base_port, base_port + max_ports): todo_set.add((item, port)) proc_items = [] results = [] # Number of child processes max_procs = GLOBAL_CFG.get(["process pool size"]) if max_procs is None: max_procs = cpu_count() try: while todo_set or proc_items: no_action = True # Get results back from child processes where possible busy_proc_items = [] while proc_items: if updater and updater.quit: raise KeyboardInterrupt() proc, my_conn, terminate_time = proc_items.pop() if my_conn.poll(): host, port, result = my_conn.recv() if result is None: # Can't connect, ignore wait_set.remove((host, port)) elif result == MSG_TIMEOUT: # Connection timeout, leave in "wait_set" pass else: # Connection success results.append((host, port, result)) wait_set.remove((host, port)) if todo_set: # Immediately give the child process something to do host, port = todo_set.pop() wait_set.add((host, port)) my_conn.send((host, port)) busy_proc_items.append( (proc, my_conn, time() + INACTIVITY_TIMEOUT)) else: # Or quit if there is nothing left to do my_conn.send(MSG_QUIT) my_conn.close() proc.join() no_action = False elif time() > terminate_time: # Terminate child process if it is taking too long proc.terminate() proc.join() no_action = False else: busy_proc_items.append((proc, my_conn, terminate_time)) proc_items += busy_proc_items # Create some child processes where necessary while len(proc_items) < max_procs and todo_set: if updater and updater.quit: raise KeyboardInterrupt() my_conn, conn = Pipe() try: proc = Process(target=_scan_worker, args=(conn, timeout, my_uuid)) except OSError: # Die if unable to start any worker process. # OK to wait and see if any worker process already running. if not proc_items: raise if cylc.flags.debug: traceback.print_exc() else: proc.start() host, port = todo_set.pop() wait_set.add((host, port)) my_conn.send((host, port)) proc_items.append( (proc, my_conn, time() + INACTIVITY_TIMEOUT)) no_action = False if no_action: sleep(SLEEP_INTERVAL) except KeyboardInterrupt: return [] # Report host:port with no results if wait_set: sys.stderr.write( 'WARNING, scan timed out, no result for the following:\n') for key in sorted(wait_set): sys.stderr.write(' %s:%s\n' % key) return results
def _load_remote_item(self, item, reg, owner, host): """Load content of service item from remote [owner@]host via SSH.""" if not is_remote(host, owner): return if host is None: host = 'localhost' if owner is None: owner = get_user() if item == self.FILE_BASE_CONTACT and not is_remote_host(host): # Attempt to read suite contact file via the local filesystem. path = r'%(run_d)s/%(srv_base)s' % { 'run_d': glbl_cfg().get_derived_host_item( reg, 'suite run directory', 'localhost', owner, replace_home=False), 'srv_base': self.DIR_BASE_SRV, } content = self._load_local_item(item, path) if content is not None: return content # Else drop through and attempt via ssh to the suite account. # Prefix STDOUT to ensure returned content is relevant prefix = r'[CYLC-AUTH] %(suite)s' % {'suite': reg} # Attempt to cat passphrase file under suite service directory script = ( r"""echo '%(prefix)s'; """ r'''cat "%(run_d)s/%(srv_base)s/%(item)s"''' ) % { 'prefix': prefix, 'run_d': glbl_cfg().get_derived_host_item( reg, 'suite run directory', host, owner), 'srv_base': self.DIR_BASE_SRV, 'item': item } import shlex command = shlex.split( glbl_cfg().get_host_item('ssh command', host, owner)) command += ['-n', owner + '@' + host, script] from subprocess import Popen, PIPE try: proc = Popen( command, stdin=open(os.devnull), stdout=PIPE, stderr=PIPE) except OSError: if cylc.flags.debug: import traceback traceback.print_exc() return out, err = (f.decode() for f in proc.communicate()) ret_code = proc.wait() # Extract passphrase from STDOUT # It should live in the line with the correct prefix content = "" can_read = False for line in out.splitlines(True): if can_read: content += line elif line.strip() == prefix: can_read = True if not content or ret_code: LOG.debug( '$ %(command)s # code=%(ret_code)s\n%(err)s', { 'command': command, # STDOUT may contain passphrase, so not safe to print # 'out': out, 'err': err, 'ret_code': ret_code, }) return return content
# 1) host selection command: $(command) or `command` match = REC_COMMAND.match(host) if match: # extract the command and execute it hs_command = match.groups()[1] timeout = GLOBAL_CFG.get(["task host select command timeout"]) is_ok, outlines = run_get_stdout(hs_command, timeout) if is_ok: # host selection command succeeded host = outlines[0] else: # host selection command failed raise HostSelectError(host, "\n".join(outlines)) # 2) environment variable: ${VAR} or $VAR # (any quotes are stripped by file parsing) match = REC_ENVIRON.match(host) if match: name = match.groups()[0] try: host = os.environ[name] except KeyError, exc: raise HostSelectError(host, "Variable not defined: " + str(exc)) try: if is_remote_host(host): return host else: return "localhost" except: return host
def scan_many(items, timeout=None, updater=None): """Call "identify" method of suites on many host:port. Args: items (list): list of 'host' string or ('host', port) tuple to scan. timeout (float): connection timeout, default is CONNECT_TIMEOUT. updater (object): quit scan cleanly if updater.quit is set. Return: list: [(host, port, identify_result), ...] """ if not items: return [] try: timeout = float(timeout) except (TypeError, ValueError): timeout = CONNECT_TIMEOUT my_uuid = uuid4() # Ensure that it does "localhost" only once items = set(items) for item in list(items): if not isinstance(item, tuple) and not is_remote_host(item): items.remove(item) items.add("localhost") # To do and wait (submitted, waiting for results) sets todo_set = set() wait_set = set() # Determine ports to scan base_port = None max_ports = None for item in items: if isinstance(item, tuple): # Assume item is ("host", port) todo_set.add(item) else: # Full port range for a host if base_port is None or max_ports is None: base_port = glbl_cfg().get(['communication', 'base port']) max_ports = glbl_cfg().get( ['communication', 'maximum number of ports']) for port in range(base_port, base_port + max_ports): todo_set.add((item, port)) proc_items = [] results = [] # Number of child processes max_procs = glbl_cfg().get(["process pool size"]) if max_procs is None: max_procs = cpu_count() try: while todo_set or proc_items: no_action = True # Get results back from child processes where possible busy_proc_items = [] while proc_items: if updater and updater.quit: raise KeyboardInterrupt() proc, my_conn, terminate_time = proc_items.pop() if my_conn.poll(): host, port, result = my_conn.recv() if result is None: # Can't connect, ignore wait_set.remove((host, port)) elif result == MSG_TIMEOUT: # Connection timeout, leave in "wait_set" pass else: # Connection success results.append((host, port, result)) wait_set.remove((host, port)) if todo_set: # Immediately give the child process something to do host, port = todo_set.pop() wait_set.add((host, port)) my_conn.send((host, port)) busy_proc_items.append( (proc, my_conn, time() + INACTIVITY_TIMEOUT)) else: # Or quit if there is nothing left to do my_conn.send(MSG_QUIT) my_conn.close() proc.join() no_action = False elif time() > terminate_time: # Terminate child process if it is taking too long proc.terminate() proc.join() no_action = False else: busy_proc_items.append((proc, my_conn, terminate_time)) proc_items += busy_proc_items # Create some child processes where necessary while len(proc_items) < max_procs and todo_set: if updater and updater.quit: raise KeyboardInterrupt() my_conn, conn = Pipe() try: proc = Process( target=_scan_worker, args=(conn, timeout, my_uuid)) except OSError: # Die if unable to start any worker process. # OK to wait and see if any worker process already running. if not proc_items: raise if cylc.flags.debug: traceback.print_exc() else: proc.start() host, port = todo_set.pop() wait_set.add((host, port)) my_conn.send((host, port)) proc_items.append( (proc, my_conn, time() + INACTIVITY_TIMEOUT)) no_action = False if no_action: sleep(SLEEP_INTERVAL) except KeyboardInterrupt: return [] # Report host:port with no results if wait_set: sys.stderr.write( 'WARNING, scan timed out, no result for the following:\n') for key in sorted(wait_set): sys.stderr.write(' %s:%s\n' % key) return results
def submit_task_jobs(self, suite, itasks, is_simulation=False): """Prepare and submit task jobs. Submit tasks where possible. Ignore tasks that are waiting for host select command to complete, or tasks that are waiting for remote initialisation. Bad host select command, error writing to a job file or bad remote initialisation will cause a bad task - leading to submission failure. This method uses prep_submit_task_job() as helper. Return (list): list of tasks that attempted submission. """ if is_simulation: return self._simulation_submit_task_jobs(itasks) # Prepare tasks for job submission prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks) # Reset consumed host selection results self.task_remote_mgr.remote_host_select_reset() if not prepared_tasks: return bad_tasks # Group task jobs by (host, owner) auth_itasks = {} # {(host, owner): [itask, ...], ...} for itask in prepared_tasks: auth_itasks.setdefault((itask.task_host, itask.task_owner), []) auth_itasks[(itask.task_host, itask.task_owner)].append(itask) # Submit task jobs for each (host, owner) group done_tasks = bad_tasks for (host, owner), itasks in sorted(auth_itasks.items()): is_init = self.task_remote_mgr.remote_init(host, owner) if is_init is None: # Remote is waiting to be initialised for itask in itasks: itask.set_summary_message(self.REMOTE_INIT_MSG) continue # Ensure that localhost background/at jobs are recorded as running # on the host name of the current suite host, rather than just # "localhost". On suite restart on a different suite host, this # allows the restart logic to correctly poll the status of the # background/at jobs that may still be running on the previous # suite host. if ( self.batch_sys_mgr.is_job_local_to_host( itask.summary['batch_sys_name']) and not is_remote_host(host) ): owner_at_host = get_host() else: owner_at_host = host # Persist if owner: owner_at_host = owner + '@' + owner_at_host now_str = get_current_time_string() done_tasks.extend(itasks) for itask in itasks: # Log and persist LOG.info( '[%s] -submit-num=%d, owner@host=%s', itask, itask.submit_num, owner_at_host) self.suite_db_mgr.put_insert_task_jobs(itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': now_str, 'user_at_host': owner_at_host, 'batch_sys_name': itask.summary['batch_sys_name'], }) itask.is_manual_submit = False if is_init == REMOTE_INIT_FAILED: # Remote has failed to initialise # Set submit-failed for all affected tasks for itask in itasks: itask.local_job_file_path = None # reset for retry log_task_job_activity( SubProcContext( self.JOBS_SUBMIT, '(init %s)' % owner_at_host, err=REMOTE_INIT_FAILED, ret_code=1), suite, itask.point, itask.tdef.name) self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED) continue # Build the "cylc jobs-submit" command cmd = ['cylc', self.JOBS_SUBMIT] if LOG.isEnabledFor(DEBUG): cmd.append('--debug') if get_utc_mode(): cmd.append('--utc-mode') remote_mode = False kwargs = {} for key, value, test_func in [ ('host', host, is_remote_host), ('user', owner, is_remote_user)]: if test_func(value): cmd.append('--%s=%s' % (key, value)) remote_mode = True kwargs[key] = value if remote_mode: cmd.append('--remote-mode') cmd.append('--') cmd.append(glbl_cfg().get_derived_host_item( suite, 'suite job log directory', host, owner)) # Chop itasks into a series of shorter lists if it's very big # to prevent overloading of stdout and stderr pipes. itasks = sorted(itasks, key=lambda itask: itask.identity) chunk_size = len(itasks) // ((len(itasks) // 100) + 1) + 1 itasks_batches = [ itasks[i:i + chunk_size] for i in range(0, len(itasks), chunk_size)] LOG.debug( '%s ... # will invoke in batches, sizes=%s', cmd, [len(b) for b in itasks_batches]) for i, itasks_batch in enumerate(itasks_batches): stdin_files = [] job_log_dirs = [] for itask in itasks_batch: if remote_mode: stdin_files.append( get_task_job_job_log( suite, itask.point, itask.tdef.name, itask.submit_num)) job_log_dirs.append(get_task_job_id( itask.point, itask.tdef.name, itask.submit_num)) # The job file is now (about to be) used: reset the file # write flag so that subsequent manual retrigger will # generate a new job file. itask.local_job_file_path = None itask.state.reset_state(TASK_STATUS_READY) if itask.state.outputs.has_custom_triggers(): self.suite_db_mgr.put_update_task_outputs(itask) self.proc_pool.put_command( SubProcContext( self.JOBS_SUBMIT, cmd + job_log_dirs, stdin_files=stdin_files, job_log_dirs=job_log_dirs, **kwargs ), self._submit_task_jobs_callback, [suite, itasks_batch]) return done_tasks
def remote_init(self, host, owner): """Initialise a remote [owner@]host if necessary. Create UUID file on suite host ".service/uuid" for remotes to identify shared file system with suite host. Call "cylc remote-init" to install suite items to remote: ".service/contact": For TCP task communication ".service/passphrase": For TCP task communication "python/": if source exists Return: REMOTE_INIT_NOT_REQUIRED: If remote init is not required, e.g. not remote REMOTE_INIT_DONE: If remote init done. REMOTE_INIT_FAILED: If init of the remote failed. Note: this will reset to None to allow retry. None: If waiting for remote init command to complete """ if self.single_task_mode or not is_remote(host, owner): return REMOTE_INIT_NOT_REQUIRED try: status = self.remote_init_map[(host, owner)] except KeyError: pass # Not yet initialised else: if status == REMOTE_INIT_FAILED: del self.remote_init_map[(host, owner)] # reset to allow retry return status # Determine what items to install comm_meth = glbl_cfg().get_host_item( 'task communication method', host, owner) owner_at_host = 'localhost' if host: owner_at_host = host if owner: owner_at_host = owner + '@' + owner_at_host LOG.debug('comm_meth[%s]=%s' % (owner_at_host, comm_meth)) items = self._remote_init_items(comm_meth) # No item to install if not items: self.remote_init_map[(host, owner)] = REMOTE_INIT_NOT_REQUIRED return self.remote_init_map[(host, owner)] # Create a TAR archive with the service files, # so they can be sent later via SSH's STDIN to the task remote. tmphandle = TemporaryFile() tarhandle = tarfile.open(fileobj=tmphandle, mode='w') for path, arcname in items: tarhandle.add(path, arcname=arcname) tarhandle.close() tmphandle.seek(0) # UUID file - for remote to identify shared file system with suite host uuid_fname = os.path.join( self.suite_srv_files_mgr.get_suite_srv_dir(self.suite), FILE_BASE_UUID) if not os.path.exists(uuid_fname): open(uuid_fname, 'wb').write(str(self.uuid_str).encode()) # Build the command cmd = ['cylc', 'remote-init'] if is_remote_host(host): cmd.append('--host=%s' % host) if is_remote_user(owner): cmd.append('--user=%s' % owner) if cylc.flags.debug: cmd.append('--debug') if comm_meth in ['ssh']: cmd.append('--indirect-comm=%s' % comm_meth) cmd.append(str(self.uuid_str)) cmd.append(glbl_cfg().get_derived_host_item( self.suite, 'suite run directory', host, owner)) self.proc_pool.put_command( SubProcContext('remote-init', cmd, stdin_files=[tmphandle]), self._remote_init_callback, [host, owner, tmphandle]) # None status: Waiting for command to finish self.remote_init_map[(host, owner)] = None return self.remote_init_map[(host, owner)]
def detect_old_contact_file(self, reg, check_host_port=None): """Detect old suite contact file. If an old contact file does not exist, do nothing. If one does exist but the suite process is definitely not alive, remove it. If one exists and the suite process is still alive, raise SuiteServiceFileError. If check_host_port is specified and does not match the (host, port) value in the old contact file, raise AssertionError. Args: reg (str): suite name check_host_port (tuple): (host, port) to check against Raise: AssertionError: If old contact file exists but does not have matching (host, port) with value of check_host_port. SuiteServiceFileError: If old contact file exists and the suite process still alive. """ # An old suite of the same name may be running if a contact file exists # and can be loaded. try: data = self.load_contact_file(reg) old_host = data[self.KEY_HOST] old_port = data[self.KEY_PORT] old_proc_str = data[self.KEY_PROCESS] except (IOError, ValueError, SuiteServiceFileError): # Contact file does not exist or corrupted, should be OK to proceed return if check_host_port and check_host_port != (old_host, int(old_port)): raise AssertionError("%s != (%s, %s)" % ( check_host_port, old_host, old_port)) # Run the "ps" command to see if the process is still running or not. # If the old suite process is still running, it should show up with the # same command line as before. # Terminate command after 10 seconds to prevent hanging, etc. old_pid_str = old_proc_str.split(None, 1)[0].strip() cmd = ["timeout", "10", "ps", self.PS_OPTS, str(old_pid_str)] if is_remote_host(old_host): import shlex ssh_str = str(glbl_cfg().get_host_item("ssh command", old_host)) cmd = shlex.split(ssh_str) + ["-n", old_host] + cmd from subprocess import Popen, PIPE from time import sleep, time proc = Popen(cmd, stdin=open(os.devnull), stdout=PIPE, stderr=PIPE) # Terminate command after 10 seconds to prevent hanging SSH, etc. timeout = time() + 10.0 while proc.poll() is None: if time() > timeout: proc.terminate() sleep(0.1) fname = self.get_contact_file(reg) ret_code = proc.wait() out, err = (f.decode() for f in proc.communicate()) if ret_code: LOG.debug("$ %s # return %d\n%s", ' '.join(cmd), ret_code, err) for line in reversed(out.splitlines()): if line.strip() == old_proc_str: # Suite definitely still running break elif line.split(None, 1)[0].strip() == "PID": # Only "ps" header - "ps" has run, but no matching results. # Suite not running. Attempt to remove suite contact file. try: os.unlink(fname) return except OSError: break raise SuiteServiceFileError( ( r"""suite contact file exists: %(fname)s Suite "%(suite)s" is already running, and listening at "%(host)s:%(port)s". To start a new run, stop the old one first with one or more of these: * cylc stop %(suite)s # wait for active tasks/event handlers * cylc stop --kill %(suite)s # kill active tasks and wait * cylc stop --now %(suite)s # don't wait for active tasks * cylc stop --now --now %(suite)s # don't wait * ssh -n "%(host)s" kill %(pid)s # final brute force! """ ) % { "host": old_host, "port": old_port, "pid": old_pid_str, "fname": fname, "suite": reg, } )
def remote_init(self, host, owner): """Initialise a remote [owner@]host if necessary. Create UUID file on suite host ".service/uuid" for remotes to identify shared file system with suite host. Call "cylc remote-init" to install suite items to remote: ".service/contact": For TCP task communication ".service/passphrase": For TCP task communication "python/": if source exists Return: REMOTE_INIT_NOT_REQUIRED: If remote init is not required, e.g. not remote REMOTE_INIT_DONE: If remote init done. REMOTE_INIT_FAILED: If init of the remote failed. Note: this will reset to None to allow retry. None: If waiting for remote init command to complete """ if self.single_task_mode or not is_remote(host, owner): return REMOTE_INIT_NOT_REQUIRED try: status = self.remote_init_map[(host, owner)] except KeyError: pass # Not yet initialised else: if status == REMOTE_INIT_FAILED: del self.remote_init_map[(host, owner)] # reset to allow retry return status # Determine what items to install comm_meth = glbl_cfg().get_host_item( 'task communication method', host, owner) owner_at_host = 'localhost' if host: owner_at_host = host if owner: owner_at_host = owner + '@' + owner_at_host LOG.debug('comm_meth[%s]=%s' % (owner_at_host, comm_meth)) items = self._remote_init_items(comm_meth) # No item to install if not items: self.remote_init_map[(host, owner)] = REMOTE_INIT_NOT_REQUIRED return self.remote_init_map[(host, owner)] # Create a TAR archive with the service files, # so they can be sent later via SSH's STDIN to the task remote. tmphandle = self.proc_pool.get_temporary_file() tarhandle = tarfile.open(fileobj=tmphandle, mode='w') for path, arcname in items: tarhandle.add(path, arcname=arcname) tarhandle.close() tmphandle.seek(0) # UUID file - for remote to identify shared file system with suite host uuid_fname = os.path.join( self.suite_srv_files_mgr.get_suite_srv_dir(self.suite), FILE_BASE_UUID) if not os.path.exists(uuid_fname): open(uuid_fname, 'wb').write(str(self.uuid_str).encode()) # Build the command cmd = ['cylc', 'remote-init'] if is_remote_host(host): cmd.append('--host=%s' % host) if is_remote_user(owner): cmd.append('--user=%s' % owner) if cylc.flags.debug: cmd.append('--debug') if comm_meth in ['ssh']: cmd.append('--indirect-comm=%s' % comm_meth) cmd.append(str(self.uuid_str)) cmd.append(glbl_cfg().get_derived_host_item( self.suite, 'suite run directory', host, owner)) self.proc_pool.put_command( SubProcContext('remote-init', cmd, stdin_files=[tmphandle]), self._remote_init_callback, [host, owner, tmphandle]) # None status: Waiting for command to finish self.remote_init_map[(host, owner)] = None return self.remote_init_map[(host, owner)]