def __init__(self, cached=False): # get the global config, if cached = False a new config instance will # be returned with the up-to-date configuration. global_config = glbl_cfg(cached=cached) # list the condemned hosts, hosts may be suffixed with `!` condemned_hosts = [ get_fqdn_by_host(host.split('!')[0]) for host in global_config.get(['suite servers', 'condemned hosts']) ] # list configured run hosts eliminating any which cannot be contacted # or which are condemned self.hosts = [] for host in (global_config.get(['suite servers', 'run hosts']) or ['localhost']): try: if get_fqdn_by_host(host) not in condemned_hosts: self.hosts.append(host) except socket.gaierror: pass # determine the server ranking and acceptance thresholds if configured self.rank_method = global_config.get( ['suite servers', 'run host select', 'rank']) self.parsed_thresholds = self.parse_thresholds( global_config.get( ['suite servers', 'run host select', 'thresholds']))
def __init__(self, cached=False): # get the global config, if cached = False a new config instance will # be returned with the up-to-date configuration. global_config = glbl_cfg(cached=cached) # list the condemned hosts, hosts may be suffixed with `!` condemned_hosts = [ get_fqdn_by_host(host.split('!')[0]) for host in global_config.get(['suite servers', 'condemned hosts'])] # list configured run hosts eliminating any which cannot be contacted # or which are condemned self.hosts = [] for host in ( global_config.get(['suite servers', 'run hosts']) or ['localhost']): try: if get_fqdn_by_host(host) not in condemned_hosts: self.hosts.append(host) except socket.gaierror: pass # determine the server ranking and acceptance thresholds if configured self.rank_method = global_config.get( ['suite servers', 'run host select', 'rank']) self.parsed_thresholds = self.parse_thresholds(global_config.get( ['suite servers', 'run host select', 'thresholds']))
def test_get_fqdn_by_host_on_bad_host(self): """get_fqdn_by_host bad host.""" bad_host = 'nosuchhost.nosuchdomain.org' try: # Future: Replace with assertRaises context manager syntax get_fqdn_by_host(bad_host) except IOError as exc: self.assertEqual(exc.filename, bad_host) self.assertEqual( "[Errno -2] Name or service not known: '%s'" % bad_host, str(exc))
def test_get_fqdn_by_host_on_bad_host(self): """get_fqdn_by_host bad host.""" bad_host = 'nosuchhost.nosuchdomain.org' try: # Future: Replace with assertRaises context manager syntax get_fqdn_by_host(bad_host) except IOError as exc: self.assertEqual(exc.filename, bad_host) self.assertEqual( "[Errno -2] Name or service not known: '%s'" % bad_host, str(exc))
def test_get_fqdn_by_host_on_bad_host(self): """get_fqdn_by_host bad host.""" bad_host = 'nosuchhost.nosuchdomain.org' with self.assertRaisesRegex( IOError, r"(\[Errno -2\] Name or service|" r"\[Errno 8\] nodename nor servname provided, or)" r" not known: '{}'".format(bad_host)) as ctx: get_fqdn_by_host(bad_host) self.assertEqual(ctx.exception.filename, bad_host)
def get_location(suite: str, owner: str, host: str): """Extract host and port from a suite's contact file. NB: if it fails to load the suite contact file, it will exit. Args: suite (str): suite name owner (str): owner of the suite host (str): host name Returns: Tuple[str, int, int]: tuple with the host name and port numbers. Raises: ClientError: if the suite is not running. """ try: contact = load_contact_file(suite, owner, host) except SuiteServiceFileError: raise ClientError(f'Contact info not found for suite ' f'"{suite}", suite not running?') if not host: host = contact[ContactFileFields.HOST] host = get_fqdn_by_host(host) port = int(contact[ContactFileFields.PORT]) pub_port = int(contact[ContactFileFields.PUBLISH_PORT]) return host, port, pub_port
def get_location(cls, suite: str, owner: str, host: str): """Extract host and port from a suite's contact file. NB: if it fails to load the suite contact file, it will exit. Args: suite (str): suite name owner (str): owner of the suite host (str): host name Returns: Tuple[str, int]: tuple with the host name and port number. Raises: ClientError: if the suite is not running. """ try: contact = SuiteSrvFilesManager().load_contact_file( suite, owner, host) except SuiteServiceFileError: raise ClientError(f'Contact info not found for suite ' f'"{suite}", suite not running?') if not host: host = contact[SuiteSrvFilesManager.KEY_HOST] host = get_fqdn_by_host(host) port = int(contact[SuiteSrvFilesManager.KEY_PORT]) return host, port
def get_location(workflow: str): """Extract host and port from a workflow's contact file. NB: if it fails to load the workflow contact file, it will exit. Args: workflow (str): workflow name Returns: Tuple[str, int, int]: tuple with the host name and port numbers. Raises: ClientError: if the workflow is not running. CylcVersionError: if target is a Cylc 7 (or earlier) workflow. """ try: contact = load_contact_file(workflow) except ServiceFileError: raise WorkflowStopped(workflow) host = contact[ContactFileFields.HOST] host = get_fqdn_by_host(host) port = int(contact[ContactFileFields.PORT]) if ContactFileFields.PUBLISH_PORT in contact: pub_port = int(contact[ContactFileFields.PUBLISH_PORT]) else: version = (contact['CYLC_VERSION'] if 'CYLC_VERSION' in contact else None) raise CylcVersionError(version=version) return host, port, pub_port
def test_get_fqdn_by_host_on_bad_host(): """get_fqdn_by_host bad host. Warning: This test can fail due to ISP/network configuration (for example ISP may reroute failed DNS to custom search page) e.g: https://www.virginmedia.com/help/advanced-network-error-search """ bad_host = 'nosuchhost.nosuchdomain.org' with pytest.raises(IOError) as exc: get_fqdn_by_host(bad_host) assert re.match( r"(\[Errno -2\] Name or service|" r"\[Errno 8\] nodename nor servname provided, or)" r" not known: '{}'".format(bad_host), str(exc.value)) assert exc.value.filename == bad_host
def test_should_auto_restart( host, stop_mode, condemned_hosts, auto_restart_time, should_auto_restart ): """Ensure the suite only auto-restarts when appropriate.""" # mock a scheduler object scheduler = Mock( host=get_fqdn_by_host(host), stop_mode=stop_mode, auto_restart_time=auto_restart_time ) # mock a workflow configuration object cfg = Mock() cfg.get = lambda x: condemned_hosts # test assert _should_auto_restart(scheduler, cfg) == should_auto_restart
def _should_auto_restart(scheduler, current_glbl_cfg): # check if workflow host is condemned - if so auto restart if scheduler.stop_mode is None: for host in current_glbl_cfg.get( ['scheduler', 'run hosts', 'condemned']): if host.endswith('!'): # host ends in an `!` -> force shutdown mode mode = AutoRestartMode.FORCE_STOP host = host[:-1] else: # normal mode (stop and restart the workflow) mode = AutoRestartMode.RESTART_NORMAL if scheduler.auto_restart_time is not None: # workflow is already scheduled to stop-restart only # AutoRestartMode.FORCE_STOP can override this. continue if get_fqdn_by_host(host) == scheduler.host: # this host is condemned, take the appropriate action return mode return False
def get_location(workflow: str): """Extract host and port from a workflow's contact file. NB: if it fails to load the workflow contact file, it will exit. Args: workflow (str): workflow name Returns: Tuple[str, int, int]: tuple with the host name and port numbers. Raises: ClientError: if the workflow is not running. """ try: contact = load_contact_file(workflow) except ServiceFileError: raise WorkflowStopped(workflow) host = contact[ContactFileFields.HOST] host = get_fqdn_by_host(host) port = int(contact[ContactFileFields.PORT]) pub_port = int(contact[ContactFileFields.PUBLISH_PORT]) return host, port, pub_port
def select_host(hosts, ranking_string=None, blacklist=None, blacklist_name=None): """Select a host from the provided list. If no ranking is provided (in `ranking_string`) then random selection is used. Args: hosts (list): List of host names to choose from. NOTE: Host names must be identifiable from the host where the call is executed. ranking_string (str): A multiline string containing Python expressions to filter hosts by e.g:: # only consider hosts with less than 70% cpu usage # and a server load of less than 5 cpu_percent() < 70 getloadavg()[0] < 5 And or Python statements to rank hosts by e.g:: # rank by used cpu, then by load average as a tie-break # (lower scores are better) cpu_percent() getloadavg() Comments are allowed using `#` but not inline comments. blacklist (list): List of host names to filter out. Can be short host names (do not have to be fqdn values) blacklist_name (str): The reason for blacklisting these hosts (used for exceptions). Raises: HostSelectException: In the event that no hosts are available / meet the specified criterion. socket.gaierror: This may be raised in the event of unknown host names for some installations or not for others. Returns: tuple - (hostname, fqdn) the chosen host hostname (str): The hostname as provided to this function. fqdn (str): The fully qualified domain name of this host. """ # standardise host names - remove duplicate items hostname_map = { # note dictionary keys filter out duplicates get_fqdn_by_host(host): host for host in hosts } hosts = list(hostname_map) if blacklist: blacklist = list(set(map(get_fqdn_by_host, blacklist))) # dict of conditions and whether they have been met (for error reporting) data = {host: {} for host in hosts} # filter out `filter_hosts` if provided if blacklist: hosts, data = _filter_by_hostname(hosts, blacklist, blacklist_name, data=data) if not hosts: # no hosts provided / left after filtering raise HostSelectException(data) rankings = [] if ranking_string: # parse rankings rankings = list(_get_rankings(ranking_string)) if not rankings: # no metrics or ranking required, pick host at random hosts = [random.choice(list(hosts))] # nosec if not rankings and len(hosts) == 1: return hostname_map[hosts[0]], hosts[0] # filter and sort by rankings metrics = list({x for x, _ in rankings}) # required metrics results, data = _get_metrics( # get data from each host hosts, metrics, data) hosts = list(results) # some hosts might not be contactable # stop here if we don't need to proceed if not hosts: # no hosts provided / left after filtering raise HostSelectException(data) if not rankings and len(hosts) == 1: return hostname_map[hosts[0]], hosts[0] hosts, data = _filter_by_ranking( # filter by rankings, sort by ranking hosts, rankings, results, data=data) if not hosts: # no hosts provided / left after filtering raise HostSelectException(data) return hostname_map[hosts[0]], hosts[0]
NOTE: these are functional tests, for unit tests see the docstrings in the host_select module. """ import socket import pytest from cylc.flow.exceptions import HostSelectException from cylc.flow.host_select import (select_host, select_suite_host) from cylc.flow.hostuserutil import get_fqdn_by_host from cylc.flow.parsec.exceptions import ListValueError localhost, localhost_aliases, _ = socket.gethostbyname_ex('localhost') localhost_fqdn = get_fqdn_by_host(localhost) # NOTE: ensure that all localhost aliases are actually aliases of localhost, # it would appear that this is not always the case # on Travis-CI on of the aliases has a different fqdn from the fqdn # of the host it is an alias of localhost_aliases = [ alias for alias in localhost_aliases if get_fqdn_by_host(alias) == localhost_fqdn ] def test_localhost(): """Basic test with one host to choose from.""" assert select_host([localhost]) == (localhost, localhost_fqdn)
the host_select module. """ from shlex import quote import socket from subprocess import call, DEVNULL import pytest from cylc.flow.cfgspec.glbl_cfg import glbl_cfg from cylc.flow.exceptions import HostSelectException from cylc.flow.host_select import (select_host, select_suite_host) from cylc.flow.hostuserutil import get_fqdn_by_host local_host, local_host_alises, _ = socket.gethostbyname_ex('localhost') local_host_fqdn = get_fqdn_by_host(local_host) try: # get a suitable remote host for running tests on # NOTE: do NOT copy this testing approach in other python tests remote_platform = glbl_cfg().get( ['platforms', '_remote_background_shared_tcp', 'hosts'], [])[0] # don't run tests unless host is contactable if call(['ssh', quote(remote_platform), 'hostname'], stdin=DEVNULL, stdout=DEVNULL, stderr=DEVNULL): raise KeyError('remote platform') # get the fqdn for this host remote_platform_fqdn = get_fqdn_by_host(remote_platform) except (KeyError, IndexError):