def _setup_event_mail(self, itask, event): """Set up task event notification, by email.""" if event in self.NON_UNIQUE_EVENTS: key1 = (self.HANDLER_MAIL, '%s-%d' % (event, itask.non_unique_events.get(event, 1))) else: key1 = (self.HANDLER_MAIL, event) id_key = (key1, str(itask.point), itask.tdef.name, itask.submit_num) if (id_key in self.event_timers or event not in self._get_events_conf( itask, "mail events", [])): return retry_delays = self._get_events_conf(itask, "mail retry delays") if not retry_delays: retry_delays = [0] self.event_timers[id_key] = TaskActionTimer( TaskEventMailContext( self.HANDLER_MAIL, # key self.HANDLER_MAIL, # ctx_type self._get_events_conf( # mail_from itask, "mail from", "notifications@" + get_host(), ), self._get_events_conf(itask, "mail to", get_user()), # mail_to self._get_events_conf(itask, "mail smtp"), # mail_smtp ), retry_delays)
def test_remove_bad_hosts(self): """Test the '_remove_bad_hosts' method. Test using 'localhost' only since remote host functionality is contained only inside remote_cylc_cmd() so is outside of the scope of HostAppointer. """ self.mock_global_config(set_hosts=['localhost']) self.assertTrue(self.app._remove_bad_hosts().get('localhost', False)) # Test 'localhost' true identifier is treated properly too. self.mock_global_config(set_hosts=[get_host()]) self.assertTrue(self.app._remove_bad_hosts().get('localhost', False)) self.mock_global_config(set_hosts=['localhost', 'FAKE_HOST']) # Check for no exceptions and 'localhost' but not 'FAKE_HOST' data # Difficult to unittest for specific stderr string; this is sufficient. self.assertTrue(self.app._remove_bad_hosts().get('localhost', False)) self.assertTrue(self.app._remove_bad_hosts().get('FAKE_HOST', True)) # Apply thresholds impossible to pass; check results in host removal. self.mock_global_config( set_hosts=['localhost'], set_thresholds='load:15 0.0') self.assertEqual(self.app._remove_bad_hosts(), {}) self.mock_global_config( set_hosts=['localhost'], set_thresholds='memory 1000000000') self.assertEqual(self.app._remove_bad_hosts(), {})
def _is_local_auth_ok(reg, owner, host): """Return True if it is OK to use local passphrase file. Use values in ~/cylc-run/REG/.service/contact to make a judgement. """ if is_remote(host, owner): fname = os.path.join(get_suite_srv_dir(reg), SuiteFiles.Service.CONTACT) data = {} try: for line in open(fname): key, value = ([item.strip() for item in line.split("=", 1)]) data[key] = value except (IOError, ValueError): # No contact file return False else: # Contact file exists, check values match if owner is None: owner = get_user() if host is None: host = get_host() host_value = data.get(ContactFileFields.HOST, "") return (reg == data.get(ContactFileFields.NAME) and owner == data.get(ContactFileFields.OWNER) and (host == host_value or host == host_value.split(".", 1)[0] # no domain )) else: return True
def test_rose_fileinstall_rose_suite_cylc_install_conf(fixture_install_flow): _, _, _, result, destpath = fixture_install_flow host = get_host() assert (destpath / 'opt/rose-suite-cylc-install.conf').read_text() == ( "# This file records CLI Options.\n\n" "!opts=A B\n\n" "[env]\n" "FOO=42\n" f"#{ROSE_ORIG_HOST_INSTALLED_OVERRIDE_STRING}\n" f"ROSE_ORIG_HOST={host}\n\n" "[jinja2:suite.rc]\n" "BAR=84\n" "CORNETTO=120\n" "FLAKE=99\n" f"#{ROSE_ORIG_HOST_INSTALLED_OVERRIDE_STRING}\n" f"ROSE_ORIG_HOST={host}\n")
def _run_event_mail(self, config, ctx): """Helper for "run_event_handlers", do mail notification.""" if ctx.event in self.get_events_conf(config, 'mail events', []): # SMTP server env = dict(os.environ) mail_smtp = self.get_events_conf(config, 'mail smtp') if mail_smtp: env['smtp'] = mail_smtp subject = '[suite %(event)s] %(suite)s' % { 'suite': ctx.suite, 'event': ctx.event} stdin_str = '' for name, value in [ ('suite event', ctx.event), ('reason', ctx.reason), ('suite', ctx.suite), ('host', ctx.host), ('port', ctx.port), ('owner', ctx.owner)]: if value: stdin_str += '%s: %s\n' % (name, value) mail_footer_tmpl = self.get_events_conf(config, 'mail footer') if mail_footer_tmpl: stdin_str += (mail_footer_tmpl + '\n') % { 'host': ctx.host, 'port': ctx.port, 'owner': ctx.owner, 'suite': ctx.suite} proc_ctx = SubProcContext( (self.SUITE_EVENT_HANDLER, ctx.event), [ 'mail', '-s', subject, '-r', self.get_events_conf( config, 'mail from', 'notifications@' + get_host()), self.get_events_conf(config, 'mail to', get_user()), ], env=env, stdin_str=stdin_str) if self.proc_pool.closed: # Run command in foreground if process pool is closed self.proc_pool.run_command(proc_ctx) self._run_event_handlers_callback(proc_ctx) else: # Run command using process pool otherwise self.proc_pool.put_command( proc_ctx, self._run_event_mail_callback)
def _send_mail(self, event, subject, message, schd, env): proc_ctx = SubProcContext( (self.WORKFLOW_EVENT_HANDLER, event), [ 'mail', '-s', subject, '-r', self.get_events_conf( schd.config, 'from', 'notifications@' + get_host()), self.get_events_conf(schd.config, 'to', get_user()), ], env=env, stdin_str=message) if self.proc_pool.closed: # Run command in foreground if process pool is closed self.proc_pool.run_command(proc_ctx) self._run_event_handlers_callback(proc_ctx) else: # Run command using process pool otherwise self.proc_pool.put_command( proc_ctx, callback=self._run_event_mail_callback)
def cache_passphrase(self, reg, owner, host, value): """Cache and dump passphrase for a remote suite in standard location. Save passphrase to ~/.cylc/auth/owner@host/reg if possible. This is normally called on a successful authentication, and will cache the remote passphrase in memory as well. """ if owner is None: owner = get_user() if host is None: host = get_host() path = self._get_cache_dir(reg, owner, host) self.cache[self.FILE_BASE_PASSPHRASE][(reg, owner, host)] = value # Dump to a file only for remote suites loaded via SSH. if self.can_disk_cache_passphrases.get((reg, owner, host)): # Although not desirable, failing to dump the passphrase to a file # is not disastrous. try: self._dump_item(path, self.FILE_BASE_PASSPHRASE, value) except (IOError, OSError): if cylc.flow.flags.debug: import traceback traceback.print_exc()
def _is_local_auth_ok(self, reg, owner, host): """Return True if it is OK to use local passphrase file. Use values in ~/cylc-run/REG/.service/contact to make a judgement. Cache results in self.can_use_load_auths. """ if (reg, owner, host) not in self.can_use_load_auths: if is_remote(host, owner): fname = os.path.join( self.get_suite_srv_dir(reg), self.FILE_BASE_CONTACT) data = {} try: for line in open(fname): key, value = ( [item.strip() for item in line.split("=", 1)]) data[key] = value except (IOError, ValueError): # No contact file self.can_use_load_auths[(reg, owner, host)] = False else: # Contact file exists, check values match if owner is None: owner = get_user() if host is None: host = get_host() host_value = data.get(self.KEY_HOST, "") self.can_use_load_auths[(reg, owner, host)] = ( reg == data.get(self.KEY_NAME) and owner == data.get(self.KEY_OWNER) and ( host == host_value or host == host_value.split(".", 1)[0] # no domain ) ) else: self.can_use_load_auths[(reg, owner, host)] = True return self.can_use_load_auths[(reg, owner, host)]
def get_auth_item(self, item, reg, owner=None, host=None, content=False): """Locate/load passphrase, SSL private key, SSL certificate, etc. Return file name, or content of file if content=True is set. Files are searched from these locations in order: 1/ For running task jobs, service directory under: a/ $CYLC_SUITE_RUN_DIR for remote jobs. b/ $CYLC_SUITE_RUN_DIR_ON_SUITE_HOST for local jobs or remote jobs with SSH messaging. 2/ (Passphrases only) From memory cache, for remote suite passphrases. Don't use if content=False. 3/ For suite on local user@host. The suite service directory. 4/ Location under $HOME/.cylc/ for remote suite control from accounts that do not actually need the suite definition directory to be installed: $HOME/.cylc/auth/SUITE_OWNER@SUITE_HOST/SUITE_NAME/ 5/ For remote suites, try locating the file from the suite service directory on remote owner@host via SSH. If content=False, the value of the located file will be dumped under: $HOME/.cylc/auth/SUITE_OWNER@SUITE_HOST/SUITE_NAME/ """ if item not in [ self.FILE_BASE_PASSPHRASE, self.FILE_BASE_CONTACT, self.FILE_BASE_CONTACT2 ]: raise ValueError("%s: item not recognised" % item) if item == self.FILE_BASE_PASSPHRASE: self.can_disk_cache_passphrases[(reg, owner, host)] = False if reg == os.getenv('CYLC_SUITE_NAME'): env_keys = [] if 'CYLC_SUITE_RUN_DIR' in os.environ: # 1(a)/ Task messaging call. env_keys.append('CYLC_SUITE_RUN_DIR') elif self.KEY_SUITE_RUN_DIR_ON_SUITE_HOST in os.environ: # 1(b)/ Task messaging call via ssh messaging. env_keys.append(self.KEY_SUITE_RUN_DIR_ON_SUITE_HOST) for key in env_keys: path = os.path.join(os.environ[key], self.DIR_BASE_SRV) if content: value = self._load_local_item(item, path) else: value = self._locate_item(item, path) if value: return value # 2/ From memory cache if item in self.cache: my_owner = owner my_host = host if my_owner is None: my_owner = get_user() if my_host is None: my_host = get_host() try: return self.cache[item][(reg, my_owner, my_host)] except KeyError: pass # 3/ Local suite service directory if self._is_local_auth_ok(reg, owner, host): path = self.get_suite_srv_dir(reg) if content: value = self._load_local_item(item, path) else: value = self._locate_item(item, path) if value: return value # 4/ Disk cache for remote suites if owner is not None and host is not None: paths = [self._get_cache_dir(reg, owner, host)] short_host = host.split('.', 1)[0] if short_host != host: paths.append(self._get_cache_dir(reg, owner, short_host)) for path in paths: if content: value = self._load_local_item(item, path) else: value = self._locate_item(item, path) if value: return value # 5/ Use SSH to load content from remote owner@host # Note: It is not possible to find ".service/contact2" on the suite # host, because it is installed on task host by "cylc remote-init" on # demand. if item != self.FILE_BASE_CONTACT2: value = self._load_remote_item(item, reg, owner, host) if value: if item == self.FILE_BASE_PASSPHRASE: self.can_disk_cache_passphrases[(reg, owner, host)] = True if not content: path = self._get_cache_dir(reg, owner, host) self._dump_item(path, item, value) value = os.path.join(path, item) return value raise SuiteServiceFileError("Couldn't get %s" % item)
""" import os import pytest import re import shutil import subprocess from pathlib import Path from uuid import uuid4 from cylc.flow.hostuserutil import get_host from cylc.flow.pathutil import get_workflow_run_dir HOST = get_host() @pytest.fixture(scope='module') def monkeymodule(): from _pytest.monkeypatch import MonkeyPatch mpatch = MonkeyPatch() yield mpatch mpatch.undo() @pytest.fixture(scope='module') def fixture_provide_flow(tmp_path_factory): """Provide a cylc workflow based on the contents of a folder which can be either validated or installed. """
def test_is_remote_host_on_localhost(): """is_remote_host with localhost.""" assert not is_remote_host(None) assert not is_remote_host('localhost') assert not is_remote_host(os.getenv('HOSTNAME')) assert not is_remote_host(get_host())
def _run_event_mail(self, config, ctx): """Helper for "run_event_handlers", do mail notification.""" if ctx.event in self.get_events_conf(config, 'mail events', []): # SMTP server env = dict(os.environ) mail_smtp = self.get_events_conf(config, 'smtp') if mail_smtp: env['smtp'] = mail_smtp subject = '[workflow %(event)s] %(workflow)s' % { 'workflow': ctx.workflow, 'event': ctx.event } stdin_str = '' for name, value in [('workflow event', ctx.event), ('reason', ctx.reason), ('workflow', ctx.workflow), ('host', ctx.host), ('port', ctx.port), ('owner', ctx.owner)]: if value: stdin_str += '%s: %s\n' % (name, value) mail_footer_tmpl = self.get_events_conf(config, 'footer') if mail_footer_tmpl: # BACK COMPAT: "suite" deprecated # url: # https://github.com/cylc/cylc-flow/pull/4174 # from: # Cylc 8 # remove at: # Cylc 9 try: stdin_str_footer = (mail_footer_tmpl + '\n') % { 'host': ctx.host, 'port': ctx.port, 'owner': ctx.owner, 'suite': ctx.workflow, # deprecated 'workflow': ctx.workflow } except KeyError: LOG.warning("Ignoring bad mail footer template: %s" % (mail_footer_tmpl)) else: stdin_str += stdin_str_footer proc_ctx = SubProcContext( (self.WORKFLOW_EVENT_HANDLER, ctx.event), [ 'mail', '-s', subject, '-r', self.get_events_conf(config, 'from', 'notifications@' + get_host()), self.get_events_conf(config, 'to', get_user()), ], env=env, stdin_str=stdin_str) if self.proc_pool.closed: # Run command in foreground if process pool is closed self.proc_pool.run_command(proc_ctx) self._run_event_handlers_callback(proc_ctx) else: # Run command using process pool otherwise self.proc_pool.put_command( proc_ctx, callback=self._run_event_mail_callback)
def get_rose_vars_from_config_node(config, config_node, environ): """Load template variables from a Rose config node. This uses only the provided config node and environment variables - there is no system interaction. Args: config (dict): Object which will be populated with the results. config_node (metomi.rose.config.ConfigNode): Configuration node representing the Rose suite configuration. environ (dict): Dictionary of environment variables """ templating = None sections = {'jinja2:suite.rc', 'empy:suite.rc', 'template variables'} # Don't allow multiple templating sections. defined_sections = sections.intersection(set(config_node.value)) if len(defined_sections) > 1: raise MultipleTemplatingEnginesError( "You should not define more than one templating section. " f"You defined:\n\t{'; '.join(defined_sections)}") elif len(defined_sections) == 1: templating, = defined_sections if templating != 'template variables': config['templating_detected'] = templating.replace(':suite.rc', '') else: config['templating_detected'] = templating # Create env section if it doesn't already exist. if 'env' not in config_node.value: config_node.set(['env']) # Get Values for standard ROSE variables (ROSE_ORIG_HOST and ROSE_SITE). rose_orig_host = get_host() rose_site = ResourceLocator().get_conf().get_value(['site'], '') # For each section process variables and add standard variables. for section in ['env', templating]: if section not in config_node.value: continue # Add standard ROSE_VARIABLES config_node[section].set(['ROSE_SITE'], rose_site) config_node[section].set(['ROSE_VERSION'], ROSE_VERSION) config_node[section].set(['ROSE_ORIG_HOST'], rose_orig_host) # Use env_var_process to process variables which may need expanding. for key, node in config_node.value[section].value.items(): try: config_node.value[section].value[key].value = env_var_process( node.value, environ=environ) if section == 'env': environ[key] = node.value except UnboundEnvironmentVariableError as exc: raise ConfigProcessError(['env', key], node.value, exc) # For each of the template language sections extract items to a simple # dict to be returned. if 'env' in config_node.value: config['env'] = { item[0][1]: item[1].value for item in config_node.value['env'].walk() } if templating in config_node.value: config['template_variables'] = { item[0][1]: item[1].value for item in config_node.value[templating].walk() } elif 'template variables' in config_node.value: config['template_variables'] = { item[0][1]: item[1].value for item in config_node.value['template variables'].walk() } # Add the entire config to ROSE_SUITE_VARIABLES to allow for programatic # access. if templating is not None: parser = Parser() for key, value in config['template_variables'].items(): # The special variables are already Python variables. if key not in ['ROSE_ORIG_HOST', 'ROSE_VERSION', 'ROSE_SITE']: try: config['template_variables'][key] = ( parser.literal_eval(value)) except Exception: raise ConfigProcessError( [templating, key], value, f'Invalid template variable: {value}' '\nMust be a valid Python or Jinja2 literal' ' (note strings "must be quoted").') from None # Add ROSE_SUITE_VARIABLES to config of templating engines in use. if templating is not None: config['template_variables']['ROSE_SUITE_VARIABLES'] = config[ 'template_variables']
def record_cylc_install_options( rundir=None, opts=None, srcdir=None, ): """Create/modify files recording Cylc install config options. Creates a new config based on CLI options and writes it to the workflow install location as ``rose-suite-cylc-install.conf``. If ``rose-suite-cylc-install.conf`` already exists over-writes changed items, except for ``!opts=`` which is merged and simplified. If ``!opts=`` have been changed these are appended to those that have been written in the installed ``rose-suite.conf``. Args: srcdir (pathlib.Path): Used to check whether the source directory contains a rose config. opts: Cylc option parser object - we want to extract the following values: - opt_conf_keys (list or str): Equivalent of ``rose suite-run --option KEY`` - defines (list of str): Equivalent of ``rose suite-run --define KEY=VAL`` - rose_template_vars (list of str): Equivalent of ``rose suite-run --define-suite KEY=VAL`` rundir (pathlib.Path): Path to dump the rose-suite-cylc-conf Returns: cli_config - Config Node which has been dumped to ``rose-suite-cylc-install.conf``. rose_suite_conf['opts'] - Opts section of the config node dumped to installed ``rose-suite.conf``. """ # Create a config based on command line options: cli_config = get_cli_opts_node(opts, srcdir) # Leave now if there is nothing to do: if not cli_config: return False # Construct path objects representing our target files. (Path(rundir) / 'opt').mkdir(exist_ok=True) conf_filepath = Path(rundir) / 'opt/rose-suite-cylc-install.conf' rose_conf_filepath = Path(rundir) / 'rose-suite.conf' dumper = ConfigDumper() loader = ConfigLoader() # If file exists we need to merge with our new config, over-writing with # new items where there are duplicates. if conf_filepath.is_file(): if opts.clear_rose_install_opts: conf_filepath.unlink() else: oldconfig = loader.load(str(conf_filepath)) cli_config = merge_rose_cylc_suite_install_conf( oldconfig, cli_config) # Get Values for standard ROSE variable ROSE_ORIG_HOST. rose_orig_host = get_host() for section in [ 'env', 'jinja2:suite.rc', 'empy:suite.rc', 'template variables' ]: if section in cli_config: cli_config[section].set(['ROSE_ORIG_HOST'], rose_orig_host) cli_config[section]['ROSE_ORIG_HOST'].comments = [ ROSE_ORIG_HOST_INSTALLED_OVERRIDE_STRING ] cli_config.comments = [' This file records CLI Options.'] identify_templating_section(cli_config) dumper.dump(cli_config, str(conf_filepath)) # Merge the opts section of the rose-suite.conf with those set by CLI: if not rose_conf_filepath.is_file(): rose_conf_filepath.touch() rose_suite_conf = loader.load(str(rose_conf_filepath)) rose_suite_conf = add_cylc_install_to_rose_conf_node_opts( rose_suite_conf, cli_config) identify_templating_section(rose_suite_conf) dumper(rose_suite_conf, rose_conf_filepath) return cli_config, rose_suite_conf
def get_rose_vars_from_config_node(config, config_node, environ): """Load template variables from a Rose config node. This uses only the provided config node and environment variables - there is no system interaction. Args: config (dict): Object which will be populated with the results. config_node (metomi.rose.config.ConfigNode): Configuration node representing the Rose suite configuration. environ (dict): Dictionary of environment variables """ templating = None # Don't allow multiple templating sections. templating = identify_templating_section(config_node) if templating != 'template variables': config['templating_detected'] = templating.replace(':suite.rc', '') else: config['templating_detected'] = templating # Create env section if it doesn't already exist. if 'env' not in config_node.value: config_node.set(['env']) if templating not in config_node.value: config_node.set([templating]) # Get Rose Orig host: rose_orig_host = get_host() # For each section process variables and add standard variables. for section in ['env', templating]: # This loop handles standard variables. # CYLC_VERSION - If it's in the config, remove it. # ROSE_VERSION - If it's in the config, replace it. # ROSE_ORIG_HOST - If it's the config, replace it, unless it has a # comment marking it as having been saved by ``cylc install``. # In all cases warn users if the value in their config is not used. for var_name, replace_with in [('ROSE_ORIG_HOST', rose_orig_host), ('ROSE_VERSION', ROSE_VERSION), ('CYLC_VERSION', SET_BY_CYLC)]: # Warn if we're we're going to override a variable: if override_this_variable(config_node, section, var_name): user_var = config_node[section].value[var_name].value LOG.warning( f'[{section}]{var_name}={user_var} from rose-suite.conf ' f'will be ignored: {var_name} will be: {replace_with}') # Handle replacement of stored variable if appropriate: if replace_with == SET_BY_CYLC: config_node[section].unset([var_name]) elif not rose_orig_host_set_by_cylc_install( config_node, section, var_name): config_node[section].set([var_name], replace_with) # Use env_var_process to process variables which may need expanding. for key, node in config_node.value[section].value.items(): try: config_node.value[section].value[key].value = env_var_process( node.value, environ=environ) if section == 'env': environ[key] = node.value except UnboundEnvironmentVariableError as exc: raise ConfigProcessError(['env', key], node.value, exc) # For each of the template language sections extract items to a simple # dict to be returned. if 'env' in config_node.value: config['env'] = { item[0][1]: item[1].value for item in config_node.value['env'].walk() } if templating in config_node.value: config['template_variables'] = { item[0][1]: item[1].value for item in config_node.value[templating].walk() } elif 'template variables' in config_node.value: config['template_variables'] = { item[0][1]: item[1].value for item in config_node.value['template variables'].walk() } # Add the entire config to ROSE_SUITE_VARIABLES to allow for programatic # access. if templating is not None: with patch_jinja2_leading_zeros(): # BACK COMPAT: patch_jinja2_leading_zeros # back support zero-padded integers for a limited time to help # users migrate before upgrading cylc-flow to Jinja2>=3.1 parser = Parser() for key, value in config['template_variables'].items(): # The special variables are already Python variables. if key not in ['ROSE_ORIG_HOST', 'ROSE_VERSION', 'ROSE_SITE']: try: config['template_variables'][key] = ( parser.literal_eval(value)) except Exception: raise ConfigProcessError( [templating, key], value, f'Invalid template variable: {value}' '\nMust be a valid Python or Jinja2 literal' ' (note strings "must be quoted").') from None # Add ROSE_SUITE_VARIABLES to config of templating engines in use. if templating is not None: config['template_variables']['ROSE_SUITE_VARIABLES'] = config[ 'template_variables']
""" import os import pytest import shutil import subprocess from pathlib import Path from shlex import split from uuid import uuid4 from cylc.flow.pathutil import get_workflow_run_dir from cylc.flow.hostuserutil import get_host HOST = get_host().split('.')[0] class SubprocessesError(Exception): ... # Check that FCM is present on system, skipping checks elsewise: try: subprocess.run(['fcm', '--version']) except FileNotFoundError: pytest.skip("\"FCM\" not installed", allow_module_level=True) @pytest.fixture(scope='module') def monkeymodule():
def test_is_remote_host_on_localhost(self): """is_remote_host with localhost.""" self.assertFalse(is_remote_host(None)) self.assertFalse(is_remote_host('localhost')) self.assertFalse(is_remote_host(os.getenv('HOSTNAME'))) self.assertFalse(is_remote_host(get_host()))
def submit_task_jobs(self, suite, itasks, is_simulation=False): """Prepare and submit task jobs. Submit tasks where possible. Ignore tasks that are waiting for host select command to complete, or tasks that are waiting for remote initialisation. Bad host select command, error writing to a job file or bad remote initialisation will cause a bad task - leading to submission failure. This method uses prep_submit_task_job() as helper. Return (list): list of tasks that attempted submission. """ if is_simulation: return self._simulation_submit_task_jobs(itasks) # Prepare tasks for job submission prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks) # Reset consumed host selection results self.task_remote_mgr.remote_host_select_reset() if not prepared_tasks: return bad_tasks # Group task jobs by (host, owner) auth_itasks = {} # {(host, owner): [itask, ...], ...} for itask in prepared_tasks: auth_itasks.setdefault((itask.task_host, itask.task_owner), []) auth_itasks[(itask.task_host, itask.task_owner)].append(itask) # Submit task jobs for each (host, owner) group done_tasks = bad_tasks for (host, owner), itasks in sorted(auth_itasks.items()): is_init = self.task_remote_mgr.remote_init(host, owner) if is_init is None: # Remote is waiting to be initialised for itask in itasks: itask.set_summary_message(self.REMOTE_INIT_MSG) continue # Ensure that localhost background/at jobs are recorded as running # on the host name of the current suite host, rather than just # "localhost". On suite restart on a different suite host, this # allows the restart logic to correctly poll the status of the # background/at jobs that may still be running on the previous # suite host. if ( self.batch_sys_mgr.is_job_local_to_host( itask.summary['batch_sys_name']) and not is_remote_host(host) ): owner_at_host = get_host() else: owner_at_host = host # Persist if owner: owner_at_host = owner + '@' + owner_at_host now_str = get_current_time_string() done_tasks.extend(itasks) for itask in itasks: # Log and persist LOG.info( '[%s] -submit-num=%02d, owner@host=%s', itask, itask.submit_num, owner_at_host) self.suite_db_mgr.put_insert_task_jobs(itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': now_str, 'user_at_host': owner_at_host, 'batch_sys_name': itask.summary['batch_sys_name'], }) itask.is_manual_submit = False if is_init == REMOTE_INIT_FAILED: # Remote has failed to initialise # Set submit-failed for all affected tasks for itask in itasks: itask.local_job_file_path = None # reset for retry log_task_job_activity( SubProcContext( self.JOBS_SUBMIT, '(init %s)' % owner_at_host, err=REMOTE_INIT_FAILED, ret_code=1), suite, itask.point, itask.tdef.name) self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED) continue # Build the "cylc jobs-submit" command cmd = ['cylc', self.JOBS_SUBMIT] if LOG.isEnabledFor(DEBUG): cmd.append('--debug') if get_utc_mode(): cmd.append('--utc-mode') remote_mode = False kwargs = {} for key, value, test_func in [ ('host', host, is_remote_host), ('user', owner, is_remote_user)]: if test_func(value): cmd.append('--%s=%s' % (key, value)) remote_mode = True kwargs[key] = value if remote_mode: cmd.append('--remote-mode') cmd.append('--') cmd.append(glbl_cfg().get_derived_host_item( suite, 'suite job log directory', host, owner)) # Chop itasks into a series of shorter lists if it's very big # to prevent overloading of stdout and stderr pipes. itasks = sorted(itasks, key=lambda itask: itask.identity) chunk_size = len(itasks) // ((len(itasks) // 100) + 1) + 1 itasks_batches = [ itasks[i:i + chunk_size] for i in range(0, len(itasks), chunk_size)] LOG.debug( '%s ... # will invoke in batches, sizes=%s', cmd, [len(b) for b in itasks_batches]) for i, itasks_batch in enumerate(itasks_batches): stdin_files = [] job_log_dirs = [] for itask in itasks_batch: if remote_mode: stdin_files.append( get_task_job_job_log( suite, itask.point, itask.tdef.name, itask.submit_num)) job_log_dirs.append(get_task_job_id( itask.point, itask.tdef.name, itask.submit_num)) # The job file is now (about to be) used: reset the file # write flag so that subsequent manual retrigger will # generate a new job file. itask.local_job_file_path = None itask.state.reset_state(TASK_STATUS_READY) if itask.state.outputs.has_custom_triggers(): self.suite_db_mgr.put_update_task_outputs(itask) self.proc_pool.put_command( SubProcContext( self.JOBS_SUBMIT, cmd + job_log_dirs, stdin_files=stdin_files, job_log_dirs=job_log_dirs, **kwargs ), self._submit_task_jobs_callback, [suite, itasks_batch]) return done_tasks
def get_auth_item(self, item, reg, owner=None, host=None, content=False): """Locate/load passphrase, SSL private key, SSL certificate, etc. Return file name, or content of file if content=True is set. Files are searched from these locations in order: 1/ For running task jobs, service directory under: a/ $CYLC_SUITE_RUN_DIR for remote jobs. b/ $CYLC_SUITE_RUN_DIR_ON_SUITE_HOST for local jobs or remote jobs with SSH messaging. 2/ (Passphrases only) From memory cache, for remote suite passphrases. Don't use if content=False. 3/ For suite on local user@host. The suite service directory. 4/ Location under $HOME/.cylc/ for remote suite control from accounts that do not actually need the suite definition directory to be installed: $HOME/.cylc/auth/SUITE_OWNER@SUITE_HOST/SUITE_NAME/ 5/ For remote suites, try locating the file from the suite service directory on remote owner@host via SSH. If content=False, the value of the located file will be dumped under: $HOME/.cylc/auth/SUITE_OWNER@SUITE_HOST/SUITE_NAME/ """ if item not in [ self.FILE_BASE_PASSPHRASE, self.FILE_BASE_CONTACT, self.FILE_BASE_CONTACT2]: raise ValueError("%s: item not recognised" % item) if item == self.FILE_BASE_PASSPHRASE: self.can_disk_cache_passphrases[(reg, owner, host)] = False if reg == os.getenv('CYLC_SUITE_NAME'): env_keys = [] if 'CYLC_SUITE_RUN_DIR' in os.environ: # 1(a)/ Task messaging call. env_keys.append('CYLC_SUITE_RUN_DIR') elif self.KEY_SUITE_RUN_DIR_ON_SUITE_HOST in os.environ: # 1(b)/ Task messaging call via ssh messaging. env_keys.append(self.KEY_SUITE_RUN_DIR_ON_SUITE_HOST) for key in env_keys: path = os.path.join(os.environ[key], self.DIR_BASE_SRV) if content: value = self._load_local_item(item, path) else: value = self._locate_item(item, path) if value: return value # 2/ From memory cache if item in self.cache: my_owner = owner my_host = host if my_owner is None: my_owner = get_user() if my_host is None: my_host = get_host() try: return self.cache[item][(reg, my_owner, my_host)] except KeyError: pass # 3/ Local suite service directory if self._is_local_auth_ok(reg, owner, host): path = self.get_suite_srv_dir(reg) if content: value = self._load_local_item(item, path) else: value = self._locate_item(item, path) if value: return value # 4/ Disk cache for remote suites if owner is not None and host is not None: paths = [self._get_cache_dir(reg, owner, host)] short_host = host.split('.', 1)[0] if short_host != host: paths.append(self._get_cache_dir(reg, owner, short_host)) for path in paths: if content: value = self._load_local_item(item, path) else: value = self._locate_item(item, path) if value: return value # 5/ Use SSH to load content from remote owner@host # Note: It is not possible to find ".service/contact2" on the suite # host, because it is installed on task host by "cylc remote-init" on # demand. if item != self.FILE_BASE_CONTACT2: value = self._load_remote_item(item, reg, owner, host) if value: if item == self.FILE_BASE_PASSPHRASE: self.can_disk_cache_passphrases[(reg, owner, host)] = True if not content: path = self._get_cache_dir(reg, owner, host) self._dump_item(path, item, value) value = os.path.join(path, item) return value raise SuiteServiceFileError("Couldn't get %s" % item)
def get_cli_opts_node(opts=None, srcdir=None): """Create a ConfigNode representing options set on the command line. Args: opts (CylcOptionParser object): Object with values from the command line. Returns: Rose ConfigNode. Example: >>> from types import SimpleNamespace >>> opts = SimpleNamespace( ... opt_conf_keys='A B', ... defines=["[env]FOO=BAR"], ... rose_template_vars=["QUX=BAZ"] ... ) >>> node = get_cli_opts_node(opts) >>> node['opts'] {'value': 'A B', 'state': '!', 'comments': []} >>> node['env']['FOO'] {'value': 'BAR', 'state': '', 'comments': []} >>> node['template variables']['QUX'] {'value': 'BAZ', 'state': '', 'comments': []} """ # Unpack info we want from opts: opt_conf_keys = [] defines = [] rose_template_vars = [] if opts and 'opt_conf_keys' in dir(opts): opt_conf_keys = opts.opt_conf_keys if opts and 'defines' in dir(opts): defines = opts.defines if opts and 'rose_template_vars' in dir(opts): rose_template_vars = opts.rose_template_vars rose_orig_host = get_host() defines.append(f'[env]ROSE_ORIG_HOST={rose_orig_host}') rose_template_vars.append(f'ROSE_ORIG_HOST={rose_orig_host}') # Construct new ouput based on optional Configs: newconfig = ConfigNode() # For each __define__ determine whether it is an env or template define. for define in defines: match = re.match((r'^\[(?P<key1>.*)\](?P<state>!{0,2})' r'(?P<key2>.*)\s*=\s*(?P<value>.*)'), define).groupdict() if match['key1'] == '' and match['state'] in ['!', '!!']: LOG.warning( 'CLI opts set to ignored or trigger-ignored will be ignored.') else: newconfig.set(keys=[match['key1'], match['key2']], value=match['value'], state=match['state']) # For each __suite define__ add define. if srcdir is not None: config_node = rose_config_tree_loader(srcdir, opts).node templating = identify_templating_section(config_node) else: templating = 'template variables' for define in rose_template_vars: match = re.match(r'(?P<state>!{0,2})(?P<key>.*)\s*=\s*(?P<value>.*)', define).groupdict() # Guess templating type? newconfig.set(keys=[templating, match['key']], value=match['value'], state=match['state']) # Specialised treatement of optional configs. if 'opts' not in newconfig: newconfig['opts'] = ConfigNode() newconfig['opts'].value = '' newconfig['opts'].value = merge_opts(newconfig, opt_conf_keys) newconfig['opts'].state = '!' return newconfig
def submit_task_jobs(self, suite, itasks, curve_auth, client_pub_key_dir, is_simulation=False): """Prepare for job submission and submit task jobs. Preparation (host selection, remote host init, and remote install) is done asynchronously. Newly released tasks may be sent here several times until these init subprocesses have returned. Failure during preparation is considered to be job submission failure. Once preparation has completed or failed, reset .waiting_on_job_prep in task instances so the scheduler knows to stop sending them back here. This method uses prep_submit_task_job() as helper. Return (list): list of tasks that attempted submission. """ if is_simulation: return self._simulation_submit_task_jobs(itasks) # Prepare tasks for job submission prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks) # Reset consumed host selection results self.task_remote_mgr.subshell_eval_reset() if not prepared_tasks: return bad_tasks auth_itasks = {} # {platform: [itask, ...], ...} for itask in prepared_tasks: platform_name = itask.platform['name'] auth_itasks.setdefault(platform_name, []) auth_itasks[platform_name].append(itask) # Submit task jobs for each platform done_tasks = bad_tasks for platform_name, itasks in sorted(auth_itasks.items()): platform = itasks[0].platform install_target = get_install_target_from_platform(platform) ri_map = self.task_remote_mgr.remote_init_map if (ri_map.get(install_target) != REMOTE_FILE_INSTALL_DONE): if install_target == get_localhost_install_target(): # Skip init and file install for localhost. LOG.debug(f"REMOTE INIT NOT REQUIRED for {install_target}") ri_map[install_target] = (REMOTE_FILE_INSTALL_DONE) elif install_target not in ri_map: # Remote init not in progress for target, so start it. self.task_remote_mgr.remote_init(platform, curve_auth, client_pub_key_dir) for itask in itasks: itask.set_summary_message(self.REMOTE_INIT_MSG) self.data_store_mgr.delta_job_msg( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num), self.REMOTE_INIT_MSG) continue elif (ri_map[install_target] == REMOTE_INIT_DONE): # Already done remote init so move on to file install self.task_remote_mgr.file_install(platform) continue elif (ri_map[install_target] in self.IN_PROGRESS.keys()): # Remote init or file install in progress. for itask in itasks: msg = self.IN_PROGRESS[ri_map[install_target]] itask.set_summary_message(msg) self.data_store_mgr.delta_job_msg( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num), msg) continue # Ensure that localhost background/at jobs are recorded as running # on the host name of the current suite host, rather than just # "localhost". On suite restart on a different suite host, this # allows the restart logic to correctly poll the status of the # background/at jobs that may still be running on the previous # suite host. host = get_host_from_platform(platform) if (self.job_runner_mgr.is_job_local_to_host( itask.summary['job_runner_name']) and not is_remote_platform(platform)): host = get_host() now_str = get_current_time_string() done_tasks.extend(itasks) for itask in itasks: # Log and persist LOG.info('[%s] -submit-num=%02d, host=%s', itask, itask.submit_num, host) self.suite_db_mgr.put_insert_task_jobs( itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': now_str, 'platform_name': itask.platform['name'], 'job_runner_name': itask.summary['job_runner_name'], }) itask.is_manual_submit = False if (ri_map[install_target] in [REMOTE_INIT_FAILED, REMOTE_FILE_INSTALL_FAILED]): # Remote init or install failed. Set submit-failed for all # affected tasks and remove target from remote init map # - this enables new tasks to re-initialise that target init_error = (ri_map[install_target]) del ri_map[install_target] for itask in itasks: itask.waiting_on_job_prep = False itask.local_job_file_path = None # reset for retry log_task_job_activity( SubProcContext(self.JOBS_SUBMIT, '(init %s)' % host, err=init_error, ret_code=1), suite, itask.point, itask.tdef.name) self._prep_submit_task_job_error(suite, itask, '(remote init)', '') continue # Build the "cylc jobs-submit" command cmd = [self.JOBS_SUBMIT] if LOG.isEnabledFor(DEBUG): cmd.append('--debug') if get_utc_mode(): cmd.append('--utc-mode') if is_remote_platform(itask.platform): remote_mode = True cmd.append('--remote-mode') else: remote_mode = False if itask.platform['clean job submission environment']: cmd.append('--clean-env') for var in itask.platform[ 'job submission environment pass-through']: cmd.append(f"--env={var}") for path in itask.platform[ 'job submission executable paths'] + SYSPATH: cmd.append(f"--path={path}") cmd.append('--') cmd.append(get_remote_suite_run_job_dir(platform, suite)) # Chop itasks into a series of shorter lists if it's very big # to prevent overloading of stdout and stderr pipes. itasks = sorted(itasks, key=lambda itask: itask.identity) chunk_size = (len(itasks) // ( (len(itasks) // platform['max batch submit size']) + 1) + 1) itasks_batches = [ itasks[i:i + chunk_size] for i in range(0, len(itasks), chunk_size) ] LOG.debug('%s ... # will invoke in batches, sizes=%s', cmd, [len(b) for b in itasks_batches]) if remote_mode: cmd = construct_ssh_cmd(cmd, platform) else: cmd = ['cylc'] + cmd for i, itasks_batch in enumerate(itasks_batches): stdin_files = [] job_log_dirs = [] for itask in itasks_batch: if remote_mode: stdin_files.append( os.path.expandvars( get_task_job_job_log(suite, itask.point, itask.tdef.name, itask.submit_num))) job_log_dirs.append( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num)) # The job file is now (about to be) used: reset the file # write flag so that subsequent manual retrigger will # generate a new job file. itask.local_job_file_path = None if itask.state.outputs.has_custom_triggers(): self.suite_db_mgr.put_update_task_outputs(itask) itask.waiting_on_job_prep = False self.proc_pool.put_command( SubProcContext( self.JOBS_SUBMIT, cmd + job_log_dirs, stdin_files=stdin_files, job_log_dirs=job_log_dirs, ), self._submit_task_jobs_callback, [suite, itasks_batch]) return done_tasks
def submit_task_jobs(self, suite, itasks, is_simulation=False): """Prepare and submit task jobs. Submit tasks where possible. Ignore tasks that are waiting for host select command to complete, or tasks that are waiting for remote initialisation. Bad host select command, error writing to a job file or bad remote initialisation will cause a bad task - leading to submission failure. This method uses prep_submit_task_job() as helper. Return (list): list of tasks that attempted submission. """ if is_simulation: return self._simulation_submit_task_jobs(itasks) # Prepare tasks for job submission prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks) # Reset consumed host selection results self.task_remote_mgr.remote_host_select_reset() if not prepared_tasks: return bad_tasks # Group task jobs by (host, owner) auth_itasks = {} # {(host, owner): [itask, ...], ...} for itask in prepared_tasks: auth_itasks.setdefault((itask.task_host, itask.task_owner), []) auth_itasks[(itask.task_host, itask.task_owner)].append(itask) # Submit task jobs for each (host, owner) group done_tasks = bad_tasks for (host, owner), itasks in sorted(auth_itasks.items()): is_init = self.task_remote_mgr.remote_init(host, owner) if is_init is None: # Remote is waiting to be initialised for itask in itasks: itask.set_summary_message(self.REMOTE_INIT_MSG) self.job_pool.add_job_msg( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num), self.REMOTE_INIT_MSG) continue # Ensure that localhost background/at jobs are recorded as running # on the host name of the current suite host, rather than just # "localhost". On suite restart on a different suite host, this # allows the restart logic to correctly poll the status of the # background/at jobs that may still be running on the previous # suite host. if (self.batch_sys_mgr.is_job_local_to_host( itask.summary['batch_sys_name']) and not is_remote_host(host)): owner_at_host = get_host() else: owner_at_host = host # Persist if owner: owner_at_host = owner + '@' + owner_at_host now_str = get_current_time_string() done_tasks.extend(itasks) for itask in itasks: # Log and persist LOG.info('[%s] -submit-num=%02d, owner@host=%s', itask, itask.submit_num, owner_at_host) self.suite_db_mgr.put_insert_task_jobs( itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': now_str, 'user_at_host': owner_at_host, 'batch_sys_name': itask.summary['batch_sys_name'], }) itask.is_manual_submit = False if is_init == REMOTE_INIT_FAILED: # Remote has failed to initialise # Set submit-failed for all affected tasks for itask in itasks: itask.local_job_file_path = None # reset for retry log_task_job_activity( SubProcContext(self.JOBS_SUBMIT, '(init %s)' % owner_at_host, err=REMOTE_INIT_FAILED, ret_code=1), suite, itask.point, itask.tdef.name) self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED) continue # Build the "cylc jobs-submit" command cmd = ['cylc', self.JOBS_SUBMIT] if LOG.isEnabledFor(DEBUG): cmd.append('--debug') if get_utc_mode(): cmd.append('--utc-mode') remote_mode = False kwargs = {} for key, value, test_func in [('host', host, is_remote_host), ('user', owner, is_remote_user)]: if test_func(value): cmd.append('--%s=%s' % (key, value)) remote_mode = True kwargs[key] = value if remote_mode: cmd.append('--remote-mode') cmd.append('--') cmd.append(get_remote_suite_run_job_dir(host, owner, suite)) # Chop itasks into a series of shorter lists if it's very big # to prevent overloading of stdout and stderr pipes. itasks = sorted(itasks, key=lambda itask: itask.identity) chunk_size = len(itasks) // ((len(itasks) // 100) + 1) + 1 itasks_batches = [ itasks[i:i + chunk_size] for i in range(0, len(itasks), chunk_size) ] LOG.debug('%s ... # will invoke in batches, sizes=%s', cmd, [len(b) for b in itasks_batches]) for i, itasks_batch in enumerate(itasks_batches): stdin_files = [] job_log_dirs = [] for itask in itasks_batch: if remote_mode: stdin_files.append( get_task_job_job_log(suite, itask.point, itask.tdef.name, itask.submit_num)) job_log_dirs.append( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num)) # The job file is now (about to be) used: reset the file # write flag so that subsequent manual retrigger will # generate a new job file. itask.local_job_file_path = None itask.state.reset(TASK_STATUS_READY) if itask.state.outputs.has_custom_triggers(): self.suite_db_mgr.put_update_task_outputs(itask) self.proc_pool.put_command( SubProcContext(self.JOBS_SUBMIT, cmd + job_log_dirs, stdin_files=stdin_files, job_log_dirs=job_log_dirs, **kwargs), self._submit_task_jobs_callback, [suite, itasks_batch]) return done_tasks