def put_suite_params(self, schd): """Put various suite parameters from schd in runtime database. This method queues the relevant insert statements. Arguments: schd (cylc.flow.scheduler.Scheduler): scheduler object. """ if schd.final_point is None: # Store None as proper null value in database. No need to do this # for initial cycle point, which should never be None. final_point_str = None else: final_point_str = str(schd.final_point) self.db_inserts_map[self.TABLE_SUITE_PARAMS].extend([ {"key": "uuid_str", "value": str(schd.uuid_str)}, {"key": "run_mode", "value": schd.run_mode}, {"key": "cylc_version", "value": CYLC_VERSION}, {"key": "UTC_mode", "value": get_utc_mode()}, {"key": "initial_point", "value": str(schd.initial_point)}, {"key": "final_point", "value": final_point_str}, ]) if schd.config.cfg['cylc']['cycle point format']: self.db_inserts_map[self.TABLE_SUITE_PARAMS].append({ "key": "cycle_point_format", "value": schd.config.cfg['cylc']['cycle point format']}) if schd.pool.is_held: self.db_inserts_map[self.TABLE_SUITE_PARAMS].append({ "key": "is_held", "value": 1}) if schd.cli_start_point_string: self.db_inserts_map[self.TABLE_SUITE_PARAMS].append({ "key": "start_point", "value": schd.cli_start_point_string})
def put_workflow_params(self, schd): """Put various workflow parameters from schd in runtime database. This method queues the relevant insert statements. Arguments: schd (cylc.flow.scheduler.Scheduler): scheduler object. """ self.db_deletes_map[self.TABLE_WORKFLOW_PARAMS].append({}) self.db_inserts_map[self.TABLE_WORKFLOW_PARAMS].extend([ { "key": self.KEY_UUID_STR, "value": schd.uuid_str }, { "key": self.KEY_CYLC_VERSION, "value": CYLC_VERSION }, { "key": self.KEY_UTC_MODE, "value": get_utc_mode() }, ]) if schd.config.cycle_point_dump_format is not None: self.db_inserts_map[self.TABLE_WORKFLOW_PARAMS].append({ "key": self.KEY_CYCLE_POINT_FORMAT, "value": schd.config.cycle_point_dump_format }) if schd.is_paused: self.db_inserts_map[self.TABLE_WORKFLOW_PARAMS].append({ "key": self.KEY_PAUSED, "value": 1 }) for key in (self.KEY_INITIAL_CYCLE_POINT, self.KEY_FINAL_CYCLE_POINT, self.KEY_START_CYCLE_POINT, self.KEY_STOP_CYCLE_POINT, self.KEY_RUN_MODE, self.KEY_CYCLE_POINT_TIME_ZONE): value = getattr(schd.options, key, None) if value is not None and value != 'ignore': self.db_inserts_map[self.TABLE_WORKFLOW_PARAMS].append({ "key": key, "value": value }) for key in (self.KEY_STOP_CLOCK_TIME, self.KEY_STOP_TASK): value = getattr(schd, key, None) if value is not None: self.db_inserts_map[self.TABLE_WORKFLOW_PARAMS].append({ "key": key, "value": value })
def _test(utc_mode, expected, expected_warnings=0): mock_glbl_cfg( 'cylc.flow.config.glbl_cfg', f''' [cylc] UTC mode = {utc_mode['glbl']} ''') mock_config = Mock() mock_config.cfg = {'cylc': {'UTC mode': utc_mode['suite']}} mock_config.options.utc_mode = utc_mode['stored'] SuiteConfig.process_utc_mode(mock_config) assert mock_config.cfg['cylc']['UTC mode'] is expected assert get_utc_mode() is expected assert len(caplog.record_tuples) == expected_warnings caplog.clear()
def put_suite_params(self, schd): """Put various suite parameters from schd in runtime database. This method queues the relevant insert statements. Arguments: schd (cylc.flow.scheduler.Scheduler): scheduler object. """ self.db_deletes_map[self.TABLE_SUITE_PARAMS].append({}) if schd.config.final_point is None: # Store None as proper null value in database. No need to do this # for initial cycle point, which should never be None. final_point_str = None else: final_point_str = str(schd.config.final_point) self.db_inserts_map[self.TABLE_SUITE_PARAMS].extend([ {"key": self.KEY_UUID_STR, "value": str(schd.uuid_str)}, {"key": "cylc_version", "value": CYLC_VERSION}, {"key": "UTC_mode", "value": get_utc_mode()}, ]) if schd.config.cfg['cylc']['cycle point format']: self.db_inserts_map[self.TABLE_SUITE_PARAMS].append({ "key": "cycle_point_format", "value": schd.config.cfg['cylc']['cycle point format']}) if schd.pool.is_held: self.db_inserts_map[self.TABLE_SUITE_PARAMS].append({ "key": self.KEY_HOLD, "value": 1}) for key in ( self.KEY_INITIAL_CYCLE_POINT, self.KEY_FINAL_CYCLE_POINT, self.KEY_START_CYCLE_POINT, self.KEY_STOP_CYCLE_POINT, self.KEY_RUN_MODE, ): value = getattr(schd.options, key, None) if value is not None: self.db_inserts_map[self.TABLE_SUITE_PARAMS].append({ "key": key, "value": value}) if schd.options.no_auto_shutdown is not None: self.db_inserts_map[self.TABLE_SUITE_PARAMS].append({ "key": self.KEY_NO_AUTO_SHUTDOWN, "value": int(schd.options.no_auto_shutdown)}) for key in (self.KEY_STOP_CLOCK_TIME, self.KEY_STOP_TASK): value = getattr(schd, key, None) if value is not None: self.db_inserts_map[self.TABLE_SUITE_PARAMS].append({ "key": key, "value": value})
def put_suite_params(self, schd): """Put various suite parameters from schd in runtime database. This method queues the relevant insert statements. Arguments: schd (cylc.flow.scheduler.Scheduler): scheduler object. """ self.db_deletes_map[self.TABLE_SUITE_PARAMS].append({}) self.db_inserts_map[self.TABLE_SUITE_PARAMS].extend([ {"key": self.KEY_UUID_STR, "value": str(schd.uuid_str)}, {"key": self.KEY_CYLC_VERSION, "value": CYLC_VERSION}, {"key": self.KEY_UTC_MODE, "value": get_utc_mode()}, ]) if schd.config.cycle_point_dump_format is not None: self.db_inserts_map[self.TABLE_SUITE_PARAMS].append({ "key": self.KEY_CYCLE_POINT_FORMAT, "value": schd.config.cycle_point_dump_format}) if schd.pool.is_held: self.db_inserts_map[self.TABLE_SUITE_PARAMS].append({ "key": self.KEY_HOLD, "value": 1}) for key in ( self.KEY_INITIAL_CYCLE_POINT, self.KEY_FINAL_CYCLE_POINT, self.KEY_START_CYCLE_POINT, self.KEY_STOP_CYCLE_POINT, self.KEY_RUN_MODE, self.KEY_CYCLE_POINT_TIME_ZONE ): value = getattr(schd.options, key, None) if value is not None: self.db_inserts_map[self.TABLE_SUITE_PARAMS].append({ "key": key, "value": value}) if schd.options.no_auto_shutdown is not None: self.db_inserts_map[self.TABLE_SUITE_PARAMS].append({ "key": self.KEY_NO_AUTO_SHUTDOWN, "value": int(schd.options.no_auto_shutdown)}) for key in (self.KEY_STOP_CLOCK_TIME, self.KEY_STOP_TASK): value = getattr(schd, key, None) if value is not None: self.db_inserts_map[self.TABLE_SUITE_PARAMS].append({ "key": key, "value": value})
def submit_task_jobs(self, suite, itasks, is_simulation=False): """Prepare and submit task jobs. Submit tasks where possible. Ignore tasks that are waiting for host select command to complete, or tasks that are waiting for remote initialisation. Bad host select command, error writing to a job file or bad remote initialisation will cause a bad task - leading to submission failure. This method uses prep_submit_task_job() as helper. Return (list): list of tasks that attempted submission. """ if is_simulation: return self._simulation_submit_task_jobs(itasks) # Prepare tasks for job submission prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks) # Reset consumed host selection results self.task_remote_mgr.remote_host_select_reset() if not prepared_tasks: return bad_tasks # Group task jobs by (host, owner) auth_itasks = {} # {(host, owner): [itask, ...], ...} for itask in prepared_tasks: auth_itasks.setdefault((itask.task_host, itask.task_owner), []) auth_itasks[(itask.task_host, itask.task_owner)].append(itask) # Submit task jobs for each (host, owner) group done_tasks = bad_tasks for (host, owner), itasks in sorted(auth_itasks.items()): is_init = self.task_remote_mgr.remote_init(host, owner) if is_init is None: # Remote is waiting to be initialised for itask in itasks: itask.set_summary_message(self.REMOTE_INIT_MSG) self.job_pool.add_job_msg( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num), self.REMOTE_INIT_MSG) continue # Ensure that localhost background/at jobs are recorded as running # on the host name of the current suite host, rather than just # "localhost". On suite restart on a different suite host, this # allows the restart logic to correctly poll the status of the # background/at jobs that may still be running on the previous # suite host. if (self.batch_sys_mgr.is_job_local_to_host( itask.summary['batch_sys_name']) and not is_remote_host(host)): owner_at_host = get_host() else: owner_at_host = host # Persist if owner: owner_at_host = owner + '@' + owner_at_host now_str = get_current_time_string() done_tasks.extend(itasks) for itask in itasks: # Log and persist LOG.info('[%s] -submit-num=%02d, owner@host=%s', itask, itask.submit_num, owner_at_host) self.suite_db_mgr.put_insert_task_jobs( itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': now_str, 'user_at_host': owner_at_host, 'batch_sys_name': itask.summary['batch_sys_name'], }) itask.is_manual_submit = False if is_init == REMOTE_INIT_FAILED: # Remote has failed to initialise # Set submit-failed for all affected tasks for itask in itasks: itask.local_job_file_path = None # reset for retry log_task_job_activity( SubProcContext(self.JOBS_SUBMIT, '(init %s)' % owner_at_host, err=REMOTE_INIT_FAILED, ret_code=1), suite, itask.point, itask.tdef.name) self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED) continue # Build the "cylc jobs-submit" command cmd = ['cylc', self.JOBS_SUBMIT] if LOG.isEnabledFor(DEBUG): cmd.append('--debug') if get_utc_mode(): cmd.append('--utc-mode') remote_mode = False kwargs = {} for key, value, test_func in [('host', host, is_remote_host), ('user', owner, is_remote_user)]: if test_func(value): cmd.append('--%s=%s' % (key, value)) remote_mode = True kwargs[key] = value if remote_mode: cmd.append('--remote-mode') cmd.append('--') cmd.append(get_remote_suite_run_job_dir(host, owner, suite)) # Chop itasks into a series of shorter lists if it's very big # to prevent overloading of stdout and stderr pipes. itasks = sorted(itasks, key=lambda itask: itask.identity) chunk_size = len(itasks) // ((len(itasks) // 100) + 1) + 1 itasks_batches = [ itasks[i:i + chunk_size] for i in range(0, len(itasks), chunk_size) ] LOG.debug('%s ... # will invoke in batches, sizes=%s', cmd, [len(b) for b in itasks_batches]) for i, itasks_batch in enumerate(itasks_batches): stdin_files = [] job_log_dirs = [] for itask in itasks_batch: if remote_mode: stdin_files.append( get_task_job_job_log(suite, itask.point, itask.tdef.name, itask.submit_num)) job_log_dirs.append( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num)) # The job file is now (about to be) used: reset the file # write flag so that subsequent manual retrigger will # generate a new job file. itask.local_job_file_path = None itask.state.reset(TASK_STATUS_READY) if itask.state.outputs.has_custom_triggers(): self.suite_db_mgr.put_update_task_outputs(itask) self.proc_pool.put_command( SubProcContext(self.JOBS_SUBMIT, cmd + job_log_dirs, stdin_files=stdin_files, job_log_dirs=job_log_dirs, **kwargs), self._submit_task_jobs_callback, [suite, itasks_batch]) return done_tasks
def submit_task_jobs(self, suite, itasks, is_simulation=False): """Prepare and submit task jobs. Submit tasks where possible. Ignore tasks that are waiting for host select command to complete, or tasks that are waiting for remote initialisation. Bad host select command, error writing to a job file or bad remote initialisation will cause a bad task - leading to submission failure. This method uses prep_submit_task_job() as helper. Return (list): list of tasks that attempted submission. """ if is_simulation: return self._simulation_submit_task_jobs(itasks) # Prepare tasks for job submission prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks) # Reset consumed host selection results self.task_remote_mgr.remote_host_select_reset() if not prepared_tasks: return bad_tasks # Group task jobs by (host, owner) auth_itasks = {} # {(host, owner): [itask, ...], ...} for itask in prepared_tasks: auth_itasks.setdefault((itask.task_host, itask.task_owner), []) auth_itasks[(itask.task_host, itask.task_owner)].append(itask) # Submit task jobs for each (host, owner) group done_tasks = bad_tasks for (host, owner), itasks in sorted(auth_itasks.items()): is_init = self.task_remote_mgr.remote_init(host, owner) if is_init is None: # Remote is waiting to be initialised for itask in itasks: itask.set_summary_message(self.REMOTE_INIT_MSG) continue # Ensure that localhost background/at jobs are recorded as running # on the host name of the current suite host, rather than just # "localhost". On suite restart on a different suite host, this # allows the restart logic to correctly poll the status of the # background/at jobs that may still be running on the previous # suite host. if ( self.batch_sys_mgr.is_job_local_to_host( itask.summary['batch_sys_name']) and not is_remote_host(host) ): owner_at_host = get_host() else: owner_at_host = host # Persist if owner: owner_at_host = owner + '@' + owner_at_host now_str = get_current_time_string() done_tasks.extend(itasks) for itask in itasks: # Log and persist LOG.info( '[%s] -submit-num=%02d, owner@host=%s', itask, itask.submit_num, owner_at_host) self.suite_db_mgr.put_insert_task_jobs(itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': now_str, 'user_at_host': owner_at_host, 'batch_sys_name': itask.summary['batch_sys_name'], }) itask.is_manual_submit = False if is_init == REMOTE_INIT_FAILED: # Remote has failed to initialise # Set submit-failed for all affected tasks for itask in itasks: itask.local_job_file_path = None # reset for retry log_task_job_activity( SubProcContext( self.JOBS_SUBMIT, '(init %s)' % owner_at_host, err=REMOTE_INIT_FAILED, ret_code=1), suite, itask.point, itask.tdef.name) self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED) continue # Build the "cylc jobs-submit" command cmd = ['cylc', self.JOBS_SUBMIT] if LOG.isEnabledFor(DEBUG): cmd.append('--debug') if get_utc_mode(): cmd.append('--utc-mode') remote_mode = False kwargs = {} for key, value, test_func in [ ('host', host, is_remote_host), ('user', owner, is_remote_user)]: if test_func(value): cmd.append('--%s=%s' % (key, value)) remote_mode = True kwargs[key] = value if remote_mode: cmd.append('--remote-mode') cmd.append('--') cmd.append(glbl_cfg().get_derived_host_item( suite, 'suite job log directory', host, owner)) # Chop itasks into a series of shorter lists if it's very big # to prevent overloading of stdout and stderr pipes. itasks = sorted(itasks, key=lambda itask: itask.identity) chunk_size = len(itasks) // ((len(itasks) // 100) + 1) + 1 itasks_batches = [ itasks[i:i + chunk_size] for i in range(0, len(itasks), chunk_size)] LOG.debug( '%s ... # will invoke in batches, sizes=%s', cmd, [len(b) for b in itasks_batches]) for i, itasks_batch in enumerate(itasks_batches): stdin_files = [] job_log_dirs = [] for itask in itasks_batch: if remote_mode: stdin_files.append( get_task_job_job_log( suite, itask.point, itask.tdef.name, itask.submit_num)) job_log_dirs.append(get_task_job_id( itask.point, itask.tdef.name, itask.submit_num)) # The job file is now (about to be) used: reset the file # write flag so that subsequent manual retrigger will # generate a new job file. itask.local_job_file_path = None itask.state.reset_state(TASK_STATUS_READY) if itask.state.outputs.has_custom_triggers(): self.suite_db_mgr.put_update_task_outputs(itask) self.proc_pool.put_command( SubProcContext( self.JOBS_SUBMIT, cmd + job_log_dirs, stdin_files=stdin_files, job_log_dirs=job_log_dirs, **kwargs ), self._submit_task_jobs_callback, [suite, itasks_batch]) return done_tasks
def generate_definition_elements(self): """Generate static definition data elements. Populates the tasks, families, and workflow elements with data from and/or derived from the workflow definition. """ config = self.schd.config update_time = time() tasks = self.added[TASKS] families = self.added[FAMILIES] workflow = self.added[WORKFLOW] workflow.id = self.workflow_id workflow.last_updated = update_time workflow.stamp = f'{workflow.id}@{workflow.last_updated}' graph = workflow.edges graph.leaves[:] = config.leaves graph.feet[:] = config.feet for key, info in config.suite_polling_tasks.items(): graph.workflow_polling_tasks.add( local_proxy=key, workflow=info[0], remote_proxy=info[1], req_state=info[2], graph_string=info[3], ) ancestors = config.get_first_parent_ancestors() descendants = config.get_first_parent_descendants() parents = config.get_parent_lists() # Create definition elements for graphed tasks. for name, tdef in config.taskdefs.items(): t_id = f'{self.workflow_id}{ID_DELIM}{name}' t_stamp = f'{t_id}@{update_time}' task = PbTask( stamp=t_stamp, id=t_id, name=name, depth=len(ancestors[name]) - 1, ) task.namespace[:] = tdef.namespace_hierarchy task.first_parent = ( f'{self.workflow_id}{ID_DELIM}{ancestors[name][1]}') user_defined_meta = {} for key, val in dict(tdef.describe()).items(): if key in ['title', 'description', 'URL']: setattr(task.meta, key, val) else: user_defined_meta[key] = val task.meta.user_defined = json.dumps(user_defined_meta) elapsed_time = task_mean_elapsed_time(tdef) if elapsed_time: task.mean_elapsed_time = elapsed_time task.parents.extend([ f'{self.workflow_id}{ID_DELIM}{p_name}' for p_name in parents[name] ]) tasks[t_id] = task # Created family definition elements for first parent # ancestors of graphed tasks. for key, names in ancestors.items(): for name in names: if (key == name or name in families): continue f_id = f'{self.workflow_id}{ID_DELIM}{name}' f_stamp = f'{f_id}@{update_time}' family = PbFamily( stamp=f_stamp, id=f_id, name=name, depth=len(ancestors[name]) - 1, ) famcfg = config.cfg['runtime'][name] user_defined_meta = {} for key, val in famcfg.get('meta', {}).items(): if key in ['title', 'description', 'URL']: setattr(family.meta, key, val) else: user_defined_meta[key] = val family.meta.user_defined = json.dumps(user_defined_meta) family.parents.extend([ f'{self.workflow_id}{ID_DELIM}{p_name}' for p_name in parents[name] ]) try: family.first_parent = ( f'{self.workflow_id}{ID_DELIM}{ancestors[name][1]}') except IndexError: pass families[f_id] = family for name, parent_list in parents.items(): if not parent_list: continue fam = parent_list[0] f_id = f'{self.workflow_id}{ID_DELIM}{fam}' if f_id in families: ch_id = f'{self.workflow_id}{ID_DELIM}{name}' if name in config.taskdefs: families[f_id].child_tasks.append(ch_id) else: families[f_id].child_families.append(ch_id) # Populate static fields of workflow workflow.api_version = API workflow.cylc_version = CYLC_VERSION workflow.name = self.schd.suite workflow.owner = self.schd.owner workflow.host = self.schd.host workflow.port = self.schd.port or -1 workflow.pub_port = self.schd.pub_port or -1 user_defined_meta = {} for key, val in config.cfg['meta'].items(): if key in ['title', 'description', 'URL']: setattr(workflow.meta, key, val) else: user_defined_meta[key] = val workflow.meta.user_defined = json.dumps(user_defined_meta) workflow.tree_depth = max([ len(val) for val in config.get_first_parent_ancestors(pruned=True).values() ]) - 1 if get_utc_mode(): time_zone_info = TIME_ZONE_UTC_INFO else: time_zone_info = TIME_ZONE_LOCAL_INFO for key, val in time_zone_info.items(): setattr(workflow.time_zone_info, key, val) workflow.run_mode = config.run_mode() workflow.cycling_mode = config.cfg['scheduling']['cycling mode'] workflow.workflow_log_dir = self.schd.suite_log_dir workflow.job_log_names.extend(list(JOB_LOG_OPTS.values())) workflow.ns_def_order.extend(config.ns_defn_order) workflow.broadcasts = json.dumps(self.schd.broadcast_mgr.broadcasts) workflow.tasks.extend(list(tasks)) workflow.families.extend(list(families)) self.ancestors = ancestors self.descendants = descendants self.parents = parents
def update(self, schd): """Update.""" self.update_time = time() global_summary = {} family_summary = {} task_summary, task_states = self._get_tasks_info(schd) all_states = [] ancestors_dict = schd.config.get_first_parent_ancestors() # Compute state_counts (total, and per cycle). state_count_totals = {} state_count_cycles = {} for point_string, c_task_states in task_states.items(): # For each cycle point, construct a family state tree # based on the first-parent single-inheritance tree c_fam_task_states = {} count = {} for key in c_task_states: state = c_task_states[key] if state is None: continue try: count[state] += 1 except KeyError: count[state] = 1 all_states.append(state) for parent in ancestors_dict.get(key, []): if parent == key: continue c_fam_task_states.setdefault(parent, set([])) c_fam_task_states[parent].add(state) state_count_cycles[point_string] = count for fam, child_states in c_fam_task_states.items(): f_id = TaskID.get(fam, point_string) state = extract_group_state(child_states) if state is None: continue try: famcfg = schd.config.cfg['runtime'][fam]['meta'] except KeyError: famcfg = {} description = famcfg.get('description') title = famcfg.get('title') family_summary[f_id] = {'name': fam, 'description': description, 'title': title, 'label': point_string, 'state': state} state_count_totals = {} for point_string, count in list(state_count_cycles.items()): for state, state_count in count.items(): state_count_totals.setdefault(state, 0) state_count_totals[state] += state_count all_states.sort() for key, value in ( ('oldest cycle point string', schd.pool.get_min_point()), ('newest cycle point string', schd.pool.get_max_point()), ('newest runahead cycle point string', schd.pool.get_max_point_runahead())): if value: global_summary[key] = str(value) else: global_summary[key] = None if get_utc_mode(): global_summary['time zone info'] = TIME_ZONE_UTC_INFO else: global_summary['time zone info'] = TIME_ZONE_LOCAL_INFO global_summary['last_updated'] = self.update_time global_summary['run_mode'] = schd.config.run_mode() global_summary['states'] = all_states global_summary['namespace definition order'] = ( schd.config.ns_defn_order) global_summary['reloading'] = schd.pool.do_reload global_summary['state totals'] = state_count_totals # Extract suite and task URLs from config. global_summary['suite_urls'] = dict( (i, j['meta']['URL']) for (i, j) in schd.config.cfg['runtime'].items()) global_summary['suite_urls']['suite'] = schd.config.cfg['meta']['URL'] # Construct a suite status string for use by monitoring clients. status, status_msg = get_suite_status(schd) global_summary['status'] = str(status) global_summary['status_string'] = status_msg # Replace the originals (atomic update, for access from other threads). self.task_summary = task_summary self.global_summary = global_summary self.family_summary = family_summary self.state_count_totals = state_count_totals self.state_count_cycles = state_count_cycles
def generate_definition_elements(self): """Generate static definition data elements""" config = self.schd.config update_time = time() tasks = {} families = {} workflow = PbWorkflow( checksum=f"{self.workflow_id}@{update_time}", id=self.workflow_id, ) ancestors = config.get_first_parent_ancestors() descendants = config.get_first_parent_descendants() parents = config.get_parent_lists() # create task definition data objects for name, tdef in config.taskdefs.items(): t_id = f"{self.workflow_id}/{name}" t_check = f"{name}@{update_time}" task = PbTask( checksum=t_check, id=t_id, name=name, depth=len(ancestors[name]) - 1, ) task.namespace[:] = tdef.namespace_hierarchy for key, val in dict(tdef.describe()).items(): if key in ['title', 'description', 'url']: setattr(task.meta, key, val) else: task.meta.user_defined.append(f"{key}={val}") ntimes = len(tdef.elapsed_times) if ntimes: task.mean_elapsed_time = sum(tdef.elapsed_times) / ntimes elif tdef.rtconfig['job']['execution time limit']: task.mean_elapsed_time = \ tdef.rtconfig['job']['execution time limit'] tasks[name] = task # family definition data objects creation for name in ancestors.keys(): if name in config.taskdefs.keys(): continue f_id = f"{self.workflow_id}/{name}" f_check = f"{name}@{update_time}" family = PbFamily( checksum=f_check, id=f_id, name=name, depth=len(ancestors[name]) - 1, ) famcfg = config.cfg['runtime'][name] for key, val in famcfg.get('meta', {}).items(): if key in ['title', 'description', 'url']: setattr(family.meta, key, val) else: family.meta.user_defined.append(f"{key}={val}") family.parents.extend( [f"{self.workflow_id}/{p_name}" for p_name in parents[name]]) families[name] = family for name, parent_list in parents.items(): if parent_list and parent_list[0] in families: ch_id = f"{self.workflow_id}/{name}" if name in config.taskdefs: families[parent_list[0]].child_tasks.append(ch_id) else: families[parent_list[0]].child_families.append(ch_id) workflow.api_version = self.schd.server.API workflow.cylc_version = CYLC_VERSION workflow.name = self.schd.suite workflow.owner = self.schd.owner workflow.host = self.schd.host workflow.port = self.schd.port for key, val in config.cfg['meta'].items(): if key in ['title', 'description', 'URL']: setattr(workflow.meta, key, val) else: workflow.meta.user_defined.append(f"{key}={val}") workflow.tree_depth = max( [len(val) for key, val in ancestors.items()]) - 1 if get_utc_mode(): time_zone_info = TIME_ZONE_UTC_INFO else: time_zone_info = TIME_ZONE_LOCAL_INFO for key, val in time_zone_info.items(): setattr(workflow.time_zone_info, key, val) workflow.last_updated = update_time workflow.run_mode = config.run_mode() workflow.cycling_mode = config.cfg['scheduling']['cycling mode'] workflow.workflow_log_dir = self.schd.suite_log_dir workflow.job_log_names.extend(list(JOB_LOG_OPTS.values())) workflow.ns_defn_order.extend(config.ns_defn_order) workflow.tasks.extend([t.id for t in tasks.values()]) workflow.families.extend([f.id for f in families.values()]) # replace the originals (atomic update, for access from other threads). self.ancestors = ancestors self.descendants = descendants self.parents = parents self.tasks = tasks self.families = families self.workflow = workflow
def update(self, schd): """Update.""" self.update_time = time() global_summary = {} family_summary = {} task_summary, task_states = self._get_tasks_info(schd) all_states = [] ancestors_dict = schd.config.get_first_parent_ancestors() # Compute state_counts (total, and per cycle). state_count_totals = {} state_count_cycles = {} for point_string, c_task_states in task_states.items(): # For each cycle point, construct a family state tree # based on the first-parent single-inheritance tree c_fam_task_states = {} count = {} for key in c_task_states: state = c_task_states[key] if state is None: continue try: count[state] += 1 except KeyError: count[state] = 1 all_states.append(state) for parent in ancestors_dict.get(key, []): if parent == key: continue c_fam_task_states.setdefault(parent, set([])) c_fam_task_states[parent].add(state) state_count_cycles[point_string] = count for fam, child_states in c_fam_task_states.items(): f_id = TaskID.get(fam, point_string) state = extract_group_state(child_states) if state is None: continue try: famcfg = schd.config.cfg['runtime'][fam]['meta'] except KeyError: famcfg = {} description = famcfg.get('description') title = famcfg.get('title') family_summary[f_id] = {'name': fam, 'description': description, 'title': title, 'label': point_string, 'state': state} state_count_totals = {} for point_string, count in list(state_count_cycles.items()): for state, state_count in count.items(): state_count_totals.setdefault(state, 0) state_count_totals[state] += state_count all_states.sort() for key, value in ( ('oldest cycle point string', schd.pool.get_min_point()), ('newest cycle point string', schd.pool.get_max_point()), ('newest runahead cycle point string', schd.pool.get_max_point_runahead())): if value: global_summary[key] = str(value) else: global_summary[key] = None if get_utc_mode(): global_summary['time zone info'] = TIME_ZONE_UTC_INFO else: global_summary['time zone info'] = TIME_ZONE_LOCAL_INFO global_summary['last_updated'] = self.update_time global_summary['run_mode'] = schd.run_mode global_summary['states'] = all_states global_summary['namespace definition order'] = ( schd.config.ns_defn_order) global_summary['reloading'] = schd.pool.do_reload global_summary['state totals'] = state_count_totals # Extract suite and task URLs from config. global_summary['suite_urls'] = dict( (i, j['meta']['URL']) for (i, j) in schd.config.cfg['runtime'].items()) global_summary['suite_urls']['suite'] = schd.config.cfg['meta']['URL'] # Construct a suite status string for use by monitoring clients. if schd.pool.is_held: global_summary['status_string'] = SUITE_STATUS_HELD elif schd.stop_mode is not None: global_summary['status_string'] = SUITE_STATUS_STOPPING elif schd.pool.hold_point: global_summary['status_string'] = ( SUITE_STATUS_RUNNING_TO_HOLD % schd.pool.hold_point) elif schd.stop_point: global_summary['status_string'] = ( SUITE_STATUS_RUNNING_TO_STOP % schd.stop_point) elif schd.stop_clock_time is not None: global_summary['status_string'] = ( SUITE_STATUS_RUNNING_TO_STOP % schd.stop_clock_time_string) elif schd.stop_task: global_summary['status_string'] = ( SUITE_STATUS_RUNNING_TO_STOP % schd.stop_task) elif schd.final_point: global_summary['status_string'] = ( SUITE_STATUS_RUNNING_TO_STOP % schd.final_point) else: global_summary['status_string'] = SUITE_STATUS_RUNNING # Replace the originals (atomic update, for access from other threads). self.task_summary = task_summary self.global_summary = global_summary self.family_summary = family_summary self.state_count_totals = state_count_totals self.state_count_cycles = state_count_cycles
def generate_definition_elements(self): """Generate static definition data elements. Populates the tasks, families, and workflow elements with data from and/or derived from the workflow definition. """ config = self.schd.config update_time = time() tasks = {} families = {} workflow = PbWorkflow( stamp=f'{self.workflow_id}@{update_time}', id=self.workflow_id, ) graph = self.data[self.workflow_id][GRAPH] graph.leaves[:] = config.leaves graph.feet[:] = config.feet for key, info in config.suite_polling_tasks.items(): graph.workflow_polling_tasks.add( local_proxy=key, workflow=info[0], remote_proxy=info[1], req_state=info[2], graph_string=info[3], ) ancestors = config.get_first_parent_ancestors() descendants = config.get_first_parent_descendants() parents = config.get_parent_lists() # Create task definition elements. for name, tdef in config.taskdefs.items(): t_id = f'{self.workflow_id}{ID_DELIM}{name}' t_check = f'{t_id}@{update_time}' task = PbTask( stamp=t_check, id=t_id, name=name, depth=len(ancestors[name]) - 1, ) task.namespace[:] = tdef.namespace_hierarchy for key, val in dict(tdef.describe()).items(): if key in ['title', 'description', 'url']: setattr(task.meta, key, val) else: task.meta.user_defined.append(f'{key}={val}') elapsed_time = task_mean_elapsed_time(tdef) if elapsed_time: task.mean_elapsed_time = elapsed_time tasks[t_id] = task # Created family definition elements. for name in ancestors.keys(): if name in config.taskdefs.keys(): continue f_id = f'{self.workflow_id}{ID_DELIM}{name}' f_check = f'{f_id}@{update_time}' family = PbFamily( stamp=f_check, id=f_id, name=name, depth=len(ancestors[name]) - 1, ) famcfg = config.cfg['runtime'][name] for key, val in famcfg.get('meta', {}).items(): if key in ['title', 'description', 'url']: setattr(family.meta, key, val) else: family.meta.user_defined.append(f'{key}={val}') family.parents.extend([ f'{self.workflow_id}{ID_DELIM}{p_name}' for p_name in parents[name] ]) families[f_id] = family for name, parent_list in parents.items(): if not parent_list: continue fam = parent_list[0] f_id = f'{self.workflow_id}{ID_DELIM}{fam}' if f_id in families: ch_id = f'{self.workflow_id}{ID_DELIM}{name}' if name in config.taskdefs: families[f_id].child_tasks.append(ch_id) else: families[f_id].child_families.append(ch_id) # Populate static fields of workflow workflow.api_version = self.schd.server.API workflow.cylc_version = CYLC_VERSION workflow.name = self.schd.suite workflow.owner = self.schd.owner workflow.host = self.schd.host workflow.port = self.schd.port for key, val in config.cfg['meta'].items(): if key in ['title', 'description', 'URL']: setattr(workflow.meta, key, val) else: workflow.meta.user_defined.append(f'{key}={val}') workflow.tree_depth = max([len(val) for key, val in ancestors.items()]) - 1 if get_utc_mode(): time_zone_info = TIME_ZONE_UTC_INFO else: time_zone_info = TIME_ZONE_LOCAL_INFO for key, val in time_zone_info.items(): setattr(workflow.time_zone_info, key, val) workflow.last_updated = update_time workflow.run_mode = config.run_mode() workflow.cycling_mode = config.cfg['scheduling']['cycling mode'] workflow.workflow_log_dir = self.schd.suite_log_dir workflow.job_log_names.extend(list(JOB_LOG_OPTS.values())) workflow.ns_defn_order.extend(config.ns_defn_order) workflow.tasks.extend(list(tasks)) workflow.families.extend(list(families)) # replace the originals (atomic update, for access from other threads). self.ancestors = ancestors self.descendants = descendants self.parents = parents self.data[self.workflow_id][TASKS] = tasks self.data[self.workflow_id][FAMILIES] = families self.data[self.workflow_id][WORKFLOW] = workflow
def submit_task_jobs(self, suite, itasks, curve_auth, client_pub_key_dir, is_simulation=False): """Prepare for job submission and submit task jobs. Preparation (host selection, remote host init, and remote install) is done asynchronously. Newly released tasks may be sent here several times until these init subprocesses have returned. Failure during preparation is considered to be job submission failure. Once preparation has completed or failed, reset .waiting_on_job_prep in task instances so the scheduler knows to stop sending them back here. This method uses prep_submit_task_job() as helper. Return (list): list of tasks that attempted submission. """ if is_simulation: return self._simulation_submit_task_jobs(itasks) # Prepare tasks for job submission prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks) # Reset consumed host selection results self.task_remote_mgr.subshell_eval_reset() if not prepared_tasks: return bad_tasks auth_itasks = {} # {platform: [itask, ...], ...} for itask in prepared_tasks: platform_name = itask.platform['name'] auth_itasks.setdefault(platform_name, []) auth_itasks[platform_name].append(itask) # Submit task jobs for each platform done_tasks = bad_tasks for platform_name, itasks in sorted(auth_itasks.items()): platform = itasks[0].platform install_target = get_install_target_from_platform(platform) ri_map = self.task_remote_mgr.remote_init_map if (ri_map.get(install_target) != REMOTE_FILE_INSTALL_DONE): if install_target == get_localhost_install_target(): # Skip init and file install for localhost. LOG.debug(f"REMOTE INIT NOT REQUIRED for {install_target}") ri_map[install_target] = (REMOTE_FILE_INSTALL_DONE) elif install_target not in ri_map: # Remote init not in progress for target, so start it. self.task_remote_mgr.remote_init(platform, curve_auth, client_pub_key_dir) for itask in itasks: itask.set_summary_message(self.REMOTE_INIT_MSG) self.data_store_mgr.delta_job_msg( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num), self.REMOTE_INIT_MSG) continue elif (ri_map[install_target] == REMOTE_INIT_DONE): # Already done remote init so move on to file install self.task_remote_mgr.file_install(platform) continue elif (ri_map[install_target] in self.IN_PROGRESS.keys()): # Remote init or file install in progress. for itask in itasks: msg = self.IN_PROGRESS[ri_map[install_target]] itask.set_summary_message(msg) self.data_store_mgr.delta_job_msg( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num), msg) continue # Ensure that localhost background/at jobs are recorded as running # on the host name of the current suite host, rather than just # "localhost". On suite restart on a different suite host, this # allows the restart logic to correctly poll the status of the # background/at jobs that may still be running on the previous # suite host. host = get_host_from_platform(platform) if (self.job_runner_mgr.is_job_local_to_host( itask.summary['job_runner_name']) and not is_remote_platform(platform)): host = get_host() now_str = get_current_time_string() done_tasks.extend(itasks) for itask in itasks: # Log and persist LOG.info('[%s] -submit-num=%02d, host=%s', itask, itask.submit_num, host) self.suite_db_mgr.put_insert_task_jobs( itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': now_str, 'platform_name': itask.platform['name'], 'job_runner_name': itask.summary['job_runner_name'], }) itask.is_manual_submit = False if (ri_map[install_target] in [REMOTE_INIT_FAILED, REMOTE_FILE_INSTALL_FAILED]): # Remote init or install failed. Set submit-failed for all # affected tasks and remove target from remote init map # - this enables new tasks to re-initialise that target init_error = (ri_map[install_target]) del ri_map[install_target] for itask in itasks: itask.waiting_on_job_prep = False itask.local_job_file_path = None # reset for retry log_task_job_activity( SubProcContext(self.JOBS_SUBMIT, '(init %s)' % host, err=init_error, ret_code=1), suite, itask.point, itask.tdef.name) self._prep_submit_task_job_error(suite, itask, '(remote init)', '') continue # Build the "cylc jobs-submit" command cmd = [self.JOBS_SUBMIT] if LOG.isEnabledFor(DEBUG): cmd.append('--debug') if get_utc_mode(): cmd.append('--utc-mode') if is_remote_platform(itask.platform): remote_mode = True cmd.append('--remote-mode') else: remote_mode = False if itask.platform['clean job submission environment']: cmd.append('--clean-env') for var in itask.platform[ 'job submission environment pass-through']: cmd.append(f"--env={var}") for path in itask.platform[ 'job submission executable paths'] + SYSPATH: cmd.append(f"--path={path}") cmd.append('--') cmd.append(get_remote_suite_run_job_dir(platform, suite)) # Chop itasks into a series of shorter lists if it's very big # to prevent overloading of stdout and stderr pipes. itasks = sorted(itasks, key=lambda itask: itask.identity) chunk_size = (len(itasks) // ( (len(itasks) // platform['max batch submit size']) + 1) + 1) itasks_batches = [ itasks[i:i + chunk_size] for i in range(0, len(itasks), chunk_size) ] LOG.debug('%s ... # will invoke in batches, sizes=%s', cmd, [len(b) for b in itasks_batches]) if remote_mode: cmd = construct_ssh_cmd(cmd, platform) else: cmd = ['cylc'] + cmd for i, itasks_batch in enumerate(itasks_batches): stdin_files = [] job_log_dirs = [] for itask in itasks_batch: if remote_mode: stdin_files.append( os.path.expandvars( get_task_job_job_log(suite, itask.point, itask.tdef.name, itask.submit_num))) job_log_dirs.append( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num)) # The job file is now (about to be) used: reset the file # write flag so that subsequent manual retrigger will # generate a new job file. itask.local_job_file_path = None if itask.state.outputs.has_custom_triggers(): self.suite_db_mgr.put_update_task_outputs(itask) itask.waiting_on_job_prep = False self.proc_pool.put_command( SubProcContext( self.JOBS_SUBMIT, cmd + job_log_dirs, stdin_files=stdin_files, job_log_dirs=job_log_dirs, ), self._submit_task_jobs_callback, [suite, itasks_batch]) return done_tasks