Exemplo n.º 1
0
    def put_suite_params(self, schd):
        """Put various suite parameters from schd in runtime database.

        This method queues the relevant insert statements.

        Arguments:
            schd (cylc.scheduler.Scheduler): scheduler object.
        """
        if schd.final_point is None:
            # Store None as proper null value in database. No need to do this
            # for initial cycle point, which should never be None.
            final_point_str = None
        else:
            final_point_str = str(schd.final_point)
        self.db_inserts_map[self.TABLE_SUITE_PARAMS].extend([
            {"key": "uuid_str", "value": str(schd.uuid_str)},
            {"key": "run_mode", "value": schd.run_mode},
            {"key": "cylc_version", "value": CYLC_VERSION},
            {"key": "UTC_mode", "value": get_utc_mode()},
            {"key": "initial_point", "value": str(schd.initial_point)},
            {"key": "final_point", "value": final_point_str},
        ])
        if schd.config.cfg['cylc']['cycle point format']:
            self.db_inserts_map[self.TABLE_SUITE_PARAMS].append({
                "key": "cycle_point_format",
                "value": schd.config.cfg['cylc']['cycle point format']})
        if schd.pool.is_held:
            self.db_inserts_map[self.TABLE_SUITE_PARAMS].append({
                "key": "is_held", "value": 1})
        if schd.cli_start_point_string:
            self.db_inserts_map[self.TABLE_SUITE_PARAMS].append({
                "key": "start_point", "value": schd.cli_start_point_string})
Exemplo n.º 2
0
    def submit_task_jobs(self, suite, itasks, is_simulation=False):
        """Prepare and submit task jobs.

        Submit tasks where possible. Ignore tasks that are waiting for host
        select command to complete, or tasks that are waiting for remote
        initialisation. Bad host select command, error writing to a job file or
        bad remote initialisation will cause a bad task - leading to submission
        failure.

        This method uses prep_submit_task_job() as helper.

        Return (list): list of tasks that attempted submission.
        """
        if is_simulation:
            return self._simulation_submit_task_jobs(itasks)

        # Prepare tasks for job submission
        prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks)

        # Reset consumed host selection results
        self.task_remote_mgr.remote_host_select_reset()

        if not prepared_tasks:
            return bad_tasks

        # Group task jobs by (host, owner)
        auth_itasks = {}  # {(host, owner): [itask, ...], ...}
        for itask in prepared_tasks:
            auth_itasks.setdefault((itask.task_host, itask.task_owner), [])
            auth_itasks[(itask.task_host, itask.task_owner)].append(itask)
        # Submit task jobs for each (host, owner) group
        done_tasks = bad_tasks
        for (host, owner), itasks in sorted(auth_itasks.items()):
            is_init = self.task_remote_mgr.remote_init(host, owner)
            if is_init is None:
                # Remote is waiting to be initialised
                for itask in itasks:
                    itask.set_summary_message(self.REMOTE_INIT_MSG)
                continue
            # Ensure that localhost background/at jobs are recorded as running
            # on the host name of the current suite host, rather than just
            # "localhost". On suite restart on a different suite host, this
            # allows the restart logic to correctly poll the status of the
            # background/at jobs that may still be running on the previous
            # suite host.
            if (
                self.batch_sys_mgr.is_job_local_to_host(
                    itask.summary['batch_sys_name']) and
                not is_remote_host(host)
            ):
                owner_at_host = get_host()
            else:
                owner_at_host = host
            # Persist
            if owner:
                owner_at_host = owner + '@' + owner_at_host
            now_str = get_current_time_string()
            done_tasks.extend(itasks)
            for itask in itasks:
                # Log and persist
                LOG.info(
                    '[%s] -submit-num=%d, owner@host=%s',
                    itask, itask.submit_num, owner_at_host)
                self.suite_db_mgr.put_insert_task_jobs(itask, {
                    'is_manual_submit': itask.is_manual_submit,
                    'try_num': itask.get_try_num(),
                    'time_submit': now_str,
                    'user_at_host': owner_at_host,
                    'batch_sys_name': itask.summary['batch_sys_name'],
                })
                itask.is_manual_submit = False
            if is_init == REMOTE_INIT_FAILED:
                # Remote has failed to initialise
                # Set submit-failed for all affected tasks
                for itask in itasks:
                    itask.local_job_file_path = None  # reset for retry
                    log_task_job_activity(
                        SubProcContext(
                            self.JOBS_SUBMIT,
                            '(init %s)' % owner_at_host,
                            err=REMOTE_INIT_FAILED,
                            ret_code=1),
                        suite, itask.point, itask.tdef.name)
                    self.task_events_mgr.process_message(
                        itask, CRITICAL,
                        self.task_events_mgr.EVENT_SUBMIT_FAILED)
                continue
            # Build the "cylc jobs-submit" command
            cmd = ['cylc', self.JOBS_SUBMIT]
            if LOG.isEnabledFor(DEBUG):
                cmd.append('--debug')
            if get_utc_mode():
                cmd.append('--utc-mode')
            remote_mode = False
            kwargs = {}
            for key, value, test_func in [
                    ('host', host, is_remote_host),
                    ('user', owner, is_remote_user)]:
                if test_func(value):
                    cmd.append('--%s=%s' % (key, value))
                    remote_mode = True
                    kwargs[key] = value
            if remote_mode:
                cmd.append('--remote-mode')
            cmd.append('--')
            cmd.append(glbl_cfg().get_derived_host_item(
                suite, 'suite job log directory', host, owner))
            # Chop itasks into a series of shorter lists if it's very big
            # to prevent overloading of stdout and stderr pipes.
            itasks = sorted(itasks, key=lambda itask: itask.identity)
            chunk_size = len(itasks) // ((len(itasks) // 100) + 1) + 1
            itasks_batches = [
                itasks[i:i + chunk_size] for i in range(0,
                                                        len(itasks),
                                                        chunk_size)]
            LOG.debug(
                '%s ... # will invoke in batches, sizes=%s',
                cmd, [len(b) for b in itasks_batches])
            for i, itasks_batch in enumerate(itasks_batches):
                stdin_files = []
                job_log_dirs = []
                for itask in itasks_batch:
                    if remote_mode:
                        stdin_files.append(
                            get_task_job_job_log(
                                suite, itask.point, itask.tdef.name,
                                itask.submit_num))
                    job_log_dirs.append(get_task_job_id(
                        itask.point, itask.tdef.name, itask.submit_num))
                    # The job file is now (about to be) used: reset the file
                    # write flag so that subsequent manual retrigger will
                    # generate a new job file.
                    itask.local_job_file_path = None
                    itask.state.reset_state(TASK_STATUS_READY)
                    if itask.state.outputs.has_custom_triggers():
                        self.suite_db_mgr.put_update_task_outputs(itask)
                self.proc_pool.put_command(
                    SubProcContext(
                        self.JOBS_SUBMIT,
                        cmd + job_log_dirs,
                        stdin_files=stdin_files,
                        job_log_dirs=job_log_dirs,
                        **kwargs
                    ),
                    self._submit_task_jobs_callback, [suite, itasks_batch])
        return done_tasks
Exemplo n.º 3
0
    def update(self, schd):
        """Update."""
        self.update_time = time()
        global_summary = {}
        family_summary = {}

        task_summary, task_states = self._get_tasks_info(schd)

        all_states = []
        ancestors_dict = schd.config.get_first_parent_ancestors()

        # Compute state_counts (total, and per cycle).
        state_count_totals = {}
        state_count_cycles = {}

        for point_string, c_task_states in task_states.items():
            # For each cycle point, construct a family state tree
            # based on the first-parent single-inheritance tree

            c_fam_task_states = {}

            count = {}

            for key in c_task_states:
                state = c_task_states[key]
                if state is None:
                    continue
                try:
                    count[state] += 1
                except KeyError:
                    count[state] = 1

                all_states.append(state)
                for parent in ancestors_dict.get(key, []):
                    if parent == key:
                        continue
                    c_fam_task_states.setdefault(parent, set([]))
                    c_fam_task_states[parent].add(state)

            state_count_cycles[point_string] = count

            for fam, child_states in c_fam_task_states.items():
                f_id = TaskID.get(fam, point_string)
                state = extract_group_state(child_states)
                if state is None:
                    continue
                try:
                    famcfg = schd.config.cfg['runtime'][fam]['meta']
                except KeyError:
                    famcfg = {}
                description = famcfg.get('description')
                title = famcfg.get('title')
                family_summary[f_id] = {'name': fam,
                                        'description': description,
                                        'title': title,
                                        'label': point_string,
                                        'state': state}

        state_count_totals = {}
        for point_string, count in list(state_count_cycles.items()):
            for state, state_count in count.items():
                state_count_totals.setdefault(state, 0)
                state_count_totals[state] += state_count

        all_states.sort()

        for key, value in (
                ('oldest cycle point string', schd.pool.get_min_point()),
                ('newest cycle point string', schd.pool.get_max_point()),
                ('newest runahead cycle point string',
                 schd.pool.get_max_point_runahead())):
            if value:
                global_summary[key] = str(value)
            else:
                global_summary[key] = None
        if get_utc_mode():
            global_summary['time zone info'] = TIME_ZONE_UTC_INFO
        else:
            global_summary['time zone info'] = TIME_ZONE_LOCAL_INFO
        global_summary['last_updated'] = self.update_time
        global_summary['run_mode'] = schd.run_mode
        global_summary['states'] = all_states
        global_summary['namespace definition order'] = (
            schd.config.ns_defn_order)
        global_summary['reloading'] = schd.pool.do_reload
        global_summary['state totals'] = state_count_totals
        # Extract suite and task URLs from config.
        global_summary['suite_urls'] = dict(
            (i, j['meta']['URL'])
            for (i, j) in schd.config.cfg['runtime'].items())
        global_summary['suite_urls']['suite'] = schd.config.cfg['meta']['URL']

        # Construct a suite status string for use by monitoring clients.
        if schd.pool.is_held:
            global_summary['status_string'] = SUITE_STATUS_HELD
        elif schd.stop_mode is not None:
            global_summary['status_string'] = SUITE_STATUS_STOPPING
        elif schd.pool.hold_point:
            global_summary['status_string'] = (
                SUITE_STATUS_RUNNING_TO_HOLD % schd.pool.hold_point)
        elif schd.stop_point:
            global_summary['status_string'] = (
                SUITE_STATUS_RUNNING_TO_STOP % schd.stop_point)
        elif schd.stop_clock_time is not None:
            global_summary['status_string'] = (
                SUITE_STATUS_RUNNING_TO_STOP % schd.stop_clock_time_string)
        elif schd.stop_task:
            global_summary['status_string'] = (
                SUITE_STATUS_RUNNING_TO_STOP % schd.stop_task)
        elif schd.final_point:
            global_summary['status_string'] = (
                SUITE_STATUS_RUNNING_TO_STOP % schd.final_point)
        else:
            global_summary['status_string'] = SUITE_STATUS_RUNNING

        # Replace the originals (atomic update, for access from other threads).
        self.task_summary = task_summary
        self.global_summary = global_summary
        self.family_summary = family_summary
        self.state_count_totals = state_count_totals
        self.state_count_cycles = state_count_cycles
Exemplo n.º 4
0
    def update(self, schd):
        """Update."""
        self.update_time = time()
        global_summary = {}
        family_summary = {}

        task_summary, task_states = self._get_tasks_info(schd)

        all_states = []
        ancestors_dict = schd.config.get_first_parent_ancestors()

        # Compute state_counts (total, and per cycle).
        state_count_totals = {}
        state_count_cycles = {}

        for point_string, c_task_states in task_states.items():
            # For each cycle point, construct a family state tree
            # based on the first-parent single-inheritance tree

            c_fam_task_states = {}

            count = {}

            for key in c_task_states:
                state = c_task_states[key]
                if state is None:
                    continue
                try:
                    count[state] += 1
                except KeyError:
                    count[state] = 1

                all_states.append(state)
                for parent in ancestors_dict.get(key, []):
                    if parent == key:
                        continue
                    c_fam_task_states.setdefault(parent, set([]))
                    c_fam_task_states[parent].add(state)

            state_count_cycles[point_string] = count

            for fam, child_states in c_fam_task_states.items():
                f_id = TaskID.get(fam, point_string)
                state = extract_group_state(child_states)
                if state is None:
                    continue
                try:
                    famcfg = schd.config.cfg['runtime'][fam]['meta']
                except KeyError:
                    famcfg = {}
                description = famcfg.get('description')
                title = famcfg.get('title')
                family_summary[f_id] = {
                    'name': fam,
                    'description': description,
                    'title': title,
                    'label': point_string,
                    'state': state
                }

        state_count_totals = {}
        for point_string, count in list(state_count_cycles.items()):
            for state, state_count in count.items():
                state_count_totals.setdefault(state, 0)
                state_count_totals[state] += state_count

        all_states.sort()

        for key, value in (('oldest cycle point string',
                            schd.pool.get_min_point()),
                           ('newest cycle point string',
                            schd.pool.get_max_point()),
                           ('newest runahead cycle point string',
                            schd.pool.get_max_point_runahead())):
            if value:
                global_summary[key] = str(value)
            else:
                global_summary[key] = None
        if get_utc_mode():
            global_summary['time zone info'] = TIME_ZONE_UTC_INFO
        else:
            global_summary['time zone info'] = TIME_ZONE_LOCAL_INFO
        global_summary['last_updated'] = self.update_time
        global_summary['run_mode'] = schd.run_mode
        global_summary['states'] = all_states
        global_summary['namespace definition order'] = (
            schd.config.ns_defn_order)
        global_summary['reloading'] = schd.pool.do_reload
        global_summary['state totals'] = state_count_totals
        # Extract suite and task URLs from config.
        global_summary['suite_urls'] = dict(
            (i, j['meta']['URL'])
            for (i, j) in schd.config.cfg['runtime'].items())
        global_summary['suite_urls']['suite'] = schd.config.cfg['meta']['URL']

        # Construct a suite status string for use by monitoring clients.
        if schd.pool.is_held:
            global_summary['status_string'] = SUITE_STATUS_HELD
        elif schd.stop_mode is not None:
            global_summary['status_string'] = SUITE_STATUS_STOPPING
        elif schd.pool.hold_point:
            global_summary['status_string'] = (SUITE_STATUS_RUNNING_TO_HOLD %
                                               schd.pool.hold_point)
        elif schd.stop_point:
            global_summary['status_string'] = (SUITE_STATUS_RUNNING_TO_STOP %
                                               schd.stop_point)
        elif schd.stop_clock_time is not None:
            global_summary['status_string'] = (SUITE_STATUS_RUNNING_TO_STOP %
                                               schd.stop_clock_time_string)
        elif schd.stop_task:
            global_summary['status_string'] = (SUITE_STATUS_RUNNING_TO_STOP %
                                               schd.stop_task)
        elif schd.final_point:
            global_summary['status_string'] = (SUITE_STATUS_RUNNING_TO_STOP %
                                               schd.final_point)
        else:
            global_summary['status_string'] = SUITE_STATUS_RUNNING

        # Replace the originals (atomic update, for access from other threads).
        self.task_summary = task_summary
        self.global_summary = global_summary
        self.family_summary = family_summary
        self.state_count_totals = state_count_totals
        self.state_count_cycles = state_count_cycles
Exemplo n.º 5
0
    def submit_task_jobs(self, suite, itasks, is_simulation=False):
        """Prepare and submit task jobs.

        Submit tasks where possible. Ignore tasks that are waiting for host
        select command to complete, or tasks that are waiting for remote
        initialisation. Bad host select command, error writing to a job file or
        bad remote initialisation will cause a bad task - leading to submission
        failure.

        This method uses prep_submit_task_job() as helper.

        Return (list): list of tasks that attempted submission.
        """
        if is_simulation:
            return self._simulation_submit_task_jobs(itasks)

        # Prepare tasks for job submission
        prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks)

        # Reset consumed host selection results
        self.task_remote_mgr.remote_host_select_reset()

        if not prepared_tasks:
            return bad_tasks

        # Group task jobs by (host, owner)
        auth_itasks = {}  # {(host, owner): [itask, ...], ...}
        for itask in prepared_tasks:
            auth_itasks.setdefault((itask.task_host, itask.task_owner), [])
            auth_itasks[(itask.task_host, itask.task_owner)].append(itask)
        # Submit task jobs for each (host, owner) group
        done_tasks = bad_tasks
        for (host, owner), itasks in sorted(auth_itasks.items()):
            is_init = self.task_remote_mgr.remote_init(host, owner)
            if is_init is None:
                # Remote is waiting to be initialised
                for itask in itasks:
                    itask.set_summary_message(self.REMOTE_INIT_MSG)
                continue
            # Ensure that localhost background/at jobs are recorded as running
            # on the host name of the current suite host, rather than just
            # "localhost". On suite restart on a different suite host, this
            # allows the restart logic to correctly poll the status of the
            # background/at jobs that may still be running on the previous
            # suite host.
            if (
                self.batch_sys_mgr.is_job_local_to_host(
                    itask.summary['batch_sys_name']) and
                not is_remote_host(host)
            ):
                owner_at_host = get_host()
            else:
                owner_at_host = host
            # Persist
            if owner:
                owner_at_host = owner + '@' + owner_at_host
            now_str = get_current_time_string()
            done_tasks.extend(itasks)
            for itask in itasks:
                # Log and persist
                LOG.info(
                    '[%s] -submit-num=%d, owner@host=%s',
                    itask, itask.submit_num, owner_at_host)
                self.suite_db_mgr.put_insert_task_jobs(itask, {
                    'is_manual_submit': itask.is_manual_submit,
                    'try_num': itask.get_try_num(),
                    'time_submit': now_str,
                    'user_at_host': owner_at_host,
                    'batch_sys_name': itask.summary['batch_sys_name'],
                })
                itask.is_manual_submit = False
            if is_init == REMOTE_INIT_FAILED:
                # Remote has failed to initialise
                # Set submit-failed for all affected tasks
                for itask in itasks:
                    itask.local_job_file_path = None  # reset for retry
                    log_task_job_activity(
                        SubProcContext(
                            self.JOBS_SUBMIT,
                            '(init %s)' % owner_at_host,
                            err=REMOTE_INIT_FAILED,
                            ret_code=1),
                        suite, itask.point, itask.tdef.name)
                    self.task_events_mgr.process_message(
                        itask, CRITICAL,
                        self.task_events_mgr.EVENT_SUBMIT_FAILED)
                continue
            # Build the "cylc jobs-submit" command
            cmd = ['cylc', self.JOBS_SUBMIT]
            if LOG.isEnabledFor(DEBUG):
                cmd.append('--debug')
            if get_utc_mode():
                cmd.append('--utc-mode')
            remote_mode = False
            kwargs = {}
            for key, value, test_func in [
                    ('host', host, is_remote_host),
                    ('user', owner, is_remote_user)]:
                if test_func(value):
                    cmd.append('--%s=%s' % (key, value))
                    remote_mode = True
                    kwargs[key] = value
            if remote_mode:
                cmd.append('--remote-mode')
            cmd.append('--')
            cmd.append(glbl_cfg().get_derived_host_item(
                suite, 'suite job log directory', host, owner))
            # Chop itasks into a series of shorter lists if it's very big
            # to prevent overloading of stdout and stderr pipes.
            itasks = sorted(itasks, key=lambda itask: itask.identity)
            chunk_size = len(itasks) // ((len(itasks) // 100) + 1) + 1
            itasks_batches = [
                itasks[i:i + chunk_size] for i in range(0,
                                                        len(itasks),
                                                        chunk_size)]
            LOG.debug(
                '%s ... # will invoke in batches, sizes=%s',
                cmd, [len(b) for b in itasks_batches])
            for i, itasks_batch in enumerate(itasks_batches):
                stdin_files = []
                job_log_dirs = []
                for itask in itasks_batch:
                    if remote_mode:
                        stdin_files.append(
                            get_task_job_job_log(
                                suite, itask.point, itask.tdef.name,
                                itask.submit_num))
                    job_log_dirs.append(get_task_job_id(
                        itask.point, itask.tdef.name, itask.submit_num))
                    # The job file is now (about to be) used: reset the file
                    # write flag so that subsequent manual retrigger will
                    # generate a new job file.
                    itask.local_job_file_path = None
                    itask.state.reset_state(TASK_STATUS_READY)
                    if itask.state.outputs.has_custom_triggers():
                        self.suite_db_mgr.put_update_task_outputs(itask)
                self.proc_pool.put_command(
                    SubProcContext(
                        self.JOBS_SUBMIT,
                        cmd + job_log_dirs,
                        stdin_files=stdin_files,
                        job_log_dirs=job_log_dirs,
                        **kwargs
                    ),
                    self._submit_task_jobs_callback, [suite, itasks_batch])
        return done_tasks