Exemplo n.º 1
0
    def get_resource_config (self, resource_key, schema=None):
        """Returns a dictionary of the requested resource config
        """

        if  resource_key in self._resource_aliases :
            logger.warning ("using alias '%s' for deprecated resource key '%s'" \
                         % (self._resource_aliases[resource_key], resource_key))
            resource_key = self._resource_aliases[resource_key]

        if  resource_key not in self._resource_configs:
            error_msg = "Resource key '%s' is not known." % resource_key
            raise PilotException(error_msg)

        resource_cfg = copy.deepcopy (self._resource_configs[resource_key])

        if  not schema :
            if 'schemas' in resource_cfg :
                schema = resource_cfg['schemas'][0]

        if  schema:
            if  schema not in resource_cfg :
                raise RuntimeError ("schema %s unknown for resource %s" \
                                  % (schema, resource_key))

            for key in resource_cfg[schema] :
                # merge schema specific resource keys into the
                # resource config
                resource_cfg[key] = resource_cfg[schema][key]


        return resource_cfg
Exemplo n.º 2
0
    def add_pilots(self, pilots):
        """Associates one or more pilots with the unit manager.

        **Arguments:**

            * **pilots** [:class:`radical.pilot.ComputePilot` or list of
              :class:`radical.pilot.ComputePilot`]: The pilot objects that will be
              added to the unit manager.

        **Raises:**

            * :class:`radical.pilot.PilotException`
        """
        if not self._uid:
            raise IncorrectState(msg="Invalid object instance.")

        if not isinstance(pilots, list):
            pilots = [pilots]

        pilot_ids = self.list_pilots()

        for pilot in pilots :
            if  pilot.uid in pilot_ids :
                logger.warning ('adding the same pilot twice (%s)' % pilot.uid)

        self._worker.add_pilots(pilots)

        # let the scheduler know...
        for pilot in pilots :
            self._scheduler.pilot_added (pilot)

        # also keep the instances around
        for pilot in pilots :
            self._pilots.append (pilot)
Exemplo n.º 3
0
    def add_pilots(self, pilots):
        """Associates one or more pilots with the unit manager.

        **Arguments:**

            * **pilots** [:class:`radical.pilot.ComputePilot` or list of
              :class:`radical.pilot.ComputePilot`]: The pilot objects that will be
              added to the unit manager.

        **Raises:**

            * :class:`radical.pilot.PilotException`
        """
        if not self._uid:
            raise IncorrectState(msg="Invalid object instance.")

        if not isinstance(pilots, list):
            pilots = [pilots]

        pilot_ids = self.list_pilots()

        for pilot in pilots:
            if pilot.uid in pilot_ids:
                logger.warning('adding the same pilot twice (%s)' % pilot.uid)

        self._worker.add_pilots(pilots)

        # let the scheduler know...
        for pilot in pilots:
            self._scheduler.pilot_added(pilot)

        # also keep the instances around
        for pilot in pilots:
            self._pilots.append(pilot)
Exemplo n.º 4
0
    def get_resource_config(self, resource_key, schema=None):
        """Returns a dictionary of the requested resource config
        """

        if resource_key in self._resource_aliases:
            logger.warning ("using alias '%s' for deprecated resource key '%s'" \
                         % (self._resource_aliases[resource_key], resource_key))
            resource_key = self._resource_aliases[resource_key]

        if resource_key not in self._resource_configs:
            error_msg = "Resource key '%s' is not known." % resource_key
            raise PilotException(error_msg)

        resource_cfg = copy.deepcopy(self._resource_configs[resource_key])

        if not schema:
            if 'schemas' in resource_cfg:
                schema = resource_cfg['schemas'][0]

        if schema:
            if schema not in resource_cfg:
                raise RuntimeError ("schema %s unknown for resource %s" \
                                  % (schema, resource_key))

            for key in resource_cfg[schema]:
                # merge schema specific resource keys into the
                # resource config
                resource_cfg[key] = resource_cfg[schema][key]

        return resource_cfg
Exemplo n.º 5
0
    def close(self, cleanup=True, terminate=True, delete=None):
        """Closes the session.

        All subsequent attempts access objects attached to the session will 
        result in an error. If cleanup is set to True (default) the session
        data is removed from the database.

        **Arguments:**
            * **cleanup** (`bool`): Remove session from MongoDB (implies * terminate)
            * **terminate** (`bool`): Shut down all pilots associated with the session. 

        **Raises:**
            * :class:`radical.pilot.IncorrectState` if the session is closed
              or doesn't exist. 
        """

        logger.debug("session %s closing" % (str(self._uid)))

        uid = self._uid

        if not self._uid:
            logger.error("Session object already closed.")
            return

        # we keep 'delete' for backward compatibility.  If it was set, and the
        # other flags (cleanup, terminate) are as defaulted (True), then delete
        # will supercede them.  Delete is considered deprecated though, and
        # we'll thus issue a warning.
        if delete != None:

            if cleanup == True and terminate == True:
                cleanup = delete
                terminate = delete
                logger.warning("'delete' flag on session is deprecated. " \
                               "Please use 'cleanup' and 'terminate' instead!")

        if cleanup:
            # cleanup implies terminate
            terminate = True

        for pmgr in self._pilot_manager_objects:
            logger.debug("session %s closes   pmgr   %s" %
                         (str(self._uid), pmgr._uid))
            pmgr.close(terminate=terminate)
            logger.debug("session %s closed   pmgr   %s" %
                         (str(self._uid), pmgr._uid))

        for umgr in self._unit_manager_objects:
            logger.debug("session %s closes   umgr   %s" %
                         (str(self._uid), umgr._uid))
            umgr.close()
            logger.debug("session %s closed   umgr   %s" %
                         (str(self._uid), umgr._uid))

        if cleanup:
            self._destroy_db_entry()

        logger.debug("session %s closed" % (str(self._uid)))
Exemplo n.º 6
0
    def call_unit_state_callbacks(self, unit_id, new_state):
        """Wrapper function to call all all relevant callbacks, on unit-level
        as well as manager-level.
        """

        # this is the point where, at the earliest, the application could have
        # been notified about unit state changes.  So we record that event.
        if not unit_id in self._callback_histories:
            self._callback_histories[unit_id] = list()
        self._callback_histories[unit_id].append({
            'timestamp':
            datetime.datetime.utcnow(),
            'state':
            new_state
        })

        for [cb, cb_data] in self._shared_data[unit_id]['callbacks']:
            try:

                if self._shared_data[unit_id]['facade_object']:
                    if cb_data:
                        cb(self._shared_data[unit_id]['facade_object'],
                           new_state, cb_data)
                    else:
                        cb(self._shared_data[unit_id]['facade_object'],
                           new_state)
                else:
                    logger.error("Couldn't call callback (no pilot instance)")
            except Exception as e:
                logger.exception("Couldn't call callback function %s" % e)
                raise

        # If we have any manager-level callbacks registered, we
        # call those as well!
        if not UNIT_STATE in self._manager_callbacks:
            self._manager_callbacks[UNIT_STATE] = list()

        for [cb, cb_data] in self._manager_callbacks[UNIT_STATE]:
            if not self._shared_data[unit_id]['facade_object']:
                logger.warning('skip cb for incomple unit (%s: %s)' %
                               (unit_id, new_state))
                break

            try:
                if cb_data:
                    cb(self._shared_data[unit_id]['facade_object'], new_state,
                       cb_data)
                else:
                    cb(self._shared_data[unit_id]['facade_object'], new_state)
            except Exception as e:
                logger.exception("Couldn't call callback function %s" % e)
                raise

        # If we meet a final state, we record the object's callback history for
        # later evaluation.
        if new_state in (DONE, FAILED, CANCELED):
            self._db.publish_compute_unit_callback_history(
                unit_id, self._callback_histories[unit_id])
Exemplo n.º 7
0
    def close(self, cleanup=True, terminate=True, delete=None):
        """Closes the session.

        All subsequent attempts access objects attached to the session will 
        result in an error. If cleanup is set to True (default) the session
        data is removed from the database.

        **Arguments:**
            * **cleanup** (`bool`): Remove session from MongoDB (implies * terminate)
            * **terminate** (`bool`): Shut down all pilots associated with the session. 

        **Raises:**
            * :class:`radical.pilot.IncorrectState` if the session is closed
              or doesn't exist. 
        """

        logger.debug("session %s closing" % (str(self._uid)))

        uid = self._uid

        if not self._uid:
            logger.error("Session object already closed.")
            return

        # we keep 'delete' for backward compatibility.  If it was set, and the
        # other flags (cleanup, terminate) are as defaulted (True), then delete
        # will supercede them.  Delete is considered deprecated though, and
        # we'll thus issue a warning.
        if  delete != None:

            if  cleanup == True and terminate == True :
                cleanup   = delete
                terminate = delete
                logger.warning("'delete' flag on session is deprecated. " \
                               "Please use 'cleanup' and 'terminate' instead!")

        if  cleanup :
            # cleanup implies terminate
            terminate = True

        for pmgr in self._pilot_manager_objects:
            logger.debug("session %s closes   pmgr   %s" % (str(self._uid), pmgr._uid))
            pmgr.close (terminate=terminate)
            logger.debug("session %s closed   pmgr   %s" % (str(self._uid), pmgr._uid))

        for umgr in self._unit_manager_objects:
            logger.debug("session %s closes   umgr   %s" % (str(self._uid), umgr._uid))
            umgr.close()
            logger.debug("session %s closed   umgr   %s" % (str(self._uid), umgr._uid))

        if  cleanup :
            self._destroy_db_entry()

        logger.debug("session %s closed" % (str(self._uid)))
    def call_unit_state_callbacks(self, unit_id, new_state):
        """Wrapper function to call all all relevant callbacks, on unit-level
        as well as manager-level.
        """

        # this is the point where, at the earliest, the application could have
        # been notified about unit state changes.  So we record that event.
        if  not unit_id in self._callback_histories :
            self._callback_histories[unit_id] = list()
        self._callback_histories[unit_id].append (
                {'timestamp' : datetime.datetime.utcnow(), 
                 'state'     : new_state})

        for [cb, cb_data] in self._shared_data[unit_id]['callbacks']:
            try:

                if self._shared_data[unit_id]['facade_object'] :
                    if  cb_data :
                        cb(self._shared_data[unit_id]['facade_object'], new_state, cb_data)
                    else :
                        cb(self._shared_data[unit_id]['facade_object'], new_state)
                else :
                    logger.error("Couldn't call callback (no pilot instance)")
            except Exception as e:
                logger.exception(
                    "Couldn't call callback function %s" % e)
                raise

        # If we have any manager-level callbacks registered, we
        # call those as well!
        if  not UNIT_STATE in self._manager_callbacks :
            self._manager_callbacks[UNIT_STATE] = list()

        for [cb, cb_data] in self._manager_callbacks[UNIT_STATE]:
            if not self._shared_data[unit_id]['facade_object'] :
                logger.warning ('skip cb for incomple unit (%s: %s)' % (unit_id, new_state))
                break

            try:
                if  cb_data :
                    cb(self._shared_data[unit_id]['facade_object'], new_state, cb_data)
                else :
                    cb(self._shared_data[unit_id]['facade_object'], new_state)
            except Exception as e:
                logger.exception(
                    "Couldn't call callback function %s" % e)
                raise

        # If we meet a final state, we record the object's callback history for
        # later evaluation.
        if  new_state in (DONE, FAILED, CANCELED) :
            self._db.publish_compute_unit_callback_history (unit_id, self._callback_histories[unit_id])
Exemplo n.º 9
0
    def close(self):
        """Shuts down the UnitManager and its background workers in a 
        coordinated fashion.
        """
        if not self._uid:
            logger.warning("UnitManager object already closed.")
            return

        if self._worker is not None:
            self._worker.stop()
            # Remove worker from registry
            self._session._process_registry.remove(self._uid)

        logger.info("Closed UnitManager %s." % str(self._uid))
        self._uid = None
Exemplo n.º 10
0
    def close(self):
        """Shuts down the UnitManager and its background workers in a 
        coordinated fashion.
        """
        if not self._uid:
            logger.warning("UnitManager object already closed.")
            return

        if self._worker is not None:
            self._worker.stop()
            # Remove worker from registry
            self._session._process_registry.remove(self._uid)

        logger.info("Closed UnitManager %s." % str(self._uid))
        self._uid = None
Exemplo n.º 11
0
    def submit_pilots(self, pilot_descriptions):
        """Submits a new :class:`radical.pilot.ComputePilot` to a resource.

        **Returns:**

            * One or more :class:`radical.pilot.ComputePilot` instances
              [`list of :class:`radical.pilot.ComputePilot`].

        **Raises:**

            * :class:`radical.pilot.PilotException`
        """
        # Check if the object instance is still valid.
        self._assert_obj_is_valid()

        # Implicit list conversion.
        return_list_type = True
        if not isinstance(pilot_descriptions, list):
            return_list_type = False
            pilot_descriptions = [pilot_descriptions]

        # Itereate over the pilot descriptions, try to create a pilot for
        # each one and append it to 'pilot_obj_list'.
        pilot_obj_list = list()

        for pilot_description in pilot_descriptions:

            if pilot_description.resource is None:
                error_msg = "ComputePilotDescription does not define mandatory attribute 'resource'."
                raise BadParameter(error_msg)

            elif pilot_description.runtime is None:
                error_msg = "ComputePilotDescription does not define mandatory attribute 'runtime'."
                raise BadParameter(error_msg)

            elif pilot_description.cores is None:
                error_msg = "ComputePilotDescription does not define mandatory attribute 'cores'."
                raise BadParameter(error_msg)

            resource_key = pilot_description.resource
            resource_cfg = self._session.get_resource_config(resource_key)

            # Check resource-specific mandatory attributes
            if "mandatory_args" in resource_cfg:
                for ma in resource_cfg["mandatory_args"]:
                    if getattr(pilot_description, ma) is None:
                        error_msg = "ComputePilotDescription does not define attribute '{0}' which is required for '{1}'.".format(
                            ma, resource_key)
                        raise BadParameter(error_msg)

            # we expand and exchange keys in the resource config, depending on
            # the selected schema so better use a deep copy...
            import copy
            resource_cfg = copy.deepcopy(resource_cfg)
            schema = pilot_description['access_schema']

            if not schema:
                if 'schemas' in resource_cfg:
                    schema = resource_cfg['schemas'][0]
            # import pprint
            # print "no schema, using %s" % schema
            # pprint.pprint (pilot_description)

            if not schema in resource_cfg:
                # import pprint
                # pprint.pprint (resource_cfg)
                logger.warning ("schema %s unknown for resource %s -- continue with defaults" \
                             % (schema, resource_key))

            else:
                for key in resource_cfg[schema]:
                    # merge schema specific resource keys into the
                    # resource config
                    resource_cfg[key] = resource_cfg[schema][key]

            # If 'default_sandbox' is defined, set it.
            if pilot_description.sandbox is not None:
                if "valid_roots" in resource_cfg and resource_cfg[
                        "valid_roots"] is not None:
                    is_valid = False
                    for vr in resource_cfg["valid_roots"]:
                        if pilot_description.sandbox.startswith(vr):
                            is_valid = True
                    if is_valid is False:
                        raise BadParameter(
                            "Working directory for resource '%s' defined as '%s' but needs to be rooted in %s "
                            % (resource_key, pilot_description.sandbox,
                               resource_cfg["valid_roots"]))

            # After the sanity checks have passed, we can register a pilot
            # startup request with the worker process and create a facade
            # object.

            pilot = ComputePilot.create(pilot_description=pilot_description,
                                        pilot_manager_obj=self)

            pilot_uid = self._worker.register_start_pilot_request(
                pilot=pilot, resource_config=resource_cfg)

            pilot._uid = pilot_uid

            pilot_obj_list.append(pilot)

        # Implicit return value conversion
        if return_list_type:
            return pilot_obj_list
        else:
            return pilot_obj_list[0]
Exemplo n.º 12
0
    def submit_pilots(self, pilot_descriptions):
        """Submits a new :class:`radical.pilot.ComputePilot` to a resource.

        **Returns:**

            * One or more :class:`radical.pilot.ComputePilot` instances
              [`list of :class:`radical.pilot.ComputePilot`].

        **Raises:**

            * :class:`radical.pilot.PilotException`
        """
        # Check if the object instance is still valid.
        self._assert_obj_is_valid()

        # Implicit list conversion.
        return_list_type = True
        if  not isinstance(pilot_descriptions, list):
            return_list_type   = False
            pilot_descriptions = [pilot_descriptions]

        # Itereate over the pilot descriptions, try to create a pilot for
        # each one and append it to 'pilot_obj_list'.
        pilot_obj_list = list()

        for pilot_description in pilot_descriptions:

            if pilot_description.resource is None:
                error_msg = "ComputePilotDescription does not define mandatory attribute 'resource'."
                raise BadParameter(error_msg)

            elif pilot_description.runtime is None:
                error_msg = "ComputePilotDescription does not define mandatory attribute 'runtime'."
                raise BadParameter(error_msg)

            elif pilot_description.cores is None:
                error_msg = "ComputePilotDescription does not define mandatory attribute 'cores'."
                raise BadParameter(error_msg)

            resource_key = pilot_description.resource
            resource_cfg = self._session.get_resource_config(resource_key)

            # Check resource-specific mandatory attributes
            if "mandatory_args" in resource_cfg:
                for ma in resource_cfg["mandatory_args"]:
                    if getattr(pilot_description, ma) is None:
                        error_msg = "ComputePilotDescription does not define attribute '{0}' which is required for '{1}'.".format(ma, resource_key)
                        raise BadParameter(error_msg)


            # we expand and exchange keys in the resource config, depending on
            # the selected schema so better use a deep copy...
            import copy
            resource_cfg  = copy.deepcopy (resource_cfg)
            schema        = pilot_description['access_schema']

            if  not schema :
                if 'schemas' in resource_cfg :
                    schema = resource_cfg['schemas'][0]
              # import pprint
              # print "no schema, using %s" % schema
              # pprint.pprint (pilot_description)

            if  not schema in resource_cfg :
              # import pprint
              # pprint.pprint (resource_cfg)
                logger.warning ("schema %s unknown for resource %s -- continue with defaults" \
                             % (schema, resource_key))

            else :
                for key in resource_cfg[schema] :
                    # merge schema specific resource keys into the
                    # resource config
                    resource_cfg[key] = resource_cfg[schema][key]

            # If 'default_sandbox' is defined, set it.
            if pilot_description.sandbox is not None:
                if "valid_roots" in resource_cfg and resource_cfg["valid_roots"] is not None:
                    is_valid = False
                    for vr in resource_cfg["valid_roots"]:
                        if pilot_description.sandbox.startswith(vr):
                            is_valid = True
                    if is_valid is False:
                        raise BadParameter("Working directory for resource '%s' defined as '%s' but needs to be rooted in %s " % (resource_key, pilot_description.sandbox, resource_cfg["valid_roots"]))

            # After the sanity checks have passed, we can register a pilot
            # startup request with the worker process and create a facade
            # object.

            pilot = ComputePilot.create(
                pilot_description=pilot_description,
                pilot_manager_obj=self)

            pilot_uid = self._worker.register_start_pilot_request(
                pilot=pilot,
                resource_config=resource_cfg)

            pilot._uid = pilot_uid

            pilot_obj_list.append(pilot)

        # Implicit return value conversion
        if  return_list_type :
            return pilot_obj_list
        else:
            return pilot_obj_list[0]
Exemplo n.º 13
0
    def _unit_state_callback (self, unit, state) :
        
        try :

            with self.lock :
            
                uid = unit.uid

                logger.info ("[SchedulerCallback]: Computeunit %s changed to %s" % (uid, state))


                found_unit = False
                if  state in [NEW, UNSCHEDULED] :

                    for pid in self.runqs :

                        if  not pid :
                            logger.warning ('cannot handle final unit %s w/o pilot information' % uid)

                        if  uid in self.runqs[pid] :

                            logger.info ('reschedule NEW unit %s from %s' % (uid, pid))

                            unit       = self.runqs[pid][uid]
                            found_unit = True

                            del self.runqs[pid][uid]
                            self.waitq[uid] = unit

                          # self._dump ('before reschedule %s' % uid)
                            self._reschedule (uid=uid)
                          # self._dump ('after  reschedule %s' % uid)

                            return

              # if  not found_unit and uid not in self.waitq :
              #     # as we cannot unregister callbacks, we simply ignore this
              #     # invokation.  Its probably from a unit we handled previously.
              #     # (although this should have been final?)
              #     #
              #     # FIXME: how can I *un*register a unit callback?
              #     logger.error ("[SchedulerCallback]: cannot handle unit %s" % uid)
              #     self._dump()
              #     return

                if  state in [PENDING_OUTPUT_STAGING, STAGING_OUTPUT, DONE, FAILED, CANCELED] :
                    # the pilot which owned this CU should now have free slots available
                    # FIXME: how do I get the pilot from the CU?
                    
                    pid = unit.execution_details.get ('pilot', None)

                    if  not pid :
                        raise RuntimeError ('cannot handle final unit %s w/o pilot information' % uid)

                    if  pid not in self.pilots :
                        logger.warning ('cannot handle unit %s cb for pilot %s (pilot is gone)' % (uid, pid))

                    else :
                        if  uid in self.runqs[pid] :

                            unit = self.runqs[pid][uid]

                            del self.runqs[pid][uid]
                            self.pilots[pid]['caps'] += unit.description.cores
                            self._reschedule (target_pid=pid)
                            found_unit = True

                      #     logger.debug ('unit %s frees %s cores on (-> %s)' \
                      #                % (uid, unit.description.cores, pid, self.pilots[pid]['caps']))

                    if not found_unit :
                        logger.warn ('unit %s freed %s cores on %s (== %s) -- not reused'
                                  % (uid, unit.description.cores, pid, self.pilots[pid]['caps']))


        except Exception as e :
            logger.error ("error in unit callback for backfiller (%s) - ignored" % e)
Exemplo n.º 14
0
    def _reschedule (self, target_pid=None, uid=None) :

        with self.lock :

            # dig through the list of waiting CUs, and try to find a pilot for each
            # of them.  This enacts first-come-first-served, but will be unbalanced
            # if the units in the queue are of different sizes.  That problem is
            # ignored at this point.
            #
            # if any units get scheduled, we push a dictionary to the UM to enact
            # the schedule:
            #   { 
            #     unit_1: [pilot_id_1, pilot_resource_name]
            #     unit_2: [pilot_id_2, pilot_resource_name]
            #     unit_4: [pilot_id_2, pilot_resource_name]
            #     ...
            #   }

            if  not len(self.pilots.keys ()) :
                # no pilots to  work on, yet.
                logger.warning ("cannot schedule -- no pilots available")
                return 

            if  target_pid and target_pid not in self.pilots :
                logger.warning ("cannot schedule -- invalid target pilot %s" % target_pid)
                raise RuntimeError ("Invalid pilot (%s)" % target_pid)
                

            schedule           = dict()
            schedule['units']  = dict()
            schedule['pilots'] = self.pilots

            logger.debug ("schedule (%s units waiting)" % len(self.waitq))


            units_to_schedule = list()
            if  uid :

                if  uid not in self.waitq :
                  # self._dump ()
                    logger.warning ("cannot schedule -- unknown unit %s" % uid)
                    raise RuntimeError ("Invalid unit (%s)" % uid)

                units_to_schedule.append (self.waitq[uid])

            else :
                # just copy the whole waitq
                for uid in self.waitq :
                    units_to_schedule.append (self.waitq[uid])


            for unit in units_to_schedule :

                uid = unit.uid
                ud  = unit.description

                # sanity check on unit state
                if  unit.state not in [NEW, SCHEDULING, UNSCHEDULED] :
                    raise RuntimeError ("scheduler queue should only contain NEW or UNSCHEDULED units (%s)" % uid)

              # logger.debug ("examine unit  %s (%s cores)" % (uid, ud.cores))

                for pid in self.pilots :

                  # logger.debug ("        pilot %s (%s caps, state %s)" \
                  #            % (pid, self.pilots[pid]['state'], self.pilots[pid]['caps']))

                    if  self.pilots[pid]['state'] in [ACTIVE] :

                        if  ud.cores <= self.pilots[pid]['caps'] :
                    
                          # logger.debug ("        unit  %s fits on pilot %s" % (uid, pid))

                            self.pilots[pid]['caps'] -= ud.cores
                            schedule['units'][unit]   = pid

                            # scheduled units are removed from the waitq
                            del self.waitq[uid]
                            self.runqs[pid][uid] = unit
                            break


                    # unit was not scheduled...
                    schedule['units'][unit] = None

                # print a warning if a unit cannot possibly be scheduled, ever
                can_handle_unit = False
                for pid in self.pilots :
                    if  unit.description.cores <= self.pilots[pid]['cores'] :
                        can_handle_unit=True
                        break

                if  not can_handle_unit :
                    logger.warning ('cannot handle unit %s with current set of pilots' % uid)

          # pprint.pprint (schedule)

            # tell the UM about the schedule
            self.manager.handle_schedule (schedule)
    def check_pilot_states(self, pilot_col):

        pending_pilots = pilot_col.find({
            "pilotmanager": self.pilot_manager_id,
            "state": {
                "$in": [PENDING_ACTIVE, ACTIVE]
            }
        })

        for pending_pilot in pending_pilots:

            pilot_failed = False
            pilot_done = False
            reconnected = False
            pilot_id = pending_pilot["_id"]
            log_message = ""
            saga_job_id = pending_pilot["saga_job_id"]

            logger.info(
                "Performing periodical health check for %s (SAGA job id %s)" %
                (str(pilot_id), saga_job_id))

            if not pilot_id in self.missing_pilots:
                self.missing_pilots[pilot_id] = 0

            # Create a job service object:
            try:
                js_url = saga_job_id.split("]-[")[0][1:]

                if js_url in self._shared_worker_data['job_services']:
                    js = self._shared_worker_data['job_services'][js_url]
                else:
                    js = saga.job.Service(js_url, session=self._session)
                    self._shared_worker_data['job_services'][js_url] = js

                saga_job = js.get_job(saga_job_id)
                reconnected = True

                if saga_job.state in [saga.job.FAILED, saga.job.CANCELED]:
                    pilot_failed = True
                    log_message  = "SAGA job state for ComputePilot %s is %s."\
                                 % (pilot_id, saga_job.state)

                if saga_job.state in [saga.job.DONE]:
                    pilot_done = True
                    log_message  = "SAGA job state for ComputePilot %s is %s."\
                                 % (pilot_id, saga_job.state)

            except Exception as e:

                if not reconnected:
                    logger.warning(
                        'could not reconnect to pilot for state check (%s)' %
                        e)
                    self.missing_pilots[pilot_id] += 1

                    if self.missing_pilots[pilot_id] >= JOB_CHECK_MAX_MISSES:
                        logger.debug('giving up after 10 attempts')
                        pilot_failed = True
                        log_message  = "Could not reconnect to pilot %s "\
                                       "multiple times - giving up" % pilot_id
                else:
                    logger.warning('pilot state check failed: %s' % e)
                    pilot_failed = True
                    log_message  = "Couldn't determine job state for ComputePilot %s. " \
                                   "Assuming it has failed." % pilot_id

            if pilot_failed:
                out, err, log = self._get_pilot_logs(pilot_col, pilot_id)
                ts = datetime.datetime.utcnow()
                pilot_col.update({
                    "_id": pilot_id,
                    "state": {
                        "$ne": DONE
                    }
                }, {
                    "$set": {
                        "state": FAILED,
                        "stdout": out,
                        "stderr": err,
                        "logfile": log
                    },
                    "$push": {
                        "statehistory": {
                            "state": FAILED,
                            "timestamp": ts
                        },
                        "log": {
                            "message": log_message,
                            "timestamp": ts
                        }
                    }
                })
                logger.debug(log_message)
                logger.warn('pilot %s declared dead' % pilot_id)

            elif pilot_done:
                # FIXME: this should only be done if the state is not yet
                # done...
                out, err, log = self._get_pilot_logs(pilot_col, pilot_id)
                ts = datetime.datetime.utcnow()
                pilot_col.update({
                    "_id": pilot_id,
                    "state": {
                        "$ne": DONE
                    }
                }, {
                    "$set": {
                        "state": DONE,
                        "stdout": out,
                        "stderr": err,
                        "logfile": log
                    },
                    "$push": {
                        "statehistory": {
                            "state": DONE,
                            "timestamp": ts
                        },
                        "log": {
                            "message": log_message,
                            "timestamp": ts
                        }
                    }
                })
                logger.debug(log_message)
                logger.warn('pilot %s declared dead' % pilot_id)

            else:
                if self.missing_pilots[pilot_id]:
                    logger.info ('pilot %s *assumed* alive and well (%s)' \
                              % (pilot_id, self.missing_pilots[pilot_id]))
                else:
                    logger.info ('pilot %s seems alive and well' \
                              % (pilot_id))
    def check_pilot_states(self, pilot_col):

        pending_pilots = pilot_col.find(
            {"pilotmanager": self.pilot_manager_id, "state": {"$in": [PENDING_ACTIVE, ACTIVE]}}
        )

        for pending_pilot in pending_pilots:

            pilot_failed = False
            pilot_done = False
            reconnected = False
            pilot_id = pending_pilot["_id"]
            log_message = ""
            saga_job_id = pending_pilot["saga_job_id"]

            logger.info("Performing periodical health check for %s (SAGA job id %s)" % (str(pilot_id), saga_job_id))

            if not pilot_id in self.missing_pilots:
                self.missing_pilots[pilot_id] = 0

            # Create a job service object:
            try:
                js_url = saga_job_id.split("]-[")[0][1:]

                if js_url in self._shared_worker_data["job_services"]:
                    js = self._shared_worker_data["job_services"][js_url]
                else:
                    js = saga.job.Service(js_url, session=self._session)
                    self._shared_worker_data["job_services"][js_url] = js

                saga_job = js.get_job(saga_job_id)
                reconnected = True

                if saga_job.state in [saga.job.FAILED, saga.job.CANCELED]:
                    pilot_failed = True
                    log_message = "SAGA job state for ComputePilot %s is %s." % (pilot_id, saga_job.state)

                if saga_job.state in [saga.job.DONE]:
                    pilot_done = True
                    log_message = "SAGA job state for ComputePilot %s is %s." % (pilot_id, saga_job.state)

            except Exception as e:

                if not reconnected:
                    logger.warning("could not reconnect to pilot for state check (%s)" % e)
                    self.missing_pilots[pilot_id] += 1

                    if self.missing_pilots[pilot_id] >= JOB_CHECK_MAX_MISSES:
                        logger.debug("giving up after 10 attempts")
                        pilot_failed = True
                        log_message = "Could not reconnect to pilot %s " "multiple times - giving up" % pilot_id
                else:
                    logger.warning("pilot state check failed: %s" % e)
                    pilot_failed = True
                    log_message = (
                        "Couldn't determine job state for ComputePilot %s. " "Assuming it has failed." % pilot_id
                    )

            if pilot_failed:
                out, err, log = self._get_pilot_logs(pilot_col, pilot_id)
                ts = datetime.datetime.utcnow()
                pilot_col.update(
                    {"_id": pilot_id, "state": {"$ne": DONE}},
                    {
                        "$set": {"state": FAILED, "stdout": out, "stderr": err, "logfile": log},
                        "$push": {
                            "statehistory": {"state": FAILED, "timestamp": ts},
                            "log": {"message": log_message, "timestamp": ts},
                        },
                    },
                )
                logger.debug(log_message)
                logger.warn("pilot %s declared dead" % pilot_id)

            elif pilot_done:
                # FIXME: this should only be done if the state is not yet
                # done...
                out, err, log = self._get_pilot_logs(pilot_col, pilot_id)
                ts = datetime.datetime.utcnow()
                pilot_col.update(
                    {"_id": pilot_id, "state": {"$ne": DONE}},
                    {
                        "$set": {"state": DONE, "stdout": out, "stderr": err, "logfile": log},
                        "$push": {
                            "statehistory": {"state": DONE, "timestamp": ts},
                            "log": {"message": log_message, "timestamp": ts},
                        },
                    },
                )
                logger.debug(log_message)
                logger.warn("pilot %s declared dead" % pilot_id)

            else:
                if self.missing_pilots[pilot_id]:
                    logger.info("pilot %s *assumed* alive and well (%s)" % (pilot_id, self.missing_pilots[pilot_id]))
                else:
                    logger.info("pilot %s seems alive and well" % (pilot_id))