    def add_resource_config(self, resource_config):
        """Adds a new :class:`radical.pilot.ResourceConfig` to the PilotManager's 
           dictionary of known resources, or accept a string which points to
           a configuration file.

           For example::

                  rc = radical.pilot.ResourceConfig(label="mycluster")
                  rc.job_manager_endpoint = "ssh+pbs://mycluster
                  rc.filesystem_endpoint  = "sftp://mycluster
                  rc.default_queue        = "private"
                  rc.bootstrapper         = "default_bootstrapper.sh"

                  pm = radical.pilot.PilotManager(session=s)

                  pd = radical.pilot.ComputePilotDescription()
                  pd.resource = "mycluster"
                  pd.cores    = 16
                  pd.runtime  = 5 # minutes

                  pilot = pm.submit_pilots(pd)
        if  isinstance (resource_config, basestring) :

            # let exceptions fall through
            rcs = ResourceConfig.from_file(resource_config)

            for rc in rcs:
                logger.info("Loaded resource configurations for %s" % rc)
                self._resource_configs[rc] = rcs[rc].as_dict() 

        else :
            print 'add rcfg as %s' % resource_config.label
            self._resource_configs[resource_config.label] = resource_config.as_dict()
    def __init__(self, manager, session):
        """Le constructeur.

        self.manager = manager
        self.session = session
        self.pilots  = dict()

        logger.info("Loaded scheduler: %s." % self.name)
    def close(self):
        """Shuts down the UnitManager and its background workers in a 
        coordinated fashion.
        if not self._uid:
            logger.warning("UnitManager object already closed.")

        if self._worker is not None:
            # Remove worker from registry

        logger.info("Closed UnitManager %s." % str(self._uid))
        self._uid = None
    def _destroy_db_entry(self):
        """Terminates the session and removes it from the database.

        All subsequent attempts access objects attached to the session and 
        attempts to re-connect to the session via its uid will result in
        an error.

            * :class:`radical.pilot.IncorrectState` if the session is closed
              or doesn't exist. 

        logger.info("Deleted session %s from database." % self._uid)
        self._uid = None
    def __init__ (self, manager, session):
        logger.info("Loaded scheduler: %s." % self.name)

        self.manager = manager
        self.session = session
        self.waitq   = dict()
        self.runqs   = dict()
        self.pmgrs   = list()
        self.pilots  = dict()
        self.lock    = threading.RLock ()
        self._db     = self.manager._worker._db

        # make sure the UM notifies us on all unit state changes
        manager.register_callback (self._unit_state_callback)
    def _reconnect(cls, session, unit_manager_id):
        """PRIVATE: Reconnect to an existing UnitManager.
        uid_exists = UnitManagerController.uid_exists(

        if not uid_exists:
            raise BadParameter(
                "UnitManager with id '%s' not in database." % unit_manager_id)

        # The UnitManager object
        obj = cls(session=session, scheduler=None, _reconnect=True)

        # Retrieve or start a worker process fo this PilotManager instance.
        worker = session._process_registry.retrieve(unit_manager_id)
        if worker is not None:
            obj._worker = worker
            obj._worker = UnitManagerController(
            session._process_registry.register(unit_manager_id, obj._worker)

        # start the worker if it's not already running
        if obj._worker.is_alive() is False:

        # Now that the worker is running (again), we can get more information
        # about the UnitManager
        um_data = obj._worker.get_unit_manager_data()

        obj._scheduler = get_scheduler(name=um_data['scheduler'], 
        # FIXME: we need to tell the scheduler about all the pilots...

        obj._uid = unit_manager_id

        logger.info("Reconnected to existing UnitManager %s." % str(obj))
        return obj
    def stage_in(self, directives):
        """Stages the content of the staging directive into the pilot's
        staging area"""

        # Wait until we can assume the pilot directory to be created
        if self.state == NEW:
        elif self.state in [DONE, FAILED, CANCELED]:
            raise Exception("Pilot already finished, no need to stage anymore!")

        # Iterate over all directives
        for directive in expand_staging_directive(directives, logger):

            # TODO: respect flags in directive

            src_url = saga.Url(directive['source'])
            action = directive['action']

            # Convert the target url into a SAGA Url object
            tgt_url = saga.Url(directive['target'])
            # Create a pointer to the directory object that we will use
            tgt_dir_url = tgt_url

            if tgt_url.path.endswith('/'):
                # If the original target was a directory (ends with /),
                # we assume that the user wants the same filename as the source.
                tgt_filename = os.path.basename(src_url.path)
                # Otherwise, extract the filename and update the directory
                tgt_filename = os.path.basename(tgt_dir_url.path)
                tgt_dir_url.path = os.path.dirname(tgt_dir_url.path)

            # Handle special 'staging' scheme
            if tgt_dir_url.scheme == 'staging':

                # We expect a staging:///relative/path/file.txt URI,
                # as hostname would have unclear semantics currently.
                if tgt_dir_url.host:
                    raise Exception("hostname not supported with staging:// scheme")

                # Remove the leading slash to get a relative path from the staging area
                target = os.path.relpath(tgt_dir_url.path, '/')

                # Now base the target directory relative of the sandbox and staging prefix
                tgt_dir_url = saga.Url(os.path.join(self.sandbox, STAGING_AREA, target))

            # Define and open the staging directory for the pilot
            # We use the target dir construct here, so that we can create
            # the directory if it does not yet exist.
            target_dir = saga.filesystem.Directory(tgt_dir_url, flags=saga.filesystem.CREATE_PARENTS)

            if action == LINK:
                # TODO: Does this make sense?
                #log_message = 'Linking %s to %s' % (source, abs_target)
                #os.symlink(source, abs_target)
                logger.error("action 'LINK' not supported on pilot level staging")
                raise ValueError("action 'LINK' not supported on pilot level staging")
            elif action == COPY:
                # TODO: Does this make sense?
                #log_message = 'Copying %s to %s' % (source, abs_target)
                #shutil.copyfile(source, abs_target)
                logger.error("action 'COPY' not supported on pilot level staging")
                raise ValueError("action 'COPY' not supported on pilot level staging")
            elif action == MOVE:
                # TODO: Does this make sense?
                #log_message = 'Moving %s to %s' % (source, abs_target)
                #shutil.move(source, abs_target)
                logger.error("action 'MOVE' not supported on pilot level staging")
                raise ValueError("action 'MOVE' not supported on pilot level staging")
            elif action == TRANSFER:
                log_message = 'Transferring %s to %s' % (src_url, os.path.join(str(tgt_dir_url), tgt_filename))
                # Transfer the source file to the target staging area
                target_dir.copy(src_url, tgt_filename)
                raise Exception('Action %s not supported' % action)
    def handle_schedule(self, schedule):

        # we want to use bulk submission to the pilots, so we collect all units
        # assigned to the same set of pilots.  At the same time, we select
        # unscheduled units for later insertion into the wait queue.

        if not schedule:
            logger.debug('skipping empty unit schedule')

    # print 'handle schedule:'
    # import pprint
    # pprint.pprint (schedule)
        pilot_cu_map = dict()
        unscheduled = list()

        pilot_ids = self.list_pilots()

        for unit in schedule['units'].keys():

            pid = schedule['units'][unit]

            if None == pid:


                if pid not in pilot_ids:
                    raise RuntimeError("schedule points to unknown pilot %s" %

                if pid not in pilot_cu_map:
                    pilot_cu_map[pid] = list()


        # submit to all pilots which got something submitted to
        for pid in pilot_cu_map.keys():

            units_to_schedule = list()

            # if a kernel name is in the cu descriptions set, do kernel expansion
            for unit in pilot_cu_map[pid]:

                if not pid in schedule['pilots']:
                    # lost pilot, do not schedule unit
                    logger.warn("unschedule unit %s, lost pilot %s" %
                                (unit.uid, pid))

                unit.sandbox = schedule['pilots'][pid]['sandbox'] + "/" + str(

                ud = unit.description

                if 'kernel' in ud and ud['kernel']:

                        from radical.ensemblemd.mdkernels import MDTaskDescription
                    except Exception as ex:
                        logger.error ("Kernels are not supported in" \
                              "compute unit descriptions -- install " \
                        # FIXME: unit needs a '_set_state() method or something!
                            unit._uid, FAILED, ["kernel expansion failed"])

                    pilot_resource = schedule['pilots'][pid]['resource']

                    mdtd = MDTaskDescription()
                    mdtd.kernel = ud.kernel
                    mdtd_bound = mdtd.bind(resource=pilot_resource)
                    ud.environment = mdtd_bound.environment
                    ud.pre_exec = mdtd_bound.pre_exec
                    ud.executable = mdtd_bound.executable
                    ud.mpi = mdtd_bound.mpi


            if len(units_to_schedule):

        # report any change in wait_queue_size
        old_wait_queue_size = self.wait_queue_size

        self.wait_queue_size = len(unscheduled)
        if old_wait_queue_size != self.wait_queue_size:
            self._worker.fire_manager_callback(WAIT_QUEUE_SIZE, self,

        if len(unscheduled):

        logger.info('%s units remain unscheduled' % len(unscheduled))
                self._uid = uid

                # otherwise, we reconnect to an existing session
                self._dbs, session_info, self._connection_info = \

                self._created   = session_info["created"]
                self._connected = session_info["connected"]

                logger.info("Reconnected to existing Session %s." % str(self))

            except Exception, ex:
                raise PilotException("Couldn't re-connect to session: %s" % ex)  

    def __del__ (self) :
        self.close ()

    def close(self, cleanup=True, terminate=True, delete=None):
        """Closes the session.
    def __init__(self,
        """Creates a new or reconnects to an exising session.

        If called without a uid, a new Session instance is created and 
        stored in the database. If uid is set, an existing session is 
        retrieved from the database. 

            * **database_url** (`string`): The MongoDB URL.  If none is given,
              RP uses the environment variable RADICAL_PILOT_DBURL.  If that is
              not set, an error will be raises.

            * **database_name** (`string`): An alternative database name 
              (default: 'radicalpilot').

            * **uid** (`string`): If uid is set, we try 
              re-connect to an existing session instead of creating a new one.

            * **name** (`string`): An optional human readable name.

            * A new Session instance.

            * :class:`radical.pilot.DatabaseError`


        # init the base class inits

        # before doing anything else, set up the debug helper for the lifetime
        # of the session.
        self._debug_helper = ru.DebugHelper()

        # Dictionaries holding all manager objects created during the session.
        self._pilot_manager_objects = list()
        self._unit_manager_objects = list()

        # Create a new process registry. All objects belonging to this
        # session will register their worker processes (if they have any)
        # in this registry. This makes it easier to shut down things in
        # a more coordinate fashion.
        self._process_registry = _ProcessRegistry()

        # The resource configuration dictionary associated with the session.
        self._resource_configs = {}

        self._database_url = database_url
        self._database_name = database_name

        if not self._database_url:
            self._database_url = os.getenv("RADICAL_PILOT_DBURL", None)

        if not self._database_url:
            raise PilotException("no database URL (set RADICAL_PILOT_DBURL)")

        logger.info("using database url  %s" % self._database_url)

        # if the database url contains a path element, we interpret that as
        # database name (without the leading slash)
        tmp_url = ru.Url(self._database_url)
        if  tmp_url.path            and \
            tmp_url.path[0]  == '/' and \
            len(tmp_url.path) >  1  :
            self._database_name = tmp_url.path[1:]
            logger.info("using database path %s" % self._database_name)
            logger.info("using database name %s" % self._database_name)

        # Loading all "default" resource configurations
        module_path = os.path.dirname(os.path.abspath(__file__))
        default_cfgs = "%s/configs/*.json" % module_path
        config_files = glob.glob(default_cfgs)

        for config_file in config_files:

                rcs = ResourceConfig.from_file(config_file)
            except Exception as e:
                logger.error("skip config file %s: %s" % (config_file, e))

            for rc in rcs:
                logger.info("Loaded resource configurations for %s" % rc)
                self._resource_configs[rc] = rcs[rc].as_dict()

        user_cfgs = "%s/.radical/pilot/configs/*.json" % os.environ.get('HOME')
        config_files = glob.glob(user_cfgs)

        for config_file in config_files:

                rcs = ResourceConfig.from_file(config_file)
            except Exception as e:
                logger.error("skip config file %s: %s" % (config_file, e))

            for rc in rcs:
                logger.info("Loaded resource configurations for %s" % rc)

                if rc in self._resource_configs:
                    # config exists -- merge user config into it
                    # new config -- add as is
                    self._resource_configs[rc] = rcs[rc].as_dict()

        default_aliases = "%s/configs/aliases.json" % module_path
        self._resource_aliases = ru.read_json_str(default_aliases)['aliases']

        ## CREATE A NEW SESSION ##
        if uid is None:
                self._connected = None

                if name:
                    self._name = name
                    self._uid = name
                # self._uid  = ru.generate_id ('rp.session.'+name+'.%(item_counter)06d', mode=ru.ID_CUSTOM)
                    self._uid = ru.generate_id('rp.session',
                    self._name = self._uid

                self._dbs, self._created, self._connection_info = \
                        dbSession.new(sid     = self._uid,
                                      name    = self._name,
                                      db_url  = self._database_url,
                                      db_name = database_name)

                logger.info("New Session created%s." % str(self))

            except Exception, ex:
                logger.exception('session create failed')
                raise PilotException("Couldn't create new session (database URL '%s' incorrect?): %s" \
                                % (self._database_url, ex))
    def check_pilot_states(self, pilot_col):

        pending_pilots = pilot_col.find({
            "pilotmanager": self.pilot_manager_id,
            "state": {
                "$in": [PENDING_ACTIVE, ACTIVE]

        for pending_pilot in pending_pilots:

            pilot_failed = False
            pilot_done = False
            reconnected = False
            pilot_id = pending_pilot["_id"]
            log_message = ""
            saga_job_id = pending_pilot["saga_job_id"]

                "Performing periodical health check for %s (SAGA job id %s)" %
                (str(pilot_id), saga_job_id))

            if not pilot_id in self.missing_pilots:
                self.missing_pilots[pilot_id] = 0

            # Create a job service object:
                js_url = saga_job_id.split("]-[")[0][1:]

                if js_url in self._shared_worker_data['job_services']:
                    js = self._shared_worker_data['job_services'][js_url]
                    js = saga.job.Service(js_url, session=self._session)
                    self._shared_worker_data['job_services'][js_url] = js

                saga_job = js.get_job(saga_job_id)
                reconnected = True

                if saga_job.state in [saga.job.FAILED, saga.job.CANCELED]:
                    pilot_failed = True
                    log_message  = "SAGA job state for ComputePilot %s is %s."\
                                 % (pilot_id, saga_job.state)

                if saga_job.state in [saga.job.DONE]:
                    pilot_done = True
                    log_message  = "SAGA job state for ComputePilot %s is %s."\
                                 % (pilot_id, saga_job.state)

            except Exception as e:

                if not reconnected:
                        'could not reconnect to pilot for state check (%s)' %
                    self.missing_pilots[pilot_id] += 1

                    if self.missing_pilots[pilot_id] >= JOB_CHECK_MAX_MISSES:
                        logger.debug('giving up after 10 attempts')
                        pilot_failed = True
                        log_message  = "Could not reconnect to pilot %s "\
                                       "multiple times - giving up" % pilot_id
                    logger.warning('pilot state check failed: %s' % e)
                    pilot_failed = True
                    log_message  = "Couldn't determine job state for ComputePilot %s. " \
                                   "Assuming it has failed." % pilot_id

            if pilot_failed:
                out, err, log = self._get_pilot_logs(pilot_col, pilot_id)
                ts = datetime.datetime.utcnow()
                    "_id": pilot_id,
                    "state": {
                        "$ne": DONE
                }, {
                    "$set": {
                        "state": FAILED,
                        "stdout": out,
                        "stderr": err,
                        "logfile": log
                    "$push": {
                        "statehistory": {
                            "state": FAILED,
                            "timestamp": ts
                        "log": {
                            "message": log_message,
                            "timestamp": ts
                logger.warn('pilot %s declared dead' % pilot_id)

            elif pilot_done:
                # FIXME: this should only be done if the state is not yet
                # done...
                out, err, log = self._get_pilot_logs(pilot_col, pilot_id)
                ts = datetime.datetime.utcnow()
                    "_id": pilot_id,
                    "state": {
                        "$ne": DONE
                }, {
                    "$set": {
                        "state": DONE,
                        "stdout": out,
                        "stderr": err,
                        "logfile": log
                    "$push": {
                        "statehistory": {
                            "state": DONE,
                            "timestamp": ts
                        "log": {
                            "message": log_message,
                            "timestamp": ts
                logger.warn('pilot %s declared dead' % pilot_id)

                if self.missing_pilots[pilot_id]:
                    logger.info ('pilot %s *assumed* alive and well (%s)' \
                              % (pilot_id, self.missing_pilots[pilot_id]))
                    logger.info ('pilot %s seems alive and well' \
                              % (pilot_id))
    def run(self):
        """Starts the process when Process.start() is called.

        # make sure to catch sys.exit (which raises SystemExit)
        try :

            logger.info("Starting InputFileTransferWorker")

            # Try to connect to the database and create a tailable cursor.
                connection = self.db_connection_info.get_db_handle()
                db = connection[self.db_connection_info.dbname]
                um_col = db["%s.cu" % self.db_connection_info.session_id]
                logger.debug("Connected to MongoDB. Serving requests for UnitManager %s." % self.unit_manager_id)

            except Exception as e :
                logger.exception("Connection error: %s" % e)

            try :
                while not self._stop.is_set():
                    # See if we can find a ComputeUnit that is waiting for
                    # input file transfer.
                    compute_unit = None

                    ts = datetime.datetime.utcnow()
                    compute_unit = um_col.find_and_modify(
                        query={"unitmanager": self.unit_manager_id,
                               "FTW_Input_Status": PENDING},
                        update={"$set" : {"FTW_Input_Status": EXECUTING,
                                          "state": STAGING_INPUT},
                                "$push": {"statehistory": {"state": STAGING_INPUT, "timestamp": ts}}},
                        limit=BULK_LIMIT # TODO: bulklimit is probably not the best way to ensure there is just one
                    # FIXME: AM: find_and_modify is not bulkable!
                    state = STAGING_INPUT

                    if compute_unit is None:
                        # Sleep a bit if no new units are available.

                        compute_unit_id = None
                            log_messages = []

                            # We have found a new CU. Now we can process the transfer
                            # directive(s) wit SAGA.
                            compute_unit_id = str(compute_unit["_id"])
                            remote_sandbox = compute_unit["sandbox"]
                            input_staging = compute_unit["FTW_Input_Directives"]

                            # We need to create the CU's directory in case it doesn't exist yet.
                            log_msg = "Creating ComputeUnit sandbox directory %s." % remote_sandbox

                            # Creating the sandbox directory.
                                logger.debug ("saga.fs.Directory ('%s')" % remote_sandbox)

                                remote_sandbox_keyurl = saga.Url (remote_sandbox)
                                remote_sandbox_keyurl.path = '/'
                                remote_sandbox_key = str(remote_sandbox_keyurl)

                                if  remote_sandbox_key not in self._saga_dirs :
                                    self._saga_dirs[remote_sandbox_key] = \
                                            saga.filesystem.Directory (remote_sandbox_key,

                                saga_dir = self._saga_dirs[remote_sandbox_key]
                                saga_dir.make_dir (remote_sandbox, 
                            except Exception as e :
                                logger.exception('Error: %s' % e)
                                # FIXME: why is this exception ignored?  AM

                            logger.info("Processing input file transfers for ComputeUnit %s" % compute_unit_id)
                            # Loop over all transfer directives and execute them.
                            for sd in input_staging:

                                state_doc = um_col.find_one(
                                    {"_id": compute_unit_id},
                                if state_doc['state'] == CANCELED:
                                    logger.info("Compute Unit Canceled, interrupting input file transfers.")
                                    state = CANCELED

                                abs_src = os.path.abspath(sd['source'])
                                input_file_url = saga.Url("file://localhost/%s" % abs_src)
                                if not sd['target']:
                                    target = remote_sandbox
                                    target = "%s/%s" % (remote_sandbox, sd['target'])

                                log_msg = "Transferring input file %s -> %s" % (input_file_url, target)

                                # Execute the transfer.
                                logger.debug ("saga.fs.File ('%s')" % input_file_url)
                                input_file = saga.filesystem.File(

                                if CREATE_PARENTS in sd['flags']:
                                    copy_flags = saga.filesystem.CREATE_PARENTS
                                    copy_flags = 0

                                try :
                                    input_file.copy(target, flags=copy_flags)
                                except Exception as e :
                                    logger.exception (e)

                                # If all went fine, update the state of this StagingDirective to Done
                                    query={"_id" : compute_unit_id,
                                           'FTW_Input_Status': EXECUTING,
                                           'FTW_Input_Directives.state': PENDING,
                                           'FTW_Input_Directives.source': sd['source'],
                                           'FTW_Input_Directives.target': sd['target'],
                                    update={'$set': {'FTW_Input_Directives.$.state': 'Done'},
                                            '$push': {'log': {
                                                'timestamp': datetime.datetime.utcnow(), 
                                                'message'  : log_msg}}

                        except Exception as e :
                            # Update the CU's state 'FAILED'.
                            ts = datetime.datetime.utcnow()
                            logentry = {'message'  : "Input transfer failed: %s" % e,
                                        'timestamp': ts}

                            um_col.update({'_id': compute_unit_id}, {
                                '$set': {'state': FAILED},
                                '$push': {
                                    'statehistory': {'state': FAILED, 'timestamp': ts},
                                    'log': logentry


                    # Code below is only to be run by the "first" or only worker
                    if self._worker_number > 1:

                    # If the CU was canceled we can skip the remainder of this loop.
                    if state == CANCELED:

                    # Check to see if there are more pending Directives, if not, we are Done
                    cursor_w = um_col.find({"unitmanager": self.unit_manager_id,
                                            "$or": [ {"Agent_Input_Status": EXECUTING},
                                                     {"FTW_Input_Status": EXECUTING}
                    # Iterate over all the returned CUs (if any)
                    for cu in cursor_w:
                        # See if there are any FTW Input Directives still pending
                        if cu['FTW_Input_Status'] == EXECUTING and \
                                not any(d['state'] == EXECUTING or d['state'] == PENDING for d in cu['FTW_Input_Directives']):
                            # All Input Directives for this FTW are done, mark the CU accordingly
                            um_col.update({"_id": cu["_id"]},
                                          {'$set': {'FTW_Input_Status': DONE},
                                           '$push': {'log': {
                                                'timestamp': datetime.datetime.utcnow(),
                                                'message'  : 'All FTW Input Staging Directives done - %d.' % self._worker_number}}

                        # See if there are any Agent Input Directives still pending or executing,
                        # if not, mark it DONE.
                        if cu['Agent_Input_Status'] == EXECUTING and \
                                not any(d['state'] == EXECUTING or d['state'] == PENDING for d in cu['Agent_Input_Directives']):
                            # All Input Directives for this Agent are done, mark the CU accordingly
                            um_col.update({"_id": cu["_id"]},
                                           {'$set': {'Agent_Input_Status': DONE},
                                            '$push': {'log': {
                                                'timestamp': datetime.datetime.utcnow(), 
                                                'message'  : 'All Agent Input Staging Directives done - %d.' % self._worker_number}}

                    # Check for all CUs if both Agent and FTW staging is done, we can then mark the CU PendingExecution
                    ts = datetime.datetime.utcnow()
                        query={"unitmanager": self.unit_manager_id,
                               "Agent_Input_Status": { "$in": [ None, DONE ] },
                               "FTW_Input_Status": { "$in": [ None, DONE ] },
                               "state": STAGING_INPUT
                        update={"$set": {
                                    "state": PENDING_EXECUTION
                                "$push": {
                                    "statehistory": {"state": PENDING_EXECUTION, "timestamp": ts}

            except Exception as e :

                logger.exception("transfer worker error: %s" % e)
                self._session.close (cleanup=False)

        except SystemExit as e :
            logger.debug("input file transfer thread caught system exit -- forcing application shutdown")
            import thread
            thread.interrupt_main ()
        All subsequent attempts access objects attached to the session will 
    def run(self):
        """Starts the process when Process.start() is called.

        # make sure to catch sys.exit (which raises SystemExit)
            # Get directory where this module lives
            mod_dir = os.path.dirname(os.path.realpath(__file__))

            # Try to connect to the database
                connection = self.db_connection_info.get_db_handle()
                db = connection[self.db_connection_info.dbname]
                pilot_col = db["%s.p" % self.db_connection_info.session_id]
                logger.debug("Connected to MongoDB. Serving requests for PilotManager %s." % self.pilot_manager_id)

            except Exception as e:
                logger.exception("Connection error: %s" % e)

            last_job_check = time.time()

            while not self._stop.is_set():

                # Periodically, we pull up all ComputePilots that are pending
                # execution or were last seen executing and check if the corresponding
                # SAGA job is still pending in the queue. If that is not the case,
                # we assume that the job has failed for some reasons and update
                # the state of the ComputePilot accordingly.
                if last_job_check + JOB_CHECK_INTERVAL < time.time():
                    last_job_check = time.time()

                # See if we can find a ComputePilot that is waiting to be launched.
                # If we find one, we use SAGA to create a job service, a job
                # description and a job that is then send to the local or remote
                # queueing system. If this succedes, we set the ComputePilot's
                # state to pending, otherwise to failed.
                compute_pilot = None

                ts = datetime.datetime.utcnow()
                compute_pilot = pilot_col.find_and_modify(
                    query={"pilotmanager": self.pilot_manager_id, "state": PENDING_LAUNCH},
                        "$set": {"state": LAUNCHING},
                        "$push": {"statehistory": {"state": LAUNCHING, "timestamp": ts}},

                if not compute_pilot:

                        # ------------------------------------------------------
                        # LAUNCH THE PILOT AGENT VIA SAGA
                        logentries = []
                        pilot_id = str(compute_pilot["_id"])

                        logger.info("Launching ComputePilot %s" % pilot_id)

                        # ------------------------------------------------------
                        # Database connection parameters
                        session_uid = self.db_connection_info.session_id
                        database_url = self.db_connection_info.dburl
                        database_name = self.db_connection_info.dbname
                        database_auth = self.db_connection_info.dbauth

                        # ------------------------------------------------------
                        # pilot description and resource configuration
                        number_cores = compute_pilot["description"]["cores"]
                        runtime = compute_pilot["description"]["runtime"]
                        queue = compute_pilot["description"]["queue"]
                        project = compute_pilot["description"]["project"]
                        cleanup = compute_pilot["description"]["cleanup"]
                        resource_key = compute_pilot["description"]["resource"]
                        schema = compute_pilot["description"]["access_schema"]
                        memory = compute_pilot["description"]["memory"]
                        pilot_sandbox = compute_pilot["sandbox"]
                        global_sandbox = compute_pilot["global_sandbox"]

                        # we expand and exchange keys in the resource config,
                        # depending on the selected schema so better use a deep
                        # copy..
                        resource_cfg = self._session.get_resource_config(resource_key, schema)

                        # import pprint
                        # pprint.pprint (resource_cfg)

                        # ------------------------------------------------------
                        # get parameters from cfg, set defaults where needed
                        agent_mongodb_endpoint = resource_cfg.get("agent_mongodb_endpoint", database_url)
                        agent_spawner = resource_cfg.get("agent_spawner", DEFAULT_AGENT_SPAWNER)
                        agent_type = resource_cfg.get("agent_type", DEFAULT_AGENT_TYPE)
                        agent_scheduler = resource_cfg.get("agent_scheduler")
                        tunnel_bind_device = resource_cfg.get("tunnel_bind_device")
                        default_queue = resource_cfg.get("default_queue")
                        forward_tunnel_endpoint = resource_cfg.get("forward_tunnel_endpoint")
                        js_endpoint = resource_cfg.get("job_manager_endpoint")
                        lrms = resource_cfg.get("lrms")
                        mpi_launch_method = resource_cfg.get("mpi_launch_method")
                        pre_bootstrap = resource_cfg.get("pre_bootstrap")
                        python_interpreter = resource_cfg.get("python_interpreter")
                        spmd_variation = resource_cfg.get("spmd_variation")
                        task_launch_method = resource_cfg.get("task_launch_method")
                        rp_version = resource_cfg.get("rp_version", DEFAULT_RP_VERSION)
                        virtenv_mode = resource_cfg.get("virtenv_mode", DEFAULT_VIRTENV_MODE)
                        virtenv = resource_cfg.get("virtenv", DEFAULT_VIRTENV)
                        stage_cacerts = resource_cfg.get("stage_cacerts", "False")

                        if stage_cacerts.lower() == "true":
                            stage_cacerts = True
                            stage_cacerts = False

                        # expand variables in virtenv string
                        virtenv = virtenv % {
                            "pilot_sandbox": saga.Url(pilot_sandbox).path,
                            "global_sandbox": saga.Url(global_sandbox).path,

                        # Check for deprecated global_virtenv
                        global_virtenv = resource_cfg.get("global_virtenv")
                        if global_virtenv:
                            logger.warn("'global_virtenv' keyword is deprecated -- use 'virtenv' and 'virtenv_mode'")
                            virtenv = global_virtenv
                            virtenv_mode = "use"

                        # set default scheme, host, port and dbname if not set
                        db_url = saga.Url(agent_mongodb_endpoint)
                        if not db_url.scheme:
                            db_url.scheme = "mongodb"
                        if not db_url.host:
                            db_url.host = "localhost"
                        if not db_url.port:
                            db_url.port = 27017
                        if not database_name:
                            database_name = "radicalpilot"

                        # Create a host:port string for use by the bootstrapper.
                        database_hostport = "%s:%d" % (db_url.host, db_url.port)

                        # ------------------------------------------------------
                        # Copy the bootstrap shell script.  This also creates
                        # the sandbox. We use always "default_bootstrapper.sh"
                        bootstrapper = "default_bootstrapper.sh"
                        bootstrapper_path = os.path.abspath("%s/../bootstrapper/%s" % (mod_dir, bootstrapper))

                        msg = "Using bootstrapper %s" % bootstrapper_path
                        logentries.append(Logentry(msg, logger=logger.info))

                        bs_script_url = saga.Url("file://localhost/%s" % bootstrapper_path)
                        bs_script_tgt = saga.Url("%s/pilot_bootstrapper.sh" % pilot_sandbox)

                        msg = "Copying bootstrapper '%s' to agent sandbox (%s)." % (bs_script_url, bs_script_tgt)
                        logentries.append(Logentry(msg, logger=logger.debug))

                        bs_script = saga.filesystem.File(bs_script_url, session=self._session)
                        bs_script.copy(bs_script_tgt, flags=saga.filesystem.CREATE_PARENTS)

                        # ------------------------------------------------------
                        # the version of the agent is derived from
                        # rp_version, which has the following format
                        # and interpretation:
                        # case rp_version:
                        #   @<token>:
                        #   @tag/@branch/@commit: # no sdist staging
                        #       git clone $github_base radical.pilot.src
                        #       (cd radical.pilot.src && git checkout token)
                        #       pip install -t $VIRTENV/rp_install/ radical.pilot.src
                        #       rm -rf radical.pilot.src
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #   release: # no sdist staging
                        #       pip install -t $VIRTENV/rp_install radical.pilot
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #   local: # needs sdist staging
                        #       tar zxf $sdist.tgz
                        #       pip install -t $VIRTENV/rp_install $sdist/
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #   debug: # needs sdist staging
                        #       tar zxf $sdist.tgz
                        #       pip install -t $SANDBOX/rp_install $sdist/
                        #       export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH
                        #   installed: # no sdist staging
                        #       true
                        # esac
                        # virtenv_mode
                        #   private : error  if ve exists, otherwise create, then use
                        #   update  : update if ve exists, otherwise create, then use
                        #   create  : use    if ve exists, otherwise create, then use
                        #   use     : use    if ve exists, otherwise error,  then exit
                        #   recreate: delete if ve exists, otherwise create, then use
                        # examples   :
                        #   [email protected]
                        #   virtenv@devel
                        #   virtenv@release
                        #   virtenv@installed
                        #   stage@local
                        #   stage@/tmp/my_agent.py
                        # Note that some combinations may be invalid,
                        # specifically in the context of virtenv_mode.  If, for
                        # example, virtenv_mode is 'use', then the 'virtenv:tag'
                        # will not make sense, as the virtenv is not updated.
                        # In those cases, the virtenv_mode is honored, and
                        # a warning is printed.
                        # Also, the 'stage' mode can only be combined with the
                        # 'local' source, or with a path to the agent (relative
                        # to mod_dir, or absolute).
                        # A rp_version which does not adhere to the
                        # above syntax is ignored, and the fallback stage@local
                        # is used.

                        if not rp_version.startswith("@") and not rp_version in ["installed", "local", "debug"]:
                            raise ValueError("invalid rp_version '%s'" % rp_version)

                        stage_sdist = True
                        if rp_version in ["installed", "release"]:
                            stage_sdist = False

                        if rp_version.startswith("@"):
                            stage_sdist = False
                            rp_version = rp_version[1:]  # strip '@'

                        # ------------------------------------------------------
                        # Copy the rp sdist if needed.  We actually also stage
                        # the sdists for radical.utils and radical.saga, so that
                        # we have the complete stack to install...
                        if stage_sdist:

                            for path in [ru.sdist_path, saga.sdist_path, sdist_path]:

                                sdist_url = saga.Url("file://localhost/%s" % path)
                                msg = "Copying sdist '%s' to sdist sandbox (%s)." % (sdist_url, pilot_sandbox)
                                logentries.append(Logentry(msg, logger=logger.debug))

                                sdist_file = saga.filesystem.File(sdist_url)
                                sdist_file.copy("%s/" % (str(pilot_sandbox)))

                        # ------------------------------------------------------
                        # some machines cannot run pip due to outdated ca certs.
                        # For those, we also stage an updated cert bundle
                        if stage_cacerts:
                            cc_path = os.path.abspath("%s/../bootstrapper/%s" % (mod_dir, "cacert.pem.gz"))

                            cc_script_url = saga.Url("file://localhost/%s" % cc_path)
                            cc_script_tgt = saga.Url("%s/cacert.pem.gz" % pilot_sandbox)

                            cc_script = saga.filesystem.File(cc_script_url, session=self._session)
                            cc_script.copy(cc_script_tgt, flags=saga.filesystem.CREATE_PARENTS)

                        # ------------------------------------------------------
                        # sanity checks
                        if not agent_spawner:
                            raise RuntimeError("missing agent spawner")
                        if not agent_scheduler:
                            raise RuntimeError("missing agent scheduler")
                        if not lrms:
                            raise RuntimeError("missing LRMS")
                        if not mpi_launch_method:
                            raise RuntimeError("missing mpi launch method")
                        if not task_launch_method:
                            raise RuntimeError("missing task launch method")

                        # massage some values
                        debug_level = os.environ.get("RADICAL_PILOT_AGENT_VERBOSE", logger.level)
                            debug_level = int(debug_level)
                        except ValueError:
                            debug_level = {
                                "CRITICAL": 1,
                                "ERROR": 2,
                                "WARNING": 3,
                                "WARN": 3,
                                "INFO": 4,
                                "DEBUG": 5,
                            }.get(debug_level, 0)

                        if not queue:
                            queue = default_queue

                        if cleanup and isinstance(cleanup, bool):
                            cleanup = "luve"  #  l : log files
                            #  u : unit work dirs
                            #  v : virtualenv
                            #  e : everything (== pilot sandbox)
                            # we never cleanup virtenvs which are not private
                            if virtenv_mode is not "private":
                                cleanup = cleanup.replace("v", "")

                        sdists = ":".join([ru.sdist_name, saga.sdist_name, sdist_name])

                        # set mandatory args
                        bootstrap_args = ""
                        bootstrap_args += " -b '%s'" % sdists
                        bootstrap_args += " -c '%s'" % number_cores
                        bootstrap_args += " -d '%s'" % debug_level
                        bootstrap_args += " -g '%s'" % virtenv
                        bootstrap_args += " -j '%s'" % task_launch_method
                        bootstrap_args += " -k '%s'" % mpi_launch_method
                        bootstrap_args += " -l '%s'" % lrms
                        bootstrap_args += " -m '%s'" % database_hostport
                        bootstrap_args += " -n '%s'" % database_name
                        bootstrap_args += " -o '%s'" % agent_spawner
                        bootstrap_args += " -p '%s'" % pilot_id
                        bootstrap_args += " -q '%s'" % agent_scheduler
                        bootstrap_args += " -r '%s'" % runtime
                        bootstrap_args += " -s '%s'" % session_uid
                        bootstrap_args += " -t '%s'" % agent_type
                        bootstrap_args += " -u '%s'" % virtenv_mode
                        bootstrap_args += " -v '%s'" % rp_version

                        # set optional args
                        if database_auth:
                            bootstrap_args += " -a '%s'" % database_auth
                        if tunnel_bind_device:
                            bootstrap_args += " -D '%s'" % tunnel_bind_device
                        if pre_bootstrap:
                            bootstrap_args += " -e '%s'" % "' -e '".join(pre_bootstrap)
                        if forward_tunnel_endpoint:
                            bootstrap_args += " -f '%s'" % forward_tunnel_endpoint
                        if python_interpreter:
                            bootstrap_args += " -i '%s'" % python_interpreter
                        if cleanup:
                            bootstrap_args += " -x '%s'" % cleanup

                        # ------------------------------------------------------
                        # now that the script is in place and we know where it is,
                        # we can launch the agent
                        js_url = saga.Url(js_endpoint)
                        logger.debug("saga.job.Service ('%s')" % js_url)
                        if js_url in self._shared_worker_data["job_services"]:
                            js = self._shared_worker_data["job_services"][js_url]
                            js = saga.job.Service(js_url, session=self._session)
                            self._shared_worker_data["job_services"][js_url] = js

                        # ------------------------------------------------------
                        # Create SAGA Job description and submit the pilot job

                        jd = saga.job.Description()

                        jd.executable = "/bin/bash"
                        jd.arguments = ["-l pilot_bootstrapper.sh", bootstrap_args]
                        jd.working_directory = saga.Url(pilot_sandbox).path
                        jd.project = project
                        jd.output = "agent.out"
                        jd.error = "agent.err"
                        jd.total_cpu_count = number_cores
                        jd.wall_time_limit = runtime
                        jd.total_physical_memory = memory
                        jd.queue = queue

                        # Set the SPMD variation only if required
                        if spmd_variation:
                            jd.spmd_variation = spmd_variation

                        if "RADICAL_PILOT_PROFILE" in os.environ:
                            jd.environment = {"RADICAL_PILOT_PROFILE": "TRUE"}

                        logger.debug("Bootstrap command line: %s %s" % (jd.executable, jd.arguments))

                        msg = "Submitting SAGA job with description: %s" % str(jd.as_dict())
                        logentries.append(Logentry(msg, logger=logger.debug))

                        pilotjob = js.create_job(jd)

                        # do a quick error check
                        if pilotjob.state == saga.FAILED:
                            raise RuntimeError("SAGA Job state is FAILED.")

                        saga_job_id = pilotjob.id
                        self._shared_worker_data["job_ids"][pilot_id] = [saga_job_id, js_url]

                        msg = "SAGA job submitted with job id %s" % str(saga_job_id)
                        logentries.append(Logentry(msg, logger=logger.debug))

                        # ------------------------------------------------------

                        log_dicts = list()
                        for le in logentries:

                        # Update the Pilot's state to 'PENDING_ACTIVE' if SAGA job submission was successful.
                        ts = datetime.datetime.utcnow()
                        ret = pilot_col.update(
                            {"_id": pilot_id, "state": "Launching"},
                                "$set": {"state": PENDING_ACTIVE, "saga_job_id": saga_job_id},
                                "$push": {"statehistory": {"state": PENDING_ACTIVE, "timestamp": ts}},
                                "$pushAll": {"log": log_dicts},

                        if ret["n"] == 0:
                            # could not update, probably because the agent is
                            # running already.  Just update state history and
                            # jobid then
                            # FIXME: make sure of the agent state!
                            ret = pilot_col.update(
                                {"_id": pilot_id},
                                    "$set": {"saga_job_id": saga_job_id},
                                    "$push": {"statehistory": {"state": PENDING_ACTIVE, "timestamp": ts}},
                                    "$pushAll": {"log": log_dicts},

                    except Exception as e:
                        # Update the Pilot's state 'FAILED'.
                        out, err, log = self._get_pilot_logs(pilot_col, pilot_id)
                        ts = datetime.datetime.utcnow()

                        # FIXME: we seem to be unable to bson/json handle saga
                        # log messages containing an '#'.  This shows up here.
                        # Until we find a clean workaround, make log shorter and
                        # rely on saga logging to reveal the problem.
                        msg = "Pilot launching failed! (%s)" % e

                        log_dicts = list()
                        log_messages = list()
                        for le in logentries:

                            {"_id": pilot_id, "state": {"$ne": FAILED}},
                                "$set": {"state": FAILED, "stdout": out, "stderr": err, "logfile": log},
                                "$push": {"statehistory": {"state": FAILED, "timestamp": ts}},
                                "$pushAll": {"log": log_dicts},

        except SystemExit as e:
            logger.exception("pilot launcher thread caught system exit -- forcing application shutdown")
            import thread

    def handle_schedule (self, schedule) :

        # we want to use bulk submission to the pilots, so we collect all units
        # assigned to the same set of pilots.  At the same time, we select
        # unscheduled units for later insertion into the wait queue.
        if  not schedule :
            logger.debug ('skipping empty unit schedule')

      # print 'handle schedule:'
      # import pprint
      # pprint.pprint (schedule)
        pilot_cu_map = dict()
        unscheduled  = list()

        pilot_ids = self.list_pilots ()

        for unit in schedule['units'].keys() :

            pid = schedule['units'][unit]

            if  None == pid :
                unscheduled.append (unit)

            else :

                if  pid not in pilot_ids :
                    raise RuntimeError ("schedule points to unknown pilot %s" % pid)

                if  pid not in pilot_cu_map :
                    pilot_cu_map[pid] = list()

                pilot_cu_map[pid].append (unit)

        # submit to all pilots which got something submitted to
        for pid in pilot_cu_map.keys():

            units_to_schedule = list()

            # if a kernel name is in the cu descriptions set, do kernel expansion
            for unit in pilot_cu_map[pid] :

                if  not pid in schedule['pilots'] :
                    # lost pilot, do not schedule unit
                    logger.warn ("unschedule unit %s, lost pilot %s" % (unit.uid, pid))

                unit.sandbox = schedule['pilots'][pid]['sandbox'] + "/" + str(unit.uid)

                ud = unit.description

                if  'kernel' in ud and ud['kernel'] :

                    try :
                        from radical.ensemblemd.mdkernels import MDTaskDescription
                    except Exception as ex :
                        logger.error ("Kernels are not supported in" \
                              "compute unit descriptions -- install " \
                        # FIXME: unit needs a '_set_state() method or something!
                        self._session._dbs.set_compute_unit_state (unit._uid, FAILED, 
                                ["kernel expansion failed"])

                    pilot_resource = schedule['pilots'][pid]['resource']

                    mdtd           = MDTaskDescription ()
                    mdtd.kernel    = ud.kernel
                    mdtd_bound     = mdtd.bind (resource=pilot_resource)
                    ud.environment = mdtd_bound.environment
                    ud.pre_exec    = mdtd_bound.pre_exec
                    ud.executable  = mdtd_bound.executable
                    ud.mpi         = mdtd_bound.mpi

                units_to_schedule.append (unit)

            if  len(units_to_schedule) :
                self._worker.schedule_compute_units (pilot_uid=pid,

        # report any change in wait_queue_size
        old_wait_queue_size = self.wait_queue_size

        self.wait_queue_size = len(unscheduled)
        if  old_wait_queue_size != self.wait_queue_size :
            self._worker.fire_manager_callback (WAIT_QUEUE_SIZE, self,

        if  len(unscheduled) :
            self._worker.unschedule_compute_units (units=unscheduled)

        logger.info ('%s units remain unscheduled' % len(unscheduled))
    def run(self):
        """run() is called when the process is started via

        # make sure to catch sys.exit (which raises SystemExit)

                "Worker thread (ID: %s[%s]) for UnitManager %s started." %
                (self.name, self.ident, self._um_id))

            # transfer results contains the futures to the results of the
            # asynchronous transfer operations.
            transfer_results = list()

            while not self._stop.is_set():

                # =================================================================
                # Check and update units. This needs to be optimized at
                # some point, i.e., state pulling should be conditional
                # or triggered by a tailable MongoDB cursor, etc.
                unit_list = self._db.get_compute_units(
                action = False

                for unit in unit_list:
                    unit_id = str(unit["_id"])

                    new_state = unit["state"]
                    if unit_id in self._shared_data:
                        old_state = self._shared_data[unit_id]["data"]["state"]
                        old_state = None
                        self._shared_data[unit_id] = {
                            'data': unit,
                            'callbacks': [],
                            'facade_object': None

                    self._shared_data[unit_id]["data"] = unit

                    if new_state != old_state:
                        # On a state change, we fire zee callbacks.
                            "RUN ComputeUnit '%s' state changed from '%s' to '%s'."
                            % (unit_id, old_state, new_state))

                        # The state of the unit has changed, We call all
                        # unit-level callbacks to propagate this.
                        self.call_unit_state_callbacks(unit_id, new_state)

                        action = True

                # After the first iteration, we are officially initialized!
                if not self._initialized.is_set():

                # sleep a little if this cycle was idle
                if not action:

        except SystemExit as e:
                "unit manager controller thread caught system exit -- forcing application shutdown"
            import thread

            # shut down the autonomous input / output transfer worker(s)
            for worker in self._input_file_transfer_worker_pool:
                logger.debug("uworker %s stops   itransfer %s" %
                             (self.name, worker.name))
                logger.debug("uworker %s stopped itransfer %s" %
                             (self.name, worker.name))

            for worker in self._output_file_transfer_worker_pool:
                logger.debug("uworker %s stops   otransfer %s" %
                             (self.name, worker.name))
                logger.debug("uworker %s stopped otransfer %s" %
                             (self.name, worker.name))
    def schedule_compute_units(self, pilot_uid, units):
        """Request the scheduling of one or more ComputeUnits on a

            cu_transfer   = list()
            cu_notransfer = list()

            # Get some information about the pilot sandbox from the database.
            pilot_info = self._db.get_pilots(pilot_ids=pilot_uid)
            # TODO: this hack below relies on what?! That there is just one pilot?
            pilot_sandbox = pilot_info[0]['sandbox']

            # Split units into two different lists: the first list contains the CUs
            # that need file transfer and the second list contains the CUs that
            # don't. The latter is added to the pilot directly, while the former
            # is added to the transfer queue.
            for unit in units:

                # Create object for staging status tracking
                unit.FTW_Input_Status = None
                unit.FTW_Input_Directives = []
                unit.Agent_Input_Status = None
                unit.Agent_Input_Directives = []
                unit.FTW_Output_Status = None
                unit.FTW_Output_Directives = []
                unit.Agent_Output_Status = None
                unit.Agent_Output_Directives = []

                # Split the input staging directives over the transfer worker and the agent
                input_sds = unit.description.input_staging
                if not isinstance(input_sds, list):
                    # Ugly, but is a workaround for iterating on attribute interface
                    # TODO: Verify if this piece of code is actually still required
                    if input_sds:
                        input_sds = [input_sds]
                        input_sds = []

                for input_sd_entry in input_sds:
                    action = input_sd_entry['action']
                    source = Url(input_sd_entry['source'])
                    target = Url(input_sd_entry['target'])

                    new_sd = {'action':   action,
                              'source':   str(source),
                              'target':   str(target),
                              'flags':    input_sd_entry['flags'],
                              'priority': input_sd_entry['priority'],
                              'state':    PENDING

                    if action in [LINK, COPY, MOVE]:
                        unit.Agent_Input_Status = PENDING
                    elif action in [TRANSFER]:
                        if source.scheme and source.scheme != 'file':
                            # If there is a scheme and it is different than "file",
                            # assume a remote pull from the agent
                            unit.Agent_Input_Status = PENDING
                            # Transfer from local to sandbox
                            unit.FTW_Input_Status = PENDING
                        logger.warn('Not sure if action %s makes sense for input staging' % action)

                # Split the output staging directives over the transfer worker and the agent
                output_sds = unit.description.output_staging
                if not isinstance(output_sds, list):
                    # Ugly, but is a workaround for iterating on att iface
                    # TODO: Verify if this piece of code is actually still required
                    if output_sds:
                        output_sds = [output_sds]
                        output_sds = []

                for output_sds_entry in output_sds:
                    action = output_sds_entry['action']
                    source = Url(output_sds_entry['source'])
                    target = Url(output_sds_entry['target'])

                    new_sd = {'action':   action,
                              'source':   str(source),
                              'target':   str(target),
                              'flags':    output_sds_entry['flags'],
                              'priority': output_sds_entry['priority'],
                              'state':    PENDING

                    if action == LINK or action == COPY or action == MOVE:
                        unit.Agent_Output_Status = NEW
                    elif action == TRANSFER:
                        if target.scheme and target.scheme != 'file':
                            # If there is a scheme and it is different than "file",
                            # assume a remote push from the agent
                            unit.Agent_Output_Status = NEW
                            # Transfer from sandbox back to local
                            unit.FTW_Output_Status = NEW
                        logger.warn('Not sure if action %s makes sense for output staging' % action)

                if unit.FTW_Input_Directives or unit.Agent_Input_Directives:
                    log = "Scheduled for data transfer to ComputePilot %s." % pilot_uid
                    self._db.set_compute_unit_state(unit.uid, PENDING_INPUT_STAGING, log)

            # Bulk-add all non-transfer units-


            for unit in cu_notransfer:
                log = "Scheduled for execution on ComputePilot %s." % pilot_uid
                self._db.set_compute_unit_state(unit.uid, PENDING_EXECUTION, log)
                #self._set_state(uid, PENDING_EXECUTION, log)

                "Scheduled ComputeUnits %s for execution on ComputePilot '%s'." %
                (cu_notransfer, pilot_uid)
        except Exception, e:
            logger.exception ('error in unit manager controller (schedule())')
    def run(self):
        """run() is called when the process is started via

        # make sure to catch sys.exit (which raises SystemExit)
        try :

            logger.debug("Worker thread (ID: %s[%s]) for UnitManager %s started." %
                        (self.name, self.ident, self._um_id))

            # transfer results contains the futures to the results of the
            # asynchronous transfer operations.
            transfer_results = list()

            while not self._stop.is_set():

                # =================================================================
                # Check and update units. This needs to be optimized at
                # some point, i.e., state pulling should be conditional
                # or triggered by a tailable MongoDB cursor, etc.
                unit_list = self._db.get_compute_units(unit_manager_id=self._um_id)
                action    = False

                for unit in unit_list:
                    unit_id = str(unit["_id"])

                    new_state = unit["state"]
                    if unit_id in self._shared_data:
                        old_state = self._shared_data[unit_id]["data"]["state"]
                        old_state = None
                        self._shared_data[unit_id] = {
                            'data':          unit,
                            'callbacks':     [],
                            'facade_object': None

                    self._shared_data[unit_id]["data"] = unit

                    if new_state != old_state:
                        # On a state change, we fire zee callbacks.
                        logger.info("RUN ComputeUnit '%s' state changed from '%s' to '%s'." % (unit_id, old_state, new_state))

                        # The state of the unit has changed, We call all
                        # unit-level callbacks to propagate this.
                        self.call_unit_state_callbacks(unit_id, new_state)

                        action = True

                # After the first iteration, we are officially initialized!
                if not self._initialized.is_set():

                # sleep a little if this cycle was idle
                if  not action :

        except SystemExit as e :
            logger.exception ("unit manager controller thread caught system exit -- forcing application shutdown")
            import thread
            thread.interrupt_main ()

        finally :
            # shut down the autonomous input / output transfer worker(s)
            for worker in self._input_file_transfer_worker_pool:
                logger.debug("uworker %s stops   itransfer %s" % (self.name, worker.name))
                worker.stop ()
                logger.debug("uworker %s stopped itransfer %s" % (self.name, worker.name))

            for worker in self._output_file_transfer_worker_pool:
                logger.debug("uworker %s stops   otransfer %s" % (self.name, worker.name))
                worker.stop ()
                logger.debug("uworker %s stopped otransfer %s" % (self.name, worker.name))
    def run(self):
        """Starts the process when Process.start() is called.

        # make sure to catch sys.exit (which raises SystemExit)
            # Get directory where this module lives
            mod_dir = os.path.dirname(os.path.realpath(__file__))

            # Try to connect to the database
                connection = self.db_connection_info.get_db_handle()
                db = connection[self.db_connection_info.dbname]
                pilot_col = db["%s.p" % self.db_connection_info.session_id]
                    "Connected to MongoDB. Serving requests for PilotManager %s."
                    % self.pilot_manager_id)

            except Exception as e:
                logger.exception("Connection error: %s" % e)

            last_job_check = time.time()

            while not self._stop.is_set():

                # Periodically, we pull up all ComputePilots that are pending
                # execution or were last seen executing and check if the corresponding
                # SAGA job is still pending in the queue. If that is not the case,
                # we assume that the job has failed for some reasons and update
                # the state of the ComputePilot accordingly.
                if last_job_check + JOB_CHECK_INTERVAL < time.time():
                    last_job_check = time.time()

                # See if we can find a ComputePilot that is waiting to be launched.
                # If we find one, we use SAGA to create a job service, a job
                # description and a job that is then send to the local or remote
                # queueing system. If this succedes, we set the ComputePilot's
                # state to pending, otherwise to failed.
                compute_pilot = None

                ts = datetime.datetime.utcnow()
                compute_pilot = pilot_col.find_and_modify(
                        "pilotmanager": self.pilot_manager_id,
                        "state": PENDING_LAUNCH
                        "$set": {
                            "state": LAUNCHING
                        "$push": {
                            "statehistory": {
                                "state": LAUNCHING,
                                "timestamp": ts

                if not compute_pilot:

                        # ------------------------------------------------------
                        # LAUNCH THE PILOT AGENT VIA SAGA
                        logentries = []
                        pilot_id = str(compute_pilot["_id"])

                        logger.info("Launching ComputePilot %s" % pilot_id)

                        # ------------------------------------------------------
                        # Database connection parameters
                        session_uid = self.db_connection_info.session_id
                        database_url = self.db_connection_info.dburl
                        database_name = self.db_connection_info.dbname
                        database_auth = self.db_connection_info.dbauth

                        # ------------------------------------------------------
                        # pilot description and resource configuration
                        number_cores = compute_pilot['description']['cores']
                        runtime = compute_pilot['description']['runtime']
                        queue = compute_pilot['description']['queue']
                        project = compute_pilot['description']['project']
                        cleanup = compute_pilot['description']['cleanup']
                        resource_key = compute_pilot['description']['resource']
                        schema = compute_pilot['description']['access_schema']
                        memory = compute_pilot['description']['memory']
                        pilot_sandbox = compute_pilot['sandbox']
                        global_sandbox = compute_pilot['global_sandbox']

                        # we expand and exchange keys in the resource config,
                        # depending on the selected schema so better use a deep
                        # copy..
                        resource_cfg = self._session.get_resource_config(
                            resource_key, schema)

                        # import pprint
                        # pprint.pprint (resource_cfg)

                        # ------------------------------------------------------
                        # get parameters from cfg, set defaults where needed
                        agent_mongodb_endpoint = resource_cfg.get(
                            'agent_mongodb_endpoint', database_url)
                        agent_spawner = resource_cfg.get(
                            'agent_spawner', DEFAULT_AGENT_SPAWNER)
                        agent_type = resource_cfg.get('agent_type',
                        agent_scheduler = resource_cfg.get('agent_scheduler')
                        tunnel_bind_device = resource_cfg.get(
                        default_queue = resource_cfg.get('default_queue')
                        forward_tunnel_endpoint = resource_cfg.get(
                        js_endpoint = resource_cfg.get('job_manager_endpoint')
                        lrms = resource_cfg.get('lrms')
                        mpi_launch_method = resource_cfg.get(
                        pre_bootstrap = resource_cfg.get('pre_bootstrap')
                        python_interpreter = resource_cfg.get(
                        spmd_variation = resource_cfg.get('spmd_variation')
                        task_launch_method = resource_cfg.get(
                        rp_version = resource_cfg.get('rp_version',
                        virtenv_mode = resource_cfg.get(
                            'virtenv_mode', DEFAULT_VIRTENV_MODE)
                        virtenv = resource_cfg.get('virtenv', DEFAULT_VIRTENV)
                        stage_cacerts = resource_cfg.get(
                            'stage_cacerts', 'False')

                        if stage_cacerts.lower() == 'true':
                            stage_cacerts = True
                            stage_cacerts = False

                        # expand variables in virtenv string
                        virtenv = virtenv % {
                            'pilot_sandbox': saga.Url(pilot_sandbox).path,
                            'global_sandbox': saga.Url(global_sandbox).path

                        # Check for deprecated global_virtenv
                        global_virtenv = resource_cfg.get('global_virtenv')
                        if global_virtenv:
                                "'global_virtenv' keyword is deprecated -- use 'virtenv' and 'virtenv_mode'"
                            virtenv = global_virtenv
                            virtenv_mode = 'use'

                        # set default scheme, host, port and dbname if not set
                        db_url = saga.Url(agent_mongodb_endpoint)
                        if not db_url.scheme: db_url.scheme = 'mongodb'
                        if not db_url.host: db_url.host = 'localhost'
                        if not db_url.port: db_url.port = 27017
                        if not database_name: database_name = 'radicalpilot'

                        # Create a host:port string for use by the bootstrapper.
                        database_hostport = "%s:%d" % (db_url.host,

                        # ------------------------------------------------------
                        # Copy the bootstrap shell script.  This also creates
                        # the sandbox. We use always "default_bootstrapper.sh"
                        bootstrapper = 'default_bootstrapper.sh'
                        bootstrapper_path = os.path.abspath("%s/../bootstrapper/%s" \
                                % (mod_dir, bootstrapper))

                        msg = "Using bootstrapper %s" % bootstrapper_path
                        logentries.append(Logentry(msg, logger=logger.info))

                        bs_script_url = saga.Url("file://localhost/%s" %
                        bs_script_tgt = saga.Url("%s/pilot_bootstrapper.sh" %

                        msg = "Copying bootstrapper '%s' to agent sandbox (%s)." \
                                % (bs_script_url, bs_script_tgt)
                        logentries.append(Logentry(msg, logger=logger.debug))

                        bs_script = saga.filesystem.File(bs_script_url,

                        # ------------------------------------------------------
                        # the version of the agent is derived from
                        # rp_version, which has the following format
                        # and interpretation:
                        # case rp_version:
                        #   @<token>:
                        #   @tag/@branch/@commit: # no sdist staging
                        #       git clone $github_base radical.pilot.src
                        #       (cd radical.pilot.src && git checkout token)
                        #       pip install -t $VIRTENV/rp_install/ radical.pilot.src
                        #       rm -rf radical.pilot.src
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #   release: # no sdist staging
                        #       pip install -t $VIRTENV/rp_install radical.pilot
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #   local: # needs sdist staging
                        #       tar zxf $sdist.tgz
                        #       pip install -t $VIRTENV/rp_install $sdist/
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #   debug: # needs sdist staging
                        #       tar zxf $sdist.tgz
                        #       pip install -t $SANDBOX/rp_install $sdist/
                        #       export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH
                        #   installed: # no sdist staging
                        #       true
                        # esac
                        # virtenv_mode
                        #   private : error  if ve exists, otherwise create, then use
                        #   update  : update if ve exists, otherwise create, then use
                        #   create  : use    if ve exists, otherwise create, then use
                        #   use     : use    if ve exists, otherwise error,  then exit
                        #   recreate: delete if ve exists, otherwise create, then use
                        # examples   :
                        #   [email protected]
                        #   virtenv@devel
                        #   virtenv@release
                        #   virtenv@installed
                        #   stage@local
                        #   stage@/tmp/my_agent.py
                        # Note that some combinations may be invalid,
                        # specifically in the context of virtenv_mode.  If, for
                        # example, virtenv_mode is 'use', then the 'virtenv:tag'
                        # will not make sense, as the virtenv is not updated.
                        # In those cases, the virtenv_mode is honored, and
                        # a warning is printed.
                        # Also, the 'stage' mode can only be combined with the
                        # 'local' source, or with a path to the agent (relative
                        # to mod_dir, or absolute).
                        # A rp_version which does not adhere to the
                        # above syntax is ignored, and the fallback stage@local
                        # is used.

                        if  not rp_version.startswith('@') and \
                            not rp_version in ['installed', 'local', 'debug']:
                            raise ValueError("invalid rp_version '%s'" %

                        stage_sdist = True
                        if rp_version in ['installed', 'release']:
                            stage_sdist = False

                        if rp_version.startswith('@'):
                            stage_sdist = False
                            rp_version = rp_version[1:]  # strip '@'

                        # ------------------------------------------------------
                        # Copy the rp sdist if needed.  We actually also stage
                        # the sdists for radical.utils and radical.saga, so that
                        # we have the complete stack to install...
                        if stage_sdist:

                            for path in [
                                    ru.sdist_path, saga.sdist_path, sdist_path

                                sdist_url = saga.Url("file://localhost/%s" %
                                msg = "Copying sdist '%s' to sdist sandbox (%s)." % (
                                    sdist_url, pilot_sandbox)
                                    Logentry(msg, logger=logger.debug))

                                sdist_file = saga.filesystem.File(sdist_url)
                                sdist_file.copy("%s/" % (str(pilot_sandbox)))

                        # ------------------------------------------------------
                        # some machines cannot run pip due to outdated ca certs.
                        # For those, we also stage an updated cert bundle
                        if stage_cacerts:
                            cc_path = os.path.abspath("%s/../bootstrapper/%s" \
                                    % (mod_dir, 'cacert.pem.gz'))

                            cc_script_url = saga.Url("file://localhost/%s" %
                            cc_script_tgt = saga.Url("%s/cacert.pem.gz" %

                            cc_script = saga.filesystem.File(
                                cc_script_url, session=self._session)

                        # ------------------------------------------------------
                        # sanity checks
                        if not agent_spawner:
                            raise RuntimeError("missing agent spawner")
                        if not agent_scheduler:
                            raise RuntimeError("missing agent scheduler")
                        if not lrms: raise RuntimeError("missing LRMS")
                        if not mpi_launch_method:
                            raise RuntimeError("missing mpi launch method")
                        if not task_launch_method:
                            raise RuntimeError("missing task launch method")

                        # massage some values
                        debug_level = os.environ.get(
                            'RADICAL_PILOT_AGENT_VERBOSE', logger.level)
                            debug_level = int(debug_level)
                        except ValueError:
                            debug_level = {
                                'CRITICAL': 1,
                                'ERROR': 2,
                                'WARNING': 3,
                                'WARN': 3,
                                'INFO': 4,
                                'DEBUG': 5
                            }.get(debug_level, 0)

                        if not queue:
                            queue = default_queue

                        if cleanup and isinstance(cleanup, bool):
                            cleanup = 'luve'  #  l : log files
                            #  u : unit work dirs
                            #  v : virtualenv
                            #  e : everything (== pilot sandbox)
                            # we never cleanup virtenvs which are not private
                            if virtenv_mode is not 'private':
                                cleanup = cleanup.replace('v', '')

                        sdists = ':'.join(
                            [ru.sdist_name, saga.sdist_name, sdist_name])

                        # set mandatory args
                        bootstrap_args = ""
                        bootstrap_args += " -b '%s'" % sdists
                        bootstrap_args += " -c '%s'" % number_cores
                        bootstrap_args += " -d '%s'" % debug_level
                        bootstrap_args += " -g '%s'" % virtenv
                        bootstrap_args += " -j '%s'" % task_launch_method
                        bootstrap_args += " -k '%s'" % mpi_launch_method
                        bootstrap_args += " -l '%s'" % lrms
                        bootstrap_args += " -m '%s'" % database_hostport
                        bootstrap_args += " -n '%s'" % database_name
                        bootstrap_args += " -o '%s'" % agent_spawner
                        bootstrap_args += " -p '%s'" % pilot_id
                        bootstrap_args += " -q '%s'" % agent_scheduler
                        bootstrap_args += " -r '%s'" % runtime
                        bootstrap_args += " -s '%s'" % session_uid
                        bootstrap_args += " -t '%s'" % agent_type
                        bootstrap_args += " -u '%s'" % virtenv_mode
                        bootstrap_args += " -v '%s'" % rp_version

                        # set optional args
                        if database_auth:
                            bootstrap_args += " -a '%s'" % database_auth
                        if tunnel_bind_device:
                            bootstrap_args += " -D '%s'" % tunnel_bind_device
                        if pre_bootstrap:
                            bootstrap_args += " -e '%s'" % "' -e '".join(
                        if forward_tunnel_endpoint:
                            bootstrap_args += " -f '%s'" % forward_tunnel_endpoint
                        if python_interpreter:
                            bootstrap_args += " -i '%s'" % python_interpreter
                        if cleanup:
                            bootstrap_args += " -x '%s'" % cleanup

                        # ------------------------------------------------------
                        # now that the script is in place and we know where it is,
                        # we can launch the agent
                        js_url = saga.Url(js_endpoint)
                        logger.debug("saga.job.Service ('%s')" % js_url)
                        if js_url in self._shared_worker_data['job_services']:
                            js = self._shared_worker_data['job_services'][
                            js = saga.job.Service(js_url,
                                js_url] = js

                        # ------------------------------------------------------
                        # Create SAGA Job description and submit the pilot job

                        jd = saga.job.Description()

                        jd.executable = "/bin/bash"
                        jd.arguments = [
                            "-l pilot_bootstrapper.sh", bootstrap_args
                        jd.working_directory = saga.Url(pilot_sandbox).path
                        jd.project = project
                        jd.output = "agent.out"
                        jd.error = "agent.err"
                        jd.total_cpu_count = number_cores
                        jd.wall_time_limit = runtime
                        jd.total_physical_memory = memory
                        jd.queue = queue

                        # Set the SPMD variation only if required
                        if spmd_variation:
                            jd.spmd_variation = spmd_variation

                        if 'RADICAL_PILOT_PROFILE' in os.environ:
                            jd.environment = {'RADICAL_PILOT_PROFILE': 'TRUE'}

                        logger.debug("Bootstrap command line: %s %s" %
                                     (jd.executable, jd.arguments))

                        msg = "Submitting SAGA job with description: %s" % str(
                        logentries.append(Logentry(msg, logger=logger.debug))

                        pilotjob = js.create_job(jd)

                        # do a quick error check
                        if pilotjob.state == saga.FAILED:
                            raise RuntimeError("SAGA Job state is FAILED.")

                        saga_job_id = pilotjob.id
                        self._shared_worker_data['job_ids'][pilot_id] = [
                            saga_job_id, js_url

                        msg = "SAGA job submitted with job id %s" % str(
                        logentries.append(Logentry(msg, logger=logger.debug))

                        # ------------------------------------------------------

                        log_dicts = list()
                        for le in logentries:

                        # Update the Pilot's state to 'PENDING_ACTIVE' if SAGA job submission was successful.
                        ts = datetime.datetime.utcnow()
                        ret = pilot_col.update(
                                "_id": pilot_id,
                                "state": 'Launching'
                            }, {
                                "$set": {
                                    "state": PENDING_ACTIVE,
                                    "saga_job_id": saga_job_id
                                "$push": {
                                    "statehistory": {
                                        "state": PENDING_ACTIVE,
                                        "timestamp": ts
                                "$pushAll": {
                                    "log": log_dicts

                        if ret['n'] == 0:
                            # could not update, probably because the agent is
                            # running already.  Just update state history and
                            # jobid then
                            # FIXME: make sure of the agent state!
                            ret = pilot_col.update({"_id": pilot_id}, {
                                "$set": {
                                    "saga_job_id": saga_job_id
                                "$push": {
                                    "statehistory": {
                                        "state": PENDING_ACTIVE,
                                        "timestamp": ts
                                "$pushAll": {
                                    "log": log_dicts

                    except Exception as e:
                        # Update the Pilot's state 'FAILED'.
                        out, err, log = self._get_pilot_logs(
                            pilot_col, pilot_id)
                        ts = datetime.datetime.utcnow()

                        # FIXME: we seem to be unable to bson/json handle saga
                        # log messages containing an '#'.  This shows up here.
                        # Until we find a clean workaround, make log shorter and
                        # rely on saga logging to reveal the problem.
                        msg = "Pilot launching failed! (%s)" % e

                        log_dicts = list()
                        log_messages = list()
                        for le in logentries:

                                "_id": pilot_id,
                                "state": {
                                    "$ne": FAILED
                            }, {
                                "$set": {
                                    "state": FAILED,
                                    "stdout": out,
                                    "stderr": err,
                                    "logfile": log
                                "$push": {
                                    "statehistory": {
                                        "state": FAILED,
                                        "timestamp": ts
                                "$pushAll": {
                                    "log": log_dicts

        except SystemExit as e:
                "pilot launcher thread caught system exit -- forcing application shutdown"
            import thread
    def run(self):
        """run() is called when the process is started via

        # make sure to catch sys.exit (which raises SystemExit)
        try :

            logger.debug("Worker thread (ID: %s[%s]) for PilotManager %s started." %
                        (self.name, self.ident, self._pm_id))

            while not self._stop.is_set():

                # # Check if one or more startup requests have finished.
                # self.startup_results_lock.acquire()

                # new_startup_results = list()

                # for transfer_result in self.startup_results:
                #     if transfer_result.ready():
                #         result = transfer_result.get()

                #         self._db.update_pilot_state(
                #             pilot_uid=result["pilot_uid"],
                #             state=result["state"],
                #             sagajobid=result["saga_job_id"],
                #             pilot_sandbox=result["sandbox"],
                #             global_sandbox=result["global_sandbox"],
                #             submitted=result["submitted"],
                #             logs=result["logs"]
                #         )

                #     else:
                #         new_startup_results.append(transfer_result)

                # self.startup_results = new_startup_results

                # self.startup_results_lock.release()

                # Check and update pilots. This needs to be optimized at
                # some point, i.e., state pulling should be conditional
                # or triggered by a tailable MongoDB cursor, etc.
                pilot_list = self._db.get_pilots(pilot_manager_id=self._pm_id)
                action = False

                for pilot in pilot_list:
                    pilot_id = str(pilot["_id"])

                    new_state = pilot["state"]
                    if pilot_id in self._shared_data:
                        old_state = self._shared_data[pilot_id]["data"]["state"]
                        old_state = None
                        self._shared_data[pilot_id] = {
                            'data':          pilot,
                            'callbacks':     [],
                            'facade_object': None

                    self._shared_data[pilot_id]['data'] = pilot

                    # FIXME: *groan* what a hack...  The Canceling state is by
                    # the nature of it not recorded in the database, but only in
                    # the local cache.  So if we see it as old state, we have to
                    # avoid state transitions into non-final states in the cache
                    # at all cost -- so we catch this here specifically
                    no_cb = False
                    if  old_state == CANCELING :
                        if  new_state not in [DONE, FAILED, CANCELED] :
                            # restore old state, making the cache explicitly
                            # different than the DB recorded state
                            self._shared_data[pilot_id]["data"]["state"] = old_state 

                            # do not tr igger a state cb!
                            no_cb = True

                    if new_state != old_state :
                        action = True

                        if not no_cb :
                            # On a state change, we fire zee callbacks.
                            logger.info("ComputePilot '%s' state changed from '%s' to '%s'." \
                                            % (pilot_id, old_state, new_state))

                            # The state of the pilot has changed, We call all
                            # pilot-level callbacks to propagate this.  This also
                            # includes communication to the unit scheduler which
                            # may, or may not, cancel the pilot's units.
                            self.call_callbacks(pilot_id, new_state)

                    # If the state is 'DONE', 'FAILED' or 'CANCELED', we also
                    # set the state of the compute unit accordingly (but only
                    # for non-final units)
                    if new_state in [FAILED, DONE, CANCELED]:
                        unit_ids = self._db.pilot_list_compute_units(pilot_uid=pilot_id)
                        self._db.set_compute_unit_state (
                            src_states=[ PENDING_INPUT_STAGING,
                            log="Pilot '%s' has terminated with state '%s'. CU canceled." % (pilot_id, new_state))

                # After the first iteration, we are officially initialized!
                if not self._initialized.is_set():

                # sleep a little if this cycle was idle
                if  not action :

        except SystemExit as e :
            logger.exception ("pilot manager controller thread caught system exit -- forcing application shutdown")
            import thread
            thread.interrupt_main ()

        finally :
            # shut down the autonomous pilot launcher worker(s)
            for worker in self._pilot_launcher_worker_pool:
                logger.debug("pworker %s stops   launcher %s" % (self.name, worker.name))
                worker.stop ()
                logger.debug("pworker %s stopped launcher %s" % (self.name, worker.name))
    def register_cancel_pilots_request(self, pilot_ids=None):
        """Registers one or more pilots for cancelation.

        if pilot_ids is None:

            pilot_ids = list()

            for pilot in self._db.get_pilots(pilot_manager_id=self._pm_id) :
                pilot_ids.append (str(pilot["_id"]))

        self._db.send_command_to_pilot(COMMAND_CANCEL_PILOT, pilot_ids=pilot_ids)
        logger.info("Sent 'COMMAND_CANCEL_PILOT' command to pilots %s.", pilot_ids)

        # pilots which are in ACTIVE state should now have time to react on the
        # CANCEL command sent above.  Meanwhile, we'll cancel all pending
        # pilots.  If that is done, we wait a little, say 10 seconds, to give
        # the pilot time to pick up the request and shut down -- but if it does
        # not do that, it will get killed the hard way...
        delayed_cancel = list()

        for pilot_id in pilot_ids :
            if  pilot_id in self._shared_data :

                # read state fomr _shared_data only once, so that it does not
                # change under us...
                old_state = str(self._shared_data[pilot_id]["data"]["state"])

                logger.warn ("actively cancel pilot %s state: %s" % (pilot_id, old_state))
                if  old_state in [DONE, FAILED, CANCELED] :
                    logger.warn ("can't actively cancel pilot %s: already in final state" % pilot_id)

                elif old_state in [PENDING_LAUNCH, LAUNCHING, PENDING_ACTIVE] :
                    if pilot_id in self._shared_worker_data['job_ids'] :

                        try :
                            job_id, js_url = self._shared_worker_data['job_ids'][pilot_id]
                            self._shared_data[pilot_id]["data"]["state"] = CANCELING
                            logger.info ("actively cancel pilot %s (%s, %s)" % (pilot_id, job_id, js_url))

                            js = self._shared_worker_data['job_services'][js_url]
                            job = js.get_job (job_id)
                            job.cancel ()
                        except Exception as e :
                            logger.exception ('pilot cancelation failed')

                    else :
                        logger.warn ("can't actively cancel pilot %s: no job id known" % pilot_id)
                        logger.debug (pprint.pformat (self._shared_worker_data))

                else :
                    logger.debug ("delay to actively cancel pilot %s: state %s" % (pilot_id, old_state))
                    delayed_cancel.append (pilot_id)

            else :
                logger.warn  ("can't actively cancel pilot %s: unknown pilot" % pilot_id)
                logger.debug (pprint.pformat (self._shared_data))

        # now tend to all delayed cancellation requests (ie. active pilots) --
        # if there are any
        if  delayed_cancel :

            # grant some levay to the unruly children...
            time.sleep (10)

            for pilot_id in delayed_cancel :

                if pilot_id in self._shared_worker_data['job_ids'] :

                    try :
                        job_id, js_url = self._shared_worker_data['job_ids'][pilot_id]
                        logger.info ("actively cancel pilot %s (delayed) (%s, %s)" % (pilot_id, job_id, js_url))

                        js = self._shared_worker_data['job_services'][js_url]
                        job = js.get_job (job_id)
                        job.cancel ()
                    except Exception as e :
                        logger.warn ('delayed pilot cancelation failed. '
                                'This is not necessarily a problem.')

                else :
                    logger.warn ("can't actively cancel pilot %s: no job id known (delayed)" % pilot_id)
                    logger.debug (pprint.pformat (self._shared_worker_data))
    def __init__ (self, database_url=None, database_name="radicalpilot",
                  uid=None, name=None):
        """Creates a new or reconnects to an exising session.

        If called without a uid, a new Session instance is created and 
        stored in the database. If uid is set, an existing session is 
        retrieved from the database. 

            * **database_url** (`string`): The MongoDB URL.  If none is given,
              RP uses the environment variable RADICAL_PILOT_DBURL.  If that is
              not set, an error will be raises.

            * **database_name** (`string`): An alternative database name 
              (default: 'radicalpilot').

            * **uid** (`string`): If uid is set, we try 
              re-connect to an existing session instead of creating a new one.

            * **name** (`string`): An optional human readable name.

            * A new Session instance.

            * :class:`radical.pilot.DatabaseError`


        # init the base class inits
        saga.Session.__init__ (self)
        Object.__init__ (self)

        # before doing anything else, set up the debug helper for the lifetime
        # of the session.
        self._debug_helper = ru.DebugHelper ()

        # Dictionaries holding all manager objects created during the session.
        self._pilot_manager_objects = list()
        self._unit_manager_objects = list()

        # Create a new process registry. All objects belonging to this 
        # session will register their worker processes (if they have any)
        # in this registry. This makes it easier to shut down things in 
        # a more coordinate fashion. 
        self._process_registry = _ProcessRegistry()

        # The resource configuration dictionary associated with the session.
        self._resource_configs = {}

        self._database_url  = database_url
        self._database_name = database_name 

        if  not self._database_url :
            self._database_url = os.getenv ("RADICAL_PILOT_DBURL", None)

        if  not self._database_url :
            raise PilotException ("no database URL (set RADICAL_PILOT_DBURL)")  

        logger.info("using database url  %s" % self._database_url)

        # if the database url contains a path element, we interpret that as
        # database name (without the leading slash)
        tmp_url = ru.Url (self._database_url)
        if  tmp_url.path            and \
            tmp_url.path[0]  == '/' and \
            len(tmp_url.path) >  1  :
            self._database_name = tmp_url.path[1:]
            logger.info("using database path %s" % self._database_name)
        else :
            logger.info("using database name %s" % self._database_name)

        # Loading all "default" resource configurations
        module_path   = os.path.dirname(os.path.abspath(__file__))
        default_cfgs  = "%s/configs/*.json" % module_path
        config_files  = glob.glob(default_cfgs)

        for config_file in config_files:

            try :
                rcs = ResourceConfig.from_file(config_file)
            except Exception as e :
                logger.error ("skip config file %s: %s" % (config_file, e))

            for rc in rcs:
                logger.info("Loaded resource configurations for %s" % rc)
                self._resource_configs[rc] = rcs[rc].as_dict() 

        user_cfgs     = "%s/.radical/pilot/configs/*.json" % os.environ.get ('HOME')
        config_files  = glob.glob(user_cfgs)

        for config_file in config_files:

            try :
                rcs = ResourceConfig.from_file(config_file)
            except Exception as e :
                logger.error ("skip config file %s: %s" % (config_file, e))

            for rc in rcs:
                logger.info("Loaded resource configurations for %s" % rc)

                if  rc in self._resource_configs :
                    # config exists -- merge user config into it
                    ru.dict_merge (self._resource_configs[rc],
                else :
                    # new config -- add as is
                    self._resource_configs[rc] = rcs[rc].as_dict() 

        default_aliases = "%s/configs/aliases.json" % module_path
        self._resource_aliases = ru.read_json_str (default_aliases)['aliases']

        ## CREATE A NEW SESSION ##
        if uid is None:
                self._connected  = None

                if name :
                    self._name = name
                    self._uid  = name
                  # self._uid  = ru.generate_id ('rp.session.'+name+'.%(item_counter)06d', mode=ru.ID_CUSTOM)
                else :
                    self._uid  = ru.generate_id ('rp.session', mode=ru.ID_PRIVATE)
                    self._name = self._uid

                self._dbs, self._created, self._connection_info = \
                        dbSession.new(sid     = self._uid,
                                      name    = self._name,
                                      db_url  = self._database_url,
                                      db_name = database_name)

                logger.info("New Session created%s." % str(self))

            except Exception, ex:
                logger.exception ('session create failed')
                raise PilotException("Couldn't create new session (database URL '%s' incorrect?): %s" \
                                % (self._database_url, ex))  
    def _unit_state_callback (self, unit, state) :
        try :

            with self.lock :
                uid = unit.uid

                logger.info ("[SchedulerCallback]: Computeunit %s changed to %s" % (uid, state))

                found_unit = False
                if  state in [NEW, UNSCHEDULED] :

                    for pid in self.runqs :

                        if  not pid :
                            logger.warning ('cannot handle final unit %s w/o pilot information' % uid)

                        if  uid in self.runqs[pid] :

                            logger.info ('reschedule NEW unit %s from %s' % (uid, pid))

                            unit       = self.runqs[pid][uid]
                            found_unit = True

                            del self.runqs[pid][uid]
                            self.waitq[uid] = unit

                          # self._dump ('before reschedule %s' % uid)
                            self._reschedule (uid=uid)
                          # self._dump ('after  reschedule %s' % uid)


              # if  not found_unit and uid not in self.waitq :
              #     # as we cannot unregister callbacks, we simply ignore this
              #     # invokation.  Its probably from a unit we handled previously.
              #     # (although this should have been final?)
              #     #
              #     # FIXME: how can I *un*register a unit callback?
              #     logger.error ("[SchedulerCallback]: cannot handle unit %s" % uid)
              #     self._dump()
              #     return

                    # the pilot which owned this CU should now have free slots available
                    # FIXME: how do I get the pilot from the CU?
                    pid = unit.execution_details.get ('pilot', None)

                    if  not pid :
                        raise RuntimeError ('cannot handle final unit %s w/o pilot information' % uid)

                    if  pid not in self.pilots :
                        logger.warning ('cannot handle unit %s cb for pilot %s (pilot is gone)' % (uid, pid))

                    else :
                        if  uid in self.runqs[pid] :

                            unit = self.runqs[pid][uid]

                            del self.runqs[pid][uid]
                            self.pilots[pid]['caps'] += unit.description.cores
                            self._reschedule (target_pid=pid)
                            found_unit = True

                      #     logger.debug ('unit %s frees %s cores on (-> %s)' \
                      #                % (uid, unit.description.cores, pid, self.pilots[pid]['caps']))

                    if not found_unit :
                        logger.warn ('unit %s freed %s cores on %s (== %s) -- not reused'
                                  % (uid, unit.description.cores, pid, self.pilots[pid]['caps']))

        except Exception as e :
            logger.error ("error in unit callback for backfiller (%s) - ignored" % e)
    def schedule_compute_units(self, pilot_uid, units):
        """Request the scheduling of one or more ComputeUnits on a

            cu_transfer = list()
            cu_notransfer = list()

            # Get some information about the pilot sandbox from the database.
            pilot_info = self._db.get_pilots(pilot_ids=pilot_uid)
            # TODO: this hack below relies on what?! That there is just one pilot?
            pilot_sandbox = pilot_info[0]['sandbox']

            # Split units into two different lists: the first list contains the CUs
            # that need file transfer and the second list contains the CUs that
            # don't. The latter is added to the pilot directly, while the former
            # is added to the transfer queue.
            for unit in units:

                # Create object for staging status tracking
                unit.FTW_Input_Status = None
                unit.FTW_Input_Directives = []
                unit.Agent_Input_Status = None
                unit.Agent_Input_Directives = []
                unit.FTW_Output_Status = None
                unit.FTW_Output_Directives = []
                unit.Agent_Output_Status = None
                unit.Agent_Output_Directives = []

                # Split the input staging directives over the transfer worker and the agent
                input_sds = unit.description.input_staging
                if not isinstance(input_sds, list):
                    # Ugly, but is a workaround for iterating on attribute interface
                    # TODO: Verify if this piece of code is actually still required
                    if input_sds:
                        input_sds = [input_sds]
                        input_sds = []

                for input_sd_entry in input_sds:
                    action = input_sd_entry['action']
                    source = Url(input_sd_entry['source'])
                    target = Url(input_sd_entry['target'])

                    new_sd = {
                        'action': action,
                        'source': str(source),
                        'target': str(target),
                        'flags': input_sd_entry['flags'],
                        'priority': input_sd_entry['priority'],
                        'state': PENDING

                    if action in [LINK, COPY, MOVE]:
                        unit.Agent_Input_Status = PENDING
                    elif action in [TRANSFER]:
                        if source.scheme and source.scheme != 'file':
                            # If there is a scheme and it is different than "file",
                            # assume a remote pull from the agent
                            unit.Agent_Input_Status = PENDING
                            # Transfer from local to sandbox
                            unit.FTW_Input_Status = PENDING
                            'Not sure if action %s makes sense for input staging'
                            % action)

                # Split the output staging directives over the transfer worker and the agent
                output_sds = unit.description.output_staging
                if not isinstance(output_sds, list):
                    # Ugly, but is a workaround for iterating on att iface
                    # TODO: Verify if this piece of code is actually still required
                    if output_sds:
                        output_sds = [output_sds]
                        output_sds = []

                for output_sds_entry in output_sds:
                    action = output_sds_entry['action']
                    source = Url(output_sds_entry['source'])
                    target = Url(output_sds_entry['target'])

                    new_sd = {
                        'action': action,
                        'source': str(source),
                        'target': str(target),
                        'flags': output_sds_entry['flags'],
                        'priority': output_sds_entry['priority'],
                        'state': PENDING

                    if action == LINK or action == COPY or action == MOVE:
                        unit.Agent_Output_Status = NEW
                    elif action == TRANSFER:
                        if target.scheme and target.scheme != 'file':
                            # If there is a scheme and it is different than "file",
                            # assume a remote push from the agent
                            unit.Agent_Output_Status = NEW
                            # Transfer from sandbox back to local
                            unit.FTW_Output_Status = NEW
                            'Not sure if action %s makes sense for output staging'
                            % action)

                if unit.FTW_Input_Directives or unit.Agent_Input_Directives:
                    log = "Scheduled for data transfer to ComputePilot %s." % pilot_uid
                                                    PENDING_INPUT_STAGING, log)

            # Bulk-add all non-transfer units-


            for unit in cu_notransfer:
                log = "Scheduled for execution on ComputePilot %s." % pilot_uid
                self._db.set_compute_unit_state(unit.uid, PENDING_EXECUTION,
                #self._set_state(uid, PENDING_EXECUTION, log)

                "Scheduled ComputeUnits %s for execution on ComputePilot '%s'."
                % (cu_notransfer, pilot_uid))
        except Exception, e:
            logger.exception('error in unit manager controller (schedule())')
    def run(self):
        """Starts the process when Process.start() is called.

        # make sure to catch sys.exit (which raises SystemExit)
        try :

            # Try to connect to the database and create a tailable cursor.
                connection = self.db_connection_info.get_db_handle()
                db = connection[self.db_connection_info.dbname]
                um_col = db["%s.cu" % self.db_connection_info.session_id]
                logger.debug("Connected to MongoDB. Serving requests for UnitManager %s." % self.unit_manager_id)

            except Exception as e:
                logger.exception("Connection error: %s" % e)

            while not self._stop.is_set():
                compute_unit = None

                # See if we can find a ComputeUnit that is waiting for
                # output file transfer.
                ts = datetime.datetime.utcnow()
                compute_unit = um_col.find_and_modify(
                    query={"unitmanager": self.unit_manager_id,
                           "FTW_Output_Status": PENDING},
                    update={"$set" : {"FTW_Output_Status": EXECUTING,
                                      "state": STAGING_OUTPUT},
                            "$push": {"statehistory": {"state": STAGING_OUTPUT, "timestamp": ts}}},
                # FIXME: AM: find_and_modify is not bulkable!
                state = STAGING_OUTPUT

                #logger.info("OFTW after finding pending cus")
                if compute_unit is None:
                    #logger.info("OFTW no cus, sleep")
                    # Sleep a bit if no new units are available.
                    logger.info("OFTW cu found, progressing ...")
                    compute_unit_id = None
                        # We have found a new CU. Now we can process the transfer
                        # directive(s) wit SAGA.
                        compute_unit_id = str(compute_unit["_id"])
                        remote_sandbox = compute_unit["sandbox"]
                        staging_directives = compute_unit["FTW_Output_Directives"]

                        logger.info("Processing output file transfers for ComputeUnit %s" % compute_unit_id)
                        # Loop over all staging directives and execute them.
                        for sd in staging_directives:

                            # Check if there was a cancel request
                            state_doc = um_col.find_one(
                                {"_id": compute_unit_id},
                            if state_doc['state'] == CANCELED:
                                logger.info("Compute Unit Canceled, interrupting output file transfers.")
                                state = CANCELED

                            action = sd['action']
                            source = sd['source']
                            target = sd['target']
                            flags  = sd['flags']

                            # Mark the beginning of transfer this StagingDirective
                                query={"_id" : compute_unit_id,
                                       'FTW_Output_Status': EXECUTING,
                                       'FTW_Output_Directives.state': PENDING,
                                       'FTW_Output_Directives.source': sd['source'],
                                       'FTW_Output_Directives.target': sd['target'],
                                update={'$set': {'FTW_Output_Directives.$.state': EXECUTING},
                                        '$push': {'log': {
                                            'timestamp': datetime.datetime.utcnow(),
                                            'message'  : 'Starting transfer of %s' % source}}

                            abs_source = "%s/%s" % (remote_sandbox, source)

                            if os.path.basename(target) == target:
                                abs_target = "file://localhost%s" % os.path.join(os.getcwd(), target)
                                abs_target = "file://localhost%s" % os.path.abspath(target)

                            log_msg = "Transferring output file %s -> %s" % (abs_source, abs_target)

                            logger.debug ("saga.fs.File ('%s')" % saga.Url(abs_source))
                            output_file = saga.filesystem.File(saga.Url(abs_source),

                            if CREATE_PARENTS in flags:
                                copy_flags = saga.filesystem.CREATE_PARENTS
                                copy_flags = 0
                            logger.debug ("saga.fs.File.copy ('%s')" % saga.Url(abs_target))
                            output_file.copy(saga.Url(abs_target), flags=copy_flags)

                            # If all went fine, update the state of this StagingDirective to Done
                                query={"_id" : compute_unit_id,
                                       'FTW_Output_Status': EXECUTING,
                                       'FTW_Output_Directives.state': EXECUTING,
                                       'FTW_Output_Directives.source': sd['source'],
                                       'FTW_Output_Directives.target': sd['target'],
                                update={'$set': {'FTW_Output_Directives.$.state': DONE},
                                        '$push': {'log': {
                                            'timestamp': datetime.datetime.utcnow(),
                                            'message'  : log_msg}}

                    except Exception as e :
                        # Update the CU's state to 'FAILED'.
                        ts = datetime.datetime.utcnow()
                        log_message = "Output transfer failed: %s" % e
                        # TODO: not only mark the CU as failed, but also the specific Directive
                        um_col.update({'_id': compute_unit_id}, {
                            '$set': {'state': FAILED},
                            '$push': {
                                'statehistory': {'state': FAILED, 'timestamp': ts},
                                'log': {'message': log_message, 'timestamp': ts}
                        logger.exception (log_message)

                # Code below is only to be run by the "first" or only worker
                if self._worker_number > 1:

                # If the CU was canceled we can skip the remainder of this loop.
                if state == CANCELED:

                # Check to see if there are more active Directives, if not, we are Done
                cursor_w = um_col.find({"unitmanager": self.unit_manager_id,
                                        "$or": [ {"Agent_Output_Status": EXECUTING},
                                                 {"FTW_Output_Status": EXECUTING}
                # Iterate over all the returned CUs (if any)
                for cu in cursor_w:
                    # See if there are any FTW Output Directives still pending
                    if cu['FTW_Output_Status'] == EXECUTING and \
                            not any(d['state'] == EXECUTING or d['state'] == PENDING for d in cu['FTW_Output_Directives']):
                        # All Output Directives for this FTW are done, mark the CU accordingly
                        um_col.update({"_id": cu["_id"]},
                                      {'$set': {'FTW_Output_Status': DONE},
                                       '$push': {'log': {
                                           'timestamp': datetime.datetime.utcnow(),
                                           'message'  : 'All FTW output staging directives done - %d.' % self._worker_number}}

                    # See if there are any Agent Output Directives still pending
                    if cu['Agent_Output_Status'] == EXECUTING and \
                            not any(d['state'] == EXECUTING or d['state'] == PENDING for d in cu['Agent_Output_Directives']):
                        # All Output Directives for this Agent are done, mark the CU accordingly
                        um_col.update({"_id": cu["_id"]},
                                      {'$set': {'Agent_Output_Status': DONE},
                                       '$push': {'log': {
                                           'timestamp': datetime.datetime.utcnow(),
                                           'message'  : 'All Agent Output Staging Directives done-%d.' % self._worker_number}}

                # Check for all CUs if both Agent and FTW staging is done, we can then mark the CU Done
                ts = datetime.datetime.utcnow()
                    query={"unitmanager": self.unit_manager_id,
                           # TODO: Now that our state model is linear,
                           # we probably don't need to check Agent_Output_Status anymore.
                           # Given that it is not updates by the agent currently, disable it here.
                           #"Agent_Output_Status": { "$in": [ None, DONE ] },
                           "FTW_Output_Status": { "$in": [ None, DONE ] },
                           "state": STAGING_OUTPUT
                    update={"$set": {
                        "state": DONE
                            "$push": {
                                "statehistory": {"state": DONE, "timestamp": ts}

        except SystemExit as e :
            logger.exception("output file transfer thread caught system exit -- forcing application shutdown")
            import thread
            thread.interrupt_main ()
    def check_pilot_states(self, pilot_col):

        pending_pilots = pilot_col.find(
            {"pilotmanager": self.pilot_manager_id, "state": {"$in": [PENDING_ACTIVE, ACTIVE]}}

        for pending_pilot in pending_pilots:

            pilot_failed = False
            pilot_done = False
            reconnected = False
            pilot_id = pending_pilot["_id"]
            log_message = ""
            saga_job_id = pending_pilot["saga_job_id"]

            logger.info("Performing periodical health check for %s (SAGA job id %s)" % (str(pilot_id), saga_job_id))

            if not pilot_id in self.missing_pilots:
                self.missing_pilots[pilot_id] = 0

            # Create a job service object:
                js_url = saga_job_id.split("]-[")[0][1:]

                if js_url in self._shared_worker_data["job_services"]:
                    js = self._shared_worker_data["job_services"][js_url]
                    js = saga.job.Service(js_url, session=self._session)
                    self._shared_worker_data["job_services"][js_url] = js

                saga_job = js.get_job(saga_job_id)
                reconnected = True

                if saga_job.state in [saga.job.FAILED, saga.job.CANCELED]:
                    pilot_failed = True
                    log_message = "SAGA job state for ComputePilot %s is %s." % (pilot_id, saga_job.state)

                if saga_job.state in [saga.job.DONE]:
                    pilot_done = True
                    log_message = "SAGA job state for ComputePilot %s is %s." % (pilot_id, saga_job.state)

            except Exception as e:

                if not reconnected:
                    logger.warning("could not reconnect to pilot for state check (%s)" % e)
                    self.missing_pilots[pilot_id] += 1

                    if self.missing_pilots[pilot_id] >= JOB_CHECK_MAX_MISSES:
                        logger.debug("giving up after 10 attempts")
                        pilot_failed = True
                        log_message = "Could not reconnect to pilot %s " "multiple times - giving up" % pilot_id
                    logger.warning("pilot state check failed: %s" % e)
                    pilot_failed = True
                    log_message = (
                        "Couldn't determine job state for ComputePilot %s. " "Assuming it has failed." % pilot_id

            if pilot_failed:
                out, err, log = self._get_pilot_logs(pilot_col, pilot_id)
                ts = datetime.datetime.utcnow()
                    {"_id": pilot_id, "state": {"$ne": DONE}},
                        "$set": {"state": FAILED, "stdout": out, "stderr": err, "logfile": log},
                        "$push": {
                            "statehistory": {"state": FAILED, "timestamp": ts},
                            "log": {"message": log_message, "timestamp": ts},
                logger.warn("pilot %s declared dead" % pilot_id)

            elif pilot_done:
                # FIXME: this should only be done if the state is not yet
                # done...
                out, err, log = self._get_pilot_logs(pilot_col, pilot_id)
                ts = datetime.datetime.utcnow()
                    {"_id": pilot_id, "state": {"$ne": DONE}},
                        "$set": {"state": DONE, "stdout": out, "stderr": err, "logfile": log},
                        "$push": {
                            "statehistory": {"state": DONE, "timestamp": ts},
                            "log": {"message": log_message, "timestamp": ts},
                logger.warn("pilot %s declared dead" % pilot_id)

                if self.missing_pilots[pilot_id]:
                    logger.info("pilot %s *assumed* alive and well (%s)" % (pilot_id, self.missing_pilots[pilot_id]))
                    logger.info("pilot %s seems alive and well" % (pilot_id))