示例#1
0
 def stop(self):
     """stop() signals the process to finish up and terminate.
     """
     logger.debug("otransfer %s stopping" % (self.name))
     self._stop.set()
     self.join()
     logger.debug("otransfer %s stopped" % (self.name))
示例#2
0
 def cancel_launcher(self):
     """cancel the launcher threads
     """
     for worker in self._pilot_launcher_worker_pool:
         logger.debug("pworker %s stops   launcher %s" % (self.name, worker.name))
         worker.stop ()
         worker.join ()
         logger.debug("pworker %s stopped launcher %s" % (self.name, worker.name))
示例#3
0
    def __init__(self):
        """ Le constructeur. Not meant to be called directly.
        """
        # 'static' members
        self._uid = None
        self._name = None
        self._description = None
        self._manager = None

        # handle to the manager's worker
        self._worker = None

        if os.getenv("RADICAL_PILOT_GCDEBUG", None) is not None:
            logger.debug("GCDEBUG __init__(): ComputeUnit [object id: %s]." % id(self))
    def __init__(self):
        """ Le constructeur. Not meant to be called directly.
        """
        # 'static' members
        self._uid = None
        self._name = None
        self._description = None
        self._manager = None

        # handle to the manager's worker
        self._worker = None

        if os.getenv("RADICAL_PILOT_GCDEBUG", None) is not None:
            logger.debug("GCDEBUG __init__(): ComputeUnit [object id: %s]." %
                         id(self))
    def _get(pilot_manager_obj, pilot_ids):
        """ PRIVATE: Get one or more pilot via their UIDs.
        """
        pilots_json = pilot_manager_obj._worker.get_compute_pilot_data(
            pilot_ids=pilot_ids)

        # create and return pilot objects
        pilots = []

        for p in pilots_json:
            pilot = ComputePilot()
            pilot._uid = str(p['_id'])
            pilot._description = p['description']
            pilot._manager = pilot_manager_obj

            pilot._worker = pilot._manager._worker

            logger.debug("Reconnected to existing ComputePilot %s" % str(pilot))
            pilots.append(pilot)

        return pilots
示例#6
0
    def cancel(self):
        """Cancel the ComputeUnit.

        **Raises:**

            * :class:`radical.pilot.radical.pilotException`
        """
        # Check if this instance is valid
        if not self._uid:
            raise BadParameter("Invalid Compute Unit instance.")

        cu_json = self._worker.get_compute_unit_data(self.uid)
        pilot_uid = cu_json['pilot']

        if self.state in [DONE, FAILED, CANCELED]:
            # nothing to do
            logger.debug("Compute unit %s has state %s, can't cancel any longer." % (self._uid, self.state))

        elif self.state in [NEW, UNSCHEDULED, PENDING_INPUT_STAGING]:
            logger.debug("Compute unit %s has state %s, going to prevent from starting." % (self._uid, self.state))
            self._manager._session._dbs.set_compute_unit_state(self._uid, CANCELED, ["Received Cancel"])

        elif self.state == STAGING_INPUT:
            logger.debug("Compute unit %s has state %s, will cancel the transfer." % (self._uid, self.state))
            self._manager._session._dbs.set_compute_unit_state(self._uid, CANCELED, ["Received Cancel"])

        elif self.state in [PENDING_EXECUTION, SCHEDULING]:
            logger.debug("Compute unit %s has state %s, will abort start-up." % (self._uid, self.state))
            self._manager._session._dbs.set_compute_unit_state(self._uid, CANCELED, ["Received Cancel"])

        elif self.state == EXECUTING:
            logger.debug("Compute unit %s has state %s, will terminate the task." % (self._uid, self.state))
            self._manager._session._dbs.send_command_to_pilot(cmd=COMMAND_CANCEL_COMPUTE_UNIT, arg=self.uid, pilot_ids=pilot_uid)

        elif self.state == PENDING_OUTPUT_STAGING:
            logger.debug("Compute unit %s has state %s, will abort the transfer." % (self._uid, self.state))
            self._manager._session._dbs.set_compute_unit_state(self._uid, CANCELED, ["Received Cancel"])

        elif self.state == STAGING_OUTPUT:
            logger.debug("Compute unit %s has state %s, will cancel the transfer." % (self._uid, self.state))
            self._manager._session._dbs.set_compute_unit_state(self._uid, CANCELED, ["Received Cancel"])

        else:
            raise IncorrectState("Unknown Compute Unit state: %s, cannot cancel" % self.state)

        # done canceling
        return
    def run(self):
        """Starts the process when Process.start() is called.
        """

        # make sure to catch sys.exit (which raises SystemExit)
        try :

            logger.info("Starting InputFileTransferWorker")

            # Try to connect to the database and create a tailable cursor.
            try:
                connection = self.db_connection_info.get_db_handle()
                db = connection[self.db_connection_info.dbname]
                um_col = db["%s.cu" % self.db_connection_info.session_id]
                logger.debug("Connected to MongoDB. Serving requests for UnitManager %s." % self.unit_manager_id)

            except Exception as e :
                logger.exception("Connection error: %s" % e)
                raise

            try :
                while not self._stop.is_set():
                    # See if we can find a ComputeUnit that is waiting for
                    # input file transfer.
                    compute_unit = None

                    ts = datetime.datetime.utcnow()
                    compute_unit = um_col.find_and_modify(
                        query={"unitmanager": self.unit_manager_id,
                               "FTW_Input_Status": PENDING},
                        update={"$set" : {"FTW_Input_Status": EXECUTING,
                                          "state": STAGING_INPUT},
                                "$push": {"statehistory": {"state": STAGING_INPUT, "timestamp": ts}}},
                        limit=BULK_LIMIT # TODO: bulklimit is probably not the best way to ensure there is just one
                    )
                    # FIXME: AM: find_and_modify is not bulkable!
                    state = STAGING_INPUT

                    if compute_unit is None:
                        # Sleep a bit if no new units are available.
                        time.sleep(IDLE_TIME) 

                    else:
                        compute_unit_id = None
                        try:
                            log_messages = []

                            # We have found a new CU. Now we can process the transfer
                            # directive(s) wit SAGA.
                            compute_unit_id = str(compute_unit["_id"])
                            remote_sandbox = compute_unit["sandbox"]
                            input_staging = compute_unit["FTW_Input_Directives"]

                            # We need to create the CU's directory in case it doesn't exist yet.
                            log_msg = "Creating ComputeUnit sandbox directory %s." % remote_sandbox
                            log_messages.append(log_msg)
                            logger.info(log_msg)

                            # Creating the sandbox directory.
                            try:
                                logger.debug ("saga.fs.Directory ('%s')" % remote_sandbox)

                                remote_sandbox_keyurl = saga.Url (remote_sandbox)
                                remote_sandbox_keyurl.path = '/'
                                remote_sandbox_key = str(remote_sandbox_keyurl)

                                if  remote_sandbox_key not in self._saga_dirs :
                                    self._saga_dirs[remote_sandbox_key] = \
                                            saga.filesystem.Directory (remote_sandbox_key,
                                                    flags=saga.filesystem.CREATE_PARENTS,
                                                    session=self._session)

                                saga_dir = self._saga_dirs[remote_sandbox_key]
                                saga_dir.make_dir (remote_sandbox, 
                                                   flags=saga.filesystem.CREATE_PARENTS)
                            except Exception as e :
                                logger.exception('Error: %s' % e)
                                # FIXME: why is this exception ignored?  AM


                            logger.info("Processing input file transfers for ComputeUnit %s" % compute_unit_id)
                            # Loop over all transfer directives and execute them.
                            for sd in input_staging:

                                state_doc = um_col.find_one(
                                    {"_id": compute_unit_id},
                                    fields=["state"]
                                )
                                if state_doc['state'] == CANCELED:
                                    logger.info("Compute Unit Canceled, interrupting input file transfers.")
                                    state = CANCELED
                                    break

                                abs_src = os.path.abspath(sd['source'])
                                input_file_url = saga.Url("file://localhost/%s" % abs_src)
                                if not sd['target']:
                                    target = remote_sandbox
                                else:
                                    target = "%s/%s" % (remote_sandbox, sd['target'])

                                log_msg = "Transferring input file %s -> %s" % (input_file_url, target)
                                log_messages.append(log_msg)
                                logger.debug(log_msg)

                                # Execute the transfer.
                                logger.debug ("saga.fs.File ('%s')" % input_file_url)
                                input_file = saga.filesystem.File(
                                    input_file_url,
                                    session=self._session
                                )

                                if CREATE_PARENTS in sd['flags']:
                                    copy_flags = saga.filesystem.CREATE_PARENTS
                                else:
                                    copy_flags = 0

                                try :
                                    input_file.copy(target, flags=copy_flags)
                                except Exception as e :
                                    logger.exception (e)
                                input_file.close()

                                # If all went fine, update the state of this StagingDirective to Done
                                um_col.find_and_modify(
                                    query={"_id" : compute_unit_id,
                                           'FTW_Input_Status': EXECUTING,
                                           'FTW_Input_Directives.state': PENDING,
                                           'FTW_Input_Directives.source': sd['source'],
                                           'FTW_Input_Directives.target': sd['target'],
                                           },
                                    update={'$set': {'FTW_Input_Directives.$.state': 'Done'},
                                            '$push': {'log': {
                                                'timestamp': datetime.datetime.utcnow(), 
                                                'message'  : log_msg}}
                                    }
                                )

                        except Exception as e :
                            # Update the CU's state 'FAILED'.
                            ts = datetime.datetime.utcnow()
                            logentry = {'message'  : "Input transfer failed: %s" % e,
                                        'timestamp': ts}

                            um_col.update({'_id': compute_unit_id}, {
                                '$set': {'state': FAILED},
                                '$push': {
                                    'statehistory': {'state': FAILED, 'timestamp': ts},
                                    'log': logentry
                                }
                            })

                            logger.exception(str(logentry))

                    # Code below is only to be run by the "first" or only worker
                    if self._worker_number > 1:
                        continue

                    # If the CU was canceled we can skip the remainder of this loop.
                    if state == CANCELED:
                        continue

                    #
                    # Check to see if there are more pending Directives, if not, we are Done
                    #
                    cursor_w = um_col.find({"unitmanager": self.unit_manager_id,
                                            "$or": [ {"Agent_Input_Status": EXECUTING},
                                                     {"FTW_Input_Status": EXECUTING}
                                                   ]
                                            }
                                           )
                    # Iterate over all the returned CUs (if any)
                    for cu in cursor_w:
                        # See if there are any FTW Input Directives still pending
                        if cu['FTW_Input_Status'] == EXECUTING and \
                                not any(d['state'] == EXECUTING or d['state'] == PENDING for d in cu['FTW_Input_Directives']):
                            # All Input Directives for this FTW are done, mark the CU accordingly
                            um_col.update({"_id": cu["_id"]},
                                          {'$set': {'FTW_Input_Status': DONE},
                                           '$push': {'log': {
                                                'timestamp': datetime.datetime.utcnow(),
                                                'message'  : 'All FTW Input Staging Directives done - %d.' % self._worker_number}}
                                           }
                            )

                        # See if there are any Agent Input Directives still pending or executing,
                        # if not, mark it DONE.
                        if cu['Agent_Input_Status'] == EXECUTING and \
                                not any(d['state'] == EXECUTING or d['state'] == PENDING for d in cu['Agent_Input_Directives']):
                            # All Input Directives for this Agent are done, mark the CU accordingly
                            um_col.update({"_id": cu["_id"]},
                                           {'$set': {'Agent_Input_Status': DONE},
                                            '$push': {'log': {
                                                'timestamp': datetime.datetime.utcnow(), 
                                                'message'  : 'All Agent Input Staging Directives done - %d.' % self._worker_number}}
                                           }
                            )

                    #
                    # Check for all CUs if both Agent and FTW staging is done, we can then mark the CU PendingExecution
                    #
                    ts = datetime.datetime.utcnow()
                    um_col.find_and_modify(
                        query={"unitmanager": self.unit_manager_id,
                               "Agent_Input_Status": { "$in": [ None, DONE ] },
                               "FTW_Input_Status": { "$in": [ None, DONE ] },
                               "state": STAGING_INPUT
                        },
                        update={"$set": {
                                    "state": PENDING_EXECUTION
                                },
                                "$push": {
                                    "statehistory": {"state": PENDING_EXECUTION, "timestamp": ts}
                                }
                        }
                    )

            except Exception as e :

                logger.exception("transfer worker error: %s" % e)
                self._session.close (cleanup=False)
                raise

        except SystemExit as e :
            logger.debug("input file transfer thread caught system exit -- forcing application shutdown")
            import thread
            thread.interrupt_main ()
    def run(self):
        """Starts the process when Process.start() is called.
        """

        # make sure to catch sys.exit (which raises SystemExit)
        try:
            # Get directory where this module lives
            mod_dir = os.path.dirname(os.path.realpath(__file__))

            # Try to connect to the database
            try:
                connection = self.db_connection_info.get_db_handle()
                db = connection[self.db_connection_info.dbname]
                pilot_col = db["%s.p" % self.db_connection_info.session_id]
                logger.debug("Connected to MongoDB. Serving requests for PilotManager %s." % self.pilot_manager_id)

            except Exception as e:
                logger.exception("Connection error: %s" % e)
                return

            last_job_check = time.time()

            while not self._stop.is_set():

                # Periodically, we pull up all ComputePilots that are pending
                # execution or were last seen executing and check if the corresponding
                # SAGA job is still pending in the queue. If that is not the case,
                # we assume that the job has failed for some reasons and update
                # the state of the ComputePilot accordingly.
                if last_job_check + JOB_CHECK_INTERVAL < time.time():
                    last_job_check = time.time()
                    self.check_pilot_states(pilot_col)

                # See if we can find a ComputePilot that is waiting to be launched.
                # If we find one, we use SAGA to create a job service, a job
                # description and a job that is then send to the local or remote
                # queueing system. If this succedes, we set the ComputePilot's
                # state to pending, otherwise to failed.
                compute_pilot = None

                ts = datetime.datetime.utcnow()
                compute_pilot = pilot_col.find_and_modify(
                    query={"pilotmanager": self.pilot_manager_id, "state": PENDING_LAUNCH},
                    update={
                        "$set": {"state": LAUNCHING},
                        "$push": {"statehistory": {"state": LAUNCHING, "timestamp": ts}},
                    },
                )

                if not compute_pilot:
                    time.sleep(IDLE_TIMER)

                else:
                    try:
                        # ------------------------------------------------------
                        #
                        # LAUNCH THE PILOT AGENT VIA SAGA
                        #
                        logentries = []
                        pilot_id = str(compute_pilot["_id"])

                        logger.info("Launching ComputePilot %s" % pilot_id)

                        # ------------------------------------------------------
                        # Database connection parameters
                        session_uid = self.db_connection_info.session_id
                        database_url = self.db_connection_info.dburl
                        database_name = self.db_connection_info.dbname
                        database_auth = self.db_connection_info.dbauth

                        # ------------------------------------------------------
                        # pilot description and resource configuration
                        number_cores = compute_pilot["description"]["cores"]
                        runtime = compute_pilot["description"]["runtime"]
                        queue = compute_pilot["description"]["queue"]
                        project = compute_pilot["description"]["project"]
                        cleanup = compute_pilot["description"]["cleanup"]
                        resource_key = compute_pilot["description"]["resource"]
                        schema = compute_pilot["description"]["access_schema"]
                        memory = compute_pilot["description"]["memory"]
                        pilot_sandbox = compute_pilot["sandbox"]
                        global_sandbox = compute_pilot["global_sandbox"]

                        # we expand and exchange keys in the resource config,
                        # depending on the selected schema so better use a deep
                        # copy..
                        resource_cfg = self._session.get_resource_config(resource_key, schema)

                        # import pprint
                        # pprint.pprint (resource_cfg)

                        # ------------------------------------------------------
                        # get parameters from cfg, set defaults where needed
                        agent_mongodb_endpoint = resource_cfg.get("agent_mongodb_endpoint", database_url)
                        agent_spawner = resource_cfg.get("agent_spawner", DEFAULT_AGENT_SPAWNER)
                        agent_type = resource_cfg.get("agent_type", DEFAULT_AGENT_TYPE)
                        agent_scheduler = resource_cfg.get("agent_scheduler")
                        tunnel_bind_device = resource_cfg.get("tunnel_bind_device")
                        default_queue = resource_cfg.get("default_queue")
                        forward_tunnel_endpoint = resource_cfg.get("forward_tunnel_endpoint")
                        js_endpoint = resource_cfg.get("job_manager_endpoint")
                        lrms = resource_cfg.get("lrms")
                        mpi_launch_method = resource_cfg.get("mpi_launch_method")
                        pre_bootstrap = resource_cfg.get("pre_bootstrap")
                        python_interpreter = resource_cfg.get("python_interpreter")
                        spmd_variation = resource_cfg.get("spmd_variation")
                        task_launch_method = resource_cfg.get("task_launch_method")
                        rp_version = resource_cfg.get("rp_version", DEFAULT_RP_VERSION)
                        virtenv_mode = resource_cfg.get("virtenv_mode", DEFAULT_VIRTENV_MODE)
                        virtenv = resource_cfg.get("virtenv", DEFAULT_VIRTENV)
                        stage_cacerts = resource_cfg.get("stage_cacerts", "False")

                        if stage_cacerts.lower() == "true":
                            stage_cacerts = True
                        else:
                            stage_cacerts = False

                        # expand variables in virtenv string
                        virtenv = virtenv % {
                            "pilot_sandbox": saga.Url(pilot_sandbox).path,
                            "global_sandbox": saga.Url(global_sandbox).path,
                        }

                        # Check for deprecated global_virtenv
                        global_virtenv = resource_cfg.get("global_virtenv")
                        if global_virtenv:
                            logger.warn("'global_virtenv' keyword is deprecated -- use 'virtenv' and 'virtenv_mode'")
                            virtenv = global_virtenv
                            virtenv_mode = "use"

                        # set default scheme, host, port and dbname if not set
                        db_url = saga.Url(agent_mongodb_endpoint)
                        if not db_url.scheme:
                            db_url.scheme = "mongodb"
                        if not db_url.host:
                            db_url.host = "localhost"
                        if not db_url.port:
                            db_url.port = 27017
                        if not database_name:
                            database_name = "radicalpilot"

                        # Create a host:port string for use by the bootstrapper.
                        database_hostport = "%s:%d" % (db_url.host, db_url.port)

                        # ------------------------------------------------------
                        # Copy the bootstrap shell script.  This also creates
                        # the sandbox. We use always "default_bootstrapper.sh"
                        bootstrapper = "default_bootstrapper.sh"
                        bootstrapper_path = os.path.abspath("%s/../bootstrapper/%s" % (mod_dir, bootstrapper))

                        msg = "Using bootstrapper %s" % bootstrapper_path
                        logentries.append(Logentry(msg, logger=logger.info))

                        bs_script_url = saga.Url("file://localhost/%s" % bootstrapper_path)
                        bs_script_tgt = saga.Url("%s/pilot_bootstrapper.sh" % pilot_sandbox)

                        msg = "Copying bootstrapper '%s' to agent sandbox (%s)." % (bs_script_url, bs_script_tgt)
                        logentries.append(Logentry(msg, logger=logger.debug))

                        bs_script = saga.filesystem.File(bs_script_url, session=self._session)
                        bs_script.copy(bs_script_tgt, flags=saga.filesystem.CREATE_PARENTS)
                        bs_script.close()

                        # ------------------------------------------------------
                        # the version of the agent is derived from
                        # rp_version, which has the following format
                        # and interpretation:
                        #
                        # case rp_version:
                        #   @<token>:
                        #   @tag/@branch/@commit: # no sdist staging
                        #       git clone $github_base radical.pilot.src
                        #       (cd radical.pilot.src && git checkout token)
                        #       pip install -t $VIRTENV/rp_install/ radical.pilot.src
                        #       rm -rf radical.pilot.src
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   release: # no sdist staging
                        #       pip install -t $VIRTENV/rp_install radical.pilot
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   local: # needs sdist staging
                        #       tar zxf $sdist.tgz
                        #       pip install -t $VIRTENV/rp_install $sdist/
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   debug: # needs sdist staging
                        #       tar zxf $sdist.tgz
                        #       pip install -t $SANDBOX/rp_install $sdist/
                        #       export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH
                        #
                        #   installed: # no sdist staging
                        #       true
                        # esac
                        #
                        # virtenv_mode
                        #   private : error  if ve exists, otherwise create, then use
                        #   update  : update if ve exists, otherwise create, then use
                        #   create  : use    if ve exists, otherwise create, then use
                        #   use     : use    if ve exists, otherwise error,  then exit
                        #   recreate: delete if ve exists, otherwise create, then use
                        #
                        # examples   :
                        #   [email protected]
                        #   virtenv@devel
                        #   virtenv@release
                        #   virtenv@installed
                        #   stage@local
                        #   stage@/tmp/my_agent.py
                        #
                        # Note that some combinations may be invalid,
                        # specifically in the context of virtenv_mode.  If, for
                        # example, virtenv_mode is 'use', then the 'virtenv:tag'
                        # will not make sense, as the virtenv is not updated.
                        # In those cases, the virtenv_mode is honored, and
                        # a warning is printed.
                        #
                        # Also, the 'stage' mode can only be combined with the
                        # 'local' source, or with a path to the agent (relative
                        # to mod_dir, or absolute).
                        #
                        # A rp_version which does not adhere to the
                        # above syntax is ignored, and the fallback stage@local
                        # is used.

                        if not rp_version.startswith("@") and not rp_version in ["installed", "local", "debug"]:
                            raise ValueError("invalid rp_version '%s'" % rp_version)

                        stage_sdist = True
                        if rp_version in ["installed", "release"]:
                            stage_sdist = False

                        if rp_version.startswith("@"):
                            stage_sdist = False
                            rp_version = rp_version[1:]  # strip '@'

                        # ------------------------------------------------------
                        # Copy the rp sdist if needed.  We actually also stage
                        # the sdists for radical.utils and radical.saga, so that
                        # we have the complete stack to install...
                        if stage_sdist:

                            for path in [ru.sdist_path, saga.sdist_path, sdist_path]:

                                sdist_url = saga.Url("file://localhost/%s" % path)
                                msg = "Copying sdist '%s' to sdist sandbox (%s)." % (sdist_url, pilot_sandbox)
                                logentries.append(Logentry(msg, logger=logger.debug))

                                sdist_file = saga.filesystem.File(sdist_url)
                                sdist_file.copy("%s/" % (str(pilot_sandbox)))
                                sdist_file.close()

                        # ------------------------------------------------------
                        # some machines cannot run pip due to outdated ca certs.
                        # For those, we also stage an updated cert bundle
                        if stage_cacerts:
                            cc_path = os.path.abspath("%s/../bootstrapper/%s" % (mod_dir, "cacert.pem.gz"))

                            cc_script_url = saga.Url("file://localhost/%s" % cc_path)
                            cc_script_tgt = saga.Url("%s/cacert.pem.gz" % pilot_sandbox)

                            cc_script = saga.filesystem.File(cc_script_url, session=self._session)
                            cc_script.copy(cc_script_tgt, flags=saga.filesystem.CREATE_PARENTS)
                            cc_script.close()

                        # ------------------------------------------------------
                        # sanity checks
                        if not agent_spawner:
                            raise RuntimeError("missing agent spawner")
                        if not agent_scheduler:
                            raise RuntimeError("missing agent scheduler")
                        if not lrms:
                            raise RuntimeError("missing LRMS")
                        if not mpi_launch_method:
                            raise RuntimeError("missing mpi launch method")
                        if not task_launch_method:
                            raise RuntimeError("missing task launch method")

                        # massage some values
                        debug_level = os.environ.get("RADICAL_PILOT_AGENT_VERBOSE", logger.level)
                        try:
                            debug_level = int(debug_level)
                        except ValueError:
                            debug_level = {
                                "CRITICAL": 1,
                                "ERROR": 2,
                                "WARNING": 3,
                                "WARN": 3,
                                "INFO": 4,
                                "DEBUG": 5,
                            }.get(debug_level, 0)

                        if not queue:
                            queue = default_queue

                        if cleanup and isinstance(cleanup, bool):
                            cleanup = "luve"  #  l : log files
                            #  u : unit work dirs
                            #  v : virtualenv
                            #  e : everything (== pilot sandbox)
                            #
                            # we never cleanup virtenvs which are not private
                            if virtenv_mode is not "private":
                                cleanup = cleanup.replace("v", "")

                        sdists = ":".join([ru.sdist_name, saga.sdist_name, sdist_name])

                        # set mandatory args
                        bootstrap_args = ""
                        bootstrap_args += " -b '%s'" % sdists
                        bootstrap_args += " -c '%s'" % number_cores
                        bootstrap_args += " -d '%s'" % debug_level
                        bootstrap_args += " -g '%s'" % virtenv
                        bootstrap_args += " -j '%s'" % task_launch_method
                        bootstrap_args += " -k '%s'" % mpi_launch_method
                        bootstrap_args += " -l '%s'" % lrms
                        bootstrap_args += " -m '%s'" % database_hostport
                        bootstrap_args += " -n '%s'" % database_name
                        bootstrap_args += " -o '%s'" % agent_spawner
                        bootstrap_args += " -p '%s'" % pilot_id
                        bootstrap_args += " -q '%s'" % agent_scheduler
                        bootstrap_args += " -r '%s'" % runtime
                        bootstrap_args += " -s '%s'" % session_uid
                        bootstrap_args += " -t '%s'" % agent_type
                        bootstrap_args += " -u '%s'" % virtenv_mode
                        bootstrap_args += " -v '%s'" % rp_version

                        # set optional args
                        if database_auth:
                            bootstrap_args += " -a '%s'" % database_auth
                        if tunnel_bind_device:
                            bootstrap_args += " -D '%s'" % tunnel_bind_device
                        if pre_bootstrap:
                            bootstrap_args += " -e '%s'" % "' -e '".join(pre_bootstrap)
                        if forward_tunnel_endpoint:
                            bootstrap_args += " -f '%s'" % forward_tunnel_endpoint
                        if python_interpreter:
                            bootstrap_args += " -i '%s'" % python_interpreter
                        if cleanup:
                            bootstrap_args += " -x '%s'" % cleanup

                        # ------------------------------------------------------
                        # now that the script is in place and we know where it is,
                        # we can launch the agent
                        js_url = saga.Url(js_endpoint)
                        logger.debug("saga.job.Service ('%s')" % js_url)
                        if js_url in self._shared_worker_data["job_services"]:
                            js = self._shared_worker_data["job_services"][js_url]
                        else:
                            js = saga.job.Service(js_url, session=self._session)
                            self._shared_worker_data["job_services"][js_url] = js

                        # ------------------------------------------------------
                        # Create SAGA Job description and submit the pilot job

                        jd = saga.job.Description()

                        jd.executable = "/bin/bash"
                        jd.arguments = ["-l pilot_bootstrapper.sh", bootstrap_args]
                        jd.working_directory = saga.Url(pilot_sandbox).path
                        jd.project = project
                        jd.output = "agent.out"
                        jd.error = "agent.err"
                        jd.total_cpu_count = number_cores
                        jd.wall_time_limit = runtime
                        jd.total_physical_memory = memory
                        jd.queue = queue

                        # Set the SPMD variation only if required
                        if spmd_variation:
                            jd.spmd_variation = spmd_variation

                        if "RADICAL_PILOT_PROFILE" in os.environ:
                            jd.environment = {"RADICAL_PILOT_PROFILE": "TRUE"}

                        logger.debug("Bootstrap command line: %s %s" % (jd.executable, jd.arguments))

                        msg = "Submitting SAGA job with description: %s" % str(jd.as_dict())
                        logentries.append(Logentry(msg, logger=logger.debug))

                        pilotjob = js.create_job(jd)
                        pilotjob.run()

                        # do a quick error check
                        if pilotjob.state == saga.FAILED:
                            raise RuntimeError("SAGA Job state is FAILED.")

                        saga_job_id = pilotjob.id
                        self._shared_worker_data["job_ids"][pilot_id] = [saga_job_id, js_url]

                        msg = "SAGA job submitted with job id %s" % str(saga_job_id)
                        logentries.append(Logentry(msg, logger=logger.debug))

                        #
                        # ------------------------------------------------------

                        log_dicts = list()
                        for le in logentries:
                            log_dicts.append(le.as_dict())

                        # Update the Pilot's state to 'PENDING_ACTIVE' if SAGA job submission was successful.
                        ts = datetime.datetime.utcnow()
                        ret = pilot_col.update(
                            {"_id": pilot_id, "state": "Launching"},
                            {
                                "$set": {"state": PENDING_ACTIVE, "saga_job_id": saga_job_id},
                                "$push": {"statehistory": {"state": PENDING_ACTIVE, "timestamp": ts}},
                                "$pushAll": {"log": log_dicts},
                            },
                        )

                        if ret["n"] == 0:
                            # could not update, probably because the agent is
                            # running already.  Just update state history and
                            # jobid then
                            # FIXME: make sure of the agent state!
                            ret = pilot_col.update(
                                {"_id": pilot_id},
                                {
                                    "$set": {"saga_job_id": saga_job_id},
                                    "$push": {"statehistory": {"state": PENDING_ACTIVE, "timestamp": ts}},
                                    "$pushAll": {"log": log_dicts},
                                },
                            )

                    except Exception as e:
                        # Update the Pilot's state 'FAILED'.
                        out, err, log = self._get_pilot_logs(pilot_col, pilot_id)
                        ts = datetime.datetime.utcnow()

                        # FIXME: we seem to be unable to bson/json handle saga
                        # log messages containing an '#'.  This shows up here.
                        # Until we find a clean workaround, make log shorter and
                        # rely on saga logging to reveal the problem.
                        msg = "Pilot launching failed! (%s)" % e
                        logentries.append(Logentry(msg))

                        log_dicts = list()
                        log_messages = list()
                        for le in logentries:
                            log_dicts.append(le.as_dict())
                            log_messages.append(le.message)

                        pilot_col.update(
                            {"_id": pilot_id, "state": {"$ne": FAILED}},
                            {
                                "$set": {"state": FAILED, "stdout": out, "stderr": err, "logfile": log},
                                "$push": {"statehistory": {"state": FAILED, "timestamp": ts}},
                                "$pushAll": {"log": log_dicts},
                            },
                        )
                        logger.exception("\n".join(log_messages))

        except SystemExit as e:
            logger.exception("pilot launcher thread caught system exit -- forcing application shutdown")
            import thread

            thread.interrupt_main()
    def wait_units(self,
                   unit_ids=None,
                   state=[DONE, FAILED, CANCELED],
                   timeout=None):
        """Returns when one or more :class:`radical.pilot.ComputeUnits` reach a
        specific state.

        If `unit_uids` is `None`, `wait_units` returns when **all**
        ComputeUnits reach the state defined in `state`.

        **Example**::

            # TODO -- add example

        **Arguments:**

            * **unit_uids** [`string` or `list of strings`]
              If unit_uids is set, only the ComputeUnits with the specified
              uids are considered. If unit_uids is `None` (default), all
              ComputeUnits are considered.

            * **state** [`string`]
              The state that ComputeUnits have to reach in order for the call
              to return.

              By default `wait_units` waits for the ComputeUnits to
              reach a terminal state, which can be one of the following:

              * :data:`radical.pilot.DONE`
              * :data:`radical.pilot.FAILED`
              * :data:`radical.pilot.CANCELED`

            * **timeout** [`float`]
              Timeout in seconds before the call returns regardless of Pilot
              state changes. The default value **None** waits forever.

        **Raises:**

            * :class:`radical.pilot.PilotException`
        """
        if not self._uid:
            raise IncorrectState(msg="Invalid object instance.")

        if not isinstance(state, list):
            state = [state]

        return_list_type = True
        if (not isinstance(unit_ids, list)) and (unit_ids is not None):
            return_list_type = False
            unit_ids = [unit_ids]

        units = self.get_units(unit_ids)
        start = time.time()
        all_ok = False
        states = list()

        while not all_ok:

            all_ok = True
            states = list()

            for unit in units:
                if unit.state not in state:
                    all_ok = False

                states.append(unit.state)

            # check timeout
            if (None != timeout) and (timeout <= (time.time() - start)):
                if not all_ok:
                    logger.debug("wait timed out: %s" % states)
                break

            # sleep a little if this cycle was idle
            if not all_ok:
                time.sleep(0.1)

        # done waiting
        if return_list_type:
            return states
        else:
            return states[0]
示例#10
0
    def close(self, cleanup=True, terminate=True, delete=None):
        """Closes the session.

        All subsequent attempts access objects attached to the session will 
        result in an error. If cleanup is set to True (default) the session
        data is removed from the database.

        **Arguments:**
            * **cleanup** (`bool`): Remove session from MongoDB (implies * terminate)
            * **terminate** (`bool`): Shut down all pilots associated with the session. 

        **Raises:**
            * :class:`radical.pilot.IncorrectState` if the session is closed
              or doesn't exist. 
        """

        logger.debug("session %s closing" % (str(self._uid)))

        uid = self._uid

        if not self._uid:
            logger.error("Session object already closed.")
            return

        # we keep 'delete' for backward compatibility.  If it was set, and the
        # other flags (cleanup, terminate) are as defaulted (True), then delete
        # will supercede them.  Delete is considered deprecated though, and
        # we'll thus issue a warning.
        if  delete != None:

            if  cleanup == True and terminate == True :
                cleanup   = delete
                terminate = delete
                logger.warning("'delete' flag on session is deprecated. " \
                               "Please use 'cleanup' and 'terminate' instead!")

        if  cleanup :
            # cleanup implies terminate
            terminate = True

        for pmgr in self._pilot_manager_objects:
            logger.debug("session %s closes   pmgr   %s" % (str(self._uid), pmgr._uid))
            pmgr.close (terminate=terminate)
            logger.debug("session %s closed   pmgr   %s" % (str(self._uid), pmgr._uid))

        for umgr in self._unit_manager_objects:
            logger.debug("session %s closes   umgr   %s" % (str(self._uid), umgr._uid))
            umgr.close()
            logger.debug("session %s closed   umgr   %s" % (str(self._uid), umgr._uid))

        if  cleanup :
            self._destroy_db_entry()

        logger.debug("session %s closed" % (str(self._uid)))
示例#11
0
    def close(self, terminate=True):
        """Shuts down the PilotManager and its background workers in a 
        coordinated fashion.

        **Arguments:**

            * **terminate** [`bool`]: If set to True, all active pilots will 
              get canceled (default: False).

        """

        logger.debug("pmgr    %s closing" % (str(self._uid)))

        # Spit out a warning in case the object was already closed.
        if not self._uid:
            logger.error("PilotManager object already closed.")
            return

        # before we terminate pilots, we have to kill the pilot launcher threads
        # -- otherwise we'll run into continous race conditions due to the
        # ongoing state checks...
        if self._worker is not None:
            # Stop the worker process
            logger.debug("pmgr    %s cancel   worker %s" %
                         (str(self._uid), self._worker.name))
            self._worker.cancel_launcher()
            logger.debug("pmgr    %s canceled worker %s" %
                         (str(self._uid), self._worker.name))

        # If terminate is set, we cancel all pilots.
        if terminate:
            # cancel all pilots, make sure they are gone, and close the pilot
            # managers.
            for pilot in self.get_pilots():
                logger.debug("pmgr    %s cancels  pilot  %s" %
                             (str(self._uid), pilot._uid))
            self.cancel_pilots()

            # FIXME:
            #
            # wait_pilots() will wait until all pilots picked up the sent cancel
            # signal and died.  However, that can take a loooong time.  For
            # example, if a pilot is in 'PENDING_ACTIVE' state, this will have to
            # wait until the pilot is bootstrapped, started, connected to the DB,
            # and shut down again.  Or, for a pilot which just got a shitload of
            # units, it will have to wait until the pilot started all those units
            # and then checks its command queue again.  Or, if the pilot job
            # already died, wait will block until the state checker kicks in and
            # declares the pilot as dead, which takes a couple of minutes.
            #
            # Solution would be to add a CANCELING state and to wait for that one,
            # too, which basically means to wait until the cancel signal has been
            # sent.  There is not much more to do at this point anyway.  This is at
            # the moment faked in the manager controler, which sets that state
            # after sending the cancel command.  This should be converted into
            # a proper state -- that would, btw, remove the need for a cancel
            # command in the first place, as the pilot can just pull its own state
            # instead, and cancel on CANCELING...
            #
            # self.wait_pilots ()
            wait_for_cancel = True
            all_pilots = self.get_pilots()
            while wait_for_cancel:
                wait_for_cancel = False
                for pilot in all_pilots:
                    logger.debug("pmgr    %s wait for pilot  %s (%s)" %
                                 (str(self._uid), pilot._uid, pilot.state))
                    if pilot.state not in [DONE, FAILED, CANCELED, CANCELING]:
                        time.sleep(1)
                        wait_for_cancel = True
                        break
            for pilot in self.get_pilots():
                logger.debug("pmgr    %s canceled pilot  %s" %
                             (str(self._uid), pilot._uid))

        logger.debug("pmgr    %s stops    worker %s" %
                     (str(self._uid), self._worker.name))
        self._worker.stop()
        self._worker.join()
        logger.debug("pmgr    %s stopped  worker %s" %
                     (str(self._uid), self._worker.name))

        # Remove worker from registry
        self._session._process_registry.remove(self._uid)

        logger.debug("pmgr    %s closed" % (str(self._uid)))
        self._uid = None
示例#12
0
    def register_start_pilot_request(self, pilot, resource_config):
        """Register a new pilot start request with the worker.
        """

        # create a new UID for the pilot
        pilot_uid = ru.generate_id ('pilot')

        # switch endpoint type
        filesystem_endpoint = resource_config['filesystem_endpoint']

        fs = saga.Url(filesystem_endpoint)

        # get the home directory on the remote machine.
        # Note that this will only work for (gsi)ssh or shell based access
        # mechanisms (FIXME)

        import saga.utils.pty_shell as sup

        if fs.port is not None:
            url = "%s://%s:%d/" % (fs.schema, fs.host, fs.port)
        else:
            url = "%s://%s/" % (fs.schema, fs.host)

        logger.debug ("saga.utils.PTYShell ('%s')" % url)
        shell = sup.PTYShell(url, self._session, logger)

        if pilot.description.sandbox :
            workdir_raw = pilot.description.sandbox
        else :
            workdir_raw = resource_config.get ('default_remote_workdir', "$PWD")

        if '$' in workdir_raw or '`' in workdir_raw :
            ret, out, err = shell.run_sync (' echo "WORKDIR: %s"' % workdir_raw)
            if  ret == 0 and 'WORKDIR:' in out :
                workdir_expanded = out.split(":")[1].strip()
                logger.debug("Determined remote working directory for %s: '%s'" % (url, workdir_expanded))
            else :
                error_msg = "Couldn't determine remote working directory."
                logger.error(error_msg)
                raise Exception(error_msg)
        else :
            workdir_expanded = workdir_raw

        # At this point we have determined 'pwd'
        fs.path = "%s/radical.pilot.sandbox" % workdir_expanded

        # This is the base URL / 'sandbox' for the pilot!
        agent_dir_url = saga.Url("%s/%s-%s/" % (str(fs), self._session.uid, pilot_uid))

        # Create a database entry for the new pilot.
        pilot_uid, pilot_json = self._db.insert_pilot(
            pilot_uid=pilot_uid,
            pilot_manager_uid=self._pm_id,
            pilot_description=pilot.description,
            pilot_sandbox=str(agent_dir_url), 
            global_sandbox=str(fs.path)
            )

        # Create a shared data store entry
        self._shared_data[pilot_uid] = {
            'data':          pilot_json,
            'callbacks':     [],
            'facade_object': weakref.ref(pilot)
        }

        return pilot_uid
    def run(self):
        """Starts the process when Process.start() is called.
        """

        # make sure to catch sys.exit (which raises SystemExit)
        try:
            # Get directory where this module lives
            mod_dir = os.path.dirname(os.path.realpath(__file__))

            # Try to connect to the database
            try:
                connection = self.db_connection_info.get_db_handle()
                db = connection[self.db_connection_info.dbname]
                pilot_col = db["%s.p" % self.db_connection_info.session_id]
                logger.debug(
                    "Connected to MongoDB. Serving requests for PilotManager %s."
                    % self.pilot_manager_id)

            except Exception as e:
                logger.exception("Connection error: %s" % e)
                return

            last_job_check = time.time()

            while not self._stop.is_set():

                # Periodically, we pull up all ComputePilots that are pending
                # execution or were last seen executing and check if the corresponding
                # SAGA job is still pending in the queue. If that is not the case,
                # we assume that the job has failed for some reasons and update
                # the state of the ComputePilot accordingly.
                if last_job_check + JOB_CHECK_INTERVAL < time.time():
                    last_job_check = time.time()
                    self.check_pilot_states(pilot_col)

                # See if we can find a ComputePilot that is waiting to be launched.
                # If we find one, we use SAGA to create a job service, a job
                # description and a job that is then send to the local or remote
                # queueing system. If this succedes, we set the ComputePilot's
                # state to pending, otherwise to failed.
                compute_pilot = None

                ts = datetime.datetime.utcnow()
                compute_pilot = pilot_col.find_and_modify(
                    query={
                        "pilotmanager": self.pilot_manager_id,
                        "state": PENDING_LAUNCH
                    },
                    update={
                        "$set": {
                            "state": LAUNCHING
                        },
                        "$push": {
                            "statehistory": {
                                "state": LAUNCHING,
                                "timestamp": ts
                            }
                        }
                    })

                if not compute_pilot:
                    time.sleep(IDLE_TIMER)

                else:
                    try:
                        # ------------------------------------------------------
                        #
                        # LAUNCH THE PILOT AGENT VIA SAGA
                        #
                        logentries = []
                        pilot_id = str(compute_pilot["_id"])

                        logger.info("Launching ComputePilot %s" % pilot_id)

                        # ------------------------------------------------------
                        # Database connection parameters
                        session_uid = self.db_connection_info.session_id
                        database_url = self.db_connection_info.dburl
                        database_name = self.db_connection_info.dbname
                        database_auth = self.db_connection_info.dbauth

                        # ------------------------------------------------------
                        # pilot description and resource configuration
                        number_cores = compute_pilot['description']['cores']
                        runtime = compute_pilot['description']['runtime']
                        queue = compute_pilot['description']['queue']
                        project = compute_pilot['description']['project']
                        cleanup = compute_pilot['description']['cleanup']
                        resource_key = compute_pilot['description']['resource']
                        schema = compute_pilot['description']['access_schema']
                        memory = compute_pilot['description']['memory']
                        pilot_sandbox = compute_pilot['sandbox']
                        global_sandbox = compute_pilot['global_sandbox']

                        # we expand and exchange keys in the resource config,
                        # depending on the selected schema so better use a deep
                        # copy..
                        resource_cfg = self._session.get_resource_config(
                            resource_key, schema)

                        # import pprint
                        # pprint.pprint (resource_cfg)

                        # ------------------------------------------------------
                        # get parameters from cfg, set defaults where needed
                        agent_mongodb_endpoint = resource_cfg.get(
                            'agent_mongodb_endpoint', database_url)
                        agent_spawner = resource_cfg.get(
                            'agent_spawner', DEFAULT_AGENT_SPAWNER)
                        agent_type = resource_cfg.get('agent_type',
                                                      DEFAULT_AGENT_TYPE)
                        agent_scheduler = resource_cfg.get('agent_scheduler')
                        tunnel_bind_device = resource_cfg.get(
                            'tunnel_bind_device')
                        default_queue = resource_cfg.get('default_queue')
                        forward_tunnel_endpoint = resource_cfg.get(
                            'forward_tunnel_endpoint')
                        js_endpoint = resource_cfg.get('job_manager_endpoint')
                        lrms = resource_cfg.get('lrms')
                        mpi_launch_method = resource_cfg.get(
                            'mpi_launch_method')
                        pre_bootstrap = resource_cfg.get('pre_bootstrap')
                        python_interpreter = resource_cfg.get(
                            'python_interpreter')
                        spmd_variation = resource_cfg.get('spmd_variation')
                        task_launch_method = resource_cfg.get(
                            'task_launch_method')
                        rp_version = resource_cfg.get('rp_version',
                                                      DEFAULT_RP_VERSION)
                        virtenv_mode = resource_cfg.get(
                            'virtenv_mode', DEFAULT_VIRTENV_MODE)
                        virtenv = resource_cfg.get('virtenv', DEFAULT_VIRTENV)
                        stage_cacerts = resource_cfg.get(
                            'stage_cacerts', 'False')

                        if stage_cacerts.lower() == 'true':
                            stage_cacerts = True
                        else:
                            stage_cacerts = False

                        # expand variables in virtenv string
                        virtenv = virtenv % {
                            'pilot_sandbox': saga.Url(pilot_sandbox).path,
                            'global_sandbox': saga.Url(global_sandbox).path
                        }

                        # Check for deprecated global_virtenv
                        global_virtenv = resource_cfg.get('global_virtenv')
                        if global_virtenv:
                            logger.warn(
                                "'global_virtenv' keyword is deprecated -- use 'virtenv' and 'virtenv_mode'"
                            )
                            virtenv = global_virtenv
                            virtenv_mode = 'use'

                        # set default scheme, host, port and dbname if not set
                        db_url = saga.Url(agent_mongodb_endpoint)
                        if not db_url.scheme: db_url.scheme = 'mongodb'
                        if not db_url.host: db_url.host = 'localhost'
                        if not db_url.port: db_url.port = 27017
                        if not database_name: database_name = 'radicalpilot'

                        # Create a host:port string for use by the bootstrapper.
                        database_hostport = "%s:%d" % (db_url.host,
                                                       db_url.port)

                        # ------------------------------------------------------
                        # Copy the bootstrap shell script.  This also creates
                        # the sandbox. We use always "default_bootstrapper.sh"
                        bootstrapper = 'default_bootstrapper.sh'
                        bootstrapper_path = os.path.abspath("%s/../bootstrapper/%s" \
                                % (mod_dir, bootstrapper))

                        msg = "Using bootstrapper %s" % bootstrapper_path
                        logentries.append(Logentry(msg, logger=logger.info))

                        bs_script_url = saga.Url("file://localhost/%s" %
                                                 bootstrapper_path)
                        bs_script_tgt = saga.Url("%s/pilot_bootstrapper.sh" %
                                                 pilot_sandbox)

                        msg = "Copying bootstrapper '%s' to agent sandbox (%s)." \
                                % (bs_script_url, bs_script_tgt)
                        logentries.append(Logentry(msg, logger=logger.debug))

                        bs_script = saga.filesystem.File(bs_script_url,
                                                         session=self._session)
                        bs_script.copy(bs_script_tgt,
                                       flags=saga.filesystem.CREATE_PARENTS)
                        bs_script.close()

                        # ------------------------------------------------------
                        # the version of the agent is derived from
                        # rp_version, which has the following format
                        # and interpretation:
                        #
                        # case rp_version:
                        #   @<token>:
                        #   @tag/@branch/@commit: # no sdist staging
                        #       git clone $github_base radical.pilot.src
                        #       (cd radical.pilot.src && git checkout token)
                        #       pip install -t $VIRTENV/rp_install/ radical.pilot.src
                        #       rm -rf radical.pilot.src
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   release: # no sdist staging
                        #       pip install -t $VIRTENV/rp_install radical.pilot
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   local: # needs sdist staging
                        #       tar zxf $sdist.tgz
                        #       pip install -t $VIRTENV/rp_install $sdist/
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   debug: # needs sdist staging
                        #       tar zxf $sdist.tgz
                        #       pip install -t $SANDBOX/rp_install $sdist/
                        #       export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH
                        #
                        #   installed: # no sdist staging
                        #       true
                        # esac
                        #
                        # virtenv_mode
                        #   private : error  if ve exists, otherwise create, then use
                        #   update  : update if ve exists, otherwise create, then use
                        #   create  : use    if ve exists, otherwise create, then use
                        #   use     : use    if ve exists, otherwise error,  then exit
                        #   recreate: delete if ve exists, otherwise create, then use
                        #
                        # examples   :
                        #   [email protected]
                        #   virtenv@devel
                        #   virtenv@release
                        #   virtenv@installed
                        #   stage@local
                        #   stage@/tmp/my_agent.py
                        #
                        # Note that some combinations may be invalid,
                        # specifically in the context of virtenv_mode.  If, for
                        # example, virtenv_mode is 'use', then the 'virtenv:tag'
                        # will not make sense, as the virtenv is not updated.
                        # In those cases, the virtenv_mode is honored, and
                        # a warning is printed.
                        #
                        # Also, the 'stage' mode can only be combined with the
                        # 'local' source, or with a path to the agent (relative
                        # to mod_dir, or absolute).
                        #
                        # A rp_version which does not adhere to the
                        # above syntax is ignored, and the fallback stage@local
                        # is used.

                        if  not rp_version.startswith('@') and \
                            not rp_version in ['installed', 'local', 'debug']:
                            raise ValueError("invalid rp_version '%s'" %
                                             rp_version)

                        stage_sdist = True
                        if rp_version in ['installed', 'release']:
                            stage_sdist = False

                        if rp_version.startswith('@'):
                            stage_sdist = False
                            rp_version = rp_version[1:]  # strip '@'

                        # ------------------------------------------------------
                        # Copy the rp sdist if needed.  We actually also stage
                        # the sdists for radical.utils and radical.saga, so that
                        # we have the complete stack to install...
                        if stage_sdist:

                            for path in [
                                    ru.sdist_path, saga.sdist_path, sdist_path
                            ]:

                                sdist_url = saga.Url("file://localhost/%s" %
                                                     path)
                                msg = "Copying sdist '%s' to sdist sandbox (%s)." % (
                                    sdist_url, pilot_sandbox)
                                logentries.append(
                                    Logentry(msg, logger=logger.debug))

                                sdist_file = saga.filesystem.File(sdist_url)
                                sdist_file.copy("%s/" % (str(pilot_sandbox)))
                                sdist_file.close()

                        # ------------------------------------------------------
                        # some machines cannot run pip due to outdated ca certs.
                        # For those, we also stage an updated cert bundle
                        if stage_cacerts:
                            cc_path = os.path.abspath("%s/../bootstrapper/%s" \
                                    % (mod_dir, 'cacert.pem.gz'))

                            cc_script_url = saga.Url("file://localhost/%s" %
                                                     cc_path)
                            cc_script_tgt = saga.Url("%s/cacert.pem.gz" %
                                                     pilot_sandbox)

                            cc_script = saga.filesystem.File(
                                cc_script_url, session=self._session)
                            cc_script.copy(
                                cc_script_tgt,
                                flags=saga.filesystem.CREATE_PARENTS)
                            cc_script.close()

                        # ------------------------------------------------------
                        # sanity checks
                        if not agent_spawner:
                            raise RuntimeError("missing agent spawner")
                        if not agent_scheduler:
                            raise RuntimeError("missing agent scheduler")
                        if not lrms: raise RuntimeError("missing LRMS")
                        if not mpi_launch_method:
                            raise RuntimeError("missing mpi launch method")
                        if not task_launch_method:
                            raise RuntimeError("missing task launch method")

                        # massage some values
                        debug_level = os.environ.get(
                            'RADICAL_PILOT_AGENT_VERBOSE', logger.level)
                        try:
                            debug_level = int(debug_level)
                        except ValueError:
                            debug_level = {
                                'CRITICAL': 1,
                                'ERROR': 2,
                                'WARNING': 3,
                                'WARN': 3,
                                'INFO': 4,
                                'DEBUG': 5
                            }.get(debug_level, 0)

                        if not queue:
                            queue = default_queue

                        if cleanup and isinstance(cleanup, bool):
                            cleanup = 'luve'  #  l : log files
                            #  u : unit work dirs
                            #  v : virtualenv
                            #  e : everything (== pilot sandbox)
                            #
                            # we never cleanup virtenvs which are not private
                            if virtenv_mode is not 'private':
                                cleanup = cleanup.replace('v', '')

                        sdists = ':'.join(
                            [ru.sdist_name, saga.sdist_name, sdist_name])

                        # set mandatory args
                        bootstrap_args = ""
                        bootstrap_args += " -b '%s'" % sdists
                        bootstrap_args += " -c '%s'" % number_cores
                        bootstrap_args += " -d '%s'" % debug_level
                        bootstrap_args += " -g '%s'" % virtenv
                        bootstrap_args += " -j '%s'" % task_launch_method
                        bootstrap_args += " -k '%s'" % mpi_launch_method
                        bootstrap_args += " -l '%s'" % lrms
                        bootstrap_args += " -m '%s'" % database_hostport
                        bootstrap_args += " -n '%s'" % database_name
                        bootstrap_args += " -o '%s'" % agent_spawner
                        bootstrap_args += " -p '%s'" % pilot_id
                        bootstrap_args += " -q '%s'" % agent_scheduler
                        bootstrap_args += " -r '%s'" % runtime
                        bootstrap_args += " -s '%s'" % session_uid
                        bootstrap_args += " -t '%s'" % agent_type
                        bootstrap_args += " -u '%s'" % virtenv_mode
                        bootstrap_args += " -v '%s'" % rp_version

                        # set optional args
                        if database_auth:
                            bootstrap_args += " -a '%s'" % database_auth
                        if tunnel_bind_device:
                            bootstrap_args += " -D '%s'" % tunnel_bind_device
                        if pre_bootstrap:
                            bootstrap_args += " -e '%s'" % "' -e '".join(
                                pre_bootstrap)
                        if forward_tunnel_endpoint:
                            bootstrap_args += " -f '%s'" % forward_tunnel_endpoint
                        if python_interpreter:
                            bootstrap_args += " -i '%s'" % python_interpreter
                        if cleanup:
                            bootstrap_args += " -x '%s'" % cleanup

                        # ------------------------------------------------------
                        # now that the script is in place and we know where it is,
                        # we can launch the agent
                        js_url = saga.Url(js_endpoint)
                        logger.debug("saga.job.Service ('%s')" % js_url)
                        if js_url in self._shared_worker_data['job_services']:
                            js = self._shared_worker_data['job_services'][
                                js_url]
                        else:
                            js = saga.job.Service(js_url,
                                                  session=self._session)
                            self._shared_worker_data['job_services'][
                                js_url] = js

                        # ------------------------------------------------------
                        # Create SAGA Job description and submit the pilot job

                        jd = saga.job.Description()

                        jd.executable = "/bin/bash"
                        jd.arguments = [
                            "-l pilot_bootstrapper.sh", bootstrap_args
                        ]
                        jd.working_directory = saga.Url(pilot_sandbox).path
                        jd.project = project
                        jd.output = "agent.out"
                        jd.error = "agent.err"
                        jd.total_cpu_count = number_cores
                        jd.wall_time_limit = runtime
                        jd.total_physical_memory = memory
                        jd.queue = queue

                        # Set the SPMD variation only if required
                        if spmd_variation:
                            jd.spmd_variation = spmd_variation

                        if 'RADICAL_PILOT_PROFILE' in os.environ:
                            jd.environment = {'RADICAL_PILOT_PROFILE': 'TRUE'}

                        logger.debug("Bootstrap command line: %s %s" %
                                     (jd.executable, jd.arguments))

                        msg = "Submitting SAGA job with description: %s" % str(
                            jd.as_dict())
                        logentries.append(Logentry(msg, logger=logger.debug))

                        pilotjob = js.create_job(jd)
                        pilotjob.run()

                        # do a quick error check
                        if pilotjob.state == saga.FAILED:
                            raise RuntimeError("SAGA Job state is FAILED.")

                        saga_job_id = pilotjob.id
                        self._shared_worker_data['job_ids'][pilot_id] = [
                            saga_job_id, js_url
                        ]

                        msg = "SAGA job submitted with job id %s" % str(
                            saga_job_id)
                        logentries.append(Logentry(msg, logger=logger.debug))

                        #
                        # ------------------------------------------------------

                        log_dicts = list()
                        for le in logentries:
                            log_dicts.append(le.as_dict())

                        # Update the Pilot's state to 'PENDING_ACTIVE' if SAGA job submission was successful.
                        ts = datetime.datetime.utcnow()
                        ret = pilot_col.update(
                            {
                                "_id": pilot_id,
                                "state": 'Launching'
                            }, {
                                "$set": {
                                    "state": PENDING_ACTIVE,
                                    "saga_job_id": saga_job_id
                                },
                                "$push": {
                                    "statehistory": {
                                        "state": PENDING_ACTIVE,
                                        "timestamp": ts
                                    }
                                },
                                "$pushAll": {
                                    "log": log_dicts
                                }
                            })

                        if ret['n'] == 0:
                            # could not update, probably because the agent is
                            # running already.  Just update state history and
                            # jobid then
                            # FIXME: make sure of the agent state!
                            ret = pilot_col.update({"_id": pilot_id}, {
                                "$set": {
                                    "saga_job_id": saga_job_id
                                },
                                "$push": {
                                    "statehistory": {
                                        "state": PENDING_ACTIVE,
                                        "timestamp": ts
                                    }
                                },
                                "$pushAll": {
                                    "log": log_dicts
                                }
                            })

                    except Exception as e:
                        # Update the Pilot's state 'FAILED'.
                        out, err, log = self._get_pilot_logs(
                            pilot_col, pilot_id)
                        ts = datetime.datetime.utcnow()

                        # FIXME: we seem to be unable to bson/json handle saga
                        # log messages containing an '#'.  This shows up here.
                        # Until we find a clean workaround, make log shorter and
                        # rely on saga logging to reveal the problem.
                        msg = "Pilot launching failed! (%s)" % e
                        logentries.append(Logentry(msg))

                        log_dicts = list()
                        log_messages = list()
                        for le in logentries:
                            log_dicts.append(le.as_dict())
                            log_messages.append(le.message)

                        pilot_col.update(
                            {
                                "_id": pilot_id,
                                "state": {
                                    "$ne": FAILED
                                }
                            }, {
                                "$set": {
                                    "state": FAILED,
                                    "stdout": out,
                                    "stderr": err,
                                    "logfile": log
                                },
                                "$push": {
                                    "statehistory": {
                                        "state": FAILED,
                                        "timestamp": ts
                                    }
                                },
                                "$pushAll": {
                                    "log": log_dicts
                                }
                            })
                        logger.exception('\n'.join(log_messages))

        except SystemExit as e:
            logger.exception(
                "pilot launcher thread caught system exit -- forcing application shutdown"
            )
            import thread
            thread.interrupt_main()
示例#14
0
    def close(self, terminate=True):
        """Shuts down the PilotManager and its background workers in a 
        coordinated fashion.

        **Arguments:**

            * **terminate** [`bool`]: If set to True, all active pilots will 
              get canceled (default: False).

        """

        logger.debug("pmgr    %s closing" % (str(self._uid)))

        # Spit out a warning in case the object was already closed.
        if not self._uid:
            logger.error("PilotManager object already closed.")
            return

        # before we terminate pilots, we have to kill the pilot launcher threads
        # -- otherwise we'll run into continous race conditions due to the
        # ongoing state checks...
        if self._worker is not None:
            # Stop the worker process
            logger.debug("pmgr    %s cancel   worker %s" % (str(self._uid), self._worker.name))
            self._worker.cancel_launcher()
            logger.debug("pmgr    %s canceled worker %s" % (str(self._uid), self._worker.name))



        # If terminate is set, we cancel all pilots. 
        if  terminate :
            # cancel all pilots, make sure they are gone, and close the pilot
            # managers.
            for pilot in self.get_pilots () :
                logger.debug("pmgr    %s cancels  pilot  %s" % (str(self._uid), pilot._uid))
            self.cancel_pilots ()

          # FIXME:
          #
          # wait_pilots() will wait until all pilots picked up the sent cancel
          # signal and died.  However, that can take a loooong time.  For
          # example, if a pilot is in 'PENDING_ACTIVE' state, this will have to
          # wait until the pilot is bootstrapped, started, connected to the DB,
          # and shut down again.  Or, for a pilot which just got a shitload of
          # units, it will have to wait until the pilot started all those units
          # and then checks its command queue again.  Or, if the pilot job
          # already died, wait will block until the state checker kicks in and
          # declares the pilot as dead, which takes a couple of minutes.
          #
          # Solution would be to add a CANCELING state and to wait for that one,
          # too, which basically means to wait until the cancel signal has been
          # sent.  There is not much more to do at this point anyway.  This is at
          # the moment faked in the manager controler, which sets that state
          # after sending the cancel command.  This should be converted into
          # a proper state -- that would, btw, remove the need for a cancel
          # command in the first place, as the pilot can just pull its own state
          # instead, and cancel on CANCELING...
          #
          # self.wait_pilots ()
            wait_for_cancel = True
            all_pilots = self.get_pilots ()
            while wait_for_cancel :
                wait_for_cancel = False
                for pilot in all_pilots :
                    logger.debug("pmgr    %s wait for pilot  %s (%s)" % (str(self._uid), pilot._uid, pilot.state))
                    if  pilot.state not in [DONE, FAILED, CANCELED, CANCELING] :
                        time.sleep (1)
                        wait_for_cancel = True
                        break
            for pilot in self.get_pilots () :
                logger.debug("pmgr    %s canceled pilot  %s" % (str(self._uid), pilot._uid))


        logger.debug("pmgr    %s stops    worker %s" % (str(self._uid), self._worker.name))
        self._worker.stop()
        self._worker.join()
        logger.debug("pmgr    %s stopped  worker %s" % (str(self._uid), self._worker.name))

        # Remove worker from registry
        self._session._process_registry.remove(self._uid)


        logger.debug("pmgr    %s closed" % (str(self._uid)))
        self._uid = None
示例#15
0
    def run(self):
        """run() is called when the process is started via
           PilotManagerController.start().
        """

        # make sure to catch sys.exit (which raises SystemExit)
        try:

            logger.debug(
                "Worker thread (ID: %s[%s]) for UnitManager %s started." %
                (self.name, self.ident, self._um_id))

            # transfer results contains the futures to the results of the
            # asynchronous transfer operations.
            transfer_results = list()

            while not self._stop.is_set():

                # =================================================================
                #
                # Check and update units. This needs to be optimized at
                # some point, i.e., state pulling should be conditional
                # or triggered by a tailable MongoDB cursor, etc.
                unit_list = self._db.get_compute_units(
                    unit_manager_id=self._um_id)
                action = False

                for unit in unit_list:
                    unit_id = str(unit["_id"])

                    new_state = unit["state"]
                    if unit_id in self._shared_data:
                        old_state = self._shared_data[unit_id]["data"]["state"]
                    else:
                        old_state = None
                        self._shared_data_lock.acquire()
                        self._shared_data[unit_id] = {
                            'data': unit,
                            'callbacks': [],
                            'facade_object': None
                        }
                        self._shared_data_lock.release()

                    self._shared_data_lock.acquire()
                    self._shared_data[unit_id]["data"] = unit
                    self._shared_data_lock.release()

                    if new_state != old_state:
                        # On a state change, we fire zee callbacks.
                        logger.info(
                            "RUN ComputeUnit '%s' state changed from '%s' to '%s'."
                            % (unit_id, old_state, new_state))

                        # The state of the unit has changed, We call all
                        # unit-level callbacks to propagate this.
                        self.call_unit_state_callbacks(unit_id, new_state)

                        action = True

                # After the first iteration, we are officially initialized!
                if not self._initialized.is_set():
                    self._initialized.set()

                # sleep a little if this cycle was idle
                if not action:
                    time.sleep(IDLE_TIME)

        except SystemExit as e:
            logger.exception(
                "unit manager controller thread caught system exit -- forcing application shutdown"
            )
            import thread
            thread.interrupt_main()

        finally:
            # shut down the autonomous input / output transfer worker(s)
            for worker in self._input_file_transfer_worker_pool:
                logger.debug("uworker %s stops   itransfer %s" %
                             (self.name, worker.name))
                worker.stop()
                logger.debug("uworker %s stopped itransfer %s" %
                             (self.name, worker.name))

            for worker in self._output_file_transfer_worker_pool:
                logger.debug("uworker %s stops   otransfer %s" %
                             (self.name, worker.name))
                worker.stop()
                logger.debug("uworker %s stopped otransfer %s" %
                             (self.name, worker.name))
示例#16
0
    def _reschedule (self, target_pid=None, uid=None) :

        with self.lock :

            # dig through the list of waiting CUs, and try to find a pilot for each
            # of them.  This enacts first-come-first-served, but will be unbalanced
            # if the units in the queue are of different sizes.  That problem is
            # ignored at this point.
            #
            # if any units get scheduled, we push a dictionary to the UM to enact
            # the schedule:
            #   { 
            #     unit_1: [pilot_id_1, pilot_resource_name]
            #     unit_2: [pilot_id_2, pilot_resource_name]
            #     unit_4: [pilot_id_2, pilot_resource_name]
            #     ...
            #   }

            if  not len(self.pilots.keys ()) :
                # no pilots to  work on, yet.
                logger.warning ("cannot schedule -- no pilots available")
                return 

            if  target_pid and target_pid not in self.pilots :
                logger.warning ("cannot schedule -- invalid target pilot %s" % target_pid)
                raise RuntimeError ("Invalid pilot (%s)" % target_pid)
                

            schedule           = dict()
            schedule['units']  = dict()
            schedule['pilots'] = self.pilots

            logger.debug ("schedule (%s units waiting)" % len(self.waitq))


            units_to_schedule = list()
            if  uid :

                if  uid not in self.waitq :
                  # self._dump ()
                    logger.warning ("cannot schedule -- unknown unit %s" % uid)
                    raise RuntimeError ("Invalid unit (%s)" % uid)

                units_to_schedule.append (self.waitq[uid])

            else :
                # just copy the whole waitq
                for uid in self.waitq :
                    units_to_schedule.append (self.waitq[uid])


            for unit in units_to_schedule :

                uid = unit.uid
                ud  = unit.description

                # sanity check on unit state
                if  unit.state not in [NEW, SCHEDULING, UNSCHEDULED] :
                    raise RuntimeError ("scheduler queue should only contain NEW or UNSCHEDULED units (%s)" % uid)

              # logger.debug ("examine unit  %s (%s cores)" % (uid, ud.cores))

                for pid in self.pilots :

                  # logger.debug ("        pilot %s (%s caps, state %s)" \
                  #            % (pid, self.pilots[pid]['state'], self.pilots[pid]['caps']))

                    if  self.pilots[pid]['state'] in [ACTIVE] :

                        if  ud.cores <= self.pilots[pid]['caps'] :
                    
                          # logger.debug ("        unit  %s fits on pilot %s" % (uid, pid))

                            self.pilots[pid]['caps'] -= ud.cores
                            schedule['units'][unit]   = pid

                            # scheduled units are removed from the waitq
                            del self.waitq[uid]
                            self.runqs[pid][uid] = unit
                            break


                    # unit was not scheduled...
                    schedule['units'][unit] = None

                # print a warning if a unit cannot possibly be scheduled, ever
                can_handle_unit = False
                for pid in self.pilots :
                    if  unit.description.cores <= self.pilots[pid]['cores'] :
                        can_handle_unit=True
                        break

                if  not can_handle_unit :
                    logger.warning ('cannot handle unit %s with current set of pilots' % uid)

          # pprint.pprint (schedule)

            # tell the UM about the schedule
            self.manager.handle_schedule (schedule)
示例#17
0
    def _pilot_state_callback (self, pilot, state) :
        
        try :

            with self.lock :

                pid = pilot.uid
    
                if  not pid in self.pilots :
                    # as we cannot unregister callbacks, we simply ignore this
                    # invokation.  Its probably from a pilot we used previously.
                    logger.warn ("[SchedulerCallback]: ComputePilot %s changed to %s (ignored)" % (pid, state))
                    return
    
    
                self.pilots[pid]['state'] = state
                logger.debug ("[SchedulerCallback]: ComputePilot %s changed to %s" % (pid, state))
    
                if  state in [ACTIVE] :
                    # the pilot is now ready to be used
                    self._reschedule (target_pid=pid)
    
                if  state in [DONE, FAILED, CANCELED] :

                  # self._dump ('pilot is final')

                    # If the pilot state is 'DONE', 'FAILED' or 'CANCELED', we
                    # need to reschedule the units which are reschedulable --
                    # all others are marked 'FAILED' if they are already
                    # 'EXECUTING' and not restartable
                    timestamp = datetime.datetime.utcnow()
                    self._db.change_compute_units (
                        filter_dict = {"pilot"       : pid, 
                                       "state"       : {"$in": [UNSCHEDULED,
                                                                PENDING_INPUT_STAGING, 
                                                                STAGING_INPUT, 
                                                                PENDING_EXECUTION, 
                                                                SCHEDULING]}},
                        set_dict    = {"state"       : UNSCHEDULED, 
                                       "pilot"       : None},
                        push_dict   = {"statehistory": {"state"     : UNSCHEDULED, 
                                                        "timestamp" : timestamp}, 
                                       "log"         : {"message"   :  "reschedule unit", 
                                                        "timestamp" : timestamp}
                                      })

                    self._db.change_compute_units (
                        filter_dict = {"pilot"       : pid, 
                                       "restartable" : True, 
                                       "state"       : {"$in": [EXECUTING, 
                                                                PENDING_OUTPUT_STAGING, 
                                                                STAGING_OUTPUT]}},
                        set_dict    = {"state"       : UNSCHEDULED,
                                       "pilot"       : None},
                        push_dict   = {"statehistory": {"state"     : UNSCHEDULED,
                                                        "timestamp" : timestamp}, 
                                       "log"         : {"message"   :  "reschedule unit", 
                                                        "timestamp" : timestamp}
                                      })

                    self._db.change_compute_units (
                        filter_dict = {"pilot"       : pid, 
                                       "restartable" : False, 
                                       "state"       : {"$in": [EXECUTING, 
                                                                PENDING_OUTPUT_STAGING, 
                                                                STAGING_OUTPUT]}},
                        set_dict    = {"state"       : FAILED},
                        push_dict   = {"statehistory": {"state"     : FAILED, 
                                                        "timestamp" : timestamp}, 
                                       "log"         : {"message"   :  "reschedule unit", 
                                                        "timestamp" : timestamp}
                                      })

                        # make sure that restartable units got back into the
                        # wait queue
                        #
                        # FIXME AM: f*****g state management: I don't have the
                        # unit state!  New state was just pushed to the DB, but
                        # I have actually no idea for which units, and the state
                        # known to the worker (i.e. the cached state) is most
                        # likely outdated. 
                        #
                        # So we don't handle runq/waitq here.  Instead, we rely
                        # on the unit cb to get invoked as soon as the state
                        # propagated back to us, and then remove them from the
                        # runq.  This is slow, potentially very slow, but save.
                        

                    # we can't use this pilot anymore...  
                    del self.pilots[pid]
                    # FIXME: how can I *un*register a pilot callback?
                    
    
        except Exception as e :
          # import traceback
          # traceback.print_exc ()
            logger.exception ("error in pilot callback for backfiller (%s) - ignored" % e)
            raise
    def run(self):
        """run() is called when the process is started via
           PilotManagerController.start().
        """

        # make sure to catch sys.exit (which raises SystemExit)
        try :

            logger.debug("Worker thread (ID: %s[%s]) for UnitManager %s started." %
                        (self.name, self.ident, self._um_id))

            # transfer results contains the futures to the results of the
            # asynchronous transfer operations.
            transfer_results = list()

            while not self._stop.is_set():

                # =================================================================
                #
                # Check and update units. This needs to be optimized at
                # some point, i.e., state pulling should be conditional
                # or triggered by a tailable MongoDB cursor, etc.
                unit_list = self._db.get_compute_units(unit_manager_id=self._um_id)
                action    = False

                for unit in unit_list:
                    unit_id = str(unit["_id"])

                    new_state = unit["state"]
                    if unit_id in self._shared_data:
                        old_state = self._shared_data[unit_id]["data"]["state"]
                    else:
                        old_state = None
                        self._shared_data_lock.acquire()
                        self._shared_data[unit_id] = {
                            'data':          unit,
                            'callbacks':     [],
                            'facade_object': None
                        }
                        self._shared_data_lock.release()

                    self._shared_data_lock.acquire()
                    self._shared_data[unit_id]["data"] = unit
                    self._shared_data_lock.release()

                    if new_state != old_state:
                        # On a state change, we fire zee callbacks.
                        logger.info("RUN ComputeUnit '%s' state changed from '%s' to '%s'." % (unit_id, old_state, new_state))

                        # The state of the unit has changed, We call all
                        # unit-level callbacks to propagate this.
                        self.call_unit_state_callbacks(unit_id, new_state)

                        action = True

                # After the first iteration, we are officially initialized!
                if not self._initialized.is_set():
                    self._initialized.set()

                # sleep a little if this cycle was idle
                if  not action :
                    time.sleep(IDLE_TIME)


        except SystemExit as e :
            logger.exception ("unit manager controller thread caught system exit -- forcing application shutdown")
            import thread
            thread.interrupt_main ()


        finally :
            # shut down the autonomous input / output transfer worker(s)
            for worker in self._input_file_transfer_worker_pool:
                logger.debug("uworker %s stops   itransfer %s" % (self.name, worker.name))
                worker.stop ()
                logger.debug("uworker %s stopped itransfer %s" % (self.name, worker.name))

            for worker in self._output_file_transfer_worker_pool:
                logger.debug("uworker %s stops   otransfer %s" % (self.name, worker.name))
                worker.stop ()
                logger.debug("uworker %s stopped otransfer %s" % (self.name, worker.name))
示例#19
0
 def __del__(self):
     """Le destructeur.
     """
     if os.getenv("RADICAL_PILOT_GCDEBUG", None) is not None:
         logger.debug("GCDEBUG __del__(): ComputeUnit [object id: %s]." % id(self))
示例#20
0
    def handle_schedule (self, schedule) :

        # we want to use bulk submission to the pilots, so we collect all units
        # assigned to the same set of pilots.  At the same time, we select
        # unscheduled units for later insertion into the wait queue.
        
        if  not schedule :
            logger.debug ('skipping empty unit schedule')
            return

      # print 'handle schedule:'
      # import pprint
      # pprint.pprint (schedule)
      #
        pilot_cu_map = dict()
        unscheduled  = list()

        pilot_ids = self.list_pilots ()

        for unit in schedule['units'].keys() :

            pid = schedule['units'][unit]

            if  None == pid :
                unscheduled.append (unit)
                continue

            else :

                if  pid not in pilot_ids :
                    raise RuntimeError ("schedule points to unknown pilot %s" % pid)

                if  pid not in pilot_cu_map :
                    pilot_cu_map[pid] = list()

                pilot_cu_map[pid].append (unit)


        # submit to all pilots which got something submitted to
        for pid in pilot_cu_map.keys():

            units_to_schedule = list()

            # if a kernel name is in the cu descriptions set, do kernel expansion
            for unit in pilot_cu_map[pid] :

                if  not pid in schedule['pilots'] :
                    # lost pilot, do not schedule unit
                    logger.warn ("unschedule unit %s, lost pilot %s" % (unit.uid, pid))
                    continue

                unit.sandbox = schedule['pilots'][pid]['sandbox'] + "/" + str(unit.uid)

                ud = unit.description

                if  'kernel' in ud and ud['kernel'] :

                    try :
                        from radical.ensemblemd.mdkernels import MDTaskDescription
                    except Exception as ex :
                        logger.error ("Kernels are not supported in" \
                              "compute unit descriptions -- install " \
                              "radical.ensemblemd.mdkernels!")
                        # FIXME: unit needs a '_set_state() method or something!
                        self._session._dbs.set_compute_unit_state (unit._uid, FAILED, 
                                ["kernel expansion failed"])
                        continue

                    pilot_resource = schedule['pilots'][pid]['resource']

                    mdtd           = MDTaskDescription ()
                    mdtd.kernel    = ud.kernel
                    mdtd_bound     = mdtd.bind (resource=pilot_resource)
                    ud.environment = mdtd_bound.environment
                    ud.pre_exec    = mdtd_bound.pre_exec
                    ud.executable  = mdtd_bound.executable
                    ud.mpi         = mdtd_bound.mpi


                units_to_schedule.append (unit)

            if  len(units_to_schedule) :
                self._worker.schedule_compute_units (pilot_uid=pid,
                                                     units=units_to_schedule)


        # report any change in wait_queue_size
        old_wait_queue_size = self.wait_queue_size

        self.wait_queue_size = len(unscheduled)
        if  old_wait_queue_size != self.wait_queue_size :
            self._worker.fire_manager_callback (WAIT_QUEUE_SIZE, self,
                                                self.wait_queue_size)

        if  len(unscheduled) :
            self._worker.unschedule_compute_units (units=unscheduled)

        logger.info ('%s units remain unscheduled' % len(unscheduled))
示例#21
0
    def wait_pilots(self, pilot_ids=None,
                    state=[DONE, FAILED, CANCELED],
                    timeout=None):
        """Returns when one or more :class:`radical.pilot.ComputePilots` reach a
        specific state or when an optional timeout is reached.

        If `pilot_uids` is `None`, `wait_pilots` returns when **all** Pilots
        reach the state defined in `state`.

        **Arguments:**

            * **pilot_uids** [`string` or `list of strings`]
              If pilot_uids is set, only the Pilots with the specified uids are
              considered. If pilot_uids is `None` (default), all Pilots are
              considered.

            * **state** [`list of strings`]
              The state(s) that Pilots have to reach in order for the call
              to return.

              By default `wait_pilots` waits for the Pilots to reach
              a **terminal** state, which can be one of the following:

              * :data:`radical.pilot.DONE`
              * :data:`radical.pilot.FAILED`
              * :data:`radical.pilot.CANCELED`

            * **timeout** [`float`]
              Optional timeout in seconds before the call returns regardless
              whether the Pilots have reached the desired state or not.
              The default value **-1.0** never times out.

        **Raises:**

            * :class:`radical.pilot.PilotException`
        """
        self._assert_obj_is_valid()

        if not isinstance(state, list):
            state = [state]

        return_list_type = True
        if (not isinstance(pilot_ids, list)) and (pilot_ids is not None):
            return_list_type = False
            pilot_ids = [pilot_ids]


        start  = time.time()
        all_ok = False
        states = list()

        while not all_ok :

            pilots = self._worker.get_compute_pilot_data(pilot_ids=pilot_ids)
            all_ok = True
            states = list()

            for pilot in pilots :
                if  pilot['state'] not in state :
                    all_ok = False

                states.append (pilot['state'])

            # check timeout
            if  (None != timeout) and (timeout <= (time.time() - start)):
                if  not all_ok :
                    logger.debug ("wait timed out: %s" % states)
                break

            # sleep a little if this cycle was idle
            if  not all_ok :
                time.sleep (0.1)

        # done waiting
        if  return_list_type :
            return states
        else :
            return states[0]
    def check_pilot_states(self, pilot_col):

        pending_pilots = pilot_col.find({
            "pilotmanager": self.pilot_manager_id,
            "state": {
                "$in": [PENDING_ACTIVE, ACTIVE]
            }
        })

        for pending_pilot in pending_pilots:

            pilot_failed = False
            pilot_done = False
            reconnected = False
            pilot_id = pending_pilot["_id"]
            log_message = ""
            saga_job_id = pending_pilot["saga_job_id"]

            logger.info(
                "Performing periodical health check for %s (SAGA job id %s)" %
                (str(pilot_id), saga_job_id))

            if not pilot_id in self.missing_pilots:
                self.missing_pilots[pilot_id] = 0

            # Create a job service object:
            try:
                js_url = saga_job_id.split("]-[")[0][1:]

                if js_url in self._shared_worker_data['job_services']:
                    js = self._shared_worker_data['job_services'][js_url]
                else:
                    js = saga.job.Service(js_url, session=self._session)
                    self._shared_worker_data['job_services'][js_url] = js

                saga_job = js.get_job(saga_job_id)
                reconnected = True

                if saga_job.state in [saga.job.FAILED, saga.job.CANCELED]:
                    pilot_failed = True
                    log_message  = "SAGA job state for ComputePilot %s is %s."\
                                 % (pilot_id, saga_job.state)

                if saga_job.state in [saga.job.DONE]:
                    pilot_done = True
                    log_message  = "SAGA job state for ComputePilot %s is %s."\
                                 % (pilot_id, saga_job.state)

            except Exception as e:

                if not reconnected:
                    logger.warning(
                        'could not reconnect to pilot for state check (%s)' %
                        e)
                    self.missing_pilots[pilot_id] += 1

                    if self.missing_pilots[pilot_id] >= JOB_CHECK_MAX_MISSES:
                        logger.debug('giving up after 10 attempts')
                        pilot_failed = True
                        log_message  = "Could not reconnect to pilot %s "\
                                       "multiple times - giving up" % pilot_id
                else:
                    logger.warning('pilot state check failed: %s' % e)
                    pilot_failed = True
                    log_message  = "Couldn't determine job state for ComputePilot %s. " \
                                   "Assuming it has failed." % pilot_id

            if pilot_failed:
                out, err, log = self._get_pilot_logs(pilot_col, pilot_id)
                ts = datetime.datetime.utcnow()
                pilot_col.update({
                    "_id": pilot_id,
                    "state": {
                        "$ne": DONE
                    }
                }, {
                    "$set": {
                        "state": FAILED,
                        "stdout": out,
                        "stderr": err,
                        "logfile": log
                    },
                    "$push": {
                        "statehistory": {
                            "state": FAILED,
                            "timestamp": ts
                        },
                        "log": {
                            "message": log_message,
                            "timestamp": ts
                        }
                    }
                })
                logger.debug(log_message)
                logger.warn('pilot %s declared dead' % pilot_id)

            elif pilot_done:
                # FIXME: this should only be done if the state is not yet
                # done...
                out, err, log = self._get_pilot_logs(pilot_col, pilot_id)
                ts = datetime.datetime.utcnow()
                pilot_col.update({
                    "_id": pilot_id,
                    "state": {
                        "$ne": DONE
                    }
                }, {
                    "$set": {
                        "state": DONE,
                        "stdout": out,
                        "stderr": err,
                        "logfile": log
                    },
                    "$push": {
                        "statehistory": {
                            "state": DONE,
                            "timestamp": ts
                        },
                        "log": {
                            "message": log_message,
                            "timestamp": ts
                        }
                    }
                })
                logger.debug(log_message)
                logger.warn('pilot %s declared dead' % pilot_id)

            else:
                if self.missing_pilots[pilot_id]:
                    logger.info ('pilot %s *assumed* alive and well (%s)' \
                              % (pilot_id, self.missing_pilots[pilot_id]))
                else:
                    logger.info ('pilot %s seems alive and well' \
                              % (pilot_id))
示例#23
0
    def run(self):
        """run() is called when the process is started via
           PilotManagerController.start().
        """

        # make sure to catch sys.exit (which raises SystemExit)
        try :

            logger.debug("Worker thread (ID: %s[%s]) for PilotManager %s started." %
                        (self.name, self.ident, self._pm_id))

            while not self._stop.is_set():

                # # Check if one or more startup requests have finished.
                # self.startup_results_lock.acquire()

                # new_startup_results = list()

                # for transfer_result in self.startup_results:
                #     if transfer_result.ready():
                #         result = transfer_result.get()

                #         self._db.update_pilot_state(
                #             pilot_uid=result["pilot_uid"],
                #             state=result["state"],
                #             sagajobid=result["saga_job_id"],
                #             pilot_sandbox=result["sandbox"],
                #             global_sandbox=result["global_sandbox"],
                #             submitted=result["submitted"],
                #             logs=result["logs"]
                #         )

                #     else:
                #         new_startup_results.append(transfer_result)

                # self.startup_results = new_startup_results

                # self.startup_results_lock.release()

                # Check and update pilots. This needs to be optimized at
                # some point, i.e., state pulling should be conditional
                # or triggered by a tailable MongoDB cursor, etc.
                pilot_list = self._db.get_pilots(pilot_manager_id=self._pm_id)
                action = False

                for pilot in pilot_list:
                    pilot_id = str(pilot["_id"])

                    new_state = pilot["state"]
                    if pilot_id in self._shared_data:
                        old_state = self._shared_data[pilot_id]["data"]["state"]
                    else:
                        old_state = None
                        self._shared_data[pilot_id] = {
                            'data':          pilot,
                            'callbacks':     [],
                            'facade_object': None
                        }

                    self._shared_data[pilot_id]['data'] = pilot

                    # FIXME: *groan* what a hack...  The Canceling state is by
                    # the nature of it not recorded in the database, but only in
                    # the local cache.  So if we see it as old state, we have to
                    # avoid state transitions into non-final states in the cache
                    # at all cost -- so we catch this here specifically
                    no_cb = False
                    if  old_state == CANCELING :
                        if  new_state not in [DONE, FAILED, CANCELED] :
                            # restore old state, making the cache explicitly
                            # different than the DB recorded state
                            self._shared_data[pilot_id]["data"]["state"] = old_state 

                            # do not tr igger a state cb!
                            no_cb = True

                    if new_state != old_state :
                        action = True

                        if not no_cb :
                            # On a state change, we fire zee callbacks.
                            logger.info("ComputePilot '%s' state changed from '%s' to '%s'." \
                                            % (pilot_id, old_state, new_state))

                            # The state of the pilot has changed, We call all
                            # pilot-level callbacks to propagate this.  This also
                            # includes communication to the unit scheduler which
                            # may, or may not, cancel the pilot's units.
                            self.call_callbacks(pilot_id, new_state)

                    # If the state is 'DONE', 'FAILED' or 'CANCELED', we also
                    # set the state of the compute unit accordingly (but only
                    # for non-final units)
                    if new_state in [FAILED, DONE, CANCELED]:
                        unit_ids = self._db.pilot_list_compute_units(pilot_uid=pilot_id)
                        self._db.set_compute_unit_state (
                            unit_ids=unit_ids, 
                            state=CANCELED,
                            src_states=[ PENDING_INPUT_STAGING,
                                         STAGING_INPUT,
                                         PENDING_EXECUTION,
                                         SCHEDULING,
                                         EXECUTING,
                                         PENDING_OUTPUT_STAGING,
                                         STAGING_OUTPUT
                                       ],
                            log="Pilot '%s' has terminated with state '%s'. CU canceled." % (pilot_id, new_state))

                # After the first iteration, we are officially initialized!
                if not self._initialized.is_set():
                    self._initialized.set()

                # sleep a little if this cycle was idle
                if  not action :
                    time.sleep(IDLE_TIME)

        except SystemExit as e :
            logger.exception ("pilot manager controller thread caught system exit -- forcing application shutdown")
            import thread
            thread.interrupt_main ()

        finally :
            # shut down the autonomous pilot launcher worker(s)
            for worker in self._pilot_launcher_worker_pool:
                logger.debug("pworker %s stops   launcher %s" % (self.name, worker.name))
                worker.stop ()
                logger.debug("pworker %s stopped launcher %s" % (self.name, worker.name))
示例#24
0
    def close(self, cleanup=True, terminate=True, delete=None):
        """Closes the session.

        All subsequent attempts access objects attached to the session will 
        result in an error. If cleanup is set to True (default) the session
        data is removed from the database.

        **Arguments:**
            * **cleanup** (`bool`): Remove session from MongoDB (implies * terminate)
            * **terminate** (`bool`): Shut down all pilots associated with the session. 

        **Raises:**
            * :class:`radical.pilot.IncorrectState` if the session is closed
              or doesn't exist. 
        """

        logger.debug("session %s closing" % (str(self._uid)))

        uid = self._uid

        if not self._uid:
            logger.error("Session object already closed.")
            return

        # we keep 'delete' for backward compatibility.  If it was set, and the
        # other flags (cleanup, terminate) are as defaulted (True), then delete
        # will supercede them.  Delete is considered deprecated though, and
        # we'll thus issue a warning.
        if delete != None:

            if cleanup == True and terminate == True:
                cleanup = delete
                terminate = delete
                logger.warning("'delete' flag on session is deprecated. " \
                               "Please use 'cleanup' and 'terminate' instead!")

        if cleanup:
            # cleanup implies terminate
            terminate = True

        for pmgr in self._pilot_manager_objects:
            logger.debug("session %s closes   pmgr   %s" %
                         (str(self._uid), pmgr._uid))
            pmgr.close(terminate=terminate)
            logger.debug("session %s closed   pmgr   %s" %
                         (str(self._uid), pmgr._uid))

        for umgr in self._unit_manager_objects:
            logger.debug("session %s closes   umgr   %s" %
                         (str(self._uid), umgr._uid))
            umgr.close()
            logger.debug("session %s closed   umgr   %s" %
                         (str(self._uid), umgr._uid))

        if cleanup:
            self._destroy_db_entry()

        logger.debug("session %s closed" % (str(self._uid)))
示例#25
0
    def register_cancel_pilots_request(self, pilot_ids=None):
        """Registers one or more pilots for cancelation.
        """

        if pilot_ids is None:

            pilot_ids = list()

            for pilot in self._db.get_pilots(pilot_manager_id=self._pm_id) :
                pilot_ids.append (str(pilot["_id"]))


        self._db.send_command_to_pilot(COMMAND_CANCEL_PILOT, pilot_ids=pilot_ids)
        logger.info("Sent 'COMMAND_CANCEL_PILOT' command to pilots %s.", pilot_ids)

        # pilots which are in ACTIVE state should now have time to react on the
        # CANCEL command sent above.  Meanwhile, we'll cancel all pending
        # pilots.  If that is done, we wait a little, say 10 seconds, to give
        # the pilot time to pick up the request and shut down -- but if it does
        # not do that, it will get killed the hard way...
        delayed_cancel = list()

        for pilot_id in pilot_ids :
            if  pilot_id in self._shared_data :

                # read state fomr _shared_data only once, so that it does not
                # change under us...
                old_state = str(self._shared_data[pilot_id]["data"]["state"])

                logger.warn ("actively cancel pilot %s state: %s" % (pilot_id, old_state))
                if  old_state in [DONE, FAILED, CANCELED] :
                    logger.warn ("can't actively cancel pilot %s: already in final state" % pilot_id)

                elif old_state in [PENDING_LAUNCH, LAUNCHING, PENDING_ACTIVE] :
                    if pilot_id in self._shared_worker_data['job_ids'] :

                        try :
                            job_id, js_url = self._shared_worker_data['job_ids'][pilot_id]
                            self._shared_data[pilot_id]["data"]["state"] = CANCELING
                            logger.info ("actively cancel pilot %s (%s, %s)" % (pilot_id, job_id, js_url))

                            js = self._shared_worker_data['job_services'][js_url]
                            job = js.get_job (job_id)
                            job.cancel ()
                        except Exception as e :
                            logger.exception ('pilot cancelation failed')


                    else :
                        logger.warn ("can't actively cancel pilot %s: no job id known" % pilot_id)
                        logger.debug (pprint.pformat (self._shared_worker_data))

                else :
                    logger.debug ("delay to actively cancel pilot %s: state %s" % (pilot_id, old_state))
                    delayed_cancel.append (pilot_id)

            else :
                logger.warn  ("can't actively cancel pilot %s: unknown pilot" % pilot_id)
                logger.debug (pprint.pformat (self._shared_data))

        # now tend to all delayed cancellation requests (ie. active pilots) --
        # if there are any
        if  delayed_cancel :

            # grant some levay to the unruly children...
            time.sleep (10)

            for pilot_id in delayed_cancel :

                if pilot_id in self._shared_worker_data['job_ids'] :

                    try :
                        job_id, js_url = self._shared_worker_data['job_ids'][pilot_id]
                        logger.info ("actively cancel pilot %s (delayed) (%s, %s)" % (pilot_id, job_id, js_url))

                        js = self._shared_worker_data['job_services'][js_url]
                        job = js.get_job (job_id)
                        job.cancel ()
                    except Exception as e :
                        logger.warn ('delayed pilot cancelation failed. '
                                'This is not necessarily a problem.')

                else :
                    logger.warn ("can't actively cancel pilot %s: no job id known (delayed)" % pilot_id)
                    logger.debug (pprint.pformat (self._shared_worker_data))
示例#26
0
    def cancel(self):
        """Cancel the ComputeUnit.

        **Raises:**

            * :class:`radical.pilot.radical.pilotException`
        """
        # Check if this instance is valid
        if not self._uid:
            raise BadParameter("Invalid Compute Unit instance.")

        cu_json = self._worker.get_compute_unit_data(self.uid)
        pilot_uid = cu_json['pilot']

        if self.state in [DONE, FAILED, CANCELED]:
            # nothing to do
            logger.debug(
                "Compute unit %s has state %s, can't cancel any longer." %
                (self._uid, self.state))

        elif self.state in [NEW, UNSCHEDULED, PENDING_INPUT_STAGING]:
            logger.debug(
                "Compute unit %s has state %s, going to prevent from starting."
                % (self._uid, self.state))
            self._manager._session._dbs.set_compute_unit_state(
                self._uid, CANCELED, ["Received Cancel"])

        elif self.state == STAGING_INPUT:
            logger.debug(
                "Compute unit %s has state %s, will cancel the transfer." %
                (self._uid, self.state))
            self._manager._session._dbs.set_compute_unit_state(
                self._uid, CANCELED, ["Received Cancel"])

        elif self.state in [PENDING_EXECUTION, SCHEDULING]:
            logger.debug("Compute unit %s has state %s, will abort start-up." %
                         (self._uid, self.state))
            self._manager._session._dbs.set_compute_unit_state(
                self._uid, CANCELED, ["Received Cancel"])

        elif self.state == EXECUTING:
            logger.debug(
                "Compute unit %s has state %s, will terminate the task." %
                (self._uid, self.state))
            self._manager._session._dbs.send_command_to_pilot(
                cmd=COMMAND_CANCEL_COMPUTE_UNIT,
                arg=self.uid,
                pilot_ids=pilot_uid)

        elif self.state == PENDING_OUTPUT_STAGING:
            logger.debug(
                "Compute unit %s has state %s, will abort the transfer." %
                (self._uid, self.state))
            self._manager._session._dbs.set_compute_unit_state(
                self._uid, CANCELED, ["Received Cancel"])

        elif self.state == STAGING_OUTPUT:
            logger.debug(
                "Compute unit %s has state %s, will cancel the transfer." %
                (self._uid, self.state))
            self._manager._session._dbs.set_compute_unit_state(
                self._uid, CANCELED, ["Received Cancel"])

        else:
            raise IncorrectState(
                "Unknown Compute Unit state: %s, cannot cancel" % self.state)

        # done canceling
        return
示例#27
0
    def wait_units(self, unit_ids=None,
                   state=[DONE, FAILED, CANCELED],
                   timeout=None):
        """Returns when one or more :class:`radical.pilot.ComputeUnits` reach a
        specific state.

        If `unit_uids` is `None`, `wait_units` returns when **all**
        ComputeUnits reach the state defined in `state`.

        **Example**::

            # TODO -- add example

        **Arguments:**

            * **unit_uids** [`string` or `list of strings`]
              If unit_uids is set, only the ComputeUnits with the specified
              uids are considered. If unit_uids is `None` (default), all
              ComputeUnits are considered.

            * **state** [`string`]
              The state that ComputeUnits have to reach in order for the call
              to return.

              By default `wait_units` waits for the ComputeUnits to
              reach a terminal state, which can be one of the following:

              * :data:`radical.pilot.DONE`
              * :data:`radical.pilot.FAILED`
              * :data:`radical.pilot.CANCELED`

            * **timeout** [`float`]
              Timeout in seconds before the call returns regardless of Pilot
              state changes. The default value **None** waits forever.

        **Raises:**

            * :class:`radical.pilot.PilotException`
        """
        if  not self._uid:
            raise IncorrectState(msg="Invalid object instance.")

        if not isinstance(state, list):
            state = [state]

        return_list_type = True
        if (not isinstance(unit_ids, list)) and (unit_ids is not None):
            return_list_type = False
            unit_ids = [unit_ids]

        units  = self.get_units (unit_ids)
        start  = time.time()
        all_ok = False
        states = list()

        while not all_ok :

            all_ok = True
            states = list()

            for unit in units :
                if  unit.state not in state :
                    all_ok = False

                states.append (unit.state)

            # check timeout
            if  (None != timeout) and (timeout <= (time.time() - start)):
                if  not all_ok :
                    logger.debug ("wait timed out: %s" % states)
                break

            # sleep a little if this cycle was idle
            if  not all_ok :
                time.sleep (0.1)

        # done waiting
        if  return_list_type :
            return states
        else :
            return states[0]
示例#28
0
 def __del__(self):
     """Le destructeur.
     """
     if os.getenv("RADICAL_PILOT_GCDEBUG", None) is not None:
         logger.debug("GCDEBUG __del__(): ComputeUnit [object id: %s]." %
                      id(self))
示例#29
0
    def run(self):
        """Starts the process when Process.start() is called.
        """

        # make sure to catch sys.exit (which raises SystemExit)
        try :

            # Try to connect to the database and create a tailable cursor.
            try:
                connection = self.db_connection_info.get_db_handle()
                db = connection[self.db_connection_info.dbname]
                um_col = db["%s.cu" % self.db_connection_info.session_id]
                logger.debug("Connected to MongoDB. Serving requests for UnitManager %s." % self.unit_manager_id)

            except Exception as e:
                logger.exception("Connection error: %s" % e)
                return

            while not self._stop.is_set():
                compute_unit = None

                # See if we can find a ComputeUnit that is waiting for
                # output file transfer.
                ts = datetime.datetime.utcnow()
                compute_unit = um_col.find_and_modify(
                    query={"unitmanager": self.unit_manager_id,
                           "FTW_Output_Status": PENDING},
                    update={"$set" : {"FTW_Output_Status": EXECUTING,
                                      "state": STAGING_OUTPUT},
                            "$push": {"statehistory": {"state": STAGING_OUTPUT, "timestamp": ts}}},
                    limit=BULK_LIMIT
                )
                # FIXME: AM: find_and_modify is not bulkable!
                state = STAGING_OUTPUT

                #logger.info("OFTW after finding pending cus")
                if compute_unit is None:
                    #logger.info("OFTW no cus, sleep")
                    # Sleep a bit if no new units are available.
                    time.sleep(IDLE_TIME)
                else:
                    logger.info("OFTW cu found, progressing ...")
                    compute_unit_id = None
                    try:
                        # We have found a new CU. Now we can process the transfer
                        # directive(s) wit SAGA.
                        compute_unit_id = str(compute_unit["_id"])
                        remote_sandbox = compute_unit["sandbox"]
                        staging_directives = compute_unit["FTW_Output_Directives"]

                        logger.info("Processing output file transfers for ComputeUnit %s" % compute_unit_id)
                        # Loop over all staging directives and execute them.
                        for sd in staging_directives:

                            # Check if there was a cancel request
                            state_doc = um_col.find_one(
                                {"_id": compute_unit_id},
                                fields=["state"]
                            )
                            if state_doc['state'] == CANCELED:
                                logger.info("Compute Unit Canceled, interrupting output file transfers.")
                                state = CANCELED
                                break

                            action = sd['action']
                            source = sd['source']
                            target = sd['target']
                            flags  = sd['flags']

                            # Mark the beginning of transfer this StagingDirective
                            um_col.find_and_modify(
                                query={"_id" : compute_unit_id,
                                       'FTW_Output_Status': EXECUTING,
                                       'FTW_Output_Directives.state': PENDING,
                                       'FTW_Output_Directives.source': sd['source'],
                                       'FTW_Output_Directives.target': sd['target'],
                                       },
                                update={'$set': {'FTW_Output_Directives.$.state': EXECUTING},
                                        '$push': {'log': {
                                            'timestamp': datetime.datetime.utcnow(),
                                            'message'  : 'Starting transfer of %s' % source}}
                                }
                            )

                            abs_source = "%s/%s" % (remote_sandbox, source)

                            if os.path.basename(target) == target:
                                abs_target = "file://localhost%s" % os.path.join(os.getcwd(), target)
                            else:
                                abs_target = "file://localhost%s" % os.path.abspath(target)

                            log_msg = "Transferring output file %s -> %s" % (abs_source, abs_target)
                            logger.debug(log_msg)

                            logger.debug ("saga.fs.File ('%s')" % saga.Url(abs_source))
                            output_file = saga.filesystem.File(saga.Url(abs_source),
                                session=self._session
                            )

                            if CREATE_PARENTS in flags:
                                copy_flags = saga.filesystem.CREATE_PARENTS
                            else:
                                copy_flags = 0
                            logger.debug ("saga.fs.File.copy ('%s')" % saga.Url(abs_target))
                            output_file.copy(saga.Url(abs_target), flags=copy_flags)
                            output_file.close()

                            # If all went fine, update the state of this StagingDirective to Done
                            um_col.find_and_modify(
                                query={"_id" : compute_unit_id,
                                       'FTW_Output_Status': EXECUTING,
                                       'FTW_Output_Directives.state': EXECUTING,
                                       'FTW_Output_Directives.source': sd['source'],
                                       'FTW_Output_Directives.target': sd['target'],
                                       },
                                update={'$set': {'FTW_Output_Directives.$.state': DONE},
                                        '$push': {'log': {
                                            'timestamp': datetime.datetime.utcnow(),
                                            'message'  : log_msg}}
                                }
                            )

                    except Exception as e :
                        # Update the CU's state to 'FAILED'.
                        ts = datetime.datetime.utcnow()
                        log_message = "Output transfer failed: %s" % e
                        # TODO: not only mark the CU as failed, but also the specific Directive
                        um_col.update({'_id': compute_unit_id}, {
                            '$set': {'state': FAILED},
                            '$push': {
                                'statehistory': {'state': FAILED, 'timestamp': ts},
                                'log': {'message': log_message, 'timestamp': ts}
                            }
                        })
                        logger.exception (log_message)


                # Code below is only to be run by the "first" or only worker
                if self._worker_number > 1:
                    continue

                # If the CU was canceled we can skip the remainder of this loop.
                if state == CANCELED:
                    continue

                #
                # Check to see if there are more active Directives, if not, we are Done
                #
                cursor_w = um_col.find({"unitmanager": self.unit_manager_id,
                                        "$or": [ {"Agent_Output_Status": EXECUTING},
                                                 {"FTW_Output_Status": EXECUTING}
                                        ]
                }
                )
                # Iterate over all the returned CUs (if any)
                for cu in cursor_w:
                    # See if there are any FTW Output Directives still pending
                    if cu['FTW_Output_Status'] == EXECUTING and \
                            not any(d['state'] == EXECUTING or d['state'] == PENDING for d in cu['FTW_Output_Directives']):
                        # All Output Directives for this FTW are done, mark the CU accordingly
                        um_col.update({"_id": cu["_id"]},
                                      {'$set': {'FTW_Output_Status': DONE},
                                       '$push': {'log': {
                                           'timestamp': datetime.datetime.utcnow(),
                                           'message'  : 'All FTW output staging directives done - %d.' % self._worker_number}}
                                       }
                        )

                    # See if there are any Agent Output Directives still pending
                    if cu['Agent_Output_Status'] == EXECUTING and \
                            not any(d['state'] == EXECUTING or d['state'] == PENDING for d in cu['Agent_Output_Directives']):
                        # All Output Directives for this Agent are done, mark the CU accordingly
                        um_col.update({"_id": cu["_id"]},
                                      {'$set': {'Agent_Output_Status': DONE},
                                       '$push': {'log': {
                                           'timestamp': datetime.datetime.utcnow(),
                                           'message'  : 'All Agent Output Staging Directives done-%d.' % self._worker_number}}
                                      }
                        )

                #
                # Check for all CUs if both Agent and FTW staging is done, we can then mark the CU Done
                #
                ts = datetime.datetime.utcnow()
                um_col.find_and_modify(
                    query={"unitmanager": self.unit_manager_id,
                           # TODO: Now that our state model is linear,
                           # we probably don't need to check Agent_Output_Status anymore.
                           # Given that it is not updates by the agent currently, disable it here.
                           #"Agent_Output_Status": { "$in": [ None, DONE ] },
                           "FTW_Output_Status": { "$in": [ None, DONE ] },
                           "state": STAGING_OUTPUT
                    },
                    update={"$set": {
                        "state": DONE
                    },
                            "$push": {
                                "statehistory": {"state": DONE, "timestamp": ts}
                            }
                    }
                )

        except SystemExit as e :
            logger.exception("output file transfer thread caught system exit -- forcing application shutdown")
            import thread
            thread.interrupt_main ()
示例#30
0
    def handle_schedule(self, schedule):

        # we want to use bulk submission to the pilots, so we collect all units
        # assigned to the same set of pilots.  At the same time, we select
        # unscheduled units for later insertion into the wait queue.

        if not schedule:
            logger.debug('skipping empty unit schedule')
            return

    # print 'handle schedule:'
    # import pprint
    # pprint.pprint (schedule)
    #
        pilot_cu_map = dict()
        unscheduled = list()

        pilot_ids = self.list_pilots()

        for unit in schedule['units'].keys():

            pid = schedule['units'][unit]

            if None == pid:
                unscheduled.append(unit)
                continue

            else:

                if pid not in pilot_ids:
                    raise RuntimeError("schedule points to unknown pilot %s" %
                                       pid)

                if pid not in pilot_cu_map:
                    pilot_cu_map[pid] = list()

                pilot_cu_map[pid].append(unit)

        # submit to all pilots which got something submitted to
        for pid in pilot_cu_map.keys():

            units_to_schedule = list()

            # if a kernel name is in the cu descriptions set, do kernel expansion
            for unit in pilot_cu_map[pid]:

                if not pid in schedule['pilots']:
                    # lost pilot, do not schedule unit
                    logger.warn("unschedule unit %s, lost pilot %s" %
                                (unit.uid, pid))
                    continue

                unit.sandbox = schedule['pilots'][pid]['sandbox'] + "/" + str(
                    unit.uid)

                ud = unit.description

                if 'kernel' in ud and ud['kernel']:

                    try:
                        from radical.ensemblemd.mdkernels import MDTaskDescription
                    except Exception as ex:
                        logger.error ("Kernels are not supported in" \
                              "compute unit descriptions -- install " \
                              "radical.ensemblemd.mdkernels!")
                        # FIXME: unit needs a '_set_state() method or something!
                        self._session._dbs.set_compute_unit_state(
                            unit._uid, FAILED, ["kernel expansion failed"])
                        continue

                    pilot_resource = schedule['pilots'][pid]['resource']

                    mdtd = MDTaskDescription()
                    mdtd.kernel = ud.kernel
                    mdtd_bound = mdtd.bind(resource=pilot_resource)
                    ud.environment = mdtd_bound.environment
                    ud.pre_exec = mdtd_bound.pre_exec
                    ud.executable = mdtd_bound.executable
                    ud.mpi = mdtd_bound.mpi

                units_to_schedule.append(unit)

            if len(units_to_schedule):
                self._worker.schedule_compute_units(pilot_uid=pid,
                                                    units=units_to_schedule)

        # report any change in wait_queue_size
        old_wait_queue_size = self.wait_queue_size

        self.wait_queue_size = len(unscheduled)
        if old_wait_queue_size != self.wait_queue_size:
            self._worker.fire_manager_callback(WAIT_QUEUE_SIZE, self,
                                               self.wait_queue_size)

        if len(unscheduled):
            self._worker.unschedule_compute_units(units=unscheduled)

        logger.info('%s units remain unscheduled' % len(unscheduled))
示例#31
0
    def wait_pilots(self,
                    pilot_ids=None,
                    state=[DONE, FAILED, CANCELED],
                    timeout=None):
        """Returns when one or more :class:`radical.pilot.ComputePilots` reach a
        specific state or when an optional timeout is reached.

        If `pilot_uids` is `None`, `wait_pilots` returns when **all** Pilots
        reach the state defined in `state`.

        **Arguments:**

            * **pilot_uids** [`string` or `list of strings`]
              If pilot_uids is set, only the Pilots with the specified uids are
              considered. If pilot_uids is `None` (default), all Pilots are
              considered.

            * **state** [`list of strings`]
              The state(s) that Pilots have to reach in order for the call
              to return.

              By default `wait_pilots` waits for the Pilots to reach
              a **terminal** state, which can be one of the following:

              * :data:`radical.pilot.DONE`
              * :data:`radical.pilot.FAILED`
              * :data:`radical.pilot.CANCELED`

            * **timeout** [`float`]
              Optional timeout in seconds before the call returns regardless
              whether the Pilots have reached the desired state or not.
              The default value **-1.0** never times out.

        **Raises:**

            * :class:`radical.pilot.PilotException`
        """
        self._assert_obj_is_valid()

        if not isinstance(state, list):
            state = [state]

        return_list_type = True
        if (not isinstance(pilot_ids, list)) and (pilot_ids is not None):
            return_list_type = False
            pilot_ids = [pilot_ids]

        start = time.time()
        all_ok = False
        states = list()

        while not all_ok:

            pilots = self._worker.get_compute_pilot_data(pilot_ids=pilot_ids)
            all_ok = True
            states = list()

            for pilot in pilots:
                if pilot['state'] not in state:
                    all_ok = False

                states.append(pilot['state'])

            # check timeout
            if (None != timeout) and (timeout <= (time.time() - start)):
                if not all_ok:
                    logger.debug("wait timed out: %s" % states)
                break

            # sleep a little if this cycle was idle
            if not all_ok:
                time.sleep(0.1)

        # done waiting
        if return_list_type:
            return states
        else:
            return states[0]
    def check_pilot_states(self, pilot_col):

        pending_pilots = pilot_col.find(
            {"pilotmanager": self.pilot_manager_id, "state": {"$in": [PENDING_ACTIVE, ACTIVE]}}
        )

        for pending_pilot in pending_pilots:

            pilot_failed = False
            pilot_done = False
            reconnected = False
            pilot_id = pending_pilot["_id"]
            log_message = ""
            saga_job_id = pending_pilot["saga_job_id"]

            logger.info("Performing periodical health check for %s (SAGA job id %s)" % (str(pilot_id), saga_job_id))

            if not pilot_id in self.missing_pilots:
                self.missing_pilots[pilot_id] = 0

            # Create a job service object:
            try:
                js_url = saga_job_id.split("]-[")[0][1:]

                if js_url in self._shared_worker_data["job_services"]:
                    js = self._shared_worker_data["job_services"][js_url]
                else:
                    js = saga.job.Service(js_url, session=self._session)
                    self._shared_worker_data["job_services"][js_url] = js

                saga_job = js.get_job(saga_job_id)
                reconnected = True

                if saga_job.state in [saga.job.FAILED, saga.job.CANCELED]:
                    pilot_failed = True
                    log_message = "SAGA job state for ComputePilot %s is %s." % (pilot_id, saga_job.state)

                if saga_job.state in [saga.job.DONE]:
                    pilot_done = True
                    log_message = "SAGA job state for ComputePilot %s is %s." % (pilot_id, saga_job.state)

            except Exception as e:

                if not reconnected:
                    logger.warning("could not reconnect to pilot for state check (%s)" % e)
                    self.missing_pilots[pilot_id] += 1

                    if self.missing_pilots[pilot_id] >= JOB_CHECK_MAX_MISSES:
                        logger.debug("giving up after 10 attempts")
                        pilot_failed = True
                        log_message = "Could not reconnect to pilot %s " "multiple times - giving up" % pilot_id
                else:
                    logger.warning("pilot state check failed: %s" % e)
                    pilot_failed = True
                    log_message = (
                        "Couldn't determine job state for ComputePilot %s. " "Assuming it has failed." % pilot_id
                    )

            if pilot_failed:
                out, err, log = self._get_pilot_logs(pilot_col, pilot_id)
                ts = datetime.datetime.utcnow()
                pilot_col.update(
                    {"_id": pilot_id, "state": {"$ne": DONE}},
                    {
                        "$set": {"state": FAILED, "stdout": out, "stderr": err, "logfile": log},
                        "$push": {
                            "statehistory": {"state": FAILED, "timestamp": ts},
                            "log": {"message": log_message, "timestamp": ts},
                        },
                    },
                )
                logger.debug(log_message)
                logger.warn("pilot %s declared dead" % pilot_id)

            elif pilot_done:
                # FIXME: this should only be done if the state is not yet
                # done...
                out, err, log = self._get_pilot_logs(pilot_col, pilot_id)
                ts = datetime.datetime.utcnow()
                pilot_col.update(
                    {"_id": pilot_id, "state": {"$ne": DONE}},
                    {
                        "$set": {"state": DONE, "stdout": out, "stderr": err, "logfile": log},
                        "$push": {
                            "statehistory": {"state": DONE, "timestamp": ts},
                            "log": {"message": log_message, "timestamp": ts},
                        },
                    },
                )
                logger.debug(log_message)
                logger.warn("pilot %s declared dead" % pilot_id)

            else:
                if self.missing_pilots[pilot_id]:
                    logger.info("pilot %s *assumed* alive and well (%s)" % (pilot_id, self.missing_pilots[pilot_id]))
                else:
                    logger.info("pilot %s seems alive and well" % (pilot_id))