def stop(self): """stop() signals the process to finish up and terminate. """ logger.debug("otransfer %s stopping" % (self.name)) self._stop.set() self.join() logger.debug("otransfer %s stopped" % (self.name))
def cancel_launcher(self): """cancel the launcher threads """ for worker in self._pilot_launcher_worker_pool: logger.debug("pworker %s stops launcher %s" % (self.name, worker.name)) worker.stop () worker.join () logger.debug("pworker %s stopped launcher %s" % (self.name, worker.name))
def __init__(self): """ Le constructeur. Not meant to be called directly. """ # 'static' members self._uid = None self._name = None self._description = None self._manager = None # handle to the manager's worker self._worker = None if os.getenv("RADICAL_PILOT_GCDEBUG", None) is not None: logger.debug("GCDEBUG __init__(): ComputeUnit [object id: %s]." % id(self))
def _get(pilot_manager_obj, pilot_ids): """ PRIVATE: Get one or more pilot via their UIDs. """ pilots_json = pilot_manager_obj._worker.get_compute_pilot_data( pilot_ids=pilot_ids) # create and return pilot objects pilots = [] for p in pilots_json: pilot = ComputePilot() pilot._uid = str(p['_id']) pilot._description = p['description'] pilot._manager = pilot_manager_obj pilot._worker = pilot._manager._worker logger.debug("Reconnected to existing ComputePilot %s" % str(pilot)) pilots.append(pilot) return pilots
def cancel(self): """Cancel the ComputeUnit. **Raises:** * :class:`radical.pilot.radical.pilotException` """ # Check if this instance is valid if not self._uid: raise BadParameter("Invalid Compute Unit instance.") cu_json = self._worker.get_compute_unit_data(self.uid) pilot_uid = cu_json['pilot'] if self.state in [DONE, FAILED, CANCELED]: # nothing to do logger.debug("Compute unit %s has state %s, can't cancel any longer." % (self._uid, self.state)) elif self.state in [NEW, UNSCHEDULED, PENDING_INPUT_STAGING]: logger.debug("Compute unit %s has state %s, going to prevent from starting." % (self._uid, self.state)) self._manager._session._dbs.set_compute_unit_state(self._uid, CANCELED, ["Received Cancel"]) elif self.state == STAGING_INPUT: logger.debug("Compute unit %s has state %s, will cancel the transfer." % (self._uid, self.state)) self._manager._session._dbs.set_compute_unit_state(self._uid, CANCELED, ["Received Cancel"]) elif self.state in [PENDING_EXECUTION, SCHEDULING]: logger.debug("Compute unit %s has state %s, will abort start-up." % (self._uid, self.state)) self._manager._session._dbs.set_compute_unit_state(self._uid, CANCELED, ["Received Cancel"]) elif self.state == EXECUTING: logger.debug("Compute unit %s has state %s, will terminate the task." % (self._uid, self.state)) self._manager._session._dbs.send_command_to_pilot(cmd=COMMAND_CANCEL_COMPUTE_UNIT, arg=self.uid, pilot_ids=pilot_uid) elif self.state == PENDING_OUTPUT_STAGING: logger.debug("Compute unit %s has state %s, will abort the transfer." % (self._uid, self.state)) self._manager._session._dbs.set_compute_unit_state(self._uid, CANCELED, ["Received Cancel"]) elif self.state == STAGING_OUTPUT: logger.debug("Compute unit %s has state %s, will cancel the transfer." % (self._uid, self.state)) self._manager._session._dbs.set_compute_unit_state(self._uid, CANCELED, ["Received Cancel"]) else: raise IncorrectState("Unknown Compute Unit state: %s, cannot cancel" % self.state) # done canceling return
def run(self): """Starts the process when Process.start() is called. """ # make sure to catch sys.exit (which raises SystemExit) try : logger.info("Starting InputFileTransferWorker") # Try to connect to the database and create a tailable cursor. try: connection = self.db_connection_info.get_db_handle() db = connection[self.db_connection_info.dbname] um_col = db["%s.cu" % self.db_connection_info.session_id] logger.debug("Connected to MongoDB. Serving requests for UnitManager %s." % self.unit_manager_id) except Exception as e : logger.exception("Connection error: %s" % e) raise try : while not self._stop.is_set(): # See if we can find a ComputeUnit that is waiting for # input file transfer. compute_unit = None ts = datetime.datetime.utcnow() compute_unit = um_col.find_and_modify( query={"unitmanager": self.unit_manager_id, "FTW_Input_Status": PENDING}, update={"$set" : {"FTW_Input_Status": EXECUTING, "state": STAGING_INPUT}, "$push": {"statehistory": {"state": STAGING_INPUT, "timestamp": ts}}}, limit=BULK_LIMIT # TODO: bulklimit is probably not the best way to ensure there is just one ) # FIXME: AM: find_and_modify is not bulkable! state = STAGING_INPUT if compute_unit is None: # Sleep a bit if no new units are available. time.sleep(IDLE_TIME) else: compute_unit_id = None try: log_messages = [] # We have found a new CU. Now we can process the transfer # directive(s) wit SAGA. compute_unit_id = str(compute_unit["_id"]) remote_sandbox = compute_unit["sandbox"] input_staging = compute_unit["FTW_Input_Directives"] # We need to create the CU's directory in case it doesn't exist yet. log_msg = "Creating ComputeUnit sandbox directory %s." % remote_sandbox log_messages.append(log_msg) logger.info(log_msg) # Creating the sandbox directory. try: logger.debug ("saga.fs.Directory ('%s')" % remote_sandbox) remote_sandbox_keyurl = saga.Url (remote_sandbox) remote_sandbox_keyurl.path = '/' remote_sandbox_key = str(remote_sandbox_keyurl) if remote_sandbox_key not in self._saga_dirs : self._saga_dirs[remote_sandbox_key] = \ saga.filesystem.Directory (remote_sandbox_key, flags=saga.filesystem.CREATE_PARENTS, session=self._session) saga_dir = self._saga_dirs[remote_sandbox_key] saga_dir.make_dir (remote_sandbox, flags=saga.filesystem.CREATE_PARENTS) except Exception as e : logger.exception('Error: %s' % e) # FIXME: why is this exception ignored? AM logger.info("Processing input file transfers for ComputeUnit %s" % compute_unit_id) # Loop over all transfer directives and execute them. for sd in input_staging: state_doc = um_col.find_one( {"_id": compute_unit_id}, fields=["state"] ) if state_doc['state'] == CANCELED: logger.info("Compute Unit Canceled, interrupting input file transfers.") state = CANCELED break abs_src = os.path.abspath(sd['source']) input_file_url = saga.Url("file://localhost/%s" % abs_src) if not sd['target']: target = remote_sandbox else: target = "%s/%s" % (remote_sandbox, sd['target']) log_msg = "Transferring input file %s -> %s" % (input_file_url, target) log_messages.append(log_msg) logger.debug(log_msg) # Execute the transfer. logger.debug ("saga.fs.File ('%s')" % input_file_url) input_file = saga.filesystem.File( input_file_url, session=self._session ) if CREATE_PARENTS in sd['flags']: copy_flags = saga.filesystem.CREATE_PARENTS else: copy_flags = 0 try : input_file.copy(target, flags=copy_flags) except Exception as e : logger.exception (e) input_file.close() # If all went fine, update the state of this StagingDirective to Done um_col.find_and_modify( query={"_id" : compute_unit_id, 'FTW_Input_Status': EXECUTING, 'FTW_Input_Directives.state': PENDING, 'FTW_Input_Directives.source': sd['source'], 'FTW_Input_Directives.target': sd['target'], }, update={'$set': {'FTW_Input_Directives.$.state': 'Done'}, '$push': {'log': { 'timestamp': datetime.datetime.utcnow(), 'message' : log_msg}} } ) except Exception as e : # Update the CU's state 'FAILED'. ts = datetime.datetime.utcnow() logentry = {'message' : "Input transfer failed: %s" % e, 'timestamp': ts} um_col.update({'_id': compute_unit_id}, { '$set': {'state': FAILED}, '$push': { 'statehistory': {'state': FAILED, 'timestamp': ts}, 'log': logentry } }) logger.exception(str(logentry)) # Code below is only to be run by the "first" or only worker if self._worker_number > 1: continue # If the CU was canceled we can skip the remainder of this loop. if state == CANCELED: continue # # Check to see if there are more pending Directives, if not, we are Done # cursor_w = um_col.find({"unitmanager": self.unit_manager_id, "$or": [ {"Agent_Input_Status": EXECUTING}, {"FTW_Input_Status": EXECUTING} ] } ) # Iterate over all the returned CUs (if any) for cu in cursor_w: # See if there are any FTW Input Directives still pending if cu['FTW_Input_Status'] == EXECUTING and \ not any(d['state'] == EXECUTING or d['state'] == PENDING for d in cu['FTW_Input_Directives']): # All Input Directives for this FTW are done, mark the CU accordingly um_col.update({"_id": cu["_id"]}, {'$set': {'FTW_Input_Status': DONE}, '$push': {'log': { 'timestamp': datetime.datetime.utcnow(), 'message' : 'All FTW Input Staging Directives done - %d.' % self._worker_number}} } ) # See if there are any Agent Input Directives still pending or executing, # if not, mark it DONE. if cu['Agent_Input_Status'] == EXECUTING and \ not any(d['state'] == EXECUTING or d['state'] == PENDING for d in cu['Agent_Input_Directives']): # All Input Directives for this Agent are done, mark the CU accordingly um_col.update({"_id": cu["_id"]}, {'$set': {'Agent_Input_Status': DONE}, '$push': {'log': { 'timestamp': datetime.datetime.utcnow(), 'message' : 'All Agent Input Staging Directives done - %d.' % self._worker_number}} } ) # # Check for all CUs if both Agent and FTW staging is done, we can then mark the CU PendingExecution # ts = datetime.datetime.utcnow() um_col.find_and_modify( query={"unitmanager": self.unit_manager_id, "Agent_Input_Status": { "$in": [ None, DONE ] }, "FTW_Input_Status": { "$in": [ None, DONE ] }, "state": STAGING_INPUT }, update={"$set": { "state": PENDING_EXECUTION }, "$push": { "statehistory": {"state": PENDING_EXECUTION, "timestamp": ts} } } ) except Exception as e : logger.exception("transfer worker error: %s" % e) self._session.close (cleanup=False) raise except SystemExit as e : logger.debug("input file transfer thread caught system exit -- forcing application shutdown") import thread thread.interrupt_main ()
def run(self): """Starts the process when Process.start() is called. """ # make sure to catch sys.exit (which raises SystemExit) try: # Get directory where this module lives mod_dir = os.path.dirname(os.path.realpath(__file__)) # Try to connect to the database try: connection = self.db_connection_info.get_db_handle() db = connection[self.db_connection_info.dbname] pilot_col = db["%s.p" % self.db_connection_info.session_id] logger.debug("Connected to MongoDB. Serving requests for PilotManager %s." % self.pilot_manager_id) except Exception as e: logger.exception("Connection error: %s" % e) return last_job_check = time.time() while not self._stop.is_set(): # Periodically, we pull up all ComputePilots that are pending # execution or were last seen executing and check if the corresponding # SAGA job is still pending in the queue. If that is not the case, # we assume that the job has failed for some reasons and update # the state of the ComputePilot accordingly. if last_job_check + JOB_CHECK_INTERVAL < time.time(): last_job_check = time.time() self.check_pilot_states(pilot_col) # See if we can find a ComputePilot that is waiting to be launched. # If we find one, we use SAGA to create a job service, a job # description and a job that is then send to the local or remote # queueing system. If this succedes, we set the ComputePilot's # state to pending, otherwise to failed. compute_pilot = None ts = datetime.datetime.utcnow() compute_pilot = pilot_col.find_and_modify( query={"pilotmanager": self.pilot_manager_id, "state": PENDING_LAUNCH}, update={ "$set": {"state": LAUNCHING}, "$push": {"statehistory": {"state": LAUNCHING, "timestamp": ts}}, }, ) if not compute_pilot: time.sleep(IDLE_TIMER) else: try: # ------------------------------------------------------ # # LAUNCH THE PILOT AGENT VIA SAGA # logentries = [] pilot_id = str(compute_pilot["_id"]) logger.info("Launching ComputePilot %s" % pilot_id) # ------------------------------------------------------ # Database connection parameters session_uid = self.db_connection_info.session_id database_url = self.db_connection_info.dburl database_name = self.db_connection_info.dbname database_auth = self.db_connection_info.dbauth # ------------------------------------------------------ # pilot description and resource configuration number_cores = compute_pilot["description"]["cores"] runtime = compute_pilot["description"]["runtime"] queue = compute_pilot["description"]["queue"] project = compute_pilot["description"]["project"] cleanup = compute_pilot["description"]["cleanup"] resource_key = compute_pilot["description"]["resource"] schema = compute_pilot["description"]["access_schema"] memory = compute_pilot["description"]["memory"] pilot_sandbox = compute_pilot["sandbox"] global_sandbox = compute_pilot["global_sandbox"] # we expand and exchange keys in the resource config, # depending on the selected schema so better use a deep # copy.. resource_cfg = self._session.get_resource_config(resource_key, schema) # import pprint # pprint.pprint (resource_cfg) # ------------------------------------------------------ # get parameters from cfg, set defaults where needed agent_mongodb_endpoint = resource_cfg.get("agent_mongodb_endpoint", database_url) agent_spawner = resource_cfg.get("agent_spawner", DEFAULT_AGENT_SPAWNER) agent_type = resource_cfg.get("agent_type", DEFAULT_AGENT_TYPE) agent_scheduler = resource_cfg.get("agent_scheduler") tunnel_bind_device = resource_cfg.get("tunnel_bind_device") default_queue = resource_cfg.get("default_queue") forward_tunnel_endpoint = resource_cfg.get("forward_tunnel_endpoint") js_endpoint = resource_cfg.get("job_manager_endpoint") lrms = resource_cfg.get("lrms") mpi_launch_method = resource_cfg.get("mpi_launch_method") pre_bootstrap = resource_cfg.get("pre_bootstrap") python_interpreter = resource_cfg.get("python_interpreter") spmd_variation = resource_cfg.get("spmd_variation") task_launch_method = resource_cfg.get("task_launch_method") rp_version = resource_cfg.get("rp_version", DEFAULT_RP_VERSION) virtenv_mode = resource_cfg.get("virtenv_mode", DEFAULT_VIRTENV_MODE) virtenv = resource_cfg.get("virtenv", DEFAULT_VIRTENV) stage_cacerts = resource_cfg.get("stage_cacerts", "False") if stage_cacerts.lower() == "true": stage_cacerts = True else: stage_cacerts = False # expand variables in virtenv string virtenv = virtenv % { "pilot_sandbox": saga.Url(pilot_sandbox).path, "global_sandbox": saga.Url(global_sandbox).path, } # Check for deprecated global_virtenv global_virtenv = resource_cfg.get("global_virtenv") if global_virtenv: logger.warn("'global_virtenv' keyword is deprecated -- use 'virtenv' and 'virtenv_mode'") virtenv = global_virtenv virtenv_mode = "use" # set default scheme, host, port and dbname if not set db_url = saga.Url(agent_mongodb_endpoint) if not db_url.scheme: db_url.scheme = "mongodb" if not db_url.host: db_url.host = "localhost" if not db_url.port: db_url.port = 27017 if not database_name: database_name = "radicalpilot" # Create a host:port string for use by the bootstrapper. database_hostport = "%s:%d" % (db_url.host, db_url.port) # ------------------------------------------------------ # Copy the bootstrap shell script. This also creates # the sandbox. We use always "default_bootstrapper.sh" bootstrapper = "default_bootstrapper.sh" bootstrapper_path = os.path.abspath("%s/../bootstrapper/%s" % (mod_dir, bootstrapper)) msg = "Using bootstrapper %s" % bootstrapper_path logentries.append(Logentry(msg, logger=logger.info)) bs_script_url = saga.Url("file://localhost/%s" % bootstrapper_path) bs_script_tgt = saga.Url("%s/pilot_bootstrapper.sh" % pilot_sandbox) msg = "Copying bootstrapper '%s' to agent sandbox (%s)." % (bs_script_url, bs_script_tgt) logentries.append(Logentry(msg, logger=logger.debug)) bs_script = saga.filesystem.File(bs_script_url, session=self._session) bs_script.copy(bs_script_tgt, flags=saga.filesystem.CREATE_PARENTS) bs_script.close() # ------------------------------------------------------ # the version of the agent is derived from # rp_version, which has the following format # and interpretation: # # case rp_version: # @<token>: # @tag/@branch/@commit: # no sdist staging # git clone $github_base radical.pilot.src # (cd radical.pilot.src && git checkout token) # pip install -t $VIRTENV/rp_install/ radical.pilot.src # rm -rf radical.pilot.src # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # release: # no sdist staging # pip install -t $VIRTENV/rp_install radical.pilot # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # local: # needs sdist staging # tar zxf $sdist.tgz # pip install -t $VIRTENV/rp_install $sdist/ # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # debug: # needs sdist staging # tar zxf $sdist.tgz # pip install -t $SANDBOX/rp_install $sdist/ # export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH # # installed: # no sdist staging # true # esac # # virtenv_mode # private : error if ve exists, otherwise create, then use # update : update if ve exists, otherwise create, then use # create : use if ve exists, otherwise create, then use # use : use if ve exists, otherwise error, then exit # recreate: delete if ve exists, otherwise create, then use # # examples : # [email protected] # virtenv@devel # virtenv@release # virtenv@installed # stage@local # stage@/tmp/my_agent.py # # Note that some combinations may be invalid, # specifically in the context of virtenv_mode. If, for # example, virtenv_mode is 'use', then the 'virtenv:tag' # will not make sense, as the virtenv is not updated. # In those cases, the virtenv_mode is honored, and # a warning is printed. # # Also, the 'stage' mode can only be combined with the # 'local' source, or with a path to the agent (relative # to mod_dir, or absolute). # # A rp_version which does not adhere to the # above syntax is ignored, and the fallback stage@local # is used. if not rp_version.startswith("@") and not rp_version in ["installed", "local", "debug"]: raise ValueError("invalid rp_version '%s'" % rp_version) stage_sdist = True if rp_version in ["installed", "release"]: stage_sdist = False if rp_version.startswith("@"): stage_sdist = False rp_version = rp_version[1:] # strip '@' # ------------------------------------------------------ # Copy the rp sdist if needed. We actually also stage # the sdists for radical.utils and radical.saga, so that # we have the complete stack to install... if stage_sdist: for path in [ru.sdist_path, saga.sdist_path, sdist_path]: sdist_url = saga.Url("file://localhost/%s" % path) msg = "Copying sdist '%s' to sdist sandbox (%s)." % (sdist_url, pilot_sandbox) logentries.append(Logentry(msg, logger=logger.debug)) sdist_file = saga.filesystem.File(sdist_url) sdist_file.copy("%s/" % (str(pilot_sandbox))) sdist_file.close() # ------------------------------------------------------ # some machines cannot run pip due to outdated ca certs. # For those, we also stage an updated cert bundle if stage_cacerts: cc_path = os.path.abspath("%s/../bootstrapper/%s" % (mod_dir, "cacert.pem.gz")) cc_script_url = saga.Url("file://localhost/%s" % cc_path) cc_script_tgt = saga.Url("%s/cacert.pem.gz" % pilot_sandbox) cc_script = saga.filesystem.File(cc_script_url, session=self._session) cc_script.copy(cc_script_tgt, flags=saga.filesystem.CREATE_PARENTS) cc_script.close() # ------------------------------------------------------ # sanity checks if not agent_spawner: raise RuntimeError("missing agent spawner") if not agent_scheduler: raise RuntimeError("missing agent scheduler") if not lrms: raise RuntimeError("missing LRMS") if not mpi_launch_method: raise RuntimeError("missing mpi launch method") if not task_launch_method: raise RuntimeError("missing task launch method") # massage some values debug_level = os.environ.get("RADICAL_PILOT_AGENT_VERBOSE", logger.level) try: debug_level = int(debug_level) except ValueError: debug_level = { "CRITICAL": 1, "ERROR": 2, "WARNING": 3, "WARN": 3, "INFO": 4, "DEBUG": 5, }.get(debug_level, 0) if not queue: queue = default_queue if cleanup and isinstance(cleanup, bool): cleanup = "luve" # l : log files # u : unit work dirs # v : virtualenv # e : everything (== pilot sandbox) # # we never cleanup virtenvs which are not private if virtenv_mode is not "private": cleanup = cleanup.replace("v", "") sdists = ":".join([ru.sdist_name, saga.sdist_name, sdist_name]) # set mandatory args bootstrap_args = "" bootstrap_args += " -b '%s'" % sdists bootstrap_args += " -c '%s'" % number_cores bootstrap_args += " -d '%s'" % debug_level bootstrap_args += " -g '%s'" % virtenv bootstrap_args += " -j '%s'" % task_launch_method bootstrap_args += " -k '%s'" % mpi_launch_method bootstrap_args += " -l '%s'" % lrms bootstrap_args += " -m '%s'" % database_hostport bootstrap_args += " -n '%s'" % database_name bootstrap_args += " -o '%s'" % agent_spawner bootstrap_args += " -p '%s'" % pilot_id bootstrap_args += " -q '%s'" % agent_scheduler bootstrap_args += " -r '%s'" % runtime bootstrap_args += " -s '%s'" % session_uid bootstrap_args += " -t '%s'" % agent_type bootstrap_args += " -u '%s'" % virtenv_mode bootstrap_args += " -v '%s'" % rp_version # set optional args if database_auth: bootstrap_args += " -a '%s'" % database_auth if tunnel_bind_device: bootstrap_args += " -D '%s'" % tunnel_bind_device if pre_bootstrap: bootstrap_args += " -e '%s'" % "' -e '".join(pre_bootstrap) if forward_tunnel_endpoint: bootstrap_args += " -f '%s'" % forward_tunnel_endpoint if python_interpreter: bootstrap_args += " -i '%s'" % python_interpreter if cleanup: bootstrap_args += " -x '%s'" % cleanup # ------------------------------------------------------ # now that the script is in place and we know where it is, # we can launch the agent js_url = saga.Url(js_endpoint) logger.debug("saga.job.Service ('%s')" % js_url) if js_url in self._shared_worker_data["job_services"]: js = self._shared_worker_data["job_services"][js_url] else: js = saga.job.Service(js_url, session=self._session) self._shared_worker_data["job_services"][js_url] = js # ------------------------------------------------------ # Create SAGA Job description and submit the pilot job jd = saga.job.Description() jd.executable = "/bin/bash" jd.arguments = ["-l pilot_bootstrapper.sh", bootstrap_args] jd.working_directory = saga.Url(pilot_sandbox).path jd.project = project jd.output = "agent.out" jd.error = "agent.err" jd.total_cpu_count = number_cores jd.wall_time_limit = runtime jd.total_physical_memory = memory jd.queue = queue # Set the SPMD variation only if required if spmd_variation: jd.spmd_variation = spmd_variation if "RADICAL_PILOT_PROFILE" in os.environ: jd.environment = {"RADICAL_PILOT_PROFILE": "TRUE"} logger.debug("Bootstrap command line: %s %s" % (jd.executable, jd.arguments)) msg = "Submitting SAGA job with description: %s" % str(jd.as_dict()) logentries.append(Logentry(msg, logger=logger.debug)) pilotjob = js.create_job(jd) pilotjob.run() # do a quick error check if pilotjob.state == saga.FAILED: raise RuntimeError("SAGA Job state is FAILED.") saga_job_id = pilotjob.id self._shared_worker_data["job_ids"][pilot_id] = [saga_job_id, js_url] msg = "SAGA job submitted with job id %s" % str(saga_job_id) logentries.append(Logentry(msg, logger=logger.debug)) # # ------------------------------------------------------ log_dicts = list() for le in logentries: log_dicts.append(le.as_dict()) # Update the Pilot's state to 'PENDING_ACTIVE' if SAGA job submission was successful. ts = datetime.datetime.utcnow() ret = pilot_col.update( {"_id": pilot_id, "state": "Launching"}, { "$set": {"state": PENDING_ACTIVE, "saga_job_id": saga_job_id}, "$push": {"statehistory": {"state": PENDING_ACTIVE, "timestamp": ts}}, "$pushAll": {"log": log_dicts}, }, ) if ret["n"] == 0: # could not update, probably because the agent is # running already. Just update state history and # jobid then # FIXME: make sure of the agent state! ret = pilot_col.update( {"_id": pilot_id}, { "$set": {"saga_job_id": saga_job_id}, "$push": {"statehistory": {"state": PENDING_ACTIVE, "timestamp": ts}}, "$pushAll": {"log": log_dicts}, }, ) except Exception as e: # Update the Pilot's state 'FAILED'. out, err, log = self._get_pilot_logs(pilot_col, pilot_id) ts = datetime.datetime.utcnow() # FIXME: we seem to be unable to bson/json handle saga # log messages containing an '#'. This shows up here. # Until we find a clean workaround, make log shorter and # rely on saga logging to reveal the problem. msg = "Pilot launching failed! (%s)" % e logentries.append(Logentry(msg)) log_dicts = list() log_messages = list() for le in logentries: log_dicts.append(le.as_dict()) log_messages.append(le.message) pilot_col.update( {"_id": pilot_id, "state": {"$ne": FAILED}}, { "$set": {"state": FAILED, "stdout": out, "stderr": err, "logfile": log}, "$push": {"statehistory": {"state": FAILED, "timestamp": ts}}, "$pushAll": {"log": log_dicts}, }, ) logger.exception("\n".join(log_messages)) except SystemExit as e: logger.exception("pilot launcher thread caught system exit -- forcing application shutdown") import thread thread.interrupt_main()
def wait_units(self, unit_ids=None, state=[DONE, FAILED, CANCELED], timeout=None): """Returns when one or more :class:`radical.pilot.ComputeUnits` reach a specific state. If `unit_uids` is `None`, `wait_units` returns when **all** ComputeUnits reach the state defined in `state`. **Example**:: # TODO -- add example **Arguments:** * **unit_uids** [`string` or `list of strings`] If unit_uids is set, only the ComputeUnits with the specified uids are considered. If unit_uids is `None` (default), all ComputeUnits are considered. * **state** [`string`] The state that ComputeUnits have to reach in order for the call to return. By default `wait_units` waits for the ComputeUnits to reach a terminal state, which can be one of the following: * :data:`radical.pilot.DONE` * :data:`radical.pilot.FAILED` * :data:`radical.pilot.CANCELED` * **timeout** [`float`] Timeout in seconds before the call returns regardless of Pilot state changes. The default value **None** waits forever. **Raises:** * :class:`radical.pilot.PilotException` """ if not self._uid: raise IncorrectState(msg="Invalid object instance.") if not isinstance(state, list): state = [state] return_list_type = True if (not isinstance(unit_ids, list)) and (unit_ids is not None): return_list_type = False unit_ids = [unit_ids] units = self.get_units(unit_ids) start = time.time() all_ok = False states = list() while not all_ok: all_ok = True states = list() for unit in units: if unit.state not in state: all_ok = False states.append(unit.state) # check timeout if (None != timeout) and (timeout <= (time.time() - start)): if not all_ok: logger.debug("wait timed out: %s" % states) break # sleep a little if this cycle was idle if not all_ok: time.sleep(0.1) # done waiting if return_list_type: return states else: return states[0]
def close(self, cleanup=True, terminate=True, delete=None): """Closes the session. All subsequent attempts access objects attached to the session will result in an error. If cleanup is set to True (default) the session data is removed from the database. **Arguments:** * **cleanup** (`bool`): Remove session from MongoDB (implies * terminate) * **terminate** (`bool`): Shut down all pilots associated with the session. **Raises:** * :class:`radical.pilot.IncorrectState` if the session is closed or doesn't exist. """ logger.debug("session %s closing" % (str(self._uid))) uid = self._uid if not self._uid: logger.error("Session object already closed.") return # we keep 'delete' for backward compatibility. If it was set, and the # other flags (cleanup, terminate) are as defaulted (True), then delete # will supercede them. Delete is considered deprecated though, and # we'll thus issue a warning. if delete != None: if cleanup == True and terminate == True : cleanup = delete terminate = delete logger.warning("'delete' flag on session is deprecated. " \ "Please use 'cleanup' and 'terminate' instead!") if cleanup : # cleanup implies terminate terminate = True for pmgr in self._pilot_manager_objects: logger.debug("session %s closes pmgr %s" % (str(self._uid), pmgr._uid)) pmgr.close (terminate=terminate) logger.debug("session %s closed pmgr %s" % (str(self._uid), pmgr._uid)) for umgr in self._unit_manager_objects: logger.debug("session %s closes umgr %s" % (str(self._uid), umgr._uid)) umgr.close() logger.debug("session %s closed umgr %s" % (str(self._uid), umgr._uid)) if cleanup : self._destroy_db_entry() logger.debug("session %s closed" % (str(self._uid)))
def close(self, terminate=True): """Shuts down the PilotManager and its background workers in a coordinated fashion. **Arguments:** * **terminate** [`bool`]: If set to True, all active pilots will get canceled (default: False). """ logger.debug("pmgr %s closing" % (str(self._uid))) # Spit out a warning in case the object was already closed. if not self._uid: logger.error("PilotManager object already closed.") return # before we terminate pilots, we have to kill the pilot launcher threads # -- otherwise we'll run into continous race conditions due to the # ongoing state checks... if self._worker is not None: # Stop the worker process logger.debug("pmgr %s cancel worker %s" % (str(self._uid), self._worker.name)) self._worker.cancel_launcher() logger.debug("pmgr %s canceled worker %s" % (str(self._uid), self._worker.name)) # If terminate is set, we cancel all pilots. if terminate: # cancel all pilots, make sure they are gone, and close the pilot # managers. for pilot in self.get_pilots(): logger.debug("pmgr %s cancels pilot %s" % (str(self._uid), pilot._uid)) self.cancel_pilots() # FIXME: # # wait_pilots() will wait until all pilots picked up the sent cancel # signal and died. However, that can take a loooong time. For # example, if a pilot is in 'PENDING_ACTIVE' state, this will have to # wait until the pilot is bootstrapped, started, connected to the DB, # and shut down again. Or, for a pilot which just got a shitload of # units, it will have to wait until the pilot started all those units # and then checks its command queue again. Or, if the pilot job # already died, wait will block until the state checker kicks in and # declares the pilot as dead, which takes a couple of minutes. # # Solution would be to add a CANCELING state and to wait for that one, # too, which basically means to wait until the cancel signal has been # sent. There is not much more to do at this point anyway. This is at # the moment faked in the manager controler, which sets that state # after sending the cancel command. This should be converted into # a proper state -- that would, btw, remove the need for a cancel # command in the first place, as the pilot can just pull its own state # instead, and cancel on CANCELING... # # self.wait_pilots () wait_for_cancel = True all_pilots = self.get_pilots() while wait_for_cancel: wait_for_cancel = False for pilot in all_pilots: logger.debug("pmgr %s wait for pilot %s (%s)" % (str(self._uid), pilot._uid, pilot.state)) if pilot.state not in [DONE, FAILED, CANCELED, CANCELING]: time.sleep(1) wait_for_cancel = True break for pilot in self.get_pilots(): logger.debug("pmgr %s canceled pilot %s" % (str(self._uid), pilot._uid)) logger.debug("pmgr %s stops worker %s" % (str(self._uid), self._worker.name)) self._worker.stop() self._worker.join() logger.debug("pmgr %s stopped worker %s" % (str(self._uid), self._worker.name)) # Remove worker from registry self._session._process_registry.remove(self._uid) logger.debug("pmgr %s closed" % (str(self._uid))) self._uid = None
def register_start_pilot_request(self, pilot, resource_config): """Register a new pilot start request with the worker. """ # create a new UID for the pilot pilot_uid = ru.generate_id ('pilot') # switch endpoint type filesystem_endpoint = resource_config['filesystem_endpoint'] fs = saga.Url(filesystem_endpoint) # get the home directory on the remote machine. # Note that this will only work for (gsi)ssh or shell based access # mechanisms (FIXME) import saga.utils.pty_shell as sup if fs.port is not None: url = "%s://%s:%d/" % (fs.schema, fs.host, fs.port) else: url = "%s://%s/" % (fs.schema, fs.host) logger.debug ("saga.utils.PTYShell ('%s')" % url) shell = sup.PTYShell(url, self._session, logger) if pilot.description.sandbox : workdir_raw = pilot.description.sandbox else : workdir_raw = resource_config.get ('default_remote_workdir', "$PWD") if '$' in workdir_raw or '`' in workdir_raw : ret, out, err = shell.run_sync (' echo "WORKDIR: %s"' % workdir_raw) if ret == 0 and 'WORKDIR:' in out : workdir_expanded = out.split(":")[1].strip() logger.debug("Determined remote working directory for %s: '%s'" % (url, workdir_expanded)) else : error_msg = "Couldn't determine remote working directory." logger.error(error_msg) raise Exception(error_msg) else : workdir_expanded = workdir_raw # At this point we have determined 'pwd' fs.path = "%s/radical.pilot.sandbox" % workdir_expanded # This is the base URL / 'sandbox' for the pilot! agent_dir_url = saga.Url("%s/%s-%s/" % (str(fs), self._session.uid, pilot_uid)) # Create a database entry for the new pilot. pilot_uid, pilot_json = self._db.insert_pilot( pilot_uid=pilot_uid, pilot_manager_uid=self._pm_id, pilot_description=pilot.description, pilot_sandbox=str(agent_dir_url), global_sandbox=str(fs.path) ) # Create a shared data store entry self._shared_data[pilot_uid] = { 'data': pilot_json, 'callbacks': [], 'facade_object': weakref.ref(pilot) } return pilot_uid
def run(self): """Starts the process when Process.start() is called. """ # make sure to catch sys.exit (which raises SystemExit) try: # Get directory where this module lives mod_dir = os.path.dirname(os.path.realpath(__file__)) # Try to connect to the database try: connection = self.db_connection_info.get_db_handle() db = connection[self.db_connection_info.dbname] pilot_col = db["%s.p" % self.db_connection_info.session_id] logger.debug( "Connected to MongoDB. Serving requests for PilotManager %s." % self.pilot_manager_id) except Exception as e: logger.exception("Connection error: %s" % e) return last_job_check = time.time() while not self._stop.is_set(): # Periodically, we pull up all ComputePilots that are pending # execution or were last seen executing and check if the corresponding # SAGA job is still pending in the queue. If that is not the case, # we assume that the job has failed for some reasons and update # the state of the ComputePilot accordingly. if last_job_check + JOB_CHECK_INTERVAL < time.time(): last_job_check = time.time() self.check_pilot_states(pilot_col) # See if we can find a ComputePilot that is waiting to be launched. # If we find one, we use SAGA to create a job service, a job # description and a job that is then send to the local or remote # queueing system. If this succedes, we set the ComputePilot's # state to pending, otherwise to failed. compute_pilot = None ts = datetime.datetime.utcnow() compute_pilot = pilot_col.find_and_modify( query={ "pilotmanager": self.pilot_manager_id, "state": PENDING_LAUNCH }, update={ "$set": { "state": LAUNCHING }, "$push": { "statehistory": { "state": LAUNCHING, "timestamp": ts } } }) if not compute_pilot: time.sleep(IDLE_TIMER) else: try: # ------------------------------------------------------ # # LAUNCH THE PILOT AGENT VIA SAGA # logentries = [] pilot_id = str(compute_pilot["_id"]) logger.info("Launching ComputePilot %s" % pilot_id) # ------------------------------------------------------ # Database connection parameters session_uid = self.db_connection_info.session_id database_url = self.db_connection_info.dburl database_name = self.db_connection_info.dbname database_auth = self.db_connection_info.dbauth # ------------------------------------------------------ # pilot description and resource configuration number_cores = compute_pilot['description']['cores'] runtime = compute_pilot['description']['runtime'] queue = compute_pilot['description']['queue'] project = compute_pilot['description']['project'] cleanup = compute_pilot['description']['cleanup'] resource_key = compute_pilot['description']['resource'] schema = compute_pilot['description']['access_schema'] memory = compute_pilot['description']['memory'] pilot_sandbox = compute_pilot['sandbox'] global_sandbox = compute_pilot['global_sandbox'] # we expand and exchange keys in the resource config, # depending on the selected schema so better use a deep # copy.. resource_cfg = self._session.get_resource_config( resource_key, schema) # import pprint # pprint.pprint (resource_cfg) # ------------------------------------------------------ # get parameters from cfg, set defaults where needed agent_mongodb_endpoint = resource_cfg.get( 'agent_mongodb_endpoint', database_url) agent_spawner = resource_cfg.get( 'agent_spawner', DEFAULT_AGENT_SPAWNER) agent_type = resource_cfg.get('agent_type', DEFAULT_AGENT_TYPE) agent_scheduler = resource_cfg.get('agent_scheduler') tunnel_bind_device = resource_cfg.get( 'tunnel_bind_device') default_queue = resource_cfg.get('default_queue') forward_tunnel_endpoint = resource_cfg.get( 'forward_tunnel_endpoint') js_endpoint = resource_cfg.get('job_manager_endpoint') lrms = resource_cfg.get('lrms') mpi_launch_method = resource_cfg.get( 'mpi_launch_method') pre_bootstrap = resource_cfg.get('pre_bootstrap') python_interpreter = resource_cfg.get( 'python_interpreter') spmd_variation = resource_cfg.get('spmd_variation') task_launch_method = resource_cfg.get( 'task_launch_method') rp_version = resource_cfg.get('rp_version', DEFAULT_RP_VERSION) virtenv_mode = resource_cfg.get( 'virtenv_mode', DEFAULT_VIRTENV_MODE) virtenv = resource_cfg.get('virtenv', DEFAULT_VIRTENV) stage_cacerts = resource_cfg.get( 'stage_cacerts', 'False') if stage_cacerts.lower() == 'true': stage_cacerts = True else: stage_cacerts = False # expand variables in virtenv string virtenv = virtenv % { 'pilot_sandbox': saga.Url(pilot_sandbox).path, 'global_sandbox': saga.Url(global_sandbox).path } # Check for deprecated global_virtenv global_virtenv = resource_cfg.get('global_virtenv') if global_virtenv: logger.warn( "'global_virtenv' keyword is deprecated -- use 'virtenv' and 'virtenv_mode'" ) virtenv = global_virtenv virtenv_mode = 'use' # set default scheme, host, port and dbname if not set db_url = saga.Url(agent_mongodb_endpoint) if not db_url.scheme: db_url.scheme = 'mongodb' if not db_url.host: db_url.host = 'localhost' if not db_url.port: db_url.port = 27017 if not database_name: database_name = 'radicalpilot' # Create a host:port string for use by the bootstrapper. database_hostport = "%s:%d" % (db_url.host, db_url.port) # ------------------------------------------------------ # Copy the bootstrap shell script. This also creates # the sandbox. We use always "default_bootstrapper.sh" bootstrapper = 'default_bootstrapper.sh' bootstrapper_path = os.path.abspath("%s/../bootstrapper/%s" \ % (mod_dir, bootstrapper)) msg = "Using bootstrapper %s" % bootstrapper_path logentries.append(Logentry(msg, logger=logger.info)) bs_script_url = saga.Url("file://localhost/%s" % bootstrapper_path) bs_script_tgt = saga.Url("%s/pilot_bootstrapper.sh" % pilot_sandbox) msg = "Copying bootstrapper '%s' to agent sandbox (%s)." \ % (bs_script_url, bs_script_tgt) logentries.append(Logentry(msg, logger=logger.debug)) bs_script = saga.filesystem.File(bs_script_url, session=self._session) bs_script.copy(bs_script_tgt, flags=saga.filesystem.CREATE_PARENTS) bs_script.close() # ------------------------------------------------------ # the version of the agent is derived from # rp_version, which has the following format # and interpretation: # # case rp_version: # @<token>: # @tag/@branch/@commit: # no sdist staging # git clone $github_base radical.pilot.src # (cd radical.pilot.src && git checkout token) # pip install -t $VIRTENV/rp_install/ radical.pilot.src # rm -rf radical.pilot.src # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # release: # no sdist staging # pip install -t $VIRTENV/rp_install radical.pilot # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # local: # needs sdist staging # tar zxf $sdist.tgz # pip install -t $VIRTENV/rp_install $sdist/ # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # debug: # needs sdist staging # tar zxf $sdist.tgz # pip install -t $SANDBOX/rp_install $sdist/ # export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH # # installed: # no sdist staging # true # esac # # virtenv_mode # private : error if ve exists, otherwise create, then use # update : update if ve exists, otherwise create, then use # create : use if ve exists, otherwise create, then use # use : use if ve exists, otherwise error, then exit # recreate: delete if ve exists, otherwise create, then use # # examples : # [email protected] # virtenv@devel # virtenv@release # virtenv@installed # stage@local # stage@/tmp/my_agent.py # # Note that some combinations may be invalid, # specifically in the context of virtenv_mode. If, for # example, virtenv_mode is 'use', then the 'virtenv:tag' # will not make sense, as the virtenv is not updated. # In those cases, the virtenv_mode is honored, and # a warning is printed. # # Also, the 'stage' mode can only be combined with the # 'local' source, or with a path to the agent (relative # to mod_dir, or absolute). # # A rp_version which does not adhere to the # above syntax is ignored, and the fallback stage@local # is used. if not rp_version.startswith('@') and \ not rp_version in ['installed', 'local', 'debug']: raise ValueError("invalid rp_version '%s'" % rp_version) stage_sdist = True if rp_version in ['installed', 'release']: stage_sdist = False if rp_version.startswith('@'): stage_sdist = False rp_version = rp_version[1:] # strip '@' # ------------------------------------------------------ # Copy the rp sdist if needed. We actually also stage # the sdists for radical.utils and radical.saga, so that # we have the complete stack to install... if stage_sdist: for path in [ ru.sdist_path, saga.sdist_path, sdist_path ]: sdist_url = saga.Url("file://localhost/%s" % path) msg = "Copying sdist '%s' to sdist sandbox (%s)." % ( sdist_url, pilot_sandbox) logentries.append( Logentry(msg, logger=logger.debug)) sdist_file = saga.filesystem.File(sdist_url) sdist_file.copy("%s/" % (str(pilot_sandbox))) sdist_file.close() # ------------------------------------------------------ # some machines cannot run pip due to outdated ca certs. # For those, we also stage an updated cert bundle if stage_cacerts: cc_path = os.path.abspath("%s/../bootstrapper/%s" \ % (mod_dir, 'cacert.pem.gz')) cc_script_url = saga.Url("file://localhost/%s" % cc_path) cc_script_tgt = saga.Url("%s/cacert.pem.gz" % pilot_sandbox) cc_script = saga.filesystem.File( cc_script_url, session=self._session) cc_script.copy( cc_script_tgt, flags=saga.filesystem.CREATE_PARENTS) cc_script.close() # ------------------------------------------------------ # sanity checks if not agent_spawner: raise RuntimeError("missing agent spawner") if not agent_scheduler: raise RuntimeError("missing agent scheduler") if not lrms: raise RuntimeError("missing LRMS") if not mpi_launch_method: raise RuntimeError("missing mpi launch method") if not task_launch_method: raise RuntimeError("missing task launch method") # massage some values debug_level = os.environ.get( 'RADICAL_PILOT_AGENT_VERBOSE', logger.level) try: debug_level = int(debug_level) except ValueError: debug_level = { 'CRITICAL': 1, 'ERROR': 2, 'WARNING': 3, 'WARN': 3, 'INFO': 4, 'DEBUG': 5 }.get(debug_level, 0) if not queue: queue = default_queue if cleanup and isinstance(cleanup, bool): cleanup = 'luve' # l : log files # u : unit work dirs # v : virtualenv # e : everything (== pilot sandbox) # # we never cleanup virtenvs which are not private if virtenv_mode is not 'private': cleanup = cleanup.replace('v', '') sdists = ':'.join( [ru.sdist_name, saga.sdist_name, sdist_name]) # set mandatory args bootstrap_args = "" bootstrap_args += " -b '%s'" % sdists bootstrap_args += " -c '%s'" % number_cores bootstrap_args += " -d '%s'" % debug_level bootstrap_args += " -g '%s'" % virtenv bootstrap_args += " -j '%s'" % task_launch_method bootstrap_args += " -k '%s'" % mpi_launch_method bootstrap_args += " -l '%s'" % lrms bootstrap_args += " -m '%s'" % database_hostport bootstrap_args += " -n '%s'" % database_name bootstrap_args += " -o '%s'" % agent_spawner bootstrap_args += " -p '%s'" % pilot_id bootstrap_args += " -q '%s'" % agent_scheduler bootstrap_args += " -r '%s'" % runtime bootstrap_args += " -s '%s'" % session_uid bootstrap_args += " -t '%s'" % agent_type bootstrap_args += " -u '%s'" % virtenv_mode bootstrap_args += " -v '%s'" % rp_version # set optional args if database_auth: bootstrap_args += " -a '%s'" % database_auth if tunnel_bind_device: bootstrap_args += " -D '%s'" % tunnel_bind_device if pre_bootstrap: bootstrap_args += " -e '%s'" % "' -e '".join( pre_bootstrap) if forward_tunnel_endpoint: bootstrap_args += " -f '%s'" % forward_tunnel_endpoint if python_interpreter: bootstrap_args += " -i '%s'" % python_interpreter if cleanup: bootstrap_args += " -x '%s'" % cleanup # ------------------------------------------------------ # now that the script is in place and we know where it is, # we can launch the agent js_url = saga.Url(js_endpoint) logger.debug("saga.job.Service ('%s')" % js_url) if js_url in self._shared_worker_data['job_services']: js = self._shared_worker_data['job_services'][ js_url] else: js = saga.job.Service(js_url, session=self._session) self._shared_worker_data['job_services'][ js_url] = js # ------------------------------------------------------ # Create SAGA Job description and submit the pilot job jd = saga.job.Description() jd.executable = "/bin/bash" jd.arguments = [ "-l pilot_bootstrapper.sh", bootstrap_args ] jd.working_directory = saga.Url(pilot_sandbox).path jd.project = project jd.output = "agent.out" jd.error = "agent.err" jd.total_cpu_count = number_cores jd.wall_time_limit = runtime jd.total_physical_memory = memory jd.queue = queue # Set the SPMD variation only if required if spmd_variation: jd.spmd_variation = spmd_variation if 'RADICAL_PILOT_PROFILE' in os.environ: jd.environment = {'RADICAL_PILOT_PROFILE': 'TRUE'} logger.debug("Bootstrap command line: %s %s" % (jd.executable, jd.arguments)) msg = "Submitting SAGA job with description: %s" % str( jd.as_dict()) logentries.append(Logentry(msg, logger=logger.debug)) pilotjob = js.create_job(jd) pilotjob.run() # do a quick error check if pilotjob.state == saga.FAILED: raise RuntimeError("SAGA Job state is FAILED.") saga_job_id = pilotjob.id self._shared_worker_data['job_ids'][pilot_id] = [ saga_job_id, js_url ] msg = "SAGA job submitted with job id %s" % str( saga_job_id) logentries.append(Logentry(msg, logger=logger.debug)) # # ------------------------------------------------------ log_dicts = list() for le in logentries: log_dicts.append(le.as_dict()) # Update the Pilot's state to 'PENDING_ACTIVE' if SAGA job submission was successful. ts = datetime.datetime.utcnow() ret = pilot_col.update( { "_id": pilot_id, "state": 'Launching' }, { "$set": { "state": PENDING_ACTIVE, "saga_job_id": saga_job_id }, "$push": { "statehistory": { "state": PENDING_ACTIVE, "timestamp": ts } }, "$pushAll": { "log": log_dicts } }) if ret['n'] == 0: # could not update, probably because the agent is # running already. Just update state history and # jobid then # FIXME: make sure of the agent state! ret = pilot_col.update({"_id": pilot_id}, { "$set": { "saga_job_id": saga_job_id }, "$push": { "statehistory": { "state": PENDING_ACTIVE, "timestamp": ts } }, "$pushAll": { "log": log_dicts } }) except Exception as e: # Update the Pilot's state 'FAILED'. out, err, log = self._get_pilot_logs( pilot_col, pilot_id) ts = datetime.datetime.utcnow() # FIXME: we seem to be unable to bson/json handle saga # log messages containing an '#'. This shows up here. # Until we find a clean workaround, make log shorter and # rely on saga logging to reveal the problem. msg = "Pilot launching failed! (%s)" % e logentries.append(Logentry(msg)) log_dicts = list() log_messages = list() for le in logentries: log_dicts.append(le.as_dict()) log_messages.append(le.message) pilot_col.update( { "_id": pilot_id, "state": { "$ne": FAILED } }, { "$set": { "state": FAILED, "stdout": out, "stderr": err, "logfile": log }, "$push": { "statehistory": { "state": FAILED, "timestamp": ts } }, "$pushAll": { "log": log_dicts } }) logger.exception('\n'.join(log_messages)) except SystemExit as e: logger.exception( "pilot launcher thread caught system exit -- forcing application shutdown" ) import thread thread.interrupt_main()
def close(self, terminate=True): """Shuts down the PilotManager and its background workers in a coordinated fashion. **Arguments:** * **terminate** [`bool`]: If set to True, all active pilots will get canceled (default: False). """ logger.debug("pmgr %s closing" % (str(self._uid))) # Spit out a warning in case the object was already closed. if not self._uid: logger.error("PilotManager object already closed.") return # before we terminate pilots, we have to kill the pilot launcher threads # -- otherwise we'll run into continous race conditions due to the # ongoing state checks... if self._worker is not None: # Stop the worker process logger.debug("pmgr %s cancel worker %s" % (str(self._uid), self._worker.name)) self._worker.cancel_launcher() logger.debug("pmgr %s canceled worker %s" % (str(self._uid), self._worker.name)) # If terminate is set, we cancel all pilots. if terminate : # cancel all pilots, make sure they are gone, and close the pilot # managers. for pilot in self.get_pilots () : logger.debug("pmgr %s cancels pilot %s" % (str(self._uid), pilot._uid)) self.cancel_pilots () # FIXME: # # wait_pilots() will wait until all pilots picked up the sent cancel # signal and died. However, that can take a loooong time. For # example, if a pilot is in 'PENDING_ACTIVE' state, this will have to # wait until the pilot is bootstrapped, started, connected to the DB, # and shut down again. Or, for a pilot which just got a shitload of # units, it will have to wait until the pilot started all those units # and then checks its command queue again. Or, if the pilot job # already died, wait will block until the state checker kicks in and # declares the pilot as dead, which takes a couple of minutes. # # Solution would be to add a CANCELING state and to wait for that one, # too, which basically means to wait until the cancel signal has been # sent. There is not much more to do at this point anyway. This is at # the moment faked in the manager controler, which sets that state # after sending the cancel command. This should be converted into # a proper state -- that would, btw, remove the need for a cancel # command in the first place, as the pilot can just pull its own state # instead, and cancel on CANCELING... # # self.wait_pilots () wait_for_cancel = True all_pilots = self.get_pilots () while wait_for_cancel : wait_for_cancel = False for pilot in all_pilots : logger.debug("pmgr %s wait for pilot %s (%s)" % (str(self._uid), pilot._uid, pilot.state)) if pilot.state not in [DONE, FAILED, CANCELED, CANCELING] : time.sleep (1) wait_for_cancel = True break for pilot in self.get_pilots () : logger.debug("pmgr %s canceled pilot %s" % (str(self._uid), pilot._uid)) logger.debug("pmgr %s stops worker %s" % (str(self._uid), self._worker.name)) self._worker.stop() self._worker.join() logger.debug("pmgr %s stopped worker %s" % (str(self._uid), self._worker.name)) # Remove worker from registry self._session._process_registry.remove(self._uid) logger.debug("pmgr %s closed" % (str(self._uid))) self._uid = None
def run(self): """run() is called when the process is started via PilotManagerController.start(). """ # make sure to catch sys.exit (which raises SystemExit) try: logger.debug( "Worker thread (ID: %s[%s]) for UnitManager %s started." % (self.name, self.ident, self._um_id)) # transfer results contains the futures to the results of the # asynchronous transfer operations. transfer_results = list() while not self._stop.is_set(): # ================================================================= # # Check and update units. This needs to be optimized at # some point, i.e., state pulling should be conditional # or triggered by a tailable MongoDB cursor, etc. unit_list = self._db.get_compute_units( unit_manager_id=self._um_id) action = False for unit in unit_list: unit_id = str(unit["_id"]) new_state = unit["state"] if unit_id in self._shared_data: old_state = self._shared_data[unit_id]["data"]["state"] else: old_state = None self._shared_data_lock.acquire() self._shared_data[unit_id] = { 'data': unit, 'callbacks': [], 'facade_object': None } self._shared_data_lock.release() self._shared_data_lock.acquire() self._shared_data[unit_id]["data"] = unit self._shared_data_lock.release() if new_state != old_state: # On a state change, we fire zee callbacks. logger.info( "RUN ComputeUnit '%s' state changed from '%s' to '%s'." % (unit_id, old_state, new_state)) # The state of the unit has changed, We call all # unit-level callbacks to propagate this. self.call_unit_state_callbacks(unit_id, new_state) action = True # After the first iteration, we are officially initialized! if not self._initialized.is_set(): self._initialized.set() # sleep a little if this cycle was idle if not action: time.sleep(IDLE_TIME) except SystemExit as e: logger.exception( "unit manager controller thread caught system exit -- forcing application shutdown" ) import thread thread.interrupt_main() finally: # shut down the autonomous input / output transfer worker(s) for worker in self._input_file_transfer_worker_pool: logger.debug("uworker %s stops itransfer %s" % (self.name, worker.name)) worker.stop() logger.debug("uworker %s stopped itransfer %s" % (self.name, worker.name)) for worker in self._output_file_transfer_worker_pool: logger.debug("uworker %s stops otransfer %s" % (self.name, worker.name)) worker.stop() logger.debug("uworker %s stopped otransfer %s" % (self.name, worker.name))
def _reschedule (self, target_pid=None, uid=None) : with self.lock : # dig through the list of waiting CUs, and try to find a pilot for each # of them. This enacts first-come-first-served, but will be unbalanced # if the units in the queue are of different sizes. That problem is # ignored at this point. # # if any units get scheduled, we push a dictionary to the UM to enact # the schedule: # { # unit_1: [pilot_id_1, pilot_resource_name] # unit_2: [pilot_id_2, pilot_resource_name] # unit_4: [pilot_id_2, pilot_resource_name] # ... # } if not len(self.pilots.keys ()) : # no pilots to work on, yet. logger.warning ("cannot schedule -- no pilots available") return if target_pid and target_pid not in self.pilots : logger.warning ("cannot schedule -- invalid target pilot %s" % target_pid) raise RuntimeError ("Invalid pilot (%s)" % target_pid) schedule = dict() schedule['units'] = dict() schedule['pilots'] = self.pilots logger.debug ("schedule (%s units waiting)" % len(self.waitq)) units_to_schedule = list() if uid : if uid not in self.waitq : # self._dump () logger.warning ("cannot schedule -- unknown unit %s" % uid) raise RuntimeError ("Invalid unit (%s)" % uid) units_to_schedule.append (self.waitq[uid]) else : # just copy the whole waitq for uid in self.waitq : units_to_schedule.append (self.waitq[uid]) for unit in units_to_schedule : uid = unit.uid ud = unit.description # sanity check on unit state if unit.state not in [NEW, SCHEDULING, UNSCHEDULED] : raise RuntimeError ("scheduler queue should only contain NEW or UNSCHEDULED units (%s)" % uid) # logger.debug ("examine unit %s (%s cores)" % (uid, ud.cores)) for pid in self.pilots : # logger.debug (" pilot %s (%s caps, state %s)" \ # % (pid, self.pilots[pid]['state'], self.pilots[pid]['caps'])) if self.pilots[pid]['state'] in [ACTIVE] : if ud.cores <= self.pilots[pid]['caps'] : # logger.debug (" unit %s fits on pilot %s" % (uid, pid)) self.pilots[pid]['caps'] -= ud.cores schedule['units'][unit] = pid # scheduled units are removed from the waitq del self.waitq[uid] self.runqs[pid][uid] = unit break # unit was not scheduled... schedule['units'][unit] = None # print a warning if a unit cannot possibly be scheduled, ever can_handle_unit = False for pid in self.pilots : if unit.description.cores <= self.pilots[pid]['cores'] : can_handle_unit=True break if not can_handle_unit : logger.warning ('cannot handle unit %s with current set of pilots' % uid) # pprint.pprint (schedule) # tell the UM about the schedule self.manager.handle_schedule (schedule)
def _pilot_state_callback (self, pilot, state) : try : with self.lock : pid = pilot.uid if not pid in self.pilots : # as we cannot unregister callbacks, we simply ignore this # invokation. Its probably from a pilot we used previously. logger.warn ("[SchedulerCallback]: ComputePilot %s changed to %s (ignored)" % (pid, state)) return self.pilots[pid]['state'] = state logger.debug ("[SchedulerCallback]: ComputePilot %s changed to %s" % (pid, state)) if state in [ACTIVE] : # the pilot is now ready to be used self._reschedule (target_pid=pid) if state in [DONE, FAILED, CANCELED] : # self._dump ('pilot is final') # If the pilot state is 'DONE', 'FAILED' or 'CANCELED', we # need to reschedule the units which are reschedulable -- # all others are marked 'FAILED' if they are already # 'EXECUTING' and not restartable timestamp = datetime.datetime.utcnow() self._db.change_compute_units ( filter_dict = {"pilot" : pid, "state" : {"$in": [UNSCHEDULED, PENDING_INPUT_STAGING, STAGING_INPUT, PENDING_EXECUTION, SCHEDULING]}}, set_dict = {"state" : UNSCHEDULED, "pilot" : None}, push_dict = {"statehistory": {"state" : UNSCHEDULED, "timestamp" : timestamp}, "log" : {"message" : "reschedule unit", "timestamp" : timestamp} }) self._db.change_compute_units ( filter_dict = {"pilot" : pid, "restartable" : True, "state" : {"$in": [EXECUTING, PENDING_OUTPUT_STAGING, STAGING_OUTPUT]}}, set_dict = {"state" : UNSCHEDULED, "pilot" : None}, push_dict = {"statehistory": {"state" : UNSCHEDULED, "timestamp" : timestamp}, "log" : {"message" : "reschedule unit", "timestamp" : timestamp} }) self._db.change_compute_units ( filter_dict = {"pilot" : pid, "restartable" : False, "state" : {"$in": [EXECUTING, PENDING_OUTPUT_STAGING, STAGING_OUTPUT]}}, set_dict = {"state" : FAILED}, push_dict = {"statehistory": {"state" : FAILED, "timestamp" : timestamp}, "log" : {"message" : "reschedule unit", "timestamp" : timestamp} }) # make sure that restartable units got back into the # wait queue # # FIXME AM: f*****g state management: I don't have the # unit state! New state was just pushed to the DB, but # I have actually no idea for which units, and the state # known to the worker (i.e. the cached state) is most # likely outdated. # # So we don't handle runq/waitq here. Instead, we rely # on the unit cb to get invoked as soon as the state # propagated back to us, and then remove them from the # runq. This is slow, potentially very slow, but save. # we can't use this pilot anymore... del self.pilots[pid] # FIXME: how can I *un*register a pilot callback? except Exception as e : # import traceback # traceback.print_exc () logger.exception ("error in pilot callback for backfiller (%s) - ignored" % e) raise
def run(self): """run() is called when the process is started via PilotManagerController.start(). """ # make sure to catch sys.exit (which raises SystemExit) try : logger.debug("Worker thread (ID: %s[%s]) for UnitManager %s started." % (self.name, self.ident, self._um_id)) # transfer results contains the futures to the results of the # asynchronous transfer operations. transfer_results = list() while not self._stop.is_set(): # ================================================================= # # Check and update units. This needs to be optimized at # some point, i.e., state pulling should be conditional # or triggered by a tailable MongoDB cursor, etc. unit_list = self._db.get_compute_units(unit_manager_id=self._um_id) action = False for unit in unit_list: unit_id = str(unit["_id"]) new_state = unit["state"] if unit_id in self._shared_data: old_state = self._shared_data[unit_id]["data"]["state"] else: old_state = None self._shared_data_lock.acquire() self._shared_data[unit_id] = { 'data': unit, 'callbacks': [], 'facade_object': None } self._shared_data_lock.release() self._shared_data_lock.acquire() self._shared_data[unit_id]["data"] = unit self._shared_data_lock.release() if new_state != old_state: # On a state change, we fire zee callbacks. logger.info("RUN ComputeUnit '%s' state changed from '%s' to '%s'." % (unit_id, old_state, new_state)) # The state of the unit has changed, We call all # unit-level callbacks to propagate this. self.call_unit_state_callbacks(unit_id, new_state) action = True # After the first iteration, we are officially initialized! if not self._initialized.is_set(): self._initialized.set() # sleep a little if this cycle was idle if not action : time.sleep(IDLE_TIME) except SystemExit as e : logger.exception ("unit manager controller thread caught system exit -- forcing application shutdown") import thread thread.interrupt_main () finally : # shut down the autonomous input / output transfer worker(s) for worker in self._input_file_transfer_worker_pool: logger.debug("uworker %s stops itransfer %s" % (self.name, worker.name)) worker.stop () logger.debug("uworker %s stopped itransfer %s" % (self.name, worker.name)) for worker in self._output_file_transfer_worker_pool: logger.debug("uworker %s stops otransfer %s" % (self.name, worker.name)) worker.stop () logger.debug("uworker %s stopped otransfer %s" % (self.name, worker.name))
def __del__(self): """Le destructeur. """ if os.getenv("RADICAL_PILOT_GCDEBUG", None) is not None: logger.debug("GCDEBUG __del__(): ComputeUnit [object id: %s]." % id(self))
def handle_schedule (self, schedule) : # we want to use bulk submission to the pilots, so we collect all units # assigned to the same set of pilots. At the same time, we select # unscheduled units for later insertion into the wait queue. if not schedule : logger.debug ('skipping empty unit schedule') return # print 'handle schedule:' # import pprint # pprint.pprint (schedule) # pilot_cu_map = dict() unscheduled = list() pilot_ids = self.list_pilots () for unit in schedule['units'].keys() : pid = schedule['units'][unit] if None == pid : unscheduled.append (unit) continue else : if pid not in pilot_ids : raise RuntimeError ("schedule points to unknown pilot %s" % pid) if pid not in pilot_cu_map : pilot_cu_map[pid] = list() pilot_cu_map[pid].append (unit) # submit to all pilots which got something submitted to for pid in pilot_cu_map.keys(): units_to_schedule = list() # if a kernel name is in the cu descriptions set, do kernel expansion for unit in pilot_cu_map[pid] : if not pid in schedule['pilots'] : # lost pilot, do not schedule unit logger.warn ("unschedule unit %s, lost pilot %s" % (unit.uid, pid)) continue unit.sandbox = schedule['pilots'][pid]['sandbox'] + "/" + str(unit.uid) ud = unit.description if 'kernel' in ud and ud['kernel'] : try : from radical.ensemblemd.mdkernels import MDTaskDescription except Exception as ex : logger.error ("Kernels are not supported in" \ "compute unit descriptions -- install " \ "radical.ensemblemd.mdkernels!") # FIXME: unit needs a '_set_state() method or something! self._session._dbs.set_compute_unit_state (unit._uid, FAILED, ["kernel expansion failed"]) continue pilot_resource = schedule['pilots'][pid]['resource'] mdtd = MDTaskDescription () mdtd.kernel = ud.kernel mdtd_bound = mdtd.bind (resource=pilot_resource) ud.environment = mdtd_bound.environment ud.pre_exec = mdtd_bound.pre_exec ud.executable = mdtd_bound.executable ud.mpi = mdtd_bound.mpi units_to_schedule.append (unit) if len(units_to_schedule) : self._worker.schedule_compute_units (pilot_uid=pid, units=units_to_schedule) # report any change in wait_queue_size old_wait_queue_size = self.wait_queue_size self.wait_queue_size = len(unscheduled) if old_wait_queue_size != self.wait_queue_size : self._worker.fire_manager_callback (WAIT_QUEUE_SIZE, self, self.wait_queue_size) if len(unscheduled) : self._worker.unschedule_compute_units (units=unscheduled) logger.info ('%s units remain unscheduled' % len(unscheduled))
def wait_pilots(self, pilot_ids=None, state=[DONE, FAILED, CANCELED], timeout=None): """Returns when one or more :class:`radical.pilot.ComputePilots` reach a specific state or when an optional timeout is reached. If `pilot_uids` is `None`, `wait_pilots` returns when **all** Pilots reach the state defined in `state`. **Arguments:** * **pilot_uids** [`string` or `list of strings`] If pilot_uids is set, only the Pilots with the specified uids are considered. If pilot_uids is `None` (default), all Pilots are considered. * **state** [`list of strings`] The state(s) that Pilots have to reach in order for the call to return. By default `wait_pilots` waits for the Pilots to reach a **terminal** state, which can be one of the following: * :data:`radical.pilot.DONE` * :data:`radical.pilot.FAILED` * :data:`radical.pilot.CANCELED` * **timeout** [`float`] Optional timeout in seconds before the call returns regardless whether the Pilots have reached the desired state or not. The default value **-1.0** never times out. **Raises:** * :class:`radical.pilot.PilotException` """ self._assert_obj_is_valid() if not isinstance(state, list): state = [state] return_list_type = True if (not isinstance(pilot_ids, list)) and (pilot_ids is not None): return_list_type = False pilot_ids = [pilot_ids] start = time.time() all_ok = False states = list() while not all_ok : pilots = self._worker.get_compute_pilot_data(pilot_ids=pilot_ids) all_ok = True states = list() for pilot in pilots : if pilot['state'] not in state : all_ok = False states.append (pilot['state']) # check timeout if (None != timeout) and (timeout <= (time.time() - start)): if not all_ok : logger.debug ("wait timed out: %s" % states) break # sleep a little if this cycle was idle if not all_ok : time.sleep (0.1) # done waiting if return_list_type : return states else : return states[0]
def check_pilot_states(self, pilot_col): pending_pilots = pilot_col.find({ "pilotmanager": self.pilot_manager_id, "state": { "$in": [PENDING_ACTIVE, ACTIVE] } }) for pending_pilot in pending_pilots: pilot_failed = False pilot_done = False reconnected = False pilot_id = pending_pilot["_id"] log_message = "" saga_job_id = pending_pilot["saga_job_id"] logger.info( "Performing periodical health check for %s (SAGA job id %s)" % (str(pilot_id), saga_job_id)) if not pilot_id in self.missing_pilots: self.missing_pilots[pilot_id] = 0 # Create a job service object: try: js_url = saga_job_id.split("]-[")[0][1:] if js_url in self._shared_worker_data['job_services']: js = self._shared_worker_data['job_services'][js_url] else: js = saga.job.Service(js_url, session=self._session) self._shared_worker_data['job_services'][js_url] = js saga_job = js.get_job(saga_job_id) reconnected = True if saga_job.state in [saga.job.FAILED, saga.job.CANCELED]: pilot_failed = True log_message = "SAGA job state for ComputePilot %s is %s."\ % (pilot_id, saga_job.state) if saga_job.state in [saga.job.DONE]: pilot_done = True log_message = "SAGA job state for ComputePilot %s is %s."\ % (pilot_id, saga_job.state) except Exception as e: if not reconnected: logger.warning( 'could not reconnect to pilot for state check (%s)' % e) self.missing_pilots[pilot_id] += 1 if self.missing_pilots[pilot_id] >= JOB_CHECK_MAX_MISSES: logger.debug('giving up after 10 attempts') pilot_failed = True log_message = "Could not reconnect to pilot %s "\ "multiple times - giving up" % pilot_id else: logger.warning('pilot state check failed: %s' % e) pilot_failed = True log_message = "Couldn't determine job state for ComputePilot %s. " \ "Assuming it has failed." % pilot_id if pilot_failed: out, err, log = self._get_pilot_logs(pilot_col, pilot_id) ts = datetime.datetime.utcnow() pilot_col.update({ "_id": pilot_id, "state": { "$ne": DONE } }, { "$set": { "state": FAILED, "stdout": out, "stderr": err, "logfile": log }, "$push": { "statehistory": { "state": FAILED, "timestamp": ts }, "log": { "message": log_message, "timestamp": ts } } }) logger.debug(log_message) logger.warn('pilot %s declared dead' % pilot_id) elif pilot_done: # FIXME: this should only be done if the state is not yet # done... out, err, log = self._get_pilot_logs(pilot_col, pilot_id) ts = datetime.datetime.utcnow() pilot_col.update({ "_id": pilot_id, "state": { "$ne": DONE } }, { "$set": { "state": DONE, "stdout": out, "stderr": err, "logfile": log }, "$push": { "statehistory": { "state": DONE, "timestamp": ts }, "log": { "message": log_message, "timestamp": ts } } }) logger.debug(log_message) logger.warn('pilot %s declared dead' % pilot_id) else: if self.missing_pilots[pilot_id]: logger.info ('pilot %s *assumed* alive and well (%s)' \ % (pilot_id, self.missing_pilots[pilot_id])) else: logger.info ('pilot %s seems alive and well' \ % (pilot_id))
def run(self): """run() is called when the process is started via PilotManagerController.start(). """ # make sure to catch sys.exit (which raises SystemExit) try : logger.debug("Worker thread (ID: %s[%s]) for PilotManager %s started." % (self.name, self.ident, self._pm_id)) while not self._stop.is_set(): # # Check if one or more startup requests have finished. # self.startup_results_lock.acquire() # new_startup_results = list() # for transfer_result in self.startup_results: # if transfer_result.ready(): # result = transfer_result.get() # self._db.update_pilot_state( # pilot_uid=result["pilot_uid"], # state=result["state"], # sagajobid=result["saga_job_id"], # pilot_sandbox=result["sandbox"], # global_sandbox=result["global_sandbox"], # submitted=result["submitted"], # logs=result["logs"] # ) # else: # new_startup_results.append(transfer_result) # self.startup_results = new_startup_results # self.startup_results_lock.release() # Check and update pilots. This needs to be optimized at # some point, i.e., state pulling should be conditional # or triggered by a tailable MongoDB cursor, etc. pilot_list = self._db.get_pilots(pilot_manager_id=self._pm_id) action = False for pilot in pilot_list: pilot_id = str(pilot["_id"]) new_state = pilot["state"] if pilot_id in self._shared_data: old_state = self._shared_data[pilot_id]["data"]["state"] else: old_state = None self._shared_data[pilot_id] = { 'data': pilot, 'callbacks': [], 'facade_object': None } self._shared_data[pilot_id]['data'] = pilot # FIXME: *groan* what a hack... The Canceling state is by # the nature of it not recorded in the database, but only in # the local cache. So if we see it as old state, we have to # avoid state transitions into non-final states in the cache # at all cost -- so we catch this here specifically no_cb = False if old_state == CANCELING : if new_state not in [DONE, FAILED, CANCELED] : # restore old state, making the cache explicitly # different than the DB recorded state self._shared_data[pilot_id]["data"]["state"] = old_state # do not tr igger a state cb! no_cb = True if new_state != old_state : action = True if not no_cb : # On a state change, we fire zee callbacks. logger.info("ComputePilot '%s' state changed from '%s' to '%s'." \ % (pilot_id, old_state, new_state)) # The state of the pilot has changed, We call all # pilot-level callbacks to propagate this. This also # includes communication to the unit scheduler which # may, or may not, cancel the pilot's units. self.call_callbacks(pilot_id, new_state) # If the state is 'DONE', 'FAILED' or 'CANCELED', we also # set the state of the compute unit accordingly (but only # for non-final units) if new_state in [FAILED, DONE, CANCELED]: unit_ids = self._db.pilot_list_compute_units(pilot_uid=pilot_id) self._db.set_compute_unit_state ( unit_ids=unit_ids, state=CANCELED, src_states=[ PENDING_INPUT_STAGING, STAGING_INPUT, PENDING_EXECUTION, SCHEDULING, EXECUTING, PENDING_OUTPUT_STAGING, STAGING_OUTPUT ], log="Pilot '%s' has terminated with state '%s'. CU canceled." % (pilot_id, new_state)) # After the first iteration, we are officially initialized! if not self._initialized.is_set(): self._initialized.set() # sleep a little if this cycle was idle if not action : time.sleep(IDLE_TIME) except SystemExit as e : logger.exception ("pilot manager controller thread caught system exit -- forcing application shutdown") import thread thread.interrupt_main () finally : # shut down the autonomous pilot launcher worker(s) for worker in self._pilot_launcher_worker_pool: logger.debug("pworker %s stops launcher %s" % (self.name, worker.name)) worker.stop () logger.debug("pworker %s stopped launcher %s" % (self.name, worker.name))
def close(self, cleanup=True, terminate=True, delete=None): """Closes the session. All subsequent attempts access objects attached to the session will result in an error. If cleanup is set to True (default) the session data is removed from the database. **Arguments:** * **cleanup** (`bool`): Remove session from MongoDB (implies * terminate) * **terminate** (`bool`): Shut down all pilots associated with the session. **Raises:** * :class:`radical.pilot.IncorrectState` if the session is closed or doesn't exist. """ logger.debug("session %s closing" % (str(self._uid))) uid = self._uid if not self._uid: logger.error("Session object already closed.") return # we keep 'delete' for backward compatibility. If it was set, and the # other flags (cleanup, terminate) are as defaulted (True), then delete # will supercede them. Delete is considered deprecated though, and # we'll thus issue a warning. if delete != None: if cleanup == True and terminate == True: cleanup = delete terminate = delete logger.warning("'delete' flag on session is deprecated. " \ "Please use 'cleanup' and 'terminate' instead!") if cleanup: # cleanup implies terminate terminate = True for pmgr in self._pilot_manager_objects: logger.debug("session %s closes pmgr %s" % (str(self._uid), pmgr._uid)) pmgr.close(terminate=terminate) logger.debug("session %s closed pmgr %s" % (str(self._uid), pmgr._uid)) for umgr in self._unit_manager_objects: logger.debug("session %s closes umgr %s" % (str(self._uid), umgr._uid)) umgr.close() logger.debug("session %s closed umgr %s" % (str(self._uid), umgr._uid)) if cleanup: self._destroy_db_entry() logger.debug("session %s closed" % (str(self._uid)))
def register_cancel_pilots_request(self, pilot_ids=None): """Registers one or more pilots for cancelation. """ if pilot_ids is None: pilot_ids = list() for pilot in self._db.get_pilots(pilot_manager_id=self._pm_id) : pilot_ids.append (str(pilot["_id"])) self._db.send_command_to_pilot(COMMAND_CANCEL_PILOT, pilot_ids=pilot_ids) logger.info("Sent 'COMMAND_CANCEL_PILOT' command to pilots %s.", pilot_ids) # pilots which are in ACTIVE state should now have time to react on the # CANCEL command sent above. Meanwhile, we'll cancel all pending # pilots. If that is done, we wait a little, say 10 seconds, to give # the pilot time to pick up the request and shut down -- but if it does # not do that, it will get killed the hard way... delayed_cancel = list() for pilot_id in pilot_ids : if pilot_id in self._shared_data : # read state fomr _shared_data only once, so that it does not # change under us... old_state = str(self._shared_data[pilot_id]["data"]["state"]) logger.warn ("actively cancel pilot %s state: %s" % (pilot_id, old_state)) if old_state in [DONE, FAILED, CANCELED] : logger.warn ("can't actively cancel pilot %s: already in final state" % pilot_id) elif old_state in [PENDING_LAUNCH, LAUNCHING, PENDING_ACTIVE] : if pilot_id in self._shared_worker_data['job_ids'] : try : job_id, js_url = self._shared_worker_data['job_ids'][pilot_id] self._shared_data[pilot_id]["data"]["state"] = CANCELING logger.info ("actively cancel pilot %s (%s, %s)" % (pilot_id, job_id, js_url)) js = self._shared_worker_data['job_services'][js_url] job = js.get_job (job_id) job.cancel () except Exception as e : logger.exception ('pilot cancelation failed') else : logger.warn ("can't actively cancel pilot %s: no job id known" % pilot_id) logger.debug (pprint.pformat (self._shared_worker_data)) else : logger.debug ("delay to actively cancel pilot %s: state %s" % (pilot_id, old_state)) delayed_cancel.append (pilot_id) else : logger.warn ("can't actively cancel pilot %s: unknown pilot" % pilot_id) logger.debug (pprint.pformat (self._shared_data)) # now tend to all delayed cancellation requests (ie. active pilots) -- # if there are any if delayed_cancel : # grant some levay to the unruly children... time.sleep (10) for pilot_id in delayed_cancel : if pilot_id in self._shared_worker_data['job_ids'] : try : job_id, js_url = self._shared_worker_data['job_ids'][pilot_id] logger.info ("actively cancel pilot %s (delayed) (%s, %s)" % (pilot_id, job_id, js_url)) js = self._shared_worker_data['job_services'][js_url] job = js.get_job (job_id) job.cancel () except Exception as e : logger.warn ('delayed pilot cancelation failed. ' 'This is not necessarily a problem.') else : logger.warn ("can't actively cancel pilot %s: no job id known (delayed)" % pilot_id) logger.debug (pprint.pformat (self._shared_worker_data))
def cancel(self): """Cancel the ComputeUnit. **Raises:** * :class:`radical.pilot.radical.pilotException` """ # Check if this instance is valid if not self._uid: raise BadParameter("Invalid Compute Unit instance.") cu_json = self._worker.get_compute_unit_data(self.uid) pilot_uid = cu_json['pilot'] if self.state in [DONE, FAILED, CANCELED]: # nothing to do logger.debug( "Compute unit %s has state %s, can't cancel any longer." % (self._uid, self.state)) elif self.state in [NEW, UNSCHEDULED, PENDING_INPUT_STAGING]: logger.debug( "Compute unit %s has state %s, going to prevent from starting." % (self._uid, self.state)) self._manager._session._dbs.set_compute_unit_state( self._uid, CANCELED, ["Received Cancel"]) elif self.state == STAGING_INPUT: logger.debug( "Compute unit %s has state %s, will cancel the transfer." % (self._uid, self.state)) self._manager._session._dbs.set_compute_unit_state( self._uid, CANCELED, ["Received Cancel"]) elif self.state in [PENDING_EXECUTION, SCHEDULING]: logger.debug("Compute unit %s has state %s, will abort start-up." % (self._uid, self.state)) self._manager._session._dbs.set_compute_unit_state( self._uid, CANCELED, ["Received Cancel"]) elif self.state == EXECUTING: logger.debug( "Compute unit %s has state %s, will terminate the task." % (self._uid, self.state)) self._manager._session._dbs.send_command_to_pilot( cmd=COMMAND_CANCEL_COMPUTE_UNIT, arg=self.uid, pilot_ids=pilot_uid) elif self.state == PENDING_OUTPUT_STAGING: logger.debug( "Compute unit %s has state %s, will abort the transfer." % (self._uid, self.state)) self._manager._session._dbs.set_compute_unit_state( self._uid, CANCELED, ["Received Cancel"]) elif self.state == STAGING_OUTPUT: logger.debug( "Compute unit %s has state %s, will cancel the transfer." % (self._uid, self.state)) self._manager._session._dbs.set_compute_unit_state( self._uid, CANCELED, ["Received Cancel"]) else: raise IncorrectState( "Unknown Compute Unit state: %s, cannot cancel" % self.state) # done canceling return
def wait_units(self, unit_ids=None, state=[DONE, FAILED, CANCELED], timeout=None): """Returns when one or more :class:`radical.pilot.ComputeUnits` reach a specific state. If `unit_uids` is `None`, `wait_units` returns when **all** ComputeUnits reach the state defined in `state`. **Example**:: # TODO -- add example **Arguments:** * **unit_uids** [`string` or `list of strings`] If unit_uids is set, only the ComputeUnits with the specified uids are considered. If unit_uids is `None` (default), all ComputeUnits are considered. * **state** [`string`] The state that ComputeUnits have to reach in order for the call to return. By default `wait_units` waits for the ComputeUnits to reach a terminal state, which can be one of the following: * :data:`radical.pilot.DONE` * :data:`radical.pilot.FAILED` * :data:`radical.pilot.CANCELED` * **timeout** [`float`] Timeout in seconds before the call returns regardless of Pilot state changes. The default value **None** waits forever. **Raises:** * :class:`radical.pilot.PilotException` """ if not self._uid: raise IncorrectState(msg="Invalid object instance.") if not isinstance(state, list): state = [state] return_list_type = True if (not isinstance(unit_ids, list)) and (unit_ids is not None): return_list_type = False unit_ids = [unit_ids] units = self.get_units (unit_ids) start = time.time() all_ok = False states = list() while not all_ok : all_ok = True states = list() for unit in units : if unit.state not in state : all_ok = False states.append (unit.state) # check timeout if (None != timeout) and (timeout <= (time.time() - start)): if not all_ok : logger.debug ("wait timed out: %s" % states) break # sleep a little if this cycle was idle if not all_ok : time.sleep (0.1) # done waiting if return_list_type : return states else : return states[0]
def run(self): """Starts the process when Process.start() is called. """ # make sure to catch sys.exit (which raises SystemExit) try : # Try to connect to the database and create a tailable cursor. try: connection = self.db_connection_info.get_db_handle() db = connection[self.db_connection_info.dbname] um_col = db["%s.cu" % self.db_connection_info.session_id] logger.debug("Connected to MongoDB. Serving requests for UnitManager %s." % self.unit_manager_id) except Exception as e: logger.exception("Connection error: %s" % e) return while not self._stop.is_set(): compute_unit = None # See if we can find a ComputeUnit that is waiting for # output file transfer. ts = datetime.datetime.utcnow() compute_unit = um_col.find_and_modify( query={"unitmanager": self.unit_manager_id, "FTW_Output_Status": PENDING}, update={"$set" : {"FTW_Output_Status": EXECUTING, "state": STAGING_OUTPUT}, "$push": {"statehistory": {"state": STAGING_OUTPUT, "timestamp": ts}}}, limit=BULK_LIMIT ) # FIXME: AM: find_and_modify is not bulkable! state = STAGING_OUTPUT #logger.info("OFTW after finding pending cus") if compute_unit is None: #logger.info("OFTW no cus, sleep") # Sleep a bit if no new units are available. time.sleep(IDLE_TIME) else: logger.info("OFTW cu found, progressing ...") compute_unit_id = None try: # We have found a new CU. Now we can process the transfer # directive(s) wit SAGA. compute_unit_id = str(compute_unit["_id"]) remote_sandbox = compute_unit["sandbox"] staging_directives = compute_unit["FTW_Output_Directives"] logger.info("Processing output file transfers for ComputeUnit %s" % compute_unit_id) # Loop over all staging directives and execute them. for sd in staging_directives: # Check if there was a cancel request state_doc = um_col.find_one( {"_id": compute_unit_id}, fields=["state"] ) if state_doc['state'] == CANCELED: logger.info("Compute Unit Canceled, interrupting output file transfers.") state = CANCELED break action = sd['action'] source = sd['source'] target = sd['target'] flags = sd['flags'] # Mark the beginning of transfer this StagingDirective um_col.find_and_modify( query={"_id" : compute_unit_id, 'FTW_Output_Status': EXECUTING, 'FTW_Output_Directives.state': PENDING, 'FTW_Output_Directives.source': sd['source'], 'FTW_Output_Directives.target': sd['target'], }, update={'$set': {'FTW_Output_Directives.$.state': EXECUTING}, '$push': {'log': { 'timestamp': datetime.datetime.utcnow(), 'message' : 'Starting transfer of %s' % source}} } ) abs_source = "%s/%s" % (remote_sandbox, source) if os.path.basename(target) == target: abs_target = "file://localhost%s" % os.path.join(os.getcwd(), target) else: abs_target = "file://localhost%s" % os.path.abspath(target) log_msg = "Transferring output file %s -> %s" % (abs_source, abs_target) logger.debug(log_msg) logger.debug ("saga.fs.File ('%s')" % saga.Url(abs_source)) output_file = saga.filesystem.File(saga.Url(abs_source), session=self._session ) if CREATE_PARENTS in flags: copy_flags = saga.filesystem.CREATE_PARENTS else: copy_flags = 0 logger.debug ("saga.fs.File.copy ('%s')" % saga.Url(abs_target)) output_file.copy(saga.Url(abs_target), flags=copy_flags) output_file.close() # If all went fine, update the state of this StagingDirective to Done um_col.find_and_modify( query={"_id" : compute_unit_id, 'FTW_Output_Status': EXECUTING, 'FTW_Output_Directives.state': EXECUTING, 'FTW_Output_Directives.source': sd['source'], 'FTW_Output_Directives.target': sd['target'], }, update={'$set': {'FTW_Output_Directives.$.state': DONE}, '$push': {'log': { 'timestamp': datetime.datetime.utcnow(), 'message' : log_msg}} } ) except Exception as e : # Update the CU's state to 'FAILED'. ts = datetime.datetime.utcnow() log_message = "Output transfer failed: %s" % e # TODO: not only mark the CU as failed, but also the specific Directive um_col.update({'_id': compute_unit_id}, { '$set': {'state': FAILED}, '$push': { 'statehistory': {'state': FAILED, 'timestamp': ts}, 'log': {'message': log_message, 'timestamp': ts} } }) logger.exception (log_message) # Code below is only to be run by the "first" or only worker if self._worker_number > 1: continue # If the CU was canceled we can skip the remainder of this loop. if state == CANCELED: continue # # Check to see if there are more active Directives, if not, we are Done # cursor_w = um_col.find({"unitmanager": self.unit_manager_id, "$or": [ {"Agent_Output_Status": EXECUTING}, {"FTW_Output_Status": EXECUTING} ] } ) # Iterate over all the returned CUs (if any) for cu in cursor_w: # See if there are any FTW Output Directives still pending if cu['FTW_Output_Status'] == EXECUTING and \ not any(d['state'] == EXECUTING or d['state'] == PENDING for d in cu['FTW_Output_Directives']): # All Output Directives for this FTW are done, mark the CU accordingly um_col.update({"_id": cu["_id"]}, {'$set': {'FTW_Output_Status': DONE}, '$push': {'log': { 'timestamp': datetime.datetime.utcnow(), 'message' : 'All FTW output staging directives done - %d.' % self._worker_number}} } ) # See if there are any Agent Output Directives still pending if cu['Agent_Output_Status'] == EXECUTING and \ not any(d['state'] == EXECUTING or d['state'] == PENDING for d in cu['Agent_Output_Directives']): # All Output Directives for this Agent are done, mark the CU accordingly um_col.update({"_id": cu["_id"]}, {'$set': {'Agent_Output_Status': DONE}, '$push': {'log': { 'timestamp': datetime.datetime.utcnow(), 'message' : 'All Agent Output Staging Directives done-%d.' % self._worker_number}} } ) # # Check for all CUs if both Agent and FTW staging is done, we can then mark the CU Done # ts = datetime.datetime.utcnow() um_col.find_and_modify( query={"unitmanager": self.unit_manager_id, # TODO: Now that our state model is linear, # we probably don't need to check Agent_Output_Status anymore. # Given that it is not updates by the agent currently, disable it here. #"Agent_Output_Status": { "$in": [ None, DONE ] }, "FTW_Output_Status": { "$in": [ None, DONE ] }, "state": STAGING_OUTPUT }, update={"$set": { "state": DONE }, "$push": { "statehistory": {"state": DONE, "timestamp": ts} } } ) except SystemExit as e : logger.exception("output file transfer thread caught system exit -- forcing application shutdown") import thread thread.interrupt_main ()
def handle_schedule(self, schedule): # we want to use bulk submission to the pilots, so we collect all units # assigned to the same set of pilots. At the same time, we select # unscheduled units for later insertion into the wait queue. if not schedule: logger.debug('skipping empty unit schedule') return # print 'handle schedule:' # import pprint # pprint.pprint (schedule) # pilot_cu_map = dict() unscheduled = list() pilot_ids = self.list_pilots() for unit in schedule['units'].keys(): pid = schedule['units'][unit] if None == pid: unscheduled.append(unit) continue else: if pid not in pilot_ids: raise RuntimeError("schedule points to unknown pilot %s" % pid) if pid not in pilot_cu_map: pilot_cu_map[pid] = list() pilot_cu_map[pid].append(unit) # submit to all pilots which got something submitted to for pid in pilot_cu_map.keys(): units_to_schedule = list() # if a kernel name is in the cu descriptions set, do kernel expansion for unit in pilot_cu_map[pid]: if not pid in schedule['pilots']: # lost pilot, do not schedule unit logger.warn("unschedule unit %s, lost pilot %s" % (unit.uid, pid)) continue unit.sandbox = schedule['pilots'][pid]['sandbox'] + "/" + str( unit.uid) ud = unit.description if 'kernel' in ud and ud['kernel']: try: from radical.ensemblemd.mdkernels import MDTaskDescription except Exception as ex: logger.error ("Kernels are not supported in" \ "compute unit descriptions -- install " \ "radical.ensemblemd.mdkernels!") # FIXME: unit needs a '_set_state() method or something! self._session._dbs.set_compute_unit_state( unit._uid, FAILED, ["kernel expansion failed"]) continue pilot_resource = schedule['pilots'][pid]['resource'] mdtd = MDTaskDescription() mdtd.kernel = ud.kernel mdtd_bound = mdtd.bind(resource=pilot_resource) ud.environment = mdtd_bound.environment ud.pre_exec = mdtd_bound.pre_exec ud.executable = mdtd_bound.executable ud.mpi = mdtd_bound.mpi units_to_schedule.append(unit) if len(units_to_schedule): self._worker.schedule_compute_units(pilot_uid=pid, units=units_to_schedule) # report any change in wait_queue_size old_wait_queue_size = self.wait_queue_size self.wait_queue_size = len(unscheduled) if old_wait_queue_size != self.wait_queue_size: self._worker.fire_manager_callback(WAIT_QUEUE_SIZE, self, self.wait_queue_size) if len(unscheduled): self._worker.unschedule_compute_units(units=unscheduled) logger.info('%s units remain unscheduled' % len(unscheduled))
def wait_pilots(self, pilot_ids=None, state=[DONE, FAILED, CANCELED], timeout=None): """Returns when one or more :class:`radical.pilot.ComputePilots` reach a specific state or when an optional timeout is reached. If `pilot_uids` is `None`, `wait_pilots` returns when **all** Pilots reach the state defined in `state`. **Arguments:** * **pilot_uids** [`string` or `list of strings`] If pilot_uids is set, only the Pilots with the specified uids are considered. If pilot_uids is `None` (default), all Pilots are considered. * **state** [`list of strings`] The state(s) that Pilots have to reach in order for the call to return. By default `wait_pilots` waits for the Pilots to reach a **terminal** state, which can be one of the following: * :data:`radical.pilot.DONE` * :data:`radical.pilot.FAILED` * :data:`radical.pilot.CANCELED` * **timeout** [`float`] Optional timeout in seconds before the call returns regardless whether the Pilots have reached the desired state or not. The default value **-1.0** never times out. **Raises:** * :class:`radical.pilot.PilotException` """ self._assert_obj_is_valid() if not isinstance(state, list): state = [state] return_list_type = True if (not isinstance(pilot_ids, list)) and (pilot_ids is not None): return_list_type = False pilot_ids = [pilot_ids] start = time.time() all_ok = False states = list() while not all_ok: pilots = self._worker.get_compute_pilot_data(pilot_ids=pilot_ids) all_ok = True states = list() for pilot in pilots: if pilot['state'] not in state: all_ok = False states.append(pilot['state']) # check timeout if (None != timeout) and (timeout <= (time.time() - start)): if not all_ok: logger.debug("wait timed out: %s" % states) break # sleep a little if this cycle was idle if not all_ok: time.sleep(0.1) # done waiting if return_list_type: return states else: return states[0]
def check_pilot_states(self, pilot_col): pending_pilots = pilot_col.find( {"pilotmanager": self.pilot_manager_id, "state": {"$in": [PENDING_ACTIVE, ACTIVE]}} ) for pending_pilot in pending_pilots: pilot_failed = False pilot_done = False reconnected = False pilot_id = pending_pilot["_id"] log_message = "" saga_job_id = pending_pilot["saga_job_id"] logger.info("Performing periodical health check for %s (SAGA job id %s)" % (str(pilot_id), saga_job_id)) if not pilot_id in self.missing_pilots: self.missing_pilots[pilot_id] = 0 # Create a job service object: try: js_url = saga_job_id.split("]-[")[0][1:] if js_url in self._shared_worker_data["job_services"]: js = self._shared_worker_data["job_services"][js_url] else: js = saga.job.Service(js_url, session=self._session) self._shared_worker_data["job_services"][js_url] = js saga_job = js.get_job(saga_job_id) reconnected = True if saga_job.state in [saga.job.FAILED, saga.job.CANCELED]: pilot_failed = True log_message = "SAGA job state for ComputePilot %s is %s." % (pilot_id, saga_job.state) if saga_job.state in [saga.job.DONE]: pilot_done = True log_message = "SAGA job state for ComputePilot %s is %s." % (pilot_id, saga_job.state) except Exception as e: if not reconnected: logger.warning("could not reconnect to pilot for state check (%s)" % e) self.missing_pilots[pilot_id] += 1 if self.missing_pilots[pilot_id] >= JOB_CHECK_MAX_MISSES: logger.debug("giving up after 10 attempts") pilot_failed = True log_message = "Could not reconnect to pilot %s " "multiple times - giving up" % pilot_id else: logger.warning("pilot state check failed: %s" % e) pilot_failed = True log_message = ( "Couldn't determine job state for ComputePilot %s. " "Assuming it has failed." % pilot_id ) if pilot_failed: out, err, log = self._get_pilot_logs(pilot_col, pilot_id) ts = datetime.datetime.utcnow() pilot_col.update( {"_id": pilot_id, "state": {"$ne": DONE}}, { "$set": {"state": FAILED, "stdout": out, "stderr": err, "logfile": log}, "$push": { "statehistory": {"state": FAILED, "timestamp": ts}, "log": {"message": log_message, "timestamp": ts}, }, }, ) logger.debug(log_message) logger.warn("pilot %s declared dead" % pilot_id) elif pilot_done: # FIXME: this should only be done if the state is not yet # done... out, err, log = self._get_pilot_logs(pilot_col, pilot_id) ts = datetime.datetime.utcnow() pilot_col.update( {"_id": pilot_id, "state": {"$ne": DONE}}, { "$set": {"state": DONE, "stdout": out, "stderr": err, "logfile": log}, "$push": { "statehistory": {"state": DONE, "timestamp": ts}, "log": {"message": log_message, "timestamp": ts}, }, }, ) logger.debug(log_message) logger.warn("pilot %s declared dead" % pilot_id) else: if self.missing_pilots[pilot_id]: logger.info("pilot %s *assumed* alive and well (%s)" % (pilot_id, self.missing_pilots[pilot_id])) else: logger.info("pilot %s seems alive and well" % (pilot_id))