def run (self) : """ daemon workload The daemon will loop forever, sleeping self.idle_timeout seconds after each iteration. Every iteration will query the Bundle Agents for status updates, and will push those to the given MongoDB URL. """ try : if not self.config_file : raise RuntimeError ('no bundle config file -- call run() via start_daemon()!') self.load_cluster_credentials(self.config_file) while True : # FIXME: make configurable via config file mongo, db, dbname, cname, pname = ru.mongodb_connect (self.mongodb_url) coll_config = db['config' ] coll_workload = db['workload' ] coll_bandwidth = db['bandwidth'] ret = self.get_data () # with open ('/tmp/l', 'w+') as log: # import pprint # pprint.pprint (ret) # log.write ("\n\n%s\n\n" % pprint.pformat (ret)) for cluster_ip in ret['cluster_list'] : cluster_id = ip2id (cluster_ip) config = ret['cluster_config' ][cluster_id] workload = ret['cluster_workload' ][cluster_id] bandwidth = ret['cluster_bandwidth'][cluster_id] timestamp = time.time () if config : config ['timestamp'] = timestamp if workload : workload ['timestamp'] = timestamp if bandwidth : bandwidth ['timestamp'] = timestamp if config : config ['_id'] = cluster_id if workload : workload ['_id'] = cluster_id if bandwidth : bandwidth ['_id'] = cluster_id if config : coll_config .update ({'_id': cluster_id}, config , upsert=True) if workload : coll_workload .update ({'_id': cluster_id}, workload , upsert=True) if bandwidth : coll_bandwidth.update ({'_id': cluster_id}, bandwidth, upsert=True) time.sleep (self.idle_timeout) except Exception as e : # FIXME: need a logfile from daemon base class raise
def dump (url, mode) : """ Connect to mongodb at the given location, and traverse the data bases """ mongo, db, dbname, cname, pname = ru.mongodb_connect (url, _DEFAULT_DBURL) print dbname if dbname : dbnames = [dbname] else : dbnames = mongo.database_names () for name in dbnames : if mode == 'list' and not dbname : print " +-- db %s" % name elif mode == 'remove' : if (not dbname) or (name == dbname) : try : mongo.drop_database (name) print " removed database %s" % name except : pass # ignore system tables else : handle_db (mongo, mode, name, cname, pname) mongo.disconnect ()
def __init__(self, db_url, db_name="AIMES_bundle"): url = ru.Url(db_url) if db_name: url.path = db_name mongo, db, dbname, _, _ = ru.mongodb_connect (url) self._client = mongo self._db = db self._dbname = dbname self._dburl = str(url) if url.username and url.password: self._dbauth = "{}:{}".format(url.username, url.password) else: self._dbauth = None self._session_id = None # shortcuts to collections # db.session self._s = None # db.session.resource self._r = None # db.session.resource.config self._rc = None # db.session.resource.workload self._rw = None # db.session.resource.bandwidth self._bw = None # db.session.bundle_manager self._bm = None
def __init__(self, db_url, db_name="radicalpilot"): """ Le constructeur. Should not be called directrly, but rather via the static methods new() or reconnect(). """ url = ru.Url (db_url) if db_name : url.path = db_name mongo, db, dbname, pname, cname = ru.mongodb_connect (url) self._client = mongo self._db = db self._dburl = str(url) self._dbname = dbname if url.username and url.password: self._dbauth = "%s:%s" % (url.username, url.password) else: self._dbauth = None self._session_id = None self._s = None self._w = None self._um = None self._p = None self._pm = None
def fetch_json(sid, dburl=None, tgt=None, skip_existing=False, session=None, log=None): ''' returns file name ''' if not log and session: log = session._log rep = session._rep elif not log: log = ru.Logger('radical.pilot.utils') rep = ru.Reporter('radical.pilot.utils') if not tgt: tgt = '.' if tgt.startswith('/'): # Assume an absolute path dst = os.path.join(tgt, '%s.json' % sid) else: # Assume a relative path dst = os.path.join(os.getcwd(), tgt, '%s.json' % sid) try: os.makedirs(os.path.dirname(tgt)) except OSError: pass # dir exists if skip_existing and os.path.isfile(dst) \ and os.stat(dst).st_size > 0: log.info("session already in %s", dst) else: if not dburl: dburl = os.environ.get('RADICAL_PILOT_DBURL') if not dburl: raise ValueError('RADICAL_PILOT_DBURL is not set') mongo, db, _, _, _ = ru.mongodb_connect(dburl) json_docs = get_session_docs(db, sid) ru.write_json(json_docs, dst) log.info("session written to %s", dst) mongo.close() rep.ok("+ %s (json)\n" % sid) return dst
def __init__(self, sid, name, dburl): """ Creates a new session A session is a distinct collection with three sub-collections in MongoDB: radical.pilot.<sid> | Base collection. Holds some metadata. | self._s radical.pilot.<sid>.cu | Collection holding all compute units. | self._w radical.pilot.<sid>.um | Collection holding all unit managers. | self._um radical.pilot.<sid>.p | Collection holding all pilots. | self._p radical.pilot.<sid>.pm | Collection holding all pilot managers. | self._pm All collections are created with a new session. Since MongoDB uses lazy-create, they only appear in the database after the first insert. That's ok. """ # mpongodb_connect wants a string at the moment mongo, db, _, _, _ = ru.mongodb_connect(str(dburl)) if not mongo or not db: raise RuntimeError("Could not connect to database at %s" % dburl) self._client = mongo self._db = db self._dburl = ru.Url(dburl) self._session_id = sid self._created = time.time() self._connected = self._created self._closed = None # make sure session doesn't exist already if self._db[sid].count() != 0: raise RuntimeError("Session '%s' already exists." % sid) # create the db entry self._s = self._db["%s" % sid] self._s.insert({"_id" : sid, "name" : name, "created" : self._created, "connected" : self._created}) # Create the collection shortcut: self._w = self._db["%s.cu" % sid] self._um = self._db["%s.um" % sid] self._p = self._db["%s.p" % sid] self._pm = self._db["%s.pm" % sid]
def fetch_json(sid, dburl=None, tgt=None, skip_existing=False): ''' returns file name ''' if not tgt: tgt = '.' if tgt.startswith('/'): # Assume an absolute path dst = os.path.join(tgt, '%s.json' % sid) else: # Assume a relative path dst = os.path.join(os.getcwd(), tgt, '%s.json' % sid) if skip_existing and os.path.isfile(dst) \ and os.stat(dst).st_size > 0: print "session already in %s" % dst else: if not dburl: dburl = os.environ.get('RADICAL_PILOT_DBURL') if not dburl: from radical.pilot.session import default_dburl logger.report.warn('using default dburl: %s' % default_dburl) dburl = default_dburl mongo, db, _, _, _ = ru.mongodb_connect(dburl) json_docs = get_session_docs(db, sid) ru.write_json(json_docs, dst) print "session written to %s" % dst mongo.close() return dst
def initialize_child(self): self._session_id = self._cfg['session_id'] self._mongodb_url = self._cfg['mongodb_url'] self._pilot_id = self._cfg['pilot_id'] _, db, _, _, _ = ru.mongodb_connect(self._mongodb_url) self._mongo_db = db self._cinfo = dict() # collection cache self._lock = threading.RLock() # protect _cinfo self._state_cache = dict() # used to preserve state ordering self.declare_subscriber('state', 'agent_state_pubsub', self.state_cb) self.declare_idle_cb(self.idle_cb, self._cfg.get('bulk_collection_time')) # all components use the command channel for control messages self.declare_publisher ('command', rpc.AGENT_COMMAND_PUBSUB) # communicate successful startup self.publish('command', {'cmd' : 'alive', 'arg' : self.cname})
def query_db (self) : mongo, db, dbname, cname, pname = ru.mongodb_connect (self.mongodb_url) self._priv = dict() self._priv['cluster_list'] = list() self._priv['cluster_config'] = dict() self._priv['cluster_workload'] = dict() self._priv['cluster_bandwidth'] = dict() for doc in list(db['config'].find ()): self._priv['cluster_list'].append (doc['_id']) self._priv['cluster_config'][doc['_id']] = doc for doc in list(db['workload'].find ()): self._priv['cluster_workload'][doc['_id']] = doc for doc in list(db['bandwidth'].find ()): self._priv['cluster_bandwidth'][doc['_id']] = doc # we have a dictionary of Resources instances, indexed by resource name self.resources = dict() for resource_name in self._priv['cluster_list']: config = self._priv['cluster_config' ].get (resource_name, dict()) workload = self._priv['cluster_workload' ].get (resource_name, dict()) bandwidths = self._priv['cluster_bandwidth'].get (resource_name, dict()) # import pprint # pprint.pprint(bandwidths) self.resources[resource_name] = Resource(resource_name, config, workload, bandwidths) # and a list of Queue instances, for all queues of all resources self.queues = list() for resource in self.resources: self.queues += self.resources[resource].queues.values()
def initialize_child(self): self._session_id = self._cfg['session_id'] self._dburl = self._cfg['dburl'] self._owner = self._cfg['owner'] # TODO: get db handle from a connected session _, db, _, _, _ = ru.mongodb_connect(self._dburl) self._mongo_db = db self._coll = self._mongo_db[self._session_id] self._bulk = self._coll.initialize_ordered_bulk_op() self._last = time.time() # time of last bulk push self._uids = list() # list of collected uids self._lock = threading.RLock() # protect _bulk self._bct = self._cfg.get('bulk_collection_time', DEFAULT_BULK_COLLECTION_TIME) self._bcs = self._cfg.get('bulk_collection_size', DEFAULT_BULK_COLLECTION_SIZE) self.register_subscriber(rpc.STATE_PUBSUB, self._state_cb) self.register_timed_cb(self._idle_cb, timer=self._bct)
def initialize_child(self): self._session_id = self._cfg['session_id'] self._mongodb_url = self._cfg['mongodb_url'] self.declare_idle_cb(self.idle_cb, self._cfg.get('heartbeat_interval')) # all components use the command channel for control messages self.declare_publisher ('command', rpc.AGENT_COMMAND_PUBSUB) self._pilot_id = self._cfg['pilot_id'] self._session_id = self._cfg['session_id'] self._runtime = self._cfg['runtime'] self._starttime = time.time() # set up db connection _, mongo_db, _, _, _ = ru.mongodb_connect(self._cfg['mongodb_url']) self._p = mongo_db["%s.p" % self._session_id] self._cu = mongo_db["%s.cu" % self._session_id] # communicate successful startup self.publish('command', {'cmd' : 'alive', 'arg' : self.cname})
def fetch_logfiles (sid, dburl=None, src=None, tgt=None, access=None, session=None, skip_existing=False, fetch_client=False, log=None): ''' sid: session for which all logfiles are fetched src: dir to look for client session logfiles tgt: dir to store the logfile in returns list of file names ''' if not log and session: log = session._log rep = session._rep elif not log: log = ru.Logger('radical.pilot.utils') rep = ru.Reporter('radical.pilot.utils') ret = list() if not dburl: dburl = os.environ['RADICAL_PILOT_DBURL'] if not dburl: raise RuntimeError ('Please set RADICAL_PILOT_DBURL') if not src: src = os.getcwd() if not tgt: tgt = os.getcwd() if not tgt.startswith('/') and '://' not in tgt: tgt = "%s/%s" % (os.getcwd(), tgt) # we always create a session dir as real target tgt_url = rs.Url("%s/%s/" % (tgt, sid)) # Turn URLs without schema://host into file://localhost, # so that they dont become interpreted as relative. if not tgt_url.schema: tgt_url.schema = 'file' if not tgt_url.host: tgt_url.host = 'localhost' if fetch_client: # first fetch session logfile client_logfile = "%s/%s.log" % (src, sid) ftgt = rs.Url('%s/%s' % (tgt_url, os.path.basename(client_logfile))) ret.append("%s" % ftgt.path) if skip_existing and os.path.isfile(ftgt.path) \ and os.stat(ftgt.path).st_size > 0: pass else: log_file = rs.fs.File(client_logfile, session=session) log_file.copy(ftgt, flags=rs.fs.CREATE_PARENTS) log_file.close() _, db, _, _, _ = ru.mongodb_connect (dburl) json_docs = get_session_docs(db, sid) pilots = json_docs['pilot'] num_pilots = len(pilots) log.info("Session: %s", sid) log.info("Number of pilots in session: %d", num_pilots) for pilot in pilots: try: sandbox_url = rs.Url(pilot['pilot_sandbox']) if access: # Allow to use a different access schema than used for the the run. # Useful if you ran from the headnode, but would like to retrieve # the logfiles to your desktop (Hello Titan). access_url = rs.Url(access) sandbox_url.schema = access_url.schema sandbox_url.host = access_url.host sandbox = rs.fs.Directory (sandbox_url, session=session) # Try to fetch a tarball of logfiles, so that we can get them all in one (SAGA) go! LOGFILES_TARBALL = '%s.log.tgz' % pilot['uid'] tarball_available = False try: if sandbox.is_file(LOGFILES_TARBALL) and \ sandbox.get_size(LOGFILES_TARBALL): log.info("logfiles tarball exists") ftgt = rs.Url('%s/%s' % (tgt_url, LOGFILES_TARBALL)) if skip_existing and os.path.isfile(ftgt.path) \ and os.stat(ftgt.path).st_size > 0: log.info("Skip fetching of '%s/%s' to '%s'.", sandbox_url, LOGFILES_TARBALL, tgt_url) tarball_available = True else: log.info("Fetching '%s%s' to '%s'.", sandbox_url, LOGFILES_TARBALL, tgt_url) log_file = rs.fs.File("%s%s" % (sandbox_url, LOGFILES_TARBALL), session=session) log_file.copy(ftgt, flags=rs.fs.CREATE_PARENTS) log_file.close() tarball_available = True else: log.warn("logiles tarball doesnt exists") except rs.DoesNotExist: log.warn("logfiles tarball doesnt exists") try: os.mkdir("%s/%s" % (tgt_url.path, pilot['uid'])) except OSError: pass # We now have a local tarball if tarball_available: log.debug("Extract tarball %s to %s", ftgt.path, tgt_url.path) try: tarball = tarfile.open(ftgt.path) tarball.extractall("%s/%s" % (tgt_url.path, pilot['uid'])) logfiles = glob.glob("%s/%s/*.log" % (tgt_url.path, pilot['uid'])) log.info("tarball %s extracted to '%s/%s/'.", ftgt.path, tgt_url.path, pilot['uid']) ret.extend(logfiles) os.unlink(ftgt.path) except Exception as e: log.warn('could not extract tarball %s [%s]', ftgt.path, e) # If extract succeeded, no need to fetch individual logfiles rep.ok("+ %s (logfiles)\n" % pilot['uid']) continue # If we dont have a tarball (for whichever reason), fetch individual logfiles logfiles = sandbox.list('*.log') for logfile in logfiles: ftgt = rs.Url('%s/%s/%s' % (tgt_url, pilot['uid'], logfile)) ret.append("%s" % ftgt.path) if skip_existing and os.path.isfile(ftgt.path) \ and os.stat(ftgt.path).st_size > 0: continue log_file = rs.fs.File("%s%s" % (sandbox_url, logfile), session=session) log_file.copy(ftgt, flags=rs.fs.CREATE_PARENTS) log_file.close() rep.ok("+ %s (logfiles)\n" % pilot['uid']) except Exception as e: rep.error("- %s (logfiles)\n" % pilot['uid']) return ret
def fetch_logfiles(sid, dburl=None, src=None, tgt=None, access=None, session=None, skip_existing=False, fetch_client=False, log=None): ''' sid: session for which all logfiles are fetched src: dir to look for client session logfiles tgt: dir to store the logfile in returns list of file names ''' if not log and session: log = session._log rep = session._rep elif not log: log = ru.Logger('radical.pilot.utils') rep = ru.Reporter('radical.pilot.utils') ret = list() if not dburl: dburl = os.environ['RADICAL_PILOT_DBURL'] if not dburl: raise RuntimeError('Please set RADICAL_PILOT_DBURL') if not src: src = os.getcwd() if not tgt: tgt = os.getcwd() if not tgt.startswith('/') and '://' not in tgt: tgt = "%s/%s" % (os.getcwd(), tgt) # we always create a session dir as real target tgt_url = saga.Url("%s/%s/" % (tgt, sid)) # Turn URLs without schema://host into file://localhost, # so that they dont become interpreted as relative. if not tgt_url.schema: tgt_url.schema = 'file' if not tgt_url.host: tgt_url.host = 'localhost' if fetch_client: # first fetch session logfile client_logfile = "%s/%s.log" % (src, sid) ftgt = saga.Url('%s/%s' % (tgt_url, os.path.basename(client_logfile))) ret.append("%s" % ftgt.path) if skip_existing and os.path.isfile(ftgt.path) \ and os.stat(ftgt.path).st_size > 0: pass else: log_file = saga.filesystem.File(client_logfile, session=session) log_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS) log_file.close() _, db, _, _, _ = ru.mongodb_connect(dburl) json_docs = get_session_docs(db, sid) pilots = json_docs['pilot'] num_pilots = len(pilots) log.info("Session: %s", sid) log.info("Number of pilots in session: %d", num_pilots) for pilot in pilots: try: sandbox_url = saga.Url(pilot['pilot_sandbox']) if access: # Allow to use a different access schema than used for the the run. # Useful if you ran from the headnode, but would like to retrieve # the logfiles to your desktop (Hello Titan). access_url = saga.Url(access) sandbox_url.schema = access_url.schema sandbox_url.host = access_url.host sandbox = saga.filesystem.Directory(sandbox_url, session=session) # Try to fetch a tarball of logfiles, so that we can get them all in one (SAGA) go! LOGFILES_TARBALL = '%s.log.tgz' % pilot['uid'] tarball_available = False try: if sandbox.is_file(LOGFILES_TARBALL) and \ sandbox.get_size(LOGFILES_TARBALL): log.info("logfiles tarball exists") ftgt = saga.Url('%s/%s' % (tgt_url, LOGFILES_TARBALL)) if skip_existing and os.path.isfile(ftgt.path) \ and os.stat(ftgt.path).st_size > 0: log.info("Skip fetching of '%s/%s' to '%s'.", sandbox_url, LOGFILES_TARBALL, tgt_url) tarball_available = True else: log.info("Fetching '%s%s' to '%s'.", sandbox_url, LOGFILES_TARBALL, tgt_url) log_file = saga.filesystem.File( "%s%s" % (sandbox_url, LOGFILES_TARBALL), session=session) log_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS) log_file.close() tarball_available = True else: log.warn("logiles tarball doesnt exists") except saga.DoesNotExist: log.warn("logfiles tarball doesnt exists") try: os.mkdir("%s/%s" % (tgt_url.path, pilot['uid'])) except OSError: pass # We now have a local tarball if tarball_available: log.debug("Extract tarball %s to %s", ftgt.path, tgt_url.path) try: tarball = tarfile.open(ftgt.path) tarball.extractall("%s/%s" % (tgt_url.path, pilot['uid'])) logfiles = glob.glob("%s/%s/*.log" % (tgt_url.path, pilot['uid'])) log.info("tarball %s extracted to '%s/%s/'.", ftgt.path, tgt_url.path, pilot['uid']) ret.extend(logfiles) os.unlink(ftgt.path) except Exception as e: log.warn('could not extract tarball %s [%s]', ftgt.path, e) # If extract succeeded, no need to fetch individual logfiles rep.ok("+ %s (logfiles)\n" % pilot['uid']) continue # If we dont have a tarball (for whichever reason), fetch individual logfiles logfiles = sandbox.list('*.log') for logfile in logfiles: ftgt = saga.Url('%s/%s/%s' % (tgt_url, pilot['uid'], logfile)) ret.append("%s" % ftgt.path) if skip_existing and os.path.isfile(ftgt.path) \ and os.stat(ftgt.path).st_size > 0: continue log_file = saga.filesystem.File("%s%s" % (sandbox_url, logfile), session=session) log_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS) log_file.close() rep.ok("+ %s (logfiles)\n" % pilot['uid']) except Exception as e: rep.error("- %s (logfiles)\n" % pilot['uid']) return ret
def initialize_child(self): """ Read the configuration file, setup logging and mongodb connection. This prepares the stage for the component setup (self._setup()). """ # keep track of objects we need to stop in the finally clause self._sub_agents = dict() self._components = dict() self._workers = dict() # sanity check on config settings if not 'cores' in self._cfg: raise ValueError("Missing number of cores") if not 'debug' in self._cfg: raise ValueError("Missing DEBUG level") if not 'lrms' in self._cfg: raise ValueError("Missing LRMS") if not 'mongodb_url' in self._cfg: raise ValueError("Missing MongoDB URL") if not 'pilot_id' in self._cfg: raise ValueError("Missing pilot id") if not 'runtime' in self._cfg: raise ValueError("Missing or zero agent runtime") if not 'scheduler' in self._cfg: raise ValueError("Missing agent scheduler") if not 'session_id' in self._cfg: raise ValueError("Missing session id") if not 'spawner' in self._cfg: raise ValueError("Missing agent spawner") if not 'task_launch_method' in self._cfg: raise ValueError("Missing unit launch method") if not 'agent_layout' in self._cfg: raise ValueError("Missing agent layout") self._pilot_id = self._cfg['pilot_id'] self._session_id = self._cfg['session_id'] self._runtime = self._cfg['runtime'] self._sub_cfg = self._cfg['agent_layout'][self.agent_name] self._pull_units = self._sub_cfg.get('pull_units', False) # this better be on a shared FS! self._cfg['workdir'] = os.getcwd() # another sanity check if self.agent_name == 'agent_0': if self._sub_cfg.get('target', 'local') != 'local': raise ValueError("agent_0 must run on target 'local'") # configure the agent logger self._log.setLevel(self._cfg['debug']) # set up db connection -- only for the master agent and for the agent # which pulls units (which might be the same) if self.agent_name == 'agent_0' or self._pull_units: self._log.debug('connecting to mongodb at %s for unit pull') _, mongo_db, _, _, _ = ru.mongodb_connect(self._cfg['mongodb_url']) self._p = mongo_db["%s.p" % self._session_id] self._cu = mongo_db["%s.cu" % self._session_id] self._log.debug('connected to mongodb') # first order of business: set the start time and state of the pilot # Only the master agent performs this action if self.agent_name == 'agent_0': now = time.time() ret = self._p.update( {"_id": self._pilot_id}, {"$set" : {"state" : rps.ACTIVE, "started" : now}, "$push": {"statehistory" : {"state" : rps.ACTIVE, "timestamp": now}} }) # TODO: Check for return value, update should be true! self._log.info("Database updated: %s", ret) # make sure we collect commands, specifically to implement the startup # barrier on bootstrap_4 self.declare_publisher ('command', rpc.AGENT_COMMAND_PUBSUB) self.declare_subscriber('command', rpc.AGENT_COMMAND_PUBSUB, self.barrier_cb) # Now instantiate all communication and notification channels, and all # components and workers. It will then feed a set of units to the # lead-in queue (staging_input). A state notification callback will # then register all units which reached a final state (DONE). Once all # units are accounted for, it will tear down all created objects. # we pick the layout according to our role (name) # NOTE: we don't do sanity checks on the agent layout (too lazy) -- but # we would hiccup badly over ill-formatted or incomplete layouts... if not self.agent_name in self._cfg['agent_layout']: raise RuntimeError("no agent layout section for %s" % self.agent_name) try: self.start_sub_agents() self.start_components() # before we declare bootstrapping-success, the we wait for all # components, workers and sub_agents to complete startup. For that, # all sub-agents will wait ALIVE messages on the COMMAND pubsub for # all entities it spawned. Only when all are alive, we will # continue here. self.alive_barrier() except Exception as e: self._log.exception("Agent setup error: %s" % e) raise self._prof.prof('Agent setup done', logger=self._log.debug, uid=self._pilot_id) # also watch all components (once per second) self.declare_idle_cb(self.watcher_cb, 10.0) # once bootstrap_4 is done, we signal success to the parent agent # -- if we have any parent... if self.agent_name != 'agent_0': self.publish('command', {'cmd' : 'alive', 'arg' : self.agent_name}) # the pulling agent registers the staging_input_queue as this is what we want to push to # FIXME: do a sanity check on the config that only one agent pulls, as # this is a non-atomic operation at this point self._log.debug('agent will pull units: %s' % bool(self._pull_units)) if self._pull_units: self.declare_output(rps.AGENT_STAGING_INPUT_PENDING, rpc.AGENT_STAGING_INPUT_QUEUE) self.declare_publisher('state', rpc.AGENT_STATE_PUBSUB) # register idle callback, to pull for units -- which is the only action # we have to perform, really self.declare_idle_cb(self.idle_cb, self._cfg['db_poll_sleeptime'])
def get_session_frames (sids, db=None, cachedir=None) : # use like this: # # session_frame, pilot_frame, unit_frame = rpu.get_session_frames (session, db, cachedir) # pandas.set_option('display.width', 1000) # print session_frame # print pilot_frame # print unit_frame # # u_min = unit_frame.ix[unit_frame['started'].idxmin()]['started'] # u_max = unit_frame.ix[unit_frame['finished'].idxmax()]['finished'] # print u_min # print u_max # print u_max - u_min mongo = None if not db: dburl = os.environ.get('RADICAL_PILOT_DBURL') if not dburl: raise RuntimeError ('Please set RADICAL_PILOT_DBURL') mongo, db, _, _, _ = ru.mongodb_connect(dburl) if not isinstance (sids, list) : sids = [sids] session_dicts = list() pilot_dicts = list() unit_dicts = list() for sid in sids : docs = get_session_docs (db, sid, cachedir=cachedir) session = docs['session'] session_start = session['created'] session_dict = { 'sid' : sid, 'started' : session['created'], 'finished' : None, 'n_pilots' : len(docs['pilot']), 'n_units' : 0 } last_pilot_event = 0 for pilot in docs['pilot'] : pid = pilot['_id'] description = pilot.get ('description', dict()) started = pilot.get ('started') finished = pilot.get ('finished') cores = 0 if pilot['nodes'] and pilot['cores_per_node']: cores = len(pilot['nodes']) * pilot['cores_per_node'] else: cores = description.get('cores') if started : started -= session_start if finished : finished -= session_start pilot_dict = { 'sid' : sid, 'pid' : pid, 'n_units' : len(pilot.get ('unit_ids', list())), 'started' : started, 'finished' : finished, 'resource' : description.get ('resource'), 'cores' : cores, 'runtime' : description.get ('runtime'), NEW : None, PENDING_LAUNCH : None, LAUNCHING : None, PENDING_ACTIVE : None, ACTIVE : None, DONE : None, FAILED : None, CANCELED : None } for entry in pilot.get('statehistory', list()): state = entry['state'] timer = entry['timestamp'] - session_start pilot_dict[state] = timer last_pilot_event = max(last_pilot_event, timer) if not pilot_dict[NEW]: if pilot_dict[PENDING_LAUNCH]: pilot_dict[NEW] = pilot_dict[PENDING_LAUNCH] else: pilot_dict[NEW] = pilot_dict[LAUNCHING] pilot_dicts.append (pilot_dict) for unit in docs['unit']: uid = unit['_id'] started = unit.get ('started') finished = unit.get ('finished') description = unit.get ('description', dict()) if started : started -= session_start if finished : finished -= session_start session_dict['n_units'] += 1 unit_dict = { 'sid' : sid, 'pid' : unit.get('pilot'), 'uid' : uid, 'started' : started, 'finished' : finished, 'cores' : description.get ('cores'), 'slots' : unit.get ('slots'), NEW : None, UNSCHEDULED : None, PENDING_INPUT_STAGING : None, STAGING_INPUT : None, EXECUTING_PENDING : None, SCHEDULING : None, ALLOCATING : None, EXECUTING : None, PENDING_OUTPUT_STAGING : None, STAGING_OUTPUT : None, DONE : None, FAILED : None, CANCELED : None } for entry in unit.get('statehistory', list()): state = entry['state'] timer = entry['timestamp'] - session_start unit_dict[state] = timer # FIXME: there is more state messup afloat: some states are missing, # even though we know they have happened. For one, we see data # being staged w/o having a record of InputStaging states. Or we # find callback history entries for states which are not in the # history... # # We try to clean up to some extent. The policy is like this, for # any [pending_state, state] pair: # # - if both are in the hist: great # - if one is in the hist, and the other in the cb hist, use like # that, but ensure that pending_state <= state # - if both are in cb_hist, use them, apply same ordering assert. # Use median if ordering is wrong # - if only on is in cb_host, use the same value for the other one # - if neither is anywhere, leave unset rec_hist = dict() cb_hist = dict() for e in unit.get('statehistory', list()): state = e['state'] timer = e['timestamp'] - session_start if state not in rec_hist: rec_hist[state] = list() rec_hist[state].append(timer) for e in unit.get('callbackhistory', list()): state = e['state'] timer = e['timestamp'] - session_start if state not in cb_hist: cb_hist[state] = list() cb_hist[state].append(timer) statepairs = {STAGING_INPUT : PENDING_INPUT_STAGING , STAGING_OUTPUT : PENDING_OUTPUT_STAGING} primary_states = [NEW , UNSCHEDULED , STAGING_INPUT , EXECUTING_PENDING , SCHEDULING , ALLOCATING , EXECUTING , STAGING_OUTPUT , DONE , CANCELED , FAILED ] for state in primary_states: pend = None t_state = None t_pend = None ts_rec = rec_hist.get (state) # state time stamp from state hist ts_cb = cb_hist.get (state) # state time stamp from cb hist tp_rec = None # pending state time stamp from state hist tp_cb = None # pending state time stamp from cb hist if state in statepairs: pend = statepairs[state] tp_rec = rec_hist.get (pend) tp_cb = cb_hist.get (pend) # try to find a candidate for state timestamp if ts_rec : t_state = ts_rec[0] elif ts_cb : t_state = ts_cb [0] elif tp_rec : t_state = tp_rec[0] elif tp_cb : t_state = tp_cb [0] # try to find a candidate for pending timestamp if tp_rec : t_pend = tp_rec[0] elif tp_cb : t_pend = tp_cb [0] # if there is no t_pend, check if there are two state times on # record (in the state hist), and if so, reorder if pend : if t_state and not t_pend: if ts_rec and len(ts_rec) == 2: t_pend = min (ts_rec) t_state = max (ts_rec) else: t_pend = t_state # make sure that any pending time comes before state time if pend: if t_pend > t_state: # print "%s : %s" % (uid, state) t_med = (t_pend + t_state) / 2 t_pend = t_med t_state = t_med # record the times for the data frame unit_dict[state] = t_state if pend : unit_dict[pend] = t_pend if unit_dict[UNSCHEDULED] and unit_dict[SCHEDULING]: unit_dict[UNSCHEDULED] = min(unit_dict[UNSCHEDULED], unit_dict[SCHEDULING]) if not unit_dict[NEW]: if unit_dict[UNSCHEDULED]: unit_dict[NEW] = unit_dict[UNSCHEDULED] if unit_dict[SCHEDULING]: unit_dict[NEW] = unit_dict[SCHEDULING] unit_dicts.append (unit_dict) session_dict['finished'] = last_pilot_event session_dicts.append (session_dict) import pandas session_frame = pandas.DataFrame (session_dicts) pilot_frame = pandas.DataFrame (pilot_dicts) unit_frame = pandas.DataFrame (unit_dicts) if mongo: mongo.close() return session_frame, pilot_frame, unit_frame
def fetch_logfiles (sid, dburl=None, client=None, tgt=None, access=None, session=None, skip_existing=False): ''' sid: session for which all logfiles are fetched client: dir to look for client session logfiles tgt: dir to store the logfile in returns list of file names ''' ret = list() if not dburl: dburl = os.environ.get('RADICAL_PILOT_DBURL') if not dburl: from radical.pilot.session import default_dburl logger.report.warn('using default dburl: %s' % default_dburl) dburl = default_dburl if not client: client = os.getcwd() if not tgt: tgt = os.getcwd() if not tgt.startswith('/') and '://' not in tgt: tgt = "%s/%s" % (os.getcwd(), tgt) # we always create a session dir as real target tgt_url = saga.Url(tgt) # Turn URLs without schema://host into file://localhost, # so that they dont become interpreted as relative. if not tgt_url.schema: tgt_url.schema = 'file' if not tgt_url.host: tgt_url.host = 'localhost' # first fetch session logfile # FIXME: should we record pwd or logfile location in db session? Or create # a sandbox like dir for storing logfiles and logs? client_logfile = "%s/%s.log" % (client, sid) ftgt = saga.Url('%s/%s' % (tgt_url, os.path.basename(client_logfile))) ret.append("%s" % ftgt.path) if skip_existing and os.path.isfile(ftgt.path) \ and os.stat(ftgt.path).st_size > 0: logger.report.info("\t- %s\n" % client_logfile.split('/')[-1]) else: if not os.path.isfile(client_logfile): print 'skipping client logfile: %s does not exist' % client_logfile else: logger.report.info("\t+ %s\n" % client_logfile.split('/')[-1]) log_file = saga.filesystem.File(client_logfile, session=session) log_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS) log_file.close() _, db, _, _, _ = ru.mongodb_connect (dburl) json_docs = get_session_docs(db, sid) pilots = json_docs['pilot'] num_pilots = len(pilots) # print "Session: %s" % sid # print "Number of pilots in session: %d" % num_pilots for pilot in pilots: # print "Processing pilot '%s'" % pilot['_id'] sandbox_url = saga.Url(pilot['sandbox']) if access: # Allow to use a different access scheme than used for the the run. # Useful if you ran from the headnode, but would like to retrieve # the logfiles to your desktop (Hello Titan). access_url = saga.Url(access) sandbox_url.schema = access_url.schema sandbox_url.host = access_url.host # print "Overriding remote sandbox: %s" % sandbox_url sandbox = saga.filesystem.Directory (sandbox_url, session=session) # Try to fetch a tarball of logfiles, so that we can get them all in one (SAGA) go! LOGFILES_TARBALL = '%s.log.tgz' % pilot['_id'] tarball_available = False try: if sandbox.is_file(LOGFILES_TARBALL): print "Logfiles tarball exists!" ftgt = saga.Url('%s/%s' % (tgt_url, LOGFILES_TARBALL)) if skip_existing and os.path.isfile(ftgt.path) \ and os.stat(ftgt.path).st_size > 0: print "Skipping fetching of '%s/%s' to '%s'." % (sandbox_url, LOGFILES_TARBALL, tgt_url) tarball_available = True else: print "Fetching '%s%s' to '%s'." % (sandbox_url, LOGFILES_TARBALL, tgt_url) log_file = saga.filesystem.File("%s%s" % (sandbox_url, LOGFILES_TARBALL), session=session) log_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS) log_file.close() tarball_available = True else: print "Logfiles tarball doesnt exists!" except saga.DoesNotExist: print "exception(TODO): logfiles tarball doesnt exists!" try: os.mkdir("%s/%s" % (tgt_url.path, pilot['_id'])) except OSError: pass # We now have a local tarball if tarball_available: print "Extracting tarball %s into '%s'." % (ftgt.path, tgt_url.path) tarball = tarfile.open(ftgt.path) tarball.extractall("%s/%s" % (tgt_url.path, pilot['_id'])) logfiles = glob.glob("%s/%s/*.log" % (tgt_url.path, pilot['_id'])) print "Tarball %s extracted to '%s/%s/'." % (ftgt.path, tgt_url.path, pilot['_id']) ret.extend(logfiles) os.unlink(ftgt.path) # If extract succeeded, no need to fetch individual logfiles continue # If we dont have a tarball (for whichever reason), fetch individual logfiles logfiles = sandbox.list('*.log') for log in logfiles: ftgt = saga.Url('%s/%s/%s' % (tgt_url, pilot['_id'], log)) ret.append("%s" % ftgt.path) if skip_existing and os.path.isfile(ftgt.path) \ and os.stat(ftgt.path).st_size > 0: logger.report.info("\t- %s\n" % str(log).split('/')[-1]) continue logger.report.info("\t+ %s\n" % str(log).split('/')[-1]) log_file = saga.filesystem.File("%s%s" % (sandbox_url, log), session=session) log_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS) log_file.close() return ret
def fetch_profiles (sid, dburl=None, src=None, tgt=None, access=None, session=None, skip_existing=False): ''' sid: session for which all profiles are fetched src: dir to look for client session profiles ($src/$sid/*.prof) tgt: dir to store the profile in - $tgt/$sid/*.prof, - $tgt/$sid/$pilot_id/*.prof) returns list of file names ''' log = ru.get_logger('radical.pilot.utils') ret = list() if not dburl: dburl = os.environ['RADICAL_PILOT_DBURL'] if not dburl: raise RuntimeError ('Please set RADICAL_PILOT_DBURL') if not src: src = os.getcwd() if not tgt: tgt = os.getcwd() if not tgt.startswith('/') and '://' not in tgt: tgt = "%s/%s" % (os.getcwd(), tgt) # we always create a session dir as real target tgt_url = saga.Url("%s/%s/" % (tgt, sid)) # Turn URLs without schema://host into file://localhost, # so that they dont become interpreted as relative. if not tgt_url.schema: tgt_url.schema = 'file' if not tgt_url.host: tgt_url.host = 'localhost' # first fetch session profile # FIXME: should we record pwd or profile location in db session? Or create # a sandbox like dir for storing profiles and logs? client_profiles = glob.glob("%s/%s/*.prof" % (src, sid)) if not client_profiles: raise RuntimeError('no client profiles in %s/%s' % (src, sid)) for client_profile in client_profiles: ftgt = saga.Url('%s/%s' % (tgt_url, os.path.basename(client_profile))) ret.append("%s" % ftgt.path) if skip_existing and os.path.isfile(ftgt.path) \ and os.stat(ftgt.path).st_size > 0: logger.report.info("\t- %s\n" % client_profile.split('/')[-1]) else: logger.report.info("\t+ %s\n" % client_profile.split('/')[-1]) prof_file = saga.filesystem.File(client_profile, session=session) prof_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS) prof_file.close() _, db, _, _, _ = ru.mongodb_connect (dburl) json_docs = get_session_docs(db, sid) pilots = json_docs['pilot'] num_pilots = len(pilots) log.debug("Session: %s", sid) log.debug("Number of pilots in session: %d", num_pilots) for pilot in pilots: pilot['uid'] = pilot['_id'] log.debug("processing pilot '%s'", pilot['uid']) sandbox_url = saga.Url(pilot['sandbox']) if access: # Allow to use a different access schema than used for the the run. # Useful if you ran from the headnode, but would like to retrieve # the profiles to your desktop (Hello Titan). access_url = saga.Url(access) sandbox_url.schema = access_url.schema sandbox_url.host = access_url.host # print "Overriding remote sandbox: %s" % sandbox_url sandbox = saga.filesystem.Directory (sandbox_url, session=session) # Try to fetch a tarball of profiles, so that we can get them all in one (SAGA) go! PROFILES_TARBALL = '%s.prof.tgz' % pilot['uid'] tarball_available = False try: if sandbox.is_file(PROFILES_TARBALL): log.warn("Profiles tarball exists") ftgt = saga.Url('%s/%s' % (tgt_url, PROFILES_TARBALL)) if skip_existing and os.path.isfile(ftgt.path) \ and os.stat(ftgt.path).st_size > 0: log.info("skip fetching of '%s/%s' to '%s'.", sandbox_url, PROFILES_TARBALL, tgt_url) tarball_available = True else: log.info("fetch '%s%s' to '%s'.", sandbox_url, PROFILES_TARBALL, tgt_url) prof_file = saga.filesystem.File("%s%s" % (sandbox_url, PROFILES_TARBALL), session=session) prof_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS) prof_file.close() tarball_available = True else: log.warn("profiles tarball doesnt exists!") except saga.DoesNotExist: log.exception("exception(TODO): profiles tarball doesnt exists!") try: os.mkdir("%s/%s" % (tgt_url.path, pilot['uid'])) except OSError: pass # We now have a local tarball if tarball_available: log.info("Extract tarball %s to '%s'.", ftgt.path, tgt_url.path) try: tarball = tarfile.open(ftgt.path, mode='r:gz') tarball.extractall("%s/%s" % (tgt_url.path, pilot['uid'])) profiles = glob.glob("%s/%s/*.prof" % (tgt_url.path, pilot['uid'])) ret.extend(profiles) except Exception as e: log.warn('could not extract tarball %s [%s]', ftgt.path, e) print 'skip %s [%s]' % (ftgt.path, e) # If extract succeeded, no need to fetch individual profiles continue # If we dont have a tarball (for whichever reason), fetch individual profiles profiles = sandbox.list('*.prof') for prof in profiles: ftgt = saga.Url('%s/%s/%s' % (tgt_url, pilot['uid'], prof)) ret.append("%s" % ftgt.path) if skip_existing and os.path.isfile(ftgt.path) \ and os.stat(ftgt.path).st_size > 0: logger.report.info("\t- %s\n" % str(prof).split('/')[-1]) continue logger.report.info("\t+ %s\n" % str(prof).split('/')[-1]) prof_file = saga.filesystem.File("%s%s" % (sandbox_url, prof), session=session) prof_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS) prof_file.close() return ret
#!/usr/bin/env python import os import radical.utils as ru dburl = 'mongodb://144.76.72.175/am' pwd = os.path.dirname(__file__) if __name__ == '__main__': mongo, db, _, _, _ = ru.mongodb_connect(str(dburl)) sid = ru.generate_id('rp.session', mode=ru.ID_PRIVATE) coll = db[sid] json = ru.read_json('%s/01_publish_resources.json' % pwd) print 'create session %s' % sid for doc in json: if doc['type'] == 'session': doc['uid'] = sid doc['_id'] = sid coll.insert(doc) print 'insert %s %s' % (doc['type'], doc['uid']) print 'inserted session %s' % sid
def bootstrap_3(): """ This method continues where the bootstrapper left off, but will quickly pass control to the Agent class which will spawn the functional components. Most of bootstrap_3 applies only to agent_0, in particular all mongodb interactions remains excluded for other sub-agent instances. The agent interprets a config file, which will specify in an agent_layout section: - what nodes should be used for sub-agent startup - what bridges should be started - what components should be started - what are the endpoints for bridges which are not started bootstrap_3 will create derived config files for all sub-agents. The agent master (agent_0) will collect information about the nodes required for all instances. That is added to the config itself, for the benefit of the LRMS initialisation which is expected to block those nodes from the scheduler. """ global lrms, agent, bridges # find out what agent instance name we have if len(sys.argv) != 2: raise RuntimeError("invalid number of parameters (%s)" % sys.argv) agent_name = sys.argv[1] # load the agent config, and overload the config dicts agent_cfg = "%s/%s.cfg" % (os.getcwd(), agent_name) print "startup agent %s : %s" % (agent_name, agent_cfg) cfg = ru.read_json_str(agent_cfg) cfg["agent_name"] = agent_name pilot_id = cfg["pilot_id"] # set up a logger and profiler prof = ru.Profiler("%s.bootstrap_3" % agent_name) prof.prof("sync ref", msg="agent start", uid=pilot_id) log = ru.get_logger("%s.bootstrap_3" % agent_name, "%s.bootstrap_3.log" % agent_name, "DEBUG") # FIXME? log.info("start") prof.prof("sync ref", msg="agent start") try: import setproctitle as spt spt.setproctitle("radical.pilot %s" % agent_name) except Exception as e: log.debug("no setproctitle: %s", e) log.setLevel(cfg.get("debug", "INFO")) print "Agent config (%s):\n%s\n\n" % (agent_cfg, pprint.pformat(cfg)) # quickly set up a mongodb handle so that we can report errors. # FIXME: signal handlers need mongo_p, but we won't have that until later if agent_name == "agent_0": # Check for the RADICAL_PILOT_DB_HOSTPORT env var, which will hold the # address of the tunnelized DB endpoint. # If it exists, we overrule the agent config with it. hostport = os.environ.get("RADICAL_PILOT_DB_HOSTPORT") if hostport: dburl = ru.Url(cfg["mongodb_url"]) dburl.host, dburl.port = hostport.split(":") cfg["mongodb_url"] = str(dburl) _, mongo_db, _, _, _ = ru.mongodb_connect(cfg["mongodb_url"]) mongo_p = mongo_db["%s.p" % cfg["session_id"]] if not mongo_p: raise RuntimeError("could not get a mongodb handle") # set up signal and exit handlers def exit_handler(): global lrms, agent, bridges print "atexit" if lrms: lrms.stop() lrms = None if bridges: for b in bridges: b.stop() bridges = dict() if agent: agent.stop() agent = None sys.exit(1) def sigint_handler(signum, frame): if agent_name == "agent_0": pilot_FAILED(msg="Caught SIGINT. EXITING (%s)" % frame) print "sigint" prof.prof("stop", msg="sigint_handler", uid=pilot_id) prof.close() sys.exit(2) def sigterm_handler(signum, frame): if agent_name == "agent_0": pilot_FAILED(msg="Caught SIGTERM. EXITING (%s)" % frame) print "sigterm" prof.prof("stop", msg="sigterm_handler %s" % os.getpid(), uid=pilot_id) prof.close() sys.exit(3) def sigalarm_handler(signum, frame): if agent_name == "agent_0": pilot_FAILED(msg="Caught SIGALRM (Walltime limit?). EXITING (%s)" % frame) print "sigalrm" prof.prof("stop", msg="sigalarm_handler", uid=pilot_id) prof.close() sys.exit(4) import atexit atexit.register(exit_handler) signal.signal(signal.SIGINT, sigint_handler) signal.signal(signal.SIGTERM, sigterm_handler) signal.signal(signal.SIGALRM, sigalarm_handler) # if anything went wrong up to this point, we would have been unable to # report errors into mongodb. From here on, any fatal error should result # in one of the above handlers or exit handlers being activated, thus # reporting the error dutifully. try: # ---------------------------------------------------------------------- # des Pudels Kern: merge LRMS info into cfg and get the agent started if agent_name == "agent_0": # only the master agent creates LRMS and sub-agent config files. # The LRMS which will give us the set of agent_nodes to use for # sub-agent startup. Add the remaining LRMS information to the # config, for the benefit of the scheduler). lrms = rp.agent.RM.create(name=cfg["lrms"], cfg=cfg, logger=log) cfg["lrms_info"] = lrms.lrms_info # the master agent also is the only one which starts bridges. It # has to do so before creating the Agent Worker instance, as that is # using the bridges already. bridges = start_bridges(cfg, log) # FIXME: make sure all communication channels are in place. This could # be replaced with a proper barrier, but not sure if that is worth it... time.sleep(1) # after we started bridges, we'll add their in and out addresses # to the config, so that the communication channels can connect to # them. At this point we also write configs for all sub-agents this # instance intents to spawn. # # FIXME: we should point the address to the node of the subagent # which hosts the bridge, not the local IP. Until this # is fixed, bridges MUST run on agent_0 (which is what # RM.hostip() below will point to). nodeip = rp.agent.RM.hostip(cfg.get("network_interface"), logger=log) write_sub_configs(cfg, bridges, nodeip, log) # Store some runtime information into the session mongo_p.update( {"_id": pilot_id}, {"$set": {"lm_info": lrms.lm_info.get("version_info"), "lm_detail": lrms.lm_info.get("lm_detail")}}, ) # we now have correct bridge addresses added to the agent_0.cfg, and all # other agents will have picked that up from their config files -- we # can start the agent and all its components! agent = rp.worker.Agent(cfg) agent.start() log.debug("waiting for agent %s to join" % agent_name) agent.join() log.debug("agent %s joined" % agent_name) # ---------------------------------------------------------------------- except SystemExit: log.exception("Exit running agent: %s" % agent_name) if agent and not agent.final_cause: agent.final_cause = "sys.exit" except Exception as e: log.exception("Error running agent: %s" % agent_name) if agent and not agent.final_cause: agent.final_cause = "error" finally: # in all cases, make sure we perform an orderly shutdown. I hope python # does not mind doing all those things in a finally clause of # (essentially) main... if agent: agent.stop() log.debug("agent %s finalized" % agent_name) # agent.stop will not tear down bridges -- we do that here at last for name, b in bridges.items(): try: log.info("closing bridge %s", b) b["handle"].stop() except Exception as e: log.exception("ignore failing bridge terminate (%s)", e) bridges = dict() # make sure the lrms release whatever it acquired if lrms: lrms.stop() lrms = None # agent_0 will also report final pilot state to the DB if agent_name == "agent_0": if agent and agent.final_cause == "timeout": pilot_DONE(mongo_p, pilot_id, log, "TIMEOUT received. Terminating.") elif agent and agent.final_cause == "cancel": pilot_CANCELED(mongo_p, pilot_id, log, "CANCEL received. Terminating.") elif agent and agent.final_cause == "sys.exit": pilot_CANCELED(mongo_p, pilot_id, log, "EXIT received. Terminating.") elif agent and agent.final_cause == "finalize": log.info("shutdown due to component finalization -- assuming error") pilot_FAILED(mongo_p, pilot_id, log, "FINALIZE received") elif agent: pilot_FAILED(mongo_p, pilot_id, log, "TERMINATE received") else: pilot_FAILED(mongo_p, pilot_id, log, "FAILED startup") log.info("stop") prof.prof("stop", msg="finally clause agent", uid=pilot_id) prof.close()
def fetch_profiles (sid, dburl=None, src=None, tgt=None, access=None, session=None, skip_existing=False, fetch_client=False, log=None): ''' sid: session for which all profiles are fetched src: dir to look for client session profiles ($src/$sid/*.prof) tgt: dir to store the profile in - $tgt/$sid/*.prof, - $tgt/$sid/$pilot_id/*.prof) returns list of file names ''' if not log and session: log = session._log rep = session._rep elif not log: log = ru.Logger('radical.pilot.utils') rep = ru.Reporter('radical.pilot.utils') ret = list() if not dburl: dburl = os.environ['RADICAL_PILOT_DBURL'] if not dburl: raise ValueError('RADICAL_PILOT_DBURL is not set') if not src: src = os.getcwd() if not tgt: tgt = os.getcwd() if not tgt.startswith('/') and '://' not in tgt: tgt = "%s/%s" % (os.getcwd(), tgt) # we always create a session dir as real target tgt_url = rs.Url("%s/%s/" % (tgt, sid)) # Turn URLs without schema://host into file://localhost, # so that they dont become interpreted as relative. if not tgt_url.schema: tgt_url.schema = 'file' if not tgt_url.host: tgt_url.host = 'localhost' # first fetch session profile if fetch_client: client_profiles = glob.glob("%s/%s/*.prof" % (src, sid)) if not client_profiles: raise RuntimeError('no client profiles in %s/%s' % (src, sid)) for client_profile in client_profiles: ftgt = rs.Url('%s/%s' % (tgt_url, os.path.basename(client_profile))) ret.append("%s" % ftgt.path) if skip_existing and os.path.isfile(ftgt.path) \ and os.stat(ftgt.path).st_size > 0: pass else: prof_file = rs.fs.File(client_profile, session=session) prof_file.copy(ftgt, flags=rs.fs.CREATE_PARENTS) prof_file.close() if not os.path.isfile(client_profile): raise RuntimeError('client profilefile %s does not exist' % client_profile) _, db, _, _, _ = ru.mongodb_connect (dburl) json_docs = get_session_docs(db, sid) pilots = json_docs['pilot'] num_pilots = len(pilots) log.debug("Session: %s", sid) log.debug("Number of pilots in session: %d", num_pilots) for pilot in pilots: try: log.debug("processing pilot '%s'", pilot['uid']) sandbox_url = rs.Url(pilot['pilot_sandbox']) if access: # Allow to use a different access schema than used for the the run. # Useful if you ran from the headnode, but would like to retrieve # the profiles to your desktop (Hello Titan). access_url = rs.Url(access) sandbox_url.schema = access_url.schema sandbox_url.host = access_url.host # print "Overriding remote sandbox: %s" % sandbox_url sandbox = rs.fs.Directory (sandbox_url, session=session) # Try to fetch a tarball of profiles, so that we can get them all in one (SAGA) go! PROFILES_TARBALL = '%s.prof.tgz' % pilot['uid'] tarball_available = False try: if sandbox.is_file(PROFILES_TARBALL) and \ sandbox.get_size(PROFILES_TARBALL): log.info("profiles tarball exists") ftgt = rs.Url('%s/%s' % (tgt_url, PROFILES_TARBALL)) if skip_existing and os.path.isfile(ftgt.path) \ and os.stat(ftgt.path).st_size > 0: log.info("skip fetching of '%s/%s' to '%s'.", sandbox_url, PROFILES_TARBALL, tgt_url) tarball_available = True else: log.info("fetch '%s%s' to '%s'.", sandbox_url, PROFILES_TARBALL, tgt_url) prof_file = rs.fs.File("%s%s" % (sandbox_url, PROFILES_TARBALL), session=session) prof_file.copy(ftgt, flags=rs.fs.CREATE_PARENTS) prof_file.close() tarball_available = True else: log.warn("profiles tarball doesnt exists!") except rs.DoesNotExist: log.exception("exception(TODO): profiles tarball doesnt exists!") try: os.mkdir("%s/%s" % (tgt_url.path, pilot['uid'])) except OSError: pass # We now have a local tarball if tarball_available: log.info("Extract tarball %s to '%s'.", ftgt.path, tgt_url.path) try: tarball = tarfile.open(ftgt.path, mode='r:gz') tarball.extractall("%s/%s" % (tgt_url.path, pilot['uid'])) profiles = glob.glob("%s/%s/*.prof" % (tgt_url.path, pilot['uid'])) ret.extend(profiles) os.unlink(ftgt.path) # If extract succeeded, no need to fetch individual profiles rep.ok("+ %s (profiles)\n" % pilot['uid']) continue except Exception as e: log.warn('could not extract tarball %s [%s]', ftgt.path, e) # If we dont have a tarball (for whichever reason), fetch individual profiles profiles = sandbox.list('*.prof') for prof in profiles: ftgt = rs.Url('%s/%s/%s' % (tgt_url, pilot['uid'], prof)) ret.append("%s" % ftgt.path) if skip_existing and os.path.isfile(ftgt.path) \ and os.stat(ftgt.path).st_size > 0: pass else: prof_file = rs.fs.File("%s%s" % (sandbox_url, prof), session=session) prof_file.copy(ftgt, flags=rs.fs.CREATE_PARENTS) prof_file.close() rep.ok("+ %s (profiles)\n" % pilot['uid']) except Exception as e: rep.error("- %s (profiles)\n" % pilot['uid']) log.exception('failed to fet profile for %s', pilot['uid']) return ret
def __init__(self, sid, dburl, cfg, log, connect=True): """ Creates a new session A session is a MongoDB collection which contains documents of different types: session : document describing this rp.Session (singleton) pmgr : document describing a rp.PilotManager pilots : document describing a rp.Pilot umgr : document describing a rp.UnitManager units : document describing a rp.Unit """ self._dburl = dburl self._log = log self._mongo = None self._db = None self._created = time.time() self._connected = None self._closed = None self._c = None self._can_remove = False if not connect: return # mpongodb_connect wants a string at the moment self._mongo, self._db, _, _, _ = ru.mongodb_connect(str(dburl)) if not self._mongo or not self._db: raise RuntimeError('Could not connect to database at %s' % dburl) self._connected = time.time() self._c = self._db[sid] # creates collection (lazily) # If session exists, we assume this is a reconnect, otherwise we create # the session entry. # NOTE: hell will break loose if session IDs are not unique! if not self._c.count(): # make 'uid', 'type' and 'state' indexes, as we frequently query # based on combinations of those. Only 'uid' is unique pma = pymongo.ASCENDING self._c.create_index([('uid', pma)], unique=True, sparse=False) self._c.create_index([('type', pma)], unique=False, sparse=False) self._c.create_index([('state', pma)], unique=False, sparse=False) # insert the session doc self._can_delete = True self._c.insert({ 'type': 'session', '_id': sid, 'uid': sid, 'cfg': copy.deepcopy(cfg), 'created': self._created, 'connected': self._connected }) self._can_remove = True else: docs = self._c.find({'type': 'session', 'uid': sid}) if not docs.count(): raise ValueError('cannot reconnect to session %s' % sid) doc = docs[0] self._can_delete = False self._created = doc['created'] self._connected = time.time()
def __init__(self, sid, dburl, cfg, logger, connect=True): """ Creates a new session A session is a MongoDB collection which contains documents of different types: session : document describing this rp.Session (singleton) pmgr : document describing a rp.PilotManager pilots : document describing a rp.Pilot umgr : document describing a rp.UnitManager units : document describing a rp.Unit """ self._dburl = dburl self._log = logger self._mongo = None self._db = None self._created = time.time() self._connected = None self._closed = None self._c = None self._can_remove = False if not connect: return # mpongodb_connect wants a string at the moment self._mongo, self._db, _, _, _ = ru.mongodb_connect(str(dburl)) if not self._mongo or not self._db: raise RuntimeError('Could not connect to database at %s' % dburl) self._connected = time.time() self._c = self._db[sid] # creates collection (lazily) # If session exists, we assume this is a reconnect, otherwise we create # the session entry. # NOTE: hell will break loose if session IDs are not unique! if not self._c.count(): # make 'uid', 'type' and 'state' indexes, as we frequently query # based on combinations of those. Only 'uid' is unique self._c.create_index([('uid', pymongo.ASCENDING)], unique=True, sparse=False) self._c.create_index([('type', pymongo.ASCENDING)], unique=False, sparse=False) self._c.create_index([('state', pymongo.ASCENDING)], unique=False, sparse=False) # insert the session doc self._can_delete = True self._c.insert({'type' : 'session', '_id' : sid, 'uid' : sid, 'cfg' : copy.deepcopy(cfg), 'created' : self._created, 'connected' : self._connected}) self._can_remove = True else: docs = self._c.find({'type' : 'session', 'uid' : sid}) if not docs.count(): raise ValueError('cannot reconnect to session %s' % sid) doc = docs[0] self._can_delete = False self._created = doc['created'] self._connected = time.time()
plt.savefig(ofile) # ----------------------------------------------------------------------------- # if __name__ == '__main__': session = None q_pilots = None timing = None timings = {} #pdir = os.environ['PLOT_DIR'] cachedir = os.getcwd() dburl = 'mongodb://*****:*****@ds053838.mongolab.com:53838/hicomb' mongo, db, dbname, cname, pname = ru.mongodb_connect(str(dburl)) if len(sys.argv) <= 1: usage("insufficient arguments -- need session ID") if len(sys.argv) > 4: usage("too many arguments -- no more than 3") if len(sys.argv[1]) < 20: usage("illegal session token -- valid e.g. 54b1c5d523769c2f1b55dffd") else: session = sys.argv[1] if len(sys.argv) > 2: timing = sys.argv[2]
def fetch_profiles(sid, dburl=None, client=None, tgt=None, access=None, session=None, skip_existing=False): ''' sid: session for which all profiles are fetched client: dir to look for client session profiles tgt: dir to store the profile in returns list of file names ''' ret = list() if not dburl: dburl = os.environ['RADICAL_PILOT_DBURL'] if not dburl: raise RuntimeError('Please set RADICAL_PILOT_DBURL') if not client: client = os.getcwd() if not tgt: tgt = os.getcwd() if not tgt.startswith('/') and '://' not in tgt: tgt = "%s/%s" % (os.getcwd(), tgt) # we always create a session dir as real target tgt_url = saga.Url("%s/%s/" % (tgt, sid)) # Turn URLs without schema://host into file://localhost, # so that they dont become interpreted as relative. if not tgt_url.schema: tgt_url.schema = 'file' if not tgt_url.host: tgt_url.host = 'localhost' # first fetch session profile # FIXME: should we record pwd or profile location in db session? Or create # a sandbox like dir for storing profiles and logs? client_profile = "%s/%s.prof" % (client, sid) ftgt = saga.Url('%s/%s' % (tgt_url, os.path.basename(client_profile))) ret.append("%s" % ftgt.path) if skip_existing and os.path.isfile(ftgt.path) \ and os.stat(ftgt.path).st_size > 0: logger.report.info("\t- %s\n" % client_profile.split('/')[-1]) else: logger.report.info("\t+ %s\n" % client_profile.split('/')[-1]) prof_file = saga.filesystem.File(client_profile, session=session) prof_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS) prof_file.close() _, db, _, _, _ = ru.mongodb_connect(dburl) json_docs = get_session_docs(db, sid) pilots = json_docs['pilot'] num_pilots = len(pilots) # print "Session: %s" % sid # print "Number of pilots in session: %d" % num_pilots for pilot in pilots: # print "Processing pilot '%s'" % pilot['_id'] sandbox_url = saga.Url(pilot['sandbox']) if access: # Allow to use a different access scheme than used for the the run. # Useful if you ran from the headnode, but would like to retrieve # the profiles to your desktop (Hello Titan). access_url = saga.Url(access) sandbox_url.schema = access_url.schema sandbox_url.host = access_url.host # print "Overriding remote sandbox: %s" % sandbox_url sandbox = saga.filesystem.Directory(sandbox_url, session=session) # Try to fetch a tarball of profiles, so that we can get them all in one (SAGA) go! PROFILES_TARBALL = '%s.prof.tgz' % pilot['_id'] tarball_available = False try: if sandbox.is_file(PROFILES_TARBALL): print "Profiles tarball exists!" ftgt = saga.Url('%s/%s' % (tgt_url, PROFILES_TARBALL)) if skip_existing and os.path.isfile(ftgt.path) \ and os.stat(ftgt.path).st_size > 0: print "Skipping fetching of '%s/%s' to '%s'." % ( sandbox_url, PROFILES_TARBALL, tgt_url) tarball_available = True else: print "Fetching '%s%s' to '%s'." % ( sandbox_url, PROFILES_TARBALL, tgt_url) prof_file = saga.filesystem.File( "%s%s" % (sandbox_url, PROFILES_TARBALL), session=session) prof_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS) prof_file.close() tarball_available = True else: print "Profiles tarball doesnt exists!" except saga.DoesNotExist: print "exception(TODO): profiles tarball doesnt exists!" try: os.mkdir("%s/%s" % (tgt_url.path, pilot['_id'])) except OSError: pass # We now have a local tarball if tarball_available: print "Extracting tarball %s into '%s'." % (ftgt.path, tgt_url.path) tarball = tarfile.open(ftgt.path) tarball.extractall("%s/%s" % (tgt_url.path, pilot['_id'])) profiles = glob.glob("%s/*.prof" % tgt_url.path) print "Tarball %s extracted to '%s/%s/'." % ( ftgt.path, tgt_url.path, pilot['_id']) ret.extend(profiles) # If extract succeeded, no need to fetch individual profiles continue # If we dont have a tarball (for whichever reason), fetch individual profiles profiles = sandbox.list('*.prof') for prof in profiles: ftgt = saga.Url('%s/%s/%s' % (tgt_url, pilot['_id'], prof)) ret.append("%s" % ftgt.path) if skip_existing and os.path.isfile(ftgt.path) \ and os.stat(ftgt.path).st_size > 0: logger.report.info("\t- %s\n" % str(prof).split('/')[-1]) continue logger.report.info("\t+ %s\n" % str(prof).split('/')[-1]) prof_file = saga.filesystem.File("%s%s" % (sandbox_url, prof), session=session) prof_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS) prof_file.close() return ret