# ------------------------------------------------------------------------------ # # set up the connection to EC2 # if not 'EC2_URL' in os.environ: usage("no %s in environment" % 'EC2_URL') if not 'EC2_ACCESS_KEY' in os.environ: usage("no %s in environment" % 'EC2_ACCESS_KEY') if not 'EC2_SECRET_KEY' in os.environ: usage("no %s in environment" % 'EC2_SECRET_KEY') if not 'EC2_KEYPAIR_ID' in os.environ: usage("no %s in environment" % 'EC2_KEYPAIR_ID') if not 'EC2_KEYPAIR' in os.environ: usage("no %s in environment" % 'EC2_KEYPAIR') server = saga.Url(os.environ['EC2_URL']) # in order to connect to EC2, we need an EC2 ID and KEY c1 = saga.Context('ec2') c1.user_id = os.environ['EC2_ACCESS_KEY'] c1.user_key = os.environ['EC2_SECRET_KEY'] c1.server = server # in order to access a created VM, we additionally need to point to the ssh # key which is used for EC2 VM contextualization, i.e. as EC2 'keypair'. # If the keypair is not yet registered on EC2, it will be registered by SAGA # -- but then a user_key *must* be specified (only the public key is ever # transfererd to EC2). c2 = saga.Context('ec2_keypair') c2.token = os.environ['EC2_KEYPAIR_ID'] c2.user_cert = os.environ['EC2_KEYPAIR']
def _handle_pilot_input_staging(self, pilot, sds): pid = pilot['uid'] # NOTE: no unit sandboxes defined! src_context = { 'pwd': pilot['client_sandbox'], 'pilot': pilot['pilot_sandbox'], 'resource': pilot['resource_sandbox'] } tgt_context = { 'pwd': pilot['pilot_sandbox'], 'pilot': pilot['pilot_sandbox'], 'resource': pilot['resource_sandbox'] } # Iterate over all directives for sd in sds: # TODO: respect flags in directive action = sd['action'] flags = sd['flags'] did = sd['uid'] src = sd['source'] tgt = sd['target'] assert (action in [COPY, LINK, MOVE, TRANSFER]) self._prof.prof('staging_in_start', uid=pid, msg=did) src = complete_url(src, src_context, self._log) tgt = complete_url(tgt, tgt_context, self._log) if action in [COPY, LINK, MOVE]: self._prof.prof('staging_in_fail', uid=pid, msg=did) raise ValueError("invalid action '%s' on pilot level" % action) self._log.info('transfer %s to %s', src, tgt) # FIXME: make sure that tgt URL points to the right resource # FIXME: honor sd flags if given (recursive...) flags = rsfs.CREATE_PARENTS if os.path.isdir(src.path): flags |= rsfs.RECURSIVE # Define and open the staging directory for the pilot # We use the target dir construct here, so that we can create # the directory if it does not yet exist. # url used for cache (sandbox url w/o path) tmp = rs.Url(pilot['pilot_sandbox']) tmp.path = '/' key = str(tmp) self._log.debug("rs.file.Directory ('%s')", key) with self._cache_lock: if key in self._saga_fs_cache: fs = self._saga_fs_cache[key] else: fs = rsfs.Directory(key, session=self._session) self._saga_fs_cache[key] = fs fs.copy(src, tgt, flags=flags) sd['pmgr_state'] = rps.DONE self._prof.prof('staging_in_stop', uid=pid, msg=did) self.publish( rpc.CONTROL_PUBSUB, { 'cmd': 'pilot_staging_input_result', 'arg': { 'pilot': pilot, 'sds': sds } })
def run(self): """Starts the process when Process.start() is called. """ # make sure to catch sys.exit (which raises SystemExit) try: logger.info("Starting InputFileTransferWorker") # Try to connect to the database and create a tailable cursor. try: db = self._session.get_db() um_col = db["%s.cu" % self._session.uid] logger.debug( "Connected to MongoDB. Serving requests for UnitManager %s." % self.unit_manager_id) except Exception as e: logger.exception("Connection error: %s" % e) raise while not self._stop.is_set() and \ not self._session._terminate.is_set(): # See if we can find a ComputeUnit that is waiting for # input file transfer. ts = timestamp() compute_unit = um_col.find_and_modify( query={ "unitmanager": self.unit_manager_id, "state": PENDING_INPUT_STAGING, }, update={ "$set": { "state": STAGING_INPUT }, "$push": { "statehistory": { "state": STAGING_INPUT, "timestamp": ts } } }) if compute_unit is None: # Sleep a bit if no new units are available. time.sleep(IDLE_TIME) else: compute_unit_id = None state = STAGING_INPUT try: log_messages = [] # We have found a new CU. Now we can process the transfer # directive(s) wit SAGA. compute_unit_id = str(compute_unit["_id"]) logger.debug("InputStagingController: unit found: %s" % compute_unit_id) self._session.prof.prof('advance', uid=compute_unit_id, msg=state, state=state) remote_sandbox = compute_unit["sandbox"] input_staging = compute_unit.get( "FTW_Input_Directives", []) # if we do staging, create the CU's directory in case it doesn't exist yet. if input_staging: log_msg = "InputStagingController: Creating ComputeUnit sandbox directory %s." % remote_sandbox log_messages.append(log_msg) logger.info(log_msg) # Creating/initialising the sandbox directory. try: logger.debug("saga.fs.Directory ('%s')" % remote_sandbox) # url used for saga remote_sandbox_url = saga.Url(remote_sandbox) # keyurl and key used for cache remote_sandbox_keyurl = saga.Url( remote_sandbox) remote_sandbox_keyurl.path = '/' remote_sandbox_key = str(remote_sandbox_keyurl) if remote_sandbox_key not in self._saga_dirs: self._saga_dirs[remote_sandbox_key] = \ saga.filesystem.Directory(remote_sandbox_url, flags=saga.filesystem.CREATE_PARENTS, session=self._session) saga_dir = self._saga_dirs[remote_sandbox_key] except Exception as e: logger.exception('Error: %s' % e) raise logger.info( "InputStagingController: Processing input file transfers for ComputeUnit %s" % compute_unit_id) # Loop over all transfer directives and execute them. for sd in input_staging: logger.debug( "InputStagingController: sd: %s : %s" % (compute_unit_id, sd)) # Check if there was a cancel request state_doc = um_col.find_one( {"_id": compute_unit_id}, fields=["state"]) if state_doc['state'] == CANCELED: self._session.prof.prof('advance', uid=compute_unit_id, msg=CANCELED, state=CANCELED) logger.info( "Compute Unit Canceled, interrupting input file transfers." ) state = CANCELED # Break out of the loop for this CU's SD's break abs_src = os.path.abspath(sd['source']) input_file_url = saga.Url("file://localhost%s" % abs_src) if not sd['target']: target = '%s/%s' % (remote_sandbox, os.path.basename(abs_src)) else: target = "%s/%s" % (remote_sandbox, sd['target']) log_msg = "Transferring input file %s -> %s" % ( input_file_url, target) log_messages.append(log_msg) logger.debug(log_msg) # Execute the transfer. if CREATE_PARENTS in sd['flags']: copy_flags = saga.filesystem.CREATE_PARENTS else: copy_flags = 0 try: saga_dir.copy(input_file_url, target, flags=copy_flags) except Exception as e: logger.exception(e) raise Exception("copy failed(%s)" % e.message) # If this CU was canceled we can skip the remainder of this loop, # to process more CUs. if state == CANCELED: continue # All IFTW staging done for this CU. Push it out, by # setting the state as 'AGENT_ATGING_INPUT_PENDING and # sending it to mongodb. We mark the CU under 'umgr' # control -- once the agent picks it up, it will be # marked as under 'agent' control, before the # agent_stging_output_component passes control back in # a similar manner. um_col.update({'_id': compute_unit_id}, { '$set': { 'state': AGENT_STAGING_INPUT_PENDING, 'control': 'umgr' }, '$push': { 'statehistory': { 'state': AGENT_STAGING_INPUT_PENDING, 'timestamp': ts }, 'log': { 'timestamp': timestamp(), 'message': 'push unit to agent after ftw staging' } } }) logger.debug( "InputStagingController: %s : push to agent" % compute_unit_id) self._session.prof.prof( 'advance', uid=compute_unit_id, msg=AGENT_STAGING_INPUT_PENDING, state=AGENT_STAGING_INPUT_PENDING) except Exception as e: # Update the CU's state to 'FAILED'. ts = timestamp() logentry = { 'message': "Input transfer failed: %s" % e, 'timestamp': ts } um_col.update({'_id': compute_unit_id}, { '$set': { 'state': FAILED }, '$push': { 'statehistory': { 'state': FAILED, 'timestamp': ts }, 'log': logentry } }) self._session.prof.prof('advance', uid=compute_unit_id, msg=FAILED, state=FAILED) logger.exception(str(logentry)) raise except SystemExit as e: logger.debug( "input file transfer thread caught system exit -- forcing application shutdown" ) thread.interrupt_main()
def initialize(self, url, session=None, prompt=None, logger=None, posix=True, interactive=True): with self.rlock: # make sure we have a valid url type url = saga.Url(url) if not prompt: prompt = "^(.*[\$#%>\]])\s*$" if not logger: logger = self.logger # collect all information we have/need about the requested master # connection info = self._create_master_entry(url, session, prompt, logger, posix, interactive) # we got master info - register the master, and create the instance! type_s = str(info['shell_type']) user_s = str(info['user']) host_s = str(info['host_str']) # Now, if we don't have that master, yet, we need to instantiate it if not host_s in self.registry: self.registry[host_s] = {} if not user_s in self.registry[host_s]: self.registry[host_s][user_s] = {} if not type_s in self.registry[host_s][user_s]: # new master: create an instance, and register it m_cmd = info['scripts'][info['shell_type']]['master'] % info logger.debug ("open master pty for [%s] [%s] %s: %s'" \ % (type_s, host_s, user_s, m_cmd)) info['pty'] = supp.PTYProcess(m_cmd, logger=logger) if not info['pty'].alive(): raise se.NoSuccess._log (logger, \ "Shell not connected to %s" % info['host_str']) # authorization, prompt setup, etc. Initialize as shell if not # explicitly marked as non-posix shell self._initialize_pty(info['pty'], info) # master was created - register it self.registry[host_s][user_s][type_s] = info else: # we already have a master: make sure it is alive, and restart as # needed info = self.registry[host_s][user_s][type_s] if not info['pty'].alive(recover=True): raise se.IncorrectState._log (logger, \ "Lost shell connection to %s" % info['host_str']) return info
def fetch_profiles(sid, dburl=None, client=None, tgt=None, access=None, session=None, skip_existing=False): ''' sid: session for which all profiles are fetched client: dir to look for client session profiles tgt: dir to store the profile in returns list of file names ''' ret = list() if not dburl: dburl = os.environ['RADICAL_PILOT_DBURL'] if not dburl: raise RuntimeError('Please set RADICAL_PILOT_DBURL') if not client: client = os.getcwd() if not tgt: tgt = os.getcwd() if not tgt.startswith('/') and '://' not in tgt: tgt = "%s/%s" % (os.getcwd(), tgt) # we always create a session dir as real target tgt_url = saga.Url("%s/%s/" % (tgt, sid)) # Turn URLs without schema://host into file://localhost, # so that they dont become interpreted as relative. if not tgt_url.schema: tgt_url.schema = 'file' if not tgt_url.host: tgt_url.host = 'localhost' # first fetch session profile # FIXME: should we record pwd or profile location in db session? Or create # a sandbox like dir for storing profiles and logs? client_profile = "%s/%s.prof" % (client, sid) ftgt = saga.Url('%s/%s' % (tgt_url, os.path.basename(client_profile))) ret.append("%s" % ftgt.path) if skip_existing and os.path.isfile(ftgt.path) \ and os.stat(ftgt.path).st_size > 0: logger.report.info("\t- %s\n" % client_profile.split('/')[-1]) else: logger.report.info("\t+ %s\n" % client_profile.split('/')[-1]) prof_file = saga.filesystem.File(client_profile, session=session) prof_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS) prof_file.close() _, db, _, _, _ = ru.mongodb_connect(dburl) json_docs = get_session_docs(db, sid) pilots = json_docs['pilot'] num_pilots = len(pilots) # print "Session: %s" % sid # print "Number of pilots in session: %d" % num_pilots for pilot in pilots: # print "Processing pilot '%s'" % pilot['_id'] sandbox_url = saga.Url(pilot['sandbox']) if access: # Allow to use a different access scheme than used for the the run. # Useful if you ran from the headnode, but would like to retrieve # the profiles to your desktop (Hello Titan). access_url = saga.Url(access) sandbox_url.schema = access_url.schema sandbox_url.host = access_url.host # print "Overriding remote sandbox: %s" % sandbox_url sandbox = saga.filesystem.Directory(sandbox_url, session=session) # Try to fetch a tarball of profiles, so that we can get them all in one (SAGA) go! PROFILES_TARBALL = '%s.prof.tgz' % pilot['_id'] tarball_available = False try: if sandbox.is_file(PROFILES_TARBALL): print "Profiles tarball exists!" ftgt = saga.Url('%s/%s' % (tgt_url, PROFILES_TARBALL)) if skip_existing and os.path.isfile(ftgt.path) \ and os.stat(ftgt.path).st_size > 0: print "Skipping fetching of '%s/%s' to '%s'." % ( sandbox_url, PROFILES_TARBALL, tgt_url) tarball_available = True else: print "Fetching '%s%s' to '%s'." % ( sandbox_url, PROFILES_TARBALL, tgt_url) prof_file = saga.filesystem.File( "%s%s" % (sandbox_url, PROFILES_TARBALL), session=session) prof_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS) prof_file.close() tarball_available = True else: print "Profiles tarball doesnt exists!" except saga.DoesNotExist: print "exception(TODO): profiles tarball doesnt exists!" try: os.mkdir("%s/%s" % (tgt_url.path, pilot['_id'])) except OSError: pass # We now have a local tarball if tarball_available: print "Extracting tarball %s into '%s'." % (ftgt.path, tgt_url.path) tarball = tarfile.open(ftgt.path) tarball.extractall("%s/%s" % (tgt_url.path, pilot['_id'])) profiles = glob.glob("%s/*.prof" % tgt_url.path) print "Tarball %s extracted to '%s/%s/'." % ( ftgt.path, tgt_url.path, pilot['_id']) ret.extend(profiles) # If extract succeeded, no need to fetch individual profiles continue # If we dont have a tarball (for whichever reason), fetch individual profiles profiles = sandbox.list('*.prof') for prof in profiles: ftgt = saga.Url('%s/%s/%s' % (tgt_url, pilot['_id'], prof)) ret.append("%s" % ftgt.path) if skip_existing and os.path.isfile(ftgt.path) \ and os.stat(ftgt.path).st_size > 0: logger.report.info("\t- %s\n" % str(prof).split('/')[-1]) continue logger.report.info("\t+ %s\n" % str(prof).split('/')[-1]) prof_file = saga.filesystem.File("%s%s" % (sandbox_url, prof), session=session) prof_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS) prof_file.close() return ret
def fetch_logfiles(sid, dburl=None, src=None, tgt=None, access=None, session=None, skip_existing=False, fetch_client=False, log=None): ''' sid: session for which all logfiles are fetched src: dir to look for client session logfiles tgt: dir to store the logfile in returns list of file names ''' if not log and session: log = session._log rep = session._rep elif not log: log = ru.Logger('radical.pilot.utils') rep = ru.Reporter('radical.pilot.utils') ret = list() if not dburl: dburl = os.environ['RADICAL_PILOT_DBURL'] if not dburl: raise RuntimeError('Please set RADICAL_PILOT_DBURL') if not src: src = os.getcwd() if not tgt: tgt = os.getcwd() if not tgt.startswith('/') and '://' not in tgt: tgt = "%s/%s" % (os.getcwd(), tgt) # we always create a session dir as real target tgt_url = saga.Url("%s/%s/" % (tgt, sid)) # Turn URLs without schema://host into file://localhost, # so that they dont become interpreted as relative. if not tgt_url.schema: tgt_url.schema = 'file' if not tgt_url.host: tgt_url.host = 'localhost' if fetch_client: # first fetch session logfile client_logfile = "%s/%s.log" % (src, sid) ftgt = saga.Url('%s/%s' % (tgt_url, os.path.basename(client_logfile))) ret.append("%s" % ftgt.path) if skip_existing and os.path.isfile(ftgt.path) \ and os.stat(ftgt.path).st_size > 0: pass else: log_file = saga.filesystem.File(client_logfile, session=session) log_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS) log_file.close() _, db, _, _, _ = ru.mongodb_connect(dburl) json_docs = get_session_docs(db, sid) pilots = json_docs['pilot'] num_pilots = len(pilots) log.info("Session: %s", sid) log.info("Number of pilots in session: %d", num_pilots) for pilot in pilots: try: sandbox_url = saga.Url(pilot['pilot_sandbox']) if access: # Allow to use a different access schema than used for the the run. # Useful if you ran from the headnode, but would like to retrieve # the logfiles to your desktop (Hello Titan). access_url = saga.Url(access) sandbox_url.schema = access_url.schema sandbox_url.host = access_url.host sandbox = saga.filesystem.Directory(sandbox_url, session=session) # Try to fetch a tarball of logfiles, so that we can get them all in one (SAGA) go! LOGFILES_TARBALL = '%s.log.tgz' % pilot['uid'] tarball_available = False try: if sandbox.is_file(LOGFILES_TARBALL) and \ sandbox.get_size(LOGFILES_TARBALL): log.info("logfiles tarball exists") ftgt = saga.Url('%s/%s' % (tgt_url, LOGFILES_TARBALL)) if skip_existing and os.path.isfile(ftgt.path) \ and os.stat(ftgt.path).st_size > 0: log.info("Skip fetching of '%s/%s' to '%s'.", sandbox_url, LOGFILES_TARBALL, tgt_url) tarball_available = True else: log.info("Fetching '%s%s' to '%s'.", sandbox_url, LOGFILES_TARBALL, tgt_url) log_file = saga.filesystem.File( "%s%s" % (sandbox_url, LOGFILES_TARBALL), session=session) log_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS) log_file.close() tarball_available = True else: log.warn("logiles tarball doesnt exists") except saga.DoesNotExist: log.warn("logfiles tarball doesnt exists") try: os.mkdir("%s/%s" % (tgt_url.path, pilot['uid'])) except OSError: pass # We now have a local tarball if tarball_available: log.debug("Extract tarball %s to %s", ftgt.path, tgt_url.path) try: tarball = tarfile.open(ftgt.path) tarball.extractall("%s/%s" % (tgt_url.path, pilot['uid'])) logfiles = glob.glob("%s/%s/*.log" % (tgt_url.path, pilot['uid'])) log.info("tarball %s extracted to '%s/%s/'.", ftgt.path, tgt_url.path, pilot['uid']) ret.extend(logfiles) os.unlink(ftgt.path) except Exception as e: log.warn('could not extract tarball %s [%s]', ftgt.path, e) # If extract succeeded, no need to fetch individual logfiles rep.ok("+ %s (logfiles)\n" % pilot['uid']) continue # If we dont have a tarball (for whichever reason), fetch individual logfiles logfiles = sandbox.list('*.log') for logfile in logfiles: ftgt = saga.Url('%s/%s/%s' % (tgt_url, pilot['uid'], logfile)) ret.append("%s" % ftgt.path) if skip_existing and os.path.isfile(ftgt.path) \ and os.stat(ftgt.path).st_size > 0: continue log_file = saga.filesystem.File("%s%s" % (sandbox_url, logfile), session=session) log_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS) log_file.close() rep.ok("+ %s (logfiles)\n" % pilot['uid']) except Exception as e: rep.error("- %s (logfiles)\n" % pilot['uid']) return ret
def stage_in(self, directives): """Stages the content of the staging directive into the pilot's staging area""" # Wait until we can assume the pilot directory to be created if self.state == NEW: self.wait( state=[PENDING_LAUNCH, LAUNCHING, PENDING_ACTIVE, ACTIVE]) elif self.state in [DONE, FAILED, CANCELED]: raise Exception( "Pilot already finished, no need to stage anymore!") # Iterate over all directives for directive in expand_staging_directive(directives): # TODO: respect flags in directive src_url = saga.Url(directive['source']) action = directive['action'] # Convert the target url into a SAGA Url object tgt_url = saga.Url(directive['target']) # Create a pointer to the directory object that we will use tgt_dir_url = tgt_url if tgt_url.path.endswith('/'): # If the original target was a directory (ends with /), # we assume that the user wants the same filename as the source. tgt_filename = os.path.basename(src_url.path) else: # Otherwise, extract the filename and update the directory tgt_filename = os.path.basename(tgt_dir_url.path) tgt_dir_url.path = os.path.dirname(tgt_dir_url.path) # Handle special 'staging' scheme if tgt_dir_url.scheme == 'staging': # We expect a staging:///relative/path/file.txt URI, # as hostname would have unclear semantics currently. if tgt_dir_url.host: raise Exception( "hostname not supported with staging:// scheme") # Remove the leading slash to get a relative path from the staging area rel_path = os.path.relpath(tgt_dir_url.path, '/') # Now base the target directory relative of the sandbox and staging prefix tgt_dir_url = saga.Url( os.path.join(self.sandbox, STAGING_AREA, rel_path)) # Define and open the staging directory for the pilot # We use the target dir construct here, so that we can create # the directory if it does not yet exist. target_dir = saga.filesystem.Directory( tgt_dir_url, flags=saga.filesystem.CREATE_PARENTS) if action == LINK: # TODO: Does this make sense? #log_message = 'Linking %s to %s' % (source, abs_target) #os.symlink(source, abs_target) logger.error( "action 'LINK' not supported on pilot level staging") raise ValueError( "action 'LINK' not supported on pilot level staging") elif action == COPY: # TODO: Does this make sense? #log_message = 'Copying %s to %s' % (source, abs_target) #shutil.copyfile(source, abs_target) logger.error( "action 'COPY' not supported on pilot level staging") raise ValueError( "action 'COPY' not supported on pilot level staging") elif action == MOVE: # TODO: Does this make sense? #log_message = 'Moving %s to %s' % (source, abs_target) #shutil.move(source, abs_target) logger.error( "action 'MOVE' not supported on pilot level staging") raise ValueError( "action 'MOVE' not supported on pilot level staging") elif action == TRANSFER: log_message = 'Transferring %s to %s' % ( src_url, os.path.join(str(tgt_dir_url), tgt_filename)) logger.info(log_message) # Transfer the source file to the target staging area target_dir.copy(src_url, tgt_filename) else: raise Exception('Action %s not supported' % action)
def _handle_unit(self, unit, actionables): uid = unit['uid'] src_context = { 'pwd': unit['unit_sandbox'], # !!! 'unit': unit['unit_sandbox'], 'pilot': unit['pilot_sandbox'], 'resource': unit['resource_sandbox'] } tgt_context = { 'pwd': os.getcwd(), # !!! 'unit': unit['unit_sandbox'], 'pilot': unit['pilot_sandbox'], 'resource': unit['resource_sandbox'] } # url used for cache (sandbox url w/o path) tmp = rs.Url(unit["unit_sandbox"]) tmp.path = '/' key = str(tmp) if key not in self._cache: self._cache[key] = rs.filesystem.Directory(tmp, session=self._session) saga_dir = self._cache[key] # Loop over all transfer directives and execute them. for sd in actionables: action = sd['action'] flags = sd['flags'] did = sd['uid'] src = sd['source'] tgt = sd['target'] self._prof.prof('staging_out_start', uid=uid, msg=did) self._log.debug('src: %s', src) self._log.debug('tgt: %s', tgt) src = rpsd.complete_url(src, src_context, self._log) tgt = rpsd.complete_url(tgt, tgt_context, self._log) self._log.debug('src: %s', src) self._log.debug('tgt: %s', tgt) # Check if the src is a folder, if true # add recursive flag if not already specified if saga_dir.is_dir(src.path): flags |= rs.filesystem.RECURSIVE # Always set CREATE_PARENTS flags |= rs.filesystem.CREATE_PARENTS saga_dir.copy(src, tgt, flags=flags) self._prof.prof('staging_out_stop', uid=uid, msg=did) # all staging is done -- at this point the unit is final unit['state'] = unit['target_state'] self.advance(unit, publish=True, push=True)
def run(self): """Starts the process when Process.start() is called. """ # make sure to catch sys.exit (which raises SystemExit) try: # Get directory where this module lives mod_dir = os.path.dirname(os.path.realpath(__file__)) # Try to connect to the database try: connection = self.db_connection_info.get_db_handle() db = connection[self.db_connection_info.dbname] pilot_col = db["%s.p" % self.db_connection_info.session_id] logger.debug( "Connected to MongoDB. Serving requests for PilotManager %s." % self.pilot_manager_id) except Exception as e: logger.exception("Connection error: %s" % e) return last_job_check = time.time() while not self._stop.is_set(): # Periodically, we pull up all ComputePilots that are pending # execution or were last seen executing and check if the corresponding # SAGA job is still pending in the queue. If that is not the case, # we assume that the job has failed for some reasons and update # the state of the ComputePilot accordingly. if last_job_check + JOB_CHECK_INTERVAL < time.time(): last_job_check = time.time() self.check_pilot_states(pilot_col) # See if we can find a ComputePilot that is waiting to be launched. # If we find one, we use SAGA to create a job service, a job # description and a job that is then send to the local or remote # queueing system. If this succedes, we set the ComputePilot's # state to pending, otherwise to failed. compute_pilot = None ts = datetime.datetime.utcnow() compute_pilot = pilot_col.find_and_modify( query={ "pilotmanager": self.pilot_manager_id, "state": PENDING_LAUNCH }, update={ "$set": { "state": LAUNCHING }, "$push": { "statehistory": { "state": LAUNCHING, "timestamp": ts } } }) if not compute_pilot: time.sleep(IDLE_TIMER) else: try: # ------------------------------------------------------ # # LAUNCH THE PILOT AGENT VIA SAGA # logentries = [] pilot_id = str(compute_pilot["_id"]) logger.info("Launching ComputePilot %s" % pilot_id) # ------------------------------------------------------ # Database connection parameters session_uid = self.db_connection_info.session_id database_url = self.db_connection_info.dburl database_name = self.db_connection_info.dbname database_auth = self.db_connection_info.dbauth # ------------------------------------------------------ # pilot description and resource configuration number_cores = compute_pilot['description']['cores'] runtime = compute_pilot['description']['runtime'] queue = compute_pilot['description']['queue'] project = compute_pilot['description']['project'] cleanup = compute_pilot['description']['cleanup'] resource_key = compute_pilot['description']['resource'] schema = compute_pilot['description']['access_schema'] memory = compute_pilot['description']['memory'] pilot_sandbox = compute_pilot['sandbox'] global_sandbox = compute_pilot['global_sandbox'] # we expand and exchange keys in the resource config, # depending on the selected schema so better use a deep # copy.. resource_cfg = self._session.get_resource_config( resource_key, schema) # import pprint # pprint.pprint (resource_cfg) # ------------------------------------------------------ # get parameters from cfg, set defaults where needed agent_mongodb_endpoint = resource_cfg.get( 'agent_mongodb_endpoint', database_url) agent_spawner = resource_cfg.get( 'agent_spawner', DEFAULT_AGENT_SPAWNER) agent_type = resource_cfg.get('agent_type', DEFAULT_AGENT_TYPE) agent_scheduler = resource_cfg.get('agent_scheduler') tunnel_bind_device = resource_cfg.get( 'tunnel_bind_device') default_queue = resource_cfg.get('default_queue') forward_tunnel_endpoint = resource_cfg.get( 'forward_tunnel_endpoint') js_endpoint = resource_cfg.get('job_manager_endpoint') lrms = resource_cfg.get('lrms') mpi_launch_method = resource_cfg.get( 'mpi_launch_method') pre_bootstrap = resource_cfg.get('pre_bootstrap') python_interpreter = resource_cfg.get( 'python_interpreter') spmd_variation = resource_cfg.get('spmd_variation') task_launch_method = resource_cfg.get( 'task_launch_method') rp_version = resource_cfg.get('rp_version', DEFAULT_RP_VERSION) virtenv_mode = resource_cfg.get( 'virtenv_mode', DEFAULT_VIRTENV_MODE) virtenv = resource_cfg.get('virtenv', DEFAULT_VIRTENV) stage_cacerts = resource_cfg.get( 'stage_cacerts', 'False') if stage_cacerts.lower() == 'true': stage_cacerts = True else: stage_cacerts = False # expand variables in virtenv string virtenv = virtenv % { 'pilot_sandbox': saga.Url(pilot_sandbox).path, 'global_sandbox': saga.Url(global_sandbox).path } # Check for deprecated global_virtenv global_virtenv = resource_cfg.get('global_virtenv') if global_virtenv: logger.warn( "'global_virtenv' keyword is deprecated -- use 'virtenv' and 'virtenv_mode'" ) virtenv = global_virtenv virtenv_mode = 'use' # set default scheme, host, port and dbname if not set db_url = saga.Url(agent_mongodb_endpoint) if not db_url.scheme: db_url.scheme = 'mongodb' if not db_url.host: db_url.host = 'localhost' if not db_url.port: db_url.port = 27017 if not database_name: database_name = 'radicalpilot' # Create a host:port string for use by the bootstrapper. database_hostport = "%s:%d" % (db_url.host, db_url.port) # ------------------------------------------------------ # Copy the bootstrap shell script. This also creates # the sandbox. We use always "default_bootstrapper.sh" bootstrapper = 'default_bootstrapper.sh' bootstrapper_path = os.path.abspath("%s/../bootstrapper/%s" \ % (mod_dir, bootstrapper)) msg = "Using bootstrapper %s" % bootstrapper_path logentries.append(Logentry(msg, logger=logger.info)) bs_script_url = saga.Url("file://localhost/%s" % bootstrapper_path) bs_script_tgt = saga.Url("%s/pilot_bootstrapper.sh" % pilot_sandbox) msg = "Copying bootstrapper '%s' to agent sandbox (%s)." \ % (bs_script_url, bs_script_tgt) logentries.append(Logentry(msg, logger=logger.debug)) bs_script = saga.filesystem.File(bs_script_url, session=self._session) bs_script.copy(bs_script_tgt, flags=saga.filesystem.CREATE_PARENTS) bs_script.close() # ------------------------------------------------------ # the version of the agent is derived from # rp_version, which has the following format # and interpretation: # # case rp_version: # @<token>: # @tag/@branch/@commit: # no sdist staging # git clone $github_base radical.pilot.src # (cd radical.pilot.src && git checkout token) # pip install -t $VIRTENV/rp_install/ radical.pilot.src # rm -rf radical.pilot.src # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # release: # no sdist staging # pip install -t $VIRTENV/rp_install radical.pilot # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # local: # needs sdist staging # tar zxf $sdist.tgz # pip install -t $VIRTENV/rp_install $sdist/ # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # debug: # needs sdist staging # tar zxf $sdist.tgz # pip install -t $SANDBOX/rp_install $sdist/ # export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH # # installed: # no sdist staging # true # esac # # virtenv_mode # private : error if ve exists, otherwise create, then use # update : update if ve exists, otherwise create, then use # create : use if ve exists, otherwise create, then use # use : use if ve exists, otherwise error, then exit # recreate: delete if ve exists, otherwise create, then use # # examples : # [email protected] # virtenv@devel # virtenv@release # virtenv@installed # stage@local # stage@/tmp/my_agent.py # # Note that some combinations may be invalid, # specifically in the context of virtenv_mode. If, for # example, virtenv_mode is 'use', then the 'virtenv:tag' # will not make sense, as the virtenv is not updated. # In those cases, the virtenv_mode is honored, and # a warning is printed. # # Also, the 'stage' mode can only be combined with the # 'local' source, or with a path to the agent (relative # to mod_dir, or absolute). # # A rp_version which does not adhere to the # above syntax is ignored, and the fallback stage@local # is used. if not rp_version.startswith('@') and \ not rp_version in ['installed', 'local', 'debug']: raise ValueError("invalid rp_version '%s'" % rp_version) stage_sdist = True if rp_version in ['installed', 'release']: stage_sdist = False if rp_version.startswith('@'): stage_sdist = False rp_version = rp_version[1:] # strip '@' # ------------------------------------------------------ # Copy the rp sdist if needed. We actually also stage # the sdists for radical.utils and radical.saga, so that # we have the complete stack to install... if stage_sdist: for path in [ ru.sdist_path, saga.sdist_path, sdist_path ]: sdist_url = saga.Url("file://localhost/%s" % path) msg = "Copying sdist '%s' to sdist sandbox (%s)." % ( sdist_url, pilot_sandbox) logentries.append( Logentry(msg, logger=logger.debug)) sdist_file = saga.filesystem.File(sdist_url) sdist_file.copy("%s/" % (str(pilot_sandbox))) sdist_file.close() # ------------------------------------------------------ # some machines cannot run pip due to outdated ca certs. # For those, we also stage an updated cert bundle if stage_cacerts: cc_path = os.path.abspath("%s/../bootstrapper/%s" \ % (mod_dir, 'cacert.pem.gz')) cc_script_url = saga.Url("file://localhost/%s" % cc_path) cc_script_tgt = saga.Url("%s/cacert.pem.gz" % pilot_sandbox) cc_script = saga.filesystem.File( cc_script_url, session=self._session) cc_script.copy( cc_script_tgt, flags=saga.filesystem.CREATE_PARENTS) cc_script.close() # ------------------------------------------------------ # sanity checks if not agent_spawner: raise RuntimeError("missing agent spawner") if not agent_scheduler: raise RuntimeError("missing agent scheduler") if not lrms: raise RuntimeError("missing LRMS") if not mpi_launch_method: raise RuntimeError("missing mpi launch method") if not task_launch_method: raise RuntimeError("missing task launch method") # massage some values debug_level = os.environ.get( 'RADICAL_PILOT_AGENT_VERBOSE', logger.level) try: debug_level = int(debug_level) except ValueError: debug_level = { 'CRITICAL': 1, 'ERROR': 2, 'WARNING': 3, 'WARN': 3, 'INFO': 4, 'DEBUG': 5 }.get(debug_level, 0) if not queue: queue = default_queue if cleanup and isinstance(cleanup, bool): cleanup = 'luve' # l : log files # u : unit work dirs # v : virtualenv # e : everything (== pilot sandbox) # # we never cleanup virtenvs which are not private if virtenv_mode is not 'private': cleanup = cleanup.replace('v', '') sdists = ':'.join( [ru.sdist_name, saga.sdist_name, sdist_name]) # set mandatory args bootstrap_args = "" bootstrap_args += " -b '%s'" % sdists bootstrap_args += " -c '%s'" % number_cores bootstrap_args += " -d '%s'" % debug_level bootstrap_args += " -g '%s'" % virtenv bootstrap_args += " -j '%s'" % task_launch_method bootstrap_args += " -k '%s'" % mpi_launch_method bootstrap_args += " -l '%s'" % lrms bootstrap_args += " -m '%s'" % database_hostport bootstrap_args += " -n '%s'" % database_name bootstrap_args += " -o '%s'" % agent_spawner bootstrap_args += " -p '%s'" % pilot_id bootstrap_args += " -q '%s'" % agent_scheduler bootstrap_args += " -r '%s'" % runtime bootstrap_args += " -s '%s'" % session_uid bootstrap_args += " -t '%s'" % agent_type bootstrap_args += " -u '%s'" % virtenv_mode bootstrap_args += " -v '%s'" % rp_version # set optional args if database_auth: bootstrap_args += " -a '%s'" % database_auth if tunnel_bind_device: bootstrap_args += " -D '%s'" % tunnel_bind_device if pre_bootstrap: bootstrap_args += " -e '%s'" % "' -e '".join( pre_bootstrap) if forward_tunnel_endpoint: bootstrap_args += " -f '%s'" % forward_tunnel_endpoint if python_interpreter: bootstrap_args += " -i '%s'" % python_interpreter if cleanup: bootstrap_args += " -x '%s'" % cleanup # ------------------------------------------------------ # now that the script is in place and we know where it is, # we can launch the agent js_url = saga.Url(js_endpoint) logger.debug("saga.job.Service ('%s')" % js_url) if js_url in self._shared_worker_data['job_services']: js = self._shared_worker_data['job_services'][ js_url] else: js = saga.job.Service(js_url, session=self._session) self._shared_worker_data['job_services'][ js_url] = js # ------------------------------------------------------ # Create SAGA Job description and submit the pilot job jd = saga.job.Description() jd.executable = "/bin/bash" jd.arguments = [ "-l pilot_bootstrapper.sh", bootstrap_args ] jd.working_directory = saga.Url(pilot_sandbox).path jd.project = project jd.output = "agent.out" jd.error = "agent.err" jd.total_cpu_count = number_cores jd.wall_time_limit = runtime jd.total_physical_memory = memory jd.queue = queue # Set the SPMD variation only if required if spmd_variation: jd.spmd_variation = spmd_variation if 'RADICAL_PILOT_PROFILE' in os.environ: jd.environment = {'RADICAL_PILOT_PROFILE': 'TRUE'} logger.debug("Bootstrap command line: %s %s" % (jd.executable, jd.arguments)) msg = "Submitting SAGA job with description: %s" % str( jd.as_dict()) logentries.append(Logentry(msg, logger=logger.debug)) pilotjob = js.create_job(jd) pilotjob.run() # do a quick error check if pilotjob.state == saga.FAILED: raise RuntimeError("SAGA Job state is FAILED.") saga_job_id = pilotjob.id self._shared_worker_data['job_ids'][pilot_id] = [ saga_job_id, js_url ] msg = "SAGA job submitted with job id %s" % str( saga_job_id) logentries.append(Logentry(msg, logger=logger.debug)) # # ------------------------------------------------------ log_dicts = list() for le in logentries: log_dicts.append(le.as_dict()) # Update the Pilot's state to 'PENDING_ACTIVE' if SAGA job submission was successful. ts = datetime.datetime.utcnow() ret = pilot_col.update( { "_id": pilot_id, "state": 'Launching' }, { "$set": { "state": PENDING_ACTIVE, "saga_job_id": saga_job_id }, "$push": { "statehistory": { "state": PENDING_ACTIVE, "timestamp": ts } }, "$pushAll": { "log": log_dicts } }) if ret['n'] == 0: # could not update, probably because the agent is # running already. Just update state history and # jobid then # FIXME: make sure of the agent state! ret = pilot_col.update({"_id": pilot_id}, { "$set": { "saga_job_id": saga_job_id }, "$push": { "statehistory": { "state": PENDING_ACTIVE, "timestamp": ts } }, "$pushAll": { "log": log_dicts } }) except Exception as e: # Update the Pilot's state 'FAILED'. out, err, log = self._get_pilot_logs( pilot_col, pilot_id) ts = datetime.datetime.utcnow() # FIXME: we seem to be unable to bson/json handle saga # log messages containing an '#'. This shows up here. # Until we find a clean workaround, make log shorter and # rely on saga logging to reveal the problem. msg = "Pilot launching failed! (%s)" % e logentries.append(Logentry(msg)) log_dicts = list() log_messages = list() for le in logentries: log_dicts.append(le.as_dict()) log_messages.append(le.message) pilot_col.update( { "_id": pilot_id, "state": { "$ne": FAILED } }, { "$set": { "state": FAILED, "stdout": out, "stderr": err, "logfile": log }, "$push": { "statehistory": { "state": FAILED, "timestamp": ts } }, "$pushAll": { "log": log_dicts } }) logger.exception('\n'.join(log_messages)) except SystemExit as e: logger.exception( "pilot launcher thread caught system exit -- forcing application shutdown" ) import thread thread.interrupt_main()
def _handle_unit(self, unit, actionables): # FIXME: we should created unit sandboxes in a bulk uid = unit['uid'] self._prof.prof("create_sandbox_start", uid=uid) src_context = { 'pwd': os.getcwd(), # !!! 'unit': unit['unit_sandbox'], 'pilot': unit['pilot_sandbox'], 'resource': unit['resource_sandbox'] } tgt_context = { 'pwd': unit['unit_sandbox'], # !!! 'unit': unit['unit_sandbox'], 'pilot': unit['pilot_sandbox'], 'resource': unit['resource_sandbox'] } # we have actionable staging directives, and thus we need a unit # sandbox. sandbox = rs.Url(unit["unit_sandbox"]) tmp = rs.Url(unit["unit_sandbox"]) # url used for cache (sandbox url w/o path) tmp.path = '/' key = str(tmp) self._log.debug('key %s / %s', key, tmp) if key not in self._fs_cache: self._fs_cache[key] = rs.filesystem.Directory( tmp, session=self._session) saga_dir = self._fs_cache[key] saga_dir.make_dir(sandbox, flags=rs.filesystem.CREATE_PARENTS) self._prof.prof("create_sandbox_stop", uid=uid) # Loop over all transfer directives and filter out tarball staging # directives. Those files are added into a tarball, and a single # actionable to stage that tarball replaces the original actionables. # create a new actionable list during the filtering new_actionables = list() tar_file = None for sd in actionables: # don't touch non-tar SDs if sd['action'] != rpc.TARBALL: new_actionables.append(sd) else: action = sd['action'] flags = sd['flags'] # NOTE: we don't use those did = sd['uid'] src = sd['source'] tgt = sd['target'] src = complete_url(src, src_context, self._log) tgt = complete_url(tgt, tgt_context, self._log) self._prof.prof('staging_in_tar_start', uid=uid, msg=did) # create a tarfile on the first match, and register for transfer if not tar_file: tmp_file = tempfile.NamedTemporaryFile( prefix='rp_usi_%s.' % uid, suffix='.tar', delete=False) tar_path = tmp_file.name tar_file = tarfile.open(fileobj=tmp_file, mode='w') tar_src = ru.Url('file://localhost/%s' % tar_path) tar_tgt = ru.Url('unit:////%s.tar' % uid) tar_did = ru.generate_id('sd') tar_sd = { 'action': rpc.TRANSFER, 'flags': rpc.DEFAULT_FLAGS, 'uid': tar_did, 'source': str(tar_src), 'target': str(tar_tgt), } new_actionables.append(tar_sd) # add the src file tar_file.add(src.path, arcname=tgt.path) self._prof.prof('staging_in_tar_stop', uid=uid, msg=did) # make sure tarball is flushed to disk if tar_file: tar_file.close() # work on the filtered TRANSFER actionables for sd in new_actionables: action = sd['action'] flags = sd['flags'] did = sd['uid'] src = sd['source'] tgt = sd['target'] if action == rpc.TRANSFER: src = complete_url(src, src_context, self._log) tgt = complete_url(tgt, tgt_context, self._log) # Check if the src is a folder, if true # add recursive flag if not already specified if os.path.isdir(src.path): flags |= rs.filesystem.RECURSIVE # Always set CREATE_PARENTS flags |= rs.filesystem.CREATE_PARENTS src = complete_url(src, src_context, self._log) tgt = complete_url(tgt, tgt_context, self._log) self._prof.prof('staging_in_start', uid=uid, msg=did) saga_dir.copy(src, tgt, flags=flags) self._prof.prof('staging_in_stop', uid=uid, msg=did) if tar_file: # some tarball staging was done. Add a staging directive for the # agent to untar the tarball, and clean up. tar_sd['action'] = rpc.TARBALL unit['description']['input_staging'].append(tar_sd) os.remove(tar_path) # staging is done, we can advance the unit at last self.advance(unit, rps.AGENT_STAGING_INPUT_PENDING, publish=True, push=True)
def main(): tmp_dir = None try: tmp_dir = tempfile.mkdtemp(prefix='saga-test-', suffix='-%s' % TEST_NAME, dir=os.path.expanduser('~/tmp')) print 'tmpdir: %s' % tmp_dir ctx = saga.Context("x509") ctx.user_proxy = '/Users/mark/proj/myproxy/xsede.x509' session = saga.Session() session.add_context(ctx) source_url = saga.Url() source_url.schema = 'go' source_url.host = SOURCE source_url.path = tmp_dir target_url = saga.Url() target_url.schema = 'go' target_url.host = TARGET target_url.path = os.path.join('~/saga-tests/', os.path.basename(tmp_dir)) print "Point to local Directory through GO ..." d = saga.filesystem.Directory(source_url) print "And check ..." assert d.is_dir() == True assert d.is_file() == False assert d.is_link() == False d.close() print "Point to remote Directory through GO ..." d = saga.filesystem.Directory(target_url, flags=saga.filesystem.CREATE_PARENTS) print "And check ..." assert d.is_dir() == True assert d.is_file() == False assert d.is_link() == False d.close() print "Point to local file through GO, before creation ..." caught = False try: saga.filesystem.File(os.path.join(str(source_url), FILE_A_level_0)) except saga.DoesNotExist: caught = True assert caught == True print "Create actual file ..." touch(tmp_dir, FILE_A_level_0) print "Try again ..." f = saga.filesystem.File(os.path.join(str(source_url), FILE_A_level_0)) assert f.is_file() == True assert f.is_dir() == False assert f.is_link() == False f.close() print "Copy local file to remote, using different filename ..." d = saga.filesystem.Directory(target_url, flags=saga.filesystem.CREATE_PARENTS) d.copy(os.path.join(str(source_url), FILE_A_level_0), FILE_A_level_0+COPIED_SUFFIX) d.close() f = saga.filesystem.File(os.path.join(str(target_url), FILE_A_level_0+COPIED_SUFFIX)) assert f.is_file() == True assert f.is_dir() == False assert f.is_link() == False f.close() print "Copy local file to remote, keeping filename in tact ..." d = saga.filesystem.Directory(target_url, flags=saga.filesystem.CREATE_PARENTS) d.copy(os.path.join(str(source_url), FILE_A_level_0), FILE_A_level_0) d.close() f = saga.filesystem.File(os.path.join(str(target_url), FILE_A_level_0)) assert f.is_file() == True assert f.is_dir() == False assert f.is_link() == False f.close() print 'Create file in level 1 ...' tree = LEVEL_1 os.mkdir(os.path.join(tmp_dir, tree)) touch(os.path.join(tmp_dir, tree), FILE_A_level_1) print "Test local file ..." f = saga.filesystem.File(os.path.join(str(source_url), tree, FILE_A_level_1)) assert f.is_file() == True assert f.is_dir() == False assert f.is_link() == False f.close() print "Copy local file to remote, keeping filename in tact ..." d = saga.filesystem.Directory(os.path.join(str(target_url), tree), flags=saga.filesystem.CREATE_PARENTS) d.copy(os.path.join(str(source_url), tree, FILE_A_level_1), FILE_A_level_1) d.close() print "Test file after transfer ..." f = saga.filesystem.File(os.path.join(str(target_url), tree, FILE_A_level_1)) assert f.is_file() == True assert f.is_dir() == False assert f.is_link() == False f.close() print "Copy non-existent local file to remote, keeping filename in tact ..." d = saga.filesystem.Directory(str(target_url), flags=saga.filesystem.CREATE_PARENTS) try: d.copy(os.path.join(str(source_url), NON_EXISTING_FILE), NON_EXISTING_FILE) except saga.DoesNotExist: caught = True assert caught == True print "Test file after (non-)transfer ..." caught = False try: saga.filesystem.File(os.path.join(str(target_url), NON_EXISTING_FILE)) except saga.DoesNotExist: caught = True assert caught == True # destination = "go://gridftp.stampede.tacc.xsede.org/~/tmp/" # #destination = "go://oasis-dm.sdsc.xsede.org/~/tmp/" # #destination = "go://ncsa#BlueWaters/~/tmp/" # #destination = "go://marksant#netbook/Users/mark/tmp/go/" # src_filename = "my_file" # dst_filename = "my_file_" # rt_filename = "my_file__" # # # open home directory on a remote machine # source_dir = saga.filesystem.Directory(source) # # # copy .bash_history to /tmp/ on the local machine # source_dir.copy(src_filename, os.path.join(destination, dst_filename)) # # # list 'm*' in local /tmp/ directory # dest_dir = saga.filesystem.Directory(destination) # for entry in dest_dir.list(pattern='%s*' % src_filename[0]): # print entry # # dest_file = saga.filesystem.File(os.path.join(destination, dst_filename)) # assert dest_file.is_file() == True # assert dest_file.is_link() == False # assert dest_file.is_dir() == False # print 'Size: %d' % dest_file.get_size() # # dest_file.copy(source) # # dest_file.copy(os.path.join(source+'broken', rt_filename)) print "Before return 0" return 0 except saga.SagaException as ex: # Catch all saga exceptions print "An exception occurred: (%s) %s " % (ex.type, (str(ex))) # Trace back the exception. That can be helpful for debugging. print " \n*** Backtrace:\n %s" % ex.traceback print "before return -1" return -1 finally: print "and finally ..." if CLEANUP and tmp_dir: shutil.rmtree(tmp_dir)
__author__ = "Andre Merzky" __copyright__ = "Copyright 2012-2013, The SAGA Project" __license__ = "MIT" import re import time import saga import saga.utils.pty_shell as sups try : shell = sups.PTYShell (saga.Url ("fork://localhost"), []) shell.run_async ("(sftp -b - localhost || (printf \"SFTP_ABORT\n\"; false)) <<EOT") shell.send ("progress\nput /home/merzky/downloads/totalview*.sh /tmp/t\nEOT\n") # pat_bof = re.compile ("(?P<perc>\d+\%).*(?P<time>--:--)\s*ETA") pat_bof = re.compile ("(?P<perc>\d+)\%\s+(?P<size>.+?)\s+(?P<perf>.+?)\s+(?P<time>--:--)\s*ETA") pat_eta = re.compile ("(?P<perc>\d+)\%\s+(?P<size>.+?)\s+(?P<perf>.+?)\s+(?P<time>\d\d:\d\d)\s*ETA") pat_eof = re.compile ("(?P<perc>\d+)\%\s+(?P<size>.+?)\s+(?P<perf>.+?)\s+(?P<time>\d\d:\d\d)\s*\n") pat_def = re.compile ("^sftp>.*\n") begin = True error = "" while True : ret, out = shell.find (['ETA$', 'SFTP_ABORT\n', '\n']) progress = None # ---------------------------------------------------------------------- # found ETA - transfer is in progress
def run(self): """Starts the process when Process.start() is called. """ # make sure to catch sys.exit (which raises SystemExit) try : logger.info("Starting InputFileTransferWorker") # Try to connect to the database and create a tailable cursor. try: connection = self.db_connection_info.get_db_handle() db = connection[self.db_connection_info.dbname] um_col = db["%s.cu" % self.db_connection_info.session_id] logger.debug("Connected to MongoDB. Serving requests for UnitManager %s." % self.unit_manager_id) except Exception as e : logger.exception("Connection error: %s" % e) raise try : while not self._stop.is_set(): # See if we can find a ComputeUnit that is waiting for # input file transfer. compute_unit = None ts = datetime.datetime.utcnow() compute_unit = um_col.find_and_modify( query={"unitmanager": self.unit_manager_id, "FTW_Input_Status": PENDING}, update={"$set" : {"FTW_Input_Status": EXECUTING, "state": STAGING_INPUT}, "$push": {"statehistory": {"state": STAGING_INPUT, "timestamp": ts}}}, limit=BULK_LIMIT # TODO: bulklimit is probably not the best way to ensure there is just one ) # FIXME: AM: find_and_modify is not bulkable! state = STAGING_INPUT if compute_unit is None: # Sleep a bit if no new units are available. time.sleep(IDLE_TIME) else: compute_unit_id = None try: log_messages = [] # We have found a new CU. Now we can process the transfer # directive(s) wit SAGA. compute_unit_id = str(compute_unit["_id"]) remote_sandbox = compute_unit["sandbox"] input_staging = compute_unit["FTW_Input_Directives"] # We need to create the CU's directory in case it doesn't exist yet. log_msg = "Creating ComputeUnit sandbox directory %s." % remote_sandbox log_messages.append(log_msg) logger.info(log_msg) # Creating the sandbox directory. try: logger.debug ("saga.fs.Directory ('%s')" % remote_sandbox) remote_sandbox_keyurl = saga.Url (remote_sandbox) remote_sandbox_keyurl.path = '/' remote_sandbox_key = str(remote_sandbox_keyurl) if remote_sandbox_key not in self._saga_dirs : self._saga_dirs[remote_sandbox_key] = \ saga.filesystem.Directory (remote_sandbox_key, flags=saga.filesystem.CREATE_PARENTS, session=self._session) saga_dir = self._saga_dirs[remote_sandbox_key] saga_dir.make_dir (remote_sandbox, flags=saga.filesystem.CREATE_PARENTS) except Exception as e : logger.exception('Error: %s' % e) # FIXME: why is this exception ignored? AM logger.info("Processing input file transfers for ComputeUnit %s" % compute_unit_id) # Loop over all transfer directives and execute them. for sd in input_staging: state_doc = um_col.find_one( {"_id": compute_unit_id}, fields=["state"] ) if state_doc['state'] == CANCELED: logger.info("Compute Unit Canceled, interrupting input file transfers.") state = CANCELED break abs_src = os.path.abspath(sd['source']) input_file_url = saga.Url("file://localhost/%s" % abs_src) if not sd['target']: target = remote_sandbox else: target = "%s/%s" % (remote_sandbox, sd['target']) log_msg = "Transferring input file %s -> %s" % (input_file_url, target) log_messages.append(log_msg) logger.debug(log_msg) # Execute the transfer. logger.debug ("saga.fs.File ('%s')" % input_file_url) input_file = saga.filesystem.File( input_file_url, session=self._session ) if CREATE_PARENTS in sd['flags']: copy_flags = saga.filesystem.CREATE_PARENTS else: copy_flags = 0 try : input_file.copy(target, flags=copy_flags) except Exception as e : logger.exception (e) input_file.close() # If all went fine, update the state of this StagingDirective to Done um_col.find_and_modify( query={"_id" : compute_unit_id, 'FTW_Input_Status': EXECUTING, 'FTW_Input_Directives.state': PENDING, 'FTW_Input_Directives.source': sd['source'], 'FTW_Input_Directives.target': sd['target'], }, update={'$set': {'FTW_Input_Directives.$.state': 'Done'}, '$push': {'log': { 'timestamp': datetime.datetime.utcnow(), 'message' : log_msg}} } ) except Exception as e : # Update the CU's state 'FAILED'. ts = datetime.datetime.utcnow() logentry = {'message' : "Input transfer failed: %s" % e, 'timestamp': ts} um_col.update({'_id': compute_unit_id}, { '$set': {'state': FAILED}, '$push': { 'statehistory': {'state': FAILED, 'timestamp': ts}, 'log': logentry } }) logger.exception(str(logentry)) # Code below is only to be run by the "first" or only worker if self._worker_number > 1: continue # If the CU was canceled we can skip the remainder of this loop. if state == CANCELED: continue # # Check to see if there are more pending Directives, if not, we are Done # cursor_w = um_col.find({"unitmanager": self.unit_manager_id, "$or": [ {"Agent_Input_Status": EXECUTING}, {"FTW_Input_Status": EXECUTING} ] } ) # Iterate over all the returned CUs (if any) for cu in cursor_w: # See if there are any FTW Input Directives still pending if cu['FTW_Input_Status'] == EXECUTING and \ not any(d['state'] == EXECUTING or d['state'] == PENDING for d in cu['FTW_Input_Directives']): # All Input Directives for this FTW are done, mark the CU accordingly um_col.update({"_id": cu["_id"]}, {'$set': {'FTW_Input_Status': DONE}, '$push': {'log': { 'timestamp': datetime.datetime.utcnow(), 'message' : 'All FTW Input Staging Directives done - %d.' % self._worker_number}} } ) # See if there are any Agent Input Directives still pending or executing, # if not, mark it DONE. if cu['Agent_Input_Status'] == EXECUTING and \ not any(d['state'] == EXECUTING or d['state'] == PENDING for d in cu['Agent_Input_Directives']): # All Input Directives for this Agent are done, mark the CU accordingly um_col.update({"_id": cu["_id"]}, {'$set': {'Agent_Input_Status': DONE}, '$push': {'log': { 'timestamp': datetime.datetime.utcnow(), 'message' : 'All Agent Input Staging Directives done - %d.' % self._worker_number}} } ) # # Check for all CUs if both Agent and FTW staging is done, we can then mark the CU PendingExecution # ts = datetime.datetime.utcnow() um_col.find_and_modify( query={"unitmanager": self.unit_manager_id, "Agent_Input_Status": { "$in": [ None, DONE ] }, "FTW_Input_Status": { "$in": [ None, DONE ] }, "state": STAGING_INPUT }, update={"$set": { "state": PENDING_EXECUTION }, "$push": { "statehistory": {"state": PENDING_EXECUTION, "timestamp": ts} } } ) except Exception as e : logger.exception("transfer worker error: %s" % e) self._session.close (cleanup=False) raise except SystemExit as e : logger.debug("input file transfer thread caught system exit -- forcing application shutdown") import thread thread.interrupt_main ()
def _start_pilot_bulk(self, resource, schema, pilots): """ For each pilot, we prepare by determining what files need to be staged, and what job description needs to be submitted. We expect `_prepare_pilot(resource, pilot)` to return a dict with: { 'js' : saga.job.Description, 'ft' : [ { 'src' : string # absolute source file name 'tgt' : string # relative target file name 'rem' : bool # shall we remove src? }, ... ] } When transfering data, we'll ensure that each src is only transferred once (in fact, we put all src files into a tarball and unpack that on the target side). The returned dicts are expected to only contain files which actually need staging, ie. which have not been staged during a previous pilot submission. That implies one of two things: either this component is stateful, and remembers what has been staged -- which makes it difficult to use multiple component instances; or the component inspects the target resource for existing files -- which involves additional expensive remote hops. FIXME: since neither is implemented at this point we won't discuss the tradeoffs further -- right now files are unique per pilot bulk. Once all dicts are collected, we create one additional file which contains the staging information, and then pack all src files into a tarball for staging. We transfer the tarball, and *immediately* trigger the untaring on the target resource, which is thus *not* part of the bootstrapping process. NOTE: this is to avoid untaring race conditions for multiple pilots, and also to simplify bootstrapping dependencies -- the bootstrappers are likely within the tarball after all... """ rcfg = self._session.get_resource_config(resource, schema) sid = self._session.uid # we create a fake session_sandbox with all pilot_sandboxes in /tmp, and # then tar it up. Once we untar that tarball on the target machine, we # should have all sandboxes and all files required to bootstrap the # pilots # FIXME: on untar, there is a race between multiple launcher components # within the same session toward the same target resource. tmp_dir = os.path.abspath(tempfile.mkdtemp(prefix='rp_agent_tar_dir')) tar_name = '%s.%s.tgz' % (sid, self.uid) tar_tgt = '%s/%s' % (tmp_dir, tar_name) tar_url = rs.Url('file://localhost/%s' % tar_tgt) # we need the session sandbox url, but that is (at least in principle) # dependent on the schema to use for pilot startup. So we confirm here # that the bulk is consistent wrt. to the schema. # FIXME: if it is not, it needs to be splitted into schema-specific # sub-bulks schema = pilots[0]['description'].get('access_schema') for pilot in pilots[1:]: assert(schema == pilot['description'].get('access_schema')), \ 'inconsistent scheme on launch / staging' session_sandbox = self._session._get_session_sandbox(pilots[0]).path # we will create the session sandbox before we untar, so we can use that # as workdir, and pack all paths relative to that session sandbox. That # implies that we have to recheck that all URLs in fact do point into # the session sandbox. ft_list = list() # files to stage jd_list = list() # jobs to submit for pilot in pilots: info = self._prepare_pilot(resource, rcfg, pilot) ft_list += info['ft'] jd_list.append(info['jd']) self._prof.prof('staging_in_start', uid=pilot['uid']) for ft in ft_list: src = os.path.abspath(ft['src']) tgt = os.path.relpath(os.path.normpath(ft['tgt']), session_sandbox) # src_dir = os.path.dirname(src) tgt_dir = os.path.dirname(tgt) if tgt_dir.startswith('..'): raise ValueError('staging target %s outside of pilot sandbox' % ft['tgt']) if not os.path.isdir('%s/%s' % (tmp_dir, tgt_dir)): os.makedirs('%s/%s' % (tmp_dir, tgt_dir)) if src == '/dev/null': # we want an empty file -- touch it (tar will refuse to # handle a symlink to /dev/null) open('%s/%s' % (tmp_dir, tgt), 'a').close() else: os.symlink(src, '%s/%s' % (tmp_dir, tgt)) # tar. If any command fails, this will raise. cmd = "cd %s && tar zchf %s *" % (tmp_dir, tar_tgt) self._log.debug('cmd: %s', cmd) try: out = sp.check_output(["/bin/sh", "-c", cmd], stderr=sp.STDOUT) except Exception: self._log.exception('callout failed: %s', out) raise else: self._log.debug('out: %s', out) # remove all files marked for removal-after-pack for ft in ft_list: if ft['rem']: os.unlink(ft['src']) fs_endpoint = rcfg['filesystem_endpoint'] fs_url = rs.Url(fs_endpoint) self._log.debug("rs.file.Directory ('%s')", fs_url) with self._cache_lock: if fs_url in self._saga_fs_cache: fs = self._saga_fs_cache[fs_url] else: fs = rsfs.Directory(fs_url, session=self._session) self._saga_fs_cache[fs_url] = fs tar_rem = rs.Url(fs_url) tar_rem.path = "%s/%s" % (session_sandbox, tar_name) fs.copy(tar_url, tar_rem, flags=rsfs.CREATE_PARENTS) shutil.rmtree(tmp_dir) # we now need to untar on the target machine. js_url = ru.Url(pilots[0]['js_url']) # well, we actually don't need to talk to the lrms, but only need # a shell on the headnode. That seems true for all LRMSs we use right # now. So, lets convert the URL: if '+' in js_url.scheme: parts = js_url.scheme.split('+') if 'gsissh' in parts: js_url.scheme = 'gsissh' elif 'ssh' in parts: js_url.scheme = 'ssh' else: # In the non-combined '+' case we need to distinguish between # a url that was the result of a hop or a local lrms. if js_url.scheme not in ['ssh', 'gsissh']: js_url.scheme = 'fork' with self._cache_lock: if js_url in self._saga_js_cache: js_tmp = self._saga_js_cache[js_url] else: js_tmp = rs.job.Service(js_url, session=self._session) self._saga_js_cache[js_url] = js_tmp ## cmd = "tar zmxvf %s/%s -C / ; rm -f %s" % \ cmd = "tar zmxvf %s/%s -C %s" % \ (session_sandbox, tar_name, session_sandbox) j = js_tmp.run_job(cmd) j.wait() self._log.debug('tar cmd : %s', cmd) self._log.debug('tar done: %s, %s, %s', j.state, j.stdout, j.stderr) for pilot in pilots: self._prof.prof('staging_in_stop', uid=pilot['uid']) self._prof.prof('submission_start', uid=pilot['uid']) # look up or create JS for actual pilot submission. This might result # in the same js url as above, or not. js_ep = rcfg['job_manager_endpoint'] with self._cache_lock: if js_ep in self._saga_js_cache: js = self._saga_js_cache[js_ep] else: js = rs.job.Service(js_ep, session=self._session) self._saga_js_cache[js_ep] = js # now that the scripts are in place and configured, # we can launch the agent jc = rs.job.Container() for jd in jd_list: self._log.debug('jd: %s', pprint.pformat(jd.as_dict())) jc.add(js.create_job(jd)) jc.run() # we assume here that the tasks arrive in the same order as the job # descriptions. For uniform sets of pilots the order does not matter # much though. Either way, this needs confirming on SAGA level # FIXME for j, jd in zip(jc.get_tasks(), jd_list): # do a quick error check if j.state == rs.FAILED: self._log.error('%s: %s : %s : %s', j.id, j.state, j.stderr, j.stdout) raise RuntimeError("SAGA Job state is FAILED. (%s)" % jd.name) pilot = None pid = jd.name for p in pilots: if p['uid'] == pid: pilot = p break assert (pilot) # Update the Pilot's state to 'PMGR_ACTIVE_PENDING' if SAGA job # submission was successful. Since the pilot leaves the scope of # the PMGR for the time being, we update the complete DB document pilot['$all'] = True # FIXME: update the right pilot with self._pilots_lock: self._pilots[pid] = dict() self._pilots[pid]['pilot'] = pilot self._pilots[pid]['job'] = j # make sure we watch that pilot with self._check_lock: self._checking.append(pid) for pilot in pilots: self._prof.prof('submission_stop', uid=pilot['uid'])
def _get_resource_sandbox(self, pilot): """ for a given pilot dict, determine the global RP sandbox, based on the pilot's 'resource' attribute. """ self.is_valid() # FIXME: this should get 'resource, schema=None' as parameters resource = pilot['description'].get('resource') schema = pilot['description'].get('access_schema') if not resource: raise ValueError('Cannot get pilot sandbox w/o resource target') # the global sandbox will be the same for all pilots on any resource, so # we cache it with self._cache_lock: if resource not in self._cache['resource_sandbox']: # cache miss -- determine sandbox and fill cache rcfg = self.get_resource_config(resource, schema) fs_url = rs.Url(rcfg['filesystem_endpoint']) # Get the sandbox from either the pilot_desc or resource conf sandbox_raw = pilot['description'].get('sandbox') if not sandbox_raw: sandbox_raw = rcfg.get('default_remote_workdir', "$PWD") # If the sandbox contains expandables, we need to resolve those remotely. # NOTE: Note that this will only work for (gsi)ssh or shell based access mechanisms if '$' not in sandbox_raw and '`' not in sandbox_raw: # no need to expand further sandbox_base = sandbox_raw else: js_url = rs.Url(rcfg['job_manager_endpoint']) if 'ssh' in js_url.schema.split('+'): js_url.schema = 'ssh' elif 'gsissh' in js_url.schema.split('+'): js_url.schema = 'gsissh' elif 'fork' in js_url.schema.split('+'): js_url.schema = 'fork' elif '+' not in js_url.schema: # For local access to queueing systems use fork js_url.schema = 'fork' else: raise Exception("unsupported access schema: %s" % js_url.schema) self._log.debug("rsup.PTYShell('%s')", js_url) shell = rsup.PTYShell(js_url, self) ret, out, err = shell.run_sync(' echo "WORKDIR: %s"' % sandbox_raw) if ret == 0 and 'WORKDIR:' in out: sandbox_base = out.split(":")[1].strip() self._log.debug("sandbox base %s: '%s'", js_url, sandbox_base) else: raise RuntimeError("Couldn't get remote working directory.") # at this point we have determined the remote 'pwd' - the global sandbox # is relative to it. fs_url.path = "%s/radical.pilot.sandbox" % sandbox_base # before returning, keep the URL string in cache self._cache['resource_sandbox'][resource] = fs_url return self._cache['resource_sandbox'][resource]
def _prepare_pilot(self, resource, rcfg, pilot): pid = pilot["uid"] ret = {'ft': list(), 'jd': None} # # ---------------------------------------------------------------------- # # the rcfg can contain keys with string expansion placeholders where # # values from the pilot description need filling in. A prominent # # example is `%(pd.project)s`, where the pilot description's `PROJECT` # # value needs to be filled in (here in lowercase). # expand = dict() # for k,v in pilot['description'].iteritems(): # if v is None: # v = '' # expand['pd.%s' % k] = v # if isinstance(v, basestring): # expand['pd.%s' % k.upper()] = v.upper() # expand['pd.%s' % k.lower()] = v.lower() # else: # expand['pd.%s' % k.upper()] = v # expand['pd.%s' % k.lower()] = v # # for k in rcfg: # if isinstance(rcfg[k], basestring): # orig = rcfg[k] # rcfg[k] = rcfg[k] % expand # expanded = rcfg[k] # if orig != expanded: # self._log.debug('RCFG:\n%s\n%s', orig, expanded) # ---------------------------------------------------------------------- # Database connection parameters sid = self._session.uid database_url = self._session.dburl # some default values are determined at runtime default_virtenv = '%%(resource_sandbox)s/ve.%s.%s' % \ (resource, self._rp_version) # ---------------------------------------------------------------------- # pilot description and resource configuration number_cores = pilot['description']['cores'] number_gpus = pilot['description']['gpus'] runtime = pilot['description']['runtime'] queue = pilot['description']['queue'] project = pilot['description']['project'] cleanup = pilot['description']['cleanup'] memory = pilot['description']['memory'] candidate_hosts = pilot['description']['candidate_hosts'] # ---------------------------------------------------------------------- # get parameters from resource cfg, set defaults where needed agent_launch_method = rcfg.get('agent_launch_method') agent_dburl = rcfg.get('agent_mongodb_endpoint', database_url) agent_spawner = rcfg.get('agent_spawner', DEFAULT_AGENT_SPAWNER) rc_agent_config = rcfg.get('agent_config', DEFAULT_AGENT_CONFIG) agent_scheduler = rcfg.get('agent_scheduler') tunnel_bind_device = rcfg.get('tunnel_bind_device') default_queue = rcfg.get('default_queue') forward_tunnel_endpoint = rcfg.get('forward_tunnel_endpoint') lrms = rcfg.get('lrms') mpi_launch_method = rcfg.get('mpi_launch_method', '') pre_bootstrap_0 = rcfg.get('pre_bootstrap_0', []) pre_bootstrap_1 = rcfg.get('pre_bootstrap_1', []) python_interpreter = rcfg.get('python_interpreter') task_launch_method = rcfg.get('task_launch_method') rp_version = rcfg.get('rp_version', DEFAULT_RP_VERSION) virtenv_mode = rcfg.get('virtenv_mode', DEFAULT_VIRTENV_MODE) virtenv = rcfg.get('virtenv', default_virtenv) cores_per_node = rcfg.get('cores_per_node', 0) gpus_per_node = rcfg.get('gpus_per_node', 0) lfs_path_per_node = rcfg.get('lfs_path_per_node', None) lfs_size_per_node = rcfg.get('lfs_size_per_node', 0) python_dist = rcfg.get('python_dist') virtenv_dist = rcfg.get('virtenv_dist', DEFAULT_VIRTENV_DIST) cu_tmp = rcfg.get('cu_tmp') spmd_variation = rcfg.get('spmd_variation') shared_filesystem = rcfg.get('shared_filesystem', True) stage_cacerts = rcfg.get('stage_cacerts', False) cu_pre_exec = rcfg.get('cu_pre_exec') cu_post_exec = rcfg.get('cu_post_exec') export_to_cu = rcfg.get('export_to_cu') mandatory_args = rcfg.get('mandatory_args', []) saga_jd_supplement = rcfg.get('saga_jd_supplement', {}) import pprint self._log.debug(cores_per_node) self._log.debug(pprint.pformat(rcfg)) # make sure that mandatory args are known for ma in mandatory_args: if pilot['description'].get(ma) is None: raise ValueError('attribute "%s" is required for "%s"' % (ma, resource)) # get pilot and global sandbox resource_sandbox = self._session._get_resource_sandbox(pilot).path session_sandbox = self._session._get_session_sandbox(pilot).path pilot_sandbox = self._session._get_pilot_sandbox(pilot).path pilot['resource_sandbox'] = str( self._session._get_resource_sandbox(pilot)) pilot['pilot_sandbox'] = str(self._session._get_pilot_sandbox(pilot)) pilot['client_sandbox'] = str(self._session._get_client_sandbox()) # Agent configuration that is not part of the public API. # The agent config can either be a config dict, or # a string pointing to a configuration name. If neither # is given, check if 'RADICAL_PILOT_AGENT_CONFIG' is # set. The last fallback is 'agent_default' agent_config = pilot['description'].get('_config') if not agent_config: agent_config = os.environ.get('RADICAL_PILOT_AGENT_CONFIG') if not agent_config: agent_config = rc_agent_config if isinstance(agent_config, dict): # use dict as is agent_cfg = agent_config elif isinstance(agent_config, basestring): try: # interpret as a config name agent_cfg_file = os.path.join(self._conf_dir, "agent_%s.json" % agent_config) self._log.info("Read agent config file: %s", agent_cfg_file) agent_cfg = ru.read_json(agent_cfg_file) # allow for user level overload user_cfg_file = '%s/.radical/pilot/config/%s' \ % (os.environ['HOME'], os.path.basename(agent_cfg_file)) if os.path.exists(user_cfg_file): self._log.info("merging user config: %s" % user_cfg_file) user_cfg = ru.read_json(user_cfg_file) ru.dict_merge(agent_cfg, user_cfg, policy='overwrite') except Exception as e: self._log.exception("Error reading agent config file: %s" % e) raise else: # we can't handle this type raise TypeError( 'agent config must be string (config name) or dict') # expand variables in virtenv string virtenv = virtenv % { 'pilot_sandbox': pilot_sandbox, 'session_sandbox': session_sandbox, 'resource_sandbox': resource_sandbox } # Check for deprecated global_virtenv if 'global_virtenv' in rcfg: raise RuntimeError("'global_virtenv' is deprecated (%s)" % resource) # Create a host:port string for use by the bootstrap_0. db_url = rs.Url(agent_dburl) if db_url.port: db_hostport = "%s:%d" % (db_url.host, db_url.port) else: db_hostport = "%s:%d" % (db_url.host, 27017) # mongodb default # ---------------------------------------------------------------------- # the version of the agent is derived from # rp_version, which has the following format # and interpretation: # # case rp_version: # @<token>: # @tag/@branch/@commit: # no sdist staging # git clone $github_base radical.pilot.src # (cd radical.pilot.src && git checkout token) # pip install -t $VIRTENV/rp_install/ radical.pilot.src # rm -rf radical.pilot.src # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # release: # no sdist staging # pip install -t $VIRTENV/rp_install radical.pilot # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # local: # needs sdist staging # tar zxf $sdist.tgz # pip install -t $VIRTENV/rp_install $sdist/ # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # debug: # needs sdist staging # tar zxf $sdist.tgz # pip install -t $SANDBOX/rp_install $sdist/ # export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH # # installed: # no sdist staging # true # esac # # virtenv_mode # private : error if ve exists, otherwise create, then use # update : update if ve exists, otherwise create, then use # create : use if ve exists, otherwise create, then use # use : use if ve exists, otherwise error, then exit # recreate: delete if ve exists, otherwise create, then use # # examples : # [email protected] # virtenv@devel # virtenv@release # virtenv@installed # stage@local # stage@/tmp/my_agent.py # # Note that some combinations may be invalid, # specifically in the context of virtenv_mode. If, for # example, virtenv_mode is 'use', then the 'virtenv:tag' # will not make sense, as the virtenv is not updated. # In those cases, the virtenv_mode is honored, and # a warning is printed. # # Also, the 'stage' mode can only be combined with the # 'local' source, or with a path to the agent (relative # to root_dir, or absolute). # # A rp_version which does not adhere to the # above syntax is ignored, and the fallback stage@local # is used. if not rp_version.startswith('@') and \ not rp_version in ['installed', 'local', 'debug', 'release']: raise ValueError("invalid rp_version '%s'" % rp_version) if rp_version.startswith('@'): rp_version = rp_version[1:] # strip '@' # ---------------------------------------------------------------------- # sanity checks if not python_dist: raise RuntimeError("missing python distribution") if not virtenv_dist: raise RuntimeError("missing virtualenv distribution") if not agent_spawner: raise RuntimeError("missing agent spawner") if not agent_scheduler: raise RuntimeError("missing agent scheduler") if not lrms: raise RuntimeError("missing LRMS") if not agent_launch_method: raise RuntimeError("missing agentlaunch method") if not task_launch_method: raise RuntimeError("missing task launch method") # massage some values if not queue: queue = default_queue if cleanup and isinstance(cleanup, bool): # l : log files # u : unit work dirs # v : virtualenv # e : everything (== pilot sandbox) if shared_filesystem: cleanup = 'luve' else: # we cannot clean the sandbox from within the agent, as the hop # staging would then fail, and we'd get nothing back. # FIXME: cleanup needs to be done by the pmgr.launcher, or # someone else, really, after fetching all logs and # profiles. cleanup = 'luv' # we never cleanup virtenvs which are not private if virtenv_mode is not 'private': cleanup = cleanup.replace('v', '') # add dists to staging files, if needed if rp_version in ['local', 'debug']: sdist_names = [ru.sdist_name, rs.sdist_name, self._rp_sdist_name] sdist_paths = [ru.sdist_path, rs.sdist_path, self._rp_sdist_path] else: sdist_names = list() sdist_paths = list() # if cores_per_node is set (!= None), then we need to # allocation full nodes, and thus round up if cores_per_node: cores_per_node = int(cores_per_node) number_cores = int(cores_per_node * math.ceil(float(number_cores) / cores_per_node)) # if gpus_per_node is set (!= None), then we need to # allocation full nodes, and thus round up if gpus_per_node: gpus_per_node = int(gpus_per_node) number_gpus = int(gpus_per_node * math.ceil(float(number_gpus) / gpus_per_node)) # set mandatory args bootstrap_args = "" bootstrap_args += " -d '%s'" % ':'.join(sdist_names) bootstrap_args += " -p '%s'" % pid bootstrap_args += " -s '%s'" % sid bootstrap_args += " -m '%s'" % virtenv_mode bootstrap_args += " -r '%s'" % rp_version bootstrap_args += " -b '%s'" % python_dist bootstrap_args += " -g '%s'" % virtenv_dist bootstrap_args += " -v '%s'" % virtenv bootstrap_args += " -y '%d'" % runtime # set optional args if lrms == "CCM": bootstrap_args += " -c" if forward_tunnel_endpoint: bootstrap_args += " -f '%s'" % forward_tunnel_endpoint if forward_tunnel_endpoint: bootstrap_args += " -h '%s'" % db_hostport if python_interpreter: bootstrap_args += " -i '%s'" % python_interpreter if tunnel_bind_device: bootstrap_args += " -t '%s'" % tunnel_bind_device if cleanup: bootstrap_args += " -x '%s'" % cleanup for arg in pre_bootstrap_0: bootstrap_args += " -e '%s'" % arg for arg in pre_bootstrap_1: bootstrap_args += " -w '%s'" % arg agent_cfg['owner'] = 'agent_0' agent_cfg['cores'] = number_cores agent_cfg['gpus'] = number_gpus agent_cfg['lrms'] = lrms agent_cfg['spawner'] = agent_spawner agent_cfg['scheduler'] = agent_scheduler agent_cfg['runtime'] = runtime agent_cfg['dburl'] = str(database_url) agent_cfg['session_id'] = sid agent_cfg['pilot_id'] = pid agent_cfg['logdir'] = '.' agent_cfg['pilot_sandbox'] = pilot_sandbox agent_cfg['session_sandbox'] = session_sandbox agent_cfg['resource_sandbox'] = resource_sandbox agent_cfg['agent_launch_method'] = agent_launch_method agent_cfg['task_launch_method'] = task_launch_method agent_cfg['mpi_launch_method'] = mpi_launch_method agent_cfg['cores_per_node'] = cores_per_node agent_cfg['gpus_per_node'] = gpus_per_node agent_cfg['lfs_path_per_node'] = lfs_path_per_node agent_cfg['lfs_size_per_node'] = lfs_size_per_node agent_cfg['cu_tmp'] = cu_tmp agent_cfg['export_to_cu'] = export_to_cu agent_cfg['cu_pre_exec'] = cu_pre_exec agent_cfg['cu_post_exec'] = cu_post_exec agent_cfg['resource_cfg'] = copy.deepcopy(rcfg) agent_cfg['debug'] = self._log.getEffectiveLevel() # we'll also push the agent config into MongoDB pilot['cfg'] = agent_cfg # ---------------------------------------------------------------------- # Write agent config dict to a json file in pilot sandbox. agent_cfg_name = 'agent_0.cfg' cfg_tmp_handle, cfg_tmp_file = tempfile.mkstemp(prefix='rp.agent_cfg.') os.close(cfg_tmp_handle) # file exists now # Convert dict to json file self._log.debug("Write agent cfg to '%s'.", cfg_tmp_file) self._log.debug(pprint.pformat(agent_cfg)) ru.write_json(agent_cfg, cfg_tmp_file) ret['ft'].append({ 'src': cfg_tmp_file, 'tgt': '%s/%s' % (pilot_sandbox, agent_cfg_name), 'rem': True }) # purge the tmp file after packing # ---------------------------------------------------------------------- # we also touch the log and profile tarballs in the target pilot sandbox ret['ft'].append({ 'src': '/dev/null', 'tgt': '%s/%s' % (pilot_sandbox, '%s.log.tgz' % pid), 'rem': False }) # don't remove /dev/null # only stage profiles if we profile if self._prof.enabled: ret['ft'].append({ 'src': '/dev/null', 'tgt': '%s/%s' % (pilot_sandbox, '%s.prof.tgz' % pid), 'rem': False }) # don't remove /dev/null # check if we have a sandbox cached for that resource. If so, we have # nothing to do. Otherwise we create the sandbox and stage the RP # stack etc. # NOTE: this will race when multiple pilot launcher instances are used! with self._cache_lock: if resource not in self._sandboxes: for sdist in sdist_paths: base = os.path.basename(sdist) ret['ft'].append({ 'src': sdist, 'tgt': '%s/%s' % (session_sandbox, base), 'rem': False }) # Copy the bootstrap shell script. bootstrapper_path = os.path.abspath( "%s/agent/%s" % (self._root_dir, BOOTSTRAPPER_0)) self._log.debug("use bootstrapper %s", bootstrapper_path) ret['ft'].append({ 'src': bootstrapper_path, 'tgt': '%s/%s' % (session_sandbox, BOOTSTRAPPER_0), 'rem': False }) # Some machines cannot run pip due to outdated CA certs. # For those, we also stage an updated certificate bundle # TODO: use booleans all the way? if stage_cacerts: cc_name = 'cacert.pem.gz' cc_path = os.path.abspath("%s/agent/%s" % (self._root_dir, cc_name)) self._log.debug("use CAs %s", cc_path) ret['ft'].append({ 'src': cc_path, 'tgt': '%s/%s' % (session_sandbox, cc_name), 'rem': False }) self._sandboxes[resource] = True # ---------------------------------------------------------------------- # Create SAGA Job description and submit the pilot job jd = rs.job.Description() if shared_filesystem: bootstrap_tgt = '%s/%s' % (session_sandbox, BOOTSTRAPPER_0) else: bootstrap_tgt = '%s/%s' % ('.', BOOTSTRAPPER_0) jd.name = pid jd.executable = "/bin/bash" jd.arguments = ['-l %s' % bootstrap_tgt, bootstrap_args] jd.working_directory = pilot_sandbox jd.project = project jd.output = "bootstrap_0.out" jd.error = "bootstrap_0.err" jd.total_cpu_count = number_cores jd.total_gpu_count = number_gpus jd.processes_per_host = cores_per_node jd.spmd_variation = spmd_variation jd.wall_time_limit = runtime jd.total_physical_memory = memory jd.queue = queue jd.candidate_hosts = candidate_hosts jd.environment = dict() # we set any saga_jd_supplement keys which are not already set above for key, val in saga_jd_supplement.iteritems(): if not jd[key]: self._log.debug('supplement %s: %s', key, val) jd[key] = val if 'RADICAL_PILOT_PROFILE' in os.environ: jd.environment['RADICAL_PILOT_PROFILE'] = 'TRUE' # for condor backends and the like which do not have shared FSs, we add # additional staging directives so that the backend system binds the # files from the session and pilot sandboxes to the pilot job. jd.file_transfer = list() if not shared_filesystem: jd.file_transfer.extend([ 'site:%s/%s > %s' % (session_sandbox, BOOTSTRAPPER_0, BOOTSTRAPPER_0), 'site:%s/%s > %s' % (pilot_sandbox, agent_cfg_name, agent_cfg_name), 'site:%s/%s.log.tgz > %s.log.tgz' % (pilot_sandbox, pid, pid), 'site:%s/%s.log.tgz < %s.log.tgz' % (pilot_sandbox, pid, pid) ]) if 'RADICAL_PILOT_PROFILE' in os.environ: jd.file_transfer.extend([ 'site:%s/%s.prof.tgz > %s.prof.tgz' % (pilot_sandbox, pid, pid), 'site:%s/%s.prof.tgz < %s.prof.tgz' % (pilot_sandbox, pid, pid) ]) for sdist in sdist_names: jd.file_transfer.extend( ['site:%s/%s > %s' % (session_sandbox, sdist, sdist)]) if stage_cacerts: jd.file_transfer.extend( ['site:%s/%s > %s' % (session_sandbox, cc_name, cc_name)]) self._log.debug("Bootstrap command line: %s %s", jd.executable, jd.arguments) ret['jd'] = jd return ret
def fetch_profiles (sid, dburl=None, src=None, tgt=None, access=None, session=None, skip_existing=False, fetch_client=False, log=None): ''' sid: session for which all profiles are fetched src: dir to look for client session profiles ($src/$sid/*.prof) tgt: dir to store the profile in - $tgt/$sid/*.prof, - $tgt/$sid/$pilot_id/*.prof) returns list of file names ''' if not log and session: log = session._log rep = session._rep elif not log: log = ru.Logger('radical.pilot.utils') rep = ru.Reporter('radical.pilot.utils') ret = list() if not dburl: dburl = os.environ['RADICAL_PILOT_DBURL'] if not dburl: raise ValueError('RADICAL_PILOT_DBURL is not set') if not src: src = os.getcwd() if not tgt: tgt = os.getcwd() if not tgt.startswith('/') and '://' not in tgt: tgt = "%s/%s" % (os.getcwd(), tgt) # we always create a session dir as real target tgt_url = rs.Url("%s/%s/" % (tgt, sid)) # Turn URLs without schema://host into file://localhost, # so that they dont become interpreted as relative. if not tgt_url.schema: tgt_url.schema = 'file' if not tgt_url.host: tgt_url.host = 'localhost' # first fetch session profile if fetch_client: client_profiles = glob.glob("%s/%s/*.prof" % (src, sid)) if not client_profiles: raise RuntimeError('no client profiles in %s/%s' % (src, sid)) for client_profile in client_profiles: ftgt = rs.Url('%s/%s' % (tgt_url, os.path.basename(client_profile))) ret.append("%s" % ftgt.path) if skip_existing and os.path.isfile(ftgt.path) \ and os.stat(ftgt.path).st_size > 0: pass else: prof_file = rs.fs.File(client_profile, session=session) prof_file.copy(ftgt, flags=rs.fs.CREATE_PARENTS) prof_file.close() if not os.path.isfile(client_profile): raise RuntimeError('client profilefile %s does not exist' % client_profile) _, db, _, _, _ = ru.mongodb_connect (dburl) json_docs = get_session_docs(db, sid) pilots = json_docs['pilot'] num_pilots = len(pilots) log.debug("Session: %s", sid) log.debug("Number of pilots in session: %d", num_pilots) for pilot in pilots: try: log.debug("processing pilot '%s'", pilot['uid']) sandbox_url = rs.Url(pilot['pilot_sandbox']) if access: # Allow to use a different access schema than used for the the run. # Useful if you ran from the headnode, but would like to retrieve # the profiles to your desktop (Hello Titan). access_url = rs.Url(access) sandbox_url.schema = access_url.schema sandbox_url.host = access_url.host # print "Overriding remote sandbox: %s" % sandbox_url sandbox = rs.fs.Directory (sandbox_url, session=session) # Try to fetch a tarball of profiles, so that we can get them all in one (SAGA) go! PROFILES_TARBALL = '%s.prof.tgz' % pilot['uid'] tarball_available = False try: if sandbox.is_file(PROFILES_TARBALL) and \ sandbox.get_size(PROFILES_TARBALL): log.info("profiles tarball exists") ftgt = rs.Url('%s/%s' % (tgt_url, PROFILES_TARBALL)) if skip_existing and os.path.isfile(ftgt.path) \ and os.stat(ftgt.path).st_size > 0: log.info("skip fetching of '%s/%s' to '%s'.", sandbox_url, PROFILES_TARBALL, tgt_url) tarball_available = True else: log.info("fetch '%s%s' to '%s'.", sandbox_url, PROFILES_TARBALL, tgt_url) prof_file = rs.fs.File("%s%s" % (sandbox_url, PROFILES_TARBALL), session=session) prof_file.copy(ftgt, flags=rs.fs.CREATE_PARENTS) prof_file.close() tarball_available = True else: log.warn("profiles tarball doesnt exists!") except rs.DoesNotExist: log.exception("exception(TODO): profiles tarball doesnt exists!") try: os.mkdir("%s/%s" % (tgt_url.path, pilot['uid'])) except OSError: pass # We now have a local tarball if tarball_available: log.info("Extract tarball %s to '%s'.", ftgt.path, tgt_url.path) try: tarball = tarfile.open(ftgt.path, mode='r:gz') tarball.extractall("%s/%s" % (tgt_url.path, pilot['uid'])) profiles = glob.glob("%s/%s/*.prof" % (tgt_url.path, pilot['uid'])) ret.extend(profiles) os.unlink(ftgt.path) # If extract succeeded, no need to fetch individual profiles rep.ok("+ %s (profiles)\n" % pilot['uid']) continue except Exception as e: log.warn('could not extract tarball %s [%s]', ftgt.path, e) # If we dont have a tarball (for whichever reason), fetch individual profiles profiles = sandbox.list('*.prof') for prof in profiles: ftgt = rs.Url('%s/%s/%s' % (tgt_url, pilot['uid'], prof)) ret.append("%s" % ftgt.path) if skip_existing and os.path.isfile(ftgt.path) \ and os.stat(ftgt.path).st_size > 0: pass else: prof_file = rs.fs.File("%s%s" % (sandbox_url, prof), session=session) prof_file.copy(ftgt, flags=rs.fs.CREATE_PARENTS) prof_file.close() rep.ok("+ %s (profiles)\n" % pilot['uid']) except Exception as e: rep.error("- %s (profiles)\n" % pilot['uid']) log.exception('failed to fet profile for %s', pilot['uid']) return ret
def run(self): """Starts the process when Process.start() is called. """ # make sure to catch sys.exit (which raises SystemExit) try: # Try to connect to the database and create a tailable cursor. try: db = self._session.get_db() um_col = db["%s.cu" % self._session.uid] logger.debug( "Connected to MongoDB. Serving requests for UnitManager %s." % self.unit_manager_id) except Exception as e: logger.exception("Connection error: %s" % e) return while not self._stop.is_set() and \ not self._session._terminate.is_set(): # See if we can find a ComputeUnit that is waiting for client output file transfer. # FIXME: this method is not bulkable. See agent pulling for # units for an approach to split the call into two bulkable # ones. ts = timestamp() compute_unit = um_col.find_and_modify( query={ "unitmanager": self.unit_manager_id, "state": PENDING_OUTPUT_STAGING, "control": 'agent' }, update={ "$set": { "state": STAGING_OUTPUT, "control": 'umgr' }, "$push": { "statehistory": { "state": STAGING_OUTPUT, "timestamp": ts } } }) if compute_unit is None: # Sleep a bit if no new units are available. time.sleep(IDLE_TIME) else: logger.info("OFTW CU found, progressing ...") state = STAGING_OUTPUT compute_unit_id = None try: log_messages = [] # We have found a new CU. Now we can process the transfer # directive(s) with SAGA. compute_unit_id = str(compute_unit["_id"]) self._session.prof.prof('advance', uid=compute_unit_id, msg=STAGING_OUTPUT, state=STAGING_OUTPUT) logger.debug( "OutputStagingController: unit found: %s" % compute_unit_id) remote_sandbox = compute_unit["sandbox"] output_staging = compute_unit.get( "FTW_Output_Directives", []) logger.info( "OutputStagingController: Processing output file transfers for ComputeUnit %s" % compute_unit_id) # Loop over all staging directives and execute them. for sd in output_staging: logger.debug( "OutputStagingController: sd: %s : %s" % (compute_unit_id, sd)) # Check if there was a cancel request for this CU # TODO: Can't these cancel requests come from a central place? state_doc = um_col.find_one( {"_id": compute_unit_id}, fields=["state"]) if state_doc['state'] == CANCELED: logger.info( "Compute Unit Canceled, interrupting output file transfers." ) self._session.prof.prof('advance', uid=compute_unit_id, msg=CANCELED, state=CANCELED) state = CANCELED # Break out of the loop over all SD's, into the loop over CUs break abs_src = "%s/%s" % (remote_sandbox, sd['source']) if os.path.basename(sd['target']) == sd['target']: abs_target = "file://localhost%s" % os.path.join( os.getcwd(), sd['target']) else: abs_target = "file://localhost%s" % os.path.abspath( sd['target']) log_msg = "Transferring output file %s -> %s" % ( abs_src, abs_target) log_messages.append(log_msg) logger.debug(log_msg) output_file = saga.filesystem.File( saga.Url(abs_src), session=self._session) if CREATE_PARENTS in sd['flags']: copy_flags = saga.filesystem.CREATE_PARENTS else: copy_flags = 0 try: output_file.copy(saga.Url(abs_target), flags=copy_flags) output_file.close() except Exception as e: logger.exception(e) raise Exception("copy failed(%s)" % e.message) # If the CU was canceled we can skip the remainder of this loop, # and return to the CU loop if state == CANCELED: continue # Update the CU's state to 'DONE'. ts = timestamp() log_message = "Output transfer completed." um_col.update({'_id': compute_unit_id}, { '$set': { 'state': DONE }, '$push': { 'statehistory': { 'state': DONE, 'timestamp': ts }, 'log': { 'message': log_message, 'timestamp': ts } } }) self._session.prof.prof('advance', uid=compute_unit_id, msg=DONE, state=DONE) except Exception as e: # Update the CU's state to 'FAILED'. ts = timestamp() log_message = "Output transfer failed: %s" % e um_col.update({'_id': compute_unit_id}, { '$set': { 'state': FAILED }, '$push': { 'statehistory': { 'state': FAILED, 'timestamp': ts }, 'log': { 'message': log_message, 'timestamp': ts } } }) logger.exception(log_message) self._session.prof.prof('advance', uid=compute_unit_id, msg=FAILED, state=FAILED) raise except SystemExit as e: logger.exception( "output file transfer thread caught system exit -- forcing application shutdown" ) thread.interrupt_main()
def execute_pattern(self, pattern, resource): pattern_start_time = datetime.datetime.now() def get_input_data(kernel, instance=None, iteration=None, ktype=None): # INPUT DATA: ip_list = [] #------------------------------------------------------------------------------------------------------------------ # upload_input_data data_in = [] if kernel._kernel._upload_input_data is not None: if isinstance(kernel._kernel._upload_input_data, list): pass else: kernel._kernel._upload_input_data = [ kernel._kernel._upload_input_data ] for i in range(0, len(kernel._kernel._upload_input_data)): if (ktype == 'simulation' or ktype == 'analysis'): var = resolve_placeholder_vars( working_dirs=self.working_dirs, path=kernel._kernel._upload_input_data[i], instance=instance, iteration=iteration, type=ktype) else: var = resolve_placeholder_vars( working_dirs=self.working_dirs, path=kernel._kernel._upload_input_data[i]) if len(var.split('>')) > 1: temp = { 'source': var.split('>')[0].strip(), 'target': var.split('>')[1].strip() } else: temp = { 'source': var.split('>')[0].strip(), 'target': os.path.basename(var.split('>')[0].strip()) } data_in.append(temp) if ip_list is None: ip_list = data_in else: ip_list += data_in #------------------------------------------------------------------------------------------------------------------ #------------------------------------------------------------------------------------------------------------------ # link_input_data data_in = [] if kernel._kernel._link_input_data is not None: if isinstance(kernel._kernel._link_input_data, list): pass else: kernel._kernel._link_input_data = [ kernel._kernel._link_input_data ] for i in range(0, len(kernel._kernel._link_input_data)): if (ktype == 'simulation' or ktype == 'analysis'): var = resolve_placeholder_vars( working_dirs=self.working_dirs, path=kernel._kernel._link_input_data[i], instance=instance, iteration=iteration, type=ktype) else: var = resolve_placeholder_vars( working_dirs=self.working_dirs, path=kernel._kernel._link_input_data[i]) if len(var.split('>')) > 1: temp = { 'source': var.split('>')[0].strip(), 'target': var.split('>')[1].strip(), 'action': radical.pilot.LINK } else: temp = { 'source': var.split('>')[0].strip(), 'target': os.path.basename(var.split('>')[0].strip()), 'action': radical.pilot.LINK } data_in.append(temp) if ip_list is None: ip_list = data_in else: ip_list += data_in #------------------------------------------------------------------------------------------------------------------ #------------------------------------------------------------------------------------------------------------------ # copy_input_data data_in = [] if kernel._kernel._copy_input_data is not None: if isinstance(kernel._kernel._copy_input_data, list): pass else: kernel._kernel._copy_input_data = [ kernel._kernel._copy_input_data ] for i in range(0, len(kernel._kernel._copy_input_data)): if (ktype == 'simulation' or ktype == 'analysis'): var = resolve_placeholder_vars( working_dirs=self.working_dirs, path=kernel._kernel._copy_input_data[i], instance=instance, iteration=iteration, type=ktype) else: var = resolve_placeholder_vars( working_dirs=self.working_dirs, path=kernel._kernel._copy_input_data[i]) if len(var.split('>')) > 1: temp = { 'source': var.split('>')[0].strip(), 'target': var.split('>')[1].strip(), 'action': radical.pilot.COPY } else: temp = { 'source': var.split('>')[0].strip(), 'target': os.path.basename(var.split('>')[0].strip()), 'action': radical.pilot.COPY } data_in.append(temp) if ip_list is None: ip_list = data_in else: ip_list += data_in #------------------------------------------------------------------------------------------------------------------ #------------------------------------------------------------------------------------------------------------------ # download input data if kernel.download_input_data is not None: data_in = kernel.download_input_data if ip_list is None: ip_list = data_in else: ip_list += data_in #------------------------------------------------------------------------------------------------------------------ return ip_list #------------------------------------------------------------------------------------------------------------------ def get_output_data(kernel, instance=None, iteration=None, ktype=None): # OUTPUT DATA: op_list = [] #------------------------------------------------------------------------------------------------------------------ # copy_output_data data_out = [] if kernel._kernel._copy_output_data is not None: if isinstance(kernel._kernel._copy_output_data, list): pass else: kernel._kernel._copy_output_data = [ kernel._kernel._copy_output_data ] for i in range(0, len(kernel._kernel._copy_output_data)): if (ktype == 'simulation' or ktype == 'analysis'): var = resolve_placeholder_vars( working_dirs=self.working_dirs, path=kernel._kernel._copy_output_data[i], instance=instance, iteration=iteration, type=ktype) else: var = resolve_placeholder_vars( working_dirs=self.working_dirs, path=kernel._kernel._copy_output_data[i]) if len(var.split('>')) > 1: temp = { 'source': var.split('>')[0].strip(), 'target': var.split('>')[1].strip(), 'action': radical.pilot.COPY } else: temp = { 'source': var.split('>')[0].strip(), 'target': os.path.basename(var.split('>')[0].strip()), 'action': radical.pilot.COPY } data_out.append(temp) if op_list is None: op_list = data_out else: op_list += data_out #----------------------------------------------------------------------------------------------------------------- #------------------------------------------------------------------------------------------------------------------ # download_output_data data_out = [] if kernel._kernel._download_output_data is not None: if isinstance(kernel._kernel._download_output_data, list): pass else: kernel._kernel._download_output_data = [ kernel._kernel._download_output_data ] for i in range(0, len(kernel._kernel._download_output_data)): if (ktype == 'simulation' or ktype == 'analysis'): var = resolve_placeholder_vars( working_dirs=self.working_dirs, path=kernel._kernel._download_output_data[i], instance=instance, iteration=iteration, type=ktype) else: var = resolve_placeholder_vars( working_dirs=self.working_dirs, path=kernel._kernel._download_output_data[i]) if len(var.split('>')) > 1: temp = { 'source': var.split('>')[0].strip(), 'target': var.split('>')[1].strip() } else: temp = { 'source': var.split('>')[0].strip(), 'target': os.path.basename(var.split('>')[0].strip()) } data_out.append(temp) if op_list is None: op_list = data_out else: op_list += data_out #------------------------------------------------------------------------------------------------------------------ return op_list #------------------------------------------------------------------------------------------------------------------ #----------------------------------------------------------------------- # def unit_state_cb(unit, state): if state == radical.pilot.FAILED: self.get_logger().error( "ComputeUnit error: STDERR: {0}, STDOUT: {0}".format( unit.stderr, unit.stdout)) self.get_logger().error("Pattern execution FAILED.") sys.exit(1) #----------------------------------------------------------------------- # def create_filecheck_command(files_list): command_list = [] for f in files_list: command = 'if [ -f "{0}" ]; then exit 0; else echo "File {0} does not exist" >&2; exit 1; fi;'.format( f) command_list.append(command) return command_list self._reporter.ok('>>ok') self.get_logger().info( "Executing simulation-analysis loop with {0} iterations on {1} allocated core(s) on '{2}'" .format(pattern.iterations, resource._cores, resource._resource_key)) self._reporter.header( "Executing simulation-analysis loop with {0} iterations on {1} allocated core(s) on '{2}'" .format(pattern.iterations, resource._cores, resource._resource_key)) all_cus = [] #print resource._pilot.description['cores'] self.get_logger().info("Waiting for pilot on {0} to go Active".format( resource._resource_key)) self._reporter.info("Job waiting on queue...".format( resource._resource_key)) resource._pmgr.wait_pilots(resource._pilot.uid, 'Active') self._reporter.ok("\nJob is now running !".format( resource._resource_key)) profiling = int(os.environ.get('RADICAL_ENMD_PROFILING', 0)) if profiling == 1: from collections import OrderedDict as od pattern._execution_profile = [] enmd_overhead_dict = od() cu_dict = od() try: start_now = datetime.datetime.now() resource._umgr.register_callback(unit_state_cb) ######################################################################## # execute pre_loop # ################################################################ # EXECUTE PRE-LOOP if profiling == 1: probe_preloop_start = datetime.datetime.now() enmd_overhead_dict['preloop'] = od() enmd_overhead_dict['preloop'][ 'start_time'] = probe_preloop_start pre_loop = pattern.pre_loop() if pre_loop is not None: pre_loop._bind_to_resource(resource._resource_key) cud = radical.pilot.ComputeUnitDescription() cud.name = "pre_loop" cud.pre_exec = pre_loop._cu_def_pre_exec cud.executable = pre_loop._cu_def_executable cud.arguments = pre_loop.arguments cud.mpi = pre_loop.uses_mpi cud.input_staging = get_input_data(kernel=pre_loop) cud.output_staging = get_output_data(kernel=pre_loop) if pre_loop.exists_remote is not None: cud.post_exec = create_filecheck_command( pre_loop.exists_remote) self.get_logger().debug("Created pre_loop CU: {0}.".format( cud.as_dict())) self.get_logger().info( "Submitted ComputeUnit(s) for pre_loop step.") self._reporter.info("\nWaiting for pre_loop step to complete.") if profiling == 1: probe_preloop_wait = datetime.datetime.now() enmd_overhead_dict['preloop'][ 'wait_time'] = probe_preloop_wait unit = resource._umgr.submit_units(cud) all_cus.append(unit) resource._umgr.wait_units(unit.uid) if profiling == 1: probe_preloop_res = datetime.datetime.now() enmd_overhead_dict['preloop'][ 'res_time'] = probe_preloop_res self.get_logger().info("Pre_loop completed.") if unit.state != radical.pilot.DONE: raise EnsemblemdError( "Pre-loop CU failed with error: {0}".format( unit.stdout)) self.working_dirs["pre_loop"] = saga.Url( unit.working_directory).path # Process CU information and append it to the dictionary if profiling == 1: probe_preloop_done = datetime.datetime.now() enmd_overhead_dict['preloop'][ 'stop_time'] = probe_preloop_done cu_dict['pre_loop'] = unit self._reporter.ok('>> done') else: self.get_logger().info("No pre_loop stage.") ######################################################################## # execute simulation analysis loop # for iteration in range(1, pattern.iterations + 1): self.working_dirs['iteration_{0}'.format(iteration)] = {} ################################################################ # EXECUTE SIMULATION STEPS if profiling == 1: enmd_overhead_dict['iter_{0}'.format(iteration)] = od() cu_dict['iter_{0}'.format(iteration)] = od() if isinstance( pattern.simulation_step(iteration=iteration, instance=1), list): num_sim_kerns = len( pattern.simulation_step(iteration=iteration, instance=1)) else: num_sim_kerns = 1 #print num_sim_kerns all_sim_cus = [] if profiling == 1: enmd_overhead_dict['iter_{0}'.format( iteration)]['sim'] = od() cu_dict['iter_{0}'.format(iteration)]['sim'] = list() for kern_step in range(0, num_sim_kerns): if profiling == 1: probe_sim_start = datetime.datetime.now() enmd_overhead_dict['iter_{0}'.format(iteration)][ 'sim']['kernel_{0}'.format(kern_step)] = od() enmd_overhead_dict['iter_{0}'.format( iteration)]['sim']['kernel_{0}'.format( kern_step)]['start_time'] = probe_sim_start s_units = [] for s_instance in range(1, pattern._simulation_instances + 1): if isinstance( pattern.simulation_step(iteration=iteration, instance=s_instance), list): sim_step = pattern.simulation_step( iteration=iteration, instance=s_instance)[kern_step] else: sim_step = pattern.simulation_step( iteration=iteration, instance=s_instance) sim_step._bind_to_resource(resource._resource_key) # Resolve all placeholders #if sim_step.link_input_data is not None: # for i in range(len(sim_step.link_input_data)): # sim_step.link_input_data[i] = resolve_placeholder_vars(working_dirs, s_instance, iteration, pattern._simulation_instances, pattern._analysis_instances, "simulation", sim_step.link_input_data[i]) cud = radical.pilot.ComputeUnitDescription() cud.name = "sim ;{iteration} ;{instance}".format( iteration=iteration, instance=s_instance) cud.pre_exec = sim_step._cu_def_pre_exec cud.executable = sim_step._cu_def_executable cud.arguments = sim_step.arguments cud.mpi = sim_step.uses_mpi cud.input_staging = get_input_data(kernel=sim_step, instance=s_instance, iteration=iteration, ktype='simulation') cud.output_staging = get_output_data( kernel=sim_step, instance=s_instance, iteration=iteration, ktype='simulation') if sim_step.cores is not None: cud.cores = sim_step.cores if sim_step.exists_remote is not None: cud.post_exec = create_filecheck_command( sim_step.exists_remote) s_units.append(cud) if sim_step.get_instance_type == 'single': break self.get_logger().debug( "Created simulation CU: {0}.".format(cud.as_dict())) self.get_logger().info( "Submitted tasks for simulation iteration {0}.".format( iteration)) self.get_logger().info( "Waiting for {3} simulations in iteration {0}/ kernel {1}: {2} to complete." .format(iteration, kern_step + 1, sim_step.name, pattern._simulation_instances)) self._reporter.info( "\nIteration {0}: Waiting for {2} simulation tasks: {1} to complete" .format(iteration, sim_step.name, pattern._simulation_instances)) if profiling == 1: probe_sim_wait = datetime.datetime.now() enmd_overhead_dict['iter_{0}'.format( iteration)]['sim']['kernel_{0}'.format( kern_step)]['wait_time'] = probe_sim_wait s_cus = resource._umgr.submit_units(s_units) all_cus.extend(s_cus) all_sim_cus.extend(s_cus) uids = [cu.uid for cu in s_cus] resource._umgr.wait_units(uids) if profiling == 1: probe_sim_res = datetime.datetime.now() enmd_overhead_dict['iter_{0}'.format( iteration)]['sim']['kernel_{0}'.format( kern_step)]['res_time'] = probe_sim_res self.get_logger().info( "Simulations in iteration {0}/ kernel {1}: {2} completed." .format(iteration, kern_step + 1, sim_step.name)) failed_units = "" for unit in s_cus: if unit.state != radical.pilot.DONE: failed_units += " * Simulation task {0} failed with an error: {1}\n".format( unit.uid, unit.stderr) if profiling == 1: probe_sim_done = datetime.datetime.now() enmd_overhead_dict['iter_{0}'.format( iteration)]['sim']['kernel_{0}'.format( kern_step)]['stop_time'] = probe_sim_done self._reporter.ok('>> done') if profiling == 1: probe_post_sim_start = datetime.datetime.now() enmd_overhead_dict['iter_{0}'.format( iteration)]['sim']['post'] = od() enmd_overhead_dict['iter_{0}'.format(iteration)]['sim'][ 'post']['start_time'] = probe_post_sim_start # TODO: ensure working_dir <-> instance mapping i = 0 for cu in s_cus: i += 1 self.working_dirs['iteration_{0}'.format(iteration)][ 'simulation_{0}'.format(i)] = saga.Url( cu.working_directory).path if profiling == 1: probe_post_sim_end = datetime.datetime.now() enmd_overhead_dict['iter_{0}'.format(iteration)]['sim'][ 'post']['stop_time'] = probe_post_sim_end cu_dict['iter_{0}'.format(iteration)]['sim'] = all_sim_cus ################################################################ # EXECUTE ANALYSIS STEPS if isinstance( pattern.analysis_step(iteration=iteration, instance=1), list): num_ana_kerns = len( pattern.analysis_step(iteration=iteration, instance=1)) else: num_ana_kerns = 1 #print num_ana_kerns all_ana_cus = [] if profiling == 1: enmd_overhead_dict['iter_{0}'.format( iteration)]['ana'] = od() cu_dict['iter_{0}'.format(iteration)]['ana'] = list() for kern_step in range(0, num_ana_kerns): if profiling == 1: probe_ana_start = datetime.datetime.now() enmd_overhead_dict['iter_{0}'.format(iteration)][ 'ana']['kernel_{0}'.format(kern_step)] = od() enmd_overhead_dict['iter_{0}'.format( iteration)]['ana']['kernel_{0}'.format( kern_step)]['start_time'] = probe_ana_start a_units = [] for a_instance in range(1, pattern._analysis_instances + 1): if isinstance( pattern.analysis_step(iteration=iteration, instance=a_instance), list): ana_step = pattern.analysis_step( iteration=iteration, instance=a_instance)[kern_step] else: ana_step = pattern.analysis_step( iteration=iteration, instance=a_instance) ana_step._bind_to_resource(resource._resource_key) # Resolve all placeholders #if ana_step.link_input_data is not None: # for i in range(len(ana_step.link_input_data)): # ana_step.link_input_data[i] = resolve_placeholder_vars(working_dirs, a_instance, iteration, pattern._simulation_instances, pattern._analysis_instances, "analysis", ana_step.link_input_data[i]) cud = radical.pilot.ComputeUnitDescription() cud.name = "ana ; {iteration}; {instance}".format( iteration=iteration, instance=a_instance) cud.pre_exec = ana_step._cu_def_pre_exec cud.executable = ana_step._cu_def_executable cud.arguments = ana_step.arguments cud.mpi = ana_step.uses_mpi cud.input_staging = get_input_data(kernel=ana_step, instance=a_instance, iteration=iteration, ktype='analysis') cud.output_staging = get_output_data( kernel=ana_step, instance=a_instance, iteration=iteration, ktype='analysis') if ana_step.cores is not None: cud.cores = ana_step.cores if ana_step.exists_remote is not None: cud.post_exec = create_filecheck_command( ana_step.exists_remote) a_units.append(cud) if ana_step.get_instance_type == 'single': break self.get_logger().debug("Created analysis CU: {0}.".format( cud.as_dict())) self.get_logger().info( "Submitted tasks for analysis iteration {0}.".format( iteration)) self.get_logger().info( "Waiting for analysis tasks in iteration {0}/kernel {1}: {2} to complete." .format(iteration, kern_step + 1, ana_step.name)) self._reporter.info( "\nIteration {0}: Waiting for analysis tasks: {1} to complete" .format(iteration, ana_step.name)) if profiling == 1: probe_ana_wait = datetime.datetime.now() enmd_overhead_dict['iter_{0}'.format( iteration)]['ana']['kernel_{0}'.format( kern_step)]['wait_time'] = probe_ana_wait a_cus = resource._umgr.submit_units(a_units) all_cus.extend(a_cus) all_ana_cus.extend(a_cus) uids = [cu.uid for cu in a_cus] resource._umgr.wait_units(uids) if profiling == 1: probe_ana_res = datetime.datetime.now() enmd_overhead_dict['iter_{0}'.format( iteration)]['ana']['kernel_{0}'.format( kern_step)]['res_time'] = probe_ana_res self.get_logger().info( "Analysis in iteration {0}/kernel {1}: {2} completed.". format(iteration, kern_step + 1, ana_step.name)) failed_units = "" for unit in a_cus: if unit.state != radical.pilot.DONE: failed_units += " * Analysis task {0} failed with an error: {1}\n".format( unit.uid, unit.stderr) if profiling == 1: probe_ana_done = datetime.datetime.now() enmd_overhead_dict['iter_{0}'.format( iteration)]['ana']['kernel_{0}'.format( kern_step)]['stop_time'] = probe_ana_done self._reporter.ok('>> done') if profiling == 1: probe_post_ana_start = datetime.datetime.now() enmd_overhead_dict['iter_{0}'.format( iteration)]['ana']['post'] = od() enmd_overhead_dict['iter_{0}'.format(iteration)]['ana'][ 'post']['start_time'] = probe_post_ana_start if (pattern.adaptive_simulation == False): pass else: pattern._simulation_instances = pattern.get_new_simulation_instances( a_cus[0].stdout) i = 0 for cu in a_cus: i += 1 self.working_dirs['iteration_{0}'.format(iteration)][ 'analysis_{0}'.format(i)] = saga.Url( cu.working_directory).path if profiling == 1: probe_post_ana_end = datetime.datetime.now() enmd_overhead_dict['iter_{0}'.format(iteration)]['ana'][ 'post']['stop_time'] = probe_post_ana_end cu_dict['iter_{0}'.format(iteration)]['ana'] = all_ana_cus self._reporter.header('Pattern execution successfully finished') # ONLY PROFILING SECTION BELOW if profiling == 1: #Pattern overhead logging title = "iteration,step,kernel,probe,timestamp" f1 = open('enmd_pat_overhead.csv', 'w') f1.write(title + "\n\n") iter = 'None' step = 'pre_loop' kern = 'None' for key, val in enmd_overhead_dict['preloop'].items(): probe = key timestamp = val entry = '{0},{1},{2},{3},{4}\n'.format( iter, step, kern, probe, timestamp) f1.write(entry) iters = pattern.iterations for i in range(1, iters + 1): iter = 'iter_{0}'.format(i) for key1, val1 in enmd_overhead_dict[iter].items(): step = key1 for key2, val2 in val1.items(): kern = key2 for key3, val3 in val2.items(): probe = key3 timestamp = val3 entry = '{0},{1},{2},{3},{4}\n'.format( iter.split('_')[1], step, kern, probe, timestamp) f1.write(entry) f1.close() #CU data logging title = "uid, iter, step, Scheduling, StagingInput, AgentStagingInputPending, AgentStagingInput, AllocatingPending, Allocating, ExecutingPending, Executing, AgentStagingOutputPending, AgentStagingOutput, PendingOutputStaging, StagingOutput, Done" f2 = open( "execution_profile_{mysession}.csv".format( mysession=resource._session.uid), 'w') f2.write(title + "\n\n") iter = 'None' step = 'pre_loop' if step in cu_dict: cu = cu_dict['pre_loop'] st_data = {} for st in cu.state_history: st_dict = st.as_dict() st_data["{0}".format(st_dict["state"])] = {} st_data["{0}".format( st_dict["state"])] = st_dict["timestamp"] states = [ 'Scheduling,' 'StagingInput', 'AgentStagingInputPending', 'AgentStagingInput', 'AllocatingPending', 'Allocating', 'ExecutingPending', 'Executing', 'AgentStagingOutputPending', 'AgentStagingOutput', 'PendingOutputStaging', 'StagingOutput', 'Done' ] for state in states: if (state in st_data) is False: st_data[state] = None line = "{uid}, {iter}, {step}, {Scheduling}, {StagingInput}, {AgentStagingInputPending}, {AgentStagingInput}, {AllocatingPending}, {Allocating}, {ExecutingPending},{Executing}, {AgentStagingOutputPending}, {AgentStagingOutput}, {PendingOutputStaging}, {StagingOutput}, {Done}".format( uid=cu.uid, iter=0, step='pre_loop', Scheduling=(st_data['Scheduling']), StagingInput=(st_data['StagingInput']), AgentStagingInputPending=( st_data['AgentStagingInputPending']), AgentStagingInput=(st_data['AgentStagingInput']), AllocatingPending=(st_data['AllocatingPending']), Allocating=(st_data['Allocating']), ExecutingPending=(st_data['ExecutingPending']), Executing=(st_data['Executing']), AgentStagingOutputPending=( st_data['AgentStagingOutputPending']), AgentStagingOutput=(st_data['AgentStagingOutput']), PendingOutputStaging=(st_data['PendingOutputStaging']), StagingOutput=(st_data['StagingOutput']), Done=(st_data['Done'])) f2.write(line + '\n') else: print 'No pre_loop step in the pattern' for i in range(1, iters + 1): iter = 'iter_{0}'.format(i) for key, val in cu_dict[iter].items(): step = key cus = val if step == 'sim': for cu in cus: st_data = {} for st in cu.state_history: st_dict = st.as_dict() st_data["{0}".format( st_dict["state"])] = {} st_data["{0}".format( st_dict["state"] )] = st_dict["timestamp"] states = [ 'Scheduling,' 'StagingInput', 'AgentStagingInputPending', 'AgentStagingInput', 'AllocatingPending', 'Allocating', 'ExecutingPending', 'Executing', 'AgentStagingOutputPending', 'AgentStagingOutput', 'PendingOutputStaging', 'StagingOutput', 'Done' ] for state in states: if (state in st_data) is False: st_data[state] = None line = "{uid}, {iter}, {step}, {Scheduling}, {StagingInput}, {AgentStagingInputPending}, {AgentStagingInput}, {AllocatingPending}, {Allocating}, {ExecutingPending},{Executing}, {AgentStagingOutputPending}, {AgentStagingOutput}, {PendingOutputStaging}, {StagingOutput}, {Done}".format( uid=cu.uid, iter=iter.split('_')[1], step=step, Scheduling=(st_data['Scheduling']), StagingInput=(st_data['StagingInput']), AgentStagingInputPending=( st_data['AgentStagingInputPending']), AgentStagingInput=( st_data['AgentStagingInput']), AllocatingPending=( st_data['AllocatingPending']), Allocating=(st_data['Allocating']), ExecutingPending=( st_data['ExecutingPending']), Executing=(st_data['Executing']), AgentStagingOutputPending=( st_data['AgentStagingOutputPending']), AgentStagingOutput=( st_data['AgentStagingOutput']), PendingOutputStaging=( st_data['PendingOutputStaging']), StagingOutput=(st_data['StagingOutput']), Done=(st_data['Done'])) f2.write(line + '\n') elif step == 'ana': for cu in cus: st_data = {} for st in cu.state_history: st_dict = st.as_dict() st_data["{0}".format( st_dict["state"])] = {} st_data["{0}".format( st_dict["state"] )] = st_dict["timestamp"] states = [ 'Scheduling,' 'StagingInput', 'AgentStagingInputPending', 'AgentStagingInput', 'AllocatingPending', 'Allocating', 'ExecutingPending', 'Executing', 'AgentStagingOutputPending', 'AgentStagingOutput', 'PendingOutputStaging', 'StagingOutput', 'Done' ] for state in states: if (state in st_data) is False: st_data[state] = None line = "{uid}, {iter}, {step}, {Scheduling}, {StagingInput}, {AgentStagingInputPending}, {AgentStagingInput}, {AllocatingPending}, {Allocating}, {ExecutingPending},{Executing}, {AgentStagingOutputPending}, {AgentStagingOutput}, {PendingOutputStaging}, {StagingOutput}, {Done}".format( uid=cu.uid, iter=iter.split('_')[1], step=step, Scheduling=(st_data['Scheduling']), StagingInput=(st_data['StagingInput']), AgentStagingInputPending=( st_data['AgentStagingInputPending']), AgentStagingInput=( st_data['AgentStagingInput']), AllocatingPending=( st_data['AllocatingPending']), Allocating=(st_data['Allocating']), ExecutingPending=( st_data['ExecutingPending']), Executing=(st_data['Executing']), AgentStagingOutputPending=( st_data['AgentStagingOutputPending']), AgentStagingOutput=( st_data['AgentStagingOutput']), PendingOutputStaging=( st_data['PendingOutputStaging']), StagingOutput=(st_data['StagingOutput']), Done=(st_data['Done'])) f2.write(line + '\n') f2.close() except KeyboardInterrupt: self._reporter.error('Execution interupted') traceback.print_exc()