def run(self): '''Do brokering and submit''' arclog = arc_utils.ARCLogger(baselogger, 0) tmplog = arclog.log # Do brokering among the available queues jobdesc = self.jobdescs[0] broker = arc.Broker(self.userconfig, jobdesc, "Random") targetsorter = arc.ExecutionTargetSorter(broker) for target in self.queuelist: tmplog.debug("considering target {0}:{1}".format( target.ComputingService.Name, target.ComputingShare.Name)) # Adding an entity performs matchmaking and brokering targetsorter.addEntity(target) if len(targetsorter.getMatchingTargets()) == 0: tmplog.error("no clusters satisfied job description requirements") return targetsorter.reset( ) # required to reset iterator, otherwise we get a seg fault selectedtarget = targetsorter.getCurrentTarget() # Job object will contain the submitted job job = arc.Job() submitter = arc.Submitter(self.userconfig) if submitter.Submit(selectedtarget, jobdesc, job) != arc.SubmissionStatus.NONE: tmplog.error("Submission failed") return self.job = job
def acknowledge_events_files(self, workspec): '''Tell workers that harvester received events/files''' # get logger arclog = arc_utils.ARCLogger(baselogger, workspec.workerID) tmpLog = arclog.log job = workspec.workAttributes['arcjob'] arcid = job['JobID'] # Set certificate to use for interacting with ARC CE usercfg = arc.UserConfig(self.cred_type) if not self._setup_proxy(usercfg, workspec, arcid, tmpLog): return False # Delete jobid/jsonEventsUpdateFileName.read for pandaID in workspec.pandaid_list: accessPoint = self.get_access_point(workspec, pandaID) remoteJsonFilePath = '%s/%s%s' % (arcid, jsonEventsUpdateFileName, suffixReadJson) status = self._delete_file(remoteJsonFilePath, usercfg, tmpLog) if not status and status.GetErrno() != errno.ENOENT: tmpLog.error('Failed deleting {0}: {1}'.format( remoteJsonFilePath, str(status))) tmpLog.debug('done') return
def feed_events(self, workspec, events_dict): '''Havester has an event range to pass to job''' # get logger arclog = arc_utils.ARCLogger(baselogger, workspec.workerID) tmpLog = arclog.log # Upload to jobid/jsonEventsFeedFileName, delete jobid/jsonEventsRequestFileName job = workspec.workAttributes['arcjob'] arcid = job['JobID'] # Set certificate to use for interacting with ARC CE usercfg = arc.UserConfig(self.cred_type) if not self._setup_proxy(usercfg, workspec, arcid, tmpLog): return False retVal = True if workspec.mapType in [ WorkSpec.MT_OneToOne, WorkSpec.MT_MultiWorkers ]: # put the json just under the access point then upload to ARC CE localJsonFilePath = os.path.join(workspec.get_access_point(), jsonEventsFeedFileName) tmpLog.debug('feeding events to {0}'.format(localJsonFilePath)) try: with open(localJsonFilePath, 'w') as jsonFile: json.dump(events_dict, jsonFile) except Exception: core_utils.dump_error_message(tmpLog) retVal = False remoteJsonFilePath = '%s/%s' % (arcid, jsonEventsFeedFileName) # Try to copy the file status = self._copy_file(localJsonFilePath, remoteJsonFilePath, usercfg, tmpLog) if not status: tmpLog.error('Failed to feed events to {0}: {1}'.format( remoteJsonFilePath, str(status))) retVal = False else: remoteJsonEventsRequestFile = '%s/%s' % ( arcid, jsonEventsRequestFileName) status = self._delete_file(remoteJsonEventsRequestFile, usercfg, tmpLog) if not status and status.GetErrno() != errno.ENOENT: tmpLog.error( 'Failed to delete event request file at {0}'.format( remoteJsonEventsRequestFile)) elif workspec.mapType == WorkSpec.MT_MultiJobs: # TOBEFIXED pass # remove request file try: jsonFilePath = os.path.join(workspec.get_access_point(), jsonEventsFeedFileName) os.remove(jsonFilePath) except Exception: pass tmpLog.debug('done') return retVal
def post_processing(self, workspec, jobspec_list, map_type): ''' Fetch job output and process pilot info for sending in final heartbeat. The pilot pickle is loaded and some attributes corrected (schedulerid, pilotlog etc), then converted to dictionary and stored in workspec.workAttributes[pandaid]. If pilot pickle cannot be used, report ARC error in pilotErrorDiag and fill all possible attributes using ARC information. ''' arclog = arc_utils.ARCLogger(baselogger, workspec.workerID) tmplog = arclog.log tmplog.info('Post processing ARC job {0}'.format(workspec.batchID)) job = workspec.workAttributes['arcjob'] proxyrole = workspec.workAttributes['proxyrole'] arcid = job['JobID'] tmplog.info('Job id {0}'.format(arcid)) if 'arcdownloadfiles' not in workspec.workAttributes: tmplog.error('No files to download') return # Assume one-to-one mapping of workers to jobs. If jobspec_list is empty # it means the job was cancelled by panda or otherwise forgotten if not jobspec_list: return # Set certificate userconfig = arc.UserConfig(self.cred_type) try: userconfig.ProxyPath(str(self.certs[proxyrole])) except: tmplog.error("Job {0}: no proxy found with role {1}".format(job.JobID, proxyrole)) return queueconfigmapper = QueueConfigMapper() queueconfig = queueconfigmapper.get_queue(jobspec_list[0].computingSite) logbaseurl = queueconfig.submitter.get('logBaseURL') logbasedir = queueconfig.submitter.get('logDir', self.tmpdir) logsubdir = workspec.workAttributes['logsubdir'] pandaid = str(jobspec_list[0].PandaID) # Construct log path and url logurl = '/'.join([logbaseurl, logsubdir, str(pandaid)]) if logbaseurl else None logdir = os.path.join(logbasedir, logsubdir) # post_processing is only called once, so no retries are done. But keep # the possibility here in case it changes (fetched, notfetched, notfetchedretry) = self._download_outputs(workspec.workAttributes['arcdownloadfiles'], logdir, arcid, pandaid, userconfig, tmplog) if arcid not in fetched: tmplog.warning("Could not get outputs of {0}".format(arcid)) workspec.workAttributes[long(pandaid)] = {} workspec.workAttributes[long(pandaid)] = self._extractAndFixPilotPickle(job, pandaid, (arcid in fetched), logurl, tmplog) tmplog.debug("pilot info for {0}: {1}".format(pandaid, workspec.workAttributes[long(pandaid)]))
def kill_worker(self, workspec): """Cancel the ARC job. :param workspec: worker specification :type workspec: WorkSpec :return: A tuple of return code (True for success, False otherwise) and error dialog :rtype: (bool, string) """ # make logger arclog = arc_utils.ARCLogger(baselogger, workspec.workerID) tmplog = arclog.log (job, modtime, proxyrole) = arc_utils.workspec2arcjob(workspec) if not job.JobID: # Job not submitted tmplog.info("Job was not submitted so cannot be cancelled") return True, '' # Set certificate userconfig = arc.UserConfig(self.cred_type) try: userconfig.ProxyPath(str(self.certs[proxyrole])) except: # Log a warning and return True so that job can be cleaned tmplog.warning("Job {0}: no proxy found with role {1}".format( job.JobID, proxyrole)) return True, '' job_supervisor = arc.JobSupervisor(userconfig, [job]) job_supervisor.Update() job_supervisor.Cancel() notcancelled = job_supervisor.GetIDsNotProcessed() if job.JobID in notcancelled: if job.State == arc.JobState.UNDEFINED: # If longer than one hour since submission assume job never made it if job.SubmissionTime + arc.Period(3600) < arc.Time(): tmplog.warning( "Assuming job is lost and marking as cancelled") return True, '' # Job has not yet reached info system tmplog.warning( "Job is not yet in info system so cannot be cancelled") return False, "Job is not yet in info system so could not be cancelled" # Log a warning and return True so that job can be cleaned tmplog.warning("Job could not be cancelled") return True, '' tmplog.info("Job cancelled successfully") return True, ''
def sweep_worker(self, workspec): """Clean the ARC job :param workspec: worker specification :type workspec: WorkSpec :return: A tuple of return code (True for success, False otherwise) and error dialog :rtype: (bool, string) """ # make logger arclog = arc_utils.ARCLogger(baselogger, workspec.workerID) tmplog = arclog.log (job, modtime, proxyrole) = arc_utils.workspec2arcjob(workspec) if not job.JobID: # Job not submitted tmplog.info("Job was not submitted so cannot be cleaned") return True, '' # Set certificate userconfig = arc.UserConfig(self.cred_type) try: userconfig.ProxyPath(str(self.certs[proxyrole])) except: # Log a warning and return True so that job can be cleaned tmplog.warning("Job {0}: no proxy found with role {1}".format( job.JobID, proxyrole)) return True, '' job_supervisor = arc.JobSupervisor(userconfig, [job]) job_supervisor.Update() job_supervisor.Clean() notcleaned = job_supervisor.GetIDsNotProcessed() if job.JobID in notcleaned: # Log a warning and return True so that job can be finished tmplog.warning("Job could not be cleaned") return True, '' tmplog.info("Job cleaned successfully") return True, ''
def events_requested(self, workspec): '''Used to tell harvester that the worker requests events''' # get logger arclog = arc_utils.ARCLogger(baselogger, workspec.workerID) tmpLog = arclog.log # Check for jobid/jsonEventsRequestFileName job = workspec.workAttributes['arcjob'] arcid = job['JobID'] # Set certificate to use for interacting with ARC CE usercfg = arc.UserConfig(self.cred_type) if not self._setup_proxy(usercfg, workspec, arcid, tmpLog): return {} remoteJsonFilePath = '%s/%s' % (arcid, jsonEventsRequestFileName) localJsonFilePath = os.path.join(workspec.get_access_point(), jsonEventsRequestFileName) tmpLog.debug( 'looking for event request file {0}'.format(remoteJsonFilePath)) # Try to copy the file status = self._copy_file(remoteJsonFilePath, localJsonFilePath, usercfg, tmpLog) if not status: if status.GetErrno() == errno.ENOENT: # Not found tmpLog.debug('not found') return {} # Some other error tmpLog.warning('Failed to copy {0}: {1}'.format( remoteJsonFilePath, str(status))) return {} try: with open(localJsonFilePath) as jsonFile: retDict = json.load(jsonFile) os.remove(localJsonFilePath) except Exception: tmpLog.debug('failed to load json') return {} tmpLog.debug('found') return retDict
def submit_workers(self, workspec_list): retlist = [] # Get queue info from DB pandaqueues = self.dbproxy.get_cache("panda_queues.json", None) if pandaqueues is None: raise Exception("Failed to get panda queue info from database") pandaqueues = pandaqueues.data osmap = self.dbproxy.get_cache("ddmendpoints_objectstores.json", None) if osmap is None: raise Exception("Failed to get Object Store info from database") osmap = osmap.data for workspec in workspec_list: arclog = arc_utils.ARCLogger(baselogger, workspec.workerID) tmplog = arclog.log # Assume for aCT that jobs are always pre-fetched (no late-binding) for jobspec in workspec.get_jobspec_list(): tmplog.debug("JobSpec: {0}".format(jobspec.values_map())) if jobspec.computingSite not in pandaqueues: retlist.append( (False, "No queue information for {0}".format( jobspec.computingSite))) continue # Get CEs from panda queue info # List of (endpoint, queue) tuples arcces = [] for endpoint in pandaqueues[jobspec.computingSite]['queues']: ce_endpoint = endpoint['ce_endpoint'] if not re.search('://', ce_endpoint): ce_endpoint = 'gsiftp://%s' % ce_endpoint ce_queue = endpoint['ce_queue_name'] arcces.append((ce_endpoint, ce_queue)) if not arcces: retlist.append((False, "No CEs defined for %{0}".format( jobspec.computingSite))) continue # Set true pilot or not queueconfigmapper = QueueConfigMapper() queueconfig = queueconfigmapper.get_queue( jobspec.computingSite) pandaqueues[jobspec.computingSite][ 'truepilot'] = 'running' in queueconfig.noHeartbeat # Set log URL for GTAG env in job description logbaseurl = queueconfig.submitter.get('logBaseURL') logsubdir = self._set_logdir(jobspec.computingSite) logfileurl = '/'.join( [logbaseurl, logsubdir, '%d.out' % jobspec.PandaID]) if logbaseurl else None tmplog.debug("Converting to ARC XRSL format") arcxrsl = ARCParser( jobspec.jobParams, jobspec.computingSite, pandaqueues[jobspec.computingSite], logfileurl, self.schedulerid, osmap, '/tmp', # tmpdir, TODO common tmp dir None, #jobSpec.eventranges, # TODO event ranges tmplog) arcxrsl.parse() xrsl = arcxrsl.getXrsl() tmplog.debug("ARC xrsl: {0}".format(xrsl)) # Set the files to be downloaded at the end of the job downloadfiles = 'gmlog/errors' if 'logFile' in jobspec.jobParams: downloadfiles += ';%s' % jobspec.jobParams[ 'logFile'].replace('.tgz', '') if not pandaqueues[jobspec.computingSite]['truepilot']: downloadfiles += ';jobSmallFiles.tgz' # Set certificate userconfig = arc.UserConfig(self.cred_type) proxyrole = '' if jobspec.jobParams['prodSourceLabel'] == 'user': userconfig.ProxyPath(str(self.certs['pilot'])) proxyrole = 'pilot' else: userconfig.ProxyPath(str(self.certs['production'])) proxyrole = 'production' tmplog.debug("Submitting using {0} proxy at {1}".format( proxyrole, userconfig.ProxyPath())) try: tmplog.debug("Submission targets: {0}".format(arcces)) arcjob = self._arc_submit(xrsl, arcces, userconfig, tmplog) tmplog.info("ARC CE job id {0}".format(arcjob.JobID)) arc_utils.arcjob2workspec(arcjob, workspec) workspec.workAttributes['arcdownloadfiles'] = downloadfiles workspec.workAttributes['proxyrole'] = proxyrole workspec.workAttributes['logsubdir'] = logsubdir workspec.batchID = arcjob.JobID tmplog.debug(workspec.workAttributes) result = (True, '') except Exception as exc: tmplog.error(traceback.format_exc()) result = (False, "Failed to submit ARC job: {0}".format(str(exc))) retlist.append(result) return retlist
def events_to_update(self, workspec): '''Report events processed for harvester to update''' # get logger arclog = arc_utils.ARCLogger(baselogger, workspec.workerID) tmpLog = arclog.log job = workspec.workAttributes['arcjob'] arcid = job['JobID'] # Set certificate to use for interacting with ARC CE usercfg = arc.UserConfig(self.cred_type) if not self._setup_proxy(usercfg, workspec, arcid, tmpLog): return False # Check for jobid/jsonEventsUpdateFileName on CE, rename to .read retDict = dict() for pandaID in workspec.pandaid_list: # first look for json.read which is not yet acknowledged accessPoint = self.get_access_point(workspec, pandaID) localJsonFilePath = os.path.join(accessPoint, jsonEventsUpdateFileName) remoteJsonFilePathRead = '%s/%s%s' % ( arcid, jsonEventsUpdateFileName, suffixReadJson) tmpLog.debug('looking for event update file {0}'.format( remoteJsonFilePathRead)) status = self._copy_file(remoteJsonFilePathRead, localJsonFilePath, usercfg, tmpLog) if not status: if status.GetErrno() != errno.ENOENT: tmpLog.warning('Failed checking {0}: {1}'.format( remoteJsonFilePathRead, str(status))) continue # Look for new json remoteJsonFilePath = '%s/%s' % (arcid, jsonEventsUpdateFileName) status = self._copy_file(remoteJsonFilePath, localJsonFilePath, usercfg, tmpLog) if not status: if status.GetErrno() != errno.ENOENT: tmpLog.warning('Failed checking {0}: {1}'.format( remoteJsonFilePath, str(status))) else: # not found tmpLog.debug('not found') continue # Rename to prevent from being overwritten # Gridftp does not support renaming so upload .read file and delete old one status = self._copy_file(localJsonFilePath, remoteJsonFilePathRead, usercfg, tmpLog) if not status: tmpLog.warning('Failed copying {0} to {1}: {2}'.format( localJsonFilePath, remoteJsonFilePathRead, str(status))) # If rename fails, delete old file anyway status = self._delete_file(remoteJsonFilePath, usercfg, tmpLog) if not status: tmpLog.warning('Failed deleting {0}: {1}'.format( remoteJsonFilePath, str(status))) # load json nData = 0 try: with open(localJsonFilePath) as jsonFile: tmpOrigDict = json.load(jsonFile) newDict = dict() # change the key from str to int for tmpPandaID, tmpDict in tmpOrigDict.iteritems(): tmpPandaID = long(tmpPandaID) retDict[tmpPandaID] = tmpDict nData += 1 except Exception: raise tmpLog.error('failed to load json') # delete local file try: os.remove(localJsonFilePath) except Exception: pass tmpLog.debug('got {0} events for PandaID={1}'.format( nData, pandaID)) return retDict
def check_workers(self, workspec_list): retList = [] for workspec in workspec_list: # make logger arclog = arc_utils.ARCLogger(baselogger, workspec.workerID) tmplog = arclog.log tmplog.info("checking worker id {0}".format(workspec.workerID)) (job, modtime, proxyrole) = arc_utils.workspec2arcjob(workspec) # Set certificate userconfig = arc.UserConfig(self.cred_type) try: userconfig.ProxyPath(str(self.certs[proxyrole])) except: tmplog.error("Job {0}: no proxy found with role {1}".format( job.JobID, proxyrole)) retList.append((workspec.status, '')) continue job_supervisor = arc.JobSupervisor(userconfig, [job]) job_supervisor.Update() jobsupdated = job_supervisor.GetAllJobs() jobsnotupdated = job_supervisor.GetIDsNotProcessed() for updatedjob in jobsupdated: if updatedjob.JobID in jobsnotupdated: tmplog.error("Failed to find information on {0}".format( updatedjob.JobID)) # If missing for too long (2 days), mark as lost if arc.Time() - modtime > arc.Period(172800): tmplog.error( "Job {0} missing for more than 2 days, marking as lost" .format(updatedjob.JobID)) retList.append((workspec.ST_failed, '')) else: retList.append((workspec.status, '')) continue # Convert arc state to WorkSpec state arcstatus = updatedjob.State newstatus = WorkSpec.ST_submitted if arcstatus == arc.JobState.RUNNING or \ arcstatus == arc.JobState.FINISHING: newstatus = WorkSpec.ST_running elif arcstatus == arc.JobState.FINISHED: if updatedjob.ExitCode == -1: # Missing exit code, but assume success tmplog.warning( "Job {0} FINISHED but has missing exit code, setting to zero" .format(updatedjob.JobID)) updatedjob.ExitCode = 0 newstatus = WorkSpec.ST_finished elif arcstatus == arc.JobState.FAILED: newstatus = WorkSpec.ST_failed tmplog.info("Job {0} failed: {1}".format( updatedjob.JobID, ";".join([joberr for joberr in updatedjob.Error]))) elif arcstatus == arc.JobState.KILLED: newstatus = WorkSpec.ST_cancelled elif arcstatus == arc.JobState.DELETED or \ arcstatus == arc.JobState.OTHER: # unexpected newstatus = WorkSpec.ST_failed # Not covered: arc.JobState.HOLD. Maybe need a post-run state in # harvester, also to cover FINISHING # compare strings here to get around limitations of JobState API if job.State.GetGeneralState( ) == updatedjob.State.GetGeneralState(): tmplog.debug("Job {0} still in state {1}".format( job.JobID, job.State.GetGeneralState())) retList.append((newstatus, '')) continue tmplog.info("Job {0}: {1} -> {2} ({3})".format( job.JobID, job.State.GetGeneralState(), updatedjob.State.GetGeneralState(), updatedjob.State.GetSpecificState())) arc_utils.arcjob2workspec(updatedjob, workspec) # Have to force update to change info in DB workspec.force_update('workAttributes') tmplog.debug("batchStatus {0} -> workerStatus {1}".format( arcstatus.GetGeneralState(), newstatus)) retList.append((newstatus, '')) return True, retList