def processToResubmit(self): if self.cluster: jobstoresubmit = self.db.getArcJobs( "arcstate='toresubmit' and cluster='" + self.cluster + "'") else: jobstoresubmit = self.db.getArcJobs( "arcstate='toresubmit' and clusterlist=''") for proxyid, jobs in jobstoresubmit.items(): self.uc.CredentialString(str(self.db.getProxy(proxyid))) # Clean up jobs which were submitted jobstoclean = [job[2] for job in jobs if job[2].JobID] if jobstoclean: # Put all jobs to cancel, however the supervisor will only cancel # cancellable jobs and remove the rest so there has to be 2 calls # to Clean() job_supervisor = arc.JobSupervisor(self.uc, jobstoclean) job_supervisor.Update() self.log.info("Cancelling %i jobs" % len(jobstoclean)) job_supervisor.Cancel() processed = job_supervisor.GetIDsProcessed() notprocessed = job_supervisor.GetIDsNotProcessed() # Clean the successfully cancelled jobs if processed: job_supervisor.SelectByID(processed) self.log.info("Cleaning %i jobs" % len(processed)) if not job_supervisor.Clean(): self.log.warning("Failed to clean some jobs") # New job supervisor with the uncancellable jobs if notprocessed: notcancellable = [ job for job in jobstoclean if job.JobID in notprocessed ] job_supervisor = arc.JobSupervisor(self.uc, notcancellable) job_supervisor.Update() self.log.info("Cleaning %i jobs" % len(notcancellable)) if not job_supervisor.Clean(): self.log.warning("Failed to clean some jobs") # Empty job to reset DB info j = arc.Job() for (id, appjobid, job, created) in jobs: self.db.updateArcJob( id, { "arcstate": "tosubmit", "tarcstate": self.db.getTimeStamp(), "cluster": None }, j)
def processToRerun(self): if not self.cluster: # Rerun only applies to job which have been submitted return jobstorerun = self.db.getArcJobs("arcstate='torerun' and cluster='" + self.cluster + "'") if not jobstorerun: return # TODO: downtimes from CRIC if self.conf.get(['downtime', 'srmdown']) == 'True': self.log.info('SRM down, not rerunning') return self.log.info("Resuming %i jobs" % sum(len(v) for v in jobstorerun.values())) for proxyid, jobs in jobstorerun.items(): self.uc.CredentialString(str(self.db.getProxy(proxyid))) job_supervisor = arc.JobSupervisor(self.uc, [j[2] for j in jobs]) job_supervisor.Update() # Renew proxy to be safe job_supervisor.Renew() job_supervisor = arc.JobSupervisor(self.uc, [j[2] for j in jobs]) job_supervisor.Update() job_supervisor.Resume() notresumed = job_supervisor.GetIDsNotProcessed() for (id, appjobid, job, created) in jobs: if job.JobID in notresumed: self.log.error("%s: Could not resume job %s" % (appjobid, job.JobID)) self.db.updateArcJob(id, { "arcstate": "failed", "tarcstate": self.db.getTimeStamp() }) else: # Force a wait before next status check, to allow the # infosys to update and avoid the failed state being picked # up again self.db.updateArcJob( id, { "arcstate": "finishing" if job.RestartState == arc.JobState.FINISHING else 'submitted', "tarcstate": self.db.getTimeStamp(time.time() + 3600) })
def example(): # Creating a UserConfig object with the user's proxy # and the path of the trusted CA certificates uc = arc.UserConfig() uc.ProxyPath("/tmp/x509up_u%s" % os.getuid()) uc.CACertificatesDirectory("/etc/grid-security/certificates") # Create a new job object with a given JobID job = arc.Job() job.JobID = "https://piff.hep.lu.se:443/arex/hYDLDmyxvUfn5h5iWqkutBwoABFKDmABFKDmIpHKDmYBFKDmtRy9En" job.Flavour = "ARC1" job.ServiceInformationURL = job.JobStatusURL = job.JobManagementURL = arc.URL("https://piff.hep.lu.se:443/arex") sys.stdout.write("Get job information from the computing element...\n") # Put the job into a JobSupervisor and update its information job_supervisor = arc.JobSupervisor(uc, [job]) job_supervisor.Update() sys.stdout.write("Downloading results...\n") # Prepare a list for storing the directories for the downloaded job results (if there would be more jobs) downloadeddirectories = arc.StringList() # Start retrieving results of all the selected jobs # into the "/tmp" directory (first argument) # using the jobid and not the jobname as the name of the subdirectory (second argument, usejobname = False) # do not overwrite existing directories with the same name (third argument: force = False) # collect the downloaded directories into the variable "downloadeddirectories" (forth argument) success = job_supervisor.Retrieve("/tmp", False, False, downloadeddirectories) if not success: sys.stdout.write("Downloading results failed.\n") for downloadeddirectory in downloadeddirectories: sys.stdout.write("Job results were downloaded to %s\n"%str(downloadeddirectory)) sys.stdout.write("Contents of the directory:\n") for filename in os.listdir(downloadeddirectory): sys.stdout.write(" %s\n"%filename)
def killJob(self, jobIDList): """ Kill the specified jobs """ result = self._prepareProxy() if not result['OK']: gLogger.error('ARCComputingElement: failed to set up proxy', result['Message']) return result self.usercfg.ProxyPath(os.environ['X509_USER_PROXY']) jobList = list(jobIDList) if isinstance(jobIDList, basestring): jobList = [jobIDList] gLogger.debug("Killing jobs %s" % jobIDList) jobs = [] for jobID in jobList: jobs.append(self.__getARCJob(jobID)) # JobSupervisor is able to aggregate jobs to perform bulk operations and thus minimizes the communication overhead job_supervisor = arc.JobSupervisor(self.usercfg, jobs) if not job_supervisor.Cancel(): errorString = ' - '.join(jobList).strip() return S_ERROR('Failed to kill at least one of these jobs: %s. CE(?) not reachable?' % errorString) return S_OK()
def processToClean(self): jobstoclean = self.db.getArcJobs("arcstate='toclean' and cluster='" + self.cluster + "' limit 100") if not jobstoclean: return self.log.info("Cleaning %d jobs" % sum(len(v) for v in jobstoclean.values())) for proxyid, jobs in jobstoclean.items(): self.uc.CredentialString(str(self.db.getProxy(proxyid))) job_supervisor = arc.JobSupervisor(self.uc, [j[2] for j in jobs]) job_supervisor.Update() job_supervisor.Clean() notcleaned = job_supervisor.GetIDsNotProcessed() for (id, appjobid, job, created) in jobs: if job.JobID in notcleaned: self.log.error("%s: Could not clean job %s" % (appjobid, job.JobID)) self.db.deleteArcJob(id)
def killJob(self, jobIDList): """ Kill the specified jobs """ result = self._prepareProxy() if not result['OK']: self.log.error('ARCComputingElement: failed to set up proxy', result['Message']) return result self.usercfg.ProxyPath(os.environ['X509_USER_PROXY']) jobList = list(jobIDList) if isinstance(jobIDList, six.string_types): jobList = [jobIDList] self.log.debug("Killing jobs %s" % jobIDList) jobs = [] for jobID in jobList: jobs.append(self.__getARCJob(jobID)) # JobSupervisor is able to aggregate jobs to perform bulk operations and thus minimizes the communication overhead # We still need to create chunks to avoid timeout in the case there are too many jobs to supervise for chunk in breakListIntoChunks(jobs, 100): job_supervisor = arc.JobSupervisor(self.usercfg, chunk) if not job_supervisor.Cancel(): errorString = ' - '.join(jobList).strip() return S_ERROR('Failed to kill at least one of these jobs: %s. CE(?) not reachable?' % errorString) return S_OK()
def example(): # Creating a UserConfig object with the user's proxy # and the path of the trusted CA certificates uc = arc.UserConfig() uc.ProxyPath("/tmp/x509up_u%s" % os.getuid()) uc.CACertificatesDirectory("/etc/grid-security/certificates") # Create a new job object with a given JobID job = arc.Job() job.JobID = "https://piff.hep.lu.se:443/arex/1QuMDmRwvUfn5h5iWqkutBwoABFKDmABFKDmIpHKDmXBFKDmIuAean" job.Flavour = "ARC1" job.JobManagementURL = arc.URL("https://piff.hep.lu.se:443/arex") job.JobStatusURL = arc.URL("https://piff.hep.lu.se:443/arex") sys.stdout.write("Job object before update:\n") job.SaveToStream(arc.CPyOstream(sys.stdout), True) job_supervisor = arc.JobSupervisor(uc, [job]) # Update the states of jobs within this JobSupervisor job_supervisor.Update() # Get our updated job from the JobSupervisor jobs = job_supervisor.GetAllJobs() job = jobs[0] sys.stdout.write("Job object after update:\n") job.SaveToStream(arc.CPyOstream(sys.stdout), True)
def test_resubmit(self): self.usercfg.Broker("TEST") arc.TargetInformationRetrieverPluginTESTControl.targets = [ self.create_test_target("http://test2.nordugrid.org") ] arc.TargetInformationRetrieverPluginTESTControl.status = arc.EndpointQueryingStatus( arc.EndpointQueryingStatus.SUCCESSFUL) js = arc.JobSupervisor(self.usercfg, [ self.create_test_job( job_id="http://test.nordugrid.org/1234567890test1", state=arc.JobState.FAILED), self.create_test_job( job_id="http://test.nordugrid.org/1234567890test2", state=arc.JobState.RUNNING) ]) self.expect(js.GetAllJobs()).to_have(2).jobs() endpoints = [ arc.Endpoint("http://test2.nordugrid.org", arc.Endpoint.COMPUTINGINFO, "org.nordugrid.tirtest") ] resubmitted = arc.JobList() result = js.Resubmit(0, endpoints, resubmitted)
def example(): uc = arc.UserConfig() # Create a JobSupervisor to handle all the jobs job_supervisor = arc.JobSupervisor(uc) # Retrieve all the jobs from this computing element endpoint = arc.Endpoint("https://piff.hep.lu.se:443/arex", arc.Endpoint.JOBLIST) sys.stdout.write("Querying %s for jobs...\n" % endpoint.str()) retriever = arc.JobListRetriever(uc) retriever.addConsumer(job_supervisor) retriever.addEndpoint(endpoint) retriever.wait() sys.stdout.write("%s jobs found\n" % len(job_supervisor.GetAllJobs())) sys.stdout.write("Getting job states...\n") # Update the states of the jobs job_supervisor.Update() # Print state of updated jobs sys.stdout.write("The jobs have the following states: %s\n" % (", ".join( [job.State.GetGeneralState() for job in job_supervisor.GetAllJobs()]))) # Select failed jobs job_supervisor.SelectByStatus(["Failed"]) failed_jobs = job_supervisor.GetSelectedJobs() sys.stdout.write("The failed jobs:\n") for job in failed_jobs: job.SaveToStream(arc.CPyOstream(sys.stdout), True)
def get_job(self, job_id): """ Return an instance of ``arc.Job`` representing the job with the given ID :param job_id: ID of the job as returned by `submit_job` :raises JobNotFoundError: if no job with the given ID could be found :return: Instance of ``arc.Job`` representing the job """ user_config = self.get_user_config() # Create a JobSupervisor to handle all the jobs job_supervisor = arc.JobSupervisor(user_config) # Retrieve all the jobs from this computing element endpoint = arc.Endpoint(self.config.ARC_SERVER, arc.Endpoint.JOBLIST) retriever = arc.JobListRetriever(user_config) retriever.addConsumer(job_supervisor) retriever.addEndpoint(endpoint) retriever.wait() # Update the states of the jobs job_supervisor.Update() # Get all jobs and find job by ID jobs = job_supervisor.GetAllJobs() for job in jobs: if job.JobID == job_id: return job raise JobNotFoundError( "Could not find a job with ID '{}'".format(job_id))
def example(): # Creating a UserConfig object with the user's proxy # and the path of the trusted CA certificates uc = arc.UserConfig() uc.ProxyPath("/tmp/x509up_u%s" % os.getuid()) uc.CACertificatesDirectory("/etc/grid-security/certificates") # Create a new job object with a given JobID job = arc.Job() job.JobID = "https://piff.hep.lu.se:443/arex/w7LNDmSkEiun1ZPzno6AuCjpABFKDmABFKDmZ9LKDmUBFKDmXugZwm" job.IDFromEndpoint = "w7LNDmSkEiun1ZPzno6AuCjpABFKDmABFKDmZ9LKDmUBFKDmXugZwm" job.JobManagementURL = arc.URL("https://piff.hep.lu.se:443/arex") job.JobStatusURL = arc.URL("https://piff.hep.lu.se:443/arex") job.JobStatusInterfaceName = 'org.ogf.glue.emies.activitymanagement' job.JobManagementInterfaceName = 'org.ogf.glue.emies.activitymanagement' sys.stdout.write("Job object before update:\n") job.SaveToStream(arc.CPyOstream(sys.stdout), True) job_supervisor = arc.JobSupervisor(uc, [job]) # Update the states of jobs within this JobSupervisor job_supervisor.Update() # Get our updated job from the JobSupervisor jobs = job_supervisor.GetAllJobs() if not jobs: sys.stdout.write("No jobs found\n") return job = jobs[0] sys.stdout.write("Job object after update:\n") job.SaveToStream(arc.CPyOstream(sys.stdout), True)
def fetchAll(self, jobs): # Get all outputs using Job Supervisor job_supervisor = arc.JobSupervisor(self.uc, list(jobs.values())) job_supervisor.Update() dirs = arc.StringList() job_supervisor.Retrieve(self.tmpdir, False, False, dirs) return (list(job_supervisor.GetIDsProcessed()), list(job_supervisor.GetIDsNotProcessed()))
def kill_worker(self, workspec): """Cancel the ARC job. :param workspec: worker specification :type workspec: WorkSpec :return: A tuple of return code (True for success, False otherwise) and error dialog :rtype: (bool, string) """ # make logger arclog = arc_utils.ARCLogger(baselogger, workspec.workerID) tmplog = arclog.log (job, modtime, proxyrole) = arc_utils.workspec2arcjob(workspec) if not job.JobID: # Job not submitted tmplog.info("Job was not submitted so cannot be cancelled") return True, '' # Set certificate userconfig = arc.UserConfig(self.cred_type) try: userconfig.ProxyPath(str(self.certs[proxyrole])) except: # Log a warning and return True so that job can be cleaned tmplog.warning("Job {0}: no proxy found with role {1}".format( job.JobID, proxyrole)) return True, '' job_supervisor = arc.JobSupervisor(userconfig, [job]) job_supervisor.Update() job_supervisor.Cancel() notcancelled = job_supervisor.GetIDsNotProcessed() if job.JobID in notcancelled: if job.State == arc.JobState.UNDEFINED: # If longer than one hour since submission assume job never made it if job.SubmissionTime + arc.Period(3600) < arc.Time(): tmplog.warning( "Assuming job is lost and marking as cancelled") return True, '' # Job has not yet reached info system tmplog.warning( "Job is not yet in info system so cannot be cancelled") return False, "Job is not yet in info system so could not be cancelled" # Log a warning and return True so that job can be cleaned tmplog.warning("Job could not be cancelled") return True, '' tmplog.info("Job cancelled successfully") return True, ''
def test_cancel(self): id1 = "http://test.nordugrid.org/1234567890test1" id2 = "http://test.nordugrid.org/1234567890test2" id3 = "http://test.nordugrid.org/1234567890test3" id4 = "http://test.nordugrid.org/1234567890test4" js = arc.JobSupervisor(self.usercfg, [ self.create_test_job(job_id=id1, state=arc.JobState.RUNNING), self.create_test_job(job_id=id2, state=arc.JobState.FINISHED), self.create_test_job(job_id=id3, state=arc.JobState.UNDEFINED) ]) arc.JobControllerPluginTestACCControl.cancelStatus = True self.expect(js.Cancel()).to_be( True, message="Cancel was expected to return True") self.expect(js.GetIDsProcessed()).to_have(1).ID() self.expect(js.GetIDsProcessed()[0]).to_be(id1) self.expect(js.GetIDsNotProcessed()).to_have(2).IDs() self.expect(js.GetIDsNotProcessed()[0]).to_be(id2) self.expect(js.GetIDsNotProcessed()[1]).to_be(id3) js.ClearSelection() arc.JobControllerPluginTestACCControl.cancelStatus = False self.expect(js.Cancel()).to_be( False, message="Cancel was expected to return False") self.expect(js.GetIDsProcessed()).to_have(0).IDs() self.expect(js.GetIDsNotProcessed()).to_have(3).IDs() self.expect(js.GetIDsNotProcessed()[0]).to_be(id1) self.expect(js.GetIDsNotProcessed()[1]).to_be(id2) self.expect(js.GetIDsNotProcessed()[2]).to_be(id3) js.ClearSelection() job = self.create_test_job(job_id=id4, state=arc.JobState.ACCEPTED, state_text="Accepted") self.expect(js.AddJob(job)).to_be( True, message="AddJob was expected to return True") arc.JobControllerPluginTestACCControl.cancelStatus = True js.SelectByStatus(["Accepted"]) self.expect(js.Cancel()).to_be( True, message="Cancel was expected to return False") self.expect(js.GetIDsProcessed()).to_have(1).ID() self.expect(js.GetIDsProcessed()[0]).to_be(id4) self.expect(js.GetIDsNotProcessed()).to_have(0).IDs() js.ClearSelection() arc.JobControllerPluginTestACCControl.cancelStatus = False js.SelectByStatus(["Accepted"]) self.expect(js.Cancel()).to_be( False, message="Cancel was expected to return False") self.expect(js.GetIDsProcessed()).to_have(0).IDs() self.expect(js.GetIDsNotProcessed()).to_have(1).ID() self.expect(js.GetIDsNotProcessed()[0]).to_be(id4) js.ClearSelection()
def test_constructor(self): id1 = "http://test.nordugrid.org/1234567890test1" id2 = "http://test.nordugrid.org/1234567890test2" js = arc.JobSupervisor(self.usercfg, [ self.create_test_job(job_id=id1), self.create_test_job(job_id=id2) ]) self.expect(js.GetAllJobs()).not_to_be_empty() jobs = js.GetAllJobs() self.expect(jobs).to_have(2).jobs() self.expect(jobs[0].JobID).to_be(id1) self.expect(jobs[1].JobID).to_be(id2)
def test_clean(self): id1 = "http://test.nordugrid.org/1234567890test1" id2 = "http://test.nordugrid.org/1234567890test2" js = arc.JobSupervisor(self.usercfg, [ self.create_test_job(job_id=id1, state=arc.JobState.FINISHED, state_text="Finished"), self.create_test_job(job_id=id2, state=arc.JobState.UNDEFINED) ]) self.expect(js.GetAllJobs()).to_have(2).jobs() arc.JobControllerPluginTestACCControl.cleanStatus = True self.expect(js.Clean()).to_be( True, message="Clean was expected to return True") self.expect(js.GetIDsProcessed()).to_have(1).ID() self.expect(js.GetIDsProcessed()[0]).to_be(id1) self.expect(js.GetIDsNotProcessed()).to_have(1).ID() self.expect(js.GetIDsNotProcessed()[0]).to_be(id2) js.ClearSelection() arc.JobControllerPluginTestACCControl.cleanStatus = False self.expect(js.Clean()).to_be( False, message="Clean was expected to return False") self.expect(js.GetIDsProcessed()).to_have(0).IDs() self.expect(js.GetIDsNotProcessed()).to_have(2).IDs() self.expect(js.GetIDsNotProcessed()[0]).to_be(id1) self.expect(js.GetIDsNotProcessed()[1]).to_be(id2) js.ClearSelection() arc.JobControllerPluginTestACCControl.cleanStatus = True js.SelectByStatus(["Finished"]) self.expect(js.Clean()).to_be( True, message="Clean was expected to return True") self.expect(js.GetIDsProcessed()).to_have(1).ID() self.expect(js.GetIDsProcessed()[0]).to_be(id1) self.expect(js.GetIDsNotProcessed()).to_have(0).IDs() js.ClearSelection() arc.JobControllerPluginTestACCControl.cleanStatus = False js.SelectByStatus(["Finished"]) self.expect(js.Clean()).to_be( False, message="Clean was expected to return False") self.expect(js.GetIDsProcessed()).to_have(0).IDs() self.expect(js.GetIDsNotProcessed()).to_have(1).ID() self.expect(js.GetIDsNotProcessed()[0]).to_be(id1) js.ClearSelection()
def test_add_job(self): js = arc.JobSupervisor(self.usercfg, arc.JobList()) self.expect(js.GetAllJobs()).to_be_empty() job = self.create_test_job( job_id="http://test.nordugrid.org/1234567890test1") self.expect(js.AddJob(job)).to_be( True, message="AddJob was expected to return True") self.expect(js.GetAllJobs()).not_to_be_empty() job.JobManagementInterfaceName = "" self.expect(js.AddJob(job)).to_be( False, message="AddJob was expected to return False") self.expect(js.GetAllJobs()).to_have(1).job() job.JobManagementInterfaceName = "non.existent.interface" self.expect(js.AddJob(job)).to_be( False, message="AddJob was expected to return False") self.expect(js.GetAllJobs()).to_have(1).job()
def sweep_worker(self, workspec): """Clean the ARC job :param workspec: worker specification :type workspec: WorkSpec :return: A tuple of return code (True for success, False otherwise) and error dialog :rtype: (bool, string) """ # make logger arclog = arc_utils.ARCLogger(baselogger, workspec.workerID) tmplog = arclog.log (job, modtime, proxyrole) = arc_utils.workspec2arcjob(workspec) if not job.JobID: # Job not submitted tmplog.info("Job was not submitted so cannot be cleaned") return True, '' # Set certificate userconfig = arc.UserConfig(self.cred_type) try: userconfig.ProxyPath(str(self.certs[proxyrole])) except: # Log a warning and return True so that job can be cleaned tmplog.warning("Job {0}: no proxy found with role {1}".format( job.JobID, proxyrole)) return True, '' job_supervisor = arc.JobSupervisor(userconfig, [job]) job_supervisor.Update() job_supervisor.Clean() notcleaned = job_supervisor.GetIDsNotProcessed() if job.JobID in notcleaned: # Log a warning and return True so that job can be finished tmplog.warning("Job could not be cleaned") return True, '' tmplog.info("Job cleaned successfully") return True, ''
def processToCancel(self): jobstocancel = self.db.getArcJobs("arcstate='tocancel' and cluster='" + self.cluster + "'") if not jobstocancel: return self.log.info("Cancelling %i jobs" % sum(len(v) for v in jobstocancel.values())) for proxyid, jobs in jobstocancel.items(): self.uc.CredentialString(self.db.getProxy(proxyid)) job_supervisor = arc.JobSupervisor(self.uc, [j[2] for j in jobs]) job_supervisor.Update() job_supervisor.Cancel() notcancelled = job_supervisor.GetIDsNotProcessed() for (id, appjobid, job, created) in jobs: if not job.JobID: # Job not submitted self.log.info("%s: Marking unsubmitted job cancelled" % appjobid) self.db.updateArcJob( id, { "arcstate": "cancelled", "tarcstate": self.db.getTimeStamp() }) elif job.JobID in notcancelled: if job.State == arc.JobState.UNDEFINED: # If longer than one hour since submission assume job never made it if job.StartTime + arc.Period(3600) < arc.Time(): self.log.warning( "%s: Assuming job %s is lost and marking as cancelled" % (appjobid, job.JobID)) self.db.updateArcJob( id, { "arcstate": "cancelled", "tarcstate": self.db.getTimeStamp() }) else: # Job has not yet reached info system self.log.warning( "%s: Job %s is not yet in info system so cannot be cancelled" % (appjobid, job.JobID)) else: self.log.error("%s: Could not cancel job %s" % (appjobid, job.JobID)) # Just to mark as cancelled so it can be cleaned self.db.updateArcJob( id, { "arcstate": "cancelled", "tarcstate": self.db.getTimeStamp() }) else: self.db.updateArcJob( id, { "arcstate": "cancelling", "tarcstate": self.db.getTimeStamp() })
def getJobStatus(self, jobIDList): """ Get the status information for the given list of jobs """ result = self._prepareProxy() if not result['OK']: self.log.error('ARCComputingElement: failed to set up proxy', result['Message']) return result self.usercfg.ProxyPath(os.environ['X509_USER_PROXY']) jobTmpList = list(jobIDList) if isinstance(jobIDList, six.string_types): jobTmpList = [jobIDList] # Pilots are stored with a DIRAC stamp (":::XXXXX") appended jobList = [] for j in jobTmpList: if ":::" in j: job = j.split(":::")[0] else: job = j jobList.append(job) jobs = [] for jobID in jobList: jobs.append(self.__getARCJob(jobID)) # JobSupervisor is able to aggregate jobs to perform bulk operations and thus minimizes the communication overhead # We still need to create chunks to avoid timeout in the case there are too many jobs to supervise jobsUpdated = [] for chunk in breakListIntoChunks(jobs, 100): job_supervisor = arc.JobSupervisor(self.usercfg, chunk) job_supervisor.Update() jobsUpdated.extend(job_supervisor.GetAllJobs()) resultDict = {} jobsToRenew = [] jobsToCancel = [] for job in jobsUpdated: jobID = job.JobID self.log.debug("Retrieving status for job %s" % jobID) arcState = job.State.GetGeneralState() self.log.debug("ARC status for job %s is %s" % (jobID, arcState)) if arcState: # Meaning arcState is filled. Is this good python? resultDict[jobID] = self.mapStates[arcState] # Renew proxy only of jobs which are running or queuing if arcState in ("Running", "Queuing"): nearExpiry = arc.Time() + arc.Period(10000) # 2 hours, 46 minutes and 40 seconds if job.ProxyExpirationTime < nearExpiry: # Jobs to renew are aggregated to perform bulk operations jobsToRenew.append(job) self.log.debug("Renewing proxy for job %s whose proxy expires at %s" % (jobID, job.ProxyExpirationTime)) if arcState == "Hold": # Jobs to cancel are aggregated to perform bulk operations # Cancel held jobs so they don't sit in the queue forever jobsToCancel.append(job) self.log.debug("Killing held job %s" % jobID) else: resultDict[jobID] = 'Unknown' # If done - is it really done? Check the exit code if resultDict[jobID] == "Done": exitCode = int(job.ExitCode) if exitCode: resultDict[jobID] = "Failed" self.log.debug("DIRAC status for job %s is %s" % (jobID, resultDict[jobID])) # JobSupervisor is able to aggregate jobs to perform bulk operations and thus minimizes the communication overhead # We still need to create chunks to avoid timeout in the case there are too many jobs to supervise for chunk in breakListIntoChunks(jobsToRenew, 100): job_supervisor_renew = arc.JobSupervisor(self.usercfg, chunk) if not job_supervisor_renew.Renew(): self.log.warn('At least one of the jobs failed to renew its credentials') for chunk in breakListIntoChunks(jobsToCancel, 100): job_supervisor_cancel = arc.JobSupervisor(self.usercfg, chunk) if not job_supervisor_cancel.Cancel(): self.log.warn('At least one of the jobs failed to be cancelled') if not resultDict: return S_ERROR('No job statuses returned') return S_OK(resultDict)
def check_workers(self, workspec_list): retList = [] for workspec in workspec_list: # make logger arclog = arc_utils.ARCLogger(baselogger, workspec.workerID) tmplog = arclog.log tmplog.info("checking worker id {0}".format(workspec.workerID)) (job, modtime, proxyrole) = arc_utils.workspec2arcjob(workspec) # Set certificate userconfig = arc.UserConfig(self.cred_type) try: userconfig.ProxyPath(str(self.certs[proxyrole])) except: tmplog.error("Job {0}: no proxy found with role {1}".format( job.JobID, proxyrole)) retList.append((workspec.status, '')) continue job_supervisor = arc.JobSupervisor(userconfig, [job]) job_supervisor.Update() jobsupdated = job_supervisor.GetAllJobs() jobsnotupdated = job_supervisor.GetIDsNotProcessed() for updatedjob in jobsupdated: if updatedjob.JobID in jobsnotupdated: tmplog.error("Failed to find information on {0}".format( updatedjob.JobID)) # If missing for too long (2 days), mark as lost if arc.Time() - modtime > arc.Period(172800): tmplog.error( "Job {0} missing for more than 2 days, marking as lost" .format(updatedjob.JobID)) retList.append((workspec.ST_failed, '')) else: retList.append((workspec.status, '')) continue # Convert arc state to WorkSpec state arcstatus = updatedjob.State newstatus = WorkSpec.ST_submitted if arcstatus == arc.JobState.RUNNING or \ arcstatus == arc.JobState.FINISHING: newstatus = WorkSpec.ST_running elif arcstatus == arc.JobState.FINISHED: if updatedjob.ExitCode == -1: # Missing exit code, but assume success tmplog.warning( "Job {0} FINISHED but has missing exit code, setting to zero" .format(updatedjob.JobID)) updatedjob.ExitCode = 0 newstatus = WorkSpec.ST_finished elif arcstatus == arc.JobState.FAILED: newstatus = WorkSpec.ST_failed tmplog.info("Job {0} failed: {1}".format( updatedjob.JobID, ";".join([joberr for joberr in updatedjob.Error]))) elif arcstatus == arc.JobState.KILLED: newstatus = WorkSpec.ST_cancelled elif arcstatus == arc.JobState.DELETED or \ arcstatus == arc.JobState.OTHER: # unexpected newstatus = WorkSpec.ST_failed # Not covered: arc.JobState.HOLD. Maybe need a post-run state in # harvester, also to cover FINISHING # compare strings here to get around limitations of JobState API if job.State.GetGeneralState( ) == updatedjob.State.GetGeneralState(): tmplog.debug("Job {0} still in state {1}".format( job.JobID, job.State.GetGeneralState())) retList.append((newstatus, '')) continue tmplog.info("Job {0}: {1} -> {2} ({3})".format( job.JobID, job.State.GetGeneralState(), updatedjob.State.GetGeneralState(), updatedjob.State.GetSpecificState())) arc_utils.arcjob2workspec(updatedjob, workspec) # Have to force update to change info in DB workspec.force_update('workAttributes') tmplog.debug("batchStatus {0} -> workerStatus {1}".format( arcstatus.GetGeneralState(), newstatus)) retList.append((newstatus, '')) return True, retList
def checkJobs(self): ''' Query all running jobs ''' # minimum time between checks if time.time() < self.checktime + int( self.conf.get(['jobs', 'checkmintime'])): self.log.debug("mininterval not reached") return self.checktime = time.time() # check jobs which were last checked more than checkinterval ago jobstocheck=self.db.getArcJobs("arcstate in ('submitted', 'running', 'finishing', 'cancelling', 'holding') and " \ "jobid not like '' and cluster='"+self.cluster+"' and "+ \ self.db.timeStampLessThan("tarcstate", self.conf.get(['jobs','checkinterval'])) + \ " limit 100000") njobstocheck = sum(len(v) for v in jobstocheck.values()) if not njobstocheck: return self.log.info("%d jobs to check" % njobstocheck) self.resetJobs(jobstocheck) # Loop over proxies for proxyid, jobs in jobstocheck.items(): self.uc.CredentialString(str(self.db.getProxy(proxyid))) job_supervisor = arc.JobSupervisor(self.uc, [j[2] for j in jobs]) job_supervisor.Update() jobsupdated = job_supervisor.GetAllJobs() jobsnotupdated = job_supervisor.GetIDsNotProcessed() for (originaljobinfo, updatedjob) in zip(jobs, jobsupdated): (id, appjobid, originaljob, created) = originaljobinfo if updatedjob.JobID in jobsnotupdated: self.log.error("%s: Failed to find information on %s" % (appjobid, updatedjob.JobID)) continue if updatedjob.JobID != originaljob.JobID: # something went wrong with list order self.log.warning( "%s: Bad job id (%s), expected %s" % (appjobid, updatedjob.JobID, originaljob.JobID)) continue # compare strings here to get around limitations of JobState API # map INLRMS:S and O to HOLD (not necessary when ARC 4.1 is used) if updatedjob.State.GetGeneralState() == 'Queuing' and ( updatedjob.State.GetSpecificState() == 'INLRMS:S' or updatedjob.State.GetSpecificState() == 'INLRMS:O'): updatedjob.State = arc.JobState('Hold') if originaljob.State.GetGeneralState() == updatedjob.State.GetGeneralState() \ and self.cluster not in ['gsiftp://gar-ex-etpgrid1.garching.physik.uni-muenchen.de:2811/preempt', 'gsiftp://arc1-it4i.farm.particle.cz/qfree', 'gsiftp://arc2-it4i.farm.particle.cz/qfree']: # just update timestamp # Update numbers every time for superMUC since walltime is missing for finished jobs self.db.updateArcJob(id, {'tarcstate': self.db.getTimeStamp()}) continue self.log.info("%s: Job %s: %s -> %s (%s)" % (appjobid, originaljob.JobID, originaljob.State.GetGeneralState(), updatedjob.State.GetGeneralState(), updatedjob.State.GetSpecificState())) # state changed, update whole Job object arcstate = 'submitted' if updatedjob.State == arc.JobState.FINISHED: if updatedjob.ExitCode == -1: # Missing exit code, but assume success self.log.warning( "%s: Job %s FINISHED but has missing exit code, setting to zero" % (appjobid, updatedjob.JobID)) updatedjob.ExitCode = 0 arcstate = 'finished' self.log.debug( '%s: reported walltime %d, cputime %d' % (appjobid, updatedjob.UsedTotalWallTime.GetPeriod(), updatedjob.UsedTotalCPUTime.GetPeriod())) elif updatedjob.State == arc.JobState.FAILED: # EMI-ES reports cancelled jobs as failed so check substate (this is fixed in ARC 6.8) if 'cancel' in updatedjob.State.GetSpecificState(): arcstate = 'cancelled' else: arcstate = self.processJobErrors( id, appjobid, updatedjob) elif updatedjob.State == arc.JobState.KILLED: arcstate = 'cancelled' elif updatedjob.State == arc.JobState.RUNNING: arcstate = 'running' elif updatedjob.State == arc.JobState.FINISHING: arcstate = 'finishing' elif updatedjob.State == arc.JobState.HOLD: arcstate = 'holding' elif updatedjob.State == arc.JobState.DELETED or \ updatedjob.State == arc.JobState.OTHER: # unexpected arcstate = 'failed' # Walltime reported by ARC 6 is multiplied by cores if arc.ARC_VERSION_MAJOR >= 6 and updatedjob.RequestedSlots > 0: updatedjob.UsedTotalWallTime = arc.Period( updatedjob.UsedTotalWallTime.GetPeriod() // updatedjob.RequestedSlots) # Fix crazy wallclock and CPU times if updatedjob.UsedTotalWallTime > arc.Time() - arc.Time( int(created.strftime("%s"))): fixedwalltime = arc.Time() - arc.Time( int(created.strftime("%s"))) self.log.warning( "%s: Fixing reported walltime %d to %d" % (appjobid, updatedjob.UsedTotalWallTime.GetPeriod(), fixedwalltime.GetPeriod())) updatedjob.UsedTotalWallTime = fixedwalltime if updatedjob.UsedTotalCPUTime > arc.Period(10**7): self.log.warning( "%s: Discarding reported CPUtime %d" % (appjobid, updatedjob.UsedTotalCPUTime.GetPeriod())) updatedjob.UsedTotalCPUTime = arc.Period(-1) self.db.updateArcJob( id, { 'arcstate': arcstate, 'tarcstate': self.db.getTimeStamp(), 'tstate': self.db.getTimeStamp() }, updatedjob) self.log.info('Done')