def _getARCJob(self, jobID): """Create an ARC Job with all the needed / possible parameters defined. By the time we come here, the environment variable X509_USER_PROXY should already be set """ j = arc.Job() j.JobID = str(jobID) j.IDFromEndpoint = os.path.basename(j.JobID) if self.endpointType == "Gridftp": statURL = "ldap://%s:2135/Mds-Vo-Name=local,o=grid??sub?(nordugrid-job-globalid=%s)" % ( self.ceHost, jobID) j.JobStatusURL = arc.URL(str(statURL)) j.JobStatusInterfaceName = "org.nordugrid.ldapng" mangURL = "gsiftp://%s:2811/jobs/" % (self.ceHost) j.JobManagementURL = arc.URL(str(mangURL)) j.JobManagementInterfaceName = "org.nordugrid.gridftpjob" j.ServiceInformationURL = j.JobManagementURL j.ServiceInformationInterfaceName = "org.nordugrid.ldapng" else: commonURL = "https://%s:8443/arex" % self.ceHost j.JobStatusURL = arc.URL(str(commonURL)) j.JobStatusInterfaceName = "org.ogf.glue.emies.activitymanagement" j.JobManagementURL = arc.URL(str(commonURL)) j.JobManagementInterfaceName = "org.ogf.glue.emies.activitymanagement" j.ServiceInformationURL = arc.URL(str(commonURL)) j.ServiceInformationInterfaceName = "org.ogf.glue.emies.resourceinfo" j.PrepareHandler(self.usercfg) return j
def example(): # Creating a UserConfig object with the user's proxy # and the path of the trusted CA certificates uc = arc.UserConfig() uc.ProxyPath("/tmp/x509up_u%s" % os.getuid()) uc.CACertificatesDirectory("/etc/grid-security/certificates") # Create a new job object with a given JobID job = arc.Job() job.JobID = "https://piff.hep.lu.se:443/arex/w7LNDmSkEiun1ZPzno6AuCjpABFKDmABFKDmZ9LKDmUBFKDmXugZwm" job.IDFromEndpoint = "w7LNDmSkEiun1ZPzno6AuCjpABFKDmABFKDmZ9LKDmUBFKDmXugZwm" job.JobManagementURL = arc.URL("https://piff.hep.lu.se:443/arex") job.JobStatusURL = arc.URL("https://piff.hep.lu.se:443/arex") job.JobStatusInterfaceName = 'org.ogf.glue.emies.activitymanagement' job.JobManagementInterfaceName = 'org.ogf.glue.emies.activitymanagement' sys.stdout.write("Job object before update:\n") job.SaveToStream(arc.CPyOstream(sys.stdout), True) job_supervisor = arc.JobSupervisor(uc, [job]) # Update the states of jobs within this JobSupervisor job_supervisor.Update() # Get our updated job from the JobSupervisor jobs = job_supervisor.GetAllJobs() if not jobs: sys.stdout.write("No jobs found\n") return job = jobs[0] sys.stdout.write("Job object after update:\n") job.SaveToStream(arc.CPyOstream(sys.stdout), True)
def example(): # Creating a UserConfig object with the user's proxy # and the path of the trusted CA certificates uc = arc.UserConfig() uc.ProxyPath("/tmp/x509up_u%s" % os.getuid()) uc.CACertificatesDirectory("/etc/grid-security/certificates") # Create a new job object with a given JobID job = arc.Job() job.JobID = "https://piff.hep.lu.se:443/arex/1QuMDmRwvUfn5h5iWqkutBwoABFKDmABFKDmIpHKDmXBFKDmIuAean" job.Flavour = "ARC1" job.JobManagementURL = arc.URL("https://piff.hep.lu.se:443/arex") job.JobStatusURL = arc.URL("https://piff.hep.lu.se:443/arex") sys.stdout.write("Job object before update:\n") job.SaveToStream(arc.CPyOstream(sys.stdout), True) job_supervisor = arc.JobSupervisor(uc, [job]) # Update the states of jobs within this JobSupervisor job_supervisor.Update() # Get our updated job from the JobSupervisor jobs = job_supervisor.GetAllJobs() job = jobs[0] sys.stdout.write("Job object after update:\n") job.SaveToStream(arc.CPyOstream(sys.stdout), True)
def _getARCJob(self, jobID): """Create an ARC Job with all the needed / possible parameters defined. By the time we come here, the environment variable X509_USER_PROXY should already be set """ j = arc.Job() j.JobID = str(jobID) j.IDFromEndpoint = os.path.basename(j.JobID) # Get the endpoint type (GridFTP or AREX) endpointType = j.JobID.split(":")[0] if endpointType == "gsiftp": statURL = "ldap://%s:2135/Mds-Vo-Name=local,o=grid??sub?(nordugrid-job-globalid=%s)" % (self.ceHost, jobID) j.JobStatusURL = arc.URL(str(statURL)) j.JobStatusInterfaceName = "org.nordugrid.ldapng" mangURL = os.path.dirname(j.JobID) j.JobManagementURL = arc.URL(str(mangURL)) j.JobManagementInterfaceName = "org.nordugrid.gridftpjob" j.ServiceInformationURL = j.JobManagementURL j.ServiceInformationInterfaceName = "org.nordugrid.ldapng" else: commonURL = "/".join(j.JobID.split("/")[0:4]) j.JobStatusURL = arc.URL(str(commonURL)) j.JobStatusInterfaceName = "org.nordugrid.arcrest" j.JobManagementURL = arc.URL(str(commonURL)) j.JobManagementInterfaceName = "org.nordugrid.arcrest" j.ServiceInformationURL = arc.URL(str(commonURL)) j.ServiceInformationInterfaceName = "org.nordugrid.arcrest" j.PrepareHandler(self.usercfg) return j
def example(): # Creating a UserConfig object with the user's proxy # and the path of the trusted CA certificates uc = arc.UserConfig() uc.ProxyPath("/tmp/x509up_u%s" % os.getuid()) uc.CACertificatesDirectory("/etc/grid-security/certificates") # Create a new job object with a given JobID job = arc.Job() job.JobID = "https://piff.hep.lu.se:443/arex/hYDLDmyxvUfn5h5iWqkutBwoABFKDmABFKDmIpHKDmYBFKDmtRy9En" job.Flavour = "ARC1" job.ServiceInformationURL = job.JobStatusURL = job.JobManagementURL = arc.URL("https://piff.hep.lu.se:443/arex") sys.stdout.write("Get job information from the computing element...\n") # Put the job into a JobSupervisor and update its information job_supervisor = arc.JobSupervisor(uc, [job]) job_supervisor.Update() sys.stdout.write("Downloading results...\n") # Prepare a list for storing the directories for the downloaded job results (if there would be more jobs) downloadeddirectories = arc.StringList() # Start retrieving results of all the selected jobs # into the "/tmp" directory (first argument) # using the jobid and not the jobname as the name of the subdirectory (second argument, usejobname = False) # do not overwrite existing directories with the same name (third argument: force = False) # collect the downloaded directories into the variable "downloadeddirectories" (forth argument) success = job_supervisor.Retrieve("/tmp", False, False, downloadeddirectories) if not success: sys.stdout.write("Downloading results failed.\n") for downloadeddirectory in downloadeddirectories: sys.stdout.write("Job results were downloaded to %s\n"%str(downloadeddirectory)) sys.stdout.write("Contents of the directory:\n") for filename in os.listdir(downloadeddirectory): sys.stdout.write(" %s\n"%filename)
def test(): from pandaharvester.harvestercore.work_spec import WorkSpec wspec = WorkSpec() jobid = "gsiftp://pcoslo5.cern.ch:2811/jobs/XkNNDmultdtn1ZPzno6AuCjpABFKDmABFKDmwqyLDmABFKDm8dOcOn" wspec.batchID = jobid workAttributes = {"arcjob": {}} workAttributes["arcjob"]["JobID"] = wspec.batchID workAttributes["arcjob"][ "JobStatusURL"] = "ldap://{0}:2135/mds-vo-name=local,o=grid??sub?(nordugrid-job-globalid={1})".format( urlparse(jobid).netloc, jobid) workAttributes["arcjob"]["JobStatusInterfaceName"] = "org.nordugrid.ldapng" jobmanagementurl = arc.URL(wspec.batchID) jobmanagementurl.ChangePath("/jobs") workAttributes["arcjob"]["JobManagementURL"] = jobmanagementurl.str() workAttributes["arcjob"][ "JobManagementInterfaceName"] = "org.nordugrid.gridftpjob" workAttributes["proxyrole"] = 'production' wspec.workAttributes = workAttributes wspec.accessPoint = '/tmp' wspec.mapType = WorkSpec.MT_OneToOne wspec.pandaid_list = [1234] print wspec.workAttributes messenger = ARCMessenger() print messenger.events_requested(wspec) print messenger.feed_events(wspec, {'event': 1234}) print messenger.events_to_update(wspec) messenger.acknowledge_events_files(wspec)
def save_job_outputs(self, job_id): """ Retrieve output files from a job and save them to a temp directory. The file/directory specified in `OUTPUT_FILE` will be downloaded, and ``stdout`` and ``stderr`` outputs are saved as ``stdout.txt`` and ``stderr.txt`` respectively. :param job_id: ID of the job as returned by `submit_job` :raises JobNotFoundError: if no job with the given ID could be found :return: Path to the directory the output files were saved in, or ``None`` if no files were saved """ job = self.get_job(job_id) user_config = self.get_user_config() temp_dir = tempfile.mkdtemp() # Last argument is 'force' - whether to continue if destination directory already exists success = job.Retrieve(user_config, arc.URL("file://{}".format(temp_dir)), True) # Remove temp dir and fail if no files were downloaded if not os.listdir(temp_dir): success = False os.rmdir(temp_dir) return temp_dir if success else None
def __getARCJob( self, jobID ): """ Create an ARC Job with all the needed / possible parameters defined. By the time we come here, the environment variable X509_USER_PROXY should already be set """ j = arc.Job() j.JobID = jobID statURL = "ldap://%s:2135/Mds-Vo-Name=local,o=grid??sub?(nordugrid-job-globalid=%s)" % ( self.ceHost, jobID ) j.JobStatusURL = arc.URL( statURL ) j.JobStatusInterfaceName = "org.nordugrid.ldapng" mangURL = "gsiftp://%s:2811/jobs/" % ( self.ceHost ) j.JobManagementURL = arc.URL( mangURL ) j.JobManagementInterfaceName = "org.nordugrid.gridftpjob" j.ServiceInformationURL = j.JobManagementURL j.ServiceInformationInterfaceName = "org.nordugrid.ldapng" j.PrepareHandler( self.usercfg ) return j
def getJobOutput(self, jobID, localDir=None): """ Get the specified job standard output and error files. If the localDir is provided, the output is returned as file in this directory. Otherwise, the output is returned as strings. """ result = self._prepareProxy() self.usercfg.ProxyPath(os.environ['X509_USER_PROXY']) if not result['OK']: gLogger.error('ARCComputingElement: failed to set up proxy', result['Message']) return result if jobID.find(':::') != -1: pilotRef, stamp = jobID.split(':::') else: pilotRef = jobID stamp = '' if not stamp: return S_ERROR('Pilot stamp not defined for %s' % pilotRef) job = self.__getARCJob(pilotRef) arcID = os.path.basename(pilotRef) gLogger.debug("Retrieving pilot logs for %s" % pilotRef) if "WorkingDirectory" in self.ceParameters: workingDirectory = os.path.join( self.ceParameters['WorkingDirectory'], arcID) else: workingDirectory = arcID outFileName = os.path.join(workingDirectory, '%s.out' % stamp) errFileName = os.path.join(workingDirectory, '%s.err' % stamp) gLogger.debug("Working directory for pilot output %s" % workingDirectory) isItOkay = job.Retrieve(self.usercfg, arc.URL(workingDirectory), False) if (isItOkay): outFile = open(outFileName, 'r') output = outFile.read() outFile.close() os.unlink(outFileName) errFile = open(errFileName, 'r') error = errFile.read() errFile.close() os.unlink(errFileName) gLogger.debug("Pilot output = %s" % output) gLogger.debug("Pilot error = %s" % error) else: job.Update() arcState = job.State.GetGeneralState() if (arcState != "Undefined"): return S_ERROR( 'Failed to retrieve output for %s as job is not finished (maybe not started yet)' % jobID) gLogger.debug( "Could not retrieve pilot output for %s - either permission / proxy error or could not connect to CE" % pilotRef) return S_ERROR('Failed to retrieve output for %s' % jobID) return S_OK((output, error))
def create_test_job(self, job_id = "http://test.nordugrid.org/testid", cluster = "http://test.nordugrid.org", state = arc.JobState.RUNNING, state_text = None, job_description = "non-empty"): job = arc.Job() job.JobID = job_id job.ServiceInformationInterfaceName = job.JobStatusInterfaceName = job.JobManagementInterfaceName = "org.nordugrid.test" job.ServiceInformationURL = job.JobStatusURL = job.JobManagementURL = arc.URL(cluster) if state_text is None: job.State = arc.JobStateTEST(state) else: job.State = arc.JobStateTEST(state, state_text) job.JobDescriptionDocument = job_description return job
def extractOutputFilesFromMetadata(self, arcjobid): aj = self.dbarc.getArcJobInfo(arcjobid, columns=["JobID", "appjobid"]) if not aj or 'JobID' not in aj or not aj['JobID']: self.log.error("failed to find arcjobid %s in database" % arcjobid) return {} jobid = aj['JobID'] sessionid = jobid[jobid.rfind('/') + 1:] try: jobinfo = aCTPandaJob(filename=os.path.join( self.tmpdir, sessionid, 'heartbeat.json')) metadata = getattr(jobinfo, 'xml') # travis doesn't like jobinfo.xml except Exception as x: self.log.error("%s: failed to extract metadata for arcjob %s: %s" % (aj['appjobid'], sessionid, x)) return {} try: outputfiles = json.loads(metadata) except Exception as e: self.log.error( "%s: failed to load output file info for arcjob %s: %s" % (aj['appjobid'], sessionid, str(e))) return {} surls = {} for attrs in outputfiles.values(): try: size = attrs['fsize'] adler32 = attrs['adler32'] surl = attrs['surl'] se = arc.URL(str(surl)).Host() except Exception as x: self.log.error('%s: %s' % (aj['appjobid'], x)) else: checksum = "adler32:" + adler32 if se not in surls: surls[se] = [] surls[se] += [{ "surl": surl, "fsize": size, "checksum": checksum, "arcjobid": arcjobid }] return surls
def rename(self, pfn, new_pfn): """ Allows to rename a file stored inside the connected RSE. :param pfn Current physical file name :param new_pfn New physical file name :raises DestinationNotAccessible, ServiceUnavailable, SourceNotFound """ dp = DataPoint(str(pfn), self.cfg) if dp.h is None: raise ServiceUnavailable("Can't handle pfn %s" % pfn) url = arc.URL(str(new_pfn)) if not url: raise ServiceUnavailable("Can't handle new pfn %s" % new_pfn) status = dp.h.Rename(url) if not status: if status.GetErrno() == errno.ENOENT: raise SourceNotFound() raise ServiceUnavailable(str(status))
def test(jobid): '''Test checking status''' from pandaharvester.harvestercore.work_spec import WorkSpec wspec = WorkSpec() wspec.batchID = jobid #"gsiftp://pikolit.ijs.si:2811/jobs/HtgKDmtCe7qn4J8tmqCBXHLnABFKDmABFKDmBcGKDmABFKDm4NCTCn" workAttributes = {"arcjob": {}} workAttributes["arcjob"]["JobID"] = wspec.batchID workAttributes["arcjob"][ "JobStatusURL"] = "ldap://{0}:2135/mds-vo-name=local,o=grid??sub?(nordugrid-job-globalid={1})".format( urlparse.urlparse(jobid).netloc, jobid) workAttributes["arcjob"]["JobStatusInterfaceName"] = "org.nordugrid.ldapng" jobmanagementurl = arc.URL(wspec.batchID) jobmanagementurl.ChangePath("/jobs") workAttributes["arcjob"]["JobManagementURL"] = jobmanagementurl.str() workAttributes["arcjob"][ "JobManagementInterfaceName"] = "org.nordugrid.gridftpjob" wspec.workAttributes = workAttributes print wspec.workAttributes monitor = ARCMonitor() print monitor.check_workers([wspec])
def copyFinishedFiles(self, arcjobid, extractmetadata): """ - if extractmetadata: (normal arc jobs, not true pilot jobs) - extract panda_node_struct.pickle from jobSmallFiles.tgz and store it under tmp/pickle - extract metadata-surl.xml and update pickle. store xml under tmp/xml - copy .job.log file to jobs/date/cluster/jobid - copy gmlog dir to jobs/date/cluster/jobid """ columns = [ 'JobID', 'appjobid', 'cluster', 'UsedTotalWallTime', 'EndTime', 'ExecutionNode', 'stdout' ] aj = self.dbarc.getArcJobInfo(arcjobid, columns=columns) if not aj.has_key('JobID') or not aj['JobID']: self.log.error('No JobID in arcjob %s: %s' % (str(arcjobid), str(aj))) return False jobid = aj['JobID'] sessionid = jobid[jobid.rfind('/') + 1:] date = time.strftime('%Y%m%d') cluster = arc.URL(str(jobid)).Host() if extractmetadata: try: pandapickle = self._extractFromSmallFiles( aj, "panda_node_struct.pickle") except Exception, x: self.log.error( "%s: failed to extract pickle for arcjob %s: %s" % (aj['appjobid'], sessionid, x)) pandapickle = None try: metadata = self._extractFromSmallFiles(aj, "metadata-surl.xml") except Exception, x: self.log.error( "%s: failed to extract metadata-surl.xml for arcjob %s: %s" % (aj['appjobid'], sessionid, x)) metadata = None
def test(jobid): '''Kill a job''' from pandaharvester.harvestercore.work_spec import WorkSpec import json wspec = WorkSpec() wspec.batchID = jobid workAttributes = {"arcjob": {}} workAttributes["arcjob"]["JobID"] = wspec.batchID workAttributes["arcjob"][ "JobStatusURL"] = "ldap://{0}:2135/mds-vo-name=local,o=grid??sub?(nordugrid-job-globalid={1})".format( urlparse.urlparse(jobid).netloc, wspec.batchID) workAttributes["arcjob"]["JobStatusInterfaceName"] = "org.nordugrid.ldapng" jobmanagementurl = arc.URL(wspec.batchID) jobmanagementurl.ChangePath("/jobs") workAttributes["arcjob"]["JobManagementURL"] = jobmanagementurl.str() workAttributes["arcjob"][ "JobManagementInterfaceName"] = "org.nordugrid.gridftpjob" wspec.workAttributes = workAttributes print wspec.workAttributes sweeper = ARCSweeper() print sweeper.kill_worker(wspec)
def infinite(self, url): logger.msg(arc.INFO, "EchoService (python) thread test starting") i = 0 while True: try: i += 1 cfg = arc.MCCConfig() s = arc.ClientSOAP(cfg, arc.URL(url)) ns = arc.NS('echo', echo_ns) outpayload = arc.PayloadSOAP(ns) outpayload.NewChild('echo:echo').NewChild('echo:say').Set( 'hi!') resp, status = s.process(outpayload) logger.msg( arc.INFO, "EchoService (python) thread test, iteration %(iteration)s %(status)s" % { 'iteration': i, 'status': status }) time.sleep(3) except Exception as e: import traceback logger.msg(arc.DEBUG, traceback.format_exc())
def submit(self): """ Main function to submit jobs. """ global queuelist # check for stopsubmission flag if self.conf.get(['downtime', 'stopsubmission']) == "true": self.log.info('Submission suspended due to downtime') return # check for any site-specific limits or status clusterstatus = self.conf.getCond(["sites", "site"], f"endpoint={self.cluster}", ["status"]) or 'online' if clusterstatus == 'offline': self.log.info('Site status is offline') return clustermaxjobs = int( self.conf.getCond(["sites", "site"], f"endpoint={self.cluster}", ["maxjobs"]) or 999999) nsubmitted = self.db.getNArcJobs(f"cluster='{self.cluster}'") if nsubmitted >= clustermaxjobs: self.log.info( f'{nsubmitted} submitted jobs is greater than or equal to max jobs {clustermaxjobs}' ) return # Get cluster host and queue: cluster/queue clusterhost = clusterqueue = None if self.cluster: cluster = self.cluster if cluster.find('://') == -1: cluster = 'gsiftp://' + cluster clusterurl = arc.URL(cluster) clusterhost = clusterurl.Host() clusterqueue = clusterurl.Path()[1:] # strip off leading slash # Apply fair-share if self.cluster: fairshares = self.db.getArcJobsInfo( "arcstate='tosubmit' and clusterlist like '%" + self.cluster + "%'", ['fairshare', 'proxyid']) else: fairshares = self.db.getArcJobsInfo( "arcstate='tosubmit' and clusterlist=''", ['fairshare', 'proxyid']) if not fairshares: self.log.info('Nothing to submit') return # split by proxy for GU queues fairshares = list( set([(p['fairshare'], p['proxyid']) for p in fairshares])) # For proxy bug - see below shuffle(fairshares) for fairshare, proxyid in fairshares: # apply maxjobs limit (check above should make sure greater than zero) # Note: relies on exit after first loop limit = min(clustermaxjobs - nsubmitted, 10) try: # catch any exceptions here to avoid leaving lock if self.cluster: # Lock row for update in case multiple clusters are specified #jobs=self.db.getArcJobsInfo("arcstate='tosubmit' and ( clusterlist like '%{0}' or clusterlist like '%{0},%' ) and fairshare='{1}' order by priority desc limit 10".format(self.cluster, fairshare), jobs = self.db.getArcJobsInfo( "arcstate='tosubmit' and ( clusterlist like '%{0}' or clusterlist like '%{0},%' ) and fairshare='{1}' and proxyid='{2}' limit {3}" .format(self.cluster, fairshare, proxyid, limit), columns=[ "id", "jobdesc", "appjobid", "priority", "proxyid", "clusterlist" ], lock=True) if jobs: self.log.debug("started lock for writing %d jobs" % len(jobs)) else: jobs = self.db.getArcJobsInfo( "arcstate='tosubmit' and clusterlist='' and fairshare='{0} and proxyid={1}' limit {2}" .format(fairshare, proxyid, limit), columns=[ "id", "jobdesc", "appjobid", "priority", "proxyid", "clusterlist" ]) # mark submitting in db jobs_taken = [] for j in jobs: jd = { 'cluster': self.cluster, 'arcstate': 'submitting', 'tarcstate': self.db.getTimeStamp() } self.db.updateArcJobLazy(j['id'], jd) jobs_taken.append(j) jobs = jobs_taken finally: if self.cluster: try: self.db.Commit(lock=True) self.log.debug("ended lock") except: self.log.warning("Failed to release DB lock") else: self.db.Commit() if len(jobs) == 0: #self.log.debug("No jobs to submit") continue self.log.info( "Submitting %d jobs for fairshare %s and proxyid %d" % (len(jobs), fairshare, proxyid)) # max waiting priority try: maxpriowaiting = max(jobs, key=lambda x: x['priority'])['priority'] except: maxpriowaiting = 0 self.log.info("Maximum priority of waiting jobs: %d" % maxpriowaiting) # Query infosys - either local or index if self.cluster: if self.cluster.find('://') != -1: aris = arc.URL(self.cluster) else: aris = arc.URL('gsiftp://%s' % self.cluster) if aris.Protocol() == 'https': aris.ChangePath('/arex') infoendpoints = [ arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO, 'org.ogf.glue.emies.resourceinfo') ] elif aris.Protocol() == 'local': infoendpoints = [ arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO, 'org.nordugrid.local') ] else: aris = 'ldap://' + aris.Host( ) + '/mds-vo-name=local,o=grid' infoendpoints = [ arc.Endpoint(aris, arc.Endpoint.COMPUTINGINFO, 'org.nordugrid.ldapng') ] else: giises = self.conf.getList(['atlasgiis', 'item']) infoendpoints = [] for g in giises: # Specify explicitly EGIIS infoendpoints.append( arc.Endpoint(str(g), arc.Endpoint.REGISTRY, "org.nordugrid.ldapegiis")) # Set UserConfig credential for querying infosys proxystring = str(self.db.getProxy(proxyid)) self.uc.CredentialString(proxystring) global usercred usercred = self.uc # retriever contains a list of CE endpoints retriever = arc.ComputingServiceRetriever(self.uc, infoendpoints) retriever.wait() # targets is the list of queues # parse target.ComputingService.ID for the CE hostname # target.ComputingShare.Name is the queue name targets = retriever.GetExecutionTargets() # Filter only sites for this process queuelist = [] for target in targets: if not target.ComputingService.ID: self.log.info( "Target %s does not have ComputingService ID defined, skipping" % target.ComputingService.Name) continue # If EMI-ES infoendpoint, force EMI-ES submission if infoendpoints[ 0].InterfaceName == 'org.ogf.glue.emies.resourceinfo' and target.ComputingEndpoint.InterfaceName != 'org.ogf.glue.emies.activitycreation': self.log.debug( "Rejecting target interface %s because not EMI-ES" % target.ComputingEndpoint.InterfaceName) continue # Check for matching host and queue targethost = re.sub( ':arex$', '', re.sub('urn:ogf:ComputingService:', '', target.ComputingService.ID)) targetqueue = target.ComputingShare.Name if clusterhost and targethost != clusterhost: self.log.debug( 'Rejecting target host %s as it does not match %s' % (targethost, clusterhost)) continue if clusterqueue and targetqueue != clusterqueue: self.log.debug( 'Rejecting target queue %s as it does not match %s' % (targetqueue, clusterqueue)) continue if targetqueue in self.conf.getList(['queuesreject', 'item']): self.log.debug( 'Rejecting target queue %s in queuesreject list' % targetqueue) continue elif targethost in self.conf.getList( ['clustersreject', 'item']): self.log.debug( 'Rejecting target host %s in clustersreject list' % targethost) continue else: # tmp hack target.ComputingShare.LocalWaitingJobs = 0 target.ComputingShare.PreLRMSWaitingJobs = 0 target.ExecutionEnvironment.CPUClockSpeed = 2000 qjobs = self.db.getArcJobsInfo( "cluster='" + str(self.cluster) + "' and arcstate='submitted' and fairshare='%s'" % fairshare, ['id', 'priority']) rjobs = self.db.getArcJobsInfo( "cluster='" + str(self.cluster) + "' and arcstate='running' and fairshare='%s'" % fairshare, ['id']) # max queued priority try: maxprioqueued = max( qjobs, key=lambda x: x['priority'])['priority'] except: maxprioqueued = 0 self.log.info("Max priority queued: %d" % maxprioqueued) # Limit number of submitted jobs using configuration or default (0.15 + 100/num of shares) # Note: assumes only a few shares are used qfraction = float(self.conf.get([ 'jobs', 'queuefraction' ])) if self.conf.get(['jobs', 'queuefraction']) else 0.15 qoffset = int(self.conf.get([ 'jobs', 'queueoffset' ])) if self.conf.get(['jobs', 'queueoffset']) else 100 jlimit = len(rjobs) * qfraction + qoffset / len(fairshares) self.log.debug("running %d, queued %d, queue limit %d" % (len(rjobs), len(qjobs), jlimit)) if str(self.cluster).find('arc-boinc-0') != -1: jlimit = len(rjobs) * 0.15 + 400 if str(self.cluster).find('XXXpikolit') != -1: jlimit = len(rjobs) * 0.15 + 100 if str(self.cluster).find('arc05.lcg') != -1: jlimit = len(rjobs) * 0.15 + 400 target.ComputingShare.PreLRMSWaitingJobs = len(qjobs) if len(qjobs) < jlimit or ((maxpriowaiting > maxprioqueued) and (maxpriowaiting > 10)): if maxpriowaiting > maxprioqueued: self.log.info( "Overriding limit, maxpriowaiting: %d > maxprioqueued: %d" % (maxpriowaiting, maxprioqueued)) queuelist.append(target) self.log.debug("Adding target %s:%s" % (targethost, targetqueue)) else: self.log.info( "%s/%s already at limit of submitted jobs for fairshare %s" % (targethost, targetqueue, fairshare)) # check if any queues are available, if not leave and try again next time if not queuelist: self.log.info("No free queues available") self.db.Commit() continue self.log.info("start submitting") # Just run one thread for each job in sequence. Strange things happen # when trying to create a new UserConfig object for each thread. tasks = [] for j in jobs: self.log.debug("%s: preparing submission" % j['appjobid']) jobdescstr = str( self.db.getArcJobDescription(str(j['jobdesc']))) jobdescs = arc.JobDescriptionList() if not jobdescstr or not arc.JobDescription_Parse( jobdescstr, jobdescs): self.log.error("%s: Failed to prepare job description" % j['appjobid']) continue tasks.append((j['id'], j['appjobid'], jobdescstr, proxystring, int(self.conf.get(['atlasgiis', 'timeout'])))) npools = 1 if any(s in self.cluster for s in self.conf.getList(['parallelsubmit', 'item'])): npools = int(self.conf.get(['parallelsubmit', 'npools'])) self.log.debug("Starting submitters: %s" % npools) pool = multiprocessing.Pool(npools) #results = [] #for task in tasks: # result = pool.apply_async(Submit,(task)) # results.append(result) # Submit in workers results = [pool.apply_async(Submit, (t)) for t in tasks] # timeout per submission timeout = 60 stopflag = False for result, task in zip(results, tasks): try: jdb = result.get(timeout) jconv = JobConv() job = jconv.db2job(jdb) except multiprocessing.TimeoutError: self.log.error( "%s: submission timeout: exit and try again" % task[1]) # abort submission if Submit process is stuck #pool.terminate() KillPool(pool) pool.join() stopflag = True # reduce timeout to finish quickly timeout = 0.1 continue if job is None: self.log.error("%s: no job defined for %d" % (task[1], task[0])) continue jd = {} jd['arcstate'] = 'submitted' # initial offset to 1 minute to force first status check jd['tarcstate'] = self.db.getTimeStamp( time.time() - int(self.conf.get(['jobs', 'checkinterval'])) + 120) jd['tstate'] = self.db.getTimeStamp() # extract hostname of cluster (depends on JobID being a URL) self.log.info("%s: job id %s" % (task[1], job.JobID)) jd['cluster'] = self.cluster self.db.updateArcJobLazy(task[0], jd, job) if not stopflag: pool.terminate() pool.join() else: # stop submitting, gsiftp connection problem likely raise ExceptInterrupt(15) self.log.info("threads finished") # commit transaction to release row locks self.db.Commit() # still proxy bug - exit if there are multiple proxies if len(self.db.getProxiesInfo('TRUE', ['id'])) > 1: raise ExceptInterrupt(15) self.log.info("end submitting") return
def updatePandaHeartbeat(self,pstatus): """ Heartbeat status updates. """ nthreads=int(self.conf.get(["panda","threads"])) columns = ['pandaid', 'siteName', 'startTime', 'computingElement', 'node', 'corecount', 'eventranges'] jobs=self.dbpanda.getJobs("pandastatus='"+pstatus+"' and sendhb=1 and ("+self.dbpanda.timeStampLessThan("theartbeat", self.conf.get(['panda','heartbeattime']))+" or modified > theartbeat) limit 1000", columns) if not jobs: return self.log.info("Update heartbeat for %d jobs in state %s (%s)" % (len(jobs), pstatus, ','.join([str(j['pandaid']) for j in jobs]))) changed_pstatus = False if pstatus == 'sent': pstatus = 'starting' changed_pstatus = True tlist=[] for j in jobs: # Don't send transferring heartbeat for ES jobs, they must be in running while events are updated if pstatus == 'transferring' and j['eventranges']: pstatus = 'running' jd = {} if pstatus != 'starting': jd['startTime'] = j['startTime'] if j['computingElement']: if j['computingElement'].find('://') != -1: # this if is only needed during transition period jd['computingElement'] = arc.URL(str(j['computingElement'])).Host() else: jd['computingElement'] = j['computingElement'] jd['node'] = j['node'] jd['siteName'] = j['siteName'] # For starting truepilot jobs send pilotID with expected log # location so logs are available in case of lost heartbeat if pstatus == 'starting' and not changed_pstatus and self.sites[j['siteName']]['truepilot']: date = time.strftime('%Y-%m-%d', time.gmtime()) logurl = '/'.join([self.conf.get(["joblog","urlprefix"]), date, j['siteName'], '%s.out' % j['pandaid']]) jd['pilotID'] = '%s|Unknown|Unknown|Unknown|Unknown' % logurl try: jd['jobMetrics']="coreCount=%s" % (j['corecount'] if j['corecount'] > 0 else self.sites[j['siteName']]['corecount']) except: pass t=PandaThr(self.getPanda(j['siteName']).updateStatus,j['pandaid'],pstatus,jd) tlist.append(t) aCTUtils.RunThreadsSplit(tlist,nthreads) for t in tlist: if t.result == None or 'StatusCode' not in t.result: # Strange response from panda, try later continue if t.result['StatusCode'] and t.result['StatusCode'][0] == '60': self.log.error('Failed to contact Panda, proxy may have expired') continue #self.log.debug('%s: %s' % (t.id, t.result)) if 'command' in t.result and t.result['command'][0] != "NULL": self.log.info("%s: response: %s" % (t.id,t.result) ) jd={} if changed_pstatus: jd['pandastatus']=pstatus # Make sure heartbeat is ahead of modified time so it is not picked up again if self.sites[t.args['siteName']]['truepilot'] and pstatus == 'starting': # Set theartbeat 1h in the future to allow job to start # running and avoid race conditions with heartbeats # Now heartbeat timeout is 2h so we remove the offset #jd['theartbeat'] = self.dbpanda.getTimeStamp(time.time()+3600) jd['theartbeat'] = self.dbpanda.getTimeStamp(time.time()+1) else: jd['theartbeat'] = self.dbpanda.getTimeStamp(time.time()+1) # If panda tells us to kill the job, set actpandastatus to tobekilled # and remove from heartbeats if 'command' in t.result and ( ("tobekilled" in t.result['command'][0]) or ("badattemptnr" in t.result['command'][0]) ): self.log.info('%s: cancelled by panda' % t.id) jd['actpandastatus']="tobekilled" jd['pandastatus']=None self.dbpanda.updateJob(t.id,jd) self.log.info("Threads finished")
def _arc_submit(self, xrsl, arcces, userconfig, log): '''Check the available CEs and submit''' queuelist = [] for arcce in arcces: (ce_endpoint, ce_queue) = arcce aris = arc.URL(str(ce_endpoint)) ce_host = aris.Host() if aris.Protocol() == 'https': aris.ChangePath('/arex') infoendpoints = [ arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO, 'org.ogf.glue.emies.resourceinfo') ] else: aris = 'ldap://' + aris.Host() + '/mds-vo-name=local,o=grid' infoendpoints = [ arc.Endpoint(aris, arc.Endpoint.COMPUTINGINFO, 'org.nordugrid.ldapng') ] # retriever contains a list of CE endpoints retriever = arc.ComputingServiceRetriever(userconfig, infoendpoints) retriever.wait() # targets is the list of queues # parse target.ComputingService.ID for the CE hostname # target.ComputingShare.Name is the queue name targets = retriever.GetExecutionTargets() # Filter only sites for this process for target in targets: if not target.ComputingService.ID: log.info( "Target {0} does not have ComputingService ID defined, skipping" .format(target.ComputingService.Name)) continue # If EMI-ES infoendpoint, force EMI-ES submission if infoendpoints[0].InterfaceName == 'org.ogf.glue.emies.resourceinfo' \ and target.ComputingEndpoint.InterfaceName != 'org.ogf.glue.emies.activitycreation': log.debug( "Rejecting target interface {0} because not EMI-ES". format(target.ComputingEndpoint.InterfaceName)) continue # Check for matching host and queue targethost = re.sub( ':arex$', '', re.sub('urn:ogf:ComputingService:', '', target.ComputingService.ID)) targetqueue = target.ComputingShare.Name if targethost != ce_host: log.debug( 'Rejecting target host {0} as it does not match {1}'. format(targethost, ce_host)) continue if targetqueue != ce_queue: log.debug( 'Rejecting target queue {0} as it does not match {1}'. format(targetqueue, ce_queue)) continue queuelist.append(target) log.debug("Adding target {0}:{1}".format( targethost, targetqueue)) # check if any queues are available, if not leave and try again next time if not queuelist: raise Exception("No free queues available") log.debug("preparing submission") jobdescs = arc.JobDescriptionList() if not arc.JobDescription_Parse(str(xrsl), jobdescs): raise Exception("Failed to prepare job description") # Run the submission in a separate thread thr = SubmitThr(queuelist, jobdescs, userconfig) return self._run_submit(thr)
def submit(self): """ Main function to submit jobs. """ global queuelist # check for stopsubmission flag if self.conf.get(['downtime', 'stopsubmission']) == "true": self.log.info('Submission suspended due to downtime') return 0 # Get cluster host and queue: cluster/queue clusterhost = clusterqueue = None if self.cluster: cluster = self.cluster if cluster.find('://') == -1: cluster = 'gsiftp://' + cluster clusterurl = arc.URL(cluster) clusterhost = clusterurl.Host() clusterqueue = clusterurl.Path()[1:] # strip off leading slash # Apply fair-share if self.cluster: fairshares = self.db.getArcJobsInfo( "arcstate='tosubmit' and clusterlist like '%" + self.cluster + "%'", ['fairshare']) else: fairshares = self.db.getArcJobsInfo( "arcstate='tosubmit' and clusterlist=''", ['fairshare']) if not fairshares: self.log.info('Nothing to submit') return 0 fairshares = list(set([p['fairshare'] for p in fairshares])) # For EMI-ES proxy bug - see below shuffle(fairshares) count = 0 for fairshare in fairshares: try: # catch any exceptions here to avoid leaving lock if self.cluster: # Lock row for update in case multiple clusters are specified #jobs=self.db.getArcJobsInfo("arcstate='tosubmit' and ( clusterlist like '%{0}' or clusterlist like '%{0},%' ) and fairshare='{1}' order by priority desc limit 10".format(self.cluster, fairshare), jobs = self.db.getArcJobsInfo( "arcstate='tosubmit' and ( clusterlist like '%{0}' or clusterlist like '%{0},%' ) and fairshare='{1}' limit 10" .format(self.cluster, fairshare), columns=[ "id", "jobdesc", "appjobid", "priority", "proxyid" ], lock=True) if jobs: self.log.debug("started lock for writing %d jobs" % len(jobs)) else: jobs = self.db.getArcJobsInfo( "arcstate='tosubmit' and clusterlist='' and fairshare='{0}' limit 10" .format(fairshare), columns=["id", "jobdesc", "appjobid", "priority"]) # mark submitting in db jobs_taken = [] for j in jobs: jd = { 'cluster': self.cluster, 'arcstate': 'submitting', 'tarcstate': self.db.getTimeStamp() } self.db.updateArcJobLazy(j['id'], jd) jobs_taken.append(j) jobs = jobs_taken finally: if self.cluster: try: self.db.Commit(lock=True) self.log.debug("ended lock") except: self.log.warning("Failed to release DB lock") else: self.db.Commit() if len(jobs) == 0: #self.log.debug("No jobs to submit") continue self.log.info("Submitting %d jobs for fairshare %s" % (len(jobs), fairshare)) # max waiting priority try: maxpriowaiting = max(jobs, key=lambda x: x['priority'])['priority'] except: maxpriowaiting = 0 self.log.info("Maximum priority of waiting jobs: %d" % maxpriowaiting) # Query infosys - either local or index if self.cluster: if self.cluster.find('://') != -1: aris = arc.URL(self.cluster) else: aris = arc.URL('gsiftp://%s' % self.cluster) if aris.Protocol() == 'https': aris.ChangePath('/arex') infoendpoints = [ arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO, 'org.ogf.glue.emies.resourceinfo') ] elif aris.Protocol() == 'local': infoendpoints = [ arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO, 'org.nordugrid.local') ] else: aris = 'ldap://' + aris.Host( ) + '/mds-vo-name=local,o=grid' infoendpoints = [ arc.Endpoint(aris, arc.Endpoint.COMPUTINGINFO, 'org.nordugrid.ldapng') ] else: giises = self.conf.getList(['atlasgiis', 'item']) infoendpoints = [] for g in giises: # Specify explicitly EGIIS infoendpoints.append( arc.Endpoint(str(g), arc.Endpoint.REGISTRY, "org.nordugrid.ldapegiis")) # Set UserConfig credential for each proxy. Assumes that any proxy # in the fairshare can query the CE infosys self.uc.CredentialString(self.db.getProxy(jobs[0]['proxyid'])) # retriever contains a list of CE endpoints retriever = arc.ComputingServiceRetriever(self.uc, infoendpoints) retriever.wait() # targets is the list of queues # parse target.ComputingService.ID for the CE hostname # target.ComputingShare.Name is the queue name targets = retriever.GetExecutionTargets() # Filter only sites for this process queuelist = [] for target in targets: if not target.ComputingService.ID: self.log.info( "Target %s does not have ComputingService ID defined, skipping" % target.ComputingService.Name) continue # If EMI-ES infoendpoint, force EMI-ES submission if infoendpoints[ 0].InterfaceName == 'org.ogf.glue.emies.resourceinfo' and target.ComputingEndpoint.InterfaceName != 'org.ogf.glue.emies.activitycreation': self.log.debug( "Rejecting target interface %s because not EMI-ES" % target.ComputingEndpoint.InterfaceName) continue # Check for matching host and queue targethost = re.sub( ':arex$', '', re.sub('urn:ogf:ComputingService:', '', target.ComputingService.ID)) targetqueue = target.ComputingShare.Name if clusterhost and targethost != clusterhost: self.log.debug( 'Rejecting target host %s as it does not match %s' % (targethost, clusterhost)) continue if clusterqueue and targetqueue != clusterqueue: self.log.debug( 'Rejecting target queue %s as it does not match %s' % (targetqueue, clusterqueue)) continue if targetqueue in self.conf.getList(['queuesreject', 'item']): self.log.debug( 'Rejecting target queue %s in queuesreject list' % targetqueue) continue elif targethost in self.conf.getList( ['clustersreject', 'item']): self.log.debug( 'Rejecting target host %s in clustersreject list' % targethost) continue else: # tmp hack target.ComputingShare.LocalWaitingJobs = 0 target.ComputingShare.PreLRMSWaitingJobs = 0 target.ExecutionEnvironment.CPUClockSpeed = 2000 qjobs = self.db.getArcJobsInfo( "cluster='" + str(self.cluster) + "' and arcstate='submitted' and fairshare='%s'" % fairshare, ['id', 'priority']) rjobs = self.db.getArcJobsInfo( "cluster='" + str(self.cluster) + "' and arcstate='running' and fairshare='%s'" % fairshare, ['id']) # max queued priority try: maxprioqueued = max( qjobs, key=lambda x: x['priority'])['priority'] except: maxprioqueued = 0 self.log.info("Max priority queued: %d" % maxprioqueued) # Set number of submitted jobs to running * 0.15 + 400/num of shares # Note: assumes only a few shares are used jlimit = len(rjobs) * 0.15 + 100 / len(fairshares) if str(self.cluster).find('arc-boinc-0') != -1: jlimit = len(rjobs) * 0.15 + 400 if str(self.cluster).find('XXXpikolit') != -1: jlimit = len(rjobs) * 0.15 + 100 if str(self.cluster).find('arc05.lcg') != -1: jlimit = len(rjobs) * 0.15 + 400 target.ComputingShare.PreLRMSWaitingJobs = len(qjobs) if len(qjobs) < jlimit or ((maxpriowaiting > maxprioqueued) and (maxpriowaiting > 10)): if maxpriowaiting > maxprioqueued: self.log.info( "Overriding limit, maxpriowaiting: %d > maxprioqueued: %d" % (maxpriowaiting, maxprioqueued)) queuelist.append(target) self.log.debug("Adding target %s:%s" % (targethost, targetqueue)) else: self.log.info( "%s/%s already at limit of submitted jobs for fairshare %s" % (targethost, targetqueue, fairshare)) # check if any queues are available, if not leave and try again next time if not queuelist: self.log.info("No free queues available") self.db.Commit() # EMI-ES proxy problem - see bug 3685 if self.cluster and self.cluster.startswith('https://'): raise ExceptInterrupt(15) continue self.log.info("start submitting") # Just run one thread for each job in sequence. Strange things happen # when trying to create a new UserConfig object for each thread. for j in jobs: self.log.debug("%s: preparing submission" % j['appjobid']) jobdescstr = str( self.db.getArcJobDescription(str(j['jobdesc']))) jobdescs = arc.JobDescriptionList() if not jobdescstr or not arc.JobDescription_Parse( jobdescstr, jobdescs): self.log.error("%s: Failed to prepare job description" % j['appjobid']) continue # TODO: might not work if proxies are different within a share # since same uc object is shared among threads self.uc.CredentialString(self.db.getProxy(j['proxyid'])) t = SubmitThr(Submit, j['id'], j['appjobid'], jobdescs, self.uc, self.log) self.RunThreadsSplit([t], 1) count = count + 1 self.log.info("threads finished") # commit transaction to release row locks self.db.Commit() # EMI-ES proxy problem - see bug 3685 if self.cluster and self.cluster.startswith('https://'): raise ExceptInterrupt(15) self.log.info("end submitting") return count
def updatePandaHeartbeatBulk(self,pstatus): """ Heartbeat status updates in bulk. """ columns = ['pandaid', 'siteName', 'startTime', 'computingElement', 'node', 'corecount', 'eventranges'] jobs=self.dbpanda.getJobs("pandastatus='"+pstatus+"' and sendhb=1 and ("+self.dbpanda.timeStampLessThan("theartbeat", self.conf.get(['panda','heartbeattime']))+" or modified > theartbeat) limit 1000", columns) #jobs=self.dbpanda.getJobs("pandastatus='"+pstatus+"' and sendhb=1 and ("+self.dbpanda.timeStampLessThan("theartbeat", 60)+" or modified > theartbeat) limit 1000", columns) if not jobs: return self.log.info("Update heartbeat for %d jobs in state %s (%s)" % (len(jobs), pstatus, ','.join([str(j['pandaid']) for j in jobs]))) changed_pstatus = False if pstatus == 'sent': pstatus = 'starting' changed_pstatus = True tlist=[] jobsbyproxy = {} for j in jobs: # Don't send transferring heartbeat for ES jobs, they must be in running while events are updated if pstatus == 'transferring' and j['eventranges']: pstatus = 'running' jd = {'jobId': j['pandaid'], 'state': pstatus} if pstatus != 'starting': jd['startTime'] = j['startTime'] if j['computingElement']: if j['computingElement'].find('://') != -1: # this if is only needed during transition period jd['computingElement'] = arc.URL(str(j['computingElement'])).Host() else: jd['computingElement'] = j['computingElement'] jd['node'] = j['node'] jd['siteName'] = j['siteName'] # For starting truepilot jobs send pilotID with expected log # location so logs are available in case of lost heartbeat if pstatus == 'starting' and not changed_pstatus and self.sites[j['siteName']]['truepilot']: date = time.strftime('%Y-%m-%d', time.gmtime()) logurl = '/'.join([self.conf.get(["joblog","urlprefix"]), date, j['siteName'], '%s.out' % j['pandaid']]) jd['pilotID'] = '%s|Unknown|Unknown|Unknown|Unknown' % logurl try: corecount = int(j['corecount']) if j['corecount'] > 0 else self.sites[j['siteName']]['corecount'] jd['jobMetrics'] = "coreCount=%d" % corecount jd['coreCount'] = corecount except: self.log.warning('%s: no corecount available' % j['pandaid']) try: jobsbyproxy[self.sites[j['siteName']]['type']].append(jd) except: jobsbyproxy[self.sites[j['siteName']]['type']] = [jd] for sitetype, jobs in jobsbyproxy.items(): t = PandaBulkThr(self.pandas.get(sitetype, self.pandas.get('production')).updateStatuses, [j['jobId'] for j in jobs], jobs) tlist.append(t) aCTUtils.RunThreadsSplit(tlist, self.nthreads) for t in tlist: if not t or not t.result or not t.result[0]: # Strange response from panda, try later continue for pandaid, response in zip(t.ids, t.result[1]): try: result = cgi.parse_qs(response) except Exception: self.log.error('Could not parse result from panda: %s' % response) continue if not result.get('StatusCode'): # Strange response from panda, try later continue if result['StatusCode'][0] == '60': self.log.error('Failed to contact Panda, proxy may have expired') continue if result.get('command', [''])[0] not in ['', "NULL"]: self.log.info("%s: response: %s" % (pandaid, result)) jd = {} if changed_pstatus: jd['pandastatus'] = pstatus # Make sure heartbeat is ahead of modified time so it is not picked up again jd['theartbeat'] = self.dbpanda.getTimeStamp(time.time()+1) # If panda tells us to kill the job, set actpandastatus to tobekilled # and remove from heartbeats if result.get('command', [''])[0] in ["tobekilled", "badattemptnr", "alreadydone"]: self.log.info('%s: cancelled by panda' % pandaid) jd['actpandastatus'] = "tobekilled" jd['pandastatus'] = None self.dbpanda.updateJob(pandaid, jd) self.log.info("Threads finished")
def _download_outputs(self, files, logdir, jobid, pandaid, userconfig, log): '''Download the output files specified in downloadfiles''' # construct datapoint object, initialising connection. Use the same # object until base URL changes. TODO group by base URL. datapoint = arc_utils.DataPoint(str(jobid), userconfig) dp = datapoint.h dm = arc.DataMover() dm.retry(False) dm.passive(True) dm.secure(False) fetched = [] notfetched = [] notfetchedretry = [] # create required local log dirs try: os.makedirs(logdir, 0755) except OSError as e: if e.errno != errno.EEXIST or not os.path.isdir(logdir): log.warning('Failed to create directory {0}: {1}'.format( logdir, os.strerror(e.errno))) notfetched.append(jobid) return (fetched, notfetched, notfetchedretry) tmpdldir = os.path.join(self.tmpdir, pandaid) try: os.makedirs(tmpdldir, 0755) except OSError as e: if e.errno != errno.EEXIST or not os.path.isdir(tmpdldir): log.warning('Failed to create directory {0}: {1}'.format( tmpdldir, os.strerror(e.errno))) notfetched.append(jobid) return (fetched, notfetched, notfetchedretry) filelist = files.split(';') if re.search(r'[\*\[\]\?]', files): # found wildcard, need to get sessiondir list remotefiles = self.listUrlRecursive(jobid, log) expandedfiles = [] for wcf in filelist: if re.search(r'[\*\[\]\?]', wcf): # only match wildcards in matching dirs expandedfiles += [ rf for rf in remotefiles if fnmatch.fnmatch(rf, wcf) and os.path.dirname(rf) == os.path.dirname(wcf) ] else: expandedfiles.append(wcf) # remove duplicates from wildcard matching through set filelist = list(set(expandedfiles)) for f in filelist: if f == 'gmlog/errors': localfile = os.path.join(logdir, '%s.log' % pandaid) elif f.find('.log') != -1: localfile = os.path.join(logdir, '%s.out' % pandaid) else: localfile = os.path.join(tmpdldir, f) remotefile = arc.URL(str(jobid + '/' + f)) dp.SetURL(remotefile) localdp = arc_utils.DataPoint(str(localfile), userconfig) # do the copy status = dm.Transfer(dp, localdp.h, arc.FileCache(), arc.URLMap()) if not status and str(status).find( 'File unavailable' ) == -1: # tmp fix for globus error which is always retried if status.Retryable(): log.warning( 'Failed to download but will retry {0}: {1}'.format( dp.GetURL().str(), str(status))) notfetchedretry.append(jobid) else: log.error( 'Failed to download with permanent failure {0}: {1}'. format(dp.GetURL().str(), str(status))) notfetched.append(jobid) else: os.chmod(localfile, 0644) log.info('Downloaded {0}'.format(dp.GetURL().str())) if jobid not in notfetched and jobid not in notfetchedretry: fetched.append(jobid) return (fetched, notfetched, notfetchedretry)
def copyFinishedFiles(self, arcjobid, extractmetadata): """ - if extractmetadata: (normal arc jobs, not true pilot jobs) - store heartbeat file under tmp/pickle or under harvester access point if specified - copy .job.log file to jobs/date/pandaqueue/pandaid.out - copy gmlog errors to jobs/date/pandaqueue/pandaid.log """ columns = ['JobID', 'appjobid', 'cluster', 'UsedTotalWallTime', 'arcjobs.EndTime', 'ExecutionNode', 'stdout', 'fairshare', 'pandajobs.created', 'metadata'] select = "arcjobs.id=%d AND arcjobs.id=pandajobs.arcjobid" % arcjobid aj = self.dbarc.getArcJobsInfo(select, columns=columns, tables='arcjobs,pandajobs') if not aj or 'JobID' not in aj[0] or not aj[0]['JobID']: self.log.error('No JobID in arcjob %s: %s'%(str(arcjobid), str(aj))) return False aj = aj[0] jobid = aj['JobID'] sessionid = jobid[jobid.rfind('/')+1:] date = aj['created'].strftime('%Y-%m-%d') if extractmetadata: try: jobinfo = aCTPandaJob(filename=os.path.join(self.tmpdir, sessionid, 'heartbeat.json')) except Exception as x: self.log.error("%s: failed to load heartbeat file for arcjob %s: %s" %(aj['appjobid'], jobid, x)) jobinfo = aCTPandaJob(jobinfo={'jobId': aj['appjobid'], 'state': 'finished'}) # update heartbeat and dump to tmp/heartbeats jobinfo.computingElement = arc.URL(str(aj['cluster'])).Host() if hasattr(jobinfo, 'startTime') and hasattr(jobinfo, 'endTime'): # take values from the pilot jobinfo.startTime = datetime.datetime.utcfromtimestamp(jobinfo.startTime).strftime('%Y-%m-%d %H:%M:%S') jobinfo.endTime = datetime.datetime.utcfromtimestamp(jobinfo.endTime).strftime('%Y-%m-%d %H:%M:%S') else: # Use ARC values if aj['EndTime']: # datetime cannot be serialised to json so use string (for harvester) jobinfo.startTime = (aj['EndTime'] - datetime.timedelta(0, aj['UsedTotalWallTime'])).strftime('%Y-%m-%d %H:%M:%S') jobinfo.endTime = aj['EndTime'].strftime('%Y-%m-%d %H:%M:%S') # Sanity check for efficiency > 100% cputimepercore = getattr(jobinfo, 'cpuConsumptionTime', 0) / getattr(jobinfo, 'coreCount', 1) if aj['UsedTotalWallTime'] < cputimepercore: self.log.warning('%s: Adjusting reported walltime %d to CPU time %d' % (aj['appjobid'], aj['UsedTotalWallTime'], cputimepercore)) jobinfo.startTime = (aj['EndTime'] - datetime.timedelta(0, cputimepercore)).strftime('%Y-%m-%d %H:%M:%S') else: self.log.warning('%s: no endtime found' % aj['appjobid']) if len(aj["ExecutionNode"]) > 255: jobinfo.node = aj["ExecutionNode"][:254] self.log.warning("%s: Truncating wn hostname from %s to %s" % (aj['appjobid'], aj['ExecutionNode'], jobinfo.node)) else: jobinfo.node = aj["ExecutionNode"] try: smeta = json.loads(aj['metadata'].decode()) except: smeta = None if smeta and smeta.get('harvesteraccesspoint'): # de-serialise the metadata to json try: jobinfo.metaData = json.loads(jobinfo.metaData) except Exception as e: self.log.warning("%s: no metaData in pilot metadata: %s" % (aj['appjobid'], str(e))) jobinfo.writeToFile(os.path.join(smeta['harvesteraccesspoint'], 'jobReport.json')) else: jobinfo.writeToFile(os.path.join(self.tmpdir, "heartbeats", "%s.json" % aj['appjobid'])) # copy to joblog dir files downloaded for the job: gmlog errors and pilot log outd = os.path.join(self.conf.get(['joblog','dir']), date, aj['fairshare']) try: os.makedirs(outd, 0o755) except: pass localdir = os.path.join(self.tmpdir, sessionid) gmlogerrors = os.path.join(localdir, "gmlog", "errors") arcjoblog = os.path.join(outd, "%s.log" % aj['appjobid']) if not os.path.exists(arcjoblog): try: shutil.move(gmlogerrors, arcjoblog) os.chmod(arcjoblog, 0o644) except: self.log.error("Failed to copy %s" % gmlogerrors) pilotlog = aj['stdout'] if not pilotlog and os.path.exists(localdir): pilotlogs = [f for f in os.listdir(localdir)] for f in pilotlogs: if f.find('.log'): pilotlog = f if pilotlog: try: shutil.move(os.path.join(localdir, pilotlog), os.path.join(outd, '%s.out' % aj['appjobid'])) os.chmod(os.path.join(outd, '%s.out' % aj['appjobid']), 0o644) except Exception as e: self.log.error("Failed to copy file %s: %s" % (os.path.join(localdir,pilotlog), str(e))) return False return True
def process(self, inmsg, outmsg): logger.msg(arc.DEBUG, "EchoService (python) 'Process' called") # time.sleep(10) # get the payload from the message inpayload = inmsg.Payload() logger.msg( arc.VERBOSE, 'inmsg.Auth().Export(arc.SecAttr.ARCAuth) = %s' % inmsg.Auth().Export(arc.SecAttr.ARCAuth).GetXML()) logger.msg( arc.VERBOSE, 'inmsg.Attributes().getAll() = %s ' % inmsg.Attributes().getAll()) logger.msg(arc.INFO, "EchoService (python) got: %s " % inpayload.GetXML()) # the first child of the payload should be the name of the request request_node = inpayload.Child() # get the namespace request_namespace = request_node.Namespace() logger.msg( arc.DEBUG, "EchoService (python) request_namespace: %s" % request_namespace) if request_namespace != echo_ns: if request_namespace == wsrf_rp_ns: outpayload = arc.PayloadSOAP(arc.NS({'wsrf-rp': wsrf_rp_ns})) outpayload.NewChild( 'wsrf-rp:GetResourcePropertyDocumentResponse').NewChild( self.GetLocalInformation()) outmsg.Payload(outpayload) logger.msg(arc.DEBUG, "outpayload %s" % outpayload.GetXML()) return arc.MCC_Status(arc.STATUS_OK) raise Exception('wrong namespace. expected: %s' % echo_ns) # get the name of the request without the namespace prefix # this is the name of the Body node's first child request_name = request_node.Name() # create an answer payload ns = arc.NS({'echo': echo_ns}) outpayload = arc.PayloadSOAP(ns) # here we defined that 'echo' prefix will be the namespace prefix of 'http://www.nordugrid.org/schemas/echo' # get the message say = str(request_node.Get('say')) # put it between the response-prefix and the response-suffix hear = self.prefix + say + self.suffix if request_name == 'double': # if the name of the request is 'double' # we create a new echo message which we send to http://localhost:60000/Echo using the ClientSOAP object cfg = arc.MCCConfig() ssl = False if self.ssl_config: cfg.AddCertificate(self.ssl_config.get('cert_file', None)) cfg.AddPrivateKey(self.ssl_config.get('key_file', None)) if 'ca_file' in self.ssl_config: cfg.AddCAFile(self.ssl_config.get('ca_file', None)) else: cfg.AddCADir(self.ssl_config.get('ca_dir', None)) ssl = True if ssl: url = arc.URL('https://localhost:60000/Echo') logger.msg( arc.DEBUG, 'Calling https://localhost:60000/Echo using ClientSOAP') else: url = arc.URL('http://localhost:60000/Echo') logger.msg( arc.DEBUG, 'Calling http://localhost:60000/Echo using ClientSOAP') # creating the ClientSOAP object s = arc.ClientSOAP(cfg, url) new_payload = arc.PayloadSOAP(ns) # creating the message new_payload.NewChild('echo:echo').NewChild('echo:say').Set(hear) logger.msg(arc.DEBUG, 'new_payload %s' % new_payload.GetXML()) # sending the message resp, status = s.process(new_payload) # get the response hear = str(resp.Get('echoResponse').Get('hear')) elif request_name == 'httplib': # if the name of the request is 'httplib' # we create a new echo message which we send to http://localhost:60000/echo using python's built-in http client try: import http.client as httplib except ImportError: import httplib logger.msg(arc.DEBUG, 'Calling http://localhost:60000/Echo using httplib') # create the connection h = httplib.HTTPConnection('localhost', 60000) new_payload = arc.PayloadSOAP(ns) # create the message new_payload.NewChild('echo:echo').NewChild('echo:say').Set(hear) logger.msg(arc.DEBUG, 'new_payload %s' % new_payload.GetXML()) # send the message h.request('POST', '/Echo', new_payload.GetXML()) r = h.getresponse() response = r.read() logger.msg(arc.DEBUG, response) resp = arc.XMLNode(response) # get the response hear = str(resp.Child().Get('echoResponse').Get('hear')) elif request_name == 'wait': logger.msg(arc.DEBUG, 'Start waiting 10 sec...') time.sleep(10) logger.msg(arc.DEBUG, 'Waiting ends.') # we create a node at '/echo:echoResponse/echo:hear' and put the string in it outpayload.NewChild('echo:echoResponse').NewChild('echo:hear').Set( hear) outmsg.Payload(outpayload) logger.msg(arc.DEBUG, "outpayload %s" % outpayload.GetXML()) # return with STATUS_OK return arc.MCC_Status(arc.STATUS_OK)
def getJobOutput(self, jobID, workingDirectory=None): """Get the specified job standard output and error files. Standard output and error are returned as strings. If further outputs are retrieved, they are stored in workingDirectory. """ result = self._prepareProxy() if not result["OK"]: self.log.error("ARCComputingElement: failed to set up proxy", result["Message"]) return result self.usercfg.ProxyPath(os.environ["X509_USER_PROXY"]) if jobID.find(":::") != -1: pilotRef, stamp = jobID.split(":::") else: pilotRef = jobID stamp = "" if not stamp: return S_ERROR("Pilot stamp not defined for %s" % pilotRef) job = self._getARCJob(pilotRef) arcID = os.path.basename(pilotRef) self.log.debug("Retrieving pilot logs for %s" % pilotRef) if not workingDirectory: if "WorkingDirectory" in self.ceParameters: workingDirectory = os.path.join( self.ceParameters["WorkingDirectory"], arcID) else: workingDirectory = arcID outFileName = os.path.join(workingDirectory, "%s.out" % stamp) errFileName = os.path.join(workingDirectory, "%s.err" % stamp) self.log.debug("Working directory for pilot output %s" % workingDirectory) # Retrieve the job output: # last parameter allows downloading the outputs even if workingDirectory already exists isItOkay = job.Retrieve(self.usercfg, arc.URL(str(workingDirectory)), True) if isItOkay: output = None error = None try: with open(outFileName, "r") as outFile: output = outFile.read() os.unlink(outFileName) with open(errFileName, "r") as errFile: error = errFile.read() os.unlink(errFileName) except IOError as e: self.log.error("Error downloading outputs", repr(e).replace(",)", ")")) return S_ERROR("Error downloading outputs") self.log.debug("Pilot output = %s" % output) self.log.debug("Pilot error = %s" % error) else: job.Update() arcState = job.State.GetGeneralState() if arcState != "Undefined": return S_ERROR( "Failed to retrieve output for %s as job is not finished (maybe not started yet)" % jobID) self.log.debug( "Could not retrieve pilot output for %s - either permission / proxy error or could not connect to CE" % pilotRef) return S_ERROR("Failed to retrieve output for %s" % jobID) return S_OK((output, error))
#! /usr/bin/env python from __future__ import print_function import arc import sys root_logger = arc.Logger_getRootLogger() root_logger.addDestination(arc.LogStream(sys.stdout)) root_logger.setThreshold(arc.ERROR) if len(sys.argv) < 2: print("Usage: echo_client.py URL [message]") print( " echo_client gets the credentials from the default user config file") sys.exit(-1) url = arc.URL(sys.argv[1]) try: message = sys.argv[2] except: message = 'hi!' cfg = arc.MCCConfig() uc = arc.UserConfig('') uc.ApplyToConfig(cfg) s = arc.ClientSOAP(cfg, url) outpayload = arc.PayloadSOAP( arc.NS('echo', 'http://www.nordugrid.org/schemas/echo')) outpayload.NewChild('echo:echo').NewChild('echo:say').Set(message) resp, status = s.process(outpayload) print(resp.GetXML(True))
def fetchSome(self, jobs, downloadfiles): # Get specified files for the jobs in downloadfiles # jobs: id: Job object # downloadfiles: id: list of files relative to session dir, with wildcards if not jobs or not downloadfiles: return ([], [], []) # construct datapoint object, initialising connection. Use the same # object until base URL changes. TODO group by base URL. datapoint = aCTUtils.DataPoint(jobs.values()[0].JobID, self.uc) dp = datapoint.h dm = arc.DataMover() dm.retry(False) dm.passive(True) dm.secure(False) fetched = [] notfetched = [] notfetchedretry = [] for (id, job) in jobs.items(): if id not in downloadfiles: continue jobid = job.JobID # If connection URL is different reconnect if arc.URL(jobid).ConnectionURL() != dp: datapoint = aCTUtils.DataPoint(jobid, self.uc) dp = datapoint.h localdir = str(self.conf.get(['tmp', 'dir' ])) + jobid[jobid.rfind('/'):] + '/' files = downloadfiles[id].split(';') if re.search('[\*\[\]\?]', downloadfiles[id]): # found wildcard, need to get sessiondir list remotefiles = self.listUrlRecursive(jobid) expandedfiles = [] for wcf in files: if re.search('[\*\[\]\?]', wcf): # only match wildcards in matching dirs expandedfiles += [ rf for rf in remotefiles if fnmatch.fnmatch(rf, wcf) and os.path.dirname(rf) == os.path.dirname(wcf) ] else: expandedfiles.append(wcf) # remove duplicates from wildcard matching through set files = list(set(expandedfiles)) for f in files: localfile = str(localdir + f) localfiledir = localfile[:localfile.rfind('/')] # create required local dirs try: os.makedirs(localfiledir, 0755) except OSError as e: if e.errno != errno.EEXIST or not os.path.isdir( localfiledir): self.log.warning('Failed to create directory %s: %s', localfiledir, os.strerror(e.errno)) notfetched.append(jobid) break remotefile = arc.URL(str(jobid + '/' + f)) dp.SetURL(remotefile) localdp = aCTUtils.DataPoint(localfile, self.uc) # do the copy status = dm.Transfer(dp, localdp.h, arc.FileCache(), arc.URLMap()) if not status and str(status).find( 'File unavailable' ) == -1: # tmp fix for globus error which is always retried if status.Retryable(): self.log.warning( 'Failed to download but will retry %s: %s', dp.GetURL().str(), str(status)) notfetchedretry.append(jobid) else: self.log.error( 'Failed to download with permanent failure %s: %s', dp.GetURL().str(), str(status)) notfetched.append(jobid) break self.log.info('Downloaded %s', dp.GetURL().str()) if jobid not in notfetched and jobid not in notfetchedretry: fetched.append(jobid) return (fetched, notfetched, notfetchedretry)
def processFailed(self, arcjobs): """ process jobs failed for other reasons than athena (log_extracts was not created by pilot) """ if not arcjobs: return self.log.info("processing %d failed jobs" % len(arcjobs)) for aj in arcjobs: jobid = aj['JobID'] if not jobid: # Job was not even submitted, there is no more information self.log.warning( "%s: Job has not been submitted yet so no information to report", aj['appjobid']) continue cluster = arc.URL(str(jobid)).Host() sessionid = jobid[jobid.rfind('/') + 1:] date = time.strftime('%Y%m%d') outd = os.path.join(self.conf.get(['joblog', 'dir']), date, cluster, sessionid) # Make sure the path up to outd exists try: os.makedirs(os.path.dirname(outd), 0755) except: pass try: shutil.rmtree(outd) except: pass # copy from tmp to outd. tmp dir will be cleaned in validator localdir = os.path.join(self.arcconf.get(['tmp', 'dir']), sessionid) try: shutil.copytree(localdir, outd) except (OSError, shutil.Error) as e: self.log.warning("%s: Failed to copy job output for %s: %s" % (aj['appjobid'], jobid, str(e))) # Sometimes fetcher fails to get output, so just make empty dir try: os.makedirs(outd, 0755) except OSError, e: self.log.warning( "%s: Failed to create %s: %s. Job logs will be missing" % (aj['appjobid'], outd, str(e))) # set right permissions aCTUtils.setFilePermissionsRecursive(outd) # set update, pickle from pilot is not available # some values might not be properly set # TODO synchronize error codes with the rest of production pupdate = aCTPandaJob() pupdate.jobId = aj['appjobid'] pupdate.state = 'failed' pupdate.siteName = aj['siteName'] pupdate.computingElement = cluster pupdate.schedulerID = self.conf.get(['panda', 'schedulerid']) pupdate.pilotID = self.conf.get( ["joblog", "urlprefix"] ) + "/" + date + "/" + cluster + '/' + sessionid + "|Unknown|Unknown|Unknown|Unknown" if len(aj["ExecutionNode"]) > 255: pupdate.node = aj["ExecutionNode"][:254] self.log.warning( "%s: Truncating wn hostname from %s to %s" % (aj['pandaid'], aj['ExecutionNode'], pupdate.node)) else: pupdate.node = aj["ExecutionNode"] pupdate.node = aj['ExecutionNode'] pupdate.pilotLog = self.createPilotLog(outd, aj['pandaid']) pupdate.cpuConsumptionTime = aj['UsedTotalCPUTime'] pupdate.cpuConsumptionUnit = 'seconds' pupdate.cpuConversionFactor = 1 pupdate.pilotTiming = "0|0|%s|0" % aj['UsedTotalWallTime'] pupdate.exeErrorCode = aj['ExitCode'] pupdate.exeErrorDiag = aj['Error'] pupdate.pilotErrorCode = 1008 codes = [] codes.append("Job timeout") codes.append("qmaster enforced h_rt limit") codes.append("job killed: wall") codes.append("Job exceeded time limit") if [ errcode for errcode in codes if re.search(errcode, aj['Error']) ]: pupdate.pilotErrorCode = 1213 codes = [] codes.append("Job probably exceeded memory limit") codes.append("job killed: vmem") codes.append("pvmem exceeded") if [ errcode for errcode in codes if re.search(errcode, aj['Error']) ]: pupdate.pilotErrorCode = 1212 pupdate.pilotErrorDiag = aj['Error'] # set start/endtime pupdate.startTime = self.getStartTime(aj['EndTime'], aj['UsedTotalWallTime']) pupdate.endTime = aj['EndTime'] # save the pickle file to be used by aCTAutopilot panda update try: picklefile = os.path.join(self.arcconf.get(['tmp', 'dir']), "pickle", str(aj['pandaid']) + ".pickle") pupdate.writeToFile(picklefile) except Exception as e: self.log.warning("%s: Failed to write file %s: %s" % (aj['appjobid'], picklefile, str(e)))
#lfn = f.getElementsByTagName("logical")[0].getElementsByTagName("lfn")[0].getAttribute("name") #guid = str(file.getAttribute('ID')) size = "" adler32 = "" surl = "" se = "" for m in f.getElementsByTagName("metadata"): v = m.getAttribute("att_value") if m.getAttribute("att_name") == "fsize": size = v if m.getAttribute("att_name") == "adler32": adler32 = v # rewrite surl in xml if m.getAttribute("att_name") == "surl": surl = v se = arc.URL(str(surl)).Host() except Exception, x: self.log.error('%s: %s' % (aj['appjobid'], x)) outp = False if outp: checksum = "adler32:" + adler32 if not surls.has_key(se): surls[se] = [] surls[se] += [{ "surl": surl, "fsize": size, "checksum": checksum, "arcjobid": arcjobid }]