def events_to_update(self, workspec): '''Report events processed for harvester to update''' # get logger arclog = arc_utils.ARCLogger(baselogger, workspec.workerID) tmpLog = arclog.log job = workspec.workAttributes['arcjob'] arcid = job['JobID'] # Set certificate to use for interacting with ARC CE usercfg = arc.UserConfig(self.cred_type) if not self._setup_proxy(usercfg, workspec, arcid, tmpLog): return False # Check for jobid/jsonEventsUpdateFileName on CE, rename to .read retDict = dict() for pandaID in workspec.pandaid_list: # first look for json.read which is not yet acknowledged accessPoint = self.get_access_point(workspec, pandaID) localJsonFilePath = os.path.join(accessPoint, jsonEventsUpdateFileName) remoteJsonFilePathRead = '%s/%s%s' % ( arcid, jsonEventsUpdateFileName, suffixReadJson) tmpLog.debug('looking for event update file {0}'.format( remoteJsonFilePathRead)) status = self._copy_file(remoteJsonFilePathRead, localJsonFilePath, usercfg, tmpLog) if not status: if status.GetErrno() != errno.ENOENT: tmpLog.warning('Failed checking {0}: {1}'.format( remoteJsonFilePathRead, str(status))) continue # Look for new json remoteJsonFilePath = '%s/%s' % (arcid, jsonEventsUpdateFileName) status = self._copy_file(remoteJsonFilePath, localJsonFilePath, usercfg, tmpLog) if not status: if status.GetErrno() != errno.ENOENT: tmpLog.warning('Failed checking {0}: {1}'.format( remoteJsonFilePath, str(status))) else: # not found tmpLog.debug('not found') continue # Rename to prevent from being overwritten # Gridftp does not support renaming so upload .read file and delete old one status = self._copy_file(localJsonFilePath, remoteJsonFilePathRead, usercfg, tmpLog) if not status: tmpLog.warning('Failed copying {0} to {1}: {2}'.format( localJsonFilePath, remoteJsonFilePathRead, str(status))) # If rename fails, delete old file anyway status = self._delete_file(remoteJsonFilePath, usercfg, tmpLog) if not status: tmpLog.warning('Failed deleting {0}: {1}'.format( remoteJsonFilePath, str(status))) # load json nData = 0 try: with open(localJsonFilePath) as jsonFile: tmpOrigDict = json.load(jsonFile) newDict = dict() # change the key from str to int for tmpPandaID, tmpDict in tmpOrigDict.iteritems(): tmpPandaID = long(tmpPandaID) retDict[tmpPandaID] = tmpDict nData += 1 except Exception: raise tmpLog.error('failed to load json') # delete local file try: os.remove(localJsonFilePath) except Exception: pass tmpLog.debug('got {0} events for PandaID={1}'.format( nData, pandaID)) return retDict
import arc import sys # Set up logging to stderr with level VERBOSE (a lot of output will be shown) logstdout = arc.LogStream(sys.stdout) logstdout.setFormat(arc.ShortFormat) arc.Logger_getRootLogger().addDestination(logstdout) arc.Logger_getRootLogger().setThreshold(arc.VERBOSE) logger = arc.Logger(arc.Logger_getRootLogger(), "jobsubmit") # UserConfig contains information on credentials and default services to use. # This form of the constructor is necessary to initialise the local job list. usercfg = arc.UserConfig("", "") # Simple job description which outputs hostname to stdout jobdescstring = "&(executable=/bin/hostname)(stdout=stdout)" # Parse job description jobdescs = arc.JobDescriptionList() if not arc.JobDescription_Parse(jobdescstring, jobdescs): logger.msg(arc.ERROR, "Invalid job description") sys.exit(1) # Use 'arc.JobDescription_ParseFromFile("helloworld.xrsl", jobdescs)' # to parse job description from file. # Use top-level NorduGrid information index to find resources index = arc.Endpoint( "ldap://index1.nordugrid.org:2135/Mds-Vo-name=NorduGrid,o=grid", arc.Endpoint.REGISTRY, "org.nordugrid.ldapegiis") services = arc.EndpointList(1, index)
def post_processing(self, workspec, jobspec_list, map_type): ''' Fetch job output and process pilot info for sending in final heartbeat. The pilot pickle is loaded and some attributes corrected (schedulerid, pilotlog etc), then converted to dictionary and stored in workspec.workAttributes[pandaid]. If pilot pickle cannot be used, report ARC error in pilotErrorDiag and fill all possible attributes using ARC information. ''' arclog = arc_utils.ARCLogger(baselogger, workspec.workerID) tmplog = arclog.log tmplog.info('Post processing ARC job {0}'.format(workspec.batchID)) job = workspec.workAttributes['arcjob'] arcid = job['JobID'] tmplog.info('Job id {0}'.format(arcid)) if 'arcdownloadfiles' not in workspec.workAttributes: tmplog.error('No files to download') return # Assume one-to-one mapping of workers to jobs. If jobspec_list is empty # it means the job was cancelled by panda or otherwise forgotten if not jobspec_list: return # Set certificate to use for interacting with ARC CE userconfig = arc.UserConfig(self.cred_type) if not self._setup_proxy(usercfg, workspec, arcid, tmplog): return queueconfigmapper = QueueConfigMapper() queueconfig = queueconfigmapper.get_queue( jobspec_list[0].computingSite) logbaseurl = queueconfig.submitter.get('logBaseURL') logbasedir = queueconfig.submitter.get('logDir', self.tmpdir) logsubdir = workspec.workAttributes['logsubdir'] pandaid = str(jobspec_list[0].PandaID) # Construct log path and url logurl = '/'.join([logbaseurl, logsubdir, str(pandaid)]) if logbaseurl else None logdir = os.path.join(logbasedir, logsubdir) # post_processing is only called once, so no retries are done. But keep # the possibility here in case it changes (fetched, notfetched, notfetchedretry) = self._download_outputs( workspec.workAttributes['arcdownloadfiles'], logdir, arcid, pandaid, userconfig, tmplog) if arcid not in fetched: tmplog.warning("Could not get outputs of {0}".format(arcid)) workspec.workAttributes[long(pandaid)] = {} workspec.workAttributes[long( pandaid)] = self._extractAndFixPilotPickle(job, pandaid, (arcid in fetched), logurl, tmplog) tmplog.debug("pilot info for {0}: {1}".format( pandaid, workspec.workAttributes[long(pandaid)]))
#! /usr/bin/env python from __future__ import print_function import arc import sys root_logger = arc.Logger_getRootLogger() root_logger.addDestination(arc.LogStream(sys.stdout)) root_logger.setThreshold(arc.ERROR) if len(sys.argv) < 2: print("Usage: echo_client.py URL [message]") print( " echo_client gets the credentials from the default user config file") sys.exit(-1) url = arc.URL(sys.argv[1]) try: message = sys.argv[2] except: message = 'hi!' cfg = arc.MCCConfig() uc = arc.UserConfig('') uc.ApplyToConfig(cfg) s = arc.ClientSOAP(cfg, url) outpayload = arc.PayloadSOAP( arc.NS('echo', 'http://www.nordugrid.org/schemas/echo')) outpayload.NewChild('echo:echo').NewChild('echo:say').Set(message) resp, status = s.process(outpayload) print(resp.GetXML(True))
def check_workers(self, workspec_list): retList = [] for workspec in workspec_list: # make logger arclog = arc_utils.ARCLogger(baselogger, workspec.workerID) tmplog = arclog.log tmplog.info("checking worker id {0}".format(workspec.workerID)) (job, modtime, proxyrole) = arc_utils.workspec2arcjob(workspec) # Set certificate userconfig = arc.UserConfig(self.cred_type) try: userconfig.ProxyPath(str(self.certs[proxyrole])) except: tmplog.error("Job {0}: no proxy found with role {1}".format( job.JobID, proxyrole)) retList.append((workspec.status, '')) continue job_supervisor = arc.JobSupervisor(userconfig, [job]) job_supervisor.Update() jobsupdated = job_supervisor.GetAllJobs() jobsnotupdated = job_supervisor.GetIDsNotProcessed() for updatedjob in jobsupdated: if updatedjob.JobID in jobsnotupdated: tmplog.error("Failed to find information on {0}".format( updatedjob.JobID)) # If missing for too long (2 days), mark as lost if arc.Time() - modtime > arc.Period(172800): tmplog.error( "Job {0} missing for more than 2 days, marking as lost" .format(updatedjob.JobID)) retList.append((workspec.ST_failed, '')) else: retList.append((workspec.status, '')) continue # Convert arc state to WorkSpec state arcstatus = updatedjob.State newstatus = WorkSpec.ST_submitted if arcstatus == arc.JobState.RUNNING or \ arcstatus == arc.JobState.FINISHING: newstatus = WorkSpec.ST_running elif arcstatus == arc.JobState.FINISHED: if updatedjob.ExitCode == -1: # Missing exit code, but assume success tmplog.warning( "Job {0} FINISHED but has missing exit code, setting to zero" .format(updatedjob.JobID)) updatedjob.ExitCode = 0 newstatus = WorkSpec.ST_finished elif arcstatus == arc.JobState.FAILED: newstatus = WorkSpec.ST_failed tmplog.info("Job {0} failed: {1}".format( updatedjob.JobID, ";".join([joberr for joberr in updatedjob.Error]))) elif arcstatus == arc.JobState.KILLED: newstatus = WorkSpec.ST_cancelled elif arcstatus == arc.JobState.DELETED or \ arcstatus == arc.JobState.OTHER: # unexpected newstatus = WorkSpec.ST_failed # Not covered: arc.JobState.HOLD. Maybe need a post-run state in # harvester, also to cover FINISHING # compare strings here to get around limitations of JobState API if job.State.GetGeneralState( ) == updatedjob.State.GetGeneralState(): tmplog.debug("Job {0} still in state {1}".format( job.JobID, job.State.GetGeneralState())) retList.append((newstatus, '')) continue tmplog.info("Job {0}: {1} -> {2} ({3})".format( job.JobID, job.State.GetGeneralState(), updatedjob.State.GetGeneralState(), updatedjob.State.GetSpecificState())) arc_utils.arcjob2workspec(updatedjob, workspec) # Have to force update to change info in DB workspec.force_update('workAttributes') tmplog.debug("batchStatus {0} -> workerStatus {1}".format( arcstatus.GetGeneralState(), newstatus)) retList.append((newstatus, '')) return True, retList
import arc import sys if len(sys.argv) != 2: sys.stdout.write("Usage: python partial_copy.py filename\n") sys.exit(1) desired_size = 512 usercfg = arc.UserConfig() url = arc.URL(sys.argv[1]) handle = arc.DataHandle(url, usercfg) point = handle.__ref__() point.SetSecure( False) # GridFTP servers generally do not have encrypted data channel info = arc.FileInfo("") point.Stat(info) sys.stdout.write("Name: %s\n" % str(info.GetName())) fsize = info.GetSize() if fsize > desired_size: point.Range(fsize - desired_size, fsize - 1) databuffer = arc.DataBuffer() point.StartReading(databuffer) while True: n = 0 length = 0 offset = 0 (r, n, length, offset, buf) = databuffer.for_write(True) if not r: break sys.stdout.write("BUFFER: %d : %d : %s\n" % (offset, length, str(buf))) databuffer.is_written(n) point.StopReading()
def setUp(self): self.usercfg = arc.UserConfig( arc.initializeCredentialsType( arc.initializeCredentialsType.SkipCredentials))
def submit_workers(self, workspec_list): retlist = [] # Get queue info from DB pandaqueues = self.dbproxy.get_cache("panda_queues.json", None) if pandaqueues is None: raise Exception("Failed to get panda queue info from database") pandaqueues = pandaqueues.data osmap = self.dbproxy.get_cache("ddmendpoints_objectstores.json", None) if osmap is None: raise Exception("Failed to get Object Store info from database") osmap = osmap.data for workspec in workspec_list: arclog = arc_utils.ARCLogger(baselogger, workspec.workerID) tmplog = arclog.log # Assume for aCT that jobs are always pre-fetched (no late-binding) for jobspec in workspec.get_jobspec_list(): tmplog.debug("JobSpec: {0}".format(jobspec.values_map())) if jobspec.computingSite not in pandaqueues: retlist.append( (False, "No queue information for {0}".format( jobspec.computingSite))) continue # Get CEs from panda queue info # List of (endpoint, queue) tuples arcces = [] for endpoint in pandaqueues[jobspec.computingSite]['queues']: ce_endpoint = endpoint['ce_endpoint'] if not re.search('://', ce_endpoint): ce_endpoint = 'gsiftp://%s' % ce_endpoint ce_queue = endpoint['ce_queue_name'] arcces.append((ce_endpoint, ce_queue)) if not arcces: retlist.append((False, "No CEs defined for %{0}".format( jobspec.computingSite))) continue # Set true pilot or not queueconfigmapper = QueueConfigMapper() queueconfig = queueconfigmapper.get_queue( jobspec.computingSite) pandaqueues[ jobspec.computingSite]['truepilot'] = queueconfig.truePilot # Set log URL for GTAG env in job description logbaseurl = queueconfig.submitter.get('logBaseURL') logsubdir = self._set_logdir(jobspec.computingSite) logfileurl = '/'.join( [logbaseurl, logsubdir, '%d.out' % jobspec.PandaID]) if logbaseurl else None tmplog.debug("Converting to ARC XRSL format") arcxrsl = ARCParser( jobspec.jobParams, jobspec.computingSite, pandaqueues[jobspec.computingSite], logfileurl, self.schedulerid, osmap, '/tmp', # tmpdir, TODO common tmp dir None, #jobSpec.eventranges, # TODO event ranges tmplog) arcxrsl.parse() xrsl = arcxrsl.getXrsl() tmplog.debug("ARC xrsl: {0}".format(xrsl)) # Set the files to be downloaded at the end of the job downloadfiles = 'gmlog/errors' if 'logFile' in jobspec.jobParams: downloadfiles += ';%s' % jobspec.jobParams[ 'logFile'].replace('.tgz', '') if not pandaqueues[jobspec.computingSite]['truepilot']: downloadfiles += ';jobSmallFiles.tgz' # Set certificate userconfig = arc.UserConfig(self.cred_type) proxyrole = '' if jobspec.jobParams['prodSourceLabel'] == 'user': userconfig.ProxyPath(str(self.certs['pilot'])) proxyrole = 'pilot' else: userconfig.ProxyPath(str(self.certs['production'])) proxyrole = 'production' tmplog.debug("Submitting using {0} proxy at {1}".format( proxyrole, userconfig.ProxyPath())) try: tmplog.debug("Submission targets: {0}".format(arcces)) arcjob = self._arc_submit(xrsl, arcces, userconfig, tmplog) tmplog.info("ARC CE job id {0}".format(arcjob.JobID)) arc_utils.arcjob2workspec(arcjob, workspec) workspec.workAttributes['arcdownloadfiles'] = downloadfiles workspec.workAttributes['proxyrole'] = proxyrole workspec.workAttributes['logsubdir'] = logsubdir workspec.batchID = arcjob.JobID tmplog.debug(workspec.workAttributes) result = (True, '') except Exception as exc: tmplog.error(traceback.format_exc()) result = (False, "Failed to submit ARC job: {0}".format(str(exc))) retlist.append(result) return retlist