def getStatus(self, strIDs, timeout): # convert str to list ids = strIDs.split() # peek jobs tmpWrapper = _TimedMethod(self.taskBuffer.peekJobs, timeout) tmpWrapper.run(ids, False, True, True, False) # make response if tmpWrapper.result == Protocol.TimeOutToken: # timeout response = Protocol.Response(Protocol.SC_TimeOut) else: if isinstance(tmpWrapper.result, types.ListType): # succeed response = Protocol.Response(Protocol.SC_Success) # make return retStr = '' attStr = '' for job in tmpWrapper.result: if job == None: retStr += '%s+' % 'notfound' attStr += '0+' else: retStr += '%s+' % job.jobStatus attStr += '%s+' % job.attemptNr response.appendNode('status', retStr[:-1]) response.appendNode('attemptNr', attStr[:-1]) else: # failed response = Protocol.Response(Protocol.SC_Failed) _logger.debug("getStatus : %s ret -> %s" % (strIDs, response.encode())) return response.encode()
def getKeyPair(self, realDN, publicKeyName, privateKeyName): tmpMsg = "getKeyPair {0}/{1} : ".format(publicKeyName, privateKeyName) if realDN == None: # cannot extract DN tmpMsg += "failed since DN cannot be extracted" _logger.debug(tmpMsg) response = Protocol.Response( Protocol.SC_Perms, 'Cannot extract DN from proxy. not HTTPS?') else: # get compact DN compactDN = self.taskBuffer.cleanUserID(realDN) # check permission self.specialDispatchParams.update() if not 'allowKey' in self.specialDispatchParams: allowKey = [] else: allowKey = self.specialDispatchParams['allowKey'] if not compactDN in allowKey: # permission denied tmpMsg += "failed since '{0}' not in the authorized user list who have 'k' in {1}.USERS.GRIDPREF".format( compactDN, panda_config.schemaMETA) _logger.debug(tmpMsg) response = Protocol.Response(Protocol.SC_Perms, tmpMsg) else: # look for key pair if not 'keyPair' in self.specialDispatchParams: keyPair = {} else: keyPair = self.specialDispatchParams['keyPair'] notFound = False if not publicKeyName in keyPair: # public key is missing notFound = True tmpMsg += "failed for '{2}' since {0} is missing on {1}".format( publicKeyName, socket.getfqdn(), compactDN) elif not privateKeyName in keyPair: # private key is missing notFound = True tmpMsg += "failed for '{2}' since {0} is missing on {1}".format( privateKeyName, socket.getfqdn(), compactDN) if notFound: # private or public key is missing _logger.debug(tmpMsg) response = Protocol.Response(Protocol.SC_MissKey, tmpMsg) else: # key pair is available response = Protocol.Response(Protocol.SC_Success) response.appendNode('publicKey', keyPair[publicKeyName]) response.appendNode('privateKey', keyPair[privateKeyName]) tmpMsg += "sent key-pair to '{0}'".format(compactDN) _logger.debug(tmpMsg) # return return response.encode()
def updateEventRanges(self, eventRanges, timeout, acceptJson, version): # peek jobs tmpWrapper = _TimedMethod(self.taskBuffer.updateEventRanges, timeout) tmpWrapper.run(eventRanges, version) # make response if tmpWrapper.result == Protocol.TimeOutToken: # timeout response = Protocol.Response(Protocol.SC_TimeOut) else: # succeed response = Protocol.Response(Protocol.SC_Success) # make return response.appendNode('Returns', tmpWrapper.result[0]) response.appendNode('Command', tmpWrapper.result[1]) _logger.debug("updateEventRanges : ret -> %s" % (response.encode(acceptJson))) return response.encode(acceptJson)
def ackCommands(self, command_ids, timeout, accept_json): """ Acknowledge the commands from a list of IDs """ _logger.debug("command_ids : {0}".format(command_ids)) tmp_wrapper = _TimedMethod(self.taskBuffer.ackCommands, timeout) tmp_wrapper.run(command_ids) # Make response if tmp_wrapper.result == Protocol.TimeOutToken: # timeout response = Protocol.Response(Protocol.SC_TimeOut) else: # success response = Protocol.Response(Protocol.SC_Success) response.appendNode('Returns', tmp_wrapper.result) _logger.debug("ackCommands : ret -> %s" % (response.encode(accept_json))) return response.encode(accept_json)
def getCommands(self, harvester_id, n_commands, timeout, accept_json): """ Get commands for a particular harvester instance """ tmp_wrapper = _TimedMethod(self.taskBuffer.getCommands, timeout) tmp_wrapper.run(harvester_id, n_commands) # Make response if tmp_wrapper.result == Protocol.TimeOutToken: # timeout response = Protocol.Response(Protocol.SC_TimeOut) else: # success response = Protocol.Response(Protocol.SC_Success) response.appendNode('Returns', tmp_wrapper.result[0]) response.appendNode('Commands', tmp_wrapper.result[1]) _logger.debug("getCommands : ret -> %s" % (response.encode(accept_json))) return response.encode(accept_json)
def getEventRanges(self, pandaID, jobsetID, jediTaskID, nRanges, timeout, acceptJson): # peek jobs tmpWrapper = _TimedMethod(self.taskBuffer.getEventRanges, timeout) tmpWrapper.run(pandaID, jobsetID, jediTaskID, nRanges, acceptJson) # make response if tmpWrapper.result == Protocol.TimeOutToken: # timeout response = Protocol.Response(Protocol.SC_TimeOut) else: if tmpWrapper.result != None: # succeed response = Protocol.Response(Protocol.SC_Success) # make return response.appendNode('eventRanges', tmpWrapper.result) else: # failed response = Protocol.Response(Protocol.SC_Failed) _logger.debug("getEventRanges : %s ret -> %s" % (pandaID, response.encode(acceptJson))) return response.encode(acceptJson)
def updateEventRange(self, eventRangeID, eventStatus, coreCount, cpuConsumptionTime, objstoreID, timeout): # peek jobs tmpWrapper = _TimedMethod(self.taskBuffer.updateEventRange, timeout) tmpWrapper.run(eventRangeID, eventStatus, coreCount, cpuConsumptionTime, objstoreID) # make response _logger.debug(str(tmpWrapper.result)) if tmpWrapper.result == Protocol.TimeOutToken: # timeout response = Protocol.Response(Protocol.SC_TimeOut) else: if tmpWrapper.result[0] == True: # succeed response = Protocol.Response(Protocol.SC_Success) response.appendNode('Command', tmpWrapper.result[1]) else: # failed response = Protocol.Response(Protocol.SC_Failed) _logger.debug("updateEventRange : %s ret -> %s" % (eventRangeID, response.encode())) return response.encode()
def getProxy(self, realDN, role): tmpMsg = "getProxy DN={0} role={1} : ".format(realDN, role) if realDN == None: # cannot extract DN tmpMsg += "failed since DN cannot be extracted" _logger.debug(tmpMsg) response = Protocol.Response( Protocol.SC_Perms, 'Cannot extract DN from proxy. not HTTPS?') else: # get compact DN compactDN = self.taskBuffer.cleanUserID(realDN) # check permission self.specialDispatchParams.update() if not 'allowProxy' in self.specialDispatchParams: allowProxy = [] else: allowProxy = self.specialDispatchParams['allowProxy'] if not compactDN in allowProxy: # permission denied tmpMsg += "failed since '{0}' not in the authorized user list who have 'p' in {1}.USERS.GRIDPREF ".format( compactDN, panda_config.schemaMETA) tmpMsg += "to get proxy" _logger.debug(tmpMsg) response = Protocol.Response(Protocol.SC_Perms, tmpMsg) else: # get proxy response = Protocol.Response(Protocol.SC_Success, '') tmpStat, tmpMsg = self.setUserProxy(response, realDN, role) if not tmpStat: _logger.debug(tmpMsg) response.appendNode('StatusCode', Protocol.SC_ProxyError) else: tmpMsg = "sent proxy" _logger.debug(tmpMsg) # return return response.encode(True)
def updateJob(req, jobId, state, token=None, transExitCode=None, pilotErrorCode=None, pilotErrorDiag=None, timestamp=None, timeout=60, xml='', node=None, workdir=None, cpuConsumptionTime=None, cpuConsumptionUnit=None, remainingSpace=None, schedulerID=None, pilotID=None, siteName=None, messageLevel=None, pilotLog='', metaData='', cpuConversionFactor=None, exeErrorCode=None, exeErrorDiag=None, pilotTiming=None, computingElement=None, startTime=None, endTime=None, nEvents=None, nInputFiles=None, batchID=None, attemptNr=None, jobMetrics=None, stdout='', jobSubStatus=None, coreCount=None, maxRSS=None, maxVMEM=None, maxSWAP=None, maxPSS=None, avgRSS=None, avgVMEM=None, avgSWAP=None, avgPSS=None): tmpLog = LogWrapper( _logger, 'updateJob PandaID={0} PID={1}'.format(jobId, os.getpid())) tmpLog.debug('start') # get DN realDN = _getDN(req) # get FQANs fqans = _getFQAN(req) # check production role prodManager = _checkRole(fqans, realDN, jobDispatcher, site=siteName, hostname=req.get_remote_host()) # check token validToken = _checkToken(token, jobDispatcher) # accept json acceptJson = req.acceptJson() _logger.debug( "updateJob(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,attemptNr:%s,jobSubStatus:%s,core:%s,DN:%s,role:%s,token:%s,val:%s,FQAN:%s,maxRSS=%s,maxVMEM=%s,maxSWAP=%s,maxPSS=%s,avgRSS=%s,avgVMEM=%s,avgSWAP=%s,avgPSS=%s\n==XML==\n%s\n==LOG==\n%s\n==Meta==\n%s\n==Metrics==\n%s\n==stdout==\n%s)" % (jobId, state, transExitCode, pilotErrorCode, pilotErrorDiag, node, workdir, cpuConsumptionTime, cpuConsumptionUnit, remainingSpace, schedulerID, pilotID, siteName, messageLevel, nEvents, nInputFiles, cpuConversionFactor, exeErrorCode, exeErrorDiag, pilotTiming, computingElement, startTime, endTime, batchID, attemptNr, jobSubStatus, coreCount, realDN, prodManager, token, validToken, str(fqans), maxRSS, maxVMEM, maxSWAP, maxPSS, avgRSS, avgVMEM, avgSWAP, avgPSS, xml, pilotLog, metaData, jobMetrics, stdout)) _pilotReqLogger.info('method=updateJob,site=%s,node=%s,type=None' % (siteName, node)) # invalid role if not prodManager: _logger.warning("updateJob(%s) : invalid role" % jobId) return Protocol.Response(Protocol.SC_Role).encode(acceptJson) # invalid token if not validToken: _logger.warning("updateJob(%s) : invalid token" % jobId) return Protocol.Response(Protocol.SC_Invalid).encode(acceptJson) # aborting message if jobId == 'NULL': return Protocol.Response(Protocol.SC_Success).encode(acceptJson) # check status if not state in [ 'running', 'failed', 'finished', 'holding', 'starting', 'transferring' ]: _logger.warning("invalid state=%s for updateJob" % state) return Protocol.Response(Protocol.SC_Success).encode(acceptJson) # pilot log tmpLog.debug('sending log') if pilotLog != '': try: # make message message = pilotLog # get logger _pandaLogger = PandaLogger() _pandaLogger.lock() _pandaLogger.setParams({'Type': 'pilotLog', 'PandaID': int(jobId)}) logger = _pandaLogger.getHttpLogger(panda_config.loggername) # add message logger.info(message) except: tmpLog.debug('failed to send log') finally: tmpLog.debug('release lock') try: # release HTTP handler _pandaLogger.release() except: pass tmpLog.debug('done log') # create parameter map param = {} if cpuConsumptionTime != None: param['cpuConsumptionTime'] = cpuConsumptionTime if cpuConsumptionUnit != None: param['cpuConsumptionUnit'] = cpuConsumptionUnit if node != None: param['modificationHost'] = node[:128] if transExitCode != None: param['transExitCode'] = transExitCode if pilotErrorCode != None: param['pilotErrorCode'] = pilotErrorCode if pilotErrorDiag != None: param['pilotErrorDiag'] = pilotErrorDiag[:500] if jobMetrics != None: param['jobMetrics'] = jobMetrics[:500] if schedulerID != None: param['schedulerID'] = schedulerID if pilotID != None: param['pilotID'] = pilotID[:200] if batchID != None: param['batchID'] = batchID[:80] if exeErrorCode != None: param['exeErrorCode'] = exeErrorCode if exeErrorDiag != None: param['exeErrorDiag'] = exeErrorDiag[:500] if cpuConversionFactor != None: param['cpuConversion'] = cpuConversionFactor if pilotTiming != None: param['pilotTiming'] = pilotTiming if computingElement != None: param['computingElement'] = computingElement if nEvents != None: param['nEvents'] = nEvents if nInputFiles != None: param['nInputFiles'] = nInputFiles if not jobSubStatus in [None, '']: param['jobSubStatus'] = jobSubStatus if not coreCount in [None, '']: param['actualCoreCount'] = coreCount if maxRSS != None: param['maxRSS'] = maxRSS if maxVMEM != None: param['maxVMEM'] = maxVMEM if maxSWAP != None: param['maxSWAP'] = maxSWAP if maxPSS != None: param['maxPSS'] = maxPSS if avgRSS != None: param['avgRSS'] = avgRSS if avgVMEM != None: param['avgVMEM'] = avgVMEM if avgSWAP != None: param['avgSWAP'] = avgSWAP if avgPSS != None: param['avgPSS'] = avgPSS if startTime != None: try: param['startTime'] = datetime.datetime( *time.strptime(startTime, '%Y-%m-%d %H:%M:%S')[:6]) except: pass if endTime != None: try: param['endTime'] = datetime.datetime( *time.strptime(endTime, '%Y-%m-%d %H:%M:%S')[:6]) except: pass if attemptNr != None: try: attemptNr = int(attemptNr) except: attemptNr = None if stdout != '': stdout = stdout[:2048] # invoke JD tmpLog.debug('executing') return jobDispatcher.updateJob(int(jobId), state, int(timeout), xml, siteName, param, metaData, attemptNr, stdout, acceptJson)
def getJob(req, siteName, token=None, timeout=60, cpu=None, mem=None, diskSpace=None, prodSourceLabel=None, node=None, computingElement=None, AtlasRelease=None, prodUserID=None, getProxyKey=None, countryGroup=None, workingGroup=None, allowOtherCountry=None, taskID=None, nJobs=None): _logger.debug("getJob(%s)" % siteName) # get DN realDN = _getDN(req) # get FQANs fqans = _getFQAN(req) # check production role if getProxyKey == 'True': # don't use /atlas to prevent normal proxy getting credname prodManager = _checkRole(fqans, realDN, jobDispatcher, False, site=siteName) else: prodManager = _checkRole(fqans, realDN, jobDispatcher, site=siteName, hostname=req.get_remote_host()) # check token validToken = _checkToken(token, jobDispatcher) # set DN for non-production user if not prodManager: prodUserID = realDN # allow getProxyKey for production role if getProxyKey == 'True' and prodManager: getProxyKey = True else: getProxyKey = False # convert mem and diskSpace try: mem = int(float(mem)) if mem < 0: mem = 0 except: mem = 0 try: diskSpace = int(float(diskSpace)) if diskSpace < 0: diskSpace = 0 except: diskSpace = 0 _logger.debug("getJob(%s,nJobs=%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,taskID=%s,DN:%s,role:%s,token:%s,val:%s,FQAN:%s,json:%s)" \ % (siteName,nJobs,cpu,mem,diskSpace,prodSourceLabel,node, computingElement,AtlasRelease,prodUserID,getProxyKey,countryGroup,workingGroup, allowOtherCountry,taskID,realDN,prodManager,token,validToken,str(fqans),req.acceptJson())) _pilotReqLogger.info('method=getJob,site=%s,node=%s,type=%s' % (siteName, node, prodSourceLabel)) # invalid role if (not prodManager) and (not prodSourceLabel in ['user']): _logger.warning("getJob(%s) : invalid role" % siteName) return Protocol.Response(Protocol.SC_Role).encode(req.acceptJson()) # invalid token if not validToken: _logger.warning("getJob(%s) : invalid token" % siteName) return Protocol.Response(Protocol.SC_Invalid).encode(req.acceptJson()) # invoke JD return jobDispatcher.getJob(siteName, prodSourceLabel, cpu, mem, diskSpace, node, int(timeout), computingElement, AtlasRelease, prodUserID, getProxyKey, countryGroup, workingGroup, allowOtherCountry, realDN, taskID, nJobs, req.acceptJson())
def updateJob(self, jobID, jobStatus, timeout, xml, siteName, param, metadata, attemptNr=None, stdout='', acceptJson=False): # recoverable error for ES merge recoverableEsMerge = False if 'pilotErrorCode' in param and param['pilotErrorCode'] in [ '1099', '1137', '1151', '1152', '1221', '1224', '1225' ]: recoverableEsMerge = True # retry failed analysis job and ddm job if jobStatus=='failed' \ and ((param.has_key('pilotErrorCode') and (param['pilotErrorCode'] in ['1200','1201'] \ or param['pilotErrorCode'].startswith('-') \ or recoverableEsMerge)) \ or (siteName != None and siteName.find('DDM') != -1)): # retry if param.has_key('pilotErrorCode') and (param['pilotErrorCode'].startswith('-') or \ recoverableEsMerge): # pilot retry with new PandaID. Negative codes or ESMERGERECOVERABLE ret = self.taskBuffer.retryJob( jobID, param, getNewPandaID=True, attemptNr=attemptNr, recoverableEsMerge=recoverableEsMerge) else: # old style ret = self.taskBuffer.retryJob(jobID, param, attemptNr=attemptNr) if ret: # return succeed response = Protocol.Response(Protocol.SC_Success) return response.encode(acceptJson) # add metadata if metadata != '': ret = self.taskBuffer.addMetadata([jobID], [metadata], [jobStatus]) if len(ret) > 0 and not ret[0]: _logger.debug("updateJob : %s failed to add metadata" % jobID) # return succeed response = Protocol.Response(Protocol.SC_Success) return response.encode(acceptJson) # add stdout if stdout != '': self.taskBuffer.addStdOut(jobID, stdout) # update tmpStatus = jobStatus updateStateChange = False if jobStatus == 'failed' or jobStatus == 'finished': tmpStatus = 'holding' # update stateChangeTime to prevent Watcher from finding this job updateStateChange = True param['jobDispatcherErrorDiag'] = None elif jobStatus in ['holding', 'transferring']: param[ 'jobDispatcherErrorDiag'] = 'set to {0} by the pilot at {1}'.format( jobStatus, datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')) if tmpStatus == 'holding': tmpWrapper = _TimedMethod(self.taskBuffer.updateJobStatus, None) else: tmpWrapper = _TimedMethod(self.taskBuffer.updateJobStatus, timeout) tmpWrapper.run(jobID, tmpStatus, param, updateStateChange, attemptNr) # make response if tmpWrapper.result == Protocol.TimeOutToken: # timeout response = Protocol.Response(Protocol.SC_TimeOut) else: if tmpWrapper.result: # succeed response = Protocol.Response(Protocol.SC_Success) # set command if isinstance(tmpWrapper.result, types.StringType): response.appendNode('command', tmpWrapper.result) else: response.appendNode('command', 'NULL') # add output to dataset if not tmpWrapper.result in ["badattemptnr", "alreadydone" ] and (jobStatus == 'failed' or jobStatus == 'finished'): Adder(self.taskBuffer, jobID, xml, jobStatus, attemptNr=attemptNr).start() else: # failed response = Protocol.Response(Protocol.SC_Failed) _logger.debug("updateJob : %s ret -> %s" % (jobID, response.encode(acceptJson))) return response.encode(acceptJson)
def getJob(self, siteName, prodSourceLabel, cpu, mem, diskSpace, node, timeout, computingElement, atlasRelease, prodUserID, getProxyKey, countryGroup, workingGroup, allowOtherCountry, realDN, taskID, nJobs, acceptJson): jobs = [] useGLEXEC = False useProxyCache = False try: tmpNumJobs = int(nJobs) except: tmpNumJobs = None if tmpNumJobs == None: tmpNumJobs = 1 # wrapper function for timeout if hasattr(panda_config, 'global_shares') and panda_config.global_shares == True: tmpWrapper = _TimedMethod(self.taskBuffer.getJobsGShare, timeout) else: tmpWrapper = _TimedMethod(self.taskBuffer.getJobs, timeout) tmpWrapper.run(tmpNumJobs, siteName, prodSourceLabel, cpu, mem, diskSpace, node, timeout, computingElement, atlasRelease, prodUserID, getProxyKey, countryGroup, workingGroup, allowOtherCountry, taskID) if isinstance(tmpWrapper.result, types.ListType): jobs = jobs + tmpWrapper.result # make response if len(jobs) > 0: proxyKey = jobs[-1] nSent = jobs[-2] jobs = jobs[:-2] if len(jobs) != 0: # succeed self.siteMapperCache.update() responseList = [] # append Jobs for tmpJob in jobs: response = Protocol.Response(Protocol.SC_Success) response.appendJob(tmpJob, self.siteMapperCache) # append nSent response.appendNode('nSent', nSent) # set proxy key if getProxyKey: response.setProxyKey(proxyKey) # check if glexec or proxy cache is used if hasattr(panda_config, 'useProxyCache' ) and panda_config.useProxyCache == True: self.specialDispatchParams.update() if not 'glexecSites' in self.specialDispatchParams: glexecSites = {} else: glexecSites = self.specialDispatchParams['glexecSites'] if siteName in glexecSites: if glexecSites[siteName] == 'True': useGLEXEC = True elif glexecSites[siteName] == 'test' and \ (prodSourceLabel in ['test','prod_test'] or \ (tmpJob.processingType in ['gangarobot'])): useGLEXEC = True if not 'proxyCacheSites' in self.specialDispatchParams: proxyCacheSites = {} else: proxyCacheSites = self.specialDispatchParams[ 'proxyCacheSites'] if siteName in proxyCacheSites: useProxyCache = True # set proxy if useGLEXEC or useProxyCache: try: # get compact compactDN = self.taskBuffer.cleanUserID(realDN) # check permission self.specialDispatchParams.update() if not 'allowProxy' in self.specialDispatchParams: allowProxy = [] else: allowProxy = self.specialDispatchParams[ 'allowProxy'] if not compactDN in allowProxy: _logger.warning( "getJob : %s %s '%s' no permission to retrive user proxy" % (siteName, node, compactDN)) else: if useProxyCache: tmpStat, tmpOut = response.setUserProxy( proxyCacheSites[siteName]['dn'], proxyCacheSites[siteName]['role']) else: tmpStat, tmpOut = response.setUserProxy() if not tmpStat: _logger.warning( "getJob : %s %s failed to get user proxy : %s" % (siteName, node, tmpOut)) except: errtype, errvalue = sys.exc_info()[:2] _logger.warning( "getJob : %s %s failed to get user proxy with %s:%s" % (siteName, node, errtype.__name__, errvalue)) # panda proxy if 'pandaProxySites' in self.specialDispatchParams and siteName in self.specialDispatchParams['pandaProxySites'] \ and (EventServiceUtils.isEventServiceJob(tmpJob) or EventServiceUtils.isEventServiceMerge(tmpJob)): # get secret key tmpSecretKey, tmpErrMsg = DispatcherUtils.getSecretKey( tmpJob.PandaID) if tmpSecretKey == None: _logger.warning( "getJob : PandaID=%s site=%s failed to get panda proxy secret key : %s" % (tmpJob.PandaID, siteName, tmpErrMsg)) else: # set secret key _logger.debug("getJob : PandaID=%s key=%s" % (tmpJob.PandaID, tmpSecretKey)) response.setPandaProxySecretKey(tmpSecretKey) # add responseList.append(response.data) # make response for bulk if nJobs != None: response = Protocol.Response(Protocol.SC_Success) if not acceptJson: response.appendNode('jobs', json.dumps(responseList)) else: response.appendNode('jobs', responseList) else: if tmpWrapper.result == Protocol.TimeOutToken: # timeout response = Protocol.Response(Protocol.SC_TimeOut) else: # no available jobs response = Protocol.Response(Protocol.SC_NoJobs) _pilotReqLogger.info('method=noJob,site=%s,node=%s,type=%s' % (siteName, node, prodSourceLabel)) # return _logger.debug("getJob : %s %s useGLEXEC=%s ret -> %s" % (siteName, node, useGLEXEC, response.encode(acceptJson))) return response.encode(acceptJson)
def updateJob(req, jobId, state, token=None, transExitCode=None, pilotErrorCode=None, pilotErrorDiag=None, timestamp=None, timeout=60, xml='', node=None, workdir=None, cpuConsumptionTime=None, cpuConsumptionUnit=None, remainingSpace=None, schedulerID=None, pilotID=None, siteName=None, messageLevel=None, pilotLog='', metaData='', cpuConversionFactor=None, exeErrorCode=None, exeErrorDiag=None, pilotTiming=None, computingElement=None, startTime=None, endTime=None, nEvents=None, nInputFiles=None, batchID=None, attemptNr=None, jobMetrics=None, stdout='', jobSubStatus=None, coreCount=None, maxRSS=None, maxVMEM=None, maxSWAP=None, maxPSS=None, avgRSS=None, avgVMEM=None, avgSWAP=None, avgPSS=None, totRCHAR=None, totWCHAR=None, totRBYTES=None, totWBYTES=None, rateRCHAR=None, rateWCHAR=None, rateRBYTES=None, rateWBYTES=None): tmpLog = LogWrapper( _logger, 'updateJob PandaID={0} PID={1}'.format(jobId, os.getpid())) tmpLog.debug('start') # get DN realDN = _getDN(req) # get FQANs fqans = _getFQAN(req) # check production role prodManager = _checkRole(fqans, realDN, jobDispatcher, site=siteName, hostname=req.get_remote_host()) # check token validToken = _checkToken(token, jobDispatcher) # accept json acceptJson = req.acceptJson() _logger.debug( "updateJob(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,attemptNr:%s,jobSubStatus:%s,core:%s,DN:%s,role:%s,token:%s,val:%s,FQAN:%s,maxRSS=%s,maxVMEM=%s,maxSWAP=%s,maxPSS=%s,avgRSS=%s,avgVMEM=%s,avgSWAP=%s,avgPSS=%s,totRCHAR=%s,totWCHAR=%s,totRBYTES=%s,totWBYTES=%s,rateRCHAR=%s,rateWCHAR=%s,rateRBYTES=%s,rateWBYTES=%s\n==XML==\n%s\n==LOG==\n%s\n==Meta==\n%s\n==Metrics==\n%s\n==stdout==\n%s)" % (jobId, state, transExitCode, pilotErrorCode, pilotErrorDiag, node, workdir, cpuConsumptionTime, cpuConsumptionUnit, remainingSpace, schedulerID, pilotID, siteName, messageLevel, nEvents, nInputFiles, cpuConversionFactor, exeErrorCode, exeErrorDiag, pilotTiming, computingElement, startTime, endTime, batchID, attemptNr, jobSubStatus, coreCount, realDN, prodManager, token, validToken, str(fqans), maxRSS, maxVMEM, maxSWAP, maxPSS, avgRSS, avgVMEM, avgSWAP, avgPSS, totRCHAR, totWCHAR, totRBYTES, totWBYTES, rateRCHAR, rateWCHAR, rateRBYTES, rateWBYTES, xml, pilotLog[:1024], metaData[:1024], jobMetrics, stdout)) _pilotReqLogger.debug('method=updateJob,site=%s,node=%s,type=None' % (siteName, node)) # invalid role if not prodManager: _logger.warning("updateJob(%s) : invalid role" % jobId) if acceptJson: tmpMsg = 'no production/pilot role in VOMS FQANs or non pilot owner' else: tmpMsg = None return Protocol.Response(Protocol.SC_Role, tmpMsg).encode(acceptJson) # invalid token if not validToken: _logger.warning("updateJob(%s) : invalid token" % jobId) return Protocol.Response(Protocol.SC_Invalid).encode(acceptJson) # aborting message if jobId == 'NULL': return Protocol.Response(Protocol.SC_Success).encode(acceptJson) # check status if not state in [ 'running', 'failed', 'finished', 'holding', 'starting', 'transferring' ]: _logger.warning("invalid state=%s for updateJob" % state) return Protocol.Response(Protocol.SC_Success).encode(acceptJson) # create parameter map param = {} if cpuConsumptionTime != None: param['cpuConsumptionTime'] = cpuConsumptionTime if cpuConsumptionUnit != None: param['cpuConsumptionUnit'] = cpuConsumptionUnit if node != None: param['modificationHost'] = node[:128] if transExitCode != None: param['transExitCode'] = transExitCode if pilotErrorCode != None: param['pilotErrorCode'] = pilotErrorCode if pilotErrorDiag != None: param['pilotErrorDiag'] = pilotErrorDiag[:500] if jobMetrics != None: param['jobMetrics'] = jobMetrics[:500] if schedulerID != None: param['schedulerID'] = schedulerID if pilotID != None: param['pilotID'] = pilotID[:200] if batchID != None: param['batchID'] = batchID[:80] if exeErrorCode != None: param['exeErrorCode'] = exeErrorCode if exeErrorDiag != None: param['exeErrorDiag'] = exeErrorDiag[:500] if cpuConversionFactor != None: param['cpuConversion'] = cpuConversionFactor if pilotTiming != None: param['pilotTiming'] = pilotTiming if computingElement != None: param['computingElement'] = computingElement if nEvents != None: param['nEvents'] = nEvents if nInputFiles != None: param['nInputFiles'] = nInputFiles if not jobSubStatus in [None, '']: param['jobSubStatus'] = jobSubStatus if not coreCount in [None, '']: param['actualCoreCount'] = coreCount if maxRSS != None: param['maxRSS'] = maxRSS if maxVMEM != None: param['maxVMEM'] = maxVMEM if maxSWAP != None: param['maxSWAP'] = maxSWAP if maxPSS != None: param['maxPSS'] = maxPSS if avgRSS != None: param['avgRSS'] = avgRSS if avgVMEM != None: param['avgVMEM'] = avgVMEM if avgSWAP != None: param['avgSWAP'] = avgSWAP if avgPSS != None: param['avgPSS'] = avgPSS if totRCHAR is not None: totRCHAR = int(totRCHAR) / 1024 # convert to kByte totRCHAR = min(10**10 - 1, totRCHAR) # limit to 10 digit param['totRCHAR'] = totRCHAR if totWCHAR is not None: totWCHAR = int(totWCHAR) / 1024 # convert to kByte totWCHAR = min(10**10 - 1, totWCHAR) # limit to 10 digit param['totWCHAR'] = totWCHAR if totRBYTES is not None: totRBYTES = int(totRBYTES) / 1024 # convert to kByte totRBYTES = min(10**10 - 1, totRBYTES) # limit to 10 digit param['totRBYTES'] = totRBYTES if totWBYTES is not None: totWBYTES = int(totWBYTES) / 1024 # convert to kByte totWBYTES = min(10**10 - 1, totWBYTES) # limit to 10 digit param['totWBYTES'] = totWBYTES if rateRCHAR is not None: rateRCHAR = min(10**10 - 1, int(rateRCHAR)) # limit to 10 digit param['rateRCHAR'] = rateRCHAR if rateWCHAR is not None: rateWCHAR = min(10**10 - 1, int(rateWCHAR)) # limit to 10 digit param['rateWCHAR'] = rateWCHAR if rateRBYTES is not None: rateRBYTES = min(10**10 - 1, int(rateRBYTES)) # limit to 10 digit param['rateRBYTES'] = rateRBYTES if rateWBYTES is not None: rateWBYTES = min(10**10 - 1, int(rateWBYTES)) # limit to 10 digit param['rateWBYTES'] = rateWBYTES if startTime != None: try: param['startTime'] = datetime.datetime( *time.strptime(startTime, '%Y-%m-%d %H:%M:%S')[:6]) except: pass if endTime != None: try: param['endTime'] = datetime.datetime( *time.strptime(endTime, '%Y-%m-%d %H:%M:%S')[:6]) except: pass if attemptNr != None: try: attemptNr = int(attemptNr) except: attemptNr = None if stdout != '': stdout = stdout[:2048] # invoke JD tmpLog.debug('executing') return jobDispatcher.updateJob(int(jobId), state, int(timeout), xml, siteName, param, metaData, pilotLog, attemptNr, stdout, acceptJson)