def _query(self, cmd, conn=False): """ Make queries to MPIJob DB """ print "DB3" start = Time.time() ret = DB._query(self, cmd, conn) if DEBUG: print >> debugFile, Time.time() - start, cmd.replace('\n', '') debugFile.flush() print ret return ret
def _update( self, cmd, conn=False ): """ Update MPIJob Database """ print "DB4" start = Time.time() ret = DB._update( self, cmd, conn ) if DEBUG: print >> debugFile, Time.time() - start, cmd.replace('\n','') debugFile.flush() print ret return ret
def _update(self, cmd, conn=False): """ Update MPIJob Database """ print "DB4" start = Time.time() ret = DB._update(self, cmd, conn) if DEBUG: print >> debugFile, Time.time() - start, cmd.replace('\n', '') debugFile.flush() print ret return ret
def _query( self, cmd, conn=False ): """ Make queries to MPIJob DB """ print "DB3" start = Time.time() ret = DB._query( self, cmd, conn ) if DEBUG: print >> debugFile, Time.time() - start, cmd.replace('\n','') debugFile.flush() print ret return ret
def addLoggingRecord(self, jobID, status='idem', minor='idem', application='idem', date='', source='Unknown'): """ Add a new entry to the JobLoggingDB table. One, two or all the three status components can be specified. Optionaly the time stamp of the status can be provided in a form of a string in a format '%Y-%m-%d %H:%M:%S' or as datetime.datetime object. If the time stamp is not provided the current UTC time is used. """ event = 'status/minor/app=%s/%s/%s' % (status, minor, application) self.gLogger.info("Adding record for job " + str(jobID) + ": '" + event + "' from " + source) if not date: # Make the UTC datetime string and float _date = Time.dateTime() epoc = time.mktime(_date.timetuple( )) + _date.microsecond / 1000000. - MAGIC_EPOC_NUMBER time_order = round(epoc, 3) else: try: if type(date) in StringTypes: # The date is provided as a string in UTC _date = Time.fromString(date) epoc = time.mktime(_date.timetuple( )) + _date.microsecond / 1000000. - MAGIC_EPOC_NUMBER time_order = round(epoc, 3) elif type(date) == Time._dateTimeType: _date = date epoc = time.mktime(_date.timetuple( )) + _date.microsecond / 1000000. - MAGIC_EPOC_NUMBER time_order = round(epoc, 3) else: self.gLogger.error('Incorrect date for the logging record') _date = Time.dateTime() epoc = time.mktime(_date.timetuple()) - MAGIC_EPOC_NUMBER time_order = round(epoc, 3) except: self.gLogger.exception('Exception while date evaluation') _date = Time.dateTime() epoc = time.mktime(_date.timetuple()) - MAGIC_EPOC_NUMBER time_order = round(epoc, 3) cmd = "INSERT INTO LoggingInfo (JobId, Status, MinorStatus, ApplicationStatus, " + \ "StatusTime, StatusTimeOrder, StatusSource) VALUES (%d,'%s','%s','%s','%s',%f,'%s')" % \ (int(jobID),status,minor,application,str(_date),time_order,source) return self._update(cmd)
def addLoggingRecord(self, jobID, status='idem', minor='idem', application='idem', date='', source='Unknown'): """ Add a new entry to the JobLoggingDB table. One, two or all the three status components can be specified. Optionaly the time stamp of the status can be provided in a form of a string in a format '%Y-%m-%d %H:%M:%S' or as datetime.datetime object. If the time stamp is not provided the current UTC time is used. """ event = 'status/minor/app=%s/%s/%s' % (status,minor,application) self.gLogger.info("Adding record for job "+str(jobID)+": '"+event+"' from "+source) if not date: # Make the UTC datetime string and float _date = Time.dateTime() epoc = time.mktime(_date.timetuple())+_date.microsecond/1000000. - MAGIC_EPOC_NUMBER time_order = round(epoc,3) else: try: if type(date) in StringTypes: # The date is provided as a string in UTC _date = Time.fromString(date) epoc = time.mktime(_date.timetuple())+_date.microsecond/1000000. - MAGIC_EPOC_NUMBER time_order = round(epoc,3) elif type(date) == Time._dateTimeType: _date = date epoc = time.mktime(_date.timetuple())+_date.microsecond/1000000. - MAGIC_EPOC_NUMBER time_order = round(epoc,3) else: self.gLogger.error('Incorrect date for the logging record') _date = Time.dateTime() epoc = time.mktime(_date.timetuple()) - MAGIC_EPOC_NUMBER time_order = round(epoc,3) except: self.gLogger.exception('Exception while date evaluation') _date = Time.dateTime() epoc = time.mktime(_date.timetuple()) - MAGIC_EPOC_NUMBER time_order = round(epoc,3) cmd = "INSERT INTO LoggingInfo (JobId, Status, MinorStatus, ApplicationStatus, " + \ "StatusTime, StatusTimeOrder, StatusSource) VALUES (%d,'%s','%s','%s','%s',%f,'%s')" % \ (int(jobID),status,minor,application,str(_date),time_order,source) return self._update( cmd )
def initialize( self, request ): """ Set default values to attributes,parameters """ if type( request ) == types.NoneType: # Set some defaults for name in self.attributeNames: self.attributes[name] = 'Unknown' self.attributes['CreationTime'] = str( Time.dateTime() ) self.attributes['Status'] = "New" result = getProxyInfo() if result['OK']: proxyDict = result[ 'Value' ] self.attributes['OwnerDN'] = proxyDict[ 'identity' ] if 'group' in proxyDict: self.attributes['OwnerGroup'] = proxyDict[ 'group' ] self.attributes['DIRACSetup'] = gConfig.getValue( '/DIRAC/Setup', 'Unknown' ) elif type( request ) == types.InstanceType: for attr in self.attributeNames: self.attributes[attr] = request.attributes[attr] # initialize request from an XML string if type( request ) in types.StringTypes: for name in self.attributeNames: self.attributes[name] = 'Unknown' self.parseRequest( request ) # Initialize request from another request elif type( request ) == types.InstanceType: self.subRequests = copy.deepcopy( request.subrequests )
def setLastUpdate(self, time=''): """ Set the last update to the current data and time """ if not time: time = str(Time.dateTime()) self.LastUpdate = time return S_OK()
def setCreationTime(self, time=''): """ Set the creation time to the current data and time """ if not time: time = str(Time.dateTime()) self.CreationTime = time return S_OK()
def __init__(self): # These are the subrequest attributes self.RequestType = '' self.Status = 'Waiting' self.SubRequestID = 0 self.Operation = '' self.SourceSE = '' self.TargetSE = '' self.CreationTime = str(Time.dateTime()) self.SubmissionTime = str(Time.dateTime()) self.LastUpdate = str(Time.dateTime()) self.Error = '' self.Catalog = '' self.Arguments = '' self.Files = [] self.Datasets = []
def setLastUpdate(self,time=''): """ Set the last update to the current data and time """ if not time: time = str(Time.dateTime()) self.LastUpdate = time return S_OK()
def updateRing(self,updDict): """ Update Ring port and status attributes after master of MPICH2 starts Inputs: {Port, RingID, JobID} Output: {RingID, Status, JobID} """ print "DB15" port = updDict['Port'] ringID = updDict['RingID'] jobID = updDict['JobID'] status = 'RingInit' timeUpd = Time.time() req = "UPDATE Rings SET Port=%s, LastTimeUpdate=UTC_TIMESTAMP(), Status=\'%s\' WHERE RingID=%s AND JobID=%s" % (port,status,ringID,jobID) result = self._query(req) if not result['OK']: print "DB16" self.log.info ('UPDATE PORT ERROR') return S_OK(result) dict = {'RingID': ringID, 'JobID': jobID} result = self.selectRings(dict) values = result['Value'] result ={} keys = ['RingID', 'Status', 'JobID'] for x,y,t in values: z = int(str(x).strip('L')) v = int(str(t).strip('L')) result.setdefault('RingID',z) result.setdefault('Status',y) result.setdefault('JobID',v) print result return S_OK(result)
def updateProcessors(self, updDict): """ Update number of ring processors than are part of particular ring. Input: {RingID, JobID} Output:{RingID} """ print "DB23" ringID = updDict['RingID'] jobID = updDict['JobID'] req = ('SELECT NumberOfProcessorsRing, NumberOfProcessorsJob FROM Rings WHERE RingID=%s AND JobID=%s') % (ringID,jobID) result = self._query(req) if not result['OK']: print "DB24" return S_OK(result) value ={} temp = result['Value'] for x,y in temp: v = temp[0] z = int(str(x).strip('L')) value.setdefault('numProce',z) value.setdefault('numProceJ',y) numProc=int(value['numProce'])+1 timeUpd = Time.time() cmd = 'UPDATE Rings SET NumberOfProcessorsRing=%s, LastTimeUpdate=UTC_TIMESTAMP() WHERE RingID=%s AND JobID=%s' % (numProc, ringID,jobID) result = self._update(cmd) print "RESULT SELF UPDATE", result if not result['OK']: print "Result no OK", result print "DB25" return S_ERROR(result['Message']) matchDict = {'RingID':ringID} result = self.selectRing(matchDict) #result = ringID print "VH >>>>>>>>>>>>> ELIMINE", result return S_OK(result)
def updateRing(self, updDict): """ Update Ring port and status attributes after master of MPICH2 starts Inputs: {Port, RingID, JobID} Output: {RingID, Status, JobID} """ print "DB15" port = updDict['Port'] ringID = updDict['RingID'] jobID = updDict['JobID'] status = 'RingInit' timeUpd = Time.time() req = "UPDATE Rings SET Port=%s, LastTimeUpdate=UTC_TIMESTAMP(), Status=\'%s\' WHERE RingID=%s AND JobID=%s" % ( port, status, ringID, jobID) result = self._query(req) if not result['OK']: print "DB16" self.log.info('UPDATE PORT ERROR') return S_OK(result) dict = {'RingID': ringID, 'JobID': jobID} result = self.selectRings(dict) values = result['Value'] result = {} keys = ['RingID', 'Status', 'JobID'] for x, y, t in values: z = int(str(x).strip('L')) v = int(str(t).strip('L')) result.setdefault('RingID', z) result.setdefault('Status', y) result.setdefault('JobID', v) print result return S_OK(result)
def setCreationTime(self,time=''): """ Set the creation time to the current data and time """ if not time: time = str(Time.dateTime()) self.CreationTime = time return S_OK()
def setApplicationStatus(self, appStatus, sendFlag=True): """ Send application status information to the JobState service for jobID """ if not self.jobID: return S_OK('Local execution, jobID is null.') timeStamp = Time.toString() # add Application status record self.appStatusInfo.append((appStatus.replace("'", ''), timeStamp)) if sendFlag: # and send return self.sendStoredStatusInfo() return S_OK()
def setJobParameter(self, par_name, par_value, sendFlag=True): """ Send job parameter for jobID """ if not self.jobID: return S_OK('Local execution, jobID is null.') timeStamp = Time.toString() # add job paramenter record self.jobParameters[par_name] = (par_value, timeStamp) if sendFlag: # and send return self.sendStoredJobParameters() return S_OK()
def setJobParameter( self, par_name, par_value, sendFlag = True ): """ Send job parameter for jobID """ if not self.jobID: return S_OK( 'Local execution, jobID is null.' ) timeStamp = Time.toString() # add job parameter record self.jobParameters[par_name] = ( par_value, timeStamp ) if sendFlag: # and send return self.sendStoredJobParameters() return S_OK()
def setApplicationStatus( self, appStatus, sendFlag = True ): """ Send application status information to the JobState service for jobID """ if not self.jobID: return S_OK( 'Local execution, jobID is null.' ) timeStamp = Time.toString() # add Application status record self.appStatusInfo.append( ( appStatus.replace( "'", '' ), timeStamp ) ) if sendFlag: # and send return self.sendStoredStatusInfo() return S_OK()
def setJobStatus(self, status="", minor="", application="", sendFlag=True): """ Send job status information to the JobState service for jobID """ if not self.jobID: return S_OK("Local execution, jobID is null.") timeStamp = Time.toString() # add job status record self.jobStatusInfo.append((status.replace("'", ""), minor.replace("'", ""), timeStamp)) if application: self.appStatusInfo.append((application.replace("'", ""), timeStamp)) if sendFlag: # and send return self.sendStoredStatusInfo() return S_OK()
def setJobParameters(self, parameters, sendFlag=True): """ Send job parameters for jobID """ if not self.jobID: return S_OK("Local execution, jobID is null.") timeStamp = Time.toString() # add job parameter record for pname, pvalue in parameters: self.jobParameters[pname] = (pvalue, timeStamp) if sendFlag: # and send return self.sendStoredJobParameters() return S_OK()
def __init__(self,rpcStub= None,executionOrder=0): """Instantiates the Workflow object and some default parameters. """ self.subAttributeNames = ['Status','SubRequestID','Operation','ExecutionOrder','CreationTime','LastUpdate','Arguments'] self.subAttributes = {} for attr in self.subAttributeNames: self.subAttributes[attr] = "Unknown" # Some initial values self.subAttributes['Status'] = "Waiting" self.subAttributes['SubRequestID'] = makeGuid() self.subAttributes['CreationTime'] = Time.toString() self.subAttributes['ExecutionOrder'] = executionOrder if rpcStub: self.subAttributes['Arguments'] = DEncode.encode(rpcStub) self.subAttributes['Operation'] = rpcStub[1]
def updateProcessors(self, updDict): """ Update number of ring processors than are part of particular ring. Input: {RingID, JobID} Output:{RingID} """ print "DB23" ringID = updDict['RingID'] jobID = updDict['JobID'] req = ( 'SELECT NumberOfProcessorsRing, NumberOfProcessorsJob FROM Rings WHERE RingID=%s AND JobID=%s' ) % (ringID, jobID) result = self._query(req) if not result['OK']: print "DB24" return S_OK(result) value = {} temp = result['Value'] for x, y in temp: v = temp[0] z = int(str(x).strip('L')) value.setdefault('numProce', z) value.setdefault('numProceJ', y) numProc = int(value['numProce']) + 1 timeUpd = Time.time() cmd = 'UPDATE Rings SET NumberOfProcessorsRing=%s, LastTimeUpdate=UTC_TIMESTAMP() WHERE RingID=%s AND JobID=%s' % ( numProc, ringID, jobID) result = self._update(cmd) print "RESULT SELF UPDATE", result if not result['OK']: print "Result no OK", result print "DB25" return S_ERROR(result['Message']) matchDict = {'RingID': ringID} result = self.selectRing(matchDict) #result = ringID print "VH >>>>>>>>>>>>> ELIMINE", result return S_OK(result)
def _submitPilots( self, workDir, taskQueueDict, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ, proxy, pilotsPerJob ): """ This method does the actual pilot submission to the Grid RB The logic is as follows: - If there are no available RB it return error - If there is no VOMS extension in the proxy, return error - It creates a temp directory - Prepare a JDL it has some part common to gLite and LCG (the payload description) it has some part specific to each middleware """ taskQueueID = taskQueueDict['TaskQueueID'] # ownerDN = taskQueueDict['OwnerDN'] credDict = proxy.getCredentials()['Value'] ownerDN = credDict['identity'] ownerGroup = credDict[ 'group' ] if not self.resourceBrokers: # Since we can exclude RBs from the list, it may become empty return S_ERROR( ERROR_RB ) # Need to get VOMS extension for the later interactions with WMS ret = gProxyManager.getVOMSAttributes( proxy ) if not ret['OK']: self.log.error( ERROR_VOMS, ret['Message'] ) return S_ERROR( ERROR_VOMS ) if not ret['Value']: return S_ERROR( ERROR_VOMS ) vomsGroup = ret['Value'][0] workingDirectory = tempfile.mkdtemp( prefix = 'TQ_%s_' % taskQueueID, dir = workDir ) self.log.verbose( 'Using working Directory:', workingDirectory ) # Write JDL retDict = self._prepareJDL( taskQueueDict, workingDirectory, pilotOptions, pilotsPerJob, ceMask, submitPrivatePilot, privateTQ ) jdl = retDict['JDL'] pilotRequirements = retDict['Requirements'] rb = retDict['RB'] if not jdl: try: shutil.rmtree( workingDirectory ) except: pass return S_ERROR( ERROR_JDL ) # Check that there are available queues for the Job: if self.enableListMatch: availableCEs = [] now = Time.dateTime() availableCEs = self.listMatchCache.get( pilotRequirements ) if availableCEs == False: availableCEs = self._listMatch( proxy, jdl, taskQueueID, rb ) if availableCEs != False: self.log.verbose( 'LastListMatch', now ) self.log.verbose( 'AvailableCEs ', availableCEs ) self.listMatchCache.add( pilotRequirements, self.listMatchDelay * 60, value = availableCEs ) # it is given in minutes if not availableCEs: try: shutil.rmtree( workingDirectory ) except: pass return S_ERROR( ERROR_CE + ' TQ: %d' % taskQueueID ) # Now we are ready for the actual submission, so self.log.verbose( 'Submitting Pilots for TaskQueue', taskQueueID ) submitRet = self._submitPilot( proxy, pilotsPerJob, jdl, taskQueueID, rb ) try: shutil.rmtree( workingDirectory ) except: pass if not submitRet: return S_ERROR( 'Pilot Submission Failed for TQ %d ' % taskQueueID ) # pilotReference, resourceBroker = submitRet submittedPilots = 0 if pilotsPerJob != 1 and len( submitRet ) != pilotsPerJob: # Parametric jobs are used for pilotReference, resourceBroker in submitRet: pilotReference = self._getChildrenReferences( proxy, pilotReference, taskQueueID ) submittedPilots += len( pilotReference ) pilotAgentsDB.addPilotTQReference( pilotReference, taskQueueID, ownerDN, ownerGroup, resourceBroker, self.gridMiddleware, pilotRequirements ) else: for pilotReference, resourceBroker in submitRet: pilotReference = [pilotReference] submittedPilots += len( pilotReference ) pilotAgentsDB.addPilotTQReference( pilotReference, taskQueueID, ownerDN, ownerGroup, resourceBroker, self.gridMiddleware, pilotRequirements ) # add some sleep here time.sleep( 0.1 * submittedPilots ) if pilotsToSubmit > pilotsPerJob: # Additional submissions are necessary, need to get a new token and iterate. pilotsToSubmit -= pilotsPerJob result = gProxyManager.requestToken( ownerDN, ownerGroup, max( pilotsToSubmit, self.maxJobsInFillMode ) ) if not result[ 'OK' ]: self.log.error( ERROR_TOKEN, result['Message'] ) result = S_ERROR( ERROR_TOKEN ) result['Value'] = submittedPilots return result ( token, numberOfUses ) = result[ 'Value' ] for option in pilotOptions: if option.find( '-o /Security/ProxyToken=' ) == 0: pilotOptions.remove( option ) pilotOptions.append( '-o /Security/ProxyToken=%s' % token ) pilotsPerJob = max( 1, min( pilotsPerJob, int( numberOfUses / self.maxJobsInFillMode ) ) ) result = self._submitPilots( workDir, taskQueueDict, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ, proxy, pilotsPerJob ) if not result['OK']: if 'Value' not in result: result['Value'] = 0 result['Value'] += submittedPilots return result submittedPilots += result['Value'] return S_OK( submittedPilots )
def execute( self ): """Main Agent code: 1.- Query TaskQueueDB for existing TQs 2.- Add their Priorities 3.- Submit pilots """ self.__checkSubmitPools() self.directorDict = getResourceDict() #Add all submit pools self.directorDict[ 'SubmitPool' ] = self.am_getOption( "SubmitPools" ) #Add all DIRAC platforms if not specified otherwise if not 'Platform' in self.directorDict: result = gConfig.getOptionsDict( '/Resources/Computing/OSCompatibility' ) if result['OK']: self.directorDict['Platform'] = result['Value'].keys() rpcMatcher = RPCClient( "WorkloadManagement/Matcher" ) result = rpcMatcher.getMatchingTaskQueues( self.directorDict ) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] ) return result taskQueueDict = result['Value'] self.log.info( 'Found %s TaskQueues' % len( taskQueueDict ) ) if not taskQueueDict: self.log.info( 'No TaskQueue to Process' ) return S_OK() prioritySum = 0 waitingJobs = 0 for taskQueueID in taskQueueDict: taskQueueDict[taskQueueID]['TaskQueueID'] = taskQueueID prioritySum += taskQueueDict[taskQueueID]['Priority'] waitingJobs += taskQueueDict[taskQueueID]['Jobs'] self.log.info( 'Sum of Priorities %s' % prioritySum ) if waitingJobs == 0: self.log.info( 'No waiting Jobs' ) return S_OK( 'No waiting Jobs' ) if prioritySum <= 0: return S_ERROR( 'Wrong TaskQueue Priorities' ) self.pilotsPerPriority = self.am_getOption( 'pilotsPerIteration' ) / prioritySum self.pilotsPerJob = self.am_getOption( 'pilotsPerIteration' ) / waitingJobs self.callBackLock.acquire() self.submittedPilots = 0 self.callBackLock.release() self.toSubmitPilots = 0 waitingStatusList = ['Submitted', 'Ready', 'Scheduled', 'Waiting'] timeLimitToConsider = Time.toString( Time.dateTime() - Time.hour * self.am_getOption( "maxPilotWaitingHours" ) ) for taskQueueID in taskQueueDict: self.log.verbose( 'Processing TaskQueue', taskQueueID ) result = pilotAgentsDB.countPilots( { 'TaskQueueID': taskQueueID, 'Status': waitingStatusList}, None, timeLimitToConsider ) if not result['OK']: self.log.error( 'Fail to get Number of Waiting pilots', result['Message'] ) waitingPilots = 0 else: waitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % taskQueueID, waitingPilots ) result = self.submitPilotsForTaskQueue( taskQueueDict[taskQueueID], waitingPilots ) if result['OK']: self.toSubmitPilots += result['Value'] self.log.info( 'Number of pilots to be Submitted %s' % self.toSubmitPilots ) # Now wait until all Jobs in the Default ThreadPool are proccessed if 'Default' in self.pools: # only for those in "Default' thread Pool # for pool in self.pools: self.pools['Default'].processAllResults() self.log.info( 'Number of pilots Submitted %s' % self.submittedPilots ) return S_OK()
def export_setJobStatusBulk(self, jobID, statusDict): """ Set various status fields for job specified by its JobId. Set only the last status in the JobDB, updating all the status logging information in the JobLoggingDB. The statusDict has datetime as a key and status information dictionary as values """ status = "" minor = "" application = "" appCounter = "" endDate = '' startDate = '' startFlag = '' jobID = int(jobID) result = jobDB.getJobAttributes(jobID, ['Status']) if not result['OK']: return result if not result['Value']: # if there is no matching Job it returns an empty dictionary return S_ERROR('No Matching Job') new_status = result['Value']['Status'] if new_status == "Stalled": status = 'Running' # Get the latest WN time stamps of status updates result = logDB.getWMSTimeStamps(int(jobID)) if not result['OK']: return result lastTime = max( [float(t) for s, t in result['Value'].items() if s != 'LastTime']) from DIRAC import Time lastTime = Time.toString(Time.fromEpoch(lastTime)) # Get the last status values dates = sorted(statusDict) # We should only update the status if its time stamp is more recent than the last update for date in [date for date in dates if date >= lastTime]: sDict = statusDict[date] if sDict['Status']: status = sDict['Status'] if status in JOB_FINAL_STATES: endDate = date if status == "Running": startFlag = 'Running' if sDict['MinorStatus']: minor = sDict['MinorStatus'] if minor == "Application" and startFlag == 'Running': startDate = date if sDict['ApplicationStatus']: application = sDict['ApplicationStatus'] counter = sDict.get('ApplicationCounter') if counter: appCounter = counter attrNames = [] attrValues = [] if status: attrNames.append('Status') attrValues.append(status) if minor: attrNames.append('MinorStatus') attrValues.append(minor) if application: attrNames.append('ApplicationStatus') attrValues.append(application) if appCounter: attrNames.append('ApplicationCounter') attrValues.append(appCounter) result = jobDB.setJobAttributes(jobID, attrNames, attrValues, update=True) if not result['OK']: return result if endDate: result = jobDB.setEndExecTime(jobID, endDate) if startDate: result = jobDB.setStartExecTime(jobID, startDate) # Update the JobLoggingDB records for date in dates: sDict = statusDict[date] status = sDict['Status'] if not status: status = 'idem' minor = sDict['MinorStatus'] if not minor: minor = 'idem' application = sDict['ApplicationStatus'] if not application: application = 'idem' else: status = "Running" minor = "Application" source = sDict['Source'] result = logDB.addLoggingRecord(jobID, status, minor, application, date, source) if not result['OK']: return result return S_OK()
def _submitPilots( self, workDir, taskQueueDict, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ, proxy, pilotsPerJob ): """ This method does the actual pilot submission to the Grid RB The logic is as follows: - If there are no available RB it return error - If there is no VOMS extension in the proxy, return error - It creates a temp directory - Prepare a JDL it has some part common to gLite and LCG (the payload description) it has some part specific to each middleware """ taskQueueID = taskQueueDict['TaskQueueID'] # ownerDN = taskQueueDict['OwnerDN'] ownerDN = proxy.getCredentials()['Value']['identity'] if not self.resourceBrokers: # Since we can exclude RBs from the list, it may become empty return S_ERROR( ERROR_RB ) # Need to get VOMS extension for the later interactions with WMS ret = gProxyManager.getVOMSAttributes( proxy ) if not ret['OK']: self.log.error( ERROR_VOMS, ret['Message'] ) return S_ERROR( ERROR_VOMS ) if not ret['Value']: return S_ERROR( ERROR_VOMS ) vomsGroup = ret['Value'][0] workingDirectory = tempfile.mkdtemp( prefix = 'TQ_%s_' % taskQueueID, dir = workDir ) self.log.verbose( 'Using working Directory:', workingDirectory ) # Write JDL retDict = self._prepareJDL( taskQueueDict, workingDirectory, pilotOptions, pilotsPerJob, ceMask, submitPrivatePilot, privateTQ ) jdl = retDict['JDL'] pilotRequirements = retDict['Requirements'] rb = retDict['RB'] if not jdl: try: shutil.rmtree( workingDirectory ) except: pass return S_ERROR( ERROR_JDL ) # Check that there are available queues for the Job: if self.enableListMatch: availableCEs = [] now = Time.dateTime() availableCEs = self.listMatchCache.get( pilotRequirements ) if availableCEs == False: availableCEs = self._listMatch( proxy, jdl, taskQueueID, rb ) if availableCEs != False: self.log.verbose( 'LastListMatch', now ) self.log.verbose( 'AvailableCEs ', availableCEs ) self.listMatchCache.add( pilotRequirements, self.listMatchDelay * 60, value = availableCEs ) # it is given in minutes if not availableCEs: try: shutil.rmtree( workingDirectory ) except: pass return S_ERROR( ERROR_CE + ' TQ: %d' % taskQueueID ) # Now we are ready for the actual submission, so self.log.verbose( 'Submitting Pilots for TaskQueue', taskQueueID ) submitRet = self._submitPilot( proxy, pilotsPerJob, jdl, taskQueueID, rb ) try: shutil.rmtree( workingDirectory ) except: pass if not submitRet: return S_ERROR( 'Pilot Submission Failed for TQ %d ' % taskQueueID ) # pilotReference, resourceBroker = submitRet submittedPilots = 0 if pilotsPerJob != 1 and len( submitRet ) != pilotsPerJob: # Parametric jobs are used for pilotReference, resourceBroker in submitRet: pilotReference = self._getChildrenReferences( proxy, pilotReference, taskQueueID ) submittedPilots += len( pilotReference ) pilotAgentsDB.addPilotTQReference( pilotReference, taskQueueID, ownerDN, vomsGroup, resourceBroker, self.gridMiddleware, pilotRequirements ) else: for pilotReference, resourceBroker in submitRet: pilotReference = [pilotReference] submittedPilots += len( pilotReference ) pilotAgentsDB.addPilotTQReference( pilotReference, taskQueueID, ownerDN, vomsGroup, broker = resourceBroker, gridType = self.gridMiddleware, requirements = pilotRequirements ) # add some sleep here time.sleep( 0.1 * submittedPilots ) if pilotsToSubmit > pilotsPerJob: # Additional submissions are necessary, need to get a new token and iterate. pilotsToSubmit -= pilotsPerJob ownerDN = self.genericPilotDN ownerGroup = self.genericPilotGroup result = gProxyManager.requestToken( ownerDN, ownerGroup, max( pilotsToSubmit, self.maxJobsInFillMode ) ) if not result[ 'OK' ]: self.log.error( ERROR_TOKEN, result['Message'] ) return S_ERROR( ERROR_TOKEN ) ( token, numberOfUses ) = result[ 'Value' ] for option in pilotOptions: if option.find( '-o /Security/ProxyToken=' ) == 0: pilotOptions.remove( option ) pilotOptions.append( '-o /Security/ProxyToken=%s' % token ) pilotsPerJob = min( pilotsPerJob, int( numberOfUses / self.maxJobsInFillMode ) ) result = self._submitPilots( workDir, taskQueueDict, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ, proxy, pilotsPerJob ) if not result['OK']: result['Value'] = submittedPilots return result submittedPilots += result['Value'] return S_OK( submittedPilots )
def execute(self): """Main Agent code: 1.- Query TaskQueueDB for existing TQs 2.- Add their Priorities 3.- Submit pilots """ self.__checkSubmitPools() self.directorDict = getResourceDict() #Add all submit pools self.directorDict['SubmitPool'] = self.am_getOption("SubmitPools") rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(self.directorDict) if not result['OK']: self.log.error('Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] self.log.info('Found %s TaskQueues' % len(taskQueueDict)) if not taskQueueDict: self.log.info('No TaskQueue to Process') return S_OK() prioritySum = 0 waitingJobs = 0 for taskQueueID in taskQueueDict: taskQueueDict[taskQueueID]['TaskQueueID'] = taskQueueID prioritySum += taskQueueDict[taskQueueID]['Priority'] waitingJobs += taskQueueDict[taskQueueID]['Jobs'] self.log.info('Sum of Priorities %s' % prioritySum) if waitingJobs == 0: self.log.info('No waiting Jobs') return S_OK('No waiting Jobs') if prioritySum <= 0: return S_ERROR('Wrong TaskQueue Priorities') self.pilotsPerPriority = self.am_getOption( 'pilotsPerIteration') / prioritySum self.pilotsPerJob = self.am_getOption( 'pilotsPerIteration') / waitingJobs self.callBackLock.acquire() self.submittedPilots = 0 self.callBackLock.release() self.toSubmitPilots = 0 waitingStatusList = ['Submitted', 'Ready', 'Scheduled', 'Waiting'] timeLimitToConsider = Time.toString( Time.dateTime() - Time.hour * self.am_getOption("maxPilotWaitingHours")) for taskQueueID in taskQueueDict: self.log.verbose('Processing TaskQueue', taskQueueID) result = pilotAgentsDB.countPilots( { 'TaskQueueID': taskQueueID, 'Status': waitingStatusList }, None, timeLimitToConsider) if not result['OK']: self.log.error('Fail to get Number of Waiting pilots', result['Message']) waitingPilots = 0 else: waitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % taskQueueID, waitingPilots) result = self.submitPilotsForTaskQueue(taskQueueDict[taskQueueID], waitingPilots) if result['OK']: self.toSubmitPilots += result['Value'] self.log.info('Number of pilots to be Submitted %s' % self.toSubmitPilots) # Now wait until all Jobs in the Default ThreadPool are proccessed if 'Default' in self.pools: # only for those in "Default' thread Pool # for pool in self.pools: self.pools['Default'].processAllResults() self.log.info('Number of pilots Submitted %s' % self.submittedPilots) return S_OK()
def export_setJobStatusBulk( self, jobID, statusDict ): """ Set various status fields for job specified by its JobId. Set only the last status in the JobDB, updating all the status logging information in the JobLoggingDB. The statusDict has datetime as a key and status information dictionary as values """ status = "" minor = "" application = "" appCounter = "" endDate = '' startDate = '' startFlag = '' jobID = int( jobID ) result = jobDB.getJobAttributes( jobID, ['Status'] ) if not result['OK']: return result if not result['Value']: # if there is no matching Job it returns an empty dictionary return S_ERROR( 'No Matching Job' ) new_status = result['Value']['Status'] if new_status == "Stalled": status = 'Running' # Get the latest WN time stamps of status updates result = logDB.getWMSTimeStamps( int( jobID ) ) if not result['OK']: return result lastTime = max( [float( t ) for s, t in result['Value'].items() if s != 'LastTime'] ) from DIRAC import Time lastTime = Time.toString( Time.fromEpoch( lastTime ) ) # Get the last status values dates = sorted( statusDict ) # We should only update the status if its time stamp is more recent than the last update for date in [date for date in dates if date >= lastTime]: sDict = statusDict[date] if sDict['Status']: status = sDict['Status'] if status in JOB_FINAL_STATES: endDate = date if status == "Running": startFlag = 'Running' if sDict['MinorStatus']: minor = sDict['MinorStatus'] if minor == "Application" and startFlag == 'Running': startDate = date if sDict['ApplicationStatus']: application = sDict['ApplicationStatus'] counter = sDict.get( 'ApplicationCounter' ) if counter: appCounter = counter attrNames = [] attrValues = [] if status: attrNames.append( 'Status' ) attrValues.append( status ) if minor: attrNames.append( 'MinorStatus' ) attrValues.append( minor ) if application: attrNames.append( 'ApplicationStatus' ) attrValues.append( application ) if appCounter: attrNames.append( 'ApplicationCounter' ) attrValues.append( appCounter ) result = jobDB.setJobAttributes( jobID, attrNames, attrValues, update = True ) if not result['OK']: return result if endDate: result = jobDB.setEndExecTime( jobID, endDate ) if startDate: result = jobDB.setStartExecTime( jobID, startDate ) # Update the JobLoggingDB records for date in dates: sDict = statusDict[date] status = sDict['Status'] if not status: status = 'idem' minor = sDict['MinorStatus'] if not minor: minor = 'idem' application = sDict['ApplicationStatus'] if not application: application = 'idem' else: status = "Running" minor = "Application" source = sDict['Source'] result = logDB.addLoggingRecord( jobID, status, minor, application, date, source ) if not result['OK']: return result return S_OK()
def execute(self): """Main Agent code: 1.- Query TaskQueueDB for existing TQs 2.- Add their Priorities 3.- Submit pilots """ self.__checkSubmitPools() self.directorDict = getResourceDict() # Add all submit pools self.directorDict["SubmitPool"] = self.am_getOption("SubmitPools") # Add all DIRAC platforms if not specified otherwise if not "Platform" in self.directorDict: result = getDIRACPlatforms() if result["OK"]: self.directorDict["Platform"] = result["Value"] rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(self.directorDict) if not result["OK"]: self.log.error("Could not retrieve TaskQueues from TaskQueueDB", result["Message"]) return result taskQueueDict = result["Value"] self.log.info("Found %s TaskQueues" % len(taskQueueDict)) if not taskQueueDict: self.log.info("No TaskQueue to Process") return S_OK() prioritySum = 0 waitingJobs = 0 for taskQueueID in taskQueueDict: taskQueueDict[taskQueueID]["TaskQueueID"] = taskQueueID prioritySum += taskQueueDict[taskQueueID]["Priority"] waitingJobs += taskQueueDict[taskQueueID]["Jobs"] self.log.info("Sum of Priorities %s" % prioritySum) if waitingJobs == 0: self.log.info("No waiting Jobs") return S_OK("No waiting Jobs") if prioritySum <= 0: return S_ERROR("Wrong TaskQueue Priorities") self.pilotsPerPriority = self.am_getOption("pilotsPerIteration") / prioritySum self.pilotsPerJob = self.am_getOption("pilotsPerIteration") / waitingJobs self.callBackLock.acquire() self.submittedPilots = 0 self.callBackLock.release() self.toSubmitPilots = 0 waitingStatusList = ["Submitted", "Ready", "Scheduled", "Waiting"] timeLimitToConsider = Time.toString(Time.dateTime() - Time.hour * self.am_getOption("maxPilotWaitingHours")) for taskQueueID in taskQueueDict: self.log.verbose("Processing TaskQueue", taskQueueID) result = pilotAgentsDB.countPilots( {"TaskQueueID": taskQueueID, "Status": waitingStatusList}, None, timeLimitToConsider ) if not result["OK"]: self.log.error("Fail to get Number of Waiting pilots", result["Message"]) waitingPilots = 0 else: waitingPilots = result["Value"] self.log.verbose("Waiting Pilots for TaskQueue %s:" % taskQueueID, waitingPilots) result = self.submitPilotsForTaskQueue(taskQueueDict[taskQueueID], waitingPilots) if result["OK"]: self.toSubmitPilots += result["Value"] self.log.info("Number of pilots to be Submitted %s" % self.toSubmitPilots) # Now wait until all Jobs in the Default ThreadPool are proccessed if "Default" in self.pools: # only for those in "Default' thread Pool # for pool in self.pools: self.pools["Default"].processAllResults() self.log.info("Number of pilots Submitted %s" % self.submittedPilots) return S_OK()