def getQueueSlots( self, queue, manyWaitingPilotsFlag ): """ Get the number of available slots in the queue """ ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] self.queueSlots.setdefault( queue, {} ) totalSlots = self.queueSlots[queue].get( 'AvailableSlots', 0 ) # See if there are waiting pilots for this queue. If not, allow submission if totalSlots and manyWaitingPilotsFlag: result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'Status': WAITING_PILOT_STATUS } ) if result['OK']: jobIDList = result['Value'] if not jobIDList: return totalSlots return 0 availableSlotsCount = self.queueSlots[queue].setdefault( 'AvailableSlotsCount', 0 ) waitingJobs = 1 if totalSlots == 0: if availableSlotsCount % 10 == 0: # Get the list of already existing pilots for this queue jobIDList = None result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'Status': TRANSIENT_PILOT_STATUS } ) if result['OK']: jobIDList = result['Value'] result = ce.available( jobIDList ) if not result['OK']: self.log.warn( 'Failed to check the availability of queue %s: \n%s' % ( queue, result['Message'] ) ) self.failedQueues[queue] += 1 else: ceInfoDict = result['CEInfoDict'] self.log.info( "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d" % \ ( ceName, queueName, ceInfoDict['WaitingJobs'], ceInfoDict['RunningJobs'], ceInfoDict['SubmittedJobs'], ceInfoDict['MaxTotalJobs'] ) ) totalSlots = result['Value'] self.queueSlots[queue]['AvailableSlots'] = totalSlots waitingJobs = ceInfoDict['WaitingJobs'] self.queueSlots[queue]['AvailableSlotsCount'] += 1 if manyWaitingPilotsFlag and waitingJobs: return 0 else: return totalSlots
def __getQueueSlots( self, queue ): """ Get the number of available slots in the queue """ ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] self.queueSlots.setdefault( queue, {} ) totalSlots = self.queueSlots[queue].get( 'AvailableSlots', 0 ) availableSlotsCount = self.queueSlots[queue].setdefault( 'AvailableSlotsCount', 0 ) if totalSlots == 0: if availableSlotsCount % 10 == 0: # Get the list of already existing pilots for this queue jobIDList = None result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'Status':['Running','Submitted','Scheduled'] } ) if result['OK']: jobIDList = result['Value'] result = ce.available( jobIDList ) if not result['OK']: self.log.warn( 'Failed to check the availability of queue %s: \n%s' % ( queue, result['Message'] ) ) self.failedQueues[queue] += 1 else: ceInfoDict = result['CEInfoDict'] self.log.info( "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d" % \ ( ceName, queueName, ceInfoDict['WaitingJobs'], ceInfoDict['RunningJobs'], ceInfoDict['SubmittedJobs'], ceInfoDict['MaxTotalJobs'] ) ) totalSlots = result['Value'] self.queueSlots[queue]['AvailableSlots'] = totalSlots self.queueSlots[queue]['AvailableSlotsCount'] += 1 return totalSlots
def updatePilotStatus( self ): """ Update status of pilots in transient states """ for queue in self.queueDict: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] ceType = self.queueDict[queue]['CEType'] siteName = self.queueDict[queue]['Site'] abortedPilots = 0 result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'Status':TRANSIENT_PILOT_STATUS, 'OwnerDN': self.pilotDN, 'OwnerGroup': self.pilotGroup } ) if not result['OK']: self.log.error( 'Failed to select pilots: %s' % result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info from DB', result['Message'] ) continue pilotDict = result['Value'] stampedPilotRefs = [] for pRef in pilotDict: if pilotDict[pRef]['PilotStamp']: stampedPilotRefs.append( pRef + ":::" + pilotDict[pRef]['PilotStamp'] ) else: stampedPilotRefs = list( pilotRefs ) break result = ce.isProxyValid() if not result['OK']: result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 23400 ) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy( self.proxy, 23300 ) result = ce.getJobStatus( stampedPilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots status from CE', '%s: %s' % ( ceName, result['Message'] ) ) continue pilotCEDict = result['Value'] for pRef in pilotRefs: newStatus = '' oldStatus = pilotDict[pRef]['Status'] ceStatus = pilotCEDict[pRef] lastUpdateTime = pilotDict[pRef]['LastUpdateTime'] sinceLastUpdate = dateTime() - lastUpdateTime if oldStatus == ceStatus and ceStatus != "Unknown": # Normal status did not change, continue continue elif ceStatus == "Unknown" and oldStatus == "Unknown": if sinceLastUpdate < 3600*second: # Allow 1 hour of Unknown status assuming temporary problems on the CE continue else: newStatus = 'Aborted' elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS: # Possible problems on the CE, let's keep the Unknown status for a while newStatus = 'Unknown' elif ceStatus != 'Unknown' : # Update the pilot status to the new value newStatus = ceStatus if newStatus: self.log.info( 'Updating status to %s for pilot %s' % ( newStatus, pRef ) ) result = pilotAgentsDB.setPilotStatus( pRef, newStatus, '', 'Updated by SiteDirector' ) if newStatus == "Aborted": abortedPilots += 1 # Retrieve the pilot output now if newStatus in FINAL_PILOT_STATUS: if pilotDict[pRef]['OutputReady'].lower() == 'false' and self.getOutput: self.log.info( 'Retrieving output for pilot %s' % pRef ) pilotStamp = pilotDict[pRef]['PilotStamp'] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ':::' + pilotStamp result = ce.getJobOutput( pRefStamp ) if not result['OK']: self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) ) else: output, error = result['Value'] if output: result = pilotAgentsDB.storePilotOutput( pRef, output, error ) if not result['OK']: self.log.error( 'Failed to store pilot output', result['Message'] ) else: self.log.warn( 'Empty pilot output not stored to PilotDB' ) # If something wrong in the queue, make a pause for the job submission if abortedPilots: self.failedQueues[queue] += 1 # The pilot can be in Done state set by the job agent check if the output is retrieved for queue in self.queueDict: ce = self.queueDict[queue]['CE'] if not ce.isProxyValid( 120 ): result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 1000 ) if not result['OK']: return result ce.setProxy( self.proxy, 940 ) ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] ceType = self.queueDict[queue]['CEType'] siteName = self.queueDict[queue]['Site'] result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'OutputReady':'False', 'Status':FINAL_PILOT_STATUS} ) if not result['OK']: self.log.error( 'Failed to select pilots', result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info from DB', result['Message'] ) continue pilotDict = result['Value'] if self.getOutput: for pRef in pilotRefs: self.log.info( 'Retrieving output for pilot %s' % pRef ) pilotStamp = pilotDict[pRef]['PilotStamp'] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ':::' + pilotStamp result = ce.getJobOutput( pRefStamp ) if not result['OK']: self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) ) else: output, error = result['Value'] result = pilotAgentsDB.storePilotOutput( pRef, output, error ) if not result['OK']: self.log.error( 'Failed to store pilot output', result['Message'] ) # Check if the accounting is to be sent if self.sendAccounting: result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'AccountingSent':'False', 'Status':FINAL_PILOT_STATUS} ) if not result['OK']: self.log.error( 'Failed to select pilots', result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info from DB', result['Message'] ) continue pilotDict = result['Value'] result = self.sendPilotAccounting( pilotDict ) if not result['OK']: self.log.error( 'Failed to send pilot agent accounting' ) return S_OK()
def updatePilotStatus(self): """ Update status of pilots in transient states """ for queue in self.queueDict: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] ceType = self.queueDict[queue]['CEType'] siteName = self.queueDict[queue]['Site'] result = pilotAgentsDB.selectPilots({ 'DestinationSite': ceName, 'Queue': queueName, 'GridType': ceType, 'GridSite': siteName, 'Status': TRANSIENT_PILOT_STATUS, 'OwnerDN': self.pilotDN, 'OwnerGroup': self.pilotGroup }) if not result['OK']: self.log.error('Failed to select pilots: %s' % result['Message']) continue pilotRefs = result['Value'] if not pilotRefs: continue #print "AT >>> pilotRefs", pilotRefs result = pilotAgentsDB.getPilotInfo(pilotRefs) if not result['OK']: self.log.error('Failed to get pilots info from DB', result['Message']) continue pilotDict = result['Value'] #print "AT >>> pilotDict", pilotDict stampedPilotRefs = [] for pRef in pilotDict: if pilotDict[pRef]['PilotStamp']: stampedPilotRefs.append(pRef + ":::" + pilotDict[pRef]['PilotStamp']) else: stampedPilotRefs = list(pilotRefs) break result = ce.isProxyValid() if not result['OK']: result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 600) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy(self.proxy, 500) result = ce.getJobStatus(stampedPilotRefs) if not result['OK']: self.log.error('Failed to get pilots status from CE', '%s: %s' % (ceName, result['Message'])) continue pilotCEDict = result['Value'] #print "AT >>> pilotCEDict", pilotCEDict for pRef in pilotRefs: newStatus = '' oldStatus = pilotDict[pRef]['Status'] ceStatus = pilotCEDict[pRef] if oldStatus == ceStatus: # Status did not change, continue continue elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS: # Pilot finished without reporting, consider it Aborted newStatus = 'Aborted' elif ceStatus != 'Unknown': # Update the pilot status to the new value newStatus = ceStatus if newStatus: self.log.info('Updating status to %s for pilot %s' % (newStatus, pRef)) result = pilotAgentsDB.setPilotStatus( pRef, newStatus, '', 'Updated by SiteDirector') # Retrieve the pilot output now if newStatus in FINAL_PILOT_STATUS: if pilotDict[pRef]['OutputReady'].lower( ) == 'false' and self.getOutput: self.log.info('Retrieving output for pilot %s' % pRef) pilotStamp = pilotDict[pRef]['PilotStamp'] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ':::' + pilotStamp result = ce.getJobOutput(pRefStamp) if not result['OK']: self.log.error( 'Failed to get pilot output', '%s: %s' % (ceName, result['Message'])) else: output, error = result['Value'] if output: result = pilotAgentsDB.storePilotOutput( pRef, output, error) if not result['OK']: self.log.error( 'Failed to store pilot output', result['Message']) else: self.log.warn( 'Empty pilot output not stored to PilotDB') # The pilot can be in Done state set by the job agent check if the output is retrieved for queue in self.queueDict: ce = self.queueDict[queue]['CE'] if not ce.isProxyValid(120): result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 1000) if not result['OK']: return result ce.setProxy(self.proxy, 940) ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] ceType = self.queueDict[queue]['CEType'] siteName = self.queueDict[queue]['Site'] result = pilotAgentsDB.selectPilots({ 'DestinationSite': ceName, 'Queue': queueName, 'GridType': ceType, 'GridSite': siteName, 'OutputReady': 'False', 'Status': FINAL_PILOT_STATUS }) if not result['OK']: self.log.error('Failed to select pilots', result['Message']) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo(pilotRefs) if not result['OK']: self.log.error('Failed to get pilots info from DB', result['Message']) continue pilotDict = result['Value'] if self.getOutput: for pRef in pilotRefs: self.log.info('Retrieving output for pilot %s' % pRef) pilotStamp = pilotDict[pRef]['PilotStamp'] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ':::' + pilotStamp result = ce.getJobOutput(pRefStamp) if not result['OK']: self.log.error('Failed to get pilot output', '%s: %s' % (ceName, result['Message'])) else: output, error = result['Value'] result = pilotAgentsDB.storePilotOutput( pRef, output, error) if not result['OK']: self.log.error('Failed to store pilot output', result['Message']) # Check if the accounting is to be sent if self.sendAccounting: result = pilotAgentsDB.selectPilots({ 'DestinationSite': ceName, 'Queue': queueName, 'GridType': ceType, 'GridSite': siteName, 'AccountingSent': 'False', 'Status': FINAL_PILOT_STATUS }) if not result['OK']: self.log.error('Failed to select pilots', result['Message']) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo(pilotRefs) if not result['OK']: self.log.error('Failed to get pilots info from DB', result['Message']) continue pilotDict = result['Value'] result = self.sendPilotAccounting(pilotDict) if not result['OK']: self.log.error('Failed to send pilot agent accounting') return S_OK()
def updatePilotStatus( self ): """ Update status of pilots in transient states """ for queue in self.queueDict: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] ceType = self.queueDict[queue]['CEType'] siteName = self.queueDict[queue]['Site'] result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'Status':TRANSIENT_PILOT_STATUS, 'OwnerDN': self.pilotDN, 'OwnerGroup': self.pilotGroup } ) if not result['OK']: self.log.error( 'Failed to select pilots: %s' % result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info from DB', result['Message'] ) continue pilotDict = result['Value'] stampedPilotRefs = [] for pRef in pilotDict: if pilotDict[pRef]['PilotStamp']: stampedPilotRefs.append( pRef + ":::" + pilotDict[pRef]['PilotStamp'] ) else: stampedPilotRefs = list( pilotRefs ) break result = ce.isProxyValid() if not result['OK']: result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 600 ) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy( self.proxy, 500 ) result = ce.getJobStatus( stampedPilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots status from CE', '%s: %s' % ( ceName, result['Message'] ) ) continue pilotCEDict = result['Value'] for pRef in pilotRefs: newStatus = '' oldStatus = pilotDict[pRef]['Status'] ceStatus = pilotCEDict[pRef] if oldStatus == ceStatus: # Status did not change, continue continue elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS: # Pilot finished without reporting, consider it Aborted newStatus = 'Aborted' elif ceStatus != 'Unknown' : # Update the pilot status to the new value newStatus = ceStatus if newStatus: self.log.info( 'Updating status to %s for pilot %s' % ( newStatus, pRef ) ) result = pilotAgentsDB.setPilotStatus( pRef, newStatus, '', 'Updated by SiteDirector' ) # Retrieve the pilot output now if newStatus in FINAL_PILOT_STATUS: if pilotDict[pRef]['OutputReady'].lower() == 'false' and self.getOutput: self.log.info( 'Retrieving output for pilot %s' % pRef ) pilotStamp = pilotDict[pRef]['PilotStamp'] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ':::' + pilotStamp result = ce.getJobOutput( pRefStamp ) if not result['OK']: self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) ) else: output, error = result['Value'] if output: result = pilotAgentsDB.storePilotOutput( pRef, output, error ) if not result['OK']: self.log.error( 'Failed to store pilot output', result['Message'] ) else: self.log.warn( 'Empty pilot output not stored to PilotDB' ) # The pilot can be in Done state set by the job agent check if the output is retrieved for queue in self.queueDict: ce = self.queueDict[queue]['CE'] if not ce.isProxyValid( 120 ): result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 1000 ) if not result['OK']: return result ce.setProxy( self.proxy, 940 ) ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] ceType = self.queueDict[queue]['CEType'] siteName = self.queueDict[queue]['Site'] result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'OutputReady':'False', 'Status':FINAL_PILOT_STATUS} ) if not result['OK']: self.log.error( 'Failed to select pilots', result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info from DB', result['Message'] ) continue pilotDict = result['Value'] if self.getOutput: for pRef in pilotRefs: self.log.info( 'Retrieving output for pilot %s' % pRef ) pilotStamp = pilotDict[pRef]['PilotStamp'] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ':::' + pilotStamp result = ce.getJobOutput( pRefStamp ) if not result['OK']: self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) ) else: output, error = result['Value'] result = pilotAgentsDB.storePilotOutput( pRef, output, error ) if not result['OK']: self.log.error( 'Failed to store pilot output', result['Message'] ) # Check if the accounting is to be sent if self.sendAccounting: result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'AccountingSent':'False', 'Status':FINAL_PILOT_STATUS} ) if not result['OK']: self.log.error( 'Failed to select pilots', result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info from DB', result['Message'] ) continue pilotDict = result['Value'] result = self.sendPilotAccounting( pilotDict ) if not result['OK']: self.log.error( 'Failed to send pilot agent accounting' ) return S_OK()
def updatePilotStatus( self ): """ Update status of pilots in transient states """ for queue in self.queueDict: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] ceType = self.queueDict[queue]['CEType'] siteName = self.queueDict[queue]['Site'] abortedPilots = 0 result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'Status':TRANSIENT_PILOT_STATUS, 'OwnerDN': self.pilotDN, 'OwnerGroup': self.pilotGroup } ) if not result['OK']: self.log.error( 'Failed to select pilots: %s' % result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info from DB', result['Message'] ) continue pilotDict = result['Value'] stampedPilotRefs = [] for pRef in pilotDict: if pilotDict[pRef]['PilotStamp']: stampedPilotRefs.append( pRef + ":::" + pilotDict[pRef]['PilotStamp'] ) else: stampedPilotRefs = list( pilotRefs ) break # This proxy is used for checking the pilot status and renewals # We really need at least a few hours otherwise the renewed # proxy may expire before we check again... result = ce.isProxyValid( 3*3600 ) if not result['OK']: result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 23400 ) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy( self.proxy, 23300 ) result = ce.getJobStatus( stampedPilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots status from CE', '%s: %s' % ( ceName, result['Message'] ) ) continue pilotCEDict = result['Value'] for pRef in pilotRefs: newStatus = '' oldStatus = pilotDict[pRef]['Status'] ceStatus = pilotCEDict[pRef] lastUpdateTime = pilotDict[pRef]['LastUpdateTime'] sinceLastUpdate = dateTime() - lastUpdateTime if oldStatus == ceStatus and ceStatus != "Unknown": # Normal status did not change, continue continue elif ceStatus == "Unknown" and oldStatus == "Unknown": if sinceLastUpdate < 3600*second: # Allow 1 hour of Unknown status assuming temporary problems on the CE continue else: newStatus = 'Aborted' elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS: # Possible problems on the CE, let's keep the Unknown status for a while newStatus = 'Unknown' elif ceStatus != 'Unknown' : # Update the pilot status to the new value newStatus = ceStatus if newStatus: self.log.info( 'Updating status to %s for pilot %s' % ( newStatus, pRef ) ) result = pilotAgentsDB.setPilotStatus( pRef, newStatus, '', 'Updated by SiteDirector' ) if newStatus == "Aborted": abortedPilots += 1 # Retrieve the pilot output now if newStatus in FINAL_PILOT_STATUS: if pilotDict[pRef]['OutputReady'].lower() == 'false' and self.getOutput: self.log.info( 'Retrieving output for pilot %s' % pRef ) pilotStamp = pilotDict[pRef]['PilotStamp'] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ':::' + pilotStamp result = ce.getJobOutput( pRefStamp ) if not result['OK']: self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) ) else: output, error = result['Value'] if output: result = pilotAgentsDB.storePilotOutput( pRef, output, error ) if not result['OK']: self.log.error( 'Failed to store pilot output', result['Message'] ) else: self.log.warn( 'Empty pilot output not stored to PilotDB' ) # If something wrong in the queue, make a pause for the job submission if abortedPilots: self.failedQueues[queue] += 1 # The pilot can be in Done state set by the job agent check if the output is retrieved for queue in self.queueDict: ce = self.queueDict[queue]['CE'] if not ce.isProxyValid( 120 ): result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 1000 ) if not result['OK']: return result ce.setProxy( self.proxy, 940 ) ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] ceType = self.queueDict[queue]['CEType'] siteName = self.queueDict[queue]['Site'] result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'OutputReady':'False', 'Status':FINAL_PILOT_STATUS} ) if not result['OK']: self.log.error( 'Failed to select pilots', result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info from DB', result['Message'] ) continue pilotDict = result['Value'] if self.getOutput: for pRef in pilotRefs: self.log.info( 'Retrieving output for pilot %s' % pRef ) pilotStamp = pilotDict[pRef]['PilotStamp'] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ':::' + pilotStamp result = ce.getJobOutput( pRefStamp ) if not result['OK']: self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) ) else: output, error = result['Value'] result = pilotAgentsDB.storePilotOutput( pRef, output, error ) if not result['OK']: self.log.error( 'Failed to store pilot output', result['Message'] ) # Check if the accounting is to be sent if self.sendAccounting: result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'AccountingSent':'False', 'Status':FINAL_PILOT_STATUS} ) if not result['OK']: self.log.error( 'Failed to select pilots', result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info from DB', result['Message'] ) continue pilotDict = result['Value'] result = self.sendPilotAccounting( pilotDict ) if not result['OK']: self.log.error( 'Failed to send pilot agent accounting' ) return S_OK()
def updatePilotStatus(self): """ Update status of pilots in transient states """ for queue in self.queueDict: ce = self.queueDict[queue]["CE"] ceName = self.queueDict[queue]["CEName"] queueName = self.queueDict[queue]["QueueName"] ceType = self.queueDict[queue]["CEType"] siteName = self.queueDict[queue]["Site"] result = pilotAgentsDB.selectPilots( { "DestinationSite": ceName, "Queue": queueName, "GridType": ceType, "GridSite": siteName, "Status": TRANSIENT_PILOT_STATUS, "OwnerDN": self.pilotDN, "OwnerGroup": self.pilotGroup, } ) if not result["OK"]: self.log.error("Failed to select pilots: %s" % result["Message"]) continue pilotRefs = result["Value"] if not pilotRefs: continue # print "AT >>> pilotRefs", pilotRefs result = pilotAgentsDB.getPilotInfo(pilotRefs) if not result["OK"]: self.log.error("Failed to get pilots info from DB", result["Message"]) continue pilotDict = result["Value"] # print "AT >>> pilotDict", pilotDict stampedPilotRefs = [] for pRef in pilotDict: if pilotDict[pRef]["PilotStamp"]: stampedPilotRefs.append(pRef + ":::" + pilotDict[pRef]["PilotStamp"]) else: stampedPilotRefs = list(pilotRefs) break result = ce.isProxyValid() if not result["OK"]: result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, self.pilotGroup, 600) if not result["OK"]: return result self.proxy = result["Value"] ce.setProxy(self.proxy, 500) result = ce.getJobStatus(stampedPilotRefs) if not result["OK"]: self.log.error("Failed to get pilots status from CE", "%s: %s" % (ceName, result["Message"])) continue pilotCEDict = result["Value"] # print "AT >>> pilotCEDict", pilotCEDict for pRef in pilotRefs: newStatus = "" oldStatus = pilotDict[pRef]["Status"] ceStatus = pilotCEDict[pRef] if oldStatus == ceStatus: # Status did not change, continue continue elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS: # Pilot finished without reporting, consider it Aborted newStatus = "Aborted" elif ceStatus != "Unknown": # Update the pilot status to the new value newStatus = ceStatus if newStatus: self.log.info("Updating status to %s for pilot %s" % (newStatus, pRef)) result = pilotAgentsDB.setPilotStatus(pRef, newStatus, "", "Updated by SiteDirector") # Retrieve the pilot output now if newStatus in FINAL_PILOT_STATUS: if pilotDict[pRef]["OutputReady"].lower() == "false" and self.getOutput: self.log.info("Retrieving output for pilot %s" % pRef) pilotStamp = pilotDict[pRef]["PilotStamp"] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ":::" + pilotStamp result = ce.getJobOutput(pRefStamp) if not result["OK"]: self.log.error("Failed to get pilot output", "%s: %s" % (ceName, result["Message"])) else: output, error = result["Value"] if output: result = pilotAgentsDB.storePilotOutput(pRef, output, error) if not result["OK"]: self.log.error("Failed to store pilot output", result["Message"]) else: self.log.warn("Empty pilot output not stored to PilotDB") # The pilot can be in Done state set by the job agent check if the output is retrieved for queue in self.queueDict: ce = self.queueDict[queue]["CE"] if not ce.isProxyValid(120): result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, self.pilotGroup, 1000) if not result["OK"]: return result ce.setProxy(self.proxy, 940) ceName = self.queueDict[queue]["CEName"] queueName = self.queueDict[queue]["QueueName"] ceType = self.queueDict[queue]["CEType"] siteName = self.queueDict[queue]["Site"] result = pilotAgentsDB.selectPilots( { "DestinationSite": ceName, "Queue": queueName, "GridType": ceType, "GridSite": siteName, "OutputReady": "False", "Status": FINAL_PILOT_STATUS, } ) if not result["OK"]: self.log.error("Failed to select pilots", result["Message"]) continue pilotRefs = result["Value"] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo(pilotRefs) if not result["OK"]: self.log.error("Failed to get pilots info from DB", result["Message"]) continue pilotDict = result["Value"] if self.getOutput: for pRef in pilotRefs: self.log.info("Retrieving output for pilot %s" % pRef) pilotStamp = pilotDict[pRef]["PilotStamp"] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ":::" + pilotStamp result = ce.getJobOutput(pRefStamp) if not result["OK"]: self.log.error("Failed to get pilot output", "%s: %s" % (ceName, result["Message"])) else: output, error = result["Value"] result = pilotAgentsDB.storePilotOutput(pRef, output, error) if not result["OK"]: self.log.error("Failed to store pilot output", result["Message"]) # Check if the accounting is to be sent if self.sendAccounting: result = pilotAgentsDB.selectPilots( { "DestinationSite": ceName, "Queue": queueName, "GridType": ceType, "GridSite": siteName, "AccountingSent": "False", "Status": FINAL_PILOT_STATUS, } ) if not result["OK"]: self.log.error("Failed to select pilots", result["Message"]) continue pilotRefs = result["Value"] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo(pilotRefs) if not result["OK"]: self.log.error("Failed to get pilots info from DB", result["Message"]) continue pilotDict = result["Value"] result = self.sendPilotAccounting(pilotDict) if not result["OK"]: self.log.error("Failed to send pilot agent accounting") return S_OK()