def cleanTaskQueues(): tqDB = TaskQueueDB() jobDB = JobDB() logDB = JobLoggingDB() result = tqDB.enableAllTaskQueues() if not result['OK']: return result result = tqDB.findOrphanJobs() if not result['OK']: return result for jid in result['Value']: result = tqDB.deleteJob(jid) if not result['OK']: gLogger.error("Cannot delete from TQ job %s" % jid, result['Message']) continue result = jobDB.rescheduleJob(jid) if not result['OK']: gLogger.error("Cannot reschedule in JobDB job %s" % jid, result['Message']) continue result = logDB.addLoggingRecord(jid, JobStatus.RECEIVED, "", "", source="JobState") if not result['OK']: gLogger.error("Cannot add logging record in JobLoggingDB %s" % jid, result['Message']) continue return S_OK()
def initializeMatcherHandler(serviceInfo): """ Matcher Service initialization """ global gJobDB global gTaskQueueDB global jlDB global pilotAgentsDB gJobDB = JobDB() gTaskQueueDB = TaskQueueDB() jlDB = JobLoggingDB() pilotAgentsDB = PilotAgentsDB() gMonitor.registerActivity('matchTime', "Job matching time", 'Matching', "secs", gMonitor.OP_MEAN, 300) gMonitor.registerActivity('matchesDone', "Job Match Request", 'Matching', "matches", gMonitor.OP_RATE, 300) gMonitor.registerActivity('matchesOK', "Matched jobs", 'Matching', "matches", gMonitor.OP_RATE, 300) gMonitor.registerActivity('numTQs', "Number of Task Queues", 'Matching', "tqsk queues", gMonitor.OP_MEAN, 300) gTaskQueueDB.recalculateTQSharesForAll() gThreadScheduler.addPeriodicTask(120, gTaskQueueDB.recalculateTQSharesForAll) gThreadScheduler.addPeriodicTask(60, sendNumTaskQueues) sendNumTaskQueues() return S_OK()
def initializeMatcherHandler(serviceInfo): """ Matcher Service initialization """ global jobDB global jobLoggingDB global taskQueueDB jobDB = JobDB() jobLoggingDB = JobLoggingDB() taskQueueDB = TaskQueueDB() gMonitor.registerActivity("matchTime", "Job matching time", "Matching", "secs", gMonitor.OP_MEAN, 300) gMonitor.registerActivity( "matchTaskQueues", "Task queues checked per job", "Matching", "task queues", gMonitor.OP_MEAN, 300 ) gMonitor.registerActivity("matchesDone", "Job Matches", "Matching", "matches", gMonitor.OP_MEAN, 300) gMonitor.registerActivity("numTQs", "Number of Task Queues", "Matching", "tqsk queues", gMonitor.OP_MEAN, 300) taskQueueDB.recalculateTQSharesForAll() gThreadScheduler.addPeriodicTask(120, taskQueueDB.recalculateTQSharesForAll) gThreadScheduler.addPeriodicTask(120, sendNumTaskQueues) sendNumTaskQueues() return S_OK()
def initializeMatcherHandler( serviceInfo ): """ Matcher Service initialization """ global gJobDB global gJobLoggingDB global gTaskQueueDB global gPilotAgentsDB gJobDB = JobDB() gJobLoggingDB = JobLoggingDB() gTaskQueueDB = TaskQueueDB() gPilotAgentsDB = PilotAgentsDB() gMonitor.registerActivity( 'matchTime', "Job matching time", 'Matching', "secs" , gMonitor.OP_MEAN, 300 ) gMonitor.registerActivity( 'matchesDone', "Job Match Request", 'Matching', "matches" , gMonitor.OP_RATE, 300 ) gMonitor.registerActivity( 'matchesOK', "Matched jobs", 'Matching', "matches" , gMonitor.OP_RATE, 300 ) gMonitor.registerActivity( 'numTQs', "Number of Task Queues", 'Matching', "tqsk queues" , gMonitor.OP_MEAN, 300 ) gTaskQueueDB.recalculateTQSharesForAll() gThreadScheduler.addPeriodicTask( 120, gTaskQueueDB.recalculateTQSharesForAll ) gThreadScheduler.addPeriodicTask( 60, sendNumTaskQueues ) sendNumTaskQueues() return S_OK()
def test_matcher( self ): # insert a proper DN to run the test resourceDescription = {'OwnerGroup': 'prod', 'OwnerDN':'/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser/[email protected]', 'DIRACVersion': 'pippo', 'ReleaseVersion':'blabla', 'VirtualOrganization':'LHCB', 'PilotInfoReportedFlag':'True', 'PilotBenchmark':'anotherPilot', 'LHCbPlatform':'CERTO', 'Site':'DIRAC.Jenkins.org', 'CPUTime' : 86400 } matcher = RPCClient( 'WorkloadManagement/Matcher' ) JobStateUpdate = RPCClient( 'WorkloadManagement/JobStateUpdate' ) wmsClient = WMSClient() job = helloWorldJob() job.setDestination( 'DIRAC.Jenkins.org' ) job.setInputData( '/a/bbb' ) job.setType( 'User' ) jobDescription = createFile( job ) res = wmsClient.submitJob( job._toJDL( xmlFile = jobDescription ) ) self.assert_( res['OK'] ) jobID = res['Value'] res = JobStateUpdate.setJobStatus( jobID, 'Waiting', 'matching', 'source' ) self.assert_( res['OK'] ) tqDB = TaskQueueDB() tqDefDict = {'OwnerDN': '/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser/[email protected]', 'OwnerGroup':'prod', 'Setup':'JenkinsSetup', 'CPUTime':86400} res = tqDB.insertJob( jobID, tqDefDict, 10 ) self.assert_( res['OK'] ) res = matcher.requestJob( resourceDescription ) print res self.assert_( res['OK'] ) wmsClient.deleteJob( jobID )
def initialize(self): """ Sets defaults """ self.am_setOption("PollingTime", 120) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() self.jobLoggingDB = JobLoggingDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) agentTSTypes = self.am_getOption('ProductionTypes', []) if agentTSTypes: self.prod_types = agentTSTypes else: self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge']) gLogger.info( "Will exclude the following Production types from cleaning %s" % (', '.join(self.prod_types))) self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce', 500) self.jobByJob = self.am_getOption('JobByJob', False) self.throttlingPeriod = self.am_getOption('ThrottlingPeriod', 0.) self.removeStatusDelay['Done'] = self.am_getOption( 'RemoveStatusDelay/Done', 7) self.removeStatusDelay['Killed'] = self.am_getOption( 'RemoveStatusDelay/Killed', 7) self.removeStatusDelay['Failed'] = self.am_getOption( 'RemoveStatusDelay/Failed', 7) self.removeStatusDelay['Any'] = self.am_getOption( 'RemoveStatusDelay/Any', -1) return S_OK()
def test_matcher( self ): # insert a proper DN to run the test resourceDescription = {'OwnerGroup': 'prod', 'OwnerDN':'/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser/[email protected]', 'DIRACVersion': 'pippo', 'ReleaseVersion':'blabla', 'VirtualOrganization':'LHCB', 'PilotInfoReportedFlag':'True', 'PilotBenchmark':'anotherPilot', 'LHCbPlatform':'CERTO', 'Site':'DIRAC.Jenkins.org', 'CPUTime' : 86400 } matcher = RPCClient( 'WorkloadManagement/Matcher' ) JobStateUpdate = RPCClient( 'WorkloadManagement/JobStateUpdate' ) wmsClient = WMSClient() job = helloWorldJob() job.setDestination( 'DIRAC.Jenkins.org' ) job.setInputData( '/a/bbb' ) job.setType( 'User' ) jobDescription = createFile( job ) res = wmsClient.submitJob( job._toJDL( xmlFile = jobDescription ) ) self.assert_( res['OK'] ) jobID = res['Value'] res = JobStateUpdate.setJobStatus( jobID, 'Waiting', 'matching', 'source' ) self.assert_( res['OK'] ) tqDB = TaskQueueDB() tqDefDict = {'OwnerDN': '/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser/[email protected]', 'OwnerGroup':'prod', 'Setup':'dirac-JenkinsSetup', 'CPUTime':86400} res = tqDB.insertJob( jobID, tqDefDict, 10 ) self.assert_( res['OK'] ) res = matcher.requestJob( resourceDescription ) print res self.assert_( res['OK'] ) wmsClient.deleteJob( jobID )
def __init__(self, pilotAgentsDB=None, jobDB=None, tqDB=None, jlDB=None, opsHelper=None): """ c'tor """ if pilotAgentsDB: self.pilotAgentsDB = pilotAgentsDB else: self.pilotAgentsDB = PilotAgentsDB() if jobDB: self.jobDB = jobDB else: self.jobDB = JobDB() if tqDB: self.tqDB = tqDB else: self.tqDB = TaskQueueDB() if jlDB: self.jlDB = jlDB else: self.jlDB = JobLoggingDB() if opsHelper: self.opsHelper = opsHelper else: self.opsHelper = Operations() self.log = gLogger.getSubLogger("Matcher") self.limiter = Limiter(jobDB=self.jobDB, opsHelper=self.opsHelper) self.siteClient = SiteStatus()
def __init__( self, pilotAgentsDB = None, jobDB = None, tqDB = None, jlDB = None, opsHelper = None ): """ c'tor """ if pilotAgentsDB: self.pilotAgentsDB = pilotAgentsDB else: self.pilotAgentsDB = PilotAgentsDB() if jobDB: self.jobDB = jobDB else: self.jobDB = JobDB() if tqDB: self.tqDB = tqDB else: self.tqDB = TaskQueueDB() if jlDB: self.jlDB = jlDB else: self.jlDB = JobLoggingDB() if opsHelper: self.opsHelper = opsHelper else: self.opsHelper = Operations() self.log = gLogger.getSubLogger( "Matcher" ) self.limiter = Limiter( jobDB = self.jobDB, opsHelper = self.opsHelper )
def initializeOptimizer(self): """Initialize specific parameters for TaskQueueAgent. """ self.waitingStatus = self.am_getOption('WaitingStatus', 'Waiting') self.waitingMinorStatus = self.am_getOption('WaitingMinorStatus', 'Pilot Agent Submission') try: self.taskQueueDB = TaskQueueDB() result = self.taskQueueDB.enableAllTaskQueues() if not result['OK']: raise Exception("Can't enable TaskQueues: %s" % result['Message']) except Exception, e: self.log.exception() return S_ERROR("Cannot initialize taskqueueDB: %s" % str(e))
def initializeJobMonitoringHandler(serviceInfo): global jobDB, jobLoggingDB, taskQueueDB jobDB = JobDB() jobLoggingDB = JobLoggingDB() taskQueueDB = TaskQueueDB() return S_OK()
def checkDBAccess(cls): # Init DB if there if not JobState.__db.checked: JobState.__db.jobDB = JobDB() JobState.__db.logDB = JobLoggingDB() JobState.__db.tqDB = TaskQueueDB() JobState.__db.checked = True
def export_getPilots(cls, jobID): """ Get pilot references and their states for : - those pilots submitted for the TQ where job is sitting - (or) the pilots executing/having executed the Job """ pilots = [] result = pilotDB.getPilotsForJobID(int(jobID)) if not result['OK']: if result['Message'].find('not found') == -1: return S_ERROR('Failed to get pilot: ' + result['Message']) else: pilots += result['Value'] if not pilots: # Pilots were not found try to look in the Task Queue taskQueueID = 0 result = TaskQueueDB().getTaskQueueForJob(int(jobID)) if result['OK'] and result['Value']: taskQueueID = result['Value'] if taskQueueID: result = pilotDB.getPilotsForTaskQueue(taskQueueID, limit=10) if not result['OK']: return S_ERROR('Failed to get pilot: ' + result['Message']) pilots += result['Value'] if not pilots: return S_ERROR('Failed to get pilot for Job %d' % int(jobID)) return pilotDB.getPilotInfo(pilotID=pilots)
def initializeJobMonitoringHandler(serviceInfo): global gJobDB, gJobLoggingDB, gTaskQueueDB gJobDB = JobDB() gJobLoggingDB = JobLoggingDB() gTaskQueueDB = TaskQueueDB() return S_OK()
def initialize( self ): """ Sets defaults """ self.am_setOption( "PollingTime", 120 ) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() self.jobLoggingDB = JobLoggingDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) agentTSTypes = self.am_getOption('ProductionTypes', []) if agentTSTypes: self.prod_types = agentTSTypes else: self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] ) gLogger.info( "Will exclude the following Production types from cleaning %s" % ( ', '.join( self.prod_types ) ) ) self.maxJobsAtOnce = self.am_getOption( 'MaxJobsAtOnce', 500 ) self.jobByJob = self.am_getOption( 'JobByJob', False ) self.throttlingPeriod = self.am_getOption('ThrottlingPeriod', 0.) self.removeStatusDelay['Done'] = self.am_getOption( 'RemoveStatusDelay/Done', 7 ) self.removeStatusDelay['Killed'] = self.am_getOption( 'RemoveStatusDelay/Killed', 7 ) self.removeStatusDelay['Failed'] = self.am_getOption( 'RemoveStatusDelay/Failed', 7 ) self.removeStatusDelay['Any'] = self.am_getOption( 'RemoveStatusDelay/Any', -1 ) return S_OK()
def initializeJobManagerHandler(serviceInfo): global gJobDB, gJobLoggingDB, gtaskQueueDB gJobDB = JobDB() gJobLoggingDB = JobLoggingDB() gtaskQueueDB = TaskQueueDB() return S_OK()
class TaskQueueAgent(OptimizerModule): """ The specific Optimizer must provide the following methods: - initializeOptimizer() before each execution cycle - checkJob() - the main method called for each job """ ############################################################################# def initializeOptimizer(self): """Initialize specific parameters for TaskQueueAgent. """ self.waitingStatus = self.am_getOption('WaitingStatus', 'Waiting') self.waitingMinorStatus = self.am_getOption('WaitingMinorStatus', 'Pilot Agent Submission') try: self.taskQueueDB = TaskQueueDB() result = self.taskQueueDB.enableAllTaskQueues() if not result['OK']: raise Exception("Can't enable TaskQueues: %s" % result['Message']) except Exception, e: self.log.exception() return S_ERROR("Cannot initialize taskqueueDB: %s" % str(e)) return S_OK()
def test_matcher(self): # insert a proper DN to run the test resourceDescription = { "OwnerGroup": "prod", "OwnerDN": "/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser", "DIRACVersion": "pippo", "GridCE": "some.grid.ce.org", "ReleaseVersion": "blabla", "VirtualOrganization": "LHCb", "PilotInfoReportedFlag": "True", "PilotBenchmark": "anotherPilot", "Site": "DIRAC.Jenkins.ch", "CPUTime": 86400, } wmsClient = WMSClient() job = helloWorldJob() job.setDestination("DIRAC.Jenkins.ch") job.setInputData("/a/bbb") job.setType("User") jobDescription = createFile(job) res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res["OK"], res.get("Message")) jobID = res["Value"] # forcing the update res = JobStateUpdateClient().setJobStatus(jobID, JobStatus.WAITING, "matching", "source", None, True) self.assertTrue(res["OK"], res.get("Message")) tqDB = TaskQueueDB() tqDefDict = { "OwnerDN": "/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser", "OwnerGroup": "prod", "Setup": "dirac-JenkinsSetup", "CPUTime": 86400, } res = tqDB.insertJob(jobID, tqDefDict, 10) self.assertTrue(res["OK"], res.get("Message")) res = MatcherClient().requestJob(resourceDescription) print(res) self.assertTrue(res["OK"], res.get("Message")) wmsClient.deleteJob(jobID)
def initializeMatcherHandler(serviceInfo): """ Matcher Service initialization """ global gJobDB global gJobLoggingDB global gTaskQueueDB global gPilotAgentsDB # Create JobDB object and initialize its tables. gJobDB = JobDB() res = gJobDB._checkTable() if not res['OK']: return res # Create JobLoggingDB object and initialize its tables. gJobLoggingDB = JobLoggingDB() res = gJobLoggingDB._checkTable() if not res['OK']: return res gTaskQueueDB = TaskQueueDB() # Create PilotAgentsDB object and initialize its tables. gPilotAgentsDB = PilotAgentsDB() res = gPilotAgentsDB._checkTable() if not res['OK']: return res gMonitor.registerActivity('matchTime', "Job matching time", 'Matching', "secs", gMonitor.OP_MEAN, 300) gMonitor.registerActivity('matchesDone', "Job Match Request", 'Matching', "matches", gMonitor.OP_RATE, 300) gMonitor.registerActivity('matchesOK', "Matched jobs", 'Matching', "matches", gMonitor.OP_RATE, 300) gMonitor.registerActivity('numTQs', "Number of Task Queues", 'Matching', "tqsk queues", gMonitor.OP_MEAN, 300) gTaskQueueDB.recalculateTQSharesForAll() gThreadScheduler.addPeriodicTask(120, gTaskQueueDB.recalculateTQSharesForAll) gThreadScheduler.addPeriodicTask(60, sendNumTaskQueues) sendNumTaskQueues() return S_OK()
def initialize( self ): """Sets defaults """ self.am_setOption( "PollingTime", 60 ) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() self.jobLoggingDB = JobLoggingDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) agentTSTypes = self.am_getOption('ProductionTypes', []) if agentTSTypes: self.prod_types = agentTSTypes else: self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] ) gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', '))) self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',200) self.jobByJob = self.am_getOption('JobByJob',True) self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.) return S_OK()
def initialize(self): """Sets defaults """ self.am_setOption("PollingTime", 60) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) self.prod_types = self.am_getOption('ProductionTypes', [ 'DataReconstruction', 'DataStripping', 'MCSimulation', 'Merge', 'production' ]) gLogger.info( 'Will exclude the following Production types from cleaning %s' % (string.join(self.prod_types, ', '))) self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce', 200) self.jobByJob = self.am_getOption('JobByJob', True) self.throttlingPeriod = self.am_getOption('ThrottlingPeriod', 0.) return S_OK()
def initializeHandler(cls, svcInfoDict): cls.gJobDB = JobDB() cls.gJobLoggingDB = JobLoggingDB() cls.gTaskQueueDB = TaskQueueDB() cls.gElasticJobParametersDB = None useESForJobParametersFlag = Operations().getValue( '/Services/JobMonitoring/useESForJobParametersFlag', False) if useESForJobParametersFlag: cls.gElasticJobParametersDB = ElasticJobParametersDB() return S_OK()
def initializeWMSAdministratorHandler(serviceInfo): """ WMS AdministratorService initialization """ global jobDB global taskQueueDB jobDB = JobDB() taskQueueDB = TaskQueueDB() return S_OK()
def initializeHandler(cls, svcInfoDict): """ WMS AdministratorService initialization """ cls.jobDB = JobDB() cls.taskQueueDB = TaskQueueDB() cls.elasticJobParametersDB = None useESForJobParametersFlag = Operations().getValue( '/Services/JobMonitoring/useESForJobParametersFlag', False) if useESForJobParametersFlag: cls.elasticJobParametersDB = ElasticJobParametersDB() return S_OK()
def initializeOptimizer( self ): """Initialize specific parameters for TaskQueueAgent. """ self.waitingStatus = self.am_getOption( 'WaitingStatus', 'Waiting' ) self.waitingMinorStatus = self.am_getOption( 'WaitingMinorStatus', 'Pilot Agent Submission' ) try: self.taskQueueDB = TaskQueueDB() result = self.taskQueueDB.enableAllTaskQueues() if not result[ 'OK' ]: raise Exception( "Can't enable TaskQueues: %s" % result[ 'Message' ] ) except Exception, e: self.log.exception() return S_ERROR( "Cannot initialize taskqueueDB: %s" % str( e ) )
def initializeWMSAdministratorHandler(serviceInfo): """ WMS AdministratorService initialization :param dict serviceInfo: service information dictionary :return: S_OK()/S_ERROR() """ global jobDB global taskQueueDB jobDB = JobDB() taskQueueDB = TaskQueueDB() return S_OK()
def initializeMatcherHandler( serviceInfo ): """ Matcher Service initialization """ global jobDB global jobLoggingDB global taskQueueDB jobDB = JobDB() jobLoggingDB = JobLoggingDB() taskQueueDB = TaskQueueDB() gMonitor.registerActivity( 'matchTime', "Job matching time", 'Matching', "secs" , gMonitor.OP_MEAN, 300 ) gMonitor.registerActivity( 'matchTaskQueues', "Task queues checked per job", 'Matching', "task queues" , gMonitor.OP_MEAN, 300 ) gMonitor.registerActivity( 'matchesDone', "Job Matches", 'Matching', "matches" , gMonitor.OP_MEAN, 300 ) gMonitor.registerActivity( 'numTQs', "Number of Task Queues", 'Matching', "tqsk queues" , gMonitor.OP_MEAN, 300 ) taskQueueDB.recalculateTQSharesForAll() gThreadScheduler.addPeriodicTask( 120, taskQueueDB.recalculateTQSharesForAll ) gThreadScheduler.addPeriodicTask( 120, sendNumTaskQueues ) sendNumTaskQueues() return S_OK()
def initialize( self ): """Sets defaults """ self.am_setOption( "PollingTime", 60 ) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() self.jobLoggingDB = JobLoggingDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) self.prod_types = self.am_getOption('ProductionTypes',['DataReconstruction', 'DataStripping', 'MCSimulation', 'Merge', 'production']) gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', '))) self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',200) self.jobByJob = self.am_getOption('JobByJob',True) self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.) return S_OK()
def initializeJobManagerHandler(serviceInfo): global gJobDB, gJobLoggingDB, gtaskQueueDB, enablePilotsLogging, gPilotAgentsDB, gPilotsLoggingDB gJobDB = JobDB() gJobLoggingDB = JobLoggingDB() gtaskQueueDB = TaskQueueDB() gPilotAgentsDB = PilotAgentsDB() # there is a problem with accessing CS with shorter paths, so full path is extracted from serviceInfo dict enablePilotsLogging = gConfig.getValue( serviceInfo['serviceSectionPath'].replace('JobManager', 'PilotsLogging') + '/Enable', 'False').lower() in ('yes', 'true') if enablePilotsLogging: gPilotsLoggingDB = PilotsLoggingDB() return S_OK()
def initializeHandler(cls, serviceInfoDict): """ Initialization of DB objects and OptimizationMind """ cls.jobDB = JobDB() cls.jobLoggingDB = JobLoggingDB() cls.taskQueueDB = TaskQueueDB() cls.pilotAgentsDB = PilotAgentsDB() cls.pilotsLoggingDB = None enablePilotsLogging = Operations().getValue( '/Services/JobMonitoring/usePilotsLoggingFlag', False) if enablePilotsLogging: cls.pilotsLoggingDB = PilotsLoggingDB() cls.msgClient = MessageClient("WorkloadManagement/OptimizationMind") cls.__connectToOptMind() gThreadScheduler.addPeriodicTask(60, cls.__connectToOptMind) return S_OK()
def initialize(self): """Sets defaults """ self.am_setOption("PollingTime", 60) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) self.prod_types = self.am_getOption( "ProductionTypes", ["DataReconstruction", "DataStripping", "MCSimulation", "Merge", "production"] ) gLogger.info( "Will exclude the following Production types from cleaning %s" % (string.join(self.prod_types, ", ")) ) self.maxJobsAtOnce = self.am_getOption("MaxJobsAtOnce", 200) self.jobByJob = self.am_getOption("JobByJob", True) self.throttlingPeriod = self.am_getOption("ThrottlingPeriod", 0.0) return S_OK()
def initialize( self ): """Sets defaults """ self.am_setOption( "PollingTime", 60 ) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() self.jobLoggingDB = JobLoggingDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) agentTSTypes = self.am_getOption('ProductionTypes', []) if agentTSTypes: self.prod_types = agentTSTypes else: self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] ) gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', '))) self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',100) self.jobByJob = self.am_getOption('JobByJob',True) self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.) return S_OK()
class TaskQueuesAgent(AgentModule): """Agent for recalculating TQ shares""" def __init__(self, *args, **kwargs): """c'tor""" AgentModule.__init__(self, *args, **kwargs) # clients self.tqDB = None def initialize(self): """just initialize TQDB""" self.tqDB = TaskQueueDB() return S_OK() def execute(self): """calls TQDB.recalculateTQSharesForAll""" res = self.tqDB.recalculateTQSharesForAll() if not res["OK"]: self.log.error("Error recalculating TQ shares", res["Message"]) return S_OK()
class TaskQueueAgent( OptimizerModule ): """ The specific Optimizer must provide the following methods: - initializeOptimizer() before each execution cycle - checkJob() - the main method called for each job """ ############################################################################# def initializeOptimizer( self ): """Initialize specific parameters for TaskQueueAgent. """ self.waitingStatus = self.am_getOption( 'WaitingStatus', 'Waiting' ) self.waitingMinorStatus = self.am_getOption( 'WaitingMinorStatus', 'Pilot Agent Submission' ) try: self.taskQueueDB = TaskQueueDB() result = self.taskQueueDB.enableAllTaskQueues() if not result[ 'OK' ]: raise Exception( "Can't enable TaskQueues: %s" % result[ 'Message' ] ) except Exception, e: self.log.exception() return S_ERROR( "Cannot initialize taskqueueDB: %s" % str( e ) ) return S_OK()
def initializeHandler(cls, serviceInfoDict): cls.jobDB = JobDB() cls.jobLoggingDB = JobLoggingDB() cls.taskQueueDB = TaskQueueDB() cls.pilotAgentsDB = PilotAgentsDB() cls.limiter = Limiter(jobDB=cls.jobDB) cls.taskQueueDB.recalculateTQSharesForAll() gMonitor.registerActivity('matchTime', "Job matching time", 'Matching', "secs", gMonitor.OP_MEAN, 300) gMonitor.registerActivity('matchesDone', "Job Match Request", 'Matching', "matches", gMonitor.OP_RATE, 300) gMonitor.registerActivity('matchesOK', "Matched jobs", 'Matching', "matches", gMonitor.OP_RATE, 300) gMonitor.registerActivity('numTQs', "Number of Task Queues", 'Matching', "tqsk queues", gMonitor.OP_MEAN, 300) gThreadScheduler.addPeriodicTask(120, cls.taskQueueDB.recalculateTQSharesForAll) gThreadScheduler.addPeriodicTask(60, cls.sendNumTaskQueues) cls.sendNumTaskQueues() return S_OK()
def initializeWMSAdministratorHandler(serviceInfo): """ WMS AdministratorService initialization """ global jobDB global pilotDB global taskQueueDB global enablePilotsLogging # there is a problem with accessing CS with shorter paths, so full path is extracted from serviceInfo dict enablePilotsLogging = gConfig.getValue( serviceInfo['serviceSectionPath'].replace( 'WMSAdministrator', 'PilotsLogging') + '/Enable', 'False').lower() in ( 'yes', 'true') jobDB = JobDB() pilotDB = PilotAgentsDB() taskQueueDB = TaskQueueDB() if enablePilotsLogging: pilotsLoggingDB = PilotsLoggingDB() return S_OK()
# File : dirac-admin-submit-pilot-for-job # Author : Ricardo Graciani ######################################################################## __RCSID__ = "cd6b25c (2010-12-04 11:45:50 +0000) Ricardo Graciani <*****@*****.**>" import sys import DIRAC from DIRAC.Core.Base import Script Script.parseCommandLine( ignoreErrors = True ) args = Script.getPositionalArgs() from DIRAC.WorkloadManagementSystem.DB.TaskQueueDB import TaskQueueDB from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB jobdb = JobDB() tqdb = TaskQueueDB() result = jobdb.selectJobs( { 'Status' : [ 'Received', 'Checking', 'Waiting' ] } ) if not result[ 'OK' ]: print result[ 'Message' ] sys.exit( 1 ) jobList = result[ 'Value' ] print tqdb.forceRecreationOfTables() for job in jobList: result = jobdb.getJobAttribute( job, 'RescheduleCounter' ) if not result[ 'OK' ]: print "Cannot get reschedule counter for job %s" % job rC = 0 rC = result[ 'Value' ] if rC >= jobdb.maxRescheduling: jobdb.setJobAttribute( job, "RescheduleCounter", "0" )
class JobCleaningAgent( AgentModule ): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ ############################################################################# def initialize( self ): """Sets defaults """ self.am_setOption( "PollingTime", 60 ) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() self.jobLoggingDB = JobLoggingDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) agentTSTypes = self.am_getOption('ProductionTypes', []) if agentTSTypes: self.prod_types = agentTSTypes else: self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] ) gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', '))) self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',100) self.jobByJob = self.am_getOption('JobByJob',True) self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.) return S_OK() def __getAllowedJobTypes( self ): #Get valid jobTypes result = self.jobDB.getDistinctJobAttributes( 'JobType' ) if not result[ 'OK' ]: return result cleanJobTypes = [] for jobType in result[ 'Value' ]: if jobType not in self.prod_types: cleanJobTypes.append( jobType ) self.log.notice( "JobTypes to clean %s" % cleanJobTypes ) return S_OK( cleanJobTypes ) ############################################################################# def execute( self ): """The PilotAgent execution method. """ #Delete jobs in "Deleted" state result = self.removeJobsByStatus( { 'Status' : 'Deleted' } ) if not result[ 'OK' ]: return result #Get all the Job types that can be cleaned result = self.__getAllowedJobTypes() if not result[ 'OK' ]: return result baseCond = { 'JobType' : result[ 'Value' ] } # Remove jobs with final status for status in REMOVE_STATUS_DELAY: delay = REMOVE_STATUS_DELAY[ status ] condDict = dict( baseCond ) condDict[ 'Status' ] = status delTime = str( Time.dateTime() - delay * Time.day ) result = self.removeJobsByStatus( condDict, delTime ) if not result['OK']: gLogger.warn( 'Failed to remove jobs in status %s' % status ) return S_OK() def removeJobsByStatus( self, condDict, delay = False ): """ Remove deleted jobs """ if delay: gLogger.verbose( "Removing jobs with %s and older than %s" % ( condDict, delay ) ) result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce ) else: gLogger.verbose( "Removing jobs with %s " % condDict ) result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce ) if not result['OK']: return result jobList = result['Value'] if len(jobList) > self.maxJobsAtOnce: jobList = jobList[:self.maxJobsAtOnce] if not jobList: return S_OK() self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) ) count = 0 error_count = 0 result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList ) if not result[ 'OK' ]: gLogger.warn( "Cannot unassign jobs to sandboxes", result[ 'Message' ] ) result = self.deleteJobOversizedSandbox( jobList ) if not result[ 'OK' ]: gLogger.warn( "Cannot schedle removal of oversized sandboxes", result[ 'Message' ] ) return result failedJobs = result['Value']['Failed'] for job in failedJobs: jobList.pop( jobList.index( job ) ) if self.jobByJob: for jobID in jobList: resultJobDB = self.jobDB.removeJobFromDB( jobID ) resultTQ = self.taskQueueDB.deleteJob( jobID ) resultLogDB = self.jobLoggingDB.deleteJob( jobID ) errorFlag = False if not resultJobDB['OK']: gLogger.warn( 'Failed to remove job %d from JobDB' % jobID, result['Message'] ) errorFlag = True if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message'] ) errorFlag = True if not resultLogDB['OK']: gLogger.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message'] ) errorFlag = True if errorFlag: error_count += 1 else: count += 1 if self.throttlingPeriod: time.sleep(self.throttlingPeriod) else: result = self.jobDB.removeJobFromDB( jobList ) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList) ) else: gLogger.info('Deleted %d jobs from JobDB' % len(jobList) ) for jobID in jobList: resultTQ = self.taskQueueDB.deleteJob( jobID ) if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message'] ) error_count += 1 else: count += 1 result = self.jobLoggingDB.deleteJob( jobList ) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList) ) else: gLogger.info('Deleted %d jobs from JobLoggingDB' % len(jobList) ) if count > 0 or error_count > 0 : gLogger.info( 'Deleted %d jobs from JobDB, %d errors' % ( count, error_count ) ) return S_OK() def deleteJobOversizedSandbox( self, jobIDList ): """ Delete the job oversized sandbox files from storage elements """ failed = {} successful = {} lfnDict = {} for jobID in jobIDList: result = self.jobDB.getJobParameter( jobID, 'OutputSandboxLFN' ) if result['OK']: lfn = result['Value'] if lfn: lfnDict[lfn] = jobID else: successful[jobID] = 'No oversized sandbox found' else: gLogger.warn( 'Error interrogting JobDB: %s' % result['Message'] ) if not lfnDict: return S_OK( {'Successful':successful, 'Failed':failed} ) # Schedule removal of the LFNs now for lfn,jobID in lfnDict.items(): result = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] ) if not result['OK']: failed[jobID] = lfn continue if not result['Value']: failed[jobID] = lfn continue ownerDN = result['Value']['OwnerDN'] ownerGroup = result['Value']['OwnerGroup'] result = self.__setRemovalRequest( lfn, ownerDN, ownerGroup ) if not result['OK']: failed[jobID] = lfn else: successful[jobID] = lfn result = {'Successful':successful, 'Failed':failed} return S_OK( result ) def __setRemovalRequest( self, lfn, ownerDN, ownerGroup ): """ Set removal request with the given credentials """ oRequest = Request() oRequest.OwnerDN = ownerDN oRequest.OwnerGroup = ownerGroup oRequest.RequestName = os.path.basename( lfn ).strip() + '_removal_request.xml' oRequest.SourceComponent = 'JobCleaningAgent' removeFile = Operation() removeFile.Type = 'RemoveFile' removedFile = File() removedFile.LFN = lfn removeFile.addFile( removedFile ) oRequest.addOperation( removeFile ) return ReqClient().putRequest( oRequest )
Suggestion: for local testing, run this with:: python -m pytest -c ../pytest.ini -vv tests/Integration/WorkloadManagementSystem/Test_TaskQueueDB.py """ from DIRAC import gLogger from DIRAC.Core.Base.Script import parseCommandLine parseCommandLine() from DIRAC.WorkloadManagementSystem.DB.TaskQueueDB import TaskQueueDB gLogger.setLevel('DEBUG') tqDB = TaskQueueDB() def test_basicChain(): """ a basic put - remove """ tqDefDict = { 'OwnerDN': '/my/DN', 'OwnerGroup': 'myGroup', 'Setup': 'aSetup', 'CPUTime': 50000 } result = tqDB.insertJob(123, tqDefDict, 10) assert result['OK'] is True result = tqDB.getTaskQueueForJobs([123]) assert result['OK'] is True
class JobCleaningAgent( AgentModule ): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ def __init__( self, *args, **kwargs ): """ c'tor """ AgentModule.__init__( self, *args, **kwargs ) #clients # FIXME: shouldn't we avoid using the DBs directly, and instead go through the service? self.jobDB = None self.taskQueueDB = None self.jobLoggingDB = None self.maxJobsAtOnce = 100 self.jobByJob = False self.throttlingPeriod = 0. self.removeStatusDelay = {'Done':7, 'Killed':1, 'Failed':7 } ############################################################################# def initialize( self ): """ Sets defaults """ self.am_setOption( "PollingTime", 120 ) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() self.jobLoggingDB = JobLoggingDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) agentTSTypes = self.am_getOption('ProductionTypes', []) if agentTSTypes: self.prod_types = agentTSTypes else: self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] ) gLogger.info( "Will exclude the following Production types from cleaning %s" % ( ', '.join( self.prod_types ) ) ) self.maxJobsAtOnce = self.am_getOption( 'MaxJobsAtOnce', 500 ) self.jobByJob = self.am_getOption( 'JobByJob', False ) self.throttlingPeriod = self.am_getOption('ThrottlingPeriod', 0.) self.removeStatusDelay['Done'] = self.am_getOption( 'RemoveStatusDelay/Done', 7 ) self.removeStatusDelay['Killed'] = self.am_getOption( 'RemoveStatusDelay/Killed', 7 ) self.removeStatusDelay['Failed'] = self.am_getOption( 'RemoveStatusDelay/Failed', 7 ) return S_OK() def __getAllowedJobTypes( self ): """ Get valid jobTypes """ result = self.jobDB.getDistinctJobAttributes( 'JobType' ) if not result[ 'OK' ]: return result cleanJobTypes = [] for jobType in result[ 'Value' ]: if jobType not in self.prod_types: cleanJobTypes.append( jobType ) self.log.notice( "JobTypes to clean %s" % cleanJobTypes ) return S_OK( cleanJobTypes ) ############################################################################# def execute( self ): """ Remove jobs in various status """ #Delete jobs in "Deleted" state result = self.removeJobsByStatus( { 'Status' : 'Deleted' } ) if not result[ 'OK' ]: return result #Get all the Job types that can be cleaned result = self.__getAllowedJobTypes() if not result[ 'OK' ]: return result # No jobs in the system subject to removal if not result['Value']: return S_OK() baseCond = { 'JobType' : result[ 'Value' ] } # Remove jobs with final status for status in self.removeStatusDelay: delay = self.removeStatusDelay[ status ] condDict = dict( baseCond ) condDict[ 'Status' ] = status delTime = str( Time.dateTime() - delay * Time.day ) result = self.removeJobsByStatus( condDict, delTime ) if not result['OK']: gLogger.warn( 'Failed to remove jobs in status %s' % status ) return S_OK() def removeJobsByStatus( self, condDict, delay = False ): """ Remove deleted jobs """ if delay: gLogger.verbose( "Removing jobs with %s and older than %s" % ( condDict, delay ) ) result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce ) else: gLogger.verbose( "Removing jobs with %s " % condDict ) result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce ) if not result['OK']: return result jobList = result['Value'] if len(jobList) > self.maxJobsAtOnce: jobList = jobList[:self.maxJobsAtOnce] if not jobList: return S_OK() self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) ) count = 0 error_count = 0 result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList ) if not result[ 'OK' ]: gLogger.warn( "Cannot unassign jobs to sandboxes", result[ 'Message' ] ) result = self.deleteJobOversizedSandbox( jobList ) if not result[ 'OK' ]: gLogger.warn( "Cannot schedule removal of oversized sandboxes", result[ 'Message' ] ) return result failedJobs = result['Value']['Failed'] for job in failedJobs: jobList.pop( jobList.index( job ) ) # TODO: we should not remove a job if it still has requests in the RequestManager. # But this logic should go in the client or in the service, and right now no service expose jobDB.removeJobFromDB if self.jobByJob: for jobID in jobList: resultJobDB = self.jobDB.removeJobFromDB( jobID ) resultTQ = self.taskQueueDB.deleteJob( jobID ) resultLogDB = self.jobLoggingDB.deleteJob( jobID ) errorFlag = False if not resultJobDB['OK']: gLogger.warn( 'Failed to remove job %d from JobDB' % jobID, result['Message'] ) errorFlag = True if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message'] ) errorFlag = True if not resultLogDB['OK']: gLogger.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message'] ) errorFlag = True if errorFlag: error_count += 1 else: count += 1 if self.throttlingPeriod: time.sleep(self.throttlingPeriod) else: result = self.jobDB.removeJobFromDB( jobList ) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList) ) else: gLogger.info('Deleted %d jobs from JobDB' % len(jobList) ) for jobID in jobList: resultTQ = self.taskQueueDB.deleteJob( jobID ) if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message'] ) error_count += 1 else: count += 1 result = self.jobLoggingDB.deleteJob( jobList ) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList) ) else: gLogger.info('Deleted %d jobs from JobLoggingDB' % len(jobList) ) if count > 0 or error_count > 0 : gLogger.info( 'Deleted %d jobs from JobDB, %d errors' % ( count, error_count ) ) return S_OK() def deleteJobOversizedSandbox( self, jobIDList ): """ Delete the job oversized sandbox files from storage elements """ failed = {} successful = {} lfnDict = {} for jobID in jobIDList: result = self.jobDB.getJobParameter( jobID, 'OutputSandboxLFN' ) if result['OK']: lfn = result['Value'] if lfn: lfnDict[lfn] = jobID else: successful[jobID] = 'No oversized sandbox found' else: gLogger.warn( 'Error interrogating JobDB: %s' % result['Message'] ) if not lfnDict: return S_OK( {'Successful':successful, 'Failed':failed} ) # Schedule removal of the LFNs now for lfn,jobID in lfnDict.items(): result = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] ) if not result['OK']: failed[jobID] = lfn continue if not result['Value']: failed[jobID] = lfn continue ownerDN = result['Value']['OwnerDN'] ownerGroup = result['Value']['OwnerGroup'] result = self.__setRemovalRequest( lfn, ownerDN, ownerGroup ) if not result['OK']: failed[jobID] = lfn else: successful[jobID] = lfn result = {'Successful':successful, 'Failed':failed} return S_OK( result ) def __setRemovalRequest( self, lfn, ownerDN, ownerGroup ): """ Set removal request with the given credentials """ oRequest = Request() oRequest.OwnerDN = ownerDN oRequest.OwnerGroup = ownerGroup oRequest.RequestName = os.path.basename( lfn ).strip() + '_removal_request.xml' oRequest.SourceComponent = 'JobCleaningAgent' removeFile = Operation() removeFile.Type = 'RemoveFile' removedFile = File() removedFile.LFN = lfn removeFile.addFile( removedFile ) oRequest.addOperation( removeFile ) return ReqClient().putRequest( oRequest )
class JobCleaningAgent( AgentModule ): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ ############################################################################# def initialize( self ): """Sets defaults """ self.am_setOption( "PollingTime", 60 ) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() self.jobLoggingDB = JobLoggingDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) agentTSTypes = self.am_getOption('ProductionTypes', []) if agentTSTypes: self.prod_types = agentTSTypes else: self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] ) gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', '))) self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',200) self.jobByJob = self.am_getOption('JobByJob',True) self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.) return S_OK() def __getAllowedJobTypes( self ): #Get valid jobTypes result = self.jobDB.getDistinctJobAttributes( 'JobType' ) if not result[ 'OK' ]: return result cleanJobTypes = [] for jobType in result[ 'Value' ]: if jobType not in self.prod_types: cleanJobTypes.append( jobType ) self.log.notice( "JobTypes to clean %s" % cleanJobTypes ) return S_OK( cleanJobTypes ) ############################################################################# def execute( self ): """The PilotAgent execution method. """ #Delete jobs in "Deleted" state result = self.removeJobsByStatus( { 'Status' : 'Deleted' } ) if not result[ 'OK' ]: return result #Get all the Job types that can be cleaned result = self.__getAllowedJobTypes() if not result[ 'OK' ]: return result baseCond = { 'JobType' : result[ 'Value' ] } # Remove jobs with final status for status in REMOVE_STATUS_DELAY: delay = REMOVE_STATUS_DELAY[ status ] condDict = dict( baseCond ) condDict[ 'Status' ] = status delTime = str( Time.dateTime() - delay * Time.day ) result = self.removeJobsByStatus( condDict, delTime ) if not result['OK']: gLogger.warn( 'Failed to remove jobs in status %s' % status ) return S_OK() def removeJobsByStatus( self, condDict, delay = False ): """ Remove deleted jobs """ if delay: gLogger.verbose( "Removing jobs with %s and older than %s" % ( condDict, delay ) ) result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce ) else: gLogger.verbose( "Removing jobs with %s " % condDict ) result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce ) if not result['OK']: return result jobList = result['Value'] if len(jobList) > self.maxJobsAtOnce: jobList = jobList[:self.maxJobsAtOnce] if not jobList: return S_OK() self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) ) count = 0 error_count = 0 result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList ) if not result[ 'OK' ]: gLogger.warn( "Cannot unassign jobs to sandboxes", result[ 'Message' ] ) if self.jobByJob: for jobID in jobList: resultJobDB = self.jobDB.removeJobFromDB( jobID ) resultTQ = self.taskQueueDB.deleteJob( jobID ) resultLogDB = self.jobLoggingDB.deleteJob( jobID ) errorFlag = False if not resultJobDB['OK']: gLogger.warn( 'Failed to remove job %d from JobDB' % jobID, result['Message'] ) errorFlag = True if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message'] ) errorFlag = True if not resultLogDB['OK']: gLogger.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message'] ) errorFlag = True if errorFlag: error_count += 1 else: count += 1 if self.throttlingPeriod: time.sleep(self.throttlingPeriod) else: result = self.jobDB.removeJobFromDB( jobList ) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList) ) else: gLogger.info('Deleted %d jobs from JobDB' % len(jobList) ) for jobID in jobList: resultTQ = self.taskQueueDB.deleteJob( jobID ) if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message'] ) error_count += 1 else: count += 1 result = self.jobLoggingDB.deleteJob( jobList ) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList) ) else: gLogger.info('Deleted %d jobs from JobLoggingDB' % len(jobList) ) if count > 0 or error_count > 0 : gLogger.info( 'Deleted %d jobs from JobDB, %d errors' % ( count, error_count ) ) return S_OK()
def setUp(self): gLogger.setLevel('DEBUG') self.tqDB = TaskQueueDB()
class Matcher( object ): """ Logic for matching """ def __init__( self, pilotAgentsDB = None, jobDB = None, tqDB = None, jlDB = None, opsHelper = None ): """ c'tor """ if pilotAgentsDB: self.pilotAgentsDB = pilotAgentsDB else: self.pilotAgentsDB = PilotAgentsDB() if jobDB: self.jobDB = jobDB else: self.jobDB = JobDB() if tqDB: self.tqDB = tqDB else: self.tqDB = TaskQueueDB() if jlDB: self.jlDB = jlDB else: self.jlDB = JobLoggingDB() if opsHelper: self.opsHelper = opsHelper else: self.opsHelper = Operations() self.log = gLogger.getSubLogger( "Matcher" ) self.limiter = Limiter( jobDB = self.jobDB, opsHelper = self.opsHelper ) def selectJob( self, resourceDescription, credDict ): """ Main job selection function to find the highest priority job matching the resource capacity """ startTime = time.time() resourceDict = self._getResourceDict( resourceDescription, credDict ) negativeCond = self.limiter.getNegativeCondForSite( resourceDict['Site'] ) result = self.tqDB.matchAndGetJob( resourceDict, negativeCond = negativeCond ) if not result['OK']: return result result = result['Value'] if not result['matchFound']: self.log.info( "No match found" ) raise RuntimeError( "No match found" ) jobID = result['jobId'] resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup', 'Status'] ) if not resAtt['OK']: raise RuntimeError( 'Could not retrieve job attributes' ) if not resAtt['Value']: raise RuntimeError( "No attributes returned for job" ) if not resAtt['Value']['Status'] == 'Waiting': self.log.error( 'Job matched by the TQ is not in Waiting state', str( jobID ) ) result = self.tqDB.deleteJob( jobID ) if not result[ 'OK' ]: return result raise RuntimeError( "Job %s is not in Waiting state" % str( jobID ) ) self._reportStatus( resourceDict, jobID ) result = self.jobDB.getJobJDL( jobID ) if not result['OK']: raise RuntimeError( "Failed to get the job JDL" ) resultDict = {} resultDict['JDL'] = result['Value'] resultDict['JobID'] = jobID matchTime = time.time() - startTime self.log.info( "Match time: [%s]" % str( matchTime ) ) gMonitor.addMark( "matchTime", matchTime ) # Get some extra stuff into the response returned resOpt = self.jobDB.getJobOptParameters( jobID ) if resOpt['OK']: for key, value in resOpt['Value'].items(): resultDict[key] = value resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] ) if not resAtt['OK']: raise RuntimeError( 'Could not retrieve job attributes' ) if not resAtt['Value']: raise RuntimeError( 'No attributes returned for job' ) if self.opsHelper.getValue( "JobScheduling/CheckMatchingDelay", True ): self.limiter.updateDelayCounters( resourceDict['Site'], jobID ) pilotInfoReportedFlag = resourceDict.get( 'PilotInfoReportedFlag', False ) if not pilotInfoReportedFlag: self._updatePilotInfo( resourceDict ) self._updatePilotJobMapping( resourceDict, jobID ) resultDict['DN'] = resAtt['Value']['OwnerDN'] resultDict['Group'] = resAtt['Value']['OwnerGroup'] resultDict['PilotInfoReportedFlag'] = True return resultDict def _getResourceDict( self, resourceDescription, credDict ): """ from resourceDescription to resourceDict (just various mods) """ resourceDict = self._processResourceDescription( resourceDescription ) resourceDict = self._checkCredentials( resourceDict, credDict ) self._checkPilotVersion( resourceDict ) if not self._checkMask( resourceDict ): # Banned destinations can only take Test jobs resourceDict['JobType'] = 'Test' self.log.verbose( "Resource description:" ) for key in resourceDict: self.log.verbose( "%s : %s" % ( key.rjust( 20 ), resourceDict[ key ] ) ) return resourceDict def _processResourceDescription( self, resourceDescription ): """ Check and form the resource description dictionary resourceDescription is a ceDict coming from a JobAgent, for example. """ resourceDict = {} if isinstance( resourceDescription, basestring ): classAdAgent = ClassAd( resourceDescription ) if not classAdAgent.isOK(): raise ValueError( 'Illegal Resource JDL' ) self.log.verbose( classAdAgent.asJDL() ) for name in singleValueDefFields: if classAdAgent.lookupAttribute( name ): if name == 'CPUTime': resourceDict[name] = classAdAgent.getAttributeInt( name ) else: resourceDict[name] = classAdAgent.getAttributeString( name ) for name in multiValueMatchFields: if classAdAgent.lookupAttribute( name ): if name == 'SubmitPool': resourceDict[name] = classAdAgent.getListFromExpression( name ) else: resourceDict[name] = classAdAgent.getAttributeString( name ) # Check if a JobID is requested if classAdAgent.lookupAttribute( 'JobID' ): resourceDict['JobID'] = classAdAgent.getAttributeInt( 'JobID' ) for k in ( 'DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization' ): if classAdAgent.lookupAttribute( k ): resourceDict[ k ] = classAdAgent.getAttributeString( k ) else: for name in singleValueDefFields: if resourceDescription.has_key( name ): resourceDict[name] = resourceDescription[name] for name in multiValueMatchFields: if resourceDescription.has_key( name ): resourceDict[name] = resourceDescription[name] if resourceDescription.has_key( 'JobID' ): resourceDict['JobID'] = resourceDescription['JobID'] for k in ( 'DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization', 'PilotReference', 'PilotBenchmark', 'PilotInfoReportedFlag' ): if k in resourceDescription: resourceDict[ k ] = resourceDescription[ k ] return resourceDict def _reportStatus( self, resourceDict, jobID ): """ Reports the status of the matched job in jobDB and jobLoggingDB Do not fail if errors happen here """ attNames = ['Status', 'MinorStatus', 'ApplicationStatus', 'Site'] attValues = ['Matched', 'Assigned', 'Unknown', resourceDict['Site']] result = self.jobDB.setJobAttributes( jobID, attNames, attValues ) if not result['OK']: self.log.error( "Problem reporting job status", "setJobAttributes, jobID = %s: %s" % ( jobID, result['Message'] ) ) else: self.log.verbose( "Set job attributes for jobID %s" % jobID ) result = self.jlDB.addLoggingRecord( jobID, status = 'Matched', minor = 'Assigned', source = 'Matcher' ) if not result['OK']: self.log.error( "Problem reporting job status", "addLoggingRecord, jobID = %s: %s" % ( jobID, result['Message'] ) ) else: self.log.verbose( "Added logging record for jobID %s" % jobID ) def _checkMask( self, resourceDict ): """ Check the mask: are we allowed to run normal jobs? FIXME: should we move to site OR SE? """ if not 'Site' in resourceDict: self.log.error( "Missing Site Name in Resource JDL" ) raise RuntimeError( "Missing Site Name in Resource JDL" ) # Get common site mask and check the agent site result = self.jobDB.getSiteMask( siteState = 'Active' ) if not result['OK']: self.log.error( "Internal error", "getSiteMask: %s" % result['Message'] ) raise RuntimeError( "Internal error" ) maskList = result['Value'] if resourceDict['Site'] not in maskList: return False return True def _updatePilotInfo( self, resourceDict ): """ Update pilot information - do not fail if we don't manage to do it """ pilotReference = resourceDict.get( 'PilotReference', '' ) if pilotReference: gridCE = resourceDict.get( 'GridCE', 'Unknown' ) site = resourceDict.get( 'Site', 'Unknown' ) benchmark = resourceDict.get( 'PilotBenchmark', 0.0 ) self.log.verbose( 'Reporting pilot info for %s: gridCE=%s, site=%s, benchmark=%f' % ( pilotReference, gridCE, site, benchmark ) ) result = self.pilotAgentsDB.setPilotStatus( pilotReference, status = 'Running', gridSite = site, destination = gridCE, benchmark = benchmark ) if not result['OK']: self.log.error( "Problem updating pilot information", "; setPilotStatus. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) ) def _updatePilotJobMapping( self, resourceDict, jobID ): """ Update pilot to job mapping information """ pilotReference = resourceDict.get( 'PilotReference', '' ) if pilotReference: result = self.pilotAgentsDB.setCurrentJobID( pilotReference, jobID ) if not result['OK']: self.log.error( "Problem updating pilot information", ";setCurrentJobID. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) ) result = self.pilotAgentsDB.setJobForPilot( jobID, pilotReference, updateStatus = False ) if not result['OK']: self.log.error( "Problem updating pilot information", "; setJobForPilot. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) ) def _checkCredentials( self, resourceDict, credDict ): """ Check if we can get a job given the passed credentials """ if Properties.GENERIC_PILOT in credDict[ 'properties' ]: # You can only match groups in the same VO if credDict[ 'group' ] == "hosts": # for the host case the VirtualOrganization parameter # is mandatory in resourceDict vo = resourceDict.get( 'VirtualOrganization', '' ) else: vo = Registry.getVOForGroup( credDict[ 'group' ] ) result = Registry.getGroupsForVO( vo ) if result[ 'OK' ]: resourceDict[ 'OwnerGroup' ] = result[ 'Value' ] else: raise RuntimeError( result['Message'] ) else: # If it's a private pilot, the DN has to be the same if Properties.PILOT in credDict[ 'properties' ]: self.log.notice( "Setting the resource DN to the credentials DN" ) resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ] # If it's a job sharing. The group has to be the same and just check that the DN (if any) # belongs to the same group elif Properties.JOB_SHARING in credDict[ 'properties' ]: resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ] self.log.notice( "Setting the resource group to the credentials group" ) if 'OwnerDN' in resourceDict and resourceDict[ 'OwnerDN' ] != credDict[ 'DN' ]: ownerDN = resourceDict[ 'OwnerDN' ] result = Registry.getGroupsForDN( resourceDict[ 'OwnerDN' ] ) if not result[ 'OK' ]: raise RuntimeError( result['Message'] ) if credDict[ 'group' ] not in result[ 'Value' ]: # DN is not in the same group! bad boy. self.log.notice( "You cannot request jobs from DN %s. It does not belong to your group!" % ownerDN ) resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ] # Nothing special, group and DN have to be the same else: resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ] resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ] return resourceDict def _checkPilotVersion( self, resourceDict ): """ Check the pilot DIRAC version """ if self.opsHelper.getValue( "Pilot/CheckVersion", True ): if 'ReleaseVersion' not in resourceDict: if not 'DIRACVersion' in resourceDict: raise RuntimeError( 'Version check requested and not provided by Pilot' ) else: pilotVersion = resourceDict['DIRACVersion'] else: pilotVersion = resourceDict['ReleaseVersion'] validVersions = self.opsHelper.getValue( "Pilot/Version", [] ) if validVersions and pilotVersion not in validVersions: raise RuntimeError( 'Pilot version does not match the production version %s not in ( %s )' % \ ( pilotVersion, ",".join( validVersions ) ) ) # Check project if requested validProject = self.opsHelper.getValue( "Pilot/Project", "" ) if validProject: if 'ReleaseProject' not in resourceDict: raise RuntimeError( "Version check requested but expected project %s not received" % validProject ) if resourceDict[ 'ReleaseProject' ] != validProject: raise RuntimeError( "Version check requested but expected project %s != received %s" % ( validProject, resourceDict[ 'ReleaseProject' ] ) )
class JobCleaningAgent(AgentModule): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ ############################################################################# def initialize(self): """Sets defaults """ self.am_setOption("PollingTime", 60) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) self.prod_types = self.am_getOption( "ProductionTypes", ["DataReconstruction", "DataStripping", "MCSimulation", "Merge", "production"] ) gLogger.info( "Will exclude the following Production types from cleaning %s" % (string.join(self.prod_types, ", ")) ) self.maxJobsAtOnce = self.am_getOption("MaxJobsAtOnce", 200) self.jobByJob = self.am_getOption("JobByJob", True) self.throttlingPeriod = self.am_getOption("ThrottlingPeriod", 0.0) return S_OK() def __getAllowedJobTypes(self): # Get valid jobTypes result = self.jobDB.getDistinctJobAttributes("JobType") if not result["OK"]: return result cleanJobTypes = [] for jobType in result["Value"]: if jobType not in self.prod_types: cleanJobTypes.append(jobType) self.log.notice("JobTypes to clean %s" % cleanJobTypes) return S_OK(cleanJobTypes) ############################################################################# def execute(self): """The PilotAgent execution method. """ # Delete jobs in "Deleted" state result = self.removeJobsByStatus({"Status": "Deleted"}) if not result["OK"]: return result # Get all the Job types that can be cleaned result = self.__getAllowedJobTypes() if not result["OK"]: return result baseCond = {"JobType": result["Value"]} # Remove jobs with final status for status in REMOVE_STATUS_DELAY: delay = REMOVE_STATUS_DELAY[status] condDict = dict(baseCond) condDict["Status"] = status delTime = str(Time.dateTime() - delay * Time.day) result = self.removeJobsByStatus(condDict, delTime) if not result["OK"]: gLogger.warn("Failed to remove jobs in status %s" % status) return S_OK() def removeJobsByStatus(self, condDict, delay=False): """ Remove deleted jobs """ if delay: gLogger.verbose("Removing jobs with %s and older than %s" % (condDict, delay)) result = self.jobDB.selectJobs(condDict, older=delay) else: gLogger.verbose("Removing jobs with %s " % condDict) result = self.jobDB.selectJobs(condDict) if not result["OK"]: return result jobList = result["Value"] if len(jobList) > self.maxJobsAtOnce: jobList = jobList[: self.maxJobsAtOnce] if not jobList: return S_OK() self.log.notice("Deleting %s jobs for %s" % (len(jobList), condDict)) count = 0 error_count = 0 result = SandboxStoreClient(useCertificates=True).unassignJobs(jobList) if not result["OK"]: gLogger.warn("Cannot unassign jobs to sandboxes", result["Message"]) if self.jobByJob: for jobID in jobList: resultJobDB = self.jobDB.removeJobFromDB(jobID) resultTQ = self.taskQueueDB.deleteJob(jobID) if not resultJobDB["OK"]: gLogger.warn("Failed to remove job %d from JobDB" % jobID, result["Message"]) error_count += 1 elif not resultTQ["OK"]: gLogger.warn("Failed to remove job %d from TaskQueueDB" % jobID, result["Message"]) error_count += 1 else: count += 1 if self.throttlingPeriod: time.sleep(self.throttlingPeriod) else: result = self.jobDB.removeJobFromDB(jobList) if not result["OK"]: gLogger.error("Failed to delete %d jobs from JobDB" % len(jobList)) else: gLogger.info("Deleted %d jobs from JobDB" % len(jobList)) for jobID in jobList: resultTQ = self.taskQueueDB.deleteJob(jobID) if not resultTQ["OK"]: gLogger.warn("Failed to remove job %d from TaskQueueDB" % jobID, resultTQ["Message"]) error_count += 1 else: count += 1 if count > 0 or error_count > 0: gLogger.info("Deleted %d jobs from JobDB, %d errors" % (count, error_count)) return S_OK()
class JobCleaningAgent( AgentModule ): """ The specific agents must provide the following methods: * initialize() for initial settings * beginExecution() * execute() - the main method called in the agent cycle * endExecution() * finalize() - the graceful exit of the method, this one is usually used for the agent restart """ def __init__( self, *args, **kwargs ): """ c'tor """ AgentModule.__init__( self, *args, **kwargs ) #clients # FIXME: shouldn't we avoid using the DBs directly, and instead go through the service? self.jobDB = None self.taskQueueDB = None self.jobLoggingDB = None self.maxJobsAtOnce = 100 self.jobByJob = False self.throttlingPeriod = 0. self.prodTypes = [] self.removeStatusDelay = {} ############################################################################# def initialize( self ): """ Sets defaults """ self.am_setOption( "PollingTime", 120 ) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() self.jobLoggingDB = JobLoggingDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) agentTSTypes = self.am_getOption('ProductionTypes', []) if agentTSTypes: self.prodTypes = agentTSTypes else: self.prodTypes = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge']) gLogger.info("Will exclude the following Production types from cleaning %s" % ( ', '.join(self.prodTypes))) self.maxJobsAtOnce = self.am_getOption( 'MaxJobsAtOnce', 500 ) self.jobByJob = self.am_getOption( 'JobByJob', False ) self.throttlingPeriod = self.am_getOption('ThrottlingPeriod', 0.) self.removeStatusDelay['Done'] = self.am_getOption( 'RemoveStatusDelay/Done', 7 ) self.removeStatusDelay['Killed'] = self.am_getOption( 'RemoveStatusDelay/Killed', 7 ) self.removeStatusDelay['Failed'] = self.am_getOption( 'RemoveStatusDelay/Failed', 7 ) self.removeStatusDelay['Any'] = self.am_getOption( 'RemoveStatusDelay/Any', -1 ) return S_OK() def __getAllowedJobTypes( self ): """ Get valid jobTypes """ result = self.jobDB.getDistinctJobAttributes( 'JobType' ) if not result[ 'OK' ]: return result cleanJobTypes = [] for jobType in result[ 'Value' ]: if jobType not in self.prodTypes: cleanJobTypes.append( jobType ) self.log.notice( "JobTypes to clean %s" % cleanJobTypes ) return S_OK( cleanJobTypes ) ############################################################################# def execute( self ): """ Remove jobs in various status """ #Delete jobs in "Deleted" state result = self.removeJobsByStatus( { 'Status' : 'Deleted' } ) if not result[ 'OK' ]: return result #Get all the Job types that can be cleaned result = self.__getAllowedJobTypes() if not result[ 'OK' ]: return result # No jobs in the system subject to removal if not result['Value']: return S_OK() baseCond = { 'JobType' : result[ 'Value' ] } # Remove jobs with final status for status in self.removeStatusDelay: delay = self.removeStatusDelay[ status ] if delay < 0: # Negative delay means don't delete anything... continue condDict = dict( baseCond ) if status != 'Any': condDict[ 'Status' ] = status delTime = str( Time.dateTime() - delay * Time.day ) result = self.removeJobsByStatus( condDict, delTime ) if not result['OK']: gLogger.warn( 'Failed to remove jobs in status %s' % status ) return S_OK() def removeJobsByStatus( self, condDict, delay = False ): """ Remove deleted jobs """ if delay: gLogger.verbose( "Removing jobs with %s and older than %s day(s)" % ( condDict, delay ) ) result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce ) else: gLogger.verbose( "Removing jobs with %s " % condDict ) result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce ) if not result['OK']: return result jobList = result['Value'] if len(jobList) > self.maxJobsAtOnce: jobList = jobList[:self.maxJobsAtOnce] if not jobList: return S_OK() self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) ) count = 0 error_count = 0 result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList ) if not result[ 'OK' ]: gLogger.error("Cannot unassign jobs to sandboxes", result['Message']) return result result = self.deleteJobOversizedSandbox(jobList) if not result[ 'OK' ]: gLogger.error( "Cannot schedule removal of oversized sandboxes", result['Message']) return result failedJobs = result['Value']['Failed'] for job in failedJobs: jobList.pop(jobList.index(job)) # TODO: we should not remove a job if it still has requests in the RequestManager. # But this logic should go in the client or in the service, and right now no service expose jobDB.removeJobFromDB if self.jobByJob: for jobID in jobList: resultJobDB = self.jobDB.removeJobFromDB( jobID ) resultTQ = self.taskQueueDB.deleteJob( jobID ) resultLogDB = self.jobLoggingDB.deleteJob( jobID ) errorFlag = False if not resultJobDB['OK']: gLogger.warn( 'Failed to remove job %d from JobDB' % jobID, result['Message'] ) errorFlag = True if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message'] ) errorFlag = True if not resultLogDB['OK']: gLogger.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message'] ) errorFlag = True if errorFlag: error_count += 1 else: count += 1 if self.throttlingPeriod: time.sleep(self.throttlingPeriod) else: result = self.jobDB.removeJobFromDB( jobList ) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList) ) else: gLogger.info('Deleted %d jobs from JobDB' % len(jobList) ) for jobID in jobList: resultTQ = self.taskQueueDB.deleteJob( jobID ) if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message'] ) error_count += 1 else: count += 1 result = self.jobLoggingDB.deleteJob( jobList ) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList) ) else: gLogger.info('Deleted %d jobs from JobLoggingDB' % len(jobList) ) if count > 0 or error_count > 0 : gLogger.info( 'Deleted %d jobs from JobDB, %d errors' % ( count, error_count ) ) return S_OK() def deleteJobOversizedSandbox( self, jobIDList ): """ Delete the job oversized sandbox files from storage elements """ failed = {} successful = {} lfnDict = {} for jobID in jobIDList: result = self.jobDB.getJobParameter( jobID, 'OutputSandboxLFN' ) if result['OK']: lfn = result['Value'] if lfn: lfnDict[lfn] = jobID else: successful[jobID] = 'No oversized sandbox found' else: gLogger.warn( 'Error interrogating JobDB: %s' % result['Message'] ) if not lfnDict: return S_OK({'Successful': successful, 'Failed': failed}) # Schedule removal of the LFNs now for lfn,jobID in lfnDict.items(): result = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] ) if not result['OK']: failed[jobID] = lfn continue if not result['Value']: failed[jobID] = lfn continue ownerDN = result['Value']['OwnerDN'] ownerGroup = result['Value']['OwnerGroup'] result = self.__setRemovalRequest( lfn, ownerDN, ownerGroup ) if not result['OK']: failed[jobID] = lfn else: successful[jobID] = lfn result = {'Successful':successful, 'Failed':failed} return S_OK(result) def __setRemovalRequest( self, lfn, ownerDN, ownerGroup ): """ Set removal request with the given credentials """ oRequest = Request() oRequest.OwnerDN = ownerDN oRequest.OwnerGroup = ownerGroup oRequest.RequestName = os.path.basename( lfn ).strip() + '_removal_request.xml' oRequest.SourceComponent = 'JobCleaningAgent' removeFile = Operation() removeFile.Type = 'RemoveFile' removedFile = File() removedFile.LFN = lfn removeFile.addFile( removedFile ) oRequest.addOperation( removeFile ) return ReqClient().putRequest( oRequest )
# print db.checkImageStatus( 'name', 'flavor'*10, 'requirements' ) ret = db.insertInstance( 'Image3', 'instance' ) print "insertInstance ", ret ret = db.insertInstance( 'Image2', 'instance' ) print "insertInstance ", ret if not ret['OK']: DIRAC.exit() print type( ret['Value'] ) print "declareInstanceSubmitted", db.declareInstanceSubmitted( ret['Value'] ) id1 = DIRAC.Time.toString() print "declareInstanceRunning ", db.declareInstanceRunning( 'Image3', id1, 'IP', 'ip' ) id2 = DIRAC.Time.toString() print "declareInstanceRunning ", db.declareInstanceRunning( 'Image2', id2, 'IP', 'ip' ) print "declareInstanceRunning ", db.instanceIDHeartBeat( id2, 1.0 ) for status in validStates: print "get%10sInstances " % status, db.getInstancesByStatus( status ) print "declareInstanceHalting ", db.declareInstanceHalting( id1, 0.0 ) print "declareInstanceHalting ", db.declareInstanceHalting( id2, 0.0 ) print "declareStalledInstances ", db.declareStalledInstances() print "declareStalledInstances ", db.declareStalledInstances() from DIRAC.WorkloadManagementSystem.DB.TaskQueueDB import TaskQueueDB tq = TaskQueueDB() print tq.retrieveTaskQueues()