def test_matcher( self ): # insert a proper DN to run the test resourceDescription = {'OwnerGroup': 'prod', 'OwnerDN':'/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser/[email protected]', 'DIRACVersion': 'pippo', 'ReleaseVersion':'blabla', 'VirtualOrganization':'LHCB', 'PilotInfoReportedFlag':'True', 'PilotBenchmark':'anotherPilot', 'LHCbPlatform':'CERTO', 'Site':'DIRAC.Jenkins.org', 'CPUTime' : 86400 } matcher = RPCClient( 'WorkloadManagement/Matcher' ) JobStateUpdate = RPCClient( 'WorkloadManagement/JobStateUpdate' ) wmsClient = WMSClient() job = helloWorldJob() job.setDestination( 'DIRAC.Jenkins.org' ) job.setInputData( '/a/bbb' ) job.setType( 'User' ) jobDescription = createFile( job ) res = wmsClient.submitJob( job._toJDL( xmlFile = jobDescription ) ) self.assert_( res['OK'] ) jobID = res['Value'] res = JobStateUpdate.setJobStatus( jobID, 'Waiting', 'matching', 'source' ) self.assert_( res['OK'] ) tqDB = TaskQueueDB() tqDefDict = {'OwnerDN': '/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser/[email protected]', 'OwnerGroup':'prod', 'Setup':'JenkinsSetup', 'CPUTime':86400} res = tqDB.insertJob( jobID, tqDefDict, 10 ) self.assert_( res['OK'] ) res = matcher.requestJob( resourceDescription ) print res self.assert_( res['OK'] ) wmsClient.deleteJob( jobID )
def test_ParametricChain(self): """ This test will submit a parametric job which should generate 3 actual jobs """ wmsClient = WMSClient() jobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate') jobMonitor = JobMonitoringClient() # create the job job = parametricJob() jobDescription = createFile(job) # submit the job result = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(result['OK']) jobIDList = result['Value'] self.assertEqual(len(jobIDList), 3) result = jobMonitor.getJobsParameters(jobIDList, ['JobName']) self.assertTrue(result['OK']) jobNames = [result['Value'][jobID]['JobName'] for jobID in result['Value']] self.assertEqual(set(jobNames), set(['parametric_helloWorld_%s' % nJob for nJob in range(3)])) for jobID in jobIDList: result = jobStateUpdate.setJobStatus(jobID, 'Done', 'matching', 'source') self.assertTrue(result['OK']) result = wmsClient.deleteJob(jobIDList) self.assertTrue(result['OK']) for jobID in jobIDList: result = jobMonitor.getJobStatus(jobID) self.assertTrue(result['OK']) self.assertEqual(result['Value'], 'Deleted')
def __sendKillCommand(self, job): """Send a kill signal to the job such that it cannot continue running. :param int job: ID of job to send kill command """ ownerDN = self.jobDB.getJobAttribute(job, 'OwnerDN') ownerGroup = self.jobDB.getJobAttribute(job, 'OwnerGroup') if ownerDN['OK'] and ownerGroup['OK']: wmsClient = WMSClient(useCertificates=True, delegatedDN=ownerDN['Value'], delegatedGroup=ownerGroup['Value']) resKill = wmsClient.killJob(job) if not resKill['OK']: self.log.error("Failed to send kill command to job", "%s: %s" % (job, resKill['Message'])) else: self.log.error("Failed to get ownerDN or Group for job:", "%s: %s, %s" % (job, ownerDN.get('Message', ''), ownerGroup.get('Message', '')))
def __init__( self, *args, **kwargs ): ''' c'tor ''' AgentModule.__init__( self, *args, **kwargs ) # # replica manager self.replicaManager = ReplicaManager() # # transformation client self.transClient = TransformationClient() # # wms client self.wmsClient = WMSClient() # # request client self.requestClient = RequestClient() # # file catalog clinet self.metadataClient = FileCatalogClient() # # placeholders for CS options # # transformations types self.transformationTypes = None # # directory locations self.directoryLocations = None # # transformation metadata self.transfidmeta = None # # archive periof in days self.archiveAfter = None # # active SEs self.activeStorages = None # # transformation log SEs self.logSE = None # # enable/disable execution self.enableFlag = None
def initialize( self ): """Sets defaults """ self.replicaManager = ReplicaManager() self.transClient = TransformationClient() self.wmsClient = WMSClient() self.requestClient = RequestClient() self.metadataClient = FileCatalogClient() self.storageUsageClient = StorageUsageClient() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) self.transformationTypes = sortList( self.am_getOption( 'TransformationTypes', ['MCSimulation', 'DataReconstruction', 'DataStripping', 'MCStripping', 'Merge', 'Replication'] ) ) gLogger.info( "Will consider the following transformation types: %s" % str( self.transformationTypes ) ) self.directoryLocations = sortList( self.am_getOption( 'DirectoryLocations', ['TransformationDB', 'StorageUsage', 'MetadataCatalog'] ) ) gLogger.info( "Will search for directories in the following locations: %s" % str( self.directoryLocations ) ) self.transfidmeta = self.am_getOption( 'TransfIDMeta', "TransformationID" ) gLogger.info( "Will use %s as metadata tag name for TransformationID" % self.transfidmeta ) self.archiveAfter = self.am_getOption( 'ArchiveAfter', 7 ) # days gLogger.info( "Will archive Completed transformations after %d days" % self.archiveAfter ) self.activeStorages = sortList( self.am_getOption( 'ActiveSEs', [] ) ) gLogger.info( "Will check the following storage elements: %s" % str( self.activeStorages ) ) self.logSE = self.am_getOption( 'TransformationLogSE', 'LogSE' ) gLogger.info( "Will remove logs found on storage element: %s" % self.logSE ) return S_OK()
def __init__(self, transClient=None, logger=None, submissionClient=None, jobMonitoringClient=None, outputDataModule=None, jobClass=None, opsH=None, destinationPlugin=None): """ Generates some default objects. jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works: VOs can pass in their job class extension, if present """ if not logger: logger = gLogger.getSubLogger('WorkflowTasks') super(WorkflowTasks, self).__init__(transClient, logger) if not submissionClient: self.submissionClient = WMSClient() else: self.submissionClient = submissionClient if not jobMonitoringClient: self.jobMonitoringClient = JobMonitoringClient() else: self.jobMonitoringClient = jobMonitoringClient if not jobClass: self.jobClass = Job else: self.jobClass = jobClass if not opsH: self.opsH = Operations() else: self.opsH = opsH if not outputDataModule: self.outputDataModule = self.opsH.getValue( "Transformations/OutputDataModule", "") else: self.outputDataModule = outputDataModule if not destinationPlugin: self.destinationPlugin = self.opsH.getValue( 'Transformations/DestinationPlugin', 'BySE') else: self.destinationPlugin = destinationPlugin self.destinationPlugin_o = None self.outputDataModule_o = None
def __sendKillCommand(self, job): """Send a kill signal to the job such that it cannot continue running. :param int job: ID of job to send kill command """ ownerDN = self.jobDB.getJobAttribute(job, 'OwnerDN') ownerGroup = self.jobDB.getJobAttribute(job, 'OwnerGroup') if ownerDN['OK'] and ownerGroup['OK']: wmsClient = WMSClient(useCertificates=True, delegatedDN=ownerDN['Value'], delegatedGroup=ownerGroup['Value']) resKill = wmsClient.killJob(job) if not resKill['OK']: self.log.error("Failed to send kill command to job", "%s: %s" % (job, resKill['Message'])) else: self.log.error( "Failed to get ownerDN or Group for job:", "%s: %s, %s" % (job, ownerDN.get( 'Message', ''), ownerGroup.get('Message', '')))
def test_matcher(self): # insert a proper DN to run the test resourceDescription = { 'OwnerGroup': 'prod', 'OwnerDN': '/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser/[email protected]', 'DIRACVersion': 'pippo', 'ReleaseVersion': 'blabla', 'VirtualOrganization': 'LHCb', 'PilotInfoReportedFlag': 'True', 'PilotBenchmark': 'anotherPilot', 'Site': 'DIRAC.Jenkins.ch', 'CPUTime': 86400} matcher = RPCClient('WorkloadManagement/Matcher') JobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate') wmsClient = WMSClient() job = helloWorldJob() job.setDestination('DIRAC.Jenkins.ch') job.setInputData('/a/bbb') job.setType('User') jobDescription = createFile(job) res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res['OK']) jobID = res['Value'] res = JobStateUpdate.setJobStatus(jobID, 'Waiting', 'matching', 'source') self.assertTrue(res['OK']) tqDB = TaskQueueDB() tqDefDict = {'OwnerDN': '/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser/[email protected]', 'OwnerGroup': 'prod', 'Setup': 'dirac-JenkinsSetup', 'CPUTime': 86400} res = tqDB.insertJob(jobID, tqDefDict, 10) self.assertTrue(res['OK']) res = matcher.requestJob(resourceDescription) print res self.assertTrue(res['OK']) wmsClient.deleteJob(jobID)
def test_ParametricChain(self): """ This test will submit a parametric job which should generate 3 actual jobs """ wmsClient = WMSClient() jobStateUpdate = JobStateUpdateClient() jobMonitor = JobMonitoringClient() # create the job job = parametricJob() jobDescription = createFile(job) # submit the job result = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(result['OK']) jobIDList = result['Value'] self.assertEqual(len(jobIDList), 3) result = jobMonitor.getJobsParameters(jobIDList, ['JobName']) self.assertTrue(result['OK']) jobNames = [ result['Value'][jobID]['JobName'] for jobID in result['Value'] ] self.assertEqual( set(jobNames), set(['parametric_helloWorld_%s' % nJob for nJob in range(3)])) for jobID in jobIDList: result = jobStateUpdate.setJobStatus(jobID, 'Done', 'matching', 'source') self.assertTrue(result['OK']) result = wmsClient.deleteJob(jobIDList) self.assertTrue(result['OK']) for jobID in jobIDList: result = jobMonitor.getJobStatus(jobID) self.assertTrue(result['OK']) self.assertEqual(result['Value'], 'Deleted')
def test_ParametricChain(self): """This test will submit a parametric job which should generate 3 actual jobs""" wmsClient = WMSClient() jobStateUpdate = JobStateUpdateClient() jobMonitor = JobMonitoringClient() # create the job job = parametricJob() jobDescription = createFile(job) # submit the job res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res["OK"], res.get("Message")) jobIDList = res["Value"] self.assertEqual(len(jobIDList), 3, msg="Got %s" % str(jobIDList)) res = jobMonitor.getJobsParameters(jobIDList, ["JobName"]) self.assertTrue(res["OK"], res.get("Message")) jobNames = [res["Value"][jobID]["JobName"] for jobID in res["Value"]] self.assertEqual( set(jobNames), set(["parametric_helloWorld_%s" % nJob for nJob in range(3)])) for jobID in jobIDList: res = jobStateUpdate.setJobStatus(jobID, JobStatus.CHECKING, "checking", "source") self.assertTrue(res["OK"], res.get("Message")) res = wmsClient.deleteJob(jobIDList) self.assertTrue(res["OK"], res.get("Message")) print(res) for jobID in jobIDList: res = jobMonitor.getJobsStatus(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"][jobID]["Status"], JobStatus.DELETED, msg="Got %s" % str(res["Value"]))
def __init__(self, agentName, baseAgentName=False, properties=dict()): """ c'tor :param self: self reference :param str agentName: name of agent :param bool baseAgentName: whatever :param dict properties: whatever else """ AgentModule.__init__(self, agentName, baseAgentName, properties) ## replica manager self.replicaManager = ReplicaManager() ## transformation client self.transClient = TransformationClient() ## wms client self.wmsClient = WMSClient() ## request client self.requestClient = RequestClient() ## file catalog clinet self.metadataClient = FileCatalogClient() ## storage usage agent self.storageUsageClient = StorageUsageClient() ## placeholders for CS options ## transformations types self.transformationTypes = None ## directory locations self.directoryLocations = None ## transformation metadata self.transfidmeta = None ## archive periof in days self.archiveAfter = None ## active SEs self.activeStorages = None ## transformation log SEs self.logSE = None ## enable/disable execution self.enableFlag = None
def main(): if len(sys.argv) < 2: print "At least one parameter (user group, e.g. dune_user) expected, got %s !" \ % (len(sys.argv)-1) print "Usage: resetjobs.py <user group> -or- resetjobs.py <user group> <site>" print "Example: ./resetjobs.py dune_user LCG.UKI-LT2-IC-HEP.uk" print "Only available to dirac_admin." sys.exit(1) # dictionary JOBFILTER = {} JOBFILTER['OwnerGroup'] = str(sys.argv[1]) JOBFILTER['Status'] = 'Failed' if len(sys.argv) == 3: JOBFILTER['Site'] = str(sys.argv[2]) print JOBFILTER rpcClient = RPCClient("WorkloadManagement/JobMonitoring") jobs = rpcClient.getJobs(JOBFILTER) if not jobs["OK"]: print "Could not retrieve jobs." sys.exit(1) job_ids = jobs["Value"] print "%s matching jobs found." % len(job_ids) if len(job_ids) > 500: print "Will reset the first 500 jobs, please rerun script to delete more." wmsClient = WMSClient() for jobid in job_ids[0:500]: # print jobid res = wmsClient.resetJob(int(jobid)) if not res['OK']: print "Could not reset job %s" % jobid
def initialize(self): """ agent initialisation reading and setting confing opts :param self: self reference """ # # shifter proxy # See cleanCatalogContents method: this proxy will be used ALSO when the file catalog used # is the DIRAC File Catalog (DFC). # This is possible because of unset of the "UseServerCertificate" option self.shifterProxy = self.am_getOption('shifterProxy', None) # # transformations types self.dataProcTTypes = Operations().getValue('Transformations/DataProcessing', self.dataProcTTypes) self.dataManipTTypes = Operations().getValue('Transformations/DataManipulation', self.dataManipTTypes) agentTSTypes = self.am_getOption('TransformationTypes', []) if agentTSTypes: self.transformationTypes = sorted(agentTSTypes) else: self.transformationTypes = sorted(self.dataProcTTypes + self.dataManipTTypes) self.log.info("Will consider the following transformation types: %s" % str(self.transformationTypes)) # # directory locations self.directoryLocations = sorted(self.am_getOption('DirectoryLocations', self.directoryLocations)) self.log.info("Will search for directories in the following locations: %s" % str(self.directoryLocations)) # # transformation metadata self.transfidmeta = self.am_getOption('TransfIDMeta', self.transfidmeta) self.log.info("Will use %s as metadata tag name for TransformationID" % self.transfidmeta) # # archive periof in days self.archiveAfter = self.am_getOption('ArchiveAfter', self.archiveAfter) # days self.log.info("Will archive Completed transformations after %d days" % self.archiveAfter) # # active SEs self.activeStorages = sorted(self.am_getOption('ActiveSEs', self.activeStorages)) if self.activeStorages: self.log.info("Will check the following storage elements: %s" % str(self.activeStorages)) # # transformation log SEs self.logSE = Operations().getValue('/LogStorage/LogSE', self.logSE) self.log.info("Will remove logs found on storage element: %s" % self.logSE) # # transformation client self.transClient = TransformationClient() # # wms client self.wmsClient = WMSClient() # # request client self.reqClient = ReqClient() # # file catalog client self.metadataClient = FileCatalogClient() return S_OK()
def initialize( self ): """ agent initialisation reading and setting confing opts :param self: self reference """ # # shifter proxy self.am_setOption( 'shifterProxy', 'DataManager' ) # # transformations types self.dataProcTTypes = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] ) self.dataManipTTypes = Operations().getValue( 'Transformations/DataManipulation', ['Replication', 'Removal'] ) agentTSTypes = self.am_getOption( 'TransformationTypes', [] ) if agentTSTypes: self.transformationTypes = sorted( agentTSTypes ) else: self.transformationTypes = sorted( self.dataProcTTypes + self.dataManipTTypes ) self.log.info( "Will consider the following transformation types: %s" % str( self.transformationTypes ) ) # # directory locations self.directoryLocations = sorted( self.am_getOption( 'DirectoryLocations', [ 'TransformationDB', 'MetadataCatalog' ] ) ) self.log.info( "Will search for directories in the following locations: %s" % str( self.directoryLocations ) ) # # transformation metadata self.transfidmeta = self.am_getOption( 'TransfIDMeta', "TransformationID" ) self.log.info( "Will use %s as metadata tag name for TransformationID" % self.transfidmeta ) # # archive periof in days self.archiveAfter = self.am_getOption( 'ArchiveAfter', 7 ) # days self.log.info( "Will archive Completed transformations after %d days" % self.archiveAfter ) # # active SEs self.activeStorages = sorted( self.am_getOption( 'ActiveSEs', [] ) ) self.log.info( "Will check the following storage elements: %s" % str( self.activeStorages ) ) # # transformation log SEs self.logSE = Operations().getValue( '/LogStorage/LogSE', 'LogSE' ) self.log.info( "Will remove logs found on storage element: %s" % self.logSE ) # # enable/disable execution, should be using CS option Status?? with default value as 'Active'?? self.enableFlag = self.am_getOption( 'EnableFlag', 'True' ) # # data manager # self.dm = DataManager() # # transformation client self.transClient = TransformationClient() # # wms client self.wmsClient = WMSClient() # # request client self.reqClient = ReqClient() # # file catalog client self.metadataClient = FileCatalogClient() return S_OK()
def __init__(self, transClient=None, logger=None, submissionClient=None, jobMonitoringClient=None, outputDataModule=None, jobClass=None, opsH=None, destinationPlugin=None, ownerDN=None, ownerGroup=None): """ Generates some default objects. jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works: VOs can pass in their job class extension, if present """ if not logger: logger = gLogger.getSubLogger('WorkflowTasks') super(WorkflowTasks, self).__init__(transClient, logger) useCertificates = True if (bool(ownerDN) and bool(ownerGroup)) else False if not submissionClient: self.submissionClient = WMSClient(useCertificates=useCertificates, delegatedDN=ownerDN, delegatedGroup=ownerGroup) else: self.submissionClient = submissionClient if not jobMonitoringClient: self.jobMonitoringClient = JobMonitoringClient() else: self.jobMonitoringClient = jobMonitoringClient if not jobClass: self.jobClass = Job else: self.jobClass = jobClass if not opsH: self.opsH = Operations() else: self.opsH = opsH if not outputDataModule: self.outputDataModule = self.opsH.getValue("Transformations/OutputDataModule", "") else: self.outputDataModule = outputDataModule if not destinationPlugin: self.destinationPlugin = self.opsH.getValue('Transformations/DestinationPlugin', 'BySE') else: self.destinationPlugin = destinationPlugin self.destinationPlugin_o = None self.outputDataModule_o = None
def __init__(self, *args, **kwargs): ''' c'tor ''' TaskManagerAgentBase.__init__(self, *args, **kwargs) self.submissionClient = WMSClient() self.taskManager = WorkflowTasks( transClient=self.transClient, submissionClient=self.submissionClient) self.shifterProxy = 'ProductionManager' agentTSTypes = self.am_getOption('TransType', []) if agentTSTypes: self.transType = agentTSTypes else: self.transType = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'])
def __init__(self, transClient=None, logger=None, submissionClient=None, jobMonitoringClient=None, outputDataModule=None, jobClass=None, opsH=None): """ Generates some default objects. jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works: VOs can pass in their job class extension, if present """ if not logger: logger = gLogger.getSubLogger('WorkflowTasks') super(WorkflowTasks, self).__init__(transClient, logger) if not submissionClient: from DIRAC.WorkloadManagementSystem.Client.WMSClient import WMSClient self.submissionClient = WMSClient() else: self.submissionClient = submissionClient if not jobMonitoringClient: from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient self.jobMonitoringClient = JobMonitoringClient() else: self.jobMonitoringClient = jobMonitoringClient if not outputDataModule: self.outputDataModule = gConfig.getValue( "/DIRAC/VOPolicy/OutputDataModule", "") else: self.outputDataModule = outputDataModule if not jobClass: from DIRAC.Interfaces.API.Job import Job self.jobClass = Job else: self.jobClass = jobClass if not opsH: from DIRAC.ConfigurationSystem.Client.Helpers.Operations import Operations self.opsH = Operations() else: self.opsH = opsH
def __init__( self, agentName, baseAgentName = False, properties = dict() ): """ c'tor :param self: self reference :param str agentName: name of agent :param bool baseAgentName: whatever :param dict properties: whatever else """ AgentModule.__init__( self, agentName, baseAgentName, properties ) ## replica manager self.replicaManager = ReplicaManager() ## transformation client self.transClient = TransformationClient() ## wms client self.wmsClient = WMSClient() ## request client self.requestClient = RequestClient() ## file catalog clinet self.metadataClient = FileCatalogClient() ## storage usage agent self.storageUsageClient = StorageUsageClient() ## placeholders for CS options ## transformations types self.transformationTypes = None ## directory locations self.directoryLocations = None ## transformation metadata self.transfidmeta = None ## archive periof in days self.archiveAfter = None ## active SEs self.activeStorages = None ## transformation log SEs self.logSE = None ## enable/disable execution self.enableFlag = None
def removeDeletedJobs(self): """Fully remove jobs that are already in status "DELETED", unless there are still requests. :returns: S_OK/S_ERROR """ res = self._getJobsList({"Status": JobStatus.DELETED}) if not res["OK"]: return res jobList = res["Value"] if not jobList: self.log.info("No jobs to remove") return S_OK() self.log.info("Unassigning sandboxes from soon to be deleted jobs", "(%d)" % len(jobList)) result = SandboxStoreClient(useCertificates=True).unassignJobs(jobList) if not result["OK"]: self.log.error("Cannot unassign jobs to sandboxes", result["Message"]) return result self.log.info("Attempting to remove deleted jobs", "(%d)" % len(jobList)) # remove from jobList those that have still Operations to do in RMS reqClient = ReqClient() res = reqClient.getRequestIDsForJobs(jobList) if not res["OK"]: return res if res["Value"]["Successful"]: notFinal = set() # Check whether these requests are in a final status for job, reqID in res["Value"]["Successful"].items(): # If not, remove job from list to remove if reqClient.getRequestStatus(reqID).get( "Value") not in Request.FINAL_STATES: # Keep that job notFinal.add(job) else: # Remove the request, if failed, keep the job res1 = reqClient.deleteRequest(reqID) if not res1["OK"]: notFinal.add(job) if notFinal: self.log.info( "Some jobs won't be removed, as still having Requests not in final status", "(n=%d)" % len(notFinal)) jobList = list(set(jobList) - notFinal) if not jobList: return S_OK() ownerJobsDict = self._getOwnerJobsDict(jobList) fail = False for owner, jobsList in ownerJobsDict.items(): ownerDN = owner.split(";")[0] ownerGroup = owner.split(";")[1] self.log.verbose( "Attempting to remove jobs", "(n=%d) for %s : %s" % (len(jobsList), ownerDN, ownerGroup)) wmsClient = WMSClient(useCertificates=True, delegatedDN=ownerDN, delegatedGroup=ownerGroup) result = wmsClient.removeJob(jobsList) if not result["OK"]: self.log.error( "Could not remove jobs", "for %s : %s (n=%d) : %s" % (ownerDN, ownerGroup, len(jobsList), result["Message"]), ) fail = True if fail: return S_ERROR() return S_OK()
class JobLaunchpadHandler(WebHandler): AUTH_PROPS = "authenticated" def __init__(self, *args, **kwargs ): super( JobLaunchpadHandler, self ).__init__( *args, **kwargs ) sessionData = self.getSessionData() self.user = sessionData['user'].get( 'username', '' ) self.group = sessionData['user'].get( 'group', '' ) self.vo = getVOForGroup( self.group ) def web_getProxyStatus(self): self.write(self.__getProxyStatus()) def __getProxyStatus(self, secondsOverride=None): from DIRAC.FrameworkSystem.Client.ProxyManagerClient import ProxyManagerClient proxyManager = ProxyManagerClient() userData = self.getSessionData() group = str(userData["user"]["group"]) if group == "visitor": return {"success":"false", "error":"User is anonymous or is not registered in the system"} userDN = str(userData["user"]["DN"]) defaultSeconds = 24 * 3600 + 60 # 24H + 1min validSeconds = gConfig.getValue("/Registry/DefaultProxyLifeTime", defaultSeconds) gLogger.info("\033[0;31m userHasProxy(%s, %s, %s) \033[0m" % (userDN, group, validSeconds)) result = proxyManager.userHasProxy(userDN, group, validSeconds) if result["OK"]: if result["Value"]: return {"success":"true", "result":"true"} else: return {"success":"true", "result":"false"} else: return {"success":"false", "error":"false"} gLogger.info("\033[0;31m PROXY: \033[0m", result) def __getPlatform(self): gLogger.info("start __getPlatform") path = "/Resources/Computing/OSCompatibility" result = gConfig.getOptionsDict(path) gLogger.debug(result) if not result[ "OK" ]: return False platformDict = result[ "Value" ] platform = platformDict.keys() gLogger.debug("platform: %s" % platform) gLogger.info("end __getPlatform") return platform def __getOptionsFromCS(self , path="/WebApp/Launchpad/Options" , delimiter=","): gLogger.info("start __getOptionsFromCS") result = gConfig.getOptionsDict(path) gLogger.always(result) if not result["OK"]: return [] options = result["Value"] for i in options.keys(): options[ i ] = options[ i ].split(delimiter) result = gConfig.getSections(path) if result["OK"]: sections = result["Value"] if len(sections) > 0: for i in sections: options[ i ] = self.__getOptionsFromCS(path + '/' + i , delimiter) gLogger.always("options: %s" % options) gLogger.info("end __getOptionsFromCS") return options def web_getLaunchpadOpts(self): defaultParams = {"JobName" : [1, 'DIRAC'], "Executable" : [1, "/bin/ls"], "Arguments" : [1, "-ltrA"], "OutputSandbox" : [1, "std.out, std.err"], "JobGroup" : [0, "Unknown"], "InputData" : [0, ""], "OutputData" : [0, ""], "OutputSE" : [0, "DIRAC-USER"], "OutputPath": [0, ""], "CPUTime" : [0, "86400"], "Site" : [0, ""], "BannedSite" : [0, ""], "Platform" : [0, "Linux_x86_64_glibc-2.12"], "Priority" : [0, "5"], "StdError" : [0, "std.err"], "StdOutput" : [0, "std.out"], "Parameters" : [0, "0"], "ParameterStart" : [0, "0"], "ParameterStep" : [0, "1"], "ParameterFactor": [0, "0"]} delimiter = gConfig.getValue("/WebApp/Launchpad/ListSeparator" , ',') options = self.__getOptionsFromCS(delimiter=delimiter) # platform = self.__getPlatform() # if platform and options: # if not options.has_key("Platform"): # options[ "Platform" ] = platform # else: # csPlatform = list(options[ "Platform" ]) # allPlatforms = csPlatform + platform # platform = uniqueElements(allPlatforms) # options[ "Platform" ] = platform gLogger.debug("Combined options from CS: %s" % options) override = gConfig.getValue("/WebApp/Launchpad/OptionsOverride" , False) gLogger.info("end __getLaunchpadOpts") # Updating the default values from OptionsOverride configuration branch for key in options: if key not in defaultParams: defaultParams[key] = [ 0, "" ] defaultParams[key][1] = options[key][0] # Reading of the predefined sets of launchpad parameters values obj = Operations( vo = self.vo ) predefinedSets = {} launchpadSections = obj.getSections("Launchpad") import pprint if launchpadSections['OK']: for section in launchpadSections["Value"]: predefinedSets[section] = {} sectionOptions = obj.getOptionsDict("Launchpad/" + section) pprint.pprint(sectionOptions) if sectionOptions['OK']: predefinedSets[section] = sectionOptions["Value"] self.write({"success":"true", "result":defaultParams, "predefinedSets":predefinedSets}) def __canRunJobs(self): data = self.getSessionData() isAuth = False if "properties" in data["user"]: if "NormalUser" in data["user"]["properties"]: isAuth = True return isAuth @asyncGen def web_jobSubmit(self): # self.set_header('Content-type', "text/html") # Otherwise the browser would offer you to download a JobSubmit file if not self.__canRunJobs(): self.finish({"success":"false", "error":"You are not allowed to run the jobs"}) return proxy = yield self.threadTask( self.__getProxyStatus, 86460 ) if proxy["success"] == "false" or proxy["result"] == "false": self.finish({"success":"false", "error":"You can not run a job: your proxy is valid less then 24 hours"}) return jdl = "" params = {} lfns = [] for tmp in self.request.arguments: try: if len(self.request.arguments[tmp][0]) > 0: if tmp[:8] == "lfnField": if len(self.request.arguments[tmp][0].strip()) > 0: lfns.append("LFN:" + self.request.arguments[tmp][0]) else: params[tmp] = self.request.arguments[tmp][0] except: pass for item in params: if item == "OutputSandbox": jdl = jdl + str(item) + " = {" + str(params[item]) + "};" if item == "Parameters": try: parameters = int(params[item]) jdl = jdl + str(item) + " = \"" + str(parameters) + "\";" except: parameters = str(params[item]) if parameters.find("{") >= 0 and parameters.find("}") >= 0: parameters = parameters.rstrip("}") parameters = parameters.lstrip("{") if len(parameters) > 0: jdl = jdl + str(item) + " = {" + parameters + "};" else: self.finish({"success":"false", "error":"Parameters vector has zero length"}) return else: self.finish({"success":"false", "error":"Parameters must be an integer or a vector. Example: 4 or {1,2,3,4}"}) return else: jdl = jdl + str(item) + " = \"" + str(params[item]) + "\";" store = [] for key in self.request.files: try: if self.request.files[key][0].filename: gLogger.info("\033[0;31m file - %s \033[0m " % self.request.files[key][0].filename) store.append(self.request.files[key][0]) except: pass gLogger.info("\033[0;31m *** %s \033[0m " % params) clearFS = False # Clear directory flag fileNameList = [] exception_counter = 0 callback = {} if len(store) > 0: # If there is a file(s) in sandbox clearFS = True import shutil import os storePath = tempfile.mkdtemp(prefix='DIRAC_') try: for fileObj in store: name = os.path.join(storePath , fileObj.filename.lstrip(os.sep)) tFile = open(name , 'w') tFile.write(fileObj.body) tFile.close() fileNameList.append(name) except Exception, x: exception_counter = 1 callback = {"success":"false", "error":"An EXCEPTION happens during saving your sandbox file(s): %s" % str(x)} if ((len(fileNameList) > 0) or (len(lfns) > 0)) and exception_counter == 0: sndBox = "InputSandbox = {\"" + "\",\"".join(fileNameList + lfns) + "\"};" else: sndBox = "" if exception_counter == 0: jdl = jdl + sndBox from DIRAC.WorkloadManagementSystem.Client.WMSClient import WMSClient jobManager = WMSClient(useCertificates=True, timeout = 1800 ) jdl = str(jdl) gLogger.info("J D L : ", jdl) try: result = yield self.threadTask(jobManager.submitJob, jdl) if result["OK"]: callback = {"success":"true", "result":result["Value"]} else: callback = {"success":"false", "error":result["Message"]} except Exception, x: callback = {"success":"false", "error":"An EXCEPTION happens during job submittion: %s" % str(x)}
class TransformationCleaningAgent(AgentModule): """ .. class:: TransformationCleaningAgent :param ~DIRAC.DataManagementSystem.Client.DataManager.DataManager dm: DataManager instance :param ~TransformationClient.TransformationClient transClient: TransformationClient instance :param ~FileCatalogClient.FileCatalogClient metadataClient: FileCatalogClient instance """ def __init__(self, *args, **kwargs): """c'tor""" AgentModule.__init__(self, *args, **kwargs) self.shifterProxy = None # # transformation client self.transClient = None # # wms client self.wmsClient = None # # request client self.reqClient = None # # file catalog client self.metadataClient = None # # transformations types self.transformationTypes = None # # directory locations self.directoryLocations = ["TransformationDB", "MetadataCatalog"] # # transformation metadata self.transfidmeta = "TransformationID" # # archive periof in days self.archiveAfter = 7 # # transformation log SEs self.logSE = "LogSE" # # enable/disable execution self.enableFlag = "True" self.dataProcTTypes = ["MCSimulation", "Merge"] self.dataManipTTypes = ["Replication", "Removal"] def initialize(self): """agent initialisation reading and setting config opts :param self: self reference """ # # shifter proxy # See cleanContent method: this proxy will be used ALSO when the file catalog used # is the DIRAC File Catalog (DFC). # This is possible because of unset of the "UseServerCertificate" option self.shifterProxy = self.am_getOption("shifterProxy", self.shifterProxy) # # transformations types self.dataProcTTypes = Operations().getValue( "Transformations/DataProcessing", self.dataProcTTypes) self.dataManipTTypes = Operations().getValue( "Transformations/DataManipulation", self.dataManipTTypes) agentTSTypes = self.am_getOption("TransformationTypes", []) if agentTSTypes: self.transformationTypes = sorted(agentTSTypes) else: self.transformationTypes = sorted(self.dataProcTTypes + self.dataManipTTypes) self.log.info("Will consider the following transformation types: %s" % str(self.transformationTypes)) # # directory locations self.directoryLocations = sorted( self.am_getOption("DirectoryLocations", self.directoryLocations)) self.log.info( "Will search for directories in the following locations: %s" % str(self.directoryLocations)) # # transformation metadata self.transfidmeta = self.am_getOption("TransfIDMeta", self.transfidmeta) self.log.info("Will use %s as metadata tag name for TransformationID" % self.transfidmeta) # # archive periof in days self.archiveAfter = self.am_getOption("ArchiveAfter", self.archiveAfter) # days self.log.info("Will archive Completed transformations after %d days" % self.archiveAfter) # # transformation log SEs self.logSE = Operations().getValue("/LogStorage/LogSE", self.logSE) self.log.info("Will remove logs found on storage element: %s" % self.logSE) # # transformation client self.transClient = TransformationClient() # # wms client self.wmsClient = WMSClient() # # request client self.reqClient = ReqClient() # # file catalog client self.metadataClient = FileCatalogClient() # # job monitoring client self.jobMonitoringClient = JobMonitoringClient() return S_OK() ############################################################################# def execute(self): """execution in one agent's cycle :param self: self reference """ self.enableFlag = self.am_getOption("EnableFlag", self.enableFlag) if self.enableFlag != "True": self.log.info( "TransformationCleaningAgent is disabled by configuration option EnableFlag" ) return S_OK("Disabled via CS flag") # Obtain the transformations in Cleaning status and remove any mention of the jobs/files res = self.transClient.getTransformations({ "Status": "Cleaning", "Type": self.transformationTypes }) if res["OK"]: for transDict in res["Value"]: if self.shifterProxy: self._executeClean(transDict) else: self.log.info( "Cleaning transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s" % transDict) executeWithUserProxy(self._executeClean)( transDict, proxyUserDN=transDict["AuthorDN"], proxyUserGroup=transDict["AuthorGroup"]) else: self.log.error("Failed to get transformations", res["Message"]) # Obtain the transformations in RemovingFiles status and removes the output files res = self.transClient.getTransformations({ "Status": "RemovingFiles", "Type": self.transformationTypes }) if res["OK"]: for transDict in res["Value"]: if self.shifterProxy: self._executeRemoval(transDict) else: self.log.info( "Removing files for transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s" % transDict) executeWithUserProxy(self._executeRemoval)( transDict, proxyUserDN=transDict["AuthorDN"], proxyUserGroup=transDict["AuthorGroup"]) else: self.log.error("Could not get the transformations", res["Message"]) # Obtain the transformations in Completed status and archive if inactive for X days olderThanTime = datetime.utcnow() - timedelta(days=self.archiveAfter) res = self.transClient.getTransformations( { "Status": "Completed", "Type": self.transformationTypes }, older=olderThanTime, timeStamp="LastUpdate") if res["OK"]: for transDict in res["Value"]: if self.shifterProxy: self._executeArchive(transDict) else: self.log.info( "Archiving files for transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s" % transDict) executeWithUserProxy(self._executeArchive)( transDict, proxyUserDN=transDict["AuthorDN"], proxyUserGroup=transDict["AuthorGroup"]) else: self.log.error("Could not get the transformations", res["Message"]) return S_OK() def finalize(self): """Only at finalization: will clean ancient transformations (remnants) 1) get the transformation IDs of jobs that are older than 1 year 2) find the status of those transformations. Those "Cleaned" and "Archived" will be cleaned and archived (again) Why doing this here? Basically, it's a race: 1) the production manager submits a transformation 2) the TransformationAgent, and a bit later the WorkflowTaskAgent, put such transformation in their internal queue, so eventually during their (long-ish) cycle they'll work on it. 3) 1 minute after creating the transformation, the production manager cleans it (by hand, for whatever reason). So, the status is changed to "Cleaning" 4) the TransformationCleaningAgent cleans what has been created (maybe, nothing), then sets the transformation status to "Cleaned" or "Archived" 5) a bit later the TransformationAgent, and later the WorkflowTaskAgent, kick in, creating tasks and jobs for a production that's effectively cleaned (but these 2 agents don't know yet). Of course, one could make one final check in TransformationAgent or WorkflowTaskAgent, but these 2 agents are already doing a lot of stuff, and are pretty heavy. So, we should just clean from time to time. What I added here is done only when the agent finalize, and it's quite light-ish operation anyway. """ res = self.jobMonitoringClient.getJobGroups( None, datetime.utcnow() - timedelta(days=365)) if not res["OK"]: self.log.error("Failed to get job groups", res["Message"]) return res transformationIDs = res["Value"] if transformationIDs: res = self.transClient.getTransformations( {"TransformationID": transformationIDs}) if not res["OK"]: self.log.error("Failed to get transformations", res["Message"]) return res transformations = res["Value"] toClean = [] toArchive = [] for transDict in transformations: if transDict["Status"] == "Cleaned": toClean.append(transDict) if transDict["Status"] == "Archived": toArchive.append(transDict) for transDict in toClean: if self.shifterProxy: self._executeClean(transDict) else: self.log.info( "Cleaning transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s" % transDict) executeWithUserProxy(self._executeClean)( transDict, proxyUserDN=transDict["AuthorDN"], proxyUserGroup=transDict["AuthorGroup"]) for transDict in toArchive: if self.shifterProxy: self._executeArchive(transDict) else: self.log.info( "Archiving files for transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s" % transDict) executeWithUserProxy(self._executeArchive)( transDict, proxyUserDN=transDict["AuthorDN"], proxyUserGroup=transDict["AuthorGroup"]) # Remove JobIDs that were unknown to the TransformationSystem jobGroupsToCheck = [ str(transDict["TransformationID"]).zfill(8) for transDict in toClean + toArchive ] res = self.jobMonitoringClient.getJobs( {"JobGroup": jobGroupsToCheck}) if not res["OK"]: return res jobIDsToRemove = [int(jobID) for jobID in res["Value"]] res = self.__removeWMSTasks(jobIDsToRemove) if not res["OK"]: return res return S_OK() def _executeClean(self, transDict): """Clean transformation.""" # if transformation is of type `Replication` or `Removal`, there is nothing to clean. # We just archive if transDict["Type"] in self.dataManipTTypes: res = self.archiveTransformation(transDict["TransformationID"]) if not res["OK"]: self.log.error( "Problems archiving transformation", "%s: %s" % (transDict["TransformationID"], res["Message"])) else: res = self.cleanTransformation(transDict["TransformationID"]) if not res["OK"]: self.log.error( "Problems cleaning transformation", "%s: %s" % (transDict["TransformationID"], res["Message"])) def _executeRemoval(self, transDict): """Remove files from given transformation.""" res = self.removeTransformationOutput(transDict["TransformationID"]) if not res["OK"]: self.log.error( "Problems removing transformation", "%s: %s" % (transDict["TransformationID"], res["Message"])) def _executeArchive(self, transDict): """Archive the given transformation.""" res = self.archiveTransformation(transDict["TransformationID"]) if not res["OK"]: self.log.error( "Problems archiving transformation", "%s: %s" % (transDict["TransformationID"], res["Message"])) return S_OK() ############################################################################# # # Get the transformation directories for checking # def getTransformationDirectories(self, transID): """get the directories for the supplied transformation from the transformation system. These directories are used by removeTransformationOutput and cleanTransformation for removing output. :param self: self reference :param int transID: transformation ID """ self.log.verbose( "Cleaning Transformation directories of transformation %d" % transID) directories = [] if "TransformationDB" in self.directoryLocations: res = self.transClient.getTransformationParameters( transID, ["OutputDirectories"]) if not res["OK"]: self.log.error("Failed to obtain transformation directories", res["Message"]) return res transDirectories = [] if res["Value"]: if not isinstance(res["Value"], list): try: transDirectories = ast.literal_eval(res["Value"]) except Exception: # It can happen if the res['Value'] is '/a/b/c' instead of '["/a/b/c"]' transDirectories.append(res["Value"]) else: transDirectories = res["Value"] directories = self._addDirs(transID, transDirectories, directories) if "MetadataCatalog" in self.directoryLocations: res = self.metadataClient.findDirectoriesByMetadata( {self.transfidmeta: transID}) if not res["OK"]: self.log.error("Failed to obtain metadata catalog directories", res["Message"]) return res transDirectories = res["Value"] directories = self._addDirs(transID, transDirectories, directories) if not directories: self.log.info("No output directories found") directories = sorted(directories) return S_OK(directories) @classmethod def _addDirs(cls, transID, newDirs, existingDirs): """append unique :newDirs: list to :existingDirs: list :param self: self reference :param int transID: transformationID :param list newDirs: src list of paths :param list existingDirs: dest list of paths """ for folder in newDirs: transStr = str(transID).zfill(8) if re.search(transStr, str(folder)): if folder not in existingDirs: existingDirs.append(os.path.normpath(folder)) return existingDirs ############################################################################# # # These are the methods for performing the cleaning of catalogs and storage # def cleanContent(self, directory): """wipe out everything from catalog under folder :directory: :param self: self reference :params str directory: folder name """ self.log.verbose("Cleaning Catalog contents") res = self.__getCatalogDirectoryContents([directory]) if not res["OK"]: return res filesFound = res["Value"] if not filesFound: self.log.info( "No files are registered in the catalog directory %s" % directory) return S_OK() self.log.info( "Attempting to remove possible remnants from the catalog and storage", "(n=%d)" % len(filesFound)) # Executing with shifter proxy gConfigurationData.setOptionInCFG( "/DIRAC/Security/UseServerCertificate", "false") res = DataManager().removeFile(filesFound, force=True) gConfigurationData.setOptionInCFG( "/DIRAC/Security/UseServerCertificate", "true") if not res["OK"]: return res realFailure = False for lfn, reason in res["Value"]["Failed"].items(): if "File does not exist" in str(reason): self.log.warn("File %s not found in some catalog: " % (lfn)) else: self.log.error("Failed to remove file found in the catalog", "%s %s" % (lfn, reason)) realFailure = True if realFailure: return S_ERROR("Failed to remove all files found in the catalog") return S_OK() def __getCatalogDirectoryContents(self, directories): """get catalog contents under paths :directories: :param self: self reference :param list directories: list of paths in catalog """ self.log.info("Obtaining the catalog contents for %d directories:" % len(directories)) for directory in directories: self.log.info(directory) activeDirs = directories allFiles = {} fc = FileCatalog() while activeDirs: currentDir = activeDirs[0] res = returnSingleResult(fc.listDirectory(currentDir)) activeDirs.remove(currentDir) if not res["OK"] and "Directory does not exist" in res[ "Message"]: # FIXME: DFC should return errno self.log.info("The supplied directory %s does not exist" % currentDir) elif not res["OK"]: if "No such file or directory" in res["Message"]: self.log.info("%s: %s" % (currentDir, res["Message"])) else: self.log.error( "Failed to get directory %s content" % currentDir, res["Message"]) else: dirContents = res["Value"] activeDirs.extend(dirContents["SubDirs"]) allFiles.update(dirContents["Files"]) self.log.info("", "Found %d files" % len(allFiles)) return S_OK(list(allFiles)) def cleanTransformationLogFiles(self, directory): """clean up transformation logs from directory :directory: :param self: self reference :param str directory: folder name """ self.log.verbose("Removing log files found in the directory", directory) res = returnSingleResult( StorageElement(self.logSE).removeDirectory(directory, recursive=True)) if not res["OK"]: if cmpError(res, errno.ENOENT): # No such file or directory self.log.warn("Transformation log directory does not exist", directory) return S_OK() self.log.error("Failed to remove log files", res["Message"]) return res self.log.info("Successfully removed transformation log directory") return S_OK() ############################################################################# # # These are the functional methods for archiving and cleaning transformations # def removeTransformationOutput(self, transID): """This just removes any mention of the output data from the catalog and storage""" self.log.info("Removing output data for transformation %s" % transID) res = self.getTransformationDirectories(transID) if not res["OK"]: self.log.error("Problem obtaining directories for transformation", "%s with result '%s'" % (transID, res)) return S_OK() directories = res["Value"] for directory in directories: if not re.search("/LOG/", directory): res = self.cleanContent(directory) if not res["OK"]: return res self.log.info("Removed %d directories from the catalog \ and its files from the storage for transformation %s" % (len(directories), transID)) # Clean ALL the possible remnants found in the metadata catalog res = self.cleanMetadataCatalogFiles(transID) if not res["OK"]: return res self.log.info("Successfully removed output of transformation", transID) # Change the status of the transformation to RemovedFiles res = self.transClient.setTransformationParameter( transID, "Status", "RemovedFiles") if not res["OK"]: self.log.error( "Failed to update status of transformation %s to RemovedFiles" % (transID), res["Message"]) return res self.log.info("Updated status of transformation %s to RemovedFiles" % (transID)) return S_OK() def archiveTransformation(self, transID): """This just removes job from the jobDB and the transformation DB :param self: self reference :param int transID: transformation ID """ self.log.info("Archiving transformation %s" % transID) # Clean the jobs in the WMS and any failover requests found res = self.cleanTransformationTasks(transID) if not res["OK"]: return res # Clean the transformation DB of the files and job information res = self.transClient.cleanTransformation(transID) if not res["OK"]: return res self.log.info("Successfully archived transformation %d" % transID) # Change the status of the transformation to archived res = self.transClient.setTransformationParameter( transID, "Status", "Archived") if not res["OK"]: self.log.error( "Failed to update status of transformation %s to Archived" % (transID), res["Message"]) return res self.log.info("Updated status of transformation %s to Archived" % (transID)) return S_OK() def cleanTransformation(self, transID): """This removes what was produced by the supplied transformation, leaving only some info and log in the transformation DB. """ self.log.info("Cleaning transformation", transID) res = self.getTransformationDirectories(transID) if not res["OK"]: self.log.error("Problem obtaining directories for transformation", "%s with result '%s'" % (transID, res["Message"])) return S_OK() directories = res["Value"] # Clean the jobs in the WMS and any failover requests found res = self.cleanTransformationTasks(transID) if not res["OK"]: return res # Clean the log files for the jobs for directory in directories: if re.search("/LOG/", directory): res = self.cleanTransformationLogFiles(directory) if not res["OK"]: return res res = self.cleanContent(directory) if not res["OK"]: return res # Clean ALL the possible remnants found res = self.cleanMetadataCatalogFiles(transID) if not res["OK"]: return res # Clean the transformation DB of the files and job information res = self.transClient.cleanTransformation(transID) if not res["OK"]: return res self.log.info("Successfully cleaned transformation", transID) res = self.transClient.setTransformationParameter( transID, "Status", "Cleaned") if not res["OK"]: self.log.error( "Failed to update status of transformation %s to Cleaned" % (transID), res["Message"]) return res self.log.info("Updated status of transformation", "%s to Cleaned" % (transID)) return S_OK() def cleanMetadataCatalogFiles(self, transID): """wipe out files from catalog""" res = self.metadataClient.findFilesByMetadata( {self.transfidmeta: transID}) if not res["OK"]: return res fileToRemove = res["Value"] if not fileToRemove: self.log.info("No files found for transID", transID) return S_OK() # Executing with shifter proxy gConfigurationData.setOptionInCFG( "/DIRAC/Security/UseServerCertificate", "false") res = DataManager().removeFile(fileToRemove, force=True) gConfigurationData.setOptionInCFG( "/DIRAC/Security/UseServerCertificate", "true") if not res["OK"]: return res for lfn, reason in res["Value"]["Failed"].items(): self.log.error("Failed to remove file found in metadata catalog", "%s %s" % (lfn, reason)) if res["Value"]["Failed"]: return S_ERROR( "Failed to remove all files found in the metadata catalog") self.log.info("Successfully removed all files found in the DFC") return S_OK() ############################################################################# # # These are the methods for removing the jobs from the WMS and transformation DB # def cleanTransformationTasks(self, transID): """clean tasks from WMS, or from the RMS if it is a DataManipulation transformation""" self.log.verbose("Cleaning Transformation tasks of transformation", transID) res = self.__getTransformationExternalIDs(transID) if not res["OK"]: return res externalIDs = res["Value"] if externalIDs: res = self.transClient.getTransformationParameters( transID, ["Type"]) if not res["OK"]: self.log.error("Failed to determine transformation type") return res transType = res["Value"] if transType in self.dataProcTTypes: res = self.__removeWMSTasks(externalIDs) else: res = self.__removeRequests(externalIDs) if not res["OK"]: return res return S_OK() def __getTransformationExternalIDs(self, transID): """collect all ExternalIDs for transformation :transID: :param self: self reference :param int transID: transforamtion ID """ res = self.transClient.getTransformationTasks( condDict={"TransformationID": transID}) if not res["OK"]: self.log.error( "Failed to get externalIDs for transformation %d" % transID, res["Message"]) return res externalIDs = [taskDict["ExternalID"] for taskDict in res["Value"]] self.log.info("Found %d tasks for transformation" % len(externalIDs)) return S_OK(externalIDs) def __removeRequests(self, requestIDs): """This will remove requests from the RMS system -""" rIDs = [int(int(j)) for j in requestIDs if int(j)] for reqID in rIDs: self.reqClient.cancelRequest(reqID) return S_OK() def __removeWMSTasks(self, transJobIDs): """delete jobs (mark their status as "JobStatus.DELETED") and their requests from the system :param self: self reference :param list trasnJobIDs: job IDs """ # Prevent 0 job IDs jobIDs = [int(j) for j in transJobIDs if int(j)] allRemove = True for jobList in breakListIntoChunks(jobIDs, 500): res = self.wmsClient.killJob(jobList) if res["OK"]: self.log.info("Successfully killed %d jobs from WMS" % len(jobList)) elif ("InvalidJobIDs" in res) and ("NonauthorizedJobIDs" not in res) and ("FailedJobIDs" not in res): self.log.info("Found jobs which did not exist in the WMS", "(n=%d)" % len(res["InvalidJobIDs"])) elif "NonauthorizedJobIDs" in res: self.log.error("Failed to kill jobs because not authorized", "(n=%d)" % len(res["NonauthorizedJobIDs"])) allRemove = False elif "FailedJobIDs" in res: self.log.error("Failed to kill jobs", "(n=%d)" % len(res["FailedJobIDs"])) allRemove = False res = self.wmsClient.deleteJob(jobList) if res["OK"]: self.log.info("Successfully deleted jobs from WMS", "(n=%d)" % len(jobList)) elif ("InvalidJobIDs" in res) and ("NonauthorizedJobIDs" not in res) and ("FailedJobIDs" not in res): self.log.info("Found jobs which did not exist in the WMS", "(n=%d)" % len(res["InvalidJobIDs"])) elif "NonauthorizedJobIDs" in res: self.log.error("Failed to delete jobs because not authorized", "(n=%d)" % len(res["NonauthorizedJobIDs"])) allRemove = False elif "FailedJobIDs" in res: self.log.error("Failed to delete jobs", "(n=%d)" % len(res["FailedJobIDs"])) allRemove = False if not allRemove: return S_ERROR("Failed to delete all remnants from WMS") self.log.info("Successfully deleted all tasks from the WMS") if not jobIDs: self.log.info( "JobIDs not present, unable to delete associated requests.") return S_OK() failed = 0 failoverRequests = {} res = self.reqClient.getRequestIDsForJobs(jobIDs) if not res["OK"]: self.log.error("Failed to get requestID for jobs.", res["Message"]) return res failoverRequests.update(res["Value"]["Successful"]) if not failoverRequests: return S_OK() for jobID, requestID in res["Value"]["Successful"].items(): # Put this check just in case, tasks must have associated jobs if jobID == 0 or jobID == "0": continue res = self.reqClient.cancelRequest(requestID) if not res["OK"]: self.log.error("Failed to remove request from RequestDB", res["Message"]) failed += 1 else: self.log.verbose("Removed request %s associated to job %d." % (requestID, jobID)) if failed: self.log.info("Successfully removed requests", "(n=%d)" % (len(failoverRequests) - failed)) self.log.info("Failed to remove requests", "(n=%d)" % failed) return S_ERROR("Failed to remove all the request from RequestDB") self.log.info( "Successfully removed all the associated failover requests") return S_OK()
class WorkflowTasks(TaskBase): """Handles jobs""" def __init__( self, transClient=None, logger=None, submissionClient=None, jobMonitoringClient=None, outputDataModule=None, jobClass=None, opsH=None, destinationPlugin=None, ownerDN=None, ownerGroup=None, ): """Generates some default objects. jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works: VOs can pass in their job class extension, if present """ if not logger: logger = gLogger.getSubLogger(self.__class__.__name__) super(WorkflowTasks, self).__init__(transClient, logger) useCertificates = True if (bool(ownerDN) and bool(ownerGroup)) else False if not submissionClient: self.submissionClient = WMSClient(useCertificates=useCertificates, delegatedDN=ownerDN, delegatedGroup=ownerGroup) else: self.submissionClient = submissionClient if not jobMonitoringClient: self.jobMonitoringClient = JobMonitoringClient() else: self.jobMonitoringClient = jobMonitoringClient if not jobClass: self.jobClass = Job else: self.jobClass = jobClass if not opsH: self.opsH = Operations() else: self.opsH = opsH if not outputDataModule: self.outputDataModule = self.opsH.getValue( "Transformations/OutputDataModule", "") else: self.outputDataModule = outputDataModule if not destinationPlugin: self.destinationPlugin = self.opsH.getValue( "Transformations/DestinationPlugin", "BySE") else: self.destinationPlugin = destinationPlugin self.destinationPlugin_o = None self.outputDataModule_o = None def prepareTransformationTasks(self, transBody, taskDict, owner="", ownerGroup="", ownerDN="", bulkSubmissionFlag=False): """Prepare tasks, given a taskDict, that is created (with some manipulation) by the DB jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works. :param str transBody: transformation job template :param dict taskDict: dictionary of per task parameters :param str owner: owner of the transformation :param str ownerGroup: group of the owner of the transformation :param str ownerDN: DN of the owner of the transformation :param bool bulkSubmissionFlag: flag for using bulk submission or not :return: S_OK/S_ERROR with updated taskDict """ if (not owner) or (not ownerGroup): res = getProxyInfo(False, False) if not res["OK"]: return res proxyInfo = res["Value"] owner = proxyInfo["username"] ownerGroup = proxyInfo["group"] if not ownerDN: res = getDNForUsername(owner) if not res["OK"]: return res ownerDN = res["Value"][0] if bulkSubmissionFlag: return self.__prepareTasksBulk(transBody, taskDict, owner, ownerGroup, ownerDN) # not a bulk submission return self.__prepareTasks(transBody, taskDict, owner, ownerGroup, ownerDN) def __prepareTasksBulk(self, transBody, taskDict, owner, ownerGroup, ownerDN): """Prepare transformation tasks with a single job object for bulk submission :param str transBody: transformation job template :param dict taskDict: dictionary of per task parameters :param str owner: owner of the transformation :param str ownerGroup: group of the owner of the transformation :param str ownerDN: DN of the owner of the transformation :return: S_OK/S_ERROR with updated taskDict """ if taskDict: transID = list(taskDict.values())[0]["TransformationID"] else: return S_OK({}) method = "__prepareTasksBulk" startTime = time.time() # Prepare the bulk Job object with common parameters oJob = self.jobClass(transBody) self._logVerbose("Setting job owner:group to %s:%s" % (owner, ownerGroup), transID=transID, method=method) oJob.setOwner(owner) oJob.setOwnerGroup(ownerGroup) oJob.setOwnerDN(ownerDN) try: site = oJob.workflow.findParameter("Site").getValue() except AttributeError: site = None jobType = oJob.workflow.findParameter("JobType").getValue() transGroup = str(transID).zfill(8) # Verify that the JOB_ID parameter is added to the workflow if not oJob.workflow.findParameter("JOB_ID"): oJob._addParameter(oJob.workflow, "JOB_ID", "string", "00000000", "Initial JOB_ID") if oJob.workflow.findParameter("PRODUCTION_ID"): oJob._setParamValue("PRODUCTION_ID", str(transID).zfill(8)) # pylint: disable=protected-access else: oJob._addParameter( oJob.workflow, # pylint: disable=protected-access "PRODUCTION_ID", "string", str(transID).zfill(8), "Production ID", ) oJob.setType(jobType) self._logVerbose("Adding default transformation group of %s" % (transGroup), transID=transID, method=method) oJob.setJobGroup(transGroup) clinicPath = self._checkSickTransformations(transID) if clinicPath: self._handleHospital(oJob, clinicPath) # Collect per job parameters sequences paramSeqDict = {} # tasks must be sorted because we use bulk submission and we must find the correspondance for taskID in sorted(taskDict): paramsDict = taskDict[taskID] seqDict = {} if site is not None: paramsDict["Site"] = site paramsDict["JobType"] = jobType # Handle destination site sites = self._handleDestination(paramsDict) if not sites: self._logError("Could not get a list a sites", transID=transID, method=method) return S_ERROR(ETSUKN, "Can not evaluate destination site") else: self._logVerbose("Setting Site: ", str(sites), transID=transID, method=method) seqDict["Site"] = sites seqDict["JobName"] = self._transTaskName(transID, taskID) seqDict["JOB_ID"] = str(taskID).zfill(8) self._logDebug( "TransID: %s, TaskID: %s, paramsDict: %s" % (transID, taskID, str(paramsDict)), transID=transID, method=method, ) # Handle Input Data inputData = paramsDict.get("InputData") if inputData: if isinstance(inputData, six.string_types): inputData = inputData.replace(" ", "").split(";") self._logVerbose("Setting input data to %s" % inputData, transID=transID, method=method) seqDict["InputData"] = inputData elif paramSeqDict.get("InputData") is not None: self._logError( "Invalid mixture of jobs with and without input data") return S_ERROR( ETSDATA, "Invalid mixture of jobs with and without input data") for paramName, paramValue in paramsDict.items(): if paramName not in ("InputData", "Site", "TargetSE"): if paramValue: self._logVerbose("Setting %s to %s" % (paramName, paramValue), transID=transID, method=method) seqDict[paramName] = paramValue outputParameterList = [] if self.outputDataModule: res = self.getOutputData({ "Job": oJob._toXML(), # pylint: disable=protected-access "TransformationID": transID, "TaskID": taskID, "InputData": inputData, }) if not res["OK"]: self._logError("Failed to generate output data", res["Message"], transID=transID, method=method) continue for name, output in res["Value"].items(): seqDict[name] = output outputParameterList.append(name) if oJob.workflow.findParameter(name): oJob._setParamValue(name, "%%(%s)s" % name) # pylint: disable=protected-access else: oJob._addParameter( oJob.workflow, name, "JDL", "%%(%s)s" % name, name # pylint: disable=protected-access ) for pName, seq in seqDict.items(): paramSeqDict.setdefault(pName, []).append(seq) for paramName, paramSeq in paramSeqDict.items(): if paramName in ["JOB_ID", "PRODUCTION_ID", "InputData" ] + outputParameterList: res = oJob.setParameterSequence(paramName, paramSeq, addToWorkflow=paramName) else: res = oJob.setParameterSequence(paramName, paramSeq) if not res["OK"]: return res if taskDict: self._logInfo("Prepared %d tasks" % len(taskDict), transID=transID, method=method, reftime=startTime) taskDict["BulkJobObject"] = oJob return S_OK(taskDict) def __prepareTasks(self, transBody, taskDict, owner, ownerGroup, ownerDN): """Prepare transformation tasks with a job object per task :param str transBody: transformation job template :param dict taskDict: dictionary of per task parameters :param owner: owner of the transformation :param str ownerGroup: group of the owner of the transformation :param str ownerDN: DN of the owner of the transformation :return: S_OK/S_ERROR with updated taskDict """ if taskDict: transID = list(taskDict.values())[0]["TransformationID"] else: return S_OK({}) method = "__prepareTasks" startTime = time.time() oJobTemplate = self.jobClass(transBody) oJobTemplate.setOwner(owner) oJobTemplate.setOwnerGroup(ownerGroup) oJobTemplate.setOwnerDN(ownerDN) try: site = oJobTemplate.workflow.findParameter("Site").getValue() except AttributeError: site = None jobType = oJobTemplate.workflow.findParameter("JobType").getValue() templateOK = False getOutputDataTiming = 0.0 for taskID, paramsDict in taskDict.items(): # Create a job for each task and add it to the taskDict if not templateOK: templateOK = True # Update the template with common information self._logVerbose("Job owner:group to %s:%s" % (owner, ownerGroup), transID=transID, method=method) transGroup = str(transID).zfill(8) self._logVerbose("Adding default transformation group of %s" % (transGroup), transID=transID, method=method) oJobTemplate.setJobGroup(transGroup) if oJobTemplate.workflow.findParameter("PRODUCTION_ID"): oJobTemplate._setParamValue("PRODUCTION_ID", str(transID).zfill(8)) else: oJobTemplate._addParameter(oJobTemplate.workflow, "PRODUCTION_ID", "string", str(transID).zfill(8), "Production ID") if not oJobTemplate.workflow.findParameter("JOB_ID"): oJobTemplate._addParameter(oJobTemplate.workflow, "JOB_ID", "string", "00000000", "Initial JOB_ID") if site is not None: paramsDict["Site"] = site paramsDict["JobType"] = jobType # Now create the job from the template oJob = copy.deepcopy(oJobTemplate) constructedName = self._transTaskName(transID, taskID) self._logVerbose("Setting task name to %s" % constructedName, transID=transID, method=method) oJob.setName(constructedName) oJob._setParamValue("JOB_ID", str(taskID).zfill(8)) inputData = None self._logDebug( "TransID: %s, TaskID: %s, paramsDict: %s" % (transID, taskID, str(paramsDict)), transID=transID, method=method, ) # These helper functions do the real job sites = self._handleDestination(paramsDict) if not sites: self._logError("Could not get a list a sites", transID=transID, method=method) paramsDict["TaskObject"] = "" continue else: self._logDebug("Setting Site: ", str(sites), transID=transID, method=method) res = oJob.setDestination(sites) if not res["OK"]: self._logError("Could not set the site: %s" % res["Message"], transID=transID, method=method) paramsDict["TaskObject"] = "" continue self._handleInputs(oJob, paramsDict) self._handleRest(oJob, paramsDict) clinicPath = self._checkSickTransformations(transID) if clinicPath: self._handleHospital(oJob, clinicPath) paramsDict["TaskObject"] = "" if self.outputDataModule: getOutputDataTiming -= time.time() res = self.getOutputData({ "Job": oJob._toXML(), "TransformationID": transID, "TaskID": taskID, "InputData": inputData }) getOutputDataTiming += time.time() if not res["OK"]: self._logError("Failed to generate output data", res["Message"], transID=transID, method=method) continue for name, output in res["Value"].items(): oJob._addJDLParameter(name, ";".join(output)) paramsDict["TaskObject"] = oJob if taskDict: self._logVerbose( "Average getOutputData time: %.1f per task" % (getOutputDataTiming / len(taskDict)), transID=transID, method=method, ) self._logInfo("Prepared %d tasks" % len(taskDict), transID=transID, method=method, reftime=startTime) return S_OK(taskDict) ############################################################################# def _handleDestination(self, paramsDict): """Handle Sites and TargetSE in the parameters""" try: sites = ["ANY"] if paramsDict["Site"]: # 'Site' comes from the XML and therefore is ; separated sites = fromChar(paramsDict["Site"], sepChar=";") except KeyError: pass if self.destinationPlugin_o: destinationPlugin_o = self.destinationPlugin_o else: res = self.__generatePluginObject(self.destinationPlugin) if not res["OK"]: self._logFatal( "Could not generate a destination plugin object") return res destinationPlugin_o = res["Value"] self.destinationPlugin_o = destinationPlugin_o destinationPlugin_o.setParameters(paramsDict) destSites = destinationPlugin_o.run() if not destSites: return sites # Now we need to make the AND with the sites, if defined if sites != ["ANY"]: # Need to get the AND destSites &= set(sites) return list(destSites) def _handleInputs(self, oJob, paramsDict): """set job inputs (+ metadata)""" inputData = paramsDict.get("InputData") transID = paramsDict["TransformationID"] if inputData: self._logVerbose("Setting input data to %s" % inputData, transID=transID, method="_handleInputs") res = oJob.setInputData(inputData) if not res["OK"]: self._logError("Could not set the inputs: %s" % res["Message"], transID=transID, method="_handleInputs") def _handleRest(self, oJob, paramsDict): """add as JDL parameters all the other parameters that are not for inputs or destination""" transID = paramsDict["TransformationID"] for paramName, paramValue in paramsDict.items(): if paramName not in ("InputData", "Site", "TargetSE"): if paramValue: self._logDebug("Setting %s to %s" % (paramName, paramValue), transID=transID, method="_handleRest") oJob._addJDLParameter(paramName, paramValue) def _checkSickTransformations(self, transID): """Check if the transformation is in the transformations to be processed at Hospital or Clinic""" transID = int(transID) clinicPath = "Hospital" if transID in set( int(x) for x in self.opsH.getValue( os.path.join(clinicPath, "Transformations"), [])): return clinicPath if "Clinics" in self.opsH.getSections("Hospital").get("Value", []): basePath = os.path.join("Hospital", "Clinics") clinics = self.opsH.getSections(basePath)["Value"] for clinic in clinics: clinicPath = os.path.join(basePath, clinic) if transID in set( int(x) for x in self.opsH.getValue( os.path.join(clinicPath, "Transformations"), [])): return clinicPath return None def _handleHospital(self, oJob, clinicPath): """Optional handle of hospital/clinic jobs""" if not clinicPath: return oJob.setInputDataPolicy("download", dataScheduling=False) # Check first for a clinic, if not it must be the general hospital hospitalSite = self.opsH.getValue( os.path.join(clinicPath, "ClinicSite"), "") hospitalCEs = self.opsH.getValue(os.path.join(clinicPath, "ClinicCE"), []) # If not found, get the hospital parameters if not hospitalSite: hospitalSite = self.opsH.getValue("Hospital/HospitalSite", "DIRAC.JobDebugger.ch") if not hospitalCEs: hospitalCEs = self.opsH.getValue("Hospital/HospitalCEs", []) oJob.setDestination(hospitalSite) if hospitalCEs: oJob._addJDLParameter("GridCE", hospitalCEs) def __generatePluginObject(self, plugin): """This simply instantiates the TaskManagerPlugin class with the relevant plugin name""" method = "__generatePluginObject" try: plugModule = __import__(self.pluginLocation, globals(), locals(), ["TaskManagerPlugin"]) except ImportError as e: self._logException("Failed to import 'TaskManagerPlugin' %s: %s" % (plugin, e), method=method) return S_ERROR() try: plugin_o = getattr(plugModule, "TaskManagerPlugin")("%s" % plugin, operationsHelper=self.opsH) return S_OK(plugin_o) except AttributeError as e: self._logException("Failed to create %s(): %s." % (plugin, e), method=method) return S_ERROR() ############################################################################# def getOutputData(self, paramDict): """Get the list of job output LFNs from the provided plugin""" if not self.outputDataModule_o: # Create the module object moduleFactory = ModuleFactory() moduleInstance = moduleFactory.getModule(self.outputDataModule, None) if not moduleInstance["OK"]: return moduleInstance self.outputDataModule_o = moduleInstance["Value"] # This is the "argument" to the module, set it and then execute self.outputDataModule_o.paramDict = paramDict return self.outputDataModule_o.execute() def submitTransformationTasks(self, taskDict): """Submit the tasks""" if "BulkJobObject" in taskDict: return self.__submitTransformationTasksBulk(taskDict) return self.__submitTransformationTasks(taskDict) def __submitTransformationTasksBulk(self, taskDict): """Submit jobs in one go with one parametric job""" if not taskDict: return S_OK(taskDict) startTime = time.time() method = "__submitTransformationTasksBulk" oJob = taskDict.pop("BulkJobObject") # we can only do this, once the job has been popped, or we _might_ crash transID = list(taskDict.values())[0]["TransformationID"] if oJob is None: self._logError("no bulk Job object found", transID=transID, method=method) return S_ERROR(ETSUKN, "No bulk job object provided for submission") result = self.submitTaskToExternal(oJob) if not result["OK"]: self._logError("Failed to submit tasks to external", transID=transID, method=method) return result jobIDList = result["Value"] if len(jobIDList) != len(taskDict): for task in taskDict.values(): task["Success"] = False return S_ERROR( ETSUKN, "Submitted less number of jobs than requested tasks") # Get back correspondence with tasks sorted by ID for jobID, taskID in zip(jobIDList, sorted(taskDict)): taskDict[taskID]["ExternalID"] = jobID taskDict[taskID]["Success"] = True submitted = len(jobIDList) self._logInfo( "Submitted %d tasks to WMS in %.1f seconds" % (submitted, time.time() - startTime), transID=transID, method=method, ) return S_OK(taskDict) def __submitTransformationTasks(self, taskDict): """Submit jobs one by one""" method = "__submitTransformationTasks" submitted = 0 failed = 0 startTime = time.time() for task in taskDict.values(): transID = task["TransformationID"] if not task["TaskObject"]: task["Success"] = False failed += 1 continue res = self.submitTaskToExternal(task["TaskObject"]) if res["OK"]: task["ExternalID"] = res["Value"] task["Success"] = True submitted += 1 else: self._logError("Failed to submit task to WMS", res["Message"], transID=transID, method=method) task["Success"] = False failed += 1 if submitted: self._logInfo( "Submitted %d tasks to WMS in %.1f seconds" % (submitted, time.time() - startTime), transID=transID, method=method, ) if failed: self._logError("Failed to submit %d tasks to WMS." % (failed), transID=transID, method=method) return S_OK(taskDict) def submitTaskToExternal(self, job): """Submits a single job (which can be a bulk one) to the WMS.""" if isinstance(job, six.string_types): try: oJob = self.jobClass(job) except Exception as x: # pylint: disable=broad-except self._logException("Failed to create job object", "", x) return S_ERROR("Failed to create job object") elif isinstance(job, self.jobClass): oJob = job else: self._logError("No valid job description found") return S_ERROR("No valid job description found") workflowFileObject = StringIO(oJob._toXML()) jdl = oJob._toJDL(jobDescriptionObject=workflowFileObject) return self.submissionClient.submitJob(jdl, workflowFileObject) def updateTransformationReservedTasks(self, taskDicts): transID = None jobNames = [ self._transTaskName(taskDict["TransformationID"], taskDict["TaskID"]) for taskDict in taskDicts ] res = self.jobMonitoringClient.getJobs({"JobName": jobNames}) if not res["OK"]: self._logError( "Failed to get task from WMS", res["Message"], transID=transID, method="updateTransformationReservedTasks", ) return res jobNameIDs = {} for wmsID in res["Value"]: res = self.jobMonitoringClient.getJobSummary(int(wmsID)) if not res["OK"]: self._logWarn( "Failed to get task summary from WMS", res["Message"], transID=transID, method="updateTransformationReservedTasks", ) else: jobNameIDs[res["Value"]["JobName"]] = int(wmsID) noTask = list(set(jobNames) - set(jobNameIDs)) return S_OK({"NoTasks": noTask, "TaskNameIDs": jobNameIDs}) def getSubmittedTaskStatus(self, taskDicts): """ Check the status of a list of tasks and return lists of taskIDs for each new status """ method = "getSubmittedTaskStatus" if taskDicts: wmsIDs = [ int(taskDict["ExternalID"]) for taskDict in taskDicts if int(taskDict["ExternalID"]) ] transID = taskDicts[0]["TransformationID"] else: return S_OK({}) res = self.jobMonitoringClient.getJobsStatus(wmsIDs) if not res["OK"]: self._logWarn("Failed to get job status from the WMS system", transID=transID, method=method) return res statusDict = res["Value"] updateDict = {} for taskDict in taskDicts: taskID = taskDict["TaskID"] wmsID = int(taskDict["ExternalID"]) if not wmsID: continue oldStatus = taskDict["ExternalStatus"] newStatus = statusDict.get(wmsID, {}).get("Status", "Removed") if oldStatus != newStatus: if newStatus == "Removed": self._logVerbose( "Production/Job %d/%d removed from WMS while it is in %s status" % (transID, taskID, oldStatus), transID=transID, method=method, ) newStatus = "Failed" self._logVerbose( "Setting job status for Production/Job %d/%d to %s" % (transID, taskID, newStatus), transID=transID, method=method, ) updateDict.setdefault(newStatus, []).append(taskID) return S_OK(updateDict) def getSubmittedFileStatus(self, fileDicts): """ Check the status of a list of files and return the new status of each LFN """ if not fileDicts: return S_OK({}) method = "getSubmittedFileStatus" # All files are from the same transformation transID = fileDicts[0]["TransformationID"] taskFiles = {} for fileDict in fileDicts: jobName = self._transTaskName(transID, fileDict["TaskID"]) taskFiles.setdefault(jobName, {})[fileDict["LFN"]] = fileDict["Status"] res = self.updateTransformationReservedTasks(fileDicts) if not res["OK"]: self._logWarn("Failed to obtain taskIDs for files", transID=transID, method=method) return res noTasks = res["Value"]["NoTasks"] taskNameIDs = res["Value"]["TaskNameIDs"] updateDict = {} for jobName in noTasks: for lfn, oldStatus in taskFiles[jobName].items(): if oldStatus != TransformationFilesStatus.UNUSED: updateDict[lfn] = TransformationFilesStatus.UNUSED res = self.jobMonitoringClient.getJobsStatus(list( taskNameIDs.values())) if not res["OK"]: self._logWarn("Failed to get job status from the WMS system", transID=transID, method=method) return res statusDict = res["Value"] for jobName, wmsID in taskNameIDs.items(): jobStatus = statusDict.get(wmsID, {}).get("Status") newFileStatus = { "Done": TransformationFilesStatus.PROCESSED, "Completed": TransformationFilesStatus.PROCESSED, "Failed": TransformationFilesStatus.UNUSED, }.get(jobStatus) if newFileStatus: for lfn, oldStatus in taskFiles[jobName].items(): if newFileStatus != oldStatus: updateDict[lfn] = newFileStatus return S_OK(updateDict)
def test_JobStateUpdateAndJobMonitoringMultuple( self ): """ # Now, let's submit some jobs. Different sites, types, inputs """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = RPCClient( 'WorkloadManagement/JobStateUpdate' ) jobIDs = [] dests = ['DIRAC.site1.org', 'DIRAC.site2.org'] lfnss = [['/a/1.txt', '/a/2.txt'], ['/a/1.txt', '/a/3.txt', '/a/4.txt'], []] types = ['User', 'Test'] for dest in dests: for lfns in lfnss: for jobType in types: job = helloWorldJob() job.setDestination( dest ) job.setInputData( lfns ) job.setType( jobType ) jobDescription = createFile( job ) res = wmsClient.submitJob( job._toJDL( xmlFile = jobDescription ) ) self.assert_( res['OK'] ) jobID = res['Value'] jobIDs.append( jobID ) res = jobMonitor.getSites() self.assert_( res['OK'] ) self.assert_( set( res['Value'] ) <= set( dests + ['ANY', 'DIRAC.Jenkins.org'] ) ) res = jobMonitor.getJobTypes() self.assert_( res['OK'] ) self.assertEqual( sorted( res['Value'] ), sorted( types ) ) res = jobMonitor.getApplicationStates() self.assert_( res['OK'] ) self.assertEqual( sorted( res['Value'] ), sorted( ['Unknown'] ) ) res = jobMonitor.getOwners() self.assert_( res['OK'] ) res = jobMonitor.getOwnerGroup() self.assert_( res['OK'] ) res = jobMonitor.getProductionIds() self.assert_( res['OK'] ) res = jobMonitor.getJobGroups() self.assert_( res['OK'] ) res = jobMonitor.getStates() self.assert_( res['OK'] ) self.assert_( sorted( res['Value'] ) in [['Received'], sorted( ['Received', 'Waiting'] )] ) res = jobMonitor.getMinorStates() self.assert_( res['OK'] ) self.assert_( sorted( res['Value'] ) in [['Job accepted'], sorted( ['Job accepted', 'matching'] ) ] ) self.assert_( res['OK'] ) res = jobMonitor.getJobs() self.assert_( res['OK'] ) self.assert_( set( [str( x ) for x in jobIDs] ) <= set( res['Value'] ) ) # res = jobMonitor.getCounters(attrList) # self.assert_( res['OK'] ) res = jobMonitor.getCurrentJobCounters() self.assert_( res['OK'] ) try: self.assert_( res['Value'].get( 'Received' ) + res['Value'].get( 'Waiting' ) >= long( len( dests ) * len( lfnss ) * len( types ) ) ) except TypeError: pass res = jobMonitor.getJobsSummary( jobIDs ) self.assert_( res['OK'] ) res = jobMonitor.getJobPageSummaryWeb( {}, [], 0, 100 ) self.assert_( res['OK'] ) res = jobStateUpdate.setJobStatusBulk( jobID, {str( datetime.datetime.utcnow() ):{'Status': 'Running', 'MinorStatus': 'MinorStatus', 'ApplicationStatus': 'ApplicationStatus', 'Source': 'Unknown'}} ) self.assert_( res['OK'] ) res = jobStateUpdate.setJobsParameter( {jobID:['Status', 'Running']} ) self.assert_( res['OK'] ) # delete the jobs - this will just set its status to "deleted" wmsClient.deleteJob( jobIDs )
def test_JobStateUpdateAndJobMonitoringMultuple(self): """ # Now, let's submit some jobs. Different sites, types, inputs """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = JobStateUpdateClient() jobIDs = [] lfnss = [['/a/1.txt', '/a/2.txt'], ['/a/1.txt', '/a/3.txt', '/a/4.txt'], []] types = ['User', 'Test'] for lfns in lfnss: for jobType in types: job = helloWorldJob() job.setDestination('DIRAC.Jenkins.ch') job.setInputData(lfns) job.setType(jobType) jobDescription = createFile(job) res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res['OK'], res.get('Message')) jobID = res['Value'] jobIDs.append(jobID) res = jobMonitor.getSites() print(res) self.assertTrue(res['OK'], res.get('Message')) self.assertTrue(set(res['Value']) <= {'ANY', 'DIRAC.Jenkins.ch'}, msg="Got %s" % res['Value']) res = jobMonitor.getJobTypes() self.assertTrue(res['OK'], res.get('Message')) self.assertEqual(sorted(res['Value']), sorted(types), msg="Got %s" % str(sorted(res['Value']))) res = jobMonitor.getApplicationStates() self.assertTrue(res['OK'], res.get('Message')) self.assertEqual(sorted(res['Value']), sorted(['Unknown']), msg="Got %s" % sorted(str(res['Value']))) res = jobMonitor.getOwners() self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getOwnerGroup() self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getProductionIds() self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobGroups() self.assertTrue(res['OK'], res.get('Message')) resJG_empty = res['Value'] res = jobMonitor.getJobGroups(None, datetime.datetime.utcnow()) self.assertTrue(res['OK'], res.get('Message')) resJG_olderThanNow = res['Value'] self.assertEqual(resJG_empty, resJG_olderThanNow) res = jobMonitor.getJobGroups( None, datetime.datetime.utcnow() - datetime.timedelta(days=365)) self.assertTrue(res['OK'], res.get('Message')) resJG_olderThanOneYear = res['Value'] self.assertTrue( set(resJG_olderThanOneYear).issubset(set(resJG_olderThanNow))) res = jobMonitor.getStates() self.assertTrue(res['OK'], res.get('Message')) self.assertTrue( sorted(res['Value']) in [['Received'], sorted(['Received', 'Waiting'])]) res = jobMonitor.getMinorStates() self.assertTrue(res['OK'], res.get('Message')) self.assertTrue( sorted(res['Value']) in [['Job accepted'], sorted( ['Job accepted', 'Job Rescheduled'])]) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobs() self.assertTrue(res['OK'], res.get('Message')) self.assertTrue(set([str(x) for x in jobIDs]) <= set(res['Value'])) # res = jobMonitor.getCounters(attrList) # self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getCurrentJobCounters() self.assertTrue(res['OK'], res.get('Message')) try: self.assertTrue( res['Value'].get('Received') + res['Value'].get('Waiting') >= int(len(lfnss) * len(types))) except TypeError: pass res = jobMonitor.getJobsSummary(jobIDs) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobPageSummaryWeb({}, [], 0, 100) self.assertTrue(res['OK'], res.get('Message')) res = jobStateUpdate.setJobStatusBulk( jobID, { str(datetime.datetime.utcnow()): { 'Status': 'Running', 'MinorStatus': 'MinorStatus', 'ApplicationStatus': 'ApplicationStatus', 'Source': 'Unknown' } }) self.assertTrue(res['OK'], res.get('Message')) res = jobStateUpdate.setJobsParameter({jobID: ['Status', 'Running']}) self.assertTrue(res['OK'], res.get('Message')) # delete the jobs - this will just set its status to "deleted" wmsClient.deleteJob(jobIDs)
class WorkflowTasks(TaskBase): """ Handles jobs """ def __init__(self, transClient=None, logger=None, submissionClient=None, jobMonitoringClient=None, outputDataModule=None, jobClass=None, opsH=None, destinationPlugin=None, ownerDN=None, ownerGroup=None): """ Generates some default objects. jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works: VOs can pass in their job class extension, if present """ if not logger: logger = gLogger.getSubLogger('WorkflowTasks') super(WorkflowTasks, self).__init__(transClient, logger) useCertificates = True if (bool(ownerDN) and bool(ownerGroup)) else False if not submissionClient: self.submissionClient = WMSClient(useCertificates=useCertificates, delegatedDN=ownerDN, delegatedGroup=ownerGroup) else: self.submissionClient = submissionClient if not jobMonitoringClient: self.jobMonitoringClient = JobMonitoringClient() else: self.jobMonitoringClient = jobMonitoringClient if not jobClass: self.jobClass = Job else: self.jobClass = jobClass if not opsH: self.opsH = Operations() else: self.opsH = opsH if not outputDataModule: self.outputDataModule = self.opsH.getValue("Transformations/OutputDataModule", "") else: self.outputDataModule = outputDataModule if not destinationPlugin: self.destinationPlugin = self.opsH.getValue('Transformations/DestinationPlugin', 'BySE') else: self.destinationPlugin = destinationPlugin self.destinationPlugin_o = None self.outputDataModule_o = None def prepareTransformationTasks(self, transBody, taskDict, owner='', ownerGroup='', ownerDN='', bulkSubmissionFlag=False): """ Prepare tasks, given a taskDict, that is created (with some manipulation) by the DB jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works. :param transBody: transformation job template :param taskDict: dictionary of per task parameters :param owner: owner of the transformation :param ownerGroup: group of the owner of the transformation :param ownerDN: DN of the owner of the transformation :return: S_OK/S_ERROR with updated taskDict """ if (not owner) or (not ownerGroup): res = getProxyInfo(False, False) if not res['OK']: return res proxyInfo = res['Value'] owner = proxyInfo['username'] ownerGroup = proxyInfo['group'] if not ownerDN: res = getDNForUsername(owner) if not res['OK']: return res ownerDN = res['Value'][0] if bulkSubmissionFlag: return self.__prepareTasksBulk(transBody, taskDict, owner, ownerGroup, ownerDN) return self.__prepareTasks(transBody, taskDict, owner, ownerGroup, ownerDN) def __prepareTasksBulk(self, transBody, taskDict, owner, ownerGroup, ownerDN): """ Prepare transformation tasks with a single job object for bulk submission """ if taskDict: transID = taskDict.values()[0]['TransformationID'] else: return S_OK({}) # Prepare the bulk Job object with common parameters oJob = self.jobClass(transBody) method = 'prepareTransformationTasksBulk' self._logVerbose('Setting job owner:group to %s:%s' % (owner, ownerGroup), transID=transID, method=method) oJob.setOwner(owner) oJob.setOwnerGroup(ownerGroup) oJob.setOwnerDN(ownerDN) jobType = oJob.workflow.findParameter('JobType').getValue() transGroup = str(transID).zfill(8) # Verify that the JOB_ID parameter is added to the workflow if not oJob.workflow.findParameter('JOB_ID'): oJob._addParameter(oJob.workflow, 'JOB_ID', 'string', '00000000', "Initial JOB_ID") if oJob.workflow.findParameter('PRODUCTION_ID'): oJob._setParamValue('PRODUCTION_ID', str(transID).zfill(8)) # pylint: disable=protected-access else: oJob._addParameter(oJob.workflow, # pylint: disable=protected-access 'PRODUCTION_ID', 'string', str(transID).zfill(8), "Production ID") oJob.setType(jobType) self._logVerbose('Adding default transformation group of %s' % (transGroup), transID=transID, method=method) oJob.setJobGroup(transGroup) if int(transID) in [int(x) for x in self.opsH.getValue("Hospital/Transformations", [])]: self._handleHospital(oJob) # Collect per job parameters sequences paramSeqDict = {} # tasks must be sorted because we use bulk submission and we must find the correspondance for taskID in sorted(taskDict): paramsDict = taskDict[taskID] seqDict = {} # Handle destination site sites = self._handleDestination(paramsDict) if not sites: self._logError('Could not get a list a sites', transID=transID) return S_ERROR(ETSUKN, "Can not evaluate destination site") else: self._logVerbose('Setting Site: ', str(sites), transID=transID) seqDict['Site'] = sites seqDict['JobName'] = self._transTaskName(transID, taskID) seqDict['JOB_ID'] = str(taskID).zfill(8) self._logDebug('TransID: %s, TaskID: %s, paramsDict: %s' % (transID, taskID, str(paramsDict)), transID=transID, method=method) # Handle Input Data inputData = paramsDict.get('InputData') if inputData: self._logVerbose('Setting input data to %s' % inputData, transID=transID, method=method) seqDict['InputData'] = inputData elif paramSeqDict.get('InputData') is not None: self._logError("Invalid mixture of jobs with and without input data") return S_ERROR(ETSDATA, "Invalid mixture of jobs with and without input data") for paramName, paramValue in paramsDict.iteritems(): if paramName not in ('InputData', 'Site', 'TargetSE'): if paramValue: self._logVerbose('Setting %s to %s' % (paramName, paramValue), transID=transID, method=method) seqDict[paramName] = paramValue outputParameterList = [] if self.outputDataModule: res = self.getOutputData({'Job': oJob._toXML(), 'TransformationID': transID, # pylint: disable=protected-access 'TaskID': taskID, 'InputData': inputData}) if not res['OK']: self._logError("Failed to generate output data", res['Message'], transID=transID, method=method) continue for name, output in res['Value'].iteritems(): seqDict[name] = output outputParameterList.append(name) if oJob.workflow.findParameter(name): oJob._setParamValue(name, "%%(%s)s" % name) # pylint: disable=protected-access else: oJob._addParameter(oJob.workflow, # pylint: disable=protected-access name, 'JDL', "%%(%s)s" % name, name) for pName, seq in seqDict.iteritems(): paramSeqDict.setdefault(pName, []).append(seq) for paramName, paramSeq in paramSeqDict.iteritems(): if paramName in ['JOB_ID', 'PRODUCTION_ID', 'InputData'] + outputParameterList: oJob.setParameterSequence(paramName, paramSeq, addToWorkflow=paramName) else: oJob.setParameterSequence(paramName, paramSeq) taskDict['BulkJobObject'] = oJob return S_OK(taskDict) def __prepareTasks(self, transBody, taskDict, owner, ownerGroup, ownerDN): """ Prepare transformation tasks with a job object per task """ method = '__prepareTasks' startTime = time.time() oJobTemplate = self.jobClass(transBody) oJobTemplate.setOwner(owner) oJobTemplate.setOwnerGroup(ownerGroup) oJobTemplate.setOwnerDN(ownerDN) site = oJobTemplate.workflow.findParameter('Site').getValue() jobType = oJobTemplate.workflow.findParameter('JobType').getValue() templateOK = False getOutputDataTiming = 0. for taskID, paramsDict in taskDict.iteritems(): # Create a job for each task and add it to the taskDict if not templateOK: templateOK = True # Update the template with common information transID = paramsDict['TransformationID'] self._logVerbose('Job owner:group to %s:%s' % (owner, ownerGroup), transID=transID, method=method) transGroup = str(transID).zfill(8) self._logVerbose('Adding default transformation group of %s' % (transGroup), transID=transID, method=method) oJobTemplate.setJobGroup(transGroup) if oJobTemplate.workflow.findParameter('PRODUCTION_ID'): oJobTemplate._setParamValue('PRODUCTION_ID', str(transID).zfill(8)) else: oJobTemplate._addParameter(oJobTemplate.workflow, 'PRODUCTION_ID', 'string', str(transID).zfill(8), "Production ID") if not oJobTemplate.workflow.findParameter('JOB_ID'): oJobTemplate._addParameter(oJobTemplate.workflow, 'JOB_ID', 'string', '00000000', "Initial JOB_ID") paramsDict['Site'] = site paramsDict['JobType'] = jobType # Now create the job from the template oJob = copy.deepcopy(oJobTemplate) constructedName = self._transTaskName(transID, taskID) self._logVerbose('Setting task name to %s' % constructedName, transID=transID, method=method) oJob.setName(constructedName) oJob._setParamValue('JOB_ID', str(taskID).zfill(8)) inputData = None self._logDebug('TransID: %s, TaskID: %s, paramsDict: %s' % (transID, taskID, str(paramsDict)), transID=transID, method=method) # These helper functions do the real job sites = self._handleDestination(paramsDict) if not sites: self._logError('Could not get a list a sites', transID=transID, method=method) paramsDict['TaskObject'] = '' continue else: self._logDebug('Setting Site: ', str(sites), transID=transID, method=method) res = oJob.setDestination(sites) if not res['OK']: self._logError('Could not set the site: %s' % res['Message'], transID=transID, method=method) continue self._handleInputs(oJob, paramsDict) self._handleRest(oJob, paramsDict) hospitalTrans = [int(x) for x in self.opsH.getValue("Hospital/Transformations", [])] if int(transID) in hospitalTrans: self._handleHospital(oJob) paramsDict['TaskObject'] = '' if self.outputDataModule: getOutputDataTiming -= time.time() res = self.getOutputData({'Job': oJob._toXML(), 'TransformationID': transID, 'TaskID': taskID, 'InputData': inputData}) getOutputDataTiming += time.time() if not res['OK']: self._logError("Failed to generate output data", res['Message'], transID=transID, method=method) continue for name, output in res['Value'].iteritems(): oJob._addJDLParameter(name, ';'.join(output)) paramsDict['TaskObject'] = oJob if taskDict: self._logVerbose('Average getOutputData time: %.1f per task' % (getOutputDataTiming / len(taskDict)), transID=transID, method=method) self._logInfo('Prepared %d tasks' % len(taskDict), transID=transID, method=method, reftime=startTime) return S_OK(taskDict) ############################################################################# def _handleDestination(self, paramsDict): """ Handle Sites and TargetSE in the parameters """ try: sites = ['ANY'] if paramsDict['Site']: # 'Site' comes from the XML and therefore is ; separated sites = fromChar(paramsDict['Site'], sepChar=';') except KeyError: pass if self.destinationPlugin_o: destinationPlugin_o = self.destinationPlugin_o else: res = self.__generatePluginObject(self.destinationPlugin) if not res['OK']: self._logFatal("Could not generate a destination plugin object") return res destinationPlugin_o = res['Value'] self.destinationPlugin_o = destinationPlugin_o destinationPlugin_o.setParameters(paramsDict) destSites = destinationPlugin_o.run() if not destSites: return sites # Now we need to make the AND with the sites, if defined if sites != ['ANY']: # Need to get the AND destSites &= set(sites) return list(destSites) def _handleInputs(self, oJob, paramsDict): """ set job inputs (+ metadata) """ inputData = paramsDict.get('InputData') transID = paramsDict['TransformationID'] if inputData: self._logVerbose('Setting input data to %s' % inputData, transID=transID, method='handleInputs') oJob.setInputData(inputData) def _handleRest(self, oJob, paramsDict): """ add as JDL parameters all the other parameters that are not for inputs or destination """ transID = paramsDict['TransformationID'] for paramName, paramValue in paramsDict.iteritems(): if paramName not in ('InputData', 'Site', 'TargetSE'): if paramValue: self._logDebug('Setting %s to %s' % (paramName, paramValue), transID=transID, method='handleRest') oJob._addJDLParameter(paramName, paramValue) def _handleHospital(self, oJob): """ Optional handle of hospital jobs """ oJob.setType('Hospital') oJob.setInputDataPolicy('download', dataScheduling=False) hospitalSite = self.opsH.getValue("Hospital/HospitalSite", 'DIRAC.JobDebugger.ch') oJob.setDestination(hospitalSite) hospitalCEs = self.opsH.getValue("Hospital/HospitalCEs", []) if hospitalCEs: oJob._addJDLParameter('GridCE', hospitalCEs) def __generatePluginObject(self, plugin): """ This simply instantiates the TaskManagerPlugin class with the relevant plugin name """ try: plugModule = __import__(self.pluginLocation, globals(), locals(), ['TaskManagerPlugin']) except ImportError as e: self._logException("Failed to import 'TaskManagerPlugin' %s: %s" % (plugin, e)) return S_ERROR() try: plugin_o = getattr(plugModule, 'TaskManagerPlugin')('%s' % plugin, operationsHelper=self.opsH) return S_OK(plugin_o) except AttributeError as e: self._logException("Failed to create %s(): %s." % (plugin, e)) return S_ERROR() ############################################################################# def getOutputData(self, paramDict): """ Get the list of job output LFNs from the provided plugin """ if not self.outputDataModule_o: # Create the module object moduleFactory = ModuleFactory() moduleInstance = moduleFactory.getModule(self.outputDataModule, None) if not moduleInstance['OK']: return moduleInstance self.outputDataModule_o = moduleInstance['Value'] # This is the "argument" to the module, set it and then execute self.outputDataModule_o.paramDict = paramDict return self.outputDataModule_o.execute() def submitTransformationTasks(self, taskDict): """ Submit the tasks """ if 'BulkJobObject' in taskDict: return self.__submitTransformationTasksBulk(taskDict) return self.__submitTransformationTasks(taskDict) def __submitTransformationTasksBulk(self, taskDict): """ Submit jobs in one go with one parametric job """ if not taskDict: return S_OK(taskDict) startTime = time.time() oJob = taskDict.pop('BulkJobObject') # we can only do this, once the job has been popped, or we _might_ crash transID = taskDict.values()[0]['TransformationID'] if oJob is None: self._logError('no bulk Job object found', transID=transID, method='submitTransformationTasksBulk') return S_ERROR(ETSUKN, 'No bulk job object provided for submission') result = self.submitTaskToExternal(oJob) if not result['OK']: return result jobIDList = result['Value'] if len(jobIDList) != len(taskDict): for task in taskDict.values(): task['Success'] = False return S_ERROR(ETSUKN, 'Submitted less number of jobs than requested tasks') # Get back correspondance with tasks sorted by ID for jobID, taskID in zip(jobIDList, sorted(taskDict)): taskDict[taskID]['ExternalID'] = jobID taskDict[taskID]['Success'] = True submitted = len(jobIDList) self._logInfo('Submitted %d tasks to WMS in %.1f seconds' % (submitted, time.time() - startTime), transID=transID, method='submitTransformationTasksBulk') return S_OK(taskDict) def __submitTransformationTasks(self, taskDict): """ Submit jobs one by one """ method = 'submitTransformationTasks' submitted = 0 failed = 0 startTime = time.time() for task in taskDict.itervalues(): transID = task['TransformationID'] if not task['TaskObject']: task['Success'] = False failed += 1 continue res = self.submitTaskToExternal(task['TaskObject']) if res['OK']: task['ExternalID'] = res['Value'] task['Success'] = True submitted += 1 else: self._logError("Failed to submit task to WMS", res['Message'], transID=transID, method=method) task['Success'] = False failed += 1 if submitted: self._logInfo('Submitted %d tasks to WMS in %.1f seconds' % (submitted, time.time() - startTime), transID=transID, method=method) if failed: self._logError('Failed to submit %d tasks to WMS.' % (failed), transID=transID, method=method) return S_OK(taskDict) def submitTaskToExternal(self, job): """ Submits a single job to the WMS. """ if isinstance(job, basestring): try: oJob = self.jobClass(job) except Exception as x: # pylint: disable=broad-except self._logException("Failed to create job object", '', x) return S_ERROR("Failed to create job object") elif isinstance(job, self.jobClass): oJob = job else: self._logError("No valid job description found") return S_ERROR("No valid job description found") workflowFileObject = StringIO.StringIO(oJob._toXML()) jdl = oJob._toJDL(jobDescriptionObject=workflowFileObject) return self.submissionClient.submitJob(jdl, workflowFileObject) def updateTransformationReservedTasks(self, taskDicts): transID = None jobNames = [self._transTaskName(taskDict['TransformationID'], taskDict['TaskID']) for taskDict in taskDicts] res = self.jobMonitoringClient.getJobs({'JobName': jobNames}) if not res['OK']: self._logError("Failed to get task from WMS", res['Message'], transID=transID, method='updateTransformationReservedTasks') return res jobNameIDs = {} for wmsID in res['Value']: res = self.jobMonitoringClient.getJobPrimarySummary(int(wmsID)) if not res['OK']: self._logWarn("Failed to get task summary from WMS", res['Message'], transID=transID, method='updateTransformationReservedTasks') else: jobNameIDs[res['Value']['JobName']] = int(wmsID) noTask = list(set(jobNames) - set(jobNameIDs)) return S_OK({'NoTasks': noTask, 'TaskNameIDs': jobNameIDs}) def getSubmittedTaskStatus(self, taskDicts): """ Check the status of a list of tasks and return lists of taskIDs for each new status """ if taskDicts: wmsIDs = [int(taskDict['ExternalID']) for taskDict in taskDicts if int(taskDict['ExternalID'])] transID = taskDicts[0]['TransformationID'] else: return S_OK({}) res = self.jobMonitoringClient.getJobsStatus(wmsIDs) if not res['OK']: self._logWarn("Failed to get job status from the WMS system", transID=transID) return res statusDict = res['Value'] updateDict = {} for taskDict in taskDicts: taskID = taskDict['TaskID'] wmsID = int(taskDict['ExternalID']) if not wmsID: continue oldStatus = taskDict['ExternalStatus'] newStatus = statusDict.get(wmsID, {}).get('Status', 'Removed') if oldStatus != newStatus: if newStatus == "Removed": self._logVerbose('Production/Job %d/%d removed from WMS while it is in %s status' % (transID, taskID, oldStatus), transID=transID) newStatus = "Failed" self._logVerbose('Setting job status for Production/Job %d/%d to %s' % (transID, taskID, newStatus), transID=transID) updateDict.setdefault(newStatus, []).append(taskID) return S_OK(updateDict) def getSubmittedFileStatus(self, fileDicts): """ Check the status of a list of files and return the new status of each LFN """ if not fileDicts: return S_OK({}) # All files are from the same transformation transID = fileDicts[0]['TransformationID'] taskFiles = {} for fileDict in fileDicts: jobName = self._transTaskName(transID, fileDict['TaskID']) taskFiles.setdefault(jobName, {})[fileDict['LFN']] = fileDict['Status'] res = self.updateTransformationReservedTasks(fileDicts) if not res['OK']: self._logWarn("Failed to obtain taskIDs for files", transID=transID) return res noTasks = res['Value']['NoTasks'] taskNameIDs = res['Value']['TaskNameIDs'] updateDict = {} for jobName in noTasks: for lfn, oldStatus in taskFiles[jobName].iteritems(): if oldStatus != 'Unused': updateDict[lfn] = 'Unused' res = self.jobMonitoringClient.getJobsStatus(taskNameIDs.values()) if not res['OK']: self._logWarn("Failed to get job status from the WMS system", transID=transID) return res statusDict = res['Value'] for jobName, wmsID in taskNameIDs.iteritems(): jobStatus = statusDict.get(wmsID, {}).get('Status') newFileStatus = {'Done': 'Processed', 'Completed': 'Processed', 'Failed': 'Unused'}.get(jobStatus) if newFileStatus: for lfn, oldStatus in taskFiles[jobName].iteritems(): if newFileStatus != oldStatus: updateDict[lfn] = newFileStatus return S_OK(updateDict)
def test_FullChain( self ): """ This test will - call all the WMSClient methods that will end up calling all the JobManager service methods - use the JobMonitoring to verify few properties - call the JobCleaningAgent to eliminate job entries from the DBs """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = RPCClient( 'WorkloadManagement/JobStateUpdate' ) # create the job job = helloWorldJob() jobDescription = createFile( job ) # submit the job res = wmsClient.submitJob( job._toJDL( xmlFile = jobDescription ) ) self.assert_( res['OK'] ) # self.assertEqual( type( res['Value'] ), int ) # self.assertEqual( res['Value'], res['JobID'] ) # jobID = res['JobID'] jobID = res['Value'] # updating the status jobStateUpdate.setJobStatus( jobID, 'Running', 'Executing Minchiapp', 'source' ) # reset the job res = wmsClient.resetJob( jobID ) self.assert_( res['OK'] ) # reschedule the job res = wmsClient.rescheduleJob( jobID ) self.assert_( res['OK'] ) res = jobMonitor.getJobStatus( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], 'Received' ) # updating the status again jobStateUpdate.setJobStatus( jobID, 'Matched', 'matching', 'source' ) # kill the job res = wmsClient.killJob( jobID ) self.assert_( res['OK'] ) res = jobMonitor.getJobStatus( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], 'Killed' ) # updating the status aaaagain jobStateUpdate.setJobStatus( jobID, 'Done', 'matching', 'source' ) # kill the job res = wmsClient.killJob( jobID ) self.assert_( res['OK'] ) res = jobMonitor.getJobStatus( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], 'Done' ) # this time it won't kill... it's done! # delete the job - this will just set its status to "deleted" res = wmsClient.deleteJob( jobID ) self.assert_( res['OK'] ) res = jobMonitor.getJobStatus( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], 'Deleted' )
def test_JobStateUpdateAndJobMonitoringMultuple(self): """ # Now, let's submit some jobs. Different sites, types, inputs """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate') jobIDs = [] dests = ['DIRAC.site1.org', 'DIRAC.site2.org'] lfnss = [['/a/1.txt', '/a/2.txt'], ['/a/1.txt', '/a/3.txt', '/a/4.txt'], []] types = ['User', 'Test'] for dest in dests: for lfns in lfnss: for jobType in types: job = helloWorldJob() job.setDestination(dest) job.setInputData(lfns) job.setType(jobType) jobDescription = createFile(job) res = wmsClient.submitJob( job._toJDL(xmlFile=jobDescription)) self.assert_(res['OK']) jobID = res['Value'] jobIDs.append(jobID) res = jobMonitor.getSites() self.assert_(res['OK']) self.assert_( set(res['Value']) <= set(dests + ['ANY', 'DIRAC.Jenkins.ch'])) res = jobMonitor.getJobTypes() self.assert_(res['OK']) self.assertEqual(sorted(res['Value']), sorted(types)) res = jobMonitor.getApplicationStates() self.assert_(res['OK']) self.assertEqual(sorted(res['Value']), sorted(['Unknown'])) res = jobMonitor.getOwners() self.assert_(res['OK']) res = jobMonitor.getOwnerGroup() self.assert_(res['OK']) res = jobMonitor.getProductionIds() self.assert_(res['OK']) res = jobMonitor.getJobGroups() self.assert_(res['OK']) res = jobMonitor.getStates() self.assert_(res['OK']) self.assert_( sorted(res['Value']) in [['Received'], sorted(['Received', 'Waiting'])]) res = jobMonitor.getMinorStates() self.assert_(res['OK']) self.assert_( sorted(res['Value']) in [['Job accepted'], sorted(['Job accepted', 'matching'])]) self.assert_(res['OK']) res = jobMonitor.getJobs() self.assert_(res['OK']) self.assert_(set([str(x) for x in jobIDs]) <= set(res['Value'])) # res = jobMonitor.getCounters(attrList) # self.assert_( res['OK'] ) res = jobMonitor.getCurrentJobCounters() self.assert_(res['OK']) try: self.assert_( res['Value'].get('Received') + res['Value'].get('Waiting') >= long(len(dests) * len(lfnss) * len(types))) except TypeError: pass res = jobMonitor.getJobsSummary(jobIDs) self.assert_(res['OK']) res = jobMonitor.getJobPageSummaryWeb({}, [], 0, 100) self.assert_(res['OK']) res = jobStateUpdate.setJobStatusBulk( jobID, { str(datetime.datetime.utcnow()): { 'Status': 'Running', 'MinorStatus': 'MinorStatus', 'ApplicationStatus': 'ApplicationStatus', 'Source': 'Unknown' } }) self.assert_(res['OK']) res = jobStateUpdate.setJobsParameter({jobID: ['Status', 'Running']}) self.assert_(res['OK']) # delete the jobs - this will just set its status to "deleted" wmsClient.deleteJob(jobIDs)
def __failStalledJobs(self, failedTime): """ Changes the Stalled status to Failed for jobs long in the Stalled status """ result = self.jobDB.selectJobs({'Status': 'Stalled'}) if not result['OK']: return result jobs = result['Value'] failedCounter = 0 minorStalledStatuses = ("Job stalled: pilot not running", 'Stalling for more than %d sec' % failedTime) if jobs: self.log.info('%s Stalled jobs will be checked for failure' % (len(jobs))) for job in jobs: setFailed = False # Check if the job pilot is lost result = self.__getJobPilotStatus(job) if not result['OK']: self.log.error('Failed to get pilot status', result['Message']) continue pilotStatus = result['Value'] if pilotStatus != "Running": setFailed = minorStalledStatuses[0] else: result = self.__getLatestUpdateTime(job) if not result['OK']: self.log.error('Failed to get job update time', result['Message']) continue elapsedTime = toEpoch() - result['Value'] if elapsedTime > failedTime: setFailed = minorStalledStatuses[1] # Set the jobs Failed, send them a kill signal in case they are not really dead and send accounting info if setFailed: # Send a kill signal to the job such that it cannot continue running WMSClient().killJob(job) self.__updateJobStatus(job, 'Failed', setFailed) failedCounter += 1 result = self.__sendAccounting(job) if not result['OK']: self.log.error('Failed to send accounting', result['Message']) recoverCounter = 0 for minor in minorStalledStatuses: result = self.jobDB.selectJobs({ 'Status': 'Failed', 'MinorStatus': minor, 'AccountedFlag': 'False' }) if not result['OK']: return result if result['Value']: jobs = result['Value'] self.log.info('%s Stalled jobs will be Accounted' % (len(jobs))) for job in jobs: result = self.__sendAccounting(job) if not result['OK']: self.log.error('Failed to send accounting', result['Message']) continue recoverCounter += 1 if not result['OK']: break if failedCounter: self.log.info('%d jobs set to Failed' % failedCounter) if recoverCounter: self.log.info('%d jobs properly Accounted' % recoverCounter) return S_OK(failedCounter)
class WorkflowTasks(TaskBase): """ Handles jobs """ def __init__(self, transClient=None, logger=None, submissionClient=None, jobMonitoringClient=None, outputDataModule=None, jobClass=None, opsH=None, destinationPlugin=None, ownerDN=None, ownerGroup=None): """ Generates some default objects. jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works: VOs can pass in their job class extension, if present """ if not logger: logger = gLogger.getSubLogger('WorkflowTasks') super(WorkflowTasks, self).__init__(transClient, logger) useCertificates = True if (bool(ownerDN) and bool(ownerGroup)) else False if not submissionClient: self.submissionClient = WMSClient(useCertificates=useCertificates, delegatedDN=ownerDN, delegatedGroup=ownerGroup) else: self.submissionClient = submissionClient if not jobMonitoringClient: self.jobMonitoringClient = JobMonitoringClient() else: self.jobMonitoringClient = jobMonitoringClient if not jobClass: self.jobClass = Job else: self.jobClass = jobClass if not opsH: self.opsH = Operations() else: self.opsH = opsH if not outputDataModule: self.outputDataModule = self.opsH.getValue( "Transformations/OutputDataModule", "") else: self.outputDataModule = outputDataModule if not destinationPlugin: self.destinationPlugin = self.opsH.getValue( 'Transformations/DestinationPlugin', 'BySE') else: self.destinationPlugin = destinationPlugin self.destinationPlugin_o = None self.outputDataModule_o = None def prepareTransformationTasks(self, transBody, taskDict, owner='', ownerGroup='', ownerDN='', bulkSubmissionFlag=False): """ Prepare tasks, given a taskDict, that is created (with some manipulation) by the DB jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works. :param str transBody: transformation job template :param dict taskDict: dictionary of per task parameters :param str owner: owner of the transformation :param str ownerGroup: group of the owner of the transformation :param str ownerDN: DN of the owner of the transformation :param bool bulkSubmissionFlag: flag for using bulk submission or not :return: S_OK/S_ERROR with updated taskDict """ if (not owner) or (not ownerGroup): res = getProxyInfo(False, False) if not res['OK']: return res proxyInfo = res['Value'] owner = proxyInfo['username'] ownerGroup = proxyInfo['group'] if not ownerDN: res = getDNForUsername(owner) if not res['OK']: return res ownerDN = res['Value'][0] if bulkSubmissionFlag: return self.__prepareTasksBulk(transBody, taskDict, owner, ownerGroup, ownerDN) # not a bulk submission return self.__prepareTasks(transBody, taskDict, owner, ownerGroup, ownerDN) def __prepareTasksBulk(self, transBody, taskDict, owner, ownerGroup, ownerDN): """ Prepare transformation tasks with a single job object for bulk submission :param str transBody: transformation job template :param dict taskDict: dictionary of per task parameters :param str owner: owner of the transformation :param str ownerGroup: group of the owner of the transformation :param str ownerDN: DN of the owner of the transformation :return: S_OK/S_ERROR with updated taskDict """ if taskDict: transID = taskDict.values()[0]['TransformationID'] else: return S_OK({}) method = '__prepareTasksBulk' startTime = time.time() # Prepare the bulk Job object with common parameters oJob = self.jobClass(transBody) self._logVerbose('Setting job owner:group to %s:%s' % (owner, ownerGroup), transID=transID, method=method) oJob.setOwner(owner) oJob.setOwnerGroup(ownerGroup) oJob.setOwnerDN(ownerDN) jobType = oJob.workflow.findParameter('JobType').getValue() transGroup = str(transID).zfill(8) # Verify that the JOB_ID parameter is added to the workflow if not oJob.workflow.findParameter('JOB_ID'): oJob._addParameter(oJob.workflow, 'JOB_ID', 'string', '00000000', "Initial JOB_ID") if oJob.workflow.findParameter('PRODUCTION_ID'): oJob._setParamValue('PRODUCTION_ID', str(transID).zfill(8)) # pylint: disable=protected-access else: oJob._addParameter( oJob.workflow, # pylint: disable=protected-access 'PRODUCTION_ID', 'string', str(transID).zfill(8), "Production ID") oJob.setType(jobType) self._logVerbose('Adding default transformation group of %s' % (transGroup), transID=transID, method=method) oJob.setJobGroup(transGroup) if int(transID) in [ int(x) for x in self.opsH.getValue("Hospital/Transformations", []) ]: self._handleHospital(oJob) # Collect per job parameters sequences paramSeqDict = {} # tasks must be sorted because we use bulk submission and we must find the correspondance for taskID in sorted(taskDict): paramsDict = taskDict[taskID] seqDict = {} paramsDict['JobType'] = jobType # Handle destination site sites = self._handleDestination(paramsDict) if not sites: self._logError('Could not get a list a sites', transID=transID, method=method) return S_ERROR(ETSUKN, "Can not evaluate destination site") else: self._logVerbose('Setting Site: ', str(sites), transID=transID, method=method) seqDict['Site'] = sites seqDict['JobName'] = self._transTaskName(transID, taskID) seqDict['JOB_ID'] = str(taskID).zfill(8) self._logDebug('TransID: %s, TaskID: %s, paramsDict: %s' % (transID, taskID, str(paramsDict)), transID=transID, method=method) # Handle Input Data inputData = paramsDict.get('InputData') if inputData: if isinstance(inputData, basestring): inputData = inputData.replace(' ', '').split(';') self._logVerbose('Setting input data to %s' % inputData, transID=transID, method=method) seqDict['InputData'] = inputData elif paramSeqDict.get('InputData') is not None: self._logError( "Invalid mixture of jobs with and without input data") return S_ERROR( ETSDATA, "Invalid mixture of jobs with and without input data") for paramName, paramValue in paramsDict.iteritems(): if paramName not in ('InputData', 'Site', 'TargetSE'): if paramValue: self._logVerbose('Setting %s to %s' % (paramName, paramValue), transID=transID, method=method) seqDict[paramName] = paramValue outputParameterList = [] if self.outputDataModule: res = self.getOutputData({ 'Job': oJob._toXML(), 'TransformationID': transID, # pylint: disable=protected-access 'TaskID': taskID, 'InputData': inputData }) if not res['OK']: self._logError("Failed to generate output data", res['Message'], transID=transID, method=method) continue for name, output in res['Value'].iteritems(): seqDict[name] = output outputParameterList.append(name) if oJob.workflow.findParameter(name): oJob._setParamValue(name, "%%(%s)s" % name) # pylint: disable=protected-access else: oJob._addParameter( oJob.workflow, # pylint: disable=protected-access name, 'JDL', "%%(%s)s" % name, name) for pName, seq in seqDict.iteritems(): paramSeqDict.setdefault(pName, []).append(seq) for paramName, paramSeq in paramSeqDict.iteritems(): if paramName in ['JOB_ID', 'PRODUCTION_ID', 'InputData' ] + outputParameterList: res = oJob.setParameterSequence(paramName, paramSeq, addToWorkflow=paramName) else: res = oJob.setParameterSequence(paramName, paramSeq) if not res['OK']: return res if taskDict: self._logInfo('Prepared %d tasks' % len(taskDict), transID=transID, method=method, reftime=startTime) taskDict['BulkJobObject'] = oJob return S_OK(taskDict) def __prepareTasks(self, transBody, taskDict, owner, ownerGroup, ownerDN): """ Prepare transformation tasks with a job object per task :param str transBody: transformation job template :param dict taskDict: dictionary of per task parameters :param owner: owner of the transformation :param str ownerGroup: group of the owner of the transformation :param str ownerDN: DN of the owner of the transformation :return: S_OK/S_ERROR with updated taskDict """ if taskDict: transID = taskDict.values()[0]['TransformationID'] else: return S_OK({}) method = '__prepareTasks' startTime = time.time() oJobTemplate = self.jobClass(transBody) oJobTemplate.setOwner(owner) oJobTemplate.setOwnerGroup(ownerGroup) oJobTemplate.setOwnerDN(ownerDN) try: site = oJobTemplate.workflow.findParameter('Site').getValue() except AttributeError: site = None jobType = oJobTemplate.workflow.findParameter('JobType').getValue() templateOK = False getOutputDataTiming = 0. for taskID, paramsDict in taskDict.iteritems(): # Create a job for each task and add it to the taskDict if not templateOK: templateOK = True # Update the template with common information self._logVerbose('Job owner:group to %s:%s' % (owner, ownerGroup), transID=transID, method=method) transGroup = str(transID).zfill(8) self._logVerbose('Adding default transformation group of %s' % (transGroup), transID=transID, method=method) oJobTemplate.setJobGroup(transGroup) if oJobTemplate.workflow.findParameter('PRODUCTION_ID'): oJobTemplate._setParamValue('PRODUCTION_ID', str(transID).zfill(8)) else: oJobTemplate._addParameter(oJobTemplate.workflow, 'PRODUCTION_ID', 'string', str(transID).zfill(8), "Production ID") if not oJobTemplate.workflow.findParameter('JOB_ID'): oJobTemplate._addParameter(oJobTemplate.workflow, 'JOB_ID', 'string', '00000000', "Initial JOB_ID") if site is not None: paramsDict['Site'] = site paramsDict['JobType'] = jobType # Now create the job from the template oJob = copy.deepcopy(oJobTemplate) constructedName = self._transTaskName(transID, taskID) self._logVerbose('Setting task name to %s' % constructedName, transID=transID, method=method) oJob.setName(constructedName) oJob._setParamValue('JOB_ID', str(taskID).zfill(8)) inputData = None self._logDebug('TransID: %s, TaskID: %s, paramsDict: %s' % (transID, taskID, str(paramsDict)), transID=transID, method=method) # These helper functions do the real job sites = self._handleDestination(paramsDict) if not sites: self._logError('Could not get a list a sites', transID=transID, method=method) paramsDict['TaskObject'] = '' continue else: self._logDebug('Setting Site: ', str(sites), transID=transID, method=method) res = oJob.setDestination(sites) if not res['OK']: self._logError('Could not set the site: %s' % res['Message'], transID=transID, method=method) paramsDict['TaskObject'] = '' continue self._handleInputs(oJob, paramsDict) self._handleRest(oJob, paramsDict) hospitalTrans = [ int(x) for x in self.opsH.getValue("Hospital/Transformations", []) ] if int(transID) in hospitalTrans: self._handleHospital(oJob) paramsDict['TaskObject'] = '' if self.outputDataModule: getOutputDataTiming -= time.time() res = self.getOutputData({ 'Job': oJob._toXML(), 'TransformationID': transID, 'TaskID': taskID, 'InputData': inputData }) getOutputDataTiming += time.time() if not res['OK']: self._logError("Failed to generate output data", res['Message'], transID=transID, method=method) continue for name, output in res['Value'].iteritems(): oJob._addJDLParameter(name, ';'.join(output)) paramsDict['TaskObject'] = oJob if taskDict: self._logVerbose('Average getOutputData time: %.1f per task' % (getOutputDataTiming / len(taskDict)), transID=transID, method=method) self._logInfo('Prepared %d tasks' % len(taskDict), transID=transID, method=method, reftime=startTime) return S_OK(taskDict) ############################################################################# def _handleDestination(self, paramsDict): """ Handle Sites and TargetSE in the parameters """ try: sites = ['ANY'] if paramsDict['Site']: # 'Site' comes from the XML and therefore is ; separated sites = fromChar(paramsDict['Site'], sepChar=';') except KeyError: pass if self.destinationPlugin_o: destinationPlugin_o = self.destinationPlugin_o else: res = self.__generatePluginObject(self.destinationPlugin) if not res['OK']: self._logFatal( "Could not generate a destination plugin object") return res destinationPlugin_o = res['Value'] self.destinationPlugin_o = destinationPlugin_o destinationPlugin_o.setParameters(paramsDict) destSites = destinationPlugin_o.run() if not destSites: return sites # Now we need to make the AND with the sites, if defined if sites != ['ANY']: # Need to get the AND destSites &= set(sites) return list(destSites) def _handleInputs(self, oJob, paramsDict): """ set job inputs (+ metadata) """ inputData = paramsDict.get('InputData') transID = paramsDict['TransformationID'] if inputData: self._logVerbose('Setting input data to %s' % inputData, transID=transID, method='_handleInputs') res = oJob.setInputData(inputData) if not res['OK']: self._logError("Could not set the inputs: %s" % res['Message'], transID=transID, method='_handleInputs') def _handleRest(self, oJob, paramsDict): """ add as JDL parameters all the other parameters that are not for inputs or destination """ transID = paramsDict['TransformationID'] for paramName, paramValue in paramsDict.iteritems(): if paramName not in ('InputData', 'Site', 'TargetSE'): if paramValue: self._logDebug('Setting %s to %s' % (paramName, paramValue), transID=transID, method='_handleRest') oJob._addJDLParameter(paramName, paramValue) def _handleHospital(self, oJob): """ Optional handle of hospital jobs """ oJob.setType('Hospital') oJob.setInputDataPolicy('download', dataScheduling=False) hospitalSite = self.opsH.getValue("Hospital/HospitalSite", 'DIRAC.JobDebugger.ch') oJob.setDestination(hospitalSite) hospitalCEs = self.opsH.getValue("Hospital/HospitalCEs", []) if hospitalCEs: oJob._addJDLParameter('GridCE', hospitalCEs) def __generatePluginObject(self, plugin): """ This simply instantiates the TaskManagerPlugin class with the relevant plugin name """ method = '__generatePluginObject' try: plugModule = __import__(self.pluginLocation, globals(), locals(), ['TaskManagerPlugin']) except ImportError as e: self._logException("Failed to import 'TaskManagerPlugin' %s: %s" % (plugin, e), method=method) return S_ERROR() try: plugin_o = getattr(plugModule, 'TaskManagerPlugin')('%s' % plugin, operationsHelper=self.opsH) return S_OK(plugin_o) except AttributeError as e: self._logException("Failed to create %s(): %s." % (plugin, e), method=method) return S_ERROR() ############################################################################# def getOutputData(self, paramDict): """ Get the list of job output LFNs from the provided plugin """ if not self.outputDataModule_o: # Create the module object moduleFactory = ModuleFactory() moduleInstance = moduleFactory.getModule(self.outputDataModule, None) if not moduleInstance['OK']: return moduleInstance self.outputDataModule_o = moduleInstance['Value'] # This is the "argument" to the module, set it and then execute self.outputDataModule_o.paramDict = paramDict return self.outputDataModule_o.execute() def submitTransformationTasks(self, taskDict): """ Submit the tasks """ if 'BulkJobObject' in taskDict: return self.__submitTransformationTasksBulk(taskDict) return self.__submitTransformationTasks(taskDict) def __submitTransformationTasksBulk(self, taskDict): """ Submit jobs in one go with one parametric job """ if not taskDict: return S_OK(taskDict) startTime = time.time() method = '__submitTransformationTasksBulk' oJob = taskDict.pop('BulkJobObject') # we can only do this, once the job has been popped, or we _might_ crash transID = taskDict.values()[0]['TransformationID'] if oJob is None: self._logError('no bulk Job object found', transID=transID, method=method) return S_ERROR(ETSUKN, 'No bulk job object provided for submission') result = self.submitTaskToExternal(oJob) if not result['OK']: self._logError('Failed to submit tasks to external', transID=transID, method=method) return result jobIDList = result['Value'] if len(jobIDList) != len(taskDict): for task in taskDict.values(): task['Success'] = False return S_ERROR( ETSUKN, 'Submitted less number of jobs than requested tasks') # Get back correspondance with tasks sorted by ID for jobID, taskID in zip(jobIDList, sorted(taskDict)): taskDict[taskID]['ExternalID'] = jobID taskDict[taskID]['Success'] = True submitted = len(jobIDList) self._logInfo('Submitted %d tasks to WMS in %.1f seconds' % (submitted, time.time() - startTime), transID=transID, method=method) return S_OK(taskDict) def __submitTransformationTasks(self, taskDict): """ Submit jobs one by one """ method = '__submitTransformationTasks' submitted = 0 failed = 0 startTime = time.time() for task in taskDict.itervalues(): transID = task['TransformationID'] if not task['TaskObject']: task['Success'] = False failed += 1 continue res = self.submitTaskToExternal(task['TaskObject']) if res['OK']: task['ExternalID'] = res['Value'] task['Success'] = True submitted += 1 else: self._logError("Failed to submit task to WMS", res['Message'], transID=transID, method=method) task['Success'] = False failed += 1 if submitted: self._logInfo('Submitted %d tasks to WMS in %.1f seconds' % (submitted, time.time() - startTime), transID=transID, method=method) if failed: self._logError('Failed to submit %d tasks to WMS.' % (failed), transID=transID, method=method) return S_OK(taskDict) def submitTaskToExternal(self, job): """ Submits a single job (which can be a bulk one) to the WMS. """ if isinstance(job, basestring): try: oJob = self.jobClass(job) except Exception as x: # pylint: disable=broad-except self._logException("Failed to create job object", '', x) return S_ERROR("Failed to create job object") elif isinstance(job, self.jobClass): oJob = job else: self._logError("No valid job description found") return S_ERROR("No valid job description found") workflowFileObject = StringIO.StringIO(oJob._toXML()) jdl = oJob._toJDL(jobDescriptionObject=workflowFileObject) return self.submissionClient.submitJob(jdl, workflowFileObject) def updateTransformationReservedTasks(self, taskDicts): transID = None jobNames = [ self._transTaskName(taskDict['TransformationID'], taskDict['TaskID']) for taskDict in taskDicts ] res = self.jobMonitoringClient.getJobs({'JobName': jobNames}) if not res['OK']: self._logError("Failed to get task from WMS", res['Message'], transID=transID, method='updateTransformationReservedTasks') return res jobNameIDs = {} for wmsID in res['Value']: res = self.jobMonitoringClient.getJobPrimarySummary(int(wmsID)) if not res['OK']: self._logWarn("Failed to get task summary from WMS", res['Message'], transID=transID, method='updateTransformationReservedTasks') else: jobNameIDs[res['Value']['JobName']] = int(wmsID) noTask = list(set(jobNames) - set(jobNameIDs)) return S_OK({'NoTasks': noTask, 'TaskNameIDs': jobNameIDs}) def getSubmittedTaskStatus(self, taskDicts): """ Check the status of a list of tasks and return lists of taskIDs for each new status """ method = 'getSubmittedTaskStatus' if taskDicts: wmsIDs = [ int(taskDict['ExternalID']) for taskDict in taskDicts if int(taskDict['ExternalID']) ] transID = taskDicts[0]['TransformationID'] else: return S_OK({}) res = self.jobMonitoringClient.getJobsStatus(wmsIDs) if not res['OK']: self._logWarn("Failed to get job status from the WMS system", transID=transID, method=method) return res statusDict = res['Value'] updateDict = {} for taskDict in taskDicts: taskID = taskDict['TaskID'] wmsID = int(taskDict['ExternalID']) if not wmsID: continue oldStatus = taskDict['ExternalStatus'] newStatus = statusDict.get(wmsID, {}).get('Status', 'Removed') if oldStatus != newStatus: if newStatus == "Removed": self._logVerbose( 'Production/Job %d/%d removed from WMS while it is in %s status' % (transID, taskID, oldStatus), transID=transID, method=method) newStatus = "Failed" self._logVerbose( 'Setting job status for Production/Job %d/%d to %s' % (transID, taskID, newStatus), transID=transID, method=method) updateDict.setdefault(newStatus, []).append(taskID) return S_OK(updateDict) def getSubmittedFileStatus(self, fileDicts): """ Check the status of a list of files and return the new status of each LFN """ if not fileDicts: return S_OK({}) method = 'getSubmittedFileStatus' # All files are from the same transformation transID = fileDicts[0]['TransformationID'] taskFiles = {} for fileDict in fileDicts: jobName = self._transTaskName(transID, fileDict['TaskID']) taskFiles.setdefault(jobName, {})[fileDict['LFN']] = fileDict['Status'] res = self.updateTransformationReservedTasks(fileDicts) if not res['OK']: self._logWarn("Failed to obtain taskIDs for files", transID=transID, method=method) return res noTasks = res['Value']['NoTasks'] taskNameIDs = res['Value']['TaskNameIDs'] updateDict = {} for jobName in noTasks: for lfn, oldStatus in taskFiles[jobName].iteritems(): if oldStatus != 'Unused': updateDict[lfn] = 'Unused' res = self.jobMonitoringClient.getJobsStatus(taskNameIDs.values()) if not res['OK']: self._logWarn("Failed to get job status from the WMS system", transID=transID, method=method) return res statusDict = res['Value'] for jobName, wmsID in taskNameIDs.iteritems(): jobStatus = statusDict.get(wmsID, {}).get('Status') newFileStatus = { 'Done': 'Processed', 'Completed': 'Processed', 'Failed': 'Unused' }.get(jobStatus) if newFileStatus: for lfn, oldStatus in taskFiles[jobName].iteritems(): if newFileStatus != oldStatus: updateDict[lfn] = newFileStatus return S_OK(updateDict)
class JobLaunchpadHandler(WebHandler): AUTH_PROPS = "authenticated" def web_getProxyStatus(self): self.write(self.__getProxyStatus()) def __getProxyStatus(self, secondsOverride=None): from DIRAC.FrameworkSystem.Client.ProxyManagerClient import ProxyManagerClient proxyManager = ProxyManagerClient() userData = self.getSessionData() group = str(userData["user"]["group"]) if group == "visitor": return { "success": "false", "error": "User is anonymous or is not registered in the system" } userDN = str(userData["user"]["DN"]) defaultSeconds = 24 * 3600 + 60 # 24H + 1min validSeconds = gConfig.getValue("/Registry/DefaultProxyLifeTime", defaultSeconds) gLogger.info("\033[0;31m userHasProxy(%s, %s, %s) \033[0m" % (userDN, group, validSeconds)) result = proxyManager.userHasProxy(userDN, group, validSeconds) if result["OK"]: if result["Value"]: return {"success": "true", "result": "true"} else: return {"success": "true", "result": "false"} else: return {"success": "false", "error": "false"} gLogger.info("\033[0;31m PROXY: \033[0m", result) def __getPlatform(self): gLogger.info("start __getPlatform") path = "/Resources/Computing/OSCompatibility" result = gConfig.getOptionsDict(path) gLogger.debug(result) if not result["OK"]: return False platformDict = result["Value"] platform = platformDict.keys() gLogger.debug("platform: %s" % platform) gLogger.info("end __getPlatform") return platform def __getOptionsFromCS(self, path="/Website/Launchpad/Options", delimiter=","): gLogger.info("start __getOptionsFromCS") result = gConfig.getOptionsDict(path) gLogger.always(result) if not result["OK"]: return [] options = result["Value"] for i in options.keys(): options[i] = options[i].split(delimiter) result = gConfig.getSections(path) if result["OK"]: sections = result["Value"] if len(sections) > 0: for i in sections: options[i] = self.__getOptionsFromCS(path + '/' + i, delimiter) gLogger.always("options: %s" % options) gLogger.info("end __getOptionsFromCS") return options ''' Method obtain launchpad setup to Eiscat with pre-selected LFNs as input data parameter, the caller js client will use setup to open an new Launchpad ''' @asyncGen def web_getLaunchpadSetupWithLFNs(self): #on the fly file catalog for advanced launchpad if not hasattr(self, 'fc'): userData = self.getSessionData() group = str(userData["user"]["group"]) vo = getVOForGroup(group) self.fc = FileCatalog(vo=vo) self.set_header('Content-type', 'text/plain') lfnList = [] arguments = self.request.arguments gLogger.always( "submit: incoming arguments %s to getLaunchpadSetupWithLFNs" % arguments) lfnStr = str(arguments['path'][0]) lfnList = lfnStr.split(',') #checks if the experiments folder in lfn list has a rtg_def.m file at some subfolder gLogger.always("submit: checking if some rtg_def.m" % arguments) processed = [] metaDict = {'type': 'info'} for lfn in lfnStr.split(','): pos_relative = lfn.find("/") pos_relative = lfn.find("/", pos_relative + 1) pos_relative = lfn.find("/", pos_relative + 1) pos_relative = lfn.find("/", pos_relative + 1) pos_relative = lfn.find("/", pos_relative + 1) experiment_lfn = lfn[0:pos_relative] if experiment_lfn in processed: continue processed.append(experiment_lfn) gLogger.always("checking rtg_def.m in %s" % experiment_lfn) result = self.fc.findFilesByMetadata(metaDict, path=str(experiment_lfn)) print "result" print result if not result['OK'] or not result['Value']: gLogger.error("Failed to get type info from $s, %s" % (experiment_lfn, result["Message"])) continue for candidate_lfn in result['Value']: if candidate_lfn.find('rtg_def.m') > 0: lfnList.append(candidate_lfn) totalfn = len(lfnList) ptlfn = '' current = 1 for lfn in lfnList: ptlfn = ptlfn + lfn if current < totalfn: ptlfn = ptlfn + ', ' current = current + 1 defaultParams = { "JobName": [1, 'Eiscat'], "Executable": [1, "/bin/ls"], "Arguments": [1, "-ltrA"], "OutputSandbox": [1, "std.out, std.err"], "InputData": [1, ptlfn], "OutputData": [0, ""], "OutputSE": [1, "EISCAT-disk"], "OutputPath": [0, ""], "CPUTime": [0, "86400"], "Site": [0, ""], "BannedSite": [0, ""], "Platform": [0, "Linux_x86_64_glibc-2.5"], "Priority": [0, "5"], "StdError": [0, "std.err"], "StdOutput": [0, "std.out"], "Parameters": [0, "0"], "ParameterStart": [0, "0"], "ParameterStep": [0, "1"] } delimiter = gConfig.getValue("/Website/Launchpad/ListSeparator", ',') options = self.__getOptionsFromCS(delimiter=delimiter) # platform = self.__getPlatform() # if platform and options: # if not options.has_key("Platform"): # options[ "Platform" ] = platform # else: # csPlatform = list(options[ "Platform" ]) # allPlatforms = csPlatform + platform # platform = uniqueElements(allPlatforms) # options[ "Platform" ] = platform gLogger.debug("Options from CS: %s" % options) override = gConfig.getValue("/Website/Launchpad/OptionsOverride", False) gLogger.info("end __getLaunchpadOpts") # Updating the default values from OptionsOverride configuration branch, for key in options: if key not in defaultParams: defaultParams[key] = [0, ""] defaultParams[key][1] = options[key][0] gLogger.info( "Default params + override from /Website/Launchpad/OptionsOverride -> %s" % defaultParams) # Reading of the predefined sets of launchpad parameters values obj = Operations() predefinedSets = {} launchpadSections = obj.getSections("Launchpad") import pprint if launchpadSections['OK']: for section in launchpadSections["Value"]: predefinedSets[section] = {} sectionOptions = obj.getOptionsDict("Launchpad/" + section) pprint.pprint(sectionOptions) if sectionOptions['OK']: predefinedSets[section] = sectionOptions["Value"] self.write({ "success": "true", "result": defaultParams, "predefinedSets": predefinedSets }) def web_getLaunchpadOpts(self): defaultParams = { "JobName": [1, 'DIRAC'], "Executable": [1, "/bin/ls"], "Arguments": [1, "-ltrA"], "OutputSandbox": [1, "std.out, std.err"], "InputData": [0, ""], "OutputData": [0, ""], "OutputSE": [0, "DIRAC-USER"], "OutputPath": [0, ""], "CPUTime": [0, "86400"], "Site": [0, ""], "BannedSite": [0, ""], "Platform": [0, "Linux_x86_64_glibc-2.5"], "Priority": [0, "5"], "StdError": [0, "std.err"], "StdOutput": [0, "std.out"], "Parameters": [0, "0"], "ParameterStart": [0, "0"], "ParameterStep": [0, "1"] } delimiter = gConfig.getValue("/Website/Launchpad/ListSeparator", ',') options = self.__getOptionsFromCS(delimiter=delimiter) # platform = self.__getPlatform() # if platform and options: # if not options.has_key("Platform"): # options[ "Platform" ] = platform # else: # csPlatform = list(options[ "Platform" ]) # allPlatforms = csPlatform + platform # platform = uniqueElements(allPlatforms) # options[ "Platform" ] = platform gLogger.debug("Combined options from CS: %s" % options) override = gConfig.getValue("/Website/Launchpad/OptionsOverride", False) gLogger.info("end __getLaunchpadOpts") # Updating the default values from OptionsOverride configuration branch for key in options: if key not in defaultParams: defaultParams[key] = [0, ""] defaultParams[key][1] = options[key][0] # Reading of the predefined sets of launchpad parameters values obj = Operations() predefinedSets = {} launchpadSections = obj.getSections("Launchpad") import pprint if launchpadSections['OK']: for section in launchpadSections["Value"]: predefinedSets[section] = {} sectionOptions = obj.getOptionsDict("Launchpad/" + section) pprint.pprint(sectionOptions) if sectionOptions['OK']: predefinedSets[section] = sectionOptions["Value"] self.write({ "success": "true", "result": defaultParams, "predefinedSets": predefinedSets }) def __canRunJobs(self): data = self.getSessionData() isAuth = False if "properties" in data["user"]: if "NormalUser" in data["user"]["properties"]: isAuth = True return isAuth @asyncGen def web_jobSubmit(self): # self.set_header('Content-type', "text/html") # Otherwise the browser would offer you to download a JobSubmit file if not self.__canRunJobs(): self.finish({ "success": "false", "error": "You are not allowed to run the jobs" }) return proxy = yield self.threadTask(self.__getProxyStatus, 86460) if proxy["success"] == "false" or proxy["result"] == "false": self.finish({ "success": "false", "error": "You can not run a job: your proxy is valid less then 24 hours" }) return jdl = "" params = {} lfns = [] for tmp in self.request.arguments: try: if len(self.request.arguments[tmp][0]) > 0: if tmp[:8] == "lfnField": if len(self.request.arguments[tmp][0].strip()) > 0: lfns.append("LFN:" + self.request.arguments[tmp][0]) else: params[tmp] = self.request.arguments[tmp][0] except: pass for item in params: if item == "OutputSandbox": jdl = jdl + str(item) + " = {" + str(params[item]) + "};" if item == "Parameters": try: parameters = int(params[item]) jdl = jdl + str(item) + " = \"" + str(parameters) + "\";" except: parameters = str(params[item]) if parameters.find("{") >= 0 and parameters.find("}") >= 0: parameters = parameters.rstrip("}") parameters = parameters.lstrip("{") if len(parameters) > 0: jdl = jdl + str(item) + " = {" + parameters + "};" else: self.finish({ "success": "false", "error": "Parameters vector has zero length" }) return else: self.finish({ "success": "false", "error": "Parameters must be an integer or a vector. Example: 4 or {1,2,3,4}" }) return else: jdl = jdl + str(item) + " = \"" + str(params[item]) + "\";" store = [] for key in self.request.files: try: if self.request.files[key][0].filename: gLogger.info("\033[0;31m file - %s \033[0m " % self.request.files[key][0].filename) store.append(self.request.files[key][0]) except: pass gLogger.info("\033[0;31m *** %s \033[0m " % params) clearFS = False # Clear directory flag fileNameList = [] exception_counter = 0 callback = {} if len(store) > 0: # If there is a file(s) in sandbox clearFS = True import shutil import os storePath = tempfile.mkdtemp(prefix='DIRAC_') try: for fileObj in store: name = os.path.join(storePath, fileObj.filename.lstrip(os.sep)) tFile = open(name, 'w') tFile.write(fileObj.body) tFile.close() fileNameList.append(name) except Exception, x: exception_counter = 1 callback = { "success": "false", "error": "An EXCEPTION happens during saving your sandbox file(s): %s" % str(x) } if ((len(fileNameList) > 0) or (len(lfns) > 0)) and exception_counter == 0: sndBox = "InputSandbox = {\"" + "\",\"".join(fileNameList + lfns) + "\"};" else: sndBox = "" if exception_counter == 0: jdl = jdl + sndBox from DIRAC.WorkloadManagementSystem.Client.WMSClient import WMSClient jobManager = WMSClient(useCertificates=True, timeout=1800) jdl = str(jdl) gLogger.info("J D L : ", jdl) try: result = yield self.threadTask(jobManager.submitJob, jdl) if result["OK"]: callback = {"success": "true", "result": result["Value"]} else: callback = {"success": "false", "error": result["Message"]} except Exception, x: callback = { "success": "false", "error": "An EXCEPTION happens during job submittion: %s" % str(x) }
def __init__(self): TaskBase.__init__(self) self.submissionClient = WMSClient() self.jobMonitoringClient = JobMonitoringClient()
def initialize(self): """ agent initialisation reading and setting confing opts :param self: self reference """ # # shifter proxy self.am_setOption('shifterProxy', 'DataManager') # # transformations types self.dataProcTTypes = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge']) self.dataManipTTypes = Operations().getValue( 'Transformations/DataManipulation', ['Replication', 'Removal']) agentTSTypes = self.am_getOption('TransformationTypes', []) if agentTSTypes: self.transformationTypes = sorted(agentTSTypes) else: self.transformationTypes = sorted(self.dataProcTTypes + self.dataManipTTypes) self.log.info("Will consider the following transformation types: %s" % str(self.transformationTypes)) # # directory locations self.directoryLocations = sorted( self.am_getOption('DirectoryLocations', ['TransformationDB', 'MetadataCatalog'])) self.log.info( "Will search for directories in the following locations: %s" % str(self.directoryLocations)) # # transformation metadata self.transfidmeta = self.am_getOption('TransfIDMeta', "TransformationID") self.log.info("Will use %s as metadata tag name for TransformationID" % self.transfidmeta) # # archive periof in days self.archiveAfter = self.am_getOption('ArchiveAfter', 7) # days self.log.info("Will archive Completed transformations after %d days" % self.archiveAfter) # # active SEs self.activeStorages = sorted(self.am_getOption('ActiveSEs', [])) self.log.info("Will check the following storage elements: %s" % str(self.activeStorages)) # # transformation log SEs self.logSE = self.am_getOption('TransformationLogSE', 'LogSE') self.log.info("Will remove logs found on storage element: %s" % self.logSE) # # enable/disable execution, should be using CS option Status?? with default value as 'Active'?? self.enableFlag = self.am_getOption('EnableFlag', 'True') # # data manager # self.dm = DataManager() # # transformation client self.transClient = TransformationClient() # # wms client self.wmsClient = WMSClient() # # request client self.reqClient = ReqClient() # # file catalog client self.metadataClient = FileCatalogClient() return S_OK()
class TransformationCleaningAgent(AgentModule): """ .. class:: TransformationCleaningAgent :param ~DIRAC.DataManagementSystem.Client.DataManager.DataManager dm: DataManager instance :param ~TransformationClient.TransformationClient transClient: TransformationClient instance :param ~FileCatalogClient.FileCatalogClient metadataClient: FileCatalogClient instance """ def __init__(self, *args, **kwargs): """ c'tor """ AgentModule.__init__(self, *args, **kwargs) self.shifterProxy = None # # transformation client self.transClient = None # # wms client self.wmsClient = None # # request client self.reqClient = None # # file catalog client self.metadataClient = None # # transformations types self.transformationTypes = None # # directory locations self.directoryLocations = ['TransformationDB', 'MetadataCatalog'] # # transformation metadata self.transfidmeta = 'TransformationID' # # archive periof in days self.archiveAfter = 7 # # transformation log SEs self.logSE = 'LogSE' # # enable/disable execution self.enableFlag = 'True' self.dataProcTTypes = ['MCSimulation', 'Merge'] self.dataManipTTypes = ['Replication', 'Removal'] def initialize(self): """ agent initialisation reading and setting confing opts :param self: self reference """ # # shifter proxy # See cleanContent method: this proxy will be used ALSO when the file catalog used # is the DIRAC File Catalog (DFC). # This is possible because of unset of the "UseServerCertificate" option self.shifterProxy = self.am_getOption('shifterProxy', self.shifterProxy) # # transformations types self.dataProcTTypes = Operations().getValue( 'Transformations/DataProcessing', self.dataProcTTypes) self.dataManipTTypes = Operations().getValue( 'Transformations/DataManipulation', self.dataManipTTypes) agentTSTypes = self.am_getOption('TransformationTypes', []) if agentTSTypes: self.transformationTypes = sorted(agentTSTypes) else: self.transformationTypes = sorted(self.dataProcTTypes + self.dataManipTTypes) self.log.info("Will consider the following transformation types: %s" % str(self.transformationTypes)) # # directory locations self.directoryLocations = sorted( self.am_getOption('DirectoryLocations', self.directoryLocations)) self.log.info( "Will search for directories in the following locations: %s" % str(self.directoryLocations)) # # transformation metadata self.transfidmeta = self.am_getOption('TransfIDMeta', self.transfidmeta) self.log.info("Will use %s as metadata tag name for TransformationID" % self.transfidmeta) # # archive periof in days self.archiveAfter = self.am_getOption('ArchiveAfter', self.archiveAfter) # days self.log.info("Will archive Completed transformations after %d days" % self.archiveAfter) # # transformation log SEs self.logSE = Operations().getValue('/LogStorage/LogSE', self.logSE) self.log.info("Will remove logs found on storage element: %s" % self.logSE) # # transformation client self.transClient = TransformationClient() # # wms client self.wmsClient = WMSClient() # # request client self.reqClient = ReqClient() # # file catalog client self.metadataClient = FileCatalogClient() return S_OK() ############################################################################# def execute(self): """ execution in one agent's cycle :param self: self reference """ self.enableFlag = self.am_getOption('EnableFlag', self.enableFlag) if self.enableFlag != 'True': self.log.info( 'TransformationCleaningAgent is disabled by configuration option EnableFlag' ) return S_OK('Disabled via CS flag') # Obtain the transformations in Cleaning status and remove any mention of the jobs/files res = self.transClient.getTransformations({ 'Status': 'Cleaning', 'Type': self.transformationTypes }) if res['OK']: for transDict in res['Value']: if self.shifterProxy: self._executeClean(transDict) else: self.log.info( "Cleaning transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s" % transDict) executeWithUserProxy(self._executeClean)( transDict, proxyUserDN=transDict['AuthorDN'], proxyUserGroup=transDict['AuthorGroup']) else: self.log.error("Failed to get transformations", res['Message']) # Obtain the transformations in RemovingFiles status and removes the output files res = self.transClient.getTransformations({ 'Status': 'RemovingFiles', 'Type': self.transformationTypes }) if res['OK']: for transDict in res['Value']: if self.shifterProxy: self._executeRemoval(transDict) else: self.log.info( "Removing files for transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s" % transDict) executeWithUserProxy(self._executeRemoval)( transDict, proxyUserDN=transDict['AuthorDN'], proxyUserGroup=transDict['AuthorGroup']) else: self.log.error("Could not get the transformations", res['Message']) # Obtain the transformations in Completed status and archive if inactive for X days olderThanTime = datetime.utcnow() - timedelta(days=self.archiveAfter) res = self.transClient.getTransformations( { 'Status': 'Completed', 'Type': self.transformationTypes }, older=olderThanTime, timeStamp='LastUpdate') if res['OK']: for transDict in res['Value']: if self.shifterProxy: self._executeArchive(transDict) else: self.log.info( "Archiving files for transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s" % transDict) executeWithUserProxy(self._executeArchive)( transDict, proxyUserDN=transDict['AuthorDN'], proxyUserGroup=transDict['AuthorGroup']) else: self.log.error("Could not get the transformations", res['Message']) return S_OK() def _executeClean(self, transDict): """Clean transformation.""" # if transformation is of type `Replication` or `Removal`, there is nothing to clean. # We just archive if transDict['Type'] in self.dataManipTTypes: res = self.archiveTransformation(transDict['TransformationID']) if not res['OK']: self.log.error("Problems archiving transformation %s: %s" % (transDict['TransformationID'], res['Message'])) else: res = self.cleanTransformation(transDict['TransformationID']) if not res['OK']: self.log.error("Problems cleaning transformation %s: %s" % (transDict['TransformationID'], res['Message'])) def _executeRemoval(self, transDict): """Remove files from given transformation.""" res = self.removeTransformationOutput(transDict['TransformationID']) if not res['OK']: self.log.error("Problems removing transformation %s: %s" % (transDict['TransformationID'], res['Message'])) def _executeArchive(self, transDict): """Archive the given transformation.""" res = self.archiveTransformation(transDict['TransformationID']) if not res['OK']: self.log.error("Problems archiving transformation %s: %s" % (transDict['TransformationID'], res['Message'])) return S_OK() ############################################################################# # # Get the transformation directories for checking # def getTransformationDirectories(self, transID): """ get the directories for the supplied transformation from the transformation system. These directories are used by removeTransformationOutput and cleanTransformation for removing output. :param self: self reference :param int transID: transformation ID """ self.log.verbose( "Cleaning Transformation directories of transformation %d" % transID) directories = [] if 'TransformationDB' in self.directoryLocations: res = self.transClient.getTransformationParameters( transID, ['OutputDirectories']) if not res['OK']: self.log.error("Failed to obtain transformation directories", res['Message']) return res transDirectories = [] if res['Value']: if not isinstance(res['Value'], list): try: transDirectories = ast.literal_eval(res['Value']) except BaseException: # It can happen if the res['Value'] is '/a/b/c' instead of '["/a/b/c"]' transDirectories.append(res['Value']) else: transDirectories = res['Value'] directories = self._addDirs(transID, transDirectories, directories) if 'MetadataCatalog' in self.directoryLocations: res = self.metadataClient.findDirectoriesByMetadata( {self.transfidmeta: transID}) if not res['OK']: self.log.error("Failed to obtain metadata catalog directories", res['Message']) return res transDirectories = res['Value'] directories = self._addDirs(transID, transDirectories, directories) if not directories: self.log.info("No output directories found") directories = sorted(directories) return S_OK(directories) @classmethod def _addDirs(cls, transID, newDirs, existingDirs): """ append unique :newDirs: list to :existingDirs: list :param self: self reference :param int transID: transformationID :param list newDirs: src list of paths :param list existingDirs: dest list of paths """ for folder in newDirs: transStr = str(transID).zfill(8) if re.search(transStr, str(folder)): if folder not in existingDirs: existingDirs.append(os.path.normpath(folder)) return existingDirs ############################################################################# # # These are the methods for performing the cleaning of catalogs and storage # def cleanContent(self, directory): """ wipe out everything from catalog under folder :directory: :param self: self reference :params str directory: folder name """ self.log.verbose("Cleaning Catalog contents") res = self.__getCatalogDirectoryContents([directory]) if not res['OK']: return res filesFound = res['Value'] if not filesFound: self.log.info( "No files are registered in the catalog directory %s" % directory) return S_OK() self.log.info( "Attempting to remove %d possible remnants from the catalog and storage" % len(filesFound)) # Executing with shifter proxy gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'false') res = DataManager().removeFile(filesFound, force=True) gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'true') if not res['OK']: return res realFailure = False for lfn, reason in res['Value']['Failed'].items(): if "File does not exist" in str(reason): self.log.warn("File %s not found in some catalog: " % (lfn)) else: self.log.error("Failed to remove file found in the catalog", "%s %s" % (lfn, reason)) realFailure = True if realFailure: return S_ERROR("Failed to remove all files found in the catalog") return S_OK() def __getCatalogDirectoryContents(self, directories): """ get catalog contents under paths :directories: :param self: self reference :param list directories: list of paths in catalog """ self.log.info('Obtaining the catalog contents for %d directories:' % len(directories)) for directory in directories: self.log.info(directory) activeDirs = directories allFiles = {} fc = FileCatalog() while activeDirs: currentDir = activeDirs[0] res = returnSingleResult(fc.listDirectory(currentDir)) activeDirs.remove(currentDir) if not res['OK'] and 'Directory does not exist' in res[ 'Message']: # FIXME: DFC should return errno self.log.info("The supplied directory %s does not exist" % currentDir) elif not res['OK']: if "No such file or directory" in res['Message']: self.log.info("%s: %s" % (currentDir, res['Message'])) else: self.log.error("Failed to get directory %s content: %s" % (currentDir, res['Message'])) else: dirContents = res['Value'] activeDirs.extend(dirContents['SubDirs']) allFiles.update(dirContents['Files']) self.log.info("Found %d files" % len(allFiles)) return S_OK(allFiles.keys()) def cleanTransformationLogFiles(self, directory): """ clean up transformation logs from directory :directory: :param self: self reference :param str directory: folder name """ self.log.verbose("Removing log files found in the directory %s" % directory) res = returnSingleResult( StorageElement(self.logSE).removeDirectory(directory, recursive=True)) if not res['OK']: if cmpError(res, errno.ENOENT): # No such file or directory self.log.warn("Transformation log directory does not exist", directory) return S_OK() self.log.error("Failed to remove log files", res['Message']) return res self.log.info("Successfully removed transformation log directory") return S_OK() ############################################################################# # # These are the functional methods for archiving and cleaning transformations # def removeTransformationOutput(self, transID): """ This just removes any mention of the output data from the catalog and storage """ self.log.info("Removing output data for transformation %s" % transID) res = self.getTransformationDirectories(transID) if not res['OK']: self.log.error( 'Problem obtaining directories for transformation %s with result "%s"' % (transID, res)) return S_OK() directories = res['Value'] for directory in directories: if not re.search('/LOG/', directory): res = self.cleanContent(directory) if not res['OK']: return res self.log.info("Removed %d directories from the catalog \ and its files from the storage for transformation %s" % (len(directories), transID)) # Clean ALL the possible remnants found in the metadata catalog res = self.cleanMetadataCatalogFiles(transID) if not res['OK']: return res self.log.info("Successfully removed output of transformation %d" % transID) # Change the status of the transformation to RemovedFiles res = self.transClient.setTransformationParameter( transID, 'Status', 'RemovedFiles') if not res['OK']: self.log.error( "Failed to update status of transformation %s to RemovedFiles" % (transID), res['Message']) return res self.log.info("Updated status of transformation %s to RemovedFiles" % (transID)) return S_OK() def archiveTransformation(self, transID): """ This just removes job from the jobDB and the transformation DB :param self: self reference :param int transID: transformation ID """ self.log.info("Archiving transformation %s" % transID) # Clean the jobs in the WMS and any failover requests found res = self.cleanTransformationTasks(transID) if not res['OK']: return res # Clean the transformation DB of the files and job information res = self.transClient.cleanTransformation(transID) if not res['OK']: return res self.log.info("Successfully archived transformation %d" % transID) # Change the status of the transformation to archived res = self.transClient.setTransformationParameter( transID, 'Status', 'Archived') if not res['OK']: self.log.error( "Failed to update status of transformation %s to Archived" % (transID), res['Message']) return res self.log.info("Updated status of transformation %s to Archived" % (transID)) return S_OK() def cleanTransformation(self, transID): """ This removes what was produced by the supplied transformation, leaving only some info and log in the transformation DB. """ self.log.info("Cleaning transformation %s" % transID) res = self.getTransformationDirectories(transID) if not res['OK']: self.log.error( 'Problem obtaining directories for transformation %s with result "%s"' % (transID, res)) return S_OK() directories = res['Value'] # Clean the jobs in the WMS and any failover requests found res = self.cleanTransformationTasks(transID) if not res['OK']: return res # Clean the log files for the jobs for directory in directories: if re.search('/LOG/', directory): res = self.cleanTransformationLogFiles(directory) if not res['OK']: return res res = self.cleanContent(directory) if not res['OK']: return res # Clean ALL the possible remnants found res = self.cleanMetadataCatalogFiles(transID) if not res['OK']: return res # Clean the transformation DB of the files and job information res = self.transClient.cleanTransformation(transID) if not res['OK']: return res self.log.info("Successfully cleaned transformation %d" % transID) res = self.transClient.setTransformationParameter( transID, 'Status', 'Cleaned') if not res['OK']: self.log.error( "Failed to update status of transformation %s to Cleaned" % (transID), res['Message']) return res self.log.info("Updated status of transformation %s to Cleaned" % (transID)) return S_OK() def cleanMetadataCatalogFiles(self, transID): """ wipe out files from catalog """ res = self.metadataClient.findFilesByMetadata( {self.transfidmeta: transID}) if not res['OK']: return res fileToRemove = res['Value'] if not fileToRemove: self.log.info('No files found for transID %s' % transID) return S_OK() # Executing with shifter proxy gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'false') res = DataManager().removeFile(fileToRemove, force=True) gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'true') if not res['OK']: return res for lfn, reason in res['Value']['Failed'].items(): self.log.error("Failed to remove file found in metadata catalog", "%s %s" % (lfn, reason)) if res['Value']['Failed']: return S_ERROR( "Failed to remove all files found in the metadata catalog") self.log.info("Successfully removed all files found in the BK") return S_OK() ############################################################################# # # These are the methods for removing the jobs from the WMS and transformation DB # def cleanTransformationTasks(self, transID): """ clean tasks from WMS, or from the RMS if it is a DataManipulation transformation """ self.log.verbose("Cleaning Transformation tasks of transformation %d" % transID) res = self.__getTransformationExternalIDs(transID) if not res['OK']: return res externalIDs = res['Value'] if externalIDs: res = self.transClient.getTransformationParameters( transID, ['Type']) if not res['OK']: self.log.error("Failed to determine transformation type") return res transType = res['Value'] if transType in self.dataProcTTypes: res = self.__removeWMSTasks(externalIDs) else: res = self.__removeRequests(externalIDs) if not res['OK']: return res return S_OK() def __getTransformationExternalIDs(self, transID): """ collect all ExternalIDs for transformation :transID: :param self: self reference :param int transID: transforamtion ID """ res = self.transClient.getTransformationTasks( condDict={'TransformationID': transID}) if not res['OK']: self.log.error( "Failed to get externalIDs for transformation %d" % transID, res['Message']) return res externalIDs = [taskDict['ExternalID'] for taskDict in res["Value"]] self.log.info("Found %d tasks for transformation" % len(externalIDs)) return S_OK(externalIDs) def __removeRequests(self, requestIDs): """ This will remove requests from the RMS system - """ rIDs = [int(long(j)) for j in requestIDs if long(j)] for reqID in rIDs: self.reqClient.cancelRequest(reqID) return S_OK() def __removeWMSTasks(self, transJobIDs): """ wipe out jobs and their requests from the system :param self: self reference :param list trasnJobIDs: job IDs """ # Prevent 0 job IDs jobIDs = [int(j) for j in transJobIDs if int(j)] allRemove = True for jobList in breakListIntoChunks(jobIDs, 500): res = self.wmsClient.killJob(jobList) if res['OK']: self.log.info("Successfully killed %d jobs from WMS" % len(jobList)) elif ("InvalidJobIDs" in res) and ("NonauthorizedJobIDs" not in res) and ("FailedJobIDs" not in res): self.log.info("Found %s jobs which did not exist in the WMS" % len(res['InvalidJobIDs'])) elif "NonauthorizedJobIDs" in res: self.log.error( "Failed to kill %s jobs because not authorized" % len(res['NonauthorizedJobIDs'])) allRemove = False elif "FailedJobIDs" in res: self.log.error("Failed to kill %s jobs" % len(res['FailedJobIDs'])) allRemove = False res = self.wmsClient.deleteJob(jobList) if res['OK']: self.log.info("Successfully removed %d jobs from WMS" % len(jobList)) elif ("InvalidJobIDs" in res) and ("NonauthorizedJobIDs" not in res) and ("FailedJobIDs" not in res): self.log.info("Found %s jobs which did not exist in the WMS" % len(res['InvalidJobIDs'])) elif "NonauthorizedJobIDs" in res: self.log.error( "Failed to remove %s jobs because not authorized" % len(res['NonauthorizedJobIDs'])) allRemove = False elif "FailedJobIDs" in res: self.log.error("Failed to remove %s jobs" % len(res['FailedJobIDs'])) allRemove = False if not allRemove: return S_ERROR("Failed to remove all remnants from WMS") self.log.info("Successfully removed all tasks from the WMS") if not jobIDs: self.log.info( "JobIDs not present, unable to remove asociated requests.") return S_OK() failed = 0 failoverRequests = {} res = self.reqClient.getRequestIDsForJobs(jobIDs) if not res['OK']: self.log.error("Failed to get requestID for jobs.", res['Message']) return res failoverRequests.update(res['Value']['Successful']) if not failoverRequests: return S_OK() for jobID, requestID in res['Value']['Successful'].items(): # Put this check just in case, tasks must have associated jobs if jobID == 0 or jobID == '0': continue res = self.reqClient.cancelRequest(requestID) if not res['OK']: self.log.error("Failed to remove request from RequestDB", res['Message']) failed += 1 else: self.log.verbose("Removed request %s associated to job %d." % (requestID, jobID)) if failed: self.log.info("Successfully removed %s requests" % (len(failoverRequests) - failed)) self.log.info("Failed to remove %s requests" % failed) return S_ERROR("Failed to remove all the request from RequestDB") self.log.info( "Successfully removed all the associated failover requests") return S_OK()
def test_JobStateUpdateAndJobMonitoring( self ): """ Verifying all JobStateUpdate and JobMonitoring functions """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = RPCClient( 'WorkloadManagement/JobStateUpdate' ) # create a job and check stuff job = helloWorldJob() jobDescription = createFile( job ) # submitting the job. Checking few stuff res = wmsClient.submitJob( job._toJDL( xmlFile = jobDescription ) ) self.assert_( res['OK'] ) jobID = int ( res['Value'] ) # jobID = res['JobID'] res = jobMonitor.getJobJDL( jobID, True ) self.assert_( res['OK'] ) res = jobMonitor.getJobJDL( jobID, False ) self.assert_( res['OK'] ) # Adding stuff res = jobStateUpdate.setJobStatus( jobID, 'Matched', 'matching', 'source' ) self.assert_( res['OK'] ) res = jobStateUpdate.setJobParameters( jobID, [( 'par1', 'par1Value' ), ( 'par2', 'par2Value' )] ) self.assert_( res['OK'] ) res = jobStateUpdate.setJobApplicationStatus( jobID, 'app status', 'source' ) self.assert_( res['OK'] ) # res = jobStateUpdate.setJobFlag() # self.assert_( res['OK'] ) # res = jobStateUpdate.unsetJobFlag() # self.assert_( res['OK'] ) res = jobStateUpdate.setJobSite( jobID, 'Site' ) self.assert_( res['OK'] ) # res = jobMonitor.traceJobParameter( 'Site', 1, 'Status' ) # self.assert_( res['OK'] ) # now checking few things res = jobMonitor.getJobStatus( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], 'Running' ) res = jobMonitor.getJobParameter( jobID, 'par1' ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], {'par1': 'par1Value'} ) res = jobMonitor.getJobParameters( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], {'par1': 'par1Value', 'par2': 'par2Value'} ) res = jobMonitor.getJobAttribute( jobID, 'Site' ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], 'Site' ) res = jobMonitor.getJobAttributes( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value']['ApplicationStatus'], 'app status' ) self.assertEqual( res['Value']['JobName'], 'helloWorld' ) res = jobMonitor.getJobSummary( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value']['ApplicationStatus'], 'app status' ) self.assertEqual( res['Value']['Status'], 'Running' ) res = jobMonitor.getJobHeartBeatData( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], [] ) res = jobMonitor.getInputData( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], [] ) res = jobMonitor.getJobPrimarySummary( jobID ) self.assert_( res['OK'] ) res = jobMonitor.getAtticJobParameters( jobID ) self.assert_( res['OK'] ) res = jobStateUpdate.setJobsStatus( [jobID], 'Done', 'MinorStatus', 'Unknown' ) self.assert_( res['OK'] ) res = jobMonitor.getJobSummary( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value']['Status'], 'Done' ) self.assertEqual( res['Value']['MinorStatus'], 'MinorStatus' ) self.assertEqual( res['Value']['ApplicationStatus'], 'app status' ) res = jobStateUpdate.sendHeartBeat( jobID, {'bih':'bih'}, {'boh':'boh'} ) self.assert_( res['OK'] ) # delete the job - this will just set its status to "deleted" wmsClient.deleteJob( jobID )
def test_FullChain(self): """This test will - call all the WMSClient methods that will end up calling all the JobManager service methods - use the JobMonitoring to verify few properties - call the JobCleaningAgent to eliminate job entries from the DBs """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = JobStateUpdateClient() # create the job job = helloWorldJob() jobDescription = createFile(job) # submit the job res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res["OK"], res.get("Message")) self.assertTrue(isinstance(res["Value"], int), msg="Got %s" % type(res["Value"])) self.assertEqual(res["Value"], res["JobID"], msg="Got %s, expected %s" % (str(res["Value"]), res["JobID"])) jobID = res["JobID"] jobID = res["Value"] # updating the status res = jobStateUpdate.setJobStatus(jobID, JobStatus.RUNNING, "Executing Minchiapp", "source") self.assertTrue(res["OK"], res.get("Message")) # reset the job res = wmsClient.resetJob(jobID) self.assertTrue(res["OK"], res.get("Message")) # reschedule the job res = wmsClient.rescheduleJob(jobID) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobsStatus(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"][jobID]["Status"], JobStatus.RECEIVED, msg="Got %s" % str(res["Value"])) res = jobMonitor.getJobsMinorStatus([jobID]) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"], {jobID: { "MinorStatus": "Job Rescheduled" }}, msg="Got %s" % str(res["Value"])) res = jobMonitor.getJobsApplicationStatus([jobID]) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"], {jobID: { "ApplicationStatus": "Unknown" }}, msg="Got %s" % str(res["Value"])) res = jobMonitor.getJobsStates([jobID]) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual( res["Value"], { jobID: { "Status": JobStatus.RECEIVED, "MinorStatus": "Job Rescheduled", "ApplicationStatus": "Unknown" } }, msg="Got %s" % str(res["Value"]), ) # updating the status again res = jobStateUpdate.setJobStatus(jobID, JobStatus.CHECKING, "checking", "source") self.assertTrue(res["OK"], res.get("Message")) res = jobStateUpdate.setJobStatus(jobID, JobStatus.WAITING, "waiting", "source") self.assertTrue(res["OK"], res.get("Message")) res = jobStateUpdate.setJobStatus(jobID, JobStatus.MATCHED, "matched", "source") self.assertTrue(res["OK"], res.get("Message")) # kill the job res = wmsClient.killJob(jobID) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobsStatus(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"][jobID]["Status"], JobStatus.KILLED, msg="Got %s" % str(res["Value"])) # delete the job - this will just set its status to "deleted" res = wmsClient.deleteJob(jobID) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobsStatus(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"][jobID]["Status"], JobStatus.DELETED, msg="Got %s" % str(res["Value"]))
class TransformationCleaningAgent( AgentModule ): """ .. class:: TransformationCleaningAgent :param DataManger dm: DataManager instance :param TransfromationClient transClient: TransfromationClient instance :param FileCatalogClient metadataClient: FileCatalogClient instance """ def __init__( self, *args, **kwargs ): """ c'tor """ AgentModule.__init__( self, *args, **kwargs ) # # data manager self.dm = None # # transformation client self.transClient = None # # wms client self.wmsClient = None # # request client self.reqClient = None # # file catalog client self.metadataClient = None # # transformations types self.transformationTypes = None # # directory locations self.directoryLocations = None # # transformation metadata self.transfidmeta = None # # archive periof in days self.archiveAfter = None # # active SEs self.activeStorages = None # # transformation log SEs self.logSE = None # # enable/disable execution self.enableFlag = None def initialize( self ): """ agent initialisation reading and setting confing opts :param self: self reference """ # # shifter proxy self.am_setOption( 'shifterProxy', 'DataManager' ) # # transformations types self.dataProcTTypes = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] ) self.dataManipTTypes = Operations().getValue( 'Transformations/DataManipulation', ['Replication', 'Removal'] ) agentTSTypes = self.am_getOption( 'TransformationTypes', [] ) if agentTSTypes: self.transformationTypes = sorted( agentTSTypes ) else: self.transformationTypes = sorted( self.dataProcTTypes + self.dataManipTTypes ) self.log.info( "Will consider the following transformation types: %s" % str( self.transformationTypes ) ) # # directory locations self.directoryLocations = sorted( self.am_getOption( 'DirectoryLocations', [ 'TransformationDB', 'MetadataCatalog' ] ) ) self.log.info( "Will search for directories in the following locations: %s" % str( self.directoryLocations ) ) # # transformation metadata self.transfidmeta = self.am_getOption( 'TransfIDMeta', "TransformationID" ) self.log.info( "Will use %s as metadata tag name for TransformationID" % self.transfidmeta ) # # archive periof in days self.archiveAfter = self.am_getOption( 'ArchiveAfter', 7 ) # days self.log.info( "Will archive Completed transformations after %d days" % self.archiveAfter ) # # active SEs self.activeStorages = sorted( self.am_getOption( 'ActiveSEs', [] ) ) self.log.info( "Will check the following storage elements: %s" % str( self.activeStorages ) ) # # transformation log SEs self.logSE = self.am_getOption( 'TransformationLogSE', 'LogSE' ) self.log.info( "Will remove logs found on storage element: %s" % self.logSE ) # # enable/disable execution, should be using CS option Status?? with default value as 'Active'?? self.enableFlag = self.am_getOption( 'EnableFlag', 'True' ) # # data manager # self.dm = DataManager() # # transformation client self.transClient = TransformationClient() # # wms client self.wmsClient = WMSClient() # # request client self.reqClient = ReqClient() # # file catalog client self.metadataClient = FileCatalogClient() return S_OK() ############################################################################# def execute( self ): """ execution in one agent's cycle :param self: self reference """ self.enableFlag = self.am_getOption( 'EnableFlag', 'True' ) if not self.enableFlag == 'True': self.log.info( 'TransformationCleaningAgent is disabled by configuration option EnableFlag' ) return S_OK( 'Disabled via CS flag' ) # # Obtain the transformations in Cleaning status and remove any mention of the jobs/files res = self.transClient.getTransformations( { 'Status' : 'Cleaning', 'Type' : self.transformationTypes } ) if res['OK']: for transDict in res['Value']: # # if transformation is of type `Replication` or `Removal`, there is nothing to clean. # # We just archive if transDict[ 'Type' ] in self.dataManipTTypes: res = self.archiveTransformation( transDict['TransformationID'] ) if not res['OK']: self.log.error( "Problems archiving transformation %s: %s" % ( transDict['TransformationID'], res['Message'] ) ) else: res = self.cleanTransformation( transDict['TransformationID'] ) if not res['OK']: self.log.error( "Problems cleaning transformation %s: %s" % ( transDict['TransformationID'], res['Message'] ) ) # # Obtain the transformations in RemovingFiles status and (wait for it) removes the output files res = self.transClient.getTransformations( { 'Status' : 'RemovingFiles', 'Type' : self.transformationTypes} ) if res['OK']: for transDict in res['Value']: res = self.removeTransformationOutput( transDict['TransformationID'] ) if not res['OK']: self.log.error( "Problems removing transformation %s: %s" % ( transDict['TransformationID'], res['Message'] ) ) # # Obtain the transformations in Completed status and archive if inactive for X days olderThanTime = datetime.utcnow() - timedelta( days = self.archiveAfter ) res = self.transClient.getTransformations( { 'Status' : 'Completed', 'Type' : self.transformationTypes }, older = olderThanTime, timeStamp = 'LastUpdate' ) if res['OK']: for transDict in res['Value']: res = self.archiveTransformation( transDict['TransformationID'] ) if not res['OK']: self.log.error( "Problems archiving transformation %s: %s" % ( transDict['TransformationID'], res['Message'] ) ) else: self.log.error( "Could not get the transformations" ) return S_OK() ############################################################################# # # Get the transformation directories for checking # def getTransformationDirectories( self, transID ): """ get the directories for the supplied transformation from the transformation system :param self: self reference :param int transID: transformation ID """ directories = [] if 'TransformationDB' in self.directoryLocations: res = self.transClient.getTransformationParameters( transID, ['OutputDirectories'] ) if not res['OK']: self.log.error( "Failed to obtain transformation directories", res['Message'] ) return res transDirectories = res['Value'].splitlines() directories = self._addDirs( transID, transDirectories, directories ) if 'MetadataCatalog' in self.directoryLocations: res = self.metadataClient.findDirectoriesByMetadata( {self.transfidmeta:transID} ) if not res['OK']: self.log.error( "Failed to obtain metadata catalog directories", res['Message'] ) return res transDirectories = res['Value'] directories = self._addDirs( transID, transDirectories, directories ) if not directories: self.log.info( "No output directories found" ) directories = sorted( directories ) return S_OK( directories ) # FIXME If a classmethod, should it not have cls instead of self? @classmethod def _addDirs( self, transID, newDirs, existingDirs ): """ append uniqe :newDirs: list to :existingDirs: list :param self: self reference :param int transID: transformationID :param list newDirs: src list of paths :param list existingDirs: dest list of paths """ for folder in newDirs: transStr = str( transID ).zfill( 8 ) if re.search( transStr, str( folder ) ): if not folder in existingDirs: existingDirs.append( folder ) return existingDirs ############################################################################# # # These are the methods for performing the cleaning of catalogs and storage # def cleanStorageContents( self, directory ): """ delete lfn dir from all active SE :param self: self reference :param sre directory: folder name """ for storageElement in self.activeStorages: res = self.__removeStorageDirectory( directory, storageElement ) if not res['OK']: return res return S_OK() def __removeStorageDirectory( self, directory, storageElement ): """ wipe out all contents from :directory: at :storageElement: :param self: self reference :param str directory: path :param str storageElement: SE name """ self.log.info( 'Removing the contents of %s at %s' % ( directory, storageElement ) ) se = StorageElement( storageElement ) res = se.getPfnForLfn( [directory] ) if not res['OK']: self.log.error( "Failed to get PFN for directory", res['Message'] ) return res if directory in res['Value']['Failed']: self.log.verbose( 'Failed to obtain directory PFN from LFN', '%s %s' % ( directory, res['Value']['Failed'][directory] ) ) return S_ERROR( 'Failed to obtain directory PFN from LFNs' ) storageDirectory = res['Value']['Successful'][directory] res = returnSingleResult( se.exists( storageDirectory ) ) if not res['OK']: self.log.error( "Failed to obtain existance of directory", res['Message'] ) return res exists = res['Value'] if not exists: self.log.info( "The directory %s does not exist at %s " % ( directory, storageElement ) ) return S_OK() res = returnSingleResult( se.removeDirectory( storageDirectory, recursive = True ) ) if not res['OK']: self.log.error( "Failed to remove storage directory", res['Message'] ) return res self.log.info( "Successfully removed %d files from %s at %s" % ( res['Value']['FilesRemoved'], directory, storageElement ) ) return S_OK() def cleanCatalogContents( self, directory ): """ wipe out everything from catalog under folder :directory: :param self: self reference :params str directory: folder name """ res = self.__getCatalogDirectoryContents( [directory] ) if not res['OK']: return res filesFound = res['Value'] if not filesFound: self.log.info( "No files are registered in the catalog directory %s" % directory ) return S_OK() self.log.info( "Attempting to remove %d possible remnants from the catalog and storage" % len( filesFound ) ) # Executing with shifter proxy gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'false' ) res = DataManager().removeFile( filesFound, force = True ) gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'true' ) if not res['OK']: return res realFailure = False for lfn, reason in res['Value']['Failed'].items(): if "File does not exist" in str( reason ): self.log.warn( "File %s not found in some catalog: " % ( lfn ) ) else: self.log.error( "Failed to remove file found in the catalog", "%s %s" % ( lfn, reason ) ) realFailure = True if realFailure: return S_ERROR( "Failed to remove all files found in the catalog" ) return S_OK() def __getCatalogDirectoryContents( self, directories ): """ get catalog contents under paths :directories: :param self: self reference :param list directories: list of paths in catalog """ self.log.info( 'Obtaining the catalog contents for %d directories:' % len( directories ) ) for directory in directories: self.log.info( directory ) activeDirs = directories allFiles = {} fc = FileCatalog() while len( activeDirs ) > 0: currentDir = activeDirs[0] res = returnSingleResult( fc.listDirectory( currentDir ) ) activeDirs.remove( currentDir ) if not res['OK'] and res['Message'].endswith( 'The supplied path does not exist' ): self.log.info( "The supplied directory %s does not exist" % currentDir ) elif not res['OK']: if "No such file or directory" in res['Message']: self.log.info( "%s: %s" % ( currentDir, res['Message'] ) ) else: self.log.error( "Failed to get directory %s content: %s" % ( currentDir, res['Message'] ) ) else: dirContents = res['Value'] activeDirs.extend( dirContents['SubDirs'] ) allFiles.update( dirContents['Files'] ) self.log.info( "Found %d files" % len( allFiles ) ) return S_OK( allFiles.keys() ) def cleanTransformationLogFiles( self, directory ): """ clean up transformation logs from directory :directory: :param self: self reference :param str directory: folder name """ self.log.info( "Removing log files found in the directory %s" % directory ) res = returnSingleResult( StorageElement( self.logSE ).removeDirectory( directory ) ) if not res['OK']: self.log.error( "Failed to remove log files", res['Message'] ) return res self.log.info( "Successfully removed transformation log directory" ) return S_OK() ############################################################################# # # These are the functional methods for archiving and cleaning transformations # def removeTransformationOutput( self, transID ): """ This just removes any mention of the output data from the catalog and storage """ self.log.info( "Removing output data for transformation %s" % transID ) res = self.getTransformationDirectories( transID ) if not res['OK']: self.log.error( 'Problem obtaining directories for transformation %s with result "%s"' % ( transID, res ) ) return S_OK() directories = res['Value'] for directory in directories: if not re.search( '/LOG/', directory ): res = self.cleanCatalogContents( directory ) if not res['OK']: return res res = self.cleanStorageContents( directory ) if not res['OK']: return res self.log.info( "Removed directories in the catalog and storage for transformation" ) # Clean ALL the possible remnants found in the metadata catalog res = self.cleanMetadataCatalogFiles( transID ) if not res['OK']: return res self.log.info( "Successfully removed output of transformation %d" % transID ) # Change the status of the transformation to RemovedFiles res = self.transClient.setTransformationParameter( transID, 'Status', 'RemovedFiles' ) if not res['OK']: self.log.error( "Failed to update status of transformation %s to RemovedFiles" % ( transID ), res['Message'] ) return res self.log.info( "Updated status of transformation %s to RemovedFiles" % ( transID ) ) return S_OK() def archiveTransformation( self, transID ): """ This just removes job from the jobDB and the transformation DB :param self: self reference :param int transID: transformation ID """ self.log.info( "Archiving transformation %s" % transID ) # Clean the jobs in the WMS and any failover requests found res = self.cleanTransformationTasks( transID ) if not res['OK']: return res # Clean the transformation DB of the files and job information res = self.transClient.cleanTransformation( transID ) if not res['OK']: return res self.log.info( "Successfully archived transformation %d" % transID ) # Change the status of the transformation to archived res = self.transClient.setTransformationParameter( transID, 'Status', 'Archived' ) if not res['OK']: self.log.error( "Failed to update status of transformation %s to Archived" % ( transID ), res['Message'] ) return res self.log.info( "Updated status of transformation %s to Archived" % ( transID ) ) return S_OK() def cleanTransformation( self, transID ): """ This removes what was produced by the supplied transformation, leaving only some info and log in the transformation DB. """ self.log.info( "Cleaning transformation %s" % transID ) res = self.getTransformationDirectories( transID ) if not res['OK']: self.log.error( 'Problem obtaining directories for transformation %s with result "%s"' % ( transID, res ) ) return S_OK() directories = res['Value'] # Clean the jobs in the WMS and any failover requests found res = self.cleanTransformationTasks( transID ) if not res['OK']: return res # Clean the log files for the jobs for directory in directories: if re.search( '/LOG/', directory ): res = self.cleanTransformationLogFiles( directory ) if not res['OK']: return res res = self.cleanCatalogContents( directory ) if not res['OK']: return res res = self.cleanStorageContents( directory ) if not res['OK']: return res # Clean ALL the possible remnants found in the BK res = self.cleanMetadataCatalogFiles( transID ) if not res['OK']: return res # Clean the transformation DB of the files and job information res = self.transClient.cleanTransformation( transID ) if not res['OK']: return res self.log.info( "Successfully cleaned transformation %d" % transID ) res = self.transClient.setTransformationParameter( transID, 'Status', 'Cleaned' ) if not res['OK']: self.log.error( "Failed to update status of transformation %s to Cleaned" % ( transID ), res['Message'] ) return res self.log.info( "Updated status of transformation %s to Cleaned" % ( transID ) ) return S_OK() def cleanMetadataCatalogFiles( self, transID ): """ wipe out files from catalog """ res = self.metadataClient.findFilesByMetadata( { self.transfidmeta : transID } ) if not res['OK']: return res fileToRemove = res['Value'] if not fileToRemove: self.log.info( 'No files found for transID %s' % transID ) return S_OK() # Executing with shifter proxy gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'false' ) res = DataManager().removeFile( fileToRemove, force = True ) gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'true' ) if not res['OK']: return res for lfn, reason in res['Value']['Failed'].items(): self.log.error( "Failed to remove file found in metadata catalog", "%s %s" % ( lfn, reason ) ) if res['Value']['Failed']: return S_ERROR( "Failed to remove all files found in the metadata catalog" ) self.log.info( "Successfully removed all files found in the BK" ) return S_OK() ############################################################################# # # These are the methods for removing the jobs from the WMS and transformation DB # def cleanTransformationTasks( self, transID ): """ clean tasks from WMS, or from the RMS if it is a DataManipulation transformation """ res = self.__getTransformationExternalIDs( transID ) if not res['OK']: return res externalIDs = res['Value'] if externalIDs: res = self.transClient.getTransformationParameters( transID, ['Type'] ) if not res['OK']: self.log.error( "Failed to determine transformation type" ) return res transType = res['Value'] if transType in self.dataProcTTypes: res = self.__removeWMSTasks( externalIDs ) else: res = self.__removeRequests( externalIDs ) if not res['OK']: return res return S_OK() def __getTransformationExternalIDs( self, transID ): """ collect all ExternalIDs for transformation :transID: :param self: self reference :param int transID: transforamtion ID """ res = self.transClient.getTransformationTasks( condDict = { 'TransformationID' : transID } ) if not res['OK']: self.log.error( "Failed to get externalIDs for transformation %d" % transID, res['Message'] ) return res externalIDs = [ taskDict['ExternalID'] for taskDict in res["Value"] ] self.log.info( "Found %d tasks for transformation" % len( externalIDs ) ) return S_OK( externalIDs ) def __removeRequests( self, requestIDs ): """ This will remove requests from the (new) RMS system - #FIXME: if the old system is still installed, it won't remove anything!!! (we don't want to risk removing from the new RMS what is instead in the old) """ # FIXME: checking if the old system is still installed! from DIRAC.ConfigurationSystem.Client import PathFinder if PathFinder.getServiceURL( "RequestManagement/RequestManager" ): self.log.warn( "NOT removing requests!!" ) return S_OK() rIDs = [ int( long( j ) ) for j in requestIDs if long( j ) ] for requestName in rIDs: self.reqClient.deleteRequest( requestName ) return S_OK() def __removeWMSTasks( self, transJobIDs ): """ wipe out jobs and their requests from the system TODO: should check request status, maybe FTS files as well ??? :param self: self reference :param list trasnJobIDs: job IDs """ # Prevent 0 job IDs jobIDs = [ int( j ) for j in transJobIDs if int( j ) ] allRemove = True for jobList in breakListIntoChunks( jobIDs, 500 ): res = self.wmsClient.killJob( jobList ) if res['OK']: self.log.info( "Successfully killed %d jobs from WMS" % len( jobList ) ) elif ( "InvalidJobIDs" in res ) and ( "NonauthorizedJobIDs" not in res ) and ( "FailedJobIDs" not in res ): self.log.info( "Found %s jobs which did not exist in the WMS" % len( res['InvalidJobIDs'] ) ) elif "NonauthorizedJobIDs" in res: self.log.error( "Failed to kill %s jobs because not authorized" % len( res['NonauthorizedJobIDs'] ) ) allRemove = False elif "FailedJobIDs" in res: self.log.error( "Failed to kill %s jobs" % len( res['FailedJobIDs'] ) ) allRemove = False res = self.wmsClient.deleteJob( jobList ) if res['OK']: self.log.info( "Successfully removed %d jobs from WMS" % len( jobList ) ) elif ( "InvalidJobIDs" in res ) and ( "NonauthorizedJobIDs" not in res ) and ( "FailedJobIDs" not in res ): self.log.info( "Found %s jobs which did not exist in the WMS" % len( res['InvalidJobIDs'] ) ) elif "NonauthorizedJobIDs" in res: self.log.error( "Failed to remove %s jobs because not authorized" % len( res['NonauthorizedJobIDs'] ) ) allRemove = False elif "FailedJobIDs" in res: self.log.error( "Failed to remove %s jobs" % len( res['FailedJobIDs'] ) ) allRemove = False if not allRemove: return S_ERROR( "Failed to remove all remnants from WMS" ) self.log.info( "Successfully removed all tasks from the WMS" ) if not jobIDs: self.log.info( "JobIDs not present, unable to remove asociated requests." ) return S_OK() failed = 0 # FIXME: double request client: old/new -> only the new will survive sooner or later # this is the old try: res = RequestClient().getRequestForJobs( jobIDs ) if not res['OK']: self.log.error( "Failed to get requestID for jobs.", res['Message'] ) return res failoverRequests = res['Value'] self.log.info( "Found %d jobs with associated failover requests (in the old RMS)" % len( failoverRequests ) ) if not failoverRequests: return S_OK() for jobID, requestName in failoverRequests.items(): # Put this check just in case, tasks must have associated jobs if jobID == 0 or jobID == '0': continue res = RequestClient().deleteRequest( requestName ) if not res['OK']: self.log.error( "Failed to remove request from RequestDB", res['Message'] ) failed += 1 else: self.log.verbose( "Removed request %s associated to job %d." % ( requestName, jobID ) ) except RuntimeError: failoverRequests = {} pass # FIXME: and this is the new res = self.reqClient.getRequestNamesForJobs( jobIDs ) if not res['OK']: self.log.error( "Failed to get requestID for jobs.", res['Message'] ) return res failoverRequests.update( res['Value']['Successful'] ) if not failoverRequests: return S_OK() for jobID, requestName in res['Value']['Successful'].items(): # Put this check just in case, tasks must have associated jobs if jobID == 0 or jobID == '0': continue res = self.reqClient.deleteRequest( requestName ) if not res['OK']: self.log.error( "Failed to remove request from RequestDB", res['Message'] ) failed += 1 else: self.log.verbose( "Removed request %s associated to job %d." % ( requestName, jobID ) ) if failed: self.log.info( "Successfully removed %s requests" % ( len( failoverRequests ) - failed ) ) self.log.info( "Failed to remove %s requests" % failed ) return S_ERROR( "Failed to remove all the request from RequestDB" ) self.log.info( "Successfully removed all the associated failover requests" ) return S_OK()
def test_FullChain(self): """ This test will - call all the WMSClient methods that will end up calling all the JobManager service methods - use the JobMonitoring to verify few properties - call the JobCleaningAgent to eliminate job entries from the DBs """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = JobStateUpdateClient() # create the job job = helloWorldJob() jobDescription = createFile(job) # submit the job res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res['OK'], res.get('Message')) self.assertTrue(isinstance(res['Value'], int), msg="Got %s" % type(res['Value'])) self.assertEqual(res['Value'], res['JobID'], msg="Got %s, expected %s" % (str(res['Value']), res['JobID'])) jobID = res['JobID'] jobID = res['Value'] # updating the status res = jobStateUpdate.setJobStatus(jobID, 'Running', 'Executing Minchiapp', 'source') self.assertTrue(res['OK'], res.get('Message')) # reset the job res = wmsClient.resetJob(jobID) self.assertTrue(res['OK'], res.get('Message')) # reschedule the job res = wmsClient.rescheduleJob(jobID) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobStatus(jobID) self.assertTrue(res['OK'], res.get('Message')) self.assertEqual(res['Value'], 'Received', msg="Got %s" % str(res['Value'])) res = jobMonitor.getJobsMinorStatus([jobID]) self.assertTrue(res['OK'], res.get('Message')) self.assertEqual( res['Value'], {jobID: { 'MinorStatus': 'Job Rescheduled', 'JobID': jobID }}, msg="Got %s" % str(res['Value'])) res = jobMonitor.getJobsApplicationStatus([jobID]) self.assertTrue(res['OK'], res.get('Message')) self.assertEqual( res['Value'], {jobID: { 'ApplicationStatus': 'Unknown', 'JobID': jobID }}, msg="Got %s" % str(res['Value'])) # updating the status again res = jobStateUpdate.setJobStatus(jobID, 'Matched', 'matching', 'source') self.assertTrue(res['OK'], res.get('Message')) # kill the job res = wmsClient.killJob(jobID) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobStatus(jobID) self.assertTrue(res['OK'], res.get('Message')) self.assertEqual(res['Value'], 'Killed', msg="Got %s" % str(res['Value'])) # updating the status aaaagain res = jobStateUpdate.setJobStatus(jobID, 'Done', 'matching', 'source') self.assertTrue(res['OK'], res.get('Message')) # kill the job res = wmsClient.killJob(jobID) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobStatus(jobID) self.assertTrue(res['OK'], res.get('Message')) self.assertEqual( res['Value'], 'Done', msg="Got %s" % str(res['Value'])) # this time it won't kill... it's done! # delete the job - this will just set its status to "deleted" res = wmsClient.deleteJob(jobID) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobStatus(jobID) self.assertTrue(res['OK'], res.get('Message')) self.assertEqual(res['Value'], 'Deleted', msg="Got %s" % str(res['Value']))
def initialize(self): """ agent initialisation reading and setting confing opts :param self: self reference """ # # shifter proxy # See cleanCatalogContents method: this proxy will be used ALSO when the file catalog used # is the DIRAC File Catalog (DFC). # This is possible because of unset of the "UseServerCertificate" option self.shifterProxy = self.am_getOption('shifterProxy', None) # # transformations types self.dataProcTTypes = Operations().getValue( 'Transformations/DataProcessing', self.dataProcTTypes) self.dataManipTTypes = Operations().getValue( 'Transformations/DataManipulation', self.dataManipTTypes) agentTSTypes = self.am_getOption('TransformationTypes', []) if agentTSTypes: self.transformationTypes = sorted(agentTSTypes) else: self.transformationTypes = sorted(self.dataProcTTypes + self.dataManipTTypes) self.log.info("Will consider the following transformation types: %s" % str(self.transformationTypes)) # # directory locations self.directoryLocations = sorted( self.am_getOption('DirectoryLocations', self.directoryLocations)) self.log.info( "Will search for directories in the following locations: %s" % str(self.directoryLocations)) # # transformation metadata self.transfidmeta = self.am_getOption('TransfIDMeta', self.transfidmeta) self.log.info("Will use %s as metadata tag name for TransformationID" % self.transfidmeta) # # archive periof in days self.archiveAfter = self.am_getOption('ArchiveAfter', self.archiveAfter) # days self.log.info("Will archive Completed transformations after %d days" % self.archiveAfter) # # active SEs self.activeStorages = sorted( self.am_getOption('ActiveSEs', self.activeStorages)) if self.activeStorages: self.log.info("Will check the following storage elements: %s" % str(self.activeStorages)) # # transformation log SEs self.logSE = Operations().getValue('/LogStorage/LogSE', self.logSE) self.log.info("Will remove logs found on storage element: %s" % self.logSE) # # transformation client self.transClient = TransformationClient() # # wms client self.wmsClient = WMSClient() # # request client self.reqClient = ReqClient() # # file catalog client self.metadataClient = FileCatalogClient() return S_OK()
class TransformationCleaningAgent(AgentModule): """ .. class:: TransformationCleaningAgent :param ~DIRAC.DataManagementSystem.Client.DataManager.DataManager dm: DataManager instance :param ~TransformationClient.TransformationClient transClient: TransformationClient instance :param ~FileCatalogClient.FileCatalogClient metadataClient: FileCatalogClient instance """ def __init__(self, *args, **kwargs): """ c'tor """ AgentModule.__init__(self, *args, **kwargs) self.shifterProxy = None # # transformation client self.transClient = None # # wms client self.wmsClient = None # # request client self.reqClient = None # # file catalog client self.metadataClient = None # # transformations types self.transformationTypes = None # # directory locations self.directoryLocations = ['TransformationDB', 'MetadataCatalog'] # # transformation metadata self.transfidmeta = 'TransformationID' # # archive periof in days self.archiveAfter = 7 # # transformation log SEs self.logSE = 'LogSE' # # enable/disable execution self.enableFlag = 'True' self.dataProcTTypes = ['MCSimulation', 'Merge'] self.dataManipTTypes = ['Replication', 'Removal'] def initialize(self): """ agent initialisation reading and setting confing opts :param self: self reference """ # # shifter proxy # See cleanContent method: this proxy will be used ALSO when the file catalog used # is the DIRAC File Catalog (DFC). # This is possible because of unset of the "UseServerCertificate" option self.shifterProxy = self.am_getOption('shifterProxy', self.shifterProxy) # # transformations types self.dataProcTTypes = Operations().getValue('Transformations/DataProcessing', self.dataProcTTypes) self.dataManipTTypes = Operations().getValue('Transformations/DataManipulation', self.dataManipTTypes) agentTSTypes = self.am_getOption('TransformationTypes', []) if agentTSTypes: self.transformationTypes = sorted(agentTSTypes) else: self.transformationTypes = sorted(self.dataProcTTypes + self.dataManipTTypes) self.log.info("Will consider the following transformation types: %s" % str(self.transformationTypes)) # # directory locations self.directoryLocations = sorted(self.am_getOption('DirectoryLocations', self.directoryLocations)) self.log.info("Will search for directories in the following locations: %s" % str(self.directoryLocations)) # # transformation metadata self.transfidmeta = self.am_getOption('TransfIDMeta', self.transfidmeta) self.log.info("Will use %s as metadata tag name for TransformationID" % self.transfidmeta) # # archive periof in days self.archiveAfter = self.am_getOption('ArchiveAfter', self.archiveAfter) # days self.log.info("Will archive Completed transformations after %d days" % self.archiveAfter) # # transformation log SEs self.logSE = Operations().getValue('/LogStorage/LogSE', self.logSE) self.log.info("Will remove logs found on storage element: %s" % self.logSE) # # transformation client self.transClient = TransformationClient() # # wms client self.wmsClient = WMSClient() # # request client self.reqClient = ReqClient() # # file catalog client self.metadataClient = FileCatalogClient() return S_OK() ############################################################################# def execute(self): """ execution in one agent's cycle :param self: self reference """ self.enableFlag = self.am_getOption('EnableFlag', self.enableFlag) if self.enableFlag != 'True': self.log.info('TransformationCleaningAgent is disabled by configuration option EnableFlag') return S_OK('Disabled via CS flag') # Obtain the transformations in Cleaning status and remove any mention of the jobs/files res = self.transClient.getTransformations({'Status': 'Cleaning', 'Type': self.transformationTypes}) if res['OK']: for transDict in res['Value']: if self.shifterProxy: self._executeClean(transDict) else: self.log.info("Cleaning transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s" % transDict) executeWithUserProxy(self._executeClean)(transDict, proxyUserDN=transDict['AuthorDN'], proxyUserGroup=transDict['AuthorGroup']) else: self.log.error("Failed to get transformations", res['Message']) # Obtain the transformations in RemovingFiles status and removes the output files res = self.transClient.getTransformations({'Status': 'RemovingFiles', 'Type': self.transformationTypes}) if res['OK']: for transDict in res['Value']: if self.shifterProxy: self._executeRemoval(transDict) else: self.log.info("Removing files for transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s" % transDict) executeWithUserProxy(self._executeRemoval)(transDict, proxyUserDN=transDict['AuthorDN'], proxyUserGroup=transDict['AuthorGroup']) else: self.log.error("Could not get the transformations", res['Message']) # Obtain the transformations in Completed status and archive if inactive for X days olderThanTime = datetime.utcnow() - timedelta(days=self.archiveAfter) res = self.transClient.getTransformations({'Status': 'Completed', 'Type': self.transformationTypes}, older=olderThanTime, timeStamp='LastUpdate') if res['OK']: for transDict in res['Value']: if self.shifterProxy: self._executeArchive(transDict) else: self.log.info("Archiving files for transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s" % transDict) executeWithUserProxy(self._executeArchive)(transDict, proxyUserDN=transDict['AuthorDN'], proxyUserGroup=transDict['AuthorGroup']) else: self.log.error("Could not get the transformations", res['Message']) return S_OK() def _executeClean(self, transDict): """Clean transformation.""" # if transformation is of type `Replication` or `Removal`, there is nothing to clean. # We just archive if transDict['Type'] in self.dataManipTTypes: res = self.archiveTransformation(transDict['TransformationID']) if not res['OK']: self.log.error("Problems archiving transformation %s: %s" % (transDict['TransformationID'], res['Message'])) else: res = self.cleanTransformation(transDict['TransformationID']) if not res['OK']: self.log.error("Problems cleaning transformation %s: %s" % (transDict['TransformationID'], res['Message'])) def _executeRemoval(self, transDict): """Remove files from given transformation.""" res = self.removeTransformationOutput(transDict['TransformationID']) if not res['OK']: self.log.error("Problems removing transformation %s: %s" % (transDict['TransformationID'], res['Message'])) def _executeArchive(self, transDict): """Archive the given transformation.""" res = self.archiveTransformation(transDict['TransformationID']) if not res['OK']: self.log.error("Problems archiving transformation %s: %s" % (transDict['TransformationID'], res['Message'])) return S_OK() ############################################################################# # # Get the transformation directories for checking # def getTransformationDirectories(self, transID): """ get the directories for the supplied transformation from the transformation system. These directories are used by removeTransformationOutput and cleanTransformation for removing output. :param self: self reference :param int transID: transformation ID """ self.log.verbose("Cleaning Transformation directories of transformation %d" % transID) directories = [] if 'TransformationDB' in self.directoryLocations: res = self.transClient.getTransformationParameters(transID, ['OutputDirectories']) if not res['OK']: self.log.error("Failed to obtain transformation directories", res['Message']) return res transDirectories = [] if res['Value']: if not isinstance(res['Value'], list): try: transDirectories = ast.literal_eval(res['Value']) except BaseException: # It can happen if the res['Value'] is '/a/b/c' instead of '["/a/b/c"]' transDirectories.append(res['Value']) else: transDirectories = res['Value'] directories = self._addDirs(transID, transDirectories, directories) if 'MetadataCatalog' in self.directoryLocations: res = self.metadataClient.findDirectoriesByMetadata({self.transfidmeta: transID}) if not res['OK']: self.log.error("Failed to obtain metadata catalog directories", res['Message']) return res transDirectories = res['Value'] directories = self._addDirs(transID, transDirectories, directories) if not directories: self.log.info("No output directories found") directories = sorted(directories) return S_OK(directories) @classmethod def _addDirs(cls, transID, newDirs, existingDirs): """ append unique :newDirs: list to :existingDirs: list :param self: self reference :param int transID: transformationID :param list newDirs: src list of paths :param list existingDirs: dest list of paths """ for folder in newDirs: transStr = str(transID).zfill(8) if re.search(transStr, str(folder)): if folder not in existingDirs: existingDirs.append(os.path.normpath(folder)) return existingDirs ############################################################################# # # These are the methods for performing the cleaning of catalogs and storage # def cleanContent(self, directory): """ wipe out everything from catalog under folder :directory: :param self: self reference :params str directory: folder name """ self.log.verbose("Cleaning Catalog contents") res = self.__getCatalogDirectoryContents([directory]) if not res['OK']: return res filesFound = res['Value'] if not filesFound: self.log.info("No files are registered in the catalog directory %s" % directory) return S_OK() self.log.info("Attempting to remove %d possible remnants from the catalog and storage" % len(filesFound)) # Executing with shifter proxy gConfigurationData.setOptionInCFG('/DIRAC/Security/UseServerCertificate', 'false') res = DataManager().removeFile(filesFound, force=True) gConfigurationData.setOptionInCFG('/DIRAC/Security/UseServerCertificate', 'true') if not res['OK']: return res realFailure = False for lfn, reason in res['Value']['Failed'].items(): if "File does not exist" in str(reason): self.log.warn("File %s not found in some catalog: " % (lfn)) else: self.log.error("Failed to remove file found in the catalog", "%s %s" % (lfn, reason)) realFailure = True if realFailure: return S_ERROR("Failed to remove all files found in the catalog") return S_OK() def __getCatalogDirectoryContents(self, directories): """ get catalog contents under paths :directories: :param self: self reference :param list directories: list of paths in catalog """ self.log.info('Obtaining the catalog contents for %d directories:' % len(directories)) for directory in directories: self.log.info(directory) activeDirs = directories allFiles = {} fc = FileCatalog() while activeDirs: currentDir = activeDirs[0] res = returnSingleResult(fc.listDirectory(currentDir)) activeDirs.remove(currentDir) if not res['OK'] and 'Directory does not exist' in res['Message']: # FIXME: DFC should return errno self.log.info("The supplied directory %s does not exist" % currentDir) elif not res['OK']: if "No such file or directory" in res['Message']: self.log.info("%s: %s" % (currentDir, res['Message'])) else: self.log.error("Failed to get directory %s content: %s" % (currentDir, res['Message'])) else: dirContents = res['Value'] activeDirs.extend(dirContents['SubDirs']) allFiles.update(dirContents['Files']) self.log.info("Found %d files" % len(allFiles)) return S_OK(allFiles.keys()) def cleanTransformationLogFiles(self, directory): """ clean up transformation logs from directory :directory: :param self: self reference :param str directory: folder name """ self.log.verbose("Removing log files found in the directory %s" % directory) res = returnSingleResult(StorageElement(self.logSE).removeDirectory(directory, recursive=True)) if not res['OK']: if cmpError(res, errno.ENOENT): # No such file or directory self.log.warn("Transformation log directory does not exist", directory) return S_OK() self.log.error("Failed to remove log files", res['Message']) return res self.log.info("Successfully removed transformation log directory") return S_OK() ############################################################################# # # These are the functional methods for archiving and cleaning transformations # def removeTransformationOutput(self, transID): """ This just removes any mention of the output data from the catalog and storage """ self.log.info("Removing output data for transformation %s" % transID) res = self.getTransformationDirectories(transID) if not res['OK']: self.log.error('Problem obtaining directories for transformation %s with result "%s"' % (transID, res)) return S_OK() directories = res['Value'] for directory in directories: if not re.search('/LOG/', directory): res = self.cleanContent(directory) if not res['OK']: return res self.log.info("Removed %d directories from the catalog \ and its files from the storage for transformation %s" % (len(directories), transID)) # Clean ALL the possible remnants found in the metadata catalog res = self.cleanMetadataCatalogFiles(transID) if not res['OK']: return res self.log.info("Successfully removed output of transformation %d" % transID) # Change the status of the transformation to RemovedFiles res = self.transClient.setTransformationParameter(transID, 'Status', 'RemovedFiles') if not res['OK']: self.log.error("Failed to update status of transformation %s to RemovedFiles" % (transID), res['Message']) return res self.log.info("Updated status of transformation %s to RemovedFiles" % (transID)) return S_OK() def archiveTransformation(self, transID): """ This just removes job from the jobDB and the transformation DB :param self: self reference :param int transID: transformation ID """ self.log.info("Archiving transformation %s" % transID) # Clean the jobs in the WMS and any failover requests found res = self.cleanTransformationTasks(transID) if not res['OK']: return res # Clean the transformation DB of the files and job information res = self.transClient.cleanTransformation(transID) if not res['OK']: return res self.log.info("Successfully archived transformation %d" % transID) # Change the status of the transformation to archived res = self.transClient.setTransformationParameter(transID, 'Status', 'Archived') if not res['OK']: self.log.error("Failed to update status of transformation %s to Archived" % (transID), res['Message']) return res self.log.info("Updated status of transformation %s to Archived" % (transID)) return S_OK() def cleanTransformation(self, transID): """ This removes what was produced by the supplied transformation, leaving only some info and log in the transformation DB. """ self.log.info("Cleaning transformation %s" % transID) res = self.getTransformationDirectories(transID) if not res['OK']: self.log.error('Problem obtaining directories for transformation %s with result "%s"' % (transID, res)) return S_OK() directories = res['Value'] # Clean the jobs in the WMS and any failover requests found res = self.cleanTransformationTasks(transID) if not res['OK']: return res # Clean the log files for the jobs for directory in directories: if re.search('/LOG/', directory): res = self.cleanTransformationLogFiles(directory) if not res['OK']: return res res = self.cleanContent(directory) if not res['OK']: return res # Clean ALL the possible remnants found res = self.cleanMetadataCatalogFiles(transID) if not res['OK']: return res # Clean the transformation DB of the files and job information res = self.transClient.cleanTransformation(transID) if not res['OK']: return res self.log.info("Successfully cleaned transformation %d" % transID) res = self.transClient.setTransformationParameter(transID, 'Status', 'Cleaned') if not res['OK']: self.log.error("Failed to update status of transformation %s to Cleaned" % (transID), res['Message']) return res self.log.info("Updated status of transformation %s to Cleaned" % (transID)) return S_OK() def cleanMetadataCatalogFiles(self, transID): """ wipe out files from catalog """ res = self.metadataClient.findFilesByMetadata({self.transfidmeta: transID}) if not res['OK']: return res fileToRemove = res['Value'] if not fileToRemove: self.log.info('No files found for transID %s' % transID) return S_OK() # Executing with shifter proxy gConfigurationData.setOptionInCFG('/DIRAC/Security/UseServerCertificate', 'false') res = DataManager().removeFile(fileToRemove, force=True) gConfigurationData.setOptionInCFG('/DIRAC/Security/UseServerCertificate', 'true') if not res['OK']: return res for lfn, reason in res['Value']['Failed'].items(): self.log.error("Failed to remove file found in metadata catalog", "%s %s" % (lfn, reason)) if res['Value']['Failed']: return S_ERROR("Failed to remove all files found in the metadata catalog") self.log.info("Successfully removed all files found in the BK") return S_OK() ############################################################################# # # These are the methods for removing the jobs from the WMS and transformation DB # def cleanTransformationTasks(self, transID): """ clean tasks from WMS, or from the RMS if it is a DataManipulation transformation """ self.log.verbose("Cleaning Transformation tasks of transformation %d" % transID) res = self.__getTransformationExternalIDs(transID) if not res['OK']: return res externalIDs = res['Value'] if externalIDs: res = self.transClient.getTransformationParameters(transID, ['Type']) if not res['OK']: self.log.error("Failed to determine transformation type") return res transType = res['Value'] if transType in self.dataProcTTypes: res = self.__removeWMSTasks(externalIDs) else: res = self.__removeRequests(externalIDs) if not res['OK']: return res return S_OK() def __getTransformationExternalIDs(self, transID): """ collect all ExternalIDs for transformation :transID: :param self: self reference :param int transID: transforamtion ID """ res = self.transClient.getTransformationTasks(condDict={'TransformationID': transID}) if not res['OK']: self.log.error("Failed to get externalIDs for transformation %d" % transID, res['Message']) return res externalIDs = [taskDict['ExternalID'] for taskDict in res["Value"]] self.log.info("Found %d tasks for transformation" % len(externalIDs)) return S_OK(externalIDs) def __removeRequests(self, requestIDs): """ This will remove requests from the RMS system - """ rIDs = [int(long(j)) for j in requestIDs if long(j)] for reqID in rIDs: self.reqClient.cancelRequest(reqID) return S_OK() def __removeWMSTasks(self, transJobIDs): """ wipe out jobs and their requests from the system :param self: self reference :param list trasnJobIDs: job IDs """ # Prevent 0 job IDs jobIDs = [int(j) for j in transJobIDs if int(j)] allRemove = True for jobList in breakListIntoChunks(jobIDs, 500): res = self.wmsClient.killJob(jobList) if res['OK']: self.log.info("Successfully killed %d jobs from WMS" % len(jobList)) elif ("InvalidJobIDs" in res) and ("NonauthorizedJobIDs" not in res) and ("FailedJobIDs" not in res): self.log.info("Found %s jobs which did not exist in the WMS" % len(res['InvalidJobIDs'])) elif "NonauthorizedJobIDs" in res: self.log.error("Failed to kill %s jobs because not authorized" % len(res['NonauthorizedJobIDs'])) allRemove = False elif "FailedJobIDs" in res: self.log.error("Failed to kill %s jobs" % len(res['FailedJobIDs'])) allRemove = False res = self.wmsClient.deleteJob(jobList) if res['OK']: self.log.info("Successfully removed %d jobs from WMS" % len(jobList)) elif ("InvalidJobIDs" in res) and ("NonauthorizedJobIDs" not in res) and ("FailedJobIDs" not in res): self.log.info("Found %s jobs which did not exist in the WMS" % len(res['InvalidJobIDs'])) elif "NonauthorizedJobIDs" in res: self.log.error("Failed to remove %s jobs because not authorized" % len(res['NonauthorizedJobIDs'])) allRemove = False elif "FailedJobIDs" in res: self.log.error("Failed to remove %s jobs" % len(res['FailedJobIDs'])) allRemove = False if not allRemove: return S_ERROR("Failed to remove all remnants from WMS") self.log.info("Successfully removed all tasks from the WMS") if not jobIDs: self.log.info("JobIDs not present, unable to remove asociated requests.") return S_OK() failed = 0 failoverRequests = {} res = self.reqClient.getRequestIDsForJobs(jobIDs) if not res['OK']: self.log.error("Failed to get requestID for jobs.", res['Message']) return res failoverRequests.update(res['Value']['Successful']) if not failoverRequests: return S_OK() for jobID, requestID in res['Value']['Successful'].items(): # Put this check just in case, tasks must have associated jobs if jobID == 0 or jobID == '0': continue res = self.reqClient.cancelRequest(requestID) if not res['OK']: self.log.error("Failed to remove request from RequestDB", res['Message']) failed += 1 else: self.log.verbose("Removed request %s associated to job %d." % (requestID, jobID)) if failed: self.log.info("Successfully removed %s requests" % (len(failoverRequests) - failed)) self.log.info("Failed to remove %s requests" % failed) return S_ERROR("Failed to remove all the request from RequestDB") self.log.info("Successfully removed all the associated failover requests") return S_OK()
shutil.copyfileobj(file.file, tFile) file.file.close() tFile.close() fileNameList.append(name) except Exception,x: exception_counter = 1 c.result = {"success":"false","error":"An EXCEPTION happens during saving your sandbox file(s): %s" % str(x)} if len(fileNameList) > 0 and exception_counter == 0: sndBox = "InputSandbox = {\"" + "\",\"".join(fileNameList) + "\"};" else: sndBox = "" if exception_counter == 0: jdl = jdl + sndBox from DIRAC.WorkloadManagementSystem.Client.WMSClient import WMSClient jobManager = WMSClient(getRPCClient("WorkloadManagement/JobManager"), getRPCClient("WorkloadManagement/SandboxStore"), getTransferClient("WorkloadManagement/SandboxStore")) jdl = str(jdl) gLogger.info("J D L : ",jdl) try: result = jobManager.submitJob(jdl) if result["OK"]: c.result = {"success":"true","result":result["Value"]} else: c.result = {"success":"false","error":result["Message"]} except Exception,x: c.result = {"success":"false","error":"An EXCEPTION happens during job submittion: %s" % str(x)} if clearFS: shutil.rmtree(storePath) return c.result ################################################################################
class TransformationCleaningAgent( AgentModule ): ''' .. class:: TransformationCleaningAgent :param ReplicaManger replicaManager: ReplicaManager instance :param TransfromationClient transClient: TransfromationClient instance :param RequestClient requestClient: RequestClient instance :param FileCatalogClient metadataClient: FileCatalogClient instance ''' def __init__( self, *args, **kwargs ): ''' c'tor ''' AgentModule.__init__( self, *args, **kwargs ) # # replica manager self.replicaManager = ReplicaManager() # # transformation client self.transClient = TransformationClient() # # wms client self.wmsClient = WMSClient() # # request client self.requestClient = RequestClient() # # file catalog clinet self.metadataClient = FileCatalogClient() # # placeholders for CS options # # transformations types self.transformationTypes = None # # directory locations self.directoryLocations = None # # transformation metadata self.transfidmeta = None # # archive periof in days self.archiveAfter = None # # active SEs self.activeStorages = None # # transformation log SEs self.logSE = None # # enable/disable execution self.enableFlag = None def initialize( self ): ''' agent initialisation reading and setting confing opts :param self: self reference ''' # # shifter proxy self.am_setOption( 'shifterProxy', 'DataManager' ) # # transformations types agentTSTypes = self.am_getOption( 'TransformationTypes', [] ) if agentTSTypes: self.transformationTypes = sortList( agentTSTypes ) else: dataProc = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] ) dataManip = Operations().getValue( 'Transformations/DataManipulation', ['Replication', 'Removal'] ) self.transformationTypes = sortList( dataProc + dataManip ) self.log.info( "Will consider the following transformation types: %s" % str( self.transformationTypes ) ) # # directory locations self.directoryLocations = sortList( self.am_getOption( 'DirectoryLocations', [ 'TransformationDB', 'MetadataCatalog' ] ) ) self.log.info( "Will search for directories in the following locations: %s" % str( self.directoryLocations ) ) # # transformation metadata self.transfidmeta = self.am_getOption( 'TransfIDMeta', "TransformationID" ) self.log.info( "Will use %s as metadata tag name for TransformationID" % self.transfidmeta ) # # archive periof in days self.archiveAfter = self.am_getOption( 'ArchiveAfter', 7 ) # days self.log.info( "Will archive Completed transformations after %d days" % self.archiveAfter ) # # active SEs self.activeStorages = sortList( self.am_getOption( 'ActiveSEs', [] ) ) self.log.info( "Will check the following storage elements: %s" % str( self.activeStorages ) ) # # transformation log SEs self.logSE = self.am_getOption( 'TransformationLogSE', 'LogSE' ) self.log.info( "Will remove logs found on storage element: %s" % self.logSE ) # # enable/disable execution, should be using CS option Status?? with default value as 'Active'?? self.enableFlag = self.am_getOption( 'EnableFlag', 'True' ) return S_OK() ############################################################################# def execute( self ): ''' execution in one agent's cycle :param self: self reference ''' self.enableFlag = self.am_getOption( 'EnableFlag', 'True' ) if not self.enableFlag == 'True': self.log.info( 'TransformationCleaningAgent is disabled by configuration option EnableFlag' ) return S_OK( 'Disabled via CS flag' ) # # Obtain the transformations in Cleaning status and remove any mention of the jobs/files res = self.transClient.getTransformations( { 'Status' : 'Cleaning', 'Type' : self.transformationTypes } ) if res['OK']: for transDict in res['Value']: # # if transformation is of type `Replication` or `Removal`, there is nothing to clean. # # We just archive if transDict[ 'Type' ] in [ 'Replication', 'Removal' ]: res = self.archiveTransformation( transDict['TransformationID'] ) if not res['OK']: self.log.error( "Problems archiving transformation %s: %s" % ( transDict['TransformationID'], res['Message'] ) ) else: res = self.cleanTransformation( transDict['TransformationID'] ) if not res['OK']: self.log.error( "Problems cleaning transformation %s: %s" % ( transDict['TransformationID'], res['Message'] ) ) # # Obtain the transformations in RemovingFiles status and (wait for it) removes the output files res = self.transClient.getTransformations( { 'Status' : 'RemovingFiles', 'Type' : self.transformationTypes} ) if res['OK']: for transDict in res['Value']: res = self.removeTransformationOutput( transDict['TransformationID'] ) if not res['OK']: self.log.error( "Problems removing transformation %s: %s" % ( transDict['TransformationID'], res['Message'] ) ) # # Obtain the transformations in Completed status and archive if inactive for X days olderThanTime = datetime.utcnow() - timedelta( days = self.archiveAfter ) res = self.transClient.getTransformations( { 'Status' : 'Completed', 'Type' : self.transformationTypes }, older = olderThanTime, timeStamp = 'LastUpdate' ) if res['OK']: for transDict in res['Value']: res = self.archiveTransformation( transDict['TransformationID'] ) if not res['OK']: self.log.error( "Problems archiving transformation %s: %s" % ( transDict['TransformationID'], res['Message'] ) ) else: self.log.error( "Could not get the transformations" ) return S_OK() ############################################################################# # # Get the transformation directories for checking # def getTransformationDirectories( self, transID ): ''' get the directories for the supplied transformation from the transformation system :param self: self reference :param int transID: transformation ID ''' directories = [] if 'TransformationDB' in self.directoryLocations: res = self.transClient.getTransformationParameters( transID, ['OutputDirectories'] ) if not res['OK']: self.log.error( "Failed to obtain transformation directories", res['Message'] ) return res transDirectories = res['Value'].splitlines() directories = self._addDirs( transID, transDirectories, directories ) if 'MetadataCatalog' in self.directoryLocations: res = self.metadataClient.findDirectoriesByMetadata( {self.transfidmeta:transID} ) if not res['OK']: self.log.error( "Failed to obtain metadata catalog directories", res['Message'] ) return res transDirectories = res['Value'] directories = self._addDirs( transID, transDirectories, directories ) if not directories: self.log.info( "No output directories found" ) directories = sortList( directories ) return S_OK( directories ) @classmethod def _addDirs( self, transID, newDirs, existingDirs ): ''' append uniqe :newDirs: list to :existingDirs: list :param self: self reference :param int transID: transformationID :param list newDirs: src list of paths :param list existingDirs: dest list of paths ''' for folder in newDirs: transStr = str( transID ).zfill( 8 ) if re.search( transStr, str( folder ) ): if not folder in existingDirs: existingDirs.append( folder ) return existingDirs ############################################################################# # # These are the methods for performing the cleaning of catalogs and storage # def cleanStorageContents( self, directory ): ''' delete lfn dir from all active SE :param self: self reference :param sre directory: folder name ''' for storageElement in self.activeStorages: res = self.__removeStorageDirectory( directory, storageElement ) if not res['OK']: return res return S_OK() def __removeStorageDirectory( self, directory, storageElement ): ''' wipe out all contents from :directory: at :storageElement: :param self: self reference :param str directory: path :param str storageElement: SE name ''' self.log.info( 'Removing the contents of %s at %s' % ( directory, storageElement ) ) res = self.replicaManager.getPfnForLfn( [directory], storageElement ) if not res['OK']: self.log.error( "Failed to get PFN for directory", res['Message'] ) return res for directory, error in res['Value']['Failed'].items(): self.log.error( 'Failed to obtain directory PFN from LFN', '%s %s' % ( directory, error ) ) if res['Value']['Failed']: return S_ERROR( 'Failed to obtain directory PFN from LFNs' ) storageDirectory = res['Value']['Successful'].values()[0] res = self.replicaManager.getStorageFileExists( storageDirectory, storageElement, singleFile = True ) if not res['OK']: self.log.error( "Failed to obtain existance of directory", res['Message'] ) return res exists = res['Value'] if not exists: self.log.info( "The directory %s does not exist at %s " % ( directory, storageElement ) ) return S_OK() res = self.replicaManager.removeStorageDirectory( storageDirectory, storageElement, recursive = True, singleDirectory = True ) if not res['OK']: self.log.error( "Failed to remove storage directory", res['Message'] ) return res self.log.info( "Successfully removed %d files from %s at %s" % ( res['Value']['FilesRemoved'], directory, storageElement ) ) return S_OK() def cleanCatalogContents( self, directory ): ''' wipe out everything from catalog under folder :directory: :param self: self reference :params str directory: folder name ''' res = self.__getCatalogDirectoryContents( [directory] ) if not res['OK']: return res filesFound = res['Value'] if not filesFound: return S_OK() self.log.info( "Attempting to remove %d possible remnants from the catalog and storage" % len( filesFound ) ) res = self.replicaManager.removeFile( filesFound, force = True ) if not res['OK']: return res for lfn, reason in res['Value']['Failed'].items(): self.log.error( "Failed to remove file found in the catalog", "%s %s" % ( lfn, reason ) ) if res['Value']['Failed']: return S_ERROR( "Failed to remove all files found in the catalog" ) return S_OK() def __getCatalogDirectoryContents( self, directories ): ''' get catalog contents under paths :directories: :param self: self reference :param list directories: list of paths in catalog ''' self.log.info( 'Obtaining the catalog contents for %d directories:' % len( directories ) ) for directory in directories: self.log.info( directory ) activeDirs = directories allFiles = {} while len( activeDirs ) > 0: currentDir = activeDirs[0] res = self.replicaManager.getCatalogListDirectory( currentDir, singleFile = True ) activeDirs.remove( currentDir ) if not res['OK'] and res['Message'].endswith( 'The supplied path does not exist' ): self.log.info( "The supplied directory %s does not exist" % currentDir ) elif not res['OK']: self.log.error( 'Failed to get directory contents', '%s %s' % ( currentDir, res['Message'] ) ) else: dirContents = res['Value'] activeDirs.extend( dirContents['SubDirs'] ) allFiles.update( dirContents['Files'] ) self.log.info( "Found %d files" % len( allFiles ) ) return S_OK( allFiles.keys() ) def cleanTransformationLogFiles( self, directory ): ''' clean up transformation logs from directory :directory: :param self: self reference :param str directory: folder name ''' self.log.info( "Removing log files found in the directory %s" % directory ) res = self.replicaManager.removeStorageDirectory( directory, self.logSE, singleDirectory = True ) if not res['OK']: self.log.error( "Failed to remove log files", res['Message'] ) return res self.log.info( "Successfully removed transformation log directory" ) return S_OK() ############################################################################# # # These are the functional methods for archiving and cleaning transformations # def removeTransformationOutput( self, transID ): ''' This just removes any mention of the output data from the catalog and storage ''' self.log.info( "Removing output data for transformation %s" % transID ) res = self.getTransformationDirectories( transID ) if not res['OK']: self.log.error( 'Problem obtaining directories for transformation %s with result "%s"' % ( transID, res ) ) return S_OK() directories = res['Value'] for directory in directories: if not re.search( '/LOG/', directory ): res = self.cleanCatalogContents( directory ) if not res['OK']: return res res = self.cleanStorageContents( directory ) if not res['OK']: return res self.log.info( "Removed directories in the catalog and storage for transformation" ) # Clean ALL the possible remnants found in the metadata catalog res = self.cleanMetadataCatalogFiles( transID ) if not res['OK']: return res self.log.info( "Successfully removed output of transformation %d" % transID ) # Change the status of the transformation to RemovedFiles res = self.transClient.setTransformationParameter( transID, 'Status', 'RemovedFiles' ) if not res['OK']: self.log.error( "Failed to update status of transformation %s to RemovedFiles" % ( transID ), res['Message'] ) return res self.log.info( "Updated status of transformation %s to RemovedFiles" % ( transID ) ) return S_OK() def archiveTransformation( self, transID ): ''' This just removes job from the jobDB and the transformation DB :param self: self reference :param int transID: transformation ID ''' self.log.info( "Archiving transformation %s" % transID ) # Clean the jobs in the WMS and any failover requests found res = self.cleanTransformationTasks( transID ) if not res['OK']: return res # Clean the transformation DB of the files and job information res = self.transClient.cleanTransformation( transID ) if not res['OK']: return res self.log.info( "Successfully archived transformation %d" % transID ) # Change the status of the transformation to archived res = self.transClient.setTransformationParameter( transID, 'Status', 'Archived' ) if not res['OK']: self.log.error( "Failed to update status of transformation %s to Archived" % ( transID ), res['Message'] ) return res self.log.info( "Updated status of transformation %s to Archived" % ( transID ) ) return S_OK() def cleanTransformation( self, transID ): ''' This removes any mention of the supplied transformation ''' self.log.info( "Cleaning transformation %s" % transID ) res = self.getTransformationDirectories( transID ) if not res['OK']: self.log.error( 'Problem obtaining directories for transformation %s with result "%s"' % ( transID, res ) ) return S_OK() directories = res['Value'] # Clean the jobs in the WMS and any failover requests found res = self.cleanTransformationTasks( transID ) if not res['OK']: return res # Clean the log files for the jobs for directory in directories: if re.search( '/LOG/', directory ): res = self.cleanTransformationLogFiles( directory ) if not res['OK']: return res res = self.cleanCatalogContents( directory ) if not res['OK']: return res res = self.cleanStorageContents( directory ) if not res['OK']: return res # Clean ALL the possible remnants found in the BK res = self.cleanMetadataCatalogFiles( transID ) if not res['OK']: return res # Clean the transformation DB of the files and job information res = self.transClient.cleanTransformation( transID ) if not res['OK']: return res self.log.info( "Successfully cleaned transformation %d" % transID ) # Change the status of the transformation to deleted res = self.transClient.setTransformationParameter( transID, 'Status', 'Deleted' ) if not res['OK']: self.log.error( "Failed to update status of transformation %s to Deleted" % ( transID ), res['Message'] ) return res self.log.info( "Updated status of transformation %s to Deleted" % ( transID ) ) return S_OK() def cleanMetadataCatalogFiles( self, transID ): ''' wipe out files from catalog ''' res = self.metadataClient.findFilesByMetadata( { self.transfidmeta : transID } ) if not res['OK']: return res fileToRemove = res['Value'] if not fileToRemove: self.log.info( 'No files found for transID %s' % transID ) return S_OK() res = self.replicaManager.removeFile( fileToRemove, force = True ) if not res['OK']: return res for lfn, reason in res['Value']['Failed'].items(): self.log.error( "Failed to remove file found in metadata catalog", "%s %s" % ( lfn, reason ) ) if res['Value']['Failed']: return S_ERROR( "Failed to remove all files found in the metadata catalog" ) self.log.info( "Successfully removed all files found in the BK" ) return S_OK() ############################################################################# # # These are the methods for removing the jobs from the WMS and transformation DB # def cleanTransformationTasks( self, transID ): ''' clean tasks from WMS ''' res = self.__getTransformationExternalIDs( transID ) if not res['OK']: return res externalIDs = res['Value'] if externalIDs: res = self.transClient.getTransformationParameters( transID, ['Type'] ) if not res['OK']: self.log.error( "Failed to determine transformation type" ) return res transType = res['Value'] if transType in [ 'Replication', 'Removal' ]: res = self.__removeRequests( externalIDs ) else: res = self.__removeWMSTasks( externalIDs ) if not res['OK']: return res return S_OK() def __getTransformationExternalIDs( self, transID ): ''' collect all ExternalIDs for transformation :transID: :param self: self reference :param int transID: transforamtion ID ''' res = self.transClient.getTransformationTasks( condDict = { 'TransformationID' : transID } ) if not res['OK']: self.log.error( "Failed to get externalIDs for transformation %d" % transID, res['Message'] ) return res externalIDs = [ taskDict['ExternalID'] for taskDict in res["Value"] ] self.log.info( "Found %d tasks for transformation" % len( externalIDs ) ) return S_OK( externalIDs ) def __removeRequests( self, requestIDs ): ''' dummy method ''' self.log.error( "Not removing requests but should do" ) return S_OK() def __removeWMSTasks( self, transJobIDs ): ''' wipe out jobs and their requests from the system TODO: should check request status, maybe FTS files as well ??? :param self: self reference :param list trasnJobIDs: job IDs ''' # Prevent 0 job IDs jobIDs = [ int( j ) for j in transJobIDs if int( j ) ] allRemove = True for jobList in breakListIntoChunks( jobIDs, 500 ): res = self.wmsClient.killJob( jobList ) if res['OK']: self.log.info( "Successfully killed %d jobs from WMS" % len( jobList ) ) elif ( "InvalidJobIDs" in res ) and ( "NonauthorizedJobIDs" not in res ) and ( "FailedJobIDs" not in res ): self.log.info( "Found %s jobs which did not exist in the WMS" % len( res['InvalidJobIDs'] ) ) elif "NonauthorizedJobIDs" in res: self.log.error( "Failed to kill %s jobs because not authorized" % len( res['NonauthorizedJobIDs'] ) ) allRemove = False elif "FailedJobIDs" in res: self.log.error( "Failed to kill %s jobs" % len( res['FailedJobIDs'] ) ) allRemove = False res = self.wmsClient.deleteJob( jobList ) if res['OK']: self.log.info( "Successfully removed %d jobs from WMS" % len( jobList ) ) elif ( "InvalidJobIDs" in res ) and ( "NonauthorizedJobIDs" not in res ) and ( "FailedJobIDs" not in res ): self.log.info( "Found %s jobs which did not exist in the WMS" % len( res['InvalidJobIDs'] ) ) elif "NonauthorizedJobIDs" in res: self.log.error( "Failed to remove %s jobs because not authorized" % len( res['NonauthorizedJobIDs'] ) ) allRemove = False elif "FailedJobIDs" in res: self.log.error( "Failed to remove %s jobs" % len( res['FailedJobIDs'] ) ) allRemove = False if not allRemove: return S_ERROR( "Failed to remove all remnants from WMS" ) self.log.info( "Successfully removed all tasks from the WMS" ) if not jobIDs: self.log.info( "JobIDs not present, unable to remove asociated requests." ) return S_OK() res = self.requestClient.getRequestForJobs( jobIDs ) if not res['OK']: self.log.error( "Failed to get requestID for jobs.", res['Message'] ) return res failoverRequests = res['Value'] self.log.info( "Found %d jobs with associated failover requests" % len( failoverRequests ) ) if not failoverRequests: return S_OK() failed = 0 for jobID, requestName in failoverRequests.items(): # Put this check just in case, tasks must have associated jobs if jobID == 0 or jobID == '0': continue res = self.requestClient.deleteRequest( requestName ) if not res['OK']: self.log.error( "Failed to remove request from RequestDB", res['Message'] ) failed += 1 else: self.log.verbose( "Removed request %s associated to job %d." % ( requestName, jobID ) ) if failed: self.log.info( "Successfully removed %s requests" % ( len( failoverRequests ) - failed ) ) self.log.info( "Failed to remove %s requests" % failed ) return S_ERROR( "Failed to remove all the request from RequestDB" ) self.log.info( "Successfully removed all the associated failover requests" ) return S_OK()
class TransformationCleaningAgent( AgentModule ): ''' .. class:: TransformationCleaningAgent :param ReplicaManger replicaManager: ReplicaManager instance :param TransfromationClient transClient: TransfromationClient instance :param RequestClient requestClient: RequestClient instance :param FileCatalogClient metadataClient: FileCatalogClient instance ''' def __init__( self, *args, **kwargs ): ''' c'tor ''' AgentModule.__init__( self, *args, **kwargs ) # # replica manager self.replicaManager = ReplicaManager() # # transformation client self.transClient = TransformationClient() # # wms client self.wmsClient = WMSClient() # # request client self.requestClient = RequestClient() # # file catalog clinet self.metadataClient = FileCatalogClient() # # placeholders for CS options # # transformations types self.transformationTypes = None # # directory locations self.directoryLocations = None # # transformation metadata self.transfidmeta = None # # archive periof in days self.archiveAfter = None # # active SEs self.activeStorages = None # # transformation log SEs self.logSE = None # # enable/disable execution self.enableFlag = None def initialize( self ): ''' agent initialisation reading and setting confing opts :param self: self reference ''' # # shifter proxy self.am_setOption( 'shifterProxy', 'DataManager' ) # # transformations types agentTSTypes = self.am_getOption( 'TransformationTypes', [] ) if agentTSTypes: self.transformationTypes = sortList( agentTSTypes ) else: dataProc = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] ) dataManip = Operations().getValue( 'Transformations/DataManipulation', ['Replication', 'Removal'] ) self.transformationTypes = sortList( dataProc + dataManip ) self.log.info( "Will consider the following transformation types: %s" % str( self.transformationTypes ) ) # # directory locations self.directoryLocations = sortList( self.am_getOption( 'DirectoryLocations', [ 'TransformationDB', 'MetadataCatalog' ] ) ) self.log.info( "Will search for directories in the following locations: %s" % str( self.directoryLocations ) ) # # transformation metadata self.transfidmeta = self.am_getOption( 'TransfIDMeta', "TransformationID" ) self.log.info( "Will use %s as metadata tag name for TransformationID" % self.transfidmeta ) # # archive periof in days self.archiveAfter = self.am_getOption( 'ArchiveAfter', 7 ) # days self.log.info( "Will archive Completed transformations after %d days" % self.archiveAfter ) # # active SEs self.activeStorages = sortList( self.am_getOption( 'ActiveSEs', [] ) ) self.log.info( "Will check the following storage elements: %s" % str( self.activeStorages ) ) # # transformation log SEs self.logSE = self.am_getOption( 'TransformationLogSE', 'LogSE' ) self.log.info( "Will remove logs found on storage element: %s" % self.logSE ) # # enable/disable execution, should be using CS option Status?? with default value as 'Active'?? self.enableFlag = self.am_getOption( 'EnableFlag', 'True' ) return S_OK() ############################################################################# def execute( self ): ''' execution in one agent's cycle :param self: self reference ''' self.enableFlag = self.am_getOption( 'EnableFlag', 'True' ) if not self.enableFlag == 'True': self.log.info( 'TransformationCleaningAgent is disabled by configuration option EnableFlag' ) return S_OK( 'Disabled via CS flag' ) # # Obtain the transformations in Cleaning status and remove any mention of the jobs/files res = self.transClient.getTransformations( { 'Status' : 'Cleaning', 'Type' : self.transformationTypes } ) if res['OK']: for transDict in res['Value']: # # if transformation is of type `Replication` or `Removal`, there is nothing to clean. # # We just archive if transDict[ 'Type' ] in [ 'Replication', 'Removal' ]: res = self.archiveTransformation( transDict['TransformationID'] ) if not res['OK']: self.log.error( "Problems archiving transformation %s: %s" % ( transDict['TransformationID'], res['Message'] ) ) else: res = self.cleanTransformation( transDict['TransformationID'] ) if not res['OK']: self.log.error( "Problems cleaning transformation %s: %s" % ( transDict['TransformationID'], res['Message'] ) ) # # Obtain the transformations in RemovingFiles status and (wait for it) removes the output files res = self.transClient.getTransformations( { 'Status' : 'RemovingFiles', 'Type' : self.transformationTypes} ) if res['OK']: for transDict in res['Value']: res = self.removeTransformationOutput( transDict['TransformationID'] ) if not res['OK']: self.log.error( "Problems removing transformation %s: %s" % ( transDict['TransformationID'], res['Message'] ) ) # # Obtain the transformations in Completed status and archive if inactive for X days olderThanTime = datetime.utcnow() - timedelta( days = self.archiveAfter ) res = self.transClient.getTransformations( { 'Status' : 'Completed', 'Type' : self.transformationTypes }, older = olderThanTime, timeStamp = 'LastUpdate' ) if res['OK']: for transDict in res['Value']: res = self.archiveTransformation( transDict['TransformationID'] ) if not res['OK']: self.log.error( "Problems archiving transformation %s: %s" % ( transDict['TransformationID'], res['Message'] ) ) else: self.log.error( "Could not get the transformations" ) return S_OK() ############################################################################# # # Get the transformation directories for checking # def getTransformationDirectories( self, transID ): ''' get the directories for the supplied transformation from the transformation system :param self: self reference :param int transID: transformation ID ''' directories = [] if 'TransformationDB' in self.directoryLocations: res = self.transClient.getTransformationParameters( transID, ['OutputDirectories'] ) if not res['OK']: self.log.error( "Failed to obtain transformation directories", res['Message'] ) return res transDirectories = res['Value'].splitlines() directories = self._addDirs( transID, transDirectories, directories ) if 'MetadataCatalog' in self.directoryLocations: res = self.metadataClient.findDirectoriesByMetadata( {self.transfidmeta:transID} ) if not res['OK']: self.log.error( "Failed to obtain metadata catalog directories", res['Message'] ) return res transDirectories = res['Value'] directories = self._addDirs( transID, transDirectories, directories ) if not directories: self.log.info( "No output directories found" ) directories = sortList( directories ) return S_OK( directories ) @classmethod def _addDirs( self, transID, newDirs, existingDirs ): ''' append uniqe :newDirs: list to :existingDirs: list :param self: self reference :param int transID: transformationID :param list newDirs: src list of paths :param list existingDirs: dest list of paths ''' for folder in newDirs: transStr = str( transID ).zfill( 8 ) if re.search( transStr, str( folder ) ): if not folder in existingDirs: existingDirs.append( folder ) return existingDirs ############################################################################# # # These are the methods for performing the cleaning of catalogs and storage # def cleanStorageContents( self, directory ): ''' delete lfn dir from all active SE :param self: self reference :param sre directory: folder name ''' for storageElement in self.activeStorages: res = self.__removeStorageDirectory( directory, storageElement ) if not res['OK']: return res return S_OK() def __removeStorageDirectory( self, directory, storageElement ): ''' wipe out all contents from :directory: at :storageElement: :param self: self reference :param str directory: path :param str storageElement: SE name ''' self.log.info( 'Removing the contents of %s at %s' % ( directory, storageElement ) ) res = self.replicaManager.getPfnForLfn( [directory], storageElement ) if not res['OK']: self.log.error( "Failed to get PFN for directory", res['Message'] ) return res for directory, error in res['Value']['Failed'].items(): self.log.error( 'Failed to obtain directory PFN from LFN', '%s %s' % ( directory, error ) ) if res['Value']['Failed']: return S_ERROR( 'Failed to obtain directory PFN from LFNs' ) storageDirectory = res['Value']['Successful'].values()[0] res = self.replicaManager.getStorageFileExists( storageDirectory, storageElement, singleFile = True ) if not res['OK']: self.log.error( "Failed to obtain existance of directory", res['Message'] ) return res exists = res['Value'] if not exists: self.log.info( "The directory %s does not exist at %s " % ( directory, storageElement ) ) return S_OK() res = self.replicaManager.removeStorageDirectory( storageDirectory, storageElement, recursive = True, singleDirectory = True ) if not res['OK']: self.log.error( "Failed to remove storage directory", res['Message'] ) return res self.log.info( "Successfully removed %d files from %s at %s" % ( res['Value']['FilesRemoved'], directory, storageElement ) ) return S_OK() def cleanCatalogContents( self, directory ): ''' wipe out everything from catalog under folder :directory: :param self: self reference :params str directory: folder name ''' res = self.__getCatalogDirectoryContents( [directory] ) if not res['OK']: return res filesFound = res['Value'] if not filesFound: return S_OK() self.log.info( "Attempting to remove %d possible remnants from the catalog and storage" % len( filesFound ) ) res = self.replicaManager.removeFile( filesFound ) if not res['OK']: return res for lfn, reason in res['Value']['Failed'].items(): self.log.error( "Failed to remove file found in the catalog", "%s %s" % ( lfn, reason ) ) if res['Value']['Failed']: return S_ERROR( "Failed to remove all files found in the catalog" ) return S_OK() def __getCatalogDirectoryContents( self, directories ): ''' get catalog contents under paths :directories: :param self: self reference :param list directories: list of paths in catalog ''' self.log.info( 'Obtaining the catalog contents for %d directories:' % len( directories ) ) for directory in directories: self.log.info( directory ) activeDirs = directories allFiles = {} while len( activeDirs ) > 0: currentDir = activeDirs[0] res = self.replicaManager.getCatalogListDirectory( currentDir, singleFile = True ) activeDirs.remove( currentDir ) if not res['OK'] and res['Message'].endswith( 'The supplied path does not exist' ): self.log.info( "The supplied directory %s does not exist" % currentDir ) elif not res['OK']: self.log.error( 'Failed to get directory contents', '%s %s' % ( currentDir, res['Message'] ) ) else: dirContents = res['Value'] activeDirs.extend( dirContents['SubDirs'] ) allFiles.update( dirContents['Files'] ) self.log.info( "Found %d files" % len( allFiles ) ) return S_OK( allFiles.keys() ) def cleanTransformationLogFiles( self, directory ): ''' clean up transformation logs from directory :directory: :param self: self reference :param str directory: folder name ''' self.log.info( "Removing log files found in the directory %s" % directory ) res = self.replicaManager.removeStorageDirectory( directory, self.logSE, singleDirectory = True ) if not res['OK']: self.log.error( "Failed to remove log files", res['Message'] ) return res self.log.info( "Successfully removed transformation log directory" ) return S_OK() ############################################################################# # # These are the functional methods for archiving and cleaning transformations # def removeTransformationOutput( self, transID ): ''' This just removes any mention of the output data from the catalog and storage ''' self.log.info( "Removing output data for transformation %s" % transID ) res = self.getTransformationDirectories( transID ) if not res['OK']: self.log.error( 'Problem obtaining directories for transformation %s with result "%s"' % ( transID, res ) ) return S_OK() directories = res['Value'] for directory in directories: if not re.search( '/LOG/', directory ): res = self.cleanCatalogContents( directory ) if not res['OK']: return res res = self.cleanStorageContents( directory ) if not res['OK']: return res self.log.info( "Removed directories in the catalog and storage for transformation" ) # Clean ALL the possible remnants found in the metadata catalog res = self.cleanMetadataCatalogFiles( transID, directories ) if not res['OK']: return res self.log.info( "Successfully removed output of transformation %d" % transID ) # Change the status of the transformation to RemovedFiles res = self.transClient.setTransformationParameter( transID, 'Status', 'RemovedFiles' ) if not res['OK']: self.log.error( "Failed to update status of transformation %s to RemovedFiles" % ( transID ), res['Message'] ) return res self.log.info( "Updated status of transformation %s to RemovedFiles" % ( transID ) ) return S_OK() def archiveTransformation( self, transID ): ''' This just removes job from the jobDB and the transformation DB :param self: self reference :param int transID: transformation ID ''' self.log.info( "Archiving transformation %s" % transID ) # Clean the jobs in the WMS and any failover requests found res = self.cleanTransformationTasks( transID ) if not res['OK']: return res # Clean the transformation DB of the files and job information res = self.transClient.cleanTransformation( transID ) if not res['OK']: return res self.log.info( "Successfully archived transformation %d" % transID ) # Change the status of the transformation to archived res = self.transClient.setTransformationParameter( transID, 'Status', 'Archived' ) if not res['OK']: self.log.error( "Failed to update status of transformation %s to Archived" % ( transID ), res['Message'] ) return res self.log.info( "Updated status of transformation %s to Archived" % ( transID ) ) return S_OK() def cleanTransformation( self, transID ): ''' This removes any mention of the supplied transformation ''' self.log.info( "Cleaning transformation %s" % transID ) res = self.getTransformationDirectories( transID ) if not res['OK']: self.log.error( 'Problem obtaining directories for transformation %s with result "%s"' % ( transID, res ) ) return S_OK() directories = res['Value'] # Clean the jobs in the WMS and any failover requests found res = self.cleanTransformationTasks( transID ) if not res['OK']: return res # Clean the log files for the jobs for directory in directories: if re.search( '/LOG/', directory ): res = self.cleanTransformationLogFiles( directory ) if not res['OK']: return res res = self.cleanCatalogContents( directory ) if not res['OK']: return res res = self.cleanStorageContents( directory ) if not res['OK']: return res # Clean ALL the possible remnants found in the BK res = self.cleanMetadataCatalogFiles( transID, directories ) if not res['OK']: return res # Clean the transformation DB of the files and job information res = self.transClient.cleanTransformation( transID ) if not res['OK']: return res self.log.info( "Successfully cleaned transformation %d" % transID ) # Change the status of the transformation to deleted res = self.transClient.setTransformationParameter( transID, 'Status', 'Deleted' ) if not res['OK']: self.log.error( "Failed to update status of transformation %s to Deleted" % ( transID ), res['Message'] ) return res self.log.info( "Updated status of transformation %s to Deleted" % ( transID ) ) return S_OK() def cleanMetadataCatalogFiles( self, transID ): ''' wipe out files from catalog ''' res = self.metadataClient.findFilesByMetadata( { self.transfidmeta : transID } ) if not res['OK']: return res fileToRemove = res['Value'] if not fileToRemove: self.log.info( 'No files found for transID %s' % transID ) return S_OK() res = self.replicaManager.removeFile( fileToRemove ) if not res['OK']: return res for lfn, reason in res['Value']['Failed'].items(): self.log.error( "Failed to remove file found in metadata catalog", "%s %s" % ( lfn, reason ) ) if res['Value']['Failed']: return S_ERROR( "Failed to remove all files found in the metadata catalog" ) self.log.info( "Successfully removed all files found in the BK" ) return S_OK() ############################################################################# # # These are the methods for removing the jobs from the WMS and transformation DB # def cleanTransformationTasks( self, transID ): ''' clean tasks from WMS ''' res = self.__getTransformationExternalIDs( transID ) if not res['OK']: return res externalIDs = res['Value'] if externalIDs: res = self.transClient.getTransformationParameters( transID, ['Type'] ) if not res['OK']: self.log.error( "Failed to determine transformation type" ) return res transType = res['Value'] if transType in [ 'Replication', 'Removal' ]: res = self.__removeRequests( externalIDs ) else: res = self.__removeWMSTasks( externalIDs ) if not res['OK']: return res return S_OK() def __getTransformationExternalIDs( self, transID ): ''' collect all ExternalIDs for transformation :transID: :param self: self reference :param int transID: transforamtion ID ''' res = self.transClient.getTransformationTasks( condDict = { 'TransformationID' : transID } ) if not res['OK']: self.log.error( "Failed to get externalIDs for transformation %d" % transID, res['Message'] ) return res externalIDs = [ taskDict['ExternalID'] for taskDict in res["Value"] ] self.log.info( "Found %d tasks for transformation" % len( externalIDs ) ) return S_OK( externalIDs ) def __removeRequests( self, requestIDs ): ''' dummy method ''' self.log.error( "Not removing requests but should do" ) return S_OK() def __removeWMSTasks( self, transJobIDs ): ''' wipe out jobs and their requests from the system TODO: should check request status, maybe FTS files as well ??? :param self: self reference :param list trasnJobIDs: job IDs ''' # Prevent 0 job IDs jobIDs = [ int( j ) for j in transJobIDs if int( j ) ] allRemove = True for jobList in breakListIntoChunks( jobIDs, 500 ): res = self.wmsClient.killJob( jobList ) if res['OK']: self.log.info( "Successfully killed %d jobs from WMS" % len( jobList ) ) elif ( "InvalidJobIDs" in res ) and ( "NonauthorizedJobIDs" not in res ) and ( "FailedJobIDs" not in res ): self.log.info( "Found %s jobs which did not exist in the WMS" % len( res['InvalidJobIDs'] ) ) elif "NonauthorizedJobIDs" in res: self.log.error( "Failed to kill %s jobs because not authorized" % len( res['NonauthorizedJobIDs'] ) ) allRemove = False elif "FailedJobIDs" in res: self.log.error( "Failed to kill %s jobs" % len( res['FailedJobIDs'] ) ) allRemove = False res = self.wmsClient.deleteJob( jobList ) if res['OK']: self.log.info( "Successfully removed %d jobs from WMS" % len( jobList ) ) elif ( "InvalidJobIDs" in res ) and ( "NonauthorizedJobIDs" not in res ) and ( "FailedJobIDs" not in res ): self.log.info( "Found %s jobs which did not exist in the WMS" % len( res['InvalidJobIDs'] ) ) elif "NonauthorizedJobIDs" in res: self.log.error( "Failed to remove %s jobs because not authorized" % len( res['NonauthorizedJobIDs'] ) ) allRemove = False elif "FailedJobIDs" in res: self.log.error( "Failed to remove %s jobs" % len( res['FailedJobIDs'] ) ) allRemove = False if not allRemove: return S_ERROR( "Failed to remove all remnants from WMS" ) self.log.info( "Successfully removed all tasks from the WMS" ) if not jobIDs: self.log.info( "JobIDs not present, unable to remove asociated requests." ) return S_OK() res = self.requestClient.getRequestForJobs( jobIDs ) if not res['OK']: self.log.error( "Failed to get requestID for jobs.", res['Message'] ) return res failoverRequests = res['Value'] self.log.info( "Found %d jobs with associated failover requests" % len( failoverRequests ) ) if not failoverRequests: return S_OK() failed = 0 for jobID, requestName in failoverRequests.items(): # Put this check just in case, tasks must have associated jobs if jobID == 0 or jobID == '0': continue res = self.requestClient.deleteRequest( requestName ) if not res['OK']: self.log.error( "Failed to remove request from RequestDB", res['Message'] ) failed += 1 else: self.log.verbose( "Removed request %s associated to job %d." % ( requestName, jobID ) ) if failed: self.log.info( "Successfully removed %s requests" % ( len( failoverRequests ) - failed ) ) self.log.info( "Failed to remove %s requests" % failed ) return S_ERROR( "Failed to remove all the request from RequestDB" ) self.log.info( "Successfully removed all the associated failover requests" ) return S_OK()
def __init__( self, transClient=None, logger=None, submissionClient=None, jobMonitoringClient=None, outputDataModule=None, jobClass=None, opsH=None, destinationPlugin=None, ownerDN=None, ownerGroup=None, ): """Generates some default objects. jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works: VOs can pass in their job class extension, if present """ if not logger: logger = gLogger.getSubLogger(self.__class__.__name__) super(WorkflowTasks, self).__init__(transClient, logger) useCertificates = True if (bool(ownerDN) and bool(ownerGroup)) else False if not submissionClient: self.submissionClient = WMSClient(useCertificates=useCertificates, delegatedDN=ownerDN, delegatedGroup=ownerGroup) else: self.submissionClient = submissionClient if not jobMonitoringClient: self.jobMonitoringClient = JobMonitoringClient() else: self.jobMonitoringClient = jobMonitoringClient if not jobClass: self.jobClass = Job else: self.jobClass = jobClass if not opsH: self.opsH = Operations() else: self.opsH = opsH if not outputDataModule: self.outputDataModule = self.opsH.getValue( "Transformations/OutputDataModule", "") else: self.outputDataModule = outputDataModule if not destinationPlugin: self.destinationPlugin = self.opsH.getValue( "Transformations/DestinationPlugin", "BySE") else: self.destinationPlugin = destinationPlugin self.destinationPlugin_o = None self.outputDataModule_o = None
def test_JobStateUpdateAndJobMonitoring(self): """ Verifying all JobStateUpdate and JobMonitoring functions """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate') # create a job and check stuff job = helloWorldJob() jobDescription = createFile(job) # submitting the job. Checking few stuff res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assert_(res['OK']) jobID = int(res['Value']) # jobID = res['JobID'] res = jobMonitor.getJobJDL(jobID, True) self.assert_(res['OK']) res = jobMonitor.getJobJDL(jobID, False) self.assert_(res['OK']) res = jobMonitor.getJobsParameters([jobID], []) self.assert_(res['OK']) self.assertEqual(res['Value'], {}) res = jobMonitor.getJobsParameters([jobID], ['Owner']) self.assert_(res['OK']) # Adding stuff res = jobStateUpdate.setJobStatus(jobID, 'Matched', 'matching', 'source') self.assert_(res['OK']) res = jobStateUpdate.setJobParameters(jobID, [('par1', 'par1Value'), ('par2', 'par2Value')]) self.assert_(res['OK']) res = jobStateUpdate.setJobApplicationStatus(jobID, 'app status', 'source') self.assert_(res['OK']) # res = jobStateUpdate.setJobFlag() # self.assert_( res['OK'] ) # res = jobStateUpdate.unsetJobFlag() # self.assert_( res['OK'] ) res = jobStateUpdate.setJobSite(jobID, 'Site') self.assert_(res['OK']) # res = jobMonitor.traceJobParameter( 'Site', 1, 'Status' ) # self.assert_( res['OK'] ) # now checking few things res = jobMonitor.getJobStatus(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], 'Running') res = jobMonitor.getJobParameter(jobID, 'par1') self.assert_(res['OK']) self.assertEqual(res['Value'], {'par1': 'par1Value'}) res = jobMonitor.getJobParameters(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], { 'par1': 'par1Value', 'par2': 'par2Value' }) res = jobMonitor.getJobAttribute(jobID, 'Site') self.assert_(res['OK']) self.assertEqual(res['Value'], 'Site') res = jobMonitor.getJobAttributes(jobID) self.assert_(res['OK']) self.assertEqual(res['Value']['ApplicationStatus'], 'app status') self.assertEqual(res['Value']['JobName'], 'helloWorld') res = jobMonitor.getJobSummary(jobID) self.assert_(res['OK']) self.assertEqual(res['Value']['ApplicationStatus'], 'app status') self.assertEqual(res['Value']['Status'], 'Running') res = jobMonitor.getJobHeartBeatData(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], []) res = jobMonitor.getInputData(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], []) res = jobMonitor.getJobPrimarySummary(jobID) self.assert_(res['OK']) res = jobMonitor.getAtticJobParameters(jobID) self.assert_(res['OK']) res = jobStateUpdate.setJobsStatus([jobID], 'Done', 'MinorStatus', 'Unknown') self.assert_(res['OK']) res = jobMonitor.getJobSummary(jobID) self.assert_(res['OK']) self.assertEqual(res['Value']['Status'], 'Done') self.assertEqual(res['Value']['MinorStatus'], 'MinorStatus') self.assertEqual(res['Value']['ApplicationStatus'], 'app status') res = jobStateUpdate.sendHeartBeat(jobID, {'bih': 'bih'}, {'boh': 'boh'}) self.assert_(res['OK']) # delete the job - this will just set its status to "deleted" wmsClient.deleteJob(jobID)
class TransformationCleaningAgent( AgentModule ): ############################################################################# def initialize( self ): """Sets defaults """ self.replicaManager = ReplicaManager() self.transClient = TransformationClient() self.wmsClient = WMSClient() self.requestClient = RequestClient() self.metadataClient = FileCatalogClient() self.storageUsageClient = StorageUsageClient() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) self.transformationTypes = sortList( self.am_getOption( 'TransformationTypes', ['MCSimulation', 'DataReconstruction', 'DataStripping', 'MCStripping', 'Merge', 'Replication'] ) ) gLogger.info( "Will consider the following transformation types: %s" % str( self.transformationTypes ) ) self.directoryLocations = sortList( self.am_getOption( 'DirectoryLocations', ['TransformationDB', 'StorageUsage', 'MetadataCatalog'] ) ) gLogger.info( "Will search for directories in the following locations: %s" % str( self.directoryLocations ) ) self.transfidmeta = self.am_getOption( 'TransfIDMeta', "TransformationID" ) gLogger.info( "Will use %s as metadata tag name for TransformationID" % self.transfidmeta ) self.archiveAfter = self.am_getOption( 'ArchiveAfter', 7 ) # days gLogger.info( "Will archive Completed transformations after %d days" % self.archiveAfter ) self.activeStorages = sortList( self.am_getOption( 'ActiveSEs', [] ) ) gLogger.info( "Will check the following storage elements: %s" % str( self.activeStorages ) ) self.logSE = self.am_getOption( 'TransformationLogSE', 'LogSE' ) gLogger.info( "Will remove logs found on storage element: %s" % self.logSE ) return S_OK() ############################################################################# def execute( self ): """ The TransformationCleaningAgent execution method. """ self.enableFlag = self.am_getOption( 'EnableFlag', 'True' ) if not self.enableFlag == 'True': self.log.info( 'TransformationCleaningAgent is disabled by configuration option %s/EnableFlag' % ( self.section ) ) return S_OK( 'Disabled via CS flag' ) # Obtain the transformations in Cleaning status and remove any mention of the jobs/files res = self.transClient.getTransformations( {'Status':'Cleaning', 'Type':self.transformationTypes} ) if res['OK']: for transDict in res['Value']: self.cleanTransformation( transDict['TransformationID'] ) # Obtain the transformations in RemovingFiles status and (wait for it) removes the output files res = self.transClient.getTransformations( {'Status':'RemovingFiles', 'Type':self.transformationTypes} ) if res['OK']: for transDict in res['Value']: self.removeTransformationOutput( transDict['TransformationID'] ) # Obtain the transformations in Completed status and archive if inactive for X days olderThanTime = datetime.utcnow() - timedelta( days = self.archiveAfter ) res = self.transClient.getTransformations( {'Status':'Completed', 'Type':self.transformationTypes}, older = olderThanTime ) if res['OK']: for transDict in res['Value']: self.archiveTransformation( transDict['TransformationID'] ) return S_OK() ############################################################################# # # Get the transformation directories for checking # def getTransformationDirectories( self, transID ): """ Get the directories for the supplied transformation from the transformation system """ directories = [] if 'TransformationDB' in self.directoryLocations: res = self.transClient.getTransformationParameters( transID, ['OutputDirectories'] ) if not res['OK']: gLogger.error( "Failed to obtain transformation directories", res['Message'] ) return res transDirectories = res['Value'].splitlines() directories = self.__addDirs( transID, transDirectories, directories ) if 'StorageUsage' in self.directoryLocations: res = self.storageUsageClient.getStorageDirectories( '', '', transID, [] ) if not res['OK']: gLogger.error( "Failed to obtain storage usage directories", res['Message'] ) return res transDirectories = res['Value'] directories = self.__addDirs( transID, transDirectories, directories ) if 'MetadataCatalog' in self.directoryLocations: res = self.metadataClient.findDirectoriesByMetadata( {self.transfidmeta:transID} ) if not res['OK']: gLogger.error( "Failed to obtain metadata catalog directories", res['Message'] ) return res transDirectories = res['Value'] directories = self.__addDirs( transID, transDirectories, directories ) if not directories: gLogger.info( "No output directories found" ) directories = sortList( directories ) return S_OK( directories ) def __addDirs( self, transID, newDirs, existingDirs ): for dir in newDirs: transStr = str( transID ).zfill( 8 ) if re.search( transStr, dir ): if not dir in existingDirs: existingDirs.append( dir ) return existingDirs ############################################################################# # # These are the methods for performing the cleaning of catalogs and storage # def cleanStorageContents( self, directory ): for storageElement in self.activeStorages: res = self.__removeStorageDirectory( directory, storageElement ) if not res['OK']: return res return S_OK() def __removeStorageDirectory( self, directory, storageElement ): gLogger.info( 'Removing the contents of %s at %s' % ( directory, storageElement ) ) res = self.replicaManager.getPfnForLfn( [directory], storageElement ) if not res['OK']: gLogger.error( "Failed to get PFN for directory", res['Message'] ) return res for directory, error in res['Value']['Failed'].items(): gLogger.error( 'Failed to obtain directory PFN from LFN', '%s %s' % ( directory, error ) ) if res['Value']['Failed']: return S_ERROR( 'Failed to obtain directory PFN from LFNs' ) storageDirectory = res['Value']['Successful'].values()[0] res = self.replicaManager.getStorageFileExists( storageDirectory, storageElement, singleFile = True ) if not res['OK']: gLogger.error( "Failed to obtain existance of directory", res['Message'] ) return res exists = res['Value'] if not exists: gLogger.info( "The directory %s does not exist at %s " % ( directory, storageElement ) ) return S_OK() res = self.replicaManager.removeStorageDirectory( storageDirectory, storageElement, recursive = True, singleDirectory = True ) if not res['OK']: gLogger.error( "Failed to remove storage directory", res['Message'] ) return res gLogger.info( "Successfully removed %d files from %s at %s" % ( res['Value']['FilesRemoved'], directory, storageElement ) ) return S_OK() def cleanCatalogContents( self, directory ): res = self.__getCatalogDirectoryContents( [directory] ) if not res['OK']: return res filesFound = res['Value'] if not filesFound: return S_OK() gLogger.info( "Attempting to remove %d possible remnants from the catalog and storage" % len( filesFound ) ) res = self.replicaManager.removeFile( filesFound ) if not res['OK']: return res for lfn, reason in res['Value']['Failed'].items(): gLogger.error( "Failed to remove file found in the catalog", "%s %s" % ( lfn, reason ) ) if res['Value']['Failed']: return S_ERROR( "Failed to remove all files found in the catalog" ) return S_OK() def __getCatalogDirectoryContents( self, directories ): gLogger.info( 'Obtaining the catalog contents for %d directories:' % len( directories ) ) for directory in directories: gLogger.info( directory ) activeDirs = directories allFiles = {} while len( activeDirs ) > 0: currentDir = activeDirs[0] res = self.replicaManager.getCatalogListDirectory( currentDir, singleFile = True ) activeDirs.remove( currentDir ) if not res['OK'] and res['Message'].endswith( 'The supplied path does not exist' ): gLogger.info( "The supplied directory %s does not exist" % currentDir ) elif not res['OK']: gLogger.error( 'Failed to get directory contents', '%s %s' % ( currentDir, res['Message'] ) ) else: dirContents = res['Value'] activeDirs.extend( dirContents['SubDirs'] ) allFiles.update( dirContents['Files'] ) gLogger.info( "Found %d files" % len( allFiles ) ) return S_OK( allFiles.keys() ) def cleanTransformationLogFiles( self, directory ): gLogger.info( "Removing log files found in the directory %s" % directory ) res = self.replicaManager.removeStorageDirectory( directory, self.logSE, singleDirectory = True ) if not res['OK']: gLogger.error( "Failed to remove log files", res['Message'] ) return res gLogger.info( "Successfully removed transformation log directory" ) return S_OK() ############################################################################# # # These are the functional methods for archiving and cleaning transformations # def removeTransformationOutput( self, transID ): """ This just removes any mention of the output data from the catalog and storage """ gLogger.info( "Removing output data for transformation %s" % transID ) res = self.getTransformationDirectories( transID ) if not res['OK']: gLogger.error( 'Problem obtaining directories for transformation %s with result "%s"' % ( transID, res ) ) return S_OK() directories = res['Value'] for directory in directories: if not re.search( '/LOG/', directory ): res = self.cleanCatalogContents( directory ) if not res['OK']: return res res = self.cleanStorageContents( directory ) if not res['OK']: return res gLogger.info( "Removed directories in the catalog and storage for transformation" ) # Clean ALL the possible remnants found in the metadata catalog res = self.cleanMetadataCatalogFiles( transID, directories ) if not res['OK']: return res gLogger.info( "Successfully removed output of transformation %d" % transID ) # Change the status of the transformation to RemovedFiles res = self.transClient.setTransformationParameter( transID, 'Status', 'RemovedFiles' ) if not res['OK']: gLogger.error( "Failed to update status of transformation %s to RemovedFiles" % ( transID ), res['Message'] ) return res gLogger.info( "Updated status of transformation %s to RemovedFiles" % ( transID ) ) return S_OK() def archiveTransformation( self, transID ): """ This just removes job from the jobDB and the transformation DB """ gLogger.info( "Archiving transformation %s" % transID ) # Clean the jobs in the WMS and any failover requests found res = self.cleanTransformationTasks( transID ) if not res['OK']: return res # Clean the transformation DB of the files and job information res = self.transClient.cleanTransformation( transID ) if not res['OK']: return res gLogger.info( "Successfully archived transformation %d" % transID ) # Change the status of the transformation to archived res = self.transClient.setTransformationParameter( transID, 'Status', 'Archived' ) if not res['OK']: gLogger.error( "Failed to update status of transformation %s to Archived" % ( transID ), res['Message'] ) return res gLogger.info( "Updated status of transformation %s to Archived" % ( transID ) ) return S_OK() def cleanTransformation( self, transID ): """ This removes any mention of the supplied transformation """ gLogger.info( "Cleaning transformation %s" % transID ) res = self.getTransformationDirectories( transID ) if not res['OK']: gLogger.error( 'Problem obtaining directories for transformation %s with result "%s"' % ( transID, res ) ) return S_OK() directories = res['Value'] # Clean the jobs in the WMS and any failover requests found res = self.cleanTransformationTasks( transID ) if not res['OK']: return res # Clean the log files for the jobs for directory in directories: if re.search( '/LOG/', directory ): res = self.cleanTransformationLogFiles( directory ) if not res['OK']: return res res = self.cleanCatalogContents( directory ) if not res['OK']: return res res = self.cleanStorageContents( directory ) if not res['OK']: return res # Clean ALL the possible remnants found in the BK res = self.cleanMetadataCatalogFiles( transID, directories ) if not res['OK']: return res # Clean the transformation DB of the files and job information res = self.transClient.cleanTransformation( transID ) if not res['OK']: return res gLogger.info( "Successfully cleaned transformation %d" % transID ) # Change the status of the transformation to deleted res = self.transClient.setTransformationParameter( transID, 'Status', 'Deleted' ) if not res['OK']: gLogger.error( "Failed to update status of transformation %s to Deleted" % ( transID ), res['Message'] ) return res gLogger.info( "Updated status of transformation %s to Deleted" % ( transID ) ) return S_OK() def cleanMetadataCatalogFiles( self, transID, directories ): res = self.metadataClient.findFilesByMetadata( {self.transfidmeta:transID} ) if not res['OK']: return res fileToRemove = res['Value'] if not len(fileToRemove): gLogger.info('No files found for transID %s'%transID) return S_OK() res = self.replicaManager.removeFile( fileToRemove ) if not res['OK']: return res for lfn, reason in res['Value']['Failed'].items(): gLogger.error( "Failed to remove file found in metadata catalog", "%s %s" % ( lfn, reason ) ) if res['Value']['Failed']: return S_ERROR( "Failed to remove all files found in the metadata catalog" ) gLogger.info( "Successfully removed all files found in the BK" ) return S_OK() ############################################################################# # # These are the methods for removing the jobs from the WMS and transformation DB # def cleanTransformationTasks( self, transID ): res = self.__getTransformationExternalIDs( transID ) if not res['OK']: return res externalIDs = res['Value'] if externalIDs: res = self.transClient.getTransformationParameters( transID, ['Type'] ) if not res['OK']: gLogger.error( "Failed to determine transformation type" ) return res transType = res['Value'] if transType == 'Replication': res = self.__removeRequests( externalIDs ) else: res = self.__removeWMSTasks( externalIDs ) if not res['OK']: return res return S_OK() def __getTransformationExternalIDs( self, transID ): res = self.transClient.getTransformationTasks( condDict = {'TransformationID':transID} ) if not res['OK']: gLogger.error( "Failed to get externalIDs for transformation %d" % transID, res['Message'] ) return res externalIDs = [] for taskDict in res['Value']: externalIDs.append( taskDict['ExternalID'] ) gLogger.info( "Found %d tasks for transformation" % len( externalIDs ) ) return S_OK( externalIDs ) def __removeRequests( self, requestIDs ): gLogger.error( "Not removing requests but should do" ) return S_OK() def __removeWMSTasks( self, jobIDs ): allRemove = True for jobList in breakListIntoChunks( jobIDs, 500 ): res = self.wmsClient.deleteJob( jobList ) if res['OK']: gLogger.info( "Successfully removed %d jobs from WMS" % len( jobList ) ) elif ( res.has_key( 'InvalidJobIDs' ) ) and ( not res.has_key( 'NonauthorizedJobIDs' ) ) and ( not res.has_key( 'FailedJobIDs' ) ): gLogger.info( "Found %s jobs which did not exist in the WMS" % len( res['InvalidJobIDs'] ) ) elif res.has_key( 'NonauthorizedJobIDs' ): gLogger.error( "Failed to remove %s jobs because not authorized" % len( res['NonauthorizedJobIDs'] ) ) allRemove = False elif res.has_key( 'FailedJobIDs' ): gLogger.error( "Failed to remove %s jobs" % len( res['FailedJobIDs'] ) ) allRemove = False if not allRemove: return S_ERROR( "Failed to remove all remnants from WMS" ) gLogger.info( "Successfully removed all tasks from the WMS" ) res = self.requestClient.getRequestForJobs( jobIDs ) if not res['OK']: gLogger.error( "Failed to get requestID for jobs.", res['Message'] ) return res failoverRequests = res['Value'] gLogger.info( "Found %d jobs with associated failover requests" % len( failoverRequests ) ) if not failoverRequests: return S_OK() failed = 0 for jobID, requestName in failoverRequests.items(): res = self.requestClient.deleteRequest( requestName ) if not res['OK']: gLogger.error( "Failed to remove request from RequestDB", res['Message'] ) failed += 1 else: gLogger.verbose( "Removed request %s associated to job %d." % ( requestName, jobID ) ) if failed: gLogger.info( "Successfully removed %s requests" % ( len( failoverRequests ) - failed ) ) gLogger.info( "Failed to remove %s requests" % failed ) return S_ERROR( "Failed to remove all the request from RequestDB" ) gLogger.info( "Successfully removed all the associated failover requests" ) return S_OK()
def test_FullChain(self): """ This test will - call all the WMSClient methods that will end up calling all the JobManager service methods - use the JobMonitoring to verify few properties - call the JobCleaningAgent to eliminate job entries from the DBs """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate') # create the job job = helloWorldJob() jobDescription = createFile(job) # submit the job res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assert_(res['OK']) # self.assertEqual( type( res['Value'] ), int ) # self.assertEqual( res['Value'], res['JobID'] ) # jobID = res['JobID'] jobID = res['Value'] # updating the status jobStateUpdate.setJobStatus(jobID, 'Running', 'Executing Minchiapp', 'source') # reset the job res = wmsClient.resetJob(jobID) self.assert_(res['OK']) # reschedule the job res = wmsClient.rescheduleJob(jobID) self.assert_(res['OK']) res = jobMonitor.getJobStatus(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], 'Received') # updating the status again jobStateUpdate.setJobStatus(jobID, 'Matched', 'matching', 'source') # kill the job res = wmsClient.killJob(jobID) self.assert_(res['OK']) res = jobMonitor.getJobStatus(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], 'Killed') # updating the status aaaagain jobStateUpdate.setJobStatus(jobID, 'Done', 'matching', 'source') # kill the job res = wmsClient.killJob(jobID) self.assert_(res['OK']) res = jobMonitor.getJobStatus(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], 'Done') # this time it won't kill... it's done! # delete the job - this will just set its status to "deleted" res = wmsClient.deleteJob(jobID) self.assert_(res['OK']) res = jobMonitor.getJobStatus(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], 'Deleted')
class TransformationCleaningAgent( AgentModule ): """ .. class:: TransformationCleaningAgent :param DataManger dm: DataManager instance :param TransfromationClient transClient: TransfromationClient instance :param FileCatalogClient metadataClient: FileCatalogClient instance """ def __init__( self, *args, **kwargs ): """ c'tor """ AgentModule.__init__( self, *args, **kwargs ) # # data manager self.dm = None # # transformation client self.transClient = None # # wms client self.wmsClient = None # # request client self.reqClient = None # # file catalog client self.metadataClient = None # # transformations types self.transformationTypes = None # # directory locations self.directoryLocations = None # # transformation metadata self.transfidmeta = None # # archive periof in days self.archiveAfter = None # # active SEs self.activeStorages = None # # transformation log SEs self.logSE = None # # enable/disable execution self.enableFlag = None def initialize( self ): """ agent initialisation reading and setting confing opts :param self: self reference """ # # shifter proxy self.am_setOption( 'shifterProxy', 'DataManager' ) # # transformations types self.dataProcTTypes = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] ) self.dataManipTTypes = Operations().getValue( 'Transformations/DataManipulation', ['Replication', 'Removal'] ) agentTSTypes = self.am_getOption( 'TransformationTypes', [] ) if agentTSTypes: self.transformationTypes = sorted( agentTSTypes ) else: self.transformationTypes = sorted( self.dataProcTTypes + self.dataManipTTypes ) self.log.info( "Will consider the following transformation types: %s" % str( self.transformationTypes ) ) # # directory locations self.directoryLocations = sorted( self.am_getOption( 'DirectoryLocations', [ 'TransformationDB', 'MetadataCatalog' ] ) ) self.log.info( "Will search for directories in the following locations: %s" % str( self.directoryLocations ) ) # # transformation metadata self.transfidmeta = self.am_getOption( 'TransfIDMeta', "TransformationID" ) self.log.info( "Will use %s as metadata tag name for TransformationID" % self.transfidmeta ) # # archive periof in days self.archiveAfter = self.am_getOption( 'ArchiveAfter', 7 ) # days self.log.info( "Will archive Completed transformations after %d days" % self.archiveAfter ) # # active SEs self.activeStorages = sorted( self.am_getOption( 'ActiveSEs', [] ) ) self.log.info( "Will check the following storage elements: %s" % str( self.activeStorages ) ) # # transformation log SEs self.logSE = self.am_getOption( 'TransformationLogSE', 'LogSE' ) self.log.info( "Will remove logs found on storage element: %s" % self.logSE ) # # enable/disable execution, should be using CS option Status?? with default value as 'Active'?? self.enableFlag = self.am_getOption( 'EnableFlag', 'True' ) # # data manager # self.dm = DataManager() # # transformation client self.transClient = TransformationClient() # # wms client self.wmsClient = WMSClient() # # request client self.reqClient = ReqClient() # # file catalog client self.metadataClient = FileCatalogClient() return S_OK() ############################################################################# def execute( self ): """ execution in one agent's cycle :param self: self reference """ self.enableFlag = self.am_getOption( 'EnableFlag', 'True' ) if not self.enableFlag == 'True': self.log.info( 'TransformationCleaningAgent is disabled by configuration option EnableFlag' ) return S_OK( 'Disabled via CS flag' ) # # Obtain the transformations in Cleaning status and remove any mention of the jobs/files res = self.transClient.getTransformations( { 'Status' : 'Cleaning', 'Type' : self.transformationTypes } ) if res['OK']: for transDict in res['Value']: # # if transformation is of type `Replication` or `Removal`, there is nothing to clean. # # We just archive if transDict[ 'Type' ] in self.dataManipTTypes: res = self.archiveTransformation( transDict['TransformationID'] ) if not res['OK']: self.log.error( "Problems archiving transformation %s: %s" % ( transDict['TransformationID'], res['Message'] ) ) else: res = self.cleanTransformation( transDict['TransformationID'] ) if not res['OK']: self.log.error( "Problems cleaning transformation %s: %s" % ( transDict['TransformationID'], res['Message'] ) ) # # Obtain the transformations in RemovingFiles status and (wait for it) removes the output files res = self.transClient.getTransformations( { 'Status' : 'RemovingFiles', 'Type' : self.transformationTypes} ) if res['OK']: for transDict in res['Value']: res = self.removeTransformationOutput( transDict['TransformationID'] ) if not res['OK']: self.log.error( "Problems removing transformation %s: %s" % ( transDict['TransformationID'], res['Message'] ) ) # # Obtain the transformations in Completed status and archive if inactive for X days olderThanTime = datetime.utcnow() - timedelta( days = self.archiveAfter ) res = self.transClient.getTransformations( { 'Status' : 'Completed', 'Type' : self.transformationTypes }, older = olderThanTime, timeStamp = 'LastUpdate' ) if res['OK']: for transDict in res['Value']: res = self.archiveTransformation( transDict['TransformationID'] ) if not res['OK']: self.log.error( "Problems archiving transformation %s: %s" % ( transDict['TransformationID'], res['Message'] ) ) else: self.log.error( "Could not get the transformations" ) return S_OK() ############################################################################# # # Get the transformation directories for checking # def getTransformationDirectories( self, transID ): """ get the directories for the supplied transformation from the transformation system :param self: self reference :param int transID: transformation ID """ directories = [] if 'TransformationDB' in self.directoryLocations: res = self.transClient.getTransformationParameters( transID, ['OutputDirectories'] ) if not res['OK']: self.log.error( "Failed to obtain transformation directories", res['Message'] ) return res if type( res['Value'] ) != type( [] ): transDirectories = ast.literal_eval( res['Value'] ) else: transDirectories = res['Value'] directories = self._addDirs( transID, transDirectories, directories ) if 'MetadataCatalog' in self.directoryLocations: res = self.metadataClient.findDirectoriesByMetadata( {self.transfidmeta:transID} ) if not res['OK']: self.log.error( "Failed to obtain metadata catalog directories", res['Message'] ) return res transDirectories = res['Value'] directories = self._addDirs( transID, transDirectories, directories ) if not directories: self.log.info( "No output directories found" ) directories = sorted( directories ) return S_OK( directories ) # FIXME If a classmethod, should it not have cls instead of self? @classmethod def _addDirs( self, transID, newDirs, existingDirs ): """ append uniqe :newDirs: list to :existingDirs: list :param self: self reference :param int transID: transformationID :param list newDirs: src list of paths :param list existingDirs: dest list of paths """ for folder in newDirs: transStr = str( transID ).zfill( 8 ) if re.search( transStr, str( folder ) ): if not folder in existingDirs: existingDirs.append( folder ) return existingDirs ############################################################################# # # These are the methods for performing the cleaning of catalogs and storage # def cleanStorageContents( self, directory ): """ delete lfn dir from all active SE :param self: self reference :param sre directory: folder name """ for storageElement in self.activeStorages: res = self.__removeStorageDirectory( directory, storageElement ) if not res['OK']: return res return S_OK() def __removeStorageDirectory( self, directory, storageElement ): """ wipe out all contents from :directory: at :storageElement: :param self: self reference :param str directory: path :param str storageElement: SE name """ self.log.info( 'Removing the contents of %s at %s' % ( directory, storageElement ) ) se = StorageElement( storageElement ) res = se.getPfnForLfn( [directory] ) if not res['OK']: self.log.error( "Failed to get PFN for directory", res['Message'] ) return res if directory in res['Value']['Failed']: self.log.verbose( 'Failed to obtain directory PFN from LFN', '%s %s' % ( directory, res['Value']['Failed'][directory] ) ) return S_ERROR( 'Failed to obtain directory PFN from LFNs' ) storageDirectory = res['Value']['Successful'][directory] res = returnSingleResult( se.exists( storageDirectory ) ) if not res['OK']: self.log.error( "Failed to obtain existance of directory", res['Message'] ) return res exists = res['Value'] if not exists: self.log.info( "The directory %s does not exist at %s " % ( directory, storageElement ) ) return S_OK() res = returnSingleResult( se.removeDirectory( storageDirectory, recursive = True ) ) if not res['OK']: self.log.error( "Failed to remove storage directory", res['Message'] ) return res self.log.info( "Successfully removed %d files from %s at %s" % ( res['Value']['FilesRemoved'], directory, storageElement ) ) return S_OK() def cleanCatalogContents( self, directory ): """ wipe out everything from catalog under folder :directory: :param self: self reference :params str directory: folder name """ res = self.__getCatalogDirectoryContents( [directory] ) if not res['OK']: return res filesFound = res['Value'] if not filesFound: self.log.info( "No files are registered in the catalog directory %s" % directory ) return S_OK() self.log.info( "Attempting to remove %d possible remnants from the catalog and storage" % len( filesFound ) ) # Executing with shifter proxy gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'false' ) res = DataManager().removeFile( filesFound, force = True ) gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'true' ) if not res['OK']: return res realFailure = False for lfn, reason in res['Value']['Failed'].items(): if "File does not exist" in str( reason ): self.log.warn( "File %s not found in some catalog: " % ( lfn ) ) else: self.log.error( "Failed to remove file found in the catalog", "%s %s" % ( lfn, reason ) ) realFailure = True if realFailure: return S_ERROR( "Failed to remove all files found in the catalog" ) return S_OK() def __getCatalogDirectoryContents( self, directories ): """ get catalog contents under paths :directories: :param self: self reference :param list directories: list of paths in catalog """ self.log.info( 'Obtaining the catalog contents for %d directories:' % len( directories ) ) for directory in directories: self.log.info( directory ) activeDirs = directories allFiles = {} fc = FileCatalog() while len( activeDirs ) > 0: currentDir = activeDirs[0] res = returnSingleResult( fc.listDirectory( currentDir ) ) activeDirs.remove( currentDir ) if not res['OK'] and res['Message'].endswith( 'The supplied path does not exist' ): self.log.info( "The supplied directory %s does not exist" % currentDir ) elif not res['OK']: if "No such file or directory" in res['Message']: self.log.info( "%s: %s" % ( currentDir, res['Message'] ) ) else: self.log.error( "Failed to get directory %s content: %s" % ( currentDir, res['Message'] ) ) else: dirContents = res['Value'] activeDirs.extend( dirContents['SubDirs'] ) allFiles.update( dirContents['Files'] ) self.log.info( "Found %d files" % len( allFiles ) ) return S_OK( allFiles.keys() ) def cleanTransformationLogFiles( self, directory ): """ clean up transformation logs from directory :directory: :param self: self reference :param str directory: folder name """ self.log.info( "Removing log files found in the directory %s" % directory ) res = returnSingleResult( StorageElement( self.logSE ).removeDirectory( directory ) ) if not res['OK']: self.log.error( "Failed to remove log files", res['Message'] ) return res self.log.info( "Successfully removed transformation log directory" ) return S_OK() ############################################################################# # # These are the functional methods for archiving and cleaning transformations # def removeTransformationOutput( self, transID ): """ This just removes any mention of the output data from the catalog and storage """ self.log.info( "Removing output data for transformation %s" % transID ) res = self.getTransformationDirectories( transID ) if not res['OK']: self.log.error( 'Problem obtaining directories for transformation %s with result "%s"' % ( transID, res ) ) return S_OK() directories = res['Value'] for directory in directories: if not re.search( '/LOG/', directory ): res = self.cleanCatalogContents( directory ) if not res['OK']: return res res = self.cleanStorageContents( directory ) if not res['OK']: return res self.log.info( "Removed directories in the catalog and storage for transformation" ) # Clean ALL the possible remnants found in the metadata catalog res = self.cleanMetadataCatalogFiles( transID ) if not res['OK']: return res self.log.info( "Successfully removed output of transformation %d" % transID ) # Change the status of the transformation to RemovedFiles res = self.transClient.setTransformationParameter( transID, 'Status', 'RemovedFiles' ) if not res['OK']: self.log.error( "Failed to update status of transformation %s to RemovedFiles" % ( transID ), res['Message'] ) return res self.log.info( "Updated status of transformation %s to RemovedFiles" % ( transID ) ) return S_OK() def archiveTransformation( self, transID ): """ This just removes job from the jobDB and the transformation DB :param self: self reference :param int transID: transformation ID """ self.log.info( "Archiving transformation %s" % transID ) # Clean the jobs in the WMS and any failover requests found res = self.cleanTransformationTasks( transID ) if not res['OK']: return res # Clean the transformation DB of the files and job information res = self.transClient.cleanTransformation( transID ) if not res['OK']: return res self.log.info( "Successfully archived transformation %d" % transID ) # Change the status of the transformation to archived res = self.transClient.setTransformationParameter( transID, 'Status', 'Archived' ) if not res['OK']: self.log.error( "Failed to update status of transformation %s to Archived" % ( transID ), res['Message'] ) return res self.log.info( "Updated status of transformation %s to Archived" % ( transID ) ) return S_OK() def cleanTransformation( self, transID ): """ This removes what was produced by the supplied transformation, leaving only some info and log in the transformation DB. """ self.log.info( "Cleaning transformation %s" % transID ) res = self.getTransformationDirectories( transID ) if not res['OK']: self.log.error( 'Problem obtaining directories for transformation %s with result "%s"' % ( transID, res ) ) return S_OK() directories = res['Value'] # Clean the jobs in the WMS and any failover requests found res = self.cleanTransformationTasks( transID ) if not res['OK']: return res # Clean the log files for the jobs for directory in directories: if re.search( '/LOG/', directory ): res = self.cleanTransformationLogFiles( directory ) if not res['OK']: return res res = self.cleanCatalogContents( directory ) if not res['OK']: return res res = self.cleanStorageContents( directory ) if not res['OK']: return res # Clean ALL the possible remnants found in the BK res = self.cleanMetadataCatalogFiles( transID ) if not res['OK']: return res # Clean the transformation DB of the files and job information res = self.transClient.cleanTransformation( transID ) if not res['OK']: return res self.log.info( "Successfully cleaned transformation %d" % transID ) res = self.transClient.setTransformationParameter( transID, 'Status', 'Cleaned' ) if not res['OK']: self.log.error( "Failed to update status of transformation %s to Cleaned" % ( transID ), res['Message'] ) return res self.log.info( "Updated status of transformation %s to Cleaned" % ( transID ) ) return S_OK() def cleanMetadataCatalogFiles( self, transID ): """ wipe out files from catalog """ res = self.metadataClient.findFilesByMetadata( { self.transfidmeta : transID } ) if not res['OK']: return res fileToRemove = res['Value'] if not fileToRemove: self.log.info( 'No files found for transID %s' % transID ) return S_OK() # Executing with shifter proxy gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'false' ) res = DataManager().removeFile( fileToRemove, force = True ) gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'true' ) if not res['OK']: return res for lfn, reason in res['Value']['Failed'].items(): self.log.error( "Failed to remove file found in metadata catalog", "%s %s" % ( lfn, reason ) ) if res['Value']['Failed']: return S_ERROR( "Failed to remove all files found in the metadata catalog" ) self.log.info( "Successfully removed all files found in the BK" ) return S_OK() ############################################################################# # # These are the methods for removing the jobs from the WMS and transformation DB # def cleanTransformationTasks( self, transID ): """ clean tasks from WMS, or from the RMS if it is a DataManipulation transformation """ res = self.__getTransformationExternalIDs( transID ) if not res['OK']: return res externalIDs = res['Value'] if externalIDs: res = self.transClient.getTransformationParameters( transID, ['Type'] ) if not res['OK']: self.log.error( "Failed to determine transformation type" ) return res transType = res['Value'] if transType in self.dataProcTTypes: res = self.__removeWMSTasks( externalIDs ) else: res = self.__removeRequests( externalIDs ) if not res['OK']: return res return S_OK() def __getTransformationExternalIDs( self, transID ): """ collect all ExternalIDs for transformation :transID: :param self: self reference :param int transID: transforamtion ID """ res = self.transClient.getTransformationTasks( condDict = { 'TransformationID' : transID } ) if not res['OK']: self.log.error( "Failed to get externalIDs for transformation %d" % transID, res['Message'] ) return res externalIDs = [ taskDict['ExternalID'] for taskDict in res["Value"] ] self.log.info( "Found %d tasks for transformation" % len( externalIDs ) ) return S_OK( externalIDs ) def __removeRequests( self, requestIDs ): """ This will remove requests from the (new) RMS system - #FIXME: if the old system is still installed, it won't remove anything!!! (we don't want to risk removing from the new RMS what is instead in the old) """ # FIXME: checking if the old system is still installed! from DIRAC.ConfigurationSystem.Client import PathFinder if PathFinder.getServiceURL( "RequestManagement/RequestManager" ): self.log.warn( "NOT removing requests!!" ) return S_OK() rIDs = [ int( long( j ) ) for j in requestIDs if long( j ) ] for requestName in rIDs: self.reqClient.deleteRequest( requestName ) return S_OK() def __removeWMSTasks( self, transJobIDs ): """ wipe out jobs and their requests from the system TODO: should check request status, maybe FTS files as well ??? :param self: self reference :param list trasnJobIDs: job IDs """ # Prevent 0 job IDs jobIDs = [ int( j ) for j in transJobIDs if int( j ) ] allRemove = True for jobList in breakListIntoChunks( jobIDs, 500 ): res = self.wmsClient.killJob( jobList ) if res['OK']: self.log.info( "Successfully killed %d jobs from WMS" % len( jobList ) ) elif ( "InvalidJobIDs" in res ) and ( "NonauthorizedJobIDs" not in res ) and ( "FailedJobIDs" not in res ): self.log.info( "Found %s jobs which did not exist in the WMS" % len( res['InvalidJobIDs'] ) ) elif "NonauthorizedJobIDs" in res: self.log.error( "Failed to kill %s jobs because not authorized" % len( res['NonauthorizedJobIDs'] ) ) allRemove = False elif "FailedJobIDs" in res: self.log.error( "Failed to kill %s jobs" % len( res['FailedJobIDs'] ) ) allRemove = False res = self.wmsClient.deleteJob( jobList ) if res['OK']: self.log.info( "Successfully removed %d jobs from WMS" % len( jobList ) ) elif ( "InvalidJobIDs" in res ) and ( "NonauthorizedJobIDs" not in res ) and ( "FailedJobIDs" not in res ): self.log.info( "Found %s jobs which did not exist in the WMS" % len( res['InvalidJobIDs'] ) ) elif "NonauthorizedJobIDs" in res: self.log.error( "Failed to remove %s jobs because not authorized" % len( res['NonauthorizedJobIDs'] ) ) allRemove = False elif "FailedJobIDs" in res: self.log.error( "Failed to remove %s jobs" % len( res['FailedJobIDs'] ) ) allRemove = False if not allRemove: return S_ERROR( "Failed to remove all remnants from WMS" ) self.log.info( "Successfully removed all tasks from the WMS" ) if not jobIDs: self.log.info( "JobIDs not present, unable to remove asociated requests." ) return S_OK() failed = 0 # FIXME: double request client: old/new -> only the new will survive sooner or later # this is the old try: res = RequestClient().getRequestForJobs( jobIDs ) if not res['OK']: self.log.error( "Failed to get requestID for jobs.", res['Message'] ) return res failoverRequests = res['Value'] self.log.info( "Found %d jobs with associated failover requests (in the old RMS)" % len( failoverRequests ) ) if not failoverRequests: return S_OK() for jobID, requestName in failoverRequests.items(): # Put this check just in case, tasks must have associated jobs if jobID == 0 or jobID == '0': continue res = RequestClient().deleteRequest( requestName ) if not res['OK']: self.log.error( "Failed to remove request from RequestDB", res['Message'] ) failed += 1 else: self.log.verbose( "Removed request %s associated to job %d." % ( requestName, jobID ) ) except RuntimeError: failoverRequests = {} pass # FIXME: and this is the new res = self.reqClient.getRequestNamesForJobs( jobIDs ) if not res['OK']: self.log.error( "Failed to get requestID for jobs.", res['Message'] ) return res failoverRequests.update( res['Value']['Successful'] ) if not failoverRequests: return S_OK() for jobID, requestName in res['Value']['Successful'].items(): # Put this check just in case, tasks must have associated jobs if jobID == 0 or jobID == '0': continue res = self.reqClient.deleteRequest( requestName ) if not res['OK']: self.log.error( "Failed to remove request from RequestDB", res['Message'] ) failed += 1 else: self.log.verbose( "Removed request %s associated to job %d." % ( requestName, jobID ) ) if failed: self.log.info( "Successfully removed %s requests" % ( len( failoverRequests ) - failed ) ) self.log.info( "Failed to remove %s requests" % failed ) return S_ERROR( "Failed to remove all the request from RequestDB" ) self.log.info( "Successfully removed all the associated failover requests" ) return S_OK()