def putRequest( self, userName, userDN, userGroup, sourceSE, targetSE1, targetSE2 ): """ test case for user """ req = self.buildRequest( userName, userGroup, sourceSE, targetSE1, targetSE2 ) req.RequestName = "test%s-%s" % ( userName, userGroup ) req.OwnerDN = userDN req.OwnerGroup = userGroup gLogger.always( "putRequest: request '%s'" % req.RequestName ) for op in req: gLogger.always( "putRequest: => %s %s %s" % ( op.Order, op.Type, op.TargetSE ) ) for f in op: gLogger.always( "putRequest: ===> file %s" % f.LFN ) reqClient = ReqClient() delete = reqClient.deleteRequest( req.RequestName ) if not delete["OK"]: gLogger.error( "putRequest: %s" % delete["Message"] ) return delete put = reqClient.putRequest( req ) if not put["OK"]: gLogger.error( "putRequest: %s" % put["Message"] ) return put
def myRequest(): """Create a request and put it to the db""" request = Request() request.RequestName = 'myAwesomeRemovalRequest.xml' request.JobID = 0 request.SourceComponent = "myScript" remove = Operation() remove.Type = "RemoveFile" lfn = "/ilc/user/s/sailer/test.txt" rmFile = File() rmFile.LFN = lfn remove.addFile( rmFile ) request.addOperation( remove ) isValid = RequestValidator().validate( request ) if not isValid['OK']: raise RuntimeError( "Failover request is not valid: %s" % isValid['Message'] ) else: print("It is a GOGOGO") requestClient = ReqClient() result = requestClient.putRequest( request ) print(result)
def commitRequest( self ): """ Send request to the Request Management Service """ if self.request.isEmpty(): return S_OK() isValid = RequestValidator().validate( self.request ) if not isValid["OK"]: return S_ERROR( "Failover request is not valid: %s" % isValid["Message"] ) else: requestClient = ReqClient() result = requestClient.putRequest( self.request ) return result
def __init__(self, transClient=None, logger=None, requestClient=None, requestClass=None, requestValidator=None, ownerDN=None, ownerGroup=None): """ c'tor the requestClass is by default Request. If extensions want to use an extended type, they can pass it as a parameter. This is the same behavior as WorfkloTasks and jobClass """ if not logger: logger = gLogger.getSubLogger('RequestTasks') super(RequestTasks, self).__init__(transClient, logger) useCertificates = True if (bool(ownerDN) and bool(ownerGroup)) else False if not requestClient: self.requestClient = ReqClient(useCertificates=useCertificates, delegatedDN=ownerDN, delegatedGroup=ownerGroup) else: self.requestClient = requestClient if not requestClass: self.requestClass = Request else: self.requestClass = requestClass if not requestValidator: self.requestValidator = RequestValidator() else: self.requestValidator = requestValidator
def __init__( self, *args, **kwargs ): """ c'tor """ AgentModule.__init__( self, *args, **kwargs ) # # replica manager self.replicaManager = ReplicaManager() # # transformation client self.transClient = TransformationClient() # # wms client self.wmsClient = WMSClient() # # request client # FIXME: double client: only ReqClient will survive in the end self.requestClient = RequestClient() self.reqClient = ReqClient() # # file catalog clinet self.metadataClient = FileCatalogClient() # # placeholders for CS options # # transformations types self.transformationTypes = None # # directory locations self.directoryLocations = None # # transformation metadata self.transfidmeta = None # # archive periof in days self.archiveAfter = None # # active SEs self.activeStorages = None # # transformation log SEs self.logSE = None # # enable/disable execution self.enableFlag = None
def __init__( self, transClient = None, logger = None, requestClient = None, requestClass = None, requestValidator = None ): """ c'tor the requestClass is by default Request. If extensions want to use an extended type, they can pass it as a parameter. This is the same behavior as WorfkloTasks and jobClass """ if not logger: logger = gLogger.getSubLogger( 'RequestTasks' ) super( RequestTasks, self ).__init__( transClient, logger ) if not requestClient: self.requestClient = ReqClient() else: self.requestClient = requestClient if not requestClass: self.requestClass = Request else: self.requestClass = requestClass if not requestValidator: self.requestValidator = RequestValidator() else: self.requestValidator = requestValidator
def callback(self): """ Trigger the callback once all the FTS interactions are done and update the status of the Operation to 'Finished' if successful """ self.reqClient = ReqClient() res = self._callback() if res['OK']: self.status = 'Finished' return res
def __init__( self, requestJSON, handlersDict, csPath, agentName, standalone=False, requestClient=None): """c'tor :param self: self reference :param str requestJSON: request serialized to JSON :param dict opHandlers: operation handlers """ self.request = Request(requestJSON) # # csPath self.csPath = csPath # # agent name self.agentName = agentName # # standalone flag self.standalone = standalone # # handlers dict self.handlersDict = handlersDict # # handlers class def self.handlers = {} # # own sublogger self.log = gLogger.getSubLogger("pid_%s/%s" % (os.getpid(), self.request.RequestName)) # # get shifters info self.__managersDict = {} shifterProxies = self.__setupManagerProxies() if not shifterProxies["OK"]: self.log.error(shifterProxies["Message"]) # # initialize gMonitor gMonitor.setComponentType(gMonitor.COMPONENT_AGENT) gMonitor.setComponentName(self.agentName) gMonitor.initialize() # # own gMonitor activities gMonitor.registerActivity("RequestAtt", "Requests processed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM) gMonitor.registerActivity("RequestFail", "Requests failed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM) gMonitor.registerActivity("RequestOK", "Requests done", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM) if requestClient is None: self.requestClient = ReqClient() else: self.requestClient = requestClient
def initialize( self ): """ agent initialisation reading and setting confing opts :param self: self reference """ # # shifter proxy self.am_setOption( 'shifterProxy', 'DataManager' ) # # transformations types self.dataProcTTypes = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] ) self.dataManipTTypes = Operations().getValue( 'Transformations/DataManipulation', ['Replication', 'Removal'] ) agentTSTypes = self.am_getOption( 'TransformationTypes', [] ) if agentTSTypes: self.transformationTypes = sorted( agentTSTypes ) else: self.transformationTypes = sorted( self.dataProcTTypes + self.dataManipTTypes ) self.log.info( "Will consider the following transformation types: %s" % str( self.transformationTypes ) ) # # directory locations self.directoryLocations = sorted( self.am_getOption( 'DirectoryLocations', [ 'TransformationDB', 'MetadataCatalog' ] ) ) self.log.info( "Will search for directories in the following locations: %s" % str( self.directoryLocations ) ) # # transformation metadata self.transfidmeta = self.am_getOption( 'TransfIDMeta', "TransformationID" ) self.log.info( "Will use %s as metadata tag name for TransformationID" % self.transfidmeta ) # # archive periof in days self.archiveAfter = self.am_getOption( 'ArchiveAfter', 7 ) # days self.log.info( "Will archive Completed transformations after %d days" % self.archiveAfter ) # # active SEs self.activeStorages = sorted( self.am_getOption( 'ActiveSEs', [] ) ) self.log.info( "Will check the following storage elements: %s" % str( self.activeStorages ) ) # # transformation log SEs self.logSE = Operations().getValue( '/LogStorage/LogSE', 'LogSE' ) self.log.info( "Will remove logs found on storage element: %s" % self.logSE ) # # enable/disable execution, should be using CS option Status?? with default value as 'Active'?? self.enableFlag = self.am_getOption( 'EnableFlag', 'True' ) # # data manager # self.dm = DataManager() # # transformation client self.transClient = TransformationClient() # # wms client self.wmsClient = WMSClient() # # request client self.reqClient = ReqClient() # # file catalog client self.metadataClient = FileCatalogClient() return S_OK()
def initialize(self): """ agent initialisation reading and setting confing opts :param self: self reference """ # # shifter proxy # See cleanCatalogContents method: this proxy will be used ALSO when the file catalog used # is the DIRAC File Catalog (DFC). # This is possible because of unset of the "UseServerCertificate" option self.shifterProxy = self.am_getOption('shifterProxy', None) # # transformations types self.dataProcTTypes = Operations().getValue('Transformations/DataProcessing', self.dataProcTTypes) self.dataManipTTypes = Operations().getValue('Transformations/DataManipulation', self.dataManipTTypes) agentTSTypes = self.am_getOption('TransformationTypes', []) if agentTSTypes: self.transformationTypes = sorted(agentTSTypes) else: self.transformationTypes = sorted(self.dataProcTTypes + self.dataManipTTypes) self.log.info("Will consider the following transformation types: %s" % str(self.transformationTypes)) # # directory locations self.directoryLocations = sorted(self.am_getOption('DirectoryLocations', self.directoryLocations)) self.log.info("Will search for directories in the following locations: %s" % str(self.directoryLocations)) # # transformation metadata self.transfidmeta = self.am_getOption('TransfIDMeta', self.transfidmeta) self.log.info("Will use %s as metadata tag name for TransformationID" % self.transfidmeta) # # archive periof in days self.archiveAfter = self.am_getOption('ArchiveAfter', self.archiveAfter) # days self.log.info("Will archive Completed transformations after %d days" % self.archiveAfter) # # active SEs self.activeStorages = sorted(self.am_getOption('ActiveSEs', self.activeStorages)) if self.activeStorages: self.log.info("Will check the following storage elements: %s" % str(self.activeStorages)) # # transformation log SEs self.logSE = Operations().getValue('/LogStorage/LogSE', self.logSE) self.log.info("Will remove logs found on storage element: %s" % self.logSE) # # transformation client self.transClient = TransformationClient() # # wms client self.wmsClient = WMSClient() # # request client self.reqClient = ReqClient() # # file catalog client self.metadataClient = FileCatalogClient() return S_OK()
def __setRemovalRequest(self, lfn, ownerDN, ownerGroup): """ Set removal request with the given credentials """ oRequest = Request() oRequest.OwnerDN = ownerDN oRequest.OwnerGroup = ownerGroup oRequest.RequestName = os.path.basename( lfn).strip() + '_removal_request.xml' oRequest.SourceComponent = 'JobCleaningAgent' removeFile = Operation() removeFile.Type = 'RemoveFile' removedFile = File() removedFile.LFN = lfn removeFile.addFile(removedFile) oRequest.addOperation(removeFile) return ReqClient().putRequest(oRequest)
def __init__(self, *args, **kwargs): AgentModule.__init__(self, *args, **kwargs) self.name = 'FileStatusTransformationAgent' self.enabled = False self.shifterProxy = 'DataManager' self.transformationTypes = ["Replication"] self.transformationStatuses = ["Active"] self.transformationFileStatuses = ["Assigned", "Problematic", "Processed", "Unused"] self.addressTo = ["*****@*****.**"] self.addressFrom = "*****@*****.**" self.emailSubject = "FileStatusTransformationAgent" self.accounting = defaultdict(list) self.errors = [] self.fcClient = FileCatalogClient() self.tClient = TransformationClient() self.reqClient = ReqClient() self.nClient = NotificationClient()
def __init__(self, requestJSON, handlersDict, csPath, agentName): """c'tor :param self: self reference :param str requestJSON: request serialized to JSON :param dict opHandlers: operation handlers """ self.request = Request(requestJSON) # # csPath self.csPath = csPath # # agent name self.agentName = agentName # # handlers dict self.handlersDict = handlersDict # # handlers class def self.handlers = {} # # own sublogger self.log = gLogger.getSubLogger(self.request.RequestName) # # get shifters info self.__managersDict = {} shifterProxies = self.__setupManagerProxies() if not shifterProxies["OK"]: self.log.error(shifterProxies["Message"]) # # initialize gMonitor gMonitor.setComponentType(gMonitor.COMPONENT_AGENT) gMonitor.setComponentName(self.agentName) gMonitor.initialize() # # own gMonitor activities gMonitor.registerActivity("RequestAtt", "Requests processed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM) gMonitor.registerActivity("RequestFail", "Requests failed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM) gMonitor.registerActivity("RequestOK", "Requests done", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM) self.requestClient = ReqClient()
def __deleteSandboxFromExternalBackend(self, SEName, SEPFN): if self.getCSOption("DelayedExternalDeletion", True): gLogger.info("Setting deletion request") try: # We need the hostDN used in order to pass these credentials to the # SandboxStoreDB.. hostCertLocation, _ = Locations.getHostCertificateAndKeyLocation() hostCert = X509Certificate.X509Certificate() hostCert.loadFromFile(hostCertLocation) hostDN = hostCert.getSubjectDN().get("Value") # use the host authentication to fetch the data result = self.sandboxDB.getSandboxOwner(SEName, SEPFN, hostDN, "hosts") if not result["OK"]: return result _owner, ownerDN, ownerGroup = result["Value"] request = Request() request.RequestName = "RemoteSBDeletion:%s|%s:%s" % (SEName, SEPFN, time.time()) request.OwnerDN = ownerDN request.OwnerGroup = ownerGroup physicalRemoval = Operation() physicalRemoval.Type = "PhysicalRemoval" physicalRemoval.TargetSE = SEName fileToRemove = File() fileToRemove.PFN = SEPFN physicalRemoval.addFile(fileToRemove) request.addOperation(physicalRemoval) return ReqClient().putRequest(request) except Exception as e: gLogger.exception("Exception while setting deletion request") return S_ERROR("Cannot set deletion request: %s" % str(e)) else: gLogger.info("Deleting external Sandbox") try: return StorageElement(SEName).removeFile(SEPFN) except Exception: gLogger.exception("RM raised an exception while trying to delete a remote sandbox") return S_ERROR("RM raised an exception while trying to delete a remote sandbox")
def __setRemovalRequest(self, lfn, ownerDN, ownerGroup): """Set removal request with the given credentials""" oRequest = Request() oRequest.OwnerDN = ownerDN oRequest.OwnerGroup = ownerGroup oRequest.RequestName = os.path.basename( lfn).strip() + "_removal_request.xml" oRequest.SourceComponent = "JobCleaningAgent" removeFile = Operation() removeFile.Type = "RemoveFile" removedFile = File() removedFile.LFN = lfn removeFile.addFile(removedFile) oRequest.addOperation(removeFile) # put the request with the owner certificate to make sure it's still a valid DN return ReqClient(useCertificates=True, delegatedDN=ownerDN, delegatedGroup=ownerGroup).putRequest(oRequest)
def setUp(self): """ test case set up """ gLogger.setLevel('INFO') self.file = File() self.file.LFN = "/lhcb/user/c/cibak/testFile" self.file.Checksum = "123456" self.file.ChecksumType = "ADLER32" self.file2 = File() self.file2.LFN = "/lhcb/user/f/fstagni/testFile" self.file2.Checksum = "654321" self.file2.ChecksumType = "ADLER32" self.operation = Operation() self.operation.Type = "ReplicateAndRegister" self.operation.TargetSE = "CERN-USER" self.operation.addFile(self.file) self.operation.addFile(self.file2) proxyInfo = getProxyInfo()['Value'] self.request = Request() self.request.RequestName = "RequestManagerHandlerTests" self.request.OwnerDN = proxyInfo['identity'] self.request.OwnerGroup = proxyInfo['group'] self.request.JobID = 123 self.request.addOperation(self.operation) # # JSON representation of a whole request self.jsonStr = self.request.toJSON()['Value'] # # request client self.requestClient = ReqClient() self.stressRequests = 1000 self.bulkRequest = 1000
def main(): # Registering arguments will automatically add their description to the help menu Script.registerArgument(" SE: StorageElement|All") Script.registerArgument(["LFN: LFN or file containing a List of LFNs"]) Script.parseCommandLine(ignoreErrors=False) # parseCommandLine show help when mandatory arguments are not specified or incorrect argument args = Script.getPositionalArgs() targetSE = args.pop(0) lfns = [] for inputFileName in args: if os.path.exists(inputFileName): with open(inputFileName, "r") as inputFile: string = inputFile.read() lfns.extend([lfn.strip() for lfn in string.splitlines()]) else: lfns.append(inputFileName) from DIRAC.Resources.Storage.StorageElement import StorageElement import DIRAC # Check is provided SE is OK if targetSE != "All": se = StorageElement(targetSE) if not se.valid: print(se.errorReason) print() Script.showHelp() from DIRAC.RequestManagementSystem.Client.Request import Request from DIRAC.RequestManagementSystem.Client.Operation import Operation from DIRAC.RequestManagementSystem.Client.File import File from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient from DIRAC.RequestManagementSystem.private.RequestValidator import RequestValidator from DIRAC.Resources.Catalog.FileCatalog import FileCatalog reqClient = ReqClient() fc = FileCatalog() requestOperation = "RemoveReplica" if targetSE == "All": requestOperation = "RemoveFile" for lfnList in breakListIntoChunks(lfns, 100): oRequest = Request() requestName = "%s_%s" % ( md5(repr(time.time()).encode()).hexdigest()[:16], md5(repr(time.time()).encode()).hexdigest()[:16], ) oRequest.RequestName = requestName oOperation = Operation() oOperation.Type = requestOperation oOperation.TargetSE = targetSE res = fc.getFileMetadata(lfnList) if not res["OK"]: print("Can't get file metadata: %s" % res["Message"]) DIRAC.exit(1) if res["Value"]["Failed"]: print( "Could not get the file metadata of the following, so skipping them:" ) for fFile in res["Value"]["Failed"]: print(fFile) lfnMetadata = res["Value"]["Successful"] for lfn in lfnMetadata: rarFile = File() rarFile.LFN = lfn rarFile.Size = lfnMetadata[lfn]["Size"] rarFile.Checksum = lfnMetadata[lfn]["Checksum"] rarFile.GUID = lfnMetadata[lfn]["GUID"] rarFile.ChecksumType = "ADLER32" oOperation.addFile(rarFile) oRequest.addOperation(oOperation) isValid = RequestValidator().validate(oRequest) if not isValid["OK"]: print("Request is not valid: ", isValid["Message"]) DIRAC.exit(1) result = reqClient.putRequest(oRequest) if result["OK"]: print("Request %d Submitted" % result["Value"]) else: print("Failed to submit Request: ", result["Message"])
targetSE = set(switch[1].split(',')) if reset and not force: status = 'Failed' if fixJob: status = 'Done' if terse: verbose = True if status: if not until: until = datetime.datetime.utcnow() if not since: since = until - datetime.timedelta(hours=24) from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient from DIRAC.RequestManagementSystem.Client.ReqClient import printRequest, recoverableRequest reqClient = ReqClient() if transID: if not taskIDs: gLogger.fatal("If Transformation is set, a list of Tasks should also be set") Script.showHelp(exitCode=2) # In principle, the task name is unique, so the request name should be unique as well # If ever this would not work anymore, we would need to use the transformationClient # to fetch the ExternalID requests = ['%08d_%08d' % (transID, task) for task in taskIDs] allR = True elif not jobs: requests = [] # Get full list of arguments, with and without comma for arg in [x.strip() for arg in Script.getPositionalArgs() for x in arg.split(',')]: if os.path.exists(arg):
def initialize(self): """ agent initialisation reading and setting confing opts :param self: self reference """ # # shifter proxy # See cleanCatalogContents method: this proxy will be used ALSO when the file catalog used # is the DIRAC File Catalog (DFC). # This is possible because of unset of the "UseServerCertificate" option self.shifterProxy = self.am_getOption('shifterProxy', None) # # transformations types self.dataProcTTypes = Operations().getValue( 'Transformations/DataProcessing', self.dataProcTTypes) self.dataManipTTypes = Operations().getValue( 'Transformations/DataManipulation', self.dataManipTTypes) agentTSTypes = self.am_getOption('TransformationTypes', []) if agentTSTypes: self.transformationTypes = sorted(agentTSTypes) else: self.transformationTypes = sorted(self.dataProcTTypes + self.dataManipTTypes) self.log.info("Will consider the following transformation types: %s" % str(self.transformationTypes)) # # directory locations self.directoryLocations = sorted( self.am_getOption('DirectoryLocations', self.directoryLocations)) self.log.info( "Will search for directories in the following locations: %s" % str(self.directoryLocations)) # # transformation metadata self.transfidmeta = self.am_getOption('TransfIDMeta', self.transfidmeta) self.log.info("Will use %s as metadata tag name for TransformationID" % self.transfidmeta) # # archive periof in days self.archiveAfter = self.am_getOption('ArchiveAfter', self.archiveAfter) # days self.log.info("Will archive Completed transformations after %d days" % self.archiveAfter) # # active SEs self.activeStorages = sorted( self.am_getOption('ActiveSEs', self.activeStorages)) if self.activeStorages: self.log.info("Will check the following storage elements: %s" % str(self.activeStorages)) # # transformation log SEs self.logSE = Operations().getValue('/LogStorage/LogSE', self.logSE) self.log.info("Will remove logs found on storage element: %s" % self.logSE) # # transformation client self.transClient = TransformationClient() # # wms client self.wmsClient = WMSClient() # # request client self.reqClient = ReqClient() # # file catalog client self.metadataClient = FileCatalogClient() return S_OK()
since = convertDate( switch[1] ) elif switch[0] == 'Until': until = convertDate( switch[1] ) if reset: status = 'Failed' if terse: verbose = True if status: if not until: until = datetime.datetime.utcnow() if not since: since = until - datetime.timedelta( hours = 24 ) from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient from DIRAC.RequestManagementSystem.Client.ReqClient import printRequest, recoverableRequest reqClient = ReqClient() if transID: if not taskIDs: gLogger.fatal( "If Transformation is set, a list of Tasks should also be set" ) Script.showHelp() DIRAC.exit( 2 ) # In principle, the task name is unique, so the request name should be unique as well # If ever this would not work anymore, we would need to use the transformationClient # to fetch the ExternalID requests = ['%08d_%08d' % ( transID, task ) for task in taskIDs] allR = True elif not jobs: args = Script.getPositionalArgs() if len( args ) == 1: allR = True
targetSE = set(switch[1].split(',')) if reset and not force: status = 'Failed' if fixJob: status = 'Done' if terse: verbose = True if status: if not until: until = datetime.datetime.utcnow() if not since: since = until - datetime.timedelta(hours=24) from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient from DIRAC.RequestManagementSystem.Client.ReqClient import printRequest, recoverableRequest reqClient = ReqClient() if transID: if not taskIDs: gLogger.fatal("If Transformation is set, a list of Tasks should also be set") Script.showHelp() DIRAC.exit(2) # In principle, the task name is unique, so the request name should be unique as well # If ever this would not work anymore, we would need to use the transformationClient # to fetch the ExternalID requests = ['%08d_%08d' % (transID, task) for task in taskIDs] allR = True elif not jobs: requests = [] # Get full list of arguments, with and without comma for arg in [x.strip() for arg in Script.getPositionalArgs() for x in arg.split(',')]:
def requestClient(cls): """ request client getter """ if not cls.__requestClient: cls.__requestClient = ReqClient() return cls.__requestClient
class RequestTasks(TaskBase): """ Class for handling tasks for the RMS """ def __init__(self, transClient=None, logger=None, requestClient=None, requestClass=None, requestValidator=None, ownerDN=None, ownerGroup=None): """ c'tor the requestClass is by default Request. If extensions want to use an extended type, they can pass it as a parameter. This is the same behavior as WorfkloTasks and jobClass """ if not logger: logger = gLogger.getSubLogger('RequestTasks') super(RequestTasks, self).__init__(transClient, logger) useCertificates = True if (bool(ownerDN) and bool(ownerGroup)) else False if not requestClient: self.requestClient = ReqClient(useCertificates=useCertificates, delegatedDN=ownerDN, delegatedGroup=ownerGroup) else: self.requestClient = requestClient if not requestClass: self.requestClass = Request else: self.requestClass = requestClass if not requestValidator: self.requestValidator = RequestValidator() else: self.requestValidator = requestValidator def prepareTransformationTasks(self, transBody, taskDict, owner='', ownerGroup='', ownerDN='', bulkSubmissionFlag=False): """ Prepare tasks, given a taskDict, that is created (with some manipulation) by the DB """ if not taskDict: return S_OK({}) if (not owner) or (not ownerGroup): res = getProxyInfo(False, False) if not res['OK']: return res proxyInfo = res['Value'] owner = proxyInfo['username'] ownerGroup = proxyInfo['group'] if not ownerDN: res = getDNForUsername(owner) if not res['OK']: return res ownerDN = res['Value'][0] try: transJson = json.loads(transBody) self._multiOperationsBody(transJson, taskDict, ownerDN, ownerGroup) except ValueError: # #json couldn't load self._singleOperationsBody(transBody, taskDict, ownerDN, ownerGroup) return S_OK(taskDict) def _multiOperationsBody(self, transJson, taskDict, ownerDN, ownerGroup): """ deal with a Request that has multiple operations :param transJson: list of lists of string and dictionaries, e.g.: .. code :: python body = [ ( "ReplicateAndRegister", { "SourceSE":"FOO-SRM", "TargetSE":"BAR-SRM" }), ( "RemoveReplica", { "TargetSE":"FOO-SRM" } ), ] :param dict taskDict: dictionary of tasks, modified in this function :param str ownerDN: certificate DN used for the requests :param str onwerGroup: dirac group used for the requests :returns: None """ failedTasks = [] for taskID, task in taskDict.items(): transID = task['TransformationID'] if not task.get('InputData'): self._logError("Error creating request for task", "%s, No input data" % taskID, transID=transID) taskDict.pop(taskID) continue files = [] oRequest = Request() if isinstance(task['InputData'], list): files = task['InputData'] elif isinstance(task['InputData'], basestring): files = task['InputData'].split(';') # create the operations from the json structure for operationTuple in transJson: op = Operation() op.Type = operationTuple[0] for parameter, value in operationTuple[1].iteritems(): setattr(op, parameter, value) for lfn in files: opFile = File() opFile.LFN = lfn op.addFile(opFile) oRequest.addOperation(op) result = self._assignRequestToTask(oRequest, taskDict, transID, taskID, ownerDN, ownerGroup) if not result['OK']: failedTasks.append(taskID) # Remove failed tasks for taskID in failedTasks: taskDict.pop(taskID) def _singleOperationsBody(self, transBody, taskDict, ownerDN, ownerGroup): """ deal with a Request that has just one operation, as it was sofar :param transBody: string, can be an empty string :param dict taskDict: dictionary of tasks, modified in this function :param str ownerDN: certificate DN used for the requests :param str onwerGroup: dirac group used for the requests :returns: None """ requestOperation = 'ReplicateAndRegister' if transBody: try: _requestType, requestOperation = transBody.split(';') except AttributeError: pass failedTasks = [] # Do not remove sorted, we might pop elements in the loop for taskID, task in taskDict.iteritems(): transID = task['TransformationID'] oRequest = Request() transfer = Operation() transfer.Type = requestOperation transfer.TargetSE = task['TargetSE'] # If there are input files if task.get('InputData'): if isinstance(task['InputData'], list): files = task['InputData'] elif isinstance(task['InputData'], basestring): files = task['InputData'].split(';') for lfn in files: trFile = File() trFile.LFN = lfn transfer.addFile(trFile) oRequest.addOperation(transfer) result = self._assignRequestToTask(oRequest, taskDict, transID, taskID, ownerDN, ownerGroup) if not result['OK']: failedTasks.append(taskID) # Remove failed tasks for taskID in failedTasks: taskDict.pop(taskID) def _assignRequestToTask(self, oRequest, taskDict, transID, taskID, ownerDN, ownerGroup): """set ownerDN and group to request, and add the request to taskDict if it is valid, otherwise remove the task from the taskDict :param oRequest: Request :param dict taskDict: dictionary of tasks, modified in this function :param int transID: Transformation ID :param int taskID: Task ID :param str ownerDN: certificate DN used for the requests :param str onwerGroup: dirac group used for the requests :returns: None """ oRequest.RequestName = self._transTaskName(transID, taskID) oRequest.OwnerDN = ownerDN oRequest.OwnerGroup = ownerGroup isValid = self.requestValidator.validate(oRequest) if not isValid['OK']: self._logError("Error creating request for task", "%s %s" % (taskID, isValid), transID=transID) return S_ERROR('Error creating request') taskDict[taskID]['TaskObject'] = oRequest return S_OK() def submitTransformationTasks(self, taskDict): """ Submit requests one by one """ submitted = 0 failed = 0 startTime = time.time() method = 'submitTransformationTasks' for task in taskDict.itervalues(): # transID is the same for all tasks, so pick it up every time here transID = task['TransformationID'] if not task['TaskObject']: task['Success'] = False failed += 1 continue res = self.submitTaskToExternal(task['TaskObject']) if res['OK']: task['ExternalID'] = res['Value'] task['Success'] = True submitted += 1 else: self._logError("Failed to submit task to RMS", res['Message'], transID=transID) task['Success'] = False failed += 1 if submitted: self._logInfo('Submitted %d tasks to RMS in %.1f seconds' % (submitted, time.time() - startTime), transID=transID, method=method) if failed: self._logWarn('Failed to submit %d tasks to RMS.' % (failed), transID=transID, method=method) return S_OK(taskDict) def submitTaskToExternal(self, oRequest): """ Submits a request to RMS """ if isinstance(oRequest, self.requestClass): return self.requestClient.putRequest(oRequest, useFailoverProxy=False, retryMainService=2) return S_ERROR("Request should be a Request object") def updateTransformationReservedTasks(self, taskDicts): requestNameIDs = {} noTasks = [] for taskDict in taskDicts: requestName = self._transTaskName(taskDict['TransformationID'], taskDict['TaskID']) reqID = taskDict['ExternalID'] if reqID: requestNameIDs[requestName] = reqID else: noTasks.append(requestName) return S_OK({'NoTasks': noTasks, 'TaskNameIDs': requestNameIDs}) def getSubmittedTaskStatus(self, taskDicts): """ Check if tasks changed status, and return a list of tasks per new status """ updateDict = {} badRequestID = 0 for taskDict in taskDicts: oldStatus = taskDict['ExternalStatus'] # ExternalID is normally a string if taskDict['ExternalID'] and int(taskDict['ExternalID']): newStatus = self.requestClient.getRequestStatus( taskDict['ExternalID']) if not newStatus['OK']: log = self._logVerbose if 'not exist' in newStatus[ 'Message'] else self._logWarn log("getSubmittedTaskStatus: Failed to get requestID for request", newStatus['Message'], transID=taskDict['TransformationID']) else: newStatus = newStatus['Value'] # We don't care updating the tasks to Assigned while the request is being processed if newStatus != oldStatus and newStatus != 'Assigned': updateDict.setdefault(newStatus, []).append(taskDict['TaskID']) else: badRequestID += 1 if badRequestID: self._logWarn("%d requests have identifier 0" % badRequestID) return S_OK(updateDict) def getSubmittedFileStatus(self, fileDicts): """ Check if transformation files changed status, and return a list of taskIDs per new status """ # Don't try and get status of not submitted tasks! transID = None taskFiles = {} for fileDict in fileDicts: # There is only one transformation involved, get however the transID in the loop transID = fileDict['TransformationID'] taskID = int(fileDict['TaskID']) taskFiles.setdefault(taskID, []).append(fileDict['LFN']) # Should not happen, but just in case there are no files, return if transID is None: return S_OK({}) res = self.transClient.getTransformationTasks({ 'TransformationID': transID, 'TaskID': taskFiles.keys() }) if not res['OK']: return res requestFiles = {} for taskDict in res['Value']: taskID = taskDict['TaskID'] externalID = taskDict['ExternalID'] # Only consider tasks that are submitted, ExternalID is a string if taskDict['ExternalStatus'] != 'Created' and externalID and int( externalID): requestFiles[externalID] = taskFiles[taskID] updateDict = {} for requestID, lfnList in requestFiles.iteritems(): statusDict = self.requestClient.getRequestFileStatus( requestID, lfnList) if not statusDict['OK']: log = self._logVerbose if 'not exist' in statusDict[ 'Message'] else self._logWarn log("Failed to get files status for request", statusDict['Message'], transID=transID, method='getSubmittedFileStatus') else: for lfn, newStatus in statusDict['Value'].iteritems(): if newStatus == 'Done': updateDict[lfn] = 'Processed' elif newStatus == 'Failed': updateDict[lfn] = 'Problematic' return S_OK(updateDict)
def main(): # Registering arguments will automatically add their description to the help menu Script.registerArgument(" sourceSE: source SE") Script.registerArgument(" LFN: LFN or file containing a List of LFNs") Script.registerArgument(["targetSE: target SEs"]) Script.parseCommandLine() import DIRAC from DIRAC import gLogger # parseCommandLine show help when mandatory arguments are not specified or incorrect argument args = Script.getPositionalArgs() sourceSE = args[0] lfnList = getLFNList(args[1]) targetSEs = list(set([se for targetSE in args[2:] for se in targetSE.split(",")])) gLogger.info( "Will create request with 'MoveReplica' " "operation using %s lfns and %s target SEs" % (len(lfnList), len(targetSEs)) ) from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient from DIRAC.RequestManagementSystem.Client.Request import Request from DIRAC.RequestManagementSystem.Client.Operation import Operation from DIRAC.RequestManagementSystem.Client.File import File from DIRAC.Resources.Catalog.FileCatalog import FileCatalog from DIRAC.Core.Utilities.List import breakListIntoChunks lfnChunks = breakListIntoChunks(lfnList, 100) multiRequests = len(lfnChunks) > 1 error = 0 count = 0 reqClient = ReqClient() fc = FileCatalog() for lfnChunk in lfnChunks: metaDatas = fc.getFileMetadata(lfnChunk) if not metaDatas["OK"]: gLogger.error("unable to read metadata for lfns: %s" % metaDatas["Message"]) error = -1 continue metaDatas = metaDatas["Value"] for failedLFN, reason in metaDatas["Failed"].items(): gLogger.error("skipping %s: %s" % (failedLFN, reason)) lfnChunk = set(metaDatas["Successful"]) if not lfnChunk: gLogger.error("LFN list is empty!!!") error = -1 continue if len(lfnChunk) > Operation.MAX_FILES: gLogger.error("too many LFNs, max number of files per operation is %s" % Operation.MAX_FILES) error = -1 continue count += 1 request = Request() request.RequestName = "%s_%s" % ( md5(repr(time.time()).encode()).hexdigest()[:16], md5(repr(time.time()).encode()).hexdigest()[:16], ) moveReplica = Operation() moveReplica.Type = "MoveReplica" moveReplica.SourceSE = sourceSE moveReplica.TargetSE = ",".join(targetSEs) for lfn in lfnChunk: metaDict = metaDatas["Successful"][lfn] opFile = File() opFile.LFN = lfn opFile.Size = metaDict["Size"] if "Checksum" in metaDict: # # should check checksum type, now assuming Adler32 (metaDict["ChecksumType"] = 'AD' opFile.Checksum = metaDict["Checksum"] opFile.ChecksumType = "ADLER32" moveReplica.addFile(opFile) request.addOperation(moveReplica) result = reqClient.putRequest(request) if not result["OK"]: gLogger.error("Failed to submit Request: %s" % (result["Message"])) error = -1 continue if not multiRequests: gLogger.always("Request %d submitted successfully" % result["Value"]) if multiRequests: gLogger.always("%d requests have been submitted" % (count)) DIRAC.exit(error)
from DIRAC.Core.Base import Script Script.setUsageMessage('\n'.join([ __doc__, 'Usage:', ' %s [option|cfgfile] <Request list>' % Script.scriptName ])) if __name__ == "__main__": from DIRAC.Core.Base.Script import parseCommandLine parseCommandLine() import DIRAC requests = [] from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient reqClient = ReqClient() args = Script.getPositionalArgs() if len(args) == 1: requests = [reqName for reqName in args[0].split(',') if reqName] if not requests: DIRAC.gLogger.fatal("Need at least one request name") Script.showHelp() DIRAC.exit(1) for reqName in requests: reqName = reqName.strip() res = reqClient.cancelRequest(reqName) if res['OK']: DIRAC.gLogger.always("Request %s canceled" % reqName)
def main(): """ Main executive code """ Script.registerSwitch("", "Job=", " JobID[,jobID2,...]") Script.registerSwitch("", "Transformation=", " transformation ID") Script.registerSwitch("", "Tasks=", " Associated to --Transformation, list of taskIDs") Script.registerSwitch("", "Verbose", " Print more information") Script.registerSwitch("", "Terse", " Only print request status") Script.registerSwitch("", "Full", " Print full request content") Script.registerSwitch("", "Status=", " Select all requests in a given status") Script.registerSwitch( "", "Since=", " Associated to --Status, start date yyyy-mm-dd or nb of days (default= -one day" ) Script.registerSwitch("", "Until=", " Associated to --Status, end date (default= now") Script.registerSwitch("", "Maximum=", " Associated to --Status, max number of requests ") Script.registerSwitch("", "Reset", " Reset Failed files to Waiting if any") Script.registerSwitch("", "Force", " Force reset even if not Failed") Script.registerSwitch( "", "All", " (if --Status Failed) all requests, otherwise exclude irrecoverable failures" ) Script.registerSwitch("", "FixJob", " Set job Done if the request is Done") Script.registerSwitch("", "Cancel", " Cancel the request") Script.registerSwitch("", "ListJobs", " List the corresponding jobs") Script.registerSwitch("", "TargetSE=", " Select request only if that SE is in the targetSEs") # Registering arguments will automatically add their description to the help menu Script.registerArgument( ( "file: a file containing a list of requests (Comma-separated on each line)", "request: a request ID or a unique request name", ), mandatory=False, ) Script.registerArgument(["request: a request ID or a unique request name"], mandatory=False) Script.parseCommandLine() import DIRAC from DIRAC import gLogger jobs = [] requestID = 0 transID = None taskIDs = None tasks = None requests = [] full = False verbose = False status = None until = None since = None terse = False allR = False reset = False fixJob = False maxRequests = 999999999999 cancel = False listJobs = False force = False targetSE = set() for switch in Script.getUnprocessedSwitches(): if switch[0] == "Job": jobs = [] job = "Unknown" try: for arg in switch[1].split(","): if os.path.exists(arg): with open(arg, "r") as fp: lines = fp.readlines() for line in lines: for job in line.split(","): jobs += [int(job.strip())] gLogger.notice("Found %d jobs in file %s" % (len(jobs), arg)) else: jobs.append(int(arg)) except TypeError: gLogger.fatal("Invalid jobID", job) elif switch[0] == "Transformation": try: transID = int(switch[1]) except Exception: gLogger.fatal("Invalid transID", switch[1]) elif switch[0] == "Tasks": try: taskIDs = [int(task) for task in switch[1].split(",")] except Exception: gLogger.fatal("Invalid tasks", switch[1]) elif switch[0] == "Full": full = True elif switch[0] == "Verbose": verbose = True elif switch[0] == "Terse": terse = True elif switch[0] == "All": allR = True elif switch[0] == "Reset": reset = True elif switch[0] == "Force": force = True elif switch[0] == "Status": status = switch[1].capitalize() elif switch[0] == "Since": since = convertDate(switch[1]) elif switch[0] == "Until": until = convertDate(switch[1]) elif switch[0] == "FixJob": fixJob = True elif switch[0] == "Cancel": cancel = True elif switch[0] == "ListJobs": listJobs = True elif switch[0] == "Maximum": try: maxRequests = int(switch[1]) except Exception: pass elif switch[0] == "TargetSE": targetSE = set(switch[1].split(",")) if reset and not force: status = "Failed" if fixJob: status = "Done" if terse: verbose = True if status: if not until: until = datetime.datetime.utcnow() if not since: since = until - datetime.timedelta(hours=24) from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient from DIRAC.RequestManagementSystem.Client.ReqClient import printRequest, recoverableRequest reqClient = ReqClient() if transID: if not taskIDs: gLogger.fatal("If Transformation is set, a list of Tasks should also be set") Script.showHelp(exitCode=2) # In principle, the task name is unique, so the request name should be unique as well # If ever this would not work anymore, we would need to use the transformationClient # to fetch the ExternalID requests = ["%08d_%08d" % (transID, task) for task in taskIDs] allR = True elif not jobs: requests = [] # Get full list of arguments, with and without comma for arg in [x.strip() for ar in Script.getPositionalArgs() for x in ar.split(",")]: if os.path.exists(arg): lines = open(arg, "r").readlines() requests += [reqID.strip() for line in lines for reqID in line.split(",")] gLogger.notice("Found %d requests in file" % len(requests)) else: requests.append(arg) allR = True else: res = reqClient.getRequestIDsForJobs(jobs) if not res["OK"]: gLogger.fatal("Error getting request for jobs", res["Message"]) DIRAC.exit(2) if res["Value"]["Failed"]: gLogger.error("No request found for jobs %s" % ",".join(sorted(str(job) for job in res["Value"]["Failed"]))) requests = sorted(res["Value"]["Successful"].values()) if requests: allR = True else: DIRAC.exit(0) if status and not requests: allR = allR or status != "Failed" res = reqClient.getRequestIDsList([status], limit=maxRequests, since=since, until=until) if not res["OK"]: gLogger.error("Error getting requests:", res["Message"]) DIRAC.exit(2) requests = [reqID for reqID, _st, updTime in res["Value"] if updTime > since and updTime <= until and reqID] gLogger.notice("Obtained %d requests %s between %s and %s" % (len(requests), status, since, until)) if not requests: gLogger.notice("No request selected....") Script.showHelp(exitCode=2) okRequests = [] jobIDList = [] for reqID in requests: # We allow reqID to be the requestName if it is unique try: # PEP-515 allows for underscore in numerical literals # So a request name 00123_00456 # is interpreted as a requestID 12300456 # Using an exception here for non-string is not an option if isinstance(reqID, str) and not reqID.isdigit(): raise ValueError() requestID = int(reqID) except (ValueError, TypeError): requestID = reqClient.getRequestIDForName(reqID) if not requestID["OK"]: gLogger.notice(requestID["Message"]) continue requestID = requestID["Value"] request = reqClient.peekRequest(requestID) if not request["OK"]: gLogger.error(request["Message"]) DIRAC.exit(-1) request = request["Value"] if not request: gLogger.error("no such request %s" % requestID) continue # If no operation as the targetSE, skip if targetSE: found = False for op in request: if op.TargetSE and targetSE.intersection(op.TargetSE.split(",")): found = True break if not found: continue # keep a list of jobIDs if requested if request.JobID and listJobs: jobIDList.append(request.JobID) if status and request.Status != status: gLogger.notice( "Request %s is not in requested status %s%s" % (reqID, status, " (cannot be reset)" if reset else "") ) continue if fixJob and request.Status == "Done" and request.JobID: # The request is for a job and is Done, verify that the job is in the proper status result = reqClient.finalizeRequest(request.RequestID, request.JobID, useCertificates=False) if not result["OK"]: gLogger.error("Error finalizing job", result["Message"]) else: gLogger.notice("Job %d updated to %s" % (request.JobID, result["Value"])) continue if cancel: if request.Status not in ("Done", "Failed"): ret = reqClient.cancelRequest(requestID) if not ret["OK"]: gLogger.error("Error canceling request %s" % reqID, ret["Message"]) else: gLogger.notice("Request %s cancelled" % reqID) else: gLogger.notice("Request %s is in status %s, not cancelled" % (reqID, request.Status)) elif allR or recoverableRequest(request): okRequests.append(str(requestID)) if reset: gLogger.notice("============ Request %s =============" % requestID) ret = reqClient.resetFailedRequest(requestID, allR=allR) if not ret["OK"]: gLogger.error("Error resetting request %s" % requestID, ret["Message"]) else: if len(requests) > 1: gLogger.notice("\n===================================") dbStatus = reqClient.getRequestStatus(requestID).get("Value", "Unknown") printRequest(request, status=dbStatus, full=full, verbose=verbose, terse=terse) if listJobs: gLogger.notice("List of %d jobs:\n" % len(jobIDList), ",".join(str(jobID) for jobID in jobIDList)) if status and okRequests: from DIRAC.Core.Utilities.List import breakListIntoChunks gLogger.notice("\nList of %d selected requests:" % len(okRequests)) for reqs in breakListIntoChunks(okRequests, 100): gLogger.notice(",".join(reqs))
class RequestTask(object): """ .. class:: RequestTask request's processing task """ def __init__( self, requestJSON, handlersDict, csPath, agentName, standalone=False, requestClient=None): """c'tor :param self: self reference :param str requestJSON: request serialized to JSON :param dict opHandlers: operation handlers """ self.request = Request(requestJSON) # # csPath self.csPath = csPath # # agent name self.agentName = agentName # # standalone flag self.standalone = standalone # # handlers dict self.handlersDict = handlersDict # # handlers class def self.handlers = {} # # own sublogger self.log = gLogger.getSubLogger("pid_%s/%s" % (os.getpid(), self.request.RequestName)) # # get shifters info self.__managersDict = {} shifterProxies = self.__setupManagerProxies() if not shifterProxies["OK"]: self.log.error(shifterProxies["Message"]) # # initialize gMonitor gMonitor.setComponentType(gMonitor.COMPONENT_AGENT) gMonitor.setComponentName(self.agentName) gMonitor.initialize() # # own gMonitor activities gMonitor.registerActivity("RequestAtt", "Requests processed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM) gMonitor.registerActivity("RequestFail", "Requests failed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM) gMonitor.registerActivity("RequestOK", "Requests done", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM) if requestClient is None: self.requestClient = ReqClient() else: self.requestClient = requestClient def __setupManagerProxies(self): """ setup grid proxy for all defined managers """ oHelper = Operations() shifters = oHelper.getSections("Shifter") if not shifters["OK"]: self.log.error(shifters["Message"]) return shifters shifters = shifters["Value"] for shifter in shifters: shifterDict = oHelper.getOptionsDict("Shifter/%s" % shifter) if not shifterDict["OK"]: self.log.error(shifterDict["Message"]) continue userName = shifterDict["Value"].get("User", "") userGroup = shifterDict["Value"].get("Group", "") userDN = CS.getDNForUsername(userName) if not userDN["OK"]: self.log.error(userDN["Message"]) continue userDN = userDN["Value"][0] vomsAttr = CS.getVOMSAttributeForGroup(userGroup) if vomsAttr: self.log.debug("getting VOMS [%s] proxy for shifter %s@%s (%s)" % (vomsAttr, userName, userGroup, userDN)) getProxy = gProxyManager.downloadVOMSProxyToFile(userDN, userGroup, requiredTimeLeft=1200, cacheTime=4 * 43200) else: self.log.debug("getting proxy for shifter %s@%s (%s)" % (userName, userGroup, userDN)) getProxy = gProxyManager.downloadProxyToFile(userDN, userGroup, requiredTimeLeft=1200, cacheTime=4 * 43200) if not getProxy["OK"]: self.log.error(getProxy["Message"]) return S_ERROR("unable to setup shifter proxy for %s: %s" % (shifter, getProxy["Message"])) chain = getProxy["chain"] fileName = getProxy["Value"] self.log.debug("got %s: %s %s" % (shifter, userName, userGroup)) self.__managersDict[shifter] = {"ShifterDN": userDN, "ShifterName": userName, "ShifterGroup": userGroup, "Chain": chain, "ProxyFile": fileName} return S_OK() def setupProxy(self): """ download and dump request owner proxy to file and env :return: S_OK with name of newly created owner proxy file and shifter name if any """ self.__managersDict = {} shifterProxies = self.__setupManagerProxies() if not shifterProxies["OK"]: self.log.error(shifterProxies["Message"]) ownerDN = self.request.OwnerDN ownerGroup = self.request.OwnerGroup isShifter = [] for shifter, creds in self.__managersDict.items(): if creds["ShifterDN"] == ownerDN and creds["ShifterGroup"] == ownerGroup: isShifter.append(shifter) if isShifter: proxyFile = self.__managersDict[isShifter[0]]["ProxyFile"] os.environ["X509_USER_PROXY"] = proxyFile return S_OK({"Shifter": isShifter, "ProxyFile": proxyFile}) # # if we're here owner is not a shifter at all ownerProxyFile = gProxyManager.downloadVOMSProxyToFile(ownerDN, ownerGroup) if not ownerProxyFile["OK"] or not ownerProxyFile["Value"]: reason = ownerProxyFile.get("Message", "No valid proxy found in ProxyManager.") return S_ERROR("Change proxy error for '%s'@'%s': %s" % (ownerDN, ownerGroup, reason)) ownerProxyFile = ownerProxyFile["Value"] os.environ["X509_USER_PROXY"] = ownerProxyFile return S_OK({"Shifter": isShifter, "ProxyFile": ownerProxyFile}) @staticmethod def getPluginName(pluginPath): if not pluginPath: return '' if "/" in pluginPath: pluginPath = ".".join([chunk for chunk in pluginPath.split("/") if chunk]) return pluginPath.split(".")[-1] @staticmethod def loadHandler(pluginPath): """ Create an instance of requested plugin class, loading and importing it when needed. This function could raise ImportError when plugin cannot be find or TypeError when loaded class object isn't inherited from BaseOperation class. :param str pluginName: dotted path to plugin, specified as in import statement, i.e. "DIRAC.CheesShopSystem.private.Cheddar" or alternatively in 'normal' path format "DIRAC/CheesShopSystem/private/Cheddar" :return: object instance This function try to load and instantiate an object from given path. It is assumed that: * `pluginPath` is pointing to module directory "importable" by python interpreter, i.e.: it's package's top level directory is in $PYTHONPATH env variable, * the module should consist a class definition following module name, * the class itself is inherited from DIRAC.RequestManagementSystem.private.BaseOperation.BaseOperation If above conditions aren't meet, function is throwing exceptions: :raises ImportError: when class cannot be imported :raises TypeError: when class isn't inherited from OperationHandlerBase """ if "/" in pluginPath: pluginPath = ".".join([chunk for chunk in pluginPath.split("/") if chunk]) pluginName = pluginPath.split(".")[-1] if pluginName not in globals(): mod = __import__(pluginPath, globals(), fromlist=[pluginName]) pluginClassObj = getattr(mod, pluginName) else: pluginClassObj = globals()[pluginName] if not issubclass(pluginClassObj, OperationHandlerBase): raise TypeError( "operation handler '%s' isn't inherited from OperationHandlerBase class" % pluginName) for key, status in (("Att", "Attempted"), ("OK", "Successful"), ("Fail", "Failed")): gMonitor.registerActivity( "%s%s" % (pluginName, key), "%s operations %s" % (pluginName, status), "RequestExecutingAgent", "Operations/min", gMonitor.OP_SUM) # # return an instance return pluginClassObj def getHandler(self, operation): """ return instance of a handler for a given operation type on demand all created handlers are kept in self.handlers dict for further use :param ~Operation.Operation operation: Operation instance """ if operation.Type not in self.handlersDict: return S_ERROR("handler for operation '%s' not set" % operation.Type) handler = self.handlers.get(operation.Type, None) if not handler: try: handlerCls = self.loadHandler(self.handlersDict[operation.Type]) self.handlers[operation.Type] = handlerCls( csPath="%s/OperationHandlers/%s" % (self.csPath, operation.Type)) handler = self.handlers[operation.Type] except (ImportError, TypeError) as error: self.log.exception("getHandler: %s" % str(error), lException=error) return S_ERROR(str(error)) # # set operation for this handler handler.setOperation(operation) # # and return return S_OK(handler) def updateRequest(self): """ put back request to the RequestDB """ updateRequest = self.requestClient.putRequest( self.request, useFailoverProxy=False, retryMainService=2) if not updateRequest["OK"]: self.log.error(updateRequest["Message"]) return updateRequest def __call__(self): """ request processing """ self.log.debug("about to execute request") gMonitor.addMark("RequestAtt", 1) # # setup proxy for request owner setupProxy = self.setupProxy() if not setupProxy["OK"]: self.request.Error = setupProxy["Message"] if 'has no proxy registered' in setupProxy["Message"]: self.log.error('Request set to Failed:', setupProxy["Message"]) # If user is no longer registered, fail the request for operation in self.request: for opFile in operation: opFile.Status = 'Failed' operation.Status = 'Failed' else: self.log.error(setupProxy["Message"]) return S_OK(self.request) shifter = setupProxy["Value"]["Shifter"] proxyFile = setupProxy["Value"]["ProxyFile"] error = None while self.request.Status == "Waiting": # # get waiting operation operation = self.request.getWaiting() if not operation["OK"]: self.log.error(operation["Message"]) return operation operation = operation["Value"] self.log.info("executing operation #%s '%s'" % (operation.Order, operation.Type)) # # and handler for it handler = self.getHandler(operation) if not handler["OK"]: self.log.error("unable to process operation %s: %s" % (operation.Type, handler["Message"])) # gMonitor.addMark( "%s%s" % ( operation.Type, "Fail" ), 1 ) operation.Error = handler["Message"] break handler = handler["Value"] # # set shifters list in the handler handler.shifter = shifter # # and execute pluginName = self.getPluginName(self.handlersDict.get(operation.Type)) if self.standalone: useServerCertificate = gConfig.useServerCertificate() else: # Always use server certificates if executed within an agent useServerCertificate = True try: if pluginName: gMonitor.addMark("%s%s" % (pluginName, "Att"), 1) # Always use request owner proxy if useServerCertificate: gConfigurationData.setOptionInCFG('/DIRAC/Security/UseServerCertificate', 'false') exe = handler() if useServerCertificate: gConfigurationData.setOptionInCFG('/DIRAC/Security/UseServerCertificate', 'true') if not exe["OK"]: self.log.error("unable to process operation %s: %s" % (operation.Type, exe["Message"])) if pluginName: gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1) gMonitor.addMark("RequestFail", 1) if self.request.JobID: # Check if the job exists monitorServer = RPCClient("WorkloadManagement/JobMonitoring", useCertificates=True) res = monitorServer.getJobPrimarySummary(int(self.request.JobID)) if not res["OK"]: self.log.error("RequestTask: Failed to get job %d status" % self.request.JobID) elif not res['Value']: self.log.warn( "RequestTask: job %d does not exist (anymore): failed request" % self.request.JobID) for opFile in operation: opFile.Status = 'Failed' if operation.Status != 'Failed': operation.Status = 'Failed' self.request.Error = 'Job no longer exists' except Exception as error: self.log.exception("hit by exception: %s" % str(error)) if pluginName: gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1) gMonitor.addMark("RequestFail", 1) if useServerCertificate: gConfigurationData.setOptionInCFG('/DIRAC/Security/UseServerCertificate', 'true') break # # operation status check if operation.Status == "Done" and pluginName: gMonitor.addMark("%s%s" % (pluginName, "OK"), 1) elif operation.Status == "Failed" and pluginName: gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1) elif operation.Status in ("Waiting", "Scheduled"): # # no update for waiting or all files scheduled break gMonitor.flush() if error: return S_ERROR(error) # # request done? if self.request.Status == "Done": # # update request to the RequestDB self.log.info('updating request with status %s' % self.request.Status) update = self.updateRequest() if not update["OK"]: self.log.error(update["Message"]) return update self.log.info("request '%s' is done" % self.request.RequestName) gMonitor.addMark("RequestOK", 1) # # and there is a job waiting for it? finalize! if self.request.JobID: attempts = 0 while True: finalizeRequest = self.requestClient.finalizeRequest( self.request.RequestID, self.request.JobID) # pylint: disable=no-member if not finalizeRequest["OK"]: if not attempts: self.log.error( "unable to finalize request %s: %s, will retry" % (self.request.RequestName, finalizeRequest["Message"])) self.log.verbose("Waiting 10 seconds") attempts += 1 if attempts == 10: self.log.error("giving up finalize request after %d attempts" % attempts) return S_ERROR('Could not finalize request') time.sleep(10) else: self.log.info( "request '%s' is finalized%s" % (self.request.RequestName, (' after %d attempts' % attempts) if attempts else '')) break # Request will be updated by the callBack method self.log.verbose("RequestTasks exiting, request %s" % self.request.Status) return S_OK(self.request)
class TransformationCleaningAgent( AgentModule ): """ .. class:: TransformationCleaningAgent :param DataManger dm: DataManager instance :param TransfromationClient transClient: TransfromationClient instance :param FileCatalogClient metadataClient: FileCatalogClient instance """ def __init__( self, *args, **kwargs ): """ c'tor """ AgentModule.__init__( self, *args, **kwargs ) # # data manager self.dm = None # # transformation client self.transClient = None # # wms client self.wmsClient = None # # request client self.reqClient = None # # file catalog client self.metadataClient = None # # transformations types self.transformationTypes = None # # directory locations self.directoryLocations = None # # transformation metadata self.transfidmeta = None # # archive periof in days self.archiveAfter = None # # active SEs self.activeStorages = None # # transformation log SEs self.logSE = None # # enable/disable execution self.enableFlag = None def initialize( self ): """ agent initialisation reading and setting confing opts :param self: self reference """ # # shifter proxy self.am_setOption( 'shifterProxy', 'DataManager' ) # # transformations types self.dataProcTTypes = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] ) self.dataManipTTypes = Operations().getValue( 'Transformations/DataManipulation', ['Replication', 'Removal'] ) agentTSTypes = self.am_getOption( 'TransformationTypes', [] ) if agentTSTypes: self.transformationTypes = sorted( agentTSTypes ) else: self.transformationTypes = sorted( self.dataProcTTypes + self.dataManipTTypes ) self.log.info( "Will consider the following transformation types: %s" % str( self.transformationTypes ) ) # # directory locations self.directoryLocations = sorted( self.am_getOption( 'DirectoryLocations', [ 'TransformationDB', 'MetadataCatalog' ] ) ) self.log.info( "Will search for directories in the following locations: %s" % str( self.directoryLocations ) ) # # transformation metadata self.transfidmeta = self.am_getOption( 'TransfIDMeta', "TransformationID" ) self.log.info( "Will use %s as metadata tag name for TransformationID" % self.transfidmeta ) # # archive periof in days self.archiveAfter = self.am_getOption( 'ArchiveAfter', 7 ) # days self.log.info( "Will archive Completed transformations after %d days" % self.archiveAfter ) # # active SEs self.activeStorages = sorted( self.am_getOption( 'ActiveSEs', [] ) ) self.log.info( "Will check the following storage elements: %s" % str( self.activeStorages ) ) # # transformation log SEs self.logSE = self.am_getOption( 'TransformationLogSE', 'LogSE' ) self.log.info( "Will remove logs found on storage element: %s" % self.logSE ) # # enable/disable execution, should be using CS option Status?? with default value as 'Active'?? self.enableFlag = self.am_getOption( 'EnableFlag', 'True' ) # # data manager self.dm = DataManager() # # transformation client self.transClient = TransformationClient() # # wms client self.wmsClient = WMSClient() # # request client self.reqClient = ReqClient() # # file catalog client self.metadataClient = FileCatalogClient() return S_OK() ############################################################################# def execute( self ): """ execution in one agent's cycle :param self: self reference """ self.enableFlag = self.am_getOption( 'EnableFlag', 'True' ) if not self.enableFlag == 'True': self.log.info( 'TransformationCleaningAgent is disabled by configuration option EnableFlag' ) return S_OK( 'Disabled via CS flag' ) # # Obtain the transformations in Cleaning status and remove any mention of the jobs/files res = self.transClient.getTransformations( { 'Status' : 'Cleaning', 'Type' : self.transformationTypes } ) if res['OK']: for transDict in res['Value']: # # if transformation is of type `Replication` or `Removal`, there is nothing to clean. # # We just archive if transDict[ 'Type' ] in self.dataManipTTypes: res = self.archiveTransformation( transDict['TransformationID'] ) if not res['OK']: self.log.error( "Problems archiving transformation %s: %s" % ( transDict['TransformationID'], res['Message'] ) ) else: res = self.cleanTransformation( transDict['TransformationID'] ) if not res['OK']: self.log.error( "Problems cleaning transformation %s: %s" % ( transDict['TransformationID'], res['Message'] ) ) # # Obtain the transformations in RemovingFiles status and (wait for it) removes the output files res = self.transClient.getTransformations( { 'Status' : 'RemovingFiles', 'Type' : self.transformationTypes} ) if res['OK']: for transDict in res['Value']: res = self.removeTransformationOutput( transDict['TransformationID'] ) if not res['OK']: self.log.error( "Problems removing transformation %s: %s" % ( transDict['TransformationID'], res['Message'] ) ) # # Obtain the transformations in Completed status and archive if inactive for X days olderThanTime = datetime.utcnow() - timedelta( days = self.archiveAfter ) res = self.transClient.getTransformations( { 'Status' : 'Completed', 'Type' : self.transformationTypes }, older = olderThanTime, timeStamp = 'LastUpdate' ) if res['OK']: for transDict in res['Value']: res = self.archiveTransformation( transDict['TransformationID'] ) if not res['OK']: self.log.error( "Problems archiving transformation %s: %s" % ( transDict['TransformationID'], res['Message'] ) ) else: self.log.error( "Could not get the transformations" ) return S_OK() ############################################################################# # # Get the transformation directories for checking # def getTransformationDirectories( self, transID ): """ get the directories for the supplied transformation from the transformation system :param self: self reference :param int transID: transformation ID """ directories = [] if 'TransformationDB' in self.directoryLocations: res = self.transClient.getTransformationParameters( transID, ['OutputDirectories'] ) if not res['OK']: self.log.error( "Failed to obtain transformation directories", res['Message'] ) return res transDirectories = res['Value'].splitlines() directories = self._addDirs( transID, transDirectories, directories ) if 'MetadataCatalog' in self.directoryLocations: res = self.metadataClient.findDirectoriesByMetadata( {self.transfidmeta:transID} ) if not res['OK']: self.log.error( "Failed to obtain metadata catalog directories", res['Message'] ) return res transDirectories = res['Value'] directories = self._addDirs( transID, transDirectories, directories ) if not directories: self.log.info( "No output directories found" ) directories = sorted( directories ) return S_OK( directories ) # FIXME If a classmethod, should it not have cls instead of self? @classmethod def _addDirs( self, transID, newDirs, existingDirs ): """ append uniqe :newDirs: list to :existingDirs: list :param self: self reference :param int transID: transformationID :param list newDirs: src list of paths :param list existingDirs: dest list of paths """ for folder in newDirs: transStr = str( transID ).zfill( 8 ) if re.search( transStr, str( folder ) ): if not folder in existingDirs: existingDirs.append( folder ) return existingDirs ############################################################################# # # These are the methods for performing the cleaning of catalogs and storage # def cleanStorageContents( self, directory ): """ delete lfn dir from all active SE :param self: self reference :param sre directory: folder name """ for storageElement in self.activeStorages: res = self.__removeStorageDirectory( directory, storageElement ) if not res['OK']: return res return S_OK() def __removeStorageDirectory( self, directory, storageElement ): """ wipe out all contents from :directory: at :storageElement: :param self: self reference :param str directory: path :param str storageElement: SE name """ self.log.info( 'Removing the contents of %s at %s' % ( directory, storageElement ) ) se = StorageElement( storageElement ) res = se.getPfnForLfn( [directory] ) if not res['OK']: self.log.error( "Failed to get PFN for directory", res['Message'] ) return res if directory in res['Value']['Failed']: self.log.verbose( 'Failed to obtain directory PFN from LFN', '%s %s' % ( directory, res['Value']['Failed'][directory] ) ) return S_ERROR( 'Failed to obtain directory PFN from LFNs' ) storageDirectory = res['Value']['Successful'][directory] res = returnSingleResult( se.exists( storageDirectory ) ) if not res['OK']: self.log.error( "Failed to obtain existance of directory", res['Message'] ) return res exists = res['Value'] if not exists: self.log.info( "The directory %s does not exist at %s " % ( directory, storageElement ) ) return S_OK() res = returnSingleResult( se.removeDirectory( storageDirectory, recursive = True ) ) if not res['OK']: self.log.error( "Failed to remove storage directory", res['Message'] ) return res self.log.info( "Successfully removed %d files from %s at %s" % ( res['Value']['FilesRemoved'], directory, storageElement ) ) return S_OK() def cleanCatalogContents( self, directory ): """ wipe out everything from catalog under folder :directory: :param self: self reference :params str directory: folder name """ res = self.__getCatalogDirectoryContents( [directory] ) if not res['OK']: return res filesFound = res['Value'] if not filesFound: self.log.info( "No files are registered in the catalog directory %s" % directory ) return S_OK() self.log.info( "Attempting to remove %d possible remnants from the catalog and storage" % len( filesFound ) ) res = self.dm.removeFile( filesFound, force = True ) if not res['OK']: return res realFailure = False for lfn, reason in res['Value']['Failed'].items(): if "File does not exist" in str( reason ): self.log.warn( "File %s not found in some catalog: " % ( lfn ) ) else: self.log.error( "Failed to remove file found in the catalog", "%s %s" % ( lfn, reason ) ) realFailure = True if realFailure: return S_ERROR( "Failed to remove all files found in the catalog" ) return S_OK() def __getCatalogDirectoryContents( self, directories ): """ get catalog contents under paths :directories: :param self: self reference :param list directories: list of paths in catalog """ self.log.info( 'Obtaining the catalog contents for %d directories:' % len( directories ) ) for directory in directories: self.log.info( directory ) activeDirs = directories allFiles = {} fc = FileCatalog() while len( activeDirs ) > 0: currentDir = activeDirs[0] res = returnSingleResult( fc.listDirectory( currentDir ) ) activeDirs.remove( currentDir ) if not res['OK'] and res['Message'].endswith( 'The supplied path does not exist' ): self.log.info( "The supplied directory %s does not exist" % currentDir ) elif not res['OK']: if "No such file or directory" in res['Message']: self.log.info( "%s: %s" % ( currentDir, res['Message'] ) ) else: self.log.error( "Failed to get directory %s content: %s" % ( currentDir, res['Message'] ) ) else: dirContents = res['Value'] activeDirs.extend( dirContents['SubDirs'] ) allFiles.update( dirContents['Files'] ) self.log.info( "Found %d files" % len( allFiles ) ) return S_OK( allFiles.keys() ) def cleanTransformationLogFiles( self, directory ): """ clean up transformation logs from directory :directory: :param self: self reference :param str directory: folder name """ self.log.info( "Removing log files found in the directory %s" % directory ) res = returnSingleResult( StorageElement( self.logSE ).removeDirectory( directory ) ) if not res['OK']: self.log.error( "Failed to remove log files", res['Message'] ) return res self.log.info( "Successfully removed transformation log directory" ) return S_OK() ############################################################################# # # These are the functional methods for archiving and cleaning transformations # def removeTransformationOutput( self, transID ): """ This just removes any mention of the output data from the catalog and storage """ self.log.info( "Removing output data for transformation %s" % transID ) res = self.getTransformationDirectories( transID ) if not res['OK']: self.log.error( 'Problem obtaining directories for transformation %s with result "%s"' % ( transID, res ) ) return S_OK() directories = res['Value'] for directory in directories: if not re.search( '/LOG/', directory ): res = self.cleanCatalogContents( directory ) if not res['OK']: return res res = self.cleanStorageContents( directory ) if not res['OK']: return res self.log.info( "Removed directories in the catalog and storage for transformation" ) # Clean ALL the possible remnants found in the metadata catalog res = self.cleanMetadataCatalogFiles( transID ) if not res['OK']: return res self.log.info( "Successfully removed output of transformation %d" % transID ) # Change the status of the transformation to RemovedFiles res = self.transClient.setTransformationParameter( transID, 'Status', 'RemovedFiles' ) if not res['OK']: self.log.error( "Failed to update status of transformation %s to RemovedFiles" % ( transID ), res['Message'] ) return res self.log.info( "Updated status of transformation %s to RemovedFiles" % ( transID ) ) return S_OK() def archiveTransformation( self, transID ): """ This just removes job from the jobDB and the transformation DB :param self: self reference :param int transID: transformation ID """ self.log.info( "Archiving transformation %s" % transID ) # Clean the jobs in the WMS and any failover requests found res = self.cleanTransformationTasks( transID ) if not res['OK']: return res # Clean the transformation DB of the files and job information res = self.transClient.cleanTransformation( transID ) if not res['OK']: return res self.log.info( "Successfully archived transformation %d" % transID ) # Change the status of the transformation to archived res = self.transClient.setTransformationParameter( transID, 'Status', 'Archived' ) if not res['OK']: self.log.error( "Failed to update status of transformation %s to Archived" % ( transID ), res['Message'] ) return res self.log.info( "Updated status of transformation %s to Archived" % ( transID ) ) return S_OK() def cleanTransformation( self, transID ): """ This removes what was produced by the supplied transformation, leaving only some info and log in the transformation DB. """ self.log.info( "Cleaning transformation %s" % transID ) res = self.getTransformationDirectories( transID ) if not res['OK']: self.log.error( 'Problem obtaining directories for transformation %s with result "%s"' % ( transID, res ) ) return S_OK() directories = res['Value'] # Clean the jobs in the WMS and any failover requests found res = self.cleanTransformationTasks( transID ) if not res['OK']: return res # Clean the log files for the jobs for directory in directories: if re.search( '/LOG/', directory ): res = self.cleanTransformationLogFiles( directory ) if not res['OK']: return res res = self.cleanCatalogContents( directory ) if not res['OK']: return res res = self.cleanStorageContents( directory ) if not res['OK']: return res # Clean ALL the possible remnants found in the BK res = self.cleanMetadataCatalogFiles( transID ) if not res['OK']: return res # Clean the transformation DB of the files and job information res = self.transClient.cleanTransformation( transID ) if not res['OK']: return res self.log.info( "Successfully cleaned transformation %d" % transID ) res = self.transClient.setTransformationParameter( transID, 'Status', 'Cleaned' ) if not res['OK']: self.log.error( "Failed to update status of transformation %s to Cleaned" % ( transID ), res['Message'] ) return res self.log.info( "Updated status of transformation %s to Cleaned" % ( transID ) ) return S_OK() def cleanMetadataCatalogFiles( self, transID ): """ wipe out files from catalog """ res = self.metadataClient.findFilesByMetadata( { self.transfidmeta : transID } ) if not res['OK']: return res fileToRemove = res['Value'] if not fileToRemove: self.log.info( 'No files found for transID %s' % transID ) return S_OK() res = self.dm.removeFile( fileToRemove, force = True ) if not res['OK']: return res for lfn, reason in res['Value']['Failed'].items(): self.log.error( "Failed to remove file found in metadata catalog", "%s %s" % ( lfn, reason ) ) if res['Value']['Failed']: return S_ERROR( "Failed to remove all files found in the metadata catalog" ) self.log.info( "Successfully removed all files found in the BK" ) return S_OK() ############################################################################# # # These are the methods for removing the jobs from the WMS and transformation DB # def cleanTransformationTasks( self, transID ): """ clean tasks from WMS, or from the RMS if it is a DataManipulation transformation """ res = self.__getTransformationExternalIDs( transID ) if not res['OK']: return res externalIDs = res['Value'] if externalIDs: res = self.transClient.getTransformationParameters( transID, ['Type'] ) if not res['OK']: self.log.error( "Failed to determine transformation type" ) return res transType = res['Value'] if transType in self.dataProcTTypes: res = self.__removeWMSTasks( externalIDs ) else: res = self.__removeRequests( externalIDs ) if not res['OK']: return res return S_OK() def __getTransformationExternalIDs( self, transID ): """ collect all ExternalIDs for transformation :transID: :param self: self reference :param int transID: transforamtion ID """ res = self.transClient.getTransformationTasks( condDict = { 'TransformationID' : transID } ) if not res['OK']: self.log.error( "Failed to get externalIDs for transformation %d" % transID, res['Message'] ) return res externalIDs = [ taskDict['ExternalID'] for taskDict in res["Value"] ] self.log.info( "Found %d tasks for transformation" % len( externalIDs ) ) return S_OK( externalIDs ) def __removeRequests( self, requestIDs ): """ This will remove requests from the (new) RMS system - #FIXME: if the old system is still installed, it won't remove anything!!! (we don't want to risk removing from the new RMS what is instead in the old) """ # FIXME: checking if the old system is still installed! from DIRAC.ConfigurationSystem.Client import PathFinder if PathFinder.getServiceURL( "RequestManagement/RequestManager" ): self.log.warn( "NOT removing requests!!" ) return S_OK() rIDs = [ int( long( j ) ) for j in requestIDs if long( j ) ] for requestName in rIDs: self.reqClient.deleteRequest( requestName ) return S_OK() def __removeWMSTasks( self, transJobIDs ): """ wipe out jobs and their requests from the system TODO: should check request status, maybe FTS files as well ??? :param self: self reference :param list trasnJobIDs: job IDs """ # Prevent 0 job IDs jobIDs = [ int( j ) for j in transJobIDs if int( j ) ] allRemove = True for jobList in breakListIntoChunks( jobIDs, 500 ): res = self.wmsClient.killJob( jobList ) if res['OK']: self.log.info( "Successfully killed %d jobs from WMS" % len( jobList ) ) elif ( "InvalidJobIDs" in res ) and ( "NonauthorizedJobIDs" not in res ) and ( "FailedJobIDs" not in res ): self.log.info( "Found %s jobs which did not exist in the WMS" % len( res['InvalidJobIDs'] ) ) elif "NonauthorizedJobIDs" in res: self.log.error( "Failed to kill %s jobs because not authorized" % len( res['NonauthorizedJobIDs'] ) ) allRemove = False elif "FailedJobIDs" in res: self.log.error( "Failed to kill %s jobs" % len( res['FailedJobIDs'] ) ) allRemove = False res = self.wmsClient.deleteJob( jobList ) if res['OK']: self.log.info( "Successfully removed %d jobs from WMS" % len( jobList ) ) elif ( "InvalidJobIDs" in res ) and ( "NonauthorizedJobIDs" not in res ) and ( "FailedJobIDs" not in res ): self.log.info( "Found %s jobs which did not exist in the WMS" % len( res['InvalidJobIDs'] ) ) elif "NonauthorizedJobIDs" in res: self.log.error( "Failed to remove %s jobs because not authorized" % len( res['NonauthorizedJobIDs'] ) ) allRemove = False elif "FailedJobIDs" in res: self.log.error( "Failed to remove %s jobs" % len( res['FailedJobIDs'] ) ) allRemove = False if not allRemove: return S_ERROR( "Failed to remove all remnants from WMS" ) self.log.info( "Successfully removed all tasks from the WMS" ) if not jobIDs: self.log.info( "JobIDs not present, unable to remove asociated requests." ) return S_OK() failed = 0 # FIXME: double request client: old/new -> only the new will survive sooner or later # this is the old try: res = RequestClient().getRequestForJobs( jobIDs ) if not res['OK']: self.log.error( "Failed to get requestID for jobs.", res['Message'] ) return res failoverRequests = res['Value'] self.log.info( "Found %d jobs with associated failover requests (in the old RMS)" % len( failoverRequests ) ) if not failoverRequests: return S_OK() for jobID, requestName in failoverRequests.items(): # Put this check just in case, tasks must have associated jobs if jobID == 0 or jobID == '0': continue res = RequestClient().deleteRequest( requestName ) if not res['OK']: self.log.error( "Failed to remove request from RequestDB", res['Message'] ) failed += 1 else: self.log.verbose( "Removed request %s associated to job %d." % ( requestName, jobID ) ) except RuntimeError: failoverRequests = {} pass # FIXME: and this is the new res = self.reqClient.getRequestNamesForJobs( jobIDs ) if not res['OK']: self.log.error( "Failed to get requestID for jobs.", res['Message'] ) return res failoverRequests.update( res['Value']['Successful'] ) if not failoverRequests: return S_OK() for jobID, requestName in res['Value']['Successful'].items(): # Put this check just in case, tasks must have associated jobs if jobID == 0 or jobID == '0': continue res = self.reqClient.deleteRequest( requestName ) if not res['OK']: self.log.error( "Failed to remove request from RequestDB", res['Message'] ) failed += 1 else: self.log.verbose( "Removed request %s associated to job %d." % ( requestName, jobID ) ) if failed: self.log.info( "Successfully removed %s requests" % ( len( failoverRequests ) - failed ) ) self.log.info( "Failed to remove %s requests" % failed ) return S_ERROR( "Failed to remove all the request from RequestDB" ) self.log.info( "Successfully removed all the associated failover requests" ) return S_OK()
else: lfns.append( inputFileName ) from DIRAC.Resources.Storage.StorageElement import StorageElement import DIRAC # Check is provided SE is OK se = StorageElement( targetSE ) if not se.valid: print se.errorReason print Script.showHelp() from DIRAC.RequestManagementSystem.Client.RequestContainer import RequestContainer from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient reqClient = ReqClient() requestType = 'transfer' requestOperation = 'replicateAndRegister' for lfnList in breakListIntoChunks( lfns, 100 ): oRequest = RequestContainer() subRequestIndex = oRequest.initiateSubRequest( requestType )['Value'] attributeDict = {'Operation':requestOperation, 'TargetSE':targetSE} oRequest.setSubRequestAttributes( subRequestIndex, requestType, attributeDict ) files = [] for lfn in lfnList: files.append( {'LFN':lfn} ) oRequest.setSubRequestFiles( subRequestIndex, requestType, files ) requestName = "%s_%s" % ( md5( repr( time.time() ) ).hexdigest()[:16], md5( repr( time.time() ) ).hexdigest()[:16] ) oRequest.setRequestAttributes( {'RequestName':requestName} )
class RequestTasks( TaskBase ): def __init__( self, transClient = None, logger = None, requestClient = None, requestClass = None, requestValidator = None ): """ c'tor the requestClass is by default Request. If extensions want to use an extended type, they can pass it as a parameter. This is the same behavior as WorfkloTasks and jobClass """ if not logger: logger = gLogger.getSubLogger( 'RequestTasks' ) super( RequestTasks, self ).__init__( transClient, logger ) if not requestClient: self.requestClient = ReqClient() else: self.requestClient = requestClient if not requestClass: self.requestClass = Request else: self.requestClass = requestClass if not requestValidator: self.requestValidator = RequestValidator() else: self.requestValidator = requestValidator def prepareTransformationTasks( self, transBody, taskDict, owner = '', ownerGroup = '', ownerDN = '' ): """ Prepare tasks, given a taskDict, that is created (with some manipulation) by the DB """ if ( not owner ) or ( not ownerGroup ): res = getProxyInfo( False, False ) if not res['OK']: return res proxyInfo = res['Value'] owner = proxyInfo['username'] ownerGroup = proxyInfo['group'] if not ownerDN: res = getDNForUsername( owner ) if not res['OK']: return res ownerDN = res['Value'][0] requestOperation = 'ReplicateAndRegister' if transBody: try: _requestType, requestOperation = transBody.split( ';' ) except AttributeError: pass for taskID in sorted( taskDict ): paramDict = taskDict[taskID] if paramDict['InputData']: transID = paramDict['TransformationID'] oRequest = Request() transfer = Operation() transfer.Type = requestOperation transfer.TargetSE = paramDict['TargetSE'] if isinstance( paramDict['InputData'], list ): files = paramDict['InputData'] elif isinstance( paramDict['InputData'], basestring ): files = paramDict['InputData'].split( ';' ) for lfn in files: trFile = File() trFile.LFN = lfn transfer.addFile( trFile ) oRequest.addOperation( transfer ) oRequest.RequestName = _requestName( transID, taskID ) oRequest.OwnerDN = ownerDN oRequest.OwnerGroup = ownerGroup isValid = self.requestValidator.validate( oRequest ) if not isValid['OK']: return isValid taskDict[taskID]['TaskObject'] = oRequest return S_OK( taskDict ) def submitTransformationTasks( self, taskDict ): """ Submit requests one by one """ submitted = 0 failed = 0 startTime = time.time() for taskID in sorted( taskDict ): if not taskDict[taskID]['TaskObject']: taskDict[taskID]['Success'] = False failed += 1 continue res = self.submitTaskToExternal( taskDict[taskID]['TaskObject'] ) if res['OK']: taskDict[taskID]['ExternalID'] = res['Value'] taskDict[taskID]['Success'] = True submitted += 1 else: self._logError( "Failed to submit task to RMS", res['Message'] ) taskDict[taskID]['Success'] = False failed += 1 self._logInfo( 'submitTasks: Submitted %d tasks to RMS in %.1f seconds' % ( submitted, time.time() - startTime ) ) if failed: self._logWarn( 'submitTasks: But at the same time failed to submit %d tasks to RMS.' % ( failed ) ) return S_OK( taskDict ) def submitTaskToExternal( self, oRequest ): """ Submits a request using ReqClient """ if isinstance( oRequest, self.requestClass ): return self.requestClient.putRequest( oRequest ) else: return S_ERROR( "Request should be a Request object" ) def updateTransformationReservedTasks( self, taskDicts ): requestNameIDs = {} noTasks = [] for taskDict in taskDicts: requestName = _requestName( taskDict['TransformationID'], taskDict['TaskID'] ) reqID = taskDict['ExternalID'] if reqID: requestNameIDs[requestName] = reqID else: noTasks.append( requestName ) return S_OK( {'NoTasks':noTasks, 'TaskNameIDs':requestNameIDs} ) def getSubmittedTaskStatus( self, taskDicts ): updateDict = {} for taskDict in taskDicts: oldStatus = taskDict['ExternalStatus'] newStatus = self.requestClient.getRequestStatus( taskDict['ExternalID'] ) if not newStatus['OK']: log = self._logVerbose if 'not exist' in newStatus['Message'] else self.log.warn log( "getSubmittedTaskStatus: Failed to get requestID for request", '%s' % newStatus['Message'] ) else: newStatus = newStatus['Value'] if newStatus != oldStatus: updateDict.setdefault( newStatus, [] ).append( taskDict['TaskID'] ) return S_OK( updateDict ) def getSubmittedFileStatus( self, fileDicts ): taskFiles = {} submittedTasks = {} externalIds = {} # Don't try and get status of not submitted tasks! for fileDict in fileDicts: submittedTasks.setdefault( fileDict['TransformationID'], set() ).add( int( fileDict['TaskID'] ) ) for transID in submittedTasks: res = self.transClient.getTransformationTasks( { 'TransformationID':transID, 'TaskID': list( submittedTasks[transID] )} ) if not res['OK']: return res for taskDict in res['Value']: taskID = taskDict['TaskID'] externalIds[taskID] = taskDict['ExternalID'] if taskDict['ExternalStatus'] == 'Created': submittedTasks[transID].remove( taskID ) for fileDict in fileDicts: transID = fileDict['TransformationID'] taskID = int( fileDict['TaskID'] ) if taskID in submittedTasks[transID]: requestID = externalIds[taskID] taskFiles.setdefault( requestID, {} )[fileDict['LFN']] = fileDict['Status'] updateDict = {} for requestID in sorted( taskFiles ): lfnDict = taskFiles[requestID] statusDict = self.requestClient.getRequestFileStatus( requestID, lfnDict.keys() ) if not statusDict['OK']: log = self._logVerbose if 'not exist' in statusDict['Message'] else self.log.warn log( "getSubmittedFileStatus: Failed to get files status for request", '%s' % statusDict['Message'] ) continue statusDict = statusDict['Value'] for lfn, newStatus in statusDict.items(): if newStatus == lfnDict[lfn]: pass elif newStatus == 'Done': updateDict[lfn] = 'Processed' elif newStatus == 'Failed': updateDict[lfn] = 'Problematic' return S_OK( updateDict )
"operation using %s lfns and %s target SEs" % (requestName, len(lfnList), len(targetSEs))) from DIRAC.RequestManagementSystem.Client.Request import Request from DIRAC.RequestManagementSystem.Client.Operation import Operation from DIRAC.RequestManagementSystem.Client.File import File from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient from DIRAC.Resources.Catalog.FileCatalog import FileCatalog from DIRAC.Core.Utilities.List import breakListIntoChunks lfnChunks = breakListIntoChunks(lfnList, 100) multiRequests = len(lfnChunks) > 1 error = 0 count = 0 reqClient = ReqClient() fc = FileCatalog() requestIDs = [] for lfnChunk in lfnChunks: metaDatas = fc.getFileMetadata(lfnChunk) if not metaDatas["OK"]: gLogger.error("unable to read metadata for lfns: %s" % metaDatas["Message"]) error = -1 continue metaDatas = metaDatas["Value"] for failedLFN, reason in metaDatas["Failed"].items(): gLogger.error("skipping %s: %s" % (failedLFN, reason)) lfnChunk = set(metaDatas["Successful"]) if not lfnChunk:
class RequestTasks(TaskBase): """ Class for handling tasks for the RMS """ def __init__(self, transClient=None, logger=None, requestClient=None, requestClass=None, requestValidator=None, ownerDN=None, ownerGroup=None): """ c'tor the requestClass is by default Request. If extensions want to use an extended type, they can pass it as a parameter. This is the same behavior as WorfkloTasks and jobClass """ if not logger: logger = gLogger.getSubLogger('RequestTasks') super(RequestTasks, self).__init__(transClient, logger) useCertificates = True if (bool(ownerDN) and bool(ownerGroup)) else False if not requestClient: self.requestClient = ReqClient(useCertificates=useCertificates, delegatedDN=ownerDN, delegatedGroup=ownerGroup) else: self.requestClient = requestClient if not requestClass: self.requestClass = Request else: self.requestClass = requestClass if not requestValidator: self.requestValidator = RequestValidator() else: self.requestValidator = requestValidator def prepareTransformationTasks(self, transBody, taskDict, owner='', ownerGroup='', ownerDN='', bulkSubmissionFlag=False): """ Prepare tasks, given a taskDict, that is created (with some manipulation) by the DB """ if not taskDict: return S_OK({}) if (not owner) or (not ownerGroup): res = getProxyInfo(False, False) if not res['OK']: return res proxyInfo = res['Value'] owner = proxyInfo['username'] ownerGroup = proxyInfo['group'] if not ownerDN: res = getDNForUsername(owner) if not res['OK']: return res ownerDN = res['Value'][0] try: transJson = json.loads(transBody) self._multiOperationsBody(transJson, taskDict, ownerDN, ownerGroup) except ValueError: # #json couldn't load self._singleOperationsBody(transBody, taskDict, ownerDN, ownerGroup) return S_OK(taskDict) def _multiOperationsBody(self, transJson, taskDict, ownerDN, ownerGroup): """ deal with a Request that has multiple operations :param transJson: list of lists of string and dictionaries, e.g.: .. code :: python body = [ ( "ReplicateAndRegister", { "SourceSE":"FOO-SRM", "TargetSE":"BAR-SRM" }), ( "RemoveReplica", { "TargetSE":"FOO-SRM" } ), ] :param dict taskDict: dictionary of tasks, modified in this function :param str ownerDN: certificate DN used for the requests :param str onwerGroup: dirac group used for the requests :returns: None """ failedTasks = [] for taskID, task in taskDict.items(): transID = task['TransformationID'] if not task.get('InputData'): self._logError("Error creating request for task", "%s, No input data" % taskID, transID=transID) taskDict.pop(taskID) continue files = [] oRequest = Request() if isinstance(task['InputData'], list): files = task['InputData'] elif isinstance(task['InputData'], basestring): files = task['InputData'].split(';') # create the operations from the json structure for operationTuple in transJson: op = Operation() op.Type = operationTuple[0] for parameter, value in operationTuple[1].iteritems(): setattr(op, parameter, value) for lfn in files: opFile = File() opFile.LFN = lfn op.addFile(opFile) oRequest.addOperation(op) result = self._assignRequestToTask(oRequest, taskDict, transID, taskID, ownerDN, ownerGroup) if not result['OK']: failedTasks.append(taskID) # Remove failed tasks for taskID in failedTasks: taskDict.pop(taskID) def _singleOperationsBody(self, transBody, taskDict, ownerDN, ownerGroup): """ deal with a Request that has just one operation, as it was sofar :param transBody: string, can be an empty string :param dict taskDict: dictionary of tasks, modified in this function :param str ownerDN: certificate DN used for the requests :param str onwerGroup: dirac group used for the requests :returns: None """ requestOperation = 'ReplicateAndRegister' if transBody: try: _requestType, requestOperation = transBody.split(';') except AttributeError: pass failedTasks = [] # Do not remove sorted, we might pop elements in the loop for taskID, task in taskDict.iteritems(): transID = task['TransformationID'] oRequest = Request() transfer = Operation() transfer.Type = requestOperation transfer.TargetSE = task['TargetSE'] # If there are input files if task.get('InputData'): if isinstance(task['InputData'], list): files = task['InputData'] elif isinstance(task['InputData'], basestring): files = task['InputData'].split(';') for lfn in files: trFile = File() trFile.LFN = lfn transfer.addFile(trFile) oRequest.addOperation(transfer) result = self._assignRequestToTask(oRequest, taskDict, transID, taskID, ownerDN, ownerGroup) if not result['OK']: failedTasks.append(taskID) # Remove failed tasks for taskID in failedTasks: taskDict.pop(taskID) def _assignRequestToTask(self, oRequest, taskDict, transID, taskID, ownerDN, ownerGroup): """set ownerDN and group to request, and add the request to taskDict if it is valid, otherwise remove the task from the taskDict :param oRequest: Request :param dict taskDict: dictionary of tasks, modified in this function :param int transID: Transformation ID :param int taskID: Task ID :param str ownerDN: certificate DN used for the requests :param str onwerGroup: dirac group used for the requests :returns: None """ oRequest.RequestName = self._transTaskName(transID, taskID) oRequest.OwnerDN = ownerDN oRequest.OwnerGroup = ownerGroup isValid = self.requestValidator.validate(oRequest) if not isValid['OK']: self._logError("Error creating request for task", "%s %s" % (taskID, isValid), transID=transID) return S_ERROR('Error creating request') taskDict[taskID]['TaskObject'] = oRequest return S_OK() def submitTransformationTasks(self, taskDict): """ Submit requests one by one """ submitted = 0 failed = 0 startTime = time.time() method = 'submitTransformationTasks' for task in taskDict.itervalues(): # transID is the same for all tasks, so pick it up every time here transID = task['TransformationID'] if not task['TaskObject']: task['Success'] = False failed += 1 continue res = self.submitTaskToExternal(task['TaskObject']) if res['OK']: task['ExternalID'] = res['Value'] task['Success'] = True submitted += 1 else: self._logError("Failed to submit task to RMS", res['Message'], transID=transID) task['Success'] = False failed += 1 if submitted: self._logInfo('Submitted %d tasks to RMS in %.1f seconds' % (submitted, time.time() - startTime), transID=transID, method=method) if failed: self._logWarn('Failed to submit %d tasks to RMS.' % (failed), transID=transID, method=method) return S_OK(taskDict) def submitTaskToExternal(self, oRequest): """ Submits a request to RMS """ if isinstance(oRequest, self.requestClass): return self.requestClient.putRequest(oRequest, useFailoverProxy=False, retryMainService=2) return S_ERROR("Request should be a Request object") def updateTransformationReservedTasks(self, taskDicts): requestNameIDs = {} noTasks = [] for taskDict in taskDicts: requestName = self._transTaskName(taskDict['TransformationID'], taskDict['TaskID']) reqID = taskDict['ExternalID'] if reqID: requestNameIDs[requestName] = reqID else: noTasks.append(requestName) return S_OK({'NoTasks': noTasks, 'TaskNameIDs': requestNameIDs}) def getSubmittedTaskStatus(self, taskDicts): """ Check if tasks changed status, and return a list of tasks per new status """ updateDict = {} badRequestID = 0 for taskDict in taskDicts: oldStatus = taskDict['ExternalStatus'] # ExternalID is normally a string if taskDict['ExternalID'] and int(taskDict['ExternalID']): newStatus = self.requestClient.getRequestStatus(taskDict['ExternalID']) if not newStatus['OK']: log = self._logVerbose if 'not exist' in newStatus['Message'] else self._logWarn log("getSubmittedTaskStatus: Failed to get requestID for request", newStatus['Message'], transID=taskDict['TransformationID']) else: newStatus = newStatus['Value'] # We don't care updating the tasks to Assigned while the request is being processed if newStatus != oldStatus and newStatus != 'Assigned': updateDict.setdefault(newStatus, []).append(taskDict['TaskID']) else: badRequestID += 1 if badRequestID: self._logWarn("%d requests have identifier 0" % badRequestID) return S_OK(updateDict) def getSubmittedFileStatus(self, fileDicts): """ Check if transformation files changed status, and return a list of taskIDs per new status """ # Don't try and get status of not submitted tasks! transID = None taskFiles = {} for fileDict in fileDicts: # There is only one transformation involved, get however the transID in the loop transID = fileDict['TransformationID'] taskID = int(fileDict['TaskID']) taskFiles.setdefault(taskID, []).append(fileDict['LFN']) # Should not happen, but just in case there are no files, return if transID is None: return S_OK({}) res = self.transClient.getTransformationTasks({'TransformationID': transID, 'TaskID': taskFiles.keys()}) if not res['OK']: return res requestFiles = {} for taskDict in res['Value']: taskID = taskDict['TaskID'] externalID = taskDict['ExternalID'] # Only consider tasks that are submitted, ExternalID is a string if taskDict['ExternalStatus'] != 'Created' and externalID and int(externalID): requestFiles[externalID] = taskFiles[taskID] updateDict = {} for requestID, lfnList in requestFiles.iteritems(): statusDict = self.requestClient.getRequestFileStatus(requestID, lfnList) if not statusDict['OK']: log = self._logVerbose if 'not exist' in statusDict['Message'] else self._logWarn log("Failed to get files status for request", statusDict['Message'], transID=transID, method='getSubmittedFileStatus') else: for lfn, newStatus in statusDict['Value'].iteritems(): if newStatus == 'Done': updateDict[lfn] = 'Processed' elif newStatus == 'Failed': updateDict[lfn] = 'Problematic' return S_OK(updateDict)
from DIRAC.Core.Base import Script Script.setUsageMessage('\n'.join( [__doc__, 'Usage:', ' %s [option|cfgfile]' % Script.scriptName])) if __name__ == "__main__": from DIRAC.Core.Base.Script import parseCommandLine parseCommandLine() import DIRAC from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient reqClient = ReqClient() dbSummary = reqClient.getDBSummary() if not dbSummary["OK"]: DIRAC.gLogger.error(dbSummary["Message"]) DIRAC.exit(-1) dbSummary = dbSummary["Value"] if not dbSummary: DIRAC.gLogger.info("ReqDB is empty!") DIRAC.exit(0) reqs = dbSummary.get("Request", {}) ops = dbSummary.get("Operation", {}) fs = dbSummary.get("File", {})
def __init__(self, *args, **kwargs): AgentModule.__init__(self, *args, **kwargs) self.name = 'DataRecoveryAgent' self.enabled = False self.getJobInfoFromJDLOnly = False self.__getCSOptions() self.jobStatus = [ 'Failed', 'Done' ] # This needs to be both otherwise we cannot account for all cases self.jobMon = JobMonitoringClient() self.fcClient = FileCatalogClient() self.tClient = TransformationClient() self.reqClient = ReqClient() self.diracAPI = Dirac() self.inputFilesProcessed = set() self.todo = {'NoInputFiles': [dict(Message="NoInputFiles: OutputExists: Job 'Done'", ShortMessage="NoInputFiles: job 'Done' ", Counter=0, Check=lambda job: job.allFilesExist() and job.status == 'Failed', Actions=lambda job, tInfo: [job.setJobDone(tInfo)], ), dict(Message="NoInputFiles: OutputMissing: Job 'Failed'", ShortMessage="NoInputFiles: job 'Failed' ", Counter=0, Check=lambda job: job.allFilesMissing() and job.status == 'Done', Actions=lambda job, tInfo: [job.setJobFailed(tInfo)], ), ], 'InputFiles': [ \ # must always be first! dict(Message="One of many Successful: clean others", ShortMessage="Other Tasks --> Keep", Counter=0, Check=lambda job: job.allFilesExist() and job.otherTasks and \ not set(job.inputFiles).issubset(self.inputFilesProcessed), Actions=lambda job, tInfo: [self.inputFilesProcessed.update(job.inputFiles), job.setJobDone(tInfo), job.setInputProcessed(tInfo)] ), dict(Message="Other Task processed Input, no Output: Fail", ShortMessage="Other Tasks --> Fail", Counter=0, Check=lambda job: set(job.inputFiles).issubset(self.inputFilesProcessed) and \ job.allFilesMissing() and job.status != 'Failed', Actions=lambda job, tInfo: [job.setJobFailed(tInfo)] ), dict(Message="Other Task processed Input: Fail and clean", ShortMessage="Other Tasks --> Cleanup", Counter=0, Check=lambda job: set(job.inputFiles).issubset( self.inputFilesProcessed) and not job.allFilesMissing(), Actions=lambda job, tInfo: [job.setJobFailed(tInfo), job.cleanOutputs(tInfo)] ), dict(Message="InputFile(s) missing: mark job 'Failed', mark input 'Deleted', clean", ShortMessage="Input Missing --> Job 'Failed, Input 'Deleted', Cleanup", Counter=0, Check=lambda job: job.inputFiles and job.allInputFilesMissing() and \ not job.allTransFilesDeleted(), Actions=lambda job, tInfo: [job.cleanOutputs(tInfo), job.setJobFailed(tInfo), job.setInputDeleted(tInfo)], ), dict(Message="InputFile(s) Deleted, output Exists: mark job 'Failed', clean", ShortMessage="Input Deleted --> Job 'Failed, Cleanup", Counter=0, Check=lambda job: job.inputFiles and job.allInputFilesMissing() and \ job.allTransFilesDeleted() and not job.allFilesMissing(), Actions=lambda job, tInfo: [job.cleanOutputs(tInfo), job.setJobFailed(tInfo)], ), # All Output Exists dict(Message="Output Exists, job Failed, input not Processed --> Job Done, Input Processed", ShortMessage="Output Exists --> Job Done, Input Processed", Counter=0, Check=lambda job: job.allFilesExist() and \ not job.otherTasks and \ job.status == 'Failed' and \ not job.allFilesProcessed() and \ job.allInputFilesExist(), Actions=lambda job, tInfo: [job.setJobDone(tInfo), job.setInputProcessed(tInfo)] ), dict(Message="Output Exists, job Failed, input Processed --> Job Done", ShortMessage="Output Exists --> Job Done", Counter=0, Check=lambda job: job.allFilesExist() and \ not job.otherTasks and \ job.status == 'Failed' and \ job.allFilesProcessed() and \ job.allInputFilesExist(), Actions=lambda job, tInfo: [job.setJobDone(tInfo)] ), dict(Message="Output Exists, job Done, input not Processed --> Input Processed", ShortMessage="Output Exists --> Input Processed", Counter=0, Check=lambda job: job.allFilesExist() and \ not job.otherTasks and \ job.status == 'Done' and \ not job.allFilesProcessed() and \ job.allInputFilesExist(), Actions=lambda job, tInfo: [job.setInputProcessed(tInfo)] ), # outputmissing dict(Message="Output Missing, job Failed, input Assigned, MaxError --> Input MaxReset", ShortMessage="Max ErrorCount --> Input MaxReset", Counter=0, Check=lambda job: job.allFilesMissing() and \ not job.otherTasks and \ job.status == 'Failed' and \ job.allFilesAssigned() and \ not set(job.inputFiles).issubset(self.inputFilesProcessed) and \ job.allInputFilesExist() and \ job.checkErrorCount(), Actions=lambda job, tInfo: [job.setInputMaxReset(tInfo)] ), dict(Message="Output Missing, job Failed, input Assigned --> Input Unused", ShortMessage="Output Missing --> Input Unused", Counter=0, Check=lambda job: job.allFilesMissing() and \ not job.otherTasks and \ job.status == 'Failed' and \ job.allFilesAssigned() and \ not set(job.inputFiles).issubset(self.inputFilesProcessed) and \ job.allInputFilesExist(), Actions=lambda job, tInfo: [job.setInputUnused(tInfo)] ), dict(Message="Output Missing, job Done, input Assigned --> Job Failed, Input Unused", ShortMessage="Output Missing --> Job Failed, Input Unused", Counter=0, Check=lambda job: job.allFilesMissing() and \ not job.otherTasks and \ job.status == 'Done' and \ job.allFilesAssigned() and \ not set(job.inputFiles).issubset(self.inputFilesProcessed) and \ job.allInputFilesExist(), Actions=lambda job, tInfo: [job.setInputUnused(tInfo), job.setJobFailed(tInfo)] ), # some files missing, needing cleanup. Only checking for # assigned, because processed could mean an earlier job was # succesful and this one is just the duplicate that needed # to be removed! But we check for other tasks earlier, so # this should not happen dict(Message="Some missing, job Failed, input Assigned --> cleanup, Input 'Unused'", ShortMessage="Output Missing --> Cleanup, Input Unused", Counter=0, Check=lambda job: job.someFilesMissing() and \ not job.otherTasks and \ job.status == 'Failed' and \ job.allFilesAssigned() and \ job.allInputFilesExist(), Actions=lambda job, tInfo: [job.cleanOutputs(tInfo), job.setInputUnused(tInfo)] ), dict(Message="Some missing, job Done, input Assigned --> cleanup, job Failed, Input 'Unused'", ShortMessage="Output Missing --> Cleanup, Job Failed, Input Unused", Counter=0, Check=lambda job: job.someFilesMissing() and \ not job.otherTasks and \ job.status == 'Done' and \ job.allFilesAssigned() and \ job.allInputFilesExist(), Actions=lambda job, tInfo: [ job.cleanOutputs(tInfo), job.setInputUnused(tInfo), job.setJobFailed(tInfo)] ), dict(Message="Some missing, job Done --> job Failed", ShortMessage="Output Missing, Done --> Job Failed", Counter=0, Check=lambda job: not job.allFilesExist() and job.status == 'Done', Actions=lambda job, tInfo: [job.setJobFailed(tInfo)] ), dict(Message="Something Strange", ShortMessage="Strange", Counter=0, Check=lambda job: job.status not in ("Failed", "Done"), Actions=lambda job, tInfo: [] ), # should always be the last one! dict(Message="Failed Hard", ShortMessage="Failed Hard", Counter=0, Check=lambda job: False, # never Actions=lambda job, tInfo: [] ), ] } self.jobCache = defaultdict(lambda: (0, 0)) # Notification options self.notesToSend = "" self.subject = "DataRecoveryAgent" self.startTime = time.time()
gLogger.info( "Will create request '%s' with 'ReplicateAndRegister' "\ "operation using %s lfns and %s target SEs" % ( requestName, len( lfnList ), len( targetSEs ) ) ) from DIRAC.RequestManagementSystem.Client.Request import Request from DIRAC.RequestManagementSystem.Client.Operation import Operation from DIRAC.RequestManagementSystem.Client.File import File from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient from DIRAC.Resources.Catalog.FileCatalog import FileCatalog from DIRAC.Core.Utilities.List import breakListIntoChunks lfnChunks = breakListIntoChunks( lfnList, 100 ) multiRequests = len( lfnChunks ) > 1 error = 0 count = 0 reqClient = ReqClient() fc = FileCatalog() requestIDs = [] for lfnChunk in lfnChunks: metaDatas = fc.getFileMetadata( lfnChunk ) if not metaDatas["OK"]: gLogger.error( "unable to read metadata for lfns: %s" % metaDatas["Message"] ) error = -1 continue metaDatas = metaDatas["Value"] for failedLFN, reason in metaDatas["Failed"].items(): gLogger.error( "skipping %s: %s" % ( failedLFN, reason ) ) lfnChunk = set( metaDatas["Successful"] ) if not lfnChunk: gLogger.error( "LFN list is empty!!!" )
class TransformationCleaningAgent(AgentModule): """ .. class:: TransformationCleaningAgent :param ~DIRAC.DataManagementSystem.Client.DataManager.DataManager dm: DataManager instance :param ~TransformationClient.TransformationClient transClient: TransformationClient instance :param ~FileCatalogClient.FileCatalogClient metadataClient: FileCatalogClient instance """ def __init__(self, *args, **kwargs): """ c'tor """ AgentModule.__init__(self, *args, **kwargs) self.shifterProxy = None # # transformation client self.transClient = None # # wms client self.wmsClient = None # # request client self.reqClient = None # # file catalog client self.metadataClient = None # # transformations types self.transformationTypes = None # # directory locations self.directoryLocations = ['TransformationDB', 'MetadataCatalog'] # # transformation metadata self.transfidmeta = 'TransformationID' # # archive periof in days self.archiveAfter = 7 # # transformation log SEs self.logSE = 'LogSE' # # enable/disable execution self.enableFlag = 'True' self.dataProcTTypes = ['MCSimulation', 'Merge'] self.dataManipTTypes = ['Replication', 'Removal'] def initialize(self): """ agent initialisation reading and setting confing opts :param self: self reference """ # # shifter proxy # See cleanContent method: this proxy will be used ALSO when the file catalog used # is the DIRAC File Catalog (DFC). # This is possible because of unset of the "UseServerCertificate" option self.shifterProxy = self.am_getOption('shifterProxy', self.shifterProxy) # # transformations types self.dataProcTTypes = Operations().getValue('Transformations/DataProcessing', self.dataProcTTypes) self.dataManipTTypes = Operations().getValue('Transformations/DataManipulation', self.dataManipTTypes) agentTSTypes = self.am_getOption('TransformationTypes', []) if agentTSTypes: self.transformationTypes = sorted(agentTSTypes) else: self.transformationTypes = sorted(self.dataProcTTypes + self.dataManipTTypes) self.log.info("Will consider the following transformation types: %s" % str(self.transformationTypes)) # # directory locations self.directoryLocations = sorted(self.am_getOption('DirectoryLocations', self.directoryLocations)) self.log.info("Will search for directories in the following locations: %s" % str(self.directoryLocations)) # # transformation metadata self.transfidmeta = self.am_getOption('TransfIDMeta', self.transfidmeta) self.log.info("Will use %s as metadata tag name for TransformationID" % self.transfidmeta) # # archive periof in days self.archiveAfter = self.am_getOption('ArchiveAfter', self.archiveAfter) # days self.log.info("Will archive Completed transformations after %d days" % self.archiveAfter) # # transformation log SEs self.logSE = Operations().getValue('/LogStorage/LogSE', self.logSE) self.log.info("Will remove logs found on storage element: %s" % self.logSE) # # transformation client self.transClient = TransformationClient() # # wms client self.wmsClient = WMSClient() # # request client self.reqClient = ReqClient() # # file catalog client self.metadataClient = FileCatalogClient() return S_OK() ############################################################################# def execute(self): """ execution in one agent's cycle :param self: self reference """ self.enableFlag = self.am_getOption('EnableFlag', self.enableFlag) if self.enableFlag != 'True': self.log.info('TransformationCleaningAgent is disabled by configuration option EnableFlag') return S_OK('Disabled via CS flag') # Obtain the transformations in Cleaning status and remove any mention of the jobs/files res = self.transClient.getTransformations({'Status': 'Cleaning', 'Type': self.transformationTypes}) if res['OK']: for transDict in res['Value']: if self.shifterProxy: self._executeClean(transDict) else: self.log.info("Cleaning transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s" % transDict) executeWithUserProxy(self._executeClean)(transDict, proxyUserDN=transDict['AuthorDN'], proxyUserGroup=transDict['AuthorGroup']) else: self.log.error("Failed to get transformations", res['Message']) # Obtain the transformations in RemovingFiles status and removes the output files res = self.transClient.getTransformations({'Status': 'RemovingFiles', 'Type': self.transformationTypes}) if res['OK']: for transDict in res['Value']: if self.shifterProxy: self._executeRemoval(transDict) else: self.log.info("Removing files for transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s" % transDict) executeWithUserProxy(self._executeRemoval)(transDict, proxyUserDN=transDict['AuthorDN'], proxyUserGroup=transDict['AuthorGroup']) else: self.log.error("Could not get the transformations", res['Message']) # Obtain the transformations in Completed status and archive if inactive for X days olderThanTime = datetime.utcnow() - timedelta(days=self.archiveAfter) res = self.transClient.getTransformations({'Status': 'Completed', 'Type': self.transformationTypes}, older=olderThanTime, timeStamp='LastUpdate') if res['OK']: for transDict in res['Value']: if self.shifterProxy: self._executeArchive(transDict) else: self.log.info("Archiving files for transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s" % transDict) executeWithUserProxy(self._executeArchive)(transDict, proxyUserDN=transDict['AuthorDN'], proxyUserGroup=transDict['AuthorGroup']) else: self.log.error("Could not get the transformations", res['Message']) return S_OK() def _executeClean(self, transDict): """Clean transformation.""" # if transformation is of type `Replication` or `Removal`, there is nothing to clean. # We just archive if transDict['Type'] in self.dataManipTTypes: res = self.archiveTransformation(transDict['TransformationID']) if not res['OK']: self.log.error("Problems archiving transformation %s: %s" % (transDict['TransformationID'], res['Message'])) else: res = self.cleanTransformation(transDict['TransformationID']) if not res['OK']: self.log.error("Problems cleaning transformation %s: %s" % (transDict['TransformationID'], res['Message'])) def _executeRemoval(self, transDict): """Remove files from given transformation.""" res = self.removeTransformationOutput(transDict['TransformationID']) if not res['OK']: self.log.error("Problems removing transformation %s: %s" % (transDict['TransformationID'], res['Message'])) def _executeArchive(self, transDict): """Archive the given transformation.""" res = self.archiveTransformation(transDict['TransformationID']) if not res['OK']: self.log.error("Problems archiving transformation %s: %s" % (transDict['TransformationID'], res['Message'])) return S_OK() ############################################################################# # # Get the transformation directories for checking # def getTransformationDirectories(self, transID): """ get the directories for the supplied transformation from the transformation system. These directories are used by removeTransformationOutput and cleanTransformation for removing output. :param self: self reference :param int transID: transformation ID """ self.log.verbose("Cleaning Transformation directories of transformation %d" % transID) directories = [] if 'TransformationDB' in self.directoryLocations: res = self.transClient.getTransformationParameters(transID, ['OutputDirectories']) if not res['OK']: self.log.error("Failed to obtain transformation directories", res['Message']) return res transDirectories = [] if res['Value']: if not isinstance(res['Value'], list): try: transDirectories = ast.literal_eval(res['Value']) except BaseException: # It can happen if the res['Value'] is '/a/b/c' instead of '["/a/b/c"]' transDirectories.append(res['Value']) else: transDirectories = res['Value'] directories = self._addDirs(transID, transDirectories, directories) if 'MetadataCatalog' in self.directoryLocations: res = self.metadataClient.findDirectoriesByMetadata({self.transfidmeta: transID}) if not res['OK']: self.log.error("Failed to obtain metadata catalog directories", res['Message']) return res transDirectories = res['Value'] directories = self._addDirs(transID, transDirectories, directories) if not directories: self.log.info("No output directories found") directories = sorted(directories) return S_OK(directories) @classmethod def _addDirs(cls, transID, newDirs, existingDirs): """ append unique :newDirs: list to :existingDirs: list :param self: self reference :param int transID: transformationID :param list newDirs: src list of paths :param list existingDirs: dest list of paths """ for folder in newDirs: transStr = str(transID).zfill(8) if re.search(transStr, str(folder)): if folder not in existingDirs: existingDirs.append(os.path.normpath(folder)) return existingDirs ############################################################################# # # These are the methods for performing the cleaning of catalogs and storage # def cleanContent(self, directory): """ wipe out everything from catalog under folder :directory: :param self: self reference :params str directory: folder name """ self.log.verbose("Cleaning Catalog contents") res = self.__getCatalogDirectoryContents([directory]) if not res['OK']: return res filesFound = res['Value'] if not filesFound: self.log.info("No files are registered in the catalog directory %s" % directory) return S_OK() self.log.info("Attempting to remove %d possible remnants from the catalog and storage" % len(filesFound)) # Executing with shifter proxy gConfigurationData.setOptionInCFG('/DIRAC/Security/UseServerCertificate', 'false') res = DataManager().removeFile(filesFound, force=True) gConfigurationData.setOptionInCFG('/DIRAC/Security/UseServerCertificate', 'true') if not res['OK']: return res realFailure = False for lfn, reason in res['Value']['Failed'].items(): if "File does not exist" in str(reason): self.log.warn("File %s not found in some catalog: " % (lfn)) else: self.log.error("Failed to remove file found in the catalog", "%s %s" % (lfn, reason)) realFailure = True if realFailure: return S_ERROR("Failed to remove all files found in the catalog") return S_OK() def __getCatalogDirectoryContents(self, directories): """ get catalog contents under paths :directories: :param self: self reference :param list directories: list of paths in catalog """ self.log.info('Obtaining the catalog contents for %d directories:' % len(directories)) for directory in directories: self.log.info(directory) activeDirs = directories allFiles = {} fc = FileCatalog() while activeDirs: currentDir = activeDirs[0] res = returnSingleResult(fc.listDirectory(currentDir)) activeDirs.remove(currentDir) if not res['OK'] and 'Directory does not exist' in res['Message']: # FIXME: DFC should return errno self.log.info("The supplied directory %s does not exist" % currentDir) elif not res['OK']: if "No such file or directory" in res['Message']: self.log.info("%s: %s" % (currentDir, res['Message'])) else: self.log.error("Failed to get directory %s content: %s" % (currentDir, res['Message'])) else: dirContents = res['Value'] activeDirs.extend(dirContents['SubDirs']) allFiles.update(dirContents['Files']) self.log.info("Found %d files" % len(allFiles)) return S_OK(allFiles.keys()) def cleanTransformationLogFiles(self, directory): """ clean up transformation logs from directory :directory: :param self: self reference :param str directory: folder name """ self.log.verbose("Removing log files found in the directory %s" % directory) res = returnSingleResult(StorageElement(self.logSE).removeDirectory(directory, recursive=True)) if not res['OK']: if cmpError(res, errno.ENOENT): # No such file or directory self.log.warn("Transformation log directory does not exist", directory) return S_OK() self.log.error("Failed to remove log files", res['Message']) return res self.log.info("Successfully removed transformation log directory") return S_OK() ############################################################################# # # These are the functional methods for archiving and cleaning transformations # def removeTransformationOutput(self, transID): """ This just removes any mention of the output data from the catalog and storage """ self.log.info("Removing output data for transformation %s" % transID) res = self.getTransformationDirectories(transID) if not res['OK']: self.log.error('Problem obtaining directories for transformation %s with result "%s"' % (transID, res)) return S_OK() directories = res['Value'] for directory in directories: if not re.search('/LOG/', directory): res = self.cleanContent(directory) if not res['OK']: return res self.log.info("Removed %d directories from the catalog \ and its files from the storage for transformation %s" % (len(directories), transID)) # Clean ALL the possible remnants found in the metadata catalog res = self.cleanMetadataCatalogFiles(transID) if not res['OK']: return res self.log.info("Successfully removed output of transformation %d" % transID) # Change the status of the transformation to RemovedFiles res = self.transClient.setTransformationParameter(transID, 'Status', 'RemovedFiles') if not res['OK']: self.log.error("Failed to update status of transformation %s to RemovedFiles" % (transID), res['Message']) return res self.log.info("Updated status of transformation %s to RemovedFiles" % (transID)) return S_OK() def archiveTransformation(self, transID): """ This just removes job from the jobDB and the transformation DB :param self: self reference :param int transID: transformation ID """ self.log.info("Archiving transformation %s" % transID) # Clean the jobs in the WMS and any failover requests found res = self.cleanTransformationTasks(transID) if not res['OK']: return res # Clean the transformation DB of the files and job information res = self.transClient.cleanTransformation(transID) if not res['OK']: return res self.log.info("Successfully archived transformation %d" % transID) # Change the status of the transformation to archived res = self.transClient.setTransformationParameter(transID, 'Status', 'Archived') if not res['OK']: self.log.error("Failed to update status of transformation %s to Archived" % (transID), res['Message']) return res self.log.info("Updated status of transformation %s to Archived" % (transID)) return S_OK() def cleanTransformation(self, transID): """ This removes what was produced by the supplied transformation, leaving only some info and log in the transformation DB. """ self.log.info("Cleaning transformation %s" % transID) res = self.getTransformationDirectories(transID) if not res['OK']: self.log.error('Problem obtaining directories for transformation %s with result "%s"' % (transID, res)) return S_OK() directories = res['Value'] # Clean the jobs in the WMS and any failover requests found res = self.cleanTransformationTasks(transID) if not res['OK']: return res # Clean the log files for the jobs for directory in directories: if re.search('/LOG/', directory): res = self.cleanTransformationLogFiles(directory) if not res['OK']: return res res = self.cleanContent(directory) if not res['OK']: return res # Clean ALL the possible remnants found res = self.cleanMetadataCatalogFiles(transID) if not res['OK']: return res # Clean the transformation DB of the files and job information res = self.transClient.cleanTransformation(transID) if not res['OK']: return res self.log.info("Successfully cleaned transformation %d" % transID) res = self.transClient.setTransformationParameter(transID, 'Status', 'Cleaned') if not res['OK']: self.log.error("Failed to update status of transformation %s to Cleaned" % (transID), res['Message']) return res self.log.info("Updated status of transformation %s to Cleaned" % (transID)) return S_OK() def cleanMetadataCatalogFiles(self, transID): """ wipe out files from catalog """ res = self.metadataClient.findFilesByMetadata({self.transfidmeta: transID}) if not res['OK']: return res fileToRemove = res['Value'] if not fileToRemove: self.log.info('No files found for transID %s' % transID) return S_OK() # Executing with shifter proxy gConfigurationData.setOptionInCFG('/DIRAC/Security/UseServerCertificate', 'false') res = DataManager().removeFile(fileToRemove, force=True) gConfigurationData.setOptionInCFG('/DIRAC/Security/UseServerCertificate', 'true') if not res['OK']: return res for lfn, reason in res['Value']['Failed'].items(): self.log.error("Failed to remove file found in metadata catalog", "%s %s" % (lfn, reason)) if res['Value']['Failed']: return S_ERROR("Failed to remove all files found in the metadata catalog") self.log.info("Successfully removed all files found in the BK") return S_OK() ############################################################################# # # These are the methods for removing the jobs from the WMS and transformation DB # def cleanTransformationTasks(self, transID): """ clean tasks from WMS, or from the RMS if it is a DataManipulation transformation """ self.log.verbose("Cleaning Transformation tasks of transformation %d" % transID) res = self.__getTransformationExternalIDs(transID) if not res['OK']: return res externalIDs = res['Value'] if externalIDs: res = self.transClient.getTransformationParameters(transID, ['Type']) if not res['OK']: self.log.error("Failed to determine transformation type") return res transType = res['Value'] if transType in self.dataProcTTypes: res = self.__removeWMSTasks(externalIDs) else: res = self.__removeRequests(externalIDs) if not res['OK']: return res return S_OK() def __getTransformationExternalIDs(self, transID): """ collect all ExternalIDs for transformation :transID: :param self: self reference :param int transID: transforamtion ID """ res = self.transClient.getTransformationTasks(condDict={'TransformationID': transID}) if not res['OK']: self.log.error("Failed to get externalIDs for transformation %d" % transID, res['Message']) return res externalIDs = [taskDict['ExternalID'] for taskDict in res["Value"]] self.log.info("Found %d tasks for transformation" % len(externalIDs)) return S_OK(externalIDs) def __removeRequests(self, requestIDs): """ This will remove requests from the RMS system - """ rIDs = [int(long(j)) for j in requestIDs if long(j)] for reqID in rIDs: self.reqClient.cancelRequest(reqID) return S_OK() def __removeWMSTasks(self, transJobIDs): """ wipe out jobs and their requests from the system :param self: self reference :param list trasnJobIDs: job IDs """ # Prevent 0 job IDs jobIDs = [int(j) for j in transJobIDs if int(j)] allRemove = True for jobList in breakListIntoChunks(jobIDs, 500): res = self.wmsClient.killJob(jobList) if res['OK']: self.log.info("Successfully killed %d jobs from WMS" % len(jobList)) elif ("InvalidJobIDs" in res) and ("NonauthorizedJobIDs" not in res) and ("FailedJobIDs" not in res): self.log.info("Found %s jobs which did not exist in the WMS" % len(res['InvalidJobIDs'])) elif "NonauthorizedJobIDs" in res: self.log.error("Failed to kill %s jobs because not authorized" % len(res['NonauthorizedJobIDs'])) allRemove = False elif "FailedJobIDs" in res: self.log.error("Failed to kill %s jobs" % len(res['FailedJobIDs'])) allRemove = False res = self.wmsClient.deleteJob(jobList) if res['OK']: self.log.info("Successfully removed %d jobs from WMS" % len(jobList)) elif ("InvalidJobIDs" in res) and ("NonauthorizedJobIDs" not in res) and ("FailedJobIDs" not in res): self.log.info("Found %s jobs which did not exist in the WMS" % len(res['InvalidJobIDs'])) elif "NonauthorizedJobIDs" in res: self.log.error("Failed to remove %s jobs because not authorized" % len(res['NonauthorizedJobIDs'])) allRemove = False elif "FailedJobIDs" in res: self.log.error("Failed to remove %s jobs" % len(res['FailedJobIDs'])) allRemove = False if not allRemove: return S_ERROR("Failed to remove all remnants from WMS") self.log.info("Successfully removed all tasks from the WMS") if not jobIDs: self.log.info("JobIDs not present, unable to remove asociated requests.") return S_OK() failed = 0 failoverRequests = {} res = self.reqClient.getRequestIDsForJobs(jobIDs) if not res['OK']: self.log.error("Failed to get requestID for jobs.", res['Message']) return res failoverRequests.update(res['Value']['Successful']) if not failoverRequests: return S_OK() for jobID, requestID in res['Value']['Successful'].items(): # Put this check just in case, tasks must have associated jobs if jobID == 0 or jobID == '0': continue res = self.reqClient.cancelRequest(requestID) if not res['OK']: self.log.error("Failed to remove request from RequestDB", res['Message']) failed += 1 else: self.log.verbose("Removed request %s associated to job %d." % (requestID, jobID)) if failed: self.log.info("Successfully removed %s requests" % (len(failoverRequests) - failed)) self.log.info("Failed to remove %s requests" % failed) return S_ERROR("Failed to remove all the request from RequestDB") self.log.info("Successfully removed all the associated failover requests") return S_OK()
since = convertDate( switch[1] ) elif switch[0] == 'Until': until = convertDate( switch[1] ) if reset: status = 'Failed' if terse: verbose = True if status: if not until: until = datetime.datetime.utcnow() if not since: since = until - datetime.timedelta( hours = 24 ) from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient from DIRAC.RequestManagementSystem.Client.ReqClient import printRequest, recoverableRequest reqClient = ReqClient() if transID: if not taskIDs: gLogger.fatal( "If Transformation is set, a list of Tasks should also be set" ) Script.showHelp() DIRAC.exit( 2 ) requests = ['%08d_%08d' % ( transID, task ) for task in taskIDs] elif not jobs: args = Script.getPositionalArgs() if len( args ) == 1: all = True requests = [reqName for reqName in args[0].split( ',' ) if reqName] else: res = reqClient.getRequestNamesForJobs( jobs ) if not res['OK']:
def main(): from DIRAC.Core.Base import Script Script.registerSwitch('', 'Job=', ' JobID[,jobID2,...]') Script.registerSwitch('', 'Transformation=', ' transformation ID') Script.registerSwitch( '', 'Tasks=', ' Associated to --Transformation, list of taskIDs') Script.registerSwitch('', 'Verbose', ' Print more information') Script.registerSwitch('', 'Terse', ' Only print request status') Script.registerSwitch('', 'Full', ' Print full request content') Script.registerSwitch('', 'Status=', ' Select all requests in a given status') Script.registerSwitch( '', 'Since=', ' Associated to --Status, start date yyyy-mm-dd or nb of days (default= -one day' ) Script.registerSwitch( '', 'Until=', ' Associated to --Status, end date (default= now') Script.registerSwitch( '', 'Maximum=', ' Associated to --Status, max number of requests ') Script.registerSwitch('', 'Reset', ' Reset Failed files to Waiting if any') Script.registerSwitch('', 'Force', ' Force reset even if not Failed') Script.registerSwitch( '', 'All', ' (if --Status Failed) all requests, otherwise exclude irrecoverable failures' ) Script.registerSwitch('', 'FixJob', ' Set job Done if the request is Done') Script.registerSwitch('', 'Cancel', ' Cancel the request') Script.registerSwitch('', 'ListJobs', ' List the corresponding jobs') Script.registerSwitch( '', 'TargetSE=', ' Select request only if that SE is in the targetSEs') from DIRAC.Core.Base.Script import parseCommandLine parseCommandLine() import DIRAC from DIRAC import gLogger jobs = [] requestID = 0 transID = None taskIDs = None tasks = None requests = [] full = False verbose = False status = None until = None since = None terse = False allR = False reset = False fixJob = False maxRequests = 999999999999 cancel = False listJobs = False force = False targetSE = set() for switch in Script.getUnprocessedSwitches(): if switch[0] == 'Job': jobs = [] job = "Unknown" try: for arg in switch[1].split(','): if os.path.exists(arg): with open(arg, 'r') as fp: lines = fp.readlines() for line in lines: for job in line.split(','): jobs += [int(job.strip())] gLogger.notice("Found %d jobs in file %s" % (len(jobs), arg)) else: jobs.append(int(arg)) except TypeError: gLogger.fatal("Invalid jobID", job) elif switch[0] == 'Transformation': try: transID = int(switch[1]) except Exception: gLogger.fatal('Invalid transID', switch[1]) elif switch[0] == 'Tasks': try: taskIDs = [int(task) for task in switch[1].split(',')] except Exception: gLogger.fatal('Invalid tasks', switch[1]) elif switch[0] == 'Full': full = True elif switch[0] == 'Verbose': verbose = True elif switch[0] == 'Terse': terse = True elif switch[0] == 'All': allR = True elif switch[0] == 'Reset': reset = True elif switch[0] == 'Force': force = True elif switch[0] == 'Status': status = switch[1].capitalize() elif switch[0] == 'Since': since = convertDate(switch[1]) elif switch[0] == 'Until': until = convertDate(switch[1]) elif switch[0] == 'FixJob': fixJob = True elif switch[0] == 'Cancel': cancel = True elif switch[0] == 'ListJobs': listJobs = True elif switch[0] == 'Maximum': try: maxRequests = int(switch[1]) except Exception: pass elif switch[0] == 'TargetSE': targetSE = set(switch[1].split(',')) if reset and not force: status = 'Failed' if fixJob: status = 'Done' if terse: verbose = True if status: if not until: until = datetime.datetime.utcnow() if not since: since = until - datetime.timedelta(hours=24) from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient from DIRAC.RequestManagementSystem.Client.ReqClient import printRequest, recoverableRequest reqClient = ReqClient() if transID: if not taskIDs: gLogger.fatal( "If Transformation is set, a list of Tasks should also be set") Script.showHelp(exitCode=2) # In principle, the task name is unique, so the request name should be unique as well # If ever this would not work anymore, we would need to use the transformationClient # to fetch the ExternalID requests = ['%08d_%08d' % (transID, task) for task in taskIDs] allR = True elif not jobs: requests = [] # Get full list of arguments, with and without comma for arg in [ x.strip() for arg in Script.getPositionalArgs() for x in arg.split(',') ]: if os.path.exists(arg): lines = open(arg, 'r').readlines() requests += [ reqID.strip() for line in lines for reqID in line.split(',') ] gLogger.notice("Found %d requests in file" % len(requests)) else: requests.append(arg) allR = True else: res = reqClient.getRequestIDsForJobs(jobs) if not res['OK']: gLogger.fatal("Error getting request for jobs", res['Message']) DIRAC.exit(2) if res['Value']['Failed']: gLogger.error( "No request found for jobs %s" % ','.join(sorted(str(job) for job in res['Value']['Failed']))) requests = sorted(res['Value']['Successful'].values()) if requests: allR = True else: DIRAC.exit(0) if status and not requests: allR = allR or status != 'Failed' res = reqClient.getRequestIDsList([status], limit=maxRequests, since=since, until=until) if not res['OK']: gLogger.error("Error getting requests:", res['Message']) DIRAC.exit(2) requests = [ reqID for reqID, _st, updTime in res['Value'] if updTime > since and updTime <= until and reqID ] gLogger.notice('Obtained %d requests %s between %s and %s' % (len(requests), status, since, until)) if not requests: gLogger.notice('No request selected....') Script.showHelp(exitCode=2) okRequests = [] warningPrinted = False jobIDList = [] for reqID in requests: # We allow reqID to be the requestName if it is unique try: requestID = int(reqID) except ValueError: requestID = reqClient.getRequestIDForName(reqID) if not requestID['OK']: gLogger.notice(requestID['Message']) continue requestID = requestID['Value'] request = reqClient.peekRequest(requestID) if not request["OK"]: gLogger.error(request["Message"]) DIRAC.exit(-1) request = request["Value"] if not request: gLogger.error("no such request %s" % requestID) continue # If no operation as the targetSE, skip if targetSE: found = False for op in request: if op.TargetSE and targetSE.intersection( op.TargetSE.split(',')): found = True break if not found: continue # keep a list of jobIDs if requested if request.JobID and listJobs: jobIDList.append(request.JobID) if status and request.Status != status: gLogger.notice( "Request %s is not in requested status %s%s" % (reqID, status, ' (cannot be reset)' if reset else '')) continue if fixJob and request.Status == 'Done' and request.JobID: # The request is for a job and is Done, verify that the job is in the proper status result = reqClient.finalizeRequest(request.RequestID, request.JobID, useCertificates=False) if not result['OK']: gLogger.error("Error finalizing job", result['Message']) else: gLogger.notice("Job %d updated to %s" % (request.JobID, result['Value'])) continue if cancel: if request.Status not in ('Done', 'Failed'): ret = reqClient.cancelRequest(requestID) if not ret['OK']: gLogger.error("Error canceling request %s" % reqID, ret['Message']) else: gLogger.notice("Request %s cancelled" % reqID) else: gLogger.notice("Request %s is in status %s, not cancelled" % (reqID, request.Status)) elif allR or recoverableRequest(request): okRequests.append(str(requestID)) if reset: gLogger.notice('============ Request %s =============' % requestID) ret = reqClient.resetFailedRequest(requestID, allR=allR) if not ret['OK']: gLogger.error("Error resetting request %s" % requestID, ret['Message']) else: if len(requests) > 1: gLogger.notice('\n===================================') dbStatus = reqClient.getRequestStatus(requestID).get( 'Value', 'Unknown') printRequest(request, status=dbStatus, full=full, verbose=verbose, terse=terse) if listJobs: gLogger.notice("List of %d jobs:\n" % len(jobIDList), ','.join(str(jobID) for jobID in jobIDList)) if status and okRequests: from DIRAC.Core.Utilities.List import breakListIntoChunks gLogger.notice('\nList of %d selected requests:' % len(okRequests)) for reqs in breakListIntoChunks(okRequests, 100): gLogger.notice(','.join(reqs))
class FTS3Operation(FTS3Serializable): """ Abstract class to represent an operation to be executed by FTS. It is a container for FTSFiles, as well as for FTSJobs. There can be a mapping between one FTS3Operation and one RMS Operation. The FTS3Operation takes care of generating the appropriate FTSJobs, and to perform a callback when the work with FTS is over. The actual generation and callback depends on the subclass. This class should not be instantiated directly, but rather one of its subclass """ ALL_STATES = ['Active', # Default state until FTS has done everything 'Processed', # Interactions with FTS done, but callback not done 'Finished', # Everything was done 'Canceled', # Canceled by the user 'Failed', # I don't know yet ] FINAL_STATES = ['Finished', 'Canceled', 'Failed'] INIT_STATE = 'Active' _attrToSerialize = ['operationID', 'username', 'userGroup', 'rmsReqID', 'rmsOpID', 'sourceSEs', 'ftsFiles', 'activity', 'priority', 'ftsJobs', 'creationTime', 'lastUpdate', 'error', 'status'] def __init__(self, ftsFiles=None, username=None, userGroup=None, rmsReqID=-1, rmsOpID=0, sourceSEs=None, activity=None, priority=None): """ :param ftsFiles: list of FTS3Files object that belongs to the operation :param username: username whose proxy should be used :param userGroup: group that should be used with username :param rmsReqID: ID of the Request in the RMS system :param rmsOpID: ID of the Operation in the RMS system :param sourceSEs: list of SE to be used as source (if applicable) :param activity: FTS activity to use :param priority: FTS priority to use """ ############################ # persistent attributes self.username = username self.userGroup = userGroup self.rmsReqID = rmsReqID self.rmsOpID = rmsOpID if isinstance(sourceSEs, list): sourceSEs = ','.join(sourceSEs) self.sourceSEs = sourceSEs self.ftsFiles = ftsFiles if ftsFiles else [] self.activity = activity self.priority = priority self.ftsJobs = [] now = datetime.datetime.utcnow().replace(microsecond=0) self.creationTime = now self.lastUpdate = now self.error = None self.status = FTS3Operation.INIT_STATE ######################## self.reqClient = None self.dManager = None self._log = None self.init_on_load() @orm.reconstructor def init_on_load(self): """ This method initializes some attributes. It is called by sqlalchemy (which does not call __init__) """ self._vo = None self.dManager = DataManager() self.rssClient = ResourceStatus() opID = getattr(self, 'operationID', None) loggerName = '%s/' % opID if opID else '' loggerName += 'req_%s/op_%s' % (self.rmsReqID, self.rmsOpID) self._log = gLogger.getSubLogger(loggerName, True) @property def vo(self): """:returns: return vo of the usergroup """ if self._vo: return self._vo if self.userGroup: self._vo = getVOForGroup(self.userGroup) return self._vo def isTotallyProcessed(self): """ Returns True if and only if there is nothing else to be done by FTS for this operation. All files are successful or definitely failed """ if self.status == 'Processed': return True fileStatuses = set([f.status for f in self.ftsFiles]) # If all the files are in a final state if fileStatuses <= set(FTS3File.FINAL_STATES): self.status = 'Processed' return True return False def _getFilesToSubmit(self, maxAttemptsPerFile=10): """ Return the list of FTS3files that can be submitted Either because they never were submitted, or because we can make more attempts :param maxAttemptsPerFile: the maximum number of attempts to be tried for a file :return List of FTS3File to submit """ toSubmit = [] for ftsFile in self.ftsFiles: if ftsFile.attempt >= maxAttemptsPerFile: ftsFile.status = 'Defunct' # The file was never submitted or # The file failed from the point of view of FTS # but no more than the maxAttemptsPerFile elif ftsFile.status in ('New', 'Failed'): toSubmit.append(ftsFile) return toSubmit @staticmethod def _checkSEAccess(seName, accessType, vo=None): """Check the Status of a storage element :param seName: name of the StorageElement :param accessType ReadAccess, WriteAccess,CheckAccess,RemoveAccess :return S_ERROR if not allowed or error, S_OK() otherwise """ # Check that the target is writable # access = self.rssClient.getStorageElementStatus( seName, accessType ) # if not access["OK"]: # return access # if access["Value"][seName][accessType] not in ( "Active", "Degraded" ): # return S_ERROR( "%s does not have %s in Active or Degraded" % ( seName, accessType ) ) status = StorageElement(seName, vo=vo).getStatus() if not status['OK']: return status status = status['Value'] accessType = accessType.replace('Access', '') if not status[accessType]: return S_ERROR("%s does not have %s in Active or Degraded" % (seName, accessType)) return S_OK() def _createNewJob(self, jobType, ftsFiles, targetSE, sourceSE=None): """ Create a new FTS3Job object :param jobType: type of job to create (Transfer, Staging, Removal) :param ftsFiles: list of FTS3File objects the job has to work on :param targetSE: SE on which to operate :param sourceSE: source SE, only useful for Transfer jobs :return FTS3Job object """ newJob = FTS3Job() newJob.type = jobType newJob.sourceSE = sourceSE newJob.targetSE = targetSE newJob.activity = self.activity newJob.priority = self.priority newJob.username = self.username newJob.userGroup = self.userGroup newJob.vo = self.vo newJob.filesToSubmit = ftsFiles newJob.operationID = getattr(self, 'operationID') return newJob def _callback(self): """Actually performs the callback """ raise NotImplementedError("You should not be using the base class") def callback(self): """ Trigger the callback once all the FTS interactions are done and update the status of the Operation to 'Finished' if successful """ self.reqClient = ReqClient() res = self._callback() if res['OK']: self.status = 'Finished' return res def prepareNewJobs(self, maxFilesPerJob=100, maxAttemptsPerFile=10): """ Prepare the new jobs that have to be submitted :param maxFilesPerJob: maximum number of files assigned to a job :param maxAttemptsPerFile: maximum number of retry after an fts failure :return list of jobs """ raise NotImplementedError("You should not be using the base class") def _updateRmsOperationStatus(self): """ Update the status of the Files in the rms operation :return: S_OK with a dict: * request: rms Request object * operation: rms Operation object * ftsFilesByTarget: dict {SE: [ftsFiles that were successful]} """ log = self._log.getSubLogger("_updateRmsOperationStatus/%s/%s" % (getattr(self, 'operationID'), self.rmsReqID), child=True) res = self.reqClient.getRequest(self.rmsReqID) if not res['OK']: return res request = res['Value'] res = request.getWaiting() if not res["OK"]: log.error("Unable to find 'Scheduled' operation in request") res = self.reqClient.putRequest(request, useFailoverProxy=False, retryMainService=3) if not res['OK']: log.error("Could not put back the request !", res['Message']) return S_ERROR("Could not find scheduled operation") operation = res['Value'] # We index the files of the operation by their IDs rmsFileIDs = {} for opFile in operation: rmsFileIDs[opFile.FileID] = opFile # Files that failed to transfer defunctRmsFileIDs = set() # { SE : [FTS3Files] } ftsFilesByTarget = {} for ftsFile in self.ftsFiles: if ftsFile.status == 'Defunct': log.info( "File failed to transfer, setting it to failed in RMS", "%s %s" % (ftsFile.lfn, ftsFile.targetSE)) defunctRmsFileIDs.add(ftsFile.rmsFileID) continue if ftsFile.status == 'Canceled': log.info( "File canceled, setting it Failed in RMS", "%s %s" % (ftsFile.lfn, ftsFile.targetSE)) defunctRmsFileIDs.add(ftsFile.rmsFileID) continue # SHOULD NEVER HAPPEN ! if ftsFile.status != 'Finished': log.error( "Callback called with file in non terminal state", "%s %s" % (ftsFile.lfn, ftsFile.targetSE)) res = self.reqClient.putRequest(request, useFailoverProxy=False, retryMainService=3) if not res['OK']: log.error("Could not put back the request !", res['Message']) return S_ERROR("Callback called with file in non terminal state") ftsFilesByTarget.setdefault(ftsFile.targetSE, []).append(ftsFile) # Now, we set the rmsFile as done in the operation, providing # that they are not in the defunctFiles. # We cannot do this in the previous list because in the FTS system, # each destination is a separate line in the DB but not in the RMS for ftsFile in self.ftsFiles: opFile = rmsFileIDs[ftsFile.rmsFileID] opFile.Status = 'Failed' if ftsFile.rmsFileID in defunctRmsFileIDs else 'Done' return S_OK({'request': request, 'operation': operation, 'ftsFilesByTarget': ftsFilesByTarget}) @classmethod def fromRMSObjects(cls, rmsReq, rmsOp, username): """ Construct an FTS3Operation object from the RMS Request and Operation corresponding. The attributes taken are the OwnerGroup, Request and Operation IDS, sourceSE, and activity and priority if they are defined in the Argument field of the operation :param rmsReq: RMS Request object :param rmsOp: RMS Operation object :param username: username to which associate the FTS3Operation (normally comes from the Req OwnerDN) :returns: FTS3Operation object """ ftsOp = cls() ftsOp.username = username ftsOp.userGroup = rmsReq.OwnerGroup ftsOp.rmsReqID = rmsReq.RequestID ftsOp.rmsOpID = rmsOp.OperationID ftsOp.sourceSEs = rmsOp.SourceSE try: argumentDic = json.loads(rmsOp.Arguments) ftsOp.activity = argumentDic['activity'] ftsOp.priority = argumentDic['priority'] except Exception as _e: pass return ftsOp
import DIRAC # Check is provided SE is OK se = StorageElement(targetSE) if not se.valid: print se.errorReason print Script.showHelp() from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient from DIRAC.RequestManagementSystem.Client.Request import Request from DIRAC.RequestManagementSystem.Client.Operation import Operation from DIRAC.RequestManagementSystem.Client.File import File from DIRAC.RequestManagementSystem.private.RequestValidator import gRequestValidator from DIRAC.DataManagementSystem.Client.ReplicaManager import ReplicaManager reqClient = ReqClient() rm = ReplicaManager() for lfnList in breakListIntoChunks(lfns, 100): oRequest = Request() oRequest.RequestName = "%s_%s" % (md5(repr(time.time())).hexdigest()[:16], md5(repr(time.time())).hexdigest()[:16]) replicateAndRegister = Operation() replicateAndRegister.Type = 'ReplicateAndRegister' replicateAndRegister.TargetSE = targetSE res = rm.getCatalogFileMetadata(lfnList) if not res['OK']: print "Can't get file metadata: %s" % res['Message']
class TransformationCleaningAgent(AgentModule): """ .. class:: TransformationCleaningAgent :param ~DIRAC.DataManagementSystem.Client.DataManager.DataManager dm: DataManager instance :param ~TransformationClient.TransformationClient transClient: TransformationClient instance :param ~FileCatalogClient.FileCatalogClient metadataClient: FileCatalogClient instance """ def __init__(self, *args, **kwargs): """ c'tor """ AgentModule.__init__(self, *args, **kwargs) self.shifterProxy = None # # transformation client self.transClient = None # # wms client self.wmsClient = None # # request client self.reqClient = None # # file catalog client self.metadataClient = None # # transformations types self.transformationTypes = None # # directory locations self.directoryLocations = ['TransformationDB', 'MetadataCatalog'] # # transformation metadata self.transfidmeta = 'TransformationID' # # archive periof in days self.archiveAfter = 7 # # transformation log SEs self.logSE = 'LogSE' # # enable/disable execution self.enableFlag = 'True' self.dataProcTTypes = ['MCSimulation', 'Merge'] self.dataManipTTypes = ['Replication', 'Removal'] def initialize(self): """ agent initialisation reading and setting confing opts :param self: self reference """ # # shifter proxy # See cleanContent method: this proxy will be used ALSO when the file catalog used # is the DIRAC File Catalog (DFC). # This is possible because of unset of the "UseServerCertificate" option self.shifterProxy = self.am_getOption('shifterProxy', self.shifterProxy) # # transformations types self.dataProcTTypes = Operations().getValue( 'Transformations/DataProcessing', self.dataProcTTypes) self.dataManipTTypes = Operations().getValue( 'Transformations/DataManipulation', self.dataManipTTypes) agentTSTypes = self.am_getOption('TransformationTypes', []) if agentTSTypes: self.transformationTypes = sorted(agentTSTypes) else: self.transformationTypes = sorted(self.dataProcTTypes + self.dataManipTTypes) self.log.info("Will consider the following transformation types: %s" % str(self.transformationTypes)) # # directory locations self.directoryLocations = sorted( self.am_getOption('DirectoryLocations', self.directoryLocations)) self.log.info( "Will search for directories in the following locations: %s" % str(self.directoryLocations)) # # transformation metadata self.transfidmeta = self.am_getOption('TransfIDMeta', self.transfidmeta) self.log.info("Will use %s as metadata tag name for TransformationID" % self.transfidmeta) # # archive periof in days self.archiveAfter = self.am_getOption('ArchiveAfter', self.archiveAfter) # days self.log.info("Will archive Completed transformations after %d days" % self.archiveAfter) # # transformation log SEs self.logSE = Operations().getValue('/LogStorage/LogSE', self.logSE) self.log.info("Will remove logs found on storage element: %s" % self.logSE) # # transformation client self.transClient = TransformationClient() # # wms client self.wmsClient = WMSClient() # # request client self.reqClient = ReqClient() # # file catalog client self.metadataClient = FileCatalogClient() return S_OK() ############################################################################# def execute(self): """ execution in one agent's cycle :param self: self reference """ self.enableFlag = self.am_getOption('EnableFlag', self.enableFlag) if self.enableFlag != 'True': self.log.info( 'TransformationCleaningAgent is disabled by configuration option EnableFlag' ) return S_OK('Disabled via CS flag') # Obtain the transformations in Cleaning status and remove any mention of the jobs/files res = self.transClient.getTransformations({ 'Status': 'Cleaning', 'Type': self.transformationTypes }) if res['OK']: for transDict in res['Value']: if self.shifterProxy: self._executeClean(transDict) else: self.log.info( "Cleaning transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s" % transDict) executeWithUserProxy(self._executeClean)( transDict, proxyUserDN=transDict['AuthorDN'], proxyUserGroup=transDict['AuthorGroup']) else: self.log.error("Failed to get transformations", res['Message']) # Obtain the transformations in RemovingFiles status and removes the output files res = self.transClient.getTransformations({ 'Status': 'RemovingFiles', 'Type': self.transformationTypes }) if res['OK']: for transDict in res['Value']: if self.shifterProxy: self._executeRemoval(transDict) else: self.log.info( "Removing files for transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s" % transDict) executeWithUserProxy(self._executeRemoval)( transDict, proxyUserDN=transDict['AuthorDN'], proxyUserGroup=transDict['AuthorGroup']) else: self.log.error("Could not get the transformations", res['Message']) # Obtain the transformations in Completed status and archive if inactive for X days olderThanTime = datetime.utcnow() - timedelta(days=self.archiveAfter) res = self.transClient.getTransformations( { 'Status': 'Completed', 'Type': self.transformationTypes }, older=olderThanTime, timeStamp='LastUpdate') if res['OK']: for transDict in res['Value']: if self.shifterProxy: self._executeArchive(transDict) else: self.log.info( "Archiving files for transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s" % transDict) executeWithUserProxy(self._executeArchive)( transDict, proxyUserDN=transDict['AuthorDN'], proxyUserGroup=transDict['AuthorGroup']) else: self.log.error("Could not get the transformations", res['Message']) return S_OK() def _executeClean(self, transDict): """Clean transformation.""" # if transformation is of type `Replication` or `Removal`, there is nothing to clean. # We just archive if transDict['Type'] in self.dataManipTTypes: res = self.archiveTransformation(transDict['TransformationID']) if not res['OK']: self.log.error("Problems archiving transformation %s: %s" % (transDict['TransformationID'], res['Message'])) else: res = self.cleanTransformation(transDict['TransformationID']) if not res['OK']: self.log.error("Problems cleaning transformation %s: %s" % (transDict['TransformationID'], res['Message'])) def _executeRemoval(self, transDict): """Remove files from given transformation.""" res = self.removeTransformationOutput(transDict['TransformationID']) if not res['OK']: self.log.error("Problems removing transformation %s: %s" % (transDict['TransformationID'], res['Message'])) def _executeArchive(self, transDict): """Archive the given transformation.""" res = self.archiveTransformation(transDict['TransformationID']) if not res['OK']: self.log.error("Problems archiving transformation %s: %s" % (transDict['TransformationID'], res['Message'])) return S_OK() ############################################################################# # # Get the transformation directories for checking # def getTransformationDirectories(self, transID): """ get the directories for the supplied transformation from the transformation system. These directories are used by removeTransformationOutput and cleanTransformation for removing output. :param self: self reference :param int transID: transformation ID """ self.log.verbose( "Cleaning Transformation directories of transformation %d" % transID) directories = [] if 'TransformationDB' in self.directoryLocations: res = self.transClient.getTransformationParameters( transID, ['OutputDirectories']) if not res['OK']: self.log.error("Failed to obtain transformation directories", res['Message']) return res transDirectories = [] if res['Value']: if not isinstance(res['Value'], list): try: transDirectories = ast.literal_eval(res['Value']) except BaseException: # It can happen if the res['Value'] is '/a/b/c' instead of '["/a/b/c"]' transDirectories.append(res['Value']) else: transDirectories = res['Value'] directories = self._addDirs(transID, transDirectories, directories) if 'MetadataCatalog' in self.directoryLocations: res = self.metadataClient.findDirectoriesByMetadata( {self.transfidmeta: transID}) if not res['OK']: self.log.error("Failed to obtain metadata catalog directories", res['Message']) return res transDirectories = res['Value'] directories = self._addDirs(transID, transDirectories, directories) if not directories: self.log.info("No output directories found") directories = sorted(directories) return S_OK(directories) @classmethod def _addDirs(cls, transID, newDirs, existingDirs): """ append unique :newDirs: list to :existingDirs: list :param self: self reference :param int transID: transformationID :param list newDirs: src list of paths :param list existingDirs: dest list of paths """ for folder in newDirs: transStr = str(transID).zfill(8) if re.search(transStr, str(folder)): if folder not in existingDirs: existingDirs.append(os.path.normpath(folder)) return existingDirs ############################################################################# # # These are the methods for performing the cleaning of catalogs and storage # def cleanContent(self, directory): """ wipe out everything from catalog under folder :directory: :param self: self reference :params str directory: folder name """ self.log.verbose("Cleaning Catalog contents") res = self.__getCatalogDirectoryContents([directory]) if not res['OK']: return res filesFound = res['Value'] if not filesFound: self.log.info( "No files are registered in the catalog directory %s" % directory) return S_OK() self.log.info( "Attempting to remove %d possible remnants from the catalog and storage" % len(filesFound)) # Executing with shifter proxy gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'false') res = DataManager().removeFile(filesFound, force=True) gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'true') if not res['OK']: return res realFailure = False for lfn, reason in res['Value']['Failed'].items(): if "File does not exist" in str(reason): self.log.warn("File %s not found in some catalog: " % (lfn)) else: self.log.error("Failed to remove file found in the catalog", "%s %s" % (lfn, reason)) realFailure = True if realFailure: return S_ERROR("Failed to remove all files found in the catalog") return S_OK() def __getCatalogDirectoryContents(self, directories): """ get catalog contents under paths :directories: :param self: self reference :param list directories: list of paths in catalog """ self.log.info('Obtaining the catalog contents for %d directories:' % len(directories)) for directory in directories: self.log.info(directory) activeDirs = directories allFiles = {} fc = FileCatalog() while activeDirs: currentDir = activeDirs[0] res = returnSingleResult(fc.listDirectory(currentDir)) activeDirs.remove(currentDir) if not res['OK'] and 'Directory does not exist' in res[ 'Message']: # FIXME: DFC should return errno self.log.info("The supplied directory %s does not exist" % currentDir) elif not res['OK']: if "No such file or directory" in res['Message']: self.log.info("%s: %s" % (currentDir, res['Message'])) else: self.log.error("Failed to get directory %s content: %s" % (currentDir, res['Message'])) else: dirContents = res['Value'] activeDirs.extend(dirContents['SubDirs']) allFiles.update(dirContents['Files']) self.log.info("Found %d files" % len(allFiles)) return S_OK(allFiles.keys()) def cleanTransformationLogFiles(self, directory): """ clean up transformation logs from directory :directory: :param self: self reference :param str directory: folder name """ self.log.verbose("Removing log files found in the directory %s" % directory) res = returnSingleResult( StorageElement(self.logSE).removeDirectory(directory, recursive=True)) if not res['OK']: if cmpError(res, errno.ENOENT): # No such file or directory self.log.warn("Transformation log directory does not exist", directory) return S_OK() self.log.error("Failed to remove log files", res['Message']) return res self.log.info("Successfully removed transformation log directory") return S_OK() ############################################################################# # # These are the functional methods for archiving and cleaning transformations # def removeTransformationOutput(self, transID): """ This just removes any mention of the output data from the catalog and storage """ self.log.info("Removing output data for transformation %s" % transID) res = self.getTransformationDirectories(transID) if not res['OK']: self.log.error( 'Problem obtaining directories for transformation %s with result "%s"' % (transID, res)) return S_OK() directories = res['Value'] for directory in directories: if not re.search('/LOG/', directory): res = self.cleanContent(directory) if not res['OK']: return res self.log.info("Removed %d directories from the catalog \ and its files from the storage for transformation %s" % (len(directories), transID)) # Clean ALL the possible remnants found in the metadata catalog res = self.cleanMetadataCatalogFiles(transID) if not res['OK']: return res self.log.info("Successfully removed output of transformation %d" % transID) # Change the status of the transformation to RemovedFiles res = self.transClient.setTransformationParameter( transID, 'Status', 'RemovedFiles') if not res['OK']: self.log.error( "Failed to update status of transformation %s to RemovedFiles" % (transID), res['Message']) return res self.log.info("Updated status of transformation %s to RemovedFiles" % (transID)) return S_OK() def archiveTransformation(self, transID): """ This just removes job from the jobDB and the transformation DB :param self: self reference :param int transID: transformation ID """ self.log.info("Archiving transformation %s" % transID) # Clean the jobs in the WMS and any failover requests found res = self.cleanTransformationTasks(transID) if not res['OK']: return res # Clean the transformation DB of the files and job information res = self.transClient.cleanTransformation(transID) if not res['OK']: return res self.log.info("Successfully archived transformation %d" % transID) # Change the status of the transformation to archived res = self.transClient.setTransformationParameter( transID, 'Status', 'Archived') if not res['OK']: self.log.error( "Failed to update status of transformation %s to Archived" % (transID), res['Message']) return res self.log.info("Updated status of transformation %s to Archived" % (transID)) return S_OK() def cleanTransformation(self, transID): """ This removes what was produced by the supplied transformation, leaving only some info and log in the transformation DB. """ self.log.info("Cleaning transformation %s" % transID) res = self.getTransformationDirectories(transID) if not res['OK']: self.log.error( 'Problem obtaining directories for transformation %s with result "%s"' % (transID, res)) return S_OK() directories = res['Value'] # Clean the jobs in the WMS and any failover requests found res = self.cleanTransformationTasks(transID) if not res['OK']: return res # Clean the log files for the jobs for directory in directories: if re.search('/LOG/', directory): res = self.cleanTransformationLogFiles(directory) if not res['OK']: return res res = self.cleanContent(directory) if not res['OK']: return res # Clean ALL the possible remnants found res = self.cleanMetadataCatalogFiles(transID) if not res['OK']: return res # Clean the transformation DB of the files and job information res = self.transClient.cleanTransformation(transID) if not res['OK']: return res self.log.info("Successfully cleaned transformation %d" % transID) res = self.transClient.setTransformationParameter( transID, 'Status', 'Cleaned') if not res['OK']: self.log.error( "Failed to update status of transformation %s to Cleaned" % (transID), res['Message']) return res self.log.info("Updated status of transformation %s to Cleaned" % (transID)) return S_OK() def cleanMetadataCatalogFiles(self, transID): """ wipe out files from catalog """ res = self.metadataClient.findFilesByMetadata( {self.transfidmeta: transID}) if not res['OK']: return res fileToRemove = res['Value'] if not fileToRemove: self.log.info('No files found for transID %s' % transID) return S_OK() # Executing with shifter proxy gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'false') res = DataManager().removeFile(fileToRemove, force=True) gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'true') if not res['OK']: return res for lfn, reason in res['Value']['Failed'].items(): self.log.error("Failed to remove file found in metadata catalog", "%s %s" % (lfn, reason)) if res['Value']['Failed']: return S_ERROR( "Failed to remove all files found in the metadata catalog") self.log.info("Successfully removed all files found in the BK") return S_OK() ############################################################################# # # These are the methods for removing the jobs from the WMS and transformation DB # def cleanTransformationTasks(self, transID): """ clean tasks from WMS, or from the RMS if it is a DataManipulation transformation """ self.log.verbose("Cleaning Transformation tasks of transformation %d" % transID) res = self.__getTransformationExternalIDs(transID) if not res['OK']: return res externalIDs = res['Value'] if externalIDs: res = self.transClient.getTransformationParameters( transID, ['Type']) if not res['OK']: self.log.error("Failed to determine transformation type") return res transType = res['Value'] if transType in self.dataProcTTypes: res = self.__removeWMSTasks(externalIDs) else: res = self.__removeRequests(externalIDs) if not res['OK']: return res return S_OK() def __getTransformationExternalIDs(self, transID): """ collect all ExternalIDs for transformation :transID: :param self: self reference :param int transID: transforamtion ID """ res = self.transClient.getTransformationTasks( condDict={'TransformationID': transID}) if not res['OK']: self.log.error( "Failed to get externalIDs for transformation %d" % transID, res['Message']) return res externalIDs = [taskDict['ExternalID'] for taskDict in res["Value"]] self.log.info("Found %d tasks for transformation" % len(externalIDs)) return S_OK(externalIDs) def __removeRequests(self, requestIDs): """ This will remove requests from the RMS system - """ rIDs = [int(long(j)) for j in requestIDs if long(j)] for reqID in rIDs: self.reqClient.cancelRequest(reqID) return S_OK() def __removeWMSTasks(self, transJobIDs): """ wipe out jobs and their requests from the system :param self: self reference :param list trasnJobIDs: job IDs """ # Prevent 0 job IDs jobIDs = [int(j) for j in transJobIDs if int(j)] allRemove = True for jobList in breakListIntoChunks(jobIDs, 500): res = self.wmsClient.killJob(jobList) if res['OK']: self.log.info("Successfully killed %d jobs from WMS" % len(jobList)) elif ("InvalidJobIDs" in res) and ("NonauthorizedJobIDs" not in res) and ("FailedJobIDs" not in res): self.log.info("Found %s jobs which did not exist in the WMS" % len(res['InvalidJobIDs'])) elif "NonauthorizedJobIDs" in res: self.log.error( "Failed to kill %s jobs because not authorized" % len(res['NonauthorizedJobIDs'])) allRemove = False elif "FailedJobIDs" in res: self.log.error("Failed to kill %s jobs" % len(res['FailedJobIDs'])) allRemove = False res = self.wmsClient.deleteJob(jobList) if res['OK']: self.log.info("Successfully removed %d jobs from WMS" % len(jobList)) elif ("InvalidJobIDs" in res) and ("NonauthorizedJobIDs" not in res) and ("FailedJobIDs" not in res): self.log.info("Found %s jobs which did not exist in the WMS" % len(res['InvalidJobIDs'])) elif "NonauthorizedJobIDs" in res: self.log.error( "Failed to remove %s jobs because not authorized" % len(res['NonauthorizedJobIDs'])) allRemove = False elif "FailedJobIDs" in res: self.log.error("Failed to remove %s jobs" % len(res['FailedJobIDs'])) allRemove = False if not allRemove: return S_ERROR("Failed to remove all remnants from WMS") self.log.info("Successfully removed all tasks from the WMS") if not jobIDs: self.log.info( "JobIDs not present, unable to remove asociated requests.") return S_OK() failed = 0 failoverRequests = {} res = self.reqClient.getRequestIDsForJobs(jobIDs) if not res['OK']: self.log.error("Failed to get requestID for jobs.", res['Message']) return res failoverRequests.update(res['Value']['Successful']) if not failoverRequests: return S_OK() for jobID, requestID in res['Value']['Successful'].items(): # Put this check just in case, tasks must have associated jobs if jobID == 0 or jobID == '0': continue res = self.reqClient.cancelRequest(requestID) if not res['OK']: self.log.error("Failed to remove request from RequestDB", res['Message']) failed += 1 else: self.log.verbose("Removed request %s associated to job %d." % (requestID, jobID)) if failed: self.log.info("Successfully removed %s requests" % (len(failoverRequests) - failed)) self.log.info("Failed to remove %s requests" % failed) return S_ERROR("Failed to remove all the request from RequestDB") self.log.info( "Successfully removed all the associated failover requests") return S_OK()
def removeDeletedJobs(self): """Fully remove jobs that are already in status "DELETED", unless there are still requests. :returns: S_OK/S_ERROR """ res = self._getJobsList({"Status": JobStatus.DELETED}) if not res["OK"]: return res jobList = res["Value"] if not jobList: self.log.info("No jobs to remove") return S_OK() self.log.info("Unassigning sandboxes from soon to be deleted jobs", "(%d)" % len(jobList)) result = SandboxStoreClient(useCertificates=True).unassignJobs(jobList) if not result["OK"]: self.log.error("Cannot unassign jobs to sandboxes", result["Message"]) return result self.log.info("Attempting to remove deleted jobs", "(%d)" % len(jobList)) # remove from jobList those that have still Operations to do in RMS reqClient = ReqClient() res = reqClient.getRequestIDsForJobs(jobList) if not res["OK"]: return res if res["Value"]["Successful"]: notFinal = set() # Check whether these requests are in a final status for job, reqID in res["Value"]["Successful"].items(): # If not, remove job from list to remove if reqClient.getRequestStatus(reqID).get( "Value") not in Request.FINAL_STATES: # Keep that job notFinal.add(job) else: # Remove the request, if failed, keep the job res1 = reqClient.deleteRequest(reqID) if not res1["OK"]: notFinal.add(job) if notFinal: self.log.info( "Some jobs won't be removed, as still having Requests not in final status", "(n=%d)" % len(notFinal)) jobList = list(set(jobList) - notFinal) if not jobList: return S_OK() ownerJobsDict = self._getOwnerJobsDict(jobList) fail = False for owner, jobsList in ownerJobsDict.items(): ownerDN = owner.split(";")[0] ownerGroup = owner.split(";")[1] self.log.verbose( "Attempting to remove jobs", "(n=%d) for %s : %s" % (len(jobsList), ownerDN, ownerGroup)) wmsClient = WMSClient(useCertificates=True, delegatedDN=ownerDN, delegatedGroup=ownerGroup) result = wmsClient.removeJob(jobsList) if not result["OK"]: self.log.error( "Could not remove jobs", "for %s : %s (n=%d) : %s" % (ownerDN, ownerGroup, len(jobsList), result["Message"]), ) fail = True if fail: return S_ERROR() return S_OK()
since = convertDate(switch[1]) elif switch[0] == 'Until': until = convertDate(switch[1]) if reset: status = 'Failed' if terse: verbose = True if status: if not until: until = datetime.datetime.utcnow() if not since: since = until - datetime.timedelta(hours=24) from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient from DIRAC.RequestManagementSystem.Client.ReqClient import printRequest, recoverableRequest reqClient = ReqClient() if transID: if not taskIDs: gLogger.fatal( "If Transformation is set, a list of Tasks should also be set") Script.showHelp() DIRAC.exit(2) # In principle, the task name is unique, so the request name should be unique as well # If ever this would not work anymore, we would need to use the transformationClient # to fetch the ExternalID requests = ['%08d_%08d' % (transID, task) for task in taskIDs] allR = True elif not jobs: args = Script.getPositionalArgs() if len(args) == 1:
import DIRAC # Check is provided SE is OK se = StorageElement( targetSE ) if not se.valid: print se.errorReason print Script.showHelp() from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient from DIRAC.RequestManagementSystem.Client.Request import Request from DIRAC.RequestManagementSystem.Client.Operation import Operation from DIRAC.RequestManagementSystem.Client.File import File from DIRAC.RequestManagementSystem.private.RequestValidator import RequestValidator from DIRAC.Resources.Catalog.FileCatalog import FileCatalog reqClient = ReqClient() fc = FileCatalog() for lfnList in breakListIntoChunks( lfns, 100 ): oRequest = Request() oRequest.RequestName = "%s_%s" % ( md5( repr( time.time() ) ).hexdigest()[:16], md5( repr( time.time() ) ).hexdigest()[:16] ) replicateAndRegister = Operation() replicateAndRegister.Type = 'ReplicateAndRegister' replicateAndRegister.TargetSE = targetSE res = fc.getFileMetadata( lfnList ) if not res['OK']: print "Can't get file metadata: %s" % res['Message'] DIRAC.exit( 1 )
def reqClient(self): """Return RequestClient.""" if not self._reqClient: from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient self._reqClient = ReqClient() return self._reqClient
class RequestTasks( TaskBase ): def __init__( self, transClient = None, logger = None, requestClient = None, requestClass = None, ): """ c'tor the requestClass is by default Request. If extensions want to use an extended type, they can pass it as a parameter. This is the same behavior as WorfkloTasks and jobClass """ if not logger: logger = gLogger.getSubLogger( 'RequestTasks' ) super( RequestTasks, self ).__init__( transClient, logger ) if not requestClient: self.requestClient = ReqClient() else: self.requestClient = requestClient if not requestClass: self.requestClass = Request else: self.requestClass = requestClass def prepareTransformationTasks( self, transBody, taskDict, owner = '', ownerGroup = '' ): """ Prepare tasks, given a taskDict, that is created (with some manipulation) by the DB """ requestOperation = 'ReplicateAndRegister' if transBody: try: _requestType, requestOperation = transBody.split( ';' ) except AttributeError: pass for taskID in sorted( taskDict ): paramDict = taskDict[taskID] if paramDict['InputData']: transID = paramDict['TransformationID'] oRequest = Request() transfer = Operation() transfer.Type = requestOperation transfer.TargetSE = paramDict['TargetSE'] if type( paramDict['InputData'] ) == type( [] ): files = paramDict['InputData'] elif type( paramDict['InputData'] ) == type( '' ): files = paramDict['InputData'].split( ';' ) for lfn in files: trFile = File() trFile.LFN = lfn transfer.addFile( trFile ) oRequest.addOperation( transfer ) oRequest.RequestName = str( transID ).zfill( 8 ) + '_' + str( taskID ).zfill( 8 ) oRequest.OwnerDN = owner oRequest.OwnerGroup = ownerGroup isValid = gRequestValidator.validate( oRequest ) if not isValid['OK']: return isValid taskDict[taskID]['TaskObject'] = oRequest return S_OK( taskDict ) def submitTransformationTasks( self, taskDict ): """ Submit requests one by one """ submitted = 0 failed = 0 startTime = time.time() for taskID in sorted( taskDict ): if not taskDict[taskID]['TaskObject']: taskDict[taskID]['Success'] = False failed += 1 continue res = self.submitTaskToExternal( taskDict[taskID]['TaskObject'] ) if res['OK']: taskDict[taskID]['ExternalID'] = res['Value'] taskDict[taskID]['Success'] = True submitted += 1 else: self.log.error( "Failed to submit task to RMS", res['Message'] ) taskDict[taskID]['Success'] = False failed += 1 self.log.info( 'submitTasks: Submitted %d tasks to RMS in %.1f seconds' % ( submitted, time.time() - startTime ) ) if failed: self.log.info( 'submitTasks: Failed to submit %d tasks to RMS.' % ( failed ) ) return S_OK( taskDict ) def submitTaskToExternal( self, oRequest ): """ Submits a request using ReqClient """ if isinstance( oRequest, self.requestClass ): return self.requestClient.putRequest( oRequest ) else: return S_ERROR( "Request should be a Request object" ) def updateTransformationReservedTasks( self, taskDicts ): taskNameIDs = {} noTasks = [] for taskDict in taskDicts: transID = taskDict['TransformationID'] taskID = taskDict['TaskID'] taskName = str( transID ).zfill( 8 ) + '_' + str( taskID ).zfill( 8 ) res = self.requestClient.getRequestInfo( taskName ) if res['OK']: taskNameIDs[taskName] = res['Value'][0] elif re.search( "Failed to retrieve RequestID for Request", res['Message'] ): noTasks.append( taskName ) else: self.log.warn( "Failed to get requestID for request", res['Message'] ) return S_OK( {'NoTasks':noTasks, 'TaskNameIDs':taskNameIDs} ) def getSubmittedTaskStatus( self, taskDicts ): updateDict = {} for taskDict in taskDicts: transID = taskDict['TransformationID'] taskID = taskDict['TaskID'] oldStatus = taskDict['ExternalStatus'] taskName = str( transID ).zfill( 8 ) + '_' + str( taskID ).zfill( 8 ) res = self.requestClient.getRequestStatus( taskName ) newStatus = '' if res['OK']: # FIXME: for compatibility between old and new RMS try: # old newStatus = res['Value']['RequestStatus'] except TypeError: # new newStatus = res['Value'] elif re.search( "Failed to retrieve RequestID for Request", res['Message'] ): newStatus = 'Failed' else: self.log.info( "getSubmittedTaskStatus: Failed to get requestID for request", res['Message'] ) if newStatus and ( newStatus != oldStatus ): if newStatus not in updateDict: updateDict[newStatus] = [] updateDict[newStatus].append( taskID ) return S_OK( updateDict ) def getSubmittedFileStatus( self, fileDicts ): taskFiles = {} for fileDict in fileDicts: transID = fileDict['TransformationID'] taskID = fileDict['TaskID'] taskName = str( transID ).zfill( 8 ) + '_' + str( taskID ).zfill( 8 ) if taskName not in taskFiles: taskFiles[taskName] = {} taskFiles[taskName][fileDict['LFN']] = fileDict['Status'] updateDict = {} for taskName in sorted( taskFiles ): lfnDict = taskFiles[taskName] res = self.requestClient.getRequestFileStatus( taskName, lfnDict.keys() ) if not res['OK']: self.log.warn( "getSubmittedFileStatus: Failed to get files status for request", res['Message'] ) continue for lfn, newStatus in res['Value'].items(): if newStatus == lfnDict[lfn]: pass elif newStatus == 'Done': updateDict[lfn] = 'Processed' elif newStatus == 'Failed': updateDict[lfn] = 'Problematic' return S_OK( updateDict )
try: shutil.copy(appTar,"%s%s" % (final_path, os.path.basename(appTar))) except EnvironmentError, x: gLogger.error("Could not copy because %s" % x) return S_ERROR("Could not copy because %s" % x) elif path.find("http://") > -1: gLogger.error("Path %s was not foreseen!" % path) gLogger.error("Location not known, upload to location yourself, and publish in CS manually") return S_ERROR() else: lfnpath = "%s%s" % (path, os.path.basename(appTar)) res = datMan.putAndRegister(lfnpath, appTar, ops.getValue('Software/BaseStorageElement', "CERN-SRM")) if not res['OK']: return res request = Request() requestClient = ReqClient() request.RequestName = 'copy_%s' % os.path.basename(appTar).replace(".tgz", "").replace(".tar.gz", "") request.SourceComponent = 'ReplicateILCSoft' copies_at = ops.getValue('Software/CopiesAt', []) for copies in copies_at: transfer = Operation() transfer.Type = "ReplicateAndRegister" transfer.TargetSE = copies trFile = File() trFile.LFN = lfnpath trFile.GUID = "" transfer.addFile(trFile) request.addOperation(transfer) res = RequestValidator().validate(request) if not res['OK']:
def _treatOperation(self, operation): """ Treat one operation: * does the callback if the operation is finished * generate new jobs and submits them :param operation: the operation to treat :return: operation, S_OK()/S_ERROR() """ try: threadID = current_process().name log = gLogger.getSubLogger("treatOperation/%s" % operation.operationID, child=True) # If the operation is totally processed # we perform the callback if operation.isTotallyProcessed(): log.debug("FTS3Operation %s is totally processed" % operation.operationID) res = operation.callback() if not res['OK']: log.error("Error performing the callback", res) log.info("Putting back the operation") dbRes = self.fts3db.persistOperation(operation) if not dbRes['OK']: log.error("Could not persist operation", dbRes) return operation, res else: log.debug("FTS3Operation %s is not totally processed yet" % operation.operationID) # This flag is set to False if we want to stop the ongoing processing # of an operation, typically when the matching RMS Request has been # canceled (see below) continueOperationProcessing = True # Check the status of the associated RMS Request. # If it is canceled then we will not create new FTS3Jobs, and mark # this as FTS3Operation canceled. if operation.rmsReqID: res = ReqClient().getRequestStatus(operation.rmsReqID) if not res['OK']: log.error("Could not get request status", res) return operation, res rmsReqStatus = res['Value'] if rmsReqStatus == 'Canceled': log.info( "The RMS Request is canceled, canceling the FTS3Operation", "rmsReqID: %s, FTS3OperationID: %s" % (operation.rmsReqID, operation.operationID)) operation.status = 'Canceled' continueOperationProcessing = False if continueOperationProcessing: res = operation.prepareNewJobs( maxFilesPerJob=self.maxFilesPerJob, maxAttemptsPerFile=self.maxAttemptsPerFile) if not res['OK']: log.error( "Cannot prepare new Jobs", "FTS3Operation %s : %s" % (operation.operationID, res)) return operation, res newJobs = res['Value'] log.debug("FTS3Operation %s: %s new jobs to be submitted" % (operation.operationID, len(newJobs))) for ftsJob in newJobs: res = self._serverPolicy.chooseFTS3Server() if not res['OK']: log.error(res) continue ftsServer = res['Value'] log.debug("Use %s server" % ftsServer) ftsJob.ftsServer = ftsServer res = self.getFTS3Context(ftsJob.username, ftsJob.userGroup, ftsServer, threadID=threadID) if not res['OK']: log.error("Could not get context", res) continue context = res['Value'] res = ftsJob.submit(context=context, protocols=self.thirdPartyProtocols) if not res['OK']: log.error( "Could not submit FTS3Job", "FTS3Operation %s : %s" % (operation.operationID, res)) continue operation.ftsJobs.append(ftsJob) submittedFileIds = res['Value'] log.info( "FTS3Operation %s: Submitted job for %s transfers" % (operation.operationID, len(submittedFileIds))) # new jobs are put in the DB at the same time res = self.fts3db.persistOperation(operation) if not res['OK']: log.error("Could not persist operation", res) return operation, res except Exception as e: log.exception('Exception in the thread', repr(e)) return operation, S_ERROR("Exception %s" % repr(e))
since = convertDate(switch[1]) elif switch[0] == 'Until': until = convertDate(switch[1]) if reset: status = 'Failed' if terse: verbose = True if status: if not until: until = datetime.datetime.utcnow() if not since: since = until - datetime.timedelta(hours=24) from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient from DIRAC.RequestManagementSystem.Client.ReqClient import printRequest, recoverableRequest reqClient = ReqClient() if transID: if not taskIDs: gLogger.fatal( "If Transformation is set, a list of Tasks should also be set") Script.showHelp() DIRAC.exit(2) requests = ['%08d_%08d' % (transID, task) for task in taskIDs] all = True elif not jobs: args = Script.getPositionalArgs() if len(args) == 1: all = True requests = [reqName for reqName in args[0].split(',') if reqName] else:
class DataRecoveryAgent(AgentModule): """Data Recovery Agent""" def __init__(self, *args, **kwargs): AgentModule.__init__(self, *args, **kwargs) self.name = 'DataRecoveryAgent' self.enabled = False self.getJobInfoFromJDLOnly = False self.__getCSOptions() self.jobStatus = [ 'Failed', 'Done' ] # This needs to be both otherwise we cannot account for all cases self.jobMon = JobMonitoringClient() self.fcClient = FileCatalogClient() self.tClient = TransformationClient() self.reqClient = ReqClient() self.diracAPI = Dirac() self.inputFilesProcessed = set() self.todo = {'NoInputFiles': [dict(Message="NoInputFiles: OutputExists: Job 'Done'", ShortMessage="NoInputFiles: job 'Done' ", Counter=0, Check=lambda job: job.allFilesExist() and job.status == 'Failed', Actions=lambda job, tInfo: [job.setJobDone(tInfo)], ), dict(Message="NoInputFiles: OutputMissing: Job 'Failed'", ShortMessage="NoInputFiles: job 'Failed' ", Counter=0, Check=lambda job: job.allFilesMissing() and job.status == 'Done', Actions=lambda job, tInfo: [job.setJobFailed(tInfo)], ), ], 'InputFiles': [ \ # must always be first! dict(Message="One of many Successful: clean others", ShortMessage="Other Tasks --> Keep", Counter=0, Check=lambda job: job.allFilesExist() and job.otherTasks and \ not set(job.inputFiles).issubset(self.inputFilesProcessed), Actions=lambda job, tInfo: [self.inputFilesProcessed.update(job.inputFiles), job.setJobDone(tInfo), job.setInputProcessed(tInfo)] ), dict(Message="Other Task processed Input, no Output: Fail", ShortMessage="Other Tasks --> Fail", Counter=0, Check=lambda job: set(job.inputFiles).issubset(self.inputFilesProcessed) and \ job.allFilesMissing() and job.status != 'Failed', Actions=lambda job, tInfo: [job.setJobFailed(tInfo)] ), dict(Message="Other Task processed Input: Fail and clean", ShortMessage="Other Tasks --> Cleanup", Counter=0, Check=lambda job: set(job.inputFiles).issubset( self.inputFilesProcessed) and not job.allFilesMissing(), Actions=lambda job, tInfo: [job.setJobFailed(tInfo), job.cleanOutputs(tInfo)] ), dict(Message="InputFile(s) missing: mark job 'Failed', mark input 'Deleted', clean", ShortMessage="Input Missing --> Job 'Failed, Input 'Deleted', Cleanup", Counter=0, Check=lambda job: job.inputFiles and job.allInputFilesMissing() and \ not job.allTransFilesDeleted(), Actions=lambda job, tInfo: [job.cleanOutputs(tInfo), job.setJobFailed(tInfo), job.setInputDeleted(tInfo)], ), dict(Message="InputFile(s) Deleted, output Exists: mark job 'Failed', clean", ShortMessage="Input Deleted --> Job 'Failed, Cleanup", Counter=0, Check=lambda job: job.inputFiles and job.allInputFilesMissing() and \ job.allTransFilesDeleted() and not job.allFilesMissing(), Actions=lambda job, tInfo: [job.cleanOutputs(tInfo), job.setJobFailed(tInfo)], ), # All Output Exists dict(Message="Output Exists, job Failed, input not Processed --> Job Done, Input Processed", ShortMessage="Output Exists --> Job Done, Input Processed", Counter=0, Check=lambda job: job.allFilesExist() and \ not job.otherTasks and \ job.status == 'Failed' and \ not job.allFilesProcessed() and \ job.allInputFilesExist(), Actions=lambda job, tInfo: [job.setJobDone(tInfo), job.setInputProcessed(tInfo)] ), dict(Message="Output Exists, job Failed, input Processed --> Job Done", ShortMessage="Output Exists --> Job Done", Counter=0, Check=lambda job: job.allFilesExist() and \ not job.otherTasks and \ job.status == 'Failed' and \ job.allFilesProcessed() and \ job.allInputFilesExist(), Actions=lambda job, tInfo: [job.setJobDone(tInfo)] ), dict(Message="Output Exists, job Done, input not Processed --> Input Processed", ShortMessage="Output Exists --> Input Processed", Counter=0, Check=lambda job: job.allFilesExist() and \ not job.otherTasks and \ job.status == 'Done' and \ not job.allFilesProcessed() and \ job.allInputFilesExist(), Actions=lambda job, tInfo: [job.setInputProcessed(tInfo)] ), # outputmissing dict(Message="Output Missing, job Failed, input Assigned, MaxError --> Input MaxReset", ShortMessage="Max ErrorCount --> Input MaxReset", Counter=0, Check=lambda job: job.allFilesMissing() and \ not job.otherTasks and \ job.status == 'Failed' and \ job.allFilesAssigned() and \ not set(job.inputFiles).issubset(self.inputFilesProcessed) and \ job.allInputFilesExist() and \ job.checkErrorCount(), Actions=lambda job, tInfo: [job.setInputMaxReset(tInfo)] ), dict(Message="Output Missing, job Failed, input Assigned --> Input Unused", ShortMessage="Output Missing --> Input Unused", Counter=0, Check=lambda job: job.allFilesMissing() and \ not job.otherTasks and \ job.status == 'Failed' and \ job.allFilesAssigned() and \ not set(job.inputFiles).issubset(self.inputFilesProcessed) and \ job.allInputFilesExist(), Actions=lambda job, tInfo: [job.setInputUnused(tInfo)] ), dict(Message="Output Missing, job Done, input Assigned --> Job Failed, Input Unused", ShortMessage="Output Missing --> Job Failed, Input Unused", Counter=0, Check=lambda job: job.allFilesMissing() and \ not job.otherTasks and \ job.status == 'Done' and \ job.allFilesAssigned() and \ not set(job.inputFiles).issubset(self.inputFilesProcessed) and \ job.allInputFilesExist(), Actions=lambda job, tInfo: [job.setInputUnused(tInfo), job.setJobFailed(tInfo)] ), # some files missing, needing cleanup. Only checking for # assigned, because processed could mean an earlier job was # succesful and this one is just the duplicate that needed # to be removed! But we check for other tasks earlier, so # this should not happen dict(Message="Some missing, job Failed, input Assigned --> cleanup, Input 'Unused'", ShortMessage="Output Missing --> Cleanup, Input Unused", Counter=0, Check=lambda job: job.someFilesMissing() and \ not job.otherTasks and \ job.status == 'Failed' and \ job.allFilesAssigned() and \ job.allInputFilesExist(), Actions=lambda job, tInfo: [job.cleanOutputs(tInfo), job.setInputUnused(tInfo)] ), dict(Message="Some missing, job Done, input Assigned --> cleanup, job Failed, Input 'Unused'", ShortMessage="Output Missing --> Cleanup, Job Failed, Input Unused", Counter=0, Check=lambda job: job.someFilesMissing() and \ not job.otherTasks and \ job.status == 'Done' and \ job.allFilesAssigned() and \ job.allInputFilesExist(), Actions=lambda job, tInfo: [ job.cleanOutputs(tInfo), job.setInputUnused(tInfo), job.setJobFailed(tInfo)] ), dict(Message="Some missing, job Done --> job Failed", ShortMessage="Output Missing, Done --> Job Failed", Counter=0, Check=lambda job: not job.allFilesExist() and job.status == 'Done', Actions=lambda job, tInfo: [job.setJobFailed(tInfo)] ), dict(Message="Something Strange", ShortMessage="Strange", Counter=0, Check=lambda job: job.status not in ("Failed", "Done"), Actions=lambda job, tInfo: [] ), # should always be the last one! dict(Message="Failed Hard", ShortMessage="Failed Hard", Counter=0, Check=lambda job: False, # never Actions=lambda job, tInfo: [] ), ] } self.jobCache = defaultdict(lambda: (0, 0)) # Notification options self.notesToSend = "" self.subject = "DataRecoveryAgent" self.startTime = time.time() ############################################################################# def beginExecution(self): """Resets defaults after one cycle.""" self.__getCSOptions() return S_OK() def __getCSOptions(self): """Get agent options from the CS.""" self.enabled = self.am_getOption('EnableFlag', False) self.transformationsToIgnore = self.am_getOption( 'TransformationsToIgnore', []) self.getJobInfoFromJDLOnly = self.am_getOption('JobInfoFromJDLOnly', False) self.transformationStatus = self.am_getOption('TransformationStatus', ['Active', 'Completing']) ops = Operations() extendableTTypes = set( ops.getValue('Transformations/ExtendableTransfTypes', ['MCSimulation'])) dataProcessing = set(ops.getValue('Transformations/DataProcessing', [])) self.transNoInput = self.am_getOption('TransformationsNoInput', list(extendableTTypes)) self.transWithInput = self.am_getOption( 'TransformationsWithInput', list(dataProcessing - extendableTTypes)) self.transformationTypes = self.transWithInput + self.transNoInput self.log.notice('Will treat transformations without input files', self.transNoInput) self.log.notice('Will treat transformations with input files', self.transWithInput) self.addressTo = self.am_getOption('MailTo', []) self.addressFrom = self.am_getOption('MailFrom', '') self.printEveryNJobs = self.am_getOption('PrintEvery', 200) def execute(self): """ The main execution method. """ self.log.notice("Will ignore the following transformations: %s" % self.transformationsToIgnore) self.log.notice(" Job Cache: %s " % self.jobCache) transformations = self.getEligibleTransformations( self.transformationStatus, self.transformationTypes) if not transformations['OK']: self.log.error("Failure to get transformations", transformations['Message']) return S_ERROR("Failure to get transformations") for transID, transInfoDict in transformations['Value'].iteritems(): if transID in self.transformationsToIgnore: self.log.notice('Ignoring Transformation: %s' % transID) continue self.__resetCounters() self.inputFilesProcessed = set() self.log.notice('Running over Transformation: %s' % transID) self.treatTransformation(int(transID), transInfoDict) self.sendNotification(transID, transInfoDict) return S_OK() def getEligibleTransformations(self, status, typeList): """ Select transformations of given status and type. """ res = self.tClient.getTransformations(condDict={ 'Status': status, 'Type': typeList }) if not res['OK']: return res transformations = {} for prod in res['Value']: transID = prod['TransformationID'] transformations[str(transID)] = prod return S_OK(transformations) def treatTransformation(self, transID, transInfoDict): """Run this thing for given transformation.""" tInfo = TransformationInfo(transID, transInfoDict, self.enabled, self.tClient, self.fcClient, self.jobMon) jobs, nDone, nFailed = tInfo.getJobs(statusList=self.jobStatus) if self.jobCache[transID][0] == nDone and self.jobCache[transID][ 1] == nFailed: self.log.notice( 'Skipping transformation %s because nothing changed' % transID) return self.jobCache[transID] = (nDone, nFailed) tasksDict = None lfnTaskDict = None self.startTime = time.time() if transInfoDict['Type'] in self.transWithInput: self.log.notice('Getting tasks...') tasksDict = tInfo.checkTasksStatus() lfnTaskDict = dict([(taskDict['LFN'], taskID) for taskID, taskDicts in tasksDict.items() for taskDict in taskDicts]) self.checkAllJobs(jobs, tInfo, tasksDict, lfnTaskDict) self.printSummary() def checkJob(self, job, tInfo): """Deal with the job.""" checks = self.todo[ 'NoInputFiles'] if job.tType in self.transNoInput else self.todo[ 'InputFiles'] for do in checks: self.log.verbose('Testing: ', do['Message']) if do['Check'](job): do['Counter'] += 1 self.log.notice(do['Message']) self.log.notice(job) self.notesToSend += do['Message'] + '\n' self.notesToSend += str(job) + '\n' do['Actions'](job, tInfo) return def getLFNStatus(self, jobs): """Get all the LFNs for the jobs and get their status.""" self.log.notice('Collecting LFNs...') lfnExistence = {} lfnCache = [] counter = 0 jobInfoStart = time.time() for counter, job in enumerate(jobs.values()): if counter % self.printEveryNJobs == 0: self.log.notice( 'Getting JobInfo: %d/%d: %3.1fs' % (counter, len(jobs), float(time.time() - jobInfoStart))) while True: try: job.getJobInformation(self.diracAPI, self.jobMon, jdlOnly=self.getJobInfoFromJDLOnly) lfnCache.extend(job.inputFiles) lfnCache.extend(job.outputFiles) break except RuntimeError as e: # try again self.log.error('+++++ Failure for job:', job.jobID) self.log.error('+++++ Exception: ', str(e)) timeSpent = float(time.time() - jobInfoStart) self.log.notice('Getting JobInfo Done: %3.1fs (%3.3fs per job)' % (timeSpent, timeSpent / counter)) counter = 0 fileInfoStart = time.time() for lfnChunk in breakListIntoChunks(list(lfnCache), 200): counter += 200 if counter % 1000 == 0: self.log.notice('Getting FileInfo: %d/%d: %3.1fs' % (counter, len(lfnCache), float(time.time() - fileInfoStart))) while True: try: reps = self.fcClient.exists(lfnChunk) if not reps['OK']: self.log.error( 'Failed to check file existence, try again...', reps['Message']) raise RuntimeError('Try again') statuses = reps['Value'] lfnExistence.update(statuses['Successful']) break except RuntimeError: # try again pass self.log.notice('Getting FileInfo Done: %3.1fs' % (float(time.time() - fileInfoStart))) return lfnExistence def setPendingRequests(self, jobs): """Loop over all the jobs and get requests, if any.""" for jobChunk in breakListIntoChunks(jobs.values(), 1000): jobIDs = [job.jobID for job in jobChunk] while True: result = self.reqClient.readRequestsForJobs(jobIDs) if result['OK']: break self.log.error('Failed to read requests', result['Message']) # repeat for jobID in result['Value']['Successful']: request = result['Value']['Successful'][jobID] requestID = request.RequestID dbStatus = self.reqClient.getRequestStatus(requestID).get( 'Value', 'Unknown') for job in jobChunk: if job.jobID == jobID: job.pendingRequest = dbStatus not in ('Done', 'Canceled') self.log.notice( 'Found %s request for job %d' % ('pending' if job.pendingRequest else 'finished', jobID)) break def checkAllJobs(self, jobs, tInfo, tasksDict=None, lfnTaskDict=None): """run over all jobs and do checks""" fileJobDict = defaultdict(list) counter = 0 nJobs = len(jobs) self.setPendingRequests(jobs) lfnExistence = self.getLFNStatus(jobs) self.log.notice('Running over all the jobs') jobCheckStart = time.time() for counter, job in enumerate(jobs.values()): if counter % self.printEveryNJobs == 0: self.log.notice( 'Checking Jobs %d/%d: %3.1fs' % (counter, nJobs, float(time.time() - jobCheckStart))) while True: try: if job.pendingRequest: self.log.warn('Job has Pending requests:\n%s' % job) break job.checkFileExistence(lfnExistence) if tasksDict and lfnTaskDict: try: job.getTaskInfo(tasksDict, lfnTaskDict, self.transWithInput) except TaskInfoException as e: self.log.error( " Skip Task, due to TaskInfoException: %s" % e) if not job.inputFiles and job.tType in self.transWithInput: self.__failJobHard(job, tInfo) break for inputFile in job.inputFiles: fileJobDict[inputFile].append(job.jobID) self.checkJob(job, tInfo) break # get out of the while loop except RuntimeError as e: self.log.error("+++++ Failure for job: %d " % job.jobID) self.log.error("+++++ Exception: ", str(e)) # run these again because of RuntimeError self.log.notice('Checking Jobs Done: %d/%d: %3.1fs' % (counter, nJobs, float(time.time() - jobCheckStart))) def printSummary(self): """print summary of changes""" self.log.notice("Summary:") for do in itertools.chain.from_iterable(self.todo.values()): message = "%s: %s" % (do['ShortMessage'].ljust(56), str(do['Counter']).rjust(5)) self.log.notice(message) if self.notesToSend: self.notesToSend = str(message) + '\n' + self.notesToSend def __resetCounters(self): """ reset counters for modified jobs """ for _name, checks in self.todo.iteritems(): for do in checks: do['Counter'] = 0 def __failJobHard(self, job, tInfo): """ set job to failed and remove output files if there are any """ if job.inputFiles: return if job.status in ("Failed",) \ and job.allFilesMissing(): return self.log.notice("Failing job hard %s" % job) self.notesToSend += "Failing job %s: no input file?\n" % job.jobID self.notesToSend += str(job) + '\n' self.todo['InputFiles'][-1]['Counter'] += 1 job.cleanOutputs(tInfo) job.setJobFailed(tInfo) # if job.inputFile is not None: # job.setInputDeleted(tInfo) def __notOnlyKeepers(self, transType): """check of we only have 'Keep' messages in this case we do not have to send report email or run again next time """ if transType in self.transNoInput: return True checks = self.todo['InputFiles'] totalCount = 0 for check in checks[1:]: totalCount += check['Counter'] return totalCount > 0 def sendNotification(self, transID, transInfoDict): """Send notification email if something was modified for a transformation. :param int transID: ID of given transformation :param transInfoDict: """ if not self.addressTo or not self.addressFrom or not self.notesToSend: return if not self.__notOnlyKeepers(transInfoDict['Type']): # purge notes self.notesToSend = "" return # remove from the jobCache because something happened self.jobCache.pop(int(transID), None) # send the email to recipients for address in self.addressTo: result = NotificationClient().sendMail(address, "%s: %s" % (self.subject, transID), self.notesToSend, self.addressFrom, localAttempt=False) if not result['OK']: self.log.error('Cannot send notification mail', result['Message']) # purge notes self.notesToSend = ""
from DIRAC.Core.Base import Script Script.setUsageMessage( '\n'.join( [ __doc__, 'Usage:', ' %s [option|cfgfile] <Request list>' % Script.scriptName ] ) ) if __name__ == "__main__": from DIRAC.Core.Base.Script import parseCommandLine parseCommandLine() import DIRAC requests = [] from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient reqClient = ReqClient() args = Script.getPositionalArgs() if len( args ) == 1: requests = [reqName for reqName in args[0].split( ',' ) if reqName] if not requests: DIRAC.gLogger.fatal( "Need at least one request name" ) Script.showHelp() DIRAC.exit( 1 ) for reqName in requests: reqName = reqName.strip() res = reqClient.cancelRequest( reqName ) if res['OK']:
class FileStatusTransformationAgent(AgentModule): """ FileStatusTransformationAgent """ def __init__(self, *args, **kwargs): AgentModule.__init__(self, *args, **kwargs) self.name = 'FileStatusTransformationAgent' self.enabled = False self.shifterProxy = 'DataManager' self.transformationTypes = ["Replication"] self.transformationStatuses = ["Active"] self.transformationFileStatuses = ["Assigned", "Problematic", "Processed", "Unused"] self.addressTo = ["*****@*****.**"] self.addressFrom = "*****@*****.**" self.emailSubject = "FileStatusTransformationAgent" self.accounting = defaultdict(list) self.errors = [] self.fcClient = FileCatalogClient() self.tClient = TransformationClient() self.reqClient = ReqClient() self.nClient = NotificationClient() def checkFileStatusFuncExists(self, status): """ returns True/False if a function to check transformation files with a given status exists or not """ checkFileStatusFuncName = "check_%s_files" % (status.lower()) if not (hasattr(self, checkFileStatusFuncName) and callable(getattr(self, checkFileStatusFuncName))): self.log.warn("Unable to process transformation files with status ", status) return False return True def beginExecution(self): """ Reload the configurations before every cycle """ self.enabled = self.am_getOption('EnableFlag', False) self.shifterProxy = self.am_setOption('shifterProxy', 'DataManager') self.transformationTypes = self.am_getOption('TransformationTypes', ["Replication"]) self.transformationStatuses = self.am_getOption('TransformationStatuses', ["Active"]) self.transformationFileStatuses = self.am_getOption( 'TransformationFileStatuses', ["Assigned", "Problematic", "Processed", "Unused"]) self.addressTo = self.am_getOption('MailTo', ["*****@*****.**"]) self.addressFrom = self.am_getOption('MailFrom', "*****@*****.**") self.transformationFileStatuses = filter(self.checkFileStatusFuncExists, self.transformationFileStatuses) self.accounting.clear() return S_OK() def sendNotification(self, transID, transType=None, sourceSEs=None, targetSEs=None): """ sends email notification about accounting information of a transformation """ if not(self.errors or self.accounting): return S_OK() emailBody = "Transformation ID: %s\n" % transID if transType: emailBody += "Transformation Type: %s\n" % transType if sourceSEs: emailBody += "Source SE: %s\n" % (" ".join(str(source) for source in sourceSEs)) if targetSEs: emailBody += "Target SE: %s\n\n" % (" ".join(str(target) for target in targetSEs)) rows = [] for action, transFiles in self.accounting.iteritems(): emailBody += "Total number of files with action %s: %s\n" % (action, len(transFiles)) for transFile in transFiles: rows.append([[transFile['LFN']], [str(transFile['AvailableOnSource'])], [str(transFile['AvailableOnTarget'])], [transFile['Status']], [action]]) if rows: columns = ["LFN", "Source", "Target", "Old Status", "Action"] emailBody += printTable(columns, rows, printOut=False, numbering=False, columnSeparator=' | ') if self.errors: emailBody += "\n\nErrors:" emailBody += "\n".join(self.errors) self.log.notice(emailBody) subject = "%s: %s" % (self.emailSubject, transID) for address in self.addressTo: res = self.nClient.sendMail(address, subject, emailBody, self.addressFrom, localAttempt=False) if not res['OK']: self.log.error("Failure to send Email notification to ", address) continue self.errors = [] self.accounting.clear() return S_OK() def logError(self, errStr, varMsg=''): self.log.error(errStr, varMsg) self.errors.append(errStr + varMsg) def execute(self): """ main execution loop of Agent """ res = self.getTransformations() if not res['OK']: self.log.error('Failure to get transformations', res['Message']) return S_ERROR("Failure to get transformations") transformations = res['Value'] if not transformations: self.log.notice('No transformations found with Status %s and Type %s ' % (self.transformationStatuses, self.transformationTypes)) return S_OK() self.log.notice('Will treat %d transformations' % len(transformations)) self.log.notice('Transformations: %s' % ",".join([str(transformation['TransformationID']) for transformation in transformations])) for trans in transformations: transID = trans['TransformationID'] if 'SourceSE' not in trans or not trans['SourceSE']: self.logError("SourceSE not set for transformation, skip processing, transID: ", "%s" % transID) self.sendNotification(transID) continue if 'TargetSE' not in trans or not trans['TargetSE']: self.logError("TargetSE not set for transformation, skip processing, transID: ", "%s" % transID) self.sendNotification(transID, sourceSEs=trans['SourceSE']) continue if 'DataTransType' not in trans: self.logError("Transformation Type not set for transformation, skip processing, transID: ", "%s" % transID) self.sendNotification(transID, sourceSEs=trans['SourceSE'], targetSEs=trans['TargetSE']) continue res = self.processTransformation(transID, trans['SourceSE'], trans['TargetSE'], trans['DataTransType']) if not res['OK']: self.log.error('Failure to process transformation with ID:', transID) continue return S_OK() def getTransformations(self, transID=None): """ returns transformations of a given type and status """ res = None if transID: res = self.tClient.getTransformations( condDict={'TransformationID': transID, 'Status': self.transformationStatuses, 'Type': self.transformationTypes}) else: res = self.tClient.getTransformations( condDict={'Status': self.transformationStatuses, 'Type': self.transformationTypes}) if not res['OK']: return res result = res['Value'] for trans in result: res = self.tClient.getTransformationParameters(trans['TransformationID'], ['SourceSE', 'TargetSE']) if not res['OK']: self.log.error('Failure to get SourceSE and TargetSE parameters for Transformation ID:', trans['TransformationID']) continue trans['SourceSE'] = eval(res['Value']['SourceSE']) trans['TargetSE'] = eval(res['Value']['TargetSE']) res = self.getDataTransformationType(trans['TransformationID']) if not res['OK']: self.log.error('Failure to determine Data Transformation Type', "%s: %s" % (trans['TransformationID'], res['Message'])) continue trans['DataTransType'] = res['Value'] return S_OK(result) def getRequestStatus(self, transID, taskIDs): """ returns request statuses for a given list of task IDs """ res = self.tClient.getTransformationTasks(condDict={'TransformationID': transID, 'TaskID': taskIDs}) if not res['OK']: self.log.error('Failure to get Transformation Tasks for Transformation ID:', transID) return res result = res['Value'] requestStatus = {} for task in result: requestStatus[task['TaskID']] = {'RequestStatus': task['ExternalStatus'], 'RequestID': long(task['ExternalID'])} return S_OK(requestStatus) def getDataTransformationType(self, transID): """ returns transformation types Replication/Moving/Unknown for a given transformation """ res = self.tClient.getTransformationParameters(transID, 'Body') if not res['OK']: return res # if body is empty then we assume that it is a replication transformation if not res['Value']: return S_OK(REPLICATION_TRANS) replication = False rmReplica = False try: body = json.loads(res['Value']) for operation in body: if 'ReplicateAndRegister' in operation: replication = True if 'RemoveReplica' in operation: rmReplica = True except ValueError: if 'ReplicateAndRegister' in res['Value']: replication = True if 'RemoveReplica' in res['Value']: rmReplica = True if rmReplica and replication: return S_OK(MOVING_TRANS) if replication: return S_OK(REPLICATION_TRANS) return S_ERROR("Unknown Transformation Type '%r'" % res['Value']) def setFileStatus(self, transID, transFiles, status): """ sets transformation file status """ lfns = [transFile['LFN'] for transFile in transFiles] lfnStatuses = {lfn: status for lfn in lfns} if lfnStatuses: if self.enabled: res = self.tClient.setFileStatusForTransformation(transID, newLFNsStatus=lfnStatuses, force=True) if not res['OK']: self.logError('Failed to set statuses for LFNs ', "%s" % res['Message']) return res for transFile in transFiles: self.accounting[status].append({'LFN': transFile['LFN'], 'Status': transFile['Status'], 'AvailableOnSource': transFile['AvailableOnSource'], 'AvailableOnTarget': transFile['AvailableOnTarget']}) return S_OK() def selectFailedRequests(self, transFile): """ returns True if transformation file has a failed request otherwise returns False """ res = self.getRequestStatus(transFile['TransformationID'], transFile['TaskID']) if not res['OK']: self.log.error('Failure to get Request Status for Assigned File') return False result = res['Value'] if result[transFile['TaskID']]['RequestStatus'] == 'Failed': return True return False def retryStrategyForFiles(self, transID, transFiles): """ returns retryStrategy Reset Request if a request is found in RMS, otherwise returns set file status to unused""" taskIDs = [transFile['TaskID'] for transFile in transFiles] res = self.getRequestStatus(transID, taskIDs) if not res['OK']: return res result = res['Value'] retryStrategy = defaultdict(dict) for taskID in taskIDs: if taskID is None: self.log.error("Task ID is None", "Transformation: %s\n Files: %r " % (transID, transFiles)) retryStrategy[None]['Strategy'] = SET_UNUSED continue res = self.reqClient.getRequest(requestID=result[taskID]['RequestID']) if not res['OK']: self.log.notice('Request %s does not exist setting file status to unused' % result[taskID]['RequestID']) retryStrategy[taskID]['Strategy'] = SET_UNUSED else: retryStrategy[taskID]['Strategy'] = SET_UNUSED # RESET_REQUEST retryStrategy[taskID]['RequestID'] = result[taskID]['RequestID'] return S_OK(retryStrategy) def check_assigned_files(self, actions, transFiles, transType): """ treatment for transformation files with assigned status """ for transFile in transFiles: if transFile['AvailableOnSource'] and transFile['AvailableOnTarget']: if transType == REPLICATION_TRANS: actions[SET_PROCESSED].append(transFile) if transType == MOVING_TRANS: actions[RETRY].append(transFile) elif transFile['AvailableOnSource'] and not transFile['AvailableOnTarget']: actions[RETRY].append(transFile) elif not transFile['AvailableOnSource'] and transFile['AvailableOnTarget']: actions[SET_PROCESSED].append(transFile) else: # not on src and target actions[SET_DELETED].append(transFile) def check_unused_files(self, actions, transFiles, transType): """ treatment for transformation files with unused status """ for transFile in transFiles: if not transFile['AvailableOnSource'] and transFile['AvailableOnTarget']: actions[SET_PROCESSED].append(transFile) if not transFile['AvailableOnSource'] and not transFile['AvailableOnTarget']: actions[SET_DELETED].append(transFile) def check_processed_files(self, actions, transFiles, transType): """ treatment for transformation files with processed status """ for transFile in transFiles: if transFile['AvailableOnSource'] and transFile['AvailableOnTarget'] and transType == MOVING_TRANS: actions[RETRY].append(transFile) if transFile['AvailableOnSource'] and not transFile['AvailableOnTarget']: actions[RETRY].append(transFile) if not transFile['AvailableOnSource'] and not transFile['AvailableOnTarget']: actions[SET_DELETED].append(transFile) def check_problematic_files(self, actions, transFiles, transType): """ treatment for transformation files with problematic status """ for transFile in transFiles: if transFile['AvailableOnSource'] and transFile['AvailableOnTarget']: if transType == REPLICATION_TRANS: actions[SET_PROCESSED].append(transFile) if transType == MOVING_TRANS: actions[RETRY].append(transFile) elif transFile['AvailableOnSource'] and not transFile['AvailableOnTarget']: actions[RETRY].append(transFile) elif not transFile['AvailableOnSource'] and transFile['AvailableOnTarget']: actions[SET_PROCESSED].append(transFile) else: # not available on source and target actions[SET_DELETED].append(transFile) def retryFiles(self, transID, transFiles): """ resubmits request or sets file status to unused based on the retry strategy of transformation file """ setFilesUnused = [] setFilesAssigned = [] res = self.retryStrategyForFiles(transID, transFiles) if not res['OK']: self.logError('Failure to determine retry strategy (unused / reset request) for files ', "%s" % res['Message']) return res retryStrategy = res['Value'] for transFile in transFiles: if retryStrategy[transFile['TaskID']]['Strategy'] != RESET_REQUEST: setFilesUnused.append(transFile) continue requestID = retryStrategy[transFile['TaskID']]['RequestID'] if self.enabled: res = self.reqClient.resetFailedRequest(requestID, allR=True) if not res['OK']: self.logError('Failed to reset request ', 'ReqID: %s Error: %s' % (requestID, res['Message'])) continue if res['Value'] == "Not reset": self.logError('Failed to reset request ', 'ReqID: %s is non-recoverable' % requestID) continue setFilesAssigned.append(transFile) res = self.tClient.setTaskStatus(transID, transFile['TaskID'], 'Waiting') if not res['OK']: self.logError('Failure to set Waiting status for Task ID: ', "%s %s" % (transFile['TaskID'], res['Message'])) continue self.accounting[RESET_REQUEST].append({'LFN': transFile['LFN'], 'Status': transFile['Status'], 'AvailableOnSource': transFile['AvailableOnSource'], 'AvailableOnTarget': transFile['AvailableOnTarget']}) if setFilesUnused: self.setFileStatus(transID, setFilesUnused, 'Unused') if setFilesAssigned: self.setFileStatus(transID, setFilesAssigned, 'Assigned') return S_OK() def applyActions(self, transID, actions): """ sets new file statuses and resets requests """ for action, transFiles in actions.iteritems(): if action == SET_PROCESSED and transFiles: self.setFileStatus(transID, transFiles, 'Processed') if action == SET_DELETED and transFiles: self.setFileStatus(transID, transFiles, 'Deleted') if action == RETRY and transFiles: # if there is a request in RMS then reset request otherwise set file status unused self.retryFiles(transID, transFiles) def existsInFC(self, storageElements, lfns): """ checks if files have replicas registered in File Catalog for all given storageElements """ res = self.fcClient.getReplicas(lfns) if not res['OK']: return res result = {} result['Successful'] = {} result['Failed'] = {} setOfSEs = set(storageElements) for lfn, msg in res['Value']['Failed'].iteritems(): if msg == 'No such file or directory': result['Successful'][lfn] = False else: result['Failed'][lfn] = msg # check if all replicas are registered in FC filesFoundInFC = res['Value']['Successful'] for lfn, replicas in filesFoundInFC.iteritems(): result['Successful'][lfn] = setOfSEs.issubset(replicas.keys()) return S_OK(result) def existsOnSE(self, storageElements, lfns): """ checks if the given files exist physically on a list of storage elements""" result = {} result['Failed'] = {} result['Successful'] = {} if not lfns: return S_OK(result) voName = lfns[0].split('/')[1] for se in storageElements: res = StorageElement(se, vo=voName).exists(lfns) if not res['OK']: return res for lfn, status in res['Value']['Successful'].iteritems(): if lfn not in result['Successful']: result['Successful'][lfn] = status if not status: result['Successful'][lfn] = False result['Failed'][se] = res['Value']['Failed'] return S_OK(result) def exists(self, storageElements, lfns): """ checks if files exists on both file catalog and storage elements """ fcRes = self.existsInFC(storageElements, lfns) if not fcRes['OK']: self.logError('Failure to determine if files exists in File Catalog ', "%s" % fcRes['Message']) return fcRes if fcRes['Value']['Failed']: self.logError("Failed FileCatalog Response ", "%s" % fcRes['Value']['Failed']) # check if files found in file catalog also exist on SE checkLFNsOnStorage = [lfn for lfn in fcRes['Value']['Successful'] if fcRes['Value']['Successful'][lfn]] # no files were found in FC, return the result instead of verifying them on SE if not checkLFNsOnStorage: return fcRes seRes = self.existsOnSE(storageElements, checkLFNsOnStorage) if not seRes['OK']: self.logError('Failure to determine if files exist on SE ', "%s" % seRes['Message']) return seRes for se in storageElements: if seRes['Value']['Failed'][se]: self.logError('Failed to determine if files exist on SE ', "%s %s" % (se, seRes['Value']['Failed'][se])) return S_ERROR() fcResult = fcRes['Value']['Successful'] seResult = seRes['Value']['Successful'] for lfn in fcResult: if fcResult[lfn] and not seResult[lfn]: fcRes['Value']['Successful'][lfn] = False return fcRes def processTransformation(self, transID, sourceSE, targetSEs, transType): """ process transformation for a given transformation ID """ actions = {} actions[SET_PROCESSED] = [] actions[RETRY] = [] actions[SET_DELETED] = [] for status in self.transformationFileStatuses: res = self.tClient.getTransformationFiles(condDict={'TransformationID': transID, 'Status': status}) if not res['OK']: errStr = 'Failure to get Transformation Files, Status: %s Transformation ID: %s Message: %s' % (status, transID, res['Message']) self.logError(errStr) continue transFiles = res['Value'] if not transFiles: self.log.notice("No Transformation Files found with status %s for Transformation ID %d" % (status, transID)) continue self.log.notice("Processing Transformation Files with status %s for TransformationID %d " % (status, transID)) if status == 'Assigned': transFiles = filter(self.selectFailedRequests, transFiles) lfns = [transFile['LFN'] for transFile in transFiles] if not lfns: continue res = self.exists(sourceSE, lfns) if not res['OK']: continue resultSourceSe = res['Value']['Successful'] res = self.exists(targetSEs, lfns) if not res['OK']: continue resultTargetSEs = res['Value']['Successful'] for transFile in transFiles: lfn = transFile['LFN'] transFile['AvailableOnSource'] = resultSourceSe[lfn] transFile['AvailableOnTarget'] = resultTargetSEs[lfn] checkFilesFuncName = "check_%s_files" % status.lower() checkFiles = getattr(self, checkFilesFuncName) checkFiles(actions, transFiles, transType) self.applyActions(transID, actions) self.sendNotification(transID, transType, sourceSE, targetSEs) return S_OK()
def requestClient(self): """ request client getter """ if not self.__requestClient: self.__requestClient = ReqClient() return self.__requestClient
Script.registerSwitch('', 'Full', ' Print full list of requests') Script.setUsageMessage('\n'.join([__doc__, 'Usage:', ' %s [option|cfgfile]' % Script.scriptName])) parseCommandLine() from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient if __name__ == "__main__": fullPrint = False for switch in Script.getUnprocessedSwitches(): if switch[0] == 'Full': fullPrint = True reqClient = ReqClient() for server, rpcClient in reqClient.requestProxies().iteritems(): DIRAC.gLogger.always("Checking request cache at %s" % server) reqCache = rpcClient.listCacheDir() if not reqCache['OK']: DIRAC.gLogger.error("Cannot list request cache", reqCache) continue reqCache = reqCache['Value'] if fullPrint: DIRAC.gLogger.always("List of requests", reqCache) else: DIRAC.gLogger.always("Number of requests in the cache", len(reqCache)) DIRAC.exit(0)
class RequestTasks(TaskBase): def __init__(self, transClient=None, logger=None, requestClient=None, requestClass=None, requestValidator=None): """ c'tor the requestClass is by default Request. If extensions want to use an extended type, they can pass it as a parameter. This is the same behavior as WorfkloTasks and jobClass """ if not logger: logger = gLogger.getSubLogger('RequestTasks') super(RequestTasks, self).__init__(transClient, logger) if not requestClient: self.requestClient = ReqClient() else: self.requestClient = requestClient if not requestClass: self.requestClass = Request else: self.requestClass = requestClass if not requestValidator: self.requestValidator = RequestValidator() else: self.requestValidator = requestValidator def prepareTransformationTasks(self, transBody, taskDict, owner='', ownerGroup='', ownerDN=''): """ Prepare tasks, given a taskDict, that is created (with some manipulation) by the DB """ if (not owner) or (not ownerGroup): res = getProxyInfo(False, False) if not res['OK']: return res proxyInfo = res['Value'] owner = proxyInfo['username'] ownerGroup = proxyInfo['group'] if not ownerDN: res = getDNForUsername(owner) if not res['OK']: return res ownerDN = res['Value'][0] requestOperation = 'ReplicateAndRegister' if transBody: try: _requestType, requestOperation = transBody.split(';') except AttributeError: pass for taskID in sorted(taskDict): paramDict = taskDict[taskID] if paramDict['InputData']: transID = paramDict['TransformationID'] oRequest = Request() transfer = Operation() transfer.Type = requestOperation transfer.TargetSE = paramDict['TargetSE'] if isinstance(paramDict['InputData'], list): files = paramDict['InputData'] elif isinstance(paramDict['InputData'], basestring): files = paramDict['InputData'].split(';') for lfn in files: trFile = File() trFile.LFN = lfn transfer.addFile(trFile) oRequest.addOperation(transfer) oRequest.RequestName = _requestName(transID, taskID) oRequest.OwnerDN = ownerDN oRequest.OwnerGroup = ownerGroup isValid = self.requestValidator.validate(oRequest) if not isValid['OK']: return isValid taskDict[taskID]['TaskObject'] = oRequest return S_OK(taskDict) def submitTransformationTasks(self, taskDict): """ Submit requests one by one """ submitted = 0 failed = 0 startTime = time.time() for taskID in sorted(taskDict): if not taskDict[taskID]['TaskObject']: taskDict[taskID]['Success'] = False failed += 1 continue res = self.submitTaskToExternal(taskDict[taskID]['TaskObject']) if res['OK']: taskDict[taskID]['ExternalID'] = res['Value'] taskDict[taskID]['Success'] = True submitted += 1 else: self._logError("Failed to submit task to RMS", res['Message']) taskDict[taskID]['Success'] = False failed += 1 self._logInfo( 'submitTasks: Submitted %d tasks to RMS in %.1f seconds' % (submitted, time.time() - startTime)) if failed: self._logWarn( 'submitTasks: But at the same time failed to submit %d tasks to RMS.' % (failed)) return S_OK(taskDict) def submitTaskToExternal(self, oRequest): """ Submits a request using ReqClient """ if isinstance(oRequest, self.requestClass): return self.requestClient.putRequest(oRequest) else: return S_ERROR("Request should be a Request object") def updateTransformationReservedTasks(self, taskDicts): requestNameIDs = {} noTasks = [] for taskDict in taskDicts: requestName = _requestName(taskDict['TransformationID'], taskDict['TaskID']) reqID = taskDict['ExternalID'] if reqID: requestNameIDs[requestName] = reqID else: noTasks.append(requestName) return S_OK({'NoTasks': noTasks, 'TaskNameIDs': requestNameIDs}) def getSubmittedTaskStatus(self, taskDicts): updateDict = {} for taskDict in taskDicts: oldStatus = taskDict['ExternalStatus'] newStatus = self.requestClient.getRequestStatus( taskDict['ExternalID']) if not newStatus['OK']: log = self._logVerbose if 'not exist' in newStatus[ 'Message'] else self.log.warn log( "getSubmittedTaskStatus: Failed to get requestID for request", '%s' % newStatus['Message']) else: newStatus = newStatus['Value'] if newStatus != oldStatus: updateDict.setdefault(newStatus, []).append(taskDict['TaskID']) return S_OK(updateDict) def getSubmittedFileStatus(self, fileDicts): taskFiles = {} submittedTasks = {} externalIds = {} # Don't try and get status of not submitted tasks! for fileDict in fileDicts: submittedTasks.setdefault(fileDict['TransformationID'], set()).add(int(fileDict['TaskID'])) for transID in submittedTasks: res = self.transClient.getTransformationTasks({ 'TransformationID': transID, 'TaskID': list(submittedTasks[transID]) }) if not res['OK']: return res for taskDict in res['Value']: taskID = taskDict['TaskID'] externalIds[taskID] = taskDict['ExternalID'] if taskDict['ExternalStatus'] == 'Created': submittedTasks[transID].remove(taskID) for fileDict in fileDicts: transID = fileDict['TransformationID'] taskID = int(fileDict['TaskID']) if taskID in submittedTasks[transID]: requestID = externalIds[taskID] taskFiles.setdefault(requestID, {})[fileDict['LFN']] = fileDict['Status'] updateDict = {} for requestID in sorted(taskFiles): lfnDict = taskFiles[requestID] statusDict = self.requestClient.getRequestFileStatus( requestID, lfnDict.keys()) if not statusDict['OK']: log = self._logVerbose if 'not exist' in statusDict[ 'Message'] else self.log.warn log( "getSubmittedFileStatus: Failed to get files status for request", '%s' % statusDict['Message']) continue statusDict = statusDict['Value'] for lfn, newStatus in statusDict.items(): if newStatus == lfnDict[lfn]: pass elif newStatus == 'Done': updateDict[lfn] = 'Processed' elif newStatus == 'Failed': updateDict[lfn] = 'Problematic' return S_OK(updateDict)
class FTS3Operation(FTS3Serializable): """ Abstract class to represent an operation to be executed by FTS. It is a container for FTSFiles, as well as for FTSJobs. There can be a mapping between one FTS3Operation and one RMS Operation. The FTS3Operation takes care of generating the appropriate FTSJobs, and to perform a callback when the work with FTS is over. The actual generation and callback depends on the subclass. This class should not be instantiated directly, but rather one of its subclass """ ALL_STATES = [ 'Active', # Default state until FTS has done everything 'Processed', # Interactions with FTS done, but callback not done 'Finished', # Everything was done 'Canceled', # Canceled by the user 'Failed', # I don't know yet ] FINAL_STATES = ['Finished', 'Canceled', 'Failed'] INIT_STATE = 'Active' _attrToSerialize = [ 'operationID', 'username', 'userGroup', 'rmsReqID', 'rmsOpID', 'sourceSEs', 'ftsFiles', 'activity', 'priority', 'ftsJobs', 'creationTime', 'lastUpdate', 'error', 'status' ] def __init__(self, ftsFiles=None, username=None, userGroup=None, rmsReqID=-1, rmsOpID=0, sourceSEs=None, activity=None, priority=None): """ :param ftsFiles: list of FTS3Files object that belongs to the operation :param username: username whose proxy should be used :param userGroup: group that should be used with username :param rmsReqID: ID of the Request in the RMS system :param rmsOpID: ID of the Operation in the RMS system :param sourceSEs: list of SE to be used as source (if applicable) :param activity: FTS activity to use :param priority: FTS priority to use """ ############################ # persistent attributes self.username = username self.userGroup = userGroup self.rmsReqID = rmsReqID self.rmsOpID = rmsOpID if isinstance(sourceSEs, list): sourceSEs = ','.join(sourceSEs) self.sourceSEs = sourceSEs self.ftsFiles = ftsFiles if ftsFiles else [] self.activity = activity self.priority = priority self.ftsJobs = [] now = datetime.datetime.utcnow().replace(microsecond=0) self.creationTime = now self.lastUpdate = now self.error = None self.status = FTS3Operation.INIT_STATE ######################## self.reqClient = None self.dManager = None self._log = None self.init_on_load() @orm.reconstructor def init_on_load(self): """ This method initializes some attributes. It is called by sqlalchemy (which does not call __init__) """ self._vo = None self.dManager = DataManager() self.rssClient = ResourceStatus() opID = getattr(self, 'operationID', None) loggerName = '%s/' % opID if opID else '' loggerName += 'req_%s/op_%s' % (self.rmsReqID, self.rmsOpID) self._log = gLogger.getSubLogger(loggerName, True) @property def vo(self): """:returns: return vo of the usergroup """ if self._vo: return self._vo if self.userGroup: self._vo = getVOForGroup(self.userGroup) return self._vo def isTotallyProcessed(self): """ Returns True if and only if there is nothing else to be done by FTS for this operation. All files are successful or definitely failed """ if self.status == 'Processed': return True fileStatuses = set([f.status for f in self.ftsFiles]) # If all the files are in a final state if fileStatuses <= set(FTS3File.FINAL_STATES): self.status = 'Processed' return True return False def _getFilesToSubmit(self, maxAttemptsPerFile=10): """ Return the list of FTS3files that can be submitted Either because they never were submitted, or because we can make more attempts :param maxAttemptsPerFile: the maximum number of attempts to be tried for a file :return: List of FTS3File to submit """ toSubmit = [] for ftsFile in self.ftsFiles: if ftsFile.attempt >= maxAttemptsPerFile: ftsFile.status = 'Defunct' # The file was never submitted or # The file failed from the point of view of FTS # but no more than the maxAttemptsPerFile elif ftsFile.status in [FTS3File.INIT_STATE ] + FTS3File.FTS_FAILED_STATES: toSubmit.append(ftsFile) return toSubmit @staticmethod def _checkSEAccess(seName, accessType, vo=None): """Check the Status of a storage element :param seName: name of the StorageElement :param accessType ReadAccess, WriteAccess,CheckAccess,RemoveAccess :return: S_ERROR if not allowed or error, S_OK() otherwise """ # Check that the target is writable # access = self.rssClient.getStorageElementStatus( seName, accessType ) # if not access["OK"]: # return access # if access["Value"][seName][accessType] not in ( "Active", "Degraded" ): # return S_ERROR( "%s does not have %s in Active or Degraded" % ( seName, accessType ) ) status = StorageElement(seName, vo=vo).getStatus() if not status['OK']: return status status = status['Value'] accessType = accessType.replace('Access', '') if not status[accessType]: return S_ERROR( errno.EACCES, "%s does not have %s in Active or Degraded" % (seName, accessType)) return S_OK() def _createNewJob(self, jobType, ftsFiles, targetSE, sourceSE=None): """ Create a new FTS3Job object :param jobType: type of job to create (Transfer, Staging, Removal) :param ftsFiles: list of FTS3File objects the job has to work on :param targetSE: SE on which to operate :param sourceSE: source SE, only useful for Transfer jobs :return: FTS3Job object """ newJob = FTS3Job() newJob.type = jobType newJob.sourceSE = sourceSE newJob.targetSE = targetSE newJob.activity = self.activity newJob.priority = self.priority newJob.username = self.username newJob.userGroup = self.userGroup newJob.vo = self.vo newJob.filesToSubmit = ftsFiles newJob.operationID = getattr(self, 'operationID') return newJob def _callback(self): """Actually performs the callback """ raise NotImplementedError("You should not be using the base class") def callback(self): """ Trigger the callback once all the FTS interactions are done and update the status of the Operation to 'Finished' if successful """ self.reqClient = ReqClient() res = self._callback() if res['OK']: self.status = 'Finished' return res def prepareNewJobs(self, maxFilesPerJob=100, maxAttemptsPerFile=10): """ Prepare the new jobs that have to be submitted :param maxFilesPerJob: maximum number of files assigned to a job :param maxAttemptsPerFile: maximum number of retry after an fts failure :return: list of jobs """ raise NotImplementedError("You should not be using the base class") def _updateRmsOperationStatus(self): """ Update the status of the Files in the rms operation :return: S_OK with a dict: * request: rms Request object * operation: rms Operation object * ftsFilesByTarget: dict {SE: [ftsFiles that were successful]} """ log = self._log.getSubLogger( "_updateRmsOperationStatus/%s/%s" % (getattr(self, 'operationID'), self.rmsReqID), child=True) res = self.reqClient.getRequest(self.rmsReqID) if not res['OK']: return res request = res['Value'] res = request.getWaiting() if not res["OK"]: log.error("Unable to find 'Scheduled' operation in request") res = self.reqClient.putRequest(request, useFailoverProxy=False, retryMainService=3) if not res['OK']: log.error("Could not put back the request !", res['Message']) return S_ERROR("Could not find scheduled operation") operation = res['Value'] # We index the files of the operation by their IDs rmsFileIDs = {} for opFile in operation: rmsFileIDs[opFile.FileID] = opFile # Files that failed to transfer defunctRmsFileIDs = set() # { SE : [FTS3Files] } ftsFilesByTarget = {} for ftsFile in self.ftsFiles: if ftsFile.status == 'Defunct': log.info( "File failed to transfer, setting it to failed in RMS", "%s %s" % (ftsFile.lfn, ftsFile.targetSE)) defunctRmsFileIDs.add(ftsFile.rmsFileID) continue if ftsFile.status == 'Canceled': log.info("File canceled, setting it Failed in RMS", "%s %s" % (ftsFile.lfn, ftsFile.targetSE)) defunctRmsFileIDs.add(ftsFile.rmsFileID) continue # SHOULD NEVER HAPPEN ! if ftsFile.status != 'Finished': log.error("Callback called with file in non terminal state", "%s %s" % (ftsFile.lfn, ftsFile.targetSE)) res = self.reqClient.putRequest(request, useFailoverProxy=False, retryMainService=3) if not res['OK']: log.error("Could not put back the request !", res['Message']) return S_ERROR( "Callback called with file in non terminal state") ftsFilesByTarget.setdefault(ftsFile.targetSE, []).append(ftsFile) # Now, we set the rmsFile as done in the operation, providing # that they are not in the defunctFiles. # We cannot do this in the previous list because in the FTS system, # each destination is a separate line in the DB but not in the RMS for ftsFile in self.ftsFiles: opFile = rmsFileIDs[ftsFile.rmsFileID] opFile.Status = 'Failed' if ftsFile.rmsFileID in defunctRmsFileIDs else 'Done' return S_OK({ 'request': request, 'operation': operation, 'ftsFilesByTarget': ftsFilesByTarget }) @classmethod def fromRMSObjects(cls, rmsReq, rmsOp, username): """ Construct an FTS3Operation object from the RMS Request and Operation corresponding. The attributes taken are the OwnerGroup, Request and Operation IDS, sourceSE, and activity and priority if they are defined in the Argument field of the operation :param rmsReq: RMS Request object :param rmsOp: RMS Operation object :param username: username to which associate the FTS3Operation (normally comes from the Req OwnerDN) :returns: FTS3Operation object """ ftsOp = cls() ftsOp.username = username ftsOp.userGroup = rmsReq.OwnerGroup ftsOp.rmsReqID = rmsReq.RequestID ftsOp.rmsOpID = rmsOp.OperationID ftsOp.sourceSEs = rmsOp.SourceSE try: argumentDic = json.loads(rmsOp.Arguments) ftsOp.activity = argumentDic['activity'] ftsOp.priority = argumentDic['priority'] except Exception as _e: pass return ftsOp
'Arguments:', ' requestName: a request name' ] ) ) # # execution if __name__ == "__main__": from DIRAC.Core.Base.Script import parseCommandLine parseCommandLine() import DIRAC from DIRAC import gLogger resetFailed = False requests = [] jobs = [] all = False from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient reqClient = ReqClient() for switch in Script.getUnprocessedSwitches(): if switch[0] == 'Failed': resetFailed = True elif switch[0] == 'All': all = True elif switch[0] == 'Maximum': try: maxReset = int( switch[1] ) except: pass elif switch[0] == 'Job': try: jobs = [int( job ) for job in switch[1].split( ',' )] except: print "Invalid jobID", switch[1]
class RequestTask(object): """ .. class:: RequestTask request's processing task """ def __init__(self, requestJSON, handlersDict, csPath, agentName, standalone=False, requestClient=None): """c'tor :param self: self reference :param str requestJSON: request serialized to JSON :param dict opHandlers: operation handlers """ self.request = Request(requestJSON) # # csPath self.csPath = csPath # # agent name self.agentName = agentName # # standalone flag self.standalone = standalone # # handlers dict self.handlersDict = handlersDict # # handlers class def self.handlers = {} # # own sublogger self.log = gLogger.getSubLogger( "pid_%s/%s" % (os.getpid(), self.request.RequestName)) # # get shifters info self.__managersDict = {} shifterProxies = self.__setupManagerProxies() if not shifterProxies["OK"]: self.log.error(shifterProxies["Message"]) # # initialize gMonitor gMonitor.setComponentType(gMonitor.COMPONENT_AGENT) gMonitor.setComponentName(self.agentName) gMonitor.initialize() # # own gMonitor activities gMonitor.registerActivity("RequestAtt", "Requests processed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM) gMonitor.registerActivity("RequestFail", "Requests failed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM) gMonitor.registerActivity("RequestOK", "Requests done", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM) if requestClient is None: self.requestClient = ReqClient() else: self.requestClient = requestClient def __setupManagerProxies(self): """ setup grid proxy for all defined managers """ oHelper = Operations() shifters = oHelper.getSections("Shifter") if not shifters["OK"]: self.log.error(shifters["Message"]) return shifters shifters = shifters["Value"] for shifter in shifters: shifterDict = oHelper.getOptionsDict("Shifter/%s" % shifter) if not shifterDict["OK"]: self.log.error(shifterDict["Message"]) continue userName = shifterDict["Value"].get("User", "") userGroup = shifterDict["Value"].get("Group", "") userDN = CS.getDNForUsername(userName) if not userDN["OK"]: self.log.error(userDN["Message"]) continue userDN = userDN["Value"][0] vomsAttr = CS.getVOMSAttributeForGroup(userGroup) if vomsAttr: self.log.debug( "getting VOMS [%s] proxy for shifter %s@%s (%s)" % (vomsAttr, userName, userGroup, userDN)) getProxy = gProxyManager.downloadVOMSProxyToFile( userDN, userGroup, requiredTimeLeft=1200, cacheTime=4 * 43200) else: self.log.debug("getting proxy for shifter %s@%s (%s)" % (userName, userGroup, userDN)) getProxy = gProxyManager.downloadProxyToFile( userDN, userGroup, requiredTimeLeft=1200, cacheTime=4 * 43200) if not getProxy["OK"]: self.log.error(getProxy["Message"]) return S_ERROR("unable to setup shifter proxy for %s: %s" % (shifter, getProxy["Message"])) chain = getProxy["chain"] fileName = getProxy["Value"] self.log.debug("got %s: %s %s" % (shifter, userName, userGroup)) self.__managersDict[shifter] = { "ShifterDN": userDN, "ShifterName": userName, "ShifterGroup": userGroup, "Chain": chain, "ProxyFile": fileName } return S_OK() def setupProxy(self): """ download and dump request owner proxy to file and env :return: S_OK with name of newly created owner proxy file and shifter name if any """ self.__managersDict = {} shifterProxies = self.__setupManagerProxies() if not shifterProxies["OK"]: self.log.error(shifterProxies["Message"]) ownerDN = self.request.OwnerDN ownerGroup = self.request.OwnerGroup isShifter = [] for shifter, creds in self.__managersDict.items(): if creds["ShifterDN"] == ownerDN and creds[ "ShifterGroup"] == ownerGroup: isShifter.append(shifter) if isShifter: proxyFile = self.__managersDict[isShifter[0]]["ProxyFile"] os.environ["X509_USER_PROXY"] = proxyFile return S_OK({"Shifter": isShifter, "ProxyFile": proxyFile}) # # if we're here owner is not a shifter at all ownerProxyFile = gProxyManager.downloadVOMSProxyToFile( ownerDN, ownerGroup) if not ownerProxyFile["OK"] or not ownerProxyFile["Value"]: reason = ownerProxyFile.get( "Message", "No valid proxy found in ProxyManager.") return S_ERROR("Change proxy error for '%s'@'%s': %s" % (ownerDN, ownerGroup, reason)) ownerProxyFile = ownerProxyFile["Value"] os.environ["X509_USER_PROXY"] = ownerProxyFile return S_OK({"Shifter": isShifter, "ProxyFile": ownerProxyFile}) @staticmethod def getPluginName(pluginPath): if not pluginPath: return '' if "/" in pluginPath: pluginPath = ".".join( [chunk for chunk in pluginPath.split("/") if chunk]) return pluginPath.split(".")[-1] @staticmethod def loadHandler(pluginPath): """ Create an instance of requested plugin class, loading and importing it when needed. This function could raise ImportError when plugin cannot be find or TypeError when loaded class object isn't inherited from BaseOperation class. :param str pluginName: dotted path to plugin, specified as in import statement, i.e. "DIRAC.CheesShopSystem.private.Cheddar" or alternatively in 'normal' path format "DIRAC/CheesShopSystem/private/Cheddar" :return: object instance This function try to load and instantiate an object from given path. It is assumed that: * `pluginPath` is pointing to module directory "importable" by python interpreter, i.e.: it's package's top level directory is in $PYTHONPATH env variable, * the module should consist a class definition following module name, * the class itself is inherited from DIRAC.RequestManagementSystem.private.BaseOperation.BaseOperation If above conditions aren't meet, function is throwing exceptions: :raises ImportError: when class cannot be imported :raises TypeError: when class isn't inherited from OperationHandlerBase """ if "/" in pluginPath: pluginPath = ".".join( [chunk for chunk in pluginPath.split("/") if chunk]) pluginName = pluginPath.split(".")[-1] if pluginName not in globals(): mod = __import__(pluginPath, globals(), fromlist=[pluginName]) pluginClassObj = getattr(mod, pluginName) else: pluginClassObj = globals()[pluginName] if not issubclass(pluginClassObj, OperationHandlerBase): raise TypeError( "operation handler '%s' isn't inherited from OperationHandlerBase class" % pluginName) for key, status in (("Att", "Attempted"), ("OK", "Successful"), ("Fail", "Failed")): gMonitor.registerActivity( "%s%s" % (pluginName, key), "%s operations %s" % (pluginName, status), "RequestExecutingAgent", "Operations/min", gMonitor.OP_SUM) # # return an instance return pluginClassObj def getHandler(self, operation): """ return instance of a handler for a given operation type on demand all created handlers are kept in self.handlers dict for further use :param ~Operation.Operation operation: Operation instance """ if operation.Type not in self.handlersDict: return S_ERROR("handler for operation '%s' not set" % operation.Type) handler = self.handlers.get(operation.Type, None) if not handler: try: handlerCls = self.loadHandler( self.handlersDict[operation.Type]) self.handlers[operation.Type] = handlerCls( csPath="%s/OperationHandlers/%s" % (self.csPath, operation.Type)) handler = self.handlers[operation.Type] except (ImportError, TypeError) as error: self.log.exception("getHandler: %s" % str(error), lException=error) return S_ERROR(str(error)) # # set operation for this handler handler.setOperation(operation) # # and return return S_OK(handler) def updateRequest(self): """ put back request to the RequestDB """ updateRequest = self.requestClient.putRequest(self.request, useFailoverProxy=False, retryMainService=2) if not updateRequest["OK"]: self.log.error(updateRequest["Message"]) return updateRequest def __call__(self): """ request processing """ self.log.debug("about to execute request") gMonitor.addMark("RequestAtt", 1) # # setup proxy for request owner setupProxy = self.setupProxy() if not setupProxy["OK"]: self.request.Error = setupProxy["Message"] if 'has no proxy registered' in setupProxy["Message"]: self.log.error('Request set to Failed:', setupProxy["Message"]) # If user is no longer registered, fail the request for operation in self.request: for opFile in operation: opFile.Status = 'Failed' operation.Status = 'Failed' else: self.log.error(setupProxy["Message"]) return S_OK(self.request) shifter = setupProxy["Value"]["Shifter"] proxyFile = setupProxy["Value"]["ProxyFile"] error = None while self.request.Status == "Waiting": # # get waiting operation operation = self.request.getWaiting() if not operation["OK"]: self.log.error(operation["Message"]) return operation operation = operation["Value"] self.log.info("executing operation #%s '%s'" % (operation.Order, operation.Type)) # # and handler for it handler = self.getHandler(operation) if not handler["OK"]: self.log.error("unable to process operation %s: %s" % (operation.Type, handler["Message"])) # gMonitor.addMark( "%s%s" % ( operation.Type, "Fail" ), 1 ) operation.Error = handler["Message"] break handler = handler["Value"] # # set shifters list in the handler handler.shifter = shifter # # and execute pluginName = self.getPluginName( self.handlersDict.get(operation.Type)) if self.standalone: useServerCertificate = gConfig.useServerCertificate() else: # Always use server certificates if executed within an agent useServerCertificate = True try: if pluginName: gMonitor.addMark("%s%s" % (pluginName, "Att"), 1) # Always use request owner proxy if useServerCertificate: gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'false') exe = handler() if useServerCertificate: gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'true') if not exe["OK"]: self.log.error("unable to process operation %s: %s" % (operation.Type, exe["Message"])) if pluginName: gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1) gMonitor.addMark("RequestFail", 1) if self.request.JobID: # Check if the job exists monitorServer = RPCClient( "WorkloadManagement/JobMonitoring", useCertificates=True) res = monitorServer.getJobPrimarySummary( int(self.request.JobID)) if not res["OK"]: self.log.error( "RequestTask: Failed to get job %d status" % self.request.JobID) elif not res['Value']: self.log.warn( "RequestTask: job %d does not exist (anymore): failed request" % self.request.JobID) for opFile in operation: opFile.Status = 'Failed' if operation.Status != 'Failed': operation.Status = 'Failed' self.request.Error = 'Job no longer exists' except Exception as error: self.log.exception("hit by exception: %s" % str(error)) if pluginName: gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1) gMonitor.addMark("RequestFail", 1) if useServerCertificate: gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'true') break # # operation status check if operation.Status == "Done" and pluginName: gMonitor.addMark("%s%s" % (pluginName, "OK"), 1) elif operation.Status == "Failed" and pluginName: gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1) elif operation.Status in ("Waiting", "Scheduled"): # # no update for waiting or all files scheduled break gMonitor.flush() if error: return S_ERROR(error) # # request done? if self.request.Status == "Done": # # update request to the RequestDB self.log.info('updating request with status %s' % self.request.Status) update = self.updateRequest() if not update["OK"]: self.log.error(update["Message"]) return update self.log.info("request '%s' is done" % self.request.RequestName) gMonitor.addMark("RequestOK", 1) # # and there is a job waiting for it? finalize! if self.request.JobID: attempts = 0 while True: finalizeRequest = self.requestClient.finalizeRequest( self.request.RequestID, self.request.JobID) # pylint: disable=no-member if not finalizeRequest["OK"]: if not attempts: self.log.error( "unable to finalize request %s: %s, will retry" % (self.request.RequestName, finalizeRequest["Message"])) self.log.verbose("Waiting 10 seconds") attempts += 1 if attempts == 10: self.log.error( "giving up finalize request after %d attempts" % attempts) return S_ERROR('Could not finalize request') time.sleep(10) else: self.log.info("request '%s' is finalized%s" % (self.request.RequestName, (' after %d attempts' % attempts) if attempts else '')) break # Request will be updated by the callBack method self.log.verbose("RequestTasks exiting, request %s" % self.request.Status) return S_OK(self.request)
class RequestTasks(TaskBase): """ Class for handling tasks for the RMS """ def __init__( self, transClient=None, logger=None, requestClient=None, requestClass=None, requestValidator=None, ownerDN=None, ownerGroup=None, ): """c'tor the requestClass is by default Request. If extensions want to use an extended type, they can pass it as a parameter. This is the same behavior as WorfkloTasks and jobClass """ if not logger: logger = gLogger.getSubLogger(self.__class__.__name__) super(RequestTasks, self).__init__(transClient, logger) useCertificates = True if (bool(ownerDN) and bool(ownerGroup)) else False if not requestClient: self.requestClient = ReqClient(useCertificates=useCertificates, delegatedDN=ownerDN, delegatedGroup=ownerGroup) else: self.requestClient = requestClient if not requestClass: self.requestClass = Request else: self.requestClass = requestClass if not requestValidator: self.requestValidator = RequestValidator() else: self.requestValidator = requestValidator def prepareTransformationTasks(self, transBody, taskDict, owner="", ownerGroup="", ownerDN="", bulkSubmissionFlag=False): """Prepare tasks, given a taskDict, that is created (with some manipulation) by the DB""" if not taskDict: return S_OK({}) if (not owner) or (not ownerGroup): res = getProxyInfo(False, False) if not res["OK"]: return res proxyInfo = res["Value"] owner = proxyInfo["username"] ownerGroup = proxyInfo["group"] if not ownerDN: res = getDNForUsername(owner) if not res["OK"]: return res ownerDN = res["Value"][0] try: transJson, _decLen = decode(transBody) if isinstance(transJson, BaseBody): self._bodyPlugins(transJson, taskDict, ownerDN, ownerGroup) else: self._multiOperationsBody(transJson, taskDict, ownerDN, ownerGroup) except ValueError: # #json couldn't load self._singleOperationsBody(transBody, taskDict, ownerDN, ownerGroup) return S_OK(taskDict) def _multiOperationsBody(self, transJson, taskDict, ownerDN, ownerGroup): """Deal with a Request that has multiple operations :param transJson: list of lists of string and dictionaries, e.g.: .. code :: python body = [ ( "ReplicateAndRegister", { "SourceSE":"FOO-SRM", "TargetSE":"TASK:TargetSE" }), ( "RemoveReplica", { "TargetSE":"FOO-SRM" } ), ] If a value of an operation parameter in the body starts with ``TASK:``, we take it from the taskDict. For example ``TASK:TargetSE`` is replaced with ``task['TargetSE']`` :param dict taskDict: dictionary of tasks, modified in this function :param str ownerDN: certificate DN used for the requests :param str onwerGroup: dirac group used for the requests :returns: None """ for taskID, task in list(taskDict.items()): try: transID = task["TransformationID"] if not task.get("InputData"): raise StopTaskIteration("No input data") files = [] oRequest = Request() if isinstance(task["InputData"], list): files = task["InputData"] elif isinstance(task["InputData"], six.string_types): files = task["InputData"].split(";") # create the operations from the json structure for operationTuple in transJson: op = Operation() op.Type = operationTuple[0] for parameter, value in operationTuple[1].items(): # Here we massage a bit the body to replace some parameters # with what we have in the task. try: taskKey = value.split("TASK:")[1] value = task[taskKey] # Either the attribute is not a string (AttributeError) # or it does not start with 'TASK:' (IndexError) except (AttributeError, IndexError): pass # That happens when the requested substitution is not # a key in the task, and that's a problem except KeyError: raise StopTaskIteration( "Parameter %s does not exist in taskDict" % taskKey) setattr(op, parameter, value) for lfn in files: opFile = File() opFile.LFN = lfn op.addFile(opFile) oRequest.addOperation(op) result = self._assignRequestToTask(oRequest, taskDict, transID, taskID, ownerDN, ownerGroup) if not result["OK"]: raise StopTaskIteration( "Could not assign request to task: %s" % result["Message"]) except StopTaskIteration as e: self._logError("Error creating request for task", "%s, %s" % (taskID, e), transID=transID) taskDict.pop(taskID) def _singleOperationsBody(self, transBody, taskDict, ownerDN, ownerGroup): """deal with a Request that has just one operation, as it was sofar :param transBody: string, can be an empty string :param dict taskDict: dictionary of tasks, modified in this function :param str ownerDN: certificate DN used for the requests :param str onwerGroup: dirac group used for the requests :returns: None """ requestOperation = "ReplicateAndRegister" if transBody: try: _requestType, requestOperation = transBody.split(";") except AttributeError: pass failedTasks = [] # Do not remove sorted, we might pop elements in the loop for taskID, task in taskDict.items(): transID = task["TransformationID"] oRequest = Request() transfer = Operation() transfer.Type = requestOperation transfer.TargetSE = task["TargetSE"] # If there are input files if task.get("InputData"): if isinstance(task["InputData"], list): files = task["InputData"] elif isinstance(task["InputData"], six.string_types): files = task["InputData"].split(";") for lfn in files: trFile = File() trFile.LFN = lfn transfer.addFile(trFile) oRequest.addOperation(transfer) result = self._assignRequestToTask(oRequest, taskDict, transID, taskID, ownerDN, ownerGroup) if not result["OK"]: failedTasks.append(taskID) # Remove failed tasks for taskID in failedTasks: taskDict.pop(taskID) def _bodyPlugins(self, bodyObj, taskDict, ownerDN, ownerGroup): """Deal with complex body object""" for taskID, task in list(taskDict.items()): try: transID = task["TransformationID"] if not task.get("InputData"): raise StopTaskIteration("No input data") oRequest = bodyObj.taskToRequest(taskID, task, transID) result = self._assignRequestToTask(oRequest, taskDict, transID, taskID, ownerDN, ownerGroup) if not result["OK"]: raise StopTaskIteration( "Could not assign request to task: %s" % result["Message"]) except StopTaskIteration as e: self._logError("Error creating request for task", "%s, %s" % (taskID, e), transID=transID) taskDict.pop(taskID) def _assignRequestToTask(self, oRequest, taskDict, transID, taskID, ownerDN, ownerGroup): """set ownerDN and group to request, and add the request to taskDict if it is valid, otherwise remove the task from the taskDict :param oRequest: Request :param dict taskDict: dictionary of tasks, modified in this function :param int transID: Transformation ID :param int taskID: Task ID :param str ownerDN: certificate DN used for the requests :param str onwerGroup: dirac group used for the requests :returns: None """ oRequest.RequestName = self._transTaskName(transID, taskID) oRequest.OwnerDN = ownerDN oRequest.OwnerGroup = ownerGroup isValid = self.requestValidator.validate(oRequest) if not isValid["OK"]: self._logError("Error creating request for task", "%s %s" % (taskID, isValid), transID=transID) return S_ERROR("Error creating request") taskDict[taskID]["TaskObject"] = oRequest return S_OK() def submitTransformationTasks(self, taskDict): """Submit requests one by one""" submitted = 0 failed = 0 startTime = time.time() method = "submitTransformationTasks" for task in taskDict.values(): # transID is the same for all tasks, so pick it up every time here transID = task["TransformationID"] if not task["TaskObject"]: task["Success"] = False failed += 1 continue res = self.submitTaskToExternal(task["TaskObject"]) if res["OK"]: task["ExternalID"] = res["Value"] task["Success"] = True submitted += 1 else: self._logError("Failed to submit task to RMS", res["Message"], transID=transID) task["Success"] = False failed += 1 if submitted: self._logInfo( "Submitted %d tasks to RMS in %.1f seconds" % (submitted, time.time() - startTime), transID=transID, method=method, ) if failed: self._logWarn("Failed to submit %d tasks to RMS." % (failed), transID=transID, method=method) return S_OK(taskDict) def submitTaskToExternal(self, oRequest): """ Submits a request to RMS """ if isinstance(oRequest, self.requestClass): return self.requestClient.putRequest(oRequest, useFailoverProxy=False, retryMainService=2) return S_ERROR("Request should be a Request object") def updateTransformationReservedTasks(self, taskDicts): requestNameIDs = {} noTasks = [] for taskDict in taskDicts: requestName = self._transTaskName(taskDict["TransformationID"], taskDict["TaskID"]) reqID = taskDict["ExternalID"] if reqID and int(reqID): requestNameIDs[requestName] = reqID else: noTasks.append(requestName) return S_OK({"NoTasks": noTasks, "TaskNameIDs": requestNameIDs}) def getSubmittedTaskStatus(self, taskDicts): """ Check if tasks changed status, and return a list of tasks per new status """ updateDict = {} badRequestID = 0 for taskDict in taskDicts: oldStatus = taskDict["ExternalStatus"] # ExternalID is normally a string if taskDict["ExternalID"] and int(taskDict["ExternalID"]): newStatus = self.requestClient.getRequestStatus( taskDict["ExternalID"]) if not newStatus["OK"]: log = self._logVerbose if "not exist" in newStatus[ "Message"] else self._logWarn log( "getSubmittedTaskStatus: Failed to get requestID for request", newStatus["Message"], transID=taskDict["TransformationID"], ) else: newStatus = newStatus["Value"] # We don't care updating the tasks to Assigned while the request is being processed if newStatus != oldStatus and newStatus != "Assigned": updateDict.setdefault(newStatus, []).append(taskDict["TaskID"]) else: badRequestID += 1 if badRequestID: self._logWarn("%d requests have identifier 0" % badRequestID) return S_OK(updateDict) def getSubmittedFileStatus(self, fileDicts): """ Check if transformation files changed status, and return a list of taskIDs per new status """ # Don't try and get status of not submitted tasks! transID = None taskFiles = {} for fileDict in fileDicts: # There is only one transformation involved, get however the transID in the loop transID = fileDict["TransformationID"] taskID = int(fileDict["TaskID"]) taskFiles.setdefault(taskID, []).append(fileDict["LFN"]) # Should not happen, but just in case there are no files, return if transID is None: return S_OK({}) res = self.transClient.getTransformationTasks({ "TransformationID": transID, "TaskID": list(taskFiles) }) if not res["OK"]: return res requestFiles = {} for taskDict in res["Value"]: taskID = taskDict["TaskID"] externalID = taskDict["ExternalID"] # Only consider tasks that are submitted, ExternalID is a string if taskDict["ExternalStatus"] != "Created" and externalID and int( externalID): requestFiles[externalID] = taskFiles[taskID] updateDict = {} for requestID, lfnList in requestFiles.items(): statusDict = self.requestClient.getRequestFileStatus( requestID, lfnList) if not statusDict["OK"]: log = self._logVerbose if "not exist" in statusDict[ "Message"] else self._logWarn log( "Failed to get files status for request", statusDict["Message"], transID=transID, method="getSubmittedFileStatus", ) else: for lfn, newStatus in statusDict["Value"].items(): if newStatus == "Done": updateDict[lfn] = TransformationFilesStatus.PROCESSED elif newStatus == "Failed": updateDict[lfn] = TransformationFilesStatus.PROBLEMATIC return S_OK(updateDict)
'Usage:', ' %s [option|cfgfile] [requestName|requestID]' % Script.scriptName, 'Arguments:', ' requestName: a request name' ] ) ) # # execution if __name__ == "__main__": from DIRAC.Core.Base.Script import parseCommandLine parseCommandLine() import DIRAC resetFailed = False requestName = '' job = None from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient reqClient = ReqClient() for switch in Script.getUnprocessedSwitches(): if switch[0] == 'Failed': resetFailed = True elif switch[0] == 'Maximum': try: maxReset = int( switch[1] ) except: pass elif switch[0] == 'Job': try: job = int( switch[1] ) except: print "Invalid jobID", switch[1] if not job: