class ReplicateAndRegister( DMSRequestOperationsBase ): """ .. class:: ReplicateAndRegister ReplicateAndRegister operation handler """ def __init__( self, operation = None, csPath = None ): """c'tor :param self: self reference :param Operation operation: Operation instance :param str csPath: CS path for this handler """ super( ReplicateAndRegister, self ).__init__( operation, csPath ) # # own gMonitor stuff for files gMonitor.registerActivity( "ReplicateAndRegisterAtt", "Replicate and register attempted", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "ReplicateOK", "Replications successful", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "ReplicateFail", "Replications failed", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RegisterOK", "Registrations successful", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RegisterFail", "Registrations failed", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM ) # # for FTS gMonitor.registerActivity( "FTSScheduleAtt", "Files schedule attempted", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSScheduleOK", "File schedule successful", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSScheduleFail", "File schedule failed", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM ) # # SE cache self.seCache = {} # Clients self.fc = FileCatalog() self.ftsClient = FTSClient() def __call__( self ): """ call me maybe """ # # check replicas first checkReplicas = self.__checkReplicas() if not checkReplicas["OK"]: self.log.error( checkReplicas["Message"] ) if hasattr( self, "FTSMode" ) and getattr( self, "FTSMode" ): bannedGroups = getattr( self, "FTSBannedGroups" ) if hasattr( self, "FTSBannedGroups" ) else () if self.request.OwnerGroup in bannedGroups: self.log.info( "usage of FTS system is banned for request's owner" ) return self.rmTransfer() return self.ftsTransfer() return self.rmTransfer() def __checkReplicas( self ): """ check done replicas and update file states """ waitingFiles = dict( [ ( opFile.LFN, opFile ) for opFile in self.operation if opFile.Status in ( "Waiting", "Scheduled" ) ] ) targetSESet = set( self.operation.targetSEList ) replicas = self.fc.getReplicas( waitingFiles.keys() ) if not replicas["OK"]: self.log.error( replicas["Message"] ) return replicas reMissing = re.compile( "no such file or directory" ) for failedLFN, errStr in replicas["Value"]["Failed"].items(): waitingFiles[failedLFN].Error = errStr if reMissing.search( errStr.lower() ): self.log.error( "file %s does not exists" % failedLFN ) gMonitor.addMark( "ReplicateFail", len( targetSESet ) ) waitingFiles[failedLFN].Status = "Failed" for successfulLFN, reps in replicas["Value"]["Successful"].items(): if targetSESet.issubset( set( reps ) ): self.log.info( "file %s has been replicated to all targets" % successfulLFN ) waitingFiles[successfulLFN].Status = "Done" return S_OK() def _addMetadataToFiles( self, toSchedule ): """ Add metadata to those files that need to be scheduled through FTS toSchedule is a dictionary: {'lfn1': [opFile, validReplicas, validTargets], 'lfn2': [opFile, validReplicas, validTargets]} """ if toSchedule: self.log.info( "found %s files to schedule, getting metadata from FC" % len( toSchedule ) ) lfns = toSchedule.keys() else: self.log.info( "No files to schedule" ) return S_OK() res = self.fc.getFileMetadata( lfns ) if not res['OK']: return res else: if res['Value']['Failed']: self.log.warn( "Can't schedule %d files: problems getting the metadata: %s" % ( len( res['Value']['Failed'] ), ', '.join( res['Value']['Failed'] ) ) ) metadata = res['Value']['Successful'] filesToScheduleList = [] for lfnsToSchedule, lfnMetadata in metadata.items(): opFileToSchedule = toSchedule[lfnsToSchedule][0] opFileToSchedule.GUID = lfnMetadata['GUID'] opFileToSchedule.Checksum = metadata[lfnsToSchedule]['Checksum'] opFileToSchedule.ChecksumType = metadata[lfnsToSchedule]['CheckSumType'] opFileToSchedule.Size = metadata[lfnsToSchedule]['Size'] filesToScheduleList.append( ( opFileToSchedule.toJSON()['Value'], toSchedule[lfnsToSchedule][1], toSchedule[lfnsToSchedule][2] ) ) return S_OK( filesToScheduleList ) def _filterReplicas( self, opFile ): """ filter out banned/invalid source SEs """ return filterReplicas( opFile, logger = self.log, dataManager = self.dm, seCache = self.seCache ) def ftsTransfer( self ): """ replicate and register using FTS """ self.log.info( "scheduling files in FTS..." ) bannedTargets = self.checkSEsRSS() if not bannedTargets['OK']: gMonitor.addMark( "FTSScheduleAtt" ) gMonitor.addMark( "FTSScheduleFail" ) return bannedTargets if bannedTargets['Value']: return S_OK( "%s targets are banned for writing" % ",".join( bannedTargets['Value'] ) ) # Can continue now self.log.verbose( "No targets banned for writing" ) toSchedule = {} for opFile in self.getWaitingFilesList(): opFile.Error = '' gMonitor.addMark( "FTSScheduleAtt" ) # # check replicas replicas = self._filterReplicas( opFile ) if not replicas["OK"]: continue replicas = replicas["Value"] validReplicas = replicas["Valid"] bannedReplicas = replicas["Banned"] noReplicas = replicas['NoReplicas'] badReplicas = replicas['Bad'] noPFN = replicas['NoPFN'] if not validReplicas: gMonitor.addMark( "FTSScheduleFail" ) if bannedReplicas: self.log.warn( "unable to schedule '%s', replicas only at banned SEs" % opFile.LFN ) elif noReplicas: self.log.error( "unable to schedule %s, file doesn't exist" % opFile.LFN ) opFile.Error = 'No replicas found' opFile.Status = 'Failed' elif badReplicas: self.log.error( "unable to schedule %s, all replicas have a bad checksum" % opFile.LFN ) opFile.Error = 'All replicas have a bad checksum' opFile.Status = 'Failed' elif noPFN: self.log.warn( "unable to schedule %s, could not get a PFN" % opFile.LFN ) else: validTargets = list( set( self.operation.targetSEList ) - set( validReplicas ) ) if not validTargets: self.log.info( "file %s is already present at all targets" % opFile.LFN ) opFile.Status = "Done" else: toSchedule[opFile.LFN] = [ opFile, validReplicas, validTargets ] res = self._addMetadataToFiles( toSchedule ) if not res['OK']: return res else: filesToScheduleList = res['Value'] if filesToScheduleList: ftsSchedule = self.ftsClient.ftsSchedule( self.request.RequestID, self.operation.OperationID, filesToScheduleList ) if not ftsSchedule["OK"]: self.log.error( ftsSchedule["Message"] ) return ftsSchedule # might have nothing to schedule ftsSchedule = ftsSchedule["Value"] if not ftsSchedule: return S_OK() for fileID in ftsSchedule["Successful"]: gMonitor.addMark( "FTSScheduleOK", 1 ) for opFile in self.operation: if fileID == opFile.FileID: opFile.Status = "Scheduled" self.log.debug( "%s has been scheduled for FTS" % opFile.LFN ) self.log.info( "%d files have been scheduled to FTS" % len( ftsSchedule['Successful'] ) ) for fileID in ftsSchedule["Failed"]: gMonitor.addMark( "FTSScheduleFail", 1 ) for opFile in self.operation: if fileID == opFile.FileID: opFile.Error = ftsSchedule["Failed"][fileID] if 'sourceSURL equals to targetSURL' in opFile.Error: # In this case there is no need to continue opFile.Status = 'Failed' self.log.warn( "unable to schedule %s for FTS: %s" % ( opFile.LFN, opFile.Error ) ) else: self.log.info( "No files to schedule after metadata checks" ) # Just in case some transfers could not be scheduled, try them with RM return self.rmTransfer( fromFTS = True ) def rmTransfer( self, fromFTS = False ): """ replicate and register using dataManager """ # # get waiting files. If none just return waitingFiles = self.getWaitingFilesList() if not waitingFiles: return S_OK() if fromFTS: self.log.info( "Trying transfer using replica manager as FTS failed" ) else: self.log.info( "Transferring files using Data manager..." ) # # source SE sourceSE = self.operation.SourceSE if self.operation.SourceSE else None if sourceSE: # # check source se for read sourceRead = self.rssSEStatus( sourceSE, "ReadAccess" ) if not sourceRead["OK"]: self.log.info( sourceRead["Message"] ) for opFile in self.operation: opFile.Error = sourceRead["Message"] self.operation.Error = sourceRead["Message"] gMonitor.addMark( "ReplicateAndRegisterAtt", len( self.operation ) ) gMonitor.addMark( "ReplicateFail", len( self.operation ) ) return sourceRead if not sourceRead["Value"]: self.operation.Error = "SourceSE %s is banned for reading" % sourceSE self.log.info( self.operation.Error ) return S_OK( self.operation.Error ) # # check targetSEs for write bannedTargets = self.checkSEsRSS() if not bannedTargets['OK']: gMonitor.addMark( "ReplicateAndRegisterAtt", len( self.operation ) ) gMonitor.addMark( "ReplicateFail", len( self.operation ) ) return bannedTargets if bannedTargets['Value']: return S_OK( "%s targets are banned for writing" % ",".join( bannedTargets['Value'] ) ) # Can continue now self.log.verbose( "No targets banned for writing" ) # # loop over files for opFile in waitingFiles: gMonitor.addMark( "ReplicateAndRegisterAtt", 1 ) opFile.Error = '' lfn = opFile.LFN # Check if replica is at the specified source replicas = self._filterReplicas( opFile ) if not replicas["OK"]: self.log.error( replicas["Message"] ) continue replicas = replicas["Value"] if not replicas["Valid"]: self.log.warn( "unable to find valid replicas for %s" % lfn ) continue # # get the first one in the list if sourceSE not in replicas['Valid']: if sourceSE: self.log.warn( "%s is not at specified sourceSE %s, changed to %s" % ( lfn, sourceSE, replicas["Valid"][0] ) ) sourceSE = replicas["Valid"][0] # # loop over targetSE catalog = self.operation.Catalog for targetSE in self.operation.targetSEList: # # call DataManager if targetSE == sourceSE: self.log.warn( "Request to replicate %s to the source SE: %s" % ( lfn, sourceSE ) ) continue res = self.dm.replicateAndRegister( lfn, targetSE, sourceSE = sourceSE, catalog = catalog ) if res["OK"]: if lfn in res["Value"]["Successful"]: if "replicate" in res["Value"]["Successful"][lfn]: repTime = res["Value"]["Successful"][lfn]["replicate"] prString = "file %s replicated at %s in %s s." % ( lfn, targetSE, repTime ) gMonitor.addMark( "ReplicateOK", 1 ) if "register" in res["Value"]["Successful"][lfn]: gMonitor.addMark( "RegisterOK", 1 ) regTime = res["Value"]["Successful"][lfn]["register"] prString += ' and registered in %s s.' % regTime self.log.info( prString ) else: gMonitor.addMark( "RegisterFail", 1 ) prString += " but failed to register" self.log.warn( prString ) opFile.Error = "Failed to register" # # add register replica operation registerOperation = self.getRegisterOperation( opFile, targetSE ) self.request.insertAfter( registerOperation, self.operation ) else: self.log.error( "failed to replicate %s to %s." % ( lfn, targetSE ) ) gMonitor.addMark( "ReplicateFail", 1 ) opFile.Error = "Failed to replicate" else: gMonitor.addMark( "ReplicateFail", 1 ) reason = res["Value"]["Failed"][lfn] self.log.error( "failed to replicate and register file %s at %s:" % ( lfn, targetSE ), reason ) opFile.Error = reason else: gMonitor.addMark( "ReplicateFail", 1 ) opFile.Error = "DataManager error: %s" % res["Message"] self.log.error( opFile.Error ) if not opFile.Error: if len( self.operation.targetSEList ) > 1: self.log.info( "file %s has been replicated to all targetSEs" % lfn ) opFile.Status = "Done" return S_OK()
class FTSRequest( object ): """ .. class:: FTSRequest Helper class for FTS job submission and monitoring. """ # # default checksum type __defaultCksmType = "ADLER32" # # flag to disablr/enable checksum test, default: disabled __cksmTest = False def __init__( self ): """c'tor :param self: self reference """ self.log = gLogger.getSubLogger( self.__class__.__name__, True ) # # final states tuple self.finalStates = ( 'Canceled', 'Failed', 'Hold', 'Finished', 'FinishedDirty' ) # # failed states tuple self.failedStates = ( 'Canceled', 'Failed', 'Hold', 'FinishedDirty' ) # # successful states tuple self.successfulStates = ( 'Finished', 'Done' ) # # all file states tuple self.fileStates = ( 'Done', 'Active', 'Pending', 'Ready', 'Canceled', 'Failed', 'Finishing', 'Finished', 'Submitted', 'Hold', 'Waiting' ) self.statusSummary = {} # # request status self.requestStatus = 'Unknown' # # dict for FTS job files self.fileDict = {} # # dict for replicas information self.catalogReplicas = {} # # dict for metadata information self.catalogMetadata = {} # # dict for files that failed to register self.failedRegistrations = {} # # placehoder for FileCatalog reference self.oCatalog = None # # submit timestamp self.submitTime = '' # # placeholder FTS job GUID self.ftsGUID = '' # # placeholder for FTS server URL self.ftsServer = '' # # flag marking FTS job completness self.isTerminal = False # # completness percentage self.percentageComplete = 0.0 # # source SE name self.sourceSE = '' # # flag marking source SE validity self.sourceValid = False # # source space token self.sourceToken = '' # # target SE name self.targetSE = '' # # flag marking target SE validity self.targetValid = False # # target space token self.targetToken = '' # # placeholder for target StorageElement self.oTargetSE = None # # placeholder for source StorageElement self.oSourceSE = None # # checksum type, set it to default self.__cksmType = self.__defaultCksmType # # disable checksum test by default self.__cksmTest = False # # statuses that prevent submitting to FTS self.noSubmitStatus = ( 'Failed', 'Done', 'Staging' ) # # were sources resolved? self.sourceResolved = False # # Number of file transfers actually submitted self.submittedFiles = 0 self.transferTime = 0 self.submitCommand = Operations().getValue( 'DataManagement/FTSPlacement/FTS2/SubmitCommand', 'glite-transfer-submit' ) self.monitorCommand = Operations().getValue( 'DataManagement/FTSPlacement/FTS2/MonitorCommand', 'glite-transfer-status' ) self.ftsVersion = Operations().getValue( 'DataManagement/FTSVersion', 'FTS2' ) self.ftsJob = None self.ftsFiles = [] #################################################################### # # Methods for setting/getting/checking the SEs # def setSourceSE( self, se ): """ set SE for source :param self: self reference :param str se: source SE name """ if se == self.targetSE: return S_ERROR( "SourceSE is TargetSE" ) self.sourceSE = se self.oSourceSE = StorageElement( self.sourceSE ) return self.__checkSourceSE() def __checkSourceSE( self ): """ check source SE availability :param self: self reference """ if not self.sourceSE: return S_ERROR( "SourceSE not set" ) res = self.oSourceSE.isValid( 'Read' ) if not res['OK']: return S_ERROR( "SourceSE not available for reading" ) res = self.__getSESpaceToken( self.oSourceSE ) if not res['OK']: self.log.error( "FTSRequest failed to get SRM Space Token for SourceSE", res['Message'] ) return S_ERROR( "SourceSE does not support FTS transfers" ) if self.__cksmTest: cksmType = self.oSourceSE.checksumType() if cksmType in ( "NONE", "NULL" ): self.log.warn( "Checksum type set to %s at SourceSE %s, disabling checksum test" % ( cksmType, self.sourceSE ) ) self.__cksmTest = False elif cksmType != self.__cksmType: self.log.warn( "Checksum type mismatch, disabling checksum test" ) self.__cksmTest = False self.sourceToken = res['Value'] self.sourceValid = True return S_OK() def setTargetSE( self, se ): """ set target SE :param self: self reference :param str se: target SE name """ if se == self.sourceSE: return S_ERROR( "TargetSE is SourceSE" ) self.targetSE = se self.oTargetSE = StorageElement( self.targetSE ) return self.__checkTargetSE() def setTargetToken( self, token ): """ target space token setter :param self: self reference :param str token: target space token """ self.targetToken = token return S_OK() def __checkTargetSE( self ): """ check target SE availability :param self: self reference """ if not self.targetSE: return S_ERROR( "TargetSE not set" ) res = self.oTargetSE.isValid( 'Write' ) if not res['OK']: return S_ERROR( "TargetSE not available for writing" ) res = self.__getSESpaceToken( self.oTargetSE ) if not res['OK']: self.log.error( "FTSRequest failed to get SRM Space Token for TargetSE", res['Message'] ) return S_ERROR( "TargetSE does not support FTS transfers" ) # # check checksum types if self.__cksmTest: cksmType = self.oTargetSE.checksumType() if cksmType in ( "NONE", "NULL" ): self.log.warn( "Checksum type set to %s at TargetSE %s, disabling checksum test" % ( cksmType, self.targetSE ) ) self.__cksmTest = False elif cksmType != self.__cksmType: self.log.warn( "Checksum type mismatch, disabling checksum test" ) self.__cksmTest = False self.targetToken = res['Value'] self.targetValid = True return S_OK() @staticmethod def __getSESpaceToken( oSE ): """ get space token from StorageElement instance :param self: self reference :param StorageElement oSE: StorageElement instance """ res = oSE.getStorageParameters( protocol = 'srm' ) if not res['OK']: return res return S_OK( res['Value'].get( 'SpaceToken' ) ) #################################################################### # # Methods for setting/getting FTS request parameters # def setFTSGUID( self, guid ): """ FTS job GUID setter :param self: self reference :param str guid: string containg GUID """ if not checkGuid( guid ): return S_ERROR( "Incorrect GUID format" ) self.ftsGUID = guid return S_OK() def setFTSServer( self, server ): """ FTS server setter :param self: self reference :param str server: FTS server URL """ self.ftsServer = server return S_OK() def isRequestTerminal( self ): """ check if FTS job has terminated :param self: self reference """ if self.requestStatus in self.finalStates: self.isTerminal = True return S_OK( self.isTerminal ) def setCksmTest( self, cksmTest = False ): """ set cksm test :param self: self reference :param bool cksmTest: flag to enable/disable checksum test """ self.__cksmTest = bool( cksmTest ) return S_OK( self.__cksmTest ) #################################################################### # # Methods for setting/getting/checking files and their metadata # def setLFN( self, lfn ): """ add LFN :lfn: to :fileDict: :param self: self reference :param str lfn: LFN to add to """ self.fileDict.setdefault( lfn, {'Status':'Waiting'} ) return S_OK() def setSourceSURL( self, lfn, surl ): """ source SURL setter :param self: self reference :param str lfn: LFN :param str surl: source SURL """ target = self.fileDict[lfn].get( 'Target' ) if target == surl: return S_ERROR( "Source and target the same" ) return self.__setFileParameter( lfn, 'Source', surl ) def getSourceSURL( self, lfn ): """ get source SURL for LFN :lfn: :param self: self reference :param str lfn: LFN """ return self.__getFileParameter( lfn, 'Source' ) def setTargetSURL( self, lfn, surl ): """ set target SURL for LFN :lfn: :param self: self reference :param str lfn: LFN :param str surl: target SURL """ source = self.fileDict[lfn].get( 'Source' ) if source == surl: return S_ERROR( "Source and target the same" ) return self.__setFileParameter( lfn, 'Target', surl ) def getFailReason( self, lfn ): """ get fail reason for file :lfn: :param self: self reference :param str lfn: LFN """ return self.__getFileParameter( lfn, 'Reason' ) def getRetries( self, lfn ): """ get number of attepmts made to transfer file :lfn: :param self: self reference :param str lfn: LFN """ return self.__getFileParameter( lfn, 'Retries' ) def getTransferTime( self, lfn ): """ get duration of transfer for file :lfn: :param self: self reference :param str lfn: LFN """ return self.__getFileParameter( lfn, 'Duration' ) def getFailed( self ): """ get list of wrongly transferred LFNs :param self: self reference """ return S_OK( [ lfn for lfn in self.fileDict if self.fileDict[lfn].get( 'Status', '' ) in self.failedStates ] ) def getStaging( self ): """ get files set for prestaging """ return S_OK( [lfn for lfn in self.fileDict if self.fileDict[lfn].get( 'Status', '' ) == 'Staging'] ) def getDone( self ): """ get list of succesfully transferred LFNs :param self: self reference """ return S_OK( [ lfn for lfn in self.fileDict if self.fileDict[lfn].get( 'Status', '' ) in self.successfulStates ] ) def __setFileParameter( self, lfn, paramName, paramValue ): """ set :paramName: to :paramValue: for :lfn: file :param self: self reference :param str lfn: LFN :param str paramName: parameter name :param mixed paramValue: a new parameter value """ self.setLFN( lfn ) self.fileDict[lfn][paramName] = paramValue return S_OK() def __getFileParameter( self, lfn, paramName ): """ get value of :paramName: for file :lfn: :param self: self reference :param str lfn: LFN :param str paramName: parameter name """ if lfn not in self.fileDict: return S_ERROR( "Supplied file not set" ) if paramName not in self.fileDict[lfn]: return S_ERROR( "%s not set for file" % paramName ) return S_OK( self.fileDict[lfn][paramName] ) #################################################################### # # Methods for submission # def submit( self, monitor = False, printOutput = True ): """ submit FTS job :param self: self reference :param bool monitor: flag to monitor progress of FTS job :param bool printOutput: flag to print output of execution to stdout """ res = self.__prepareForSubmission() if not res['OK']: return res res = self.__submitFTSTransfer() if not res['OK']: return res resDict = { 'ftsGUID' : self.ftsGUID, 'ftsServer' : self.ftsServer, 'submittedFiles' : self.submittedFiles } if monitor or printOutput: gLogger.always( "Submitted %s@%s" % ( self.ftsGUID, self.ftsServer ) ) if monitor: self.monitor( untilTerminal = True, printOutput = printOutput, full = False ) return S_OK( resDict ) def __prepareForSubmission( self ): """ check validity of job before submission :param self: self reference """ if not self.fileDict: return S_ERROR( "No files set" ) if not self.sourceValid: return S_ERROR( "SourceSE not valid" ) if not self.targetValid: return S_ERROR( "TargetSE not valid" ) if not self.ftsServer: res = self.__resolveFTSServer() if not res['OK']: return S_ERROR( "FTSServer not valid" ) self.resolveSource() self.resolveTarget() res = self.__filesToSubmit() if not res['OK']: return S_ERROR( "No files to submit" ) return S_OK() def __getCatalogObject( self ): """ CatalogInterface instance facade :param self: self reference """ try: if not self.oCatalog: self.oCatalog = FileCatalog() return S_OK() except: return S_ERROR() def __updateReplicaCache( self, lfns = None, overwrite = False ): """ update replica cache for list of :lfns: :param self: self reference :param mixed lfns: list of LFNs :param bool overwrite: flag to trigger cache clearing and updating """ if not lfns: lfns = self.fileDict.keys() toUpdate = [ lfn for lfn in lfns if ( lfn not in self.catalogReplicas ) or overwrite ] if not toUpdate: return S_OK() res = self.__getCatalogObject() if not res['OK']: return res res = self.oCatalog.getReplicas( toUpdate ) if not res['OK']: return S_ERROR( "Failed to update replica cache: %s" % res['Message'] ) for lfn, error in res['Value']['Failed'].items(): self.__setFileParameter( lfn, 'Reason', error ) self.__setFileParameter( lfn, 'Status', 'Failed' ) for lfn, replicas in res['Value']['Successful'].items(): self.catalogReplicas[lfn] = replicas return S_OK() def __updateMetadataCache( self, lfns = None ): """ update metadata cache for list of LFNs :param self: self reference :param list lnfs: list of LFNs """ if not lfns: lfns = self.fileDict.keys() toUpdate = [ lfn for lfn in lfns if lfn not in self.catalogMetadata ] if not toUpdate: return S_OK() res = self.__getCatalogObject() if not res['OK']: return res res = self.oCatalog.getFileMetadata( toUpdate ) if not res['OK']: return S_ERROR( "Failed to get source catalog metadata: %s" % res['Message'] ) for lfn, error in res['Value']['Failed'].items(): self.__setFileParameter( lfn, 'Reason', error ) self.__setFileParameter( lfn, 'Status', 'Failed' ) for lfn, metadata in res['Value']['Successful'].items(): self.catalogMetadata[lfn] = metadata return S_OK() def resolveSource( self ): """ resolve source SE eligible for submission :param self: self reference """ # Avoid resolving sources twice if self.sourceResolved: return S_OK() # Only resolve files that need a transfer toResolve = [ lfn for lfn in self.fileDict if self.fileDict[lfn].get( "Status", "" ) != "Failed" ] if not toResolve: return S_OK() res = self.__updateMetadataCache( toResolve ) if not res['OK']: return res res = self.__updateReplicaCache( toResolve ) if not res['OK']: return res # Define the source URLs for lfn in toResolve: replicas = self.catalogReplicas.get( lfn, {} ) if self.sourceSE not in replicas: gLogger.warn( "resolveSource: skipping %s - not replicas at SourceSE %s" % ( lfn, self.sourceSE ) ) self.__setFileParameter( lfn, 'Reason', "No replica at SourceSE" ) self.__setFileParameter( lfn, 'Status', 'Failed' ) continue res = returnSingleResult( self.oSourceSE.getURL( lfn, protocol = 'srm' ) ) if not res['OK']: gLogger.warn( "resolveSource: skipping %s - %s" % ( lfn, res["Message"] ) ) self.__setFileParameter( lfn, 'Reason', res['Message'] ) self.__setFileParameter( lfn, 'Status', 'Failed' ) continue res = self.setSourceSURL( lfn, res['Value'] ) if not res['OK']: gLogger.warn( "resolveSource: skipping %s - %s" % ( lfn, res["Message"] ) ) self.__setFileParameter( lfn, 'Reason', res['Message'] ) self.__setFileParameter( lfn, 'Status', 'Failed' ) continue toResolve = [] for lfn in self.fileDict: if "Source" in self.fileDict[lfn]: toResolve.append( lfn ) if not toResolve: return S_ERROR( "No eligible Source files" ) # Get metadata of the sources, to check for existance, availability and caching res = self.oSourceSE.getFileMetadata( toResolve ) if not res['OK']: return S_ERROR( "Failed to check source file metadata" ) for lfn, error in res['Value']['Failed'].items(): if re.search( 'File does not exist', error ): gLogger.warn( "resolveSource: skipping %s - source file does not exists" % lfn ) self.__setFileParameter( lfn, 'Reason', "Source file does not exist" ) self.__setFileParameter( lfn, 'Status', 'Failed' ) else: gLogger.warn( "resolveSource: skipping %s - failed to get source metadata" % lfn ) self.__setFileParameter( lfn, 'Reason', "Failed to get Source metadata" ) self.__setFileParameter( lfn, 'Status', 'Failed' ) toStage = [] nbStagedFiles = 0 for lfn, metadata in res['Value']['Successful'].items(): lfnStatus = self.fileDict.get( lfn, {} ).get( 'Status' ) if metadata.get( 'Unavailable', False ): gLogger.warn( "resolveSource: skipping %s - source file unavailable" % lfn ) self.__setFileParameter( lfn, 'Reason', "Source file Unavailable" ) self.__setFileParameter( lfn, 'Status', 'Failed' ) elif metadata.get( 'Lost', False ): gLogger.warn( "resolveSource: skipping %s - source file lost" % lfn ) self.__setFileParameter( lfn, 'Reason', "Source file Lost" ) self.__setFileParameter( lfn, 'Status', 'Failed' ) elif not metadata.get( 'Cached', metadata['Accessible'] ): if lfnStatus != 'Staging': toStage.append( lfn ) elif metadata['Size'] != self.catalogMetadata[lfn]['Size']: gLogger.warn( "resolveSource: skipping %s - source file size mismatch" % lfn ) self.__setFileParameter( lfn, 'Reason', "Source size mismatch" ) self.__setFileParameter( lfn, 'Status', 'Failed' ) elif self.catalogMetadata[lfn]['Checksum'] and metadata['Checksum'] and \ not compareAdler( metadata['Checksum'], self.catalogMetadata[lfn]['Checksum'] ): gLogger.warn( "resolveSource: skipping %s - source file checksum mismatch" % lfn ) self.__setFileParameter( lfn, 'Reason', "Source checksum mismatch" ) self.__setFileParameter( lfn, 'Status', 'Failed' ) elif lfnStatus == 'Staging': # file that was staging is now cached self.__setFileParameter( lfn, 'Status', 'Waiting' ) nbStagedFiles += 1 # Some files were being staged if nbStagedFiles: self.log.info( 'resolveSource: %d files have been staged' % nbStagedFiles ) # Launching staging of files not in cache if toStage: gLogger.warn( "resolveSource: %s source files not cached, prestaging..." % len( toStage ) ) stage = self.oSourceSE.prestageFile( toStage ) if not stage["OK"]: gLogger.error( "resolveSource: error is prestaging", stage["Message"] ) for lfn in toStage: self.__setFileParameter( lfn, 'Reason', stage["Message"] ) self.__setFileParameter( lfn, 'Status', 'Failed' ) else: for lfn in toStage: if lfn in stage['Value']['Successful']: self.__setFileParameter( lfn, 'Status', 'Staging' ) elif lfn in stage['Value']['Failed']: self.__setFileParameter( lfn, 'Reason', stage['Value']['Failed'][lfn] ) self.__setFileParameter( lfn, 'Status', 'Failed' ) self.sourceResolved = True return S_OK() def resolveTarget( self ): """ find target SE eligible for submission :param self: self reference """ toResolve = [ lfn for lfn in self.fileDict if self.fileDict[lfn].get( 'Status' ) not in self.noSubmitStatus ] if not toResolve: return S_OK() res = self.__updateReplicaCache( toResolve ) if not res['OK']: return res for lfn in toResolve: res = returnSingleResult( self.oTargetSE.getURL( lfn, protocol = 'srm' ) ) if not res['OK']: reason = res.get( 'Message', res['Message'] ) gLogger.warn( "resolveTarget: skipping %s - %s" % ( lfn, reason ) ) self.__setFileParameter( lfn, 'Reason', reason ) self.__setFileParameter( lfn, 'Status', 'Failed' ) continue res = self.setTargetSURL( lfn, res['Value'] ) if not res['OK']: gLogger.warn( "resolveTarget: skipping %s - %s" % ( lfn, res["Message"] ) ) self.__setFileParameter( lfn, 'Reason', res['Message'] ) self.__setFileParameter( lfn, 'Status', 'Failed' ) continue toResolve = [] for lfn in self.fileDict: if "Target" in self.fileDict[lfn]: toResolve.append( lfn ) if not toResolve: return S_ERROR( "No eligible Target files" ) res = self.oTargetSE.exists( toResolve ) if not res['OK']: return S_ERROR( "Failed to check target existence" ) for lfn, error in res['Value']['Failed'].items(): self.__setFileParameter( lfn, 'Reason', error ) self.__setFileParameter( lfn, 'Status', 'Failed' ) toRemove = [] for lfn, exists in res['Value']['Successful'].items(): if exists: res = self.getSourceSURL( lfn ) if not res['OK']: gLogger.warn( "resolveTarget: skipping %s - target exists" % lfn ) self.__setFileParameter( lfn, 'Reason', "Target exists" ) self.__setFileParameter( lfn, 'Status', 'Failed' ) elif res['Value'] == self.fileDict[lfn]['Target']: gLogger.warn( "resolveTarget: skipping %s - source and target pfns are the same" % lfn ) self.__setFileParameter( lfn, 'Reason', "Source and Target the same" ) self.__setFileParameter( lfn, 'Status', 'Failed' ) else: toRemove.append( lfn ) if toRemove: self.oTargetSE.removeFile( toRemove ) return S_OK() def __filesToSubmit( self ): """ check if there is at least one file to submit :return: S_OK if at least one file is present, S_ERROR otherwise """ for lfn in self.fileDict: lfnStatus = self.fileDict[lfn].get( 'Status' ) source = self.fileDict[lfn].get( 'Source' ) target = self.fileDict[lfn].get( 'Target' ) if lfnStatus not in self.noSubmitStatus and source and target: return S_OK() return S_ERROR() def __createFTSFiles( self ): """ create LFNs file for glite-transfer-submit command This file consists one line for each fiel to be transferred: sourceSURL targetSURL [CHECKSUMTYPE:CHECKSUM] :param self: self reference """ self.__updateMetadataCache() for lfn in self.fileDict: lfnStatus = self.fileDict[lfn].get( 'Status' ) if lfnStatus not in self.noSubmitStatus: cksmStr = "" # # add chsmType:cksm only if cksmType is specified, else let FTS decide by itself if self.__cksmTest and self.__cksmType: checkSum = self.catalogMetadata.get( lfn, {} ).get( 'Checksum' ) if checkSum: cksmStr = " %s:%s" % ( self.__cksmType, intAdlerToHex( hexAdlerToInt( checkSum ) ) ) ftsFile = FTSFile() ftsFile.LFN = lfn ftsFile.SourceSURL = self.fileDict[lfn].get( 'Source' ) ftsFile.TargetSURL = self.fileDict[lfn].get( 'Target' ) ftsFile.SourceSE = self.sourceSE ftsFile.TargetSE = self.targetSE ftsFile.Status = self.fileDict[lfn].get( 'Status' ) ftsFile.Checksum = cksmStr ftsFile.Size = self.catalogMetadata.get( lfn, {} ).get( 'Size' ) self.ftsFiles.append( ftsFile ) self.submittedFiles += 1 return S_OK() def __createFTSJob( self, guid = None ): self.__createFTSFiles() ftsJob = FTSJob() ftsJob.RequestID = 0 ftsJob.OperationID = 0 ftsJob.SourceSE = self.sourceSE ftsJob.TargetSE = self.targetSE ftsJob.SourceToken = self.sourceToken ftsJob.TargetToken = self.targetToken ftsJob.FTSServer = self.ftsServer if guid: ftsJob.FTSGUID = guid for ftsFile in self.ftsFiles: ftsFile.Attempt += 1 ftsFile.Error = "" ftsJob.addFile( ftsFile ) self.ftsJob = ftsJob def __submitFTSTransfer( self ): """ create and execute glite-transfer-submit CLI command :param self: self reference """ log = gLogger.getSubLogger( 'Submit' ) self.__createFTSJob() submit = self.ftsJob.submitFTS( self.ftsVersion, command = self.submitCommand ) if not submit["OK"]: log.error( "unable to submit FTSJob: %s" % submit["Message"] ) return submit log.info( "FTSJob '%s'@'%s' has been submitted" % ( self.ftsJob.FTSGUID, self.ftsJob.FTSServer ) ) # # update statuses for job files for ftsFile in self.ftsJob: ftsFile.FTSGUID = self.ftsJob.FTSGUID ftsFile.Status = "Submitted" ftsFile.Attempt += 1 log.info( "FTSJob '%s'@'%s' has been submitted" % ( self.ftsJob.FTSGUID, self.ftsJob.FTSServer ) ) self.ftsGUID = self.ftsJob.FTSGUID return S_OK() def __resolveFTSServer( self ): """ resolve FTS server to use, it should be the closest one from target SE :param self: self reference """ if self.ftsVersion.upper() == 'FTS2': from DIRAC.ConfigurationSystem.Client.Helpers.Resources import getFTS2ServersForSites if not self.targetSE: return S_ERROR( "Target SE not set" ) res = getSitesForSE( self.targetSE ) if not res['OK'] or not res['Value']: return S_ERROR( "Could not determine target site" ) targetSites = res['Value'] targetSite = '' for targetSite in targetSites: targetFTS = getFTS2ServersForSites( [targetSite] ) if targetFTS['OK']: ftsTarget = targetFTS['Value'][targetSite] if ftsTarget: self.ftsServer = ftsTarget return S_OK( self.ftsServer ) else: return targetFTS elif self.ftsVersion.upper() == 'FTS3': from DIRAC.ConfigurationSystem.Client.Helpers.Resources import getFTS3Servers res = getFTS3Servers() if not res['OK']: return res ftsServerList = res['Value'] if ftsServerList: # Here we take the first one, regardless of the policy... # Unclean but all this will disapear after refactoring the fts code self.ftsServer = ftsServerList[0] return S_OK( self.ftsServer ) else: return S_ERROR( 'Unknown FTS version %s' % self.ftsVersion ) return S_ERROR( 'No FTS server found for %s' % targetSite ) #################################################################### # # Methods for monitoring # def summary( self, untilTerminal = False, printOutput = False ): """ summary of FTS job :param self: self reference :param bool untilTerminal: flag to monitor FTS job to its final state :param bool printOutput: flag to print out monitoring information to the stdout """ res = self.__isSummaryValid() if not res['OK']: return res while not self.isTerminal: res = self.__parseOutput( full = True ) if not res['OK']: return res if untilTerminal: self.__print() self.isRequestTerminal() if res['Value'] or ( not untilTerminal ): break time.sleep( 1 ) if untilTerminal: print "" if printOutput and ( not untilTerminal ): return self.dumpSummary( printOutput = printOutput ) return S_OK() def monitor( self, untilTerminal = False, printOutput = False, full = True ): """ monitor FTS job :param self: self reference :param bool untilTerminal: flag to monitor FTS job to its final state :param bool printOutput: flag to print out monitoring information to the stdout """ if not self.ftsJob: self.resolveSource() self.__createFTSJob( self.ftsGUID ) res = self.__isSummaryValid() if not res['OK']: return res if untilTerminal: res = self.summary( untilTerminal = untilTerminal, printOutput = printOutput ) if not res['OK']: return res res = self.__parseOutput( full = full ) if not res['OK']: return res if untilTerminal: self.finalize() if printOutput: self.dump() return res def dumpSummary( self, printOutput = False ): """ get FTS job summary as str :param self: self reference :param bool printOutput: print summary to stdout """ outStr = '' for status in sorted( self.statusSummary ): if self.statusSummary[status]: outStr = '%s\t%-10s : %-10s\n' % ( outStr, status, str( self.statusSummary[status] ) ) outStr = outStr.rstrip( '\n' ) if printOutput: print outStr return S_OK( outStr ) def __print( self ): """ print progress bar of FTS job completeness to stdout :param self: self reference """ width = 100 bits = int( ( width * self.percentageComplete ) / 100 ) outStr = "|%s>%s| %.1f%s %s %s" % ( "="*bits, " "*( width - bits ), self.percentageComplete, "%", self.requestStatus, " "*10 ) sys.stdout.write( "%s\r" % ( outStr ) ) sys.stdout.flush() def dump( self ): """ print FTS job parameters and files to stdout :param self: self reference """ print "%-10s : %-10s" % ( "Status", self.requestStatus ) print "%-10s : %-10s" % ( "Source", self.sourceSE ) print "%-10s : %-10s" % ( "Target", self.targetSE ) print "%-10s : %-128s" % ( "Server", self.ftsServer ) print "%-10s : %-128s" % ( "GUID", self.ftsGUID ) for lfn in sorted( self.fileDict ): print "\n %-15s : %-128s" % ( 'LFN', lfn ) for key in ['Source', 'Target', 'Status', 'Reason', 'Duration']: print " %-15s : %-128s" % ( key, str( self.fileDict[lfn].get( key ) ) ) return S_OK() def __isSummaryValid( self ): """ check validity of FTS job summary report :param self: self reference """ if not self.ftsServer: return S_ERROR( "FTSServer not set" ) if not self.ftsGUID: return S_ERROR( "FTSGUID not set" ) return S_OK() def __parseOutput( self, full = False ): """ execute glite-transfer-status command and parse its output :param self: self reference :param bool full: glite-transfer-status verbosity level, when set, collect information of files as well """ monitor = self.ftsJob.monitorFTS( self.ftsVersion, command = self.monitorCommand, full = full ) if not monitor['OK']: return monitor self.percentageComplete = self.ftsJob.Completeness self.requestStatus = self.ftsJob.Status self.submitTime = self.ftsJob.SubmitTime statusSummary = monitor['Value'] if statusSummary: for state in statusSummary: self.statusSummary[state] = statusSummary[state] self.transferTime = 0 for ftsFile in self.ftsJob: lfn = ftsFile.LFN self.__setFileParameter( lfn, 'Status', ftsFile.Status ) self.__setFileParameter( lfn, 'Reason', ftsFile.Error ) self.__setFileParameter( lfn, 'Duration', ftsFile._duration ) targetURL = self.__getFileParameter( lfn, 'Target' ) if not targetURL['OK']: self.__setFileParameter( lfn, 'Target', ftsFile.TargetSURL ) sourceURL = self.__getFileParameter( lfn, 'Source' ) if not sourceURL['OK']: self.__setFileParameter( lfn, 'Source', ftsFile.SourceSURL ) self.transferTime += int( ftsFile._duration ) return S_OK() #################################################################### # # Methods for finalization # def finalize( self ): """ finalize FTS job :param self: self reference """ self.__updateMetadataCache() transEndTime = dateTime() regStartTime = time.time() res = self.getTransferStatistics() transDict = res['Value'] res = self.__registerSuccessful( transDict['transLFNs'] ) regSuc, regTotal = res['Value'] regTime = time.time() - regStartTime if self.sourceSE and self.targetSE: self.__sendAccounting( regSuc, regTotal, regTime, transEndTime, transDict ) return S_OK() def getTransferStatistics( self ): """ collect information of Transfers that can be used by Accounting :param self: self reference """ transDict = { 'transTotal': len( self.fileDict ), 'transLFNs': [], 'transOK': 0, 'transSize': 0 } for lfn in self.fileDict: if self.fileDict[lfn].get( 'Status' ) in self.successfulStates: if self.fileDict[lfn].get( 'Duration', 0 ): transDict['transLFNs'].append( lfn ) transDict['transOK'] += 1 if lfn in self.catalogMetadata: transDict['transSize'] += self.catalogMetadata[lfn].get( 'Size', 0 ) return S_OK( transDict ) def getFailedRegistrations( self ): """ get failed registrations dict :param self: self reference """ return S_OK( self.failedRegistrations ) def __registerSuccessful( self, transLFNs ): """ register successfully transferred files to the catalogs, fill failedRegistrations dict for files that failed to register :param self: self reference :param list transLFNs: LFNs in FTS job """ self.failedRegistrations = {} toRegister = {} for lfn in transLFNs: res = returnSingleResult( self.oTargetSE.getURL( self.fileDict[lfn].get( 'Target' ), protocol = 'srm' ) ) if not res['OK']: self.__setFileParameter( lfn, 'Reason', res['Message'] ) self.__setFileParameter( lfn, 'Status', 'Failed' ) else: toRegister[lfn] = { 'PFN' : res['Value'], 'SE' : self.targetSE } if not toRegister: return S_OK( ( 0, 0 ) ) res = self.__getCatalogObject() if not res['OK']: for lfn in toRegister: self.failedRegistrations = toRegister self.log.error( 'Failed to get Catalog Object', res['Message'] ) return S_OK( ( 0, len( toRegister ) ) ) res = self.oCatalog.addReplica( toRegister ) if not res['OK']: self.failedRegistrations = toRegister self.log.error( 'Failed to get Catalog Object', res['Message'] ) return S_OK( ( 0, len( toRegister ) ) ) for lfn, error in res['Value']['Failed'].items(): self.failedRegistrations[lfn] = toRegister[lfn] self.log.error( 'Registration of Replica failed', '%s : %s' % ( lfn, str( error ) ) ) return S_OK( ( len( res['Value']['Successful'] ), len( toRegister ) ) ) def __sendAccounting( self, regSuc, regTotal, regTime, transEndTime, transDict ): """ send accounting record :param self: self reference :param regSuc: number of files successfully registered :param regTotal: number of files attepted to register :param regTime: time stamp at the end of registration :param transEndTime: time stamp at the end of FTS job :param dict transDict: dict holding couters for files being transerred, their sizes and successfull transfers """ oAccounting = DataOperation() oAccounting.setEndTime( transEndTime ) oAccounting.setStartTime( self.submitTime ) accountingDict = {} accountingDict['OperationType'] = 'replicateAndRegister' result = getProxyInfo() if not result['OK']: userName = '******' else: userName = result['Value'].get( 'username', 'unknown' ) accountingDict['User'] = userName accountingDict['Protocol'] = 'FTS' if 'fts3' not in self.ftsServer else 'FTS3' accountingDict['RegistrationTime'] = regTime accountingDict['RegistrationOK'] = regSuc accountingDict['RegistrationTotal'] = regTotal accountingDict['TransferOK'] = transDict['transOK'] accountingDict['TransferTotal'] = transDict['transTotal'] accountingDict['TransferSize'] = transDict['transSize'] accountingDict['FinalStatus'] = self.requestStatus accountingDict['Source'] = self.sourceSE accountingDict['Destination'] = self.targetSE accountingDict['TransferTime'] = self.transferTime oAccounting.setValuesFromDict( accountingDict ) self.log.verbose( "Attempting to commit accounting message..." ) oAccounting.commit() self.log.verbose( "...committed." ) return S_OK()
def main(): catalog = None Script.registerSwitch("C:", "Catalog=", "Catalog to use") # Registering arguments will automatically add their description to the help menu Script.registerArgument(" requestName: a request name") Script.registerArgument(" LFNs: single LFN or file with LFNs") Script.registerArgument(["targetSE: target SE"]) Script.parseCommandLine() for switch in Script.getUnprocessedSwitches(): if switch[0] == "C" or switch[0].lower() == "catalog": catalog = switch[1] args = Script.getPositionalArgs() requestName = None targetSEs = None if len(args) < 3: Script.showHelp(exitCode=1) requestName = args[0] lfnList = getLFNList(args[1]) targetSEs = list( set([se for targetSE in args[2:] for se in targetSE.split(",")])) gLogger.info("Will create request '%s' with 'ReplicateAndRegister' " "operation using %s lfns and %s target SEs" % (requestName, len(lfnList), len(targetSEs))) from DIRAC.RequestManagementSystem.Client.Request import Request from DIRAC.RequestManagementSystem.Client.Operation import Operation from DIRAC.RequestManagementSystem.Client.File import File from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient from DIRAC.Resources.Catalog.FileCatalog import FileCatalog from DIRAC.Core.Utilities.List import breakListIntoChunks lfnChunks = breakListIntoChunks(lfnList, 100) multiRequests = len(lfnChunks) > 1 error = 0 count = 0 reqClient = ReqClient() fc = FileCatalog() requestIDs = [] for lfnChunk in lfnChunks: metaDatas = fc.getFileMetadata(lfnChunk) if not metaDatas["OK"]: gLogger.error("unable to read metadata for lfns: %s" % metaDatas["Message"]) error = -1 continue metaDatas = metaDatas["Value"] for failedLFN, reason in metaDatas["Failed"].items(): gLogger.error("skipping %s: %s" % (failedLFN, reason)) lfnChunk = set(metaDatas["Successful"]) if not lfnChunk: gLogger.error("LFN list is empty!!!") error = -1 continue if len(lfnChunk) > Operation.MAX_FILES: gLogger.error( "too many LFNs, max number of files per operation is %s" % Operation.MAX_FILES) error = -1 continue count += 1 request = Request() request.RequestName = requestName if not multiRequests else "%s_%d" % ( requestName, count) replicateAndRegister = Operation() replicateAndRegister.Type = "ReplicateAndRegister" replicateAndRegister.TargetSE = ",".join(targetSEs) if catalog is not None: replicateAndRegister.Catalog = catalog for lfn in lfnChunk: metaDict = metaDatas["Successful"][lfn] opFile = File() opFile.LFN = lfn opFile.Size = metaDict["Size"] if "Checksum" in metaDict: # # should check checksum type, now assuming Adler32 (metaDict["ChecksumType"] = 'AD' opFile.Checksum = metaDict["Checksum"] opFile.ChecksumType = "ADLER32" replicateAndRegister.addFile(opFile) request.addOperation(replicateAndRegister) putRequest = reqClient.putRequest(request) if not putRequest["OK"]: gLogger.error("unable to put request '%s': %s" % (request.RequestName, putRequest["Message"])) error = -1 continue requestIDs.append(str(putRequest["Value"])) if not multiRequests: gLogger.always( "Request '%s' has been put to ReqDB for execution." % request.RequestName) if multiRequests: gLogger.always( "%d requests have been put to ReqDB for execution, with name %s_<num>" % (count, requestName)) if requestIDs: gLogger.always("RequestID(s): %s" % " ".join(requestIDs)) gLogger.always( "You can monitor requests' status using command: 'dirac-rms-request <requestName/ID>'" ) DIRAC.exit(error)
def main(): # Registering arguments will automatically add their description to the help menu Script.registerArgument(" sourceSE: source SE") Script.registerArgument(" LFN: LFN or file containing a List of LFNs") Script.registerArgument(["targetSE: target SEs"]) Script.parseCommandLine() import DIRAC from DIRAC import gLogger # parseCommandLine show help when mandatory arguments are not specified or incorrect argument args = Script.getPositionalArgs() sourceSE = args[0] lfnList = getLFNList(args[1]) targetSEs = list(set([se for targetSE in args[2:] for se in targetSE.split(",")])) gLogger.info( "Will create request with 'MoveReplica' " "operation using %s lfns and %s target SEs" % (len(lfnList), len(targetSEs)) ) from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient from DIRAC.RequestManagementSystem.Client.Request import Request from DIRAC.RequestManagementSystem.Client.Operation import Operation from DIRAC.RequestManagementSystem.Client.File import File from DIRAC.Resources.Catalog.FileCatalog import FileCatalog from DIRAC.Core.Utilities.List import breakListIntoChunks lfnChunks = breakListIntoChunks(lfnList, 100) multiRequests = len(lfnChunks) > 1 error = 0 count = 0 reqClient = ReqClient() fc = FileCatalog() for lfnChunk in lfnChunks: metaDatas = fc.getFileMetadata(lfnChunk) if not metaDatas["OK"]: gLogger.error("unable to read metadata for lfns: %s" % metaDatas["Message"]) error = -1 continue metaDatas = metaDatas["Value"] for failedLFN, reason in metaDatas["Failed"].items(): gLogger.error("skipping %s: %s" % (failedLFN, reason)) lfnChunk = set(metaDatas["Successful"]) if not lfnChunk: gLogger.error("LFN list is empty!!!") error = -1 continue if len(lfnChunk) > Operation.MAX_FILES: gLogger.error("too many LFNs, max number of files per operation is %s" % Operation.MAX_FILES) error = -1 continue count += 1 request = Request() request.RequestName = "%s_%s" % ( md5(repr(time.time()).encode()).hexdigest()[:16], md5(repr(time.time()).encode()).hexdigest()[:16], ) moveReplica = Operation() moveReplica.Type = "MoveReplica" moveReplica.SourceSE = sourceSE moveReplica.TargetSE = ",".join(targetSEs) for lfn in lfnChunk: metaDict = metaDatas["Successful"][lfn] opFile = File() opFile.LFN = lfn opFile.Size = metaDict["Size"] if "Checksum" in metaDict: # # should check checksum type, now assuming Adler32 (metaDict["ChecksumType"] = 'AD' opFile.Checksum = metaDict["Checksum"] opFile.ChecksumType = "ADLER32" moveReplica.addFile(opFile) request.addOperation(moveReplica) result = reqClient.putRequest(request) if not result["OK"]: gLogger.error("Failed to submit Request: %s" % (result["Message"])) error = -1 continue if not multiRequests: gLogger.always("Request %d submitted successfully" % result["Value"]) if multiRequests: gLogger.always("%d requests have been submitted" % (count)) DIRAC.exit(error)
class DataIntegrityClient(Client): """ The following methods are supported in the service but are not mentioned explicitly here: getProblematic() Obtains a problematic file from the IntegrityDB based on the LastUpdate time getPrognosisProblematics(prognosis) Obtains all the problematics of a particular prognosis from the integrityDB getProblematicsSummary() Obtains a count of the number of problematics for each prognosis found getDistinctPrognosis() Obtains the distinct prognosis found in the integrityDB getTransformationProblematics(prodID) Obtains the problematics for a given production incrementProblematicRetry(fileID) Increments the retry count for the supplied file ID changeProblematicPrognosis(fileID,newPrognosis) Changes the prognosis of the supplied file to the new prognosis setProblematicStatus(fileID,status) Updates the status of a problematic in the integrityDB removeProblematic(self,fileID) This removes the specified file ID from the integrity DB insertProblematic(sourceComponent,fileMetadata) Inserts file with supplied metadata into the integrity DB """ def __init__(self, **kwargs): Client.__init__(self, **kwargs) self.setServer('DataManagement/DataIntegrity') self.dm = DataManager() self.fc = FileCatalog() ########################################################################## # # This section contains the specific methods for LFC->SE checks # def catalogDirectoryToSE(self, lfnDir): """ This obtains the replica and metadata information from the catalog for the supplied directory and checks against the storage elements. """ gLogger.info("-" * 40) gLogger.info("Performing the LFC->SE check") gLogger.info("-" * 40) if type(lfnDir) in types.StringTypes: lfnDir = [lfnDir] res = self.__getCatalogDirectoryContents(lfnDir) if not res['OK']: return res replicas = res['Value']['Replicas'] catalogMetadata = res['Value']['Metadata'] res = self.__checkPhysicalFiles(replicas, catalogMetadata) if not res['OK']: return res resDict = { 'CatalogMetadata': catalogMetadata, 'CatalogReplicas': replicas } return S_OK(resDict) def catalogFileToSE(self, lfns): """ This obtains the replica and metadata information from the catalog and checks against the storage elements. """ gLogger.info("-" * 40) gLogger.info("Performing the LFC->SE check") gLogger.info("-" * 40) if type(lfns) in types.StringTypes: lfns = [lfns] res = self.__getCatalogMetadata(lfns) if not res['OK']: return res catalogMetadata = res['Value'] res = self.__getCatalogReplicas(catalogMetadata.keys()) if not res['OK']: return res replicas = res['Value'] res = self.__checkPhysicalFiles(replicas, catalogMetadata) if not res['OK']: return res resDict = { 'CatalogMetadata': catalogMetadata, 'CatalogReplicas': replicas } return S_OK(resDict) def checkPhysicalFiles(self, replicas, catalogMetadata, ses=[]): """ This obtains takes the supplied replica and metadata information obtained from the catalog and checks against the storage elements. """ gLogger.info("-" * 40) gLogger.info("Performing the LFC->SE check") gLogger.info("-" * 40) return self.__checkPhysicalFiles(replicas, catalogMetadata, ses=ses) def __checkPhysicalFiles(self, replicas, catalogMetadata, ses=[]): """ This obtains the physical file metadata and checks the metadata against the catalog entries """ seLfns = {} for lfn, replicaDict in replicas.items(): for se, _url in replicaDict.items(): if (ses) and (se not in ses): continue seLfns.setdefault(se, []).append(lfn) gLogger.info('%s %s' % ('Storage Element'.ljust(20), 'Replicas'.rjust(20))) for se in sortList(seLfns): files = len(seLfns[se]) gLogger.info('%s %s' % (se.ljust(20), str(files).rjust(20))) lfns = seLfns[se] sizeMismatch = [] res = self.__checkPhysicalFileMetadata(lfns, se) if not res['OK']: gLogger.error('Failed to get physical file metadata.', res['Message']) return res for lfn, metadata in res['Value'].items(): if lfn in catalogMetadata: if (metadata['Size'] != catalogMetadata[lfn]['Size']) and ( metadata['Size'] != 0): sizeMismatch.append((lfn, 'deprecatedUrl', se, 'CatalogPFNSizeMismatch')) if sizeMismatch: self.__reportProblematicReplicas(sizeMismatch, se, 'CatalogPFNSizeMismatch') return S_OK() def __checkPhysicalFileMetadata(self, lfns, se): """ Check obtain the physical file metadata and check the files are available """ gLogger.info('Checking the integrity of %s physical files at %s' % (len(lfns), se)) res = StorageElement(se).getFileMetadata(lfns) if not res['OK']: gLogger.error('Failed to get metadata for lfns.', res['Message']) return res lfnMetadataDict = res['Value']['Successful'] # If the replicas are completely missing missingReplicas = [] for lfn, reason in res['Value']['Failed'].items(): if re.search('File does not exist', reason): missingReplicas.append( (lfn, 'deprecatedUrl', se, 'PFNMissing')) if missingReplicas: self.__reportProblematicReplicas(missingReplicas, se, 'PFNMissing') lostReplicas = [] unavailableReplicas = [] zeroSizeReplicas = [] # If the files are not accessible for lfn, lfnMetadata in lfnMetadataDict.items(): if lfnMetadata['Lost']: lostReplicas.append((lfn, 'deprecatedUrl', se, 'PFNLost')) if lfnMetadata['Unavailable']: unavailableReplicas.append( (lfn, 'deprecatedUrl', se, 'PFNUnavailable')) if lfnMetadata['Size'] == 0: zeroSizeReplicas.append( (lfn, 'deprecatedUrl', se, 'PFNZeroSize')) if lostReplicas: self.__reportProblematicReplicas(lostReplicas, se, 'PFNLost') if unavailableReplicas: self.__reportProblematicReplicas(unavailableReplicas, se, 'PFNUnavailable') if zeroSizeReplicas: self.__reportProblematicReplicas(zeroSizeReplicas, se, 'PFNZeroSize') gLogger.info( 'Checking the integrity of physical files at %s complete' % se) return S_OK(lfnMetadataDict) ########################################################################## # # This section contains the specific methods for SE->LFC checks # def storageDirectoryToCatalog(self, lfnDir, storageElement): """ This obtains the file found on the storage element in the supplied directories and determines whether they exist in the catalog and checks their metadata elements """ gLogger.info("-" * 40) gLogger.info("Performing the SE->LFC check at %s" % storageElement) gLogger.info("-" * 40) if type(lfnDir) in types.StringTypes: lfnDir = [lfnDir] res = self.__getStorageDirectoryContents(lfnDir, storageElement) if not res['OK']: return res storageFileMetadata = res['Value'] if storageFileMetadata: return self.__checkCatalogForSEFiles(storageFileMetadata, storageElement) return S_OK({'CatalogMetadata': {}, 'StorageMetadata': {}}) def __checkCatalogForSEFiles(self, storageMetadata, storageElement): gLogger.info('Checking %s storage files exist in the catalog' % len(storageMetadata)) res = self.fc.getReplicas(storageMetadata) if not res['OK']: gLogger.error("Failed to get replicas for LFN", res['Message']) return res failedLfns = res['Value']['Failed'] successfulLfns = res['Value']['Successful'] notRegisteredLfns = [] for lfn in storageMetadata: if lfn in failedLfns: if 'No such file or directory' in failedLfns[lfn]: notRegisteredLfns.append( (lfn, 'deprecatedUrl', storageElement, 'LFNNotRegistered')) failedLfns.pop(lfn) elif storageElement not in successfulLfns[lfn]: notRegisteredLfns.append( (lfn, 'deprecatedUrl', storageElement, 'LFNNotRegistered')) if notRegisteredLfns: self.__reportProblematicReplicas(notRegisteredLfns, storageElement, 'LFNNotRegistered') if failedLfns: return S_ERROR('Failed to obtain replicas') # For the LFNs found to be registered obtain the file metadata from the catalog and verify against the storage metadata res = self.__getCatalogMetadata(storageMetadata) if not res['OK']: return res catalogMetadata = res['Value'] sizeMismatch = [] for lfn, lfnCatalogMetadata in catalogMetadata.items(): lfnStorageMetadata = storageMetadata[lfn] if (lfnStorageMetadata['Size'] != lfnCatalogMetadata['Size']) and ( lfnStorageMetadata['Size'] != 0): sizeMismatch.append((lfn, 'deprecatedUrl', storageElement, 'CatalogPFNSizeMismatch')) if sizeMismatch: self.__reportProblematicReplicas(sizeMismatch, storageElement, 'CatalogPFNSizeMismatch') gLogger.info('Checking storage files exist in the catalog complete') resDict = { 'CatalogMetadata': catalogMetadata, 'StorageMetadata': storageMetadata } return S_OK(resDict) def getStorageDirectoryContents(self, lfnDir, storageElement): """ This obtains takes the supplied lfn directories and recursively obtains the files in the supplied storage element """ return self.__getStorageDirectoryContents(lfnDir, storageElement) def __getStorageDirectoryContents(self, lfnDir, storageElement): """ Obtians the contents of the supplied directory on the storage """ gLogger.info('Obtaining the contents for %s directories at %s' % (len(lfnDir), storageElement)) se = StorageElement(storageElement) res = se.exists(lfnDir) if not res['OK']: gLogger.error("Failed to obtain existance of directories", res['Message']) return res for directory, error in res['Value']['Failed'].items(): gLogger.error('Failed to determine existance of directory', '%s %s' % (directory, error)) if res['Value']['Failed']: return S_ERROR('Failed to determine existance of directory') directoryExists = res['Value']['Successful'] activeDirs = [] for directory in sorted(directoryExists): exists = directoryExists[directory] if exists: activeDirs.append(directory) allFiles = {} while len(activeDirs) > 0: currentDir = activeDirs[0] res = se.listDirectory(currentDir) activeDirs.remove(currentDir) if not res['OK']: gLogger.error('Failed to get directory contents', res['Message']) return res elif currentDir in res['Value']['Failed']: gLogger.error( 'Failed to get directory contents', '%s %s' % (currentDir, res['Value']['Failed'][currentDir])) return S_ERROR(res['Value']['Failed'][currentDir]) else: dirContents = res['Value']['Successful'][currentDir] activeDirs.extend( se.getLFNFromURL(dirContents['SubDirs']).get( 'Value', {}).get('Successful', [])) fileURLMetadata = dirContents['Files'] fileMetadata = {} res = se.getLFNFromURL(fileURLMetadata) if not res['OK']: gLogger.error('Failed to get directory content LFNs', res['Message']) return res for url, error in res['Value']['Failed'].items(): gLogger.error("Failed to get LFN for URL", "%s %s" % (url, error)) if res['Value']['Failed']: return S_ERROR("Failed to get LFNs for PFNs") urlLfns = res['Value']['Successful'] for urlLfn, lfn in urlLfns.items(): fileMetadata[lfn] = fileURLMetadata[urlLfn] allFiles.update(fileMetadata) zeroSizeFiles = [] for lfn in sorted(allFiles): if os.path.basename(lfn) == 'dirac_directory': allFiles.pop(lfn) else: metadata = allFiles[lfn] if metadata['Size'] == 0: zeroSizeFiles.append( (lfn, 'deprecatedUrl', storageElement, 'PFNZeroSize')) if zeroSizeFiles: self.__reportProblematicReplicas(zeroSizeFiles, storageElement, 'PFNZeroSize') gLogger.info('Obtained at total of %s files for directories at %s' % (len(allFiles), storageElement)) return S_OK(allFiles) def __getStoragePathExists(self, lfnPaths, storageElement): gLogger.info('Determining the existance of %d files at %s' % (len(lfnPaths), storageElement)) se = StorageElement(storageElement) res = se.exists(lfnPaths) if not res['OK']: gLogger.error("Failed to obtain existance of paths", res['Message']) return res for lfnPath, error in res['Value']['Failed'].items(): gLogger.error('Failed to determine existance of path', '%s %s' % (lfnPath, error)) if res['Value']['Failed']: return S_ERROR('Failed to determine existance of paths') pathExists = res['Value']['Successful'] resDict = {} for lfn, exists in pathExists.items(): if exists: resDict[lfn] = True return S_OK(resDict) ########################################################################## # # This section contains the specific methods for obtaining replica and metadata information from the catalog # def __getCatalogDirectoryContents(self, lfnDir): """ Obtain the contents of the supplied directory """ gLogger.info('Obtaining the catalog contents for %s directories' % len(lfnDir)) activeDirs = lfnDir allFiles = {} while len(activeDirs) > 0: currentDir = activeDirs[0] res = self.fc.listDirectory(currentDir) activeDirs.remove(currentDir) if not res['OK']: gLogger.error('Failed to get directory contents', res['Message']) return res elif res['Value']['Failed'].has_key(currentDir): gLogger.error( 'Failed to get directory contents', '%s %s' % (currentDir, res['Value']['Failed'][currentDir])) else: dirContents = res['Value']['Successful'][currentDir] activeDirs.extend(dirContents['SubDirs']) allFiles.update(dirContents['Files']) zeroReplicaFiles = [] zeroSizeFiles = [] allReplicaDict = {} allMetadataDict = {} for lfn, lfnDict in allFiles.items(): lfnReplicas = {} for se, replicaDict in lfnDict['Replicas'].items(): lfnReplicas[se] = replicaDict['PFN'] if not lfnReplicas: zeroReplicaFiles.append(lfn) allReplicaDict[lfn] = lfnReplicas allMetadataDict[lfn] = lfnDict['MetaData'] if lfnDict['MetaData']['Size'] == 0: zeroSizeFiles.append(lfn) if zeroReplicaFiles: self.__reportProblematicFiles(zeroReplicaFiles, 'LFNZeroReplicas') if zeroSizeFiles: self.__reportProblematicFiles(zeroSizeFiles, 'LFNZeroSize') gLogger.info( 'Obtained at total of %s files for the supplied directories' % len(allMetadataDict)) resDict = {'Metadata': allMetadataDict, 'Replicas': allReplicaDict} return S_OK(resDict) def __getCatalogReplicas(self, lfns): """ Obtain the file replicas from the catalog while checking that there are replicas """ gLogger.info('Obtaining the replicas for %s files' % len(lfns)) zeroReplicaFiles = [] res = self.fc.getReplicas(lfns, allStatus=True) if not res['OK']: gLogger.error('Failed to get catalog replicas', res['Message']) return res allReplicas = res['Value']['Successful'] for lfn, error in res['Value']['Failed'].items(): if re.search('File has zero replicas', error): zeroReplicaFiles.append(lfn) if zeroReplicaFiles: self.__reportProblematicFiles(zeroReplicaFiles, 'LFNZeroReplicas') gLogger.info('Obtaining the replicas for files complete') return S_OK(allReplicas) def __getCatalogMetadata(self, lfns): """ Obtain the file metadata from the catalog while checking they exist """ if not lfns: return S_OK({}) gLogger.info('Obtaining the catalog metadata for %s files' % len(lfns)) missingCatalogFiles = [] zeroSizeFiles = [] res = self.fc.getFileMetadata(lfns) if not res['OK']: gLogger.error('Failed to get catalog metadata', res['Message']) return res allMetadata = res['Value']['Successful'] for lfn, error in res['Value']['Failed'].items(): if re.search('No such file or directory', error): missingCatalogFiles.append(lfn) if missingCatalogFiles: self.__reportProblematicFiles(missingCatalogFiles, 'LFNCatalogMissing') for lfn, metadata in allMetadata.items(): if metadata['Size'] == 0: zeroSizeFiles.append(lfn) if zeroSizeFiles: self.__reportProblematicFiles(zeroSizeFiles, 'LFNZeroSize') gLogger.info('Obtaining the catalog metadata complete') return S_OK(allMetadata) ########################################################################## # # This section contains the methods for inserting problematic files into the integrity DB # def __reportProblematicFiles(self, lfns, reason): """ Simple wrapper function around setFileProblematic """ gLogger.info('The following %s files were found with %s' % (len(lfns), reason)) for lfn in sortList(lfns): gLogger.info(lfn) res = self.setFileProblematic(lfns, reason, sourceComponent='DataIntegrityClient') if not res['OK']: gLogger.info('Failed to update integrity DB with files', res['Message']) else: gLogger.info('Successfully updated integrity DB with files') def setFileProblematic(self, lfn, reason, sourceComponent=''): """ This method updates the status of the file in the FileCatalog and the IntegrityDB lfn - the lfn of the file reason - this is given to the integrity DB and should reflect the problem observed with the file sourceComponent is the component issuing the request. """ if type(lfn) == types.ListType: lfns = lfn elif type(lfn) == types.StringType: lfns = [lfn] else: errStr = "DataIntegrityClient.setFileProblematic: Supplied file info must be list or a single LFN." gLogger.error(errStr) return S_ERROR(errStr) gLogger.info( "DataIntegrityClient.setFileProblematic: Attempting to update %s files." % len(lfns)) fileMetadata = {} for lfn in lfns: fileMetadata[lfn] = { 'Prognosis': reason, 'LFN': lfn, 'PFN': '', 'SE': '' } res = self.insertProblematic(sourceComponent, fileMetadata) if not res['OK']: gLogger.error( "DataIntegrityClient.setReplicaProblematic: Failed to insert problematics to integrity DB" ) return res def __reportProblematicReplicas(self, replicaTuple, se, reason): """ Simple wrapper function around setReplicaProblematic """ gLogger.info('The following %s files had %s at %s' % (len(replicaTuple), reason, se)) for lfn, _pfn, se, reason in sortList(replicaTuple): if lfn: gLogger.info(lfn) res = self.setReplicaProblematic(replicaTuple, sourceComponent='DataIntegrityClient') if not res['OK']: gLogger.info('Failed to update integrity DB with replicas', res['Message']) else: gLogger.info('Successfully updated integrity DB with replicas') def setReplicaProblematic(self, replicaTuple, sourceComponent=''): """ This method updates the status of the replica in the FileCatalog and the IntegrityDB The supplied replicaDict should be of the form {lfn :{'PFN':pfn,'SE':se,'Prognosis':prognosis} lfn - the lfn of the file pfn - the pfn if available (otherwise '') se - the storage element of the problematic replica (otherwise '') prognosis - this is given to the integrity DB and should reflect the problem observed with the file sourceComponent is the component issuing the request. """ if type(replicaTuple) == types.TupleType: replicaTuple = [replicaTuple] elif type(replicaTuple) == types.ListType: pass else: errStr = "DataIntegrityClient.setReplicaProblematic: Supplied replica info must be a tuple or list of tuples." gLogger.error(errStr) return S_ERROR(errStr) gLogger.info( "DataIntegrityClient.setReplicaProblematic: Attempting to update %s replicas." % len(replicaTuple)) replicaDict = {} for lfn, pfn, se, reason in replicaTuple: replicaDict[lfn] = { 'Prognosis': reason, 'LFN': lfn, 'PFN': pfn, 'SE': se } res = self.insertProblematic(sourceComponent, replicaDict) if not res['OK']: gLogger.error( "DataIntegrityClient.setReplicaProblematic: Failed to insert problematic to integrity DB" ) return res for lfn in replicaDict.keys(): replicaDict[lfn]['Status'] = 'Problematic' res = self.fc.setReplicaStatus(replicaDict) if not res['OK']: errStr = "DataIntegrityClient.setReplicaProblematic: Completely failed to update replicas." gLogger.error(errStr, res['Message']) return res failed = res['Value']['Failed'] successful = res['Value']['Successful'] resDict = {'Successful': successful, 'Failed': failed} return S_OK(resDict) ########################################################################## # # This section contains the resolution methods for various prognoses # def __updateCompletedFiles(self, prognosis, fileID): gLogger.info("%s file (%d) is resolved" % (prognosis, fileID)) return self.setProblematicStatus(fileID, 'Resolved') def __returnProblematicError(self, fileID, res): self.incrementProblematicRetry(fileID) gLogger.error('DataIntegrityClient failure', res['Message']) return res # def __getRegisteredPFNLFN( self, pfn, storageElement ): # # res = StorageElement( storageElement ).getURL( pfn ) # if not res['OK']: # gLogger.error( "Failed to get registered PFN for physical files", res['Message'] ) # return res # for pfn, error in res['Value']['Failed'].items(): # gLogger.error( 'Failed to obtain registered PFN for physical file', '%s %s' % ( pfn, error ) ) # return S_ERROR( 'Failed to obtain registered PFNs from physical file' ) # registeredPFN = res['Value']['Successful'][pfn] # res = returnSingleResult( self.fc.getLFNForPFN( registeredPFN ) ) # if ( not res['OK'] ) and re.search( 'No such file or directory', res['Message'] ): # return S_OK( False ) # return S_OK( res['Value'] ) def __updateReplicaToChecked(self, problematicDict): lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] prognosis = problematicDict['Prognosis'] problematicDict['Status'] = 'Checked' res = returnSingleResult( self.fc.setReplicaStatus({lfn: problematicDict})) if not res['OK']: return self.__returnProblematicError(fileID, res) gLogger.info("%s replica (%d) is updated to Checked status" % (prognosis, fileID)) return self.__updateCompletedFiles(prognosis, fileID) def resolveCatalogPFNSizeMismatch(self, problematicDict): """ This takes the problematic dictionary returned by the integrity DB and resolved the CatalogPFNSizeMismatch prognosis """ lfn = problematicDict['LFN'] se = problematicDict['SE'] fileID = problematicDict['FileID'] res = returnSingleResult(self.fc.getFileSize(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) catalogSize = res['Value'] res = returnSingleResult(StorageElement(se).getFileSize(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) storageSize = res['Value'] bkKCatalog = FileCatalog(['BookkeepingDB']) res = returnSingleResult(bkKCatalog.getFileSize(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) bookkeepingSize = res['Value'] if bookkeepingSize == catalogSize == storageSize: gLogger.info( "CatalogPFNSizeMismatch replica (%d) matched all registered sizes." % fileID) return self.__updateReplicaToChecked(problematicDict) if (catalogSize == bookkeepingSize): gLogger.info( "CatalogPFNSizeMismatch replica (%d) found to mismatch the bookkeeping also" % fileID) res = returnSingleResult(self.fc.getReplicas(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) if len(res['Value']) <= 1: gLogger.info( "CatalogPFNSizeMismatch replica (%d) has no other replicas." % fileID) return S_ERROR( "Not removing catalog file mismatch since the only replica" ) else: gLogger.info( "CatalogPFNSizeMismatch replica (%d) has other replicas. Removing..." % fileID) res = self.dm.removeReplica(se, lfn) if not res['OK']: return self.__returnProblematicError(fileID, res) return self.__updateCompletedFiles('CatalogPFNSizeMismatch', fileID) if (catalogSize != bookkeepingSize) and (bookkeepingSize == storageSize): gLogger.info( "CatalogPFNSizeMismatch replica (%d) found to match the bookkeeping size" % fileID) res = self.__updateReplicaToChecked(problematicDict) if not res['OK']: return self.__returnProblematicError(fileID, res) return self.changeProblematicPrognosis(fileID, 'BKCatalogSizeMismatch') gLogger.info( "CatalogPFNSizeMismatch replica (%d) all sizes found mismatch. Updating retry count" % fileID) return self.incrementProblematicRetry(fileID) def resolvePFNNotRegistered(self, problematicDict): """ This takes the problematic dictionary returned by the integrity DB and resolved the PFNNotRegistered prognosis """ lfn = problematicDict['LFN'] seName = problematicDict['SE'] fileID = problematicDict['FileID'] se = StorageElement(seName) res = returnSingleResult(self.fc.exists(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) if not res['Value']: # The file does not exist in the catalog res = returnSingleResult(se.removeFile(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) return self.__updateCompletedFiles('PFNNotRegistered', fileID) res = returnSingleResult(se.getFileMetadata(lfn)) if (not res['OK']) and (re.search('File does not exist', res['Message'])): gLogger.info("PFNNotRegistered replica (%d) found to be missing." % fileID) return self.__updateCompletedFiles('PFNNotRegistered', fileID) elif not res['OK']: return self.__returnProblematicError(fileID, res) storageMetadata = res['Value'] if storageMetadata['Lost']: gLogger.info( "PFNNotRegistered replica (%d) found to be Lost. Updating prognosis" % fileID) return self.changeProblematicPrognosis(fileID, 'PFNLost') if storageMetadata['Unavailable']: gLogger.info( "PFNNotRegistered replica (%d) found to be Unavailable. Updating retry count" % fileID) return self.incrementProblematicRetry(fileID) # HACK until we can obtain the space token descriptions through GFAL site = seName.split('_')[0].split('-')[0] if not storageMetadata['Cached']: if lfn.endswith('.raw'): seName = '%s-RAW' % site else: seName = '%s-RDST' % site elif storageMetadata['Migrated']: if lfn.startswith('/lhcb/data'): seName = '%s_M-DST' % site else: seName = '%s_MC_M-DST' % site else: if lfn.startswith('/lhcb/data'): seName = '%s-DST' % site else: seName = '%s_MC-DST' % site problematicDict['SE'] = seName res = returnSingleResult(se.getURL(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) problematicDict['PFN'] = res['Value'] res = returnSingleResult(self.fc.addReplica({lfn: problematicDict})) if not res['OK']: return self.__returnProblematicError(fileID, res) res = returnSingleResult(self.fc.getFileMetadata(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) if res['Value']['Size'] != storageMetadata['Size']: gLogger.info( "PFNNotRegistered replica (%d) found with catalog size mismatch. Updating prognosis" % fileID) return self.changeProblematicPrognosis(fileID, 'CatalogPFNSizeMismatch') return self.__updateCompletedFiles('PFNNotRegistered', fileID) def resolveLFNCatalogMissing(self, problematicDict): """ This takes the problematic dictionary returned by the integrity DB and resolved the LFNCatalogMissing prognosis """ lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] res = returnSingleResult(self.fc.exists(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) if res['Value']: return self.__updateCompletedFiles('LFNCatalogMissing', fileID) # Remove the file from all catalogs # RF_NOTE : here I can do it because it's a single file, but otherwise I would need to sort the path res = returnSingleResult(self.fc.removeFile(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) return self.__updateCompletedFiles('LFNCatalogMissing', fileID) def resolvePFNMissing(self, problematicDict): """ This takes the problematic dictionary returned by the integrity DB and resolved the PFNMissing prognosis """ se = problematicDict['SE'] lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] res = returnSingleResult(self.fc.exists(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) if not res['Value']: gLogger.info("PFNMissing file (%d) no longer exists in catalog" % fileID) return self.__updateCompletedFiles('PFNMissing', fileID) res = returnSingleResult(StorageElement(se).exists(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) if res['Value']: gLogger.info("PFNMissing replica (%d) is no longer missing" % fileID) return self.__updateReplicaToChecked(problematicDict) gLogger.info("PFNMissing replica (%d) does not exist" % fileID) res = returnSingleResult(self.fc.getReplicas(lfn, allStatus=True)) if not res['OK']: return self.__returnProblematicError(fileID, res) replicas = res['Value'] seSite = se.split('_')[0].split('-')[0] found = False print replicas for replicaSE in replicas.keys(): if re.search(seSite, replicaSE): found = True problematicDict['SE'] = replicaSE se = replicaSE if not found: gLogger.info( "PFNMissing replica (%d) is no longer registered at SE. Resolved." % fileID) return self.__updateCompletedFiles('PFNMissing', fileID) gLogger.info( "PFNMissing replica (%d) does not exist. Removing from catalog..." % fileID) res = returnSingleResult(self.fc.removeReplica({lfn: problematicDict})) if not res['OK']: return self.__returnProblematicError(fileID, res) if len(replicas) == 1: gLogger.info( "PFNMissing replica (%d) had a single replica. Updating prognosis" % fileID) return self.changeProblematicPrognosis(fileID, 'LFNZeroReplicas') res = self.dm.replicateAndRegister(problematicDict['LFN'], se) if not res['OK']: return self.__returnProblematicError(fileID, res) # If we get here the problem is solved so we can update the integrityDB return self.__updateCompletedFiles('PFNMissing', fileID) def resolvePFNUnavailable(self, problematicDict): """ This takes the problematic dictionary returned by the integrity DB and resolved the PFNUnavailable prognosis """ lfn = problematicDict['LFN'] se = problematicDict['SE'] fileID = problematicDict['FileID'] res = returnSingleResult(StorageElement(se).getFileMetadata(lfn)) if (not res['OK']) and (re.search('File does not exist', res['Message'])): # The file is no longer Unavailable but has now dissapeared completely gLogger.info( "PFNUnavailable replica (%d) found to be missing. Updating prognosis" % fileID) return self.changeProblematicPrognosis(fileID, 'PFNMissing') if (not res['OK']) or res['Value']['Unavailable']: gLogger.info( "PFNUnavailable replica (%d) found to still be Unavailable" % fileID) return self.incrementProblematicRetry(fileID) if res['Value']['Lost']: gLogger.info( "PFNUnavailable replica (%d) is now found to be Lost. Updating prognosis" % fileID) return self.changeProblematicPrognosis(fileID, 'PFNLost') gLogger.info("PFNUnavailable replica (%d) is no longer Unavailable" % fileID) # Need to make the replica okay in the Catalog return self.__updateReplicaToChecked(problematicDict) def resolvePFNZeroSize(self, problematicDict): """ This takes the problematic dictionary returned by the integrity DB and resolves the PFNZeroSize prognosis """ lfn = problematicDict['LFN'] seName = problematicDict['SE'] fileID = problematicDict['FileID'] se = StorageElement(seName) res = returnSingleResult(se.getFileSize(lfn)) if (not res['OK']) and (re.search('File does not exist', res['Message'])): gLogger.info( "PFNZeroSize replica (%d) found to be missing. Updating prognosis" % problematicDict['FileID']) return self.changeProblematicPrognosis(fileID, 'PFNMissing') storageSize = res['Value'] if storageSize == 0: res = returnSingleResult(se.removeFile(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) gLogger.info( "PFNZeroSize replica (%d) removed. Updating prognosis" % problematicDict['FileID']) return self.changeProblematicPrognosis(fileID, 'PFNMissing') res = returnSingleResult(self.fc.getReplicas(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) if seName not in res['Value']: gLogger.info( "PFNZeroSize replica (%d) not registered in catalog. Updating prognosis" % problematicDict['FileID']) return self.changeProblematicPrognosis(fileID, 'PFNNotRegistered') res = returnSingleResult(self.fc.getFileMetadata(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) catalogSize = res['Value']['Size'] if catalogSize != storageSize: gLogger.info( "PFNZeroSize replica (%d) size found to differ from registered metadata. Updating prognosis" % problematicDict['FileID']) return self.changeProblematicPrognosis(fileID, 'CatalogPFNSizeMismatch') return self.__updateCompletedFiles('PFNZeroSize', fileID) ############################################################################################ def resolveLFNZeroReplicas(self, problematicDict): """ This takes the problematic dictionary returned by the integrity DB and resolves the LFNZeroReplicas prognosis """ lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] res = returnSingleResult(self.fc.getReplicas(lfn, allStatus=True)) if res['OK'] and res['Value']: gLogger.info("LFNZeroReplicas file (%d) found to have replicas" % fileID) else: gLogger.info( "LFNZeroReplicas file (%d) does not have replicas. Checking storage..." % fileID) pfnsFound = False for storageElementName in sorted( gConfig.getValue( 'Resources/StorageElementGroups/Tier1_MC_M-DST', [])): res = self.__getStoragePathExists([lfn], storageElementName) if lfn in res['Value']: gLogger.info( "LFNZeroReplicas file (%d) found storage file at %s" % (fileID, storageElementName)) self.__reportProblematicReplicas( [(lfn, 'deprecatedUrl', storageElementName, 'PFNNotRegistered')], storageElementName, 'PFNNotRegistered') pfnsFound = True if not pfnsFound: gLogger.info( "LFNZeroReplicas file (%d) did not have storage files. Removing..." % fileID) res = returnSingleResult(self.fc.removeFile(lfn)) if not res['OK']: gLogger.error('DataIntegrityClient: failed to remove file', res['Message']) # Increment the number of retries for this file self.server.incrementProblematicRetry(fileID) return res gLogger.info("LFNZeroReplicas file (%d) removed from catalog" % fileID) # If we get here the problem is solved so we can update the integrityDB return self.__updateCompletedFiles('LFNZeroReplicas', fileID)
from DIRAC.RequestManagementSystem.private.RequestValidator import RequestValidator from DIRAC.Resources.Catalog.FileCatalog import FileCatalog reqClient = ReqClient() fc = FileCatalog() for lfnList in breakListIntoChunks( lfns, 100 ): oRequest = Request() oRequest.RequestName = "%s_%s" % ( md5( repr( time.time() ) ).hexdigest()[:16], md5( repr( time.time() ) ).hexdigest()[:16] ) replicateAndRegister = Operation() replicateAndRegister.Type = 'ReplicateAndRegister' replicateAndRegister.TargetSE = targetSE res = fc.getFileMetadata( lfnList ) if not res['OK']: print "Can't get file metadata: %s" % res['Message'] DIRAC.exit( 1 ) if res['Value']['Failed']: print "Could not get the file metadata of the following, so skipping them:" for fFile in res['Value']['Failed']: print fFile lfnMetadata = res['Value']['Successful'] for lfn in lfnMetadata: rarFile = File() rarFile.LFN = lfn rarFile.Size = lfnMetadata[lfn]['Size'] rarFile.Checksum = lfnMetadata[lfn]['Checksum']
requestOperation = 'RemoveReplica' if targetSE == 'All': requestOperation = 'RemoveFile' for lfnList in breakListIntoChunks(lfns, 100): oRequest = Request() requestName = "%s_%s" % (md5(repr(time.time())).hexdigest()[:16], md5(repr(time.time())).hexdigest()[:16]) oRequest.RequestName = requestName oOperation = Operation() oOperation.Type = requestOperation oOperation.TargetSE = targetSE res = fc.getFileMetadata(lfnList) if not res['OK']: print("Can't get file metadata: %s" % res['Message']) DIRAC.exit(1) if res['Value']['Failed']: print( "Could not get the file metadata of the following, so skipping them:" ) for fFile in res['Value']['Failed']: print(fFile) lfnMetadata = res['Value']['Successful'] for lfn in lfnMetadata: rarFile = File() rarFile.LFN = lfn
from DIRAC.RequestManagementSystem.Client.Operation import Operation from DIRAC.RequestManagementSystem.Client.File import File from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient from DIRAC.Resources.Catalog.FileCatalog import FileCatalog from DIRAC.Core.Utilities.List import breakListIntoChunks lfnChunks = breakListIntoChunks(lfnList, 100) multiRequests = len(lfnChunks) > 1 error = 0 count = 0 reqClient = ReqClient() fc = FileCatalog() requestIDs = [] for lfnChunk in lfnChunks: metaDatas = fc.getFileMetadata(lfnChunk) if not metaDatas["OK"]: gLogger.error("unable to read metadata for lfns: %s" % metaDatas["Message"]) error = -1 continue metaDatas = metaDatas["Value"] for failedLFN, reason in metaDatas["Failed"].items(): gLogger.error("skipping %s: %s" % (failedLFN, reason)) lfnChunk = set(metaDatas["Successful"]) if not lfnChunk: gLogger.error("LFN list is empty!!!") error = -1 continue
class DataIntegrityClient( Client ): """ The following methods are supported in the service but are not mentioned explicitly here: getProblematic() Obtains a problematic file from the IntegrityDB based on the LastUpdate time getPrognosisProblematics(prognosis) Obtains all the problematics of a particular prognosis from the integrityDB getProblematicsSummary() Obtains a count of the number of problematics for each prognosis found getDistinctPrognosis() Obtains the distinct prognosis found in the integrityDB getTransformationProblematics(prodID) Obtains the problematics for a given production incrementProblematicRetry(fileID) Increments the retry count for the supplied file ID changeProblematicPrognosis(fileID,newPrognosis) Changes the prognosis of the supplied file to the new prognosis setProblematicStatus(fileID,status) Updates the status of a problematic in the integrityDB removeProblematic(self,fileID) This removes the specified file ID from the integrity DB insertProblematic(sourceComponent,fileMetadata) Inserts file with supplied metadata into the integrity DB """ def __init__( self, **kwargs ): Client.__init__( self, **kwargs ) self.setServer( 'DataManagement/DataIntegrity' ) self.dm = DataManager() self.fc = FileCatalog() ########################################################################## # # This section contains the specific methods for LFC->SE checks # def catalogDirectoryToSE( self, lfnDir ): """ This obtains the replica and metadata information from the catalog for the supplied directory and checks against the storage elements. """ gLogger.info( "-" * 40 ) gLogger.info( "Performing the LFC->SE check" ) gLogger.info( "-" * 40 ) if type( lfnDir ) in types.StringTypes: lfnDir = [lfnDir] res = self.__getCatalogDirectoryContents( lfnDir ) if not res['OK']: return res replicas = res['Value']['Replicas'] catalogMetadata = res['Value']['Metadata'] res = self.__checkPhysicalFiles( replicas, catalogMetadata ) if not res['OK']: return res resDict = {'CatalogMetadata':catalogMetadata, 'CatalogReplicas':replicas} return S_OK( resDict ) def catalogFileToSE( self, lfns ): """ This obtains the replica and metadata information from the catalog and checks against the storage elements. """ gLogger.info( "-" * 40 ) gLogger.info( "Performing the LFC->SE check" ) gLogger.info( "-" * 40 ) if type( lfns ) in types.StringTypes: lfns = [lfns] res = self.__getCatalogMetadata( lfns ) if not res['OK']: return res catalogMetadata = res['Value'] res = self.__getCatalogReplicas( catalogMetadata.keys() ) if not res['OK']: return res replicas = res['Value'] res = self.__checkPhysicalFiles( replicas, catalogMetadata ) if not res['OK']: return res resDict = {'CatalogMetadata':catalogMetadata, 'CatalogReplicas':replicas} return S_OK( resDict ) def checkPhysicalFiles( self, replicas, catalogMetadata, ses = [] ): """ This obtains takes the supplied replica and metadata information obtained from the catalog and checks against the storage elements. """ gLogger.info( "-" * 40 ) gLogger.info( "Performing the LFC->SE check" ) gLogger.info( "-" * 40 ) return self.__checkPhysicalFiles( replicas, catalogMetadata, ses = ses ) def __checkPhysicalFiles( self, replicas, catalogMetadata, ses = [] ): """ This obtains the physical file metadata and checks the metadata against the catalog entries """ sePfns = {} pfnLfns = {} for lfn, replicaDict in replicas.items(): for se, pfn in replicaDict.items(): if ( ses ) and ( se not in ses ): continue if not sePfns.has_key( se ): sePfns[se] = [] sePfns[se].append( pfn ) pfnLfns[pfn] = lfn gLogger.info( '%s %s' % ( 'Storage Element'.ljust( 20 ), 'Replicas'.rjust( 20 ) ) ) for site in sortList( sePfns.keys() ): files = len( sePfns[site] ) gLogger.info( '%s %s' % ( site.ljust( 20 ), str( files ).rjust( 20 ) ) ) for se in sortList( sePfns.keys() ): pfns = sePfns[se] pfnDict = {} for pfn in pfns: pfnDict[pfn] = pfnLfns[pfn] sizeMismatch = [] res = self.__checkPhysicalFileMetadata( pfnDict, se ) if not res['OK']: gLogger.error( 'Failed to get physical file metadata.', res['Message'] ) return res for pfn, metadata in res['Value'].items(): if catalogMetadata.has_key( pfnLfns[pfn] ): if ( metadata['Size'] != catalogMetadata[pfnLfns[pfn]]['Size'] ) and ( metadata['Size'] != 0 ): sizeMismatch.append( ( pfnLfns[pfn], pfn, se, 'CatalogPFNSizeMismatch' ) ) if sizeMismatch: self.__reportProblematicReplicas( sizeMismatch, se, 'CatalogPFNSizeMismatch' ) return S_OK() def __checkPhysicalFileMetadata( self, pfnLfns, se ): """ Check obtain the physical file metadata and check the files are available """ gLogger.info( 'Checking the integrity of %s physical files at %s' % ( len( pfnLfns ), se ) ) res = StorageElement( se ).getFileMetadata( pfnLfns.keys() ) if not res['OK']: gLogger.error( 'Failed to get metadata for pfns.', res['Message'] ) return res pfnMetadataDict = res['Value']['Successful'] # If the replicas are completely missing missingReplicas = [] for pfn, reason in res['Value']['Failed'].items(): if re.search( 'File does not exist', reason ): missingReplicas.append( ( pfnLfns[pfn], pfn, se, 'PFNMissing' ) ) if missingReplicas: self.__reportProblematicReplicas( missingReplicas, se, 'PFNMissing' ) lostReplicas = [] unavailableReplicas = [] zeroSizeReplicas = [] # If the files are not accessible for pfn, pfnMetadata in pfnMetadataDict.items(): if pfnMetadata['Lost']: lostReplicas.append( ( pfnLfns[pfn], pfn, se, 'PFNLost' ) ) if pfnMetadata['Unavailable']: unavailableReplicas.append( ( pfnLfns[pfn], pfn, se, 'PFNUnavailable' ) ) if pfnMetadata['Size'] == 0: zeroSizeReplicas.append( ( pfnLfns[pfn], pfn, se, 'PFNZeroSize' ) ) if lostReplicas: self.__reportProblematicReplicas( lostReplicas, se, 'PFNLost' ) if unavailableReplicas: self.__reportProblematicReplicas( unavailableReplicas, se, 'PFNUnavailable' ) if zeroSizeReplicas: self.__reportProblematicReplicas( zeroSizeReplicas, se, 'PFNZeroSize' ) gLogger.info( 'Checking the integrity of physical files at %s complete' % se ) return S_OK( pfnMetadataDict ) ########################################################################## # # This section contains the specific methods for SE->LFC checks # def storageDirectoryToCatalog( self, lfnDir, storageElement ): """ This obtains the file found on the storage element in the supplied directories and determines whether they exist in the catalog and checks their metadata elements """ gLogger.info( "-" * 40 ) gLogger.info( "Performing the SE->LFC check at %s" % storageElement ) gLogger.info( "-" * 40 ) if type( lfnDir ) in types.StringTypes: lfnDir = [lfnDir] res = self.__getStorageDirectoryContents( lfnDir, storageElement ) if not res['OK']: return res storageFileMetadata = res['Value'] if storageFileMetadata: return self.__checkCatalogForSEFiles( storageFileMetadata, storageElement ) return S_OK( {'CatalogMetadata':{}, 'StorageMetadata':{}} ) def __checkCatalogForSEFiles( self, storageMetadata, storageElement ): gLogger.info( 'Checking %s storage files exist in the catalog' % len( storageMetadata ) ) # RF_NOTE : this comment is completely wrong # First get all the PFNs as they should be registered in the catalog res = StorageElement( storageElement ).getPfnForProtocol( storageMetadata.keys(), withPort = False ) if not res['OK']: gLogger.error( "Failed to get registered PFNs for physical files", res['Message'] ) return res for pfn, error in res['Value']['Failed'].items(): gLogger.error( 'Failed to obtain registered PFN for physical file', '%s %s' % ( pfn, error ) ) if res['Value']['Failed']: return S_ERROR( 'Failed to obtain registered PFNs from physical file' ) for original, registered in res['Value']['Successful'].items(): storageMetadata[registered] = storageMetadata.pop( original ) # Determine whether these PFNs are registered and if so obtain the LFN res = self.fc.getLFNForPFN( storageMetadata.keys() ) if not res['OK']: gLogger.error( "Failed to get registered LFNs for PFNs", res['Message'] ) return res failedPfns = res['Value']['Failed'] notRegisteredPfns = [] for pfn, error in failedPfns.items(): if re.search( 'No such file or directory', error ): notRegisteredPfns.append( ( storageMetadata[pfn]['LFN'], pfn, storageElement, 'PFNNotRegistered' ) ) failedPfns.pop( pfn ) if notRegisteredPfns: self.__reportProblematicReplicas( notRegisteredPfns, storageElement, 'PFNNotRegistered' ) if failedPfns: return S_ERROR( 'Failed to obtain LFNs for PFNs' ) pfnLfns = res['Value']['Successful'] for pfn in storageMetadata.keys(): pfnMetadata = storageMetadata.pop( pfn ) if pfn in pfnLfns.keys(): lfn = pfnLfns[pfn] storageMetadata[lfn] = pfnMetadata storageMetadata[lfn]['PFN'] = pfn # For the LFNs found to be registered obtain the file metadata from the catalog and verify against the storage metadata res = self.__getCatalogMetadata( storageMetadata.keys() ) if not res['OK']: return res catalogMetadata = res['Value'] sizeMismatch = [] for lfn, lfnCatalogMetadata in catalogMetadata.items(): lfnStorageMetadata = storageMetadata[lfn] if ( lfnStorageMetadata['Size'] != lfnCatalogMetadata['Size'] ) and ( lfnStorageMetadata['Size'] != 0 ): sizeMismatch.append( ( lfn, storageMetadata[lfn]['PFN'], storageElement, 'CatalogPFNSizeMismatch' ) ) if sizeMismatch: self.__reportProblematicReplicas( sizeMismatch, storageElement, 'CatalogPFNSizeMismatch' ) gLogger.info( 'Checking storage files exist in the catalog complete' ) resDict = {'CatalogMetadata':catalogMetadata, 'StorageMetadata':storageMetadata} return S_OK( resDict ) def getStorageDirectoryContents( self, lfnDir, storageElement ): """ This obtains takes the supplied lfn directories and recursively obtains the files in the supplied storage element """ return self.__getStorageDirectoryContents( lfnDir, storageElement ) def __getStorageDirectoryContents( self, lfnDir, storageElement ): """ Obtians the contents of the supplied directory on the storage """ gLogger.info( 'Obtaining the contents for %s directories at %s' % ( len( lfnDir ), storageElement ) ) se = StorageElement( storageElement ) res = se.getPfnForLfn( lfnDir ) if not res['OK']: gLogger.error( "Failed to get PFNs for directories", res['Message'] ) return res for directory, error in res['Value']['Failed'].items(): gLogger.error( 'Failed to obtain directory PFN from LFNs', '%s %s' % ( directory, error ) ) if res['Value']['Failed']: return S_ERROR( 'Failed to obtain directory PFN from LFNs' ) storageDirectories = res['Value']['Successful'].values() res = se.exists( storageDirectories ) if not res['OK']: gLogger.error( "Failed to obtain existance of directories", res['Message'] ) return res for directory, error in res['Value']['Failed'].items(): gLogger.error( 'Failed to determine existance of directory', '%s %s' % ( directory, error ) ) if res['Value']['Failed']: return S_ERROR( 'Failed to determine existance of directory' ) directoryExists = res['Value']['Successful'] activeDirs = [] for directory in sortList( directoryExists.keys() ): exists = directoryExists[directory] if exists: activeDirs.append( directory ) allFiles = {} while len( activeDirs ) > 0: currentDir = activeDirs[0] res = se.listDirectory( currentDir ) activeDirs.remove( currentDir ) if not res['OK']: gLogger.error( 'Failed to get directory contents', res['Message'] ) return res elif res['Value']['Failed'].has_key( currentDir ): gLogger.error( 'Failed to get directory contents', '%s %s' % ( currentDir, res['Value']['Failed'][currentDir] ) ) return S_ERROR( res['Value']['Failed'][currentDir] ) else: dirContents = res['Value']['Successful'][currentDir] activeDirs.extend( dirContents['SubDirs'] ) fileMetadata = dirContents['Files'] # RF_NOTE This ugly trick is needed because se.getPfnPath does not follow the Successful/Failed convention # res = { "Successful" : {}, "Failed" : {} } # for pfn in fileMetadata: # inRes = se.getPfnPath( pfn ) # if inRes["OK"]: # res["Successful"][pfn] = inRes["Value"] # else: # res["Failed"][pfn] = inRes["Message"] res = se.getLfnForPfn( fileMetadata.keys() ) if not res['OK']: gLogger.error( 'Failed to get directory content LFNs', res['Message'] ) return res for pfn, error in res['Value']['Failed'].items(): gLogger.error( "Failed to get LFN for PFN", "%s %s" % ( pfn, error ) ) if res['Value']['Failed']: return S_ERROR( "Failed to get LFNs for PFNs" ) pfnLfns = res['Value']['Successful'] for pfn, lfn in pfnLfns.items(): fileMetadata[pfn]['LFN'] = lfn allFiles.update( fileMetadata ) zeroSizeFiles = [] lostFiles = [] unavailableFiles = [] for pfn in sortList( allFiles.keys() ): if os.path.basename( pfn ) == 'dirac_directory': allFiles.pop( pfn ) else: metadata = allFiles[pfn] if metadata['Size'] == 0: zeroSizeFiles.append( ( metadata['LFN'], pfn, storageElement, 'PFNZeroSize' ) ) # if metadata['Lost']: # lostFiles.append((metadata['LFN'],pfn,storageElement,'PFNLost')) # if metadata['Unavailable']: # unavailableFiles.append((metadata['LFN'],pfn,storageElement,'PFNUnavailable')) if zeroSizeFiles: self.__reportProblematicReplicas( zeroSizeFiles, storageElement, 'PFNZeroSize' ) if lostFiles: self.__reportProblematicReplicas( lostFiles, storageElement, 'PFNLost' ) if unavailableFiles: self.__reportProblematicReplicas( unavailableFiles, storageElement, 'PFNUnavailable' ) gLogger.info( 'Obtained at total of %s files for directories at %s' % ( len( allFiles ), storageElement ) ) return S_OK( allFiles ) def __getStoragePathExists( self, lfnPaths, storageElement ): gLogger.info( 'Determining the existance of %d files at %s' % ( len( lfnPaths ), storageElement ) ) se = StorageElement( storageElement ) res = se.getPfnForLfn( lfnPaths ) if not res['OK']: gLogger.error( "Failed to get PFNs for LFNs", res['Message'] ) return res for lfnPath, error in res['Value']['Failed'].items(): gLogger.error( 'Failed to obtain PFN from LFN', '%s %s' % ( lfnPath, error ) ) if res['Value']['Failed']: return S_ERROR( 'Failed to obtain PFNs from LFNs' ) lfnPfns = res['Value']['Successful'] pfnLfns = {} for lfn, pfn in lfnPfns.items(): pfnLfns[pfn] = lfn res = se.exists( pfnLfns ) if not res['OK']: gLogger.error( "Failed to obtain existance of paths", res['Message'] ) return res for lfnPath, error in res['Value']['Failed'].items(): gLogger.error( 'Failed to determine existance of path', '%s %s' % ( lfnPath, error ) ) if res['Value']['Failed']: return S_ERROR( 'Failed to determine existance of paths' ) pathExists = res['Value']['Successful'] resDict = {} for pfn, exists in pathExists.items(): if exists: resDict[pfnLfns[pfn]] = pfn return S_OK( resDict ) ########################################################################## # # This section contains the specific methods for obtaining replica and metadata information from the catalog # def __getCatalogDirectoryContents( self, lfnDir ): """ Obtain the contents of the supplied directory """ gLogger.info( 'Obtaining the catalog contents for %s directories' % len( lfnDir ) ) activeDirs = lfnDir allFiles = {} while len( activeDirs ) > 0: currentDir = activeDirs[0] res = self.fc.listDirectory( currentDir ) activeDirs.remove( currentDir ) if not res['OK']: gLogger.error( 'Failed to get directory contents', res['Message'] ) return res elif res['Value']['Failed'].has_key( currentDir ): gLogger.error( 'Failed to get directory contents', '%s %s' % ( currentDir, res['Value']['Failed'][currentDir] ) ) else: dirContents = res['Value']['Successful'][currentDir] activeDirs.extend( dirContents['SubDirs'] ) allFiles.update( dirContents['Files'] ) zeroReplicaFiles = [] zeroSizeFiles = [] allReplicaDict = {} allMetadataDict = {} for lfn, lfnDict in allFiles.items(): lfnReplicas = {} for se, replicaDict in lfnDict['Replicas'].items(): lfnReplicas[se] = replicaDict['PFN'] if not lfnReplicas: zeroReplicaFiles.append( lfn ) allReplicaDict[lfn] = lfnReplicas allMetadataDict[lfn] = lfnDict['MetaData'] if lfnDict['MetaData']['Size'] == 0: zeroSizeFiles.append( lfn ) if zeroReplicaFiles: self.__reportProblematicFiles( zeroReplicaFiles, 'LFNZeroReplicas' ) if zeroSizeFiles: self.__reportProblematicFiles( zeroSizeFiles, 'LFNZeroSize' ) gLogger.info( 'Obtained at total of %s files for the supplied directories' % len( allMetadataDict ) ) resDict = {'Metadata':allMetadataDict, 'Replicas':allReplicaDict} return S_OK( resDict ) def __getCatalogReplicas( self, lfns ): """ Obtain the file replicas from the catalog while checking that there are replicas """ gLogger.info( 'Obtaining the replicas for %s files' % len( lfns ) ) zeroReplicaFiles = [] res = self.fc.getReplicas( lfns, allStatus = True ) if not res['OK']: gLogger.error( 'Failed to get catalog replicas', res['Message'] ) return res allReplicas = res['Value']['Successful'] for lfn, error in res['Value']['Failed'].items(): if re.search( 'File has zero replicas', error ): zeroReplicaFiles.append( lfn ) if zeroReplicaFiles: self.__reportProblematicFiles( zeroReplicaFiles, 'LFNZeroReplicas' ) gLogger.info( 'Obtaining the replicas for files complete' ) return S_OK( allReplicas ) def __getCatalogMetadata( self, lfns ): """ Obtain the file metadata from the catalog while checking they exist """ if not lfns: return S_OK( {} ) gLogger.info( 'Obtaining the catalog metadata for %s files' % len( lfns ) ) missingCatalogFiles = [] zeroSizeFiles = [] res = self.fc.getFileMetadata( lfns ) if not res['OK']: gLogger.error( 'Failed to get catalog metadata', res['Message'] ) return res allMetadata = res['Value']['Successful'] for lfn, error in res['Value']['Failed'].items(): if re.search( 'No such file or directory', error ): missingCatalogFiles.append( lfn ) if missingCatalogFiles: self.__reportProblematicFiles( missingCatalogFiles, 'LFNCatalogMissing' ) for lfn, metadata in allMetadata.items(): if metadata['Size'] == 0: zeroSizeFiles.append( lfn ) if zeroSizeFiles: self.__reportProblematicFiles( zeroSizeFiles, 'LFNZeroSize' ) gLogger.info( 'Obtaining the catalog metadata complete' ) return S_OK( allMetadata ) ########################################################################## # # This section contains the methods for inserting problematic files into the integrity DB # def __reportProblematicFiles( self, lfns, reason ): """ Simple wrapper function around setFileProblematic """ gLogger.info( 'The following %s files were found with %s' % ( len( lfns ), reason ) ) for lfn in sortList( lfns ): gLogger.info( lfn ) res = self.setFileProblematic( lfns, reason, sourceComponent = 'DataIntegrityClient' ) if not res['OK']: gLogger.info( 'Failed to update integrity DB with files', res['Message'] ) else: gLogger.info( 'Successfully updated integrity DB with files' ) def setFileProblematic( self, lfn, reason, sourceComponent = '' ): """ This method updates the status of the file in the FileCatalog and the IntegrityDB lfn - the lfn of the file reason - this is given to the integrity DB and should reflect the problem observed with the file sourceComponent is the component issuing the request. """ if type( lfn ) == types.ListType: lfns = lfn elif type( lfn ) == types.StringType: lfns = [lfn] else: errStr = "DataIntegrityClient.setFileProblematic: Supplied file info must be list or a single LFN." gLogger.error( errStr ) return S_ERROR( errStr ) gLogger.info( "DataIntegrityClient.setFileProblematic: Attempting to update %s files." % len( lfns ) ) fileMetadata = {} for lfn in lfns: fileMetadata[lfn] = {'Prognosis':reason, 'LFN':lfn, 'PFN':'', 'SE':''} res = self.insertProblematic( sourceComponent, fileMetadata ) if not res['OK']: gLogger.error( "DataIntegrityClient.setReplicaProblematic: Failed to insert problematics to integrity DB" ) return res def __reportProblematicReplicas( self, replicaTuple, se, reason ): """ Simple wrapper function around setReplicaProblematic """ gLogger.info( 'The following %s files had %s at %s' % ( len( replicaTuple ), reason, se ) ) for lfn, pfn, se, reason in sortList( replicaTuple ): if lfn: gLogger.info( lfn ) else: gLogger.info( pfn ) res = self.setReplicaProblematic( replicaTuple, sourceComponent = 'DataIntegrityClient' ) if not res['OK']: gLogger.info( 'Failed to update integrity DB with replicas', res['Message'] ) else: gLogger.info( 'Successfully updated integrity DB with replicas' ) def setReplicaProblematic( self, replicaTuple, sourceComponent = '' ): """ This method updates the status of the replica in the FileCatalog and the IntegrityDB The supplied replicaDict should be of the form {lfn :{'PFN':pfn,'SE':se,'Prognosis':prognosis} lfn - the lfn of the file pfn - the pfn if available (otherwise '') se - the storage element of the problematic replica (otherwise '') prognosis - this is given to the integrity DB and should reflect the problem observed with the file sourceComponent is the component issuing the request. """ if type( replicaTuple ) == types.TupleType: replicaTuple = [replicaTuple] elif type( replicaTuple ) == types.ListType: pass else: errStr = "DataIntegrityClient.setReplicaProblematic: Supplied replica info must be a tuple or list of tuples." gLogger.error( errStr ) return S_ERROR( errStr ) gLogger.info( "DataIntegrityClient.setReplicaProblematic: Attempting to update %s replicas." % len( replicaTuple ) ) replicaDict = {} for lfn, pfn, se, reason in replicaTuple: replicaDict[lfn] = {'Prognosis':reason, 'LFN':lfn, 'PFN':pfn, 'SE':se} res = self.insertProblematic( sourceComponent, replicaDict ) if not res['OK']: gLogger.error( "DataIntegrityClient.setReplicaProblematic: Failed to insert problematic to integrity DB" ) return res for lfn in replicaDict.keys(): replicaDict[lfn]['Status'] = 'Problematic' res = self.fc.setReplicaStatus( replicaDict ) if not res['OK']: errStr = "DataIntegrityClient.setReplicaProblematic: Completely failed to update replicas." gLogger.error( errStr, res['Message'] ) return res failed = res['Value']['Failed'] successful = res['Value']['Successful'] resDict = {'Successful':successful, 'Failed':failed} return S_OK( resDict ) ########################################################################## # # This section contains the resolution methods for various prognoses # def __updateCompletedFiles( self, prognosis, fileID ): gLogger.info( "%s file (%d) is resolved" % ( prognosis, fileID ) ) return self.setProblematicStatus( fileID, 'Resolved' ) def __returnProblematicError( self, fileID, res ): self.incrementProblematicRetry( fileID ) gLogger.error( res['Message'] ) return res def __getRegisteredPFNLFN( self, pfn, storageElement ): res = StorageElement( storageElement ).getPfnForProtocol( pfn, withPort = False ) if not res['OK']: gLogger.error( "Failed to get registered PFN for physical files", res['Message'] ) return res for pfn, error in res['Value']['Failed'].items(): gLogger.error( 'Failed to obtain registered PFN for physical file', '%s %s' % ( pfn, error ) ) return S_ERROR( 'Failed to obtain registered PFNs from physical file' ) registeredPFN = res['Value']['Successful'][pfn] res = Utils.executeSingleFileOrDirWrapper( self.fc.getLFNForPFN( registeredPFN ) ) if ( not res['OK'] ) and re.search( 'No such file or directory', res['Message'] ): return S_OK( False ) return S_OK( res['Value'] ) def __updateReplicaToChecked( self, problematicDict ): lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] prognosis = problematicDict['Prognosis'] problematicDict['Status'] = 'Checked' res = Utils.executeSingleFileOrDirWrapper( self.fc.setReplicaStatus( {lfn:problematicDict} ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) gLogger.info( "%s replica (%d) is updated to Checked status" % ( prognosis, fileID ) ) return self.__updateCompletedFiles( prognosis, fileID ) def resolveCatalogPFNSizeMismatch( self, problematicDict ): """ This takes the problematic dictionary returned by the integrity DB and resolved the CatalogPFNSizeMismatch prognosis """ lfn = problematicDict['LFN'] pfn = problematicDict['PFN'] se = problematicDict['SE'] fileID = problematicDict['FileID'] res = Utils.executeSingleFileOrDirWrapper( self.fc.getFileSize( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) catalogSize = res['Value'] res = Utils.executeSingleFileOrDirWrapper( StorageElement( se ).getFileSize( pfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) storageSize = res['Value'] bkKCatalog = FileCatalog( ['BookkeepingDB'] ) res = Utils.executeSingleFileOrDirWrapper( bkKCatalog.getFileSize( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) bookkeepingSize = res['Value'] if bookkeepingSize == catalogSize == storageSize: gLogger.info( "CatalogPFNSizeMismatch replica (%d) matched all registered sizes." % fileID ) return self.__updateReplicaToChecked( problematicDict ) if ( catalogSize == bookkeepingSize ): gLogger.info( "CatalogPFNSizeMismatch replica (%d) found to mismatch the bookkeeping also" % fileID ) res = Utils.executeSingleFileOrDirWrapper( self.fc.getReplicas( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) if len( res['Value'] ) <= 1: gLogger.info( "CatalogPFNSizeMismatch replica (%d) has no other replicas." % fileID ) return S_ERROR( "Not removing catalog file mismatch since the only replica" ) else: gLogger.info( "CatalogPFNSizeMismatch replica (%d) has other replicas. Removing..." % fileID ) res = self.dm.removeReplica( se, lfn ) if not res['OK']: return self.__returnProblematicError( fileID, res ) return self.__updateCompletedFiles( 'CatalogPFNSizeMismatch', fileID ) if ( catalogSize != bookkeepingSize ) and ( bookkeepingSize == storageSize ): gLogger.info( "CatalogPFNSizeMismatch replica (%d) found to match the bookkeeping size" % fileID ) res = self.__updateReplicaToChecked( problematicDict ) if not res['OK']: return self.__returnProblematicError( fileID, res ) return self.changeProblematicPrognosis( fileID, 'BKCatalogSizeMismatch' ) gLogger.info( "CatalogPFNSizeMismatch replica (%d) all sizes found mismatch. Updating retry count" % fileID ) return self.incrementProblematicRetry( fileID ) def resolvePFNNotRegistered( self, problematicDict ): """ This takes the problematic dictionary returned by the integrity DB and resolved the PFNNotRegistered prognosis """ lfn = problematicDict['LFN'] pfn = problematicDict['PFN'] seName = problematicDict['SE'] fileID = problematicDict['FileID'] se = StorageElement( seName ) res = Utils.executeSingleFileOrDirWrapper( self.fc.exists( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) if not res['Value']: # The file does not exist in the catalog res = Utils.executeSingleFileOrDirWrapper( se.removeFile( pfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) return self.__updateCompletedFiles( 'PFNNotRegistered', fileID ) res = Utils.executeSingleFileOrDirWrapper( se.getFileMetadata( pfn ) ) if ( not res['OK'] ) and ( re.search( 'File does not exist', res['Message'] ) ): gLogger.info( "PFNNotRegistered replica (%d) found to be missing." % fileID ) return self.__updateCompletedFiles( 'PFNNotRegistered', fileID ) elif not res['OK']: return self.__returnProblematicError( fileID, res ) storageMetadata = res['Value'] if storageMetadata['Lost']: gLogger.info( "PFNNotRegistered replica (%d) found to be Lost. Updating prognosis" % fileID ) return self.changeProblematicPrognosis( fileID, 'PFNLost' ) if storageMetadata['Unavailable']: gLogger.info( "PFNNotRegistered replica (%d) found to be Unavailable. Updating retry count" % fileID ) return self.incrementProblematicRetry( fileID ) # HACK until we can obtain the space token descriptions through GFAL site = seName.split( '_' )[0].split( '-' )[0] if not storageMetadata['Cached']: if lfn.endswith( '.raw' ): seName = '%s-RAW' % site else: seName = '%s-RDST' % site elif storageMetadata['Migrated']: if lfn.startswith( '/lhcb/data' ): seName = '%s_M-DST' % site else: seName = '%s_MC_M-DST' % site else: if lfn.startswith( '/lhcb/data' ): seName = '%s-DST' % site else: seName = '%s_MC-DST' % site problematicDict['SE'] = seName res = se.getPfnForProtocol( pfn, withPort = False ) if not res['OK']: return self.__returnProblematicError( fileID, res ) for pfn, error in res['Value']['Failed'].items(): gLogger.error( 'Failed to obtain registered PFN for physical file', '%s %s' % ( pfn, error ) ) return S_ERROR( 'Failed to obtain registered PFNs from physical file' ) problematicDict['PFN'] = res['Value']['Successful'][pfn] res = Utils.executeSingleFileOrDirWrapper( self.fc.addReplica( {lfn:problematicDict} ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) res = Utils.executeSingleFileOrDirWrapper( self.fc.getFileMetadata( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) if res['Value']['Size'] != storageMetadata['Size']: gLogger.info( "PFNNotRegistered replica (%d) found with catalog size mismatch. Updating prognosis" % fileID ) return self.changeProblematicPrognosis( fileID, 'CatalogPFNSizeMismatch' ) return self.__updateCompletedFiles( 'PFNNotRegistered', fileID ) def resolveLFNCatalogMissing( self, problematicDict ): """ This takes the problematic dictionary returned by the integrity DB and resolved the LFNCatalogMissing prognosis """ lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] res = Utils.executeSingleFileOrDirWrapper( self.fc.exists( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) if res['Value']: return self.__updateCompletedFiles( 'LFNCatalogMissing', fileID ) # Remove the file from all catalogs # RF_NOTE : here I can do it because it's a single file, but otherwise I would need to sort the path res = Utils.executeSingleFileOrDirWrapper( self.fc.removeFile( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) return self.__updateCompletedFiles( 'LFNCatalogMissing', fileID ) def resolvePFNMissing( self, problematicDict ): """ This takes the problematic dictionary returned by the integrity DB and resolved the PFNMissing prognosis """ pfn = problematicDict['PFN'] se = problematicDict['SE'] lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] res = Utils.executeSingleFileOrDirWrapper( self.fc.exists( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) if not res['Value']: gLogger.info( "PFNMissing file (%d) no longer exists in catalog" % fileID ) return self.__updateCompletedFiles( 'PFNMissing', fileID ) res = Utils.executeSingleFileOrDirWrapper( StorageElement( se ).exists( pfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) if res['Value']: gLogger.info( "PFNMissing replica (%d) is no longer missing" % fileID ) return self.__updateReplicaToChecked( problematicDict ) gLogger.info( "PFNMissing replica (%d) does not exist" % fileID ) res = Utils.executeSingleFileOrDirWrapper( self.fc.getReplicas( lfn, allStatus = True ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) replicas = res['Value'] seSite = se.split( '_' )[0].split( '-' )[0] found = False print replicas for replicaSE in replicas.keys(): if re.search( seSite, replicaSE ): found = True problematicDict['SE'] = replicaSE se = replicaSE if not found: gLogger.info( "PFNMissing replica (%d) is no longer registered at SE. Resolved." % fileID ) return self.__updateCompletedFiles( 'PFNMissing', fileID ) gLogger.info( "PFNMissing replica (%d) does not exist. Removing from catalog..." % fileID ) res = Utils.executeSingleFileOrDirWrapper( self.fc.removeReplica( {lfn:problematicDict} ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) if len( replicas ) == 1: gLogger.info( "PFNMissing replica (%d) had a single replica. Updating prognosis" % fileID ) return self.changeProblematicPrognosis( fileID, 'LFNZeroReplicas' ) res = self.dm.replicateAndRegister( problematicDict['LFN'], se ) if not res['OK']: return self.__returnProblematicError( fileID, res ) # If we get here the problem is solved so we can update the integrityDB return self.__updateCompletedFiles( 'PFNMissing', fileID ) def resolvePFNUnavailable( self, problematicDict ): """ This takes the problematic dictionary returned by the integrity DB and resolved the PFNUnavailable prognosis """ pfn = problematicDict['PFN'] se = problematicDict['SE'] fileID = problematicDict['FileID'] res = Utils.executeSingleFileOrDirWrapper( StorageElement( se ).getFileMetadata( pfn ) ) if ( not res['OK'] ) and ( re.search( 'File does not exist', res['Message'] ) ): # The file is no longer Unavailable but has now dissapeared completely gLogger.info( "PFNUnavailable replica (%d) found to be missing. Updating prognosis" % fileID ) return self.changeProblematicPrognosis( fileID, 'PFNMissing' ) if ( not res['OK'] ) or res['Value']['Unavailable']: gLogger.info( "PFNUnavailable replica (%d) found to still be Unavailable" % fileID ) return self.incrementProblematicRetry( fileID ) if res['Value']['Lost']: gLogger.info( "PFNUnavailable replica (%d) is now found to be Lost. Updating prognosis" % fileID ) return self.changeProblematicPrognosis( fileID, 'PFNLost' ) gLogger.info( "PFNUnavailable replica (%d) is no longer Unavailable" % fileID ) # Need to make the replica okay in the Catalog return self.__updateReplicaToChecked( problematicDict ) def resolvePFNZeroSize( self, problematicDict ): """ This takes the problematic dictionary returned by the integrity DB and resolves the PFNZeroSize prognosis """ pfn = problematicDict['PFN'] seName = problematicDict['SE'] fileID = problematicDict['FileID'] se = StorageElement( seName ) res = Utils.executeSingleFileOrDirWrapper( se.getFileSize( pfn ) ) if ( not res['OK'] ) and ( re.search( 'File does not exist', res['Message'] ) ): gLogger.info( "PFNZeroSize replica (%d) found to be missing. Updating prognosis" % problematicDict['FileID'] ) return self.changeProblematicPrognosis( fileID, 'PFNMissing' ) storageSize = res['Value'] if storageSize == 0: res = Utils.executeSingleFileOrDirWrapper( se.removeFile( pfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) gLogger.info( "PFNZeroSize replica (%d) removed. Updating prognosis" % problematicDict['FileID'] ) return self.changeProblematicPrognosis( fileID, 'PFNMissing' ) res = self.__getRegisteredPFNLFN( pfn, seName ) if not res['OK']: return self.__returnProblematicError( fileID, res ) lfn = res['Value'] if not lfn: gLogger.info( "PFNZeroSize replica (%d) not registered in catalog. Updating prognosis" % problematicDict['FileID'] ) return self.changeProblematicPrognosis( fileID, 'PFNNotRegistered' ) res = Utils.executeSingleFileOrDirWrapper( self.fc.getFileMetadata( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) catalogSize = res['Value']['Size'] if catalogSize != storageSize: gLogger.info( "PFNZeroSize replica (%d) size found to differ from registered metadata. Updating prognosis" % problematicDict['FileID'] ) return self.changeProblematicPrognosis( fileID, 'CatalogPFNSizeMismatch' ) return self.__updateCompletedFiles( 'PFNZeroSize', fileID ) ############################################################################################ def resolveLFNZeroReplicas( self, problematicDict ): """ This takes the problematic dictionary returned by the integrity DB and resolves the LFNZeroReplicas prognosis """ lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] res = Utils.executeSingleFileOrDirWrapper( self.fc.getReplicas( lfn, allStatus = True ) ) if res['OK'] and res['Value']: gLogger.info( "LFNZeroReplicas file (%d) found to have replicas" % fileID ) else: gLogger.info( "LFNZeroReplicas file (%d) does not have replicas. Checking storage..." % fileID ) pfnsFound = False for storageElementName in sortList( gConfig.getValue( 'Resources/StorageElementGroups/Tier1_MC_M-DST', [] ) ): res = self.__getStoragePathExists( [lfn], storageElementName ) if res['Value'].has_key( lfn ): gLogger.info( "LFNZeroReplicas file (%d) found storage file at %s" % ( fileID, storageElementName ) ) pfn = res['Value'][lfn] self.__reportProblematicReplicas( [( lfn, pfn, storageElementName, 'PFNNotRegistered' )], storageElementName, 'PFNNotRegistered' ) pfnsFound = True if not pfnsFound: gLogger.info( "LFNZeroReplicas file (%d) did not have storage files. Removing..." % fileID ) res = Utils.executeSingleFileOrDirWrapper( self.fc.removeFile( lfn ) ) if not res['OK']: gLogger.error( res['Message'] ) # Increment the number of retries for this file self.server.incrementProblematicRetry( fileID ) return res gLogger.info( "LFNZeroReplicas file (%d) removed from catalog" % fileID ) # If we get here the problem is solved so we can update the integrityDB return self.__updateCompletedFiles( 'LFNZeroReplicas', fileID )
class DataIntegrityClient(Client): """ The following methods are supported in the service but are not mentioned explicitly here: getProblematic() Obtains a problematic file from the IntegrityDB based on the LastUpdate time getPrognosisProblematics(prognosis) Obtains all the problematics of a particular prognosis from the integrityDB getProblematicsSummary() Obtains a count of the number of problematics for each prognosis found getDistinctPrognosis() Obtains the distinct prognosis found in the integrityDB getTransformationProblematics(prodID) Obtains the problematics for a given production incrementProblematicRetry(fileID) Increments the retry count for the supplied file ID changeProblematicPrognosis(fileID,newPrognosis) Changes the prognosis of the supplied file to the new prognosis setProblematicStatus(fileID,status) Updates the status of a problematic in the integrityDB removeProblematic(self,fileID) This removes the specified file ID from the integrity DB insertProblematic(sourceComponent,fileMetadata) Inserts file with supplied metadata into the integrity DB """ def __init__(self, **kwargs): super(DataIntegrityClient, self).__init__(**kwargs) self.setServer('DataManagement/DataIntegrity') self.dm = DataManager() self.fc = FileCatalog() def setFileProblematic(self, lfn, reason, sourceComponent=''): """ This method updates the status of the file in the FileCatalog and the IntegrityDB lfn - the lfn of the file reason - this is given to the integrity DB and should reflect the problem observed with the file sourceComponent is the component issuing the request. """ if isinstance(lfn, list): lfns = lfn elif isinstance(lfn, basestring): lfns = [lfn] else: errStr = "DataIntegrityClient.setFileProblematic: Supplied file info must be list or a single LFN." gLogger.error(errStr) return S_ERROR(errStr) gLogger.info( "DataIntegrityClient.setFileProblematic: Attempting to update %s files." % len(lfns)) fileMetadata = {} for lfn in lfns: fileMetadata[lfn] = { 'Prognosis': reason, 'LFN': lfn, 'PFN': '', 'SE': '' } res = self.insertProblematic(sourceComponent, fileMetadata) if not res['OK']: gLogger.error( "DataIntegrityClient.setReplicaProblematic: Failed to insert problematics to integrity DB" ) return res def reportProblematicReplicas(self, replicaTuple, se, reason): """ Simple wrapper function around setReplicaProblematic """ gLogger.info('The following %s files had %s at %s' % (len(replicaTuple), reason, se)) for lfn, _pfn, se, reason in sorted(replicaTuple): if lfn: gLogger.info(lfn) res = self.setReplicaProblematic(replicaTuple, sourceComponent='DataIntegrityClient') if not res['OK']: gLogger.info('Failed to update integrity DB with replicas', res['Message']) else: gLogger.info('Successfully updated integrity DB with replicas') def setReplicaProblematic(self, replicaTuple, sourceComponent=''): """ This method updates the status of the replica in the FileCatalog and the IntegrityDB The supplied replicaDict should be of the form {lfn :{'PFN':pfn,'SE':se,'Prognosis':prognosis} lfn - the lfn of the file pfn - the pfn if available (otherwise '') se - the storage element of the problematic replica (otherwise '') prognosis - this is given to the integrity DB and should reflect the problem observed with the file sourceComponent is the component issuing the request. """ if isinstance(replicaTuple, tuple): replicaTuple = [replicaTuple] elif isinstance(replicaTuple, list): pass else: errStr = "DataIntegrityClient.setReplicaProblematic: Supplied replica info must be a tuple or list of tuples." gLogger.error(errStr) return S_ERROR(errStr) gLogger.info( "DataIntegrityClient.setReplicaProblematic: Attempting to update %s replicas." % len(replicaTuple)) replicaDict = {} for lfn, pfn, se, reason in replicaTuple: replicaDict[lfn] = { 'Prognosis': reason, 'LFN': lfn, 'PFN': pfn, 'SE': se } res = self.insertProblematic(sourceComponent, replicaDict) if not res['OK']: gLogger.error( "DataIntegrityClient.setReplicaProblematic: Failed to insert problematic to integrity DB" ) return res for lfn in replicaDict.keys(): replicaDict[lfn]['Status'] = 'Problematic' res = self.fc.setReplicaStatus(replicaDict) if not res['OK']: errStr = "DataIntegrityClient.setReplicaProblematic: Completely failed to update replicas." gLogger.error(errStr, res['Message']) return res failed = res['Value']['Failed'] successful = res['Value']['Successful'] resDict = {'Successful': successful, 'Failed': failed} return S_OK(resDict) ########################################################################## # # This section contains the resolution methods for various prognoses # def __updateCompletedFiles(self, prognosis, fileID): gLogger.info("%s file (%d) is resolved" % (prognosis, fileID)) return self.setProblematicStatus(fileID, 'Resolved') def __returnProblematicError(self, fileID, res): self.incrementProblematicRetry(fileID) gLogger.error('DataIntegrityClient failure', res['Message']) return res def __updateReplicaToChecked(self, problematicDict): lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] prognosis = problematicDict['Prognosis'] problematicDict['Status'] = 'Checked' res = returnSingleResult( self.fc.setReplicaStatus({lfn: problematicDict})) if not res['OK']: return self.__returnProblematicError(fileID, res) gLogger.info("%s replica (%d) is updated to Checked status" % (prognosis, fileID)) return self.__updateCompletedFiles(prognosis, fileID) def resolveCatalogPFNSizeMismatch(self, problematicDict): """ This takes the problematic dictionary returned by the integrity DB and resolved the CatalogPFNSizeMismatch prognosis """ lfn = problematicDict['LFN'] se = problematicDict['SE'] fileID = problematicDict['FileID'] res = returnSingleResult(self.fc.getFileSize(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) catalogSize = res['Value'] res = returnSingleResult(StorageElement(se).getFileSize(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) storageSize = res['Value'] bkKCatalog = FileCatalog(['BookkeepingDB']) res = returnSingleResult(bkKCatalog.getFileSize(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) bookkeepingSize = res['Value'] if bookkeepingSize == catalogSize == storageSize: gLogger.info( "CatalogPFNSizeMismatch replica (%d) matched all registered sizes." % fileID) return self.__updateReplicaToChecked(problematicDict) if catalogSize == bookkeepingSize: gLogger.info( "CatalogPFNSizeMismatch replica (%d) found to mismatch the bookkeeping also" % fileID) res = returnSingleResult(self.fc.getReplicas(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) if len(res['Value']) <= 1: gLogger.info( "CatalogPFNSizeMismatch replica (%d) has no other replicas." % fileID) return S_ERROR( "Not removing catalog file mismatch since the only replica" ) else: gLogger.info( "CatalogPFNSizeMismatch replica (%d) has other replicas. Removing..." % fileID) res = self.dm.removeReplica(se, lfn) if not res['OK']: return self.__returnProblematicError(fileID, res) return self.__updateCompletedFiles('CatalogPFNSizeMismatch', fileID) if (catalogSize != bookkeepingSize) and (bookkeepingSize == storageSize): gLogger.info( "CatalogPFNSizeMismatch replica (%d) found to match the bookkeeping size" % fileID) res = self.__updateReplicaToChecked(problematicDict) if not res['OK']: return self.__returnProblematicError(fileID, res) return self.changeProblematicPrognosis(fileID, 'BKCatalogSizeMismatch') gLogger.info( "CatalogPFNSizeMismatch replica (%d) all sizes found mismatch. Updating retry count" % fileID) return self.incrementProblematicRetry(fileID) #FIXME: Unused? def resolvePFNNotRegistered(self, problematicDict): """ This takes the problematic dictionary returned by the integrity DB and resolved the PFNNotRegistered prognosis """ lfn = problematicDict['LFN'] seName = problematicDict['SE'] fileID = problematicDict['FileID'] se = StorageElement(seName) res = returnSingleResult(self.fc.exists(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) if not res['Value']: # The file does not exist in the catalog res = returnSingleResult(se.removeFile(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) return self.__updateCompletedFiles('PFNNotRegistered', fileID) res = returnSingleResult(se.getFileMetadata(lfn)) if (not res['OK']) and (re.search('File does not exist', res['Message'])): gLogger.info("PFNNotRegistered replica (%d) found to be missing." % fileID) return self.__updateCompletedFiles('PFNNotRegistered', fileID) elif not res['OK']: return self.__returnProblematicError(fileID, res) storageMetadata = res['Value'] if storageMetadata['Lost']: gLogger.info( "PFNNotRegistered replica (%d) found to be Lost. Updating prognosis" % fileID) return self.changeProblematicPrognosis(fileID, 'PFNLost') if storageMetadata['Unavailable']: gLogger.info( "PFNNotRegistered replica (%d) found to be Unavailable. Updating retry count" % fileID) return self.incrementProblematicRetry(fileID) # HACK until we can obtain the space token descriptions through GFAL site = seName.split('_')[0].split('-')[0] if not storageMetadata['Cached']: if lfn.endswith('.raw'): seName = '%s-RAW' % site else: seName = '%s-RDST' % site elif storageMetadata['Migrated']: if lfn.startswith('/lhcb/data'): seName = '%s_M-DST' % site else: seName = '%s_MC_M-DST' % site else: if lfn.startswith('/lhcb/data'): seName = '%s-DST' % site else: seName = '%s_MC-DST' % site problematicDict['SE'] = seName res = returnSingleResult(se.getURL(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) problematicDict['PFN'] = res['Value'] res = returnSingleResult(self.fc.addReplica({lfn: problematicDict})) if not res['OK']: return self.__returnProblematicError(fileID, res) res = returnSingleResult(self.fc.getFileMetadata(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) if res['Value']['Size'] != storageMetadata['Size']: gLogger.info( "PFNNotRegistered replica (%d) found with catalog size mismatch. Updating prognosis" % fileID) return self.changeProblematicPrognosis(fileID, 'CatalogPFNSizeMismatch') return self.__updateCompletedFiles('PFNNotRegistered', fileID) #FIXME: Unused? def resolveLFNCatalogMissing(self, problematicDict): """ This takes the problematic dictionary returned by the integrity DB and resolved the LFNCatalogMissing prognosis """ lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] res = returnSingleResult(self.fc.exists(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) if res['Value']: return self.__updateCompletedFiles('LFNCatalogMissing', fileID) # Remove the file from all catalogs # RF_NOTE : here I can do it because it's a single file, but otherwise I would need to sort the path res = returnSingleResult(self.fc.removeFile(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) return self.__updateCompletedFiles('LFNCatalogMissing', fileID) #FIXME: Unused? def resolvePFNMissing(self, problematicDict): """ This takes the problematic dictionary returned by the integrity DB and resolved the PFNMissing prognosis """ se = problematicDict['SE'] lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] res = returnSingleResult(self.fc.exists(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) if not res['Value']: gLogger.info("PFNMissing file (%d) no longer exists in catalog" % fileID) return self.__updateCompletedFiles('PFNMissing', fileID) res = returnSingleResult(StorageElement(se).exists(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) if res['Value']: gLogger.info("PFNMissing replica (%d) is no longer missing" % fileID) return self.__updateReplicaToChecked(problematicDict) gLogger.info("PFNMissing replica (%d) does not exist" % fileID) res = returnSingleResult(self.fc.getReplicas(lfn, allStatus=True)) if not res['OK']: return self.__returnProblematicError(fileID, res) replicas = res['Value'] seSite = se.split('_')[0].split('-')[0] found = False print replicas for replicaSE in replicas.keys(): if re.search(seSite, replicaSE): found = True problematicDict['SE'] = replicaSE se = replicaSE if not found: gLogger.info( "PFNMissing replica (%d) is no longer registered at SE. Resolved." % fileID) return self.__updateCompletedFiles('PFNMissing', fileID) gLogger.info( "PFNMissing replica (%d) does not exist. Removing from catalog..." % fileID) res = returnSingleResult(self.fc.removeReplica({lfn: problematicDict})) if not res['OK']: return self.__returnProblematicError(fileID, res) if len(replicas) == 1: gLogger.info( "PFNMissing replica (%d) had a single replica. Updating prognosis" % fileID) return self.changeProblematicPrognosis(fileID, 'LFNZeroReplicas') res = self.dm.replicateAndRegister(problematicDict['LFN'], se) if not res['OK']: return self.__returnProblematicError(fileID, res) # If we get here the problem is solved so we can update the integrityDB return self.__updateCompletedFiles('PFNMissing', fileID) #FIXME: Unused? def resolvePFNUnavailable(self, problematicDict): """ This takes the problematic dictionary returned by the integrity DB and resolved the PFNUnavailable prognosis """ lfn = problematicDict['LFN'] se = problematicDict['SE'] fileID = problematicDict['FileID'] res = returnSingleResult(StorageElement(se).getFileMetadata(lfn)) if (not res['OK']) and (re.search('File does not exist', res['Message'])): # The file is no longer Unavailable but has now dissapeared completely gLogger.info( "PFNUnavailable replica (%d) found to be missing. Updating prognosis" % fileID) return self.changeProblematicPrognosis(fileID, 'PFNMissing') if (not res['OK']) or res['Value']['Unavailable']: gLogger.info( "PFNUnavailable replica (%d) found to still be Unavailable" % fileID) return self.incrementProblematicRetry(fileID) if res['Value']['Lost']: gLogger.info( "PFNUnavailable replica (%d) is now found to be Lost. Updating prognosis" % fileID) return self.changeProblematicPrognosis(fileID, 'PFNLost') gLogger.info("PFNUnavailable replica (%d) is no longer Unavailable" % fileID) # Need to make the replica okay in the Catalog return self.__updateReplicaToChecked(problematicDict) #FIXME: Unused? def resolvePFNZeroSize(self, problematicDict): """ This takes the problematic dictionary returned by the integrity DB and resolves the PFNZeroSize prognosis """ lfn = problematicDict['LFN'] seName = problematicDict['SE'] fileID = problematicDict['FileID'] se = StorageElement(seName) res = returnSingleResult(se.getFileSize(lfn)) if (not res['OK']) and (re.search('File does not exist', res['Message'])): gLogger.info( "PFNZeroSize replica (%d) found to be missing. Updating prognosis" % problematicDict['FileID']) return self.changeProblematicPrognosis(fileID, 'PFNMissing') storageSize = res['Value'] if storageSize == 0: res = returnSingleResult(se.removeFile(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) gLogger.info( "PFNZeroSize replica (%d) removed. Updating prognosis" % problematicDict['FileID']) return self.changeProblematicPrognosis(fileID, 'PFNMissing') res = returnSingleResult(self.fc.getReplicas(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) if seName not in res['Value']: gLogger.info( "PFNZeroSize replica (%d) not registered in catalog. Updating prognosis" % problematicDict['FileID']) return self.changeProblematicPrognosis(fileID, 'PFNNotRegistered') res = returnSingleResult(self.fc.getFileMetadata(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) catalogSize = res['Value']['Size'] if catalogSize != storageSize: gLogger.info( "PFNZeroSize replica (%d) size found to differ from registered metadata. Updating prognosis" % problematicDict['FileID']) return self.changeProblematicPrognosis(fileID, 'CatalogPFNSizeMismatch') return self.__updateCompletedFiles('PFNZeroSize', fileID) ############################################################################################ #FIXME: Unused? def resolveLFNZeroReplicas(self, problematicDict): """ This takes the problematic dictionary returned by the integrity DB and resolves the LFNZeroReplicas prognosis """ lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] res = returnSingleResult(self.fc.getReplicas(lfn, allStatus=True)) if res['OK'] and res['Value']: gLogger.info("LFNZeroReplicas file (%d) found to have replicas" % fileID) else: gLogger.info( "LFNZeroReplicas file (%d) does not have replicas. Checking storage..." % fileID) pfnsFound = False for storageElementName in sorted( gConfig.getValue( 'Resources/StorageElementGroups/Tier1_MC_M-DST', [])): res = self.__getStoragePathExists([lfn], storageElementName) if lfn in res['Value']: gLogger.info( "LFNZeroReplicas file (%d) found storage file at %s" % (fileID, storageElementName)) self.reportProblematicReplicas( [(lfn, 'deprecatedUrl', storageElementName, 'PFNNotRegistered')], storageElementName, 'PFNNotRegistered') pfnsFound = True if not pfnsFound: gLogger.info( "LFNZeroReplicas file (%d) did not have storage files. Removing..." % fileID) res = returnSingleResult(self.fc.removeFile(lfn)) if not res['OK']: gLogger.error('DataIntegrityClient: failed to remove file', res['Message']) # Increment the number of retries for this file self.server.incrementProblematicRetry(fileID) return res gLogger.info("LFNZeroReplicas file (%d) removed from catalog" % fileID) # If we get here the problem is solved so we can update the integrityDB return self.__updateCompletedFiles('LFNZeroReplicas', fileID) def _reportProblematicFiles(self, lfns, reason): """ Simple wrapper function around setFileProblematic """ gLogger.info('The following %s files were found with %s' % (len(lfns), reason)) for lfn in sorted(lfns): gLogger.info(lfn) res = self.setFileProblematic(lfns, reason, sourceComponent='DataIntegrityClient') if not res['OK']: gLogger.info('Failed to update integrity DB with files', res['Message']) else: gLogger.info('Successfully updated integrity DB with files')
class InputDataAgent(OptimizerModule): """ The specific Optimizer must provide the following methods: - initializeOptimizer() before each execution cycle - checkJob() - the main method called for each job """ ############################################################################# def initializeOptimizer(self): """Initialize specific parameters for JobSanityAgent. """ self.failedMinorStatus = self.am_getOption('/FailedJobStatus', 'Input Data Not Available') #this will ignore failover SE files self.checkFileMetadata = self.am_getOption('CheckFileMetadata', True) self.dataManager = DataManager() self.resourceStatus = ResourceStatus() self.fc = FileCatalog() self.seToSiteMapping = {} self.lastCScheck = 0 self.cacheLength = 600 return S_OK() ############################################################################# def checkJob(self, job, classAdJob): """ This method does the optimization corresponding to this Agent, it is call for each job by the Optimizer framework """ result = self.jobDB.getInputData(job) if not result['OK']: self.log.warn('Failed to get input data from JobdB for %s' % (job)) self.log.warn(result['Message']) return result if not result['Value']: self.log.verbose('Job %s has no input data requirement' % (job)) return self.setNextOptimizer(job) #Check if we already executed this Optimizer and the input data is resolved res = self.getOptimizerJobInfo(job, self.am_getModuleParam('optimizerName')) if res['OK'] and len(res['Value']): pass else: self.log.verbose( 'Job %s has an input data requirement and will be processed' % (job)) inputData = result['Value'] result = self.__resolveInputData(job, inputData) if not result['OK']: self.log.warn(result['Message']) return result return self.setNextOptimizer(job) ############################################################################# def __resolveInputData(self, job, inputData): """This method checks the file catalog for replica information. """ lfns = [fname.replace('LFN:', '') for fname in inputData] start = time.time() # In order to place jobs on Hold if a certain SE is banned we need first to check first if # if the replicas are really available replicas = self.dataManager.getActiveReplicas(lfns) timing = time.time() - start self.log.verbose('Catalog Replicas Lookup Time: %.2f seconds ' % (timing)) if not replicas['OK']: self.log.warn(replicas['Message']) return replicas replicaDict = replicas['Value'] siteCandidates = self.__checkReplicas(job, replicaDict) if not siteCandidates['OK']: self.log.warn(siteCandidates['Message']) return siteCandidates if self.checkFileMetadata: guids = True start = time.time() guidDict = self.fc.getFileMetadata(lfns) timing = time.time() - start self.log.info('Catalog Metadata Lookup Time: %.2f seconds ' % (timing)) if not guidDict['OK']: self.log.warn(guidDict['Message']) guids = False failed = guidDict['Value']['Failed'] if failed: self.log.warn('Failed to establish some GUIDs') self.log.warn(failed) guids = False if guids: for lfn, reps in replicaDict['Successful'].items(): guidDict['Value']['Successful'][lfn].update(reps) replicas = guidDict resolvedData = {} resolvedData['Value'] = replicas resolvedData['SiteCandidates'] = siteCandidates['Value'] result = self.setOptimizerJobInfo( job, self.am_getModuleParam('optimizerName'), resolvedData) if not result['OK']: self.log.warn(result['Message']) return result return S_OK(resolvedData) ############################################################################# def __checkReplicas(self, job, replicaDict): """Check that all input lfns have valid replicas and can all be found at least in one single site. """ badLFNs = [] if replicaDict.has_key('Successful'): for lfn, reps in replicaDict['Successful'].items(): if not reps: badLFNs.append('LFN:%s Problem: No replicas available' % (lfn)) else: return S_ERROR('No replica Info available') if replicaDict.has_key('Failed'): for lfn, cause in replicaDict['Failed'].items(): badLFNs.append('LFN:%s Problem: %s' % (lfn, cause)) if badLFNs: self.log.info('Found %s problematic LFN(s) for job %s' % (len(badLFNs), job)) param = '\n'.join(badLFNs) self.log.info(param) result = self.setJobParam(job, self.am_getModuleParam('optimizerName'), param) if not result['OK']: self.log.error(result['Message']) return S_ERROR('Input Data Not Available') return self.__getSiteCandidates(replicaDict['Successful']) ############################################################################# # FIXME: right now this is unused... def __checkActiveSEs(self, job, replicaDict): """ Check active SE and replicas and identify possible Site candidates for the execution of the job """ # Now let's check if some replicas might not be available due to banned SE's activeReplicas = self.dataManager.checkActiveReplicas(replicaDict) if not activeReplicas['OK']: # due to banned SE's input data might no be available msg = "On Hold: Missing replicas due to banned SE" self.log.info(msg) self.log.warn(activeReplicas['Message']) return S_ERROR(msg) activeReplicaDict = activeReplicas['Value'] siteCandidates = self.__checkReplicas(job, activeReplicaDict) if not siteCandidates['OK']: # due to a banned SE's input data is not available at a single site msg = "On Hold: Input data not Available due to banned SE" self.log.info(msg) self.log.warn(siteCandidates['Message']) return S_ERROR(msg) resolvedData = {} resolvedData['Value'] = activeReplicas resolvedData['SiteCandidates'] = siteCandidates['Value'] result = self.setOptimizerJobInfo( job, self.am_getModuleParam('optimizerName'), resolvedData) if not result['OK']: self.log.warn(result['Message']) return result return S_OK(resolvedData) ############################################################################# def __getSitesForSE(self, se): """ Returns a list of sites having the given SE as a local one. Uses the local cache of the site-se information """ # Empty the cache if too old if (time.time() - self.lastCScheck) > self.cacheLength: self.log.verbose('Resetting the SE to site mapping cache') self.seToSiteMapping = {} self.lastCScheck = time.time() if se not in self.seToSiteMapping: sites = getSitesForSE(se) if sites['OK']: self.seToSiteMapping[se] = list(sites['Value']) return sites else: return S_OK(self.seToSiteMapping[se]) ############################################################################# def __getSiteCandidates(self, inputData): """This method returns a list of possible site candidates based on the job input data requirement. For each site candidate, the number of files on disk and tape is resolved. """ fileSEs = {} for lfn, replicas in inputData.items(): siteList = [] for se in replicas.keys(): sites = self.__getSitesForSE(se) if sites['OK']: siteList += sites['Value'] fileSEs[lfn] = uniqueElements(siteList) siteCandidates = [] i = 0 for _fileName, sites in fileSEs.items(): if not i: siteCandidates = sites else: tempSite = [] for site in siteCandidates: if site in sites: tempSite.append(site) siteCandidates = tempSite i += 1 if not len(siteCandidates): return S_ERROR('No candidate sites available') #In addition, check number of files on tape and disk for each site #for optimizations during scheduling siteResult = {} for site in siteCandidates: siteResult[site] = {'disk': [], 'tape': []} seDict = {} for lfn, replicas in inputData.items(): for se in replicas.keys(): if se not in seDict: sites = self.__getSitesForSE(se) if not sites['OK']: continue try: #storageElement = StorageElement( se ) result = self.resourceStatus.getStorageElementStatus( se, statusType='ReadAccess') if not result['OK']: continue seDict[se] = { 'Sites': sites['Value'], 'SEParams': result['Value'][se] } result = getStorageElementOptions(se) if not result['OK']: continue seDict[se]['SEParams'].update(result['Value']) except Exception: self.log.exception( 'Failed to instantiate StorageElement( %s )' % se) continue for site in seDict[se]['Sites']: if site in siteCandidates: if seDict[se]['SEParams']['ReadAccess'] and seDict[se][ 'SEParams']['DiskSE']: if lfn not in siteResult[site]['disk']: siteResult[site]['disk'].append(lfn) if lfn in siteResult[site]['tape']: siteResult[site]['tape'].remove(lfn) if seDict[se]['SEParams']['ReadAccess'] and seDict[se][ 'SEParams']['TapeSE']: if lfn not in siteResult[site][ 'tape'] and lfn not in siteResult[site][ 'disk']: siteResult[site]['tape'].append(lfn) for site in siteResult: siteResult[site]['disk'] = len(siteResult[site]['disk']) siteResult[site]['tape'] = len(siteResult[site]['tape']) return S_OK(siteResult)
class File: def __init__(self, lfn='', status='', size=0, guid='', checksum=''): # These are the possible attributes for a file if not type(lfn) in types.StringTypes: raise AttributeError, "lfn should be string type" self.lfn = str(lfn) if not type(status) in types.StringTypes: raise AttributeError, "status should be string type" self.status = str(status) try: self.size = int(size) except: raise AttributeError, "size should be integer type" if not type(guid) in types.StringTypes: raise AttributeError, "guid should be string type" self.guid = str(guid) if not type(checksum) in types.StringTypes: raise AttributeError, "checksum should be string type" self.checksum = str(checksum) self.catalogReplicas = [] self.fc = FileCatalog() def setLFN(self,lfn): if not type(lfn) in types.StringTypes: return S_ERROR("LFN should be %s and not %s" % (types.StringType,type(lfn))) self.lfn = str(lfn) return S_OK() def setStatus(self,status): if not type(status) in types.StringTypes: return S_ERROR("Status should be %s and not %s" % (types.StringType,type(status))) self.status = str(status) return S_OK() def setSize(self,size): try: self.size = int(size) return S_OK() except: return S_ERROR("Size should be %s and not %s" % (types.IntType,type(size))) def setGUID(self,guid): if not type(guid) in types.StringTypes: return S_ERROR("GUID should be %s and not %s" % (types.StringType,type(guid))) self.guid = str(guid) return S_OK() def setChecksum(self,checksum): if not type(checksum) in types.StringTypes: return S_ERROR("Checksum should be %s and not %s" % (types.StringType,type(checksum))) self.checksum = str(checksum) return S_OK() def addCatalogReplica(self,se,pfn,status='U'): for replica in self.catalogReplicas: if (replica.pfn == pfn) and (replica.se == se): return S_OK() oCatalogReplica = CatalogReplica(pfn=pfn,storageElement=se,status=status) self.catalogReplicas.append(oCatalogReplica) return S_OK() def getLFN(self): return S_OK(self.lfn) def getStatus(self): if self.status: return S_OK(self.status) if not self.lfn: return S_ERROR('No LFN is known') res = self.__populateMetadata() if not res['OK']: return res return S_OK(self.status) def getSize(self): if self.size: return S_OK(self.size) if not self.lfn: return S_ERROR('No LFN is known') res = self.__populateMetadata() if not res['OK']: return res return S_OK(self.size) def getGUID(self): if self.guid: return S_OK(self.guid) if not self.lfn: return S_ERROR('No LFN is known') res = self.__populateMetadata() if not res['OK']: return res return S_OK(self.guid) def getChecksum(self): if self.checksum: return S_OK(self.checksum) if not self.lfn: return S_ERROR('No LFN is known') res = self.__populateMetadata() if not res['OK']: return res return S_OK(self.checksum) def __populateMetadata(self): res = Utils.executeSingleFileOrDirWrapper( self.fc.getFileMetadata( self.lfn ) ) if not res['OK']: return res metadata = res['Value'] self.setChecksum(metadata['Checksum']) self.setGUID(metadata['GUID']) self.setSize(metadata['Size']) self.setStatus(metadata['Status']) return S_OK() def hasCatalogReplicas(self): if self.catalogReplicas: return S_OK(True) return S_OK(False) def clearCatalogReplicas(self): self.catalogReplicas = [] return S_OK() def getReplicas(self): if not self.lfn: return S_ERROR('No LFN is known') if self.catalogReplicas: replicas = {} for replica in self.catalogReplicas: replicas[replica.se] = replica.pfn return S_OK(replicas) res = Utils.executeSingleFileOrDirWrapper( self.fc.getCatalogReplicas( self.lfn ) ) if not res['OK']: return res replicas = res['Value'] for se,pfn in replicas.items(): oCatalogReplica = CatalogReplica(pfn=pfn,storageElement=se,status='U') self.catalogReplicas.append(oCatalogReplica) return S_OK(replicas) def digest(self): """ Get short description string of file attributes """ return S_OK("%s:%s:%d:%s:%s" % (self.lfn,self.status,self.size,self.guid,self.checksum)) def toCFG(self): """ Get the full description of the file in CFG format """ oCFG = CFG() strippedLFN = self.lfn.replace('/','&&') oCFG.createNewSection(strippedLFN) oCFG.setOption('%s/Status' % (strippedLFN), self.status) oCFG.setOption('%s/Size' % (strippedLFN), self.size) oCFG.setOption('%s/GUID' % (strippedLFN), self.guid) oCFG.setOption('%s/Checksum' % (strippedLFN), self.checksum) #TODO: still have to include the CFG from the replica objects if self.catalogReplicas: oCFG.createNewSection('%s/CatalogReplicas' % strippedLFN) for replica in self.catalogReplicas: pass # rCFG.mergeWith(CFG().loadFromBuffer(replica.toCFG()['Value'])) return S_OK(str(oCFG))
class DataIntegrityClient( Client ): """ The following methods are supported in the service but are not mentioned explicitly here: getProblematic() Obtains a problematic file from the IntegrityDB based on the LastUpdate time getPrognosisProblematics(prognosis) Obtains all the problematics of a particular prognosis from the integrityDB getProblematicsSummary() Obtains a count of the number of problematics for each prognosis found getDistinctPrognosis() Obtains the distinct prognosis found in the integrityDB getTransformationProblematics(prodID) Obtains the problematics for a given production incrementProblematicRetry(fileID) Increments the retry count for the supplied file ID changeProblematicPrognosis(fileID,newPrognosis) Changes the prognosis of the supplied file to the new prognosis setProblematicStatus(fileID,status) Updates the status of a problematic in the integrityDB removeProblematic(self,fileID) This removes the specified file ID from the integrity DB insertProblematic(sourceComponent,fileMetadata) Inserts file with supplied metadata into the integrity DB """ def __init__( self, **kwargs ): super(DataIntegrityClient, self).__init__( **kwargs ) self.setServer( 'DataManagement/DataIntegrity' ) self.dm = DataManager() self.fc = FileCatalog() def setFileProblematic( self, lfn, reason, sourceComponent = '' ): """ This method updates the status of the file in the FileCatalog and the IntegrityDB lfn - the lfn of the file reason - this is given to the integrity DB and should reflect the problem observed with the file sourceComponent is the component issuing the request. """ if isinstance( lfn, list ): lfns = lfn elif isinstance( lfn, basestring ): lfns = [lfn] else: errStr = "DataIntegrityClient.setFileProblematic: Supplied file info must be list or a single LFN." gLogger.error( errStr ) return S_ERROR( errStr ) gLogger.info( "DataIntegrityClient.setFileProblematic: Attempting to update %s files." % len( lfns ) ) fileMetadata = {} for lfn in lfns: fileMetadata[lfn] = {'Prognosis':reason, 'LFN':lfn, 'PFN':'', 'SE':''} res = self.insertProblematic( sourceComponent, fileMetadata ) if not res['OK']: gLogger.error( "DataIntegrityClient.setReplicaProblematic: Failed to insert problematics to integrity DB" ) return res def reportProblematicReplicas( self, replicaTuple, se, reason ): """ Simple wrapper function around setReplicaProblematic """ gLogger.info( 'The following %s files had %s at %s' % ( len( replicaTuple ), reason, se ) ) for lfn, _pfn, se, reason in sorted( replicaTuple ): if lfn: gLogger.info( lfn ) res = self.setReplicaProblematic( replicaTuple, sourceComponent = 'DataIntegrityClient' ) if not res['OK']: gLogger.info( 'Failed to update integrity DB with replicas', res['Message'] ) else: gLogger.info( 'Successfully updated integrity DB with replicas' ) def setReplicaProblematic( self, replicaTuple, sourceComponent = '' ): """ This method updates the status of the replica in the FileCatalog and the IntegrityDB The supplied replicaDict should be of the form {lfn :{'PFN':pfn,'SE':se,'Prognosis':prognosis} lfn - the lfn of the file pfn - the pfn if available (otherwise '') se - the storage element of the problematic replica (otherwise '') prognosis - this is given to the integrity DB and should reflect the problem observed with the file sourceComponent is the component issuing the request. """ if isinstance( replicaTuple, tuple ): replicaTuple = [replicaTuple] elif isinstance( replicaTuple, list ): pass else: errStr = "DataIntegrityClient.setReplicaProblematic: Supplied replica info must be a tuple or list of tuples." gLogger.error( errStr ) return S_ERROR( errStr ) gLogger.info( "DataIntegrityClient.setReplicaProblematic: Attempting to update %s replicas." % len( replicaTuple ) ) replicaDict = {} for lfn, pfn, se, reason in replicaTuple: replicaDict[lfn] = {'Prognosis':reason, 'LFN':lfn, 'PFN':pfn, 'SE':se} res = self.insertProblematic( sourceComponent, replicaDict ) if not res['OK']: gLogger.error( "DataIntegrityClient.setReplicaProblematic: Failed to insert problematic to integrity DB" ) return res for lfn in replicaDict.keys(): replicaDict[lfn]['Status'] = 'Problematic' res = self.fc.setReplicaStatus( replicaDict ) if not res['OK']: errStr = "DataIntegrityClient.setReplicaProblematic: Completely failed to update replicas." gLogger.error( errStr, res['Message'] ) return res failed = res['Value']['Failed'] successful = res['Value']['Successful'] resDict = {'Successful':successful, 'Failed':failed} return S_OK( resDict ) ########################################################################## # # This section contains the resolution methods for various prognoses # def __updateCompletedFiles( self, prognosis, fileID ): gLogger.info( "%s file (%d) is resolved" % ( prognosis, fileID ) ) return self.setProblematicStatus( fileID, 'Resolved' ) def __returnProblematicError( self, fileID, res ): self.incrementProblematicRetry( fileID ) gLogger.error( 'DataIntegrityClient failure', res['Message'] ) return res def __updateReplicaToChecked( self, problematicDict ): lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] prognosis = problematicDict['Prognosis'] problematicDict['Status'] = 'Checked' res = returnSingleResult( self.fc.setReplicaStatus( {lfn:problematicDict} ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) gLogger.info( "%s replica (%d) is updated to Checked status" % ( prognosis, fileID ) ) return self.__updateCompletedFiles( prognosis, fileID ) def resolveCatalogPFNSizeMismatch( self, problematicDict ): """ This takes the problematic dictionary returned by the integrity DB and resolved the CatalogPFNSizeMismatch prognosis """ lfn = problematicDict['LFN'] se = problematicDict['SE'] fileID = problematicDict['FileID'] res = returnSingleResult( self.fc.getFileSize( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) catalogSize = res['Value'] res = returnSingleResult( StorageElement( se ).getFileSize( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) storageSize = res['Value'] bkKCatalog = FileCatalog( ['BookkeepingDB'] ) res = returnSingleResult( bkKCatalog.getFileSize( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) bookkeepingSize = res['Value'] if bookkeepingSize == catalogSize == storageSize: gLogger.info( "CatalogPFNSizeMismatch replica (%d) matched all registered sizes." % fileID ) return self.__updateReplicaToChecked( problematicDict ) if catalogSize == bookkeepingSize: gLogger.info( "CatalogPFNSizeMismatch replica (%d) found to mismatch the bookkeeping also" % fileID ) res = returnSingleResult( self.fc.getReplicas( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) if len( res['Value'] ) <= 1: gLogger.info( "CatalogPFNSizeMismatch replica (%d) has no other replicas." % fileID ) return S_ERROR( "Not removing catalog file mismatch since the only replica" ) else: gLogger.info( "CatalogPFNSizeMismatch replica (%d) has other replicas. Removing..." % fileID ) res = self.dm.removeReplica( se, lfn ) if not res['OK']: return self.__returnProblematicError( fileID, res ) return self.__updateCompletedFiles( 'CatalogPFNSizeMismatch', fileID ) if ( catalogSize != bookkeepingSize ) and ( bookkeepingSize == storageSize ): gLogger.info( "CatalogPFNSizeMismatch replica (%d) found to match the bookkeeping size" % fileID ) res = self.__updateReplicaToChecked( problematicDict ) if not res['OK']: return self.__returnProblematicError( fileID, res ) return self.changeProblematicPrognosis( fileID, 'BKCatalogSizeMismatch' ) gLogger.info( "CatalogPFNSizeMismatch replica (%d) all sizes found mismatch. Updating retry count" % fileID ) return self.incrementProblematicRetry( fileID ) #FIXME: Unused? def resolvePFNNotRegistered( self, problematicDict ): """ This takes the problematic dictionary returned by the integrity DB and resolved the PFNNotRegistered prognosis """ lfn = problematicDict['LFN'] seName = problematicDict['SE'] fileID = problematicDict['FileID'] se = StorageElement( seName ) res = returnSingleResult( self.fc.exists( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) if not res['Value']: # The file does not exist in the catalog res = returnSingleResult( se.removeFile( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) return self.__updateCompletedFiles( 'PFNNotRegistered', fileID ) res = returnSingleResult( se.getFileMetadata( lfn ) ) if ( not res['OK'] ) and ( re.search( 'File does not exist', res['Message'] ) ): gLogger.info( "PFNNotRegistered replica (%d) found to be missing." % fileID ) return self.__updateCompletedFiles( 'PFNNotRegistered', fileID ) elif not res['OK']: return self.__returnProblematicError( fileID, res ) storageMetadata = res['Value'] if storageMetadata['Lost']: gLogger.info( "PFNNotRegistered replica (%d) found to be Lost. Updating prognosis" % fileID ) return self.changeProblematicPrognosis( fileID, 'PFNLost' ) if storageMetadata['Unavailable']: gLogger.info( "PFNNotRegistered replica (%d) found to be Unavailable. Updating retry count" % fileID ) return self.incrementProblematicRetry( fileID ) # HACK until we can obtain the space token descriptions through GFAL site = seName.split( '_' )[0].split( '-' )[0] if not storageMetadata['Cached']: if lfn.endswith( '.raw' ): seName = '%s-RAW' % site else: seName = '%s-RDST' % site elif storageMetadata['Migrated']: if lfn.startswith( '/lhcb/data' ): seName = '%s_M-DST' % site else: seName = '%s_MC_M-DST' % site else: if lfn.startswith( '/lhcb/data' ): seName = '%s-DST' % site else: seName = '%s_MC-DST' % site problematicDict['SE'] = seName res = returnSingleResult( se.getURL( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) problematicDict['PFN'] = res['Value'] res = returnSingleResult( self.fc.addReplica( {lfn:problematicDict} ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) res = returnSingleResult( self.fc.getFileMetadata( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) if res['Value']['Size'] != storageMetadata['Size']: gLogger.info( "PFNNotRegistered replica (%d) found with catalog size mismatch. Updating prognosis" % fileID ) return self.changeProblematicPrognosis( fileID, 'CatalogPFNSizeMismatch' ) return self.__updateCompletedFiles( 'PFNNotRegistered', fileID ) #FIXME: Unused? def resolveLFNCatalogMissing( self, problematicDict ): """ This takes the problematic dictionary returned by the integrity DB and resolved the LFNCatalogMissing prognosis """ lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] res = returnSingleResult( self.fc.exists( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) if res['Value']: return self.__updateCompletedFiles( 'LFNCatalogMissing', fileID ) # Remove the file from all catalogs # RF_NOTE : here I can do it because it's a single file, but otherwise I would need to sort the path res = returnSingleResult( self.fc.removeFile( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) return self.__updateCompletedFiles( 'LFNCatalogMissing', fileID ) #FIXME: Unused? def resolvePFNMissing( self, problematicDict ): """ This takes the problematic dictionary returned by the integrity DB and resolved the PFNMissing prognosis """ se = problematicDict['SE'] lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] res = returnSingleResult( self.fc.exists( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) if not res['Value']: gLogger.info( "PFNMissing file (%d) no longer exists in catalog" % fileID ) return self.__updateCompletedFiles( 'PFNMissing', fileID ) res = returnSingleResult( StorageElement( se ).exists( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) if res['Value']: gLogger.info( "PFNMissing replica (%d) is no longer missing" % fileID ) return self.__updateReplicaToChecked( problematicDict ) gLogger.info( "PFNMissing replica (%d) does not exist" % fileID ) res = returnSingleResult( self.fc.getReplicas( lfn, allStatus = True ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) replicas = res['Value'] seSite = se.split( '_' )[0].split( '-' )[0] found = False print replicas for replicaSE in replicas.keys(): if re.search( seSite, replicaSE ): found = True problematicDict['SE'] = replicaSE se = replicaSE if not found: gLogger.info( "PFNMissing replica (%d) is no longer registered at SE. Resolved." % fileID ) return self.__updateCompletedFiles( 'PFNMissing', fileID ) gLogger.info( "PFNMissing replica (%d) does not exist. Removing from catalog..." % fileID ) res = returnSingleResult( self.fc.removeReplica( {lfn:problematicDict} ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) if len( replicas ) == 1: gLogger.info( "PFNMissing replica (%d) had a single replica. Updating prognosis" % fileID ) return self.changeProblematicPrognosis( fileID, 'LFNZeroReplicas' ) res = self.dm.replicateAndRegister( problematicDict['LFN'], se ) if not res['OK']: return self.__returnProblematicError( fileID, res ) # If we get here the problem is solved so we can update the integrityDB return self.__updateCompletedFiles( 'PFNMissing', fileID ) #FIXME: Unused? def resolvePFNUnavailable( self, problematicDict ): """ This takes the problematic dictionary returned by the integrity DB and resolved the PFNUnavailable prognosis """ lfn = problematicDict['LFN'] se = problematicDict['SE'] fileID = problematicDict['FileID'] res = returnSingleResult( StorageElement( se ).getFileMetadata( lfn ) ) if ( not res['OK'] ) and ( re.search( 'File does not exist', res['Message'] ) ): # The file is no longer Unavailable but has now dissapeared completely gLogger.info( "PFNUnavailable replica (%d) found to be missing. Updating prognosis" % fileID ) return self.changeProblematicPrognosis( fileID, 'PFNMissing' ) if ( not res['OK'] ) or res['Value']['Unavailable']: gLogger.info( "PFNUnavailable replica (%d) found to still be Unavailable" % fileID ) return self.incrementProblematicRetry( fileID ) if res['Value']['Lost']: gLogger.info( "PFNUnavailable replica (%d) is now found to be Lost. Updating prognosis" % fileID ) return self.changeProblematicPrognosis( fileID, 'PFNLost' ) gLogger.info( "PFNUnavailable replica (%d) is no longer Unavailable" % fileID ) # Need to make the replica okay in the Catalog return self.__updateReplicaToChecked( problematicDict ) #FIXME: Unused? def resolvePFNZeroSize( self, problematicDict ): """ This takes the problematic dictionary returned by the integrity DB and resolves the PFNZeroSize prognosis """ lfn = problematicDict['LFN'] seName = problematicDict['SE'] fileID = problematicDict['FileID'] se = StorageElement( seName ) res = returnSingleResult( se.getFileSize( lfn ) ) if ( not res['OK'] ) and ( re.search( 'File does not exist', res['Message'] ) ): gLogger.info( "PFNZeroSize replica (%d) found to be missing. Updating prognosis" % problematicDict['FileID'] ) return self.changeProblematicPrognosis( fileID, 'PFNMissing' ) storageSize = res['Value'] if storageSize == 0: res = returnSingleResult( se.removeFile( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) gLogger.info( "PFNZeroSize replica (%d) removed. Updating prognosis" % problematicDict['FileID'] ) return self.changeProblematicPrognosis( fileID, 'PFNMissing' ) res = returnSingleResult( self.fc.getReplicas( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) if seName not in res['Value']: gLogger.info( "PFNZeroSize replica (%d) not registered in catalog. Updating prognosis" % problematicDict['FileID'] ) return self.changeProblematicPrognosis( fileID, 'PFNNotRegistered' ) res = returnSingleResult( self.fc.getFileMetadata( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) catalogSize = res['Value']['Size'] if catalogSize != storageSize: gLogger.info( "PFNZeroSize replica (%d) size found to differ from registered metadata. Updating prognosis" % problematicDict['FileID'] ) return self.changeProblematicPrognosis( fileID, 'CatalogPFNSizeMismatch' ) return self.__updateCompletedFiles( 'PFNZeroSize', fileID ) ############################################################################################ #FIXME: Unused? def resolveLFNZeroReplicas( self, problematicDict ): """ This takes the problematic dictionary returned by the integrity DB and resolves the LFNZeroReplicas prognosis """ lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] res = returnSingleResult( self.fc.getReplicas( lfn, allStatus = True ) ) if res['OK'] and res['Value']: gLogger.info( "LFNZeroReplicas file (%d) found to have replicas" % fileID ) else: gLogger.info( "LFNZeroReplicas file (%d) does not have replicas. Checking storage..." % fileID ) pfnsFound = False for storageElementName in sorted( gConfig.getValue( 'Resources/StorageElementGroups/Tier1_MC_M-DST', [] ) ): res = self.__getStoragePathExists( [lfn], storageElementName ) if lfn in res['Value']: gLogger.info( "LFNZeroReplicas file (%d) found storage file at %s" % ( fileID, storageElementName ) ) self.reportProblematicReplicas( [( lfn, 'deprecatedUrl', storageElementName, 'PFNNotRegistered' )], storageElementName, 'PFNNotRegistered' ) pfnsFound = True if not pfnsFound: gLogger.info( "LFNZeroReplicas file (%d) did not have storage files. Removing..." % fileID ) res = returnSingleResult( self.fc.removeFile( lfn ) ) if not res['OK']: gLogger.error( 'DataIntegrityClient: failed to remove file', res['Message'] ) # Increment the number of retries for this file self.server.incrementProblematicRetry( fileID ) return res gLogger.info( "LFNZeroReplicas file (%d) removed from catalog" % fileID ) # If we get here the problem is solved so we can update the integrityDB return self.__updateCompletedFiles( 'LFNZeroReplicas', fileID ) def _reportProblematicFiles( self, lfns, reason ): """ Simple wrapper function around setFileProblematic """ gLogger.info( 'The following %s files were found with %s' % ( len( lfns ), reason ) ) for lfn in sorted( lfns ): gLogger.info( lfn ) res = self.setFileProblematic( lfns, reason, sourceComponent = 'DataIntegrityClient' ) if not res['OK']: gLogger.info( 'Failed to update integrity DB with files', res['Message'] ) else: gLogger.info( 'Successfully updated integrity DB with files' )
class ReplicateAndRegister(DMSRequestOperationsBase): """ .. class:: ReplicateAndRegister ReplicateAndRegister operation handler """ def __init__(self, operation=None, csPath=None): """c'tor :param self: self reference :param Operation operation: Operation instance :param str csPath: CS path for this handler """ super(ReplicateAndRegister, self).__init__(operation, csPath) # # own gMonitor stuff for files gMonitor.registerActivity("ReplicateAndRegisterAtt", "Replicate and register attempted", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("ReplicateOK", "Replications successful", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("ReplicateFail", "Replications failed", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("RegisterOK", "Registrations successful", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("RegisterFail", "Registrations failed", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) # # for FTS gMonitor.registerActivity("FTSScheduleAtt", "Files schedule attempted", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("FTSScheduleOK", "File schedule successful", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("FTSScheduleFail", "File schedule failed", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) # # SE cache # Clients self.fc = FileCatalog() def __call__(self): """ call me maybe """ # # check replicas first checkReplicas = self.__checkReplicas() if not checkReplicas["OK"]: self.log.error('Failed to check replicas', checkReplicas["Message"]) if hasattr(self, "FTSMode") and getattr(self, "FTSMode"): bannedGroups = getattr(self, "FTSBannedGroups") if hasattr( self, "FTSBannedGroups") else () if self.request.OwnerGroup in bannedGroups: self.log.verbose( "usage of FTS system is banned for request's owner") return self.dmTransfer() if getattr(self, 'UseNewFTS3', False): return self.fts3Transfer() else: return self.ftsTransfer() return self.dmTransfer() def __checkReplicas(self): """ check done replicas and update file states """ waitingFiles = dict([(opFile.LFN, opFile) for opFile in self.operation if opFile.Status in ("Waiting", "Scheduled")]) targetSESet = set(self.operation.targetSEList) replicas = self.fc.getReplicas(waitingFiles.keys()) if not replicas["OK"]: self.log.error('Failed to get replicas', replicas["Message"]) return replicas reMissing = re.compile(r".*such file.*") for failedLFN, errStr in replicas["Value"]["Failed"].iteritems(): waitingFiles[failedLFN].Error = errStr if reMissing.search(errStr.lower()): self.log.error("File does not exists", failedLFN) gMonitor.addMark("ReplicateFail", len(targetSESet)) waitingFiles[failedLFN].Status = "Failed" for successfulLFN, reps in replicas["Value"]["Successful"].iteritems(): if targetSESet.issubset(set(reps)): self.log.info("file %s has been replicated to all targets" % successfulLFN) waitingFiles[successfulLFN].Status = "Done" return S_OK() def _addMetadataToFiles(self, toSchedule): """ Add metadata to those files that need to be scheduled through FTS toSchedule is a dictionary: {'lfn1': opFile, 'lfn2': opFile} """ if toSchedule: self.log.info( "found %s files to schedule, getting metadata from FC" % len(toSchedule)) else: self.log.verbose("No files to schedule") return S_OK([]) res = self.fc.getFileMetadata(toSchedule.keys()) if not res['OK']: return res else: if res['Value']['Failed']: self.log.warn( "Can't schedule %d files: problems getting the metadata: %s" % (len(res['Value']['Failed']), ', '.join( res['Value']['Failed']))) metadata = res['Value']['Successful'] filesToSchedule = {} for lfn, lfnMetadata in metadata.iteritems(): opFileToSchedule = toSchedule[lfn][0] opFileToSchedule.GUID = lfnMetadata['GUID'] # In principle this is defined already in filterReplicas() if not opFileToSchedule.Checksum: opFileToSchedule.Checksum = metadata[lfn]['Checksum'] opFileToSchedule.ChecksumType = metadata[lfn]['ChecksumType'] opFileToSchedule.Size = metadata[lfn]['Size'] filesToSchedule[opFileToSchedule.LFN] = opFileToSchedule return S_OK(filesToSchedule) def _filterReplicas(self, opFile): """ filter out banned/invalid source SEs """ return filterReplicas(opFile, logger=self.log, dataManager=self.dm) def ftsTransfer(self): """ replicate and register using FTS """ self.log.info("scheduling files in FTS...") bannedTargets = self.checkSEsRSS() if not bannedTargets['OK']: gMonitor.addMark("FTSScheduleAtt") gMonitor.addMark("FTSScheduleFail") return bannedTargets if bannedTargets['Value']: return S_OK("%s targets are banned for writing" % ",".join(bannedTargets['Value'])) # Can continue now self.log.verbose("No targets banned for writing") toSchedule = {} delayExecution = 0 errors = defaultdict(int) for opFile in self.getWaitingFilesList(): opFile.Error = '' gMonitor.addMark("FTSScheduleAtt") # # check replicas replicas = self._filterReplicas(opFile) if not replicas["OK"]: continue replicas = replicas["Value"] validReplicas = replicas.get("Valid") noMetaReplicas = replicas.get("NoMetadata") noReplicas = replicas.get('NoReplicas') badReplicas = replicas.get('Bad') noActiveReplicas = replicas.get('NoActiveReplicas') if validReplicas: validTargets = list( set(self.operation.targetSEList) - set(validReplicas)) if not validTargets: self.log.info("file %s is already present at all targets" % opFile.LFN) opFile.Status = "Done" else: toSchedule[opFile.LFN] = [ opFile, validReplicas, validTargets ] else: gMonitor.addMark("FTSScheduleFail") if noMetaReplicas: err = "Couldn't get metadata" errors[err] += 1 self.log.verbose( "unable to schedule '%s', %s at %s" % (opFile.LFN, err, ','.join(noMetaReplicas))) opFile.Error = err elif noReplicas: err = "File doesn't exist" errors[err] += 1 self.log.error( "Unable to schedule transfer", "%s %s at %s" % (opFile.LFN, err, ','.join(noReplicas))) opFile.Error = err opFile.Status = 'Failed' elif badReplicas: err = "All replicas have a bad checksum" errors[err] += 1 self.log.error( "Unable to schedule transfer", "%s, %s at %s" % (opFile.LFN, err, ','.join(badReplicas))) opFile.Error = err opFile.Status = 'Failed' elif noActiveReplicas: err = "No active replica found" errors[err] += 1 self.log.verbose( "Unable to schedule transfer", "%s, %s at %s" % (opFile.LFN, err, ','.join(noActiveReplicas))) opFile.Error = err # All source SEs are banned, delay execution by 1 hour delayExecution = 60 if delayExecution: self.log.info("Delay execution of the request by %d minutes" % delayExecution) self.request.delayNextExecution(delayExecution) # Log error counts for error, count in errors.iteritems(): self.log.error(error, 'for %d files' % count) filesToScheduleList = [] res = self._addMetadataToFiles(toSchedule) if not res['OK']: return res else: filesToSchedule = res['Value'] for lfn in filesToSchedule: filesToScheduleList.append( (filesToSchedule[lfn][0].toJSON()['Value'], toSchedule[lfn][1], toSchedule[lfn][2])) if filesToScheduleList: ftsSchedule = FTSClient().ftsSchedule(self.request.RequestID, self.operation.OperationID, filesToScheduleList) if not ftsSchedule["OK"]: self.log.error("Completely failed to schedule to FTS:", ftsSchedule["Message"]) return ftsSchedule # might have nothing to schedule ftsSchedule = ftsSchedule["Value"] if not ftsSchedule: return S_OK() self.log.info("%d files have been scheduled to FTS" % len(ftsSchedule['Successful'])) for opFile in self.operation: fileID = opFile.FileID if fileID in ftsSchedule["Successful"]: gMonitor.addMark("FTSScheduleOK", 1) opFile.Status = "Scheduled" self.log.debug("%s has been scheduled for FTS" % opFile.LFN) elif fileID in ftsSchedule["Failed"]: gMonitor.addMark("FTSScheduleFail", 1) opFile.Error = ftsSchedule["Failed"][fileID] if 'sourceSURL equals to targetSURL' in opFile.Error: # In this case there is no need to continue opFile.Status = 'Failed' self.log.warn("unable to schedule %s for FTS: %s" % (opFile.LFN, opFile.Error)) else: self.log.info("No files to schedule after metadata checks") # Just in case some transfers could not be scheduled, try them with RM return self.dmTransfer(fromFTS=True) def _checkExistingFTS3Operations(self): """ Check if there are ongoing FTS3Operation for the current RMS Operation Under some conditions, we can be trying to schedule files while there is still an FTS transfer going on. This typically happens when the REA hangs. To prevent further race condition, we check if there are FTS3Operations in a non Final state matching the current operation ID. If so, we put the corresponding files in scheduled mode. We will then wait till the FTS3 Operation performs the callback :returns: S_OK with True if we can go on, False if we should stop the processing """ res = FTS3Client().getOperationsFromRMSOpID(self.operation.OperationID) if not res['OK']: self.log.debug("Could not get FTS3Operations matching OperationID", self.operation.OperationID) return res existingFTSOperations = res['Value'] # It is ok to have FTS Operations in a final state, so we # care only about the others unfinishedFTSOperations = [ ops for ops in existingFTSOperations if ops.status not in FTS3TransferOperation.FINAL_STATES ] if not unfinishedFTSOperations: self.log.debug("No ongoing FTS3Operations, all good") return S_OK(True) self.log.warn( "Some FTS3Operations already exist for the RMS Operation:", [op.operationID for op in unfinishedFTSOperations]) # This would really be a screwed up situation ! if len(unfinishedFTSOperations) > 1: self.log.warn("That's a serious problem !!") # We take the rmsFileID of the files in the Operations, # find the corresponding File object, and set them scheduled rmsFileIDsToSetScheduled = set([ ftsFile.rmsFileID for ftsOp in unfinishedFTSOperations for ftsFile in ftsOp.ftsFiles ]) for opFile in self.operation: # If it is in the DB, it has a FileID opFileID = opFile.FileID if opFileID in rmsFileIDsToSetScheduled: self.log.warn("Setting RMSFile as already scheduled", opFileID) opFile.Status = "Scheduled" # We return here such that the Request is set back to Scheduled in the DB # With no further modification return S_OK(False) def fts3Transfer(self): """ replicate and register using FTS3 """ self.log.info("scheduling files in FTS3...") # Check first if we do not have ongoing transfers res = self._checkExistingFTS3Operations() if not res['OK']: return res # if res['Value'] is False # it means that there are ongoing transfers # and we should stop here if res['Value'] is False: # return S_OK such that the request is put back return S_OK() fts3Files = [] toSchedule = {} # Dict which maps the FileID to the object rmsFilesIds = {} for opFile in self.getWaitingFilesList(): rmsFilesIds[opFile.FileID] = opFile opFile.Error = '' gMonitor.addMark("FTSScheduleAtt") # # check replicas replicas = self._filterReplicas(opFile) if not replicas["OK"]: continue replicas = replicas["Value"] validReplicas = replicas["Valid"] noMetaReplicas = replicas["NoMetadata"] noReplicas = replicas['NoReplicas'] badReplicas = replicas['Bad'] noPFN = replicas['NoPFN'] if validReplicas: validTargets = list( set(self.operation.targetSEList) - set(validReplicas)) if not validTargets: self.log.info("file %s is already present at all targets" % opFile.LFN) opFile.Status = "Done" else: toSchedule[opFile.LFN] = [opFile, validTargets] else: gMonitor.addMark("FTSScheduleFail") if noMetaReplicas: self.log.warn( "unable to schedule '%s', couldn't get metadata at %s" % (opFile.LFN, ','.join(noMetaReplicas))) opFile.Error = "Couldn't get metadata" elif noReplicas: self.log.error( "Unable to schedule transfer", "File %s doesn't exist at %s" % (opFile.LFN, ','.join(noReplicas))) opFile.Error = 'No replicas found' opFile.Status = 'Failed' elif badReplicas: self.log.error( "Unable to schedule transfer", "File %s, all replicas have a bad checksum at %s" % (opFile.LFN, ','.join(badReplicas))) opFile.Error = 'All replicas have a bad checksum' opFile.Status = 'Failed' elif noPFN: self.log.warn( "unable to schedule %s, could not get a PFN at %s" % (opFile.LFN, ','.join(noPFN))) res = self._addMetadataToFiles(toSchedule) if not res['OK']: return res else: filesToSchedule = res['Value'] for lfn in filesToSchedule: opFile = filesToSchedule[lfn] validTargets = toSchedule[lfn][1] for targetSE in validTargets: ftsFile = FTS3File.fromRMSFile(opFile, targetSE) fts3Files.append(ftsFile) if fts3Files: res = Registry.getUsernameForDN(self.request.OwnerDN) if not res['OK']: self.log.error( "Cannot get username for DN", "%s %s" % (self.request.OwnerDN, res['Message'])) return res username = res['Value'] fts3Operation = FTS3TransferOperation.fromRMSObjects( self.request, self.operation, username) fts3Operation.ftsFiles = fts3Files ftsSchedule = FTS3Client().persistOperation(fts3Operation) if not ftsSchedule["OK"]: self.log.error("Completely failed to schedule to FTS3:", ftsSchedule["Message"]) return ftsSchedule # might have nothing to schedule ftsSchedule = ftsSchedule["Value"] self.log.info("Scheduled with FTS3Operation id %s" % ftsSchedule) self.log.info("%d files have been scheduled to FTS3" % len(fts3Files)) for ftsFile in fts3Files: opFile = rmsFilesIds[ftsFile.rmsFileID] gMonitor.addMark("FTSScheduleOK", 1) opFile.Status = "Scheduled" self.log.debug("%s has been scheduled for FTS" % opFile.LFN) else: self.log.info("No files to schedule after metadata checks") # Just in case some transfers could not be scheduled, try them with RM return self.dmTransfer(fromFTS=True) def dmTransfer(self, fromFTS=False): """ replicate and register using dataManager """ # # get waiting files. If none just return # # source SE sourceSE = self.operation.SourceSE if self.operation.SourceSE else None if sourceSE: # # check source se for read bannedSource = self.checkSEsRSS(sourceSE, 'ReadAccess') if not bannedSource["OK"]: gMonitor.addMark("ReplicateAndRegisterAtt", len(self.operation)) gMonitor.addMark("ReplicateFail", len(self.operation)) return bannedSource if bannedSource["Value"]: self.operation.Error = "SourceSE %s is banned for reading" % sourceSE self.log.info(self.operation.Error) return S_OK(self.operation.Error) # # check targetSEs for write bannedTargets = self.checkSEsRSS() if not bannedTargets['OK']: gMonitor.addMark("ReplicateAndRegisterAtt", len(self.operation)) gMonitor.addMark("ReplicateFail", len(self.operation)) return bannedTargets if bannedTargets['Value']: self.operation.Error = "%s targets are banned for writing" % ",".join( bannedTargets['Value']) return S_OK(self.operation.Error) # Can continue now self.log.verbose("No targets banned for writing") waitingFiles = self.getWaitingFilesList() if not waitingFiles: return S_OK() # # loop over files if fromFTS: self.log.info( "Trying transfer using replica manager as FTS failed") else: self.log.info("Transferring files using Data manager...") errors = defaultdict(int) delayExecution = 0 for opFile in waitingFiles: if opFile.Error in ( "Couldn't get metadata", "File doesn't exist", 'No active replica found', "All replicas have a bad checksum", ): err = "File already in error status" errors[err] += 1 gMonitor.addMark("ReplicateAndRegisterAtt", 1) opFile.Error = '' lfn = opFile.LFN # Check if replica is at the specified source replicas = self._filterReplicas(opFile) if not replicas["OK"]: self.log.error('Failed to check replicas', replicas["Message"]) continue replicas = replicas["Value"] validReplicas = replicas.get("Valid") noMetaReplicas = replicas.get("NoMetadata") noReplicas = replicas.get('NoReplicas') badReplicas = replicas.get('Bad') noActiveReplicas = replicas.get('NoActiveReplicas') if not validReplicas: gMonitor.addMark("ReplicateFail") if noMetaReplicas: err = "Couldn't get metadata" errors[err] += 1 self.log.verbose( "unable to replicate '%s', couldn't get metadata at %s" % (opFile.LFN, ','.join(noMetaReplicas))) opFile.Error = err elif noReplicas: err = "File doesn't exist" errors[err] += 1 self.log.verbose( "Unable to replicate", "File %s doesn't exist at %s" % (opFile.LFN, ','.join(noReplicas))) opFile.Error = err opFile.Status = 'Failed' elif badReplicas: err = "All replicas have a bad checksum" errors[err] += 1 self.log.error( "Unable to replicate", "%s, all replicas have a bad checksum at %s" % (opFile.LFN, ','.join(badReplicas))) opFile.Error = err opFile.Status = 'Failed' elif noActiveReplicas: err = "No active replica found" errors[err] += 1 self.log.verbose( "Unable to schedule transfer", "%s, %s at %s" % (opFile.LFN, err, ','.join(noActiveReplicas))) opFile.Error = err # All source SEs are banned, delay execution by 1 hour delayExecution = 60 continue # # get the first one in the list if sourceSE not in validReplicas: if sourceSE: err = "File not at specified source" errors[err] += 1 self.log.warn( "%s is not at specified sourceSE %s, changed to %s" % (lfn, sourceSE, validReplicas[0])) sourceSE = validReplicas[0] # # loop over targetSE catalogs = self.operation.Catalog if catalogs: catalogs = [cat.strip() for cat in catalogs.split(',')] for targetSE in self.operation.targetSEList: # # call DataManager if targetSE in validReplicas: self.log.warn( "Request to replicate %s to an existing location: %s" % (lfn, targetSE)) continue res = self.dm.replicateAndRegister(lfn, targetSE, sourceSE=sourceSE, catalog=catalogs) if res["OK"]: if lfn in res["Value"]["Successful"]: if "replicate" in res["Value"]["Successful"][lfn]: repTime = res["Value"]["Successful"][lfn][ "replicate"] prString = "file %s replicated at %s in %s s." % ( lfn, targetSE, repTime) gMonitor.addMark("ReplicateOK", 1) if "register" in res["Value"]["Successful"][lfn]: gMonitor.addMark("RegisterOK", 1) regTime = res["Value"]["Successful"][lfn][ "register"] prString += ' and registered in %s s.' % regTime self.log.info(prString) else: gMonitor.addMark("RegisterFail", 1) prString += " but failed to register" self.log.warn(prString) opFile.Error = "Failed to register" # # add register replica operation registerOperation = self.getRegisterOperation( opFile, targetSE, type='RegisterReplica') self.request.insertAfter( registerOperation, self.operation) else: self.log.error("Failed to replicate", "%s to %s" % (lfn, targetSE)) gMonitor.addMark("ReplicateFail", 1) opFile.Error = "Failed to replicate" else: gMonitor.addMark("ReplicateFail", 1) reason = res["Value"]["Failed"][lfn] self.log.error("Failed to replicate and register", "File %s at %s:" % (lfn, targetSE), reason) opFile.Error = reason else: gMonitor.addMark("ReplicateFail", 1) opFile.Error = "DataManager error: %s" % res["Message"] self.log.error("DataManager error", res["Message"]) if not opFile.Error: if len(self.operation.targetSEList) > 1: self.log.info( "file %s has been replicated to all targetSEs" % lfn) opFile.Status = "Done" # Log error counts if delayExecution: self.log.info("Delay execution of the request by %d minutes" % delayExecution) self.request.delayNextExecution(delayExecution) for error, count in errors.iteritems(): self.log.error(error, 'for %d files' % count) return S_OK()
def main(): Script.parseCommandLine(ignoreErrors=False) args = Script.getPositionalArgs() if len(args) < 2: Script.showHelp() targetSE = args.pop(0) lfns = [] for inputFileName in args: if os.path.exists(inputFileName): inputFile = open(inputFileName, 'r') string = inputFile.read() inputFile.close() lfns.extend([lfn.strip() for lfn in string.splitlines()]) else: lfns.append(inputFileName) from DIRAC.Resources.Storage.StorageElement import StorageElement import DIRAC # Check is provided SE is OK if targetSE != 'All': se = StorageElement(targetSE) if not se.valid: print(se.errorReason) print() Script.showHelp() from DIRAC.RequestManagementSystem.Client.Request import Request from DIRAC.RequestManagementSystem.Client.Operation import Operation from DIRAC.RequestManagementSystem.Client.File import File from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient from DIRAC.RequestManagementSystem.private.RequestValidator import RequestValidator from DIRAC.Resources.Catalog.FileCatalog import FileCatalog reqClient = ReqClient() fc = FileCatalog() requestOperation = 'RemoveReplica' if targetSE == 'All': requestOperation = 'RemoveFile' for lfnList in breakListIntoChunks(lfns, 100): oRequest = Request() requestName = "%s_%s" % ( md5(repr(time.time()).encode()).hexdigest()[:16], md5(repr(time.time()).encode()).hexdigest()[:16], ) oRequest.RequestName = requestName oOperation = Operation() oOperation.Type = requestOperation oOperation.TargetSE = targetSE res = fc.getFileMetadata(lfnList) if not res['OK']: print("Can't get file metadata: %s" % res['Message']) DIRAC.exit(1) if res['Value']['Failed']: print( "Could not get the file metadata of the following, so skipping them:" ) for fFile in res['Value']['Failed']: print(fFile) lfnMetadata = res['Value']['Successful'] for lfn in lfnMetadata: rarFile = File() rarFile.LFN = lfn rarFile.Size = lfnMetadata[lfn]['Size'] rarFile.Checksum = lfnMetadata[lfn]['Checksum'] rarFile.GUID = lfnMetadata[lfn]['GUID'] rarFile.ChecksumType = 'ADLER32' oOperation.addFile(rarFile) oRequest.addOperation(oOperation) isValid = RequestValidator().validate(oRequest) if not isValid['OK']: print("Request is not valid: ", isValid['Message']) DIRAC.exit(1) result = reqClient.putRequest(oRequest) if result['OK']: print('Request %d Submitted' % result['Value']) else: print('Failed to submit Request: ', result['Message'])
def main(): # Registering arguments will automatically add their description to the help menu Script.registerArgument(" SE: StorageElement|All") Script.registerArgument(["LFN: LFN or file containing a List of LFNs"]) Script.parseCommandLine(ignoreErrors=False) # parseCommandLine show help when mandatory arguments are not specified or incorrect argument args = Script.getPositionalArgs() targetSE = args.pop(0) lfns = [] for inputFileName in args: if os.path.exists(inputFileName): with open(inputFileName, "r") as inputFile: string = inputFile.read() lfns.extend([lfn.strip() for lfn in string.splitlines()]) else: lfns.append(inputFileName) from DIRAC.Resources.Storage.StorageElement import StorageElement import DIRAC # Check is provided SE is OK if targetSE != "All": se = StorageElement(targetSE) if not se.valid: print(se.errorReason) print() Script.showHelp() from DIRAC.RequestManagementSystem.Client.Request import Request from DIRAC.RequestManagementSystem.Client.Operation import Operation from DIRAC.RequestManagementSystem.Client.File import File from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient from DIRAC.RequestManagementSystem.private.RequestValidator import RequestValidator from DIRAC.Resources.Catalog.FileCatalog import FileCatalog reqClient = ReqClient() fc = FileCatalog() requestOperation = "RemoveReplica" if targetSE == "All": requestOperation = "RemoveFile" for lfnList in breakListIntoChunks(lfns, 100): oRequest = Request() requestName = "%s_%s" % ( md5(repr(time.time()).encode()).hexdigest()[:16], md5(repr(time.time()).encode()).hexdigest()[:16], ) oRequest.RequestName = requestName oOperation = Operation() oOperation.Type = requestOperation oOperation.TargetSE = targetSE res = fc.getFileMetadata(lfnList) if not res["OK"]: print("Can't get file metadata: %s" % res["Message"]) DIRAC.exit(1) if res["Value"]["Failed"]: print( "Could not get the file metadata of the following, so skipping them:" ) for fFile in res["Value"]["Failed"]: print(fFile) lfnMetadata = res["Value"]["Successful"] for lfn in lfnMetadata: rarFile = File() rarFile.LFN = lfn rarFile.Size = lfnMetadata[lfn]["Size"] rarFile.Checksum = lfnMetadata[lfn]["Checksum"] rarFile.GUID = lfnMetadata[lfn]["GUID"] rarFile.ChecksumType = "ADLER32" oOperation.addFile(rarFile) oRequest.addOperation(oOperation) isValid = RequestValidator().validate(oRequest) if not isValid["OK"]: print("Request is not valid: ", isValid["Message"]) DIRAC.exit(1) result = reqClient.putRequest(oRequest) if result["OK"]: print("Request %d Submitted" % result["Value"]) else: print("Failed to submit Request: ", result["Message"])
class FTSRequest( object ): """ .. class:: FTSRequest Helper class for FTS job submission and monitoring. """ # # default checksum type __defaultCksmType = "ADLER32" # # flag to disablr/enable checksum test, default: disabled __cksmTest = False def __init__( self ): """c'tor :param self: self reference """ self.log = gLogger.getSubLogger( self.__class__.__name__, True ) # # final states tuple self.finalStates = ( 'Canceled', 'Failed', 'Hold', 'Finished', 'FinishedDirty' ) # # failed states tuple self.failedStates = ( 'Canceled', 'Failed', 'Hold', 'FinishedDirty' ) # # successful states tuple self.successfulStates = ( 'Finished', 'Done' ) # # all file states tuple self.fileStates = ( 'Done', 'Active', 'Pending', 'Ready', 'Canceled', 'Failed', 'Finishing', 'Finished', 'Submitted', 'Hold', 'Waiting' ) self.newlyCompletedFiles = [] self.newlyFailedFiles = [] self.statusSummary = {} # # request status self.requestStatus = 'Unknown' # # dict for FTS job files self.fileDict = {} # # dict for replicas information self.catalogReplicas = {} # # dict for metadata information self.catalogMetadata = {} # # dict for files that failed to register self.failedRegistrations = {} # # placehoder for FileCatalog reference self.oCatalog = None # # submit timestamp self.submitTime = '' # # placeholder FTS job GUID self.ftsGUID = '' # # placeholder for FTS server URL self.ftsServer = '' # # not used self.priority = 3 # # flag marking FTS job completness self.isTerminal = False # # completness percentage self.percentageComplete = 0.0 # # source SE name self.sourceSE = '' # # flag marking source SE validity self.sourceValid = False # # source space token self.sourceToken = '' # # target SE name self.targetSE = '' # # flag marking target SE validity self.targetValid = False # # target space token self.targetToken = '' # # whatever self.dumpStr = '' # # placeholder for surl file self.surlFile = None # # placeholder for target StorageElement self.oTargetSE = None # # placeholder for source StorageElement self.oSourceSE = None # # checksum type, set it to default self.__cksmType = self.__defaultCksmType # # disable checksum test by default self.__cksmTest = False # # statuses that prevent submitting to FTS self.noSubmitStatus = ( 'Failed', 'Done', 'Staging' ) # # were sources resolved? self.sourceResolved = False # # Number of file transfers actually submitted self.submittedFiles = 0 #################################################################### # # Methods for setting/getting/checking the SEs # def setSourceSE( self, se ): """ set SE for source :param self: self reference :param str se: source SE name """ if se == self.targetSE: return S_ERROR( "SourceSE is TargetSE" ) self.sourceSE = se self.oSourceSE = StorageElement( self.sourceSE ) return self.__checkSourceSE() def getSourceSE( self ): """ source SE getter :param self: self reference """ if not self.sourceSE: return S_ERROR( "Source SE not defined" ) return S_OK( self.sourceSE ) def setSourceToken( self, token ): """ set source space token :param self: self reference :param str token: source space token """ self.sourceToken = token return S_OK() def getSourceToken( self ): """ source space token getter :param self: self reference """ if not self.sourceToken: return S_ERROR( "Source token not defined" ) return S_OK( self.sourceToken ) def __checkSourceSE( self ): """ check source SE availability :param self: self reference """ if not self.sourceSE: return S_ERROR( "SourceSE not set" ) res = self.oSourceSE.isValid( 'Read' ) if not res['OK']: return S_ERROR( "SourceSE not available for reading" ) res = self.__getSESpaceToken( self.oSourceSE ) if not res['OK']: self.log.error( "FTSRequest failed to get SRM Space Token for SourceSE", res['Message'] ) return S_ERROR( "SourceSE does not support FTS transfers" ) if self.__cksmTest: res = self.oSourceSE.getChecksumType() if not res["OK"]: self.log.error( "Unable to get checksum type for SourceSE %s: %s" % ( self.sourceSE, res["Message"] ) ) cksmType = res["Value"] if cksmType in ( "NONE", "NULL" ): self.log.warn( "Checksum type set to %s at SourceSE %s, disabling checksum test" % ( cksmType, self.sourceSE ) ) self.__cksmTest = False elif cksmType != self.__cksmType: self.log.warn( "Checksum type mismatch, disabling checksum test" ) self.__cksmTest = False self.sourceToken = res['Value'] self.sourceValid = True return S_OK() def setTargetSE( self, se ): """ set target SE :param self: self reference :param str se: target SE name """ if se == self.sourceSE: return S_ERROR( "TargetSE is SourceSE" ) self.targetSE = se self.oTargetSE = StorageElement( self.targetSE ) return self.__checkTargetSE() def getTargetSE( self ): """ target SE getter :param self: self reference """ if not self.targetSE: return S_ERROR( "Target SE not defined" ) return S_OK( self.targetSE ) def setTargetToken( self, token ): """ target space token setter :param self: self reference :param str token: target space token """ self.targetToken = token return S_OK() def getTargetToken( self ): """ target space token getter :param self: self reference """ if not self.targetToken: return S_ERROR( "Target token not defined" ) return S_OK( self.targetToken ) def __checkTargetSE( self ): """ check target SE availability :param self: self reference """ if not self.targetSE: return S_ERROR( "TargetSE not set" ) res = self.oTargetSE.isValid( 'Write' ) if not res['OK']: return S_ERROR( "TargetSE not available for writing" ) res = self.__getSESpaceToken( self.oTargetSE ) if not res['OK']: self.log.error( "FTSRequest failed to get SRM Space Token for TargetSE", res['Message'] ) return S_ERROR( "TargetSE does not support FTS transfers" ) # # check checksum types if self.__cksmTest: res = self.oTargetSE.getChecksumType() if not res["OK"]: self.log.error( "Unable to get checksum type for TargetSE %s: %s" % ( self.targetSE, res["Message"] ) ) cksmType = res["Value"] if cksmType in ( "NONE", "NULL" ): self.log.warn( "Checksum type set to %s at TargetSE %s, disabling checksum test" % ( cksmType, self.targetSE ) ) self.__cksmTest = False elif cksmType != self.__cksmType: self.log.warn( "Checksum type mismatch, disabling checksum test" ) self.__cksmTest = False self.targetToken = res['Value'] self.targetValid = True return S_OK() @staticmethod def __getSESpaceToken( oSE ): """ get space token from StorageElement instance :param self: self reference :param StorageElement oSE: StorageElement instance """ res = oSE.getStorageParameters( "SRM2" ) if not res['OK']: return res return S_OK( res['Value'].get( 'SpaceToken' ) ) #################################################################### # # Methods for setting/getting FTS request parameters # def setFTSGUID( self, guid ): """ FTS job GUID setter :param self: self reference :param str guid: string containg GUID """ if not checkGuid( guid ): return S_ERROR( "Incorrect GUID format" ) self.ftsGUID = guid return S_OK() def getFTSGUID( self ): """ FTS job GUID getter :param self: self refenece """ if not self.ftsGUID: return S_ERROR( "FTSGUID not set" ) return S_OK( self.ftsGUID ) def setFTSServer( self, server ): """ FTS server setter :param self: self reference :param str server: FTS server URL """ self.ftsServer = server return S_OK() def getFTSServer( self ): """ FTS server getter :param self: self reference """ if not self.ftsServer: return S_ERROR( "FTSServer not set" ) return S_OK( self.ftsServer ) def setPriority( self, priority ): """ set priority for FTS job :param self: self reference :param int priority: a new priority """ if not type( priority ) in ( IntType, LongType ): return S_ERROR( "Priority must be integer" ) if priority < 0: priority = 0 elif priority > 5: priority = 5 self.priority = priority return S_OK( self.priority ) def getPriority( self ): """ FTS job priority getter :param self: self reference """ return S_OK( self.priority ) def getPercentageComplete( self ): """ get completness percentage :param self: self reference """ completedFiles = 0 totalFiles = 0 for state in self.statusSummary: if state in self.successfulStates: completedFiles += self.statusSummary[state] totalFiles += self.statusSummary[state] self.percentageComplete = ( float( completedFiles ) * 100.0 ) / float( totalFiles ) return S_OK( self.percentageComplete ) def isRequestTerminal( self ): """ check if FTS job has terminated :param self: self reference """ if self.requestStatus in self.finalStates: self.isTerminal = True return S_OK( self.isTerminal ) def getStatus( self ): """ get FTS job status :param self: self reference """ return S_OK( self.requestStatus ) def setCksmType( self, cksm = None ): """ set checksum type to use :param self: self reference :param mixed cksm: checksum type, should be one of 'Adler32', 'md5', 'sha1', None """ if str( cksm ).upper() not in ( "ADLER32", "MD5", "SHA1", "NONE" ): return S_ERROR( "Not supported checksum type: %s" % str( cksm ) ) if not cksm: self.__cksmType = None return S_OK( False ) self.__cksmType = str( cksm ).upper() return S_OK( True ) def getCksmType( self ): """ get checksum type :param self: self reference """ return S_OK( self.__cksmType ) def setCksmTest( self, cksmTest = False ): """ set cksm test :param self: self reference :param bool cksmTest: flag to enable/disable checksum test """ self.__cksmTest = bool( cksmTest ) return S_OK( self.__cksmTest ) def getCksmTest( self ): """ get cksm test flag :param self: self reference """ return S_OK( self.__cksmTest ) #################################################################### # # Methods for setting/getting/checking files and their metadata # def setLFN( self, lfn ): """ add LFN :lfn: to :fileDict: :param self: self reference :param str lfn: LFN to add to """ self.fileDict.setdefault( lfn, {'Status':'Waiting'} ) return S_OK() def setStatus( self, lfn, status ): """ set status of a file """ return( self.__setFileParameter( lfn, 'Status', status ) ) def setSourceSURL( self, lfn, surl ): """ source SURL setter :param self: self reference :param str lfn: LFN :param str surl: source SURL """ target = self.fileDict[lfn].get( 'Target' ) if target == surl: return S_ERROR( "Source and target the same" ) return( self.__setFileParameter( lfn, 'Source', surl ) ) def getSourceSURL( self, lfn ): """ get source SURL for LFN :lfn: :param self: self reference :param str lfn: LFN """ return self.__getFileParameter( lfn, 'Source' ) def setTargetSURL( self, lfn, surl ): """ set target SURL for LFN :lfn: :param self: self reference :param str lfn: LFN :param str surl: target SURL """ source = self.fileDict[lfn].get( 'Source' ) if source == surl: return S_ERROR( "Source and target the same" ) return( self.__setFileParameter( lfn, 'Target', surl ) ) def getTargetSURL( self, lfn ): """ target SURL getter :param self: self reference :param str lfn: LFN """ return self.__getFileParameter( lfn, 'Target' ) def getFailReason( self, lfn ): """ get fail reason for file :lfn: :param self: self reference :param str lfn: LFN """ return self.__getFileParameter( lfn, 'Reason' ) def getRetries( self, lfn ): """ get number of attepmts made to transfer file :lfn: :param self: self reference :param str lfn: LFN """ return self.__getFileParameter( lfn, 'Retries' ) def getTransferTime( self, lfn ): """ get duration of transfer for file :lfn: :param self: self reference :param str lfn: LFN """ return self.__getFileParameter( lfn, 'Duration' ) def getFailed( self ): """ get list of wrongly transferred LFNs :param self: self reference """ return S_OK( [ lfn for lfn in self.fileDict if self.fileDict[lfn].get( 'Status', '' ) in self.failedStates ] ) def getStaging( self ): """ get files set for prestaging """ return S_OK( [lfn for lfn in self.fileDict if self.fileDict[lfn].get( 'Status', '' ) == 'Staging'] ) def getDone( self ): """ get list of succesfully transferred LFNs :param self: self reference """ return S_OK( [ lfn for lfn in self.fileDict if self.fileDict[lfn].get( 'Status', '' ) in self.successfulStates ] ) def __setFileParameter( self, lfn, paramName, paramValue ): """ set :paramName: to :paramValue: for :lfn: file :param self: self reference :param str lfn: LFN :param str paramName: parameter name :param mixed paramValue: a new parameter value """ self.setLFN( lfn ) self.fileDict[lfn][paramName] = paramValue return S_OK() def __getFileParameter( self, lfn, paramName ): """ get value of :paramName: for file :lfn: :param self: self reference :param str lfn: LFN :param str paramName: parameter name """ if lfn not in self.fileDict: return S_ERROR( "Supplied file not set" ) if paramName not in self.fileDict[lfn]: return S_ERROR( "%s not set for file" % paramName ) return S_OK( self.fileDict[lfn][paramName] ) #################################################################### # # Methods for submission # def submit( self, monitor = False, printOutput = True ): """ submit FTS job :param self: self reference :param bool monitor: flag to monitor progress of FTS job :param bool printOutput: flag to print output of execution to stdout """ res = self.__isSubmissionValid() if not res['OK']: return res res = self.__createSURLPairFile() if not res['OK']: return res res = self.__submitFTSTransfer() if not res['OK']: return res resDict = { 'ftsGUID' : self.ftsGUID, 'ftsServer' : self.ftsServer, 'submittedFiles' : self.submittedFiles } if monitor or printOutput: gLogger.always( "Submitted %s@%s" % ( self.ftsGUID, self.ftsServer ) ) if monitor: self.monitor( untilTerminal = True, printOutput = printOutput ) return S_OK( resDict ) def __isSubmissionValid( self ): """ check validity of job before submission :param self: self reference """ if not self.fileDict: return S_ERROR( "No files set" ) if not self.sourceValid: return S_ERROR( "SourceSE not valid" ) if not self.targetValid: return S_ERROR( "TargetSE not valid" ) if not self.ftsServer: res = self.__resolveFTSServer() if not res['OK']: return S_ERROR( "FTSServer not valid" ) self.resolveSource() self.resolveTarget() res = self.__filesToSubmit() if not res['OK']: return S_ERROR( "No files to submit" ) return S_OK() def __getCatalogObject( self ): """ CatalogInterface instance facade :param self: self reference """ try: if not self.oCatalog: self.oCatalog = FileCatalog() return S_OK() except: return S_ERROR() def __updateReplicaCache( self, lfns = None, overwrite = False ): """ update replica cache for list of :lfns: :param self: self reference :param mixed lfns: list of LFNs :param bool overwrite: flag to trigger cache clearing and updating """ if not lfns: lfns = self.fileDict.keys() toUpdate = [ lfn for lfn in lfns if ( lfn not in self.catalogReplicas ) or overwrite ] if not toUpdate: return S_OK() res = self.__getCatalogObject() if not res['OK']: return res res = self.oCatalog.getReplicas( toUpdate ) if not res['OK']: return S_ERROR( "Failed to update replica cache: %s" % res['Message'] ) for lfn, error in res['Value']['Failed'].items(): self.__setFileParameter( lfn, 'Reason', error ) self.__setFileParameter( lfn, 'Status', 'Failed' ) for lfn, replicas in res['Value']['Successful'].items(): self.catalogReplicas[lfn] = replicas return S_OK() def __updateMetadataCache( self, lfns = None, overwrite = False ): """ update metadata cache for list of LFNs :param self: self reference :param list lnfs: list of LFNs :param bool overwrite: flag to trigger cache clearing and updating """ if not lfns: lfns = self.fileDict.keys() toUpdate = [ lfn for lfn in lfns if ( lfn not in self.catalogMetadata ) or overwrite ] if not toUpdate: return S_OK() res = self.__getCatalogObject() if not res['OK']: return res res = self.oCatalog.getFileMetadata( toUpdate ) if not res['OK']: return S_ERROR( "Failed to get source catalog metadata: %s" % res['Message'] ) for lfn, error in res['Value']['Failed'].items(): self.__setFileParameter( lfn, 'Reason', error ) self.__setFileParameter( lfn, 'Status', 'Failed' ) for lfn, metadata in res['Value']['Successful'].items(): self.catalogMetadata[lfn] = metadata return S_OK() def resolveSource( self ): """ resolve source SE eligible for submission :param self: self reference """ # Avoid resolving sources twice if self.sourceResolved: return S_OK() # Only resolve files that need a transfer toResolve = [ lfn for lfn in self.fileDict if self.fileDict[lfn].get( "Status", "" ) != "Failed" ] if not toResolve: return S_OK() res = self.__updateMetadataCache( toResolve ) if not res['OK']: return res res = self.__updateReplicaCache( toResolve ) if not res['OK']: return res # Define the source URLs for lfn in toResolve: replicas = self.catalogReplicas.get( lfn, {} ) if self.sourceSE not in replicas: gLogger.warn( "resolveSource: skipping %s - not replicas at SourceSE %s" % ( lfn, self.sourceSE ) ) self.__setFileParameter( lfn, 'Reason', "No replica at SourceSE" ) self.__setFileParameter( lfn, 'Status', 'Failed' ) continue # Fix first the PFN pfn = self.oSourceSE.getPfnForLfn( lfn ).get( 'Value', {} ).get( 'Successful', {} ).get( lfn, replicas[self.sourceSE] ) res = Utils.executeSingleFileOrDirWrapper( self.oSourceSE.getPfnForProtocol( pfn, protocol = 'SRM2', withPort = True ) ) if not res['OK']: gLogger.warn( "resolveSource: skipping %s - %s" % ( lfn, res["Message"] ) ) self.__setFileParameter( lfn, 'Reason', res['Message'] ) self.__setFileParameter( lfn, 'Status', 'Failed' ) continue res = self.setSourceSURL( lfn, res['Value'] ) if not res['OK']: gLogger.warn( "resolveSource: skipping %s - %s" % ( lfn, res["Message"] ) ) self.__setFileParameter( lfn, 'Reason', res['Message'] ) self.__setFileParameter( lfn, 'Status', 'Failed' ) continue toResolve = {} for lfn in self.fileDict: if "Source" in self.fileDict[lfn]: toResolve[self.fileDict[lfn]['Source']] = lfn if not toResolve: return S_ERROR( "No eligible Source files" ) # Get metadata of the sources, to check for existance, availability and caching res = self.oSourceSE.getFileMetadata( toResolve.keys() ) if not res['OK']: return S_ERROR( "Failed to check source file metadata" ) for pfn, error in res['Value']['Failed'].items(): lfn = toResolve[pfn] if re.search( 'File does not exist', error ): gLogger.warn( "resolveSource: skipping %s - source file does not exists" % lfn ) self.__setFileParameter( lfn, 'Reason', "Source file does not exist" ) self.__setFileParameter( lfn, 'Status', 'Failed' ) else: gLogger.warn( "resolveSource: skipping %s - failed to get source metadata" % lfn ) self.__setFileParameter( lfn, 'Reason', "Failed to get Source metadata" ) self.__setFileParameter( lfn, 'Status', 'Failed' ) toStage = [] nbStagedFiles = 0 for pfn, metadata in res['Value']['Successful'].items(): lfn = toResolve[pfn] lfnStatus = self.fileDict.get( lfn, {} ).get( 'Status' ) if metadata['Unavailable']: gLogger.warn( "resolveSource: skipping %s - source file unavailable" % lfn ) self.__setFileParameter( lfn, 'Reason', "Source file Unavailable" ) self.__setFileParameter( lfn, 'Status', 'Failed' ) elif metadata['Lost']: gLogger.warn( "resolveSource: skipping %s - source file lost" % lfn ) self.__setFileParameter( lfn, 'Reason', "Source file Lost" ) self.__setFileParameter( lfn, 'Status', 'Failed' ) elif not metadata['Cached']: if lfnStatus != 'Staging': toStage.append( pfn ) elif metadata['Size'] != self.catalogMetadata[lfn]['Size']: gLogger.warn( "resolveSource: skipping %s - source file size mismatch" % lfn ) self.__setFileParameter( lfn, 'Reason', "Source size mismatch" ) self.__setFileParameter( lfn, 'Status', 'Failed' ) elif self.catalogMetadata[lfn]['Checksum'] and metadata['Checksum'] and \ not ( compareAdler( metadata['Checksum'], self.catalogMetadata[lfn]['Checksum'] ) ): gLogger.warn( "resolveSource: skipping %s - source file checksum mismatch" % lfn ) self.__setFileParameter( lfn, 'Reason', "Source checksum mismatch" ) self.__setFileParameter( lfn, 'Status', 'Failed' ) elif lfnStatus == 'Staging': # file that was staging is now cached self.__setFileParameter( lfn, 'Status', 'Waiting' ) nbStagedFiles += 1 # Some files were being staged if nbStagedFiles: self.log.info( 'resolveSource: %d files have been staged' % nbStagedFiles ) # Launching staging of files not in cache if toStage: gLogger.warn( "resolveSource: %s source files not cached, prestaging..." % len( toStage ) ) stage = self.oSourceSE.prestageFile( toStage ) if not stage["OK"]: gLogger.error( "resolveSource: error is prestaging - %s" % stage["Message"] ) for pfn in toStage: lfn = toResolve[pfn] self.__setFileParameter( lfn, 'Reason', stage["Message"] ) self.__setFileParameter( lfn, 'Status', 'Failed' ) else: for pfn in toStage: lfn = toResolve[pfn] if pfn in stage['Value']['Successful']: self.__setFileParameter( lfn, 'Status', 'Staging' ) elif pfn in stage['Value']['Failed']: self.__setFileParameter( lfn, 'Reason', stage['Value']['Failed'][pfn] ) self.__setFileParameter( lfn, 'Status', 'Failed' ) self.sourceResolved = True return S_OK() def resolveTarget( self ): """ find target SE eligible for submission :param self: self reference """ toResolve = [ lfn for lfn in self.fileDict if self.fileDict[lfn].get( 'Status' ) not in self.noSubmitStatus ] if not toResolve: return S_OK() res = self.__updateReplicaCache( toResolve ) if not res['OK']: return res for lfn in toResolve: res = self.oTargetSE.getPfnForLfn( lfn ) if not res['OK'] or lfn not in res['Value']['Successful']: gLogger.warn( "resolveTarget: skipping %s - failed to create target pfn" % lfn ) self.__setFileParameter( lfn, 'Reason', "Failed to create Target" ) self.__setFileParameter( lfn, 'Status', 'Failed' ) continue pfn = res['Value']['Successful'][lfn] res = self.oTargetSE.getPfnForProtocol( pfn, protocol = 'SRM2', withPort = True ) if not res['OK'] or pfn not in res['Value']['Successful']: reason = res.get( 'Message', res.get( 'Value', {} ).get( 'Failed', {} ).get( pfn ) ) gLogger.warn( "resolveTarget: skipping %s - %s" % ( lfn, reason ) ) self.__setFileParameter( lfn, 'Reason', reason ) self.__setFileParameter( lfn, 'Status', 'Failed' ) continue pfn = res['Value']['Successful'][pfn] res = self.setTargetSURL( lfn, pfn ) if not res['OK']: gLogger.warn( "resolveTarget: skipping %s - %s" % ( lfn, res["Message"] ) ) self.__setFileParameter( lfn, 'Reason', res['Message'] ) self.__setFileParameter( lfn, 'Status', 'Failed' ) continue toResolve = {} for lfn in self.fileDict: if "Target" in self.fileDict[lfn]: toResolve[self.fileDict[lfn]['Target']] = lfn if not toResolve: return S_ERROR( "No eligible Target files" ) res = self.oTargetSE.exists( toResolve.keys() ) if not res['OK']: return S_ERROR( "Failed to check target existence" ) for pfn, error in res['Value']['Failed'].items(): lfn = toResolve[pfn] self.__setFileParameter( lfn, 'Reason', error ) self.__setFileParameter( lfn, 'Status', 'Failed' ) toRemove = [] for pfn, exists in res['Value']['Successful'].items(): if exists: lfn = toResolve[pfn] res = self.getSourceSURL( lfn ) if not res['OK']: gLogger.warn( "resolveTarget: skipping %s - target exists" % lfn ) self.__setFileParameter( lfn, 'Reason', "Target exists" ) self.__setFileParameter( lfn, 'Status', 'Failed' ) elif res['Value'] == pfn: gLogger.warn( "resolveTarget: skipping %s - source and target pfns are the same" % lfn ) self.__setFileParameter( lfn, 'Reason', "Source and Target the same" ) self.__setFileParameter( lfn, 'Status', 'Failed' ) else: toRemove.append( pfn ) if toRemove: self.oTargetSE.removeFile( toRemove ) return S_OK() def __filesToSubmit( self ): """ check if there is at least one file to submit :return: S_OK if at least one file is present, S_ERROR otherwise """ for lfn in self.fileDict: lfnStatus = self.fileDict[lfn].get( 'Status' ) source = self.fileDict[lfn].get( 'Source' ) target = self.fileDict[lfn].get( 'Target' ) if lfnStatus not in self.noSubmitStatus and source and target: return S_OK() return S_ERROR() def __createSURLPairFile( self ): """ create LFNs file for glite-transfer-submit command This file consists one line for each fiel to be transferred: sourceSURL targetSURL [CHECKSUMTYPE:CHECKSUM] :param self: self reference """ fd, fileName = tempfile.mkstemp() surlFile = os.fdopen( fd, 'w' ) for lfn in self.fileDict: lfnStatus = self.fileDict[lfn].get( 'Status' ) source = self.fileDict[lfn].get( 'Source' ) target = self.fileDict[lfn].get( 'Target' ) if lfnStatus not in self.noSubmitStatus and source and target: cksmStr = "" # # add chsmType:cksm only if cksmType is specified, else let FTS decide by itself if self.__cksmTest and self.__cksmType: checkSum = self.catalogMetadata.get( lfn, {} ).get( 'Checksum' ) if checkSum: cksmStr = " %s:%s" % ( self.__cksmType, intAdlerToHex( hexAdlerToInt( checkSum ) ) ) surlFile.write( "%s %s%s\n" % ( source, target, cksmStr ) ) self.submittedFiles += 1 surlFile.close() self.surlFile = fileName return S_OK() def __submitFTSTransfer( self ): """ create and execute glite-transfer-submit CLI command :param self: self reference """ comm = [ 'glite-transfer-submit', '-s', self.ftsServer, '-f', self.surlFile, '-o' ] if self.targetToken: comm += [ '-t', self.targetToken ] if self.sourceToken: comm += [ '-S', self.sourceToken ] if self.__cksmTest: comm.append( "--compare-checksums" ) gLogger.verbose( 'Executing %s' % ' '.join( comm ) ) res = executeGridCommand( '', comm ) os.remove( self.surlFile ) if not res['OK']: return res returnCode, output, errStr = res['Value'] if not returnCode == 0: return S_ERROR( errStr ) guid = output.replace( '\n', '' ) if not checkGuid( guid ): return S_ERROR( 'Wrong GUID format returned' ) self.ftsGUID = guid # if self.priority != 3: # comm = ['glite-transfer-setpriority','-s', self.ftsServer,self.ftsGUID,str(self.priority)] # executeGridCommand('',comm) return res def __getFTSServer( self, site ): try: configPath = '/Resources/FTSEndpoints/%s' % site endpointURL = gConfig.getValue( configPath ) if not endpointURL: errStr = "FTSRequest.__getFTSServer: Failed to find FTS endpoint, check CS entry for '%s'." % site return S_ERROR( errStr ) return S_OK( endpointURL ) except Exception, x: return S_ERROR( 'FTSRequest.__getFTSServer: Failed to obtain endpoint details from CS' )
from DIRAC.RequestManagementSystem.Client.Operation import Operation from DIRAC.RequestManagementSystem.Client.File import File from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient from DIRAC.Resources.Catalog.FileCatalog import FileCatalog from DIRAC.Core.Utilities.List import breakListIntoChunks lfnChunks = breakListIntoChunks( lfnList, 100 ) multiRequests = len( lfnChunks ) > 1 error = 0 count = 0 reqClient = ReqClient() fc = FileCatalog() requestIDs = [] for lfnChunk in lfnChunks: metaDatas = fc.getFileMetadata( lfnChunk ) if not metaDatas["OK"]: gLogger.error( "unable to read metadata for lfns: %s" % metaDatas["Message"] ) error = -1 continue metaDatas = metaDatas["Value"] for failedLFN, reason in metaDatas["Failed"].items(): gLogger.error( "skipping %s: %s" % ( failedLFN, reason ) ) lfnChunk = set( metaDatas["Successful"] ) if not lfnChunk: gLogger.error( "LFN list is empty!!!" ) error = -1 continue if len( lfnChunk ) > Operation.MAX_FILES:
class InputDataAgent( OptimizerModule ): """ The specific Optimizer must provide the following methods: - initializeOptimizer() before each execution cycle - checkJob() - the main method called for each job """ ############################################################################# def initializeOptimizer( self ): """Initialize specific parameters for JobSanityAgent. """ self.failedMinorStatus = self.am_getOption( '/FailedJobStatus', 'Input Data Not Available' ) #this will ignore failover SE files self.checkFileMetadata = self.am_getOption( 'CheckFileMetadata', True ) self.dataManager = DataManager() self.resourceStatus = ResourceStatus() self.fc = FileCatalog() self.seToSiteMapping = {} self.lastCScheck = 0 self.cacheLength = 600 return S_OK() ############################################################################# def checkJob( self, job, classAdJob ): """ This method does the optimization corresponding to this Agent, it is call for each job by the Optimizer framework """ result = self.jobDB.getInputData( job ) if not result['OK']: self.log.warn( 'Failed to get input data from JobdB for %s' % ( job ) ) self.log.warn( result['Message'] ) return result if not result['Value']: self.log.verbose( 'Job %s has no input data requirement' % ( job ) ) return self.setNextOptimizer( job ) #Check if we already executed this Optimizer and the input data is resolved res = self.getOptimizerJobInfo( job, self.am_getModuleParam( 'optimizerName' ) ) if res['OK'] and len( res['Value'] ): pass else: self.log.verbose( 'Job %s has an input data requirement and will be processed' % ( job ) ) inputData = result['Value'] result = self.__resolveInputData( job, inputData ) if not result['OK']: self.log.warn( result['Message'] ) return result return self.setNextOptimizer( job ) ############################################################################# def __resolveInputData( self, job, inputData ): """This method checks the file catalog for replica information. """ lfns = [ fname.replace( 'LFN:', '' ) for fname in inputData ] start = time.time() # In order to place jobs on Hold if a certain SE is banned we need first to check first if # if the replicas are really available replicas = self.dataManager.getActiveReplicas( lfns ) timing = time.time() - start self.log.verbose( 'Catalog Replicas Lookup Time: %.2f seconds ' % ( timing ) ) if not replicas['OK']: self.log.warn( replicas['Message'] ) return replicas replicaDict = replicas['Value'] siteCandidates = self.__checkReplicas( job, replicaDict ) if not siteCandidates['OK']: self.log.warn( siteCandidates['Message'] ) return siteCandidates if self.checkFileMetadata: guids = True start = time.time() guidDict = self.fc.getFileMetadata( lfns ) timing = time.time() - start self.log.info( 'Catalog Metadata Lookup Time: %.2f seconds ' % ( timing ) ) if not guidDict['OK']: self.log.warn( guidDict['Message'] ) guids = False failed = guidDict['Value']['Failed'] if failed: self.log.warn( 'Failed to establish some GUIDs' ) self.log.warn( failed ) guids = False if guids: for lfn, reps in replicaDict['Successful'].items(): guidDict['Value']['Successful'][lfn].update( reps ) replicas = guidDict resolvedData = {} resolvedData['Value'] = replicas resolvedData['SiteCandidates'] = siteCandidates['Value'] result = self.setOptimizerJobInfo( job, self.am_getModuleParam( 'optimizerName' ), resolvedData ) if not result['OK']: self.log.warn( result['Message'] ) return result return S_OK( resolvedData ) ############################################################################# def __checkReplicas( self, job, replicaDict ): """Check that all input lfns have valid replicas and can all be found at least in one single site. """ badLFNs = [] if replicaDict.has_key( 'Successful' ): for lfn, reps in replicaDict['Successful'].items(): if not reps: badLFNs.append( 'LFN:%s Problem: No replicas available' % ( lfn ) ) else: return S_ERROR( 'No replica Info available' ) if replicaDict.has_key( 'Failed' ): for lfn, cause in replicaDict['Failed'].items(): badLFNs.append( 'LFN:%s Problem: %s' % ( lfn, cause ) ) if badLFNs: self.log.info( 'Found %s problematic LFN(s) for job %s' % ( len( badLFNs ), job ) ) param = '\n'.join( badLFNs ) self.log.info( param ) result = self.setJobParam( job, self.am_getModuleParam( 'optimizerName' ), param ) if not result['OK']: self.log.error( result['Message'] ) return S_ERROR( 'Input Data Not Available' ) return self.__getSiteCandidates( replicaDict['Successful'] ) ############################################################################# # FIXME: right now this is unused... def __checkActiveSEs( self, job, replicaDict ): """ Check active SE and replicas and identify possible Site candidates for the execution of the job """ # Now let's check if some replicas might not be available due to banned SE's activeReplicas = self.dataManager.checkActiveReplicas( replicaDict ) if not activeReplicas['OK']: # due to banned SE's input data might no be available msg = "On Hold: Missing replicas due to banned SE" self.log.info( msg ) self.log.warn( activeReplicas['Message'] ) return S_ERROR( msg ) activeReplicaDict = activeReplicas['Value'] siteCandidates = self.__checkReplicas( job, activeReplicaDict ) if not siteCandidates['OK']: # due to a banned SE's input data is not available at a single site msg = "On Hold: Input data not Available due to banned SE" self.log.info( msg ) self.log.warn( siteCandidates['Message'] ) return S_ERROR( msg ) resolvedData = {} resolvedData['Value'] = activeReplicas resolvedData['SiteCandidates'] = siteCandidates['Value'] result = self.setOptimizerJobInfo( job, self.am_getModuleParam( 'optimizerName' ), resolvedData ) if not result['OK']: self.log.warn( result['Message'] ) return result return S_OK( resolvedData ) ############################################################################# def __getSitesForSE( self, se ): """ Returns a list of sites having the given SE as a local one. Uses the local cache of the site-se information """ # Empty the cache if too old if ( time.time() - self.lastCScheck ) > self.cacheLength: self.log.verbose( 'Resetting the SE to site mapping cache' ) self.seToSiteMapping = {} self.lastCScheck = time.time() if se not in self.seToSiteMapping: sites = getSitesForSE( se ) if sites['OK']: self.seToSiteMapping[se] = list( sites['Value'] ) return sites else: return S_OK( self.seToSiteMapping[se] ) ############################################################################# def __getSiteCandidates( self, inputData ): """This method returns a list of possible site candidates based on the job input data requirement. For each site candidate, the number of files on disk and tape is resolved. """ fileSEs = {} for lfn, replicas in inputData.items(): siteList = [] for se in replicas.keys(): sites = self.__getSitesForSE( se ) if sites['OK']: siteList += sites['Value'] fileSEs[lfn] = uniqueElements( siteList ) siteCandidates = [] i = 0 for _fileName, sites in fileSEs.items(): if not i: siteCandidates = sites else: tempSite = [] for site in siteCandidates: if site in sites: tempSite.append( site ) siteCandidates = tempSite i += 1 if not len( siteCandidates ): return S_ERROR( 'No candidate sites available' ) #In addition, check number of files on tape and disk for each site #for optimizations during scheduling siteResult = {} for site in siteCandidates: siteResult[site] = { 'disk': [], 'tape': [] } seDict = {} for lfn, replicas in inputData.items(): for se in replicas.keys(): if se not in seDict: sites = self.__getSitesForSE( se ) if not sites['OK']: continue try: #storageElement = StorageElement( se ) result = self.resourceStatus.getStorageElementStatus( se, statusType = 'ReadAccess' ) if not result['OK']: continue seDict[se] = { 'Sites': sites['Value'], 'SEParams': result['Value'][se] } result = getStorageElementOptions( se ) if not result['OK']: continue seDict[se]['SEParams'].update(result['Value']) except Exception: self.log.exception( 'Failed to instantiate StorageElement( %s )' % se ) continue for site in seDict[se]['Sites']: if site in siteCandidates: if seDict[se]['SEParams']['ReadAccess'] and seDict[se]['SEParams']['DiskSE']: if lfn not in siteResult[site]['disk']: siteResult[site]['disk'].append( lfn ) if lfn in siteResult[site]['tape']: siteResult[site]['tape'].remove( lfn ) if seDict[se]['SEParams']['ReadAccess'] and seDict[se]['SEParams']['TapeSE']: if lfn not in siteResult[site]['tape'] and lfn not in siteResult[site]['disk']: siteResult[site]['tape'].append( lfn ) for site in siteResult: siteResult[site]['disk'] = len( siteResult[site]['disk'] ) siteResult[site]['tape'] = len( siteResult[site]['tape'] ) return S_OK( siteResult )
class FTSRequest( object ): """ .. class:: FTSRequest Helper class for FTS job submission and monitoring. """ # # default checksum type __defaultCksmType = "ADLER32" # # flag to disablr/enable checksum test, default: disabled __cksmTest = False def __init__( self ): """c'tor :param self: self reference """ self.log = gLogger.getSubLogger( self.__class__.__name__, True ) # # final states tuple self.finalStates = ( 'Canceled', 'Failed', 'Hold', 'Finished', 'FinishedDirty' ) # # failed states tuple self.failedStates = ( 'Canceled', 'Failed', 'Hold', 'FinishedDirty' ) # # successful states tuple self.successfulStates = ( 'Finished', 'Done' ) # # all file states tuple self.fileStates = ( 'Done', 'Active', 'Pending', 'Ready', 'Canceled', 'Failed', 'Finishing', 'Finished', 'Submitted', 'Hold', 'Waiting' ) self.statusSummary = {} # # request status self.requestStatus = 'Unknown' # # dict for FTS job files self.fileDict = {} # # dict for replicas information self.catalogReplicas = {} # # dict for metadata information self.catalogMetadata = {} # # dict for files that failed to register self.failedRegistrations = {} # # placehoder for FileCatalog reference self.oCatalog = None # # submit timestamp self.submitTime = '' # # placeholder FTS job GUID self.ftsGUID = '' # # placeholder for FTS server URL self.ftsServer = '' # # flag marking FTS job completness self.isTerminal = False # # completness percentage self.percentageComplete = 0.0 # # source SE name self.sourceSE = '' # # flag marking source SE validity self.sourceValid = False # # source space token self.sourceToken = '' # # target SE name self.targetSE = '' # # flag marking target SE validity self.targetValid = False # # target space token self.targetToken = '' # # placeholder for target StorageElement self.oTargetSE = None # # placeholder for source StorageElement self.oSourceSE = None # # checksum type, set it to default self.__cksmType = self.__defaultCksmType # # disable checksum test by default self.__cksmTest = False # # statuses that prevent submitting to FTS self.noSubmitStatus = ( 'Failed', 'Done', 'Staging' ) # # were sources resolved? self.sourceResolved = False # # Number of file transfers actually submitted self.submittedFiles = 0 self.transferTime = 0 self.submitCommand = Operations().getValue( 'DataManagement/FTSPlacement/FTS2/SubmitCommand', 'glite-transfer-submit' ) self.monitorCommand = Operations().getValue( 'DataManagement/FTSPlacement/FTS2/MonitorCommand', 'glite-transfer-status' ) self.ftsJob = None self.ftsFiles = [] #################################################################### # # Methods for setting/getting/checking the SEs # def setSourceSE( self, se ): """ set SE for source :param self: self reference :param str se: source SE name """ if se == self.targetSE: return S_ERROR( "SourceSE is TargetSE" ) self.sourceSE = se self.oSourceSE = StorageElement( self.sourceSE ) return self.__checkSourceSE() def __checkSourceSE( self ): """ check source SE availability :param self: self reference """ if not self.sourceSE: return S_ERROR( "SourceSE not set" ) res = self.oSourceSE.isValid( 'Read' ) if not res['OK']: return S_ERROR( "SourceSE not available for reading" ) res = self.__getSESpaceToken( self.oSourceSE ) if not res['OK']: self.log.error( "FTSRequest failed to get SRM Space Token for SourceSE", res['Message'] ) return S_ERROR( "SourceSE does not support FTS transfers" ) if self.__cksmTest: res = self.oSourceSE.getChecksumType() if not res["OK"]: self.log.error( "Unable to get checksum type for SourceSE %s: %s" % ( self.sourceSE, res["Message"] ) ) cksmType = res["Value"] if cksmType in ( "NONE", "NULL" ): self.log.warn( "Checksum type set to %s at SourceSE %s, disabling checksum test" % ( cksmType, self.sourceSE ) ) self.__cksmTest = False elif cksmType != self.__cksmType: self.log.warn( "Checksum type mismatch, disabling checksum test" ) self.__cksmTest = False self.sourceToken = res['Value'] self.sourceValid = True return S_OK() def setTargetSE( self, se ): """ set target SE :param self: self reference :param str se: target SE name """ if se == self.sourceSE: return S_ERROR( "TargetSE is SourceSE" ) self.targetSE = se self.oTargetSE = StorageElement( self.targetSE ) return self.__checkTargetSE() def setTargetToken( self, token ): """ target space token setter :param self: self reference :param str token: target space token """ self.targetToken = token return S_OK() def __checkTargetSE( self ): """ check target SE availability :param self: self reference """ if not self.targetSE: return S_ERROR( "TargetSE not set" ) res = self.oTargetSE.isValid( 'Write' ) if not res['OK']: return S_ERROR( "TargetSE not available for writing" ) res = self.__getSESpaceToken( self.oTargetSE ) if not res['OK']: self.log.error( "FTSRequest failed to get SRM Space Token for TargetSE", res['Message'] ) return S_ERROR( "TargetSE does not support FTS transfers" ) # # check checksum types if self.__cksmTest: res = self.oTargetSE.getChecksumType() if not res["OK"]: self.log.error( "Unable to get checksum type for TargetSE %s: %s" % ( self.targetSE, res["Message"] ) ) cksmType = res["Value"] if cksmType in ( "NONE", "NULL" ): self.log.warn( "Checksum type set to %s at TargetSE %s, disabling checksum test" % ( cksmType, self.targetSE ) ) self.__cksmTest = False elif cksmType != self.__cksmType: self.log.warn( "Checksum type mismatch, disabling checksum test" ) self.__cksmTest = False self.targetToken = res['Value'] self.targetValid = True return S_OK() @staticmethod def __getSESpaceToken( oSE ): """ get space token from StorageElement instance :param self: self reference :param StorageElement oSE: StorageElement instance """ res = oSE.getStorageParameters( "SRM2" ) if not res['OK']: return res return S_OK( res['Value'].get( 'SpaceToken' ) ) #################################################################### # # Methods for setting/getting FTS request parameters # def setFTSGUID( self, guid ): """ FTS job GUID setter :param self: self reference :param str guid: string containg GUID """ if not checkGuid( guid ): return S_ERROR( "Incorrect GUID format" ) self.ftsGUID = guid return S_OK() def setFTSServer( self, server ): """ FTS server setter :param self: self reference :param str server: FTS server URL """ self.ftsServer = server return S_OK() def isRequestTerminal( self ): """ check if FTS job has terminated :param self: self reference """ if self.requestStatus in self.finalStates: self.isTerminal = True return S_OK( self.isTerminal ) def setCksmTest( self, cksmTest = False ): """ set cksm test :param self: self reference :param bool cksmTest: flag to enable/disable checksum test """ self.__cksmTest = bool( cksmTest ) return S_OK( self.__cksmTest ) #################################################################### # # Methods for setting/getting/checking files and their metadata # def setLFN( self, lfn ): """ add LFN :lfn: to :fileDict: :param self: self reference :param str lfn: LFN to add to """ self.fileDict.setdefault( lfn, {'Status':'Waiting'} ) return S_OK() def setSourceSURL( self, lfn, surl ): """ source SURL setter :param self: self reference :param str lfn: LFN :param str surl: source SURL """ target = self.fileDict[lfn].get( 'Target' ) if target == surl: return S_ERROR( "Source and target the same" ) return self.__setFileParameter( lfn, 'Source', surl ) def getSourceSURL( self, lfn ): """ get source SURL for LFN :lfn: :param self: self reference :param str lfn: LFN """ return self.__getFileParameter( lfn, 'Source' ) def setTargetSURL( self, lfn, surl ): """ set target SURL for LFN :lfn: :param self: self reference :param str lfn: LFN :param str surl: target SURL """ source = self.fileDict[lfn].get( 'Source' ) if source == surl: return S_ERROR( "Source and target the same" ) return self.__setFileParameter( lfn, 'Target', surl ) def getFailReason( self, lfn ): """ get fail reason for file :lfn: :param self: self reference :param str lfn: LFN """ return self.__getFileParameter( lfn, 'Reason' ) def getRetries( self, lfn ): """ get number of attepmts made to transfer file :lfn: :param self: self reference :param str lfn: LFN """ return self.__getFileParameter( lfn, 'Retries' ) def getTransferTime( self, lfn ): """ get duration of transfer for file :lfn: :param self: self reference :param str lfn: LFN """ return self.__getFileParameter( lfn, 'Duration' ) def getFailed( self ): """ get list of wrongly transferred LFNs :param self: self reference """ return S_OK( [ lfn for lfn in self.fileDict if self.fileDict[lfn].get( 'Status', '' ) in self.failedStates ] ) def getStaging( self ): """ get files set for prestaging """ return S_OK( [lfn for lfn in self.fileDict if self.fileDict[lfn].get( 'Status', '' ) == 'Staging'] ) def getDone( self ): """ get list of succesfully transferred LFNs :param self: self reference """ return S_OK( [ lfn for lfn in self.fileDict if self.fileDict[lfn].get( 'Status', '' ) in self.successfulStates ] ) def __setFileParameter( self, lfn, paramName, paramValue ): """ set :paramName: to :paramValue: for :lfn: file :param self: self reference :param str lfn: LFN :param str paramName: parameter name :param mixed paramValue: a new parameter value """ self.setLFN( lfn ) self.fileDict[lfn][paramName] = paramValue return S_OK() def __getFileParameter( self, lfn, paramName ): """ get value of :paramName: for file :lfn: :param self: self reference :param str lfn: LFN :param str paramName: parameter name """ if lfn not in self.fileDict: return S_ERROR( "Supplied file not set" ) if paramName not in self.fileDict[lfn]: return S_ERROR( "%s not set for file" % paramName ) return S_OK( self.fileDict[lfn][paramName] ) #################################################################### # # Methods for submission # def submit( self, monitor = False, printOutput = True ): """ submit FTS job :param self: self reference :param bool monitor: flag to monitor progress of FTS job :param bool printOutput: flag to print output of execution to stdout """ res = self.__prepareForSubmission() if not res['OK']: return res res = self.__submitFTSTransfer() if not res['OK']: return res resDict = { 'ftsGUID' : self.ftsGUID, 'ftsServer' : self.ftsServer, 'submittedFiles' : self.submittedFiles } if monitor or printOutput: gLogger.always( "Submitted %s@%s" % ( self.ftsGUID, self.ftsServer ) ) if monitor: self.monitor( untilTerminal = True, printOutput = printOutput, full = False ) return S_OK( resDict ) def __prepareForSubmission( self ): """ check validity of job before submission :param self: self reference """ if not self.fileDict: return S_ERROR( "No files set" ) if not self.sourceValid: return S_ERROR( "SourceSE not valid" ) if not self.targetValid: return S_ERROR( "TargetSE not valid" ) if not self.ftsServer: res = self.__resolveFTSServer() if not res['OK']: return S_ERROR( "FTSServer not valid" ) self.resolveSource() self.resolveTarget() res = self.__filesToSubmit() if not res['OK']: return S_ERROR( "No files to submit" ) return S_OK() def __getCatalogObject( self ): """ CatalogInterface instance facade :param self: self reference """ try: if not self.oCatalog: self.oCatalog = FileCatalog() return S_OK() except: return S_ERROR() def __updateReplicaCache( self, lfns = None, overwrite = False ): """ update replica cache for list of :lfns: :param self: self reference :param mixed lfns: list of LFNs :param bool overwrite: flag to trigger cache clearing and updating """ if not lfns: lfns = self.fileDict.keys() toUpdate = [ lfn for lfn in lfns if ( lfn not in self.catalogReplicas ) or overwrite ] if not toUpdate: return S_OK() res = self.__getCatalogObject() if not res['OK']: return res res = self.oCatalog.getReplicas( toUpdate ) if not res['OK']: return S_ERROR( "Failed to update replica cache: %s" % res['Message'] ) for lfn, error in res['Value']['Failed'].items(): self.__setFileParameter( lfn, 'Reason', error ) self.__setFileParameter( lfn, 'Status', 'Failed' ) for lfn, replicas in res['Value']['Successful'].items(): self.catalogReplicas[lfn] = replicas return S_OK() def __updateMetadataCache( self, lfns = None ): """ update metadata cache for list of LFNs :param self: self reference :param list lnfs: list of LFNs """ if not lfns: lfns = self.fileDict.keys() toUpdate = [ lfn for lfn in lfns if lfn not in self.catalogMetadata ] if not toUpdate: return S_OK() res = self.__getCatalogObject() if not res['OK']: return res res = self.oCatalog.getFileMetadata( toUpdate ) if not res['OK']: return S_ERROR( "Failed to get source catalog metadata: %s" % res['Message'] ) for lfn, error in res['Value']['Failed'].items(): self.__setFileParameter( lfn, 'Reason', error ) self.__setFileParameter( lfn, 'Status', 'Failed' ) for lfn, metadata in res['Value']['Successful'].items(): self.catalogMetadata[lfn] = metadata return S_OK() def resolveSource( self ): """ resolve source SE eligible for submission :param self: self reference """ # Avoid resolving sources twice if self.sourceResolved: return S_OK() # Only resolve files that need a transfer toResolve = [ lfn for lfn in self.fileDict if self.fileDict[lfn].get( "Status", "" ) != "Failed" ] if not toResolve: return S_OK() res = self.__updateMetadataCache( toResolve ) if not res['OK']: return res res = self.__updateReplicaCache( toResolve ) if not res['OK']: return res # Define the source URLs for lfn in toResolve: replicas = self.catalogReplicas.get( lfn, {} ) if self.sourceSE not in replicas: gLogger.warn( "resolveSource: skipping %s - not replicas at SourceSE %s" % ( lfn, self.sourceSE ) ) self.__setFileParameter( lfn, 'Reason', "No replica at SourceSE" ) self.__setFileParameter( lfn, 'Status', 'Failed' ) continue # Fix first the PFN pfn = self.oSourceSE.getPfnForLfn( lfn ).get( 'Value', {} ).get( 'Successful', {} ).get( lfn, replicas[self.sourceSE] ) res = returnSingleResult( self.oSourceSE.getPfnForProtocol( pfn, protocol = 'SRM2', withPort = True ) ) if not res['OK']: gLogger.warn( "resolveSource: skipping %s - %s" % ( lfn, res["Message"] ) ) self.__setFileParameter( lfn, 'Reason', res['Message'] ) self.__setFileParameter( lfn, 'Status', 'Failed' ) continue res = self.setSourceSURL( lfn, res['Value'] ) if not res['OK']: gLogger.warn( "resolveSource: skipping %s - %s" % ( lfn, res["Message"] ) ) self.__setFileParameter( lfn, 'Reason', res['Message'] ) self.__setFileParameter( lfn, 'Status', 'Failed' ) continue toResolve = {} for lfn in self.fileDict: if "Source" in self.fileDict[lfn]: toResolve[self.fileDict[lfn]['Source']] = lfn if not toResolve: return S_ERROR( "No eligible Source files" ) # Get metadata of the sources, to check for existance, availability and caching res = self.oSourceSE.getFileMetadata( toResolve.keys() ) if not res['OK']: return S_ERROR( "Failed to check source file metadata" ) for pfn, error in res['Value']['Failed'].items(): lfn = toResolve[pfn] if re.search( 'File does not exist', error ): gLogger.warn( "resolveSource: skipping %s - source file does not exists" % lfn ) self.__setFileParameter( lfn, 'Reason', "Source file does not exist" ) self.__setFileParameter( lfn, 'Status', 'Failed' ) else: gLogger.warn( "resolveSource: skipping %s - failed to get source metadata" % lfn ) self.__setFileParameter( lfn, 'Reason', "Failed to get Source metadata" ) self.__setFileParameter( lfn, 'Status', 'Failed' ) toStage = [] nbStagedFiles = 0 for pfn, metadata in res['Value']['Successful'].items(): lfn = toResolve[pfn] lfnStatus = self.fileDict.get( lfn, {} ).get( 'Status' ) if metadata['Unavailable']: gLogger.warn( "resolveSource: skipping %s - source file unavailable" % lfn ) self.__setFileParameter( lfn, 'Reason', "Source file Unavailable" ) self.__setFileParameter( lfn, 'Status', 'Failed' ) elif metadata['Lost']: gLogger.warn( "resolveSource: skipping %s - source file lost" % lfn ) self.__setFileParameter( lfn, 'Reason', "Source file Lost" ) self.__setFileParameter( lfn, 'Status', 'Failed' ) elif not metadata['Cached']: if lfnStatus != 'Staging': toStage.append( pfn ) elif metadata['Size'] != self.catalogMetadata[lfn]['Size']: gLogger.warn( "resolveSource: skipping %s - source file size mismatch" % lfn ) self.__setFileParameter( lfn, 'Reason', "Source size mismatch" ) self.__setFileParameter( lfn, 'Status', 'Failed' ) elif self.catalogMetadata[lfn]['Checksum'] and metadata['Checksum'] and \ not compareAdler( metadata['Checksum'], self.catalogMetadata[lfn]['Checksum'] ): gLogger.warn( "resolveSource: skipping %s - source file checksum mismatch" % lfn ) self.__setFileParameter( lfn, 'Reason', "Source checksum mismatch" ) self.__setFileParameter( lfn, 'Status', 'Failed' ) elif lfnStatus == 'Staging': # file that was staging is now cached self.__setFileParameter( lfn, 'Status', 'Waiting' ) nbStagedFiles += 1 # Some files were being staged if nbStagedFiles: self.log.info( 'resolveSource: %d files have been staged' % nbStagedFiles ) # Launching staging of files not in cache if toStage: gLogger.warn( "resolveSource: %s source files not cached, prestaging..." % len( toStage ) ) stage = self.oSourceSE.prestageFile( toStage ) if not stage["OK"]: gLogger.error( "resolveSource: error is prestaging - %s" % stage["Message"] ) for pfn in toStage: lfn = toResolve[pfn] self.__setFileParameter( lfn, 'Reason', stage["Message"] ) self.__setFileParameter( lfn, 'Status', 'Failed' ) else: for pfn in toStage: lfn = toResolve[pfn] if pfn in stage['Value']['Successful']: self.__setFileParameter( lfn, 'Status', 'Staging' ) elif pfn in stage['Value']['Failed']: self.__setFileParameter( lfn, 'Reason', stage['Value']['Failed'][pfn] ) self.__setFileParameter( lfn, 'Status', 'Failed' ) self.sourceResolved = True return S_OK() def resolveTarget( self ): """ find target SE eligible for submission :param self: self reference """ toResolve = [ lfn for lfn in self.fileDict if self.fileDict[lfn].get( 'Status' ) not in self.noSubmitStatus ] if not toResolve: return S_OK() res = self.__updateReplicaCache( toResolve ) if not res['OK']: return res for lfn in toResolve: res = self.oTargetSE.getPfnForLfn( lfn ) if not res['OK'] or lfn not in res['Value']['Successful']: gLogger.warn( "resolveTarget: skipping %s - failed to create target pfn" % lfn ) self.__setFileParameter( lfn, 'Reason', "Failed to create Target" ) self.__setFileParameter( lfn, 'Status', 'Failed' ) continue pfn = res['Value']['Successful'][lfn] res = self.oTargetSE.getPfnForProtocol( pfn, protocol = 'SRM2', withPort = True ) if not res['OK'] or pfn not in res['Value']['Successful']: reason = res.get( 'Message', res.get( 'Value', {} ).get( 'Failed', {} ).get( pfn ) ) gLogger.warn( "resolveTarget: skipping %s - %s" % ( lfn, reason ) ) self.__setFileParameter( lfn, 'Reason', reason ) self.__setFileParameter( lfn, 'Status', 'Failed' ) continue pfn = res['Value']['Successful'][pfn] res = self.setTargetSURL( lfn, pfn ) if not res['OK']: gLogger.warn( "resolveTarget: skipping %s - %s" % ( lfn, res["Message"] ) ) self.__setFileParameter( lfn, 'Reason', res['Message'] ) self.__setFileParameter( lfn, 'Status', 'Failed' ) continue toResolve = {} for lfn in self.fileDict: if "Target" in self.fileDict[lfn]: toResolve[self.fileDict[lfn]['Target']] = lfn if not toResolve: return S_ERROR( "No eligible Target files" ) res = self.oTargetSE.exists( toResolve.keys() ) if not res['OK']: return S_ERROR( "Failed to check target existence" ) for pfn, error in res['Value']['Failed'].items(): lfn = toResolve[pfn] self.__setFileParameter( lfn, 'Reason', error ) self.__setFileParameter( lfn, 'Status', 'Failed' ) toRemove = [] for pfn, exists in res['Value']['Successful'].items(): if exists: lfn = toResolve[pfn] res = self.getSourceSURL( lfn ) if not res['OK']: gLogger.warn( "resolveTarget: skipping %s - target exists" % lfn ) self.__setFileParameter( lfn, 'Reason', "Target exists" ) self.__setFileParameter( lfn, 'Status', 'Failed' ) elif res['Value'] == pfn: gLogger.warn( "resolveTarget: skipping %s - source and target pfns are the same" % lfn ) self.__setFileParameter( lfn, 'Reason', "Source and Target the same" ) self.__setFileParameter( lfn, 'Status', 'Failed' ) else: toRemove.append( pfn ) if toRemove: self.oTargetSE.removeFile( toRemove ) return S_OK() def __filesToSubmit( self ): """ check if there is at least one file to submit :return: S_OK if at least one file is present, S_ERROR otherwise """ for lfn in self.fileDict: lfnStatus = self.fileDict[lfn].get( 'Status' ) source = self.fileDict[lfn].get( 'Source' ) target = self.fileDict[lfn].get( 'Target' ) if lfnStatus not in self.noSubmitStatus and source and target: return S_OK() return S_ERROR() def __createFTSFiles( self ): """ create LFNs file for glite-transfer-submit command This file consists one line for each fiel to be transferred: sourceSURL targetSURL [CHECKSUMTYPE:CHECKSUM] :param self: self reference """ self.__updateMetadataCache() for lfn in self.fileDict: lfnStatus = self.fileDict[lfn].get( 'Status' ) if lfnStatus not in self.noSubmitStatus: cksmStr = "" # # add chsmType:cksm only if cksmType is specified, else let FTS decide by itself if self.__cksmTest and self.__cksmType: checkSum = self.catalogMetadata.get( lfn, {} ).get( 'Checksum' ) if checkSum: cksmStr = " %s:%s" % ( self.__cksmType, intAdlerToHex( hexAdlerToInt( checkSum ) ) ) ftsFile = FTSFile() ftsFile.LFN = lfn ftsFile.SourceSURL = self.fileDict[lfn].get( 'Source' ) ftsFile.TargetSURL = self.fileDict[lfn].get( 'Target' ) ftsFile.SourceSE = self.sourceSE ftsFile.TargetSE = self.targetSE ftsFile.Status = self.fileDict[lfn].get( 'Status' ) ftsFile.Checksum = cksmStr ftsFile.Size = self.catalogMetadata.get( lfn, {} ).get( 'Size' ) self.ftsFiles.append( ftsFile ) self.submittedFiles += 1 return S_OK() def __createFTSJob( self, guid = None ): self.__createFTSFiles() ftsJob = FTSJob() ftsJob.RequestID = 0 ftsJob.OperationID = 0 ftsJob.SourceSE = self.sourceSE ftsJob.TargetSE = self.targetSE ftsJob.SourceToken = self.sourceToken ftsJob.TargetToken = self.targetToken ftsJob.FTSServer = self.ftsServer if guid: ftsJob.FTSGUID = guid for ftsFile in self.ftsFiles: ftsFile.Attempt += 1 ftsFile.Error = "" ftsJob.addFile( ftsFile ) self.ftsJob = ftsJob def __submitFTSTransfer( self ): """ create and execute glite-transfer-submit CLI command :param self: self reference """ log = gLogger.getSubLogger( 'Submit' ) self.__createFTSJob() submit = self.ftsJob.submitFTS2( command = self.submitCommand ) if not submit["OK"]: log.error( "unable to submit FTSJob: %s" % submit["Message"] ) return submit log.info( "FTSJob '%s'@'%s' has been submitted" % ( self.ftsJob.FTSGUID, self.ftsJob.FTSServer ) ) # # update statuses for job files for ftsFile in self.ftsJob: ftsFile.FTSGUID = self.ftsJob.FTSGUID ftsFile.Status = "Submitted" ftsFile.Attempt += 1 log.info( "FTSJob '%s'@'%s' has been submitted" % ( self.ftsJob.FTSGUID, self.ftsJob.FTSServer ) ) self.ftsGUID = self.ftsJob.FTSGUID return S_OK() def __resolveFTSServer( self ): """ resolve FTS server to use, it should be the closest one from target SE :param self: self reference """ from DIRAC.ConfigurationSystem.Client.Helpers.Resources import getFTSServersForSites if not self.targetSE: return S_ERROR( "Target SE not set" ) res = getSitesForSE( self.targetSE ) if not res['OK'] or not res['Value']: return S_ERROR( "Could not determine target site" ) targetSites = res['Value'] targetSite = '' for targetSite in targetSites: targetFTS = getFTSServersForSites( [targetSite] ) if targetFTS['OK']: ftsTarget = targetFTS['Value'][targetSite] if ftsTarget: self.ftsServer = ftsTarget return S_OK( self.ftsServer ) else: return targetFTS return S_ERROR( 'No FTS server found for %s' % targetSite ) #################################################################### # # Methods for monitoring # def summary( self, untilTerminal = False, printOutput = False ): """ summary of FTS job :param self: self reference :param bool untilTerminal: flag to monitor FTS job to its final state :param bool printOutput: flag to print out monitoring information to the stdout """ res = self.__isSummaryValid() if not res['OK']: return res while not self.isTerminal: res = self.__parseOutput( full = True ) if not res['OK']: return res if untilTerminal: self.__print() self.isRequestTerminal() if res['Value'] or ( not untilTerminal ): break time.sleep( 1 ) if untilTerminal: print "" if printOutput and ( not untilTerminal ): return self.dumpSummary( printOutput = printOutput ) return S_OK() def monitor( self, untilTerminal = False, printOutput = False, full = True ): """ monitor FTS job :param self: self reference :param bool untilTerminal: flag to monitor FTS job to its final state :param bool printOutput: flag to print out monitoring information to the stdout """ if not self.ftsJob: self.resolveSource() self.__createFTSJob( self.ftsGUID ) res = self.__isSummaryValid() if not res['OK']: return res if untilTerminal: res = self.summary( untilTerminal = untilTerminal, printOutput = printOutput ) if not res['OK']: return res res = self.__parseOutput( full = full ) if not res['OK']: return res if untilTerminal: self.finalize() if printOutput: self.dump() return res def dumpSummary( self, printOutput = False ): """ get FTS job summary as str :param self: self reference :param bool printOutput: print summary to stdout """ outStr = '' for status in sorted( self.statusSummary ): if self.statusSummary[status]: outStr = '%s\t%-10s : %-10s\n' % ( outStr, status, str( self.statusSummary[status] ) ) outStr = outStr.rstrip( '\n' ) if printOutput: print outStr return S_OK( outStr ) def __print( self ): """ print progress bar of FTS job completeness to stdout :param self: self reference """ width = 100 bits = int( ( width * self.percentageComplete ) / 100 ) outStr = "|%s>%s| %.1f%s %s %s" % ( "="*bits, " "*( width - bits ), self.percentageComplete, "%", self.requestStatus, " "*10 ) sys.stdout.write( "%s\r" % ( outStr ) ) sys.stdout.flush() def dump( self ): """ print FTS job parameters and files to stdout :param self: self reference """ print "%-10s : %-10s" % ( "Status", self.requestStatus ) print "%-10s : %-10s" % ( "Source", self.sourceSE ) print "%-10s : %-10s" % ( "Target", self.targetSE ) print "%-10s : %-128s" % ( "Server", self.ftsServer ) print "%-10s : %-128s" % ( "GUID", self.ftsGUID ) for lfn in sorted( self.fileDict ): print "\n %-15s : %-128s" % ( 'LFN', lfn ) for key in ['Source', 'Target', 'Status', 'Reason', 'Duration']: print " %-15s : %-128s" % ( key, str( self.fileDict[lfn].get( key ) ) ) return S_OK() def __isSummaryValid( self ): """ check validity of FTS job summary report :param self: self reference """ if not self.ftsServer: return S_ERROR( "FTSServer not set" ) if not self.ftsGUID: return S_ERROR( "FTSGUID not set" ) return S_OK() def __parseOutput( self, full = False ): """ execute glite-transfer-status command and parse its output :param self: self reference :param bool full: glite-transfer-status verbosity level, when set, collect information of files as well """ monitor = self.ftsJob.monitorFTS2( command = self.monitorCommand, full = full ) if not monitor['OK']: return monitor self.percentageComplete = self.ftsJob.Completeness self.requestStatus = self.ftsJob.Status self.submitTime = self.ftsJob.SubmitTime statusSummary = monitor['Value'] if statusSummary: for state in statusSummary: self.statusSummary[state] = statusSummary[state] self.transferTime = 0 for ftsFile in self.ftsJob: lfn = ftsFile.LFN self.__setFileParameter( lfn, 'Status', ftsFile.Status ) self.__setFileParameter( lfn, 'Reason', ftsFile.Error ) self.__setFileParameter( lfn, 'Duration', ftsFile._duration ) targetURL = self.__getFileParameter( lfn, 'Target' ) if not targetURL['OK']: self.__setFileParameter( lfn, 'Target', ftsFile.TargetSURL ) self.transferTime += int( ftsFile._duration ) return S_OK() #################################################################### # # Methods for finalization # def finalize( self ): """ finalize FTS job :param self: self reference """ self.__updateMetadataCache() transEndTime = dateTime() regStartTime = time.time() res = self.getTransferStatistics() transDict = res['Value'] res = self.__registerSuccessful( transDict['transLFNs'] ) regSuc, regTotal = res['Value'] regTime = time.time() - regStartTime if self.sourceSE and self.targetSE: self.__sendAccounting( regSuc, regTotal, regTime, transEndTime, transDict ) return S_OK() def getTransferStatistics( self ): """ collect information of Transfers that can be used by Accounting :param self: self reference """ transDict = { 'transTotal': len( self.fileDict ), 'transLFNs': [], 'transOK': 0, 'transSize': 0 } for lfn in self.fileDict: if self.fileDict[lfn].get( 'Status' ) in self.successfulStates: if self.fileDict[lfn].get( 'Duration', 0 ): transDict['transLFNs'].append( lfn ) transDict['transOK'] += 1 if lfn in self.catalogMetadata: transDict['transSize'] += self.catalogMetadata[lfn].get( 'Size', 0 ) return S_OK( transDict ) def getFailedRegistrations( self ): """ get failed registrations dict :param self: self reference """ return S_OK( self.failedRegistrations ) def __registerSuccessful( self, transLFNs ): """ register successfully transferred files to the catalogs, fill failedRegistrations dict for files that failed to register :param self: self reference :param list transLFNs: LFNs in FTS job """ self.failedRegistrations = {} toRegister = {} for lfn in transLFNs: res = returnSingleResult( self.oTargetSE.getPfnForProtocol( self.fileDict[lfn].get( 'Target' ), protocol = 'SRM2', withPort = False ) ) if not res['OK']: self.__setFileParameter( lfn, 'Reason', res['Message'] ) self.__setFileParameter( lfn, 'Status', 'Failed' ) else: toRegister[lfn] = { 'PFN' : res['Value'], 'SE' : self.targetSE } if not toRegister: return S_OK( ( 0, 0 ) ) res = self.__getCatalogObject() if not res['OK']: for lfn in toRegister: self.failedRegistrations = toRegister self.log.error( 'Failed to get Catalog Object', res['Message'] ) return S_OK( ( 0, len( toRegister ) ) ) res = self.oCatalog.addReplica( toRegister ) if not res['OK']: self.failedRegistrations = toRegister self.log.error( 'Failed to get Catalog Object', res['Message'] ) return S_OK( ( 0, len( toRegister ) ) ) for lfn, error in res['Value']['Failed'].items(): self.failedRegistrations[lfn] = toRegister[lfn] self.log.error( 'Registration of Replica failed', '%s : %s' % ( lfn, str( error ) ) ) return S_OK( ( len( res['Value']['Successful'] ), len( toRegister ) ) ) def __sendAccounting( self, regSuc, regTotal, regTime, transEndTime, transDict ): """ send accounting record :param self: self reference :param regSuc: number of files successfully registered :param regTotal: number of files attepted to register :param regTime: time stamp at the end of registration :param transEndTime: time stamp at the end of FTS job :param dict transDict: dict holding couters for files being transerred, their sizes and successfull transfers """ oAccounting = DataOperation() oAccounting.setEndTime( transEndTime ) oAccounting.setStartTime( self.submitTime ) accountingDict = {} accountingDict['OperationType'] = 'replicateAndRegister' result = getProxyInfo() if not result['OK']: userName = '******' else: userName = result['Value'].get( 'username', 'unknown' ) accountingDict['User'] = userName accountingDict['Protocol'] = 'FTS' if 'fts3' not in self.ftsServer else 'FTS3' accountingDict['RegistrationTime'] = regTime accountingDict['RegistrationOK'] = regSuc accountingDict['RegistrationTotal'] = regTotal accountingDict['TransferOK'] = transDict['transOK'] accountingDict['TransferTotal'] = transDict['transTotal'] accountingDict['TransferSize'] = transDict['transSize'] accountingDict['FinalStatus'] = self.requestStatus accountingDict['Source'] = self.sourceSE accountingDict['Destination'] = self.targetSE accountingDict['TransferTime'] = self.transferTime oAccounting.setValuesFromDict( accountingDict ) self.log.verbose( "Attempting to commit accounting message..." ) oAccounting.commit() self.log.verbose( "...committed." ) return S_OK()
class ReplicateAndRegister(DMSRequestOperationsBase): """ .. class:: ReplicateAndRegister ReplicateAndRegister operation handler """ def __init__(self, operation=None, csPath=None): """c'tor :param self: self reference :param Operation operation: Operation instance :param str csPath: CS path for this handler """ super(ReplicateAndRegister, self).__init__(operation, csPath) # # own gMonitor stuff for files gMonitor.registerActivity("ReplicateAndRegisterAtt", "Replicate and register attempted", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("ReplicateOK", "Replications successful", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("ReplicateFail", "Replications failed", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("RegisterOK", "Registrations successful", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("RegisterFail", "Registrations failed", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) # # for FTS gMonitor.registerActivity("FTSScheduleAtt", "Files schedule attempted", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("FTSScheduleOK", "File schedule successful", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("FTSScheduleFail", "File schedule failed", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) # # SE cache # Clients self.fc = FileCatalog() def __call__(self): """ call me maybe """ # # check replicas first checkReplicas = self.__checkReplicas() if not checkReplicas["OK"]: self.log.error('Failed to check replicas', checkReplicas["Message"]) if hasattr(self, "FTSMode") and getattr(self, "FTSMode"): bannedGroups = getattr(self, "FTSBannedGroups") if hasattr(self, "FTSBannedGroups") else () if self.request.OwnerGroup in bannedGroups: self.log.verbose("usage of FTS system is banned for request's owner") return self.dmTransfer() if getattr(self, 'UseNewFTS3', False): return self.fts3Transfer() else: return self.ftsTransfer() return self.dmTransfer() def __checkReplicas(self): """ check done replicas and update file states """ waitingFiles = dict([(opFile.LFN, opFile) for opFile in self.operation if opFile.Status in ("Waiting", "Scheduled")]) targetSESet = set(self.operation.targetSEList) replicas = self.fc.getReplicas(waitingFiles.keys()) if not replicas["OK"]: self.log.error('Failed to get replicas', replicas["Message"]) return replicas reMissing = re.compile(r".*such file.*") for failedLFN, errStr in replicas["Value"]["Failed"].iteritems(): waitingFiles[failedLFN].Error = errStr if reMissing.search(errStr.lower()): self.log.error("File does not exists", failedLFN) gMonitor.addMark("ReplicateFail", len(targetSESet)) waitingFiles[failedLFN].Status = "Failed" for successfulLFN, reps in replicas["Value"]["Successful"].iteritems(): if targetSESet.issubset(set(reps)): self.log.info("file %s has been replicated to all targets" % successfulLFN) waitingFiles[successfulLFN].Status = "Done" return S_OK() def _addMetadataToFiles(self, toSchedule): """ Add metadata to those files that need to be scheduled through FTS toSchedule is a dictionary: {'lfn1': opFile, 'lfn2': opFile} """ if toSchedule: self.log.info("found %s files to schedule, getting metadata from FC" % len(toSchedule)) else: self.log.verbose("No files to schedule") return S_OK([]) res = self.fc.getFileMetadata(toSchedule.keys()) if not res['OK']: return res else: if res['Value']['Failed']: self.log.warn("Can't schedule %d files: problems getting the metadata: %s" % (len(res['Value']['Failed']), ', '.join(res['Value']['Failed']))) metadata = res['Value']['Successful'] filesToSchedule = {} for lfn, lfnMetadata in metadata.iteritems(): opFileToSchedule = toSchedule[lfn][0] opFileToSchedule.GUID = lfnMetadata['GUID'] # In principle this is defined already in filterReplicas() if not opFileToSchedule.Checksum: opFileToSchedule.Checksum = metadata[lfn]['Checksum'] opFileToSchedule.ChecksumType = metadata[lfn]['ChecksumType'] opFileToSchedule.Size = metadata[lfn]['Size'] filesToSchedule[opFileToSchedule.LFN] = opFileToSchedule return S_OK(filesToSchedule) def _filterReplicas(self, opFile): """ filter out banned/invalid source SEs """ return filterReplicas(opFile, logger=self.log, dataManager=self.dm) def ftsTransfer(self): """ replicate and register using FTS """ self.log.info("scheduling files in FTS...") bannedTargets = self.checkSEsRSS() if not bannedTargets['OK']: gMonitor.addMark("FTSScheduleAtt") gMonitor.addMark("FTSScheduleFail") return bannedTargets if bannedTargets['Value']: return S_OK("%s targets are banned for writing" % ",".join(bannedTargets['Value'])) # Can continue now self.log.verbose("No targets banned for writing") toSchedule = {} delayExecution = 0 errors = defaultdict(int) for opFile in self.getWaitingFilesList(): opFile.Error = '' gMonitor.addMark("FTSScheduleAtt") # # check replicas replicas = self._filterReplicas(opFile) if not replicas["OK"]: continue replicas = replicas["Value"] validReplicas = replicas.get("Valid") noMetaReplicas = replicas.get("NoMetadata") noReplicas = replicas.get('NoReplicas') badReplicas = replicas.get('Bad') noActiveReplicas = replicas.get('NoActiveReplicas') if validReplicas: validTargets = list(set(self.operation.targetSEList) - set(validReplicas)) if not validTargets: self.log.info("file %s is already present at all targets" % opFile.LFN) opFile.Status = "Done" else: toSchedule[opFile.LFN] = [opFile, validReplicas, validTargets] else: gMonitor.addMark("FTSScheduleFail") if noMetaReplicas: err = "Couldn't get metadata" errors[err] += 1 self.log.verbose( "unable to schedule '%s', %s at %s" % (opFile.LFN, err, ','.join(noMetaReplicas))) opFile.Error = err elif noReplicas: err = "File doesn't exist" errors[err] += 1 self.log.error("Unable to schedule transfer", "%s %s at %s" % (opFile.LFN, err, ','.join(noReplicas))) opFile.Error = err opFile.Status = 'Failed' elif badReplicas: err = "All replicas have a bad checksum" errors[err] += 1 self.log.error("Unable to schedule transfer", "%s, %s at %s" % (opFile.LFN, err, ','.join(badReplicas))) opFile.Error = err opFile.Status = 'Failed' elif noActiveReplicas: err = "No active replica found" errors[err] += 1 self.log.verbose("Unable to schedule transfer", "%s, %s at %s" % (opFile.LFN, err, ','.join(noActiveReplicas))) opFile.Error = err # All source SEs are banned, delay execution by 1 hour delayExecution = 60 if delayExecution: self.log.info("Delay execution of the request by %d minutes" % delayExecution) self.request.delayNextExecution(delayExecution) # Log error counts for error, count in errors.iteritems(): self.log.error(error, 'for %d files' % count) filesToScheduleList = [] res = self._addMetadataToFiles(toSchedule) if not res['OK']: return res else: filesToSchedule = res['Value'] for lfn in filesToSchedule: filesToScheduleList.append((filesToSchedule[lfn][0].toJSON()['Value'], toSchedule[lfn][1], toSchedule[lfn][2])) if filesToScheduleList: ftsSchedule = FTSClient().ftsSchedule(self.request.RequestID, self.operation.OperationID, filesToScheduleList) if not ftsSchedule["OK"]: self.log.error("Completely failed to schedule to FTS:", ftsSchedule["Message"]) return ftsSchedule # might have nothing to schedule ftsSchedule = ftsSchedule["Value"] if not ftsSchedule: return S_OK() self.log.info("%d files have been scheduled to FTS" % len(ftsSchedule['Successful'])) for opFile in self.operation: fileID = opFile.FileID if fileID in ftsSchedule["Successful"]: gMonitor.addMark("FTSScheduleOK", 1) opFile.Status = "Scheduled" self.log.debug("%s has been scheduled for FTS" % opFile.LFN) elif fileID in ftsSchedule["Failed"]: gMonitor.addMark("FTSScheduleFail", 1) opFile.Error = ftsSchedule["Failed"][fileID] if 'sourceSURL equals to targetSURL' in opFile.Error: # In this case there is no need to continue opFile.Status = 'Failed' self.log.warn("unable to schedule %s for FTS: %s" % (opFile.LFN, opFile.Error)) else: self.log.info("No files to schedule after metadata checks") # Just in case some transfers could not be scheduled, try them with RM return self.dmTransfer(fromFTS=True) def _checkExistingFTS3Operations(self): """ Check if there are ongoing FTS3Operation for the current RMS Operation Under some conditions, we can be trying to schedule files while there is still an FTS transfer going on. This typically happens when the REA hangs. To prevent further race condition, we check if there are FTS3Operations in a non Final state matching the current operation ID. If so, we put the corresponding files in scheduled mode. We will then wait till the FTS3 Operation performs the callback :returns: S_OK with True if we can go on, False if we should stop the processing """ res = FTS3Client().getOperationsFromRMSOpID(self.operation.OperationID) if not res['OK']: self.log.debug( "Could not get FTS3Operations matching OperationID", self.operation.OperationID) return res existingFTSOperations = res['Value'] # It is ok to have FTS Operations in a final state, so we # care only about the others unfinishedFTSOperations = [ ops for ops in existingFTSOperations if ops.status not in FTS3TransferOperation.FINAL_STATES] if not unfinishedFTSOperations: self.log.debug("No ongoing FTS3Operations, all good") return S_OK(True) self.log.warn("Some FTS3Operations already exist for the RMS Operation:", [op.operationID for op in unfinishedFTSOperations]) # This would really be a screwed up situation ! if len(unfinishedFTSOperations) > 1: self.log.warn("That's a serious problem !!") # We take the rmsFileID of the files in the Operations, # find the corresponding File object, and set them scheduled rmsFileIDsToSetScheduled = set( [ftsFile.rmsFileID for ftsOp in unfinishedFTSOperations for ftsFile in ftsOp.ftsFiles]) for opFile in self.operation: # If it is in the DB, it has a FileID opFileID = opFile.FileID if opFileID in rmsFileIDsToSetScheduled: self.log.warn("Setting RMSFile as already scheduled", opFileID) opFile.Status = "Scheduled" # We return here such that the Request is set back to Scheduled in the DB # With no further modification return S_OK(False) def fts3Transfer(self): """ replicate and register using FTS3 """ self.log.info("scheduling files in FTS3...") # Check first if we do not have ongoing transfers res = self._checkExistingFTS3Operations() if not res['OK']: return res # if res['Value'] is False # it means that there are ongoing transfers # and we should stop here if res['Value'] is False: # return S_OK such that the request is put back return S_OK() fts3Files = [] toSchedule = {} # Dict which maps the FileID to the object rmsFilesIds = {} for opFile in self.getWaitingFilesList(): rmsFilesIds[opFile.FileID] = opFile opFile.Error = '' gMonitor.addMark("FTSScheduleAtt") # # check replicas replicas = self._filterReplicas(opFile) if not replicas["OK"]: continue replicas = replicas["Value"] validReplicas = replicas["Valid"] noMetaReplicas = replicas["NoMetadata"] noReplicas = replicas['NoReplicas'] badReplicas = replicas['Bad'] noPFN = replicas['NoPFN'] if validReplicas: validTargets = list(set(self.operation.targetSEList) - set(validReplicas)) if not validTargets: self.log.info("file %s is already present at all targets" % opFile.LFN) opFile.Status = "Done" else: toSchedule[opFile.LFN] = [opFile, validTargets] else: gMonitor.addMark("FTSScheduleFail") if noMetaReplicas: self.log.warn("unable to schedule '%s', couldn't get metadata at %s" % (opFile.LFN, ','.join(noMetaReplicas))) opFile.Error = "Couldn't get metadata" elif noReplicas: self.log.error( "Unable to schedule transfer", "File %s doesn't exist at %s" % (opFile.LFN, ','.join(noReplicas))) opFile.Error = 'No replicas found' opFile.Status = 'Failed' elif badReplicas: self.log.error( "Unable to schedule transfer", "File %s, all replicas have a bad checksum at %s" % (opFile.LFN, ','.join(badReplicas))) opFile.Error = 'All replicas have a bad checksum' opFile.Status = 'Failed' elif noPFN: self.log.warn( "unable to schedule %s, could not get a PFN at %s" % (opFile.LFN, ','.join(noPFN))) res = self._addMetadataToFiles(toSchedule) if not res['OK']: return res else: filesToSchedule = res['Value'] for lfn in filesToSchedule: opFile = filesToSchedule[lfn] validTargets = toSchedule[lfn][1] for targetSE in validTargets: ftsFile = FTS3File.fromRMSFile(opFile, targetSE) fts3Files.append(ftsFile) if fts3Files: res = Registry.getUsernameForDN(self.request.OwnerDN) if not res['OK']: self.log.error( "Cannot get username for DN", "%s %s" % (self.request.OwnerDN, res['Message'])) return res username = res['Value'] fts3Operation = FTS3TransferOperation.fromRMSObjects(self.request, self.operation, username) fts3Operation.ftsFiles = fts3Files ftsSchedule = FTS3Client().persistOperation(fts3Operation) if not ftsSchedule["OK"]: self.log.error("Completely failed to schedule to FTS3:", ftsSchedule["Message"]) return ftsSchedule # might have nothing to schedule ftsSchedule = ftsSchedule["Value"] self.log.info("Scheduled with FTS3Operation id %s" % ftsSchedule) self.log.info("%d files have been scheduled to FTS3" % len(fts3Files)) for ftsFile in fts3Files: opFile = rmsFilesIds[ftsFile.rmsFileID] gMonitor.addMark("FTSScheduleOK", 1) opFile.Status = "Scheduled" self.log.debug("%s has been scheduled for FTS" % opFile.LFN) else: self.log.info("No files to schedule after metadata checks") # Just in case some transfers could not be scheduled, try them with RM return self.dmTransfer(fromFTS=True) def dmTransfer(self, fromFTS=False): """ replicate and register using dataManager """ # # get waiting files. If none just return # # source SE sourceSE = self.operation.SourceSE if self.operation.SourceSE else None if sourceSE: # # check source se for read bannedSource = self.checkSEsRSS(sourceSE, 'ReadAccess') if not bannedSource["OK"]: gMonitor.addMark("ReplicateAndRegisterAtt", len(self.operation)) gMonitor.addMark("ReplicateFail", len(self.operation)) return bannedSource if bannedSource["Value"]: self.operation.Error = "SourceSE %s is banned for reading" % sourceSE self.log.info(self.operation.Error) return S_OK(self.operation.Error) # # check targetSEs for write bannedTargets = self.checkSEsRSS() if not bannedTargets['OK']: gMonitor.addMark("ReplicateAndRegisterAtt", len(self.operation)) gMonitor.addMark("ReplicateFail", len(self.operation)) return bannedTargets if bannedTargets['Value']: self.operation.Error = "%s targets are banned for writing" % ",".join(bannedTargets['Value']) return S_OK(self.operation.Error) # Can continue now self.log.verbose("No targets banned for writing") waitingFiles = self.getWaitingFilesList() if not waitingFiles: return S_OK() # # loop over files if fromFTS: self.log.info("Trying transfer using replica manager as FTS failed") else: self.log.info("Transferring files using Data manager...") errors = defaultdict(int) delayExecution = 0 for opFile in waitingFiles: if opFile.Error in ("Couldn't get metadata", "File doesn't exist", 'No active replica found', "All replicas have a bad checksum",): err = "File already in error status" errors[err] += 1 gMonitor.addMark("ReplicateAndRegisterAtt", 1) opFile.Error = '' lfn = opFile.LFN # Check if replica is at the specified source replicas = self._filterReplicas(opFile) if not replicas["OK"]: self.log.error('Failed to check replicas', replicas["Message"]) continue replicas = replicas["Value"] validReplicas = replicas.get("Valid") noMetaReplicas = replicas.get("NoMetadata") noReplicas = replicas.get('NoReplicas') badReplicas = replicas.get('Bad') noActiveReplicas = replicas.get('NoActiveReplicas') if not validReplicas: gMonitor.addMark("ReplicateFail") if noMetaReplicas: err = "Couldn't get metadata" errors[err] += 1 self.log.verbose( "unable to replicate '%s', couldn't get metadata at %s" % (opFile.LFN, ','.join(noMetaReplicas))) opFile.Error = err elif noReplicas: err = "File doesn't exist" errors[err] += 1 self.log.verbose( "Unable to replicate", "File %s doesn't exist at %s" % (opFile.LFN, ','.join(noReplicas))) opFile.Error = err opFile.Status = 'Failed' elif badReplicas: err = "All replicas have a bad checksum" errors[err] += 1 self.log.error( "Unable to replicate", "%s, all replicas have a bad checksum at %s" % (opFile.LFN, ','.join(badReplicas))) opFile.Error = err opFile.Status = 'Failed' elif noActiveReplicas: err = "No active replica found" errors[err] += 1 self.log.verbose("Unable to schedule transfer", "%s, %s at %s" % (opFile.LFN, err, ','.join(noActiveReplicas))) opFile.Error = err # All source SEs are banned, delay execution by 1 hour delayExecution = 60 continue # # get the first one in the list if sourceSE not in validReplicas: if sourceSE: err = "File not at specified source" errors[err] += 1 self.log.warn( "%s is not at specified sourceSE %s, changed to %s" % (lfn, sourceSE, validReplicas[0])) sourceSE = validReplicas[0] # # loop over targetSE catalogs = self.operation.Catalog if catalogs: catalogs = [cat.strip() for cat in catalogs.split(',')] for targetSE in self.operation.targetSEList: # # call DataManager if targetSE in validReplicas: self.log.warn("Request to replicate %s to an existing location: %s" % (lfn, targetSE)) opFile.Status = 'Done' continue res = self.dm.replicateAndRegister(lfn, targetSE, sourceSE=sourceSE, catalog=catalogs) if res["OK"]: if lfn in res["Value"]["Successful"]: if "replicate" in res["Value"]["Successful"][lfn]: repTime = res["Value"]["Successful"][lfn]["replicate"] prString = "file %s replicated at %s in %s s." % (lfn, targetSE, repTime) gMonitor.addMark("ReplicateOK", 1) if "register" in res["Value"]["Successful"][lfn]: gMonitor.addMark("RegisterOK", 1) regTime = res["Value"]["Successful"][lfn]["register"] prString += ' and registered in %s s.' % regTime self.log.info(prString) else: gMonitor.addMark("RegisterFail", 1) prString += " but failed to register" self.log.warn(prString) opFile.Error = "Failed to register" # # add register replica operation registerOperation = self.getRegisterOperation( opFile, targetSE, type='RegisterReplica') self.request.insertAfter(registerOperation, self.operation) else: self.log.error("Failed to replicate", "%s to %s" % (lfn, targetSE)) gMonitor.addMark("ReplicateFail", 1) opFile.Error = "Failed to replicate" else: gMonitor.addMark("ReplicateFail", 1) reason = res["Value"]["Failed"][lfn] self.log.error( "Failed to replicate and register", "File %s at %s:" % (lfn, targetSE), reason) opFile.Error = reason else: gMonitor.addMark("ReplicateFail", 1) opFile.Error = "DataManager error: %s" % res["Message"] self.log.error("DataManager error", res["Message"]) if not opFile.Error: if len(self.operation.targetSEList) > 1: self.log.info("file %s has been replicated to all targetSEs" % lfn) opFile.Status = "Done" # Log error counts if delayExecution: self.log.info("Delay execution of the request by %d minutes" % delayExecution) self.request.delayNextExecution(delayExecution) for error, count in errors.iteritems(): self.log.error(error, 'for %d files' % count) return S_OK()
class ReplicateAndRegister( DMSRequestOperationsBase ): """ .. class:: ReplicateAndRegister ReplicateAndRegister operation handler """ def __init__( self, operation = None, csPath = None ): """c'tor :param self: self reference :param Operation operation: Operation instance :param str csPath: CS path for this handler """ super( ReplicateAndRegister, self ).__init__( operation, csPath ) # # own gMonitor stuff for files gMonitor.registerActivity( "ReplicateAndRegisterAtt", "Replicate and register attempted", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "ReplicateOK", "Replications successful", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "ReplicateFail", "Replications failed", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RegisterOK", "Registrations successful", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RegisterFail", "Registrations failed", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM ) # # for FTS gMonitor.registerActivity( "FTSScheduleAtt", "Files schedule attempted", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSScheduleOK", "File schedule successful", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSScheduleFail", "File schedule failed", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM ) # # SE cache # Clients self.fc = FileCatalog() if hasattr( self, "FTSMode" ) and getattr( self, "FTSMode" ): from DIRAC.DataManagementSystem.Client.FTSClient import FTSClient self.ftsClient = FTSClient() def __call__( self ): """ call me maybe """ # # check replicas first checkReplicas = self.__checkReplicas() if not checkReplicas["OK"]: self.log.error( 'Failed to check replicas', checkReplicas["Message"] ) if hasattr( self, "FTSMode" ) and getattr( self, "FTSMode" ): bannedGroups = getattr( self, "FTSBannedGroups" ) if hasattr( self, "FTSBannedGroups" ) else () if self.request.OwnerGroup in bannedGroups: self.log.verbose( "usage of FTS system is banned for request's owner" ) return self.dmTransfer() return self.ftsTransfer() return self.dmTransfer() def __checkReplicas( self ): """ check done replicas and update file states """ waitingFiles = dict( [ ( opFile.LFN, opFile ) for opFile in self.operation if opFile.Status in ( "Waiting", "Scheduled" ) ] ) targetSESet = set( self.operation.targetSEList ) replicas = self.fc.getReplicas( waitingFiles.keys() ) if not replicas["OK"]: self.log.error( 'Failed to get replicas', replicas["Message"] ) return replicas reMissing = re.compile( r".*such file.*" ) for failedLFN, errStr in replicas["Value"]["Failed"].items(): waitingFiles[failedLFN].Error = errStr if reMissing.search( errStr.lower() ): self.log.error( "File does not exists", failedLFN ) gMonitor.addMark( "ReplicateFail", len( targetSESet ) ) waitingFiles[failedLFN].Status = "Failed" for successfulLFN, reps in replicas["Value"]["Successful"].items(): if targetSESet.issubset( set( reps ) ): self.log.info( "file %s has been replicated to all targets" % successfulLFN ) waitingFiles[successfulLFN].Status = "Done" return S_OK() def _addMetadataToFiles( self, toSchedule ): """ Add metadata to those files that need to be scheduled through FTS toSchedule is a dictionary: {'lfn1': [opFile, validReplicas, validTargets], 'lfn2': [opFile, validReplicas, validTargets]} """ if toSchedule: self.log.info( "found %s files to schedule, getting metadata from FC" % len( toSchedule ) ) lfns = toSchedule.keys() else: self.log.info( "No files to schedule" ) return S_OK() res = self.fc.getFileMetadata( lfns ) if not res['OK']: return res else: if res['Value']['Failed']: self.log.warn( "Can't schedule %d files: problems getting the metadata: %s" % ( len( res['Value']['Failed'] ), ', '.join( res['Value']['Failed'] ) ) ) metadata = res['Value']['Successful'] filesToScheduleList = [] for lfnsToSchedule, lfnMetadata in metadata.items(): opFileToSchedule = toSchedule[lfnsToSchedule][0] opFileToSchedule.GUID = lfnMetadata['GUID'] opFileToSchedule.Checksum = metadata[lfnsToSchedule]['Checksum'] opFileToSchedule.ChecksumType = metadata[lfnsToSchedule]['ChecksumType'] opFileToSchedule.Size = metadata[lfnsToSchedule]['Size'] filesToScheduleList.append( ( opFileToSchedule.toJSON()['Value'], toSchedule[lfnsToSchedule][1], toSchedule[lfnsToSchedule][2] ) ) return S_OK( filesToScheduleList ) def _filterReplicas( self, opFile ): """ filter out banned/invalid source SEs """ return filterReplicas( opFile, logger = self.log, dataManager = self.dm ) def ftsTransfer( self ): """ replicate and register using FTS """ self.log.info( "scheduling files in FTS..." ) bannedTargets = self.checkSEsRSS() if not bannedTargets['OK']: gMonitor.addMark( "FTSScheduleAtt" ) gMonitor.addMark( "FTSScheduleFail" ) return bannedTargets if bannedTargets['Value']: return S_OK( "%s targets are banned for writing" % ",".join( bannedTargets['Value'] ) ) # Can continue now self.log.verbose( "No targets banned for writing" ) toSchedule = {} for opFile in self.getWaitingFilesList(): opFile.Error = '' gMonitor.addMark( "FTSScheduleAtt" ) # # check replicas replicas = self._filterReplicas( opFile ) if not replicas["OK"]: continue replicas = replicas["Value"] validReplicas = replicas["Valid"] noMetaReplicas = replicas["NoMetadata"] noReplicas = replicas['NoReplicas'] badReplicas = replicas['Bad'] noPFN = replicas['NoPFN'] if validReplicas: validTargets = list( set( self.operation.targetSEList ) - set( validReplicas ) ) if not validTargets: self.log.info( "file %s is already present at all targets" % opFile.LFN ) opFile.Status = "Done" else: toSchedule[opFile.LFN] = [ opFile, validReplicas, validTargets ] else: gMonitor.addMark( "FTSScheduleFail" ) if noMetaReplicas: self.log.warn( "unable to schedule '%s', couldn't get metadata at %s" % ( opFile.LFN, ','.join( noMetaReplicas ) ) ) opFile.Error = "Couldn't get metadata" elif noReplicas: self.log.error( "Unable to schedule transfer", "File %s doesn't exist at %s" % ( opFile.LFN, ','.join( noReplicas ) ) ) opFile.Error = 'No replicas found' opFile.Status = 'Failed' elif badReplicas: self.log.error( "Unable to schedule transfer", "File %s, all replicas have a bad checksum at %s" % ( opFile.LFN, ','.join( badReplicas ) ) ) opFile.Error = 'All replicas have a bad checksum' opFile.Status = 'Failed' elif noPFN: self.log.warn( "unable to schedule %s, could not get a PFN at %s" % ( opFile.LFN, ','.join( noPFN ) ) ) res = self._addMetadataToFiles( toSchedule ) if not res['OK']: return res else: filesToScheduleList = res['Value'] if filesToScheduleList: ftsSchedule = self.ftsClient.ftsSchedule( self.request.RequestID, self.operation.OperationID, filesToScheduleList ) if not ftsSchedule["OK"]: self.log.error( "Completely failed to schedule to FTS:", ftsSchedule["Message"] ) return ftsSchedule # might have nothing to schedule ftsSchedule = ftsSchedule["Value"] if not ftsSchedule: return S_OK() self.log.info( "%d files have been scheduled to FTS" % len( ftsSchedule['Successful'] ) ) for opFile in self.operation: fileID = opFile.FileID if fileID in ftsSchedule["Successful"]: gMonitor.addMark( "FTSScheduleOK", 1 ) opFile.Status = "Scheduled" self.log.debug( "%s has been scheduled for FTS" % opFile.LFN ) elif fileID in ftsSchedule["Failed"]: gMonitor.addMark( "FTSScheduleFail", 1 ) opFile.Error = ftsSchedule["Failed"][fileID] if 'sourceSURL equals to targetSURL' in opFile.Error: # In this case there is no need to continue opFile.Status = 'Failed' self.log.warn( "unable to schedule %s for FTS: %s" % ( opFile.LFN, opFile.Error ) ) else: self.log.info( "No files to schedule after metadata checks" ) # Just in case some transfers could not be scheduled, try them with RM return self.dmTransfer( fromFTS = True ) def dmTransfer( self, fromFTS = False ): """ replicate and register using dataManager """ # # get waiting files. If none just return # # source SE sourceSE = self.operation.SourceSE if self.operation.SourceSE else None if sourceSE: # # check source se for read bannedSource = self.checkSEsRSS( sourceSE, 'ReadAccess' ) if not bannedSource["OK"]: gMonitor.addMark( "ReplicateAndRegisterAtt", len( self.operation ) ) gMonitor.addMark( "ReplicateFail", len( self.operation ) ) return bannedSource if bannedSource["Value"]: self.operation.Error = "SourceSE %s is banned for reading" % sourceSE self.log.info( self.operation.Error ) return S_OK( self.operation.Error ) # # check targetSEs for write bannedTargets = self.checkSEsRSS() if not bannedTargets['OK']: gMonitor.addMark( "ReplicateAndRegisterAtt", len( self.operation ) ) gMonitor.addMark( "ReplicateFail", len( self.operation ) ) return bannedTargets if bannedTargets['Value']: self.operation.Error = "%s targets are banned for writing" % ",".join( bannedTargets['Value'] ) return S_OK( self.operation.Error ) # Can continue now self.log.verbose( "No targets banned for writing" ) waitingFiles = self.getWaitingFilesList() if not waitingFiles: return S_OK() # # loop over files if fromFTS: self.log.info( "Trying transfer using replica manager as FTS failed" ) else: self.log.info( "Transferring files using Data manager..." ) for opFile in waitingFiles: gMonitor.addMark( "ReplicateAndRegisterAtt", 1 ) opFile.Error = '' lfn = opFile.LFN # Check if replica is at the specified source replicas = self._filterReplicas( opFile ) if not replicas["OK"]: self.log.error( 'Failed to check replicas', replicas["Message"] ) continue replicas = replicas["Value"] validReplicas = replicas["Valid"] noMetaReplicas = replicas["NoMetadata"] noReplicas = replicas['NoReplicas'] badReplicas = replicas['Bad'] noPFN = replicas['NoPFN'] if not validReplicas: gMonitor.addMark( "ReplicateFail" ) if noMetaReplicas: self.log.warn( "unable to replicate '%s', couldn't get metadata at %s" % ( opFile.LFN, ','.join( noMetaReplicas ) ) ) opFile.Error = "Couldn't get metadata" elif noReplicas: self.log.error( "Unable to replicate", "File %s doesn't exist at %s" % ( opFile.LFN, ','.join( noReplicas ) ) ) opFile.Error = 'No replicas found' opFile.Status = 'Failed' elif badReplicas: self.log.error( "Unable to replicate", "%s, all replicas have a bad checksum at %s" % ( opFile.LFN, ','.join( badReplicas ) ) ) opFile.Error = 'All replicas have a bad checksum' opFile.Status = 'Failed' elif noPFN: self.log.warn( "unable to replicate %s, could not get a PFN" % opFile.LFN ) continue # # get the first one in the list if sourceSE not in validReplicas: if sourceSE: self.log.warn( "%s is not at specified sourceSE %s, changed to %s" % ( lfn, sourceSE, validReplicas[0] ) ) sourceSE = validReplicas[0] # # loop over targetSE catalogs = self.operation.Catalog if catalogs: catalogs = [ cat.strip() for cat in catalogs.split( ',' ) ] for targetSE in self.operation.targetSEList: # # call DataManager if targetSE in validReplicas: self.log.warn( "Request to replicate %s to an existing location: %s" % ( lfn, targetSE ) ) opFile.Status = 'Done' continue res = self.dm.replicateAndRegister( lfn, targetSE, sourceSE = sourceSE, catalog = catalogs ) if res["OK"]: if lfn in res["Value"]["Successful"]: if "replicate" in res["Value"]["Successful"][lfn]: repTime = res["Value"]["Successful"][lfn]["replicate"] prString = "file %s replicated at %s in %s s." % ( lfn, targetSE, repTime ) gMonitor.addMark( "ReplicateOK", 1 ) if "register" in res["Value"]["Successful"][lfn]: gMonitor.addMark( "RegisterOK", 1 ) regTime = res["Value"]["Successful"][lfn]["register"] prString += ' and registered in %s s.' % regTime self.log.info( prString ) else: gMonitor.addMark( "RegisterFail", 1 ) prString += " but failed to register" self.log.warn( prString ) opFile.Error = "Failed to register" # # add register replica operation registerOperation = self.getRegisterOperation( opFile, targetSE, type = 'RegisterReplica' ) self.request.insertAfter( registerOperation, self.operation ) else: self.log.error( "Failed to replicate", "%s to %s" % ( lfn, targetSE ) ) gMonitor.addMark( "ReplicateFail", 1 ) opFile.Error = "Failed to replicate" else: gMonitor.addMark( "ReplicateFail", 1 ) reason = res["Value"]["Failed"][lfn] self.log.error( "Failed to replicate and register", "File %s at %s:" % ( lfn, targetSE ), reason ) opFile.Error = reason else: gMonitor.addMark( "ReplicateFail", 1 ) opFile.Error = "DataManager error: %s" % res["Message"] self.log.error( "DataManager error", res["Message"] ) if not opFile.Error: if len( self.operation.targetSEList ) > 1: self.log.info( "file %s has been replicated to all targetSEs" % lfn ) opFile.Status = "Done" return S_OK()
class DIRACBackend(GridBackend): """Grid backend using the GFAL command line tools `gfal-*`.""" def __init__(self, **kwargs): GridBackend.__init__(self, catalogue_prefix='', **kwargs) from DIRAC.Core.Base import Script Script.initialize() from DIRAC.FrameworkSystem.Client.ProxyManagerClient import ProxyManagerClient self.pm = ProxyManagerClient() proxy = self.pm.getUserProxiesInfo() if not proxy['OK']: raise BackendException("Proxy error.") from DIRAC.Interfaces.API.Dirac import Dirac self.dirac = Dirac() from DIRAC.Resources.Catalog.FileCatalog import FileCatalog self.fc = FileCatalog() from DIRAC.DataManagementSystem.Client.DataManager import DataManager self.dm = DataManager() self._xattr_cmd = sh.Command('gfal-xattr').bake(_tty_out=False) self._replica_checksum_cmd = sh.Command('gfal-sum').bake(_tty_out=False) self._bringonline_cmd = sh.Command('gfal-legacy-bringonline').bake(_tty_out=False) self._cp_cmd = sh.Command('gfal-copy').bake(_tty_out=False) self._ls_se_cmd = sh.Command('gfal-ls').bake(color='never', _tty_out=False) self._move_cmd = sh.Command('gfal-rename').bake(_tty_out=False) self._mkdir_cmd = sh.Command('gfal-mkdir').bake(_tty_out=False) self._replicate_cmd = sh.Command('dirac-dms-replicate-lfn').bake(_tty_out=False) self._add_cmd = sh.Command('dirac-dms-add-file').bake(_tty_out=False) @staticmethod def _check_return_value(ret): if not ret['OK']: raise BackendException("Failed: %s", ret['Message']) for path, error in ret['Value']['Failed'].items(): if ('No such' in error) or ('Directory does not' in error): raise DoesNotExistException("No such file or directory.") else: raise BackendException(error) def _is_dir(self, lurl): isdir = self.fc.isDirectory(lurl) self._check_return_value(isdir) return isdir['Value']['Successful'][lurl] def _is_file(self, lurl): isfile = self.fc.isFile(lurl) self._check_return_value(isfile) return isfile['Value']['Successful'][lurl] def _get_dir_entry(self, lurl, infodict=None): """Take a lurl and return a DirEntry.""" # If no dctionary with the information is specified, get it from the catalogue try: md = infodict['MetaData'] except TypeError: md = self.fc.getFileMetadata(lurl) if not md['OK']: raise BackendException("Failed to list path '%s': %s", lurl, md['Message']) for path, error in md['Value']['Failed'].items(): if 'No such file' in error: # File does not exist, maybe a directory? md = self.fc.getDirectoryMetadata(lurl) for path, error in md['Value']['Failed'].items(): raise DoesNotExistException("No such file or directory.") else: raise BackendException(md['Value']['Failed'][lurl]) md = md['Value']['Successful'][lurl] return DirEntry(posixpath.basename(lurl), mode=oct(md.get('Mode', -1)), links=md.get('links', -1), gid=md['OwnerGroup'], uid=md['Owner'], size=md.get('Size', -1), modified=str(md.get('ModificationDate', '?'))) def _iter_directory(self, lurl): """Iterate over entries in a directory.""" ret = self.fc.listDirectory(lurl) if not ret['OK']: raise BackendException("Failed to list path '%s': %s", lurl, ret['Message']) for path, error in ret['Value']['Failed'].items(): if 'Directory does not' in error: # Dir does not exist, maybe a File? if self.fc.isFile(lurl): lst = [(lurl, None)] break else: raise DoesNotExistException("No such file or Directory.") else: raise BackendException(ret['Value']['Failed'][lurl]) else: # Sort items by keys, i.e. paths lst = sorted(ret['Value']['Successful'][lurl]['Files'].items() + ret['Value']['Successful'][lurl]['SubDirs'].items()) for item in lst: yield item # = path, dict def _ls(self, lurl, **kwargs): # Translate keyword arguments d = kwargs.pop('directory', False) if d: # Just the requested entry itself yield self._get_dir_entry(lurl) return for path, info in self._iter_directory(lurl): yield self._get_dir_entry(path, info) def _ls_se(self, surl, **kwargs): # Translate keyword arguments d = kwargs.pop('directory', False) args = [] if -d: args.append('-d') args.append('-l') args.append(surl) try: output = self._ls_se_cmd(*args, **kwargs) except sh.ErrorReturnCode as e: if 'No such file' in e.stderr: raise DoesNotExistException("No such file or Directory.") else: raise BackendException(e.stderr) for line in output: fields = line.split() mode, links, gid, uid, size = fields[:5] name = fields[-1] modified = ' '.join(fields[5:-1]) yield DirEntry(name, mode=mode, links=int(links), gid=gid, uid=uid, size=int(size), modified=modified) def _replicas(self, lurl, **kwargs): # Check the lurl actually exists self._ls(lurl, directory=True) rep = self.dirac.getReplicas(lurl) self._check_return_value(rep) rep = rep['Value']['Successful'][lurl] return rep.values() def _exists(self, surl, **kwargs): try: ret = self._ls_se_cmd(surl, '-d', '-l', **kwargs).strip() except sh.ErrorReturnCode as e: if 'No such file' in e.stderr: return False else: if len(e.stderr) == 0: raise BackendException(e.stdout) else: raise BackendException(e.stderr) else: return ret[0] != 'd' # Return `False` for directories def _register(self, surl, lurl, verbose=False, **kwargs): # Register an existing physical copy in the file catalogue se = storage.get_SE(surl).name # See if file already exists in DFC ret = self.fc.getFileMetadata(lurl) try: self._check_return_value(ret) except DoesNotExistException: # Add new file size = next(self._ls_se(surl, directory=True)).size checksum = self.checksum(surl) guid = str(uuid.uuid4()) # The guid does not seem to be important. Make it unique if possible. ret = self.dm.registerFile((lurl, surl, size, se, guid, checksum)) else: # Add new replica ret = self.dm.registerReplica((lurl, surl, se)) self._check_return_value(ret) if verbose: print_("Successfully registered replica %s of %s from %s."%(surl, lurl, se)) return True def _deregister(self, surl, lurl, verbose=False, **kwargs): # DIRAC only needs to know the SE name to deregister a replica se = storage.get_SE(surl).name ret = self.dm.removeReplicaFromCatalog(se, [lurl]) self._check_return_value(ret) if verbose: print_("Successfully deregistered replica of %s from %s."%(lurl, se)) return True def _state(self, surl, **kwargs): try: state = self._xattr_cmd(surl, 'user.status', **kwargs).strip() except sh.ErrorReturnCode as e: if "No such file" in e.stderr: raise DoesNotExistException("No such file or Directory.") state = '?' except sh.SignalException_SIGSEGV: state = '?' return state def _checksum(self, surl, **kwargs): try: checksum = self._replica_checksum_cmd(surl, 'ADLER32', **kwargs).split()[1] except sh.ErrorReturnCode: checksum = '?' except sh.SignalException_SIGSEGV: checksum = '?' except IndexError: checksum = '?' return checksum def _bringonline(self, surl, timeout, verbose=False, **kwargs): if verbose: out = sys.stdout else: out = None # gfal does not notice when files come online, it seems # Just send a single short request, then check regularly if verbose: out = sys.stdout else: out = None end = time.time() + timeout try: self._bringonline_cmd('-t', 10, surl, _out=out, **kwargs) except sh.ErrorReturnCode as e: # The command fails if the file is not online # To be expected after 10 seconds if "No such file" in e.stderr: # Except when the file does not actually exist on the tape storage raise DoesNotExistException("No such file or Directory.") wait = 5 while(True): if verbose: print_("Checking replica state...") if self.is_online(surl): if verbose: print_("Replica brought online.") return True time_left = end - time.time() if time_left <= 0: if verbose: print_("Could not bring replica online.") return False wait *= 2 if time_left < wait: wait = time_left if verbose: print_("Timeout remaining: %d s"%(time_left)) print_("Checking again in: %d s"%(wait)) time.sleep(wait) def _replicate(self, source_surl, destination_surl, lurl, verbose=False, **kwargs): if verbose: out = sys.stdout else: out = None source = storage.get_SE(source_surl).name destination = storage.get_SE(destination_surl).name try: self._replicate_cmd(lurl, destination, source, _out=out, **kwargs) except sh.ErrorReturnCode as e: if 'No such file' in e.stderr: raise DoesNotExistException("No such file or directory.") else: if len(e.stderr) == 0: raise BackendException(e.stdout) else: raise BackendException(e.stderr) return True def _get(self, surl, localpath, verbose=False, **kwargs): if verbose: out = sys.stdout else: out = None try: self._cp_cmd('-f', '--checksum', 'ADLER32', surl, localpath, _out=out, **kwargs) except sh.ErrorReturnCode as e: if 'No such file' in e.stderr: raise DoesNotExistException("No such file or directory.") else: if len(e.stderr) == 0: raise BackendException(e.stdout) else: raise BackendException(e.stderr) return os.path.isfile(localpath) def _put(self, localpath, surl, lurl, verbose=False, **kwargs): if verbose: out = sys.stdout else: out = None se = storage.get_SE(surl).name try: self._add_cmd(lurl, localpath, se, _out=out, **kwargs) except sh.ErrorReturnCode as e: if 'No such file' in e.stderr: raise DoesNotExistException("No such file or directory.") else: if len(e.stderr) == 0: raise BackendException(e.stdout) else: raise BackendException(e.stderr) return True def _remove(self, surl, lurl, last=False, verbose=False, **kwargs): se = storage.get_SE(surl).name if last: # Delete lfn if verbose: print_("Removing all replicas of %s."%(lurl,)) ret = self.dm.removeFile([lurl]) else: if verbose: print_("Removing replica of %s from %s."%(lurl, se)) ret = self.dm.removeReplica(se, [lurl]) if not ret['OK']: raise BackendException('Failed: %s'%(ret['Message'])) for lurl, error in ret['Value']['Failed'].items(): if 'No such file' in error: raise DoesNotExistException("No such file or directory.") else: raise BackendException(error) return True def _rmdir(self, lurl, verbose=False): """Remove the an empty directory from the catalogue.""" rep = self.fc.removeDirectory(lurl) self._check_return_value(rep) return True def _move_replica(self, surl, new_surl, verbose=False, **kwargs): if verbose: out = sys.stdout else: out = None try: folder = posixpath.dirname(new_surl) self._mkdir_cmd(folder, '-p', _out=out, **kwargs) self._move_cmd(surl, new_surl, _out=out, **kwargs) except sh.ErrorReturnCode as e: if 'No such file' in e.stderr: raise DoesNotExistException("No such file or directory.") else: if len(e.stderr) == 0: raise BackendException(e.stdout) else: raise BackendException(e.stderr) return True