def initialize(self): """Sets defaults """ self.am_setOption('shifterProxy', 'ProductionManager') self.transClient = TransformationClient() self.reqClient = ReqClient() self.consChecks = ConsistencyChecks(interactive=False, transClient=self.transClient) transformationTypes = Operations().getValue( 'Transformations/DataProcessing', []) extendableTTypes = Operations().getValue( 'Transformations/ExtendableTransfTypes', ['MCSimulation']) self.transformationTypes = list( set(transformationTypes) - set(extendableTTypes)) return S_OK()
def setUp( self ): self.dmMock = Mock() self.dmMock.getReplicas.return_value = {'OK': True, 'Value':{'Successful':{'bb.raw':'metadataPippo'}, 'Failed':{}}} self.cc = ConsistencyChecks( transClient = Mock(), dm = self.dmMock, bkClient = bkc_mock ) self.fileTypes = [['SEMILEPTONIC.DST', 'LOG', 'RAW'], ['SEMILEPTONIC.DST', 'LOG', 'RAW'], ['SEMILEPTONIC.DST'], ['SEMILEPTONIC.DST']] self.cc.fileTypesExcluded = ['LOG'] self.cc.prod = 0 self.maxDiff = None
Script.registerSwitch('', 'FixIt', ' Take action to fix the catalogs') Script.setUsageMessage('\n'.join([__doc__, 'Usage:', ' %s [option|cfgfile] [values]' % Script.scriptName, ])) dmScript = DMScript() dmScript.registerDMSwitches() # Directory Script.parseCommandLine(ignoreErrors=True) fixIt = False for opt, val in Script.getUnprocessedSwitches(): if opt == 'FixIt': fixIt = True # imports from DIRAC import gLogger from LHCbDIRAC.DataManagementSystem.Client.ConsistencyChecks import ConsistencyChecks cc = ConsistencyChecks() cc.directories = dmScript.getOption('Directory', []) cc.lfns = dmScript.getOption('LFNs', []) + [lfn for arg in Script.getPositionalArgs() for lfn in arg.split(',')] bkQuery = dmScript.getBKQuery(visible='All') if bkQuery.getQueryDict() != {'Visible': 'All'}: bkQuery.setOption('ReplicaFlag', 'All') cc.bkQuery = bkQuery seList = dmScript.getOption('SEs', []) if not seList: dmScript.setSEs('Tier1-Archive') seList = dmScript.getOption('SEs', []) from LHCbDIRAC.DataManagementSystem.Client.CheckExecutors import doCheckSE doCheckSE(cc, seList, fixIt)
def __init__(self): """ Extending DIRAC's DIRACDataIntegrityClient init """ super(DataIntegrityClient, self).__init__() self.cc = ConsistencyChecks()
class DataIntegrityClient(DIRACDataIntegrityClient): def __init__(self): """ Extending DIRAC's DIRACDataIntegrityClient init """ super(DataIntegrityClient, self).__init__() self.cc = ConsistencyChecks() ########################################################################## # # This section contains the specific methods for BK->FC checks # def productionToCatalog(self, productionID): """ This obtains the file information from the BK and checks these files are present in the FC. """ gLogger.info("-" * 40) gLogger.info("Performing the BK->FC check") gLogger.info("-" * 40) res = self.__getProductionFiles(productionID) if not res['OK']: return res noReplicaFiles = res['Value']['GotReplicaNo'] yesReplicaFiles = res['Value']['GotReplicaYes'] # For the files marked as existing we perfom catalog check res = self.cc._getCatalogMetadata(yesReplicaFiles) if not res['OK']: return res catalogMetadata, missingCatalogFiles, zeroSizeFiles = res['Value'] if missingCatalogFiles: self._reportProblematicFiles(missingCatalogFiles, 'LFNCatalogMissing') if zeroSizeFiles: self._reportProblematicFiles(zeroSizeFiles, 'LFNZeroSize') # Try and get the metadata for files that shouldn't exist in the catalog if noReplicaFiles: res = self.__checkCatalogForBKNoReplicas(noReplicaFiles) if not res['OK']: return res catalogMetadata.update(res['Value']) # Get the replicas for the files found to exist in the catalog res = self.cc._getCatalogReplicas(catalogMetadata.keys()) if not res['OK']: return res replicas, zeroReplicaFiles = res['Value'] if zeroReplicaFiles: self._reportProblematicFiles(zeroReplicaFiles, 'LFNZeroReplicas') resDict = { 'CatalogMetadata': catalogMetadata, 'CatalogReplicas': replicas } return S_OK(resDict) def __checkCatalogForBKNoReplicas(self, lfns): """ Checks the catalog existence for given files """ gLogger.info('Checking the catalog existence of %s files' % len(lfns)) res = self.fc.getFileMetadata(lfns) if not res['OK']: gLogger.error('Failed to get catalog metadata', res['Message']) return res allMetadata = res['Value']['Successful'] existingCatalogFiles = allMetadata.keys() if existingCatalogFiles: self._reportProblematicFiles(existingCatalogFiles, 'BKReplicaNo') gLogger.info('Checking the catalog existence of files complete') return S_OK(allMetadata) def __getProductionFiles(self, productionID): """ This method queries the bookkeeping and obtains the file metadata for the given production """ from DIRAC.Core.DISET.RPCClient import RPCClient gLogger.info("Attempting to get files for production %s" % productionID) bk = RPCClient('Bookkeeping/BookkeepingManager') res = bk.getProductionFiles(productionID, 'ALL') if not res['OK']: return res yesReplicaFiles = [] noReplicaFiles = [] badReplicaFiles = [] badBKFileSize = [] badBKGUID = [] allMetadata = res['Value'] gLogger.info("Obtained at total of %s files" % len(allMetadata.keys())) totalSize = 0 for lfn, bkMetadata in allMetadata.iteritems(): if bkMetadata['FileType'] != 'LOG': if bkMetadata['GotReplica'] == 'Yes': yesReplicaFiles.append(lfn) if bkMetadata['FileSize']: totalSize += long(bkMetadata['FileSize']) elif bkMetadata['GotReplica'] == 'No': noReplicaFiles.append(lfn) else: badReplicaFiles.append(lfn) if not bkMetadata['FileSize']: badBKFileSize.append(lfn) if not bkMetadata['GUID']: badBKGUID.append(lfn) if badReplicaFiles: self._reportProblematicFiles(badReplicaFiles, 'BKReplicaBad') if badBKFileSize: self._reportProblematicFiles(badBKFileSize, 'BKSizeBad') if badBKGUID: self._reportProblematicFiles(badBKGUID, 'BKGUIDBad') gLogger.info("%s files marked with replicas with total size %s bytes" % (len(yesReplicaFiles), totalSize)) gLogger.info("%s files marked without replicas" % len(noReplicaFiles)) resDict = { 'BKMetadata': allMetadata, 'GotReplicaYes': yesReplicaFiles, 'GotReplicaNo': noReplicaFiles } return S_OK(resDict) ########################################################################## # # This section contains the specific methods for FC->BK checks # def catalogDirectoryToBK(self, lfnDir): """ This obtains the replica and metadata information from the catalog for the supplied directory and checks against the BK. """ gLogger.info("-" * 40) gLogger.info("Performing the FC->BK check") gLogger.info("-" * 40) if isinstance(lfnDir, basestring): lfnDir = [lfnDir] res = self.__getCatalogDirectoryContents(lfnDir) if not res['OK']: return res replicas = res['Value']['Replicas'] catalogMetadata = res['Value']['Metadata'] resDict = { 'CatalogMetadata': catalogMetadata, 'CatalogReplicas': replicas } if not catalogMetadata: gLogger.warn('No files found in directory %s' % lfnDir) return S_OK(resDict) lfns = [] for repDict in replicas: lfns.append(repDict) missingLFNs, noFlagLFNs, _okLFNs = self.cc._getBKMetadata(lfns) if missingLFNs: self._reportProblematicFiles(missingLFNs, 'LFNBKMissing') if noFlagLFNs: self._reportProblematicFiles(noFlagLFNs, 'BKReplicaNo') return S_OK(resDict) def catalogFileToBK(self, lfns): """ This obtains the replica and metadata information from the catalog and checks against the storage elements. """ gLogger.info("-" * 40) gLogger.info("Performing the FC->BK check") gLogger.info("-" * 40) if type(lfns) in types.StringTypes: lfns = [lfns] res = self.cc._getCatalogMetadata(lfns) if not res['OK']: return res catalogMetadata, missingCatalogFiles, zeroSizeFiles = res['Value'] if missingCatalogFiles: self._reportProblematicFiles(missingCatalogFiles, 'LFNCatalogMissing') if zeroSizeFiles: self._reportProblematicFiles(zeroSizeFiles, 'LFNZeroSize') res = self.cc._getCatalogReplicas(catalogMetadata.keys()) if not res['OK']: return res replicas, _zeroReplicaFiles = res['Value'] lfns = [] for repDict in replicas: lfns.append(repDict) missingLFNs, noFlagLFNs, _okLFNs = self.cc._getBKMetadata(lfns) if missingLFNs: self._reportProblematicFiles(missingLFNs, 'LFNBKMissing') if noFlagLFNs: self._reportProblematicFiles(noFlagLFNs, 'BKReplicaNo') resDict = { 'CatalogMetadata': catalogMetadata, 'CatalogReplicas': replicas } return S_OK(resDict) ########################################################################## # # This section contains the resolution methods for various prognoses # def resolveBKReplicaYes(self, problematicDict): """ This takes the problematic dictionary returned by the integrity DB and resolved the BKReplicaYes prognosis """ lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] res = returnSingleResult(self.fc.exists(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) removeBKFile = False # If the file does not exist in the catalog if not res['Value']: gLogger.info( "BKReplicaYes file (%d) does not exist in the catalog. Removing..." % fileID) removeBKFile = True else: gLogger.info( "BKReplicaYes file (%d) found to exist in the catalog" % fileID) # If the file has no replicas in the catalog res = returnSingleResult(self.fc.getReplicas(lfn)) if (not res['OK']) and (res['Message'] == 'File has zero replicas'): gLogger.info( "BKReplicaYes file (%d) found to exist without replicas. Removing..." % fileID) removeBKFile = True if removeBKFile: # Remove the file from the BK because it does not exist res = returnSingleResult( FileCatalog(catalogs=['BookkeepingDB']).removeFile(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) gLogger.info("BKReplicaYes file (%d) removed from bookkeeping" % fileID) return self.__updateCompletedFiles('BKReplicaYes', fileID) def resolveBKReplicaNo(self, problematicDict): """ This takes the problematic dictionary returned by the integrity DB and resolved the BKReplicaNo prognosis """ lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] res = returnSingleResult(self.fc.exists(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) # If the file exists in the catalog if not res['Value']: return self.__updateCompletedFiles('BKReplicaNo', fileID) gLogger.info("BKReplicaNo file (%d) found to exist in the catalog" % fileID) # and has available replicas res = returnSingleResult(self.fc.getCatalogReplicas(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) if not res['Value']: gLogger.info("BKReplicaNo file (%d) found to have no replicas" % fileID) return self.changeProblematicPrognosis(fileID, 'LFNZeroReplicas') gLogger.info("BKReplicaNo file (%d) found to have replicas" % fileID) res = returnSingleResult( FileCatalog(catalogs=['BookkeepingDB']).addFile(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) return self.__updateCompletedFiles('BKReplicaNo', fileID) def checkPhysicalFiles(self, replicas, catalogMetadata, ses=[], fixIt=False): """ This obtains takes the supplied replica and metadata information obtained from the catalog and checks against the storage elements. """ gLogger.info("-" * 40) gLogger.info("Performing the FC->SE check") gLogger.info("-" * 40) seLfns = {} for lfn, replicaDict in replicas.iteritems(): for se in replicaDict: if (ses) and (se not in ses): continue seLfns.setdefault(se, []).append(lfn) gLogger.info('%s %s' % ('Storage Element'.ljust(20), 'Replicas'.rjust(20))) for se in sorted(seLfns): lfns = seLfns[se] sizeMismatch = [] checksumMismatch = [] checksumBadInFC = [] res = self.__checkPhysicalFileMetadata(lfns, se) if not res['OK']: gLogger.error('Failed to get physical file metadata.', res['Message']) return res for lfn, metadata in res['Value'].iteritems(): if lfn in catalogMetadata: if (metadata['Size'] != catalogMetadata[lfn]['Size']) and ( metadata['Size'] != 0): sizeMismatch.append((lfn, 'deprecatedUrl', se, 'CatalogPFNSizeMismatch')) if metadata['Checksum'] != catalogMetadata[lfn]['Checksum']: if metadata['Checksum'].replace( 'x', '0' ) == catalogMetadata[lfn]['Checksum'].replace( 'x', '0'): checksumBadInFC.append( (lfn, 'deprecatedUrl', se, "%s %s" % (metadata['Checksum'], catalogMetadata[lfn]['Checksum']))) else: checksumMismatch.append( (lfn, 'deprecatedUrl', se, "%s %s" % (metadata['Checksum'], catalogMetadata[lfn]['Checksum']))) if sizeMismatch: self.reportProblematicReplicas(sizeMismatch, se, 'CatalogPFNSizeMismatch', fixIt=fixIt) if checksumMismatch: self.reportProblematicReplicas(checksumMismatch, se, 'CatalogChecksumMismatch', fixIt=fixIt) if checksumBadInFC: self.reportProblematicReplicas(checksumBadInFC, se, 'CatalogChecksumToBeFixed', fixIt=fixIt) return S_OK() def __checkPhysicalFileMetadata(self, lfns, se): """ Check obtain the physical file metadata and check the files are available """ gLogger.info('Checking the integrity of %s physical files at %s' % (len(lfns), se)) res = StorageElement(se).getFileMetadata(lfns) if not res['OK']: gLogger.error('Failed to get metadata for lfns.', res['Message']) return res pfnMetadataDict = res['Value']['Successful'] # If the replicas are completely missing missingReplicas = [] for lfn, reason in res['Value']['Failed'].iteritems(): if re.search('File does not exist', reason): missingReplicas.append( (lfn, 'deprecatedUrl', se, 'PFNMissing')) if missingReplicas: self.reportProblematicReplicas(missingReplicas, se, 'PFNMissing') lostReplicas = [] unavailableReplicas = [] zeroSizeReplicas = [] # If the files are not accessible for lfn, metadata in pfnMetadataDict.iteritems(): if metadata.get('Lost', False): lostReplicas.append((lfn, se, 'PFNLost')) if metadata.get('Unavailable', not metadata['Accessible']): unavailableReplicas.append( (lfn, 'deprecatedUrl', se, 'PFNUnavailable')) if metadata['Size'] == 0: zeroSizeReplicas.append( (lfn, 'deprecatedUrl', se, 'PFNZeroSize')) if lostReplicas: self.reportProblematicReplicas(lostReplicas, se, 'PFNLost') if unavailableReplicas: self.reportProblematicReplicas(unavailableReplicas, se, 'PFNUnavailable') if zeroSizeReplicas: self.reportProblematicReplicas(zeroSizeReplicas, se, 'PFNZeroSize') gLogger.info( 'Checking the integrity of physical files at %s complete' % se) return S_OK(pfnMetadataDict) def reportProblematicReplicas(self, replicaTuple, se, reason, fixIt=False): """ Simple wrapper function around setReplicaProblematic """ gLogger.info('The following %s files had %s at %s' % (len(replicaTuple), reason, se)) for lfn, pfn, se, reason1 in sorted(replicaTuple): if reason1 == reason: reason1 = '' if lfn: gLogger.info(lfn, reason1) else: gLogger.info(pfn, reason1) if fixIt: res = self.setReplicaProblematic( replicaTuple, sourceComponent='DataIntegrityClient') if not res['OK']: gLogger.info('Failed to update integrity DB with replicas', res['Message']) else: gLogger.info('Successfully updated integrity DB with replicas') ########################################################################## # # This section contains the specific methods for obtaining replica and metadata information from the catalog # def __getCatalogDirectoryContents(self, lfnDir): """ Obtain the contents of the supplied directory """ gLogger.info('Obtaining the catalog contents for %s directories' % len(lfnDir)) activeDirs = list(lfnDir) allFiles = {} while len(activeDirs) > 0: currentDir = activeDirs[0] res = self.fc.listDirectory(currentDir, verbose=True) activeDirs.remove(currentDir) if not res['OK']: gLogger.error('Failed to get directory contents', res['Message']) return res elif res['Value']['Failed'].has_key(currentDir): gLogger.error( 'Failed to get directory contents', '%s %s' % (currentDir, res['Value']['Failed'][currentDir])) else: dirContents = res['Value']['Successful'][currentDir] activeDirs.extend(dirContents['SubDirs']) allFiles.update(dirContents['Files']) zeroReplicaFiles = [] zeroSizeFiles = [] allReplicaDict = {} allMetadataDict = {} for lfn, lfnDict in allFiles.iteritems(): lfnReplicas = {} for se, replicaDict in lfnDict['Replicas'].iteritems(): lfnReplicas[se] = replicaDict['PFN'] if not lfnReplicas: zeroReplicaFiles.append(lfn) allReplicaDict[lfn] = lfnReplicas allMetadataDict[lfn] = lfnDict['MetaData'] if lfnDict['MetaData']['Size'] == 0: zeroSizeFiles.append(lfn) if zeroReplicaFiles: self._reportProblematicFiles(zeroReplicaFiles, 'LFNZeroReplicas') if zeroSizeFiles: self._reportProblematicFiles(zeroSizeFiles, 'LFNZeroSize') gLogger.info( 'Obtained at total of %s files for the supplied directories' % len(allMetadataDict)) resDict = {'Metadata': allMetadataDict, 'Replicas': allReplicaDict} return S_OK(resDict)
def doCheckFC2BK(cc, fixFC=False, fixBK=False, listAffectedRuns=False): """ Method actually calling for the the check using ConsistencyChecks module It prints out results and calls corrective actions if required """ cc.checkFC2BK() maxFiles = 10 suffix = '' nb = 0 baseName = 'CheckFC2BK' + ('-%s' % cc.prod if cc.prod else '') while True: fileName = baseName + '%s.txt' % suffix if not os.path.exists(fileName): break nb += 1 suffix = '-%d' % nb fp = None if cc.existLFNsBKRepNo: gLogger.notice('>>>>') affectedRuns = list( set(str(run) for run in cc.existLFNsBKRepNo.itervalues())) gLogger.error("%d files are in the FC but have replica = NO in BK" % len(cc.existLFNsBKRepNo)) from LHCbDIRAC.DataManagementSystem.Client.ConsistencyChecks import ConsistencyChecks ccAux = ConsistencyChecks() gLogger.notice("====== Now checking %d files from FC to SE ======" % len(cc.existLFNsBKRepNo)) ccAux.lfns = cc.existLFNsBKRepNo.keys() doCheckFC2SE(ccAux, bkCheck=False, fixIt=fixFC, fixOption='FixFC') cc.existLFNsBKRepNo = sorted( set(cc.existLFNsBKRepNo) - set(ccAux.existLFNsNoSE) - set(ccAux.existLFNsNotExisting) - set(ccAux.existLFNsBadFiles)) if cc.existLFNsBKRepNo: gLogger.notice( "====== Completed, %d files are in the FC and SE but have replica = NO in BK ======" % len(cc.existLFNsBKRepNo)) if fp is None: fp = open(fileName, 'w') fp.write('\nInFCButBKNo '.join([''] + sorted(cc.existLFNsBKRepNo))) res = cc.bkClient.getFileMetadata(cc.existLFNsBKRepNo) if not res['OK']: gLogger.fatal("Unable to get file metadata", res['Message']) return if res['Value']['Failed']: gLogger.error("No metadata found for some files", '%d files' % len(res['Value']['Failed'])) success = res['Value']['Successful'] filesInvisible = set(lfn for lfn, meta in success.iteritems() if meta['VisibilityFlag'][0].upper() == 'N') filesVisible = set(success) - filesInvisible gLogger.notice('%d files are visible, %d files are invisible' % (len(filesVisible), len(filesInvisible))) # Try and print the whole as INFO (in case --Verbose was used). # If nothing printed, print a limited number of files as ERROR if not gLogger.info('\n'.join( '%s : Visi %s' % (lfn, success.get(lfn, {}).get('VisibilityFlag', '?')) for lfn in sorted(cc.existLFNsBKRepNo))): if len(cc.existLFNsBKRepNo) > maxFiles: gLogger.notice('First %d files:' % maxFiles) gLogger.error('\n'.join( '%s : Visi %s' % (lfn, success.get(lfn, {}).get('VisibilityFlag', '?')) for lfn in sorted(cc.existLFNsBKRepNo)[0:maxFiles])) if listAffectedRuns: gLogger.notice('Affected runs: %s' % ','.join(affectedRuns)) gLogger.notice("Full list of files: grep InFCButBKNo %s" % fileName) if fixBK: gLogger.notice("Going to fix them, setting the replica flag") res = cc.bkClient.addFiles(list(success)) if res['OK']: gLogger.notice( "\tSuccessfully added replica flag to %d files" % len(success)) else: gLogger.error('Failed to set the replica flag', res['Message']) elif fixFC: gLogger.notice( "Going to fix them, by removing from the FC and storage") __removeFile(success) else: gLogger.notice( "Use --FixBK to fix it (set the replica flag) or --FixFC (for removing from FC and storage)" ) else: gLogger.notice( "====== Completed, no files in the FC with replica = NO in BK ======" ) gLogger.notice('<<<<') else: gLogger.notice("No files in FC with replica = NO in BK -> OK!") if cc.existLFNsNotInBK: gLogger.notice('>>>>') gLogger.error("%d files are in the FC but are NOT in BK:" % len(cc.existLFNsNotInBK)) if fp is None: fp = open(fileName, 'w') fp.write('\nInFCNotInBK '.join([''] + sorted(cc.existLFNsNotInBK))) if not gLogger.info('\n'.join(sorted(cc.existLFNsNotInBK))): if len(cc.existLFNsNotInBK) > maxFiles: gLogger.notice('First %d files:' % maxFiles) gLogger.error('\n'.join(sorted(cc.existLFNsNotInBK[0:maxFiles]))) gLogger.notice("Full list of files: grep InFCNotInBK %s" % fileName) if fixFC: gLogger.notice( "Going to fix them, by removing from the FC and storage") __removeFile(cc.existLFNsNotInBK) else: gLogger.notice( "Use --FixFC to fix it (remove from FC and storage)") gLogger.notice('<<<<') else: gLogger.notice("No files in FC not in BK -> OK!") if fp is not None: fp.close()
class DataRecoveryAgent(AgentModule): """ Standard DIRAC agent class """ def __init__(self, *args, **kwargs): """ c'tor """ AgentModule.__init__(self, *args, **kwargs) self.transClient = None self.reqClient = None self.consChecks = None self.enableFlag = True self.transformationTypes = [] self.transLogger = self.log ############################################################################# def initialize(self): """Sets defaults """ self.am_setOption('shifterProxy', 'ProductionManager') self.transClient = TransformationClient() self.reqClient = ReqClient() self.consChecks = ConsistencyChecks(interactive=False, transClient=self.transClient) transformationTypes = Operations().getValue( 'Transformations/DataProcessing', []) extendableTTypes = Operations().getValue( 'Transformations/ExtendableTransfTypes', ['MCSimulation']) self.transformationTypes = list( set(transformationTypes) - set(extendableTTypes)) return S_OK() ############################################################################# def execute(self): """ The main execution method. """ # Configuration settings self.enableFlag = self.am_getOption('EnableFlag', True) self.log.verbose('Enable flag is %s' % self.enableFlag) if not self.transformationTypes: self.log.warn("No transformation types to look for... aborting") return S_OK() transformationStatus = self.am_getOption('TransformationStatus', ['Active', 'Completing']) fileSelectionStatus = self.am_getOption('FileSelectionStatus', ['Assigned', 'MaxReset']) unrecoverableStatus = self.am_getOption('UnrecoverableStatus', ['MaxReset']) updateStatus = self.am_getOption('FileUpdateStatus', 'Unused') wmsStatusList = self.am_getOption('WMSStatus', ['Failed']) # only worry about files > 12hrs since last update selectDelay = self.am_getOption('SelectionDelay', 1) # hours transformationDict = {} for transStatus in transformationStatus: result = self.__getEligibleTransformations( transStatus, self.transformationTypes) if not result['OK']: self.log.error( "Could not obtain eligible transformations", "Status '%s': %s" % (transStatus, result['Message'])) return result if not result['Value']: self.log.info( 'No "%s" transformations of types %s to process.' % (transStatus, ', '.join(self.transformationTypes))) continue transformationDict.update(result['Value']) self.log.info( 'Selected %d transformations of types %s' % (len(transformationDict), ', '.join(self.transformationTypes))) self.log.verbose('Transformations selected:\n%s' % (', '.join(transformationDict))) for transformation, typeName in transformationDict.iteritems(): self.transLogger = self.log.getSubLogger('Trans-%s' % transformation) result = self.__selectTransformationFiles(transformation, fileSelectionStatus) if not result['OK']: self.transLogger.error( 'Could not select files for transformation', '%s: %s' % (transformation, result['Message'])) continue fileDict = result['Value'] if not fileDict: self.transLogger.verbose( 'No files in status %s selected for transformation %s' % (', '.join(fileSelectionStatus), transformation)) continue title = 'Looking at transformation %s, type %s ' % (transformation, typeName) self.transLogger.info('=' * len(title)) self.transLogger.info(title) self.transLogger.info( 'Selected %d files with status %s' % (len(fileDict), ','.join(fileSelectionStatus))) result = self.__obtainWMSJobIDs(transformation, fileDict, selectDelay, wmsStatusList) if not result['OK']: self.transLogger.error( "Could not obtain jobs for files of transformation", result['Message']) continue jobFileDict = result['Value'] if not jobFileDict: self.transLogger.info('No %s jobs found for selected files' % ' or '.join(wmsStatusList)) continue self.transLogger.verbose( "Looking at WMS jobs %s" % ','.join(str(jobID) for jobID in jobFileDict)) fileCount = sum( len(lfnList) for lfnList in jobFileDict.itervalues()) self.transLogger.verbose( '%s files are selected after examining WMS jobs' % (str(fileCount) if fileCount else 'No')) if not fileCount: continue result = self.__removePendingRequestsJobs(jobFileDict) if not result['OK']: self.transLogger.error( "Error while removing jobs with pending requests", result['Message']) continue # This method modifies the input dictionary if not jobFileDict: self.transLogger.info( 'No WMS jobs without pending requests to process.') continue fileCount = sum( len(lfnList) for lfnList in jobFileDict.itervalues()) self.transLogger.info( '%s files are selected in %d jobs after removing any job with pending requests' % (str(fileCount) if fileCount else 'No', len(jobFileDict))) if not fileCount: continue jobsThatDidntProduceOutputs, jobsThatProducedOutputs = self.__checkdescendants( transformation, jobFileDict) title = '======== Transformation %s: results ========' % transformation self.transLogger.info(title) self.transLogger.info('\tTotal jobs that can be updated now: %d' % len(jobsThatDidntProduceOutputs)) if jobsThatProducedOutputs: self.transLogger.info('\t%d jobs have descendants' % len(jobsThatProducedOutputs)) else: self.transLogger.info('\tNo jobs have descendants') filesToUpdate = [] filesMaxReset = [] filesWithDescendants = [] for job, fileList in jobFileDict.iteritems(): if job in jobsThatDidntProduceOutputs: recoverableFiles = set( lfn for lfn in fileList if fileDict[lfn][1] not in unrecoverableStatus) filesToUpdate += list(recoverableFiles) filesMaxReset += list(set(fileList) - recoverableFiles) elif job in jobsThatProducedOutputs: filesWithDescendants += fileList if filesToUpdate: self.transLogger.info("\tUpdating %d files to '%s'" % (len(filesToUpdate), updateStatus)) result = self.__updateFileStatus(transformation, filesToUpdate, updateStatus) if not result['OK']: self.transLogger.error( '\tRecoverable files were not updated', result['Message']) if filesMaxReset: self.transLogger.info( '\t%d files are in %s status and have no descendants' % (len(filesMaxReset), ','.join(unrecoverableStatus))) if filesWithDescendants: # FIXME: we should mark these files with another status such that they are not considered again and again # In addition a notification should be sent to the production managers self.transLogger.warn( '\t!!!!!!!! Transformation has descendants for files that are not marked as processed !!!!!!!!' ) self.transLogger.warn('\tFiles with descendants:', ','.join(filesWithDescendants)) return S_OK() ############################################################################# def __getEligibleTransformations(self, status, typeList): """ Select transformations of given status and type. """ res = self.transClient.getTransformations(condDict={ 'Status': status, 'Type': typeList }) if not res['OK']: return res transformations = dict((str(prod['TransformationID']), prod['Type']) for prod in res['Value']) return S_OK(transformations) ############################################################################# def __selectTransformationFiles(self, transformation, statusList): """ Select files, production jobIDs in specified file status for a given transformation. """ # Until a query for files with timestamp can be obtained must rely on the # WMS job last update res = self.transClient.getTransformationFiles(condDict={ 'TransformationID': transformation, 'Status': statusList }) if not res['OK']: return res resDict = {} mandatoryKeys = {'LFN', 'TaskID', 'LastUpdate'} for fileDict in res['Value']: missingKeys = mandatoryKeys - set(fileDict) if missingKeys: for key in missingKeys: self.transLogger.warn( '%s is mandatory, but missing for:\n\t%s' % (key, str(fileDict))) else: resDict[fileDict['LFN']] = (fileDict['TaskID'], fileDict['Status']) return S_OK(resDict) ############################################################################# def __obtainWMSJobIDs(self, transformation, fileDict, selectDelay, wmsStatusList): """ Group files by the corresponding WMS jobIDs, check the corresponding jobs have not been updated for the delay time. Can't get into any mess because we start from files only in MaxReset / Assigned and check corresponding jobs. Mixtures of files for jobs in MaxReset and Assigned statuses only possibly include some files in Unused status (not Processed for example) that will not be touched. """ taskIDList = sorted( set(taskID for taskID, _status in fileDict.values())) self.transLogger.verbose( "The following %d task IDs correspond to the selected files:\n%s" % (len(taskIDList), ', '.join(str(taskID) for taskID in taskIDList))) jobFileDict = {} olderThan = dateTime() - datetime.timedelta(hours=selectDelay) res = self.transClient.getTransformationTasks( condDict={ 'TransformationID': transformation, 'TaskID': taskIDList }, older=olderThan, timeStamp='LastUpdateTime') if not res['OK']: self.transLogger.error("getTransformationTasks returned an error", '%s' % res['Message']) return res mandatoryKeys = { 'TaskID', 'ExternalID', 'LastUpdateTime', 'ExternalStatus' } for taskDict in res['Value']: missingKey = mandatoryKeys - set(taskDict) if missingKey: for key in missingKey: self.transLogger.warn( 'Missing key %s for job dictionary:\n\t%s' % (key, str(taskDict))) continue taskID = taskDict['TaskID'] wmsID = taskDict['ExternalID'] wmsStatus = taskDict['ExternalStatus'] if not int(wmsID): self.transLogger.verbose( 'TaskID %s: status is %s (jobID = %s) so will not recheck with WMS' % (taskID, wmsStatus, wmsID)) continue # Exclude jobs not having appropriate WMS status - have to trust that production management status is correct if wmsStatus not in wmsStatusList: self.transLogger.verbose( 'Job %s is in status %s, not in %s so will be ignored' % (wmsID, wmsStatus, ', '.join(wmsStatusList))) continue # Must map unique files -> jobs in expected state jobFileDict[wmsID] = [ lfn for lfn, (tID, _st) in fileDict.iteritems() if int(tID) == int(taskID) ] self.transLogger.info( 'Found %d files for taskID %s, jobID %s (%s), last update %s' % (len(jobFileDict[wmsID]), taskID, wmsID, wmsStatus, taskDict['LastUpdateTime'])) return S_OK(jobFileDict) ############################################################################# def __removePendingRequestsJobs(self, jobFileDict): """ Before doing anything check that no outstanding requests are pending for the set of WMS jobIDs. """ jobs = jobFileDict.keys() level = self.reqClient.log.getLevel() self.reqClient.log.setLevel('ERROR') result = self.reqClient.getRequestIDsForJobs(jobs) self.reqClient.log.setLevel(level) if not result['OK']: return result if not result['Value']['Successful']: self.transLogger.verbose('None of the jobs have pending requests') return S_OK() for jobID, requestID in result['Value']['Successful'].iteritems(): res = self.reqClient.getRequestStatus(requestID) if not res['OK']: self.transLogger.error('Failed to get Status for Request', '%s:%s' % (requestID, res['Message'])) elif res['Value'] != 'Done': # If we fail to get the Status or it is not Done, we must wait, so remove the job from the list. del jobFileDict[str(jobID)] self.transLogger.verbose( 'Removing jobID %s from consideration until requests are completed' % (jobID)) return S_OK() ############################################################################# def __checkdescendants(self, transformation, jobFileDict): """ Check BK descendants for input files, prepare list of actions to be taken for recovery. """ jobsThatDidntProduceOutputs = [] jobsThatProducedOutputs = [] self.consChecks.prod = transformation for job, fileList in jobFileDict.iteritems(): result = self.consChecks.getDescendants(fileList) filesWithDesc = result[0] filesWithMultipleDesc = result[2] if filesWithDesc or filesWithMultipleDesc: jobsThatProducedOutputs.append(job) else: jobsThatDidntProduceOutputs.append(job) return jobsThatDidntProduceOutputs, jobsThatProducedOutputs ############################################################################ def __updateFileStatus(self, transformation, fileList, fileStatus): """ Update file list to specified status. """ if not self.enableFlag: self.transLogger.info( "\tEnable flag is False, would have updated %d files to '%s' status for %s" % (len(fileList), fileStatus, transformation)) return S_OK() return self.transClient.setFileStatusForTransformation( int(transformation), fileStatus, fileList, force=False)
# In case the user asked for specific LFNs if not status: lfnList = dmScript.getOption('LFNs', []) if not status and not lfnList and not runsList and not fromProd and not force: gLogger.fatal("You are about to check descendants for all files in a production") gLogger.fatal("If you really want to do so, use --Force") DIRAC.exit(0) from LHCbDIRAC.DataManagementSystem.Client.ConsistencyChecks import ConsistencyChecks from LHCbDIRAC.BookkeepingSystem.Client.BKQuery import BKQuery from DIRAC.TransformationSystem.Client.TransformationClient import TransformationClient tr = TransformationClient() for prod in prodList: startTime = time.time() cc = ConsistencyChecks() # Setting the prod also sets its type try: cc.prod = prod except RuntimeError as e: gLogger.exception(lException=e) continue if fileType and cc.transType in ('Merge', 'MCMerge'): gLogger.notice("It is not allowed to select file type for merging transformation", prod) continue cc.verbose = verbose cc.noFC = noFC cc.descendantsDepth = depth if prod != prodList[0]: gLogger.notice("====================") gLogger.notice("Processing %s production %d" % (cc.transType, cc.prod))
' Consider also files with replica flag NO') Script.parseCommandLine(ignoreErrors=True) fixIt = False checkAll = False production = 0 for opt, val in Script.getUnprocessedSwitches(): if opt == 'FixIt': fixIt = True elif opt == 'CheckAllFlags': checkAll = True # imports from LHCbDIRAC.DataManagementSystem.Client.ConsistencyChecks import ConsistencyChecks gLogger.setLevel('INFO') cc = ConsistencyChecks() bkQuery = dmScript.getBKQuery(visible='All') cc.bkQuery = bkQuery cc.lfns = dmScript.getOption('LFNs', []) productions = dmScript.getOption('Productions', []) from LHCbDIRAC.DataManagementSystem.Client.CheckExecutors import doCheckBK2FC if productions: for prod in productions: cc.prod = prod gLogger.always("Processing production %d" % cc.prod) doCheckBK2FC(cc, checkAll, fixIt) gLogger.always("Processed production %d" % cc.prod) else: doCheckBK2FC(cc, checkAll, fixIt)