Пример #1
0
    def initialize(self):
        self.fileCatalog = FileCatalog()
        self.dm = DataManager()
        self.stagerClient = StorageManagerClient()
        self.dataIntegrityClient = DataIntegrityClient()
        # This sets the Default Proxy to used as that defined under
        # /Operations/Shifter/DataManager
        # the shifterProxy option in the Configuration can be used to change this default.
        self.am_setOption("shifterProxy", "DataManager")

        return S_OK()
Пример #2
0
  def initialize( self ):
    self.replicaManager = ReplicaManager()
    #self.stagerClient = StorageManagerClient()
    self.dataIntegrityClient = DataIntegrityClient()
    self.storageDB = StorageManagementDB()
    # This sets the Default Proxy to used as that defined under
    # /Operations/Shifter/DataManager
    # the shifterProxy option in the Configuration can be used to change this default.
    self.am_setOption( 'shifterProxy', 'DataManager' )

    return S_OK()
Пример #3
0
    def initialize(self):
        self.replicaManager = ReplicaManager()
        #self.stagerClient = StorageManagerClient()
        self.dataIntegrityClient = DataIntegrityClient()
        self.storageDB = StorageManagementDB()
        # pin lifetime = 1 day
        self.pinLifetime = self.am_getOption('PinLifetime', THROTTLING_TIME)
        # Resources helper
        self.resources = Resources()

        # This sets the Default Proxy to used as that defined under
        # /Operations/Shifter/DataManager
        # the shifterProxy option in the Configuration can be used to change this default.
        self.am_setOption('shifterProxy', 'DataManager')

        return S_OK()
Пример #4
0
    def initialize(self):
        self.stagerClient = StorageManagerClient()
        self.dataIntegrityClient = DataIntegrityClient()
        # This sets the Default Proxy to used as that defined under
        # /Operations/Shifter/DataManager
        # the shifterProxy option in the Configuration can be used to change this default.
        self.am_setOption("shifterProxy", "DataManager")

        return S_OK()
Пример #5
0
    def __init__(self,
                 interactive=True,
                 transClient=None,
                 dm=None,
                 fc=None,
                 dic=None):
        """ c'tor
        interactive: Data Manager (True) or DIRAC Agente (False)
        transClient: TransformationClient() if None, else transClient params
        dm: DataManager() if None, else dm params
        fc: FileCatalog() if None, else fc params
        One object for every production/directoriesList...
    """
        self.interactive = interactive
        self.transClient = TransformationClient(
        ) if transClient is None else transClient
        self.dm = dm if dm else DataManager()
        self.fc = fc if fc else FileCatalog()
        self.dic = dic if dic else DataIntegrityClient()
        self.dirac = Dirac()

        # Base elements from which to start the consistency checks
        self._prod = 0
        self._bkQuery = None
        self._fileType = []
        self._fileTypesExcluded = []
        self._lfns = []
        self.noLFC = False
        self.directories = []

        # Accessory elements
        self.runsList = []
        self.runStatus = None
        self.fromProd = None
        self.transType = ''
        self.cachedReplicas = {}

        self.prcdWithDesc = []
        self.prcdWithoutDesc = []
        self.prcdWithMultDesc = []
        self.nonPrcdWithDesc = []
        self.nonPrcdWithoutDesc = []
        self.nonPrcdWithMultDesc = []
        self.descForPrcdLFNs = []
        self.descForNonPrcdLFNs = []
        self.removedFiles = []

        self.absentLFNsInFC = []
        self.existLFNsNoSE = {}
        self.existLFNsBadReplicas = {}
        self.existLFNsBadFiles = {}
        self.existLFNsNotExisting = {}
        self.commonAncestors = {}
        self.multipleDescendants = {}
        self.ancestors = {}

        self._verbose = False
  def initialize( self ):
    self.fileCatalog = FileCatalog()
    #self.stagerClient = StorageManagerClient()
    self.dataIntegrityClient = DataIntegrityClient()
    self.storageDB = StorageManagementDB()
    # This sets the Default Proxy to used as that defined under
    # /Operations/Shifter/DataManager
    # the shifterProxy option in the Configuration can be used to change this default.
    self.am_setOption( 'shifterProxy', 'DataManager' )

    return S_OK()
Пример #7
0
  def __init__( self, *args, **kwargs ):
    """ c'tor
    """
    AgentModule.__init__( self, *args, **kwargs )

    self.integrityClient = DataIntegrityClient()
    self.replicaManager = ReplicaManager()
    self.transClient = TransformationClient()
    self.fileCatalogClient = FileCatalogClient()

    agentTSTypes = self.am_getOption( 'TransformationTypes', [] )
    if agentTSTypes:
      self.transformationTypes = agentTSTypes
    else:
      self.transformationTypes = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] )

    self.directoryLocations = sortList( self.am_getOption( 'DirectoryLocations', ['TransformationDB',
                                                                                  'MetadataCatalog'] ) )
    self.activeStorages = sortList( self.am_getOption( 'ActiveSEs', [] ) )
    self.transfidmeta = self.am_getOption( 'TransfIDMeta', "TransformationID" )
    self.enableFlag = True
Пример #8
0
  def initialize( self ):
    self.stagerClient = StorageManagerClient()
    self.dataIntegrityClient = DataIntegrityClient()
    #self.storageDB = StorageManagementDB()
    # pin lifetime = 1 day
    self.pinLifetime = self.am_getOption( 'PinLifetime', THROTTLING_TIME )

    # This sets the Default Proxy to used as that defined under
    # /Operations/Shifter/DataManager
    # the shifterProxy option in the Configuration can be used to change this default.
    self.am_setOption( 'shifterProxy', 'DataManager' )

    return S_OK()
Пример #9
0
    def __init__(self, *args, **kwargs):
        """c'tor"""
        AgentModule.__init__(self, *args, **kwargs)

        self.consistencyInspector = ConsistencyInspector()
        self.integrityClient = DataIntegrityClient()
        self.fc = FileCatalog()
        self.transClient = TransformationClient()
        self.fileCatalogClient = FileCatalogClient()

        agentTSTypes = self.am_getOption("TransformationTypes", [])
        if agentTSTypes:
            self.transformationTypes = agentTSTypes
        else:
            self.transformationTypes = Operations().getValue(
                "Transformations/DataProcessing", ["MCSimulation", "Merge"]
            )

        self.directoryLocations = sorted(
            self.am_getOption("DirectoryLocations", ["TransformationDB", "MetadataCatalog"])
        )
        self.transfidmeta = self.am_getOption("TransfIDMeta", "TransformationID")
        self.enableFlag = True
Пример #10
0
  def initialize( self ):
    """Sets defaults
    """
    self.integrityClient = DataIntegrityClient()
    self.replicaManager = ReplicaManager()
    self.transClient = TransformationClient()
    self.storageUsageClient = StorageUsageClient()
    self.fileCatalogClient = FileCatalogClient()

    # This sets the Default Proxy to used as that defined under 
    # /Operations/Shifter/DataManager
    # the shifterProxy option in the Configuration can be used to change this default.
    self.am_setOption( 'shifterProxy', 'DataManager' )

    self.transformationTypes = sortList( self.am_getOption( 'TransformationTypes', ['MCSimulation', 'DataReconstruction', 'DataStripping', 'MCStripping', 'Merge'] ) )
    gLogger.info( "Will treat the following transformation types: %s" % str( self.transformationTypes ) )
    self.directoryLocations = sortList( self.am_getOption( 'DirectoryLocations', ['TransformationDB', 'StorageUsage', 'MetadataCatalog'] ) )
    gLogger.info( "Will search for directories in the following locations: %s" % str( self.directoryLocations ) )
    self.activeStorages = sortList( self.am_getOption( 'ActiveSEs', [] ) )
    gLogger.info( "Will check the following storage elements: %s" % str( self.activeStorages ) )
    self.transfidmeta = self.am_getOption( 'TransfIDMeta', "TransformationID" )
    gLogger.info( "Will use %s as metadata tag name for TransformationID" % self.transfidmeta )
    return S_OK()
Пример #11
0
  def __init__( self, *args, **kwargs ):
    """ c'tor
    """
    AgentModule.__init__( self, *args, **kwargs )

    self.integrityClient = DataIntegrityClient()
    self.replicaManager = ReplicaManager()
    self.transClient = TransformationClient()
    self.fileCatalogClient = FileCatalogClient()

    agentTSTypes = self.am_getOption( 'TransformationTypes', [] )
    if agentTSTypes:
      self.transformationTypes = agentTSTypes
    else:
      self.transformationTypes = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] )

    self.directoryLocations = sortList( self.am_getOption( 'DirectoryLocations', ['TransformationDB', 'MetadataCatalog'] ) )
    self.activeStorages = sortList( self.am_getOption( 'ActiveSEs', [] ) )
    self.transfidmeta = self.am_getOption( 'TransfIDMeta', "TransformationID" )
Пример #12
0
  def __init__(self, *args, **kwargs):
    """ c'tor
    """
    AgentModule.__init__(self, *args, **kwargs)

    self.consistencyInspector = ConsistencyInspector()
    self.integrityClient = DataIntegrityClient()
    self.fc = FileCatalog()
    self.transClient = TransformationClient()
    self.fileCatalogClient = FileCatalogClient()

    agentTSTypes = self.am_getOption('TransformationTypes', [])
    if agentTSTypes:
      self.transformationTypes = agentTSTypes
    else:
      self.transformationTypes = Operations().getValue('Transformations/DataProcessing', ['MCSimulation', 'Merge'])

    self.directoryLocations = sorted(self.am_getOption('DirectoryLocations', ['TransformationDB',
                                                                              'MetadataCatalog']))
    self.transfidmeta = self.am_getOption('TransfIDMeta', "TransformationID")
    self.enableFlag = True
Пример #13
0
  def initialize( self ):
    """Sets defaults
    """
    self.integrityClient = DataIntegrityClient()
    self.replicaManager = ReplicaManager()
    self.transClient = TransformationClient()
    self.fileCatalogClient = FileCatalogClient()

    # This sets the Default Proxy to used as that defined under 
    # /Operations/Shifter/DataManager
    # the shifterProxy option in the Configuration can be used to change this default.
    self.am_setOption( 'shifterProxy', 'DataManager' )

    self.transformationTypes = sortList( self.am_getOption( 'TransformationTypes', ['MCSimulation', 'DataReconstruction', 'DataStripping', 'MCStripping', 'Merge'] ) )
    gLogger.info( "Will treat the following transformation types: %s" % str( self.transformationTypes ) )
    self.directoryLocations = sortList( self.am_getOption( 'DirectoryLocations', ['TransformationDB', 'MetadataCatalog'] ) )
    gLogger.info( "Will search for directories in the following locations: %s" % str( self.directoryLocations ) )
    self.activeStorages = sortList( self.am_getOption( 'ActiveSEs', [] ) )
    gLogger.info( "Will check the following storage elements: %s" % str( self.activeStorages ) )
    self.transfidmeta = self.am_getOption( 'TransfIDMeta', "TransformationID" )
    gLogger.info( "Will use %s as metadata tag name for TransformationID" % self.transfidmeta )
    return S_OK()
  def initialize( self ):
    self.ReplicaManager = ReplicaManager()
    self.DataLog = DataLoggingClient()
    self.DataIntegrityClient = DataIntegrityClient()
    if self.am_getOption( 'DirectDB', False ):
      from DIRAC.StorageManagementSystem.DB.MigrationMonitoringDB import MigrationMonitoringDB
      self.MigrationMonitoringDB = MigrationMonitoringDB()
    else:
      from DIRAC.StorageManagementSystem.Client.MigrationMonitoringClient import MigrationMonitoringClient
      self.MigrationMonitoringDB = MigrationMonitoringClient()

    # This sets the Default Proxy to used as that defined under 
    # /Operations/Shifter/DataManager
    # the shifterProxy option in the Configuration can be used to change this default.
    self.am_setOption( 'shifterProxy', 'DataManager' )

    self.userName = '******'
    self.storageElements = self.am_getOption( 'StorageElements', ['CERN-RAW'] )
    self.lastMonitors = {}

    gMonitor.registerActivity( "Iteration", "Agent Loops/min", "MigrationMonitoringAgent", "Loops", gMonitor.OP_SUM )
    if self.storageElements:
      gLogger.info( "Agent will be initialised to monitor the following SEs:" )
      for se in self.storageElements:
        gLogger.info( se )
        self.lastMonitors[se] = datetime.datetime.utcfromtimestamp( 0.0 )
        gMonitor.registerActivity( "Iteration%s" % se, "Agent Loops/min", "MigrationMonitoringAgent", "Loops", gMonitor.OP_SUM )
        gMonitor.registerActivity( "MigratingFiles%s" % se, "Files waiting for migration", "MigrationMonitoringAgent", "Files", gMonitor.OP_MEAN )
        gMonitor.registerActivity( "MigratedFiles%s" % se, "Newly migrated files", "MigrationMonitoringAgent", "Files", gMonitor.OP_SUM )
        gMonitor.registerActivity( "TotalMigratedFiles%s" % se, "Total migrated files", "MigrationMonitoringAgent", "Files", gMonitor.OP_ACUM )
        gMonitor.registerActivity( "TotalMigratedSize%s" % se, "Total migrated file size", "MigrationMonitoringAgent", "GB", gMonitor.OP_ACUM )
        gMonitor.registerActivity( "ChecksumMatches%s" % se, "Successfully migrated files", "MigrationMonitoringAgent", "Files", gMonitor.OP_SUM )
        gMonitor.registerActivity( "TotalChecksumMatches%s" % se, "Total successfully migrated files", "MigrationMonitoringAgent", "Files", gMonitor.OP_ACUM )
        gMonitor.registerActivity( "ChecksumMismatches%s" % se, "Erroneously migrated files", "MigrationMonitoringAgent", "Files", gMonitor.OP_SUM )
        gMonitor.registerActivity( "TotalChecksumMismatches%s" % se, "Total erroneously migrated files", "MigrationMonitoringAgent", "Files", gMonitor.OP_ACUM )
        gMonitor.registerActivity( "MigrationTime%s" % se, "Average migration time", "MigrationMonitoringAgent", "Seconds", gMonitor.OP_MEAN )
    return S_OK()
Пример #15
0
    def __init__(self, *args, **kwargs):
        """ c'tor
    """
        AgentModule.__init__(self, *args, **kwargs)

        self.integrityClient = DataIntegrityClient()
        self.replicaManager = ReplicaManager()
        self.transClient = TransformationClient()
        self.fileCatalogClient = FileCatalogClient()

        agentTSTypes = self.am_getOption("TransformationTypes", [])
        if agentTSTypes:
            self.transformationTypes = agentTSTypes
        else:
            self.transformationTypes = Operations().getValue(
                "Transformations/DataProcessing", ["MCSimulation", "Merge"]
            )

        self.directoryLocations = sortList(
            self.am_getOption("DirectoryLocations", ["TransformationDB", "MetadataCatalog"])
        )
        self.activeStorages = sortList(self.am_getOption("ActiveSEs", []))
        self.transfidmeta = self.am_getOption("TransfIDMeta", "TransformationID")
        self.enableFlag = True
Пример #16
0
class StageRequestAgent( AgentModule ):

  def initialize( self ):
    self.replicaManager = ReplicaManager()
    self.stagerClient = StorageManagerClient()
    self.dataIntegrityClient = DataIntegrityClient()
    #self.storageDB = StorageManagementDB()
    # pin lifetime = 1 day
    self.pinLifetime = self.am_getOption( 'PinLifetime', THROTTLING_TIME )

    # This sets the Default Proxy to used as that defined under
    # /Operations/Shifter/DataManager
    # the shifterProxy option in the Configuration can be used to change this default.
    self.am_setOption( 'shifterProxy', 'DataManager' )

    return S_OK()

  def execute( self ):

    # Get the current submitted stage space and the amount of pinned space for each storage element
    res = self.getStorageUsage()
    if not res['OK']:
      return res

    return self.submitStageRequests()

  def getStorageUsage( self ):
    """ Fill the current Status of the SE Caches from the DB
    """
    self.storageElementCache = {}

    res = self.stagerClient.getSubmittedStagePins()
    if not res['OK']:
      gLogger.fatal( "StageRequest.getStorageUsage: Failed to obtain submitted requests from StorageManagementDB.", res['Message'] )
      return res
    self.storageElementUsage = res['Value']
    if self.storageElementUsage:
      gLogger.info( "StageRequest.getStorageUsage: Active stage/pin requests found at the following sites:" )
      for storageElement in sortList( self.storageElementUsage.keys() ):
        seDict = self.storageElementUsage[storageElement]
        # Convert to GB for printout
        seDict['TotalSize'] = seDict['TotalSize'] / ( 1000 * 1000 * 1000.0 )
        gLogger.info( "StageRequest.getStorageUsage: %s: %s replicas with a size of %.3f GB." %
                      ( storageElement.ljust( 15 ), str( seDict['Replicas'] ).rjust( 6 ), seDict['TotalSize'] ) )
    if not self.storageElementUsage:
      gLogger.info( "StageRequest.getStorageUsage: No active stage/pin requests found." )

    return S_OK()


  def submitStageRequests( self ):
    """ This manages the following transitions of the Replicas
        * Waiting -> Offline (if the file is not found Cached)
        * Waiting -> StageSubmitted (if the file is found Cached)
        * Offline -> StageSubmitted (if there are not more Waiting replicas)
    """
    # Retry Replicas that have not been Staged in a previous attempt 
    res = self._getMissingReplicas()
    if not res['OK']:
      gLogger.fatal( "StageRequest.submitStageRequests: Failed to get replicas from StorageManagementDB.", res['Message'] )
      return res
    seReplicas = res['Value']['SEReplicas']
    allReplicaInfo = res['Value']['AllReplicaInfo']

    if seReplicas:
      gLogger.info( "StageRequest.submitStageRequests: Completing partially Staged Tasks" )
    for storageElement, seReplicaIDs in seReplicas.items():
      gLogger.debug( 'Staging at %s:' % storageElement, seReplicaIDs )
      self._issuePrestageRequests( storageElement, seReplicaIDs, allReplicaInfo )

    # Check Waiting Replicas and select those found Online and all other Replicas from the same Tasks
    res = self._getOnlineReplicas()
    if not res['OK']:
      gLogger.fatal( "StageRequest.submitStageRequests: Failed to get replicas from StorageManagementDB.", res['Message'] )
      return res
    seReplicas = res['Value']['SEReplicas']
    allReplicaInfo = res['Value']['AllReplicaInfo']

    # Check Offline Replicas that fit in the Cache and all other Replicas from the same Tasks
    res = self._getOfflineReplicas()

    if not res['OK']:
      gLogger.fatal( "StageRequest.submitStageRequests: Failed to get replicas from StorageManagementDB.", res['Message'] )
      return res

    # Merge info from both results
    for storageElement, seReplicaIDs in res['Value']['SEReplicas'].items():
      if storageElement not in seReplicas:
        seReplicas[storageElement] = seReplicaIDs
      else:
        for replicaID in seReplicaIDs:
          if replicaID not in seReplicas[storageElement]:
            seReplicas[storageElement].append( replicaID )
    allReplicaInfo.update( res['Value']['AllReplicaInfo'] )

    gLogger.info( "StageRequest.submitStageRequests: Obtained %s replicas for staging." % len( allReplicaInfo ) )
    for storageElement, seReplicaIDs in seReplicas.items():
      gLogger.debug( 'Staging at %s:' % storageElement, seReplicaIDs )
      self._issuePrestageRequests( storageElement, seReplicaIDs, allReplicaInfo )
    return S_OK()

  def _getMissingReplicas( self ):
    """ This recovers Replicas that were not Staged on a previous attempt (the stage request failed or timed out),
        while other Replicas of the same task are already Staged. If left behind they can produce a deadlock.
        All SEs are considered, even if their Cache is full
    """
    # Get Replicas that are in Staged/StageSubmitted 
    gLogger.info( 'StageRequest._getMissingReplicas: Checking Staged Replicas' )

    res = self.__getStagedReplicas()
    if not res['OK']:
      gLogger.fatal( "StageRequest._getMissingReplicas: Failed to get replicas from StorageManagementDB.", res['Message'] )
      return res
    seReplicas = {}

    allReplicaInfo = res['Value']['AllReplicaInfo']
    replicasToStage = []
    for storageElement, seReplicaIDs in res['Value']['SEReplicas'].items():
      # Consider all SEs
      replicasToStage.extend( seReplicaIDs )

    # Get Replicas from the same Tasks as those selected
    res = self.__addAssociatedReplicas( replicasToStage, seReplicas, allReplicaInfo )
    if not res['OK']:
      gLogger.fatal( "StageRequest._getMissingReplicas: Failed to get associated Replicas.", res['Message'] )

    return res

  def _getOnlineReplicas( self ):
    """ This manages the transition
        * Waiting -> Offline (if the file is not found Cached)
        and returns the list of Cached Replicas for which the pin time has to be extended
        SEs for which the cache is currently full are not considered
    """
    # Get all Replicas in Waiting Status associated to Staging Tasks
    gLogger.verbose( 'StageRequest._getOnlineReplicas: Checking Online Replicas to be handled' )

    res = self.__getWaitingReplicas()
    if not res['OK']:
      gLogger.fatal( "StageRequest._getOnlineReplicas: Failed to get replicas from StorageManagementDB.", res['Message'] )
      return res
    seReplicas = {}
    allReplicaInfo = res['Value']['AllReplicaInfo']
    if not len( allReplicaInfo ):
      gLogger.info( "StageRequest._getOnlineReplicas: There were no Waiting replicas found" )
      return res
    gLogger.info( "StageRequest._getOnlineReplicas: Obtained %s replicas Waiting for staging." % len( allReplicaInfo ) )
    replicasToStage = []
    for storageElement, seReplicaIDs in res['Value']['SEReplicas'].items():
      if not self.__usage( storageElement ) < self.__cache( storageElement ):
        gLogger.info( 'StageRequest._getOnlineReplicas: Skipping %s, current usage above limit ( %s GB )' % ( storageElement, self.__cache( storageElement ) ) )
        # Do not consider those SE that have the Cache full
        continue
      # Check if the Replica Metadata is OK and find out if they are Online or Offline
      res = self.__checkIntegrity( storageElement, seReplicaIDs, allReplicaInfo )
      if not res['OK']:
        gLogger.error( 'StageRequest._getOnlineReplicas: Failed to check Replica Metadata', '(%s): %s' % ( storageElement, res['Message'] ) )
      else:
        # keep only Online Replicas
        seReplicas[storageElement] = res['Value']['Online']
        replicasToStage.extend( res['Value']['Online'] )

    # Get Replicas from the same Tasks as those selected
    res = self.__addAssociatedReplicas( replicasToStage, seReplicas, allReplicaInfo )
    if not res['OK']:
      gLogger.fatal( "StageRequest._getOnlineReplicas: Failed to get associated Replicas.", res['Message'] )

    return res

  def _getOfflineReplicas( self ):
    """ This checks Replicas in Offline status
        and returns the list of Replicas to be Staged
        SEs for which the cache is currently full are not considered
    """
    # Get all Replicas in Waiting Status associated to Staging Tasks
    gLogger.verbose( 'StageRequest._getOfflineReplicas: Checking Offline Replicas to be handled' )

    res = self.__getOfflineReplicas()
    if not res['OK']:
      gLogger.fatal( "StageRequest._getOfflineReplicas: Failed to get replicas from StorageManagementDB.", res['Message'] )
      return res
    seReplicas = {}
    allReplicaInfo = res['Value']['AllReplicaInfo']
    if not len( allReplicaInfo ):
      gLogger.info( "StageRequest._getOfflineReplicas: There were no Offline replicas found" )
      return res
    gLogger.info( "StageRequest._getOfflineReplicas: Obtained %s replicas Offline for staging." % len( allReplicaInfo ) )
    replicasToStage = []

    for storageElement, seReplicaIDs in res['Value']['SEReplicas'].items():
      if not self.__usage( storageElement ) < self.__cache( storageElement ):
        gLogger.info( 'StageRequest._getOfflineReplicas: Skipping %s, current usage above limit ( %s GB )' % ( storageElement, self.__cache( storageElement ) ) )
        # Do not consider those SE that have the Cache full
        continue
      seReplicas[storageElement] = []
      for replicaID in sorted( seReplicaIDs ):
        seReplicas[storageElement].append( replicaID )
        replicasToStage.append( replicaID )
        self.__add( storageElement, allReplicaInfo[replicaID]['Size'] )
        if not self.__usage( storageElement ) < self.__cache( storageElement ):
          # Stop adding Replicas when the cache is full
          break

    # Get Replicas from the same Tasks as those selected
    res = self.__addAssociatedReplicas( replicasToStage, seReplicas, allReplicaInfo )
    if not res['OK']:
      gLogger.fatal( "StageRequest._getOfflineReplicas: Failed to get associated Replicas.", res['Message'] )

    return res

  def __usage( self, storageElement ):
    """ Retrieve current usage of SE
    """
    if not storageElement in self.storageElementUsage:
      self.storageElementUsage[storageElement] = {'TotalSize': 0.}
    return self.storageElementUsage[storageElement]['TotalSize']

  def __cache( self, storageElement ):
    """ Retrieve cache size for SE
    """
    if not storageElement in self.storageElementCache:
      self.storageElementCache[storageElement] = gConfig.getValue( "/Resources/StorageElements/%s/DiskCacheTB" % storageElement, 1. ) * 1000. / THROTTLING_STEPS
    return self.storageElementCache[storageElement]

  def __add( self, storageElement, size ):
    """ Add size (in bytes) to current usage of storageElement (in GB)
    """
    if not storageElement in self.storageElementUsage:
      self.storageElementUsage[storageElement] = {'TotalSize': 0.}
    size = size / ( 1000 * 1000 * 1000.0 )
    self.storageElementUsage[storageElement]['TotalSize'] += size
    return size

  def _issuePrestageRequests( self, storageElement, seReplicaIDs, allReplicaInfo ):
    """ Make the request to the SE and update the DB
    """
    pfnRepIDs = {}
    for replicaID in seReplicaIDs:
      pfn = allReplicaInfo[replicaID]['PFN']
      pfnRepIDs[pfn] = replicaID

    # Now issue the prestage requests for the remaining replicas
    stageRequestMetadata = {}
    updatedPfnIDs = []
    if pfnRepIDs:
      gLogger.info( "StageRequest._issuePrestageRequests: Submitting %s stage requests for %s." % ( len( pfnRepIDs ), storageElement ) )
      res = self.replicaManager.prestageStorageFile( pfnRepIDs.keys(), storageElement, lifetime = self.pinLifetime )
      gLogger.debug( "StageRequest._issuePrestageRequests: replicaManager.prestageStorageFile: res=", res )
      #Daniela: fishy result from ReplicaManager!!! Should NOT return OK
      #res= {'OK': True, 'Value': {'Successful': {}, 'Failed': {'srm://srm-lhcb.cern.ch/castor/cern.ch/grid/lhcb/data/2010/RAW/EXPRESS/LHCb/COLLISION10/71476/071476_0000000241.raw': ' SRM2Storage.__gfal_exec: Failed to perform gfal_prestage.[SE][BringOnline][SRM_INVALID_REQUEST] httpg://srm-lhcb.cern.ch:8443/srm/managerv2: User not able to access specified space token\n'}}}
      #res= {'OK': True, 'Value': {'Successful': {'srm://gridka-dCache.fzk.de/pnfs/gridka.de/lhcb/data/2009/RAW/FULL/LHCb/COLLISION09/63495/063495_0000000001.raw': '-2083846379'}, 'Failed': {}}}

      if not res['OK']:
        gLogger.error( "StageRequest._issuePrestageRequests: Completely failed to submit stage requests for replicas.", res['Message'] )
      else:
        for pfn, requestID in res['Value']['Successful'].items():
          if not stageRequestMetadata.has_key( requestID ):
            stageRequestMetadata[requestID] = []
          stageRequestMetadata[requestID].append( pfnRepIDs[pfn] )
          updatedPfnIDs.append( pfnRepIDs[pfn] )
    if stageRequestMetadata:
      gLogger.info( "StageRequest._issuePrestageRequests: %s stage request metadata to be updated." % len( stageRequestMetadata ) )
      res = self.stagerClient.insertStageRequest( stageRequestMetadata, self.pinLifetime )
      if not res['OK']:
        gLogger.error( "StageRequest._issuePrestageRequests: Failed to insert stage request metadata.", res['Message'] )
        return res
      res = self.stagerClient.updateReplicaStatus( updatedPfnIDs, 'StageSubmitted' )
      if not res['OK']:
        gLogger.error( "StageRequest._issuePrestageRequests: Failed to insert replica status.", res['Message'] )
    return

  def __sortBySE( self, replicaDict ):

    seReplicas = {}
    replicaIDs = {}
    for replicaID, info in replicaDict.items():
      lfn = info['LFN']
      storageElement = info['SE']
      size = info['Size']
      pfn = info['PFN']
      replicaIDs[replicaID] = {'LFN':lfn, 'PFN':pfn, 'Size':size, 'StorageElement':storageElement}
      if not seReplicas.has_key( storageElement ):
        seReplicas[storageElement] = []
      seReplicas[storageElement].append( replicaID )
    return S_OK( {'SEReplicas':seReplicas, 'AllReplicaInfo':replicaIDs} )

  def __getStagedReplicas( self ):
    """ This obtains the Staged replicas from the Replicas table and for each LFN the requested storage element """
    # First obtain the Waiting replicas from the Replicas table
    res = self.stagerClient.getStagedReplicas()
    if not res['OK']:
      gLogger.error( "StageRequest.__getStagedReplicas: Failed to get replicas with Waiting status.", res['Message'] )
      return res
    if not res['Value']:
      gLogger.debug( "StageRequest.__getStagedReplicas: No Waiting replicas found to process." )
    else:
      gLogger.debug( "StageRequest.__getStagedReplicas: Obtained %s Waiting replicas(s) to process." % len( res['Value'] ) )

    return self.__sortBySE( res['Value'] )

  def __getWaitingReplicas( self ):
    """ This obtains the Waiting replicas from the Replicas table and for each LFN the requested storage element """
    # First obtain the Waiting replicas from the Replicas table
    res = self.stagerClient.getWaitingReplicas()
    if not res['OK']:
      gLogger.error( "StageRequest.__getWaitingReplicas: Failed to get replicas with Waiting status.", res['Message'] )
      return res
    if not res['Value']:
      gLogger.debug( "StageRequest.__getWaitingReplicas: No Waiting replicas found to process." )
    else:
      gLogger.debug( "StageRequest.__getWaitingReplicas: Obtained %s Waiting replicas(s) to process." % len( res['Value'] ) )

    return self.__sortBySE( res['Value'] )

  def __getOfflineReplicas( self ):
    """ This obtains the Offline replicas from the Replicas table and for each LFN the requested storage element """
    # First obtain the Waiting replicas from the Replicas table
    res = self.stagerClient.getOfflineReplicas()
    if not res['OK']:
      gLogger.error( "StageRequest.__getOfflineReplicas: Failed to get replicas with Waiting status.", res['Message'] )
      return res
    if not res['Value']:
      gLogger.debug( "StageRequest.__getOfflineReplicas: No Waiting replicas found to process." )
    else:
      gLogger.debug( "StageRequest.__getOfflineReplicas: Obtained %s Waiting replicas(s) to process." % len( res['Value'] ) )

    return self.__sortBySE( res['Value'] )

  def __addAssociatedReplicas( self, replicasToStage, seReplicas, allReplicaInfo ):
    """ Retrieve the list of Replicas that belong to the same Tasks as the provided list
    """
    res = self.stagerClient.getAssociatedReplicas( replicasToStage )
    if not res['OK']:
      gLogger.fatal( "StageRequest.__addAssociatedReplicas: Failed to get associated Replicas.", res['Message'] )
      return res
    addReplicas = {'Offline': {}, 'Waiting': {}}
    replicaIDs = {}
    for replicaID, info in res['Value'].items():
      lfn = info['LFN']
      storageElement = info['SE']
      size = info['Size']
      pfn = info['PFN']
      status = info['Status']
      if status not in ['Waiting', 'Offline']:
        continue
      if not addReplicas[status].has_key( storageElement ):
        addReplicas[status][storageElement] = []
      replicaIDs[replicaID] = {'LFN':lfn, 'PFN':pfn, 'Size':size, 'StorageElement':storageElement }
      addReplicas[status][storageElement].append( replicaID )

    waitingReplicas = addReplicas['Waiting']
    offlineReplicas = addReplicas['Offline']
    newReplicaInfo = replicaIDs
    allReplicaInfo.update( newReplicaInfo )

    # First handle Waiting Replicas for which metadata is to be checked
    for storageElement, seReplicaIDs in waitingReplicas.items():
      for replicaID in list( seReplicaIDs ):
        if replicaID in replicasToStage:
          seReplicaIDs.remove( replicaID )
      res = self.__checkIntegrity( storageElement, seReplicaIDs, allReplicaInfo )
      if not res['OK']:
        gLogger.error( 'StageRequest.__addAssociatedReplicas: Failed to check Replica Metadata', '(%s): %s' % ( storageElement, res['Message'] ) )
      else:
        # keep all Replicas (Online and Offline)
        if not storageElement in seReplicas:
          seReplicas[storageElement] = []
        seReplicas[storageElement].extend( res['Value']['Online'] )
        replicasToStage.extend( res['Value']['Online'] )
        seReplicas[storageElement].extend( res['Value']['Offline'] )
        replicasToStage.extend( res['Value']['Offline'] )

    # Then handle Offline Replicas for which metadata is already checked
    for storageElement, seReplicaIDs in offlineReplicas.items():
      if not storageElement in seReplicas:
        seReplicas[storageElement] = []
      for replicaID in sorted( seReplicaIDs ):
        if replicaID in replicasToStage:
          seReplicaIDs.remove( replicaID )
      seReplicas[storageElement].extend( seReplicaIDs )
      replicasToStage.extend( seReplicaIDs )

    for replicaID in allReplicaInfo.keys():
      if replicaID not in replicasToStage:
        del allReplicaInfo[replicaID]

    totalSize = 0
    for storageElement in sorted( seReplicas.keys() ):
      replicaIDs = seReplicas[storageElement]
      size = 0
      for replicaID in replicaIDs:
        size += self.__add( storageElement, allReplicaInfo[replicaID]['Size'] )

      gLogger.info( 'StageRequest.__addAssociatedReplicas:  Considering %s GB to be staged at %s' % ( size, storageElement ) )
      totalSize += size

    gLogger.info( "StageRequest.__addAssociatedReplicas: Obtained %s GB for staging." % totalSize )

    return S_OK( {'SEReplicas':seReplicas, 'AllReplicaInfo':allReplicaInfo} )

  def __checkIntegrity( self, storageElement, seReplicaIDs, allReplicaInfo ):
    """ Check the integrity of the files to ensure they are available
        Updates status of Offline Replicas for a later pass
        Return list of Online replicas to be Stage
    """
    if not seReplicaIDs:
      return S_OK( {'Online': [], 'Offline': []} )

    pfnRepIDs = {}
    for replicaID in seReplicaIDs:
      pfn = allReplicaInfo[replicaID]['PFN']
      pfnRepIDs[pfn] = replicaID

    gLogger.info( "StageRequest.__checkIntegrity: Checking the integrity of %s replicas at %s." % ( len( pfnRepIDs ), storageElement ) )
    res = self.replicaManager.getStorageFileMetadata( pfnRepIDs.keys(), storageElement )
    if not res['OK']:
      gLogger.error( "StageRequest.__checkIntegrity: Completely failed to obtain metadata for replicas.", res['Message'] )
      return res

    terminalReplicaIDs = {}
    onlineReplicaIDs = []
    offlineReplicaIDs = []
    for pfn, metadata in res['Value']['Successful'].items():

      if metadata['Size'] != allReplicaInfo[pfnRepIDs[pfn]]['Size']:
        gLogger.error( "StageRequest.__checkIntegrity: PFN StorageElement size does not match FileCatalog", pfn )
        terminalReplicaIDs[pfnRepIDs[pfn]] = 'PFN StorageElement size does not match FileCatalog'
        pfnRepIDs.pop( pfn )
      elif metadata['Lost']:
        gLogger.error( "StageRequest.__checkIntegrity: PFN has been Lost by the StorageElement", pfn )
        terminalReplicaIDs[pfnRepIDs[pfn]] = 'PFN has been Lost by the StorageElement'
        pfnRepIDs.pop( pfn )
      elif metadata['Unavailable']:
        gLogger.error( "StageRequest.__checkIntegrity: PFN is declared Unavailable by the StorageElement", pfn )
        terminalReplicaIDs[pfnRepIDs[pfn]] = 'PFN is declared Unavailable by the StorageElement'
        pfnRepIDs.pop( pfn )
      else:
        if metadata['Cached']:
          gLogger.verbose( "StageRequest.__checkIntegrity: Cache hit for file." )
          onlineReplicaIDs.append( pfnRepIDs[pfn] )
        else:
          offlineReplicaIDs.append( pfnRepIDs[pfn] )

    for pfn, reason in res['Value']['Failed'].items():
      if re.search( 'File does not exist', reason ):
        gLogger.error( "StageRequest.__checkIntegrity: PFN does not exist in the StorageElement", pfn )
        terminalReplicaIDs[pfnRepIDs[pfn]] = 'PFN does not exist in the StorageElement'
      pfnRepIDs.pop( pfn )

    # Update the states of the replicas in the database #TODO Sent status to integrity DB
    if terminalReplicaIDs:
      gLogger.info( "StageRequest.__checkIntegrity: %s replicas are terminally failed." % len( terminalReplicaIDs ) )
      res = self.stagerClient.updateReplicaFailure( terminalReplicaIDs )
      if not res['OK']:
        gLogger.error( "StageRequest.__checkIntegrity: Failed to update replica failures.", res['Message'] )
    if onlineReplicaIDs:
      gLogger.info( "StageRequest.__checkIntegrity: %s replicas found Online." % len( onlineReplicaIDs ) )
    if offlineReplicaIDs:
      gLogger.info( "StageRequest.__checkIntegrity: %s replicas found Offline." % len( offlineReplicaIDs ) )
      res = self.stagerClient.updateReplicaStatus( offlineReplicaIDs, 'Offline' )
    return S_OK( {'Online': onlineReplicaIDs, 'Offline': offlineReplicaIDs} )

  def __reportProblematicFiles( self, lfns, reason ):
    return S_OK()
    res = self.dataIntegrityClient.setFileProblematic( lfns, reason, sourceComponent = 'StageRequestAgent' )
    if not res['OK']:
      gLogger.error( "RequestPreparation.__reportProblematicFiles: Failed to report missing files.", res['Message'] )
      return res
    if res['Value']['Successful']:
      gLogger.info( "RequestPreparation.__reportProblematicFiles: Successfully reported %s missing files." % len( res['Value']['Successful'] ) )
    if res['Value']['Failed']:
      gLogger.info( "RequestPreparation.__reportProblematicFiles: Failed to report %s problematic files." % len( res['Value']['Failed'] ) )
    return res
Пример #17
0
class StageRequestAgent(AgentModule):
    def initialize(self):
        self.replicaManager = ReplicaManager()
        #self.stagerClient = StorageManagerClient()
        self.dataIntegrityClient = DataIntegrityClient()
        self.storageDB = StorageManagementDB()
        # pin lifetime = 1 day
        self.pinLifetime = self.am_getOption('PinLifetime', THROTTLING_TIME)

        # This sets the Default Proxy to used as that defined under
        # /Operations/Shifter/DataManager
        # the shifterProxy option in the Configuration can be used to change this default.
        self.am_setOption('shifterProxy', 'DataManager')

        return S_OK()

    def execute(self):

        # Get the current submitted stage space and the amount of pinned space for each storage element
        res = self.getStorageUsage()
        if not res['OK']:
            return res

        return self.submitStageRequests()

    def getStorageUsage(self):
        """ Fill the current Status of the SE Caches from the DB
    """
        self.storageElementCache = {}

        res = self.storageDB.getSubmittedStagePins()
        if not res['OK']:
            gLogger.fatal(
                "StageRequest.getStorageUsage: Failed to obtain submitted requests from StorageManagementDB.",
                res['Message'])
            return res
        self.storageElementUsage = res['Value']
        if self.storageElementUsage:
            gLogger.info(
                "StageRequest.getStorageUsage: Active stage/pin requests found at the following sites:"
            )
            for storageElement in sortList(self.storageElementUsage.keys()):
                seDict = self.storageElementUsage[storageElement]
                # Convert to GB for printout
                seDict['TotalSize'] = seDict['TotalSize'] / (1000 * 1000 *
                                                             1000.0)
                gLogger.info(
                    "StageRequest.getStorageUsage: %s: %s replicas with a size of %.3f GB."
                    % (storageElement.ljust(15), str(
                        seDict['Replicas']).rjust(6), seDict['TotalSize']))
        if not self.storageElementUsage:
            gLogger.info(
                "StageRequest.getStorageUsage: No active stage/pin requests found."
            )

        return S_OK()

    def submitStageRequests(self):
        """ This manages the following transitions of the Replicas
        * Waiting -> Offline (if the file is not found Cached)
        * Waiting -> StageSubmitted (if the file is found Cached)
        * Offline -> StageSubmitted (if there are not more Waiting replicas)
    """
        # Retry Replicas that have not been Staged in a previous attempt
        res = self._getMissingReplicas()
        if not res['OK']:
            gLogger.fatal(
                "StageRequest.submitStageRequests: Failed to get replicas from StorageManagementDB.",
                res['Message'])
            return res
        seReplicas = res['Value']['SEReplicas']
        allReplicaInfo = res['Value']['AllReplicaInfo']

        if seReplicas:
            gLogger.info(
                "StageRequest.submitStageRequests: Completing partially Staged Tasks"
            )
        for storageElement, seReplicaIDs in seReplicas.items():
            gLogger.debug('Staging at %s:' % storageElement, seReplicaIDs)
            self._issuePrestageRequests(storageElement, seReplicaIDs,
                                        allReplicaInfo)

        # Check Waiting Replicas and select those found Online and all other Replicas from the same Tasks
        res = self._getOnlineReplicas()
        if not res['OK']:
            gLogger.fatal(
                "StageRequest.submitStageRequests: Failed to get replicas from StorageManagementDB.",
                res['Message'])
            return res
        seReplicas = res['Value']['SEReplicas']
        allReplicaInfo = res['Value']['AllReplicaInfo']

        # Check Offline Replicas that fit in the Cache and all other Replicas from the same Tasks
        res = self._getOfflineReplicas()

        if not res['OK']:
            gLogger.fatal(
                "StageRequest.submitStageRequests: Failed to get replicas from StorageManagementDB.",
                res['Message'])
            return res

        # Merge info from both results
        for storageElement, seReplicaIDs in res['Value']['SEReplicas'].items():
            if storageElement not in seReplicas:
                seReplicas[storageElement] = seReplicaIDs
            else:
                for replicaID in seReplicaIDs:
                    if replicaID not in seReplicas[storageElement]:
                        seReplicas[storageElement].append(replicaID)
        allReplicaInfo.update(res['Value']['AllReplicaInfo'])

        gLogger.info(
            "StageRequest.submitStageRequests: Obtained %s replicas for staging."
            % len(allReplicaInfo))
        for storageElement, seReplicaIDs in seReplicas.items():
            gLogger.debug('Staging at %s:' % storageElement, seReplicaIDs)
            self._issuePrestageRequests(storageElement, seReplicaIDs,
                                        allReplicaInfo)
        return S_OK()

    def _getMissingReplicas(self):
        """ This recovers Replicas that were not Staged on a previous attempt (the stage request failed or timed out),
        while other Replicas of the same task are already Staged. If left behind they can produce a deadlock.
        All SEs are considered, even if their Cache is full
    """
        # Get Replicas that are in Staged/StageSubmitted
        gLogger.info(
            'StageRequest._getMissingReplicas: Checking Staged Replicas')

        res = self.__getStagedReplicas()
        if not res['OK']:
            gLogger.fatal(
                "StageRequest._getMissingReplicas: Failed to get replicas from StorageManagementDB.",
                res['Message'])
            return res
        seReplicas = {}

        allReplicaInfo = res['Value']['AllReplicaInfo']
        replicasToStage = []
        for storageElement, seReplicaIDs in res['Value']['SEReplicas'].items():
            # Consider all SEs
            replicasToStage.extend(seReplicaIDs)

        # Get Replicas from the same Tasks as those selected
        res = self.__addAssociatedReplicas(replicasToStage, seReplicas,
                                           allReplicaInfo)
        if not res['OK']:
            gLogger.fatal(
                "StageRequest._getMissingReplicas: Failed to get associated Replicas.",
                res['Message'])

        return res

    def _getOnlineReplicas(self):
        """ This manages the transition
        * Waiting -> Offline (if the file is not found Cached)
        and returns the list of Cached Replicas for which the pin time has to be extended
        SEs for which the cache is currently full are not considered
    """
        # Get all Replicas in Waiting Status associated to Staging Tasks
        gLogger.verbose(
            'StageRequest._getOnlineReplicas: Checking Online Replicas to be handled'
        )

        res = self.__getWaitingReplicas()
        if not res['OK']:
            gLogger.fatal(
                "StageRequest._getOnlineReplicas: Failed to get replicas from StorageManagementDB.",
                res['Message'])
            return res
        seReplicas = {}
        allReplicaInfo = res['Value']['AllReplicaInfo']
        if not len(allReplicaInfo):
            gLogger.info(
                "StageRequest._getOnlineReplicas: There were no Waiting replicas found"
            )
            return res
        gLogger.info(
            "StageRequest._getOnlineReplicas: Obtained %s replicas Waiting for staging."
            % len(allReplicaInfo))
        replicasToStage = []
        for storageElement, seReplicaIDs in res['Value']['SEReplicas'].items():
            if not self.__usage(storageElement) < self.__cache(storageElement):
                gLogger.info(
                    'StageRequest._getOnlineReplicas: Skipping %s, current usage above limit ( %s GB )'
                    % (storageElement, self.__cache(storageElement)))
                # Do not consider those SE that have the Cache full
                continue
            # Check if the Replica Metadata is OK and find out if they are Online or Offline
            res = self.__checkIntegrity(storageElement, seReplicaIDs,
                                        allReplicaInfo)
            if not res['OK']:
                gLogger.error(
                    'StageRequest._getOnlineReplicas: Failed to check Replica Metadata',
                    '(%s): %s' % (storageElement, res['Message']))
            else:
                # keep only Online Replicas
                seReplicas[storageElement] = res['Value']['Online']
                replicasToStage.extend(res['Value']['Online'])

        # Get Replicas from the same Tasks as those selected
        res = self.__addAssociatedReplicas(replicasToStage, seReplicas,
                                           allReplicaInfo)
        if not res['OK']:
            gLogger.fatal(
                "StageRequest._getOnlineReplicas: Failed to get associated Replicas.",
                res['Message'])

        return res

    def _getOfflineReplicas(self):
        """ This checks Replicas in Offline status
        and returns the list of Replicas to be Staged
        SEs for which the cache is currently full are not considered
    """
        # Get all Replicas in Waiting Status associated to Staging Tasks
        gLogger.verbose(
            'StageRequest._getOfflineReplicas: Checking Offline Replicas to be handled'
        )

        res = self.__getOfflineReplicas()
        if not res['OK']:
            gLogger.fatal(
                "StageRequest._getOfflineReplicas: Failed to get replicas from StorageManagementDB.",
                res['Message'])
            return res
        seReplicas = {}
        allReplicaInfo = res['Value']['AllReplicaInfo']
        if not len(allReplicaInfo):
            gLogger.info(
                "StageRequest._getOfflineReplicas: There were no Offline replicas found"
            )
            return res
        gLogger.info(
            "StageRequest._getOfflineReplicas: Obtained %s replicas Offline for staging."
            % len(allReplicaInfo))
        replicasToStage = []

        for storageElement, seReplicaIDs in res['Value']['SEReplicas'].items():
            if not self.__usage(storageElement) < self.__cache(storageElement):
                gLogger.info(
                    'StageRequest._getOfflineReplicas: Skipping %s, current usage above limit ( %s GB )'
                    % (storageElement, self.__cache(storageElement)))
                # Do not consider those SE that have the Cache full
                continue
            seReplicas[storageElement] = []
            for replicaID in sorted(seReplicaIDs):
                seReplicas[storageElement].append(replicaID)
                replicasToStage.append(replicaID)
                self.__add(storageElement, allReplicaInfo[replicaID]['Size'])
                if not self.__usage(storageElement) < self.__cache(
                        storageElement):
                    # Stop adding Replicas when the cache is full
                    break

        # Get Replicas from the same Tasks as those selected
        res = self.__addAssociatedReplicas(replicasToStage, seReplicas,
                                           allReplicaInfo)
        if not res['OK']:
            gLogger.fatal(
                "StageRequest._getOfflineReplicas: Failed to get associated Replicas.",
                res['Message'])

        return res

    def __usage(self, storageElement):
        """ Retrieve current usage of SE
    """
        if not storageElement in self.storageElementUsage:
            self.storageElementUsage[storageElement] = {'TotalSize': 0.}
        return self.storageElementUsage[storageElement]['TotalSize']

    def __cache(self, storageElement):
        """ Retrieve cache size for SE
    """
        if not storageElement in self.storageElementCache:
            self.storageElementCache[storageElement] = gConfig.getValue(
                "/Resources/StorageElements/%s/DiskCacheTB" % storageElement,
                1.) * 1000. / THROTTLING_STEPS
        return self.storageElementCache[storageElement]

    def __add(self, storageElement, size):
        """ Add size (in bytes) to current usage of storageElement (in GB)
    """
        if not storageElement in self.storageElementUsage:
            self.storageElementUsage[storageElement] = {'TotalSize': 0.}
        size = size / (1000 * 1000 * 1000.0)
        self.storageElementUsage[storageElement]['TotalSize'] += size
        return size

    def _issuePrestageRequests(self, storageElement, seReplicaIDs,
                               allReplicaInfo):
        """ Make the request to the SE and update the DB
    """
        pfnRepIDs = {}
        for replicaID in seReplicaIDs:
            pfn = allReplicaInfo[replicaID]['PFN']
            pfnRepIDs[pfn] = replicaID

        # Now issue the prestage requests for the remaining replicas
        stageRequestMetadata = {}
        updatedPfnIDs = []
        if pfnRepIDs:
            gLogger.info(
                "StageRequest._issuePrestageRequests: Submitting %s stage requests for %s."
                % (len(pfnRepIDs), storageElement))
            res = self.replicaManager.prestageStorageFile(
                pfnRepIDs.keys(), storageElement, lifetime=self.pinLifetime)
            gLogger.debug(
                "StageRequest._issuePrestageRequests: replicaManager.prestageStorageFile: res=",
                res)
            #Daniela: fishy result from ReplicaManager!!! Should NOT return OK
            #res= {'OK': True, 'Value': {'Successful': {}, 'Failed': {'srm://srm-lhcb.cern.ch/castor/cern.ch/grid/lhcb/data/2010/RAW/EXPRESS/LHCb/COLLISION10/71476/071476_0000000241.raw': ' SRM2Storage.__gfal_exec: Failed to perform gfal_prestage.[SE][BringOnline][SRM_INVALID_REQUEST] httpg://srm-lhcb.cern.ch:8443/srm/managerv2: User not able to access specified space token\n'}}}
            #res= {'OK': True, 'Value': {'Successful': {'srm://gridka-dCache.fzk.de/pnfs/gridka.de/lhcb/data/2009/RAW/FULL/LHCb/COLLISION09/63495/063495_0000000001.raw': '-2083846379'}, 'Failed': {}}}

            if not res['OK']:
                gLogger.error(
                    "StageRequest._issuePrestageRequests: Completely failed to submit stage requests for replicas.",
                    res['Message'])
            else:
                for pfn, requestID in res['Value']['Successful'].items():
                    if not stageRequestMetadata.has_key(requestID):
                        stageRequestMetadata[requestID] = []
                    stageRequestMetadata[requestID].append(pfnRepIDs[pfn])
                    updatedPfnIDs.append(pfnRepIDs[pfn])
        if stageRequestMetadata:
            gLogger.info(
                "StageRequest._issuePrestageRequests: %s stage request metadata to be updated."
                % len(stageRequestMetadata))
            res = self.storageDB.insertStageRequest(stageRequestMetadata,
                                                    self.pinLifetime)
            if not res['OK']:
                gLogger.error(
                    "StageRequest._issuePrestageRequests: Failed to insert stage request metadata.",
                    res['Message'])
                return res
            res = self.storageDB.updateReplicaStatus(updatedPfnIDs,
                                                     'StageSubmitted')
            if not res['OK']:
                gLogger.error(
                    "StageRequest._issuePrestageRequests: Failed to insert replica status.",
                    res['Message'])
        return

    def __sortBySE(self, replicaDict):

        seReplicas = {}
        replicaIDs = {}
        for replicaID, info in replicaDict.items():
            lfn = info['LFN']
            storageElement = info['SE']
            size = info['Size']
            pfn = info['PFN']
            replicaIDs[replicaID] = {
                'LFN': lfn,
                'PFN': pfn,
                'Size': size,
                'StorageElement': storageElement
            }
            if not seReplicas.has_key(storageElement):
                seReplicas[storageElement] = []
            seReplicas[storageElement].append(replicaID)
        return S_OK({'SEReplicas': seReplicas, 'AllReplicaInfo': replicaIDs})

    def __getStagedReplicas(self):
        """ This obtains the Staged replicas from the Replicas table and for each LFN the requested storage element """
        # First obtain the Waiting replicas from the Replicas table
        res = self.storageDB.getStagedReplicas()
        if not res['OK']:
            gLogger.error(
                "StageRequest.__getStagedReplicas: Failed to get replicas with Waiting status.",
                res['Message'])
            return res
        if not res['Value']:
            gLogger.debug(
                "StageRequest.__getStagedReplicas: No Waiting replicas found to process."
            )
        else:
            gLogger.debug(
                "StageRequest.__getStagedReplicas: Obtained %s Waiting replicas(s) to process."
                % len(res['Value']))

        return self.__sortBySE(res['Value'])

    def __getWaitingReplicas(self):
        """ This obtains the Waiting replicas from the Replicas table and for each LFN the requested storage element """
        # First obtain the Waiting replicas from the Replicas table
        res = self.storageDB.getWaitingReplicas()
        if not res['OK']:
            gLogger.error(
                "StageRequest.__getWaitingReplicas: Failed to get replicas with Waiting status.",
                res['Message'])
            return res
        if not res['Value']:
            gLogger.debug(
                "StageRequest.__getWaitingReplicas: No Waiting replicas found to process."
            )
        else:
            gLogger.debug(
                "StageRequest.__getWaitingReplicas: Obtained %s Waiting replicas(s) to process."
                % len(res['Value']))

        return self.__sortBySE(res['Value'])

    def __getOfflineReplicas(self):
        """ This obtains the Offline replicas from the Replicas table and for each LFN the requested storage element """
        # First obtain the Waiting replicas from the Replicas table
        res = self.storageDB.getOfflineReplicas()
        if not res['OK']:
            gLogger.error(
                "StageRequest.__getOfflineReplicas: Failed to get replicas with Waiting status.",
                res['Message'])
            return res
        if not res['Value']:
            gLogger.debug(
                "StageRequest.__getOfflineReplicas: No Waiting replicas found to process."
            )
        else:
            gLogger.debug(
                "StageRequest.__getOfflineReplicas: Obtained %s Waiting replicas(s) to process."
                % len(res['Value']))

        return self.__sortBySE(res['Value'])

    def __addAssociatedReplicas(self, replicasToStage, seReplicas,
                                allReplicaInfo):
        """ Retrieve the list of Replicas that belong to the same Tasks as the provided list
    """
        res = self.storageDB.getAssociatedReplicas(replicasToStage)
        if not res['OK']:
            gLogger.fatal(
                "StageRequest.__addAssociatedReplicas: Failed to get associated Replicas.",
                res['Message'])
            return res
        addReplicas = {'Offline': {}, 'Waiting': {}}
        replicaIDs = {}
        for replicaID, info in res['Value'].items():
            lfn = info['LFN']
            storageElement = info['SE']
            size = info['Size']
            pfn = info['PFN']
            status = info['Status']
            if status not in ['Waiting', 'Offline']:
                continue
            if not addReplicas[status].has_key(storageElement):
                addReplicas[status][storageElement] = []
            replicaIDs[replicaID] = {
                'LFN': lfn,
                'PFN': pfn,
                'Size': size,
                'StorageElement': storageElement
            }
            addReplicas[status][storageElement].append(replicaID)

        waitingReplicas = addReplicas['Waiting']
        offlineReplicas = addReplicas['Offline']
        newReplicaInfo = replicaIDs
        allReplicaInfo.update(newReplicaInfo)

        # First handle Waiting Replicas for which metadata is to be checked
        for storageElement, seReplicaIDs in waitingReplicas.items():
            for replicaID in list(seReplicaIDs):
                if replicaID in replicasToStage:
                    seReplicaIDs.remove(replicaID)
            res = self.__checkIntegrity(storageElement, seReplicaIDs,
                                        allReplicaInfo)
            if not res['OK']:
                gLogger.error(
                    'StageRequest.__addAssociatedReplicas: Failed to check Replica Metadata',
                    '(%s): %s' % (storageElement, res['Message']))
            else:
                # keep all Replicas (Online and Offline)
                if not storageElement in seReplicas:
                    seReplicas[storageElement] = []
                seReplicas[storageElement].extend(res['Value']['Online'])
                replicasToStage.extend(res['Value']['Online'])
                seReplicas[storageElement].extend(res['Value']['Offline'])
                replicasToStage.extend(res['Value']['Offline'])

        # Then handle Offline Replicas for which metadata is already checked
        for storageElement, seReplicaIDs in offlineReplicas.items():
            if not storageElement in seReplicas:
                seReplicas[storageElement] = []
            for replicaID in sorted(seReplicaIDs):
                if replicaID in replicasToStage:
                    seReplicaIDs.remove(replicaID)
            seReplicas[storageElement].extend(seReplicaIDs)
            replicasToStage.extend(seReplicaIDs)

        for replicaID in allReplicaInfo.keys():
            if replicaID not in replicasToStage:
                del allReplicaInfo[replicaID]

        totalSize = 0
        for storageElement in sorted(seReplicas.keys()):
            replicaIDs = seReplicas[storageElement]
            size = 0
            for replicaID in replicaIDs:
                size += self.__add(storageElement,
                                   allReplicaInfo[replicaID]['Size'])

            gLogger.info(
                'StageRequest.__addAssociatedReplicas:  Considering %s GB to be staged at %s'
                % (size, storageElement))
            totalSize += size

        gLogger.info(
            "StageRequest.__addAssociatedReplicas: Obtained %s GB for staging."
            % totalSize)

        return S_OK({
            'SEReplicas': seReplicas,
            'AllReplicaInfo': allReplicaInfo
        })

    def __checkIntegrity(self, storageElement, seReplicaIDs, allReplicaInfo):
        """ Check the integrity of the files to ensure they are available
        Updates status of Offline Replicas for a later pass
        Return list of Online replicas to be Stage
    """
        if not seReplicaIDs:
            return S_OK({'Online': [], 'Offline': []})

        pfnRepIDs = {}
        for replicaID in seReplicaIDs:
            pfn = allReplicaInfo[replicaID]['PFN']
            pfnRepIDs[pfn] = replicaID

        gLogger.info(
            "StageRequest.__checkIntegrity: Checking the integrity of %s replicas at %s."
            % (len(pfnRepIDs), storageElement))
        res = self.replicaManager.getStorageFileMetadata(
            pfnRepIDs.keys(), storageElement)
        if not res['OK']:
            gLogger.error(
                "StageRequest.__checkIntegrity: Completely failed to obtain metadata for replicas.",
                res['Message'])
            return res

        terminalReplicaIDs = {}
        onlineReplicaIDs = []
        offlineReplicaIDs = []
        for pfn, metadata in res['Value']['Successful'].items():

            if metadata['Size'] != allReplicaInfo[pfnRepIDs[pfn]]['Size']:
                gLogger.error(
                    "StageRequest.__checkIntegrity: PFN StorageElement size does not match FileCatalog",
                    pfn)
                terminalReplicaIDs[pfnRepIDs[
                    pfn]] = 'PFN StorageElement size does not match FileCatalog'
                pfnRepIDs.pop(pfn)
            elif metadata['Lost']:
                gLogger.error(
                    "StageRequest.__checkIntegrity: PFN has been Lost by the StorageElement",
                    pfn)
                terminalReplicaIDs[
                    pfnRepIDs[pfn]] = 'PFN has been Lost by the StorageElement'
                pfnRepIDs.pop(pfn)
            elif metadata['Unavailable']:
                gLogger.error(
                    "StageRequest.__checkIntegrity: PFN is declared Unavailable by the StorageElement",
                    pfn)
                terminalReplicaIDs[pfnRepIDs[
                    pfn]] = 'PFN is declared Unavailable by the StorageElement'
                pfnRepIDs.pop(pfn)
            else:
                if metadata['Cached']:
                    gLogger.verbose(
                        "StageRequest.__checkIntegrity: Cache hit for file.")
                    onlineReplicaIDs.append(pfnRepIDs[pfn])
                else:
                    offlineReplicaIDs.append(pfnRepIDs[pfn])

        for pfn, reason in res['Value']['Failed'].items():
            if re.search('File does not exist', reason):
                gLogger.error(
                    "StageRequest.__checkIntegrity: PFN does not exist in the StorageElement",
                    pfn)
                terminalReplicaIDs[pfnRepIDs[
                    pfn]] = 'PFN does not exist in the StorageElement'
            pfnRepIDs.pop(pfn)

        # Update the states of the replicas in the database #TODO Sent status to integrity DB
        if terminalReplicaIDs:
            gLogger.info(
                "StageRequest.__checkIntegrity: %s replicas are terminally failed."
                % len(terminalReplicaIDs))
            res = self.storageDB.updateReplicaFailure(terminalReplicaIDs)
            if not res['OK']:
                gLogger.error(
                    "StageRequest.__checkIntegrity: Failed to update replica failures.",
                    res['Message'])
        if onlineReplicaIDs:
            gLogger.info(
                "StageRequest.__checkIntegrity: %s replicas found Online." %
                len(onlineReplicaIDs))
        if offlineReplicaIDs:
            gLogger.info(
                "StageRequest.__checkIntegrity: %s replicas found Offline." %
                len(offlineReplicaIDs))
            res = self.storageDB.updateReplicaStatus(offlineReplicaIDs,
                                                     'Offline')
        return S_OK({'Online': onlineReplicaIDs, 'Offline': offlineReplicaIDs})

    def __reportProblematicFiles(self, lfns, reason):
        return S_OK()
        res = self.dataIntegrityClient.setFileProblematic(
            lfns, reason, self.name)
        if not res['OK']:
            gLogger.error(
                "RequestPreparation.__reportProblematicFiles: Failed to report missing files.",
                res['Message'])
            return res
        if res['Value']['Successful']:
            gLogger.info(
                "RequestPreparation.__reportProblematicFiles: Successfully reported %s missing files."
                % len(res['Value']['Successful']))
        if res['Value']['Failed']:
            gLogger.info(
                "RequestPreparation.__reportProblematicFiles: Failed to report %s problematic files."
                % len(res['Value']['Failed']))
        return res
class RequestPreparationAgent( AgentModule ):

  def initialize( self ):
    self.fileCatalog = FileCatalog()
    self.dm = DataManager()
    self.stagerClient = StorageManagerClient()
    self.dataIntegrityClient = DataIntegrityClient()
    # This sets the Default Proxy to used as that defined under
    # /Operations/Shifter/DataManager
    # the shifterProxy option in the Configuration can be used to change this default.
    self.am_setOption( 'shifterProxy', 'DataManager' )

    return S_OK()

  def execute( self ):
    """ This is the first logical task to be executed and manages the New->Waiting transition of the Replicas
    """
    res = self.__getNewReplicas()
    if not res['OK']:
      gLogger.fatal( "RequestPreparation.prepareNewReplicas: Failed to get replicas from StagerDB.", res['Message'] )
      return res
    if not res['Value']:
      gLogger.info( "There were no New replicas found" )
      return res
    replicas = res['Value']['Replicas']
    replicaIDs = res['Value']['ReplicaIDs']
    gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained %s New replicas for preparation." % len( replicaIDs ) )

    # Check if the files exist in the FileCatalog
    res = self.__getExistingFiles( replicas )
    if not res['OK']:
      return res
    exist = res['Value']['Exist']
    terminal = res['Value']['Missing']
    failed = res['Value']['Failed']
    if not exist:
      gLogger.error( 'RequestPreparation.prepareNewReplicas: Failed to determine the existence of any file' )
      return S_OK()
    terminalReplicaIDs = {}
    for lfn, reason in terminal.items():
      for replicaID in replicas[lfn].values():
        terminalReplicaIDs[replicaID] = reason
      replicas.pop( lfn )
    gLogger.info( "RequestPreparation.prepareNewReplicas: %s files exist in the FileCatalog." % len( exist ) )
    if terminal:
      gLogger.info( "RequestPreparation.prepareNewReplicas: %s files do not exist in the FileCatalog." % len( terminal ) )

    # Obtain the file sizes from the FileCatalog
    res = self.__getFileSize( exist )
    if not res['OK']:
      return res
    failed.update( res['Value']['Failed'] )
    terminal = res['Value']['ZeroSize']
    fileSizes = res['Value']['FileSizes']
    if not fileSizes:
      gLogger.error( 'RequestPreparation.prepareNewReplicas: Failed determine sizes of any files' )
      return S_OK()
    for lfn, reason in terminal.items():
      for _se, replicaID in replicas[lfn].items():
        terminalReplicaIDs[replicaID] = reason
      replicas.pop( lfn )
    gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained %s file sizes from the FileCatalog." % len( fileSizes ) )
    if terminal:
      gLogger.info( "RequestPreparation.prepareNewReplicas: %s files registered with zero size in the FileCatalog." % len( terminal ) )

    # Obtain the replicas from the FileCatalog
    res = self.__getFileReplicas( fileSizes.keys() )
    if not res['OK']:
      return res
    failed.update( res['Value']['Failed'] )
    terminal = res['Value']['ZeroReplicas']
    fileReplicas = res['Value']['Replicas']
    if not fileReplicas:
      gLogger.error( 'RequestPreparation.prepareNewReplicas: Failed determine replicas for any files' )
      return S_OK()
    for lfn, reason in terminal.items():
      for _se, replicaID in replicas[lfn].items():
        terminalReplicaIDs[replicaID] = reason
      replicas.pop( lfn )
    gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained replica information for %s file from the FileCatalog." % len( fileReplicas ) )
    if terminal:
      gLogger.info( "RequestPreparation.prepareNewReplicas: %s files registered with zero replicas in the FileCatalog." % len( terminal ) )

    # Check the replicas exist at the requested site
    replicaMetadata = []
    for lfn, requestedSEs in replicas.items():
      lfnReplicas = fileReplicas.get( lfn )

      # This should not happen in principle, but it was seen
      # after a corrupted staging request has entered the DB
      if not lfnReplicas:
        gLogger.error( "Missing replicas information", "%s %s" % ( lfn, requestedSEs ) )
        continue

      for requestedSE, replicaID in requestedSEs.items():
        if not requestedSE in lfnReplicas.keys():
          terminalReplicaIDs[replicaID] = "LFN not registered at requested SE"
          replicas[lfn].pop( requestedSE )
        else:
          replicaMetadata.append( ( replicaID, lfnReplicas[requestedSE], fileSizes[lfn] ) )

    # Update the states of the files in the database
    if terminalReplicaIDs:
      gLogger.info( "RequestPreparation.prepareNewReplicas: %s replicas are terminally failed." % len( terminalReplicaIDs ) )
      # res = self.stagerClient.updateReplicaFailure( terminalReplicaIDs )
      res = self.stagerClient.updateReplicaFailure( terminalReplicaIDs )
      if not res['OK']:
        gLogger.error( "RequestPreparation.prepareNewReplicas: Failed to update replica failures.", res['Message'] )
    if replicaMetadata:
      gLogger.info( "RequestPreparation.prepareNewReplicas: %s replica metadata to be updated." % len( replicaMetadata ) )
      # Sets the Status='Waiting' of CacheReplicas records that are OK with catalogue checks
      res = self.stagerClient.updateReplicaInformation( replicaMetadata )
      if not res['OK']:
        gLogger.error( "RequestPreparation.prepareNewReplicas: Failed to update replica metadata.", res['Message'] )
    return S_OK()

  def __getNewReplicas( self ):
    """ This obtains the New replicas from the Replicas table and for each LFN the requested storage element """
    # First obtain the New replicas from the CacheReplicas table
    res = self.stagerClient.getCacheReplicas( {'Status':'New'} )
    if not res['OK']:
      gLogger.error( "RequestPreparation.__getNewReplicas: Failed to get replicas with New status.", res['Message'] )
      return res
    if not res['Value']:
      gLogger.debug( "RequestPreparation.__getNewReplicas: No New replicas found to process." )
      return S_OK()
    else:
      gLogger.debug( "RequestPreparation.__getNewReplicas: Obtained %s New replicas(s) to process." % len( res['Value'] ) )
    replicas = {}
    replicaIDs = {}
    for replicaID, info in res['Value'].items():
      lfn = info['LFN']
      storageElement = info['SE']
      replicas.setdefault( lfn, {} )[storageElement] = replicaID
      replicaIDs[replicaID] = ( lfn, storageElement )
    return S_OK( {'Replicas':replicas, 'ReplicaIDs':replicaIDs} )

  def __getExistingFiles( self, lfns ):
    """ This checks that the files exist in the FileCatalog. """
    res = self.fileCatalog.exists( list( set( lfns ) ) )
    if not res['OK']:
      gLogger.error( "RequestPreparation.__getExistingFiles: Failed to determine whether files exist.", res['Message'] )
      return res
    failed = res['Value']['Failed']
    success = res['Value']['Successful']
    exist = [lfn for lfn, exists in success.items() if exists]
    missing = list( set( success ) - set( exist ) )
    if missing:
      reason = 'LFN not registered in the FC'
      gLogger.warn( "RequestPreparation.__getExistingFiles: %s" % reason, '\n'.join( [''] + missing ) )
      self.__reportProblematicFiles( missing, 'LFN-LFC-DoesntExist' )
      missing = dict.fromkeys( missing, reason )
    else:
      missing = {}
    return S_OK( {'Exist':exist, 'Missing':missing, 'Failed':failed} )

  def __getFileSize( self, lfns ):
    """ This obtains the file size from the FileCatalog. """
    fileSizes = {}
    zeroSize = {}
    res = self.fileCatalog.getFileSize( lfns )
    if not res['OK']:
      gLogger.error( "RequestPreparation.__getFileSize: Failed to get sizes for files.", res['Message'] )
      return res
    failed = res['Value']['Failed']
    for lfn, size in res['Value']['Successful'].items():
      if size == 0:
        zeroSize[lfn] = "LFN registered with zero size in the FileCatalog"
      else:
        fileSizes[lfn] = size
    if zeroSize:
      for lfn, reason in zeroSize.items():
        gLogger.warn( "RequestPreparation.__getFileSize: %s" % reason, lfn )
      self.__reportProblematicFiles( zeroSize.keys(), 'LFN-LFC-ZeroSize' )
    return S_OK( {'FileSizes':fileSizes, 'ZeroSize':zeroSize, 'Failed':failed} )

  def __getFileReplicas( self, lfns ):
    """ This obtains the replicas from the FileCatalog. """
    replicas = {}
    noReplicas = {}
    res = self.dm.getActiveReplicas( lfns )
    if not res['OK']:
      gLogger.error( "RequestPreparation.__getFileReplicas: Failed to obtain file replicas.", res['Message'] )
      return res
    failed = res['Value']['Failed']
    for lfn, lfnReplicas in res['Value']['Successful'].items():
      if len( lfnReplicas.keys() ) == 0:
        noReplicas[lfn] = "LFN registered with zero replicas in the FileCatalog"
      else:
        replicas[lfn] = lfnReplicas
    if noReplicas:
      for lfn, reason in noReplicas.items():
        gLogger.warn( "RequestPreparation.__getFileReplicas: %s" % reason, lfn )
      self.__reportProblematicFiles( noReplicas.keys(), 'LFN-LFC-NoReplicas' )
    return S_OK( {'Replicas':replicas, 'ZeroReplicas':noReplicas, 'Failed':failed} )

  def __reportProblematicFiles( self, lfns, reason ):
    return S_OK()
    res = self.dataIntegrityClient.setFileProblematic( lfns, reason, sourceComponent = 'RequestPreparationAgent' )
    if not res['OK']:
      gLogger.error( "RequestPreparation.__reportProblematicFiles: Failed to report missing files.", res['Message'] )
      return res
    if res['Value']['Successful']:
      gLogger.info( "RequestPreparation.__reportProblematicFiles: Successfully reported %s missing files." % len( res['Value']['Successful'] ) )
    if res['Value']['Failed']:
      gLogger.info( "RequestPreparation.__reportProblematicFiles: Failed to report %s problematic files." % len( res['Value']['Failed'] ) )
    return res
Пример #19
0
class ValidateOutputDataAgent(AgentModule):
    def __init__(self, *args, **kwargs):
        """ c'tor
    """
        AgentModule.__init__(self, *args, **kwargs)

        self.consistencyInspector = ConsistencyInspector()
        self.integrityClient = DataIntegrityClient()
        self.fc = FileCatalog()
        self.transClient = TransformationClient()
        self.fileCatalogClient = FileCatalogClient()

        agentTSTypes = self.am_getOption('TransformationTypes', [])
        if agentTSTypes:
            self.transformationTypes = agentTSTypes
        else:
            self.transformationTypes = Operations().getValue(
                'Transformations/DataProcessing', ['MCSimulation', 'Merge'])

        self.directoryLocations = sorted(
            self.am_getOption('DirectoryLocations',
                              ['TransformationDB', 'MetadataCatalog']))
        self.transfidmeta = self.am_getOption('TransfIDMeta',
                                              "TransformationID")
        self.enableFlag = True

    #############################################################################

    def initialize(self):
        """ Sets defaults
    """

        # This sets the Default Proxy to used as that defined under
        # /Operations/Shifter/DataManager
        # the shifterProxy option in the Configuration can be used to change this default.
        self.am_setOption('shifterProxy', 'DataManager')

        gLogger.info("Will treat the following transformation types: %s" %
                     str(self.transformationTypes))
        gLogger.info(
            "Will search for directories in the following locations: %s" %
            str(self.directoryLocations))
        gLogger.info("Will use %s as metadata tag name for TransformationID" %
                     self.transfidmeta)
        return S_OK()

    #############################################################################

    def execute(self):
        """ The VerifyOutputData execution method
    """
        self.enableFlag = self.am_getOption('EnableFlag', 'True')
        if not self.enableFlag == 'True':
            self.log.info(
                "VerifyOutputData is disabled by configuration option 'EnableFlag'"
            )
            return S_OK('Disabled via CS flag')

        gLogger.info("-" * 40)
        self.updateWaitingIntegrity()
        gLogger.info("-" * 40)

        res = self.transClient.getTransformations({
            'Status':
            'ValidatingOutput',
            'Type':
            self.transformationTypes
        })
        if not res['OK']:
            gLogger.error("Failed to get ValidatingOutput transformations",
                          res['Message'])
            return res
        transDicts = res['Value']
        if not transDicts:
            gLogger.info("No transformations found in ValidatingOutput status")
            return S_OK()
        gLogger.info("Found %s transformations in ValidatingOutput status" %
                     len(transDicts))
        for transDict in transDicts:
            transID = transDict['TransformationID']
            res = self.checkTransformationIntegrity(int(transID))
            if not res['OK']:
                gLogger.error(
                    "Failed to perform full integrity check for transformation %d"
                    % transID)
            else:
                self.finalizeCheck(transID)
                gLogger.info("-" * 40)
        return S_OK()

    def updateWaitingIntegrity(self):
        """ Get 'WaitingIntegrity' transformations, update to 'ValidatedOutput'
    """
        gLogger.info(
            "Looking for transformations in the WaitingIntegrity status to update"
        )
        res = self.transClient.getTransformations(
            {'Status': 'WaitingIntegrity'})
        if not res['OK']:
            gLogger.error("Failed to get WaitingIntegrity transformations",
                          res['Message'])
            return res
        transDicts = res['Value']
        if not transDicts:
            gLogger.info("No transformations found in WaitingIntegrity status")
            return S_OK()
        gLogger.info("Found %s transformations in WaitingIntegrity status" %
                     len(transDicts))
        for transDict in transDicts:
            transID = transDict['TransformationID']
            gLogger.info("-" * 40)
            res = self.integrityClient.getTransformationProblematics(
                int(transID))
            if not res['OK']:
                gLogger.error(
                    "Failed to determine waiting problematics for transformation",
                    res['Message'])
            elif not res['Value']:
                res = self.transClient.setTransformationParameter(
                    transID, 'Status', 'ValidatedOutput')
                if not res['OK']:
                    gLogger.error(
                        "Failed to update status of transformation %s to ValidatedOutput"
                        % (transID))
                else:
                    gLogger.info(
                        "Updated status of transformation %s to ValidatedOutput"
                        % (transID))
            else:
                gLogger.info(
                    "%d problematic files for transformation %s were found" %
                    (len(res['Value']), transID))
        return

    #############################################################################
    #
    # Get the transformation directories for checking
    #

    def getTransformationDirectories(self, transID):
        """ Get the directories for the supplied transformation from the transformation system
    """
        directories = []
        if 'TransformationDB' in self.directoryLocations:
            res = self.transClient.getTransformationParameters(
                transID, ['OutputDirectories'])
            if not res['OK']:
                gLogger.error("Failed to obtain transformation directories",
                              res['Message'])
                return res
            if not isinstance(res['Value'], list):
                transDirectories = ast.literal_eval(res['Value'])
            else:
                transDirectories = res['Value']
            directories = self._addDirs(transID, transDirectories, directories)

        if 'MetadataCatalog' in self.directoryLocations:
            res = self.fileCatalogClient.findDirectoriesByMetadata(
                {self.transfidmeta: transID})
            if not res['OK']:
                gLogger.error("Failed to obtain metadata catalog directories",
                              res['Message'])
                return res
            transDirectories = res['Value']
            directories = self._addDirs(transID, transDirectories, directories)
        if not directories:
            gLogger.info("No output directories found")
        directories = sorted(directories)
        return S_OK(directories)

    @staticmethod
    def _addDirs(transID, newDirs, existingDirs):
        for nDir in newDirs:
            transStr = str(transID).zfill(8)
            if re.search(transStr, nDir):
                if nDir not in existingDirs:
                    existingDirs.append(nDir)
        return existingDirs

    #############################################################################
    def checkTransformationIntegrity(self, transID):
        """ This method contains the real work
    """
        gLogger.info("-" * 40)
        gLogger.info("Checking the integrity of transformation %s" % transID)
        gLogger.info("-" * 40)

        res = self.getTransformationDirectories(transID)
        if not res['OK']:
            return res
        directories = res['Value']
        if not directories:
            return S_OK()

        ######################################################
        #
        # This check performs Catalog->SE for possible output directories
        #
        res = self.fc.exists(directories)
        if not res['OK']:
            gLogger.error('Failed to check directory existence',
                          res['Message'])
            return res
        for directory, error in res['Value']['Failed']:
            gLogger.error('Failed to determine existance of directory',
                          '%s %s' % (directory, error))
        if res['Value']['Failed']:
            return S_ERROR("Failed to determine the existance of directories")
        directoryExists = res['Value']['Successful']
        for directory in sorted(directoryExists.keys()):
            if not directoryExists[directory]:
                continue
            iRes = self.consistencyInspector.catalogDirectoryToSE(directory)
            if not iRes['OK']:
                gLogger.error(iRes['Message'])
                return iRes

        gLogger.info("-" * 40)
        gLogger.info("Completed integrity check for transformation %s" %
                     transID)
        return S_OK()

    def finalizeCheck(self, transID):
        """ Move to 'WaitingIntegrity' or 'ValidatedOutput'
    """
        res = self.integrityClient.getTransformationProblematics(int(transID))

        if not res['OK']:
            gLogger.error(
                "Failed to determine whether there were associated problematic files",
                res['Message'])
            newStatus = ''
        elif res['Value']:
            gLogger.info(
                "%d problematic files for transformation %s were found" %
                (len(res['Value']), transID))
            newStatus = "WaitingIntegrity"
        else:
            gLogger.info("No problematics were found for transformation %s" %
                         transID)
            newStatus = "ValidatedOutput"
        if newStatus:
            res = self.transClient.setTransformationParameter(
                transID, 'Status', newStatus)
            if not res['OK']:
                gLogger.error(
                    "Failed to update status of transformation %s to %s" %
                    (transID, newStatus))
            else:
                gLogger.info("Updated status of transformation %s to %s" %
                             (transID, newStatus))
        gLogger.info("-" * 40)
        return S_OK()
Пример #20
0
class MigrationMonitoringAgent(AgentModule):
    def initialize(self):
        self.ReplicaManager = ReplicaManager()
        self.DataLog = DataLoggingClient()
        self.DataIntegrityClient = DataIntegrityClient()
        if self.am_getOption('DirectDB', False):
            from DIRAC.StorageManagementSystem.DB.MigrationMonitoringDB import MigrationMonitoringDB
            self.MigrationMonitoringDB = MigrationMonitoringDB()
        else:
            from DIRAC.StorageManagementSystem.Client.MigrationMonitoringClient import MigrationMonitoringClient
            self.MigrationMonitoringDB = MigrationMonitoringClient()

        # This sets the Default Proxy to used as that defined under
        # /Operations/Shifter/DataManager
        # the shifterProxy option in the Configuration can be used to change this default.
        self.am_setOption('shifterProxy', 'DataManager')

        self.userName = '******'
        self.storageElements = self.am_getOption('StorageElements',
                                                 ['CERN-RAW'])
        self.lastMonitors = {}

        gMonitor.registerActivity("Iteration", "Agent Loops/min",
                                  "MigrationMonitoringAgent", "Loops",
                                  gMonitor.OP_SUM)
        if self.storageElements:
            gLogger.info(
                "Agent will be initialised to monitor the following SEs:")
            for se in self.storageElements:
                gLogger.info(se)
                self.lastMonitors[se] = datetime.datetime.utcfromtimestamp(0.0)
                gMonitor.registerActivity("Iteration%s" % se,
                                          "Agent Loops/min",
                                          "MigrationMonitoringAgent", "Loops",
                                          gMonitor.OP_SUM)
                gMonitor.registerActivity("MigratingFiles%s" % se,
                                          "Files waiting for migration",
                                          "MigrationMonitoringAgent", "Files",
                                          gMonitor.OP_MEAN)
                gMonitor.registerActivity("MigratedFiles%s" % se,
                                          "Newly migrated files",
                                          "MigrationMonitoringAgent", "Files",
                                          gMonitor.OP_SUM)
                gMonitor.registerActivity("TotalMigratedFiles%s" % se,
                                          "Total migrated files",
                                          "MigrationMonitoringAgent", "Files",
                                          gMonitor.OP_ACUM)
                gMonitor.registerActivity("TotalMigratedSize%s" % se,
                                          "Total migrated file size",
                                          "MigrationMonitoringAgent", "GB",
                                          gMonitor.OP_ACUM)
                gMonitor.registerActivity("ChecksumMatches%s" % se,
                                          "Successfully migrated files",
                                          "MigrationMonitoringAgent", "Files",
                                          gMonitor.OP_SUM)
                gMonitor.registerActivity("TotalChecksumMatches%s" % se,
                                          "Total successfully migrated files",
                                          "MigrationMonitoringAgent", "Files",
                                          gMonitor.OP_ACUM)
                gMonitor.registerActivity("ChecksumMismatches%s" % se,
                                          "Erroneously migrated files",
                                          "MigrationMonitoringAgent", "Files",
                                          gMonitor.OP_SUM)
                gMonitor.registerActivity("TotalChecksumMismatches%s" % se,
                                          "Total erroneously migrated files",
                                          "MigrationMonitoringAgent", "Files",
                                          gMonitor.OP_ACUM)
                gMonitor.registerActivity("MigrationTime%s" % se,
                                          "Average migration time",
                                          "MigrationMonitoringAgent",
                                          "Seconds", gMonitor.OP_MEAN)
        return S_OK()

    def execute(self):
        self.enableFlag = self.am_getOption('EnableFlag', 'True')
        if not self.enableFlag == 'True':
            self.log.info(
                'MigrationMonitoringAgent is disabled by configuration option %s/EnableFlag'
                % (self.section))
            return S_OK('Disabled via CS flag')
        gMonitor.addMark("Iteration", 1)
        self.NewToMigrating()
        for se in self.storageElements:
            gMonitor.addMark("Iteration%s" % se, 1)
            self.MigratingToMigrated(se)
        return S_OK()

    #########################################################################################################
    #
    # Includes the file size and checksum information for replicas which do not have it
    #

    def NewToMigrating(self):
        """ Obtain the new files from the migration monitoring db and (where necessary) add the size and checksum information
    """
        # First get the new files from the database
        gLogger.info("NewToMigrating: Attempting to obtain 'New' files.")
        res = self.__getFiles('', 'New')
        if not res['OK']:
            gLogger.error("NewToMigrating: Failed to get 'New' files.",
                          res['Message'])
            return res
        newFiles = res['Value']['Files']
        if not newFiles:
            gLogger.info("NewToMigrating: Found no 'New' files.")
            return S_OK()
        # Get the metadata from the catalog for which do not have size or checksum
        res = self.__getCatalogFileMetadata(newFiles)
        if not res['OK']:
            gLogger.error("NewToMigrating: Failed to get metadata for files",
                          res['Message'])
            return res
        metadata = res['Value']
        # Add the metadata to the migration monitoring DB.
        res = self.__updateNewMigrating(metadata)
        return S_OK()

    def __updateNewMigrating(self, fileMetadata):
        gLogger.info("__updateNewMigrating: Updating metadata for %s files" %
                     len(fileMetadata))
        gLogger.info("PUT THE CODE HERE TO UPDATE THE METDATA")

        #self.__setMigratingReplicaStatus(fileMetadata.keys(),'Migrating')
        return S_OK()

    #########################################################################################################
    #
    # Monitors the migration of files
    #

    def MigratingToMigrated(self, se):
        """ Obtain the active files from the migration monitoring db and check their status
    """
        # First get the migrating files from the database
        gLogger.info(
            "[%s] MigratingToMigrated: Attempting to obtain 'Migrating' files."
            % se)
        res = self.__getFiles(se, 'Migrating')
        if not res['OK']:
            gLogger.error(
                "[%s] MigratingToMigrated: Failed to get 'Migrating' files." %
                se, res['Message'])
            return res
        pfnIDs = res['Value']['PFNIDs']
        if not pfnIDs:
            gLogger.info(
                "[%s] MigratingToMigrated: Found no 'Migrating' files." % se)
            return S_OK()
        migratingFiles = res['Value']['Files']
        gLogger.info("[%s] MigratingToMigrated: Found %d 'Migrating' files." %
                     (se, len(pfnIDs)))
        gMonitor.addMark("MigratingFiles%s" % se, len(pfnIDs))
        gLogger.info(
            "[%s] MigratingToMigrated: Obtaining physical file metadata for 'Migrating' files."
            % se)
        startTime = datetime.datetime.utcnow()
        res = self.__getMigratedFiles(se, pfnIDs.keys())
        if not res['OK']:
            gLogger.error(
                "[%s] MigratingToMigrated: Failed to get 'Migrating' file metadata."
                % se, res['Message'])
            return res
        assumedEndTime = datetime.datetime.utcnow() - (
            (datetime.datetime.utcnow() - startTime) / 2
        )  # Assumed that the files are found migrated midway through obtaining the metadata
        previousMonitorTime = self.lastMonitors[se]
        self.lastMonitors[se] = datetime.datetime.utcnow()
        terminal = res['Value']['Terminal']
        migrated = res['Value']['Migrated']

        # Update the problematic files in the integrity DB and update the MigrationMonitoringDB
        gLogger.info(
            "[%s] MigratingToMigrated: Found %d terminally failed files." %
            (se, len(terminal)))
        if terminal:
            replicaTuples = []
            terminalFileIDs = []
            for pfn, prognosis in terminal.items():
                fileID = pfnIDs[pfn]
                terminalFileIDs.append(fileID)
                lfn = migratingFiles[fileID]['LFN']
                se = migratingFiles[fileID]['SE']
                replicaTuples.append((lfn, pfn, se, prognosis))
            self.__reportProblematicReplicas(replicaTuples)
            res = self.MigrationMonitoringDB.setMigratingReplicaStatus(
                terminalFileIDs, 'Failed')
            if not res['OK']:
                gLogger.error(
                    "[%s] MigratingToMigrated: Failed to update terminal files."
                    % se, res['Message'])

        # Update the migrated files and send accounting
        gLogger.info("[%s] MigratingToMigrated: Found %d migrated files." %
                     (se, len(migrated)))
        if migrated:
            migratedFileIDs = {}
            for pfn, checksum in migrated.items():
                migratedFileIDs[pfnIDs[pfn]] = checksum
            #res = self.MigrationMonitoringDB.setMigratingReplicaStatus(migratedFileIDs.keys(),'Migrated')
            #if not res['OK']:
            #  gLogger.error("[%s] MigratingToMigrated: Failed to update migrated files." % se, res['Message'])
            # Check the checksums of the migrated files
            res = self.__validateChecksums(se, migratedFileIDs, migratingFiles)
            if not res['OK']:
                gLogger.error(
                    "[%s] MigratingToMigrated: Failed to perform checksum matching."
                    % se, res['Message'])
                matchingFiles = []
                mismatchingFiles = []
            else:
                matchingFiles = res['Value']['MatchingFiles']
                mismatchingFiles = res['Value']['MismatchFiles']
            # Create and send the accounting messages
            res = self.__updateMigrationAccounting(se, migratingFiles,
                                                   matchingFiles,
                                                   mismatchingFiles,
                                                   assumedEndTime,
                                                   previousMonitorTime)
            if not res['OK']:
                gLogger.error(
                    "[%s] MigratingToMigrated: Failed to send accounting for migrated files."
                    % se, res['Message'])
        return S_OK()

    def __getMigratedFiles(self, se, pfns):
        # Get the active files from the database
        migrated = {}
        terminal = {}
        res = self.ReplicaManager.getStorageFileMetadata(pfns, se)
        if not res['OK']:
            return res
        for pfn, error in res['Value']['Failed'].items():
            if re.search("File does not exist", error):
                gLogger.error(
                    "[%s] __getStorageMetadata: PFN does not exist at StorageElement."
                    % se, "%s %s" % (pfn, error))
                terminal[pfn] = 'PFNMissing'
            else:
                gLogger.warn(
                    "[%s] __getMigratedFiles: Failed to obtain physical file metadata."
                    % se, "%s %s" % (pfn, error))
        storageMetadata = res['Value']['Successful']
        for pfn, metadata in storageMetadata.items():
            if metadata['Migrated']:
                checksum = ''
                if metadata.has_key('Checksum'):
                    checksum = metadata['Checksum']
                migrated[pfn] = checksum
            elif metadata['Lost']:
                gLogger.error(
                    "[%s] __getMigratedFiles: PFN has been Lost by the StorageElement."
                    % se, "%s" % (pfn))
                terminal[pfn] = 'PFNLost'
            elif metadata['Unavailable']:
                gLogger.error(
                    "[%s] __getMigratedFiles: PFN declared Unavailable by StorageElement."
                    % se, "%s" % (pfn))
                terminal[pfn] = 'PFNUnavailable'
        resDict = {'Terminal': terminal, 'Migrated': migrated}
        return S_OK(resDict)

    def __validateChecksums(self, se, migratedFileIDs, migratingFiles):
        """ Obtain the checksums in the catalog if not present and check against the checksum from the storage
    """
        lfnFileID = {}
        checksumToObtain = []
        for fileID in migratedFileIDs.keys():
            if not migratingFiles[fileID]['Checksum']:
                lfn = migratingFiles[fileID]['LFN']
                checksumToObtain.append(lfn)
                lfnFileID[lfn] = fileID
        if checksumToObtain:
            res = self.ReplicaManager.getCatalogFileMetadata(checksumToObtain)
            if not res['OK']:
                gLogger.error(
                    "[%s] __validateChecksums: Failed to obtain file checksums"
                    % se)
                return res
            for lfn, error in res['Value']['Failed'].items():
                gLogger.error(
                    "[%s] __validateChecksums: Failed to get file checksum" %
                    se, "%s %s" % (lfn, error))
            for lfn, metadata in res['Value']['Successful'].items():
                migratingFiles[
                    lfnFileID[lfn]]['Checksum'] = metadata['CheckSumValue']
        mismatchFiles = []
        matchFiles = []
        checksumMismatches = []
        fileRecords = []
        for fileID, seChecksum in migratedFileIDs.items():
            lfn = migratingFiles[fileID]['LFN']
            catalogChecksum = migratingFiles[fileID]['Checksum']
            if not seChecksum:
                gLogger.error(
                    "[%s] __validateChecksums: Storage checksum not available"
                    % se, migratingFiles[fileID]['PFN'])
            elif not compareAdler(seChecksum, catalogChecksum):
                gLogger.error(
                    "[%s] __validateChecksums: Storage and catalog checksum mismatch"
                    % se, "%s '%s' '%s'" % (migratingFiles[fileID]['PFN'],
                                            seChecksum, catalogChecksum))
                mismatchFiles.append(fileID)
                pfn = migratingFiles[fileID]['PFN']
                se = migratingFiles[fileID]['SE']
                checksumMismatches.append(
                    (lfn, pfn, se, 'CatalogPFNChecksumMismatch'))
                fileRecords.append(
                    (lfn, 'Checksum match', '%s@%s' % (seChecksum, se), '',
                     'MigrationMonitoringAgent'))
            else:
                fileRecords.append(
                    (lfn, 'Checksum mismatch', '%s@%s' % (seChecksum, se), '',
                     'MigrationMonitoringAgent'))
                matchFiles.append(fileID)
        # Add the data logging records
        self.DataLog.addFileRecords(fileRecords)
        if checksumMismatches:
            # Update the (mis)matching checksums (in the integrityDB and) in the migration monitoring db
            self.__reportProblematicReplicas(checksumMismatches)
            res = self.MigrationMonitoringDB.setMigratingReplicaStatus(
                mismatchFiles, 'ChecksumFail')
            if not res['OK']:
                gLogger.error(
                    "[%s] __validateChecksums: Failed to update checksum mismatching files."
                    % se, res['Message'])
        if matchFiles:
            res = self.MigrationMonitoringDB.setMigratingReplicaStatus(
                matchFiles, 'ChecksumMatch')
            if not res['OK']:
                gLogger.error(
                    "[%s] __validateChecksums: Failed to update checksum mismatching files."
                    % se, res['Message'])
        resDict = {'MatchingFiles': matchFiles, 'MismatchFiles': mismatchFiles}
        return S_OK(resDict)

    def __updateMigrationAccounting(self, se, migratingFiles, matchingFiles,
                                    mismatchingFiles, assumedEndTime,
                                    previousMonitorTime):
        """ Create accounting messages for the overall throughput observed and the total migration time for the files
    """
        allMigrated = matchingFiles + mismatchingFiles
        gMonitor.addMark("MigratedFiles%s" % se, len(allMigrated))
        gMonitor.addMark("TotalMigratedFiles%s" % se, len(allMigrated))
        lfnFileID = {}
        sizesToObtain = []
        for fileID in allMigrated:
            if not migratingFiles[fileID]['Size']:
                lfn = migratingFiles[fileID]['LFN']
                sizesToObtain.append(lfn)
                lfnFileID[lfn] = fileID
        if sizesToObtain:
            res = self.ReplicaManager.getCatalogFileSize(sizesToObtain)
            if not res['OK']:
                gLogger.error(
                    "[%s] __updateMigrationAccounting: Failed to obtain file sizes"
                    % se)
                return res
            for lfn, error in res['Value']['Failed'].items():
                gLogger.error(
                    "[%s] __updateAccounting: Failed to get file size" % se,
                    "%s %s" % (lfn, error))
                migratingFiles[lfnFileID[lfn]]['Size'] = 0
            for lfn, size in res['Value']['Successful'].items():
                migratingFiles[lfnFileID[lfn]]['Size'] = size
        totalSize = 0
        for fileID in allMigrated:
            size = migratingFiles[fileID]['Size']
            totalSize += size
            submitTime = migratingFiles[fileID]['SubmitTime']
            timeDiff = submitTime - assumedEndTime
            migrationTime = (timeDiff.days * 86400) + (timeDiff.seconds) + (
                timeDiff.microseconds / 1000000.0)
            gMonitor.addMark("MigrationTime%s" % se, migrationTime)
            gDataStoreClient.addRegister(
                self.__initialiseAccountingObject('MigrationTime', se,
                                                  submitTime, assumedEndTime,
                                                  size))
            gDataStoreClient.addRegister(
                self.__initialiseAccountingObject('MigrationThroughput', se,
                                                  previousMonitorTime,
                                                  assumedEndTime, size))
            oDataOperation = self.__initialiseAccountingObject(
                'MigrationSuccess', se, submitTime, assumedEndTime, size)
            if fileID in mismatchingFiles:
                oDataOperation.setValueByKey('TransferOK', 0)
                oDataOperation.setValueByKey('FinalStatus', 'Failed')
            gDataStoreClient.addRegister(oDataOperation)
        gMonitor.addMark("TotalMigratedSize%s" % se, totalSize)
        gMonitor.addMark("ChecksumMismatches%s" % se, len(mismatchingFiles))
        gMonitor.addMark("TotalChecksumMismatches%s" % se,
                         len(mismatchingFiles))
        gMonitor.addMark("ChecksumMatches%s" % se, len(matchingFiles))
        gMonitor.addMark("TotalChecksumMatches%s" % se, len(matchingFiles))
        if allMigrated:
            gLogger.info(
                '[%s] __updateMigrationAccounting: Attempting to send accounting message...'
                % se)
            return gDataStoreClient.commit()
        return S_OK()

    #########################################################################################################
    #
    # Utility methods used by all methods
    #

    def __getFiles(self, se, status):
        # Get files with the given status and se from the database
        res = self.MigrationMonitoringDB.getMigratingReplicas(se, status)
        if not res['OK']:
            return res
        files = res['Value']
        pfnIDs = {}
        if len(files.keys()) > 0:
            for fileID, metadataDict in files.items():
                pfn = metadataDict['PFN']
                pfnIDs[pfn] = fileID
        return S_OK({'PFNIDs': pfnIDs, 'Files': files})

    def __getCatalogFileMetadata(self, files):
        lfnFileID = {}
        metadataToObtain = []
        for fileID, metadata in files.items():
            if not (metadata['Size'] and metadata['Checksum']):
                lfn = metadata['LFN']
                metadataToObtain.append(lfn)
                lfnFileID[lfn] = fileID
        if not metadataToObtain:
            return S_OK()
        res = self.ReplicaManager.getCatalogFileMetadata(metadataToObtain)
        if not res['OK']:
            gLogger.error(
                "__getCatalogFileMetadata: Failed to obtain file metadata",
                res['Message'])
            return res
        successful = res['Value']['Successful']
        failed = res['Value']['Failed']
        terminalIDs = []
        problematicFiles = []
        for lfn, error in failed.items():
            gLogger.error(
                "__getCatalogFileMetadata: Failed to get file metadata",
                "%s %s" % (lfn, error))
            if re.search("No such file or directory", error):
                fileID = lfnFileID[lfn]
                lfn = files[fileID]['LFN']
                pfn = files[fileID]['PFN']
                se = files[fileID]['SE']
                problematicFiles.append(lfn)
                terminalIDs.append(fileID)
        if terminalIDs:
            self.__reportProblematicFiles(problematicFiles,
                                          'LFNCatalogMissing')
            self.__setMigratingReplicaStatus(terminalIDs, 'Failed')
        fileMetadata = {}
        for lfn, metadata in successful.items():
            size = metadata['Size']
            checksum = metadata['CheckSumValue']
            fileMetadata[lfnFileID[lfn]] = {'Size': size, 'Checksum': checksum}
        return S_OK(fileMetadata)

    def __setMigratingReplicaStatus(self, fileIDs, status):
        gLogger.info(
            "__setMigratingReplicaStatus: Attempting to update %s files to '%s'"
            % (len(fileIDs), status))
        res = self.MigrationMonitoringDB.setMigratingReplicaStatus(
            fileIDs, status)
        if not res['OK']:
            gLogger.info(
                "__setMigratingReplicaStatus: Failed to update status of files",
                res['Message'])
        else:
            gLogger.info(
                "__setMigratingReplicaStatus: Successfully updated status of files"
            )

    def __reportProblematicFiles(self, lfns, reason):
        gLogger.info(
            '__reportProblematicFiles: The following %s files were found with %s'
            % (len(lfns), reason))
        for lfn in sortList(lfns):
            gLogger.info(lfn)
        res = self.DataIntegrityClient.setFileProblematic(
            lfns, reason, sourceComponent='MigrationMonitoringAgent')
        if not res['OK']:
            gLogger.info(
                '__reportProblematicFiles: Failed to update integrity DB with files',
                res['Message'])
        else:
            gLogger.info(
                '__reportProblematicFiles: Successfully updated integrity DB with files'
            )

    def __reportProblematicReplicas(self, replicaTuples):
        gLogger.info(
            '__reportProblematicReplicas: The following %s files being reported to integrity DB:'
            % (len(replicaTuples)))
        for lfn, pfn, se, reason in sortList(replicaTuples):
            if lfn:
                gLogger.info(lfn)
            else:
                gLogger.info(pfn)
        res = self.DataIntegrityClient.setReplicaProblematic(
            replicaTuples, sourceComponent='MigrationMonitoringAgent')
        if not res['OK']:
            gLogger.info(
                '__reportProblematicReplicas: Failed to update integrity DB with replicas',
                res['Message'])
        else:
            gLogger.info(
                '__reportProblematicReplicas: Successfully updated integrity DB with replicas'
            )

    def __initialiseAccountingObject(self, operation, se, startTime, endTime,
                                     size):
        accountingDict = {}
        accountingDict['OperationType'] = operation
        accountingDict['User'] = self.userName
        accountingDict['Protocol'] = 'SRM'
        accountingDict['RegistrationTime'] = 0.0
        accountingDict['RegistrationOK'] = 0
        accountingDict['RegistrationTotal'] = 0
        accountingDict['TransferTotal'] = 1
        accountingDict['TransferOK'] = 1
        accountingDict['TransferSize'] = size
        timeDiff = endTime - startTime
        transferTime = (timeDiff.days * 86400) + (timeDiff.seconds) + (
            timeDiff.microseconds / 1000000.0)
        accountingDict['TransferTime'] = transferTime
        accountingDict['FinalStatus'] = 'Successful'
        accountingDict['Source'] = siteName()
        accountingDict['Destination'] = se
        oDataOperation = DataOperation()
        oDataOperation.setEndTime(endTime)
        oDataOperation.setStartTime(startTime)
        oDataOperation.setValuesFromDict(accountingDict)
        return oDataOperation
Пример #21
0
    def initialize(self):
        self.ReplicaManager = ReplicaManager()
        self.DataLog = DataLoggingClient()
        self.DataIntegrityClient = DataIntegrityClient()
        if self.am_getOption('DirectDB', False):
            from DIRAC.StorageManagementSystem.DB.MigrationMonitoringDB import MigrationMonitoringDB
            self.MigrationMonitoringDB = MigrationMonitoringDB()
        else:
            from DIRAC.StorageManagementSystem.Client.MigrationMonitoringClient import MigrationMonitoringClient
            self.MigrationMonitoringDB = MigrationMonitoringClient()

        # This sets the Default Proxy to used as that defined under
        # /Operations/Shifter/DataManager
        # the shifterProxy option in the Configuration can be used to change this default.
        self.am_setOption('shifterProxy', 'DataManager')

        self.userName = '******'
        self.storageElements = self.am_getOption('StorageElements',
                                                 ['CERN-RAW'])
        self.lastMonitors = {}

        gMonitor.registerActivity("Iteration", "Agent Loops/min",
                                  "MigrationMonitoringAgent", "Loops",
                                  gMonitor.OP_SUM)
        if self.storageElements:
            gLogger.info(
                "Agent will be initialised to monitor the following SEs:")
            for se in self.storageElements:
                gLogger.info(se)
                self.lastMonitors[se] = datetime.datetime.utcfromtimestamp(0.0)
                gMonitor.registerActivity("Iteration%s" % se,
                                          "Agent Loops/min",
                                          "MigrationMonitoringAgent", "Loops",
                                          gMonitor.OP_SUM)
                gMonitor.registerActivity("MigratingFiles%s" % se,
                                          "Files waiting for migration",
                                          "MigrationMonitoringAgent", "Files",
                                          gMonitor.OP_MEAN)
                gMonitor.registerActivity("MigratedFiles%s" % se,
                                          "Newly migrated files",
                                          "MigrationMonitoringAgent", "Files",
                                          gMonitor.OP_SUM)
                gMonitor.registerActivity("TotalMigratedFiles%s" % se,
                                          "Total migrated files",
                                          "MigrationMonitoringAgent", "Files",
                                          gMonitor.OP_ACUM)
                gMonitor.registerActivity("TotalMigratedSize%s" % se,
                                          "Total migrated file size",
                                          "MigrationMonitoringAgent", "GB",
                                          gMonitor.OP_ACUM)
                gMonitor.registerActivity("ChecksumMatches%s" % se,
                                          "Successfully migrated files",
                                          "MigrationMonitoringAgent", "Files",
                                          gMonitor.OP_SUM)
                gMonitor.registerActivity("TotalChecksumMatches%s" % se,
                                          "Total successfully migrated files",
                                          "MigrationMonitoringAgent", "Files",
                                          gMonitor.OP_ACUM)
                gMonitor.registerActivity("ChecksumMismatches%s" % se,
                                          "Erroneously migrated files",
                                          "MigrationMonitoringAgent", "Files",
                                          gMonitor.OP_SUM)
                gMonitor.registerActivity("TotalChecksumMismatches%s" % se,
                                          "Total erroneously migrated files",
                                          "MigrationMonitoringAgent", "Files",
                                          gMonitor.OP_ACUM)
                gMonitor.registerActivity("MigrationTime%s" % se,
                                          "Average migration time",
                                          "MigrationMonitoringAgent",
                                          "Seconds", gMonitor.OP_MEAN)
        return S_OK()
Пример #22
0
class RequestPreparationAgent(AgentModule):
    def initialize(self):
        self.fileCatalog = FileCatalog()
        self.dm = DataManager()
        self.stagerClient = StorageManagerClient()
        self.dataIntegrityClient = DataIntegrityClient()
        # This sets the Default Proxy to used as that defined under
        # /Operations/Shifter/DataManager
        # the shifterProxy option in the Configuration can be used to change this default.
        self.am_setOption("shifterProxy", "DataManager")

        return S_OK()

    def execute(self):
        """This is the first logical task to be executed and manages the New->Waiting transition of the Replicas"""
        res = self.__getNewReplicas()
        if not res["OK"]:
            gLogger.fatal(
                "RequestPreparation.prepareNewReplicas: Failed to get replicas from StagerDB.", res["Message"]
            )
            return res
        if not res["Value"]:
            gLogger.info("There were no New replicas found")
            return res
        replicas = res["Value"]["Replicas"]
        replicaIDs = res["Value"]["ReplicaIDs"]
        gLogger.info(
            "RequestPreparation.prepareNewReplicas: Obtained %s New replicas for preparation." % len(replicaIDs)
        )

        # Check if the files exist in the FileCatalog
        res = self.__getExistingFiles(replicas)
        if not res["OK"]:
            return res
        exist = res["Value"]["Exist"]
        terminal = res["Value"]["Missing"]
        failed = res["Value"]["Failed"]
        if not exist:
            gLogger.error("RequestPreparation.prepareNewReplicas: Failed to determine the existence of any file")
            return S_OK()
        terminalReplicaIDs = {}
        for lfn, reason in terminal.items():
            for replicaID in replicas[lfn].values():
                terminalReplicaIDs[replicaID] = reason
            replicas.pop(lfn)
        gLogger.info("RequestPreparation.prepareNewReplicas: %s files exist in the FileCatalog." % len(exist))
        if terminal:
            gLogger.info(
                "RequestPreparation.prepareNewReplicas: %s files do not exist in the FileCatalog." % len(terminal)
            )

        # Obtain the file sizes from the FileCatalog
        res = self.__getFileSize(exist)
        if not res["OK"]:
            return res
        failed.update(res["Value"]["Failed"])
        terminal = res["Value"]["ZeroSize"]
        fileSizes = res["Value"]["FileSizes"]
        if not fileSizes:
            gLogger.error("RequestPreparation.prepareNewReplicas: Failed determine sizes of any files")
            return S_OK()
        for lfn, reason in terminal.items():
            for _se, replicaID in replicas[lfn].items():
                terminalReplicaIDs[replicaID] = reason
            replicas.pop(lfn)
        gLogger.info(
            "RequestPreparation.prepareNewReplicas: Obtained %s file sizes from the FileCatalog." % len(fileSizes)
        )
        if terminal:
            gLogger.info(
                "RequestPreparation.prepareNewReplicas: %s files registered with zero size in the FileCatalog."
                % len(terminal)
            )

        # Obtain the replicas from the FileCatalog
        res = self.__getFileReplicas(list(fileSizes))
        if not res["OK"]:
            return res
        failed.update(res["Value"]["Failed"])
        terminal = res["Value"]["ZeroReplicas"]
        fileReplicas = res["Value"]["Replicas"]
        if not fileReplicas:
            gLogger.error("RequestPreparation.prepareNewReplicas: Failed determine replicas for any files")
            return S_OK()
        for lfn, reason in terminal.items():
            for _se, replicaID in replicas[lfn].items():
                terminalReplicaIDs[replicaID] = reason
            replicas.pop(lfn)
        gLogger.info(
            "RequestPreparation.prepareNewReplicas: Obtained replica information for %s file from the FileCatalog."
            % len(fileReplicas)
        )
        if terminal:
            gLogger.info(
                "RequestPreparation.prepareNewReplicas: %s files registered with zero replicas in the FileCatalog."
                % len(terminal)
            )

        # Check the replicas exist at the requested site
        replicaMetadata = []
        for lfn, requestedSEs in replicas.items():
            lfnReplicas = fileReplicas.get(lfn)

            # This should not happen in principle, but it was seen
            # after a corrupted staging request has entered the DB
            if not lfnReplicas:
                gLogger.error("Missing replicas information", "%s %s" % (lfn, requestedSEs))
                continue

            for requestedSE, replicaID in requestedSEs.items():
                if requestedSE not in lfnReplicas.keys():
                    terminalReplicaIDs[replicaID] = "LFN not registered at requested SE"
                    replicas[lfn].pop(requestedSE)
                else:
                    replicaMetadata.append((replicaID, lfnReplicas[requestedSE], fileSizes[lfn]))

        # Update the states of the files in the database
        if terminalReplicaIDs:
            gLogger.info(
                "RequestPreparation.prepareNewReplicas: %s replicas are terminally failed." % len(terminalReplicaIDs)
            )
            # res = self.stagerClient.updateReplicaFailure( terminalReplicaIDs )
            res = self.stagerClient.updateReplicaFailure(terminalReplicaIDs)
            if not res["OK"]:
                gLogger.error(
                    "RequestPreparation.prepareNewReplicas: Failed to update replica failures.", res["Message"]
                )
        if replicaMetadata:
            gLogger.info(
                "RequestPreparation.prepareNewReplicas: %s replica metadata to be updated." % len(replicaMetadata)
            )
            # Sets the Status='Waiting' of CacheReplicas records that are OK with catalogue checks
            res = self.stagerClient.updateReplicaInformation(replicaMetadata)
            if not res["OK"]:
                gLogger.error(
                    "RequestPreparation.prepareNewReplicas: Failed to update replica metadata.", res["Message"]
                )
        return S_OK()

    def __getNewReplicas(self):
        """This obtains the New replicas from the Replicas table and for each LFN the requested storage element"""
        # First obtain the New replicas from the CacheReplicas table
        res = self.stagerClient.getCacheReplicas({"Status": "New"})
        if not res["OK"]:
            gLogger.error(
                "RequestPreparation.__getNewReplicas: Failed to get replicas with New status.", res["Message"]
            )
            return res
        if not res["Value"]:
            gLogger.debug("RequestPreparation.__getNewReplicas: No New replicas found to process.")
            return S_OK()
        else:
            gLogger.debug(
                "RequestPreparation.__getNewReplicas: Obtained %s New replicas(s) to process." % len(res["Value"])
            )
        replicas = {}
        replicaIDs = {}
        for replicaID, info in res["Value"].items():
            lfn = info["LFN"]
            storageElement = info["SE"]
            replicas.setdefault(lfn, {})[storageElement] = replicaID
            replicaIDs[replicaID] = (lfn, storageElement)
        return S_OK({"Replicas": replicas, "ReplicaIDs": replicaIDs})

    def __getExistingFiles(self, lfns):
        """This checks that the files exist in the FileCatalog."""
        res = self.fileCatalog.exists(list(set(lfns)))
        if not res["OK"]:
            gLogger.error(
                "RequestPreparation.__getExistingFiles: Failed to determine whether files exist.", res["Message"]
            )
            return res
        failed = res["Value"]["Failed"]
        success = res["Value"]["Successful"]
        exist = [lfn for lfn, exists in success.items() if exists]
        missing = list(set(success) - set(exist))
        if missing:
            reason = "LFN not registered in the FC"
            gLogger.warn("RequestPreparation.__getExistingFiles: %s" % reason, "\n".join([""] + missing))
            self.__reportProblematicFiles(missing, "LFN-LFC-DoesntExist")
            missing = dict.fromkeys(missing, reason)
        else:
            missing = {}
        return S_OK({"Exist": exist, "Missing": missing, "Failed": failed})

    def __getFileSize(self, lfns):
        """This obtains the file size from the FileCatalog."""
        fileSizes = {}
        zeroSize = {}
        res = self.fileCatalog.getFileSize(lfns)
        if not res["OK"]:
            gLogger.error("RequestPreparation.__getFileSize: Failed to get sizes for files.", res["Message"])
            return res
        failed = res["Value"]["Failed"]
        for lfn, size in res["Value"]["Successful"].items():
            if size == 0:
                zeroSize[lfn] = "LFN registered with zero size in the FileCatalog"
            else:
                fileSizes[lfn] = size
        if zeroSize:
            for lfn, reason in zeroSize.items():
                gLogger.warn("RequestPreparation.__getFileSize: %s" % reason, lfn)
            self.__reportProblematicFiles(zeroSize.keys(), "LFN-LFC-ZeroSize")
        return S_OK({"FileSizes": fileSizes, "ZeroSize": zeroSize, "Failed": failed})

    def __getFileReplicas(self, lfns):
        """This obtains the replicas from the FileCatalog."""
        replicas = {}
        noReplicas = {}
        res = self.dm.getActiveReplicas(lfns)
        if not res["OK"]:
            gLogger.error("RequestPreparation.__getFileReplicas: Failed to obtain file replicas.", res["Message"])
            return res
        failed = res["Value"]["Failed"]
        for lfn, lfnReplicas in res["Value"]["Successful"].items():
            if len(lfnReplicas) == 0:
                noReplicas[lfn] = "LFN registered with zero replicas in the FileCatalog"
            else:
                replicas[lfn] = lfnReplicas
        if noReplicas:
            for lfn, reason in noReplicas.items():
                gLogger.warn("RequestPreparation.__getFileReplicas: %s" % reason, lfn)
            self.__reportProblematicFiles(list(noReplicas), "LFN-LFC-NoReplicas")
        return S_OK({"Replicas": replicas, "ZeroReplicas": noReplicas, "Failed": failed})

    def __reportProblematicFiles(self, lfns, reason):
        return S_OK()
        res = self.dataIntegrityClient.setFileProblematic(lfns, reason, sourceComponent="RequestPreparationAgent")
        if not res["OK"]:
            gLogger.error(
                "RequestPreparation.__reportProblematicFiles: Failed to report missing files.", res["Message"]
            )
            return res
        if res["Value"]["Successful"]:
            gLogger.info(
                "RequestPreparation.__reportProblematicFiles: Successfully reported %s missing files."
                % len(res["Value"]["Successful"])
            )
        if res["Value"]["Failed"]:
            gLogger.info(
                "RequestPreparation.__reportProblematicFiles: Failed to report %s problematic files."
                % len(res["Value"]["Failed"])
            )
        return res
class MigrationMonitoringAgent( AgentModule ):

  def initialize( self ):
    self.ReplicaManager = ReplicaManager()
    self.DataLog = DataLoggingClient()
    self.DataIntegrityClient = DataIntegrityClient()
    if self.am_getOption( 'DirectDB', False ):
      from DIRAC.StorageManagementSystem.DB.MigrationMonitoringDB import MigrationMonitoringDB
      self.MigrationMonitoringDB = MigrationMonitoringDB()
    else:
      from DIRAC.StorageManagementSystem.Client.MigrationMonitoringClient import MigrationMonitoringClient
      self.MigrationMonitoringDB = MigrationMonitoringClient()

    # This sets the Default Proxy to used as that defined under 
    # /Operations/Shifter/DataManager
    # the shifterProxy option in the Configuration can be used to change this default.
    self.am_setOption( 'shifterProxy', 'DataManager' )

    self.userName = '******'
    self.storageElements = self.am_getOption( 'StorageElements', ['CERN-RAW'] )
    self.lastMonitors = {}

    gMonitor.registerActivity( "Iteration", "Agent Loops/min", "MigrationMonitoringAgent", "Loops", gMonitor.OP_SUM )
    if self.storageElements:
      gLogger.info( "Agent will be initialised to monitor the following SEs:" )
      for se in self.storageElements:
        gLogger.info( se )
        self.lastMonitors[se] = datetime.datetime.utcfromtimestamp( 0.0 )
        gMonitor.registerActivity( "Iteration%s" % se, "Agent Loops/min", "MigrationMonitoringAgent", "Loops", gMonitor.OP_SUM )
        gMonitor.registerActivity( "MigratingFiles%s" % se, "Files waiting for migration", "MigrationMonitoringAgent", "Files", gMonitor.OP_MEAN )
        gMonitor.registerActivity( "MigratedFiles%s" % se, "Newly migrated files", "MigrationMonitoringAgent", "Files", gMonitor.OP_SUM )
        gMonitor.registerActivity( "TotalMigratedFiles%s" % se, "Total migrated files", "MigrationMonitoringAgent", "Files", gMonitor.OP_ACUM )
        gMonitor.registerActivity( "TotalMigratedSize%s" % se, "Total migrated file size", "MigrationMonitoringAgent", "GB", gMonitor.OP_ACUM )
        gMonitor.registerActivity( "ChecksumMatches%s" % se, "Successfully migrated files", "MigrationMonitoringAgent", "Files", gMonitor.OP_SUM )
        gMonitor.registerActivity( "TotalChecksumMatches%s" % se, "Total successfully migrated files", "MigrationMonitoringAgent", "Files", gMonitor.OP_ACUM )
        gMonitor.registerActivity( "ChecksumMismatches%s" % se, "Erroneously migrated files", "MigrationMonitoringAgent", "Files", gMonitor.OP_SUM )
        gMonitor.registerActivity( "TotalChecksumMismatches%s" % se, "Total erroneously migrated files", "MigrationMonitoringAgent", "Files", gMonitor.OP_ACUM )
        gMonitor.registerActivity( "MigrationTime%s" % se, "Average migration time", "MigrationMonitoringAgent", "Seconds", gMonitor.OP_MEAN )
    return S_OK()

  def execute( self ):
    self.enableFlag = self.am_getOption( 'EnableFlag', 'True' )
    if not self.enableFlag == 'True':
      self.log.info( 'MigrationMonitoringAgent is disabled by configuration option %s/EnableFlag' % ( self.section ) )
      return S_OK( 'Disabled via CS flag' )
    gMonitor.addMark( "Iteration", 1 )
    self.NewToMigrating()
    for se in self.storageElements:
      gMonitor.addMark( "Iteration%s" % se, 1 )
      self.MigratingToMigrated( se )
    return S_OK()

  #########################################################################################################
  #
  # Includes the file size and checksum information for replicas which do not have it
  #

  def NewToMigrating( self ):
    """ Obtain the new files from the migration monitoring db and (where necessary) add the size and checksum information
    """
    # First get the new files from the database
    gLogger.info( "NewToMigrating: Attempting to obtain 'New' files." )
    res = self.__getFiles( '', 'New' )
    if not res['OK']:
      gLogger.error( "NewToMigrating: Failed to get 'New' files.", res['Message'] )
      return res
    newFiles = res['Value']['Files']
    if not newFiles:
      gLogger.info( "NewToMigrating: Found no 'New' files." )
      return S_OK()
    # Get the metadata from the catalog for which do not have size or checksum
    res = self.__getCatalogFileMetadata( newFiles )
    if not res['OK']:
      gLogger.error( "NewToMigrating: Failed to get metadata for files", res['Message'] )
      return res
    metadata = res['Value']
    # Add the metadata to the migration monitoring DB.
    res = self.__updateNewMigrating( metadata )
    return S_OK()

  def __updateNewMigrating( self, fileMetadata ):
    gLogger.info( "__updateNewMigrating: Updating metadata for %s files" % len( fileMetadata ) )
    gLogger.info( "PUT THE CODE HERE TO UPDATE THE METDATA" )

    #self.__setMigratingReplicaStatus(fileMetadata.keys(),'Migrating')
    return S_OK()

  #########################################################################################################
  #
  # Monitors the migration of files
  #

  def MigratingToMigrated( self, se ):
    """ Obtain the active files from the migration monitoring db and check their status
    """
    # First get the migrating files from the database
    gLogger.info( "[%s] MigratingToMigrated: Attempting to obtain 'Migrating' files." % se )
    res = self.__getFiles( se, 'Migrating' )
    if not res['OK']:
      gLogger.error( "[%s] MigratingToMigrated: Failed to get 'Migrating' files." % se, res['Message'] )
      return res
    pfnIDs = res['Value']['PFNIDs']
    if not pfnIDs:
      gLogger.info( "[%s] MigratingToMigrated: Found no 'Migrating' files." % se )
      return S_OK()
    migratingFiles = res['Value']['Files']
    gLogger.info( "[%s] MigratingToMigrated: Found %d 'Migrating' files." % ( se, len( pfnIDs ) ) )
    gMonitor.addMark( "MigratingFiles%s" % se, len( pfnIDs ) )
    gLogger.info( "[%s] MigratingToMigrated: Obtaining physical file metadata for 'Migrating' files." % se )
    startTime = datetime.datetime.utcnow()
    res = self.__getMigratedFiles( se, pfnIDs.keys() )
    if not res['OK']:
      gLogger.error( "[%s] MigratingToMigrated: Failed to get 'Migrating' file metadata." % se, res['Message'] )
      return res
    assumedEndTime = datetime.datetime.utcnow() - ( ( datetime.datetime.utcnow() - startTime ) / 2 ) # Assumed that the files are found migrated midway through obtaining the metadata
    previousMonitorTime = self.lastMonitors[se]
    self.lastMonitors[se] = datetime.datetime.utcnow()
    terminal = res['Value']['Terminal']
    migrated = res['Value']['Migrated']

    # Update the problematic files in the integrity DB and update the MigrationMonitoringDB
    gLogger.info( "[%s] MigratingToMigrated: Found %d terminally failed files." % ( se, len( terminal ) ) )
    if terminal:
      replicaTuples = []
      terminalFileIDs = []
      for pfn, prognosis in terminal.items():
        fileID = pfnIDs[pfn]
        terminalFileIDs.append( fileID )
        lfn = migratingFiles[fileID]['LFN']
        se = migratingFiles[fileID]['SE']
        replicaTuples.append( ( lfn, pfn, se, prognosis ) )
      self.__reportProblematicReplicas( replicaTuples )
      res = self.MigrationMonitoringDB.setMigratingReplicaStatus( terminalFileIDs, 'Failed' )
      if not res['OK']:
        gLogger.error( "[%s] MigratingToMigrated: Failed to update terminal files." % se, res['Message'] )

    # Update the migrated files and send accounting
    gLogger.info( "[%s] MigratingToMigrated: Found %d migrated files." % ( se, len( migrated ) ) )
    if migrated:
      migratedFileIDs = {}
      for pfn, checksum in migrated.items():
        migratedFileIDs[pfnIDs[pfn]] = checksum
      #res = self.MigrationMonitoringDB.setMigratingReplicaStatus(migratedFileIDs.keys(),'Migrated')
      #if not res['OK']:
      #  gLogger.error("[%s] MigratingToMigrated: Failed to update migrated files." % se, res['Message'])
      # Check the checksums of the migrated files
      res = self.__validateChecksums( se, migratedFileIDs, migratingFiles )
      if not res['OK']:
        gLogger.error( "[%s] MigratingToMigrated: Failed to perform checksum matching." % se, res['Message'] )
        matchingFiles = []
        mismatchingFiles = []
      else:
        matchingFiles = res['Value']['MatchingFiles']
        mismatchingFiles = res['Value']['MismatchFiles']
      # Create and send the accounting messages
      res = self.__updateMigrationAccounting( se, migratingFiles, matchingFiles, mismatchingFiles, assumedEndTime, previousMonitorTime )
      if not res['OK']:
        gLogger.error( "[%s] MigratingToMigrated: Failed to send accounting for migrated files." % se, res['Message'] )
    return S_OK()

  def __getMigratedFiles( self, se, pfns ):
    # Get the active files from the database
    migrated = {}
    terminal = {}
    res = self.ReplicaManager.getStorageFileMetadata( pfns, se )
    if not res['OK']:
      return res
    for pfn, error in res['Value']['Failed'].items():
      if re.search( "File does not exist", error ):
        gLogger.error( "[%s] __getStorageMetadata: PFN does not exist at StorageElement." % se, "%s %s" % ( pfn, error ) )
        terminal[pfn] = 'PFNMissing'
      else:
        gLogger.warn( "[%s] __getMigratedFiles: Failed to obtain physical file metadata." % se, "%s %s" % ( pfn, error ) )
    storageMetadata = res['Value']['Successful']
    for pfn, metadata in storageMetadata.items():
      if metadata['Migrated']:
        checksum = ''
        if metadata.has_key( 'Checksum' ):
          checksum = metadata['Checksum']
        migrated[pfn] = checksum
      elif metadata['Lost']:
        gLogger.error( "[%s] __getMigratedFiles: PFN has been Lost by the StorageElement." % se, "%s" % ( pfn ) )
        terminal[pfn] = 'PFNLost'
      elif metadata['Unavailable']:
        gLogger.error( "[%s] __getMigratedFiles: PFN declared Unavailable by StorageElement." % se, "%s" % ( pfn ) )
        terminal[pfn] = 'PFNUnavailable'
    resDict = {'Terminal':terminal, 'Migrated':migrated}
    return S_OK( resDict )

  def __validateChecksums( self, se, migratedFileIDs, migratingFiles ):
    """ Obtain the checksums in the catalog if not present and check against the checksum from the storage
    """
    lfnFileID = {}
    checksumToObtain = []
    for fileID in migratedFileIDs.keys():
      if not migratingFiles[fileID]['Checksum']:
        lfn = migratingFiles[fileID]['LFN']
        checksumToObtain.append( lfn )
        lfnFileID[lfn] = fileID
    if checksumToObtain:
      res = self.ReplicaManager.getCatalogFileMetadata( checksumToObtain )
      if not res['OK']:
        gLogger.error( "[%s] __validateChecksums: Failed to obtain file checksums" % se )
        return res
      for lfn, error in res['Value']['Failed'].items():
        gLogger.error( "[%s] __validateChecksums: Failed to get file checksum" % se, "%s %s" % ( lfn, error ) )
      for lfn, metadata in res['Value']['Successful'].items():
        migratingFiles[lfnFileID[lfn]]['Checksum'] = metadata['CheckSumValue']
    mismatchFiles = []
    matchFiles = []
    checksumMismatches = []
    fileRecords = []
    for fileID, seChecksum in migratedFileIDs.items():
      lfn = migratingFiles[fileID]['LFN']
      catalogChecksum = migratingFiles[fileID]['Checksum']
      if not seChecksum:
        gLogger.error( "[%s] __validateChecksums: Storage checksum not available" % se, migratingFiles[fileID]['PFN'] )
      elif not compareAdler( seChecksum, catalogChecksum ):
        gLogger.error( "[%s] __validateChecksums: Storage and catalog checksum mismatch" % se, "%s '%s' '%s'" % ( migratingFiles[fileID]['PFN'], seChecksum, catalogChecksum ) )
        mismatchFiles.append( fileID )
        pfn = migratingFiles[fileID]['PFN']
        se = migratingFiles[fileID]['SE']
        checksumMismatches.append( ( lfn, pfn, se, 'CatalogPFNChecksumMismatch' ) )
        fileRecords.append( ( lfn, 'Checksum match', '%s@%s' % ( seChecksum, se ), '', 'MigrationMonitoringAgent' ) )
      else:
        fileRecords.append( ( lfn, 'Checksum mismatch', '%s@%s' % ( seChecksum, se ), '', 'MigrationMonitoringAgent' ) )
        matchFiles.append( fileID )
    # Add the data logging records
    self.DataLog.addFileRecords( fileRecords )
    if checksumMismatches:
      # Update the (mis)matching checksums (in the integrityDB and) in the migration monitoring db
      self.__reportProblematicReplicas( checksumMismatches )
      res = self.MigrationMonitoringDB.setMigratingReplicaStatus( mismatchFiles, 'ChecksumFail' )
      if not res['OK']:
        gLogger.error( "[%s] __validateChecksums: Failed to update checksum mismatching files." % se, res['Message'] )
    if matchFiles:
      res = self.MigrationMonitoringDB.setMigratingReplicaStatus( matchFiles, 'ChecksumMatch' )
      if not res['OK']:
        gLogger.error( "[%s] __validateChecksums: Failed to update checksum mismatching files." % se, res['Message'] )
    resDict = {'MatchingFiles':matchFiles, 'MismatchFiles':mismatchFiles}
    return S_OK( resDict )

  def __updateMigrationAccounting( self, se, migratingFiles, matchingFiles, mismatchingFiles, assumedEndTime, previousMonitorTime ):
    """ Create accounting messages for the overall throughput observed and the total migration time for the files
    """
    allMigrated = matchingFiles + mismatchingFiles
    gMonitor.addMark( "MigratedFiles%s" % se, len( allMigrated ) )
    gMonitor.addMark( "TotalMigratedFiles%s" % se, len( allMigrated ) )
    lfnFileID = {}
    sizesToObtain = []
    for fileID in allMigrated:
      if not migratingFiles[fileID]['Size']:
        lfn = migratingFiles[fileID]['LFN']
        sizesToObtain.append( lfn )
        lfnFileID[lfn] = fileID
    if sizesToObtain:
      res = self.ReplicaManager.getCatalogFileSize( sizesToObtain )
      if not res['OK']:
        gLogger.error( "[%s] __updateMigrationAccounting: Failed to obtain file sizes" % se )
        return res
      for lfn, error in res['Value']['Failed'].items():
        gLogger.error( "[%s] __updateAccounting: Failed to get file size" % se, "%s %s" % ( lfn, error ) )
        migratingFiles[lfnFileID[lfn]]['Size'] = 0
      for lfn, size in res['Value']['Successful'].items():
        migratingFiles[lfnFileID[lfn]]['Size'] = size
    totalSize = 0
    for fileID in allMigrated:
      size = migratingFiles[fileID]['Size']
      totalSize += size
      submitTime = migratingFiles[fileID]['SubmitTime']
      timeDiff = submitTime - assumedEndTime
      migrationTime = ( timeDiff.days * 86400 ) + ( timeDiff.seconds ) + ( timeDiff.microseconds / 1000000.0 )
      gMonitor.addMark( "MigrationTime%s" % se, migrationTime )
      gDataStoreClient.addRegister( self.__initialiseAccountingObject( 'MigrationTime', se, submitTime, assumedEndTime, size ) )
      gDataStoreClient.addRegister( self.__initialiseAccountingObject( 'MigrationThroughput', se, previousMonitorTime, assumedEndTime, size ) )
      oDataOperation = self.__initialiseAccountingObject( 'MigrationSuccess', se, submitTime, assumedEndTime, size )
      if fileID in mismatchingFiles:
        oDataOperation.setValueByKey( 'TransferOK', 0 )
        oDataOperation.setValueByKey( 'FinalStatus', 'Failed' )
      gDataStoreClient.addRegister( oDataOperation )
    gMonitor.addMark( "TotalMigratedSize%s" % se, totalSize )
    gMonitor.addMark( "ChecksumMismatches%s" % se, len( mismatchingFiles ) )
    gMonitor.addMark( "TotalChecksumMismatches%s" % se, len( mismatchingFiles ) )
    gMonitor.addMark( "ChecksumMatches%s" % se, len( matchingFiles ) )
    gMonitor.addMark( "TotalChecksumMatches%s" % se, len( matchingFiles ) )
    if allMigrated:
      gLogger.info( '[%s] __updateMigrationAccounting: Attempting to send accounting message...' % se )
      return gDataStoreClient.commit()
    return S_OK()

  #########################################################################################################
  #
  # Utility methods used by all methods
  #

  def __getFiles( self, se, status ):
    # Get files with the given status and se from the database
    res = self.MigrationMonitoringDB.getMigratingReplicas( se, status )
    if not res['OK']:
      return res
    files = res['Value']
    pfnIDs = {}
    if len( files.keys() ) > 0:
      for fileID, metadataDict in files.items():
        pfn = metadataDict['PFN']
        pfnIDs[pfn] = fileID
    return S_OK( {'PFNIDs':pfnIDs, 'Files':files} )

  def __getCatalogFileMetadata( self, files ):
    lfnFileID = {}
    metadataToObtain = []
    for fileID, metadata in files.items():
      if not ( metadata['Size'] and metadata['Checksum'] ):
        lfn = metadata['LFN']
        metadataToObtain.append( lfn )
        lfnFileID[lfn] = fileID
    if not metadataToObtain:
      return S_OK()
    res = self.ReplicaManager.getCatalogFileMetadata( metadataToObtain )
    if not res['OK']:
      gLogger.error( "__getCatalogFileMetadata: Failed to obtain file metadata", res['Message'] )
      return res
    successful = res['Value']['Successful']
    failed = res['Value']['Failed']
    terminalIDs = []
    problematicFiles = []
    for lfn, error in failed.items():
      gLogger.error( "__getCatalogFileMetadata: Failed to get file metadata", "%s %s" % ( lfn, error ) )
      if re.search( "No such file or directory", error ):
        fileID = lfnFileID[lfn]
        lfn = files[fileID]['LFN']
        pfn = files[fileID]['PFN']
        se = files[fileID]['SE']
        problematicFiles.append( lfn )
        terminalIDs.append( fileID )
    if terminalIDs:
      self.__reportProblematicFiles( problematicFiles, 'LFNCatalogMissing' )
      self.__setMigratingReplicaStatus( terminalIDs, 'Failed' )
    fileMetadata = {}
    for lfn, metadata in successful.items():
      size = metadata['Size']
      checksum = metadata['CheckSumValue']
      fileMetadata[lfnFileID[lfn]] = {'Size':size, 'Checksum':checksum}
    return S_OK( fileMetadata )

  def __setMigratingReplicaStatus( self, fileIDs, status ):
    gLogger.info( "__setMigratingReplicaStatus: Attempting to update %s files to '%s'" % ( len( fileIDs ), status ) )
    res = self.MigrationMonitoringDB.setMigratingReplicaStatus( fileIDs, status )
    if not res['OK']:
      gLogger.info( "__setMigratingReplicaStatus: Failed to update status of files", res['Message'] )
    else:
      gLogger.info( "__setMigratingReplicaStatus: Successfully updated status of files" )

  def __reportProblematicFiles( self, lfns, reason ):
    gLogger.info( '__reportProblematicFiles: The following %s files were found with %s' % ( len( lfns ), reason ) )
    for lfn in sortList( lfns ):
      gLogger.info( lfn )
    res = self.DataIntegrityClient.setFileProblematic( lfns, reason, sourceComponent = 'MigrationMonitoringAgent' )
    if not res['OK']:
      gLogger.info( '__reportProblematicFiles: Failed to update integrity DB with files', res['Message'] )
    else:
      gLogger.info( '__reportProblematicFiles: Successfully updated integrity DB with files' )

  def __reportProblematicReplicas( self, replicaTuples ):
    gLogger.info( '__reportProblematicReplicas: The following %s files being reported to integrity DB:' % ( len( replicaTuples ) ) )
    for lfn, pfn, se, reason in sortList( replicaTuples ):
      if lfn:
        gLogger.info( lfn )
      else:
        gLogger.info( pfn )
    res = self.DataIntegrityClient.setReplicaProblematic( replicaTuples, sourceComponent = 'MigrationMonitoringAgent' )
    if not res['OK']:
      gLogger.info( '__reportProblematicReplicas: Failed to update integrity DB with replicas', res['Message'] )
    else:
      gLogger.info( '__reportProblematicReplicas: Successfully updated integrity DB with replicas' )

  def __initialiseAccountingObject( self, operation, se, startTime, endTime, size ):
    accountingDict = {}
    accountingDict['OperationType'] = operation
    accountingDict['User'] = self.userName
    accountingDict['Protocol'] = 'SRM'
    accountingDict['RegistrationTime'] = 0.0
    accountingDict['RegistrationOK'] = 0
    accountingDict['RegistrationTotal'] = 0
    accountingDict['TransferTotal'] = 1
    accountingDict['TransferOK'] = 1
    accountingDict['TransferSize'] = size
    timeDiff = endTime - startTime
    transferTime = ( timeDiff.days * 86400 ) + ( timeDiff.seconds ) + ( timeDiff.microseconds / 1000000.0 )
    accountingDict['TransferTime'] = transferTime
    accountingDict['FinalStatus'] = 'Successful'
    accountingDict['Source'] = siteName()
    accountingDict['Destination'] = se
    oDataOperation = DataOperation()
    oDataOperation.setEndTime( endTime )
    oDataOperation.setStartTime( startTime )
    oDataOperation.setValuesFromDict( accountingDict )
    return oDataOperation
Пример #24
0
class StageMonitorAgent( AgentModule ):

  def initialize( self ):
    self.replicaManager = ReplicaManager()
    #self.stagerClient = StorageManagerClient()
    self.dataIntegrityClient = DataIntegrityClient()
    self.storageDB = StorageManagementDB()
    # This sets the Default Proxy to used as that defined under
    # /Operations/Shifter/DataManager
    # the shifterProxy option in the Configuration can be used to change this default.
    self.am_setOption( 'shifterProxy', 'DataManager' )

    return S_OK()

  def execute( self ):

    res = self.monitorStageRequests()
    return res

  def monitorStageRequests( self ):
    """ This is the third logical task manages the StageSubmitted->Staged transition of the Replicas
    """
    res = self.__getStageSubmittedReplicas()
    if not res['OK']:
      gLogger.fatal( "StageMonitor.monitorStageRequests: Failed to get replicas from StorageManagementDB.", res['Message'] )
      return res
    if not res['Value']:
      gLogger.info( "StageMonitor.monitorStageRequests: There were no StageSubmitted replicas found" )
      return res
    seReplicas = res['Value']['SEReplicas']
    replicaIDs = res['Value']['ReplicaIDs']
    gLogger.info( "StageMonitor.monitorStageRequests: Obtained %s StageSubmitted replicas for monitoring." % len( replicaIDs ) )
    for storageElement, seReplicaIDs in seReplicas.items():
      self.__monitorStorageElementStageRequests( storageElement, seReplicaIDs, replicaIDs )
    return S_OK()

  def __monitorStorageElementStageRequests( self, storageElement, seReplicaIDs, replicaIDs ):
    terminalReplicaIDs = {}
    stagedReplicas = []
    pfnRepIDs = {}
    pfnReqIDs = {}
    for replicaID in seReplicaIDs:
      pfn = replicaIDs[replicaID]['PFN']
      pfnRepIDs[pfn] = replicaID
      requestID = replicaIDs[replicaID].get( 'RequestID', None )
      if requestID:
        pfnReqIDs[pfn] = replicaIDs[replicaID]['RequestID']

    gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: Monitoring %s stage requests for %s." % ( len( pfnRepIDs ), storageElement ) )
    res = self.replicaManager.getStorageFileMetadata( pfnReqIDs.keys(), storageElement )
    if not res['OK']:
      gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Completely failed to monitor stage requests for replicas.", res['Message'] )
      return
    prestageStatus = res['Value']
    for pfn, reason in prestageStatus['Failed'].items():
      if re.search( 'File does not exist', reason ):
        gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: PFN did not exist in the StorageElement", pfn )
        terminalReplicaIDs[pfnRepIDs[pfn]] = 'PFN did not exist in the StorageElement'
    for pfn, staged in prestageStatus['Successful'].items():
      if staged and 'Cached' in staged and staged['Cached']:
        stagedReplicas.append( pfnRepIDs[pfn] )

    # Update the states of the replicas in the database
    if terminalReplicaIDs:
      gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: %s replicas are terminally failed." % len( terminalReplicaIDs ) )
      res = self.storageDB.updateReplicaFailure( terminalReplicaIDs )
      if not res['OK']:
        gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to update replica failures.", res['Message'] )
    if stagedReplicas:
      gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: %s staged replicas to be updated." % len( stagedReplicas ) )
      res = self.storageDB.setStageComplete( stagedReplicas )
      if not res['OK']:
        gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to updated staged replicas.", res['Message'] )
      res = self.storageDB.updateReplicaStatus( stagedReplicas, 'Staged' )
      if not res['OK']:
        gLogger.error( "StageRequest.__monitorStorageElementStageRequests: Failed to insert replica status.", res['Message'] )
    return

  def __getStageSubmittedReplicas( self ):
    """ This obtains the StageSubmitted replicas from the Replicas table and the RequestID from the StageRequests table """
    res = self.storageDB.getCacheReplicas( {'Status':'StageSubmitted'} )
    if not res['OK']:
      gLogger.error( "StageRequest.__getStageSubmittedReplicas: Failed to get replicas with StageSubmitted status.", res['Message'] )
      return res
    if not res['Value']:
      gLogger.debug( "StageRequest.__getStageSubmittedReplicas: No StageSubmitted replicas found to process." )
      return S_OK()
    else:
      gLogger.debug( "StageRequest.__getStageSubmittedReplicas: Obtained %s StageSubmitted replicas(s) to process." % len( res['Value'] ) )

    seReplicas = {}
    replicaIDs = res['Value']
    for replicaID, info in replicaIDs.items():
      storageElement = info['SE']
      if not seReplicas.has_key( storageElement ):
        seReplicas[storageElement] = []
      seReplicas[storageElement].append( replicaID )

    # RequestID was missing from replicaIDs dictionary BUGGY?
    res = self.storageDB.getStageRequests( {'ReplicaID':replicaIDs.keys()} )
    if not res['OK']:
      return res
    if not res['Value']:
      return S_ERROR( 'Could not obtain request IDs for replicas %s from StageRequests table' % ( replicaIDs.keys() ) )

    for replicaID, info in res['Value'].items():
      reqID = info['RequestID']
      replicaIDs[replicaID]['RequestID'] = reqID

    return S_OK( {'SEReplicas':seReplicas, 'ReplicaIDs':replicaIDs} )

  def __reportProblematicFiles( self, lfns, reason ):
    return S_OK()
    res = self.dataIntegrityClient.setFileProblematic( lfns, reason, self.name )
    if not res['OK']:
      gLogger.error( "RequestPreparation.__reportProblematicFiles: Failed to report missing files.", res['Message'] )
      return res
    if res['Value']['Successful']:
      gLogger.info( "RequestPreparation.__reportProblematicFiles: Successfully reported %s missing files." % len( res['Value']['Successful'] ) )
    if res['Value']['Failed']:
      gLogger.info( "RequestPreparation.__reportProblematicFiles: Failed to report %s problematic files." % len( res['Value']['Failed'] ) )
    return res
Пример #25
0
class ValidateOutputDataAgent( AgentModule ):

  def __init__( self, *args, **kwargs ):
    """ c'tor
    """
    AgentModule.__init__( self, *args, **kwargs )

    self.integrityClient = DataIntegrityClient()
    self.fc = FileCatalog()
    self.transClient = TransformationClient()
    self.fileCatalogClient = FileCatalogClient()

    agentTSTypes = self.am_getOption( 'TransformationTypes', [] )
    if agentTSTypes:
      self.transformationTypes = agentTSTypes
    else:
      self.transformationTypes = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] )

    self.directoryLocations = sorted( self.am_getOption( 'DirectoryLocations', ['TransformationDB',
                                                                                  'MetadataCatalog'] ) )
    self.activeStorages = sorted( self.am_getOption( 'ActiveSEs', [] ) )
    self.transfidmeta = self.am_getOption( 'TransfIDMeta', "TransformationID" )
    self.enableFlag = True

  #############################################################################

  def initialize( self ):
    """ Sets defaults
    """
    # This sets the Default Proxy to used as that defined under
    # /Operations/Shifter/DataManager
    # the shifterProxy option in the Configuration can be used to change this default.
    self.am_setOption( 'shifterProxy', 'DataManager' )

    gLogger.info( "Will treat the following transformation types: %s" % str( self.transformationTypes ) )
    gLogger.info( "Will search for directories in the following locations: %s" % str( self.directoryLocations ) )
    gLogger.info( "Will check the following storage elements: %s" % str( self.activeStorages ) )
    gLogger.info( "Will use %s as metadata tag name for TransformationID" % self.transfidmeta )
    return S_OK()

  #############################################################################

  def execute( self ):
    """ The VerifyOutputData execution method
    """
    self.enableFlag = self.am_getOption( 'EnableFlag', 'True' )
    if not self.enableFlag == 'True':
      self.log.info( "VerifyOutputData is disabled by configuration option 'EnableFlag'" )
      return S_OK( 'Disabled via CS flag' )

    gLogger.info( "-" * 40 )
    self.updateWaitingIntegrity()
    gLogger.info( "-" * 40 )

    res = self.transClient.getTransformations( {'Status':'ValidatingOutput', 'Type':self.transformationTypes} )
    if not res['OK']:
      gLogger.error( "Failed to get ValidatingOutput transformations", res['Message'] )
      return res
    transDicts = res['Value']
    if not transDicts:
      gLogger.info( "No transformations found in ValidatingOutput status" )
      return S_OK()
    gLogger.info( "Found %s transformations in ValidatingOutput status" % len( transDicts ) )
    for transDict in transDicts:
      transID = transDict['TransformationID']
      res = self.checkTransformationIntegrity( int( transID ) )
      if not res['OK']:
        gLogger.error( "Failed to perform full integrity check for transformation %d" % transID )
      else:
        self.finalizeCheck( transID )
        gLogger.info( "-" * 40 )
    return S_OK()

  def updateWaitingIntegrity( self ):
    """ Get 'WaitingIntegrity' transformations, update to 'ValidatedOutput'
    """
    gLogger.info( "Looking for transformations in the WaitingIntegrity status to update" )
    res = self.transClient.getTransformations( {'Status':'WaitingIntegrity'} )
    if not res['OK']:
      gLogger.error( "Failed to get WaitingIntegrity transformations", res['Message'] )
      return res
    transDicts = res['Value']
    if not transDicts:
      gLogger.info( "No transformations found in WaitingIntegrity status" )
      return S_OK()
    gLogger.info( "Found %s transformations in WaitingIntegrity status" % len( transDicts ) )
    for transDict in transDicts:
      transID = transDict['TransformationID']
      gLogger.info( "-" * 40 )
      res = self.integrityClient.getTransformationProblematics( int( transID ) )
      if not res['OK']:
        gLogger.error( "Failed to determine waiting problematics for transformation", res['Message'] )
      elif not res['Value']:
        res = self.transClient.setTransformationParameter( transID, 'Status', 'ValidatedOutput' )
        if not res['OK']:
          gLogger.error( "Failed to update status of transformation %s to ValidatedOutput" % ( transID ) )
        else:
          gLogger.info( "Updated status of transformation %s to ValidatedOutput" % ( transID ) )
      else:
        gLogger.info( "%d problematic files for transformation %s were found" % ( len( res['Value'] ), transID ) )
    return

  #############################################################################
  #
  # Get the transformation directories for checking
  #

  def getTransformationDirectories( self, transID ):
    """ Get the directories for the supplied transformation from the transformation system
    """
    directories = []
    if 'TransformationDB' in self.directoryLocations:
      res = self.transClient.getTransformationParameters( transID, ['OutputDirectories'] )
      if not res['OK']:
        gLogger.error( "Failed to obtain transformation directories", res['Message'] )
        return res
      transDirectories = res['Value'].splitlines()
      directories = self._addDirs( transID, transDirectories, directories )

    if 'MetadataCatalog' in self.directoryLocations:
      res = self.fileCatalogClient.findDirectoriesByMetadata( {self.transfidmeta:transID} )
      if not res['OK']:
        gLogger.error( "Failed to obtain metadata catalog directories", res['Message'] )
        return res
      transDirectories = res['Value']
      directories = self._addDirs( transID, transDirectories, directories )
    if not directories:
      gLogger.info( "No output directories found" )
    directories = sorted( directories )
    return S_OK( directories )

  @staticmethod
  def _addDirs( transID, newDirs, existingDirs ):
    for nDir in newDirs:
      transStr = str( transID ).zfill( 8 )
      if re.search( transStr, nDir ):
        if not nDir in existingDirs:
          existingDirs.append( nDir )
    return existingDirs

  #############################################################################
  def checkTransformationIntegrity( self, transID ):
    """ This method contains the real work
    """
    gLogger.info( "-" * 40 )
    gLogger.info( "Checking the integrity of transformation %s" % transID )
    gLogger.info( "-" * 40 )

    res = self.getTransformationDirectories( transID )
    if not res['OK']:
      return res
    directories = res['Value']
    if not directories:
      return S_OK()

    ######################################################
    #
    # This check performs Catalog->SE for possible output directories
    #
    res = self.fc.exists( directories )
    if not res['OK']:
      gLogger.error( res['Message'] )
      return res
    for directory, error in res['Value']['Failed']:
      gLogger.error( 'Failed to determine existance of directory', '%s %s' % ( directory, error ) )
    if res['Value']['Failed']:
      return S_ERROR( "Failed to determine the existance of directories" )
    directoryExists = res['Value']['Successful']
    for directory in sorted( directoryExists.keys() ):
      if not directoryExists[directory]:
        continue
      iRes = self.integrityClient.catalogDirectoryToSE( directory )
      if not iRes['OK']:
        gLogger.error( iRes['Message'] )
        return iRes

    ######################################################
    #
    # This check performs SE->Catalog for possible output directories
    #
    for storageElementName in sorted( self.activeStorages ):
      res = self.integrityClient.storageDirectoryToCatalog( directories, storageElementName )
      if not res['OK']:
        gLogger.error( res['Message'] )
        return res

    gLogger.info( "-" * 40 )
    gLogger.info( "Completed integrity check for transformation %s" % transID )
    return S_OK()

  def finalizeCheck( self, transID ):
    """ Move to 'WaitingIntegrity' or 'ValidatedOutput'
    """
    res = self.integrityClient.getTransformationProblematics( int( transID ) )
    if not res['OK']:
      gLogger.error( "Failed to determine whether there were associated problematic files", res['Message'] )
      newStatus = ''
    elif res['Value']:
      gLogger.info( "%d problematic files for transformation %s were found" % ( len( res['Value'] ), transID ) )
      newStatus = "WaitingIntegrity"
    else:
      gLogger.info( "No problematics were found for transformation %s" % transID )
      newStatus = "ValidatedOutput"
    if newStatus:
      res = self.transClient.setTransformationParameter( transID, 'Status', newStatus )
      if not res['OK']:
        gLogger.error( "Failed to update status of transformation %s to %s" % ( transID, newStatus ) )
      else:
        gLogger.info( "Updated status of transformation %s to %s" % ( transID, newStatus ) )
    gLogger.info( "-" * 40 )
    return S_OK()
Пример #26
0
class StageMonitorAgent( AgentModule ):

  def initialize( self ):
    self.replicaManager = ReplicaManager()
    self.stagerClient = StorageManagerClient()
    self.dataIntegrityClient = DataIntegrityClient()
    #self.storageDB = StorageManagementDB()
    # This sets the Default Proxy to used as that defined under
    # /Operations/Shifter/DataManager
    # the shifterProxy option in the Configuration can be used to change this default.
    self.am_setOption( 'shifterProxy', 'DataManager' )

    return S_OK()

  def execute( self ):

    res = getProxyInfo( disableVOMS = True )
    if not res['OK']:
      return res
    self.proxyInfoDict = res['Value']

    res = self.monitorStageRequests()

    return res

  def monitorStageRequests( self ):
    """ This is the third logical task manages the StageSubmitted->Staged transition of the Replicas
    """
    res = self.__getStageSubmittedReplicas()
    if not res['OK']:
      gLogger.fatal( "StageMonitor.monitorStageRequests: Failed to get replicas from StorageManagementDB.", res['Message'] )
      return res
    if not res['Value']:
      gLogger.info( "StageMonitor.monitorStageRequests: There were no StageSubmitted replicas found" )
      return res
    seReplicas = res['Value']['SEReplicas']
    replicaIDs = res['Value']['ReplicaIDs']
    gLogger.info( "StageMonitor.monitorStageRequests: Obtained %s StageSubmitted replicas for monitoring." % len( replicaIDs ) )
    for storageElement, seReplicaIDs in seReplicas.items():
      self.__monitorStorageElementStageRequests( storageElement, seReplicaIDs, replicaIDs )

    gDataStoreClient.commit()

    return S_OK()

  def __monitorStorageElementStageRequests( self, storageElement, seReplicaIDs, replicaIDs ):
    terminalReplicaIDs = {}
    oldRequests = []
    stagedReplicas = []
    pfnRepIDs = {}
    pfnReqIDs = {}
    for replicaID in seReplicaIDs:
      pfn = replicaIDs[replicaID]['PFN']
      pfnRepIDs[pfn] = replicaID
      requestID = replicaIDs[replicaID].get( 'RequestID', None )
      if requestID:
        pfnReqIDs[pfn] = replicaIDs[replicaID]['RequestID']

    gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: Monitoring %s stage requests for %s." % ( len( pfnRepIDs ), storageElement ) )
    oAccounting = DataOperation()
    oAccounting.setStartTime()

    res = self.replicaManager.getStorageFileMetadata( pfnReqIDs.keys(), storageElement )
    if not res['OK']:
      gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Completely failed to monitor stage requests for replicas.", res['Message'] )
      return
    prestageStatus = res['Value']

    accountingDict = self.__newAccountingDict( storageElement )

    for pfn, reason in prestageStatus['Failed'].items():
      accountingDict['TransferTotal'] += 1
      if re.search( 'File does not exist', reason ):
        gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: PFN did not exist in the StorageElement", pfn )
        terminalReplicaIDs[pfnRepIDs[pfn]] = 'PFN did not exist in the StorageElement'
    for pfn, staged in prestageStatus['Successful'].items():
      if staged and 'Cached' in staged and staged['Cached']:
        accountingDict['TransferTotal'] += 1
        accountingDict['TransferOK'] += 1
        accountingDict['TransferSize'] += staged['Size']
        stagedReplicas.append( pfnRepIDs[pfn] )
      if staged and 'Cached' in staged and not staged['Cached']:
        oldRequests.append( pfnRepIDs[pfn] ); #only ReplicaIDs

    oAccounting.setValuesFromDict( accountingDict )
    oAccounting.setEndTime()
    gDataStoreClient.addRegister( oAccounting )

    # Update the states of the replicas in the database
    if terminalReplicaIDs:
      gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: %s replicas are terminally failed." % len( terminalReplicaIDs ) )
      res = self.stagerClient.updateReplicaFailure( terminalReplicaIDs )
      if not res['OK']:
        gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to update replica failures.", res['Message'] )
    if stagedReplicas:
      gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: %s staged replicas to be updated." % len( stagedReplicas ) )
      res = self.stagerClient.setStageComplete( stagedReplicas )
      if not res['OK']:
        gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to updated staged replicas.", res['Message'] )
      res = self.stagerClient.updateReplicaStatus( stagedReplicas, 'Staged' )
      if not res['OK']:
        gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to insert replica status.", res['Message'] )
    if oldRequests:
      gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: %s old requests will be retried." % len( oldRequests ) )
      res = self.__wakeupOldRequests( oldRequests )
      if not res['OK']:
        gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to wakeup old requests.", res['Message'] )
    return

  def __newAccountingDict( self, storageElement ):
    """ Generate a new accounting Dict """

    accountingDict = {}
    accountingDict['OperationType'] = 'Stage'
    accountingDict['User'] = self.proxyInfoDict['username']
    accountingDict['Protocol'] = 'Stager'
    accountingDict['RegistrationTime'] = 0.0
    accountingDict['RegistrationOK'] = 0
    accountingDict['RegistrationTotal'] = 0
    accountingDict['FinalStatus'] = 'Successful'
    accountingDict['Source'] = storageElement
    accountingDict['Destination'] = storageElement
    accountingDict['ExecutionSite'] = siteName()
    accountingDict['TransferTotal'] = 0
    accountingDict['TransferOK'] = 0
    accountingDict['TransferSize'] = 0
    accountingDict['TransferTime'] = self.am_getPollingTime()

    return accountingDict

  def __getStageSubmittedReplicas( self ):
    """ This obtains the StageSubmitted replicas from the Replicas table and the RequestID from the StageRequests table """
    res = self.stagerClient.getCacheReplicas( {'Status':'StageSubmitted'} )
    if not res['OK']:
      gLogger.error( "StageMonitor.__getStageSubmittedReplicas: Failed to get replicas with StageSubmitted status.", res['Message'] )
      return res
    if not res['Value']:
      gLogger.debug( "StageMonitor.__getStageSubmittedReplicas: No StageSubmitted replicas found to process." )
      return S_OK()
    else:
      gLogger.debug( "StageMonitor.__getStageSubmittedReplicas: Obtained %s StageSubmitted replicas(s) to process." % len( res['Value'] ) )

    seReplicas = {}
    replicaIDs = res['Value']
    for replicaID, info in replicaIDs.items():
      storageElement = info['SE']
      if not seReplicas.has_key( storageElement ):
        seReplicas[storageElement] = []
      seReplicas[storageElement].append( replicaID )

    # RequestID was missing from replicaIDs dictionary BUGGY?
    res = self.stagerClient.getStageRequests( {'ReplicaID':replicaIDs.keys()} )
    if not res['OK']:
      return res
    if not res['Value']:
      return S_ERROR( 'Could not obtain request IDs for replicas %s from StageRequests table' % ( replicaIDs.keys() ) )

    for replicaID, info in res['Value'].items():
      reqID = info['RequestID']
      replicaIDs[replicaID]['RequestID'] = reqID

    return S_OK( {'SEReplicas':seReplicas, 'ReplicaIDs':replicaIDs} )

  def __reportProblematicFiles( self, lfns, reason ):
    return S_OK()
    res = self.dataIntegrityClient.setFileProblematic( lfns, reason,  sourceComponent = 'StageMonitorAgent'  )
    if not res['OK']:
      gLogger.error( "StageMonitor.__reportProblematicFiles: Failed to report missing files.", res['Message'] )
      return res
    if res['Value']['Successful']:
      gLogger.info( "StageMonitor.__reportProblematicFiles: Successfully reported %s missing files." % len( res['Value']['Successful'] ) )
    if res['Value']['Failed']:
      gLogger.info( "StageMonitor.__reportProblematicFiles: Failed to report %s problematic files." % len( res['Value']['Failed'] ) )
    return res

  def __wakeupOldRequests( self, oldRequests ):
    gLogger.info( "StageMonitor.__wakeupOldRequests: Attempting..." )
    retryInterval = self.am_getOption( 'RetryIntervalHour', 2 )
    res = self.stagerClient.wakeupOldRequests( oldRequests, retryInterval )
    if not res['OK']:
      gLogger.error( "StageMonitor.__wakeupOldRequests: Failed to resubmit old requests.", res['Message'] )
      return res
    return S_OK()
Пример #27
0
class StageRequestAgent(AgentModule):
    def initialize(self):
        self.replicaManager = ReplicaManager()
        #self.stagerClient = StorageManagerClient()
        self.dataIntegrityClient = DataIntegrityClient()
        self.storageDB = StorageManagementDB()
        # pin lifetime = 1 day
        self.pinLifetime = self.am_getOption('PinLifetime', 60 * 60 * 24)

        # This sets the Default Proxy to used as that defined under
        # /Operations/Shifter/DataManager
        # the shifterProxy option in the Configuration can be used to change this default.
        self.am_setOption('shifterProxy', 'DataManager')

        return S_OK()

    def execute(self):

        # Get the current submitted stage space and the amount of pinned space for each storage element
        res = self.storageDB.getSubmittedStagePins()
        if not res['OK']:
            gLogger.fatal(
                "StageRequest.submitStageRequests: Failed to obtain submitted requests from StorageManagementDB.",
                res['Message'])
            return res
        self.storageElementUsage = res['Value']
        if self.storageElementUsage:
            gLogger.info(
                "StageRequest.execute: Active stage/pin requests found at the following sites:"
            )
            for storageElement in sortList(self.storageElementUsage.keys()):
                seDict = self.storageElementUsage[storageElement]
                # Daniela: fishy? Changed it to GB and division by 1024 instead of 1000
                gLogger.info(
                    "StageRequest.execute: %s: %s replicas with a size of %.3f GB."
                    % (storageElement.ljust(15), str(
                        seDict['Replicas']).rjust(6), seDict['TotalSize'] /
                       (1024 * 1024 * 1024.0)))
        if not self.storageElementUsage:
            gLogger.info(
                "StageRequest.execute: No active stage/pin requests found.")
        res = self.submitStageRequests()
        return res

    def submitStageRequests(self):
        """ This manages the Waiting->StageSubmitted transition of the Replicas
    """
        res = self.__getWaitingReplicas()
        if not res['OK']:
            gLogger.fatal(
                "StageRequest.submitStageRequests: Failed to get replicas from StorageManagementDB.",
                res['Message'])
            return res
        if not res['Value']:
            gLogger.info(
                "StageRequest.submitStageRequests: There were no Waiting replicas found"
            )
            return res
        seReplicas = res['Value']['SEReplicas']
        allReplicaInfo = res['Value']['ReplicaIDs']
        gLogger.info(
            "StageRequest.submitStageRequests: Obtained %s replicas Waiting for staging."
            % len(allReplicaInfo))
        for storageElement, seReplicaIDs in seReplicas.items():
            self.__issuePrestageRequests(storageElement, seReplicaIDs,
                                         allReplicaInfo)
        return S_OK()

    def __issuePrestageRequests(self, storageElement, seReplicaIDs,
                                allReplicaInfo):
        # First select which files can be eligible for prestaging based on the available space
        usedSpace = 0
        if self.storageElementUsage.has_key(storageElement):
            usedSpace = self.storageElementUsage[storageElement]['TotalSize']
        totalSpace = gConfig.getValue(
            "/Resources/StorageElements/%s/CacheSize" % storageElement, 0)
        if not totalSpace:
            gLogger.info(
                "StageRequest__issuePrestageRequests: No space restriction at %s"
                % (storageElement))
            selectedReplicaIDs = seReplicaIDs
        elif (totalSpace > usedSpace):
            gLogger.debug(
                "StageRequest__issuePrestageRequests: total space = %s, used space = %s"
                % (totalSpace, usedSpace))
            gLogger.info(
                "StageRequest__issuePrestageRequests: %.4f GB available at %s"
                % ((totalSpace - usedSpace) /
                   (1024 * 1024 * 1024.0), storageElement))
            selectedReplicaIDs = []
            #logic was bad here, before the first comparison test, the single selected file for staging could be larger than the available space
            for replicaID in seReplicaIDs:
                if (totalSpace -
                        usedSpace) > allReplicaInfo[replicaID]['Size']:
                    usedSpace += allReplicaInfo[replicaID]['Size']
                    selectedReplicaIDs.append(replicaID)
        else:
            gLogger.info(
                "StageRequest__issuePrestageRequests: %.2f GB used at %s (limit %2.f GB)"
                % ((usedSpace) /
                   (1024 * 1024 * 1024.0), storageElement, totalSpace /
                   (1024 * 1024 * 1024.0)))
            return
        gLogger.info(
            "StageRequest__issuePrestageRequests: Selected %s files eligible for staging at %s."
            % (len(selectedReplicaIDs), storageElement))
        # Now check that the integrity of the eligible files
        pfnRepIDs = {}
        for replicaID in selectedReplicaIDs:
            pfn = allReplicaInfo[replicaID]['PFN']
            pfnRepIDs[pfn] = replicaID
        res = self.__checkIntegrity(storageElement, pfnRepIDs, allReplicaInfo)
        if not res['OK']:
            return res
        pfnRepIDs = res['Value']
        # Now issue the prestage requests for the remaining replicas
        stageRequestMetadata = {}
        updatedPfnIDs = []
        if pfnRepIDs:
            gLogger.info(
                "StageRequest.__issuePrestageRequests: Submitting %s stage requests for %s."
                % (len(pfnRepIDs), storageElement))
            res = self.replicaManager.prestageStorageFile(
                pfnRepIDs.keys(), storageElement, lifetime=self.pinLifetime)
            gLogger.debug(
                "StageRequest.__issuePrestageRequests: replicaManager.prestageStorageFile: res=",
                res)
            #res= {'OK': True, 'Value': {'Successful': {}, 'Failed': {'srm://srm-lhcb.cern.ch/castor/cern.ch/grid/lhcb/data/2010/RAW/EXPRESS/LHCb/COLLISION10/71476/071476_0000000241.raw': ' SRM2Storage.__gfal_exec: Failed to perform gfal_prestage.[SE][BringOnline][SRM_INVALID_REQUEST] httpg://srm-lhcb.cern.ch:8443/srm/managerv2: User not able to access specified space token\n'}}}
            #res= {'OK': True, 'Value': {'Successful': {'srm://gridka-dCache.fzk.de/pnfs/gridka.de/lhcb/data/2009/RAW/FULL/LHCb/COLLISION09/63495/063495_0000000001.raw': '-2083846379'}, 'Failed': {}}}
            if not res['OK']:
                gLogger.error(
                    "StageRequest.__issuePrestageRequests: Completely failed to sumbmit stage requests for replicas.",
                    res['Message'])
            else:
                for pfn, requestID in res['Value']['Successful'].items():
                    if not stageRequestMetadata.has_key(requestID):
                        stageRequestMetadata[requestID] = []
                    stageRequestMetadata[requestID].append(pfnRepIDs[pfn])
                    updatedPfnIDs.append(pfnRepIDs[pfn])
        if stageRequestMetadata:
            gLogger.info(
                "StageRequest.__issuePrestageRequests: %s stage request metadata to be updated."
                % len(stageRequestMetadata))
            res = self.storageDB.insertStageRequest(stageRequestMetadata,
                                                    self.pinLifetime)
            if not res['OK']:
                gLogger.error(
                    "StageRequest.__issuePrestageRequests: Failed to insert stage request metadata.",
                    res['Message'])
            res = self.storageDB.updateReplicaStatus(updatedPfnIDs,
                                                     'StageSubmitted')
            if not res['OK']:
                gLogger.error(
                    "StageRequest.__issuePrestageRequests: Failed to insert replica status.",
                    res['Message'])
        return

    def __getWaitingReplicas(self):
        """ This obtains the Waiting replicas from the Replicas table and for each LFN the requested storage element """
        # First obtain the Waiting replicas from the Replicas table
        res = self.storageDB.getWaitingReplicas()
        if not res['OK']:
            gLogger.error(
                "StageRequest.__getWaitingReplicas: Failed to get replicas with Waiting status.",
                res['Message'])
            return res
        if not res['Value']:
            gLogger.debug(
                "StageRequest.__getWaitingReplicas: No Waiting replicas found to process."
            )
            return S_OK()
        else:
            gLogger.debug(
                "StageRequest.__getWaitingReplicas: Obtained %s Waiting replicas(s) to process."
                % len(res['Value']))
        seReplicas = {}
        replicaIDs = {}
        for replicaID, info in res['Value'].items():
            lfn = info['LFN']
            storageElement = info['SE']
            size = info['Size']
            pfn = info['PFN']
            #      lfn,storageElement,size,pfn = info
            replicaIDs[replicaID] = {
                'LFN': lfn,
                'PFN': pfn,
                'Size': size,
                'StorageElement': storageElement
            }
            if not seReplicas.has_key(storageElement):
                seReplicas[storageElement] = []
            seReplicas[storageElement].append(replicaID)
        return S_OK({'SEReplicas': seReplicas, 'ReplicaIDs': replicaIDs})

    def __checkIntegrity(self, storageElement, pfnRepIDs, replicaIDs):
        # Check the integrity of the files to ensure they are available
        terminalReplicaIDs = {}
        gLogger.info(
            "StageRequest.__checkIntegrity: Checking the integrity of %s replicas at %s."
            % (len(pfnRepIDs), storageElement))
        res = self.replicaManager.getStorageFileMetadata(
            pfnRepIDs.keys(), storageElement)
        if not res['OK']:
            gLogger.error(
                "StageRequest.__checkIntegrity: Completely failed to obtain metadata for replicas.",
                res['Message'])
            return res
        for pfn, metadata in res['Value']['Successful'].items():
            if metadata['Cached']:
                gLogger.info(
                    "StageRequest.__checkIntegrity: Cache hit for file.")
            if metadata['Size'] != replicaIDs[pfnRepIDs[pfn]]['Size']:
                gLogger.error(
                    "StageRequest.__checkIntegrity: PFN StorageElement size does not match FileCatalog",
                    pfn)
                terminalReplicaIDs[pfnRepIDs[
                    pfn]] = 'PFN StorageElement size does not match FileCatalog'
                pfnRepIDs.pop(pfn)
            elif metadata['Lost']:
                gLogger.error(
                    "StageRequest.__checkIntegrity: PFN has been Lost by the StorageElement",
                    pfn)
                terminalReplicaIDs[
                    pfnRepIDs[pfn]] = 'PFN has been Lost by the StorageElement'
                pfnRepIDs.pop(pfn)
            elif metadata['Unavailable']:
                gLogger.error(
                    "StageRequest.__checkIntegrity: PFN is declared Unavailable by the StorageElement",
                    pfn)
                terminalReplicaIDs[pfnRepIDs[
                    pfn]] = 'PFN is declared Unavailable by the StorageElement'
                pfnRepIDs.pop(pfn)
        for pfn, reason in res['Value']['Failed'].items():
            if re.search('File does not exist', reason):
                gLogger.error(
                    "StageRequest.__checkIntegrity: PFN does not exist in the StorageElement",
                    pfn)
                terminalReplicaIDs[pfnRepIDs[
                    pfn]] = 'PFN does not exist in the StorageElement'
            pfnRepIDs.pop(pfn)
        # Update the states of the replicas in the database #TODO Sent status to integrity DB
        if terminalReplicaIDs:
            gLogger.info(
                "StageRequest.__checkIntegrity: %s replicas are terminally failed."
                % len(terminalReplicaIDs))
            res = self.storageDB.updateReplicaFailure(terminalReplicaIDs)
            if not res['OK']:
                gLogger.error(
                    "StageRequest.__checkIntegrity: Failed to update replica failures.",
                    res['Message'])
        return S_OK(pfnRepIDs)

    def __reportProblematicFiles(self, lfns, reason):
        return S_OK()
        res = self.dataIntegrityClient.setFileProblematic(
            lfns, reason, self.name)
        if not res['OK']:
            gLogger.error(
                "RequestPreparation.__reportProblematicFiles: Failed to report missing files.",
                res['Message'])
            return res
        if res['Value']['Successful']:
            gLogger.info(
                "RequestPreparation.__reportProblematicFiles: Successfully reported %s missing files."
                % len(res['Value']['Successful']))
        if res['Value']['Failed']:
            gLogger.info(
                "RequestPreparation.__reportProblematicFiles: Failed to report %s problematic files."
                % len(res['Value']['Failed']))
        return res
Пример #28
0
class RequestPreparationAgent(AgentModule):
    def initialize(self):
        self.fileCatalog = FileCatalog()
        self.dm = DataManager()
        self.stagerClient = StorageManagerClient()
        self.dataIntegrityClient = DataIntegrityClient()
        # This sets the Default Proxy to used as that defined under
        # /Operations/Shifter/DataManager
        # the shifterProxy option in the Configuration can be used to change this default.
        self.am_setOption('shifterProxy', 'DataManager')

        return S_OK()

    def execute(self):
        res = self.prepareNewReplicas()
        return res

    def prepareNewReplicas(self):
        """ This is the first logical task to be executed and manages the New->Waiting transition of the Replicas
    """
        res = self.__getNewReplicas()
        if not res['OK']:
            gLogger.fatal(
                "RequestPreparation.prepareNewReplicas: Failed to get replicas from StagerDB.",
                res['Message'])
            return res
        if not res['Value']:
            gLogger.info("There were no New replicas found")
            return res
        replicas = res['Value']['Replicas']
        replicaIDs = res['Value']['ReplicaIDs']
        gLogger.info(
            "RequestPreparation.prepareNewReplicas: Obtained %s New replicas for preparation."
            % len(replicaIDs))

        # Check that the files exist in the FileCatalog
        res = self.__getExistingFiles(replicas.keys())
        if not res['OK']:
            return res
        exist = res['Value']['Exist']
        terminal = res['Value']['Missing']
        failed = res['Value']['Failed']
        if not exist:
            gLogger.error(
                'RequestPreparation.prepareNewReplicas: Failed determine existance of any files'
            )
            return S_OK()
        terminalReplicaIDs = {}
        for lfn, reason in terminal.items():
            for _se, replicaID in replicas[lfn].items():
                terminalReplicaIDs[replicaID] = reason
            replicas.pop(lfn)
        gLogger.info(
            "RequestPreparation.prepareNewReplicas: %s files exist in the FileCatalog."
            % len(exist))
        if terminal:
            gLogger.info(
                "RequestPreparation.prepareNewReplicas: %s files do not exist in the FileCatalog."
                % len(terminal))

        # Obtain the file sizes from the FileCatalog
        res = self.__getFileSize(exist)
        if not res['OK']:
            return res
        failed.update(res['Value']['Failed'])
        terminal = res['Value']['ZeroSize']
        fileSizes = res['Value']['FileSizes']
        if not fileSizes:
            gLogger.error(
                'RequestPreparation.prepareNewReplicas: Failed determine sizes of any files'
            )
            return S_OK()
        for lfn, reason in terminal.items():
            for _se, replicaID in replicas[lfn].items():
                terminalReplicaIDs[replicaID] = reason
            replicas.pop(lfn)
        gLogger.info(
            "RequestPreparation.prepareNewReplicas: Obtained %s file sizes from the FileCatalog."
            % len(fileSizes))
        if terminal:
            gLogger.info(
                "RequestPreparation.prepareNewReplicas: %s files registered with zero size in the FileCatalog."
                % len(terminal))

        # Obtain the replicas from the FileCatalog
        res = self.__getFileReplicas(fileSizes.keys())
        if not res['OK']:
            return res
        failed.update(res['Value']['Failed'])
        terminal = res['Value']['ZeroReplicas']
        fileReplicas = res['Value']['Replicas']
        if not fileReplicas:
            gLogger.error(
                'RequestPreparation.prepareNewReplicas: Failed determine replicas for any files'
            )
            return S_OK()
        for lfn, reason in terminal.items():
            for _se, replicaID in replicas[lfn].items():
                terminalReplicaIDs[replicaID] = reason
            replicas.pop(lfn)
        gLogger.info(
            "RequestPreparation.prepareNewReplicas: Obtained replica information for %s file from the FileCatalog."
            % len(fileReplicas))
        if terminal:
            gLogger.info(
                "RequestPreparation.prepareNewReplicas: %s files registered with zero replicas in the FileCatalog."
                % len(terminal))

        # Check the replicas exist at the requested site
        replicaMetadata = []
        for lfn, requestedSEs in replicas.items():
            lfnReplicas = fileReplicas[lfn]
            for requestedSE, replicaID in requestedSEs.items():
                if not requestedSE in lfnReplicas.keys():
                    terminalReplicaIDs[
                        replicaID] = "LFN not registered at requested SE"
                    replicas[lfn].pop(requestedSE)
                else:
                    replicaMetadata.append(
                        (replicaID, lfnReplicas[requestedSE], fileSizes[lfn]))

        # Update the states of the files in the database
        if terminalReplicaIDs:
            gLogger.info(
                "RequestPreparation.prepareNewReplicas: %s replicas are terminally failed."
                % len(terminalReplicaIDs))
            # res = self.stagerClient.updateReplicaFailure( terminalReplicaIDs )
            res = self.stagerClient.updateReplicaFailure(terminalReplicaIDs)
            if not res['OK']:
                gLogger.error(
                    "RequestPreparation.prepareNewReplicas: Failed to update replica failures.",
                    res['Message'])
        if replicaMetadata:
            gLogger.info(
                "RequestPreparation.prepareNewReplicas: %s replica metadata to be updated."
                % len(replicaMetadata))
            # Sets the Status='Waiting' of CacheReplicas records that are OK with catalogue checks
            res = self.stagerClient.updateReplicaInformation(replicaMetadata)
            if not res['OK']:
                gLogger.error(
                    "RequestPreparation.prepareNewReplicas: Failed to update replica metadata.",
                    res['Message'])
        return S_OK()

    def __getNewReplicas(self):
        """ This obtains the New replicas from the Replicas table and for each LFN the requested storage element """
        # First obtain the New replicas from the CacheReplicas table
        res = self.stagerClient.getCacheReplicas({'Status': 'New'})
        if not res['OK']:
            gLogger.error(
                "RequestPreparation.__getNewReplicas: Failed to get replicas with New status.",
                res['Message'])
            return res
        if not res['Value']:
            gLogger.debug(
                "RequestPreparation.__getNewReplicas: No New replicas found to process."
            )
            return S_OK()
        else:
            gLogger.debug(
                "RequestPreparation.__getNewReplicas: Obtained %s New replicas(s) to process."
                % len(res['Value']))
        replicas = {}
        replicaIDs = {}
        for replicaID, info in res['Value'].items():
            lfn = info['LFN']
            storageElement = info['SE']
            if not replicas.has_key(lfn):
                replicas[lfn] = {}
            replicas[lfn][storageElement] = replicaID
            replicaIDs[replicaID] = (lfn, storageElement)
        return S_OK({'Replicas': replicas, 'ReplicaIDs': replicaIDs})

    def __getExistingFiles(self, lfns):
        """ This checks that the files exist in the FileCatalog. """
        filesExist = []
        missing = {}
        res = self.fileCatalog.exists(lfns)
        if not res['OK']:
            gLogger.error(
                "RequestPreparation.__getExistingFiles: Failed to determine whether files exist.",
                res['Message'])
            return res
        failed = res['Value']['Failed']
        for lfn, exists in res['Value']['Successful'].items():
            if exists:
                filesExist.append(lfn)
            else:
                missing[lfn] = 'LFN not registered in the FileCatalog'
        if missing:
            for lfn, reason in missing.items():
                gLogger.warn(
                    "RequestPreparation.__getExistingFiles: %s" % reason, lfn)
            self.__reportProblematicFiles(missing.keys(),
                                          'LFN-LFC-DoesntExist')
        return S_OK({
            'Exist': filesExist,
            'Missing': missing,
            'Failed': failed
        })

    def __getFileSize(self, lfns):
        """ This obtains the file size from the FileCatalog. """
        fileSizes = {}
        zeroSize = {}
        res = self.fileCatalog.getFileSize(lfns)
        if not res['OK']:
            gLogger.error(
                "RequestPreparation.__getFileSize: Failed to get sizes for files.",
                res['Message'])
            return res
        failed = res['Value']['Failed']
        for lfn, size in res['Value']['Successful'].items():
            if size == 0:
                zeroSize[
                    lfn] = "LFN registered with zero size in the FileCatalog"
            else:
                fileSizes[lfn] = size
        if zeroSize:
            for lfn, reason in zeroSize.items():
                gLogger.warn("RequestPreparation.__getFileSize: %s" % reason,
                             lfn)
            self.__reportProblematicFiles(zeroSize.keys(), 'LFN-LFC-ZeroSize')
        return S_OK({
            'FileSizes': fileSizes,
            'ZeroSize': zeroSize,
            'Failed': failed
        })

    def __getFileReplicas(self, lfns):
        """ This obtains the replicas from the FileCatalog. """
        replicas = {}
        noReplicas = {}
        res = self.dm.getActiveReplicas(lfns)
        if not res['OK']:
            gLogger.error(
                "RequestPreparation.__getFileReplicas: Failed to obtain file replicas.",
                res['Message'])
            return res
        failed = res['Value']['Failed']
        for lfn, lfnReplicas in res['Value']['Successful'].items():
            if len(lfnReplicas.keys()) == 0:
                noReplicas[
                    lfn] = "LFN registered with zero replicas in the FileCatalog"
            else:
                replicas[lfn] = lfnReplicas
        if noReplicas:
            for lfn, reason in noReplicas.items():
                gLogger.warn(
                    "RequestPreparation.__getFileReplicas: %s" % reason, lfn)
            self.__reportProblematicFiles(noReplicas.keys(),
                                          'LFN-LFC-NoReplicas')
        return S_OK({
            'Replicas': replicas,
            'ZeroReplicas': noReplicas,
            'Failed': failed
        })

    def __reportProblematicFiles(self, lfns, reason):
        return S_OK()
        res = self.dataIntegrityClient.setFileProblematic(
            lfns, reason, sourceComponent='RequestPreparationAgent')
        if not res['OK']:
            gLogger.error(
                "RequestPreparation.__reportProblematicFiles: Failed to report missing files.",
                res['Message'])
            return res
        if res['Value']['Successful']:
            gLogger.info(
                "RequestPreparation.__reportProblematicFiles: Successfully reported %s missing files."
                % len(res['Value']['Successful']))
        if res['Value']['Failed']:
            gLogger.info(
                "RequestPreparation.__reportProblematicFiles: Failed to report %s problematic files."
                % len(res['Value']['Failed']))
        return res
Пример #29
0
class StageRequestAgent( AgentModule ):

  def initialize( self ):
    self.replicaManager = ReplicaManager()
    #self.stagerClient = StorageManagerClient()
    self.dataIntegrityClient = DataIntegrityClient()
    self.storageDB = StorageManagementDB()
    # pin lifetime = 1 day
    self.pinLifetime = self.am_getOption( 'PinLifetime', 60 * 60 * 24 )

    # This sets the Default Proxy to used as that defined under
    # /Operations/Shifter/DataManager
    # the shifterProxy option in the Configuration can be used to change this default.
    self.am_setOption( 'shifterProxy', 'DataManager' )

    return S_OK()

  def execute( self ):

    # Get the current submitted stage space and the amount of pinned space for each storage element
    res = self.storageDB.getSubmittedStagePins()
    if not res['OK']:
      gLogger.fatal( "StageRequest.submitStageRequests: Failed to obtain submitted requests from StorageManagementDB.", res['Message'] )
      return res
    self.storageElementUsage = res['Value']
    if self.storageElementUsage:
      gLogger.info( "StageRequest.execute: Active stage/pin requests found at the following sites:" )
      for storageElement in sortList( self.storageElementUsage.keys() ):
        seDict = self.storageElementUsage[storageElement]
        # Daniela: fishy? Changed it to GB and division by 1024 instead of 1000
        gLogger.info( "StageRequest.execute: %s: %s replicas with a size of %.3f GB." % ( storageElement.ljust( 15 ), str( seDict['Replicas'] ).rjust( 6 ), seDict['TotalSize'] / ( 1024 * 1024 * 1024.0 ) ) )
    if not self.storageElementUsage:
      gLogger.info( "StageRequest.execute: No active stage/pin requests found." )
    res = self.submitStageRequests()
    return res

  def submitStageRequests( self ):
    """ This manages the Waiting->StageSubmitted transition of the Replicas
    """
    res = self.__getWaitingReplicas()
    if not res['OK']:
      gLogger.fatal( "StageRequest.submitStageRequests: Failed to get replicas from StorageManagementDB.", res['Message'] )
      return res
    if not res['Value']:
      gLogger.info( "StageRequest.submitStageRequests: There were no Waiting replicas found" )
      return res
    seReplicas = res['Value']['SEReplicas']
    allReplicaInfo = res['Value']['ReplicaIDs']
    gLogger.info( "StageRequest.submitStageRequests: Obtained %s replicas Waiting for staging." % len( allReplicaInfo ) )
    for storageElement, seReplicaIDs in seReplicas.items():
      self.__issuePrestageRequests( storageElement, seReplicaIDs, allReplicaInfo )
    return S_OK()

  def __issuePrestageRequests( self, storageElement, seReplicaIDs, allReplicaInfo ):
    # First select which files can be eligible for prestaging based on the available space
    usedSpace = 0
    if self.storageElementUsage.has_key( storageElement ):
      usedSpace = self.storageElementUsage[storageElement]['TotalSize']
    totalSpace = gConfig.getValue( "/Resources/StorageElements/%s/CacheSize" % storageElement, 0 )
    if not totalSpace:
      gLogger.info( "StageRequest__issuePrestageRequests: No space restriction at %s" % ( storageElement ) )
      selectedReplicaIDs = seReplicaIDs
    elif ( totalSpace > usedSpace ):
      gLogger.debug( "StageRequest__issuePrestageRequests: total space = %s, used space = %s" % ( totalSpace, usedSpace ) )
      gLogger.info( "StageRequest__issuePrestageRequests: %.4f GB available at %s" % ( ( totalSpace - usedSpace ) / ( 1024 * 1024 * 1024.0 ), storageElement ) )
      selectedReplicaIDs = []
      #logic was bad here, before the first comparison test, the single selected file for staging could be larger than the available space
      for replicaID in seReplicaIDs:
        if ( totalSpace - usedSpace ) > allReplicaInfo[replicaID]['Size']:
          usedSpace += allReplicaInfo[replicaID]['Size']
          selectedReplicaIDs.append( replicaID )
    else:
      gLogger.info( "StageRequest__issuePrestageRequests: %.2f GB used at %s (limit %2.f GB)" % ( ( usedSpace ) / ( 1024 * 1024 * 1024.0 ), storageElement, totalSpace / ( 1024 * 1024 * 1024.0 ) ) )
      return
    gLogger.info( "StageRequest__issuePrestageRequests: Selected %s files eligible for staging at %s." % ( len( selectedReplicaIDs ), storageElement ) )
    # Now check that the integrity of the eligible files
    pfnRepIDs = {}
    for replicaID in selectedReplicaIDs:
      pfn = allReplicaInfo[replicaID]['PFN']
      pfnRepIDs[pfn] = replicaID
    res = self.__checkIntegrity( storageElement, pfnRepIDs, allReplicaInfo )
    if not res['OK']:
      return res
    pfnRepIDs = res['Value']
    # Now issue the prestage requests for the remaining replicas
    stageRequestMetadata = {}
    updatedPfnIDs = []
    if pfnRepIDs:
      gLogger.info( "StageRequest.__issuePrestageRequests: Submitting %s stage requests for %s." % ( len( pfnRepIDs ), storageElement ) )
      res = self.replicaManager.prestageStorageFile( pfnRepIDs.keys(), storageElement, lifetime = self.pinLifetime )
      gLogger.debug( "StageRequest.__issuePrestageRequests: replicaManager.prestageStorageFile: res=", res )
      #res= {'OK': True, 'Value': {'Successful': {}, 'Failed': {'srm://srm-lhcb.cern.ch/castor/cern.ch/grid/lhcb/data/2010/RAW/EXPRESS/LHCb/COLLISION10/71476/071476_0000000241.raw': ' SRM2Storage.__gfal_exec: Failed to perform gfal_prestage.[SE][BringOnline][SRM_INVALID_REQUEST] httpg://srm-lhcb.cern.ch:8443/srm/managerv2: User not able to access specified space token\n'}}}
      #res= {'OK': True, 'Value': {'Successful': {'srm://gridka-dCache.fzk.de/pnfs/gridka.de/lhcb/data/2009/RAW/FULL/LHCb/COLLISION09/63495/063495_0000000001.raw': '-2083846379'}, 'Failed': {}}}
      if not res['OK']:
        gLogger.error( "StageRequest.__issuePrestageRequests: Completely failed to sumbmit stage requests for replicas.", res['Message'] )
      else:
        for pfn, requestID in res['Value']['Successful'].items():
          if not stageRequestMetadata.has_key( requestID ):
            stageRequestMetadata[requestID] = []
          stageRequestMetadata[requestID].append( pfnRepIDs[pfn] )
          updatedPfnIDs.append( pfnRepIDs[pfn] )
    if stageRequestMetadata:
      gLogger.info( "StageRequest.__issuePrestageRequests: %s stage request metadata to be updated." % len( stageRequestMetadata ) )
      res = self.storageDB.insertStageRequest( stageRequestMetadata, self.pinLifetime )
      if not res['OK']:
        gLogger.error( "StageRequest.__issuePrestageRequests: Failed to insert stage request metadata.", res['Message'] )
      res = self.storageDB.updateReplicaStatus( updatedPfnIDs, 'StageSubmitted' )
      if not res['OK']:
        gLogger.error( "StageRequest.__issuePrestageRequests: Failed to insert replica status.", res['Message'] )
    return

  def __getWaitingReplicas( self ):
    """ This obtains the Waiting replicas from the Replicas table and for each LFN the requested storage element """
    # First obtain the Waiting replicas from the Replicas table
    res = self.storageDB.getWaitingReplicas()
    if not res['OK']:
      gLogger.error( "StageRequest.__getWaitingReplicas: Failed to get replicas with Waiting status.", res['Message'] )
      return res
    if not res['Value']:
      gLogger.debug( "StageRequest.__getWaitingReplicas: No Waiting replicas found to process." )
      return S_OK()
    else:
      gLogger.debug( "StageRequest.__getWaitingReplicas: Obtained %s Waiting replicas(s) to process." % len( res['Value'] ) )
    seReplicas = {}
    replicaIDs = {}
    for replicaID, info in res['Value'].items():
      lfn = info['LFN']
      storageElement = info['SE']
      size = info['Size']
      pfn = info['PFN']
#      lfn,storageElement,size,pfn = info
      replicaIDs[replicaID] = {'LFN':lfn, 'PFN':pfn, 'Size':size, 'StorageElement':storageElement}
      if not seReplicas.has_key( storageElement ):
        seReplicas[storageElement] = []
      seReplicas[storageElement].append( replicaID )
    return S_OK( {'SEReplicas':seReplicas, 'ReplicaIDs':replicaIDs} )

  def __checkIntegrity( self, storageElement, pfnRepIDs, replicaIDs ):
    # Check the integrity of the files to ensure they are available
    terminalReplicaIDs = {}
    gLogger.info( "StageRequest.__checkIntegrity: Checking the integrity of %s replicas at %s." % ( len( pfnRepIDs ), storageElement ) )
    res = self.replicaManager.getStorageFileMetadata( pfnRepIDs.keys(), storageElement )
    if not res['OK']:
      gLogger.error( "StageRequest.__checkIntegrity: Completely failed to obtain metadata for replicas.", res['Message'] )
      return res
    for pfn, metadata in res['Value']['Successful'].items():
      if metadata['Cached']:
        gLogger.info( "StageRequest.__checkIntegrity: Cache hit for file." )
      if metadata['Size'] != replicaIDs[pfnRepIDs[pfn]]['Size']:
        gLogger.error( "StageRequest.__checkIntegrity: PFN StorageElement size does not match FileCatalog", pfn )
        terminalReplicaIDs[pfnRepIDs[pfn]] = 'PFN StorageElement size does not match FileCatalog'
        pfnRepIDs.pop( pfn )
      elif metadata['Lost']:
        gLogger.error( "StageRequest.__checkIntegrity: PFN has been Lost by the StorageElement", pfn )
        terminalReplicaIDs[pfnRepIDs[pfn]] = 'PFN has been Lost by the StorageElement'
        pfnRepIDs.pop( pfn )
      elif metadata['Unavailable']:
        gLogger.error( "StageRequest.__checkIntegrity: PFN is declared Unavailable by the StorageElement", pfn )
        terminalReplicaIDs[pfnRepIDs[pfn]] = 'PFN is declared Unavailable by the StorageElement'
        pfnRepIDs.pop( pfn )
    for pfn, reason in res['Value']['Failed'].items():
      if re.search( 'File does not exist', reason ):
        gLogger.error( "StageRequest.__checkIntegrity: PFN does not exist in the StorageElement", pfn )
        terminalReplicaIDs[pfnRepIDs[pfn]] = 'PFN does not exist in the StorageElement'
      pfnRepIDs.pop( pfn )
    # Update the states of the replicas in the database #TODO Sent status to integrity DB
    if terminalReplicaIDs:
      gLogger.info( "StageRequest.__checkIntegrity: %s replicas are terminally failed." % len( terminalReplicaIDs ) )
      res = self.storageDB.updateReplicaFailure( terminalReplicaIDs )
      if not res['OK']:
        gLogger.error( "StageRequest.__checkIntegrity: Failed to update replica failures.", res['Message'] )
    return S_OK( pfnRepIDs )

  def __reportProblematicFiles( self, lfns, reason ):
    return S_OK()
    res = self.dataIntegrityClient.setFileProblematic( lfns, reason, self.name )
    if not res['OK']:
      gLogger.error( "RequestPreparation.__reportProblematicFiles: Failed to report missing files.", res['Message'] )
      return res
    if res['Value']['Successful']:
      gLogger.info( "RequestPreparation.__reportProblematicFiles: Successfully reported %s missing files." % len( res['Value']['Successful'] ) )
    if res['Value']['Failed']:
      gLogger.info( "RequestPreparation.__reportProblematicFiles: Failed to report %s problematic files." % len( res['Value']['Failed'] ) )
    return res