Пример #1
0
def main():
  Script.parseCommandLine(ignoreErrors=True)
  args = Script.getPositionalArgs()
  if len(args) < 2:
    Script.showHelp()

  from DIRAC import exit as DIRACExit, gLogger

  lfn = args[0]
  se = args[1]

  from DIRAC.StorageManagementSystem.Client.StorageManagerClient import StorageManagerClient
  client = StorageManagerClient()
  res = client.getCacheReplicas({'LFN': lfn, 'SE': se})
  if not res['OK']:
    gLogger.error(res['Message'])
  cacheReplicaInfo = res['Value']
  if cacheReplicaInfo:
    replicaID = list(cacheReplicaInfo)[0]
    outStr = "\n--------------------"
    outStr += "\n%s: %s" % ('LFN'.ljust(8), cacheReplicaInfo[replicaID]['LFN'].ljust(100))
    outStr += "\n%s: %s" % ('SE'.ljust(8), cacheReplicaInfo[replicaID]['SE'].ljust(100))
    outStr += "\n%s: %s" % ('PFN'.ljust(8), cacheReplicaInfo[replicaID]['PFN'].ljust(100))
    outStr += "\n%s: %s" % ('Status'.ljust(8), cacheReplicaInfo[replicaID]['Status'].ljust(100))
    outStr += "\n%s: %s" % ('LastUpdate'.ljust(8), str(cacheReplicaInfo[replicaID]['LastUpdate']).ljust(100))
    outStr += "\n%s: %s" % ('Reason'.ljust(8), str(cacheReplicaInfo[replicaID]['Reason']).ljust(100))

    resTasks = client.getTasks({'ReplicaID': replicaID})

    if resTasks['OK']:
      # print resTasks['Message']
      outStr += '\nJob IDs requesting this file to be staged:'.ljust(8)
      tasks = resTasks['Value']
      for tid in tasks.keys():
        outStr += ' %s ' % (tasks[tid]['SourceTaskID'])

    resStageRequests = client.getStageRequests({'ReplicaID': replicaID})

    if not resStageRequests['OK']:
      gLogger.error(resStageRequests['Message'])

    if resStageRequests['Records']:
      stageRequests = resStageRequests['Value']
      outStr += "\n------SRM staging request info--------------"
      for info in stageRequests.values():
        outStr += "\n%s: %s" % ('SRM RequestID'.ljust(8), info['RequestID'].ljust(100))
        outStr += "\n%s: %s" % ('SRM StageStatus'.ljust(8), info['StageStatus'].ljust(100))
        outStr += "\n%s: %s" % ('SRM StageRequestSubmitTime'.ljust(8), str(info['StageRequestSubmitTime']).ljust(100))
        outStr += "\n%s: %s" % ('SRM StageRequestCompletedTime'.ljust(8),
                                str(info['StageRequestCompletedTime']).ljust(100))
        outStr += "\n%s: %s" % ('SRM PinExpiryTime'.ljust(8), str(info['PinExpiryTime']).ljust(100))
        outStr += "\n%s: %s sec" % ('SRM PinLength'.ljust(8), str(info['PinLength']).ljust(100))
    else:
      outStr += '\nThere are no staging requests submitted to the site yet.'.ljust(8)
  else:
    outStr = "\nThere is no such file requested for staging. Check for typo's!"
    # Script.showHelp()
  gLogger.notice(outStr)

  DIRACExit(0)
Пример #2
0
  def __requestStaging( self, jobState, stageLFNs ):
    """ Actual request for staging LFNs through the StorageManagerClient
    """
    self.jobLog.verbose( "Stage request will be \n\t%s" % "\n\t".join( [ "%s:%s" % ( lfn, stageLFNs[ lfn ] ) for lfn in stageLFNs ] ) )

    stagerClient = StorageManagerClient()
    result = jobState.setStatus( self.ex_getOption( 'StagingStatus', 'Staging' ),
                                 self.ex_getOption( 'StagingMinorStatus', 'Request To Be Sent' ),
                                 appStatus = "",
                                 source = self.ex_optimizerName() )
    if not result[ 'OK' ]:
      return result

    result = stagerClient.setRequest( stageLFNs, 'WorkloadManagement',
                                      'updateJobFromStager@WorkloadManagement/JobStateUpdate',
                                      int( jobState.jid ) )
    if not result[ 'OK' ]:
      self.jobLog.error( "Could not send stage request: %s" % result[ 'Message' ] )
      return S_ERROR( "Problem sending staging request" )

    rid = str( result[ 'Value' ] )
    self.jobLog.info( "Stage request %s sent" % rid )
    jobState.setParameter( "StageRequest", rid )

    result = jobState.setStatus( self.ex_getOption( 'StagingStatus', 'Staging' ),
                                 self.ex_getOption( 'StagingMinorStatus', 'Request Sent' ),
                                 appStatus = "",
                                 source = self.ex_optimizerName() )
    if not result['OK']:
      return result

    return S_OK( stageLFNs )
Пример #3
0
def main():
    Script.parseCommandLine(ignoreErrors=False)

    args = Script.getPositionalArgs()

    if len(args) < 1:
        Script.showHelp()

    from DIRAC import exit as DIRACExit, gLogger

    try:
        jobIDs = [int(arg) for arg in args]
    except BaseException:
        gLogger.fatal('DIRAC Job IDs must be integers')
        DIRACExit(2)

    from DIRAC.StorageManagementSystem.Client.StorageManagerClient import StorageManagerClient
    client = StorageManagerClient()

    outStr = "\n"
    for jobID in jobIDs:
        res = client.getTaskSummary(jobID)
        if not res['OK']:
            gLogger.error(res['Message'])
            continue
        if not res['Value']:
            gLogger.notice(
                'No info for job %s, probably gone from the stager...' % jobID)
            continue
        taskInfo = res['Value']['TaskInfo']
        replicaInfo = res['Value']['ReplicaInfo']
        outStr = "%s: %s" % ('JobID'.ljust(20), jobID)
        outStr += "\n%s: %s" % ('Status'.ljust(20),
                                taskInfo[str(jobID)]['Status'])
        outStr += "\n%s: %s" % ('SubmitTime'.ljust(20),
                                taskInfo[str(jobID)]['SubmitTime'])
        outStr += "\n%s: %s" % ('CompleteTime'.ljust(20),
                                taskInfo[str(jobID)]['CompleteTime'])
        outStr += "\nStaging files for this job:"
        if not res['Value']['ReplicaInfo']:
            gLogger.notice('No info on files for the job = %s, that is odd' %
                           jobID)
            continue
        else:
            for lfn, metadata in replicaInfo.items():
                outStr += "\n\t--------------------"
                outStr += "\n\t%s: %s" % ('LFN'.ljust(8), lfn.ljust(100))
                outStr += "\n\t%s: %s" % (
                    'SE'.ljust(8), metadata['StorageElement'].ljust(100))
                outStr += "\n\t%s: %s" % ('PFN'.ljust(8), str(
                    metadata['PFN']).ljust(100))
                outStr += "\n\t%s: %s" % ('Status'.ljust(8),
                                          metadata['Status'].ljust(100))
                outStr += "\n\t%s: %s" % ('Reason'.ljust(8),
                                          str(metadata['Reason']).ljust(100))
                outStr += "\n%s: %s" % ('LastUpdate'.ljust(8),
                                        str(metadata['LastUpdate']).ljust(100))
            outStr += "\n----------------------"
        gLogger.notice(outStr)
    DIRACExit(0)
Пример #4
0
  def __kill_delete_jobs( self, jobIDList, right ):
    """  Kill or delete jobs as necessary
    """

    jobList = self.__get_job_list( jobIDList )
    if not jobList:
      return S_ERROR( 'Invalid job specification: ' + str( jobIDList ) )

    validJobList, invalidJobList, nonauthJobList, ownerJobList = self.jobPolicy.evaluateJobRights( jobList, right )

    # Get job status to see what is to be killed or deleted
    result = gJobDB.getAttributesForJobList( validJobList, ['Status'] )
    if not result['OK']:
      return result
    killJobList = []
    deleteJobList = []
    stagingJobList = []
    for jobID, sDict in result['Value'].items():
      if sDict['Status'] in ['Running','Matched','Stalled']:
        killJobList.append( jobID )
      elif sDict['Status'] in ['Done','Failed']:
        if not right == RIGHT_KILL:
          deleteJobList.append( jobID )
      else:
        deleteJobList.append( jobID )
      if sDict['Status'] in ['Staging']:
        stagingJobList.append( jobID )
    
    bad_ids = []
    for jobID in killJobList:
      result = self.__killJob( jobID )
      if not result['OK']:
        bad_ids.append( jobID )

    for jobID in deleteJobList:
      result = self.__deleteJob( jobID )
      if not result['OK']:
        bad_ids.append( jobID )

    if stagingJobList:
      stagerClient = StorageManagerClient()
      gLogger.info('Going to send killing signal to stager as well!')
      result = stagerClient.killTasksBySourceTaskID(stagingJobList)
      if not result['OK']:
        gLogger.warn( 'Failed to kill some Stager tasks: %s' % result['Message'] )
             
    if invalidJobList or nonauthJobList or bad_ids:
      result = S_ERROR( 'Some jobs failed deletion' )
      if invalidJobList:
        result['InvalidJobIDs'] = invalidJobList
      if nonauthJobList:
        result['NonauthorizedJobIDs'] = nonauthJobList
      if bad_ids:
        result['FailedJobIDs'] = bad_ids
      return result

    result = S_OK( validJobList )
    result[ 'requireProxyUpload' ] = len( ownerJobList ) > 0 and self.__checkIfProxyUploadIsRequired()
    return result
Пример #5
0
    def initialize(self):
        self.stagerClient = StorageManagerClient()
        # This sets the Default Proxy to used as that defined under
        # /Operations/Shifter/DataManager
        # the shifterProxy option in the Configuration can be used to change this default.
        self.am_setOption('shifterProxy', 'DataManager')

        return S_OK()
Пример #6
0
    def initialize(self):
        self.stagerClient = StorageManagerClient()
        # This sets the Default Proxy to used as that defined under
        # /Operations/Shifter/DataManager
        # the shifterProxy option in the Configuration can be used to change this default.
        self.am_setOption("shifterProxy", "DataManager")
        self.storagePlugins = self.am_getOption("StoragePlugins", [])
        self.dataOpSender = DataOperationSender()

        return S_OK()
def main():
    # Registering arguments will automatically add their description to the help menu
    Script.registerArgument(
        "Request:  ID of the Stage request in the StorageManager")
    Script.parseCommandLine(ignoreErrors=False)

    args = Script.getPositionalArgs()

    if not len(args) == 1:
        Script.showHelp()

    from DIRAC import exit as DIRACExit, gLogger

    try:
        taskID = int(args[0])
    except Exception:
        gLogger.fatal("Stage requestID must be an integer")
        DIRACExit(2)

    from DIRAC.StorageManagementSystem.Client.StorageManagerClient import StorageManagerClient

    client = StorageManagerClient()

    res = client.getTaskSummary(taskID)
    if not res["OK"]:
        gLogger.error(res["Message"])
        DIRACExit(2)
    taskInfo = res["Value"]["TaskInfo"]
    replicaInfo = res["Value"]["ReplicaInfo"]
    outStr = "%s: %s" % ("TaskID".ljust(20), taskID)
    outStr += "\n%s: %s" % ("Status".ljust(20), taskInfo[taskID]["Status"])
    outStr += "\n%s: %s" % ("Source".ljust(20), taskInfo[taskID]["Source"])
    outStr += "\n%s: %s" % ("SourceTaskID".ljust(20),
                            taskInfo[taskID]["SourceTaskID"])
    outStr += "\n%s: %s" % ("CallBackMethod".ljust(20),
                            taskInfo[taskID]["CallBackMethod"])
    outStr += "\n%s: %s" % ("SubmitTime".ljust(20),
                            taskInfo[taskID]["SubmitTime"])
    outStr += "\n%s: %s" % ("CompleteTime".ljust(20),
                            taskInfo[taskID]["CompleteTime"])
    for lfn, metadata in replicaInfo.items():
        outStr += "\n"
        outStr += "\n\t%s: %s" % ("LFN".ljust(8), lfn.ljust(100))
        outStr += "\n\t%s: %s" % ("SE".ljust(8),
                                  metadata["StorageElement"].ljust(100))
        outStr += "\n\t%s: %s" % ("PFN".ljust(8), str(
            metadata["PFN"]).ljust(100))
        outStr += "\n\t%s: %s" % ("Size".ljust(8), str(
            metadata["FileSize"]).ljust(100))
        outStr += "\n\t%s: %s" % ("Status".ljust(8),
                                  metadata["Status"].ljust(100))
        outStr += "\n\t%s: %s" % ("Reason".ljust(8), str(
            metadata["Reason"]).ljust(100))
    gLogger.notice(outStr)
Пример #8
0
    def initialize(self):
        self.fileCatalog = FileCatalog()
        self.dm = DataManager()
        self.stagerClient = StorageManagerClient()
        self.dataIntegrityClient = DataIntegrityClient()
        # This sets the Default Proxy to used as that defined under
        # /Operations/Shifter/DataManager
        # the shifterProxy option in the Configuration can be used to change this default.
        self.am_setOption("shifterProxy", "DataManager")

        return S_OK()
Пример #9
0
def main():
    # Registering arguments will automatically add their description to the help menu
    Script.registerArgument(["JobID:    DIRAC Job ID"])
    Script.parseCommandLine(ignoreErrors=False)

    args = Script.getPositionalArgs()

    if len(args) < 1:
        Script.showHelp()

    from DIRAC import exit as DIRACExit, gLogger

    try:
        jobIDs = [int(arg) for arg in args]
    except Exception:
        gLogger.fatal("DIRAC Job IDs must be integers")
        DIRACExit(2)

    from DIRAC.StorageManagementSystem.Client.StorageManagerClient import StorageManagerClient

    client = StorageManagerClient()

    outStr = "\n"
    for jobID in jobIDs:
        res = client.getTaskSummary(jobID)
        if not res["OK"]:
            gLogger.error(res["Message"])
            continue
        if not res["Value"]:
            gLogger.notice("No info for job %s, probably gone from the stager..." % jobID)
            continue
        taskInfo = res["Value"]["TaskInfo"]
        replicaInfo = res["Value"]["ReplicaInfo"]
        outStr = "%s: %s" % ("JobID".ljust(20), jobID)
        outStr += "\n%s: %s" % ("Status".ljust(20), taskInfo[str(jobID)]["Status"])
        outStr += "\n%s: %s" % ("SubmitTime".ljust(20), taskInfo[str(jobID)]["SubmitTime"])
        outStr += "\n%s: %s" % ("CompleteTime".ljust(20), taskInfo[str(jobID)]["CompleteTime"])
        outStr += "\nStaging files for this job:"
        if not res["Value"]["ReplicaInfo"]:
            gLogger.notice("No info on files for the job = %s, that is odd" % jobID)
            continue
        else:
            for lfn, metadata in replicaInfo.items():
                outStr += "\n\t--------------------"
                outStr += "\n\t%s: %s" % ("LFN".ljust(8), lfn.ljust(100))
                outStr += "\n\t%s: %s" % ("SE".ljust(8), metadata["StorageElement"].ljust(100))
                outStr += "\n\t%s: %s" % ("PFN".ljust(8), str(metadata["PFN"]).ljust(100))
                outStr += "\n\t%s: %s" % ("Status".ljust(8), metadata["Status"].ljust(100))
                outStr += "\n\t%s: %s" % ("Reason".ljust(8), str(metadata["Reason"]).ljust(100))
                outStr += "\n%s: %s" % ("LastUpdate".ljust(8), str(metadata["LastUpdate"]).ljust(100))
            outStr += "\n----------------------"
        gLogger.notice(outStr)
    DIRACExit(0)
Пример #10
0
    def initialize(self):
        self.stagerClient = StorageManagerClient()
        # self.storageDB = StorageManagementDB()
        # pin lifetime = 1 day
        self.pinLifetime = self.am_getOption("PinLifetime", THROTTLING_TIME)

        # This sets the Default Proxy to used as that defined under
        # /Operations/Shifter/DataManager
        # the shifterProxy option in the Configuration can be used to change this default.
        self.am_setOption("shifterProxy", "DataManager")

        return S_OK()
Пример #11
0
def main():
    Script.parseCommandLine(ignoreErrors=False)

    args = Script.getPositionalArgs()

    if not len(args) == 1:
        Script.showHelp()

    from DIRAC import exit as DIRACExit, gLogger

    try:
        taskID = int(args[0])
    except BaseException:
        gLogger.fatal('Stage requestID must be an integer')
        DIRACExit(2)

    from DIRAC.StorageManagementSystem.Client.StorageManagerClient import StorageManagerClient
    client = StorageManagerClient()

    res = client.getTaskSummary(taskID)
    if not res['OK']:
        gLogger.error(res['Message'])
        DIRACExit(2)
    taskInfo = res['Value']['TaskInfo']
    replicaInfo = res['Value']['ReplicaInfo']
    outStr = "%s: %s" % ('TaskID'.ljust(20), taskID)
    outStr += "\n%s: %s" % ('Status'.ljust(20), taskInfo[taskID]['Status'])
    outStr += "\n%s: %s" % ('Source'.ljust(20), taskInfo[taskID]['Source'])
    outStr += "\n%s: %s" % ('SourceTaskID'.ljust(20),
                            taskInfo[taskID]['SourceTaskID'])
    outStr += "\n%s: %s" % ('CallBackMethod'.ljust(20),
                            taskInfo[taskID]['CallBackMethod'])
    outStr += "\n%s: %s" % ('SubmitTime'.ljust(20),
                            taskInfo[taskID]['SubmitTime'])
    outStr += "\n%s: %s" % ('CompleteTime'.ljust(20),
                            taskInfo[taskID]['CompleteTime'])
    for lfn, metadata in replicaInfo.items():
        outStr += "\n"
        outStr += "\n\t%s: %s" % ('LFN'.ljust(8), lfn.ljust(100))
        outStr += "\n\t%s: %s" % ('SE'.ljust(8),
                                  metadata['StorageElement'].ljust(100))
        outStr += "\n\t%s: %s" % ('PFN'.ljust(8), str(
            metadata['PFN']).ljust(100))
        outStr += "\n\t%s: %s" % ('Size'.ljust(8), str(
            metadata['FileSize']).ljust(100))
        outStr += "\n\t%s: %s" % ('Status'.ljust(8),
                                  metadata['Status'].ljust(100))
        outStr += "\n\t%s: %s" % ('Reason'.ljust(8), str(
            metadata['Reason']).ljust(100))
    gLogger.notice(outStr)
Пример #12
0
    def initialize(self):
        self.replicaManager = ReplicaManager()
        self.stagerClient = StorageManagerClient()
        self.dataIntegrityClient = DataIntegrityClient()
        #self.storageDB = StorageManagementDB()
        # pin lifetime = 1 day
        self.pinLifetime = self.am_getOption('PinLifetime', THROTTLING_TIME)

        # This sets the Default Proxy to used as that defined under
        # /Operations/Shifter/DataManager
        # the shifterProxy option in the Configuration can be used to change this default.
        self.am_setOption('shifterProxy', 'DataManager')

        return S_OK()
Пример #13
0
def main():
    Script.parseCommandLine(ignoreErrors=True)

    args = Script.getPositionalArgs()

    if len(args) < 2:
        Script.showHelp()

    seName = args[1]
    fileName = args[0]

    import os
    from DIRAC import exit as DIRACExit, gLogger
    from DIRAC.Interfaces.API.Dirac import Dirac
    from DIRAC.StorageManagementSystem.Client.StorageManagerClient import StorageManagerClient

    stageLfns = {}

    if os.path.exists(fileName):
        try:
            lfnFile = open(fileName)
            lfns = [k.strip() for k in lfnFile.readlines()]
            lfnFile.close()
        except Exception:
            gLogger.exception('Can not open file', fileName)
            DIRACExit(-1)
    else:
        lfns = args[:len(args) - 1]

    stageLfns[seName] = lfns
    stagerClient = StorageManagerClient()

    res = stagerClient.setRequest(
        stageLfns, 'WorkloadManagement',
        'updateJobFromStager@WorkloadManagement/JobStateUpdate',
        0)  # fake JobID = 0
    if not res['OK']:
        gLogger.error(res['Message'])
        DIRACExit(-1)
    else:
        gLogger.notice("Stage request submitted for LFNs:\n %s" % lfns)
        gLogger.notice("SE= %s" % seName)
        gLogger.notice(
            "You can check their status and progress with dirac-stager-monitor-file <LFN> <SE>"
        )

    DIRACExit()
Пример #14
0
def main():
    Script.parseCommandLine(ignoreErrors=False)
    from DIRAC.StorageManagementSystem.Client.StorageManagerClient import StorageManagerClient

    client = StorageManagerClient()

    res = client.getCacheReplicasSummary()
    if not res["OK"]:
        gLogger.fatal(res["Message"])
        DIRACExit(2)
    stagerInfo = res["Value"]
    outStr = "\n"
    outStr += "  %s" % ("Status".ljust(20))
    outStr += "  %s" % ("SE".ljust(20))
    outStr += "  %s" % ("NumberOfFiles".ljust(20))
    outStr += "  %s" % ("Size(GB)".ljust(20))
    outStr += " \n--------------------------------------------------------------------------\n"
    if stagerInfo:
        for info in stagerInfo.values():
            outStr += "  %s" % (info["Status"].ljust(20))
            outStr += "  %s" % (info["SE"].ljust(20))
            outStr += "  %s" % (str(info["NumFiles"]).ljust(20))
            outStr += "  %s\n" % (str(info["SumFiles"]).ljust(20))
    else:
        outStr += "  %s" % ("Nothing to see here...Bye")
    outStr += "  \nWARNING: the Size for files with Status=New is not yet determined at the point of selection!\n"
    outStr += "--------------------- current status of the SE Caches from the DB-----------"
    res = client.getSubmittedStagePins()
    if not res["OK"]:
        gLogger.fatal(res["Message"])
        DIRACExit(2)
    storageElementUsage = res["Value"]
    if storageElementUsage:
        for storageElement in storageElementUsage.keys():
            seDict = storageElementUsage[storageElement]
            seDict["TotalSize"] = int(seDict["TotalSize"] /
                                      (1000 * 1000 * 1000.0))
            outStr += " \n %s: %s replicas with a size of %.3f GB." % (
                storageElement.ljust(15),
                str(seDict["Replicas"]).rjust(6),
                seDict["TotalSize"],
            )
    else:
        outStr += "  %s" % "\nStageRequest.getStorageUsage: No active stage/pin requests found."
    gLogger.notice(outStr)
    DIRACExit(0)
  def initialize( self ):

    # This sets the Default Proxy to used as that defined under
    # /Operations/Shifter/DataManager
    # the shifterProxy option in the Configuration can be used to change this default.
    self.am_setOption( 'shifterProxy', 'DataManager' )
    self.stagerClient = StorageManagerClient()
    return S_OK()
Пример #16
0
    def initialize(self):
        self.stagerClient = StorageManagerClient()
        self.dataIntegrityClient = DataIntegrityClient()
        # This sets the Default Proxy to used as that defined under
        # /Operations/Shifter/DataManager
        # the shifterProxy option in the Configuration can be used to change this default.
        self.am_setOption("shifterProxy", "DataManager")

        return S_OK()
Пример #17
0
  def initialize( self ):
    self.replicaManager = ReplicaManager()
    self.stagerClient = StorageManagerClient()
    self.pinLifeTime = 60 * 60 * 24 * 7 # 7 days

    # This sets the Default Proxy to used as that defined under 
    # /Operations/Shifter/DataManager
    # the shifterProxy option in the Configuration can be used to change this default.
    self.am_setOption( 'shifterProxy', 'DataManager' )

    return S_OK()
Пример #18
0
  def initialize( self ):
    self.fileCatalog = FileCatalog()
    self.rm = ReplicaManager()
    self.stagerClient = StorageManagerClient()
    self.dataIntegrityClient = DataIntegrityClient()
    # self.storageDB = StorageManagementDB()
    # This sets the Default Proxy to used as that defined under
    # /Operations/Shifter/DataManager
    # the shifterProxy option in the Configuration can be used to change this default.
    self.am_setOption( 'shifterProxy', 'DataManager' )

    return S_OK()
Пример #19
0
  def initialize( self ):
    self.stagerClient = StorageManagerClient()
    #self.storageDB = StorageManagementDB()
    # pin lifetime = 1 day
    self.pinLifetime = self.am_getOption( 'PinLifetime', THROTTLING_TIME )

    # This sets the Default Proxy to used as that defined under
    # /Operations/Shifter/DataManager
    # the shifterProxy option in the Configuration can be used to change this default.
    self.am_setOption( 'shifterProxy', 'DataManager' )

    return S_OK()
Пример #20
0
    def __requestStaging(self, jobState, stageLFNs):
        """Actual request for staging LFNs through the StorageManagerClient"""
        self.jobLog.debug(
            "Stage request will be \n\t%s" %
            "\n\t".join(["%s:%s" % (lfn, stageLFNs[lfn])
                         for lfn in stageLFNs]))

        stagerClient = StorageManagerClient()
        result = jobState.setStatus(
            JobStatus.STAGING,
            self.ex_getOption("StagingMinorStatus", "Request To Be Sent"),
            appStatus="",
            source=self.ex_optimizerName(),
        )
        if not result["OK"]:
            return result

        result = stagerClient.setRequest(
            stageLFNs, "WorkloadManagement",
            "updateJobFromStager@WorkloadManagement/JobStateUpdate",
            int(jobState.jid))
        if not result["OK"]:
            self.jobLog.error("Could not send stage request",
                              ": %s" % result["Message"])
            return result

        rid = str(result["Value"])
        self.jobLog.info("Stage request sent", "(%s)" % rid)
        self.storeOptimizerParam("StageRequest", rid)

        result = jobState.setStatus(
            JobStatus.STAGING,
            self.ex_getOption("StagingMinorStatus", "Request Sent"),
            appStatus="",
            source=self.ex_optimizerName(),
        )
        if not result["OK"]:
            return result

        return S_OK(stageLFNs)
Пример #21
0
class PinRequestAgent( AgentModule ):

  def initialize( self ):
    self.replicaManager = ReplicaManager()
    self.stagerClient = StorageManagerClient()
    self.pinLifeTime = 60 * 60 * 24 * 7 # 7 days

    # This sets the Default Proxy to used as that defined under 
    # /Operations/Shifter/DataManager
    # the shifterProxy option in the Configuration can be used to change this default.
    self.am_setOption( 'shifterProxy', 'DataManager' )

    return S_OK()

  def execute( self ):

    res = self.submitPinRequests()
    return res

  def submitPinRequests( self ):
    """ This manages the Staged->Pinned transition of the Replicas
    """
    res = self.__getStagedReplicas()
    if not res['OK']:
      gLogger.fatal( "PinRequest.submitPinRequests: Failed to get replicas from StagerDB.", res['Message'] )
      return res
    if not res['Value']:
      gLogger.info( "PinRequest.submitPinRequests: There were no Staged replicas found" )
      return res
    seReplicas = res['Value']
    for storageElement, requestIDs in seReplicas.items():
      gLogger.info( "PinRequest.submitPinRequests: Obtained Staged replicas for pinning at %s." % storageElement )
      for requestID, replicas in requestIDs.items():
        self.__issuePinRequests( storageElement, requestID, replicas )
    return S_OK()

  def __getStagedReplicas( self ):
    """ This obtains the Staged replicas from the Replicas table and for each LFN the requested storage element """
    # First obtain the Staged replicas from the Replicas table
    res = self.stagerClient.getStagedReplicas()
    if not res['OK']:
      gLogger.error( "PinRequest.__getStagedReplicas: Failed to get replicas with Staged status.", res['Message'] )
      return res
    if not res['Value']:
      gLogger.debug( "PinRequest.__getStagedReplicas: No Staged replicas found to process." )
      return S_OK()
    else:
      gLogger.debug( "PinRequest.__getStagedReplicas: Obtained %s Staged replicas(s) to process." % len( res['Value'] ) )
    seReplicas = {}
    for replicaID, info in res['Value'].items():
      lfn, storageElement, size, pfn, requestID = info
      if not seReplicas.has_key( storageElement ):
        seReplicas[storageElement] = {}
      if not seReplicas[storageElement].has_key( requestID ):
        seReplicas[storageElement][requestID] = {}
      seReplicas[storageElement][requestID][pfn] = replicaID
    return S_OK( seReplicas )

  def __issuePinRequests( self, storageElement, requestID, replicas ):
    pinRequestMetadata = {}
    # Now issue the pin requests for the remaining replicas
    if replicas:
      gLogger.info( "PinRequest.submitPinRequests: Submitting %s pin requests for request %s at %s." % ( len( replicas ), requestID, storageElement ) )
      pfnsToPin = dict.fromkeys( replicas, requestID )
      res = self.replicaManager.pinStorageFile( pfnsToPin, storageElement, lifetime = self.pinLifeTime )
      if not res['OK']:
        gLogger.error( "PinRequest.submitPinRequests: Completely failed to sumbmit pin requests for replicas.", res['Message'] )
      else:
        for pfn in res['Value']['Successful'].keys():
          if not pinRequestMetadata.has_key( requestID ):
            pinRequestMetadata[requestID] = []
          pinRequestMetadata[requestID].append( replicas[pfn] )
    # Update the states of the replicas in the database
    if pinRequestMetadata:
      gLogger.info( "PinRequest.submitPinRequest: %s pin request metadata to be updated." % len( pinRequestMetadata ) )
      res = self.stagerClient.insertPinRequest( pinRequestMetadata, self.pinLifeTime )
      if not res['OK']:
        gLogger.error( "PinRequest.submitPinRequest: Failed to insert pin request metadata.", res['Message'] )
    return
Пример #22
0
    def __requestStaging(self, jobState, stageSite, opData):
        result = getSEsForSite(stageSite)
        if not result['OK']:
            return S_ERROR('Could not determine SEs for site %s' % stageSite)
        siteSEs = result['Value']

        tapeSEs = []
        diskSEs = []
        for seName in siteSEs:
            se = StorageElement(seName)
            result = se.getStatus()
            if not result['OK']:
                self.jobLog.error("Cannot retrieve SE %s status: %s" %
                                  (seName, result['Message']))
                return S_ERROR("Cannot retrieve SE status")
            seStatus = result['Value']
            if seStatus['Read'] and seStatus['TapeSE']:
                tapeSEs.append(seName)
            if seStatus['Read'] and seStatus['DiskSE']:
                diskSEs.append(seName)

        if not tapeSEs:
            return S_ERROR("No Local SEs for site %s" % stageSite)

        self.jobLog.verbose("Tape SEs are %s" % (", ".join(tapeSEs)))

        # I swear this is horrible DM code it's not mine.
        # Eternity of hell to the inventor of the Value of Value of Success of...
        inputData = opData['Value']['Value']['Successful']
        stageLFNs = {}
        lfnToStage = []
        for lfn in inputData:
            replicas = inputData[lfn]
            # Check SEs
            seStage = []
            for seName in replicas:
                _surl = replicas[seName]
                if seName in diskSEs:
                    # This lfn is in disk. Skip it
                    seStage = []
                    break
                if seName not in tapeSEs:
                    # This lfn is not in this tape SE. Check next SE
                    continue
                seStage.append(seName)
            for seName in seStage:
                if seName not in stageLFNs:
                    stageLFNs[seName] = []
                stageLFNs[seName].append(lfn)
                if lfn not in lfnToStage:
                    lfnToStage.append(lfn)

        if not stageLFNs:
            return S_ERROR("Cannot find tape replicas")

        # Check if any LFN is in more than one SE
        # If that's the case, try to stage from the SE that has more LFNs to stage to group the request
        # 1.- Get the SEs ordered by ascending replicas
        sortedSEs = reversed(
            sorted([(len(stageLFNs[seName]), seName)
                    for seName in stageLFNs.keys()]))
        for lfn in lfnToStage:
            found = False
            # 2.- Traverse the SEs
            for _stageCount, seName in sortedSEs:
                if lfn in stageLFNs[seName]:
                    # 3.- If first time found, just mark as found. Next time delete the replica from the request
                    if found:
                        stageLFNs[seName].remove(lfn)
                    else:
                        found = True
                # 4.-If empty SE, remove
                if len(stageLFNs[seName]) == 0:
                    stageLFNs.pop(seName)

        self.jobLog.verbose(
            "Stage request will be \n\t%s" %
            "\n\t".join(["%s:%s" % (lfn, stageLFNs[lfn])
                         for lfn in stageLFNs]))

        stagerClient = StorageManagerClient()
        result = stagerClient.setRequest(
            stageLFNs, 'WorkloadManagement',
            'updateJobFromStager@WorkloadManagement/JobStateUpdate',
            int(jobState.jid))
        if not result['OK']:
            self.jobLog.error("Could not send stage request: %s" %
                              result['Message'])
            return S_ERROR("Problem sending staging request")

        rid = str(result['Value'])
        self.jobLog.info("Stage request %s sent" % rid)
        jobState.setParameter("StageRequest", rid)
        result = jobState.setStatus(self.ex_getOption('StagingStatus',
                                                      'Staging'),
                                    self.ex_getOption('StagingMinorStatus',
                                                      'Request Sent'),
                                    appStatus="",
                                    source=self.ex_optimizerName())
        if not result['OK']:
            return result
        return S_OK(stageLFNs)
Пример #23
0
    def __kill_delete_jobs(self, jobIDList, right):
        """ Kill or delete jobs as necessary

        :param list jobIDList: job IDs
        :param str right: right

        :return: S_OK()/S_ERROR()
    """
        jobList = self.__getJobList(jobIDList)
        if not jobList:
            return S_ERROR('Invalid job specification: ' + str(jobIDList))

        validJobList, invalidJobList, nonauthJobList, ownerJobList = self.jobPolicy.evaluateJobRights(
            jobList, right)

        # Get job status to see what is to be killed or deleted
        result = self.jobDB.getAttributesForJobList(validJobList, ['Status'])
        if not result['OK']:
            return result
        killJobList = []
        deleteJobList = []
        markKilledJobList = []
        stagingJobList = []
        for jobID, sDict in result['Value'].items():  # can be an iterator
            if sDict['Status'] in (JobStatus.RUNNING, JobStatus.MATCHED,
                                   JobStatus.STALLED):
                killJobList.append(jobID)
            elif sDict['Status'] in (JobStatus.DONE, JobStatus.FAILED,
                                     JobStatus.KILLED):
                if not right == RIGHT_KILL:
                    deleteJobList.append(jobID)
            else:
                markKilledJobList.append(jobID)
            if sDict['Status'] in ['Staging']:
                stagingJobList.append(jobID)

        badIDs = []
        for jobID in markKilledJobList:
            result = self.__killJob(jobID, sendKillCommand=False)
            if not result['OK']:
                badIDs.append(jobID)

        for jobID in killJobList:
            result = self.__killJob(jobID)
            if not result['OK']:
                badIDs.append(jobID)

        for jobID in deleteJobList:
            result = self.__deleteJob(jobID)
            if not result['OK']:
                badIDs.append(jobID)

        if stagingJobList:
            stagerClient = StorageManagerClient()
            self.log.info('Going to send killing signal to stager as well!')
            result = stagerClient.killTasksBySourceTaskID(stagingJobList)
            if not result['OK']:
                self.log.warn('Failed to kill some Stager tasks',
                              result['Message'])

        if nonauthJobList or badIDs:
            result = S_ERROR('Some jobs failed deletion')
            if nonauthJobList:
                self.log.warn("Non-authorized JobIDs won't be deleted",
                              str(nonauthJobList))
                result['NonauthorizedJobIDs'] = nonauthJobList
            if badIDs:
                self.log.warn("JobIDs failed to be deleted", str(badIDs))
                result['FailedJobIDs'] = badIDs
            return result

        result = S_OK(validJobList)
        result['requireProxyUpload'] = len(
            ownerJobList) > 0 and self.__checkIfProxyUploadIsRequired()

        if invalidJobList:
            result['InvalidJobIDs'] = invalidJobList

        return result
Пример #24
0
  def __requestStaging( self, jobState, stageSite, opData ):
    result = getSEsForSite( stageSite )
    if not result['OK']:
      return S_ERROR( 'Could not determine SEs for site %s' % stageSite )
    siteSEs = result['Value']

    tapeSEs = []
    diskSEs = []
    for seName in siteSEs:
      se = StorageElement( seName )
      result = se.getStatus()
      if not result[ 'OK' ]:
        self.jobLog.error( "Cannot retrieve SE %s status: %s" % ( seName, result[ 'Message' ] ) )
        return S_ERROR( "Cannot retrieve SE status" )
      seStatus = result[ 'Value' ]
      if seStatus[ 'Read' ] and seStatus[ 'TapeSE' ]:
        tapeSEs.append( seName )
      if seStatus[ 'Read' ] and seStatus[ 'DiskSE' ]:
        diskSEs.append( seName )

    if not tapeSEs:
      return S_ERROR( "No Local SEs for site %s" % stageSite )

    self.jobLog.verbose( "Tape SEs are %s" % ( ", ".join( tapeSEs ) ) )

    #I swear this is horrible DM code it's not mine.
    #Eternity of hell to the inventor of the Value of Value of Success of...
    inputData = opData['Value']['Value']['Successful']
    stageLFNs = {}
    lfnToStage = []
    for lfn in inputData:
      replicas = inputData[ lfn ]
      #Check SEs
      seStage = []
      for seName in replicas:
        surl = replicas[ seName ]
        if seName in diskSEs:
          #This lfn is in disk. Skip it
          seStage = []
          break
        if seName not in tapeSEs:
          #This lfn is not in this tape SE. Check next SE
          continue
        seStage.append( seName )
      for seName in seStage:
        if seName not in stageLFNs:
          stageLFNs[ seName ] = []
        stageLFNs[ seName ].append( lfn )
        if lfn not in lfnToStage:
          lfnToStage.append( lfn )

    if not stageLFNs:
      return S_ERROR( "Cannot find tape replicas" )

    #Check if any LFN is in more than one SE
    #If that's the case, try to stage from the SE that has more LFNs to stage to group the request
    #1.- Get the SEs ordered by ascending replicas
    sortedSEs = reversed( sorted( [ ( len( stageLFNs[ seName ] ), seName ) for seName in stageLFNs.keys() ] ) )
    for lfn in lfnToStage:
      found = False
      #2.- Traverse the SEs
      for stageCount, seName in sortedSEs:
        if lfn in stageLFNs[ seName ]:
          #3.- If first time found, just mark as found. Next time delete the replica from the request
          if found:
            stageLFNs[ seName ].remove( lfn )
          else:
            found = True
        #4.-If empty SE, remove
        if len( stageLFNs[ seName ] ) == 0:
          stageLFNs.pop( seName )

    self.jobLog.verbose( "Stage request will be \n\t%s" % "\n\t".join( [ "%s:%s" % ( lfn, stageLFNs[ lfn ] ) for lfn in stageLFNs ] ) )

    stagerClient = StorageManagerClient()
    result = jobState.setStatus( self.ex_getOption( 'StagingStatus', 'Staging' ),
                                 self.ex_getOption( 'StagingMinorStatus', 'Request To Be Sent' ),
                                 appStatus = "",
                                 source = self.ex_optimizerName() )
    if not result[ 'OK' ]:
      return result

    result = stagerClient.setRequest( stageLFNs, 'WorkloadManagement',
                                      'updateJobFromStager@WorkloadManagement/JobStateUpdate',
                                      int( jobState.jid ) )
    if not result[ 'OK' ]:
      self.jobLog.error( "Could not send stage request: %s" %  result[ 'Message' ] )
      return S_ERROR( "Problem sending staging request" )

    rid = str( result[ 'Value' ] )
    self.jobLog.info( "Stage request %s sent" % rid )
    jobState.setParameter( "StageRequest", rid )

    result = jobState.setStatus( self.ex_getOption( 'StagingStatus', 'Staging' ),
                                 self.ex_getOption( 'StagingMinorStatus', 'Request Sent' ),
                                 appStatus = "",
                                 source = self.ex_optimizerName() )
    if not result[ 'OK' ]:
      return result

    return S_OK( stageLFNs )
Пример #25
0
    '  %s  LFN SE ...' % Script.scriptName, 'Arguments:',
    '  LFN: LFN of the staging file',
    '  SE: Storage Element for the staging file'
]))
Script.parseCommandLine(ignoreErrors=True)
args = Script.getPositionalArgs()
if len(args) < 2:
    Script.showHelp()

from DIRAC import exit as DIRACExit, gLogger

lfn = args[0]
se = args[1]

from DIRAC.StorageManagementSystem.Client.StorageManagerClient import StorageManagerClient
client = StorageManagerClient()
res = client.getCacheReplicas({'LFN': lfn, 'SE': se})
if not res['OK']:
    gLogger.error(res['Message'])
cacheReplicaInfo = res['Value']
if cacheReplicaInfo:
    replicaID = cacheReplicaInfo.keys()[0]
    outStr = "\n--------------------"
    outStr += "\n%s: %s" % ('LFN'.ljust(8),
                            cacheReplicaInfo[replicaID]['LFN'].ljust(100))
    outStr += "\n%s: %s" % ('SE'.ljust(8),
                            cacheReplicaInfo[replicaID]['SE'].ljust(100))
    outStr += "\n%s: %s" % ('PFN'.ljust(8),
                            cacheReplicaInfo[replicaID]['PFN'].ljust(100))
    outStr += "\n%s: %s" % ('Status'.ljust(8),
                            cacheReplicaInfo[replicaID]['Status'].ljust(100))
                                     'Usage:',
                                     '  %s  LFN SE ...' % Script.scriptName,
                                     'Arguments:',
                                     '  LFN: LFN of the staging file \n',
                                     '  SE: Storage Element for the staging file \n'
                                       ] ) )
Script.parseCommandLine( ignoreErrors = True )
args = Script.getPositionalArgs()
if len( args ) < 2:
  Script.showHelp()

lfn = args[0]
se = args[1]

from DIRAC.StorageManagementSystem.Client.StorageManagerClient import StorageManagerClient
client = StorageManagerClient()
res = client.getCacheReplicas( {'LFN':lfn,'SE':se} )
if not res['OK']:
  print res['Message']
cacheReplicaInfo = res['Value']
if cacheReplicaInfo:
  replicaID = cacheReplicaInfo.keys()[0]
  outStr = "\n--------------------"
  outStr = "%s\n%s: %s" % ( outStr, 'LFN'.ljust( 8 ), cacheReplicaInfo[replicaID]['LFN'].ljust( 100 ) )
  outStr = "%s\n%s: %s" % ( outStr, 'SE'.ljust( 8 ), cacheReplicaInfo[replicaID]['SE'].ljust( 100 ) )
  outStr = "%s\n%s: %s" % ( outStr, 'PFN'.ljust( 8 ), cacheReplicaInfo[replicaID]['PFN'].ljust( 100 ) )
  outStr = "%s\n%s: %s" % ( outStr, 'Status'.ljust( 8 ), cacheReplicaInfo[replicaID]['Status'].ljust( 100 ) )
  outStr = "%s\n%s: %s" % ( outStr, 'LastUpdate'.ljust( 8 ), str(cacheReplicaInfo[replicaID]['LastUpdate']).ljust( 100 ) )
  outStr = "%s\n%s: %s" % ( outStr, 'Reason'.ljust( 8 ), str( cacheReplicaInfo[replicaID]['Reason']).ljust( 100 ) )
  
  resTasks = client.getTasks({'ReplicaID':replicaID})
Пример #27
0
stageLfns = {}

if os.path.exists(fileName):
    try:
        lfnFile = open(fileName)
        lfns = [k.strip() for k in lfnFile.readlines()]
        lfnFile.close()
    except Exception:
        gLogger.exception("Can not open file", fileName)
        DIRACExit(-1)
else:
    lfns = args[1:]

stageLfns[seName] = lfns
stagerClient = StorageManagerClient()

res = stagerClient.setRequest(
    stageLfns, "WorkloadManagement", "updateJobFromStager@WorkloadManagement/JobStateUpdate", 0
)  # fake JobID = 0
if not res["OK"]:
    gLogger.error(res["Message"])
    DIRACExit(-1)
else:
    print "Stage request submitted for LFNs:\n %s" % lfns
    print "SE= %s" % seName
    print "You can check their status and progress with dirac-stager-monitor-file <LFN> <SE>"

"""Example1:
dirac-stager-stage-files.py GRIDKA-RDST filesToStage.txt 
Stage request submitted for LFNs:
Пример #28
0
class StageMonitorAgent(AgentModule):
    def initialize(self):
        self.stagerClient = StorageManagerClient()
        # This sets the Default Proxy to used as that defined under
        # /Operations/Shifter/DataManager
        # the shifterProxy option in the Configuration can be used to change this default.
        self.am_setOption("shifterProxy", "DataManager")
        self.storagePlugins = self.am_getOption("StoragePlugins", [])
        self.dataOpSender = DataOperationSender()

        return S_OK()

    def execute(self):

        res = getProxyInfo(disableVOMS=True)
        if not res["OK"]:
            return res
        self.proxyInfoDict = res["Value"]

        return self.monitorStageRequests()

    def monitorStageRequests(self):
        """This is the third logical task manages the StageSubmitted->Staged transition of the Replicas"""
        res = self.__getStageSubmittedReplicas()
        if not res["OK"]:
            gLogger.fatal(
                "StageMonitor.monitorStageRequests: Failed to get replicas from StorageManagementDB.",
                res["Message"])
            return res
        if not res["Value"]:
            gLogger.info(
                "StageMonitor.monitorStageRequests: There were no StageSubmitted replicas found"
            )
            return res
        seReplicas = res["Value"]["SEReplicas"]
        replicaIDs = res["Value"]["ReplicaIDs"]
        gLogger.info(
            "StageMonitor.monitorStageRequests: Obtained %s StageSubmitted replicas for monitoring."
            % len(replicaIDs))
        for storageElement, seReplicaIDs in seReplicas.items():
            self.__monitorStorageElementStageRequests(storageElement,
                                                      seReplicaIDs, replicaIDs)

        return self.dataOpSender.concludeSending()

    def __monitorStorageElementStageRequests(self, storageElement,
                                             seReplicaIDs, replicaIDs):
        terminalReplicaIDs = {}
        oldRequests = []
        stagedReplicas = []

        # Since we are in a given SE, the LFN is a unique key
        lfnRepIDs = {}
        for replicaID in seReplicaIDs:
            lfn = replicaIDs[replicaID]["LFN"]
            lfnRepIDs[lfn] = replicaID

        if lfnRepIDs:
            gLogger.info(
                "StageMonitor.__monitorStorageElementStageRequests: Monitoring %s stage requests for %s."
                % (len(lfnRepIDs), storageElement))
        else:
            gLogger.warn(
                "StageMonitor.__monitorStorageElementStageRequests: No requests to monitor for %s."
                % storageElement)
            return
        startTime = datetime.datetime.utcnow()
        res = StorageElement(
            storageElement,
            plugins=self.storagePlugins).getFileMetadata(lfnRepIDs)
        if not res["OK"]:
            gLogger.error(
                "StageMonitor.__monitorStorageElementStageRequests: Completely failed to monitor stage requests for replicas",
                res["Message"],
            )
            return
        prestageStatus = res["Value"]

        accountingDict = self.__newAccountingDict(storageElement)

        for lfn, reason in prestageStatus["Failed"].items():
            accountingDict["TransferTotal"] += 1
            if re.search("File does not exist", reason):
                gLogger.error(
                    "StageMonitor.__monitorStorageElementStageRequests: LFN did not exist in the StorageElement",
                    lfn)
                terminalReplicaIDs[
                    lfnRepIDs[lfn]] = "LFN did not exist in the StorageElement"
        for lfn, metadata in prestageStatus["Successful"].items():
            if not metadata:
                continue
            staged = metadata.get("Cached", metadata["Accessible"])
            if staged:
                accountingDict["TransferTotal"] += 1
                accountingDict["TransferOK"] += 1
                accountingDict["TransferSize"] += metadata["Size"]
                stagedReplicas.append(lfnRepIDs[lfn])
            elif staged is not None:
                oldRequests.append(lfnRepIDs[lfn])  # only ReplicaIDs

        # Check if sending data operation to Monitoring
        self.dataOpSender.sendData(accountingDict,
                                   startTime=startTime,
                                   endTime=datetime.datetime.utcnow())
        # Update the states of the replicas in the database
        if terminalReplicaIDs:
            gLogger.info(
                "StageMonitor.__monitorStorageElementStageRequests: %s replicas are terminally failed."
                % len(terminalReplicaIDs))
            res = self.stagerClient.updateReplicaFailure(terminalReplicaIDs)
            if not res["OK"]:
                gLogger.error(
                    "StageMonitor.__monitorStorageElementStageRequests: Failed to update replica failures.",
                    res["Message"],
                )
        if stagedReplicas:
            gLogger.info(
                "StageMonitor.__monitorStorageElementStageRequests: %s staged replicas to be updated."
                % len(stagedReplicas))
            res = self.stagerClient.setStageComplete(stagedReplicas)
            if not res["OK"]:
                gLogger.error(
                    "StageMonitor.__monitorStorageElementStageRequests: Failed to updated staged replicas.",
                    res["Message"],
                )
            res = self.stagerClient.updateReplicaStatus(
                stagedReplicas, "Staged")
            if not res["OK"]:
                gLogger.error(
                    "StageMonitor.__monitorStorageElementStageRequests: Failed to insert replica status.",
                    res["Message"],
                )
        if oldRequests:
            gLogger.info(
                "StageMonitor.__monitorStorageElementStageRequests: %s old requests will be retried."
                % len(oldRequests))
            res = self.__wakeupOldRequests(oldRequests)
            if not res["OK"]:
                gLogger.error(
                    "StageMonitor.__monitorStorageElementStageRequests: Failed to wakeup old requests.",
                    res["Message"])
        return

    def __newAccountingDict(self, storageElement):
        """Generate a new accounting Dict"""

        accountingDict = {}
        accountingDict["OperationType"] = "Stage"
        accountingDict["User"] = self.proxyInfoDict["username"]
        accountingDict["Protocol"] = "Stager"
        accountingDict["RegistrationTime"] = 0.0
        accountingDict["RegistrationOK"] = 0
        accountingDict["RegistrationTotal"] = 0
        accountingDict["FinalStatus"] = "Successful"
        accountingDict["Source"] = storageElement
        accountingDict["Destination"] = storageElement
        accountingDict["ExecutionSite"] = siteName()
        accountingDict["TransferTotal"] = 0
        accountingDict["TransferOK"] = 0
        accountingDict["TransferSize"] = 0
        accountingDict["TransferTime"] = self.am_getPollingTime()

        return accountingDict

    def __getStageSubmittedReplicas(self):
        """This obtains the StageSubmitted replicas from the Replicas table and the RequestID
        from the StageRequests table
        """
        res = self.stagerClient.getCacheReplicas({"Status": "StageSubmitted"})
        if not res["OK"]:
            gLogger.error(
                "StageMonitor.__getStageSubmittedReplicas: Failed to get replicas with StageSubmitted status.",
                res["Message"],
            )
            return res
        if not res["Value"]:
            gLogger.debug(
                "StageMonitor.__getStageSubmittedReplicas: No StageSubmitted replicas found to process."
            )
            return S_OK()
        else:
            gLogger.debug(
                "StageMonitor.__getStageSubmittedReplicas: Obtained %s StageSubmitted replicas(s) to process."
                % len(res["Value"]))

        seReplicas = {}
        replicaIDs = res["Value"]
        for replicaID, info in replicaIDs.items():
            storageElement = info["SE"]
            seReplicas.setdefault(storageElement, []).append(replicaID)

        # RequestID was missing from replicaIDs dictionary BUGGY?
        res = self.stagerClient.getStageRequests(
            {"ReplicaID": list(replicaIDs)})
        if not res["OK"]:
            return res
        if not res["Value"]:
            return S_ERROR(
                "Could not obtain request IDs for replicas %s from StageRequests table"
                % list(replicaIDs))

        for replicaID, info in res["Value"].items():
            replicaIDs[replicaID]["RequestID"] = info["RequestID"]

        return S_OK({"SEReplicas": seReplicas, "ReplicaIDs": replicaIDs})

    def __wakeupOldRequests(self, oldRequests):
        gLogger.info("StageMonitor.__wakeupOldRequests: Attempting...")
        retryInterval = self.am_getOption("RetryIntervalHour", 2)
        res = self.stagerClient.wakeupOldRequests(oldRequests, retryInterval)
        if not res["OK"]:
            gLogger.error(
                "StageMonitor.__wakeupOldRequests: Failed to resubmit old requests.",
                res["Message"])
            return res
        return S_OK()
Пример #29
0
            "  LFN: LFN of the staging file \n",
            "  SE: Storage Element for the staging file \n",
        ]
    )
)
Script.parseCommandLine(ignoreErrors=True)
args = Script.getPositionalArgs()
if len(args) < 2:
    Script.showHelp()

lfn = args[0]
se = args[1]

from DIRAC.StorageManagementSystem.Client.StorageManagerClient import StorageManagerClient

client = StorageManagerClient()
res = client.getCacheReplicas({"LFN": lfn, "SE": se})
if not res["OK"]:
    print res["Message"]
cacheReplicaInfo = res["Value"]
if cacheReplicaInfo:
    replicaID = cacheReplicaInfo.keys()[0]
    outStr = "\n--------------------"
    outStr = "%s\n%s: %s" % (outStr, "LFN".ljust(8), cacheReplicaInfo[replicaID]["LFN"].ljust(100))
    outStr = "%s\n%s: %s" % (outStr, "SE".ljust(8), cacheReplicaInfo[replicaID]["SE"].ljust(100))
    outStr = "%s\n%s: %s" % (outStr, "PFN".ljust(8), cacheReplicaInfo[replicaID]["PFN"].ljust(100))
    outStr = "%s\n%s: %s" % (outStr, "Status".ljust(8), cacheReplicaInfo[replicaID]["Status"].ljust(100))
    outStr = "%s\n%s: %s" % (outStr, "LastUpdate".ljust(8), str(cacheReplicaInfo[replicaID]["LastUpdate"]).ljust(100))
    outStr = "%s\n%s: %s" % (outStr, "Reason".ljust(8), str(cacheReplicaInfo[replicaID]["Reason"]).ljust(100))

    resTasks = client.getTasks({"ReplicaID": replicaID})
    def run():
        global subLogger

        from DIRAC.StorageManagementSystem.Client.StorageManagerClient import StorageManagerClient

        client = StorageManagerClient()
        queryDict = {}

        if "status" in switchDict:
            queryDict["Status"] = str(switchDict["status"])

        if "se" in switchDict:
            queryDict["SE"] = str(switchDict["se"])

        # weird: if there are no switches (dictionary is empty), then the --limit is ignored!!
        # must FIX that in StorageManagementDB.py!
        # ugly fix:
        newer = "1903-08-02 06:24:38"  # select newer than
        if "limit" in switchDict:
            gLogger.notice("Query limited to %s entries" % switchDict["limit"])
            res = client.getCacheReplicas(queryDict, None, newer, None, None,
                                          int(switchDict["limit"]))
        else:
            res = client.getCacheReplicas(queryDict)

        if not res["OK"]:
            gLogger.error(res["Message"])
        outStr = "\n"
        if res["Records"]:
            replicas = res["Value"]
            outStr += " %s" % ("Status".ljust(15))
            outStr += " %s" % ("LastUpdate".ljust(20))
            outStr += " %s" % ("LFN".ljust(80))
            outStr += " %s" % ("SE".ljust(10))
            outStr += " %s" % ("Reason".ljust(10))
            if "showJobs" in switchDict:
                outStr += " %s" % ("Jobs".ljust(10))
            outStr += " %s" % ("PinExpiryTime".ljust(15))
            outStr += " %s" % ("PinLength(sec)".ljust(15))
            outStr += "\n"

            for crid, info in replicas.items():
                outStr += " %s" % (info["Status"].ljust(15))
                outStr += " %s" % (str(info["LastUpdate"]).ljust(20))
                outStr += " %s" % (info["LFN"].ljust(30))
                outStr += " %s" % (info["SE"].ljust(15))
                outStr += " %s" % (str(info["Reason"]).ljust(10))

                # Task info
                if "showJobs" in switchDict:
                    resTasks = client.getTasks({"ReplicaID": crid})
                    if resTasks["OK"]:
                        if resTasks["Value"]:
                            tasks = resTasks["Value"]
                            jobs = []
                            for tid in tasks:
                                jobs.append(tasks[tid]["SourceTaskID"])
                            outStr += " %s " % (str(jobs).ljust(10))
                    else:
                        outStr += " %s " % (" --- ".ljust(10))
                # Stage request info
                # what if there's no request to the site yet?
                resStageRequests = client.getStageRequests({"ReplicaID": crid})
                if not resStageRequests["OK"]:
                    gLogger.error(resStageRequests["Message"])
                if resStageRequests["Records"]:
                    stageRequests = resStageRequests["Value"]
                    for info in stageRequests.values():
                        outStr += " %s" % (str(
                            info["PinExpiryTime"]).ljust(20))
                        outStr += " %s" % (str(info["PinLength"]).ljust(10))
                outStr += "\n"

            gLogger.notice(outStr)
        else:
            gLogger.notice("No entries")
Пример #31
0
# File :    dirac-stager-show-stats
# Author :  Daniela Remenska
########################################################################
"""
Reports breakdown of file(s) number/size in different staging states across Storage Elements.
Currently used Cache per SE is also reported. (active pins)
"""

__RCSID__ = "$Id$"
from DIRAC.Core.Base import Script
from DIRAC import gConfig, gLogger, exit as DIRACExit, S_OK, version

Script.parseCommandLine(ignoreErrors=False)
from DIRAC.StorageManagementSystem.Client.StorageManagerClient import StorageManagerClient

client = StorageManagerClient()

res = client.getCacheReplicasSummary()
if not res['OK']:
    gLogger.fatal(res['Message'])
    DIRACExit(2)
stagerInfo = res['Value']
outStr = "\n"
outStr += "  %s" % ("Status".ljust(20))
outStr += "  %s" % ("SE".ljust(20))
outStr += "  %s" % ("NumberOfFiles".ljust(20))
outStr += "  %s" % ("Size(GB)".ljust(20))
outStr += " \n--------------------------------------------------------------------------\n" % outStr
if stagerInfo:
    for info in stagerInfo.itervalues():
        outStr += "  %s" % (info['Status'].ljust(20))
Пример #32
0
class StageMonitorAgent( AgentModule ):

  def initialize( self ):
    self.replicaManager = ReplicaManager()
    self.stagerClient = StorageManagerClient()
    self.dataIntegrityClient = DataIntegrityClient()
    #self.storageDB = StorageManagementDB()
    # This sets the Default Proxy to used as that defined under
    # /Operations/Shifter/DataManager
    # the shifterProxy option in the Configuration can be used to change this default.
    self.am_setOption( 'shifterProxy', 'DataManager' )

    return S_OK()

  def execute( self ):

    res = getProxyInfo( disableVOMS = True )
    if not res['OK']:
      return res
    self.proxyInfoDict = res['Value']

    res = self.monitorStageRequests()

    return res

  def monitorStageRequests( self ):
    """ This is the third logical task manages the StageSubmitted->Staged transition of the Replicas
    """
    res = self.__getStageSubmittedReplicas()
    if not res['OK']:
      gLogger.fatal( "StageMonitor.monitorStageRequests: Failed to get replicas from StorageManagementDB.", res['Message'] )
      return res
    if not res['Value']:
      gLogger.info( "StageMonitor.monitorStageRequests: There were no StageSubmitted replicas found" )
      return res
    seReplicas = res['Value']['SEReplicas']
    replicaIDs = res['Value']['ReplicaIDs']
    gLogger.info( "StageMonitor.monitorStageRequests: Obtained %s StageSubmitted replicas for monitoring." % len( replicaIDs ) )
    for storageElement, seReplicaIDs in seReplicas.items():
      self.__monitorStorageElementStageRequests( storageElement, seReplicaIDs, replicaIDs )

    gDataStoreClient.commit()

    return S_OK()

  def __monitorStorageElementStageRequests( self, storageElement, seReplicaIDs, replicaIDs ):
    terminalReplicaIDs = {}
    oldRequests = []
    stagedReplicas = []
    pfnRepIDs = {}
    pfnReqIDs = {}
    for replicaID in seReplicaIDs:
      pfn = replicaIDs[replicaID]['PFN']
      pfnRepIDs[pfn] = replicaID
      requestID = replicaIDs[replicaID].get( 'RequestID', None )
      if requestID:
        pfnReqIDs[pfn] = replicaIDs[replicaID]['RequestID']

    gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: Monitoring %s stage requests for %s." % ( len( pfnRepIDs ), storageElement ) )
    oAccounting = DataOperation()
    oAccounting.setStartTime()

    res = self.replicaManager.getStorageFileMetadata( pfnReqIDs.keys(), storageElement )
    if not res['OK']:
      gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Completely failed to monitor stage requests for replicas.", res['Message'] )
      return
    prestageStatus = res['Value']

    accountingDict = self.__newAccountingDict( storageElement )

    for pfn, reason in prestageStatus['Failed'].items():
      accountingDict['TransferTotal'] += 1
      if re.search( 'File does not exist', reason ):
        gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: PFN did not exist in the StorageElement", pfn )
        terminalReplicaIDs[pfnRepIDs[pfn]] = 'PFN did not exist in the StorageElement'
    for pfn, staged in prestageStatus['Successful'].items():
      if staged and 'Cached' in staged and staged['Cached']:
        accountingDict['TransferTotal'] += 1
        accountingDict['TransferOK'] += 1
        accountingDict['TransferSize'] += staged['Size']
        stagedReplicas.append( pfnRepIDs[pfn] )
      if staged and 'Cached' in staged and not staged['Cached']:
        oldRequests.append( pfnRepIDs[pfn] ); #only ReplicaIDs

    oAccounting.setValuesFromDict( accountingDict )
    oAccounting.setEndTime()
    gDataStoreClient.addRegister( oAccounting )

    # Update the states of the replicas in the database
    if terminalReplicaIDs:
      gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: %s replicas are terminally failed." % len( terminalReplicaIDs ) )
      res = self.stagerClient.updateReplicaFailure( terminalReplicaIDs )
      if not res['OK']:
        gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to update replica failures.", res['Message'] )
    if stagedReplicas:
      gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: %s staged replicas to be updated." % len( stagedReplicas ) )
      res = self.stagerClient.setStageComplete( stagedReplicas )
      if not res['OK']:
        gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to updated staged replicas.", res['Message'] )
      res = self.stagerClient.updateReplicaStatus( stagedReplicas, 'Staged' )
      if not res['OK']:
        gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to insert replica status.", res['Message'] )
    if oldRequests:
      gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: %s old requests will be retried." % len( oldRequests ) )
      res = self.__wakeupOldRequests( oldRequests )
      if not res['OK']:
        gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to wakeup old requests.", res['Message'] )
    return

  def __newAccountingDict( self, storageElement ):
    """ Generate a new accounting Dict """

    accountingDict = {}
    accountingDict['OperationType'] = 'Stage'
    accountingDict['User'] = self.proxyInfoDict['username']
    accountingDict['Protocol'] = 'Stager'
    accountingDict['RegistrationTime'] = 0.0
    accountingDict['RegistrationOK'] = 0
    accountingDict['RegistrationTotal'] = 0
    accountingDict['FinalStatus'] = 'Successful'
    accountingDict['Source'] = storageElement
    accountingDict['Destination'] = storageElement
    accountingDict['ExecutionSite'] = siteName()
    accountingDict['TransferTotal'] = 0
    accountingDict['TransferOK'] = 0
    accountingDict['TransferSize'] = 0
    accountingDict['TransferTime'] = self.am_getPollingTime()

    return accountingDict

  def __getStageSubmittedReplicas( self ):
    """ This obtains the StageSubmitted replicas from the Replicas table and the RequestID from the StageRequests table """
    res = self.stagerClient.getCacheReplicas( {'Status':'StageSubmitted'} )
    if not res['OK']:
      gLogger.error( "StageMonitor.__getStageSubmittedReplicas: Failed to get replicas with StageSubmitted status.", res['Message'] )
      return res
    if not res['Value']:
      gLogger.debug( "StageMonitor.__getStageSubmittedReplicas: No StageSubmitted replicas found to process." )
      return S_OK()
    else:
      gLogger.debug( "StageMonitor.__getStageSubmittedReplicas: Obtained %s StageSubmitted replicas(s) to process." % len( res['Value'] ) )

    seReplicas = {}
    replicaIDs = res['Value']
    for replicaID, info in replicaIDs.items():
      storageElement = info['SE']
      if not seReplicas.has_key( storageElement ):
        seReplicas[storageElement] = []
      seReplicas[storageElement].append( replicaID )

    # RequestID was missing from replicaIDs dictionary BUGGY?
    res = self.stagerClient.getStageRequests( {'ReplicaID':replicaIDs.keys()} )
    if not res['OK']:
      return res
    if not res['Value']:
      return S_ERROR( 'Could not obtain request IDs for replicas %s from StageRequests table' % ( replicaIDs.keys() ) )

    for replicaID, info in res['Value'].items():
      reqID = info['RequestID']
      replicaIDs[replicaID]['RequestID'] = reqID

    return S_OK( {'SEReplicas':seReplicas, 'ReplicaIDs':replicaIDs} )

  def __reportProblematicFiles( self, lfns, reason ):
    return S_OK()
    res = self.dataIntegrityClient.setFileProblematic( lfns, reason,  sourceComponent = 'StageMonitorAgent'  )
    if not res['OK']:
      gLogger.error( "StageMonitor.__reportProblematicFiles: Failed to report missing files.", res['Message'] )
      return res
    if res['Value']['Successful']:
      gLogger.info( "StageMonitor.__reportProblematicFiles: Successfully reported %s missing files." % len( res['Value']['Successful'] ) )
    if res['Value']['Failed']:
      gLogger.info( "StageMonitor.__reportProblematicFiles: Failed to report %s problematic files." % len( res['Value']['Failed'] ) )
    return res

  def __wakeupOldRequests( self, oldRequests ):
    gLogger.info( "StageMonitor.__wakeupOldRequests: Attempting..." )
    retryInterval = self.am_getOption( 'RetryIntervalHour', 2 )
    res = self.stagerClient.wakeupOldRequests( oldRequests, retryInterval )
    if not res['OK']:
      gLogger.error( "StageMonitor.__wakeupOldRequests: Failed to resubmit old requests.", res['Message'] )
      return res
    return S_OK()
def run():
  
  from DIRAC.StorageManagementSystem.Client.StorageManagerClient import StorageManagerClient
  client = StorageManagerClient()
  queryDict = {}

  dictKeys = switchDict.keys()
  
  if 'status' in dictKeys:
    queryDict['Status'] = str(switchDict['status']) 
  
  
  if 'se' in dictKeys:
    queryDict['SE'] = str(switchDict['se']);
  
  # weird: if there are no switches (dictionary is empty), then the --limit is ignored!!
  # must FIX that in StorageManagementDB.py!
  # ugly fix:
  newer = '1903-08-02 06:24:38' # select newer than 
  if 'limit' in dictKeys:
    print "Query limited to %s entries" %switchDict['limit']   
    res = client.getCacheReplicas(queryDict, None, newer, None, None, int(switchDict['limit']))
  else:
    res = client.getCacheReplicas(queryDict)
  
  if not res['OK']:
    print res['Message']
  outStr ="\n"
  if res['Records']:
    replicas = res['Value']
    outStr = "%s %s" %(outStr, "Status".ljust(15)) 
    outStr = "%s %s" %(outStr, "LastUpdate".ljust(20))  
    outStr = "%s %s" %(outStr, "LFN".ljust(80))   
    outStr = "%s %s" %(outStr, "SE".ljust(10))  
    outStr = "%s %s" %(outStr, "Reason".ljust(10))
    if 'showJobs' in dictKeys:  
      outStr = "%s %s" %(outStr, "Jobs".ljust(10))  
    outStr = "%s %s" %(outStr, "PinExpiryTime".ljust(15))  
    outStr = "%s %s" %(outStr, "PinLength(sec)".ljust(15))  
    outStr = "%s\n" % outStr  
    
    for crid in replicas.keys():
      outStr = "%s %s" %(outStr, replicas[crid]['Status'].ljust( 15 ))
      outStr = "%s %s" %(outStr, str(replicas[crid]['LastUpdate']).ljust( 20 ))
      outStr = "%s %s" %(outStr, replicas[crid]['LFN'].ljust( 30 ))
      outStr = "%s %s" %(outStr, replicas[crid]['SE'].ljust( 15 ))              
      outStr = "%s %s" %(outStr, str(replicas[crid]['Reason']).ljust( 10 ))
 
      # Task info
      if 'showJobs' in dictKeys:
        resTasks = client.getTasks({'ReplicaID':crid})
        if resTasks['OK']:
          if resTasks['Value']:
            tasks = resTasks['Value']
            jobs = []
            for tid in tasks.keys():
              jobs.append(tasks[tid]['SourceTaskID'])      
            outStr = '%s %s ' % (outStr, str(jobs).ljust(10))
        else:
          outStr = '%s %s ' % (outStr, " --- ".ljust(10))     
      # Stage request info
      # what if there's no request to the site yet?
      resStageRequests = client.getStageRequests({'ReplicaID':crid})
      if not resStageRequests['OK']:
        print resStageRequests['Message']
      if resStageRequests['Records']:
        stageRequests = resStageRequests['Value']        
        for srid in stageRequests.keys():
          outStr = "%s %s" %(outStr, str(stageRequests[srid]['PinExpiryTime']).ljust( 20 ))
          outStr = "%s %s" %(outStr, str(stageRequests[srid]['PinLength']).ljust( 10 ))
           
 
      outStr = "%s\n" % outStr  
    print outStr
  else:
    print "No entries"    
Пример #34
0
    lfnFile.close()
  except Exception:
    DIRAC.gLogger.exception( 'Can not open file', fileName )
    DIRAC.exit( -1 )

else:
  lfns = args[1:]

dirac = Dirac()
res = dirac.getReplicas( lfns[0:], active = True, printOutput = False )

if not res['OK']:
  DIRAC.gLogger.error( res['Message'] )
  DIRAC.exit( -1 )

stagerClient = StorageManagerClient()
stageLfns = []

for lfn, replicas in res['Value']['Successful'].items():
  if seName in replicas:
    stageLfns.append( lfn )
    if len( stageLfns ) >= 10:
      # Use a fake JobID = 0
      request = stagerClient.setRequest( { seName : stageLfns }, 'WorkloadManagement',
                                         'updateJobFromStager@WorkloadManagement/JobStateUpdate', 0 )
      if request['OK']:
        DIRAC.gLogger.notice( 'Stage Request submitted for %s replicas:' % len( stageLfns ), request['Value'] )
        stageLfns = []
      else:
        DIRAC.gLogger.error( 'Failed to submit Stage Request' )
        DIRAC.gLogger.error( request['Message'] )
Пример #35
0
class RequestFinalizationAgent(AgentModule):
    def initialize(self):

        # This sets the Default Proxy to used as that defined under
        # /Operations/Shifter/DataManager
        # the shifterProxy option in the Configuration can be used to change this default.
        self.am_setOption('shifterProxy', 'DataManager')
        #self.storageDB = StorageManagementDB()
        self.stagerClient = StorageManagerClient()
        return S_OK()

    def execute(self):
        res = self.clearFailedTasks()
        res = self.callbackStagedTasks()
        res = self.removeUnlinkedReplicas()
        res = self.setOldTasksAsFailed(self.am_getOption('FailIntervalDay', 3))
        return res

    def clearFailedTasks(self):
        """ This obtains the tasks which are marked as Failed and remove all the associated records
    """
        res = self.stagerClient.getTasksWithStatus('Failed')
        if not res['OK']:
            gLogger.fatal(
                "RequestFinalization.clearFailedTasks: Failed to get Failed Tasks from StagerDB.",
                res['Message'])
            return res
        failedTasks = res['Value']
        gLogger.info(
            "RequestFinalization.clearFailedTasks: Obtained %s tasks in the 'Failed' status."
            % len(failedTasks))
        for taskID, (source, callback, sourceTask) in failedTasks.items():
            if (callback and sourceTask):
                res = self.__performCallback('Failed', callback, sourceTask)
                if not res['OK']:
                    failedTasks.pop(taskID)
        if not failedTasks:
            gLogger.info(
                "RequestFinalization.clearFailedTasks: No tasks to remove.")
            return S_OK()
        gLogger.info(
            "RequestFinalization.clearFailedTasks: Removing %s tasks..." %
            len(failedTasks))
        res = self.stagerClient.removeTasks(failedTasks.keys())
        if not res['OK']:
            gLogger.error(
                "RequestFinalization.clearFailedTasks: Failed to remove tasks.",
                res['Message'])
            return res
        gLogger.info("RequestFinalization.clearFailedTasks: ...removed.")
        return S_OK()

    def callbackDoneTasks(self):
        """ This issues the call back message for the Tasks with a State='Done'
    """
        res = self.stagerClient.getTasksWithStatus('Done')
        if not res['OK']:
            gLogger.fatal(
                "RequestFinalization.callbackDoneTasks: Failed to get Done Tasks from StorageManagementDB.",
                res['Message'])
            return res
        doneTasks = res['Value']
        gLogger.info(
            "RequestFinalization.callbackDoneTasks: Obtained %s tasks in the 'Done' status."
            % len(doneTasks))
        for taskID, (source, callback, sourceTask) in doneTasks.items():
            if (callback and sourceTask):
                res = self.__performCallback('Done', callback, sourceTask)
                if not res['OK']:
                    doneTasks.pop(taskID)
        if not doneTasks:
            gLogger.info(
                "RequestFinalization.callbackDoneTasks: No tasks to update to Done."
            )
            return S_OK()
        res = self.stagerClient.removeTasks(doneTasks.keys())
        if not res['OK']:
            gLogger.fatal(
                "RequestFinalization.callbackDoneTasks: Failed to remove Done tasks.",
                res['Message'])
        return res

    def callbackStagedTasks(self):
        """ This updates the status of the Tasks to Done then issues the call back message
    """
        res = self.stagerClient.getTasksWithStatus('Staged')
        if not res['OK']:
            gLogger.fatal(
                "RequestFinalization.callbackStagedTasks: Failed to get Staged Tasks from StagerDB.",
                res['Message'])
            return res
        stagedTasks = res['Value']
        gLogger.info(
            "RequestFinalization.callbackStagedTasks: Obtained %s tasks in the 'Staged' status."
            % len(stagedTasks))
        for taskID, (source, callback, sourceTask) in stagedTasks.items():
            if (callback and sourceTask):
                res = self.__performCallback('Done', callback, sourceTask)
                if not res['OK']:
                    stagedTasks.pop(taskID)
                else:
                    gLogger.info(
                        "RequestFinalization.callbackStagedTasks, Task = %s: %s"
                        % (sourceTask, res['Value']))

        if not stagedTasks:
            gLogger.info(
                "RequestFinalization.callbackStagedTasks: No tasks to update to Done."
            )
            return S_OK()
        # Daniela: Why is the line below commented out?
        #res = self.stagerClient.setTasksDone(stagedTasks.keys())
        res = self.stagerClient.removeTasks(stagedTasks.keys())
        if not res['OK']:
            gLogger.fatal(
                "RequestFinalization.callbackStagedTasks: Failed to remove staged Tasks.",
                res['Message'])
        return res

    def __performCallback(self, status, callback, sourceTask):
        method, service = callback.split('@')
        gLogger.debug(
            "RequestFinalization.__performCallback: Attempting to perform call back for %s with %s status"
            % (sourceTask, status))
        client = RPCClient(service)
        gLogger.debug(
            "RequestFinalization.__performCallback: Created RPCClient to %s" %
            service)
        gLogger.debug(
            "RequestFinalization.__performCallback: Attempting to invoke %s service method"
            % method)
        res = getattr(client, method)(sourceTask, status)
        if not res['OK']:
            gLogger.error(
                "RequestFinalization.__performCallback: Failed to perform callback",
                res['Message'])
        else:
            gLogger.info(
                "RequestFinalization.__performCallback: Successfully issued callback to %s for %s with %s status"
                % (callback, sourceTask, status))
        return res

    def removeUnlinkedReplicas(self):
        gLogger.info(
            "RequestFinalization.removeUnlinkedReplicas: Attempting to cleanup unlinked Replicas."
        )
        res = self.stagerClient.removeUnlinkedReplicas()
        if not res['OK']:
            gLogger.error(
                "RequestFinalization.removeUnlinkedReplicas: Failed to cleanup unlinked Replicas.",
                res['Message'])
        else:
            gLogger.info(
                "RequestFinalization.removeUnlinkedReplicas: Successfully removed unlinked Replicas."
            )
        return res

    def clearReleasedTasks(self):
        # TODO: issue release of the pins associated to this task
        res = self.stagerClient.getTasksWithStatus('Released')
        if not res['OK']:
            gLogger.fatal(
                "RequestFinalization.clearReleasedTasks: Failed to get Released Tasks from StagerDB.",
                res['Message'])
            return res
        stagedTasks = res['Value']
        gLogger.info(
            "RequestFinalization.clearReleasedTasks: Removing %s tasks..." %
            len(stagedTasks))
        res = self.stagerClient.removeTasks(stagedTasks.keys())
        if not res['OK']:
            gLogger.error(
                "RequestFinalization.clearReleasedTasks: Failed to remove tasks.",
                res['Message'])
            return res
        gLogger.info("RequestFinalization.clearReleasedTasks: ...removed.")
        return S_OK()

    def setOldTasksAsFailed(self, daysOld):
        gLogger.debug(
            "RequestFinalization.setOldTasksAsFailed: Attempting....")
        res = self.stagerClient.setOldTasksAsFailed(daysOld)
        if not res['OK']:
            gLogger.error(
                "RequestFinalization.setOldTasksAsFailed: Failed to set old tasks to a Failed state.",
                res['Message'])
            return res
        return S_OK()
Пример #36
0
args = Script.getPositionalArgs()

if len( args ) < 1:
  Script.showHelp()

from DIRAC import exit as DIRACExit, gLogger

try:
  jobIDs = [int( arg ) for arg in args]
except:
  gLogger.fatal( 'DIRAC Job IDs must be integers' )
  DIRACExit( 2 )

from DIRAC.StorageManagementSystem.Client.StorageManagerClient import StorageManagerClient
client = StorageManagerClient()

outStr = "\n"
for jobID in jobIDs:
  res = client.getTaskSummary( jobID )
  if not res['OK']:
    gLogger.error( res['Message'] )
    continue
  if not res['Value']:
    gLogger.notice( 'No info for job %s, probably gone from the stager...' % jobID )
    continue
  taskInfo = res['Value']['TaskInfo']
  replicaInfo = res['Value']['ReplicaInfo']
  outStr = "%s: %s" % ( 'JobID'.ljust( 20 ), jobID )
  outStr += "\n%s: %s" % ( 'Status'.ljust( 20 ), taskInfo[str( jobID )]['Status'] )
  outStr += "\n%s: %s" % ( 'SubmitTime'.ljust( 20 ), taskInfo[str( jobID )]['SubmitTime'] )
stageLfns = {}

if os.path.exists( fileName ):
  try:
    lfnFile = open( fileName )
    lfns = [ k.strip() for k in lfnFile.readlines() ]
    lfnFile.close()
  except Exception:
    DIRAC.gLogger.exception( 'Can not open file', fileName )
    DIRAC.exit( -1 )
else:
  lfns = args[1:]

stageLfns[seName] = lfns
stagerClient = StorageManagerClient()

res = stagerClient.setRequest( stageLfns, 'WorkloadManagement',
                                      'updateJobFromStager@WorkloadManagement/JobStateUpdate',
                                      0 ) # fake JobID = 0
if not res['OK']:
  DIRAC.gLogger.error( res['Message'] )
  DIRAC.exit( -1 )
else:
  print "Stage request submitted for LFNs:\n %s" %lfns
  print "SE= %s" %seName
  print "You can check their status and progress with dirac-stager-monitor-file <LFN> <SE>"

'''Example1:
dirac-stager-stage-files.py GRIDKA-RDST filesToStage.txt 
Stage request submitted for LFNs:
Пример #38
0
stageLfns = {}

if os.path.exists(fileName):
  try:
    lfnFile = open(fileName)
    lfns = [k.strip() for k in lfnFile.readlines()]
    lfnFile.close()
  except Exception:
    gLogger.exception('Can not open file', fileName)
    DIRACExit(-1)
else:
  lfns = args[:len(args) - 1]

stageLfns[seName] = lfns
stagerClient = StorageManagerClient()

res = stagerClient.setRequest(stageLfns, 'WorkloadManagement',
                              'updateJobFromStager@WorkloadManagement/JobStateUpdate',
                              0)  # fake JobID = 0
if not res['OK']:
  gLogger.error(res['Message'])
  DIRACExit(-1)
else:
  gLogger.notice("Stage request submitted for LFNs:\n %s" % lfns)
  gLogger.notice("SE= %s" % seName)
  gLogger.notice("You can check their status and progress with dirac-stager-monitor-file <LFN> <SE>")

'''Example1:
dirac-stager-stage-files.py filesToStage.txt GRIDKA-RDST
Stage request submitted for LFNs:
Пример #39
0
class RequestPreparationAgent(AgentModule):
    def initialize(self):
        self.fileCatalog = FileCatalog()
        self.dm = DataManager()
        self.stagerClient = StorageManagerClient()
        self.dataIntegrityClient = DataIntegrityClient()
        # This sets the Default Proxy to used as that defined under
        # /Operations/Shifter/DataManager
        # the shifterProxy option in the Configuration can be used to change this default.
        self.am_setOption("shifterProxy", "DataManager")

        return S_OK()

    def execute(self):
        """This is the first logical task to be executed and manages the New->Waiting transition of the Replicas"""
        res = self.__getNewReplicas()
        if not res["OK"]:
            gLogger.fatal(
                "RequestPreparation.prepareNewReplicas: Failed to get replicas from StagerDB.", res["Message"]
            )
            return res
        if not res["Value"]:
            gLogger.info("There were no New replicas found")
            return res
        replicas = res["Value"]["Replicas"]
        replicaIDs = res["Value"]["ReplicaIDs"]
        gLogger.info(
            "RequestPreparation.prepareNewReplicas: Obtained %s New replicas for preparation." % len(replicaIDs)
        )

        # Check if the files exist in the FileCatalog
        res = self.__getExistingFiles(replicas)
        if not res["OK"]:
            return res
        exist = res["Value"]["Exist"]
        terminal = res["Value"]["Missing"]
        failed = res["Value"]["Failed"]
        if not exist:
            gLogger.error("RequestPreparation.prepareNewReplicas: Failed to determine the existence of any file")
            return S_OK()
        terminalReplicaIDs = {}
        for lfn, reason in terminal.items():
            for replicaID in replicas[lfn].values():
                terminalReplicaIDs[replicaID] = reason
            replicas.pop(lfn)
        gLogger.info("RequestPreparation.prepareNewReplicas: %s files exist in the FileCatalog." % len(exist))
        if terminal:
            gLogger.info(
                "RequestPreparation.prepareNewReplicas: %s files do not exist in the FileCatalog." % len(terminal)
            )

        # Obtain the file sizes from the FileCatalog
        res = self.__getFileSize(exist)
        if not res["OK"]:
            return res
        failed.update(res["Value"]["Failed"])
        terminal = res["Value"]["ZeroSize"]
        fileSizes = res["Value"]["FileSizes"]
        if not fileSizes:
            gLogger.error("RequestPreparation.prepareNewReplicas: Failed determine sizes of any files")
            return S_OK()
        for lfn, reason in terminal.items():
            for _se, replicaID in replicas[lfn].items():
                terminalReplicaIDs[replicaID] = reason
            replicas.pop(lfn)
        gLogger.info(
            "RequestPreparation.prepareNewReplicas: Obtained %s file sizes from the FileCatalog." % len(fileSizes)
        )
        if terminal:
            gLogger.info(
                "RequestPreparation.prepareNewReplicas: %s files registered with zero size in the FileCatalog."
                % len(terminal)
            )

        # Obtain the replicas from the FileCatalog
        res = self.__getFileReplicas(list(fileSizes))
        if not res["OK"]:
            return res
        failed.update(res["Value"]["Failed"])
        terminal = res["Value"]["ZeroReplicas"]
        fileReplicas = res["Value"]["Replicas"]
        if not fileReplicas:
            gLogger.error("RequestPreparation.prepareNewReplicas: Failed determine replicas for any files")
            return S_OK()
        for lfn, reason in terminal.items():
            for _se, replicaID in replicas[lfn].items():
                terminalReplicaIDs[replicaID] = reason
            replicas.pop(lfn)
        gLogger.info(
            "RequestPreparation.prepareNewReplicas: Obtained replica information for %s file from the FileCatalog."
            % len(fileReplicas)
        )
        if terminal:
            gLogger.info(
                "RequestPreparation.prepareNewReplicas: %s files registered with zero replicas in the FileCatalog."
                % len(terminal)
            )

        # Check the replicas exist at the requested site
        replicaMetadata = []
        for lfn, requestedSEs in replicas.items():
            lfnReplicas = fileReplicas.get(lfn)

            # This should not happen in principle, but it was seen
            # after a corrupted staging request has entered the DB
            if not lfnReplicas:
                gLogger.error("Missing replicas information", "%s %s" % (lfn, requestedSEs))
                continue

            for requestedSE, replicaID in requestedSEs.items():
                if requestedSE not in lfnReplicas.keys():
                    terminalReplicaIDs[replicaID] = "LFN not registered at requested SE"
                    replicas[lfn].pop(requestedSE)
                else:
                    replicaMetadata.append((replicaID, lfnReplicas[requestedSE], fileSizes[lfn]))

        # Update the states of the files in the database
        if terminalReplicaIDs:
            gLogger.info(
                "RequestPreparation.prepareNewReplicas: %s replicas are terminally failed." % len(terminalReplicaIDs)
            )
            # res = self.stagerClient.updateReplicaFailure( terminalReplicaIDs )
            res = self.stagerClient.updateReplicaFailure(terminalReplicaIDs)
            if not res["OK"]:
                gLogger.error(
                    "RequestPreparation.prepareNewReplicas: Failed to update replica failures.", res["Message"]
                )
        if replicaMetadata:
            gLogger.info(
                "RequestPreparation.prepareNewReplicas: %s replica metadata to be updated." % len(replicaMetadata)
            )
            # Sets the Status='Waiting' of CacheReplicas records that are OK with catalogue checks
            res = self.stagerClient.updateReplicaInformation(replicaMetadata)
            if not res["OK"]:
                gLogger.error(
                    "RequestPreparation.prepareNewReplicas: Failed to update replica metadata.", res["Message"]
                )
        return S_OK()

    def __getNewReplicas(self):
        """This obtains the New replicas from the Replicas table and for each LFN the requested storage element"""
        # First obtain the New replicas from the CacheReplicas table
        res = self.stagerClient.getCacheReplicas({"Status": "New"})
        if not res["OK"]:
            gLogger.error(
                "RequestPreparation.__getNewReplicas: Failed to get replicas with New status.", res["Message"]
            )
            return res
        if not res["Value"]:
            gLogger.debug("RequestPreparation.__getNewReplicas: No New replicas found to process.")
            return S_OK()
        else:
            gLogger.debug(
                "RequestPreparation.__getNewReplicas: Obtained %s New replicas(s) to process." % len(res["Value"])
            )
        replicas = {}
        replicaIDs = {}
        for replicaID, info in res["Value"].items():
            lfn = info["LFN"]
            storageElement = info["SE"]
            replicas.setdefault(lfn, {})[storageElement] = replicaID
            replicaIDs[replicaID] = (lfn, storageElement)
        return S_OK({"Replicas": replicas, "ReplicaIDs": replicaIDs})

    def __getExistingFiles(self, lfns):
        """This checks that the files exist in the FileCatalog."""
        res = self.fileCatalog.exists(list(set(lfns)))
        if not res["OK"]:
            gLogger.error(
                "RequestPreparation.__getExistingFiles: Failed to determine whether files exist.", res["Message"]
            )
            return res
        failed = res["Value"]["Failed"]
        success = res["Value"]["Successful"]
        exist = [lfn for lfn, exists in success.items() if exists]
        missing = list(set(success) - set(exist))
        if missing:
            reason = "LFN not registered in the FC"
            gLogger.warn("RequestPreparation.__getExistingFiles: %s" % reason, "\n".join([""] + missing))
            self.__reportProblematicFiles(missing, "LFN-LFC-DoesntExist")
            missing = dict.fromkeys(missing, reason)
        else:
            missing = {}
        return S_OK({"Exist": exist, "Missing": missing, "Failed": failed})

    def __getFileSize(self, lfns):
        """This obtains the file size from the FileCatalog."""
        fileSizes = {}
        zeroSize = {}
        res = self.fileCatalog.getFileSize(lfns)
        if not res["OK"]:
            gLogger.error("RequestPreparation.__getFileSize: Failed to get sizes for files.", res["Message"])
            return res
        failed = res["Value"]["Failed"]
        for lfn, size in res["Value"]["Successful"].items():
            if size == 0:
                zeroSize[lfn] = "LFN registered with zero size in the FileCatalog"
            else:
                fileSizes[lfn] = size
        if zeroSize:
            for lfn, reason in zeroSize.items():
                gLogger.warn("RequestPreparation.__getFileSize: %s" % reason, lfn)
            self.__reportProblematicFiles(zeroSize.keys(), "LFN-LFC-ZeroSize")
        return S_OK({"FileSizes": fileSizes, "ZeroSize": zeroSize, "Failed": failed})

    def __getFileReplicas(self, lfns):
        """This obtains the replicas from the FileCatalog."""
        replicas = {}
        noReplicas = {}
        res = self.dm.getActiveReplicas(lfns)
        if not res["OK"]:
            gLogger.error("RequestPreparation.__getFileReplicas: Failed to obtain file replicas.", res["Message"])
            return res
        failed = res["Value"]["Failed"]
        for lfn, lfnReplicas in res["Value"]["Successful"].items():
            if len(lfnReplicas) == 0:
                noReplicas[lfn] = "LFN registered with zero replicas in the FileCatalog"
            else:
                replicas[lfn] = lfnReplicas
        if noReplicas:
            for lfn, reason in noReplicas.items():
                gLogger.warn("RequestPreparation.__getFileReplicas: %s" % reason, lfn)
            self.__reportProblematicFiles(list(noReplicas), "LFN-LFC-NoReplicas")
        return S_OK({"Replicas": replicas, "ZeroReplicas": noReplicas, "Failed": failed})

    def __reportProblematicFiles(self, lfns, reason):
        return S_OK()
        res = self.dataIntegrityClient.setFileProblematic(lfns, reason, sourceComponent="RequestPreparationAgent")
        if not res["OK"]:
            gLogger.error(
                "RequestPreparation.__reportProblematicFiles: Failed to report missing files.", res["Message"]
            )
            return res
        if res["Value"]["Successful"]:
            gLogger.info(
                "RequestPreparation.__reportProblematicFiles: Successfully reported %s missing files."
                % len(res["Value"]["Successful"])
            )
        if res["Value"]["Failed"]:
            gLogger.info(
                "RequestPreparation.__reportProblematicFiles: Failed to report %s problematic files."
                % len(res["Value"]["Failed"])
            )
        return res
Пример #40
0
  def requestStage( self, jobState, candidates, lfnData ):
    #Any site is as good as any so random time!
    stageSite = random.sample( candidates, 1 )[0]
    self.jobLog.info( "Site selected %s for staging" % stageSite )
    result = getSEsForSite( stageSite )
    if not result['OK']:
      return S_ERROR( 'Could not determine SEs for site %s' % stageSite )
    siteSEs = result['Value']

    tapeSEs = []
    diskSEs = []
    for seName in siteSEs:
      result = self.__getSEStatus( seName )
      if not result[ 'OK' ]:
        self.jobLog.error( "Cannot retrieve SE %s status: %s" % ( seName, result[ 'Message' ] ) )
        return S_ERROR( "Cannot retrieve SE status" )
      seStatus = result[ 'Value' ]
      if seStatus[ 'Read' ] and seStatus[ 'TapeSE' ]:
        tapeSEs.append( seName )
      if seStatus[ 'Read' ] and seStatus[ 'DiskSE' ]:
        diskSEs.append( seName )

    if not tapeSEs:
      return S_ERROR( "No Local SEs for site %s" % stageSite )

    self.jobLog.verbose( "Tape SEs are %s" % ( ", ".join( tapeSEs ) ) )

    stageLFNs = {}
    lfnToStage = []
    for lfn in lfnData:
      replicas = lfnData[ lfn ][ 'Replicas' ]
      # Check SEs
      seStage = []
      for seName in replicas:
        _surl = replicas[ seName ][ 'SURL' ]
        if seName in diskSEs:
          # This lfn is in disk. Skip it
          seStage = []
          break
        if seName not in tapeSEs:
          # This lfn is not in this tape SE. Check next SE
          continue
        seStage.append( seName )
      for seName in seStage:
        if seName not in stageLFNs:
          stageLFNs[ seName ] = []
        stageLFNs[ seName ].append( lfn )
        if lfn not in lfnToStage:
          lfnToStage.append( lfn )

    if not stageLFNs:
      return S_ERROR( "Cannot find tape replicas" )

    # Check if any LFN is in more than one SE
    # If that's the case, try to stage from the SE that has more LFNs to stage to group the request
    # 1.- Get the SEs ordered by ascending replicas
    sortedSEs = reversed( sorted( [ ( len( stageLFNs[ seName ] ), seName ) for seName in stageLFNs.keys() ] ) )
    for lfn in lfnToStage:
      found = False
      # 2.- Traverse the SEs
      for _stageCount, seName in sortedSEs:
        if lfn in stageLFNs[ seName ]:
          # 3.- If first time found, just mark as found. Next time delete the replica from the request
          if found:
            stageLFNs[ seName ].remove( lfn )
          else:
            found = True
        # 4.-If empty SE, remove
        if len( stageLFNs[ seName ] ) == 0:
          stageLFNs.pop( seName )

    self.jobLog.info( "Stage request will be \n\t%s" % "\n\t".join( [ "%s:%s" % ( lfn, stageLFNs[ lfn ] ) for lfn in stageLFNs ] ) )

    stagerClient = StorageManagerClient()
    result = stagerClient.setRequest( stageLFNs, 'WorkloadManagement',
                                      'stageCallback@WorkloadManagement/OptimizationMind',
                                      int( jobState.jid ) )
    if not result[ 'OK' ]:
      self.jobLog.error( "Could not send stage request: %s" % result[ 'Message' ] )
      return S_ERROR( "Problem sending staging request" )

    rid = str( result[ 'Value' ] )
    self.jobLog.info( "Stage request %s sent" % rid )
    jobState.setParameter( "StageRequest", rid )
    result = jobState.setStatus( self.ex_getOption( 'StagingStatus', 'Staging' ),
                                 self.ex_getOption( 'StagingMinorStatus', 'Request Sent' ),
                                 appStatus = "",
                                 source = self.ex_optimizerName() )
    if not result[ 'OK' ]:
      return result

    stageCandidates = []
    for seName in stageLFNs:
      result = self.__getSitesForSE( seName )
      if result[ 'OK' ]:
        stageCandidates.append( result[ 'Value' ] )

    stageCandidates = candidates.intersection( *[ sC for sC in stageCandidates ] ).union( [ stageSite ] )
    return S_OK( stageCandidates )
Пример #41
0
def run():

    from DIRAC.StorageManagementSystem.Client.StorageManagerClient import StorageManagerClient
    client = StorageManagerClient()
    queryDict = {}

    if 'status' in switchDict:
        queryDict['Status'] = str(switchDict['status'])

    if 'se' in switchDict:
        queryDict['SE'] = str(switchDict['se'])

    # weird: if there are no switches (dictionary is empty), then the --limit is ignored!!
    # must FIX that in StorageManagementDB.py!
    # ugly fix:
    newer = '1903-08-02 06:24:38'  # select newer than
    if 'limit' in switchDict:
        gLogger.notice("Query limited to %s entries" % switchDict['limit'])
        res = client.getCacheReplicas(queryDict, None, newer, None, None,
                                      int(switchDict['limit']))
    else:
        res = client.getCacheReplicas(queryDict)

    if not res['OK']:
        gLogger.error(res['Message'])
    outStr = "\n"
    if res['Records']:
        replicas = res['Value']
        outStr += " %s" % ("Status".ljust(15))
        outStr += " %s" % ("LastUpdate".ljust(20))
        outStr += " %s" % ("LFN".ljust(80))
        outStr += " %s" % ("SE".ljust(10))
        outStr += " %s" % ("Reason".ljust(10))
        if 'showJobs' in switchDict:
            outStr += " %s" % ("Jobs".ljust(10))
        outStr += " %s" % ("PinExpiryTime".ljust(15))
        outStr += " %s" % ("PinLength(sec)".ljust(15))
        outStr += "\n"

        for crid, info in replicas.iteritems():
            outStr += " %s" % (info['Status'].ljust(15))
            outStr += " %s" % (str(info['LastUpdate']).ljust(20))
            outStr += " %s" % (info['LFN'].ljust(30))
            outStr += " %s" % (info['SE'].ljust(15))
            outStr += " %s" % (str(info['Reason']).ljust(10))

            # Task info
            if 'showJobs' in switchDict:
                resTasks = client.getTasks({'ReplicaID': crid})
                if resTasks['OK']:
                    if resTasks['Value']:
                        tasks = resTasks['Value']
                        jobs = []
                        for tid in tasks:
                            jobs.append(tasks[tid]['SourceTaskID'])
                        outStr += ' %s ' % (str(jobs).ljust(10))
                else:
                    outStr += ' %s ' % (" --- ".ljust(10))
            # Stage request info
            # what if there's no request to the site yet?
            resStageRequests = client.getStageRequests({'ReplicaID': crid})
            if not resStageRequests['OK']:
                gLogger.error(resStageRequests['Message'])
            if resStageRequests['Records']:
                stageRequests = resStageRequests['Value']
                for info in stageRequests.itervalues():
                    outStr += " %s" % (str(info['PinExpiryTime']).ljust(20))
                    outStr += " %s" % (str(info['PinLength']).ljust(10))
            outStr += "\n"

        gLogger.notice(outStr)
    else:
        gLogger.notice("No entries")
Пример #42
0
    def __kill_delete_jobs(self, jobIDList, right):
        """  Kill or delete jobs as necessary
    """

        jobList = self.__get_job_list(jobIDList)
        if not jobList:
            return S_ERROR('Invalid job specification: ' + str(jobIDList))

        validJobList, invalidJobList, nonauthJobList, ownerJobList = self.jobPolicy.evaluateJobRights(
            jobList, right)

        # Get job status to see what is to be killed or deleted
        result = gJobDB.getAttributesForJobList(validJobList, ['Status'])
        if not result['OK']:
            return result
        killJobList = []
        deleteJobList = []
        markKilledJobList = []
        stagingJobList = []
        for jobID, sDict in result['Value'].items():
            if sDict['Status'] in ['Running', 'Matched', 'Stalled']:
                killJobList.append(jobID)
            elif sDict['Status'] in ['Done', 'Failed']:
                if not right == RIGHT_KILL:
                    deleteJobList.append(jobID)
            else:
                markKilledJobList.append(jobID)
            if sDict['Status'] in ['Staging']:
                stagingJobList.append(jobID)

        bad_ids = []
        for jobID in markKilledJobList:
            result = self.__killJob(jobID, sendKillCommand=False)
            if not result['OK']:
                bad_ids.append(jobID)

        for jobID in killJobList:
            result = self.__killJob(jobID)
            if not result['OK']:
                bad_ids.append(jobID)

        for jobID in deleteJobList:
            result = self.__deleteJob(jobID)
            if not result['OK']:
                bad_ids.append(jobID)

        if stagingJobList:
            stagerClient = StorageManagerClient()
            gLogger.info('Going to send killing signal to stager as well!')
            result = stagerClient.killTasksBySourceTaskID(stagingJobList)
            if not result['OK']:
                gLogger.warn('Failed to kill some Stager tasks: %s' %
                             result['Message'])

        if nonauthJobList or bad_ids:
            result = S_ERROR('Some jobs failed deletion')
            if nonauthJobList:
                result['NonauthorizedJobIDs'] = nonauthJobList
            if bad_ids:
                result['FailedJobIDs'] = bad_ids
            return result

        result = S_OK(validJobList)
        result['requireProxyUpload'] = len(
            ownerJobList) > 0 and self.__checkIfProxyUploadIsRequired()

        if invalidJobList:
            result['InvalidJobIDs'] = invalidJobList

        return result
Пример #43
0
    def __setStagingRequest(self, job, destination, inputDataDict):
        """A Staging request is formulated and saved as a job optimizer parameter.
    """

        self.log.verbose("Destination site %s" % (destination))
        self.log.verbose("Input Data: %s" % (inputDataDict))

        destinationSEs = getSEsForSite(destination)
        if not destinationSEs["OK"]:
            return S_ERROR("Could not determine SEs for site %s" % destination)
        destinationSEs = destinationSEs["Value"]

        siteTapeSEs = []
        siteDiskSEs = []
        for se in destinationSEs:
            storageElement = StorageElement(se)
            seStatus = storageElement.getStatus()["Value"]
            if seStatus["Read"] and seStatus["TapeSE"]:
                siteTapeSEs.append(se)
            if seStatus["Read"] and seStatus["DiskSE"]:
                siteDiskSEs.append(se)

        if not siteTapeSEs:
            return S_ERROR("No LocalSEs For Site")

        self.log.verbose("Site tape SEs: %s" % (", ".join(siteTapeSEs)))
        stageSURLs = {}  # OLD WAY
        stageLfns = {}  # NEW WAY

        inputData = inputDataDict["Value"]["Value"]["Successful"]
        for lfn, reps in inputData.items():
            for se, surl in reps.items():
                if se in siteDiskSEs:
                    # this File is on Disk, we can ignore it
                    break
                if not lfn in stageSURLs.keys():
                    stageSURLs[lfn] = {}
                    stageSURLs[lfn].update({se: surl})
                    if not stageLfns.has_key(se):  # NEW WAY
                        stageLfns[se] = []  # NEW WAY
                    stageLfns[se].append(lfn)  # NEW WAY

        # Now we need to check is any LFN is in more than one SE
        if len(stageLfns) > 1:
            stageSEs = sorted([(len(stageLfns[se]), se) for se in stageLfns.keys()])
            for lfn in stageSURLs:
                lfnFound = False
                for (se, numberOfLfns) in reversed(stageSEs):
                    if lfnFound and lfn in stageLfns[se]:
                        stageLfns[se].remove(lfn)
                    if lfn in stageLfns[se]:
                        lfnFound = True

        stagerClient = StorageManagerClient()
        request = stagerClient.setRequest(
            stageLfns, "WorkloadManagement", "updateJobFromStager@WorkloadManagement/JobStateUpdate", job
        )
        if request["OK"]:
            self.jobDB.setJobParameter(int(job), "StageRequest", str(request["Value"]))

        if not request["OK"]:
            self.log.error("Problem sending Staging request:")
            self.log.error(request)
            return S_ERROR("Error Sending Staging Request")
        else:
            self.log.info("Staging request successfully sent")

        result = self.updateJobStatus(job, self.stagingStatus, self.stagingMinorStatus)
        if not result["OK"]:
            return result
        return S_OK(stageLfns)
Пример #44
0
    def __kill_delete_jobs(self, jobIDList, right):
        """  Kill or delete jobs as necessary
    """

        jobList = self.__get_job_list(jobIDList)
        if not jobList:
            return S_ERROR("Invalid job specification: " + str(jobIDList))

        validJobList, invalidJobList, nonauthJobList, ownerJobList = self.jobPolicy.evaluateJobRights(jobList, right)

        # Get job status to see what is to be killed or deleted
        result = gJobDB.getAttributesForJobList(validJobList, ["Status"])
        if not result["OK"]:
            return result
        killJobList = []
        deleteJobList = []
        markKilledJobList = []
        stagingJobList = []
        for jobID, sDict in result["Value"].items():
            if sDict["Status"] in ["Running", "Matched", "Stalled"]:
                killJobList.append(jobID)
            elif sDict["Status"] in ["Done", "Failed"]:
                if not right == RIGHT_KILL:
                    deleteJobList.append(jobID)
            else:
                markKilledJobList.append(jobID)
            if sDict["Status"] in ["Staging"]:
                stagingJobList.append(jobID)

        bad_ids = []
        for jobID in markKilledJobList:
            result = self.__killJob(jobID, sendKillCommand=False)
            if not result["OK"]:
                bad_ids.append(jobID)

        for jobID in killJobList:
            result = self.__killJob(jobID)
            if not result["OK"]:
                bad_ids.append(jobID)

        for jobID in deleteJobList:
            result = self.__deleteJob(jobID)
            if not result["OK"]:
                bad_ids.append(jobID)

        if stagingJobList:
            stagerClient = StorageManagerClient()
            gLogger.info("Going to send killing signal to stager as well!")
            result = stagerClient.killTasksBySourceTaskID(stagingJobList)
            if not result["OK"]:
                gLogger.warn("Failed to kill some Stager tasks: %s" % result["Message"])

        if nonauthJobList or bad_ids:
            result = S_ERROR("Some jobs failed deletion")
            if nonauthJobList:
                result["NonauthorizedJobIDs"] = nonauthJobList
            if bad_ids:
                result["FailedJobIDs"] = bad_ids
            return result

        result = S_OK(validJobList)
        result["requireProxyUpload"] = len(ownerJobList) > 0 and self.__checkIfProxyUploadIsRequired()

        if invalidJobList:
            result["InvalidJobIDs"] = invalidJobList

        return result
Пример #45
0
class StageMonitorAgent(AgentModule):
    def initialize(self):
        self.stagerClient = StorageManagerClient()
        # This sets the Default Proxy to used as that defined under
        # /Operations/Shifter/DataManager
        # the shifterProxy option in the Configuration can be used to change this default.
        self.am_setOption('shifterProxy', 'DataManager')

        return S_OK()

    def execute(self):

        res = getProxyInfo(disableVOMS=True)
        if not res['OK']:
            return res
        self.proxyInfoDict = res['Value']

        res = self.monitorStageRequests()

        return res

    def monitorStageRequests(self):
        """ This is the third logical task manages the StageSubmitted->Staged transition of the Replicas
    """
        res = self.__getStageSubmittedReplicas()
        if not res['OK']:
            gLogger.fatal(
                "StageMonitor.monitorStageRequests: Failed to get replicas from StorageManagementDB.",
                res['Message'])
            return res
        if not res['Value']:
            gLogger.info(
                "StageMonitor.monitorStageRequests: There were no StageSubmitted replicas found"
            )
            return res
        seReplicas = res['Value']['SEReplicas']
        replicaIDs = res['Value']['ReplicaIDs']
        gLogger.info(
            "StageMonitor.monitorStageRequests: Obtained %s StageSubmitted replicas for monitoring."
            % len(replicaIDs))
        for storageElement, seReplicaIDs in seReplicas.items():
            self.__monitorStorageElementStageRequests(storageElement,
                                                      seReplicaIDs, replicaIDs)

        gDataStoreClient.commit()

        return S_OK()

    def __monitorStorageElementStageRequests(self, storageElement,
                                             seReplicaIDs, replicaIDs):
        terminalReplicaIDs = {}
        oldRequests = []
        stagedReplicas = []

        # Since we are in a given SE, the LFN is a unique key
        lfnRepIDs = {}
        lfnReqIDs = {}
        for replicaID in seReplicaIDs:
            lfn = replicaIDs[replicaID]['LFN']
            lfnRepIDs[lfn] = replicaID
            requestID = replicaIDs[replicaID].get('RequestID', None)
            if requestID:
                lfnReqIDs[lfn] = replicaIDs[replicaID]['RequestID']

        gLogger.info(
            "StageMonitor.__monitorStorageElementStageRequests: Monitoring %s stage requests for %s."
            % (len(lfnRepIDs), storageElement))
        oAccounting = DataOperation()
        oAccounting.setStartTime()

        res = StorageElement(storageElement).getFileMetadata(lfnReqIDs)
        if not res['OK']:
            gLogger.error(
                "StageMonitor.__monitorStorageElementStageRequests: Completely failed to monitor stage requests for replicas.",
                res['Message'])
            return
        prestageStatus = res['Value']

        accountingDict = self.__newAccountingDict(storageElement)

        for lfn, reason in prestageStatus['Failed'].items():
            accountingDict['TransferTotal'] += 1
            if re.search('File does not exist', reason):
                gLogger.error(
                    "StageMonitor.__monitorStorageElementStageRequests: LFN did not exist in the StorageElement",
                    lfn)
                terminalReplicaIDs[
                    lfnRepIDs[lfn]] = 'LFN did not exist in the StorageElement'
        for lfn, staged in prestageStatus['Successful'].items():
            if staged and 'Cached' in staged and staged['Cached']:
                accountingDict['TransferTotal'] += 1
                accountingDict['TransferOK'] += 1
                accountingDict['TransferSize'] += staged['Size']
                stagedReplicas.append(lfnRepIDs[lfn])
            if staged and 'Cached' in staged and not staged['Cached']:
                oldRequests.append(lfnRepIDs[lfn])
                # only ReplicaIDs

        oAccounting.setValuesFromDict(accountingDict)
        oAccounting.setEndTime()
        gDataStoreClient.addRegister(oAccounting)

        # Update the states of the replicas in the database
        if terminalReplicaIDs:
            gLogger.info(
                "StageMonitor.__monitorStorageElementStageRequests: %s replicas are terminally failed."
                % len(terminalReplicaIDs))
            res = self.stagerClient.updateReplicaFailure(terminalReplicaIDs)
            if not res['OK']:
                gLogger.error(
                    "StageMonitor.__monitorStorageElementStageRequests: Failed to update replica failures.",
                    res['Message'])
        if stagedReplicas:
            gLogger.info(
                "StageMonitor.__monitorStorageElementStageRequests: %s staged replicas to be updated."
                % len(stagedReplicas))
            res = self.stagerClient.setStageComplete(stagedReplicas)
            if not res['OK']:
                gLogger.error(
                    "StageMonitor.__monitorStorageElementStageRequests: Failed to updated staged replicas.",
                    res['Message'])
            res = self.stagerClient.updateReplicaStatus(
                stagedReplicas, 'Staged')
            if not res['OK']:
                gLogger.error(
                    "StageMonitor.__monitorStorageElementStageRequests: Failed to insert replica status.",
                    res['Message'])
        if oldRequests:
            gLogger.info(
                "StageMonitor.__monitorStorageElementStageRequests: %s old requests will be retried."
                % len(oldRequests))
            res = self.__wakeupOldRequests(oldRequests)
            if not res['OK']:
                gLogger.error(
                    "StageMonitor.__monitorStorageElementStageRequests: Failed to wakeup old requests.",
                    res['Message'])
        return

    def __newAccountingDict(self, storageElement):
        """ Generate a new accounting Dict """

        accountingDict = {}
        accountingDict['OperationType'] = 'Stage'
        accountingDict['User'] = self.proxyInfoDict['username']
        accountingDict['Protocol'] = 'Stager'
        accountingDict['RegistrationTime'] = 0.0
        accountingDict['RegistrationOK'] = 0
        accountingDict['RegistrationTotal'] = 0
        accountingDict['FinalStatus'] = 'Successful'
        accountingDict['Source'] = storageElement
        accountingDict['Destination'] = storageElement
        accountingDict['ExecutionSite'] = siteName()
        accountingDict['TransferTotal'] = 0
        accountingDict['TransferOK'] = 0
        accountingDict['TransferSize'] = 0
        accountingDict['TransferTime'] = self.am_getPollingTime()

        return accountingDict

    def __getStageSubmittedReplicas(self):
        """ This obtains the StageSubmitted replicas from the Replicas table and the RequestID from the StageRequests table """
        res = self.stagerClient.getCacheReplicas({'Status': 'StageSubmitted'})
        if not res['OK']:
            gLogger.error(
                "StageMonitor.__getStageSubmittedReplicas: Failed to get replicas with StageSubmitted status.",
                res['Message'])
            return res
        if not res['Value']:
            gLogger.debug(
                "StageMonitor.__getStageSubmittedReplicas: No StageSubmitted replicas found to process."
            )
            return S_OK()
        else:
            gLogger.debug(
                "StageMonitor.__getStageSubmittedReplicas: Obtained %s StageSubmitted replicas(s) to process."
                % len(res['Value']))

        seReplicas = {}
        replicaIDs = res['Value']
        for replicaID, info in replicaIDs.items():
            storageElement = info['SE']
            if not seReplicas.has_key(storageElement):
                seReplicas[storageElement] = []
            seReplicas[storageElement].append(replicaID)

        # RequestID was missing from replicaIDs dictionary BUGGY?
        res = self.stagerClient.getStageRequests(
            {'ReplicaID': replicaIDs.keys()})
        if not res['OK']:
            return res
        if not res['Value']:
            return S_ERROR(
                'Could not obtain request IDs for replicas %s from StageRequests table'
                % (replicaIDs.keys()))

        for replicaID, info in res['Value'].items():
            reqID = info['RequestID']
            replicaIDs[replicaID]['RequestID'] = reqID

        return S_OK({'SEReplicas': seReplicas, 'ReplicaIDs': replicaIDs})

    def __wakeupOldRequests(self, oldRequests):
        gLogger.info("StageMonitor.__wakeupOldRequests: Attempting...")
        retryInterval = self.am_getOption('RetryIntervalHour', 2)
        res = self.stagerClient.wakeupOldRequests(oldRequests, retryInterval)
        if not res['OK']:
            gLogger.error(
                "StageMonitor.__wakeupOldRequests: Failed to resubmit old requests.",
                res['Message'])
            return res
        return S_OK()
class RequestFinalizationAgent( AgentModule ):

  def initialize( self ):

    # This sets the Default Proxy to used as that defined under
    # /Operations/Shifter/DataManager
    # the shifterProxy option in the Configuration can be used to change this default.
    self.am_setOption( 'shifterProxy', 'DataManager' )
    self.stagerClient = StorageManagerClient()
    return S_OK()

  def execute( self ):
    res = self.clearFailedTasks()
    if not res['OK']:
      return res
    res = self.callbackStagedTasks()
    if not res['OK']:
      return res
    res = self.removeUnlinkedReplicas()
    if not res['OK']:
      return res
    res = self.setOldTasksAsFailed( self.am_getOption( 'FailIntervalDay', 3 ) )
    return res

  def clearFailedTasks( self ):
    """ This obtains the tasks which are marked as Failed and remove all the associated records
    """
    res = self.stagerClient.getTasksWithStatus( 'Failed' )
    if not res['OK']:
      gLogger.fatal( "RequestFinalization.clearFailedTasks: Failed to get Failed Tasks from StagerDB.", res['Message'] )
      return res
    failedTasks = res['Value']
    gLogger.info( "RequestFinalization.clearFailedTasks: Obtained %s tasks in the 'Failed' status." % len( failedTasks ) )
    for taskID, ( _source, callback, sourceTask ) in failedTasks.items():
      if ( callback and sourceTask ):
        res = self.__performCallback( 'Failed', callback, sourceTask )
        if not res['OK']:
          failedTasks.pop( taskID )
    if not failedTasks:
      gLogger.info( "RequestFinalization.clearFailedTasks: No tasks to remove." )
      return S_OK()
    gLogger.info( "RequestFinalization.clearFailedTasks: Removing %s tasks..." % len( failedTasks ) )
    res = self.stagerClient.removeTasks( failedTasks.keys() )
    if not res['OK']:
      gLogger.error( "RequestFinalization.clearFailedTasks: Failed to remove tasks.", res['Message'] )
      return res
    gLogger.info( "RequestFinalization.clearFailedTasks: ...removed." )
    return S_OK()

  def callbackDoneTasks( self ):
    """ This issues the call back message for the Tasks with a State='Done'
    """
    res = self.stagerClient.getTasksWithStatus( 'Done' )
    if not res['OK']:
      gLogger.fatal( "RequestFinalization.callbackDoneTasks: Failed to get Done Tasks from StorageManagementDB.", res['Message'] )
      return res
    doneTasks = res['Value']
    gLogger.info( "RequestFinalization.callbackDoneTasks: Obtained %s tasks in the 'Done' status." % len( doneTasks ) )
    for taskID, ( _source, callback, sourceTask ) in doneTasks.items():
      if ( callback and sourceTask ):
        res = self.__performCallback( 'Done', callback, sourceTask )
        if not res['OK']:
          doneTasks.pop( taskID )
    if not doneTasks:
      gLogger.info( "RequestFinalization.callbackDoneTasks: No tasks to update to Done." )
      return S_OK()
    res = self.stagerClient.removeTasks( doneTasks.keys() )
    if not res['OK']:
      gLogger.fatal( "RequestFinalization.callbackDoneTasks: Failed to remove Done tasks.", res['Message'] )
    return res

  def callbackStagedTasks( self ):
    """ This updates the status of the Tasks to Done then issues the call back message
    """
    res = self.stagerClient.getTasksWithStatus( 'Staged' )
    if not res['OK']:
      gLogger.fatal( "RequestFinalization.callbackStagedTasks: Failed to get Staged Tasks from StagerDB.", res['Message'] )
      return res
    stagedTasks = res['Value']
    gLogger.info( "RequestFinalization.callbackStagedTasks: Obtained %s tasks in the 'Staged' status." % len( stagedTasks ) )
    for taskID, ( _source, callback, sourceTask ) in stagedTasks.items():
      if ( callback and sourceTask ):
        res = self.__performCallback( 'Done', callback, sourceTask )
        if not res['OK']:
          stagedTasks.pop( taskID )
        else:
          gLogger.info( "RequestFinalization.callbackStagedTasks, Task = %s: %s" % ( sourceTask, res['Value'] ) )

    if not stagedTasks:
      gLogger.info( "RequestFinalization.callbackStagedTasks: No tasks to update to Done." )
      return S_OK()
    # Daniela: Why is the line below commented out?
    # res = self.stagerClient.setTasksDone(stagedTasks.keys())
    res = self.stagerClient.removeTasks( stagedTasks.keys() )
    if not res['OK']:
      gLogger.fatal( "RequestFinalization.callbackStagedTasks: Failed to remove staged Tasks.", res['Message'] )
    return res

  def __performCallback( self, status, callback, sourceTask ):
    method, service = callback.split( '@' )
    gLogger.debug( "RequestFinalization.__performCallback: Attempting to perform call back for %s with %s status" % ( sourceTask, status ) )
    client = RPCClient( service )
    gLogger.debug( "RequestFinalization.__performCallback: Created RPCClient to %s" % service )
    gLogger.debug( "RequestFinalization.__performCallback: Attempting to invoke %s service method" % method )
    res = getattr( client, method )( sourceTask, status )
    if not res['OK']:
      gLogger.error( "RequestFinalization.__performCallback: Failed to perform callback", res['Message'] )
    else:
      gLogger.info( "RequestFinalization.__performCallback: Successfully issued callback to %s for %s with %s status" % ( callback, sourceTask, status ) )
    return res

  def removeUnlinkedReplicas( self ):
    gLogger.info( "RequestFinalization.removeUnlinkedReplicas: Attempting to cleanup unlinked Replicas." )
    res = self.stagerClient.removeUnlinkedReplicas()
    if not res['OK']:
      gLogger.error( "RequestFinalization.removeUnlinkedReplicas: Failed to cleanup unlinked Replicas.", res['Message'] )
    else:
      gLogger.info( "RequestFinalization.removeUnlinkedReplicas: Successfully removed unlinked Replicas." )
    return res

  def clearReleasedTasks( self ):
    # TODO: issue release of the pins associated to this task
    res = self.stagerClient.getTasksWithStatus( 'Released' )
    if not res['OK']:
      gLogger.fatal( "RequestFinalization.clearReleasedTasks: Failed to get Released Tasks from StagerDB.", res['Message'] )
      return res
    stagedTasks = res['Value']
    gLogger.info( "RequestFinalization.clearReleasedTasks: Removing %s tasks..." % len( stagedTasks ) )
    res = self.stagerClient.removeTasks( stagedTasks.keys() )
    if not res['OK']:
      gLogger.error( "RequestFinalization.clearReleasedTasks: Failed to remove tasks.", res['Message'] )
      return res
    gLogger.info( "RequestFinalization.clearReleasedTasks: ...removed." )
    return S_OK()

  def setOldTasksAsFailed( self, daysOld ):
    gLogger.debug( "RequestFinalization.setOldTasksAsFailed: Attempting...." )
    res = self.stagerClient.setOldTasksAsFailed( daysOld )
    if not res['OK']:
      gLogger.error( "RequestFinalization.setOldTasksAsFailed: Failed to set old tasks to a Failed state.", res['Message'] )
      return res
    return S_OK()
                                     '  Request:  ID of the Stage request in the StorageManager' ] ) )
Script.parseCommandLine( ignoreErrors = False )

args = Script.getPositionalArgs()

if not len( args ) == 1:
  Script.showHelp()

try:
  taskID = int( args[0] )
except:
  print 'Stage requestID must be an integer'
  DIRAC.exit( 2 )

from DIRAC.StorageManagementSystem.Client.StorageManagerClient import StorageManagerClient
client = StorageManagerClient()

res = client.getTaskSummary( taskID )
if not res['OK']:
  print res['Message']
  DIRAC.exit( 2 )
taskInfo = res['Value']['TaskInfo']
replicaInfo = res['Value']['ReplicaInfo']
outStr = "%s: %s" % ( 'TaskID'.ljust( 20 ), taskID )
outStr = "%s\n%s: %s" % ( outStr, 'Status'.ljust( 20 ), taskInfo[taskID]['Status'] )
outStr = "%s\n%s: %s" % ( outStr, 'Source'.ljust( 20 ), taskInfo[taskID]['Source'] )
outStr = "%s\n%s: %s" % ( outStr, 'SourceTaskID'.ljust( 20 ), taskInfo[taskID]['SourceTaskID'] )
outStr = "%s\n%s: %s" % ( outStr, 'CallBackMethod'.ljust( 20 ), taskInfo[taskID]['CallBackMethod'] )
outStr = "%s\n%s: %s" % ( outStr, 'SubmitTime'.ljust( 20 ), taskInfo[taskID]['SubmitTime'] )
outStr = "%s\n%s: %s" % ( outStr, 'CompleteTime'.ljust( 20 ), taskInfo[taskID]['CompleteTime'] )
for lfn, metadata in replicaInfo.items():
Пример #48
0
# Author :  Daniela Remenska
########################################################################
"""
Reports breakdown of file(s) number/size in different staging states across Storage Elements.
Currently used Cache per SE is also reported. (active pins)
"""

__RCSID__ = "6f34186 (2013-06-13 15:12:35 +0200) Daniela <*****@*****.**>"
import DIRAC
from DIRAC.Core.Base import Script
from DIRAC import gConfig, gLogger, exit as DIRACExit, S_OK, version

Script.parseCommandLine(ignoreErrors=False)
from DIRAC.StorageManagementSystem.Client.StorageManagerClient import StorageManagerClient

client = StorageManagerClient()

res = client.getCacheReplicasSummary()
if not res["OK"]:
    print res["Message"]
    DIRACExit(2)
stagerInfo = res["Value"]
outStr = "\n"
outStr = "%s %s" % (outStr, "Status".ljust(20))
outStr = "%s %s" % (outStr, "SE".ljust(20))
outStr = "%s %s" % (outStr, "NumberOfFiles".ljust(20))
outStr = "%s %s" % (outStr, "Size(GB)".ljust(20))
outStr = "%s\n--------------------------------------------------------------------------\n" % outStr
if stagerInfo:
    for sid in stagerInfo:
        outStr = "%s %s" % (outStr, stagerInfo[sid]["Status"].ljust(20))
Пример #49
0
  def __setStagingRequest( self, job, destination, inputDataDict ):
    """A Staging request is formulated and saved as a job optimizer parameter.
    """

    self.log.verbose( 'Destination site %s' % ( destination ) )
    self.log.verbose( 'Input Data: %s' % ( inputDataDict ) )

    destinationSEs = getSEsForSite( destination )
    if not destinationSEs['OK']:
      return S_ERROR( 'Could not determine SEs for site %s' % destination )
    destinationSEs = destinationSEs['Value']

    siteTapeSEs = []
    siteDiskSEs = []
    for se in destinationSEs:
      storageElement = StorageElement( se )
      seStatus = storageElement.getStatus()['Value']
      if seStatus['Read'] and seStatus['TapeSE']:
        siteTapeSEs.append( se )
      if seStatus['Read'] and seStatus['DiskSE']:
        siteDiskSEs.append( se )

    if not siteTapeSEs:
      return S_ERROR( 'No LocalSEs For Site' )

    self.log.verbose( 'Site tape SEs: %s' % ( ', '.join( siteTapeSEs ) ) )
    stageSURLs = {} # OLD WAY
    stageLfns = {} # NEW WAY

    inputData = inputDataDict['Value']['Value']['Successful']
    for lfn, reps in inputData.items():
      for se, surl in reps.items():
        if se in siteDiskSEs:
          # this File is on Disk, we can ignore it
          break
        if se not in siteTapeSEs:
          # this File is not being staged
          continue
        if not lfn in stageSURLs.keys():
          stageSURLs[lfn] = {}
          stageSURLs[lfn].update( {se:surl} )
          if not stageLfns.has_key( se ): # NEW WAY
            stageLfns[se] = []          # NEW WAY
          stageLfns[se].append( lfn )     # NEW WAY

    # Now we need to check is any LFN is in more than one SE
    if len( stageLfns ) > 1:
      stageSEs = sorted( [ ( len( stageLfns[se] ), se ) for se in stageLfns.keys() ] )
      for lfn in stageSURLs:
        lfnFound = False
        for se in [ item[1] for item in reversed( stageSEs ) ]:
        # for ( numberOfLfns, se ) in reversed( stageSEs ):
          if lfnFound and lfn in stageLfns[se]:
            stageLfns[se].remove( lfn )
          if lfn in stageLfns[se]:
            lfnFound = True

    stagerClient = StorageManagerClient()
    request = stagerClient.setRequest( stageLfns, 'WorkloadManagement',
                                       'updateJobFromStager@WorkloadManagement/JobStateUpdate', job )
    if request['OK']:
      self.jobDB.setJobParameter( int( job ), 'StageRequest', str( request['Value'] ) )

    if not request['OK']:
      self.log.error( 'Problem sending Staging request:' )
      self.log.error( request )
      return S_ERROR( 'Error Sending Staging Request' )
    else:
      self.log.info( 'Staging request successfully sent' )

    result = self.updateJobStatus( job, self.stagingStatus, self.stagingMinorStatus, "Unknown" )
    if not result['OK']:
      return result
    return S_OK( stageLfns )
Пример #50
0
class StageRequestAgent(AgentModule):
    def initialize(self):
        self.stagerClient = StorageManagerClient()
        # self.storageDB = StorageManagementDB()
        # pin lifetime = 1 day
        self.pinLifetime = self.am_getOption("PinLifetime", THROTTLING_TIME)

        # This sets the Default Proxy to used as that defined under
        # /Operations/Shifter/DataManager
        # the shifterProxy option in the Configuration can be used to change this default.
        self.am_setOption("shifterProxy", "DataManager")

        return S_OK()

    def execute(self):

        # Get the current submitted stage space and the amount of pinned space for each storage element
        res = self.getStorageUsage()
        if not res["OK"]:
            return res

        return self.submitStageRequests()

    def getStorageUsage(self):
        """Fill the current Status of the SE Caches from the DB"""
        self.storageElementCache = {}

        res = self.stagerClient.getSubmittedStagePins()
        if not res["OK"]:
            gLogger.fatal(
                "StageRequest.getStorageUsage: Failed to obtain submitted requests from StorageManagementDB.",
                res["Message"],
            )
            return res
        self.storageElementUsage = res["Value"]
        if self.storageElementUsage:
            gLogger.info(
                "StageRequest.getStorageUsage: Active stage/pin requests found at the following sites:"
            )
            for storageElement in sorted(self.storageElementUsage.keys()):
                seDict = self.storageElementUsage[storageElement]
                # Convert to GB for printout
                seDict["TotalSize"] = seDict["TotalSize"] / (1000 * 1000 *
                                                             1000.0)
                gLogger.info(
                    "StageRequest.getStorageUsage: %s: %s replicas with a size of %.3f GB."
                    % (storageElement.ljust(15), str(
                        seDict["Replicas"]).rjust(6), seDict["TotalSize"]))
        if not self.storageElementUsage:
            gLogger.info(
                "StageRequest.getStorageUsage: No active stage/pin requests found."
            )

        return S_OK()

    def submitStageRequests(self):
        """This manages the following transitions of the Replicas
        * Waiting -> Offline (if the file is not found Cached)
        * Waiting -> StageSubmitted (if the file is found Cached)
        * Offline -> StageSubmitted (if there are not more Waiting replicas)
        """
        # Retry Replicas that have not been Staged in a previous attempt
        res = self._getMissingReplicas()
        if not res["OK"]:
            gLogger.fatal(
                "StageRequest.submitStageRequests: Failed to get replicas from StorageManagementDB.",
                res["Message"])
            return res
        seReplicas = res["Value"]["SEReplicas"]
        allReplicaInfo = res["Value"]["AllReplicaInfo"]

        if seReplicas:
            gLogger.info(
                "StageRequest.submitStageRequests: Completing partially Staged Tasks"
            )
        for storageElement, seReplicaIDs in seReplicas.items():
            gLogger.debug("Staging at %s:" % storageElement, seReplicaIDs)
            self._issuePrestageRequests(storageElement, seReplicaIDs,
                                        allReplicaInfo)

        # Check Waiting Replicas and select those found Online and all other Replicas from the same Tasks
        res = self._getOnlineReplicas()
        if not res["OK"]:
            gLogger.fatal(
                "StageRequest.submitStageRequests: Failed to get replicas from StorageManagementDB.",
                res["Message"])
            return res
        seReplicas = res["Value"]["SEReplicas"]
        allReplicaInfo = res["Value"]["AllReplicaInfo"]

        # Check Offline Replicas that fit in the Cache and all other Replicas from the same Tasks
        res = self._getOfflineReplicas()

        if not res["OK"]:
            gLogger.fatal(
                "StageRequest.submitStageRequests: Failed to get replicas from StorageManagementDB.",
                res["Message"])
            return res

        # Merge info from both results
        for storageElement, seReplicaIDs in res["Value"]["SEReplicas"].items():
            seReplicas.setdefault(storageElement, []).extend(seReplicaIDs)
        allReplicaInfo.update(res["Value"]["AllReplicaInfo"])

        gLogger.info(
            "StageRequest.submitStageRequests: Obtained %s replicas for staging."
            % len(allReplicaInfo))
        for storageElement, seReplicaIDs in seReplicas.items():
            gLogger.debug("Staging at %s:" % storageElement, seReplicaIDs)
            self._issuePrestageRequests(storageElement, seReplicaIDs,
                                        allReplicaInfo)
        return S_OK()

    def _getMissingReplicas(self):
        """This recovers Replicas that were not Staged on a previous attempt (the stage request failed or timed out),
        while other Replicas of the same task are already Staged. If left behind they can produce a deadlock.
        All SEs are considered, even if their Cache is full
        """
        # Get Replicas that are in Staged/StageSubmitted
        gLogger.info(
            "StageRequest._getMissingReplicas: Checking Staged Replicas")

        res = self.__getStagedReplicas()
        if not res["OK"]:
            gLogger.fatal(
                "StageRequest._getMissingReplicas: Failed to get replicas from StorageManagementDB.",
                res["Message"])
            return res
        seReplicas = {}

        allReplicaInfo = res["Value"]["AllReplicaInfo"]
        replicasToStage = []
        for seReplicaIDs in res["Value"]["SEReplicas"].values():
            # Consider all SEs
            replicasToStage += seReplicaIDs

        # Get Replicas from the same Tasks as those selected
        res = self.__addAssociatedReplicas(replicasToStage, seReplicas,
                                           allReplicaInfo)
        if not res["OK"]:
            gLogger.fatal(
                "StageRequest._getMissingReplicas: Failed to get associated Replicas.",
                res["Message"])

        return res

    def _getOnlineReplicas(self):
        """This manages the transition
        * Waiting -> Offline (if the file is not found Cached)
        and returns the list of Cached Replicas for which the pin time has to be extended
        SEs for which the cache is currently full are not considered
        """
        # Get all Replicas in Waiting Status associated to Staging Tasks
        gLogger.verbose(
            "StageRequest._getOnlineReplicas: Checking Online Replicas to be handled"
        )

        res = self.__getWaitingReplicas()
        if not res["OK"]:
            gLogger.fatal(
                "StageRequest._getOnlineReplicas: Failed to get replicas from StorageManagementDB.",
                res["Message"])
            return res
        seReplicas = {}
        allReplicaInfo = res["Value"]["AllReplicaInfo"]
        if not len(allReplicaInfo):
            gLogger.info(
                "StageRequest._getOnlineReplicas: There were no Waiting replicas found"
            )
            return res
        gLogger.info(
            "StageRequest._getOnlineReplicas: Obtained %s replicas Waiting for staging."
            % len(allReplicaInfo))
        replicasToStage = []
        for storageElement, seReplicaIDs in res["Value"]["SEReplicas"].items():
            if not self.__usage(storageElement) < self.__cache(storageElement):
                gLogger.info(
                    "StageRequest._getOnlineReplicas: Skipping %s, current usage above limit ( %s GB )"
                    % (storageElement, self.__cache(storageElement)))
                # Do not consider those SE that have the Cache full
                continue
            # Check if the Replica Metadata is OK and find out if they are Online or Offline
            res = self.__checkIntegrity(storageElement, seReplicaIDs,
                                        allReplicaInfo)
            if not res["OK"]:
                gLogger.error(
                    "StageRequest._getOnlineReplicas: Failed to check Replica Metadata",
                    "(%s): %s" % (storageElement, res["Message"]),
                )
            else:
                # keep only Online Replicas
                seReplicas[storageElement] = res["Value"]["Online"]
                replicasToStage += res["Value"]["Online"]

        # Get Replicas from the same Tasks as those selected
        res = self.__addAssociatedReplicas(replicasToStage, seReplicas,
                                           allReplicaInfo)
        if not res["OK"]:
            gLogger.fatal(
                "StageRequest._getOnlineReplicas: Failed to get associated Replicas.",
                res["Message"])

        return res

    def _getOfflineReplicas(self):
        """This checks Replicas in Offline status
        and returns the list of Replicas to be Staged
        SEs for which the cache is currently full are not considered
        """
        # Get all Replicas in Waiting Status associated to Staging Tasks
        gLogger.verbose(
            "StageRequest._getOfflineReplicas: Checking Offline Replicas to be handled"
        )

        res = self.__getOfflineReplicas()
        if not res["OK"]:
            gLogger.fatal(
                "StageRequest._getOfflineReplicas: Failed to get replicas from StorageManagementDB.",
                res["Message"])
            return res
        seReplicas = {}
        allReplicaInfo = res["Value"]["AllReplicaInfo"]
        if not len(allReplicaInfo):
            gLogger.info(
                "StageRequest._getOfflineReplicas: There were no Offline replicas found"
            )
            return res
        gLogger.info(
            "StageRequest._getOfflineReplicas: Obtained %s replicas Offline for staging."
            % len(allReplicaInfo))
        replicasToStage = []
        for storageElement, seReplicaIDs in res["Value"]["SEReplicas"].items():
            if not self.__usage(storageElement) < self.__cache(storageElement):
                gLogger.info(
                    "StageRequest._getOfflineReplicas: Skipping %s, current usage above limit ( %s GB )"
                    % (storageElement, self.__cache(storageElement)))
                # Do not consider those SE that have the Cache full
                continue
            seReplicas[storageElement] = []
            for replicaID in sorted(seReplicaIDs):
                seReplicas[storageElement].append(replicaID)
                replicasToStage.append(replicaID)
                self.__add(storageElement, allReplicaInfo[replicaID]["Size"])
                if not self.__usage(storageElement) < self.__cache(
                        storageElement):
                    # Stop adding Replicas when the cache is full
                    break

        # Get Replicas from the same Tasks as those selected
        res = self.__addAssociatedReplicas(replicasToStage, seReplicas,
                                           allReplicaInfo)
        if not res["OK"]:
            gLogger.fatal(
                "StageRequest._getOfflineReplicas: Failed to get associated Replicas.",
                res["Message"])

        return res

    def __usage(self, storageElement):
        """Retrieve current usage of SE"""
        # Set it if not yet done
        self.storageElementUsage.setdefault(storageElement, {"TotalSize": 0.0})
        return self.storageElementUsage[storageElement]["TotalSize"]

    def __cache(self, storageElement):
        """Retrieve cache size for SE"""
        if storageElement not in self.storageElementCache:
            diskCacheTB = float(
                StorageElement(storageElement).options.get("DiskCacheTB", 1.0))
            self.storageElementCache[
                storageElement] = diskCacheTB * 1000.0 / THROTTLING_STEPS
        return self.storageElementCache[storageElement]

    def __add(self, storageElement, size):
        """Add size (in bytes) to current usage of storageElement (in GB)"""
        self.storageElementUsage.setdefault(storageElement, {"TotalSize": 0.0})
        size /= 1000.0 * 1000.0 * 1000.0
        self.storageElementUsage[storageElement]["TotalSize"] += size
        return size

    def _issuePrestageRequests(self, storageElement, seReplicaIDs,
                               allReplicaInfo):
        """Make the request to the SE and update the DB"""
        # Since we are in a give SE, the lfn is a unique key
        lfnRepIDs = {}
        for replicaID in seReplicaIDs:
            lfn = allReplicaInfo[replicaID]["LFN"]
            lfnRepIDs[lfn] = replicaID

        # Now issue the prestage requests for the remaining replicas
        stageRequestMetadata = {}
        updatedLfnIDs = []
        if lfnRepIDs:
            gLogger.info(
                "StageRequest._issuePrestageRequests: Submitting %s stage requests for %s."
                % (len(lfnRepIDs), storageElement))
            res = StorageElement(storageElement).prestageFile(
                lfnRepIDs, lifetime=self.pinLifetime)
            gLogger.debug(
                "StageRequest._issuePrestageRequests: StorageElement.prestageStorageFile: res=",
                res)
            # Daniela: fishy result from ReplicaManager!!! Should NOT return OK
            # res= {'OK': True, 'Value': {'Successful': {}, 'Failed': {'srm://srm-lhcb.cern.ch/castor/cern.ch/grid/lhcb/data/2010/RAW/EXPRESS/LHCb/COLLISION10/71476/071476_0000000241.raw': ' SRM2Storage.__gfal_exec: Failed to perform gfal_prestage.[SE][BringOnline][SRM_INVALID_REQUEST] httpg://srm-lhcb.cern.ch:8443/srm/managerv2: User not able to access specified space token\n'}}}
            # res= {'OK': True, 'Value': {'Successful': {'srm://gridka-dCache.fzk.de/pnfs/gridka.de/lhcb/data/2009/RAW/FULL/LHCb/COLLISION09/63495/063495_0000000001.raw': '-2083846379'}, 'Failed': {}}}

            if not res["OK"]:
                gLogger.error(
                    "StageRequest._issuePrestageRequests: Completely failed to submit stage requests for replicas.",
                    res["Message"],
                )
            else:
                for lfn, requestID in res["Value"]["Successful"].items():
                    stageRequestMetadata.setdefault(requestID,
                                                    []).append(lfnRepIDs[lfn])
                    updatedLfnIDs.append(lfnRepIDs[lfn])
        if stageRequestMetadata:
            gLogger.info(
                "StageRequest._issuePrestageRequests: %s stage request metadata to be updated."
                % len(stageRequestMetadata))
            res = self.stagerClient.insertStageRequest(stageRequestMetadata,
                                                       self.pinLifetime)
            if not res["OK"]:
                gLogger.error(
                    "StageRequest._issuePrestageRequests: Failed to insert stage request metadata.",
                    res["Message"])
                return res
            res = self.stagerClient.updateReplicaStatus(
                updatedLfnIDs, "StageSubmitted")
            if not res["OK"]:
                gLogger.error(
                    "StageRequest._issuePrestageRequests: Failed to insert replica status.",
                    res["Message"])
        return

    def __sortBySE(self, replicaDict):

        seReplicas = {}
        replicaIDs = {}
        for replicaID, info in replicaDict.items():
            lfn = info["LFN"]
            storageElement = info["SE"]
            size = info["Size"]
            pfn = info["PFN"]
            replicaIDs[replicaID] = {
                "LFN": lfn,
                "PFN": pfn,
                "Size": size,
                "StorageElement": storageElement
            }
            seReplicas.setdefault(storageElement, []).append(replicaID)
        return S_OK({"SEReplicas": seReplicas, "AllReplicaInfo": replicaIDs})

    def __getStagedReplicas(self):
        """This obtains the Staged replicas from the Replicas table and for each LFN the requested storage element"""
        # First obtain the Waiting replicas from the Replicas table
        res = self.stagerClient.getStagedReplicas()
        if not res["OK"]:
            gLogger.error(
                "StageRequest.__getStagedReplicas: Failed to get replicas with Waiting status.",
                res["Message"])
            return res
        if not res["Value"]:
            gLogger.debug(
                "StageRequest.__getStagedReplicas: No Waiting replicas found to process."
            )
        else:
            gLogger.debug(
                "StageRequest.__getStagedReplicas: Obtained %s Waiting replicas(s) to process."
                % len(res["Value"]))

        return self.__sortBySE(res["Value"])

    def __getWaitingReplicas(self):
        """This obtains the Waiting replicas from the Replicas table and for each LFN the requested storage element"""
        # First obtain the Waiting replicas from the Replicas table
        res = self.stagerClient.getWaitingReplicas()
        if not res["OK"]:
            gLogger.error(
                "StageRequest.__getWaitingReplicas: Failed to get replicas with Waiting status.",
                res["Message"])
            return res
        if not res["Value"]:
            gLogger.debug(
                "StageRequest.__getWaitingReplicas: No Waiting replicas found to process."
            )
        else:
            gLogger.debug(
                "StageRequest.__getWaitingReplicas: Obtained %s Waiting replicas(s) to process."
                % len(res["Value"]))

        return self.__sortBySE(res["Value"])

    def __getOfflineReplicas(self):
        """This obtains the Offline replicas from the Replicas table and for each LFN the requested storage element"""
        # First obtain the Waiting replicas from the Replicas table
        res = self.stagerClient.getOfflineReplicas()
        if not res["OK"]:
            gLogger.error(
                "StageRequest.__getOfflineReplicas: Failed to get replicas with Waiting status.",
                res["Message"])
            return res
        if not res["Value"]:
            gLogger.debug(
                "StageRequest.__getOfflineReplicas: No Waiting replicas found to process."
            )
        else:
            gLogger.debug(
                "StageRequest.__getOfflineReplicas: Obtained %s Waiting replicas(s) to process."
                % len(res["Value"]))

        return self.__sortBySE(res["Value"])

    def __addAssociatedReplicas(self, replicasToStage, seReplicas,
                                allReplicaInfo):
        """Retrieve the list of Replicas that belong to the same Tasks as the provided list"""
        res = self.stagerClient.getAssociatedReplicas(replicasToStage)
        if not res["OK"]:
            gLogger.fatal(
                "StageRequest.__addAssociatedReplicas: Failed to get associated Replicas.",
                res["Message"])
            return res
        addReplicas = {"Offline": {}, "Waiting": {}}
        replicaIDs = {}
        for replicaID, info in res["Value"].items():
            lfn = info["LFN"]
            storageElement = info["SE"]
            size = info["Size"]
            pfn = info["PFN"]
            status = info["Status"]
            if status in ["Waiting", "Offline"]:
                replicaIDs[replicaID] = {
                    "LFN": lfn,
                    "PFN": pfn,
                    "Size": size,
                    "StorageElement": storageElement
                }
                addReplicas[status].setdefault(storageElement,
                                               []).append(replicaID)

        waitingReplicas = addReplicas["Waiting"]
        offlineReplicas = addReplicas["Offline"]
        newReplicaInfo = replicaIDs
        allReplicaInfo.update(newReplicaInfo)

        # First handle Waiting Replicas for which metadata is to be checked
        for storageElement, seReplicaIDs in waitingReplicas.items():
            for replicaID in list(seReplicaIDs):
                if replicaID in replicasToStage:
                    seReplicaIDs.remove(replicaID)
            res = self.__checkIntegrity(storageElement, seReplicaIDs,
                                        allReplicaInfo)
            if not res["OK"]:
                gLogger.error(
                    "StageRequest.__addAssociatedReplicas: Failed to check Replica Metadata",
                    "(%s): %s" % (storageElement, res["Message"]),
                )
            else:
                # keep all Replicas (Online and Offline)
                seReplicas.setdefault(storageElement,
                                      []).extend(res["Value"]["Online"])
                replicasToStage.extend(res["Value"]["Online"])
                seReplicas[storageElement].extend(res["Value"]["Offline"])
                replicasToStage.extend(res["Value"]["Offline"])

        # Then handle Offline Replicas for which metadata is already checked
        for storageElement, seReplicaIDs in offlineReplicas.items():
            for replicaID in sorted(seReplicaIDs):
                if replicaID in replicasToStage:
                    seReplicaIDs.remove(replicaID)
            seReplicas.setdefault(storageElement, []).extend(seReplicaIDs)
            replicasToStage.extend(seReplicaIDs)

        for replicaID in list(allReplicaInfo):
            if replicaID not in replicasToStage:
                del allReplicaInfo[replicaID]

        totalSize = 0
        for storageElement in sorted(seReplicas.keys()):
            replicaIDs = seReplicas[storageElement]
            size = 0
            for replicaID in replicaIDs:
                size += self.__add(storageElement,
                                   allReplicaInfo[replicaID]["Size"])

            gLogger.info(
                "StageRequest.__addAssociatedReplicas:  Considering %s GB to be staged at %s"
                % (size, storageElement))
            totalSize += size

        gLogger.info(
            "StageRequest.__addAssociatedReplicas: Obtained %s GB for staging."
            % totalSize)

        return S_OK({
            "SEReplicas": seReplicas,
            "AllReplicaInfo": allReplicaInfo
        })

    def __checkIntegrity(self, storageElement, seReplicaIDs, allReplicaInfo):
        """Check the integrity of the files to ensure they are available
        Updates status of Offline Replicas for a later pass
        Return list of Online replicas to be Stage
        """
        if not seReplicaIDs:
            return S_OK({"Online": [], "Offline": []})

        # Since we are with a given SE, the LFN is a unique key
        lfnRepIDs = {}
        for replicaID in seReplicaIDs:
            lfn = allReplicaInfo[replicaID]["LFN"]
            lfnRepIDs[lfn] = replicaID

        gLogger.info(
            "StageRequest.__checkIntegrity: Checking the integrity of %s replicas at %s."
            % (len(lfnRepIDs), storageElement))
        res = StorageElement(storageElement).getFileMetadata(lfnRepIDs)
        if not res["OK"]:
            gLogger.error(
                "StageRequest.__checkIntegrity: Completely failed to obtain metadata for replicas.",
                res["Message"])
            return res

        terminalReplicaIDs = {}
        onlineReplicaIDs = []
        offlineReplicaIDs = []
        for lfn, metadata in res["Value"]["Successful"].items():

            if metadata["Size"] != allReplicaInfo[lfnRepIDs[lfn]]["Size"]:
                gLogger.error(
                    "StageRequest.__checkIntegrity: LFN StorageElement size does not match FileCatalog",
                    lfn)
                terminalReplicaIDs[lfnRepIDs[
                    lfn]] = "LFN StorageElement size does not match FileCatalog"
                lfnRepIDs.pop(lfn)
            elif metadata.get("Lost", False):
                gLogger.error(
                    "StageRequest.__checkIntegrity: LFN has been Lost by the StorageElement",
                    lfn)
                terminalReplicaIDs[
                    lfnRepIDs[lfn]] = "LFN has been Lost by the StorageElement"
                lfnRepIDs.pop(lfn)
            elif metadata.get("Unavailable", False):
                gLogger.error(
                    "StageRequest.__checkIntegrity: LFN is declared Unavailable by the StorageElement",
                    lfn)
                terminalReplicaIDs[lfnRepIDs[
                    lfn]] = "LFN is declared Unavailable by the StorageElement"
                lfnRepIDs.pop(lfn)
            elif metadata.get("Cached", metadata["Accessible"]):
                gLogger.verbose(
                    "StageRequest.__checkIntegrity: Cache hit for file.")
                onlineReplicaIDs.append(lfnRepIDs[lfn])
            else:
                offlineReplicaIDs.append(lfnRepIDs[lfn])

        for lfn, reason in res["Value"]["Failed"].items():
            if re.search("File does not exist", reason):
                gLogger.error(
                    "StageRequest.__checkIntegrity: LFN does not exist in the StorageElement",
                    lfn)
                terminalReplicaIDs[lfnRepIDs[
                    lfn]] = "LFN does not exist in the StorageElement"
            lfnRepIDs.pop(lfn)

        # Update the states of the replicas in the database #TODO Sent status to integrity DB
        if terminalReplicaIDs:
            gLogger.info(
                "StageRequest.__checkIntegrity: %s replicas are terminally failed."
                % len(terminalReplicaIDs))
            res = self.stagerClient.updateReplicaFailure(terminalReplicaIDs)
            if not res["OK"]:
                gLogger.error(
                    "StageRequest.__checkIntegrity: Failed to update replica failures.",
                    res["Message"])
        if onlineReplicaIDs:
            gLogger.info(
                "StageRequest.__checkIntegrity: %s replicas found Online." %
                len(onlineReplicaIDs))
        if offlineReplicaIDs:
            gLogger.info(
                "StageRequest.__checkIntegrity: %s replicas found Offline." %
                len(offlineReplicaIDs))
            res = self.stagerClient.updateReplicaStatus(
                offlineReplicaIDs, "Offline")
        return S_OK({"Online": onlineReplicaIDs, "Offline": offlineReplicaIDs})
class RequestPreparationAgent( AgentModule ):

  def initialize( self ):
    self.fileCatalog = FileCatalog()
    self.dm = DataManager()
    self.stagerClient = StorageManagerClient()
    self.dataIntegrityClient = DataIntegrityClient()
    # This sets the Default Proxy to used as that defined under
    # /Operations/Shifter/DataManager
    # the shifterProxy option in the Configuration can be used to change this default.
    self.am_setOption( 'shifterProxy', 'DataManager' )

    return S_OK()

  def execute( self ):
    """ This is the first logical task to be executed and manages the New->Waiting transition of the Replicas
    """
    res = self.__getNewReplicas()
    if not res['OK']:
      gLogger.fatal( "RequestPreparation.prepareNewReplicas: Failed to get replicas from StagerDB.", res['Message'] )
      return res
    if not res['Value']:
      gLogger.info( "There were no New replicas found" )
      return res
    replicas = res['Value']['Replicas']
    replicaIDs = res['Value']['ReplicaIDs']
    gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained %s New replicas for preparation." % len( replicaIDs ) )

    # Check if the files exist in the FileCatalog
    res = self.__getExistingFiles( replicas )
    if not res['OK']:
      return res
    exist = res['Value']['Exist']
    terminal = res['Value']['Missing']
    failed = res['Value']['Failed']
    if not exist:
      gLogger.error( 'RequestPreparation.prepareNewReplicas: Failed to determine the existence of any file' )
      return S_OK()
    terminalReplicaIDs = {}
    for lfn, reason in terminal.items():
      for replicaID in replicas[lfn].values():
        terminalReplicaIDs[replicaID] = reason
      replicas.pop( lfn )
    gLogger.info( "RequestPreparation.prepareNewReplicas: %s files exist in the FileCatalog." % len( exist ) )
    if terminal:
      gLogger.info( "RequestPreparation.prepareNewReplicas: %s files do not exist in the FileCatalog." % len( terminal ) )

    # Obtain the file sizes from the FileCatalog
    res = self.__getFileSize( exist )
    if not res['OK']:
      return res
    failed.update( res['Value']['Failed'] )
    terminal = res['Value']['ZeroSize']
    fileSizes = res['Value']['FileSizes']
    if not fileSizes:
      gLogger.error( 'RequestPreparation.prepareNewReplicas: Failed determine sizes of any files' )
      return S_OK()
    for lfn, reason in terminal.items():
      for _se, replicaID in replicas[lfn].items():
        terminalReplicaIDs[replicaID] = reason
      replicas.pop( lfn )
    gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained %s file sizes from the FileCatalog." % len( fileSizes ) )
    if terminal:
      gLogger.info( "RequestPreparation.prepareNewReplicas: %s files registered with zero size in the FileCatalog." % len( terminal ) )

    # Obtain the replicas from the FileCatalog
    res = self.__getFileReplicas( fileSizes.keys() )
    if not res['OK']:
      return res
    failed.update( res['Value']['Failed'] )
    terminal = res['Value']['ZeroReplicas']
    fileReplicas = res['Value']['Replicas']
    if not fileReplicas:
      gLogger.error( 'RequestPreparation.prepareNewReplicas: Failed determine replicas for any files' )
      return S_OK()
    for lfn, reason in terminal.items():
      for _se, replicaID in replicas[lfn].items():
        terminalReplicaIDs[replicaID] = reason
      replicas.pop( lfn )
    gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained replica information for %s file from the FileCatalog." % len( fileReplicas ) )
    if terminal:
      gLogger.info( "RequestPreparation.prepareNewReplicas: %s files registered with zero replicas in the FileCatalog." % len( terminal ) )

    # Check the replicas exist at the requested site
    replicaMetadata = []
    for lfn, requestedSEs in replicas.items():
      lfnReplicas = fileReplicas.get( lfn )

      # This should not happen in principle, but it was seen
      # after a corrupted staging request has entered the DB
      if not lfnReplicas:
        gLogger.error( "Missing replicas information", "%s %s" % ( lfn, requestedSEs ) )
        continue

      for requestedSE, replicaID in requestedSEs.items():
        if not requestedSE in lfnReplicas.keys():
          terminalReplicaIDs[replicaID] = "LFN not registered at requested SE"
          replicas[lfn].pop( requestedSE )
        else:
          replicaMetadata.append( ( replicaID, lfnReplicas[requestedSE], fileSizes[lfn] ) )

    # Update the states of the files in the database
    if terminalReplicaIDs:
      gLogger.info( "RequestPreparation.prepareNewReplicas: %s replicas are terminally failed." % len( terminalReplicaIDs ) )
      # res = self.stagerClient.updateReplicaFailure( terminalReplicaIDs )
      res = self.stagerClient.updateReplicaFailure( terminalReplicaIDs )
      if not res['OK']:
        gLogger.error( "RequestPreparation.prepareNewReplicas: Failed to update replica failures.", res['Message'] )
    if replicaMetadata:
      gLogger.info( "RequestPreparation.prepareNewReplicas: %s replica metadata to be updated." % len( replicaMetadata ) )
      # Sets the Status='Waiting' of CacheReplicas records that are OK with catalogue checks
      res = self.stagerClient.updateReplicaInformation( replicaMetadata )
      if not res['OK']:
        gLogger.error( "RequestPreparation.prepareNewReplicas: Failed to update replica metadata.", res['Message'] )
    return S_OK()

  def __getNewReplicas( self ):
    """ This obtains the New replicas from the Replicas table and for each LFN the requested storage element """
    # First obtain the New replicas from the CacheReplicas table
    res = self.stagerClient.getCacheReplicas( {'Status':'New'} )
    if not res['OK']:
      gLogger.error( "RequestPreparation.__getNewReplicas: Failed to get replicas with New status.", res['Message'] )
      return res
    if not res['Value']:
      gLogger.debug( "RequestPreparation.__getNewReplicas: No New replicas found to process." )
      return S_OK()
    else:
      gLogger.debug( "RequestPreparation.__getNewReplicas: Obtained %s New replicas(s) to process." % len( res['Value'] ) )
    replicas = {}
    replicaIDs = {}
    for replicaID, info in res['Value'].items():
      lfn = info['LFN']
      storageElement = info['SE']
      replicas.setdefault( lfn, {} )[storageElement] = replicaID
      replicaIDs[replicaID] = ( lfn, storageElement )
    return S_OK( {'Replicas':replicas, 'ReplicaIDs':replicaIDs} )

  def __getExistingFiles( self, lfns ):
    """ This checks that the files exist in the FileCatalog. """
    res = self.fileCatalog.exists( list( set( lfns ) ) )
    if not res['OK']:
      gLogger.error( "RequestPreparation.__getExistingFiles: Failed to determine whether files exist.", res['Message'] )
      return res
    failed = res['Value']['Failed']
    success = res['Value']['Successful']
    exist = [lfn for lfn, exists in success.items() if exists]
    missing = list( set( success ) - set( exist ) )
    if missing:
      reason = 'LFN not registered in the FC'
      gLogger.warn( "RequestPreparation.__getExistingFiles: %s" % reason, '\n'.join( [''] + missing ) )
      self.__reportProblematicFiles( missing, 'LFN-LFC-DoesntExist' )
      missing = dict.fromkeys( missing, reason )
    else:
      missing = {}
    return S_OK( {'Exist':exist, 'Missing':missing, 'Failed':failed} )

  def __getFileSize( self, lfns ):
    """ This obtains the file size from the FileCatalog. """
    fileSizes = {}
    zeroSize = {}
    res = self.fileCatalog.getFileSize( lfns )
    if not res['OK']:
      gLogger.error( "RequestPreparation.__getFileSize: Failed to get sizes for files.", res['Message'] )
      return res
    failed = res['Value']['Failed']
    for lfn, size in res['Value']['Successful'].items():
      if size == 0:
        zeroSize[lfn] = "LFN registered with zero size in the FileCatalog"
      else:
        fileSizes[lfn] = size
    if zeroSize:
      for lfn, reason in zeroSize.items():
        gLogger.warn( "RequestPreparation.__getFileSize: %s" % reason, lfn )
      self.__reportProblematicFiles( zeroSize.keys(), 'LFN-LFC-ZeroSize' )
    return S_OK( {'FileSizes':fileSizes, 'ZeroSize':zeroSize, 'Failed':failed} )

  def __getFileReplicas( self, lfns ):
    """ This obtains the replicas from the FileCatalog. """
    replicas = {}
    noReplicas = {}
    res = self.dm.getActiveReplicas( lfns )
    if not res['OK']:
      gLogger.error( "RequestPreparation.__getFileReplicas: Failed to obtain file replicas.", res['Message'] )
      return res
    failed = res['Value']['Failed']
    for lfn, lfnReplicas in res['Value']['Successful'].items():
      if len( lfnReplicas.keys() ) == 0:
        noReplicas[lfn] = "LFN registered with zero replicas in the FileCatalog"
      else:
        replicas[lfn] = lfnReplicas
    if noReplicas:
      for lfn, reason in noReplicas.items():
        gLogger.warn( "RequestPreparation.__getFileReplicas: %s" % reason, lfn )
      self.__reportProblematicFiles( noReplicas.keys(), 'LFN-LFC-NoReplicas' )
    return S_OK( {'Replicas':replicas, 'ZeroReplicas':noReplicas, 'Failed':failed} )

  def __reportProblematicFiles( self, lfns, reason ):
    return S_OK()
    res = self.dataIntegrityClient.setFileProblematic( lfns, reason, sourceComponent = 'RequestPreparationAgent' )
    if not res['OK']:
      gLogger.error( "RequestPreparation.__reportProblematicFiles: Failed to report missing files.", res['Message'] )
      return res
    if res['Value']['Successful']:
      gLogger.info( "RequestPreparation.__reportProblematicFiles: Successfully reported %s missing files." % len( res['Value']['Successful'] ) )
    if res['Value']['Failed']:
      gLogger.info( "RequestPreparation.__reportProblematicFiles: Failed to report %s problematic files." % len( res['Value']['Failed'] ) )
    return res
Пример #52
0
Script.parseCommandLine(ignoreErrors=False)

args = Script.getPositionalArgs()

if not len(args) == 1:
    Script.showHelp()

try:
    taskID = int(args[0])
except:
    print 'Stage requestID must be an integer'
    DIRAC.exit(2)

from DIRAC.StorageManagementSystem.Client.StorageManagerClient import StorageManagerClient

client = StorageManagerClient()

res = client.getTaskSummary(taskID)
if not res['OK']:
    print res['Message']
    DIRAC.exit(2)
taskInfo = res['Value']['TaskInfo']
replicaInfo = res['Value']['ReplicaInfo']
outStr = "%s: %s" % ('TaskID'.ljust(20), taskID)
outStr = "%s\n%s: %s" % (outStr, 'Status'.ljust(20),
                         taskInfo[taskID]['Status'])
outStr = "%s\n%s: %s" % (outStr, 'Source'.ljust(20),
                         taskInfo[taskID]['Source'])
outStr = "%s\n%s: %s" % (outStr, 'SourceTaskID'.ljust(20),
                         taskInfo[taskID]['SourceTaskID'])
outStr = "%s\n%s: %s" % (outStr, 'CallBackMethod'.ljust(20),
Пример #53
0
class StageRequestAgent( AgentModule ):

  def initialize( self ):
    self.replicaManager = ReplicaManager()
    self.stagerClient = StorageManagerClient()
    self.dataIntegrityClient = DataIntegrityClient()
    #self.storageDB = StorageManagementDB()
    # pin lifetime = 1 day
    self.pinLifetime = self.am_getOption( 'PinLifetime', THROTTLING_TIME )

    # This sets the Default Proxy to used as that defined under
    # /Operations/Shifter/DataManager
    # the shifterProxy option in the Configuration can be used to change this default.
    self.am_setOption( 'shifterProxy', 'DataManager' )

    return S_OK()

  def execute( self ):

    # Get the current submitted stage space and the amount of pinned space for each storage element
    res = self.getStorageUsage()
    if not res['OK']:
      return res

    return self.submitStageRequests()

  def getStorageUsage( self ):
    """ Fill the current Status of the SE Caches from the DB
    """
    self.storageElementCache = {}

    res = self.stagerClient.getSubmittedStagePins()
    if not res['OK']:
      gLogger.fatal( "StageRequest.getStorageUsage: Failed to obtain submitted requests from StorageManagementDB.", res['Message'] )
      return res
    self.storageElementUsage = res['Value']
    if self.storageElementUsage:
      gLogger.info( "StageRequest.getStorageUsage: Active stage/pin requests found at the following sites:" )
      for storageElement in sortList( self.storageElementUsage.keys() ):
        seDict = self.storageElementUsage[storageElement]
        # Convert to GB for printout
        seDict['TotalSize'] = seDict['TotalSize'] / ( 1000 * 1000 * 1000.0 )
        gLogger.info( "StageRequest.getStorageUsage: %s: %s replicas with a size of %.3f GB." %
                      ( storageElement.ljust( 15 ), str( seDict['Replicas'] ).rjust( 6 ), seDict['TotalSize'] ) )
    if not self.storageElementUsage:
      gLogger.info( "StageRequest.getStorageUsage: No active stage/pin requests found." )

    return S_OK()


  def submitStageRequests( self ):
    """ This manages the following transitions of the Replicas
        * Waiting -> Offline (if the file is not found Cached)
        * Waiting -> StageSubmitted (if the file is found Cached)
        * Offline -> StageSubmitted (if there are not more Waiting replicas)
    """
    # Retry Replicas that have not been Staged in a previous attempt 
    res = self._getMissingReplicas()
    if not res['OK']:
      gLogger.fatal( "StageRequest.submitStageRequests: Failed to get replicas from StorageManagementDB.", res['Message'] )
      return res
    seReplicas = res['Value']['SEReplicas']
    allReplicaInfo = res['Value']['AllReplicaInfo']

    if seReplicas:
      gLogger.info( "StageRequest.submitStageRequests: Completing partially Staged Tasks" )
    for storageElement, seReplicaIDs in seReplicas.items():
      gLogger.debug( 'Staging at %s:' % storageElement, seReplicaIDs )
      self._issuePrestageRequests( storageElement, seReplicaIDs, allReplicaInfo )

    # Check Waiting Replicas and select those found Online and all other Replicas from the same Tasks
    res = self._getOnlineReplicas()
    if not res['OK']:
      gLogger.fatal( "StageRequest.submitStageRequests: Failed to get replicas from StorageManagementDB.", res['Message'] )
      return res
    seReplicas = res['Value']['SEReplicas']
    allReplicaInfo = res['Value']['AllReplicaInfo']

    # Check Offline Replicas that fit in the Cache and all other Replicas from the same Tasks
    res = self._getOfflineReplicas()

    if not res['OK']:
      gLogger.fatal( "StageRequest.submitStageRequests: Failed to get replicas from StorageManagementDB.", res['Message'] )
      return res

    # Merge info from both results
    for storageElement, seReplicaIDs in res['Value']['SEReplicas'].items():
      if storageElement not in seReplicas:
        seReplicas[storageElement] = seReplicaIDs
      else:
        for replicaID in seReplicaIDs:
          if replicaID not in seReplicas[storageElement]:
            seReplicas[storageElement].append( replicaID )
    allReplicaInfo.update( res['Value']['AllReplicaInfo'] )

    gLogger.info( "StageRequest.submitStageRequests: Obtained %s replicas for staging." % len( allReplicaInfo ) )
    for storageElement, seReplicaIDs in seReplicas.items():
      gLogger.debug( 'Staging at %s:' % storageElement, seReplicaIDs )
      self._issuePrestageRequests( storageElement, seReplicaIDs, allReplicaInfo )
    return S_OK()

  def _getMissingReplicas( self ):
    """ This recovers Replicas that were not Staged on a previous attempt (the stage request failed or timed out),
        while other Replicas of the same task are already Staged. If left behind they can produce a deadlock.
        All SEs are considered, even if their Cache is full
    """
    # Get Replicas that are in Staged/StageSubmitted 
    gLogger.info( 'StageRequest._getMissingReplicas: Checking Staged Replicas' )

    res = self.__getStagedReplicas()
    if not res['OK']:
      gLogger.fatal( "StageRequest._getMissingReplicas: Failed to get replicas from StorageManagementDB.", res['Message'] )
      return res
    seReplicas = {}

    allReplicaInfo = res['Value']['AllReplicaInfo']
    replicasToStage = []
    for storageElement, seReplicaIDs in res['Value']['SEReplicas'].items():
      # Consider all SEs
      replicasToStage.extend( seReplicaIDs )

    # Get Replicas from the same Tasks as those selected
    res = self.__addAssociatedReplicas( replicasToStage, seReplicas, allReplicaInfo )
    if not res['OK']:
      gLogger.fatal( "StageRequest._getMissingReplicas: Failed to get associated Replicas.", res['Message'] )

    return res

  def _getOnlineReplicas( self ):
    """ This manages the transition
        * Waiting -> Offline (if the file is not found Cached)
        and returns the list of Cached Replicas for which the pin time has to be extended
        SEs for which the cache is currently full are not considered
    """
    # Get all Replicas in Waiting Status associated to Staging Tasks
    gLogger.verbose( 'StageRequest._getOnlineReplicas: Checking Online Replicas to be handled' )

    res = self.__getWaitingReplicas()
    if not res['OK']:
      gLogger.fatal( "StageRequest._getOnlineReplicas: Failed to get replicas from StorageManagementDB.", res['Message'] )
      return res
    seReplicas = {}
    allReplicaInfo = res['Value']['AllReplicaInfo']
    if not len( allReplicaInfo ):
      gLogger.info( "StageRequest._getOnlineReplicas: There were no Waiting replicas found" )
      return res
    gLogger.info( "StageRequest._getOnlineReplicas: Obtained %s replicas Waiting for staging." % len( allReplicaInfo ) )
    replicasToStage = []
    for storageElement, seReplicaIDs in res['Value']['SEReplicas'].items():
      if not self.__usage( storageElement ) < self.__cache( storageElement ):
        gLogger.info( 'StageRequest._getOnlineReplicas: Skipping %s, current usage above limit ( %s GB )' % ( storageElement, self.__cache( storageElement ) ) )
        # Do not consider those SE that have the Cache full
        continue
      # Check if the Replica Metadata is OK and find out if they are Online or Offline
      res = self.__checkIntegrity( storageElement, seReplicaIDs, allReplicaInfo )
      if not res['OK']:
        gLogger.error( 'StageRequest._getOnlineReplicas: Failed to check Replica Metadata', '(%s): %s' % ( storageElement, res['Message'] ) )
      else:
        # keep only Online Replicas
        seReplicas[storageElement] = res['Value']['Online']
        replicasToStage.extend( res['Value']['Online'] )

    # Get Replicas from the same Tasks as those selected
    res = self.__addAssociatedReplicas( replicasToStage, seReplicas, allReplicaInfo )
    if not res['OK']:
      gLogger.fatal( "StageRequest._getOnlineReplicas: Failed to get associated Replicas.", res['Message'] )

    return res

  def _getOfflineReplicas( self ):
    """ This checks Replicas in Offline status
        and returns the list of Replicas to be Staged
        SEs for which the cache is currently full are not considered
    """
    # Get all Replicas in Waiting Status associated to Staging Tasks
    gLogger.verbose( 'StageRequest._getOfflineReplicas: Checking Offline Replicas to be handled' )

    res = self.__getOfflineReplicas()
    if not res['OK']:
      gLogger.fatal( "StageRequest._getOfflineReplicas: Failed to get replicas from StorageManagementDB.", res['Message'] )
      return res
    seReplicas = {}
    allReplicaInfo = res['Value']['AllReplicaInfo']
    if not len( allReplicaInfo ):
      gLogger.info( "StageRequest._getOfflineReplicas: There were no Offline replicas found" )
      return res
    gLogger.info( "StageRequest._getOfflineReplicas: Obtained %s replicas Offline for staging." % len( allReplicaInfo ) )
    replicasToStage = []

    for storageElement, seReplicaIDs in res['Value']['SEReplicas'].items():
      if not self.__usage( storageElement ) < self.__cache( storageElement ):
        gLogger.info( 'StageRequest._getOfflineReplicas: Skipping %s, current usage above limit ( %s GB )' % ( storageElement, self.__cache( storageElement ) ) )
        # Do not consider those SE that have the Cache full
        continue
      seReplicas[storageElement] = []
      for replicaID in sorted( seReplicaIDs ):
        seReplicas[storageElement].append( replicaID )
        replicasToStage.append( replicaID )
        self.__add( storageElement, allReplicaInfo[replicaID]['Size'] )
        if not self.__usage( storageElement ) < self.__cache( storageElement ):
          # Stop adding Replicas when the cache is full
          break

    # Get Replicas from the same Tasks as those selected
    res = self.__addAssociatedReplicas( replicasToStage, seReplicas, allReplicaInfo )
    if not res['OK']:
      gLogger.fatal( "StageRequest._getOfflineReplicas: Failed to get associated Replicas.", res['Message'] )

    return res

  def __usage( self, storageElement ):
    """ Retrieve current usage of SE
    """
    if not storageElement in self.storageElementUsage:
      self.storageElementUsage[storageElement] = {'TotalSize': 0.}
    return self.storageElementUsage[storageElement]['TotalSize']

  def __cache( self, storageElement ):
    """ Retrieve cache size for SE
    """
    if not storageElement in self.storageElementCache:
      self.storageElementCache[storageElement] = gConfig.getValue( "/Resources/StorageElements/%s/DiskCacheTB" % storageElement, 1. ) * 1000. / THROTTLING_STEPS
    return self.storageElementCache[storageElement]

  def __add( self, storageElement, size ):
    """ Add size (in bytes) to current usage of storageElement (in GB)
    """
    if not storageElement in self.storageElementUsage:
      self.storageElementUsage[storageElement] = {'TotalSize': 0.}
    size = size / ( 1000 * 1000 * 1000.0 )
    self.storageElementUsage[storageElement]['TotalSize'] += size
    return size

  def _issuePrestageRequests( self, storageElement, seReplicaIDs, allReplicaInfo ):
    """ Make the request to the SE and update the DB
    """
    pfnRepIDs = {}
    for replicaID in seReplicaIDs:
      pfn = allReplicaInfo[replicaID]['PFN']
      pfnRepIDs[pfn] = replicaID

    # Now issue the prestage requests for the remaining replicas
    stageRequestMetadata = {}
    updatedPfnIDs = []
    if pfnRepIDs:
      gLogger.info( "StageRequest._issuePrestageRequests: Submitting %s stage requests for %s." % ( len( pfnRepIDs ), storageElement ) )
      res = self.replicaManager.prestageStorageFile( pfnRepIDs.keys(), storageElement, lifetime = self.pinLifetime )
      gLogger.debug( "StageRequest._issuePrestageRequests: replicaManager.prestageStorageFile: res=", res )
      #Daniela: fishy result from ReplicaManager!!! Should NOT return OK
      #res= {'OK': True, 'Value': {'Successful': {}, 'Failed': {'srm://srm-lhcb.cern.ch/castor/cern.ch/grid/lhcb/data/2010/RAW/EXPRESS/LHCb/COLLISION10/71476/071476_0000000241.raw': ' SRM2Storage.__gfal_exec: Failed to perform gfal_prestage.[SE][BringOnline][SRM_INVALID_REQUEST] httpg://srm-lhcb.cern.ch:8443/srm/managerv2: User not able to access specified space token\n'}}}
      #res= {'OK': True, 'Value': {'Successful': {'srm://gridka-dCache.fzk.de/pnfs/gridka.de/lhcb/data/2009/RAW/FULL/LHCb/COLLISION09/63495/063495_0000000001.raw': '-2083846379'}, 'Failed': {}}}

      if not res['OK']:
        gLogger.error( "StageRequest._issuePrestageRequests: Completely failed to submit stage requests for replicas.", res['Message'] )
      else:
        for pfn, requestID in res['Value']['Successful'].items():
          if not stageRequestMetadata.has_key( requestID ):
            stageRequestMetadata[requestID] = []
          stageRequestMetadata[requestID].append( pfnRepIDs[pfn] )
          updatedPfnIDs.append( pfnRepIDs[pfn] )
    if stageRequestMetadata:
      gLogger.info( "StageRequest._issuePrestageRequests: %s stage request metadata to be updated." % len( stageRequestMetadata ) )
      res = self.stagerClient.insertStageRequest( stageRequestMetadata, self.pinLifetime )
      if not res['OK']:
        gLogger.error( "StageRequest._issuePrestageRequests: Failed to insert stage request metadata.", res['Message'] )
        return res
      res = self.stagerClient.updateReplicaStatus( updatedPfnIDs, 'StageSubmitted' )
      if not res['OK']:
        gLogger.error( "StageRequest._issuePrestageRequests: Failed to insert replica status.", res['Message'] )
    return

  def __sortBySE( self, replicaDict ):

    seReplicas = {}
    replicaIDs = {}
    for replicaID, info in replicaDict.items():
      lfn = info['LFN']
      storageElement = info['SE']
      size = info['Size']
      pfn = info['PFN']
      replicaIDs[replicaID] = {'LFN':lfn, 'PFN':pfn, 'Size':size, 'StorageElement':storageElement}
      if not seReplicas.has_key( storageElement ):
        seReplicas[storageElement] = []
      seReplicas[storageElement].append( replicaID )
    return S_OK( {'SEReplicas':seReplicas, 'AllReplicaInfo':replicaIDs} )

  def __getStagedReplicas( self ):
    """ This obtains the Staged replicas from the Replicas table and for each LFN the requested storage element """
    # First obtain the Waiting replicas from the Replicas table
    res = self.stagerClient.getStagedReplicas()
    if not res['OK']:
      gLogger.error( "StageRequest.__getStagedReplicas: Failed to get replicas with Waiting status.", res['Message'] )
      return res
    if not res['Value']:
      gLogger.debug( "StageRequest.__getStagedReplicas: No Waiting replicas found to process." )
    else:
      gLogger.debug( "StageRequest.__getStagedReplicas: Obtained %s Waiting replicas(s) to process." % len( res['Value'] ) )

    return self.__sortBySE( res['Value'] )

  def __getWaitingReplicas( self ):
    """ This obtains the Waiting replicas from the Replicas table and for each LFN the requested storage element """
    # First obtain the Waiting replicas from the Replicas table
    res = self.stagerClient.getWaitingReplicas()
    if not res['OK']:
      gLogger.error( "StageRequest.__getWaitingReplicas: Failed to get replicas with Waiting status.", res['Message'] )
      return res
    if not res['Value']:
      gLogger.debug( "StageRequest.__getWaitingReplicas: No Waiting replicas found to process." )
    else:
      gLogger.debug( "StageRequest.__getWaitingReplicas: Obtained %s Waiting replicas(s) to process." % len( res['Value'] ) )

    return self.__sortBySE( res['Value'] )

  def __getOfflineReplicas( self ):
    """ This obtains the Offline replicas from the Replicas table and for each LFN the requested storage element """
    # First obtain the Waiting replicas from the Replicas table
    res = self.stagerClient.getOfflineReplicas()
    if not res['OK']:
      gLogger.error( "StageRequest.__getOfflineReplicas: Failed to get replicas with Waiting status.", res['Message'] )
      return res
    if not res['Value']:
      gLogger.debug( "StageRequest.__getOfflineReplicas: No Waiting replicas found to process." )
    else:
      gLogger.debug( "StageRequest.__getOfflineReplicas: Obtained %s Waiting replicas(s) to process." % len( res['Value'] ) )

    return self.__sortBySE( res['Value'] )

  def __addAssociatedReplicas( self, replicasToStage, seReplicas, allReplicaInfo ):
    """ Retrieve the list of Replicas that belong to the same Tasks as the provided list
    """
    res = self.stagerClient.getAssociatedReplicas( replicasToStage )
    if not res['OK']:
      gLogger.fatal( "StageRequest.__addAssociatedReplicas: Failed to get associated Replicas.", res['Message'] )
      return res
    addReplicas = {'Offline': {}, 'Waiting': {}}
    replicaIDs = {}
    for replicaID, info in res['Value'].items():
      lfn = info['LFN']
      storageElement = info['SE']
      size = info['Size']
      pfn = info['PFN']
      status = info['Status']
      if status not in ['Waiting', 'Offline']:
        continue
      if not addReplicas[status].has_key( storageElement ):
        addReplicas[status][storageElement] = []
      replicaIDs[replicaID] = {'LFN':lfn, 'PFN':pfn, 'Size':size, 'StorageElement':storageElement }
      addReplicas[status][storageElement].append( replicaID )

    waitingReplicas = addReplicas['Waiting']
    offlineReplicas = addReplicas['Offline']
    newReplicaInfo = replicaIDs
    allReplicaInfo.update( newReplicaInfo )

    # First handle Waiting Replicas for which metadata is to be checked
    for storageElement, seReplicaIDs in waitingReplicas.items():
      for replicaID in list( seReplicaIDs ):
        if replicaID in replicasToStage:
          seReplicaIDs.remove( replicaID )
      res = self.__checkIntegrity( storageElement, seReplicaIDs, allReplicaInfo )
      if not res['OK']:
        gLogger.error( 'StageRequest.__addAssociatedReplicas: Failed to check Replica Metadata', '(%s): %s' % ( storageElement, res['Message'] ) )
      else:
        # keep all Replicas (Online and Offline)
        if not storageElement in seReplicas:
          seReplicas[storageElement] = []
        seReplicas[storageElement].extend( res['Value']['Online'] )
        replicasToStage.extend( res['Value']['Online'] )
        seReplicas[storageElement].extend( res['Value']['Offline'] )
        replicasToStage.extend( res['Value']['Offline'] )

    # Then handle Offline Replicas for which metadata is already checked
    for storageElement, seReplicaIDs in offlineReplicas.items():
      if not storageElement in seReplicas:
        seReplicas[storageElement] = []
      for replicaID in sorted( seReplicaIDs ):
        if replicaID in replicasToStage:
          seReplicaIDs.remove( replicaID )
      seReplicas[storageElement].extend( seReplicaIDs )
      replicasToStage.extend( seReplicaIDs )

    for replicaID in allReplicaInfo.keys():
      if replicaID not in replicasToStage:
        del allReplicaInfo[replicaID]

    totalSize = 0
    for storageElement in sorted( seReplicas.keys() ):
      replicaIDs = seReplicas[storageElement]
      size = 0
      for replicaID in replicaIDs:
        size += self.__add( storageElement, allReplicaInfo[replicaID]['Size'] )

      gLogger.info( 'StageRequest.__addAssociatedReplicas:  Considering %s GB to be staged at %s' % ( size, storageElement ) )
      totalSize += size

    gLogger.info( "StageRequest.__addAssociatedReplicas: Obtained %s GB for staging." % totalSize )

    return S_OK( {'SEReplicas':seReplicas, 'AllReplicaInfo':allReplicaInfo} )

  def __checkIntegrity( self, storageElement, seReplicaIDs, allReplicaInfo ):
    """ Check the integrity of the files to ensure they are available
        Updates status of Offline Replicas for a later pass
        Return list of Online replicas to be Stage
    """
    if not seReplicaIDs:
      return S_OK( {'Online': [], 'Offline': []} )

    pfnRepIDs = {}
    for replicaID in seReplicaIDs:
      pfn = allReplicaInfo[replicaID]['PFN']
      pfnRepIDs[pfn] = replicaID

    gLogger.info( "StageRequest.__checkIntegrity: Checking the integrity of %s replicas at %s." % ( len( pfnRepIDs ), storageElement ) )
    res = self.replicaManager.getStorageFileMetadata( pfnRepIDs.keys(), storageElement )
    if not res['OK']:
      gLogger.error( "StageRequest.__checkIntegrity: Completely failed to obtain metadata for replicas.", res['Message'] )
      return res

    terminalReplicaIDs = {}
    onlineReplicaIDs = []
    offlineReplicaIDs = []
    for pfn, metadata in res['Value']['Successful'].items():

      if metadata['Size'] != allReplicaInfo[pfnRepIDs[pfn]]['Size']:
        gLogger.error( "StageRequest.__checkIntegrity: PFN StorageElement size does not match FileCatalog", pfn )
        terminalReplicaIDs[pfnRepIDs[pfn]] = 'PFN StorageElement size does not match FileCatalog'
        pfnRepIDs.pop( pfn )
      elif metadata['Lost']:
        gLogger.error( "StageRequest.__checkIntegrity: PFN has been Lost by the StorageElement", pfn )
        terminalReplicaIDs[pfnRepIDs[pfn]] = 'PFN has been Lost by the StorageElement'
        pfnRepIDs.pop( pfn )
      elif metadata['Unavailable']:
        gLogger.error( "StageRequest.__checkIntegrity: PFN is declared Unavailable by the StorageElement", pfn )
        terminalReplicaIDs[pfnRepIDs[pfn]] = 'PFN is declared Unavailable by the StorageElement'
        pfnRepIDs.pop( pfn )
      else:
        if metadata['Cached']:
          gLogger.verbose( "StageRequest.__checkIntegrity: Cache hit for file." )
          onlineReplicaIDs.append( pfnRepIDs[pfn] )
        else:
          offlineReplicaIDs.append( pfnRepIDs[pfn] )

    for pfn, reason in res['Value']['Failed'].items():
      if re.search( 'File does not exist', reason ):
        gLogger.error( "StageRequest.__checkIntegrity: PFN does not exist in the StorageElement", pfn )
        terminalReplicaIDs[pfnRepIDs[pfn]] = 'PFN does not exist in the StorageElement'
      pfnRepIDs.pop( pfn )

    # Update the states of the replicas in the database #TODO Sent status to integrity DB
    if terminalReplicaIDs:
      gLogger.info( "StageRequest.__checkIntegrity: %s replicas are terminally failed." % len( terminalReplicaIDs ) )
      res = self.stagerClient.updateReplicaFailure( terminalReplicaIDs )
      if not res['OK']:
        gLogger.error( "StageRequest.__checkIntegrity: Failed to update replica failures.", res['Message'] )
    if onlineReplicaIDs:
      gLogger.info( "StageRequest.__checkIntegrity: %s replicas found Online." % len( onlineReplicaIDs ) )
    if offlineReplicaIDs:
      gLogger.info( "StageRequest.__checkIntegrity: %s replicas found Offline." % len( offlineReplicaIDs ) )
      res = self.stagerClient.updateReplicaStatus( offlineReplicaIDs, 'Offline' )
    return S_OK( {'Online': onlineReplicaIDs, 'Offline': offlineReplicaIDs} )

  def __reportProblematicFiles( self, lfns, reason ):
    return S_OK()
    res = self.dataIntegrityClient.setFileProblematic( lfns, reason, sourceComponent = 'StageRequestAgent' )
    if not res['OK']:
      gLogger.error( "RequestPreparation.__reportProblematicFiles: Failed to report missing files.", res['Message'] )
      return res
    if res['Value']['Successful']:
      gLogger.info( "RequestPreparation.__reportProblematicFiles: Successfully reported %s missing files." % len( res['Value']['Successful'] ) )
    if res['Value']['Failed']:
      gLogger.info( "RequestPreparation.__reportProblematicFiles: Failed to report %s problematic files." % len( res['Value']['Failed'] ) )
    return res
Пример #54
0
    def requestStage(self, jobState, candidates, lfnData):
        # Any site is as good as any so random time!
        stageSite = random.sample(candidates, 1)[0]
        self.jobLog.info("Site selected %s for staging" % stageSite)
        result = getSEsForSite(stageSite)
        if not result["OK"]:
            return S_ERROR("Could not determine SEs for site %s" % stageSite)
        siteSEs = result["Value"]

        tapeSEs = []
        diskSEs = []
        for seName in siteSEs:
            result = self.__getSEStatus(seName)
            if not result["OK"]:
                self.jobLog.error("Cannot retrieve SE %s status: %s" % (seName, result["Message"]))
                return S_ERROR("Cannot retrieve SE status")
            seStatus = result["Value"]
            if seStatus["Read"] and seStatus["TapeSE"]:
                tapeSEs.append(seName)
            if seStatus["Read"] and seStatus["DiskSE"]:
                diskSEs.append(seName)

        if not tapeSEs:
            return S_ERROR("No Local SEs for site %s" % stageSite)

        self.jobLog.verbose("Tape SEs are %s" % (", ".join(tapeSEs)))

        stageLFNs = {}
        lfnToStage = []
        for lfn in lfnData:
            replicas = lfnData[lfn]["Replicas"]
            # Check SEs
            seStage = []
            for seName in replicas:
                _surl = replicas[seName]["SURL"]
                if seName in diskSEs:
                    # This lfn is in disk. Skip it
                    seStage = []
                    break
                if seName not in tapeSEs:
                    # This lfn is not in this tape SE. Check next SE
                    continue
                seStage.append(seName)
            for seName in seStage:
                if seName not in stageLFNs:
                    stageLFNs[seName] = []
                stageLFNs[seName].append(lfn)
                if lfn not in lfnToStage:
                    lfnToStage.append(lfn)

        if not stageLFNs:
            return S_ERROR("Cannot find tape replicas")

        # Check if any LFN is in more than one SE
        # If that's the case, try to stage from the SE that has more LFNs to stage to group the request
        # 1.- Get the SEs ordered by ascending replicas
        sortedSEs = reversed(sorted([(len(stageLFNs[seName]), seName) for seName in stageLFNs.keys()]))
        for lfn in lfnToStage:
            found = False
            # 2.- Traverse the SEs
            for _stageCount, seName in sortedSEs:
                if lfn in stageLFNs[seName]:
                    # 3.- If first time found, just mark as found. Next time delete the replica from the request
                    if found:
                        stageLFNs[seName].remove(lfn)
                    else:
                        found = True
                # 4.-If empty SE, remove
                if len(stageLFNs[seName]) == 0:
                    stageLFNs.pop(seName)

        self.jobLog.info(
            "Stage request will be \n\t%s" % "\n\t".join(["%s:%s" % (lfn, stageLFNs[lfn]) for lfn in stageLFNs])
        )

        stagerClient = StorageManagerClient()
        result = stagerClient.setRequest(
            stageLFNs, "WorkloadManagement", "stageCallback@WorkloadManagement/OptimizationMind", int(jobState.jid)
        )
        if not result["OK"]:
            self.jobLog.error("Could not send stage request: %s" % result["Message"])
            return S_ERROR("Problem sending staging request")

        rid = str(result["Value"])
        self.jobLog.info("Stage request %s sent" % rid)
        jobState.setParameter("StageRequest", rid)
        result = jobState.setStatus(
            self.ex_getOption("StagingStatus", "Staging"),
            self.ex_getOption("StagingMinorStatus", "Request Sent"),
            appStatus="",
            source=self.ex_optimizerName(),
        )
        if not result["OK"]:
            return result

        stageCandidates = []
        for seName in stageLFNs:
            result = self.__getSitesForSE(seName)
            if result["OK"]:
                stageCandidates.append(result["Value"])

        stageCandidates = candidates.intersection(*[sC for sC in stageCandidates]).union([stageSite])
        return S_OK(stageCandidates)