Пример #1
0
class RegisterFile(OperationHandlerBase):
    """
  .. class:: RegisterOperation

  RegisterFile operation handler

  :param self: self reference
  :param ~DIRAC.RequestManagementSystem.Client.Operation.Operation operation: Operation instance
  :param str csPath: CS path for this handler

  """
    def __init__(self, operation=None, csPath=None):
        """c'tor

    """
        OperationHandlerBase.__init__(self, operation, csPath)

    def __call__(self):
        """ call me maybe """

        # The flag  'rmsMonitoring' is set by the RequestTask and is False by default.
        # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase.
        if self.rmsMonitoring:
            self.rmsMonitoringReporter = MonitoringReporter(
                monitoringType="RMSMonitoring")
        else:
            # # RegisterFile specific monitor info
            gMonitor.registerActivity("RegisterAtt",
                                      "Attempted file registrations",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)
            gMonitor.registerActivity("RegisterOK",
                                      "Successful file registrations",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)
            gMonitor.registerActivity("RegisterFail",
                                      "Failed file registrations",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)

        # # counter for failed files
        failedFiles = 0
        # # catalog(s) to use
        catalogs = self.operation.Catalog
        if catalogs:
            catalogs = [cat.strip() for cat in catalogs.split(',')]
        dm = DataManager(catalogs=catalogs)
        # # get waiting files
        waitingFiles = self.getWaitingFilesList()

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.addRecord(
                self.createRMSRecord("Attempted", len(waitingFiles)))

        # # loop over files
        for opFile in waitingFiles:

            if not self.rmsMonitoring:
                gMonitor.addMark("RegisterAtt", 1)

            # # get LFN
            lfn = opFile.LFN
            # # and others
            fileTuple = (lfn, opFile.PFN, opFile.Size,
                         self.operation.targetSEList[0], opFile.GUID,
                         opFile.Checksum)
            # # call DataManager
            registerFile = dm.registerFile(fileTuple)
            # # check results
            if not registerFile["OK"] or lfn in registerFile["Value"]["Failed"]:

                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord(
                        self.createRMSRecord("Failed", 1))
                else:
                    gMonitor.addMark("RegisterFail", 1)
                # self.dataLoggingClient().addFileRecord(
                #     lfn, "RegisterFail", ','.join(catalogs) if catalogs else "all catalogs", "", "RegisterFile")

                reason = str(
                    registerFile.get(
                        "Message",
                        registerFile.get("Value",
                                         {}).get("Failed",
                                                 {}).get(lfn, 'Unknown')))
                errorStr = "failed to register LFN"
                opFile.Error = "%s: %s" % (errorStr, reason)
                if 'GUID already registered' in reason:
                    opFile.Status = 'Failed'
                    self.log.error(errorStr, "%s: %s" % (lfn, reason))
                elif 'File already registered with no replicas' in reason:
                    self.log.warn(
                        errorStr,
                        "%s: %s, will remove it and retry" % (lfn, reason))
                    dm.removeFile(lfn)
                else:
                    self.log.warn(errorStr, "%s: %s" % (lfn, reason))
                failedFiles += 1

            else:

                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord(
                        self.createRMSRecord("Successful", 1))
                else:
                    gMonitor.addMark("RegisterOK", 1)
                # self.dataLoggingClient().addFileRecord(
                #     lfn, "Register", ','.join(catalogs) if catalogs else "all catalogs", "", "RegisterFile")

                self.log.verbose(
                    "file %s has been registered at %s" %
                    (lfn, ','.join(catalogs) if catalogs else "all catalogs"))
                opFile.Status = "Done"

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.commit()

        # # final check
        if failedFiles:
            self.log.warn("all files processed, %s files failed to register" %
                          failedFiles)
            self.operation.Error = "some files failed to register"
            return S_ERROR(self.operation.Error)

        return S_OK()
Пример #2
0
class StatesMonitoringAgent( AgentModule ):
  """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

  __summaryKeyFieldsMapping = [ 'Status',
                                'Site',
                                'User',
                                'UserGroup',
                                'JobGroup',
                                'JobType',
                                'ApplicationStatus',
                                'MinorStatus']
  __summaryDefinedFields = [ ( 'ApplicationStatus', 'unset' ), ( 'MinorStatus', 'unset' ) ]
  __summaryValueFieldsMapping = [ 'Jobs',
                                  'Reschedules']
  __renameFieldsMapping = { 'JobType' : 'JobSplitType' }

  __jobDBFields = []
  
  jobDB = None
  monitoringReporter = None
  reportPeriod = None
    
  def initialize( self ):
    """ Standard constructor
    """
    
    self.jobDB = JobDB()
    
    self.reportPeriod = 120
    self.am_setOption( "PollingTime", self.reportPeriod )
    
    self.monitoringReporter = MonitoringReporter( monitoringType = "WMSHistory" )
    
    for field in self.__summaryKeyFieldsMapping:
      if field == 'User':
        field = 'Owner'
      elif field == 'UserGroup':
        field = 'OwnerGroup'
      self.__jobDBFields.append( field )
    
    return S_OK()
   
  def execute( self ):
    """ Main execution method
    """
    result = gConfig.getSections( "/DIRAC/Setups" )
    if not result[ 'OK' ]:
      return result
    validSetups = result[ 'Value' ]
    self.log.info( "Valid setups for this cycle are %s" % ", ".join( validSetups ) )
    # Get the WMS Snapshot!
    result = self.jobDB.getSummarySnapshot( self.__jobDBFields )
    now = Time.dateTime()
    if not result[ 'OK' ]:
      self.log.error( "Can't get the jobdb summary", result[ 'Message' ] )
    else:
      values = result[ 'Value' ][1]
      self.log.info( "Start sending records!" )
      for record in values:
        recordSetup = record[0]
        if recordSetup not in validSetups:
          self.log.error( "Setup %s is not valid" % recordSetup )
          continue
        record = record[1:]
        rD = {}
        for fV in self.__summaryDefinedFields:
          rD[ fV[0] ] = fV[1]
        for iP in range( len( self.__summaryKeyFieldsMapping ) ):
          fieldName = self.__summaryKeyFieldsMapping[iP]
          rD[ self.__renameFieldsMapping.get( fieldName, fieldName ) ] = record[iP]
        record = record[ len( self.__summaryKeyFieldsMapping ): ]
        for iP in range( len( self.__summaryValueFieldsMapping ) ):
          rD[ self.__summaryValueFieldsMapping[iP] ] = int( record[iP] )
        rD['timestamp'] = int( Time.toEpoch( now ) )       
        self.monitoringReporter.addRecord( rD )
      retVal = self.monitoringReporter.commit()
      if retVal['OK']:
        self.log.info( "The records are successfully sent to the Store!" )
      else:
        self.log.warn( "Faild to insert the records! It will be retried in the next iteration", retVal['Message'] )
        
    return S_OK()
Пример #3
0
class TornadoServer(object):
    """
    Tornado webserver

    Initialize and run an HTTPS Server for DIRAC services.
    By default it load all https services defined in the CS,
    but you can also give an explicit list.

    The listening port is either:

    * Given as parameter
    * Loaded from the CS ``/Systems/Tornado/<instance>/Port``
    * Default to 8443


    Example 1: Easy way to start tornado::

      # Initialize server and load services
      serverToLaunch = TornadoServer()

      # Start listening when ready
      serverToLaunch.startTornado()

    Example 2:We want to debug service1 and service2 only, and use another port for that ::

      services = ['component/service1:port1', 'component/service2']
      endpoints = ['component/endpoint1', 'component/endpoint2']
      serverToLaunch = TornadoServer(services=services, endpoints=endpoints, port=1234)
      serverToLaunch.startTornado()

    """
    def __init__(self, services=True, endpoints=False, port=None):
        """C'r

        :param list services: (default True) List of service handlers to load.
            If ``True``, loads all described in the CS
            If ``False``, do not load services
        :param list endpoints: (default False) List of endpoint handlers to load.
            If ``True``, loads all described in the CS
            If ``False``, do not load endpoints
        :param int port: Port to listen to.
            If ``None``, the port is resolved following the logic described in the class documentation
        """
        self.__startTime = time.time()
        # Application metadata, routes and settings mapping on the ports
        self.__appsSettings = {}
        # Default port, if enother is not discover
        if port is None:
            port = gConfig.getValue(
                "/Systems/Tornado/%s/Port" %
                PathFinder.getSystemInstance("Tornado"), 8443)
        self.port = port

        # Handler manager initialization with default settings
        self.handlerManager = HandlerManager(services, endpoints)

        # temp value for computation, used by the monitoring
        self.__report = None
        # Last update time stamp
        self.__monitorLastStatsUpdate = None
        self.__monitoringLoopDelay = 60  # In secs

        self.activityMonitoring = False
        if "Monitoring" in Operations().getMonitoringBackends(
                monitoringType="ServiceMonitoring"):
            self.activityMonitoring = True
        # If services are defined, load only these ones (useful for debug purpose or specific services)
        retVal = self.handlerManager.loadServicesHandlers()
        if not retVal["OK"]:
            sLog.error(retVal["Message"])
            raise ImportError(
                "Some services can't be loaded, check the service names and configuration."
            )
        # Response time to load services
        self.__elapsedTime = time.time() - self.__startTime
        retVal = self.handlerManager.loadEndpointsHandlers()
        if not retVal["OK"]:
            sLog.error(retVal["Message"])
            raise ImportError(
                "Some endpoints can't be loaded, check the endpoint names and configuration."
            )

    def __calculateAppSettings(self):
        """Calculate application information mapping on the ports"""
        # if no service list is given, load services from configuration
        handlerDict = self.handlerManager.getHandlersDict()
        for data in handlerDict.values():
            port = data.get("Port") or self.port
            for hURL in data["URLs"]:
                if port not in self.__appsSettings:
                    self.__appsSettings[port] = {"routes": [], "settings": {}}
                if hURL not in self.__appsSettings[port]["routes"]:
                    self.__appsSettings[port]["routes"].append(hURL)
        return bool(self.__appsSettings)

    def loadServices(self, services):
        """Load a services

        :param services: List of service handlers to load. Default value set at initialization
            If ``True``, loads all services from CS
        :type services: bool or list

        :return: S_OK()/S_ERROR()
        """
        return self.handlerManager.loadServicesHandlers(services)

    def loadEndpoints(self, endpoints):
        """Load a endpoints

        :param endpoints: List of service handlers to load. Default value set at initialization
            If ``True``, loads all endpoints from CS
        :type endpoints: bool or list

        :return: S_OK()/S_ERROR()
        """
        return self.handlerManager.loadEndpointsHandlers(endpoints)

    def addHandlers(self, routes, settings=None, port=None):
        """Add new routes

        :param list routes: routes
        :param dict settings: application settings
        :param int port: port
        """
        port = port or self.port
        if port not in self.__appsSettings:
            self.__appsSettings[port] = {"routes": [], "settings": {}}
        if settings:
            self.__appsSettings[port]["settings"].update(settings)
        for route in routes:
            if route not in self.__appsSettings[port]["routes"]:
                self.__appsSettings[port]["routes"].append(route)

        return S_OK()

    def startTornado(self):
        """
        Starts the tornado server when ready.
        This method never returns.
        """
        # If there is no services loaded:
        if not self.__calculateAppSettings():
            raise Exception(
                "There is no services loaded, please check your configuration")

        sLog.debug("Starting Tornado")

        # Prepare SSL settings
        certs = Locations.getHostCertificateAndKeyLocation()
        if certs is False:
            sLog.fatal("Host certificates not found ! Can't start the Server")
            raise ImportError("Unable to load certificates")
        ca = Locations.getCAsLocation()
        ssl_options = {
            "certfile": certs[0],
            "keyfile": certs[1],
            "cert_reqs": M2Crypto.SSL.verify_peer,
            "ca_certs": ca,
            "sslDebug":
            DEBUG_M2CRYPTO,  # Set to true if you want to see the TLS debug messages
        }

        # Init monitoring
        if self.activityMonitoring:
            from DIRAC.MonitoringSystem.Client.MonitoringReporter import MonitoringReporter

            self.activityMonitoringReporter = MonitoringReporter(
                monitoringType="ServiceMonitoring")
            self.__monitorLastStatsUpdate = time.time()
            self.__report = self.__startReportToMonitoringLoop()
            # Response time
            # Starting monitoring, IOLoop waiting time in ms, __monitoringLoopDelay is defined in seconds
            tornado.ioloop.PeriodicCallback(
                self.__reportToMonitoring(self.__elapsedTime),
                self.__monitoringLoopDelay * 1000).start()

            # If we are running with python3, Tornado will use asyncio,
            # and we have to convince it to let us run in a different thread
            # Doing this ensures a consistent behavior between py2 and py3
            asyncio.set_event_loop_policy(
                tornado.platform.asyncio.AnyThreadEventLoopPolicy())

        for port, app in self.__appsSettings.items():
            sLog.debug(" - %s" % "\n - ".join(
                ["%s = %s" % (k, ssl_options[k]) for k in ssl_options]))

            # Default server configuration
            settings = dict(compress_response=True, cookie_secret="secret")

            # Merge appllication settings
            settings.update(app["settings"])
            # Start server
            router = Application(app["routes"],
                                 default_handler_class=NotFoundHandler,
                                 **settings)
            server = HTTPServer(router,
                                ssl_options=ssl_options,
                                decompress_request=True)
            try:
                server.listen(int(port))
            except Exception as e:  # pylint: disable=broad-except
                sLog.exception("Exception starting HTTPServer", e)
                raise
            sLog.always("Listening on port %s" % port)

        tornado.ioloop.IOLoop.current().start()

    def __reportToMonitoring(self, responseTime):
        """
        Periodically reports to Monitoring
        """

        # Calculate CPU usage by comparing realtime and cpu time since last report
        percentage = self.__endReportToMonitoringLoop(self.__report[0],
                                                      self.__report[1])
        # Send record to Monitoring
        self.activityMonitoringReporter.addRecord({
            "timestamp":
            int(TimeUtilities.toEpoch()),
            "Host":
            Network.getFQDN(),
            "ServiceName":
            "Tornado",
            "MemoryUsage":
            self.__report[2],
            "CpuPercentage":
            percentage,
            "ResponseTime":
            responseTime,
        })
        self.activityMonitoringReporter.commit()
        # Save memory usage and save realtime/CPU time for next call
        self.__report = self.__startReportToMonitoringLoop()

    def __startReportToMonitoringLoop(self):
        """
        Snapshot of resources to be taken at the beginning
        of a monitoring cycle.
        Also sends memory snapshot to the monitoring.

        This is basically copy/paste of Service.py

        :returns: tuple (<time.time(), cpuTime )

        """
        now = time.time()  # Used to calulate a delta
        stats = os.times()
        cpuTime = stats[0] + stats[2]
        if now - self.__monitorLastStatsUpdate < 0:
            return (now, cpuTime)
        # Send CPU consumption mark
        self.__monitorLastStatsUpdate = now
        # Send Memory consumption mark
        membytes = MemStat.VmB("VmRSS:")
        if membytes:
            mem = membytes / (1024.0 * 1024.0)
        return (now, cpuTime, mem)

    def __endReportToMonitoringLoop(self, initialWallTime, initialCPUTime):
        """
        Snapshot of resources to be taken at the end
        of a monitoring cycle.

        This is basically copy/paste of Service.py

        Determines CPU usage by comparing walltime and cputime and send it to monitor
        """
        wallTime = time.time() - initialWallTime
        stats = os.times()
        cpuTime = stats[0] + stats[2] - initialCPUTime
        percentage = cpuTime / wallTime * 100.0
        return percentage
Пример #4
0
class RequestExecutingAgent(AgentModule):
    """
    .. class:: RequestExecutingAgent

    request processing agent using ProcessPool, Operation handlers and RequestTask
    """

    # # process pool
    __processPool = None
    # # request cache
    __requestCache = {}
    # # requests/cycle
    __requestsPerCycle = 100
    # # minimal nb of subprocess running
    __minProcess = 20
    # # maximal nb of subprocess executed same time
    __maxProcess = 20
    # # ProcessPool queue size
    __queueSize = 20
    # # file timeout
    __fileTimeout = 300
    # # operation timeout
    __operationTimeout = 300
    # # ProcessPool finalization timeout
    __poolTimeout = 900
    # # ProcessPool sleep time
    __poolSleep = 5
    # # placeholder for RequestClient instance
    __requestClient = None
    # # Size of the bulk if use of getRequests. If 0, use getRequest
    __bulkRequest = 0
    # # Send the monitoring data to ES rather than the Framework/Monitoring
    __rmsMonitoring = False

    def __init__(self, *args, **kwargs):
        """c'tor"""
        # # call base class ctor
        AgentModule.__init__(self, *args, **kwargs)
        # # ProcessPool related stuff
        self.__requestsPerCycle = self.am_getOption("RequestsPerCycle", self.__requestsPerCycle)
        self.log.info("Requests/cycle = %d" % self.__requestsPerCycle)
        self.__minProcess = self.am_getOption("MinProcess", self.__minProcess)
        self.log.info("ProcessPool min process = %d" % self.__minProcess)
        self.__maxProcess = self.am_getOption("MaxProcess", self.__maxProcess)
        self.log.info("ProcessPool max process = %d" % self.__maxProcess)
        self.__queueSize = self.am_getOption("ProcessPoolQueueSize", self.__queueSize)
        self.log.info("ProcessPool queue size = %d" % self.__queueSize)
        self.__poolTimeout = int(self.am_getOption("ProcessPoolTimeout", self.__poolTimeout))
        self.log.info("ProcessPool timeout = %d seconds" % self.__poolTimeout)
        self.__poolSleep = int(self.am_getOption("ProcessPoolSleep", self.__poolSleep))
        self.log.info("ProcessPool sleep time = %d seconds" % self.__poolSleep)
        self.__bulkRequest = self.am_getOption("BulkRequest", self.__bulkRequest)
        self.log.info("Bulk request size = %d" % self.__bulkRequest)
        self.__rmsMonitoring = self.am_getOption("EnableRMSMonitoring", self.__rmsMonitoring)
        self.log.info("Enable ES RMS Monitoring = %s" % self.__rmsMonitoring)

        # # keep config path and agent name
        self.agentName = self.am_getModuleParam("fullName")
        self.__configPath = PathFinder.getAgentSection(self.agentName)

        # # operation handlers over here
        opHandlersPath = "%s/%s" % (self.__configPath, "OperationHandlers")
        opHandlers = gConfig.getSections(opHandlersPath)
        if not opHandlers["OK"]:
            self.log.error(opHandlers["Message"])
            raise AgentConfigError("OperationHandlers section not found in CS under %s" % self.__configPath)
        opHandlers = opHandlers["Value"]

        self.timeOuts = dict()

        # # handlers dict
        self.handlersDict = dict()
        for opHandler in opHandlers:
            opHandlerPath = "%s/%s/Location" % (opHandlersPath, opHandler)
            opLocation = gConfig.getValue(opHandlerPath, "")
            if not opLocation:
                self.log.error("%s not set for %s operation handler" % (opHandlerPath, opHandler))
                continue
            self.timeOuts[opHandler] = {"PerFile": self.__fileTimeout, "PerOperation": self.__operationTimeout}

            opTimeout = gConfig.getValue("%s/%s/TimeOut" % (opHandlersPath, opHandler), 0)
            if opTimeout:
                self.timeOuts[opHandler]["PerOperation"] = opTimeout
            fileTimeout = gConfig.getValue("%s/%s/TimeOutPerFile" % (opHandlersPath, opHandler), 0)
            if fileTimeout:
                self.timeOuts[opHandler]["PerFile"] = fileTimeout

            self.handlersDict[opHandler] = opLocation

        self.log.info("Operation handlers:")
        for item in enumerate(self.handlersDict.items()):
            opHandler = item[1][0]
            self.log.info(
                "[%s] %s: %s (timeout: %d s + %d s per file)"
                % (
                    item[0],
                    item[1][0],
                    item[1][1],
                    self.timeOuts[opHandler]["PerOperation"],
                    self.timeOuts[opHandler]["PerFile"],
                )
            )

        if self.__rmsMonitoring:
            self.rmsMonitoringReporter = MonitoringReporter(monitoringType="RMSMonitoring")
            gThreadScheduler.addPeriodicTask(100, self.__rmsMonitoringReporting)
        else:
            # # common monitor activity
            gMonitor.registerActivity("Iteration", "Agent Loops", "RequestExecutingAgent", "Loops/min", gMonitor.OP_SUM)
            gMonitor.registerActivity(
                "Processed", "Request Processed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM
            )
            gMonitor.registerActivity(
                "Done", "Request Completed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM
            )

        # # create request dict
        self.__requestCache = dict()

        # ?? Probably should be removed
        self.FTSMode = self.am_getOption("FTSMode", False)

    def processPool(self):
        """facade for ProcessPool"""
        if not self.__processPool:
            minProcess = max(1, self.__minProcess)
            maxProcess = max(self.__minProcess, self.__maxProcess)
            queueSize = abs(self.__queueSize)
            self.log.info(
                "REA ProcessPool configuration",
                "minProcess = %d maxProcess = %d queueSize = %d" % (minProcess, maxProcess, queueSize),
            )
            self.__processPool = ProcessPool(
                minProcess,
                maxProcess,
                queueSize,
                poolCallback=self.resultCallback,
                poolExceptionCallback=self.exceptionCallback,
            )
            self.__processPool.daemonize()
        return self.__processPool

    def requestClient(self):
        """RequestClient getter"""
        if not self.__requestClient:
            self.__requestClient = ReqClient()
        return self.__requestClient

    def cacheRequest(self, request):
        """put request into requestCache

        :param ~Request.Request request: Request instance
        """
        maxProcess = max(self.__minProcess, self.__maxProcess)
        if len(self.__requestCache) > maxProcess + 50:
            # For the time being we just print a warning... If the ProcessPool is working well, this is not needed
            # We don't know how much is acceptable as it depends on many factors
            self.log.warn("Too many requests in cache", ": %d" % len(self.__requestCache))
        #      return S_ERROR( "Too many requests in cache" )
        if request.RequestID in self.__requestCache:
            # We don't call  putRequest as we have got back the request that is still being executed. Better keep it
            # The main reason for this is that it lasted longer than the kick time of CleanReqAgent
            self.log.warn(
                "Duplicate request, keep it but don't execute", ": %d/%s" % (request.RequestID, request.RequestName)
            )
            return S_ERROR(errno.EALREADY, "Request already in cache")
        self.__requestCache[request.RequestID] = request
        return S_OK()

    def putRequest(self, requestID, taskResult=None):
        """put back :requestID: to RequestClient

        :param str requestID: request's id
        """
        if requestID in self.__requestCache:
            request = self.__requestCache.pop(requestID)
            if taskResult:
                if taskResult["OK"]:
                    request = taskResult["Value"]
                    # The RequestTask is putting back the Done tasks, no need to redo it
                    if request.Status == "Done":
                        return S_OK()
                # In case of timeout, we need to increment ourselves all the attempts
                elif cmpError(taskResult, errno.ETIME):
                    waitingOp = request.getWaiting()
                    for rmsFile in waitingOp.get("Value", []):
                        rmsFile.Attempt += 1

            reset = self.requestClient().putRequest(request, useFailoverProxy=False, retryMainService=2)
            if not reset["OK"]:
                return S_ERROR("putRequest: unable to reset request %s: %s" % (requestID, reset["Message"]))
        else:
            return S_ERROR("Not in cache")
        return S_OK()

    def putAllRequests(self):
        """put back all requests without callback called into requestClient

        :param self: self reference
        """
        self.log.info("putAllRequests: will put back requests", "%s" % len(self.__requestCache))
        for requestID in self.__requestCache.keys():
            reset = self.putRequest(requestID)
            if not reset["OK"]:
                self.log.error("Failed to put request", reset["Message"])
            else:
                self.log.debug("putAllRequests: request %s has been put back with its initial state" % requestID)
        return S_OK()

    def initialize(self):
        """initialize agent"""
        return S_OK()

    def execute(self):
        """read requests from RequestClient and enqueue them into ProcessPool"""
        if not self.__rmsMonitoring:
            gMonitor.addMark("Iteration", 1)
        # # requests (and so tasks) counter
        taskCounter = 0
        while taskCounter < self.__requestsPerCycle:
            self.log.debug("execute: executing %d request in this cycle" % taskCounter)

            requestsToExecute = []

            if not self.__bulkRequest:
                self.log.info("execute: ask for a single request")
                getRequest = self.requestClient().getRequest()
                if not getRequest["OK"]:
                    self.log.error("execute:", "%s" % getRequest["Message"])
                    break
                if not getRequest["Value"]:
                    self.log.info("execute: no more 'Waiting' requests to process")
                    break
                requestsToExecute = [getRequest["Value"]]
            else:
                numberOfRequest = min(self.__bulkRequest, self.__requestsPerCycle - taskCounter)
                self.log.info("execute: ask for requests", "%s" % numberOfRequest)
                getRequests = self.requestClient().getBulkRequests(numberOfRequest)
                if not getRequests["OK"]:
                    self.log.error("execute:", "%s" % getRequests["Message"])
                    break
                if not getRequests["Value"]:
                    self.log.info("execute: no more 'Waiting' requests to process")
                    break
                for rId in getRequests["Value"]["Failed"]:
                    self.log.error("execute:", "%s" % getRequests["Value"]["Failed"][rId])

                requestsToExecute = list(getRequests["Value"]["Successful"].values())

            self.log.info("execute: will execute requests ", "%s" % len(requestsToExecute))

            for request in requestsToExecute:
                # # set task id
                taskID = request.RequestID

                self.log.info(
                    "processPool status",
                    "tasks idle = %s working = %s"
                    % (self.processPool().getNumIdleProcesses(), self.processPool().getNumWorkingProcesses()),
                )

                looping = 0
                while True:
                    if not self.processPool().getFreeSlots():
                        if not looping:
                            self.log.info(
                                "No free slots available in processPool",
                                "will wait %d seconds to proceed" % self.__poolSleep,
                            )
                        time.sleep(self.__poolSleep)
                        looping += 1
                    else:
                        if looping:
                            self.log.info("Free slot found", "after %d seconds" % looping * self.__poolSleep)
                        looping = 0
                        # # save current request in cache
                        res = self.cacheRequest(request)
                        if not res["OK"]:
                            if cmpError(res, errno.EALREADY):
                                # The request is already in the cache, skip it. break out of the while loop to get next request
                                break
                            # There are too many requests in the cache, commit suicide
                            self.log.error(
                                "Too many requests in cache",
                                "(%d requests): put back all requests and exit cycle. Error %s"
                                % (len(self.__requestCache), res["Message"]),
                            )
                            self.putAllRequests()
                            return res
                        # # serialize to JSON
                        result = request.toJSON()
                        if not result["OK"]:
                            continue
                        requestJSON = result["Value"]
                        self.log.info("spawning task for request", "'%s/%s'" % (request.RequestID, request.RequestName))
                        timeOut = self.getTimeout(request)
                        enqueue = self.processPool().createAndQueueTask(
                            RequestTask,
                            kwargs={
                                "requestJSON": requestJSON,
                                "handlersDict": self.handlersDict,
                                "csPath": self.__configPath,
                                "agentName": self.agentName,
                                "rmsMonitoring": self.__rmsMonitoring,
                            },
                            taskID=taskID,
                            blocking=True,
                            usePoolCallbacks=True,
                            timeOut=timeOut,
                        )
                        if not enqueue["OK"]:
                            self.log.error("Could not enqueue task", enqueue["Message"])
                        else:
                            self.log.debug("successfully enqueued task", "'%s'" % taskID)
                            # # update monitor
                            if self.__rmsMonitoring:
                                self.rmsMonitoringReporter.addRecord(
                                    {
                                        "timestamp": int(Time.toEpoch()),
                                        "host": Network.getFQDN(),
                                        "objectType": "Request",
                                        "status": "Attempted",
                                        "objectID": request.RequestID,
                                        "nbObject": 1,
                                    }
                                )
                            else:
                                gMonitor.addMark("Processed", 1)

                            # # update request counter
                            taskCounter += 1
                            # # task created, a little time kick to proceed
                            time.sleep(0.1)
                            break

        self.log.info("Flushing callbacks", "(%d requests still in cache)" % len(self.__requestCache))
        processed = self.processPool().processResults()
        # This happens when the result queue is screwed up.
        # Returning S_ERROR proved not to be sufficient,
        # and when in this situation, there is nothing we can do.
        # So we just exit. runit will restart from scratch.
        if processed < 0:
            self.log.fatal("Results queue is screwed up")
            sys.exit(1)
        # # clean return
        return S_OK()

    def getTimeout(self, request):
        """get timeout for request"""
        timeout = 0
        for op in request:
            if op.Status not in ("Waiting", "Scheduled", "Queued"):
                continue
            if op.Type not in self.timeOuts:
                timeout += self.__operationTimeout
            else:
                perOp = self.timeOuts[op.Type].get("PerOperation", self.__operationTimeout)
                perFiles = self.timeOuts[op.Type].get("PerFile", self.__fileTimeout) * len(op)
                timeout += perOp + perFiles
        self.log.info(
            "estimated timeOut for request", "(%s/%s) is %s" % (request.RequestID, request.RequestName, timeout)
        )
        return timeout

    def finalize(self):
        """agent finalization"""
        if self.__processPool:
            self.processPool().finalize(timeout=self.__poolTimeout)
        self.putAllRequests()
        return S_OK()

    def resultCallback(self, taskID, taskResult):
        """definition of request callback function

        :param str taskID: Request.RequestID
        :param dict taskResult: task result S_OK(Request)/S_ERROR(Message)
        """
        # # clean cache
        res = self.putRequest(taskID, taskResult)
        self.log.info(
            "callback:",
            "%s result is %s(%s), put %s(%s)"
            % (
                taskID,
                "S_OK" if taskResult["OK"] else "S_ERROR",
                taskResult["Value"].Status if taskResult["OK"] else taskResult["Message"],
                "S_OK" if res["OK"] else "S_ERROR",
                "" if res["OK"] else res["Message"],
            ),
        )

    def exceptionCallback(self, taskID, taskException):
        """definition of exception callback function

        :param str taskID: Request.RequestID
        :param Exception taskException: Exception instance
        """
        self.log.error("exceptionCallback:", "%s was hit by exception %s" % (taskID, taskException))
        self.putRequest(taskID)

    def __rmsMonitoringReporting(self):
        """This method is called by the ThreadScheduler as a periodic task in order to commit the collected data which
        is done by the MonitoringReporter and is send to the 'RMSMonitoring' type.
        :return: True / False
        """
        result = self.rmsMonitoringReporter.commit()
        return result["OK"]
Пример #5
0
class Service(object):

    SVC_VALID_ACTIONS = {
        "RPC": "export",
        "FileTransfer": "transfer",
        "Message": "msg",
        "Connection": "Message"
    }
    SVC_SECLOG_CLIENT = SecurityLogClient()

    def __init__(self, serviceData):
        """
        Init the variables for the service

        :param serviceData: dict with modName, standalone, loadName, moduleObj, classObj. e.g.:
          {'modName': 'Framework/serviceName',
          'standalone': True,
          'loadName': 'Framework/serviceName',
          'moduleObj': <module 'serviceNameHandler' from '/home/DIRAC/FrameworkSystem/Service/serviceNameHandler.pyo'>,
          'classObj': <class 'serviceNameHandler.serviceHandler'>}

        """
        self._svcData = serviceData
        self._name = serviceData["modName"]
        self._startTime = datetime.datetime.utcnow()
        self._validNames = [serviceData["modName"]]
        if serviceData["loadName"] not in self._validNames:
            self._validNames.append(serviceData["loadName"])
        self._cfg = ServiceConfiguration(list(self._validNames))
        self._standalone = serviceData["standalone"]
        self.__monitorLastStatsUpdate = time.time()
        self._stats = {"queries": 0, "connections": 0}
        self._authMgr = AuthManager(
            "%s/Authorization" %
            PathFinder.getServiceSection(serviceData["loadName"]))
        self._transportPool = getGlobalTransportPool()
        self.__cloneId = 0
        self.__maxFD = 0
        self.activityMonitoring = False
        # Check if monitoring is enabled
        if "Monitoring" in Operations().getMonitoringBackends(
                monitoringType="ServiceMonitoring"):
            self.activityMonitoring = True

    def setCloneProcessId(self, cloneId):
        self.__cloneId = cloneId

    def _isMetaAction(self, action):
        referedAction = Service.SVC_VALID_ACTIONS[action]
        if referedAction in Service.SVC_VALID_ACTIONS:
            return referedAction
        return False

    def initialize(self):
        # Build the URLs
        self._url = self._cfg.getURL()
        if not self._url:
            return S_ERROR("Could not build service URL for %s" % self._name)
        gLogger.verbose("Service URL is %s" % self._url)
        # Load handler
        result = self._loadHandlerInit()
        if not result["OK"]:
            return result
        self._handler = result["Value"]
        # Initialize lock manager
        self._lockManager = LockManager(self._cfg.getMaxWaitingPetitions())
        self._threadPool = ThreadPoolExecutor(max(0,
                                                  self._cfg.getMaxThreads()))
        self._msgBroker = MessageBroker("%sMSB" % self._name,
                                        threadPool=self._threadPool)
        # Create static dict
        self._serviceInfoDict = {
            "serviceName":
            self._name,
            "serviceSectionPath":
            PathFinder.getServiceSection(self._name),
            "URL":
            self._cfg.getURL(),
            "messageSender":
            MessageSender(self._name, self._msgBroker),
            "validNames":
            self._validNames,
            "csPaths": [
                PathFinder.getServiceSection(svcName)
                for svcName in self._validNames
            ],
        }
        self.securityLogging = Operations().getValue(
            "EnableSecurityLogging", True) and getServiceOption(
                self._serviceInfoDict, "EnableSecurityLogging", True)
        # Initialize Monitoring
        # The import needs to be here because of the CS must be initialized before importing
        # this class (see https://github.com/DIRACGrid/DIRAC/issues/4793)
        from DIRAC.MonitoringSystem.Client.MonitoringReporter import MonitoringReporter

        self.activityMonitoringReporter = MonitoringReporter(
            monitoringType="ServiceMonitoring")

        self._initMonitoring()
        # Call static initialization function
        try:
            self._handler["class"]._rh__initializeClass(
                dict(self._serviceInfoDict), self._lockManager,
                self._msgBroker, self.activityMonitoringReporter)
            if self._handler["init"]:
                for initFunc in self._handler["init"]:
                    gLogger.verbose("Executing initialization function")
                    try:
                        result = initFunc(dict(self._serviceInfoDict))
                    except Exception as excp:
                        gLogger.exception(
                            "Exception while calling initialization function",
                            lException=excp)
                        return S_ERROR(
                            "Exception while calling initialization function: %s"
                            % str(excp))
                    if not isReturnStructure(result):
                        return S_ERROR(
                            "Service initialization function %s must return S_OK/S_ERROR"
                            % initFunc)
                    if not result["OK"]:
                        return S_ERROR("Error while initializing %s: %s" %
                                       (self._name, result["Message"]))
        except Exception as e:
            errMsg = "Exception while initializing %s" % self._name
            gLogger.exception(e)
            gLogger.exception(errMsg)
            return S_ERROR(errMsg)
        if self.activityMonitoring:
            gThreadScheduler.addPeriodicTask(30, self.__reportActivity)
            gThreadScheduler.addPeriodicTask(
                100, self.__activityMonitoringReporting)

        # Load actions after the handler has initialized itself
        result = self._loadActions()
        if not result["OK"]:
            return result
        self._actions = result["Value"]

        return S_OK()

    def __searchInitFunctions(self, handlerClass, currentClass=None):
        if not currentClass:
            currentClass = handlerClass
        initFuncs = []
        ancestorHasInit = False
        for ancestor in currentClass.__bases__:
            initFuncs += self.__searchInitFunctions(handlerClass, ancestor)
            if "initializeHandler" in dir(ancestor):
                ancestorHasInit = True
        if ancestorHasInit:
            initFuncs.append(
                super(currentClass, handlerClass).initializeHandler)
        if currentClass == handlerClass and "initializeHandler" in dir(
                handlerClass):
            initFuncs.append(handlerClass.initializeHandler)
        return initFuncs

    def _loadHandlerInit(self):
        handlerClass = self._svcData["classObj"]
        handlerName = handlerClass.__name__
        handlerInitMethods = self.__searchInitFunctions(handlerClass)
        try:
            handlerInitMethods.append(
                getattr(self._svcData["moduleObj"],
                        "initialize%s" % handlerName))
        except AttributeError:
            gLogger.verbose(
                "Not found global initialization function for service")

        if handlerInitMethods:
            gLogger.info("Found %s initialization methods" %
                         len(handlerInitMethods))

        handlerInfo = {}
        handlerInfo["name"] = handlerName
        handlerInfo["module"] = self._svcData["moduleObj"]
        handlerInfo["class"] = handlerClass
        handlerInfo["init"] = handlerInitMethods

        return S_OK(handlerInfo)

    def _loadActions(self):

        handlerClass = self._handler["class"]

        authRules = {}
        typeCheck = {}
        methodsList = {}
        for actionType in Service.SVC_VALID_ACTIONS:
            if self._isMetaAction(actionType):
                continue
            authRules[actionType] = {}
            typeCheck[actionType] = {}
            methodsList[actionType] = []
        handlerAttributeList = dir(handlerClass)
        for actionType in Service.SVC_VALID_ACTIONS:
            if self._isMetaAction(actionType):
                continue
            methodPrefix = "%s_" % Service.SVC_VALID_ACTIONS[actionType]
            for attribute in handlerAttributeList:
                if attribute.find(methodPrefix) != 0:
                    continue
                exportedName = attribute[len(methodPrefix):]
                methodsList[actionType].append(exportedName)
                gLogger.verbose("+ Found %s method %s" %
                                (actionType, exportedName))
                # Create lock for method
                self._lockManager.createLock(
                    "%s/%s" % (actionType, exportedName),
                    self._cfg.getMaxThreadsForMethod(actionType, exportedName))
                # Look for type and auth rules
                if actionType == "RPC":
                    typeAttr = "types_%s" % exportedName
                    authAttr = "auth_%s" % exportedName
                else:
                    typeAttr = "types_%s_%s" % (
                        Service.SVC_VALID_ACTIONS[actionType], exportedName)
                    authAttr = "auth_%s_%s" % (
                        Service.SVC_VALID_ACTIONS[actionType], exportedName)
                if typeAttr in handlerAttributeList:
                    obj = getattr(handlerClass, typeAttr)
                    gLogger.verbose("|- Found type definition %s: %s" %
                                    (typeAttr, str(obj)))
                    typeCheck[actionType][exportedName] = obj
                if authAttr in handlerAttributeList:
                    obj = getattr(handlerClass, authAttr)
                    gLogger.verbose("|- Found auth rules %s: %s" %
                                    (authAttr, str(obj)))
                    authRules[actionType][exportedName] = obj

        for actionType in Service.SVC_VALID_ACTIONS:
            referedAction = self._isMetaAction(actionType)
            if not referedAction:
                continue
            gLogger.verbose("Action %s is a meta action for %s" %
                            (actionType, referedAction))
            authRules[actionType] = []
            for method in authRules[referedAction]:
                for prop in authRules[referedAction][method]:
                    if prop not in authRules[actionType]:
                        authRules[actionType].append(prop)
            gLogger.verbose("Meta action %s props are %s" %
                            (actionType, authRules[actionType]))

        return S_OK({
            "methods": methodsList,
            "auth": authRules,
            "types": typeCheck
        })

    def _initMonitoring(self):
        props = [("__doc__", "description")]
        for prop in props:
            try:
                value = getattr(self._handler["module"], prop[0])
            except Exception as e:
                gLogger.exception(e)
                gLogger.error("Missing property", prop[0])
                value = "unset"

        for secondaryName in self._cfg.registerAlsoAs():
            gLogger.info("Registering %s also as %s" %
                         (self._name, secondaryName))
            self._validNames.append(secondaryName)
        return S_OK()

    def __reportActivity(self):
        initialWallTime, initialCPUTime, mem = self.__startReportToMonitoring()
        pendingQueries = self._threadPool._work_queue.qsize()
        activeQuereies = len(self._threadPool._threads)
        percentage = self.__endReportToMonitoring(initialWallTime,
                                                  initialCPUTime)
        self.activityMonitoringReporter.addRecord({
            "timestamp":
            int(TimeUtilities.toEpoch()),
            "Host":
            Network.getFQDN(),
            "ServiceName":
            "_".join(self._name.split("/")),
            "Location":
            self._cfg.getURL(),
            "MemoryUsage":
            mem,
            "CpuPercentage":
            percentage,
            "PendingQueries":
            pendingQueries,
            "ActiveQueries":
            activeQuereies,
            "RunningThreads":
            threading.activeCount(),
            "MaxFD":
            self.__maxFD,
        })
        self.__maxFD = 0

    def getConfig(self):
        return self._cfg

    # End of initialization functions

    def handleConnection(self, clientTransport):
        """
        This method may be called by ServiceReactor.
        The method stacks openened connection in a queue, another thread
        read this queue and handle connection.

        :param clientTransport: Object which describes opened connection (PlainTransport or SSLTransport)
        """
        if not self.activityMonitoring:
            self._stats["connections"] += 1
        self._threadPool.submit(self._processInThread, clientTransport)

    @property
    def wantsThrottle(self):
        """Boolean property for if the service wants requests to stop being accepted"""
        nQueued = self._threadPool._work_queue.qsize()
        return nQueued > self._cfg.getMaxWaitingPetitions()

    # Threaded process function
    def _processInThread(self, clientTransport):
        """
        This method handles a RPC, FileTransfer or Connection.
        Connection may be opened via ServiceReactor.__acceptIncomingConnection


        - Do the SSL/TLS Handshake (if dips is used) and extract credentials
        - Get the action called by the client
        - Check if the client is authorized to perform ation
          - If not, connection is closed
        - Instanciate the RequestHandler (RequestHandler contain all methods callable)

        (Following is not directly in this method but it describe what happen at
        #Execute the action)
        - Notify the client we're ready to execute the action (via _processProposal)
          and call RequestHandler._rh_executeAction()
        - Receive arguments/file/something else (depending on action) in the RequestHandler
        - Executing the action asked by the client

        :param clientTransport: Object which describe the opened connection (SSLTransport or PlainTransport)

        :return: S_OK with "closeTransport" a boolean to indicate if th connection have to be closed
                e.g. after RPC, closeTransport=True

        """
        self.__maxFD = max(self.__maxFD, clientTransport.oSocket.fileno())
        self._lockManager.lockGlobal()
        try:
            monReport = self.__startReportToMonitoring()
        except Exception:
            monReport = False
        try:
            # Handshake
            try:
                result = clientTransport.handshake()
                if not result["OK"]:
                    clientTransport.close()
                    return
            except Exception:
                return
            # Add to the transport pool
            trid = self._transportPool.add(clientTransport)
            if not trid:
                return
            # Receive and check proposal
            result = self._receiveAndCheckProposal(trid)
            if not result["OK"]:
                self._transportPool.sendAndClose(trid, result)
                return
            proposalTuple = result["Value"]
            # Instantiate handler
            result = self._instantiateHandler(trid, proposalTuple)
            if not result["OK"]:
                self._transportPool.sendAndClose(trid, result)
                return
            handlerObj = result["Value"]
            # Execute the action
            result = self._processProposal(trid, proposalTuple, handlerObj)
            # Close the connection if required
            if result["closeTransport"] or not result["OK"]:
                if not result["OK"]:
                    gLogger.error("Error processing proposal",
                                  result["Message"])
                self._transportPool.close(trid)
            return result
        finally:
            self._lockManager.unlockGlobal()
            if monReport:
                self.__endReportToMonitoring(monReport[0], monReport[1])

    @staticmethod
    def _createIdentityString(credDict, clientTransport=None):
        if "username" in credDict:
            if "group" in credDict:
                identity = "[%s:%s]" % (credDict["username"],
                                        credDict["group"])
            else:
                identity = "[%s:unknown]" % credDict["username"]
        else:
            identity = "unknown"
        if clientTransport:
            addr = clientTransport.getRemoteAddress()
            if addr:
                addr = "{%s:%s}" % (addr[0], addr[1])
        if "DN" in credDict:
            identity += "(%s)" % credDict["DN"]
        return identity

    @staticmethod
    def _deserializeProposalTuple(serializedProposal):
        """We receive the proposalTuple as a list.
        Turn it into a tuple again
        """
        proposalTuple = tuple(
            tuple(x) if isinstance(x, list) else x for x in serializedProposal)
        return proposalTuple

    def _receiveAndCheckProposal(self, trid):
        clientTransport = self._transportPool.get(trid)
        # Get the peer credentials
        credDict = clientTransport.getConnectingCredentials()
        # Receive the action proposal
        retVal = clientTransport.receiveData(1024)
        if not retVal["OK"]:
            gLogger.error(
                "Invalid action proposal",
                "%s %s" % (self._createIdentityString(
                    credDict, clientTransport), retVal["Message"]),
            )
            return S_ERROR("Invalid action proposal")
        proposalTuple = Service._deserializeProposalTuple(retVal["Value"])
        gLogger.debug("Received action from client",
                      "/".join(list(proposalTuple[1])))
        # Check if there are extra credentials
        if proposalTuple[2]:
            clientTransport.setExtraCredentials(proposalTuple[2])
        # Check if this is the requested service
        requestedService = proposalTuple[0][0]
        if requestedService not in self._validNames:
            return S_ERROR("%s is not up in this server" % requestedService)
        # Check if the action is valid
        requestedActionType = proposalTuple[1][0]
        if requestedActionType not in Service.SVC_VALID_ACTIONS:
            return S_ERROR("%s is not a known action type" %
                           requestedActionType)
        # Check if it's authorized
        result = self._authorizeProposal(proposalTuple[1], trid, credDict)
        if not result["OK"]:
            return result
        # Proposal is OK
        return S_OK(proposalTuple)

    def _authorizeProposal(self, actionTuple, trid, credDict):
        # Find CS path for the Auth rules
        referedAction = self._isMetaAction(actionTuple[0])
        if referedAction:
            csAuthPath = "%s/Default" % actionTuple[0]
            hardcodedMethodAuth = self._actions["auth"][actionTuple[0]]
        else:
            if actionTuple[0] == "RPC":
                csAuthPath = actionTuple[1]
            else:
                csAuthPath = "/".join(actionTuple)
            # Find if there are hardcoded auth rules in the code
            hardcodedMethodAuth = False
            if actionTuple[0] in self._actions["auth"]:
                hardcodedRulesByType = self._actions["auth"][actionTuple[0]]
                if actionTuple[0] == "FileTransfer":
                    methodName = actionTuple[1][0].lower() + actionTuple[1][1:]
                else:
                    methodName = actionTuple[1]

                if methodName in hardcodedRulesByType:
                    hardcodedMethodAuth = hardcodedRulesByType[methodName]
        # Auth time!
        if not self._authMgr.authQuery(csAuthPath, credDict,
                                       hardcodedMethodAuth):
            # Get the identity string
            identity = self._createIdentityString(credDict)
            fromHost = "unknown host"
            tr = self._transportPool.get(trid)
            if tr:
                fromHost = "/".join(
                    [str(item) for item in tr.getRemoteAddress()])
            gLogger.warn(
                "Unauthorized query", "to %s:%s by %s from %s" %
                (self._name, "/".join(actionTuple), identity, fromHost))
            result = S_ERROR(ENOAUTH, "Unauthorized query")
        else:
            result = S_OK()

        # Security log
        tr = self._transportPool.get(trid)
        if not tr:
            return S_ERROR("Client disconnected")
        sourceAddress = tr.getRemoteAddress()
        identity = self._createIdentityString(credDict)
        if self.securityLogging:
            Service.SVC_SECLOG_CLIENT.addMessage(
                result["OK"],
                sourceAddress[0],
                sourceAddress[1],
                identity,
                self._cfg.getHostname(),
                self._cfg.getPort(),
                self._name,
                "/".join(actionTuple),
            )
        return result

    def _instantiateHandler(self, trid, proposalTuple=None):
        """
        Generate an instance of the handler for a given service

        :param int trid: transport ID
        :param tuple proposalTuple: tuple describing the proposed action

        :return: S_OK/S_ERROR, Value is the handler object
        """
        # Generate the client params
        clientParams = {"serviceStartTime": self._startTime}
        if proposalTuple:
            # The 4th element is the client version
            clientParams["clientVersion"] = proposalTuple[3] if len(
                proposalTuple) > 3 else None
            clientParams["clientSetup"] = proposalTuple[0][1]
            if len(proposalTuple[0]) < 3:
                clientParams["clientVO"] = gConfig.getValue(
                    "/DIRAC/VirtualOrganization", "unknown")
            else:
                clientParams["clientVO"] = proposalTuple[0][2]
        clientTransport = self._transportPool.get(trid)
        if clientTransport:
            clientParams["clientAddress"] = clientTransport.getRemoteAddress()
        # Generate handler dict with per client info
        handlerInitDict = dict(self._serviceInfoDict)
        for key in clientParams:
            handlerInitDict[key] = clientParams[key]
        # Instantiate and initialize
        try:
            handlerInstance = self._handler["class"](handlerInitDict, trid)
            handlerInstance.initialize()
        except Exception as e:
            gLogger.exception("Server error while loading handler: %s" %
                              str(e))
            return S_ERROR("Server error while loading handler")
        return S_OK(handlerInstance)

    def _processProposal(self, trid, proposalTuple, handlerObj):
        # Notify the client we're ready to execute the action
        retVal = self._transportPool.send(trid, S_OK())
        if not retVal["OK"]:
            return retVal

        messageConnection = False
        if proposalTuple[1] == ("Connection", "new"):
            messageConnection = True

        if messageConnection:

            if self._msgBroker.getNumConnections(
            ) > self._cfg.getMaxMessagingConnections():
                result = S_ERROR(
                    "Maximum number of connections reached. Try later")
                result["closeTransport"] = True
                return result

            # This is a stable connection
            self._msgBroker.addTransportId(
                trid,
                self._name,
                receiveMessageCallback=self._mbReceivedMsg,
                disconnectCallback=self._mbDisconnect,
                listenToConnection=False,
            )

        result = self._executeAction(trid, proposalTuple, handlerObj)
        if result["OK"] and messageConnection:
            self._msgBroker.listenToTransport(trid)
            result = self._mbConnect(trid, handlerObj)
            if not result["OK"]:
                self._msgBroker.removeTransport(trid)

        result["closeTransport"] = not messageConnection or not result["OK"]
        return result

    def _mbConnect(self, trid, handlerObj=None):
        if not handlerObj:
            result = self._instantiateHandler(trid)
            if not result["OK"]:
                return result
            handlerObj = result["Value"]
        return handlerObj._rh_executeConnectionCallback("connected")

    def _executeAction(self, trid, proposalTuple, handlerObj):
        try:
            response = handlerObj._rh_executeAction(proposalTuple)
            if not response["OK"]:
                return response
            if self.activityMonitoring:
                self.activityMonitoringReporter.addRecord({
                    "timestamp":
                    int(TimeUtilities.toEpoch()),
                    "Host":
                    Network.getFQDN(),
                    "ServiceName":
                    "_".join(self._name.split("/")),
                    "Location":
                    self._cfg.getURL(),
                    "ResponseTime":
                    response["Value"][1],
                })
            return response["Value"][0]
        except Exception as e:
            gLogger.exception("Exception while executing handler action")
            return S_ERROR("Server error while executing action: %s" % str(e))

    def _mbReceivedMsg(self, trid, msgObj):
        result = self._authorizeProposal(
            ("Message", msgObj.getName()), trid,
            self._transportPool.get(trid).getConnectingCredentials())
        if not result["OK"]:
            return result
        result = self._instantiateHandler(trid)
        if not result["OK"]:
            return result
        handlerObj = result["Value"]
        response = handlerObj._rh_executeMessageCallback(msgObj)
        if self.activityMonitoring and response["OK"]:
            self.activityMonitoringReporter.addRecord({
                "timestamp":
                int(TimeUtilities.toEpoch()),
                "Host":
                Network.getFQDN(),
                "ServiceName":
                "_".join(self._name.split("/")),
                "Location":
                self._cfg.getURL(),
                "ResponseTime":
                response["Value"][1],
            })
        if response["OK"]:
            return response["Value"][0]
        else:
            return response

    def _mbDisconnect(self, trid):
        result = self._instantiateHandler(trid)
        if not result["OK"]:
            return result
        handlerObj = result["Value"]
        return handlerObj._rh_executeConnectionCallback("drop")

    def __activityMonitoringReporting(self):
        """This method is called by the ThreadScheduler as a periodic task in order to commit the collected data which
        is done by the MonitoringReporter and is send to the 'ComponentMonitoring' type.

        :return: True / False
        """
        return self.activityMonitoringReporter.commit()

    def __startReportToMonitoring(self):
        now = time.time()
        stats = os.times()
        cpuTime = stats[0] + stats[2]
        mem = None
        if now - self.__monitorLastStatsUpdate < 0:
            return (now, cpuTime, mem)
        self.__monitorLastStatsUpdate = now
        membytes = MemStat.VmB("VmRSS:")
        if membytes:
            mem = membytes / (1024.0 * 1024.0)
        return (now, cpuTime, mem)

    def __endReportToMonitoring(self, initialWallTime, initialCPUTime):
        wallTime = time.time() - initialWallTime
        stats = os.times()
        cpuTime = stats[0] + stats[2] - initialCPUTime
        percentage = cpuTime / wallTime * 100.0
        return percentage
Пример #6
0
class RemoveReplica(DMSRequestOperationsBase):
    """
    .. class:: RemoveReplica

    """

    def __init__(self, operation=None, csPath=None):
        """c'tor

        :param self: self reference
        :param Operation operation: operation to execute
        :param str csPath: CS path for this handler
        """
        # # base class ctor
        DMSRequestOperationsBase.__init__(self, operation, csPath)

    def __call__(self):
        """remove replicas"""

        # The flag  'rmsMonitoring' is set by the RequestTask and is False by default.
        # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase.
        if self.rmsMonitoring:
            self.rmsMonitoringReporter = MonitoringReporter(monitoringType="RMSMonitoring")
        else:
            # # gMonitor stuff
            gMonitor.registerActivity(
                "RemoveReplicaAtt", "Replica removals attempted", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM
            )
            gMonitor.registerActivity(
                "RemoveReplicaOK", "Successful replica removals", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM
            )
            gMonitor.registerActivity(
                "RemoveReplicaFail", "Failed replica removals", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM
            )

        # # prepare list of targetSEs
        targetSEs = self.operation.targetSEList
        # # check targetSEs for removal
        bannedTargets = self.checkSEsRSS(targetSEs, access="RemoveAccess")
        if not bannedTargets["OK"]:
            if self.rmsMonitoring:
                for status in ["Attempted", "Failed"]:
                    self.rmsMonitoringReporter.addRecord(self.createRMSRecord(status, len(self.operation)))
                self.rmsMonitoringReporter.commit()
            else:
                gMonitor.addMark("RemoveReplicaAtt")
                gMonitor.addMark("RemoveReplicaFail")
            return bannedTargets

        if bannedTargets["Value"]:
            return S_OK("%s targets are banned for removal" % ",".join(bannedTargets["Value"]))

        # # get waiting files
        waitingFiles = self.getWaitingFilesList()
        # # and prepare dict
        toRemoveDict = dict((opFile.LFN, opFile) for opFile in waitingFiles)

        self.log.info("Todo: %s replicas to delete from %s SEs" % (len(toRemoveDict), len(targetSEs)))

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Attempted", len(toRemoveDict)))
        else:
            gMonitor.addMark("RemoveReplicaAtt", len(toRemoveDict) * len(targetSEs))

        # # keep status for each targetSE
        removalStatus = dict.fromkeys(toRemoveDict, None)
        for lfn in removalStatus:
            removalStatus[lfn] = dict.fromkeys(targetSEs, None)

        # # loop over targetSEs
        for targetSE in targetSEs:

            self.log.info("Removing replicas at %s" % targetSE)

            # # 1st step - bulk removal
            bulkRemoval = self._bulkRemoval(toRemoveDict, targetSE)
            if not bulkRemoval["OK"]:
                self.log.error("Bulk replica removal failed", bulkRemoval["Message"])

                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.commit()

                return bulkRemoval

            # # report removal status for successful files
            if self.rmsMonitoring:
                self.rmsMonitoringReporter.addRecord(
                    self.createRMSRecord(
                        "Successful", len(([opFile for opFile in toRemoveDict.values() if not opFile.Error]))
                    )
                )
            else:
                gMonitor.addMark(
                    "RemoveReplicaOK", len([opFile for opFile in toRemoveDict.values() if not opFile.Error])
                )

            # # 2nd step - process the rest again
            toRetry = dict((lfn, opFile) for lfn, opFile in toRemoveDict.items() if opFile.Error)
            for lfn, opFile in toRetry.items():
                self._removeWithOwnerProxy(opFile, targetSE)
                if opFile.Error:
                    if self.rmsMonitoring:
                        self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Failed", 1))
                    else:
                        gMonitor.addMark("RemoveReplicaFail", 1)
                    removalStatus[lfn][targetSE] = opFile.Error
                else:
                    if self.rmsMonitoring:
                        self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Successful", 1))
                    else:
                        gMonitor.addMark("RemoveReplicaOK", 1)

        # # update file status for waiting files
        failed = 0
        for opFile in self.operation:
            if opFile.Status == "Waiting":
                errors = list(set(error for error in removalStatus[opFile.LFN].values() if error))
                if errors:
                    opFile.Error = "\n".join(errors)
                    # This seems to be the only unrecoverable error
                    if "Write access not permitted for this credential" in opFile.Error:
                        failed += 1
                        opFile.Status = "Failed"
                else:
                    opFile.Status = "Done"

        if failed:
            self.operation.Error = "failed to remove %s replicas" % failed

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.commit()

        return S_OK()

    def _bulkRemoval(self, toRemoveDict, targetSE):
        """remove replicas :toRemoveDict: at :targetSE:

        :param dict toRemoveDict: { lfn: opFile, ... }
        :param str targetSE: target SE name
        """
        # Clear the error
        for opFile in toRemoveDict.values():
            opFile.Error = ""
        removeReplicas = self.dm.removeReplica(targetSE, list(toRemoveDict))
        if not removeReplicas["OK"]:
            for opFile in toRemoveDict.values():
                opFile.Error = removeReplicas["Message"]
            return removeReplicas
        removeReplicas = removeReplicas["Value"]
        # # filter out failed
        for lfn, opFile in toRemoveDict.items():
            if lfn in removeReplicas["Failed"]:
                errorReason = str(removeReplicas["Failed"][lfn])
                # If the reason is that the file does not exist,
                # we consider the removal successful
                # TODO: use cmpError once the FC returns the proper error msg corresponding to ENOENT
                if "No such file" not in errorReason:
                    opFile.Error = errorReason
                    self.log.error("Failed removing lfn", "%s:%s" % (lfn, opFile.Error))

        return S_OK()

    def _removeWithOwnerProxy(self, opFile, targetSE):
        """remove opFile replica from targetSE using owner proxy

        :param File opFile: File instance
        :param str targetSE: target SE name
        """
        if "Write access not permitted for this credential" in opFile.Error:
            proxyFile = None
            if "DataManager" in self.shifter:
                # #  you're a data manager - save current proxy and get a new one for LFN and retry
                saveProxy = os.environ["X509_USER_PROXY"]
                try:
                    fileProxy = self.getProxyForLFN(opFile.LFN)
                    if not fileProxy["OK"]:
                        opFile.Error = fileProxy["Message"]
                    else:
                        proxyFile = fileProxy["Value"]
                        removeReplica = self.dm.removeReplica(targetSE, opFile.LFN)
                        if not removeReplica["OK"]:
                            opFile.Error = removeReplica["Message"]
                        else:
                            # Set or reset the error if all OK
                            opFile.Error = removeReplica["Value"]["Failed"].get(opFile.LFN, "")
                finally:
                    if proxyFile:
                        os.unlink(proxyFile)
                    # # put back request owner proxy to env
                    os.environ["X509_USER_PROXY"] = saveProxy
Пример #7
0
class StatesMonitoringAgent(AgentModule):
    """
  """

    __summaryKeyFieldsMapping = [
        'Status', 'Site', 'User', 'UserGroup', 'JobGroup', 'JobType',
        'ApplicationStatus', 'MinorStatus'
    ]
    __summaryDefinedFields = [('ApplicationStatus', 'unset'),
                              ('MinorStatus', 'unset')]
    __summaryValueFieldsMapping = ['Jobs', 'Reschedules']
    __renameFieldsMapping = {'JobType': 'JobSplitType'}

    __jobDBFields = []

    jobDB = None
    monitoringReporter = None

    def initialize(self):
        """ Standard initialization
    """

        self.jobDB = JobDB()

        self.am_setOption("PollingTime", 900)
        self.messageQueue = self.am_getOption('MessageQueue',
                                              'dirac.wmshistory')

        self.monitoringReporter = MonitoringReporter(
            monitoringType="WMSHistory", failoverQueueName=self.messageQueue)

        for field in self.__summaryKeyFieldsMapping:
            if field == 'User':
                field = 'Owner'
            elif field == 'UserGroup':
                field = 'OwnerGroup'
            self.__jobDBFields.append(field)

        return S_OK()

    def execute(self):
        """ Main execution method
    """
        # Get the WMS Snapshot!
        result = self.jobDB.getSummarySnapshot(self.__jobDBFields)
        now = Time.dateTime()
        if not result['OK']:
            self.log.error(
                "Can't get the JobDB summary",
                "%s: won't commit at this cycle" % result['Message'])
        else:
            values = result['Value'][1]

            self.log.info("Start sending records")
            for record in values:
                record = record[1:]
                rD = {}
                for fV in self.__summaryDefinedFields:
                    rD[fV[0]] = fV[1]
                for iP in range(len(self.__summaryKeyFieldsMapping)):
                    fieldName = self.__summaryKeyFieldsMapping[iP]
                    rD[self.__renameFieldsMapping.get(fieldName,
                                                      fieldName)] = record[iP]
                record = record[len(self.__summaryKeyFieldsMapping):]
                for iP in range(len(self.__summaryValueFieldsMapping)):
                    rD[self.__summaryValueFieldsMapping[iP]] = int(record[iP])
                rD['timestamp'] = int(Time.toEpoch(now))
                self.monitoringReporter.addRecord(rD)

            retVal = self.monitoringReporter.commit()
            if retVal['OK']:
                self.log.info("Records sent", "(%s)" % retVal['Value'])
            else:
                self.log.error(
                    "Failed to insert the records, it will be retried in the next iteration",
                    retVal['Message'])

        return S_OK()
Пример #8
0
class RemoveFile(DMSRequestOperationsBase):
    """
    .. class:: RemoveFile

    remove file operation handler
    """
    def __init__(self, operation=None, csPath=None):
        """c'tor

        :param self: self reference
        :param Operation operation: Operation to execute
        :param str csPath: CS path for this handler
        """
        # # call base class ctor
        DMSRequestOperationsBase.__init__(self, operation, csPath)

        # # re pattern for not existing files
        self.reNotExisting = re.compile(r"(no|not) such file.*", re.IGNORECASE)

    def __call__(self):
        """action for 'removeFile' operation"""

        # The flag  'rmsMonitoring' is set by the RequestTask and is False by default.
        # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase.
        if self.rmsMonitoring:
            self.rmsMonitoringReporter = MonitoringReporter(
                monitoringType="RMSMonitoring")

        # # get waiting files
        waitingFiles = self.getWaitingFilesList()
        fc = FileCatalog(self.operation.catalogList)

        res = fc.getReplicas([wf.LFN for wf in waitingFiles])
        if not res["OK"]:
            if self.rmsMonitoring:
                for status in ["Attempted", "Failed"]:
                    self.rmsMonitoringReporter.addRecord(
                        self.createRMSRecord(status, len(waitingFiles)))
                self.rmsMonitoringReporter.commit()
            return res

        # We check the status of the SE from the LFN that are successful
        # No idea what to do with the others...
        replicas = res["Value"]["Successful"]
        targetSEs = set([se for lfn in replicas for se in replicas[lfn]])

        if targetSEs:
            # Check if SEs are allowed for remove but don't fail yet the operation if SEs are always banned
            bannedTargets = self.checkSEsRSS(targetSEs,
                                             access="RemoveAccess",
                                             failIfBanned=False)
            if not bannedTargets["OK"]:
                if self.rmsMonitoring:
                    for status in ["Attempted", "Failed"]:
                        self.rmsMonitoringReporter.addRecord(
                            self.createRMSRecord(status, len(replicas)))
                    self.rmsMonitoringReporter.commit()
                return bannedTargets
            bannedTargets = set(bannedTargets["Value"])
        else:
            bannedTargets = set()

        # # prepare waiting file dict
        # # We take only files that have no replica at the banned SEs... If no replica, don't
        toRemoveDict = dict(
            ((opFile.LFN, opFile) for opFile in waitingFiles
             if not bannedTargets
             or not bannedTargets.intersection(replicas.get(opFile.LFN, []))))
        # If some SEs are always banned, set Failed the files that cannot be removed
        if bannedTargets and "always banned" in self.operation.Error:
            for opFile in waitingFiles:
                if opFile.LFN not in toRemoveDict:
                    # Set the files that cannot be removed Failed
                    opFile.Error = self.operation.Error
                    opFile.Status = "Failed"

            if self.rmsMonitoring:
                self.rmsMonitoringReporter.addRecord(
                    self.createRMSRecord("Failed",
                                         len(waitingFiles) -
                                         len(toRemoveDict)))
                self.rmsMonitoringReporter.commit()

            if not toRemoveDict:
                # If there are no files that can be removed, exit, else try once to remove them anyway
                return S_OK("%s targets are always banned for removal" %
                            ",".join(sorted(bannedTargets)))

        if toRemoveDict:
            if self.rmsMonitoring:
                self.rmsMonitoringReporter.addRecord(
                    self.createRMSRecord("Attempted", len(toRemoveDict)))
            # # 1st step - bulk removal
            self.log.debug("bulk removal of %s files" % len(toRemoveDict))
            bulkRemoval = self.bulkRemoval(toRemoveDict)
            if not bulkRemoval["OK"]:
                self.log.error("Bulk file removal failed",
                               bulkRemoval["Message"])
            else:
                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord(
                        self.createRMSRecord(
                            "Successful",
                            len(toRemoveDict) - len(bulkRemoval["Value"])))

            # # 2nd step - single file removal
            for lfn, opFile in toRemoveDict.items():
                self.log.info("removing single file %s" % lfn)
                singleRemoval = self.singleRemoval(opFile)
                if not singleRemoval["OK"]:
                    self.log.error("Error removing single file",
                                   singleRemoval["Message"])
                    if self.rmsMonitoring:
                        self.rmsMonitoringReporter.addRecord(
                            self.createRMSRecord("Failed", 1))
                else:
                    self.log.info("file %s has been removed" % lfn)
                    if self.rmsMonitoring:
                        self.rmsMonitoringReporter.addRecord(
                            self.createRMSRecord("Successful", 1))

            # # set
            failedFiles = [(lfn, opFile)
                           for (lfn, opFile) in toRemoveDict.items()
                           if opFile.Status in ("Failed", "Waiting")]
            if failedFiles:
                self.operation.Error = "failed to remove %d files" % len(
                    failedFiles)

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.commit()

        if bannedTargets:
            return S_OK("%s targets are banned for removal" %
                        ",".join(sorted(bannedTargets)))
        return S_OK()

    def bulkRemoval(self, toRemoveDict):
        """bulk removal using request owner DN

        :param dict toRemoveDict: { lfn: opFile, ... }
        :return: S_ERROR or S_OK( { lfn: opFile, ... } ) -- dict with files still waiting to be removed
        """
        bulkRemoval = self.dm.removeFile(list(toRemoveDict), force=True)
        if not bulkRemoval["OK"]:
            error = bulkRemoval["Message"]
            self.log.error("Bulk file removal failed", error)
            self.operation.Error = error
            for opFile in self.operation:
                opFile.Error = error
            return bulkRemoval
        bulkRemoval = bulkRemoval["Value"]
        # # filter results
        for lfn, opFile in toRemoveDict.items():
            if lfn in bulkRemoval["Successful"]:
                opFile.Status = "Done"
            elif lfn in bulkRemoval["Failed"]:

                error = bulkRemoval["Failed"][lfn]
                if isinstance(error, dict):
                    error = ";".join(
                        ["%s-%s" % (k, v) for k, v in error.items()])
                opFile.Error = error
                if self.reNotExisting.search(opFile.Error):
                    opFile.Status = "Done"

        # # return files still waiting
        toRemoveDict = dict((lfn, opFile)
                            for lfn, opFile in toRemoveDict.items()
                            if opFile.Status == "Waiting")
        return S_OK(toRemoveDict)

    def singleRemoval(self, opFile):
        """remove single file

        :param opFile: File instance
        """
        # # try to remove with owner proxy
        proxyFile = None
        if "Write access not permitted for this credential" in opFile.Error:
            if "DataManager" in self.shifter:
                # #  you're a data manager - get proxy for LFN and retry
                saveProxy = os.environ["X509_USER_PROXY"]
                try:
                    fileProxy = self.getProxyForLFN(opFile.LFN)
                    if not fileProxy["OK"]:
                        opFile.Error = "Error getting owner's proxy : %s" % fileProxy[
                            "Message"]
                    else:
                        proxyFile = fileProxy["Value"]
                        self.log.info(
                            "Trying to remove file with owner's proxy (file %s)"
                            % proxyFile)

                        removeFile = self.dm.removeFile(opFile.LFN, force=True)
                        self.log.always(str(removeFile))

                        if not removeFile["OK"]:
                            opFile.Error = str(removeFile["Message"])
                            if self.reNotExisting.search(
                                    str(removeFile["Message"]).lower()):
                                opFile.Status = "Done"
                        else:
                            removeFile = removeFile["Value"]
                            if opFile.LFN in removeFile["Failed"]:
                                error = removeFile["Failed"][opFile.LFN]
                                if isinstance(error, dict):
                                    error = ";".join([
                                        "%s-%s" % (k, v)
                                        for k, v in error.items()
                                    ])
                                if self.reNotExisting.search(error):
                                    # This should never happen due to the "force" flag
                                    opFile.Status = "Done"
                                else:
                                    opFile.Error = error
                            else:
                                opFile.Status = "Done"
                finally:
                    if proxyFile:
                        os.unlink(proxyFile)
                    # # put back request owner proxy to env
                    os.environ["X509_USER_PROXY"] = saveProxy
        # # file removed? update its status to 'Done'
        if opFile.Status == "Done":
            return S_OK()
        return S_ERROR(opFile.Error)
Пример #9
0
class MoveReplica(DMSRequestOperationsBase):
    """
  .. class:: MoveReplica

  MoveReplica operation handler
  """
    def __init__(self, operation=None, csPath=None):
        """c'tor

    :param self: self reference
    :param ~Operation.Operation operation: Operation instance
    :param str csPath: CS path for this handler
    """
        super(MoveReplica, self).__init__(operation, csPath)

        # Init ConsistencyInspector: used to check replicas
        self.ci = ConsistencyInspector()

    def __call__(self):
        """ call me maybe """

        # The flag  'rmsMonitoring' is set by the RequestTask and is False by default.
        # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase.
        if self.rmsMonitoring:
            self.rmsMonitoringReporter = MonitoringReporter(
                monitoringType="RMSMonitoring")
        else:
            # # own gMonitor stuff for files
            gMonitor.registerActivity("ReplicateAndRegisterAtt",
                                      "Replicate and register attempted",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)
            gMonitor.registerActivity("ReplicateOK", "Replications successful",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)
            gMonitor.registerActivity("ReplicateFail", "Replications failed",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)
            gMonitor.registerActivity("RegisterOK", "Registrations successful",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)
            gMonitor.registerActivity("RegisterFail", "Registrations failed",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)
            gMonitor.registerActivity("RemoveReplicaAtt",
                                      "Replica removals attempted",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)
            gMonitor.registerActivity("RemoveReplicaOK",
                                      "Successful replica removals",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)
            gMonitor.registerActivity("RemoveReplicaFail",
                                      "Failed replica removals",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)

        # # check replicas first
        res = self.__checkReplicas()
        if not res["OK"]:
            self.log.error('Failed to check replicas', res["Message"])

        sourceSE = self.operation.SourceSE if self.operation.SourceSE else None
        if sourceSE:
            # # check source se for read
            bannedSource = self.checkSEsRSS(sourceSE, 'ReadAccess')
            if not bannedSource["OK"]:
                if self.rmsMonitoring:
                    for status in ["Attempted", "Failed"]:
                        self.rmsMonitoringReporter.addRecord(
                            self.createRMSRecord(status, len(self.operation)))
                    self.rmsMonitoringReporter.commit()
                else:
                    gMonitor.addMark("ReplicateAndRegisterAtt",
                                     len(self.operation))
                    gMonitor.addMark("ReplicateFail", len(self.operation))
                return bannedSource

            if bannedSource["Value"]:
                self.operation.Error = "SourceSE %s is banned for reading" % sourceSE
                self.log.info(self.operation.Error)
                return S_OK(self.operation.Error)

        # # check targetSEs for write
        bannedTargets = self.checkSEsRSS()
        if not bannedTargets['OK']:
            if self.rmsMonitoring:
                for status in ["Attempted", "Failed"]:
                    self.rmsMonitoringReporter.addRecord(
                        self.createRMSRecord(status, len(self.operation)))
                self.rmsMonitoringReporter.commit()
            else:
                gMonitor.addMark("ReplicateAndRegisterAtt",
                                 len(self.operation))
                gMonitor.addMark("ReplicateFail", len(self.operation))
            return bannedTargets

        if bannedTargets['Value']:
            self.operation.Error = "%s targets are banned for writing" % ",".join(
                bannedTargets['Value'])
            return S_OK(self.operation.Error)

        # Can continue now
        self.log.verbose("No targets banned for writing")

        # # check sourceSEs for removal
        # # for removal the targetSEs are the sourceSEs of the replication
        targetSEs = self.operation.sourceSEList
        bannedTargets = self.checkSEsRSS(targetSEs, access='RemoveAccess')
        if not bannedTargets['OK']:
            if self.rmsMonitoring:
                for status in ["Attempted", "Failed"]:
                    self.rmsMonitoringReporter.addRecord(
                        self.createRMSRecord(status, len(self.operation)))
                self.rmsMonitoringReporter.commit()
            else:
                gMonitor.addMark("RemoveReplicaAtt")
                gMonitor.addMark("RemoveReplicaFail")
            return bannedTargets

        if bannedTargets['Value']:
            return S_OK("%s targets are banned for removal" %
                        ",".join(bannedTargets['Value']))

        # Can continue now
        self.log.verbose("No targets banned for removal")

        # Do the transfer
        # # get waiting files. If none just return
        waitingFiles = self.getWaitingFilesList()
        if not waitingFiles:
            return S_OK()

        # # loop over files
        self.log.info("Transferring files using Data manager...")
        for opFile in waitingFiles:
            res = self.dmTransfer(opFile)
            if not res["OK"]:
                continue
            else:
                # Do the replica removal
                self.log.info("Removing files using Data manager...")
                toRemoveDict = dict([(opFile.LFN, opFile)
                                     for opFile in waitingFiles])
                self.log.info("todo: %s replicas to delete from %s sites" %
                              (len(toRemoveDict), len(targetSEs)))
                self.dmRemoval(toRemoveDict, targetSEs)

        return S_OK()

    def __checkReplicas(self):
        """ check done replicas and update file states  """
        waitingFiles = dict([(opFile.LFN, opFile) for opFile in self.operation
                             if opFile.Status in ("Waiting", "Scheduled")])
        targetSESet = set(self.operation.targetSEList)

        # Check replicas
        res = self.ci._getCatalogReplicas(list(waitingFiles))

        if not res["OK"]:
            self.log.error('Failed to get catalog replicas', res["Message"])
            return S_ERROR()

        allReplicas = res['Value'][0]

        replicas = self.ci.compareChecksum(list(waitingFiles))

        if not replicas["OK"]:
            self.log.error('Failed to check replicas', replicas["Message"])
            return S_ERROR()

        replicas = replicas["Value"]
        noReplicas = replicas['NoReplicas']

        if noReplicas:
            if self.rmsMonitoring:
                self.rmsMonitoringReporter.addRecord(
                    self.createRMSRecord("Failed", len(noReplicas)))
                self.rmsMonitoringReporter.commit()
            for lfn in noReplicas.keys():
                self.log.error("File %s doesn't exist" % lfn)
                if not self.rmsMonitoring:
                    gMonitor.addMark("ReplicateFail", len(targetSESet))
                waitingFiles[lfn].Status = "Failed"

        for lfn, reps in allReplicas.items():
            if targetSESet.issubset(set(reps)):
                self.log.info("file %s has been replicated to all targets" %
                              lfn)
                waitingFiles[lfn].Status = "Done"

        return S_OK()

    def dmRemoval(self, toRemoveDict, targetSEs):

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.addRecord(
                self.createRMSRecord("Attempted", len(toRemoveDict)))
            self.rmsMonitoringReporter.commit()
        else:
            gMonitor.addMark("RemoveReplicaAtt",
                             len(toRemoveDict) * len(targetSEs))
        # # keep status for each targetSE
        removalStatus = dict.fromkeys(toRemoveDict.keys(), None)
        for lfn in removalStatus:
            removalStatus[lfn] = dict.fromkeys(targetSEs, None)

        # # loop over targetSEs
        for targetSE in targetSEs:
            self.log.info("removing replicas at %s" % targetSE)

            # # 1st step - bulk removal
            bulkRemoval = self.bulkRemoval(toRemoveDict, targetSE)
            if not bulkRemoval["OK"]:
                self.log.error('Bulk replica removal failed',
                               bulkRemoval["Message"])
                return bulkRemoval
            bulkRemoval = bulkRemoval["Value"]

            # # update removal status for successful files
            removalOK = [
                opFile for opFile in bulkRemoval.values() if not opFile.Error
            ]

            for opFile in removalOK:
                removalStatus[opFile.LFN][targetSE] = ""

            if self.rmsMonitoring:
                self.rmsMonitoringReporter.addRecord(
                    self.createRMSRecord("Successful", len(removalOK)))
            else:
                gMonitor.addMark("RemoveReplicaOK", len(removalOK))

            # # 2nd step - process the rest again
            toRetry = dict([(lfn, opFile)
                            for lfn, opFile in bulkRemoval.items()
                            if opFile.Error])
            for lfn, opFile in toRetry.items():
                self.singleRemoval(opFile, targetSE)
                if not opFile.Error:
                    if self.rmsMonitoring:
                        self.rmsMonitoringReporter.addRecord(
                            self.createRMSRecord("Successful", 1))
                    else:
                        gMonitor.addMark("RemoveReplicaOK", 1)
                    removalStatus[lfn][targetSE] = ""
                else:
                    if self.rmsMonitoring:
                        self.rmsMonitoringReporter.addRecord(
                            self.createRMSRecord("Failed", 1))
                    else:
                        gMonitor.addMark("RemoveReplicaFail", 1)
                    removalStatus[lfn][targetSE] = opFile.Error

        # # update file status for waiting files
        failed = 0
        for opFile in self.operation:
            if opFile.Status == "Waiting":
                errors = list(
                    set([
                        error for error in removalStatus[lfn].values() if error
                    ]))
                if errors:
                    opFile.Error = ",".join(errors)
                    # This seems to be the only offending error
                    if "Write access not permitted for this credential" in opFile.Error:
                        failed += 1
                        continue
                opFile.Status = "Done"

        if failed:
            self.operation.Error = "failed to remove %s replicas" % failed

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.commit()

        return S_OK(removalStatus)

    def dmTransfer(self, opFile):
        """ replicate and register using dataManager  """
        # # get waiting files. If none just return
        # # source SE
        sourceSE = self.operation.SourceSE if self.operation.SourceSE else None

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.addRecord(
                self.createRMSRecord("Attempted", 1))
            self.rmsMonitoringReporter.commit()
        else:
            gMonitor.addMark("ReplicateAndRegisterAtt", 1)

        opFile.Error = ''
        lfn = opFile.LFN

        # Check replicas
        res = self.ci._getCatalogReplicas([lfn])

        if not res["OK"]:
            self.log.error('Failed to get catalog replicas', res["Message"])
            return S_ERROR()

        allReplicas = res['Value'][0]
        replicas = self.ci.compareChecksum([lfn])

        if not replicas["OK"]:
            self.log.error('Failed to check replicas', replicas["Message"])
            return S_ERROR()

        replicas = replicas["Value"]

        validReplicas = []
        noReplicas = replicas['NoReplicas']
        missingAllReplicas = replicas["MissingAllReplicas"]
        missingReplica = replicas["MissingReplica"]
        someReplicasCorrupted = replicas["SomeReplicasCorrupted"]
        allReplicasCorrupted = replicas["AllReplicasCorrupted"]

        if noReplicas:
            self.log.error("Unable to replicate",
                           "File %s doesn't exist" % (lfn))
            opFile.Error = 'No replicas found'
            opFile.Status = 'Failed'
        elif missingAllReplicas:
            self.log.error("Unable to replicate",
                           "%s, all replicas are missing" % (lfn))
            opFile.Error = 'Missing all replicas'
            opFile.Status = 'Failed'
        elif allReplicasCorrupted:
            self.log.error("Unable to replicate",
                           "%s, all replicas are corrupted" % (lfn))
            opFile.Error = 'All replicas corrupted'
            opFile.Status = 'Failed'
        elif someReplicasCorrupted:
            self.log.error(
                "Unable to replicate", "%s, replicas corrupted at %s" %
                (lfn, someReplicasCorrupted[lfn]))
            opFile.Error = 'At least one replica corrupted'
            opFile.Status = 'Failed'
        elif missingReplica:
            self.log.error(
                "Unable to replicate",
                "%s, missing replicas at %s" % (lfn, missingReplica[lfn]))
            opFile.Error = 'At least one missing replica'
            opFile.Status = 'Failed'

        if opFile.Error:
            if self.rmsMonitoring:
                self.rmsMonitoringReporter.addRecord(
                    self.createRMSRecord("Failed", 1))
                self.rmsMonitoringReporter.commit()
            else:
                gMonitor.addMark("ReplicateFail")
            return S_ERROR()

        # Check if replica is at the specified source
        for repSEName in allReplicas[lfn]:
            validReplicas.append(repSEName)

        # # get the first one in the list
        if sourceSE not in validReplicas:
            if sourceSE:
                self.log.warn(
                    "%s is not at specified sourceSE %s, changed to %s" %
                    (lfn, sourceSE, validReplicas[0]))
            sourceSE = validReplicas[0]

        # # loop over targetSE
        catalogs = self.operation.Catalog
        if catalogs:
            catalogs = [cat.strip() for cat in catalogs.split(',')]

        for targetSE in self.operation.targetSEList:
            # # call DataManager
            if targetSE in validReplicas:
                self.log.warn(
                    "Request to replicate %s to an existing location: %s" %
                    (lfn, targetSE))
                continue

            res = self.dm.replicateAndRegister(lfn,
                                               targetSE,
                                               sourceSE=sourceSE,
                                               catalog=catalogs)

            if res["OK"]:
                if lfn in res["Value"]["Successful"]:
                    if "replicate" in res["Value"]["Successful"][lfn]:
                        repTime = res["Value"]["Successful"][lfn]["replicate"]
                        prString = "file %s replicated at %s in %s s." % (
                            lfn, targetSE, repTime)

                        if not self.rmsMonitoring:
                            gMonitor.addMark("ReplicateOK", 1)

                        if "register" in res["Value"]["Successful"][lfn]:

                            if not self.rmsMonitoring:
                                gMonitor.addMark("RegisterOK", 1)

                            regTime = res["Value"]["Successful"][lfn][
                                "register"]
                            prString += ' and registered in %s s.' % regTime
                            self.log.info(prString)
                        else:

                            if not self.rmsMonitoring:
                                gMonitor.addMark("RegisterFail", 1)

                            prString += " but failed to register"
                            self.log.warn(prString)

                            opFile.Error = "Failed to register"
                            # # add register replica operation
                            registerOperation = self.getRegisterOperation(
                                opFile, targetSE, type='RegisterReplica')
                            self.request.insertAfter(registerOperation,
                                                     self.operation)
                    else:
                        self.log.error("Failed to replicate",
                                       "%s to %s" % (lfn, targetSE))

                        if not self.rmsMonitoring:
                            gMonitor.addMark("ReplicateFail", 1)

                        opFile.Error = "Failed to replicate"
                else:

                    if not self.rmsMonitoring:
                        gMonitor.addMark("ReplicateFail", 1)

                    reason = res["Value"]["Failed"][lfn]
                    self.log.error(
                        "Failed to replicate and register",
                        "File %s at %s: %s" % (lfn, targetSE, reason))
                    opFile.Error = reason
            else:

                if not self.rmsMonitoring:
                    gMonitor.addMark("ReplicateFail", 1)

                opFile.Error = "DataManager error: %s" % res["Message"]
                self.log.error("DataManager error", res["Message"])

        if not opFile.Error:
            if self.rmsMonitoring:
                self.rmsMonitoringReporter.addRecord(
                    self.createRMSRecord("Successful", 1))
            if len(self.operation.targetSEList) > 1:
                self.log.info("file %s has been replicated to all targetSEs" %
                              lfn)
        else:
            if self.rmsMonitoring:
                self.rmsMonitoringReporter.addRecord(
                    self.createRMSRecord("Failed", 1))
                self.rmsMonitoringReporter.commit()
            return S_ERROR("dmTransfer failed")

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.commit()

        return S_OK()

    def bulkRemoval(self, toRemoveDict, targetSE):
        """ remove replicas :toRemoveDict: at :targetSE:

    :param dict toRemoveDict: { lfn: opFile, ... }
    :param str targetSE: target SE name
    :return: toRemoveDict with updated errors
    """
        removeReplicas = self.dm.removeReplica(targetSE, list(toRemoveDict))

        if not removeReplicas["OK"]:
            for opFile in toRemoveDict.values():
                opFile.Error = removeReplicas["Message"]
            return S_ERROR(removeReplicas["Message"])
        removeReplicas = removeReplicas["Value"]
        # # filter out failed
        for lfn, opFile in toRemoveDict.items():
            if lfn in removeReplicas["Failed"]:
                opFile.Error = str(removeReplicas["Failed"][lfn])
        return S_OK(toRemoveDict)

    def singleRemoval(self, opFile, targetSE):
        """ remove opFile replica from targetSE

    :param ~DIRAC.RequestManagementSystem.Client.File.File opFile: File instance
    :param str targetSE: target SE name
    """
        proxyFile = None
        if "Write access not permitted for this credential" in opFile.Error:
            # # not a DataManger? set status to failed and return
            if "DataManager" in self.shifter:
                # #  you're a data manager - save current proxy and get a new one for LFN and retry
                saveProxy = os.environ["X509_USER_PROXY"]
                try:
                    fileProxy = self.getProxyForLFN(opFile.LFN)
                    if not fileProxy["OK"]:
                        opFile.Error = fileProxy["Message"]
                    else:
                        proxyFile = fileProxy["Value"]
                        removeReplica = self.dm.removeReplica(
                            targetSE, opFile.LFN)
                        if not removeReplica["OK"]:
                            opFile.Error = removeReplica["Message"]
                        else:
                            removeReplica = removeReplica["Value"]
                            if opFile.LFN in removeReplica["Failed"]:
                                opFile.Error = removeReplica["Failed"][
                                    opFile.LFN]
                            else:
                                # # reset error - replica has been removed this time
                                opFile.Error = ""
                finally:
                    if proxyFile:
                        os.unlink(proxyFile)
                    # # put back request owner proxy to env
                    os.environ["X509_USER_PROXY"] = saveProxy
        return S_OK(opFile)
Пример #10
0
class PutAndRegister(DMSRequestOperationsBase):
    """
    .. class:: PutAndRegister

    PutAndRegister operation handler.

    This takes a local file and put it on a StorageElement before registering it in a catalog

    """
    def __init__(self, operation=None, csPath=None):
        """c'tor

        :param self: self reference
        :param Operation operation: Operation instance
        :param str csPath: CS path for this handler
        """
        # # base classes ctor
        super(PutAndRegister, self).__init__(operation, csPath)

        self.dm = DataManager()

    def __call__(self):
        """PutAndRegister operation processing"""

        # The flag  'rmsMonitoring' is set by the RequestTask and is False by default.
        # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase.
        if self.rmsMonitoring:
            self.rmsMonitoringReporter = MonitoringReporter(
                monitoringType="RMSMonitoring")
        else:
            # # gMonitor stuff
            gMonitor.registerActivity("PutAtt", "File put attempts",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)
            gMonitor.registerActivity("PutFail", "Failed file puts",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)
            gMonitor.registerActivity("PutOK", "Successful file puts",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)
            gMonitor.registerActivity("RegisterOK",
                                      "Successful file registrations",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)
            gMonitor.registerActivity("RegisterFail",
                                      "Failed file registrations",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)

        # # list of targetSEs
        targetSEs = self.operation.targetSEList

        if len(targetSEs) != 1:
            self.log.error(
                "Wrong value for TargetSE list, should contain only one target!",
                "%s" % targetSEs)
            self.operation.Error = "Wrong parameters: TargetSE should contain only one targetSE"
            for opFile in self.operation:

                opFile.Status = "Failed"
                opFile.Error = "Wrong parameters: TargetSE should contain only one targetSE"

                if not self.rmsMonitoring:
                    gMonitor.addMark("PutAtt", 1)
                    gMonitor.addMark("PutFail", 1)

            if self.rmsMonitoring:
                for status in ["Attempted", "Failed"]:
                    self.rmsMonitoringReporter.addRecord(
                        self.createRMSRecord(status, len(self.operation)))
                    self.rmsMonitoringReporter.commit()

            return S_ERROR("TargetSE should contain only one target, got %s" %
                           targetSEs)

        targetSE = targetSEs[0]
        bannedTargets = self.checkSEsRSS(targetSE)
        if not bannedTargets["OK"]:
            if self.rmsMonitoring:
                for status in ["Attempted", "Failed"]:
                    self.rmsMonitoringReporter.addRecord(
                        self.createRMSRecord(status, len(self.operation)))
                self.rmsMonitoringReporter.commit()
            else:
                gMonitor.addMark("PutAtt")
                gMonitor.addMark("PutFail")
            return bannedTargets

        if bannedTargets["Value"]:
            return S_OK("%s targets are banned for writing" %
                        ",".join(bannedTargets["Value"]))

        # # get waiting files
        waitingFiles = self.getWaitingFilesList()

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.addRecord(
                self.createRMSRecord("Attempted", len(waitingFiles)))

        # # loop over files
        for opFile in waitingFiles:
            # # get LFN
            lfn = opFile.LFN
            self.log.info("processing file %s" % lfn)

            if not self.rmsMonitoring:
                gMonitor.addMark("PutAtt", 1)

            pfn = opFile.PFN
            guid = opFile.GUID
            checksum = opFile.Checksum

            # # call DataManager passing a list of requested catalogs
            catalogs = self.operation.Catalog
            if catalogs:
                catalogs = [cat.strip() for cat in catalogs.split(",")]
            putAndRegister = DataManager(catalogs=catalogs).putAndRegister(
                lfn, pfn, targetSE, guid=guid, checksum=checksum)
            if not putAndRegister["OK"]:
                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord(
                        self.createRMSRecord("Failed", 1))
                else:
                    gMonitor.addMark("PutFail", 1)
                #         self.dataLoggingClient().addFileRecord( lfn, "PutFail", targetSE, "", "PutAndRegister" )
                self.log.error("Completely failed to put and register file",
                               putAndRegister["Message"])
                opFile.Error = str(putAndRegister["Message"])
                self.operation.Error = str(putAndRegister["Message"])
                continue

            putAndRegister = putAndRegister["Value"]

            if lfn in putAndRegister["Failed"]:
                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord(
                        self.createRMSRecord("Failed", 1))
                else:
                    gMonitor.addMark("PutFail", 1)
                #         self.dataLoggingClient().addFileRecord( lfn, "PutFail", targetSE, "", "PutAndRegister" )

                reason = putAndRegister["Failed"][lfn]
                self.log.error("Failed to put and register file",
                               " %s at %s: %s" % (lfn, targetSE, reason))
                opFile.Error = str(reason)
                self.operation.Error = str(reason)
                continue

            putAndRegister = putAndRegister["Successful"]
            if lfn in putAndRegister:

                if "put" not in putAndRegister[lfn]:

                    if self.rmsMonitoring:
                        self.rmsMonitoringReporter.addRecord(
                            self.createRMSRecord("Failed", 1))
                    else:
                        gMonitor.addMark("PutFail", 1)
                    #           self.dataLoggingClient().addFileRecord( lfn, "PutFail", targetSE, "", "PutAndRegister" )

                    self.log.info("failed to put %s to %s" % (lfn, targetSE))

                    opFile.Error = "put failed"
                    self.operation.Error = "put failed"
                    continue

                if "register" not in putAndRegister[lfn]:

                    if self.rmsMonitoring:
                        self.rmsMonitoringReporter.addRecord(
                            self.createRMSRecord("Failed", 1))
                    else:
                        gMonitor.addMark("PutOK", 1)
                        gMonitor.addMark("RegisterFail", 1)

                    #           self.dataLoggingClient().addFileRecord( lfn, "Put", targetSE, "", "PutAndRegister" )
                    #           self.dataLoggingClient().addFileRecord( lfn, "RegisterFail", targetSE, "", "PutAndRegister" )

                    self.log.info("put of %s to %s took %s seconds" %
                                  (lfn, targetSE, putAndRegister[lfn]["put"]))
                    self.log.error("Register of lfn to SE failed",
                                   "%s to %s" % (lfn, targetSE))

                    opFile.Error = "failed to register %s at %s" % (lfn,
                                                                    targetSE)
                    opFile.Status = "Failed"

                    self.log.info(opFile.Error)
                    registerOperation = self.getRegisterOperation(
                        opFile, targetSE)
                    self.request.insertAfter(registerOperation, self.operation)
                    continue

                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord(
                        self.createRMSRecord("Successful", 1))
                else:
                    gMonitor.addMark("PutOK", 1)
                    gMonitor.addMark("RegisterOK", 1)

                #         self.dataLoggingClient().addFileRecord( lfn, "Put", targetSE, "", "PutAndRegister" )
                #         self.dataLoggingClient().addFileRecord( lfn, "Register", targetSE, "", "PutAndRegister" )

                opFile.Status = "Done"
                for op in ("put", "register"):
                    self.log.info("%s of %s to %s took %s seconds" %
                                  (op, lfn, targetSE, putAndRegister[lfn][op]))

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.commit()

        return S_OK()
Пример #11
0
class AgentModule(object):
    """ Base class for all agent modules

      This class is used by the AgentReactor Class to steer the execution of
      DIRAC Agents.

      For this purpose the following methods are used:
      - am_initialize()      just after instantiated
      - am_getPollingTime()  to set the execution frequency
      - am_getMaxCycles()    to determine the number of cycles
      - am_go()              for the actual execution of one cycle

      Before each iteration, the following methods are used to determine
      if the new cycle is to be started.
      - am_getModuleParam( 'alive' )
      - am_checkStopAgentFile()
      - am_removeStopAgentFile()

      To start new execution cycle the following methods are used
      - am_getCyclesDone()
      - am_setOption( 'MaxCycles', maxCycles )

      At the same time it provides all Agents with common interface.
      All Agent class must inherit from this base class and must implement
      at least the following method:
      - execute()            main method called in the agent cycle

      Additionally they may provide:
      - initialize()         for initial settings
      - finalize()           the graceful exit

      - beginExecution()     before each execution cycle
      - endExecution()       at the end of each execution cycle

      The agent can be stopped either by a signal or by creating a 'stop_agent' file
      in the controlDirectory defined in the agent configuration

  """
    def __init__(self,
                 agentName,
                 loadName,
                 baseAgentName=False,
                 properties={}):
        """
      Common __init__ method for all Agents.
      All Agent modules must define:
      __doc__
      __RCSID__
      They are used to populate __codeProperties

      The following Options are used from the Configuration:
      - /LocalSite/InstancePath
      - /DIRAC/Setup
      - Status
      - Enabled
      - PollingTime            default = 120
      - MaxCycles              default = 500
      - WatchdogTime           default = 0 (disabled)
      - ControlDirectory       control/SystemName/AgentName
      - WorkDirectory          work/SystemName/AgentName
      - shifterProxy           ''
      - shifterProxyLocation   WorkDirectory/SystemName/AgentName/.shifterCred

      It defines the following default Options that can be set via Configuration (above):
      - MonitoringEnabled     True
      - Enabled               True if Status == Active
      - PollingTime           120
      - MaxCycles             500
      - ControlDirectory      control/SystemName/AgentName
      - WorkDirectory         work/SystemName/AgentName
      - shifterProxy          False
      - shifterProxyLocation  work/SystemName/AgentName/.shifterCred

      different defaults can be set in the initialize() method of the Agent using am_setOption()

      In order to get a shifter proxy in the environment during the execute()
      the configuration Option 'shifterProxy' must be set, a default may be given
      in the initialize() method.
    """
        if baseAgentName and agentName == baseAgentName:
            self.log = gLogger
            standaloneModule = True
        else:
            self.log = gLogger.getSubLogger(agentName, child=False)
            standaloneModule = False

        self.__basePath = gConfig.getValue('/LocalSite/InstancePath', rootPath)
        self.__agentModule = None
        self.__codeProperties = {}
        self.__getCodeInfo()

        self.__moduleProperties = {
            'fullName': agentName,
            'loadName': loadName,
            'section': PathFinder.getAgentSection(agentName),
            'loadSection': PathFinder.getAgentSection(loadName),
            'standalone': standaloneModule,
            'cyclesDone': 0,
            'totalElapsedTime': 0,
            'setup': gConfig.getValue("/DIRAC/Setup", "Unknown"),
            'alive': True
        }
        self.__moduleProperties['system'], self.__moduleProperties[
            'agentName'] = agentName.split("/")
        self.__configDefaults = {}
        self.__configDefaults['MonitoringEnabled'] = True
        self.__configDefaults['Enabled'] = self.am_getOption(
            "Status", "Active").lower() in ('active')
        self.__configDefaults['PollingTime'] = self.am_getOption(
            "PollingTime", 120)
        self.__configDefaults['MaxCycles'] = self.am_getOption(
            "MaxCycles", 500)
        self.__configDefaults['WatchdogTime'] = self.am_getOption(
            "WatchdogTime", 0)
        self.__configDefaults['ControlDirectory'] = os.path.join(
            self.__basePath, 'control', *agentName.split("/"))
        self.__configDefaults['WorkDirectory'] = os.path.join(
            self.__basePath, 'work', *agentName.split("/"))
        self.__configDefaults['shifterProxy'] = ''
        self.__configDefaults['shifterProxyLocation'] = os.path.join(
            self.__configDefaults['WorkDirectory'], '.shifterCred')

        if isinstance(properties, dict):
            for key in properties:
                self.__moduleProperties[key] = properties[key]
            self.__moduleProperties['executors'] = [(self.execute, ())]
            self.__moduleProperties['shifterProxy'] = False

        self.__monitorLastStatsUpdate = -1
        self.monitor = None
        self.__initializeMonitor()
        self.__initialized = False

    def __getCodeInfo(self):
        versionVar = "__RCSID__"
        docVar = "__doc__"
        try:
            self.__agentModule = __import__(self.__class__.__module__,
                                            globals(), locals(), versionVar)
        except Exception as excp:
            self.log.exception("Cannot load agent module", lException=excp)
        for prop in ((versionVar, "version"), (docVar, "description")):
            try:
                self.__codeProperties[prop[1]] = getattr(
                    self.__agentModule, prop[0])
            except Exception:
                self.log.error("Missing property", prop[0])
                self.__codeProperties[prop[1]] = 'unset'
        self.__codeProperties['DIRACVersion'] = DIRAC.version
        self.__codeProperties['platform'] = DIRAC.getPlatform()

    def am_initialize(self, *initArgs):
        """ Common initialization for all the agents.

        This is executed every time an agent (re)starts.
        This is called by the AgentReactor, should not be overridden.
    """
        agentName = self.am_getModuleParam('fullName')
        result = self.initialize(*initArgs)
        if not isReturnStructure(result):
            return S_ERROR("initialize must return S_OK/S_ERROR")
        if not result['OK']:
            return S_ERROR("Error while initializing %s: %s" %
                           (agentName, result['Message']))
        mkDir(self.am_getControlDirectory())
        workDirectory = self.am_getWorkDirectory()
        mkDir(workDirectory)
        # Set the work directory in an environment variable available to subprocesses if needed
        os.environ['AGENT_WORKDIRECTORY'] = workDirectory

        self.__moduleProperties['shifterProxy'] = self.am_getOption(
            'shifterProxy')
        if self.am_monitoringEnabled() and not self.activityMonitoring:
            self.monitor.enable()
        if len(self.__moduleProperties['executors']) < 1:
            return S_ERROR("At least one executor method has to be defined")
        if not self.am_Enabled():
            return S_ERROR("Agent is disabled via the configuration")
        self.log.notice("=" * 40)
        self.log.notice("Loaded agent module %s" %
                        self.__moduleProperties['fullName'])
        self.log.notice(" Site: %s" % DIRAC.siteName())
        self.log.notice(" Setup: %s" % gConfig.getValue("/DIRAC/Setup"))
        self.log.notice(" Base Module version: %s " % __RCSID__)
        self.log.notice(" Agent version: %s" %
                        self.__codeProperties['version'])
        self.log.notice(" DIRAC version: %s" % DIRAC.version)
        self.log.notice(" DIRAC platform: %s" % DIRAC.getPlatform())
        pollingTime = int(self.am_getOption('PollingTime'))
        if pollingTime > 3600:
            self.log.notice(" Polling time: %s hours" % (pollingTime / 3600.))
        else:
            self.log.notice(" Polling time: %s seconds" %
                            self.am_getOption('PollingTime'))
        self.log.notice(" Control dir: %s" % self.am_getControlDirectory())
        self.log.notice(" Work dir: %s" % self.am_getWorkDirectory())
        if self.am_getOption('MaxCycles') > 0:
            self.log.notice(" Cycles: %s" % self.am_getMaxCycles())
        else:
            self.log.notice(" Cycles: unlimited")
        if self.am_getWatchdogTime() > 0:
            self.log.notice(" Watchdog interval: %s" %
                            self.am_getWatchdogTime())
        else:
            self.log.notice(" Watchdog interval: disabled ")
        self.log.notice("=" * 40)
        self.__initialized = True
        return S_OK()

    def am_getControlDirectory(self):
        return os.path.join(self.__basePath,
                            str(self.am_getOption('ControlDirectory')))

    def am_getStopAgentFile(self):
        return os.path.join(self.am_getControlDirectory(), 'stop_agent')

    def am_checkStopAgentFile(self):
        return os.path.isfile(self.am_getStopAgentFile())

    def am_createStopAgentFile(self):
        try:
            with open(self.am_getStopAgentFile(), 'w') as fd:
                fd.write('Dirac site agent Stopped at %s' % Time.toString())
        except Exception:
            pass

    def am_removeStopAgentFile(self):
        try:
            os.unlink(self.am_getStopAgentFile())
        except Exception:
            pass

    def am_getBasePath(self):
        return self.__basePath

    def am_getWorkDirectory(self):
        return os.path.join(self.__basePath,
                            str(self.am_getOption('WorkDirectory')))

    def am_getShifterProxyLocation(self):
        return os.path.join(self.__basePath,
                            str(self.am_getOption('shifterProxyLocation')))

    def am_getOption(self, optionName, defaultValue=None):
        """ Gets an option from the agent's configuration section.
        The section will be a subsection of the /Systems section in the CS.
    """
        if defaultValue is None:
            if optionName in self.__configDefaults:
                defaultValue = self.__configDefaults[optionName]
        if optionName and optionName[0] == "/":
            return gConfig.getValue(optionName, defaultValue)
        for section in (self.__moduleProperties['section'],
                        self.__moduleProperties['loadSection']):
            result = gConfig.getOption("%s/%s" % (section, optionName),
                                       defaultValue)
            if result['OK']:
                return result['Value']
        return defaultValue

    def am_setOption(self, optionName, value):
        self.__configDefaults[optionName] = value

    def am_getModuleParam(self, optionName):
        return self.__moduleProperties[optionName]

    def am_setModuleParam(self, optionName, value):
        self.__moduleProperties[optionName] = value

    def am_getPollingTime(self):
        return self.am_getOption("PollingTime")

    def am_getMaxCycles(self):
        return self.am_getOption("MaxCycles")

    def am_getWatchdogTime(self):
        return int(self.am_getOption("WatchdogTime"))

    def am_getCyclesDone(self):
        return self.am_getModuleParam('cyclesDone')

    def am_Enabled(self):
        return self.am_getOption("Enabled")

    def am_disableMonitoring(self):
        self.am_setOption('MonitoringEnabled', False)

    def am_monitoringEnabled(self):
        return self.am_getOption("MonitoringEnabled")

    def am_stopExecution(self):
        self.am_setModuleParam('alive', False)

    def __initializeMonitor(self):
        """
    Initialize the system monitoring.
    """
        # This flag is used to activate ES based monitoring
        # if the "EnableActivityMonitoring" flag in "yes" or "true" in the cfg file.
        self.activityMonitoring = (
            Operations().getValue("EnableActivityMonitoring", False)
            or self.am_getOption("EnableActivityMonitoring", False))
        if self.activityMonitoring:
            # The import needs to be here because of the CS must be initialized before importing
            # this class (see https://github.com/DIRACGrid/DIRAC/issues/4793)
            from DIRAC.MonitoringSystem.Client.MonitoringReporter import MonitoringReporter
            self.activityMonitoringReporter = MonitoringReporter(
                monitoringType="ComponentMonitoring")
            # With the help of this periodic task we commit the data to ES at an interval of 100 seconds.
            gThreadScheduler.addPeriodicTask(
                100, self.__activityMonitoringReporting)
        else:
            if self.__moduleProperties['standalone']:
                self.monitor = gMonitor
            else:
                self.monitor = MonitoringClient()
            self.monitor.setComponentType(self.monitor.COMPONENT_AGENT)
            self.monitor.setComponentName(self.__moduleProperties['fullName'])
            self.monitor.initialize()
            self.monitor.registerActivity('CPU', "CPU Usage", 'Framework',
                                          "CPU,%", self.monitor.OP_MEAN, 600)
            self.monitor.registerActivity('MEM', "Memory Usage", 'Framework',
                                          'Memory,MB', self.monitor.OP_MEAN,
                                          600)
            # Component monitor
            for field in ('version', 'DIRACVersion', 'description',
                          'platform'):
                self.monitor.setComponentExtraParam(
                    field, self.__codeProperties[field])
            self.monitor.setComponentExtraParam('startTime', Time.dateTime())
            self.monitor.setComponentExtraParam('cycles', 0)
            self.monitor.disable()
            self.__monitorLastStatsUpdate = time.time()

    def am_secureCall(self, functor, args=(), name=False):
        if not name:
            name = str(functor)
        try:
            result = functor(*args)
            if not isReturnStructure(result):
                raise Exception(
                    "%s method for %s module has to return S_OK/S_ERROR" %
                    (name, self.__moduleProperties['fullName']))
            return result
        except Exception as e:
            self.log.exception("Agent exception while calling method %s" %
                               name,
                               lException=e)
            return S_ERROR("Exception while calling %s method: %s" %
                           (name, str(e)))

    def _setShifterProxy(self):
        if self.__moduleProperties["shifterProxy"]:
            result = setupShifterProxyInEnv(
                self.__moduleProperties["shifterProxy"],
                self.am_getShifterProxyLocation())
            if not result['OK']:
                self.log.error("Failed to set shifter proxy",
                               result['Message'])
                return result
        return S_OK()

    def am_go(self):
        # Set the shifter proxy if required
        result = self._setShifterProxy()
        if not result['OK']:
            return result
        self.log.notice("-" * 40)
        self.log.notice("Starting cycle for module %s" %
                        self.__moduleProperties['fullName'])
        mD = self.am_getMaxCycles()
        if mD > 0:
            cD = self.__moduleProperties['cyclesDone']
            self.log.notice("Remaining %s of %s cycles" % (mD - cD, mD))
        self.log.notice("-" * 40)
        # use SIGALARM as a watchdog interrupt if enabled
        watchdogInt = self.am_getWatchdogTime()
        if watchdogInt > 0:
            signal.signal(signal.SIGALRM, signal.SIG_DFL)
            signal.alarm(watchdogInt)
        elapsedTime = time.time()
        cpuStats = self._startReportToMonitoring()
        cycleResult = self.__executeModuleCycle()
        if cpuStats:
            self._endReportToMonitoring(*cpuStats)
        # Increment counters
        self.__moduleProperties['cyclesDone'] += 1
        # Show status
        elapsedTime = time.time() - elapsedTime
        self.__moduleProperties['totalElapsedTime'] += elapsedTime
        self.log.notice("-" * 40)
        self.log.notice("Agent module %s run summary" %
                        self.__moduleProperties['fullName'])
        self.log.notice(" Executed %s times previously" %
                        self.__moduleProperties['cyclesDone'])
        self.log.notice(" Cycle took %.2f seconds" % elapsedTime)
        averageElapsedTime = self.__moduleProperties[
            'totalElapsedTime'] / self.__moduleProperties['cyclesDone']
        self.log.notice(" Average execution time: %.2f seconds" %
                        (averageElapsedTime))
        elapsedPollingRate = averageElapsedTime * 100 / self.am_getOption(
            'PollingTime')
        self.log.notice(" Polling time: %s seconds" %
                        self.am_getOption('PollingTime'))
        self.log.notice(" Average execution/polling time: %.2f%%" %
                        elapsedPollingRate)
        if cycleResult['OK']:
            self.log.notice(" Cycle was successful")
            if self.activityMonitoring:
                # Here we record the data about the cycle duration along with some basic details about the
                # component and right now it isn't committed to the ES backend.
                self.activityMonitoringReporter.addRecord({
                    'timestamp':
                    int(Time.toEpoch()),
                    'host':
                    Network.getFQDN(),
                    'componentType':
                    "agent",
                    'component':
                    "_".join(self.__moduleProperties['fullName'].split("/")),
                    'cycleDuration':
                    elapsedTime,
                    'cycles':
                    1
                })
        else:
            self.log.warn(" Cycle had an error:", cycleResult['Message'])
        self.log.notice("-" * 40)
        # Update number of cycles
        if not self.activityMonitoring:
            self.monitor.setComponentExtraParam(
                'cycles', self.__moduleProperties['cyclesDone'])
        # cycle finished successfully, cancel watchdog
        if watchdogInt > 0:
            signal.alarm(0)
        return cycleResult

    def _startReportToMonitoring(self):
        try:
            if not self.activityMonitoring:
                now = time.time()
                stats = os.times()
                cpuTime = stats[0] + stats[2]
                if now - self.__monitorLastStatsUpdate < 10:
                    return (now, cpuTime)
                # Send CPU consumption mark
                self.__monitorLastStatsUpdate = now
                # Send Memory consumption mark
                membytes = MemStat.VmB('VmRSS:')
                if membytes:
                    mem = membytes / (1024. * 1024.)
                    gMonitor.addMark('MEM', mem)
                return (now, cpuTime)
            else:
                return False
        except Exception:
            return False

    def _endReportToMonitoring(self, initialWallTime, initialCPUTime):
        wallTime = time.time() - initialWallTime
        stats = os.times()
        cpuTime = stats[0] + stats[2] - initialCPUTime
        percentage = 0
        if wallTime:
            percentage = cpuTime / wallTime * 100.
        if percentage > 0:
            gMonitor.addMark('CPU', percentage)

    def __executeModuleCycle(self):
        # Execute the beginExecution function
        result = self.am_secureCall(self.beginExecution, name="beginExecution")
        if not result['OK']:
            return result
        # Launch executor functions
        executors = self.__moduleProperties['executors']
        if len(executors) == 1:
            result = self.am_secureCall(executors[0][0], executors[0][1])
            if not result['OK']:
                return result
        else:
            exeThreads = [
                threading.Thread(target=executor[0], args=executor[1])
                for executor in executors
            ]
            for thread in exeThreads:
                thread.setDaemon(1)
                thread.start()
            for thread in exeThreads:
                thread.join()
        # Execute the endExecution function
        return self.am_secureCall(self.endExecution, name="endExecution")

    def initialize(self, *args, **kwargs):
        """ Agents should override this method for specific initialization.
        Executed at every agent (re)start.
    """
        return S_OK()

    def beginExecution(self):
        return S_OK()

    def endExecution(self):
        return S_OK()

    def finalize(self):
        return S_OK()

    def execute(self):
        return S_ERROR("Execute method has to be overwritten by agent module")

    def __activityMonitoringReporting(self):
        """ This method is called by the ThreadScheduler as a periodic task in order to commit the collected data which
        is done by the MonitoringReporter and is send to the 'ComponentMonitoring' type.

        :return: True / False
    """
        result = self.activityMonitoringReporter.commit()
        return result['OK']
Пример #12
0
class RequestTask(object):
    """
    .. class:: RequestTask

    request's processing task
    """
    def __init__(self,
                 requestJSON,
                 handlersDict,
                 csPath,
                 agentName,
                 standalone=False,
                 requestClient=None,
                 rmsMonitoring=False):
        """c'tor

        :param self: self reference
        :param str requestJSON: request serialized to JSON
        :param dict opHandlers: operation handlers
        """
        self.request = Request(requestJSON)
        # # csPath
        self.csPath = csPath
        # # agent name
        self.agentName = agentName
        # # standalone flag
        self.standalone = standalone
        # # handlers dict
        self.handlersDict = handlersDict
        # # handlers class def
        self.handlers = {}
        # # own sublogger
        self.log = gLogger.getSubLogger(
            "pid_%s/%s" % (os.getpid(), self.request.RequestName))
        # # get shifters info
        self.__managersDict = {}
        shifterProxies = self.__setupManagerProxies()
        if not shifterProxies["OK"]:
            self.log.error("Cannot setup shifter proxies",
                           shifterProxies["Message"])

        #  This flag which is set and sent from the RequestExecutingAgent and is False by default.
        self.rmsMonitoring = rmsMonitoring

        if self.rmsMonitoring:
            self.rmsMonitoringReporter = MonitoringReporter(
                monitoringType="RMSMonitoring")
        else:
            # # initialize gMonitor
            gMonitor.setComponentType(gMonitor.COMPONENT_AGENT)
            gMonitor.setComponentName(self.agentName)
            gMonitor.initialize()

            # # own gMonitor activities
            gMonitor.registerActivity("RequestAtt", "Requests processed",
                                      "RequestExecutingAgent", "Requests/min",
                                      gMonitor.OP_SUM)
            gMonitor.registerActivity("RequestFail", "Requests failed",
                                      "RequestExecutingAgent", "Requests/min",
                                      gMonitor.OP_SUM)
            gMonitor.registerActivity("RequestOK", "Requests done",
                                      "RequestExecutingAgent", "Requests/min",
                                      gMonitor.OP_SUM)

        if requestClient is None:
            self.requestClient = ReqClient()
        else:
            self.requestClient = requestClient

    def __setupManagerProxies(self):
        """setup grid proxy for all defined managers"""
        oHelper = Operations()
        shifters = oHelper.getSections("Shifter")
        if not shifters["OK"]:
            return shifters
        shifters = shifters["Value"]
        for shifter in shifters:
            shifterDict = oHelper.getOptionsDict("Shifter/%s" % shifter)
            if not shifterDict["OK"]:
                self.log.error("Cannot get options dict for shifter",
                               "%s: %s" % (shifter, shifterDict["Message"]))
                continue
            userName = shifterDict["Value"].get("User", "")
            userGroup = shifterDict["Value"].get("Group", "")

            userDN = Registry.getDNForUsername(userName)
            if not userDN["OK"]:
                self.log.error("Cannot get DN For Username",
                               "%s: %s" % (userName, userDN["Message"]))
                continue
            userDN = userDN["Value"][0]
            vomsAttr = Registry.getVOMSAttributeForGroup(userGroup)
            if vomsAttr:
                self.log.debug(
                    "getting VOMS [%s] proxy for shifter %s@%s (%s)" %
                    (vomsAttr, userName, userGroup, userDN))
                getProxy = gProxyManager.downloadVOMSProxyToFile(
                    userDN,
                    userGroup,
                    requiredTimeLeft=1200,
                    cacheTime=4 * 43200)
            else:
                self.log.debug("getting proxy for shifter %s@%s (%s)" %
                               (userName, userGroup, userDN))
                getProxy = gProxyManager.downloadProxyToFile(
                    userDN,
                    userGroup,
                    requiredTimeLeft=1200,
                    cacheTime=4 * 43200)
            if not getProxy["OK"]:
                return S_ERROR("unable to setup shifter proxy for %s: %s" %
                               (shifter, getProxy["Message"]))
            chain = getProxy["chain"]
            fileName = getProxy["Value"]
            self.log.debug("got %s: %s %s" % (shifter, userName, userGroup))
            self.__managersDict[shifter] = {
                "ShifterDN": userDN,
                "ShifterName": userName,
                "ShifterGroup": userGroup,
                "Chain": chain,
                "ProxyFile": fileName,
            }
        return S_OK()

    def setupProxy(self):
        """download and dump request owner proxy to file and env

        :return: S_OK with name of newly created owner proxy file and shifter name if any
        """
        self.__managersDict = {}
        shifterProxies = self.__setupManagerProxies()
        if not shifterProxies["OK"]:
            self.log.error(shifterProxies["Message"])

        ownerDN = self.request.OwnerDN
        ownerGroup = self.request.OwnerGroup
        isShifter = []
        for shifter, creds in self.__managersDict.items():
            if creds["ShifterDN"] == ownerDN and creds[
                    "ShifterGroup"] == ownerGroup:
                isShifter.append(shifter)
        if isShifter:
            proxyFile = self.__managersDict[isShifter[0]]["ProxyFile"]
            os.environ["X509_USER_PROXY"] = proxyFile
            return S_OK({"Shifter": isShifter, "ProxyFile": proxyFile})

        # # if we're here owner is not a shifter at all
        ownerProxyFile = gProxyManager.downloadVOMSProxyToFile(
            ownerDN, ownerGroup)
        if not ownerProxyFile["OK"] or not ownerProxyFile["Value"]:
            reason = ownerProxyFile.get(
                "Message", "No valid proxy found in ProxyManager.")
            return S_ERROR("Change proxy error for '%s'@'%s': %s" %
                           (ownerDN, ownerGroup, reason))

        ownerProxyFile = ownerProxyFile["Value"]
        os.environ["X509_USER_PROXY"] = ownerProxyFile
        return S_OK({"Shifter": isShifter, "ProxyFile": ownerProxyFile})

    @staticmethod
    def getPluginName(pluginPath):
        """return plugin name"""
        if not pluginPath:
            return ""
        if "/" in pluginPath:
            pluginPath = ".".join(
                [chunk for chunk in pluginPath.split("/") if chunk])
        return pluginPath.split(".")[-1]

    def loadHandler(self, pluginPath):
        """Create an instance of requested plugin class, loading and importing it when needed.
        This function could raise ImportError when plugin cannot be find or TypeError when
        loaded class object isn't inherited from BaseOperation class.

        :param str pluginName: dotted path to plugin, specified as in import statement, i.e.
            "DIRAC.CheesShopSystem.private.Cheddar" or alternatively in 'normal' path format
            "DIRAC/CheesShopSystem/private/Cheddar"

        :return: object instance

        This function try to load and instantiate an object from given path. It is assumed that:

          * `pluginPath` is pointing to module directory "importable" by python interpreter, i.e.: it's
            package's top level directory is in $PYTHONPATH env variable,
          * the module should consist a class definition following module name,
          *  the class itself is inherited from DIRAC.RequestManagementSystem.private.BaseOperation.BaseOperation

        If above conditions aren't meet, function is throwing exceptions:

        :raises ImportError: when class cannot be imported
        :raises TypeError: when class isn't inherited from OperationHandlerBase
        """
        if "/" in pluginPath:
            pluginPath = ".".join(
                [chunk for chunk in pluginPath.split("/") if chunk])
        pluginName = pluginPath.split(".")[-1]
        if pluginName not in globals():
            mod = __import__(pluginPath, globals(), fromlist=[pluginName])
            pluginClassObj = getattr(mod, pluginName)
        else:
            pluginClassObj = globals()[pluginName]
        if not issubclass(pluginClassObj, OperationHandlerBase):
            raise TypeError(
                "operation handler '%s' isn't inherited from OperationHandlerBase class"
                % pluginName)

        if not self.rmsMonitoring:
            for key, status in (("Att", "Attempted"), ("OK", "Successful"),
                                ("Fail", "Failed")):
                gMonitor.registerActivity(
                    "%s%s" % (pluginName, key),
                    "%s operations %s" % (pluginName, status),
                    "RequestExecutingAgent",
                    "Operations/min",
                    gMonitor.OP_SUM,
                )
        # # return an instance
        return pluginClassObj

    def getHandler(self, operation):
        """return instance of a handler for a given operation type on demand
            all created handlers are kept in self.handlers dict for further use

        :param ~Operation.Operation operation: Operation instance
        """
        if operation.Type not in self.handlersDict:
            return S_ERROR("handler for operation '%s' not set" %
                           operation.Type)
        handler = self.handlers.get(operation.Type, None)
        if not handler:
            try:
                handlerCls = self.loadHandler(
                    self.handlersDict[operation.Type])
                self.handlers[operation.Type] = handlerCls(
                    csPath="%s/OperationHandlers/%s" %
                    (self.csPath, operation.Type))
                handler = self.handlers[operation.Type]
            except (ImportError, TypeError) as error:
                self.log.exception("Error getting Handler",
                                   "%s" % error,
                                   lException=error)
                return S_ERROR(str(error))
        # # set operation for this handler
        handler.setOperation(operation)
        # # and return
        return S_OK(handler)

    def updateRequest(self):
        """put back request to the RequestDB"""
        updateRequest = self.requestClient.putRequest(self.request,
                                                      useFailoverProxy=False,
                                                      retryMainService=2)
        if not updateRequest["OK"]:
            self.log.error("Cannot updateRequest", updateRequest["Message"])
        return updateRequest

    def __call__(self):
        """request processing"""

        self.log.debug("about to execute request")
        if not self.rmsMonitoring:
            gMonitor.addMark("RequestAtt", 1)

        # # setup proxy for request owner
        setupProxy = self.setupProxy()
        if not setupProxy["OK"]:
            userSuspended = "User is currently suspended"
            self.request.Error = setupProxy["Message"]
            # In case the user does not have proxy
            if DErrno.cmpError(setupProxy, DErrno.EPROXYFIND):
                self.log.error("Error setting proxy. Request set to Failed:",
                               setupProxy["Message"])
                # If user is no longer registered, fail the request
                for operation in self.request:
                    for opFile in operation:
                        opFile.Status = "Failed"
                    operation.Status = "Failed"
            elif userSuspended in setupProxy["Message"]:
                # If user is suspended, wait for a long time
                self.request.delayNextExecution(6 * 60)
                self.request.Error = userSuspended
                self.log.error("Error setting proxy: " + userSuspended,
                               self.request.OwnerDN)
            else:
                self.log.error("Error setting proxy", setupProxy["Message"])
            return S_OK(self.request)
        shifter = setupProxy["Value"]["Shifter"]

        error = None

        while self.request.Status == "Waiting":

            # # get waiting operation
            operation = self.request.getWaiting()
            if not operation["OK"]:
                self.log.error("Cannot get waiting operation",
                               operation["Message"])
                return operation
            operation = operation["Value"]
            self.log.info("executing operation", "%s" % operation.Type)

            # # and handler for it
            handler = self.getHandler(operation)
            if not handler["OK"]:
                self.log.error("Unable to process operation",
                               "%s: %s" % (operation.Type, handler["Message"]))
                # gMonitor.addMark( "%s%s" % ( operation.Type, "Fail" ), 1 )
                operation.Error = handler["Message"]
                break

            handler = handler["Value"]
            # # set shifters list in the handler
            handler.shifter = shifter
            # set rmsMonitoring flag for the RequestOperation
            handler.rmsMonitoring = self.rmsMonitoring
            # # and execute
            pluginName = self.getPluginName(
                self.handlersDict.get(operation.Type))
            if self.standalone:
                useServerCertificate = gConfig.useServerCertificate()
            else:
                # Always use server certificates if executed within an agent
                useServerCertificate = True
            try:
                if pluginName:
                    if self.rmsMonitoring:
                        self.rmsMonitoringReporter.addRecord({
                            "timestamp":
                            int(Time.toEpoch()),
                            "host":
                            Network.getFQDN(),
                            "objectType":
                            "Operation",
                            "operationType":
                            pluginName,
                            "objectID":
                            operation.OperationID,
                            "parentID":
                            operation.RequestID,
                            "status":
                            "Attempted",
                            "nbObject":
                            1,
                        })
                    else:
                        gMonitor.addMark("%s%s" % (pluginName, "Att"), 1)
                # Always use request owner proxy
                if useServerCertificate:
                    gConfigurationData.setOptionInCFG(
                        "/DIRAC/Security/UseServerCertificate", "false")
                exe = handler()
                if useServerCertificate:
                    gConfigurationData.setOptionInCFG(
                        "/DIRAC/Security/UseServerCertificate", "true")
                if not exe["OK"]:
                    self.log.error("unable to process operation",
                                   "%s: %s" % (operation.Type, exe["Message"]))
                    if pluginName:
                        if self.rmsMonitoring:
                            self.rmsMonitoringReporter.addRecord({
                                "timestamp":
                                int(Time.toEpoch()),
                                "host":
                                Network.getFQDN(),
                                "objectType":
                                "Operation",
                                "operationType":
                                pluginName,
                                "objectID":
                                operation.OperationID,
                                "parentID":
                                operation.RequestID,
                                "status":
                                "Failed",
                                "nbObject":
                                1,
                            })
                        else:
                            gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1)
                    if self.rmsMonitoring:
                        self.rmsMonitoringReporter.addRecord({
                            "timestamp":
                            int(Time.toEpoch()),
                            "host":
                            Network.getFQDN(),
                            "objectType":
                            "Request",
                            "objectID":
                            operation.RequestID,
                            "status":
                            "Failed",
                            "nbObject":
                            1,
                        })
                    else:
                        gMonitor.addMark("RequestFail", 1)

                    if self.request.JobID:
                        # Check if the job exists
                        monitorServer = JobMonitoringClient(
                            useCertificates=True)
                        res = monitorServer.getJobSummary(
                            int(self.request.JobID))
                        if not res["OK"]:
                            self.log.error(
                                "RequestTask: Failed to get job status",
                                "%d" % self.request.JobID)
                        elif not res["Value"]:
                            self.log.warn(
                                "RequestTask: job does not exist (anymore): failed request",
                                "JobID: %d" % self.request.JobID,
                            )
                            for opFile in operation:
                                opFile.Status = "Failed"
                            if operation.Status != "Failed":
                                operation.Status = "Failed"
                            self.request.Error = "Job no longer exists"
            except Exception as e:
                error = str(e)
                self.log.exception("hit by exception:", "%s" % error)
                if pluginName:
                    if self.rmsMonitoring:
                        self.rmsMonitoringReporter.addRecord({
                            "timestamp":
                            int(Time.toEpoch()),
                            "host":
                            Network.getFQDN(),
                            "objectType":
                            "Operation",
                            "operationType":
                            pluginName,
                            "objectID":
                            operation.OperationID,
                            "parentID":
                            operation.RequestID,
                            "status":
                            "Failed",
                            "nbObject":
                            1,
                        })
                    else:
                        gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1)
                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord({
                        "timestamp":
                        int(Time.toEpoch()),
                        "host":
                        Network.getFQDN(),
                        "objectType":
                        "Request",
                        "objectID":
                        operation.RequestID,
                        "status":
                        "Failed",
                        "nbObject":
                        1,
                    })
                else:
                    gMonitor.addMark("RequestFail", 1)

                if useServerCertificate:
                    gConfigurationData.setOptionInCFG(
                        "/DIRAC/Security/UseServerCertificate", "true")
                break

            # # operation status check
            if operation.Status == "Done" and pluginName:
                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord({
                        "timestamp":
                        int(Time.toEpoch()),
                        "host":
                        Network.getFQDN(),
                        "objectType":
                        "Operation",
                        "operationType":
                        pluginName,
                        "objectID":
                        operation.OperationID,
                        "parentID":
                        operation.RequestID,
                        "status":
                        "Successful",
                        "nbObject":
                        1,
                    })
                else:
                    gMonitor.addMark("%s%s" % (pluginName, "OK"), 1)
            elif operation.Status == "Failed" and pluginName:
                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord({
                        "timestamp":
                        int(Time.toEpoch()),
                        "host":
                        Network.getFQDN(),
                        "objectType":
                        "Operation",
                        "operationType":
                        pluginName,
                        "objectID":
                        operation.OperationID,
                        "parentID":
                        operation.RequestID,
                        "status":
                        "Failed",
                        "nbObject":
                        1,
                    })
                else:
                    gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1)
            elif operation.Status in ("Waiting", "Scheduled"):
                # # no update for waiting or all files scheduled
                break

        if not self.rmsMonitoring:
            gMonitor.flush()

        if error:
            return S_ERROR(error)

        # # request done?
        if self.request.Status == "Done":
            # # update request to the RequestDB
            self.log.info("Updating request status:",
                          "%s" % self.request.Status)
            update = self.updateRequest()
            if not update["OK"]:
                self.log.error("Cannot update request status",
                               update["Message"])
                return update
            self.log.info("request is done", "%s" % self.request.RequestName)
            if self.rmsMonitoring:
                self.rmsMonitoringReporter.addRecord({
                    "timestamp":
                    int(Time.toEpoch()),
                    "host":
                    Network.getFQDN(),
                    "objectType":
                    "Request",
                    "objectID":
                    getattr(self.request, "RequestID", 0),
                    "status":
                    "Successful",
                    "nbObject":
                    1,
                })
            else:
                gMonitor.addMark("RequestOK", 1)
            # # and there is a job waiting for it? finalize!
            if self.request.JobID:
                attempts = 0
                while True:
                    finalizeRequest = self.requestClient.finalizeRequest(
                        self.request.RequestID,
                        self.request.JobID  # pylint: disable=no-member
                    )
                    if not finalizeRequest["OK"]:
                        if not attempts:
                            self.log.error(
                                "unable to finalize request, will retry",
                                "ReqName %s:%s" % (self.request.RequestName,
                                                   finalizeRequest["Message"]),
                            )
                        self.log.debug("Waiting 10 seconds")
                        attempts += 1
                        if attempts == 10:
                            self.log.error("Giving up finalize request")
                            return S_ERROR("Could not finalize request")

                        time.sleep(10)

                    else:
                        self.log.info(
                            "request is finalized",
                            "ReqName %s %s" % (self.request.RequestName,
                                               (" after %d attempts" %
                                                attempts) if attempts else ""),
                        )
                        break

        # Commit all the data to the ES Backend
        if self.rmsMonitoring:
            self.rmsMonitoringReporter.commit()
        # Request will be updated by the callBack method
        self.log.verbose("RequestTasks exiting",
                         "request %s" % self.request.Status)
        return S_OK(self.request)
Пример #13
0
class ReTransfer(DMSRequestOperationsBase):
    """
    .. class:: ReTransfer


    online ReTransfer operation handler

    :param self: self reference
    :param ~DIRAC.RequestManagementSystem.Client.Operation.Operation operation: Operation instance
    :param str csPath: CS path for this handler

    """
    def __init__(self, operation=None, csPath=None):
        """c'tor"""
        # # base class ctor
        DMSRequestOperationsBase.__init__(self, operation, csPath)

    def __call__(self):
        """reTransfer operation execution"""

        # The flag  'rmsMonitoring' is set by the RequestTask and is False by default.
        # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase.
        if self.rmsMonitoring:
            self.rmsMonitoringReporter = MonitoringReporter(
                monitoringType="RMSMonitoring")

        # # list of targetSEs
        targetSEs = self.operation.targetSEList
        # # check targetSEs for removal
        targetSE = targetSEs[0]
        bannedTargets = self.checkSEsRSS(targetSE)
        if not bannedTargets["OK"]:
            if self.rmsMonitoring:
                for status in ["Attempted", "Failed"]:
                    self.rmsMonitoringReporter.addRecord(
                        self.createRMSRecord(status, len(self.operation)))
                self.rmsMonitoringReporter.commit()
            return bannedTargets

        if bannedTargets["Value"]:
            return S_OK("%s targets are banned for writing" %
                        ",".join(bannedTargets["Value"]))

        # # get waiting files
        waitingFiles = self.getWaitingFilesList()
        # # prepare waiting files
        toRetransfer = dict([(opFile.PFN, opFile) for opFile in waitingFiles])

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.addRecord(
                self.createRMSRecord("Attempted", len(toRetransfer)))

        if len(targetSEs) != 1:
            error = "only one TargetSE allowed, got %d" % len(targetSEs)
            for opFile in toRetransfer.values():
                opFile.Error = error
                opFile.Status = "Failed"
            self.operation.Error = error

            if self.rmsMonitoring:
                self.rmsMonitoringReporter.addRecord(
                    self.createRMSRecord("Failed", len(toRetransfer)))
                self.rmsMonitoringReporter.commit()

            return S_ERROR(error)

        se = StorageElement(targetSE)
        for opFile in toRetransfer.values():

            reTransfer = se.retransferOnlineFile(opFile.LFN)
            if not reTransfer["OK"]:
                opFile.Error = reTransfer["Message"]
                self.log.error("Retransfer failed", opFile.Error)

                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord(
                        self.createRMSRecord("Failed", 1))

                continue
            reTransfer = reTransfer["Value"]
            if opFile.LFN in reTransfer["Failed"]:
                opFile.Error = reTransfer["Failed"][opFile.LFN]
                self.log.error("Retransfer failed", opFile.Error)

                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord(
                        self.createRMSRecord("Failed", 1))

                continue
            opFile.Status = "Done"
            self.log.info("%s retransfer done" % opFile.LFN)

            if self.rmsMonitoring:
                self.rmsMonitoringReporter.addRecord(
                    self.createRMSRecord("Successful", 1))

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.commit()

        return S_OK()
Пример #14
0
class DataOperationSender:
    """
    class:: DataOperationSender
    It reads the MonitoringBackends option to decide whether send and commit data operation to either Accounting or Monitoring.
    """

    # Initialize the object so that the Reporters are created only once
    def __init__(self):
        monitoringType = "DataOperation"
        # Will use the `MonitoringBackends/Default` value as monitoring backend unless a flag for `MonitoringBackends/DataOperation` is set.
        self.monitoringOptions = Operations().getMonitoringBackends(
            monitoringType)
        if "Monitoring" in self.monitoringOptions:
            self.dataOperationReporter = MonitoringReporter(monitoringType)
        if "Accounting" in self.monitoringOptions:
            self.dataOp = DataOperation()

    def sendData(self,
                 baseDict,
                 commitFlag=False,
                 delayedCommit=False,
                 startTime=False,
                 endTime=False):
        """
        Sends the input to Monitoring or Acconting based on the monitoringOptions

        :param dict baseDict: contains a key/value pair
        :param bool commitFlag: decides whether to commit the record or not.
        :param bool delayedCommit: decides whether to commit the record with delay (only for sending to Accounting)
        :param int startTime: epoch time, start time of the plot
        :param int endTime: epoch time, end time of the plot
        """
        def sendMonitoring(self):
            baseDict["ExecutionSite"] = DIRAC.siteName()
            baseDict["Channel"] = baseDict["Source"] + "->" + baseDict[
                "Destination"]
            self.dataOperationReporter.addRecord(baseDict)
            if commitFlag:
                result = self.dataOperationReporter.commit()
                sLog.debug("Committing data operation to monitoring")
                if not result["OK"]:
                    sLog.error("Could not commit data operation to monitoring",
                               result["Message"])
                else:
                    sLog.debug("Done committing to monitoring")

        def sendAccounting(self):
            self.dataOp.setValuesFromDict(baseDict)
            if startTime:
                self.dataOp.setStartTime(startTime)
                self.dataOp.setEndTime(endTime)
            else:
                self.dataOp.setStartTime()
                self.dataOp.setEndTime()
            # Adding only to register
            if not commitFlag and not delayedCommit:
                return gDataStoreClient.addRegister(self.dataOp)

            # Adding to register and committing
            if commitFlag and not delayedCommit:
                gDataStoreClient.addRegister(self.dataOp)
                result = gDataStoreClient.commit()
                sLog.debug("Committing data operation to accounting")
                if not result["OK"]:
                    sLog.error("Could not commit data operation to accounting",
                               result["Message"])
                    return result
                sLog.debug("Done committing to accounting")
            # Only late committing
            else:
                result = self.dataOp.delayedCommit()
                if not result["OK"]:
                    sLog.error(
                        "Could not delay-commit data operation to accounting")
                    return result

            # Send data and commit prioritizing the first monitoring option in the list
            for backend in self.monitoringOptions:
                func = locals()[f"send{backend}"]
                res = func()
                if not res["OK"]:
                    return res

        return S_OK()

    # Call this method in order to commit all records added but not yet committed to Accounting and Monitoring
    def concludeSending(self):
        def commitAccounting():
            result = gDataStoreClient.commit()
            sLog.debug(
                "Concluding the sending and committing data operation to accounting"
            )
            if not result["OK"]:
                sLog.error("Could not commit data operation to accounting",
                           result["Message"])
            sLog.debug("Committing to accounting concluded")
            return result

        def commitMonitoring():
            result = self.dataOperationReporter.commit()
            sLog.debug("Committing data operation to monitoring")
            if not result["OK"]:
                sLog.error("Could not commit data operation to monitoring",
                           result["Message"])
            sLog.debug("Committing to monitoring concluded")
            return result

        # Commit data prioritizing first monitoring option in the list
        for backend in self.monitoringOptions:
            func = locals()[f"commit{backend}"]
            res = func()
            if not res["OK"]:
                return res
        return S_OK()
Пример #15
0
class ReplicateAndRegister(DMSRequestOperationsBase):
    """
    .. class:: ReplicateAndRegister

    ReplicateAndRegister operation handler
    """
    def __init__(self, operation=None, csPath=None):
        """c'tor

        :param self: self reference
        :param Operation operation: Operation instance
        :param str csPath: CS path for this handler
        """
        super(ReplicateAndRegister, self).__init__(operation, csPath)

        # # SE cache

        # Clients
        self.fc = FileCatalog()

    def __call__(self):
        """call me maybe"""

        # The flag  'rmsMonitoring' is set by the RequestTask and is False by default.
        # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase.
        if self.rmsMonitoring:
            self.rmsMonitoringReporter = MonitoringReporter(
                monitoringType="RMSMonitoring")
        else:
            # # own gMonitor stuff for files
            gMonitor.registerActivity(
                "ReplicateAndRegisterAtt",
                "Replicate and register attempted",
                "RequestExecutingAgent",
                "Files/min",
                gMonitor.OP_SUM,
            )
            gMonitor.registerActivity("ReplicateOK", "Replications successful",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)
            gMonitor.registerActivity("ReplicateFail", "Replications failed",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)
            gMonitor.registerActivity("RegisterOK", "Registrations successful",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)
            gMonitor.registerActivity("RegisterFail", "Registrations failed",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)
            # # for FTS
            gMonitor.registerActivity("FTSScheduleAtt",
                                      "Files schedule attempted",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)
            gMonitor.registerActivity("FTSScheduleOK",
                                      "File schedule successful",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)
            gMonitor.registerActivity("FTSScheduleFail",
                                      "File schedule failed",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)

        # # check replicas first
        checkReplicas = self.__checkReplicas()
        if not checkReplicas["OK"]:
            self.log.error("Failed to check replicas",
                           checkReplicas["Message"])
        if hasattr(self, "FTSMode") and getattr(self, "FTSMode"):
            bannedGroups = getattr(self, "FTSBannedGroups") if hasattr(
                self, "FTSBannedGroups") else ()
            if self.request.OwnerGroup in bannedGroups:
                self.log.verbose(
                    "usage of FTS system is banned for request's owner")
                return self.dmTransfer()

            return self.fts3Transfer()

        return self.dmTransfer()

    def __checkReplicas(self):
        """check done replicas and update file states"""
        waitingFiles = dict([(opFile.LFN, opFile) for opFile in self.operation
                             if opFile.Status in ("Waiting", "Scheduled")])
        targetSESet = set(self.operation.targetSEList)

        replicas = self.fc.getReplicas(list(waitingFiles))
        if not replicas["OK"]:
            self.log.error("Failed to get replicas", replicas["Message"])
            return replicas

        reMissing = re.compile(r".*such file.*")
        for failedLFN, errStr in replicas["Value"]["Failed"].items():
            waitingFiles[failedLFN].Error = errStr
            if reMissing.search(errStr.lower()):
                self.log.error("File does not exists", failedLFN)
                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord(
                        self.createRMSRecord("Failed", 1))
                else:
                    gMonitor.addMark("ReplicateFail", len(targetSESet))
                waitingFiles[failedLFN].Status = "Failed"

        for successfulLFN, reps in replicas["Value"]["Successful"].items():
            if targetSESet.issubset(set(reps)):
                self.log.info("file replicated to all targets", successfulLFN)
                waitingFiles[successfulLFN].Status = "Done"

        return S_OK()

    def _addMetadataToFiles(self, toSchedule):
        """Add metadata to those files that need to be scheduled through FTS

        toSchedule is a dictionary:
        {'lfn1': opFile, 'lfn2': opFile}
        """
        if toSchedule:
            self.log.info(
                "found %s files to schedule, getting metadata from FC" %
                len(toSchedule))
        else:
            self.log.verbose("No files to schedule")
            return S_OK([])

        res = self.fc.getFileMetadata(list(toSchedule))
        if not res["OK"]:
            return res
        else:
            if res["Value"]["Failed"]:
                self.log.warn(
                    "Can't schedule %d files: problems getting the metadata: %s"
                    % (len(res["Value"]["Failed"]), ", ".join(
                        res["Value"]["Failed"])))
            metadata = res["Value"]["Successful"]

        filesToSchedule = {}

        for lfn, lfnMetadata in metadata.items():
            opFileToSchedule = toSchedule[lfn][0]
            opFileToSchedule.GUID = lfnMetadata["GUID"]
            # In principle this is defined already in filterReplicas()
            if not opFileToSchedule.Checksum:
                opFileToSchedule.Checksum = metadata[lfn]["Checksum"]
                opFileToSchedule.ChecksumType = metadata[lfn]["ChecksumType"]
            opFileToSchedule.Size = metadata[lfn]["Size"]

            filesToSchedule[opFileToSchedule.LFN] = opFileToSchedule

        return S_OK(filesToSchedule)

    def _filterReplicas(self, opFile):
        """filter out banned/invalid source SEs"""
        return filterReplicas(opFile,
                              logger=self.log,
                              dataManager=self.dm,
                              opSources=self.operation.sourceSEList)

    def _checkExistingFTS3Operations(self):
        """
        Check if there are ongoing FTS3Operation for the current RMS Operation

        Under some conditions, we can be trying to schedule files while
        there is still an FTS transfer going on. This typically happens
        when the REA hangs. To prevent further race condition, we check
        if there are FTS3Operations in a non Final state matching the
        current operation ID. If so, we put the corresponding files in
        scheduled mode. We will then wait till the FTS3 Operation performs
        the callback

        :returns: S_OK with True if we can go on, False if we should stop the processing
        """

        res = FTS3Client().getOperationsFromRMSOpID(self.operation.OperationID)

        if not res["OK"]:
            self.log.debug("Could not get FTS3Operations matching OperationID",
                           self.operation.OperationID)
            return res

        existingFTSOperations = res["Value"]
        # It is ok to have FTS Operations in a final state, so we
        # care only about the others
        unfinishedFTSOperations = [
            ops for ops in existingFTSOperations
            if ops.status not in FTS3TransferOperation.FINAL_STATES
        ]

        if not unfinishedFTSOperations:
            self.log.debug("No ongoing FTS3Operations, all good")
            return S_OK(True)

        self.log.warn(
            "Some FTS3Operations already exist for the RMS Operation:",
            [op.operationID for op in unfinishedFTSOperations],
        )

        # This would really be a screwed up situation !
        if len(unfinishedFTSOperations) > 1:
            self.log.warn("That's a serious problem !!")

        # We take the rmsFileID of the files in the Operations,
        # find the corresponding File object, and set them scheduled
        rmsFileIDsToSetScheduled = set([
            ftsFile.rmsFileID for ftsOp in unfinishedFTSOperations
            for ftsFile in ftsOp.ftsFiles
        ])

        for opFile in self.operation:
            # If it is in the DB, it has a FileID
            opFileID = opFile.FileID
            if opFileID in rmsFileIDsToSetScheduled:
                self.log.warn("Setting RMSFile as already scheduled", opFileID)
                opFile.Status = "Scheduled"

        # We return here such that the Request is set back to Scheduled in the DB
        # With no further modification
        return S_OK(False)

    def fts3Transfer(self):
        """replicate and register using FTS3"""

        self.log.info("scheduling files in FTS3...")

        # Check first if we do not have ongoing transfers

        res = self._checkExistingFTS3Operations()
        if not res["OK"]:
            return res

        # if res['Value'] is False
        # it means that there are ongoing transfers
        # and we should stop here
        if res["Value"] is False:
            # return S_OK such that the request is put back
            return S_OK()

        fts3Files = []
        toSchedule = {}

        # Dict which maps the FileID to the object
        rmsFilesIds = {}

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.addRecord(
                self.createRMSRecord("Attempted",
                                     len(self.getWaitingFilesList())))

        for opFile in self.getWaitingFilesList():
            rmsFilesIds[opFile.FileID] = opFile

            opFile.Error = ""
            if not self.rmsMonitoring:
                gMonitor.addMark("FTSScheduleAtt")
            # # check replicas
            replicas = self._filterReplicas(opFile)
            if not replicas["OK"]:
                continue
            replicas = replicas["Value"]

            validReplicas = replicas["Valid"]
            noMetaReplicas = replicas["NoMetadata"]
            noReplicas = replicas["NoReplicas"]
            badReplicas = replicas["Bad"]
            noPFN = replicas["NoPFN"]

            if validReplicas:
                validTargets = list(
                    set(self.operation.targetSEList) - set(validReplicas))
                if not validTargets:
                    self.log.info("file %s is already present at all targets" %
                                  opFile.LFN)
                    opFile.Status = "Done"
                else:
                    toSchedule[opFile.LFN] = [opFile, validTargets]

            else:
                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord(
                        self.createRMSRecord("Failed", 1))
                else:
                    gMonitor.addMark("FTSScheduleFail")
                if noMetaReplicas:
                    self.log.warn(
                        "unable to schedule file",
                        "'%s': couldn't get metadata at %s" %
                        (opFile.LFN, ",".join(noMetaReplicas)),
                    )
                    opFile.Error = "Couldn't get metadata"
                elif noReplicas:
                    self.log.error(
                        "Unable to schedule transfer",
                        "File %s doesn't exist at %s" %
                        (opFile.LFN, ",".join(noReplicas)),
                    )
                    opFile.Error = "No replicas found"
                    opFile.Status = "Failed"
                elif badReplicas:
                    self.log.error(
                        "Unable to schedule transfer",
                        "File %s, all replicas have a bad checksum at %s" %
                        (opFile.LFN, ",".join(badReplicas)),
                    )
                    opFile.Error = "All replicas have a bad checksum"
                    opFile.Status = "Failed"
                elif noPFN:
                    self.log.warn(
                        "unable to schedule %s, could not get a PFN at %s" %
                        (opFile.LFN, ",".join(noPFN)))

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.commit()

        res = self._addMetadataToFiles(toSchedule)
        if not res["OK"]:
            return res
        else:
            filesToSchedule = res["Value"]

            for lfn in filesToSchedule:
                opFile = filesToSchedule[lfn]
                validTargets = toSchedule[lfn][1]
                for targetSE in validTargets:
                    ftsFile = FTS3File.fromRMSFile(opFile, targetSE)
                    fts3Files.append(ftsFile)

        if fts3Files:
            res = Registry.getUsernameForDN(self.request.OwnerDN)
            if not res["OK"]:
                self.log.error(
                    "Cannot get username for DN",
                    "%s %s" % (self.request.OwnerDN, res["Message"]))
                return res

            username = res["Value"]
            fts3Operation = FTS3TransferOperation.fromRMSObjects(
                self.request, self.operation, username)
            fts3Operation.ftsFiles = fts3Files

            try:
                if not fts3Operation.activity:
                    vo = getVOfromProxyGroup().get("Value")
                    fts3Plugin = getFTS3Plugin(vo=vo)
                    fts3Operation.activity = fts3Plugin.inferFTSActivity(
                        fts3Operation, self.request, self.operation)
            except Exception:
                pass

            ftsSchedule = FTS3Client().persistOperation(fts3Operation)
            if not ftsSchedule["OK"]:
                self.log.error("Completely failed to schedule to FTS3:",
                               ftsSchedule["Message"])
                return ftsSchedule

            # might have nothing to schedule
            ftsSchedule = ftsSchedule["Value"]
            self.log.info("Scheduled with FTS3Operation id %s" % ftsSchedule)

            self.log.info("%d files have been scheduled to FTS3" %
                          len(fts3Files))

            if self.rmsMonitoring:
                self.rmsMonitoringReporter.addRecord(
                    self.createRMSRecord("Successful", len(fts3Files)))

            for ftsFile in fts3Files:
                opFile = rmsFilesIds[ftsFile.rmsFileID]
                if not self.rmsMonitoring:
                    gMonitor.addMark("FTSScheduleOK", 1)
                opFile.Status = "Scheduled"
                self.log.debug("%s has been scheduled for FTS" % opFile.LFN)
        else:
            self.log.info("No files to schedule after metadata checks")

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.commit()

        # Just in case some transfers could not be scheduled, try them with RM
        return self.dmTransfer(fromFTS=True)

    def dmTransfer(self, fromFTS=False):
        """replicate and register using dataManager"""
        # # get waiting files. If none just return
        # # source SE
        sourceSE = self.operation.SourceSE if self.operation.SourceSE else None
        if sourceSE:
            # # check source se for read
            bannedSource = self.checkSEsRSS(sourceSE, "ReadAccess")
            if not bannedSource["OK"]:
                if self.rmsMonitoring:
                    for status in ["Attempted", "Failed"]:
                        self.rmsMonitoringReporter.addRecord(
                            self.createRMSRecord(status, len(self.operation)))
                    self.rmsMonitoringReporter.commit()
                else:
                    gMonitor.addMark("ReplicateAndRegisterAtt",
                                     len(self.operation))
                    gMonitor.addMark("ReplicateFail", len(self.operation))
                return bannedSource

            if bannedSource["Value"]:
                self.operation.Error = "SourceSE %s is banned for reading" % sourceSE
                self.log.info(self.operation.Error)
                return S_OK(self.operation.Error)

        # # check targetSEs for write
        bannedTargets = self.checkSEsRSS()
        if not bannedTargets["OK"]:
            if self.rmsMonitoring:
                for status in ["Attempted", "Failed"]:
                    self.rmsMonitoringReporter.addRecord(
                        self.createRMSRecord(status, len(self.operation)))
                self.rmsMonitoringReporter.commit()
            else:
                gMonitor.addMark("ReplicateAndRegisterAtt",
                                 len(self.operation))
                gMonitor.addMark("ReplicateFail", len(self.operation))
            return bannedTargets

        if bannedTargets["Value"]:
            self.operation.Error = "%s targets are banned for writing" % ",".join(
                bannedTargets["Value"])
            return S_OK(self.operation.Error)

        # Can continue now
        self.log.verbose("No targets banned for writing")

        waitingFiles = self.getWaitingFilesList()
        if not waitingFiles:
            return S_OK()
        # # loop over files
        if fromFTS:
            self.log.info(
                "Trying transfer using replica manager as FTS failed")
        else:
            self.log.info("Transferring files using Data manager...")
        errors = defaultdict(int)
        delayExecution = 0

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.addRecord(
                self.createRMSRecord("Attempted", len(waitingFiles)))

        for opFile in waitingFiles:
            if opFile.Error in (
                    "Couldn't get metadata",
                    "File doesn't exist",
                    "No active replica found",
                    "All replicas have a bad checksum",
            ):
                err = "File already in error status"
                errors[err] += 1

            if not self.rmsMonitoring:
                gMonitor.addMark("ReplicateAndRegisterAtt", 1)

            opFile.Error = ""
            lfn = opFile.LFN

            # Check if replica is at the specified source
            replicas = self._filterReplicas(opFile)
            if not replicas["OK"]:
                self.log.error("Failed to check replicas", replicas["Message"])
                continue
            replicas = replicas["Value"]
            validReplicas = replicas.get("Valid")
            noMetaReplicas = replicas.get("NoMetadata")
            noReplicas = replicas.get("NoReplicas")
            badReplicas = replicas.get("Bad")
            noActiveReplicas = replicas.get("NoActiveReplicas")

            if not validReplicas:
                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord(
                        self.createRMSRecord("Failed", 1))
                else:
                    gMonitor.addMark("ReplicateFail")
                if noMetaReplicas:
                    err = "Couldn't get metadata"
                    errors[err] += 1
                    self.log.verbose(
                        "unable to replicate '%s', couldn't get metadata at %s"
                        % (opFile.LFN, ",".join(noMetaReplicas)))
                    opFile.Error = err
                elif noReplicas:
                    err = "File doesn't exist"
                    errors[err] += 1
                    self.log.verbose(
                        "Unable to replicate", "File %s doesn't exist at %s" %
                        (opFile.LFN, ",".join(noReplicas)))
                    opFile.Error = err
                    opFile.Status = "Failed"
                elif badReplicas:
                    err = "All replicas have a bad checksum"
                    errors[err] += 1
                    self.log.error(
                        "Unable to replicate",
                        "%s, all replicas have a bad checksum at %s" %
                        (opFile.LFN, ",".join(badReplicas)),
                    )
                    opFile.Error = err
                    opFile.Status = "Failed"
                elif noActiveReplicas:
                    err = "No active replica found"
                    errors[err] += 1
                    self.log.verbose(
                        "Unable to schedule transfer", "%s, %s at %s" %
                        (opFile.LFN, err, ",".join(noActiveReplicas)))
                    opFile.Error = err
                    # All source SEs are banned, delay execution by 1 hour
                    delayExecution = 60
                continue
            # # get the first one in the list
            if sourceSE not in validReplicas:
                if sourceSE:
                    err = "File not at specified source"
                    errors[err] += 1
                    self.log.warn(
                        "%s is not at specified sourceSE %s, changed to %s" %
                        (lfn, sourceSE, validReplicas[0]))
                sourceSE = validReplicas[0]

            # # loop over targetSE
            catalogs = self.operation.Catalog
            if catalogs:
                catalogs = [cat.strip() for cat in catalogs.split(",")]

            for targetSE in self.operation.targetSEList:

                # # call DataManager
                if targetSE in validReplicas:
                    self.log.warn(
                        "Request to replicate %s to an existing location: %s" %
                        (lfn, targetSE))
                    continue
                res = self.dm.replicateAndRegister(lfn,
                                                   targetSE,
                                                   sourceSE=sourceSE,
                                                   catalog=catalogs)
                if res["OK"]:

                    if lfn in res["Value"]["Successful"]:

                        if "replicate" in res["Value"]["Successful"][lfn]:

                            repTime = res["Value"]["Successful"][lfn][
                                "replicate"]
                            prString = "file %s replicated at %s in %s s." % (
                                lfn, targetSE, repTime)

                            if not self.rmsMonitoring:
                                gMonitor.addMark("ReplicateOK", 1)

                            if "register" in res["Value"]["Successful"][lfn]:

                                if not self.rmsMonitoring:
                                    gMonitor.addMark("RegisterOK", 1)

                                regTime = res["Value"]["Successful"][lfn][
                                    "register"]
                                prString += " and registered in %s s." % regTime
                                self.log.info(prString)
                            else:

                                if not self.rmsMonitoring:
                                    gMonitor.addMark("RegisterFail", 1)
                                prString += " but failed to register"
                                self.log.warn(prString)

                                opFile.Error = "Failed to register"
                                # # add register replica operation
                                registerOperation = self.getRegisterOperation(
                                    opFile, targetSE, type="RegisterReplica")
                                self.request.insertAfter(
                                    registerOperation, self.operation)

                        else:

                            self.log.error("Failed to replicate",
                                           "%s to %s" % (lfn, targetSE))
                            if not self.rmsMonitoring:
                                gMonitor.addMark("ReplicateFail", 1)
                            opFile.Error = "Failed to replicate"

                    else:

                        if not self.rmsMonitoring:
                            gMonitor.addMark("ReplicateFail", 1)
                        reason = res["Value"]["Failed"][lfn]
                        self.log.error("Failed to replicate and register",
                                       "File %s at %s:" % (lfn, targetSE),
                                       reason)
                        opFile.Error = reason

                else:

                    if not self.rmsMonitoring:
                        gMonitor.addMark("ReplicateFail", 1)
                    opFile.Error = "DataManager error: %s" % res["Message"]
                    self.log.error("DataManager error", res["Message"])

            if not opFile.Error:
                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord(
                        self.createRMSRecord("Successful", 1))

                if len(self.operation.targetSEList) > 1:
                    self.log.info(
                        "file %s has been replicated to all targetSEs" % lfn)
                opFile.Status = "Done"
            elif self.rmsMonitoring:
                self.rmsMonitoringReporter.addRecord(
                    self.createRMSRecord("Failed", 1))
        # Log error counts
        if delayExecution:
            self.log.info("Delay execution of the request by %d minutes" %
                          delayExecution)
            self.request.delayNextExecution(delayExecution)
        for error, count in errors.items():
            self.log.error(error, "for %d files" % count)

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.commit()

        return S_OK()
Пример #16
0
class PhysicalRemoval(DMSRequestOperationsBase):
    """
    .. class:: PhysicalRemoval

    """

    def __init__(self, operation=None, csPath=None):
        """c'tor

        :param self: self reference
        :param ~DIRAC.RequestManagementSystem.Client.Operation.Operation operation: Operation instance
        :param str csPath: cs config path
        """
        DMSRequestOperationsBase.__init__(self, operation, csPath)

    def __call__(self):
        """perform physical removal operation"""

        # The flag  'rmsMonitoring' is set by the RequestTask and is False by default.
        # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase.
        if self.rmsMonitoring:
            self.rmsMonitoringReporter = MonitoringReporter(monitoringType="RMSMonitoring")

        bannedTargets = self.checkSEsRSS(access="RemoveAccess")
        if not bannedTargets["OK"]:
            if self.rmsMonitoring:
                for status in ["Attempted", "Failed"]:
                    self.rmsMonitoringReporter.addRecord(self.createRMSRecord(status, len(self.operation)))
                self.rmsMonitoringReporter.commit()
            return bannedTargets

        if bannedTargets["Value"]:
            return S_OK("%s targets are banned for removal" % ",".join(bannedTargets["Value"]))

        # # get waiting files
        waitingFiles = self.getWaitingFilesList()
        # # prepare lfn dict
        toRemoveDict = dict((opFile.LFN, opFile) for opFile in waitingFiles)

        targetSEs = self.operation.targetSEList

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Attempted", len(toRemoveDict)))
            self.rmsMonitoringReporter.commit()

        # # keep errors dict
        removalStatus = dict.fromkeys(toRemoveDict.keys(), None)
        for lfn in removalStatus:
            removalStatus[lfn] = dict.fromkeys(targetSEs, "")

        for targetSE in targetSEs:

            self.log.info("removing files from %s" % targetSE)

            # # 1st - bulk removal
            bulkRemoval = self.bulkRemoval(toRemoveDict, targetSE)
            if not bulkRemoval["OK"]:
                self.log.error("Failed bulk removal", bulkRemoval["Message"])
                self.operation.Error = bulkRemoval["Message"]
                return bulkRemoval

            bulkRemoval = bulkRemoval["Value"]

            for lfn, opFile in toRemoveDict.items():
                removalStatus[lfn][targetSE] = bulkRemoval["Failed"].get(lfn, "")
                opFile.Error = removalStatus[lfn][targetSE]

            # # 2nd - single file removal
            toRetry = dict((lfn, opFile) for lfn, opFile in toRemoveDict.items() if lfn in bulkRemoval["Failed"])
            for lfn, opFile in toRetry.items():
                self.singleRemoval(opFile, targetSE)
                if not opFile.Error:
                    removalStatus[lfn][targetSE] = ""
                else:
                    if self.rmsMonitoring:
                        self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Failed", 1))
                    removalStatus[lfn][targetSE] = opFile.Error

        # # update file status for waiting files
        failed = 0
        for opFile in self.operation:
            if opFile.Status == "Waiting":
                errors = [error for error in removalStatus[opFile.LFN].values() if error.strip()]
                if errors:
                    failed += 1
                    opFile.Error = ",".join(errors)
                    if "Write access not permitted for this credential" in opFile.Error:
                        opFile.Status = "Failed"

                        if self.rmsMonitoring:
                            self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Failed", 1))

                    continue

                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Successful", 1))
                opFile.Status = "Done"

        if failed:
            self.operation.Error = "failed to remove %s files" % failed

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.commit()

        return S_OK()

    def bulkRemoval(self, toRemoveDict, targetSE):
        """bulk removal of lfns from :targetSE:

        :param dict toRemoveDict: { lfn : opFile, ... }
        :param str targetSE: target SE name
        """

        bulkRemoval = StorageElement(targetSE).removeFile(toRemoveDict)
        return bulkRemoval

    def singleRemoval(self, opFile, targetSE):
        """remove single file from :targetSE:"""
        proxyFile = None
        if "Write access not permitted for this credential" in opFile.Error:
            # # not a DataManger? set status to failed and return
            if "DataManager" not in self.shifter:
                opFile.Status = "Failed"
            elif not opFile.LFN:
                opFile.Error = "LFN not set"
                opFile.Status = "Failed"
            else:
                # #  you're a data manager - save current proxy and get a new one for LFN and retry
                saveProxy = os.environ["X509_USER_PROXY"]
                try:
                    proxyFile = self.getProxyForLFN(opFile.LFN)
                    if not proxyFile["OK"]:
                        opFile.Error = proxyFile["Message"]
                    else:
                        proxyFile = proxyFile["Value"]
                        removeFile = StorageElement(targetSE).removeFile(opFile.LFN)
                        if not removeFile["OK"]:
                            opFile.Error = removeFile["Message"]
                        else:
                            removeFile = removeFile["Value"]
                            if opFile.LFN in removeFile["Failed"]:
                                opFile.Error = removeFile["Failed"][opFile.LFN]
                            else:
                                # # reset error - replica has been removed this time
                                opFile.Error = ""
                finally:
                    if proxyFile:
                        os.unlink(proxyFile)
                    # # put back request owner proxy to env
                    os.environ["X509_USER_PROXY"] = saveProxy
        return S_OK(opFile)
Пример #17
0
class StatesAccountingAgent(AgentModule):
    """Agent that every 15 minutes will report
    to the AccountingDB (MySQL) or the Monitoring DB (ElasticSearch), or both,
    a snapshot of the JobDB.
    Also sends a snapshot of PilotAgentsDB to Monitoring.
    """

    # WMSHistory fields
    __summaryKeyFieldsMapping = [
        "Status",
        "Site",
        "User",
        "UserGroup",
        "JobGroup",
        "JobType",
        "ApplicationStatus",
        "MinorStatus",
    ]
    __summaryDefinedFields = [("ApplicationStatus", "unset"),
                              ("MinorStatus", "unset")]
    __summaryValueFieldsMapping = ["Jobs", "Reschedules"]
    __renameFieldsMapping = {"JobType": "JobSplitType"}

    # PilotsHistory fields
    __pilotsMapping = [
        "TaskQueueID", "GridSite", "GridType", "Status", "NumOfPilots"
    ]

    def initialize(self):
        """Standard initialization"""
        # This agent will always loop every 15 minutes
        self.am_setOption("PollingTime", 900)

        # Check whether to send to Monitoring or Accounting or both
        self.jobMonitoringOption = Operations().getMonitoringBackends(
            monitoringType="WMSHistory")
        self.pilotMonitoringOption = Operations().getMonitoringBackends(
            monitoringType="PilotsHistory")
        messageQueue = self.am_getOption("MessageQueue", "dirac.wmshistory")
        self.datastores = {
        }  # For storing the clients to Accounting and Monitoring

        if "Accounting" in self.jobMonitoringOption:
            self.datastores["Accounting"] = DataStoreClient(retryGraceTime=900)
        if "Monitoring" in self.jobMonitoringOption:
            self.datastores["Monitoring"] = MonitoringReporter(
                monitoringType="WMSHistory", failoverQueueName=messageQueue)
        if "Monitoring" in self.pilotMonitoringOption:
            self.pilotReporter = MonitoringReporter(
                monitoringType="PilotsHistory", failoverQueueName=messageQueue)

        self.__jobDBFields = []
        for field in self.__summaryKeyFieldsMapping:
            if field == "User":
                field = "Owner"
            elif field == "UserGroup":
                field = "OwnerGroup"
            self.__jobDBFields.append(field)
        return S_OK()

    def execute(self):
        """Main execution method"""
        # PilotsHistory to Monitoring
        if "Monitoring" in self.pilotMonitoringOption:
            self.log.info("Committing PilotsHistory to Monitoring")
            result = PilotAgentsDB().getSummarySnapshot()
            now = datetime.datetime.utcnow()
            if not result["OK"]:
                self.log.error(
                    "Can't get the PilotAgentsDB summary",
                    "%s: won't commit PilotsHistory at this cycle" %
                    result["Message"],
                )

            values = result["Value"][1]
            for record in values:
                rD = {}
                for iP, _ in enumerate(self.__pilotsMapping):
                    rD[self.__pilotsMapping[iP]] = record[iP]
                rD["timestamp"] = int(TimeUtilities.toEpoch(now))
                self.pilotReporter.addRecord(rD)

            self.log.info("Committing to Monitoring...")
            result = self.pilotReporter.commit()
            if not result["OK"]:
                self.log.error("Could not commit to Monitoring",
                               result["Message"])
            self.log.verbose("Done committing PilotsHistory to Monitoring")

        # WMSHistory to Monitoring or Accounting
        self.log.info("Committing WMSHistory to %s backend" %
                      "and ".join(self.jobMonitoringOption))
        result = JobDB().getSummarySnapshot(self.__jobDBFields)
        now = datetime.datetime.utcnow()
        if not result["OK"]:
            self.log.error(
                "Can't get the JobDB summary",
                "%s: won't commit WMSHistory at this cycle" %
                result["Message"])
            return S_ERROR()

        values = result["Value"][1]

        self.log.info("Start sending WMSHistory records")
        for record in values:
            record = record[1:]
            rD = {}
            for fV in self.__summaryDefinedFields:
                rD[fV[0]] = fV[1]
            for iP, _ in enumerate(self.__summaryKeyFieldsMapping):
                fieldName = self.__summaryKeyFieldsMapping[iP]
                rD[self.__renameFieldsMapping.get(fieldName,
                                                  fieldName)] = record[iP]
            record = record[len(self.__summaryKeyFieldsMapping):]
            for iP, _ in enumerate(self.__summaryValueFieldsMapping):
                rD[self.__summaryValueFieldsMapping[iP]] = int(record[iP])

            for backend in self.datastores:
                if backend.lower() == "monitoring":
                    rD["timestamp"] = int(TimeUtilities.toEpoch(now))
                    self.datastores["Monitoring"].addRecord(rD)

                elif backend.lower() == "accounting":
                    acWMS = WMSHistory()
                    acWMS.setStartTime(now)
                    acWMS.setEndTime(now)
                    acWMS.setValuesFromDict(rD)
                    retVal = acWMS.checkValues()
                    if not retVal["OK"]:
                        self.log.error("Invalid WMSHistory accounting record ",
                                       "%s -> %s" % (retVal["Message"], rD))
                    else:
                        self.datastores["Accounting"].addRegister(acWMS)

        for backend, datastore in self.datastores.items():
            self.log.info("Committing WMSHistory records to %s backend" %
                          backend)
            result = datastore.commit()
            if not result["OK"]:
                self.log.error("Couldn't commit WMSHistory to %s" % backend,
                               result["Message"])
                return S_ERROR()
            self.log.verbose("Done committing WMSHistory to %s backend" %
                             backend)

        return S_OK()
Пример #18
0
class StatesMonitoringAgent(AgentModule):
    """
      The specific agents must provide the following methods:
        - initialize() for initial settings
        - beginExecution()
        - execute() - the main method called in the agent cycle
        - endExecution()
        - finalize() - the graceful exit of the method, this one is usually used
                   for the agent restart
  """

    __summaryKeyFieldsMapping = [
        'Status', 'Site', 'User', 'UserGroup', 'JobGroup', 'JobType',
        'ApplicationStatus', 'MinorStatus'
    ]
    __summaryDefinedFields = [('ApplicationStatus', 'unset'),
                              ('MinorStatus', 'unset')]
    __summaryValueFieldsMapping = ['Jobs', 'Reschedules']
    __renameFieldsMapping = {'JobType': 'JobSplitType'}

    __jobDBFields = []

    jobDB = None
    monitoringReporter = None

    def initialize(self):
        """ Standard constructor
    """

        self.jobDB = JobDB()

        self.am_setOption("PollingTime", 900)
        self.messageQueue = self.am_getOption('MessageQueue',
                                              'dirac.wmshistory')

        self.monitoringReporter = MonitoringReporter(
            monitoringType="WMSHistory", failoverQueueName=self.messageQueue)

        for field in self.__summaryKeyFieldsMapping:
            if field == 'User':
                field = 'Owner'
            elif field == 'UserGroup':
                field = 'OwnerGroup'
            self.__jobDBFields.append(field)

        return S_OK()

    def execute(self):
        """ Main execution method
    """
        result = gConfig.getSections("/DIRAC/Setups")
        if not result['OK']:
            return result
        validSetups = result['Value']
        self.log.info("Valid setups for this cycle are %s" %
                      ", ".join(validSetups))
        # Get the WMS Snapshot!
        result = self.jobDB.getSummarySnapshot(self.__jobDBFields)
        now = Time.dateTime()
        if not result['OK']:
            self.log.error("Can't get the jobdb summary", result['Message'])
        else:
            values = result['Value'][1]
            self.log.info("Start sending records!")
            for record in values:
                recordSetup = record[0]
                if recordSetup not in validSetups:
                    self.log.error("Setup %s is not valid" % recordSetup)
                    continue
                record = record[1:]
                rD = {}
                for fV in self.__summaryDefinedFields:
                    rD[fV[0]] = fV[1]
                for iP in range(len(self.__summaryKeyFieldsMapping)):
                    fieldName = self.__summaryKeyFieldsMapping[iP]
                    rD[self.__renameFieldsMapping.get(fieldName,
                                                      fieldName)] = record[iP]
                record = record[len(self.__summaryKeyFieldsMapping):]
                for iP in range(len(self.__summaryValueFieldsMapping)):
                    rD[self.__summaryValueFieldsMapping[iP]] = int(record[iP])
                rD['timestamp'] = int(Time.toEpoch(now))
                self.monitoringReporter.addRecord(rD)
            retVal = self.monitoringReporter.commit()
            if retVal['OK']:
                self.log.info(
                    "The records are successfully sent to the Store!")
            else:
                self.log.warn(
                    "Faild to insert the records! It will be retried in the next iteration",
                    retVal['Message'])

        return S_OK()
Пример #19
0
class RegisterReplica(DMSRequestOperationsBase):
    """
    .. class:: RegisterReplica

    RegisterReplica operation handler

    :param self: self reference
    :param ~DIRAC.RequestManagementSystem.Client.Operation.Operation operation: Operation instance
    :param str csPath: CS path for this handler
    """

    def __init__(self, operation=None, csPath=None):
        """c'tor"""
        DMSRequestOperationsBase.__init__(self, operation, csPath)

    def __call__(self):
        """call me maybe"""

        # The flag  'rmsMonitoring' is set by the RequestTask and is False by default.
        # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase.
        if self.rmsMonitoring:
            self.rmsMonitoringReporter = MonitoringReporter(monitoringType="RMSMonitoring")
        else:
            # # RegisterReplica specific monitor info
            gMonitor.registerActivity(
                "RegisterReplicaAtt",
                "Attempted replicas registrations",
                "RequestExecutingAgent",
                "Replicas/min",
                gMonitor.OP_SUM,
            )
            gMonitor.registerActivity(
                "RegisterReplicaOK",
                "Successful replicas registrations",
                "RequestExecutingAgent",
                "Replicas/min",
                gMonitor.OP_SUM,
            )
            gMonitor.registerActivity(
                "RegisterReplicaFail",
                "Failed replicas registrations",
                "RequestExecutingAgent",
                "Replicas/min",
                gMonitor.OP_SUM,
            )

        # # counter for failed replicas

        failedReplicas = 0
        # # catalog to use
        catalogs = self.operation.Catalog
        if catalogs:
            catalogs = [cat.strip() for cat in catalogs.split(",")]
        # # get waiting files
        waitingFiles = self.getWaitingFilesList()

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Attempted", len(waitingFiles)))

        # # loop over files
        registerOperations = {}
        successReplicas = 0
        for opFile in waitingFiles:

            if not self.rmsMonitoring:
                gMonitor.addMark("RegisterReplicaAtt", 1)

            # # get LFN
            lfn = opFile.LFN
            # # and others
            targetSE = self.operation.targetSEList[0]
            replicaTuple = (lfn, opFile.PFN, targetSE)
            # # call ReplicaManager
            registerReplica = self.dm.registerReplica(replicaTuple, catalogs)
            # # check results
            if not registerReplica["OK"] or lfn in registerReplica["Value"]["Failed"]:
                # There have been some errors

                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Failed", 1))
                else:
                    gMonitor.addMark("RegisterReplicaFail", 1)
                #        self.dataLoggingClient().addFileRecord( lfn, "RegisterReplicaFail", ','.join( catalogs ) if catalogs else "all catalogs", "", "RegisterReplica" )

                reason = registerReplica.get(
                    "Message", registerReplica.get("Value", {}).get("Failed", {}).get(lfn, "Unknown")
                )
                errorStr = "failed to register LFN %s: %s" % (lfn, str(reason))
                # FIXME: this is incompatible with the change made in the DM that we
                # ignore failures if successful in at least one catalog
                if lfn in registerReplica.get("Value", {}).get("Successful", {}) and isinstance(reason, dict):
                    # As we managed, let's create a new operation for just the remaining registration
                    errorStr += " - adding registerReplica operations to request"
                    for failedCatalog in reason:
                        key = "%s/%s" % (targetSE, failedCatalog)
                        newOperation = self.getRegisterOperation(
                            opFile, targetSE, type="RegisterReplica", catalog=failedCatalog
                        )
                        if key not in registerOperations:
                            registerOperations[key] = newOperation
                        else:
                            registerOperations[key].addFile(newOperation[0])
                    opFile.Status = "Done"
                else:
                    opFile.Error = errorStr
                    catMaster = True
                    if isinstance(reason, dict):
                        from DIRAC.Resources.Catalog.FileCatalog import FileCatalog

                        for failedCatalog in reason:
                            catMaster = catMaster and FileCatalog()._getCatalogConfigDetails(failedCatalog).get(
                                "Value", {}
                            ).get("Master", False)
                    # If one targets explicitly a catalog and it fails or if it fails on the master catalog
                    if (catalogs or catMaster) and (
                        "file does not exist" in opFile.Error.lower() or "no such file" in opFile.Error.lower()
                    ):
                        # Check if the file really exists in SE, if not, consider this file registration as Done
                        res = self.dm.getReplicaMetadata(lfn, targetSE)
                        notExist = bool("No such file" in res.get("Value", {}).get("Failed", {}).get(lfn, ""))
                        if not notExist:
                            opFile.Status = "Failed"
                        else:
                            opFile.Status = "Done"
                    if opFile.Status != "Done":
                        failedReplicas += 1
                self.log.warn(errorStr)

            else:
                # All is OK
                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Successful", 1))
                else:
                    gMonitor.addMark("RegisterReplicaOK", 1)
                    successReplicas += 1
                    self.log.verbose(
                        "Replica %s has been registered at %s"
                        % (lfn, ",".join(catalogs) if catalogs else "all catalogs")
                    )

                opFile.Status = "Done"

        # # if we have new replications to take place, put them at the end
        if registerOperations:
            self.log.info("adding %d operations to the request" % len(registerOperations))
        for operation in registerOperations.values():
            self.operation._parent.addOperation(operation)

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.commit()

        # # final check
        infoStr = ""
        if successReplicas:
            infoStr = "%d replicas successfully registered" % successReplicas
        if failedReplicas:
            infoStr += ", %d replicas failed to register" % failedReplicas
        self.log.info("All replicas processed", infoStr)
        if failedReplicas:
            self.operation.Error = "some replicas failed to register"
            return S_ERROR(self.operation.Error)

        return S_OK()
Пример #20
0
class AgentModule:
    """Base class for all agent modules

    This class is used by the AgentReactor Class to steer the execution of
    DIRAC Agents.

    For this purpose the following methods are used:
    - am_initialize()      just after instantiated
    - am_getPollingTime()  to set the execution frequency
    - am_getMaxCycles()    to determine the number of cycles
    - am_go()              for the actual execution of one cycle

    Before each iteration, the following methods are used to determine
    if the new cycle is to be started.
    - am_getModuleParam( 'alive' )
    - am_checkStopAgentFile()
    - am_removeStopAgentFile()

    To start new execution cycle the following methods are used
    - am_getCyclesDone()
    - am_setOption( 'MaxCycles', maxCycles )

    At the same time it provides all Agents with common interface.
    All Agent class must inherit from this base class and must implement
    at least the following method:
    - execute()            main method called in the agent cycle

    Additionally they may provide:
    - initialize()         for initial settings
    - finalize()           the graceful exit

    - beginExecution()     before each execution cycle
    - endExecution()       at the end of each execution cycle

    The agent can be stopped either by a signal or by creating a 'stop_agent' file
    in the controlDirectory defined in the agent configuration

    """
    def __init__(self,
                 agentName,
                 loadName,
                 baseAgentName=False,
                 properties={}):
        """
        Common __init__ method for all Agents.
        All Agent modules must define: __doc__

        They are used to populate __codeProperties

        The following Options are used from the Configuration:
        - /DIRAC/Setup
        - Status
        - Enabled
        - PollingTime            default = 120
        - MaxCycles              default = 500
        - WatchdogTime           default = 0 (disabled)
        - ControlDirectory       control/SystemName/AgentName
        - WorkDirectory          work/SystemName/AgentName
        - shifterProxy           ''
        - shifterProxyLocation   WorkDirectory/SystemName/AgentName/.shifterCred

        It defines the following default Options that can be set via Configuration (above):
        - MonitoringEnabled     True
        - Enabled               True if Status == Active
        - PollingTime           120
        - MaxCycles             500
        - ControlDirectory      control/SystemName/AgentName
        - WorkDirectory         work/SystemName/AgentName
        - shifterProxy          False
        - shifterProxyLocation  work/SystemName/AgentName/.shifterCred

        different defaults can be set in the initialize() method of the Agent using am_setOption()

        In order to get a shifter proxy in the environment during the execute()
        the configuration Option 'shifterProxy' must be set, a default may be given
        in the initialize() method.
        """
        self.log = gLogger.getSubLogger(agentName)

        self.__basePath = rootPath
        self.__agentModule = None
        self.agentName = agentName
        self.__codeProperties = {}
        self.__getCodeInfo()

        self.__moduleProperties = {
            "fullName": agentName,
            "loadName": loadName,
            "section": PathFinder.getAgentSection(agentName),
            "loadSection": PathFinder.getAgentSection(loadName),
            "cyclesDone": 0,
            "totalElapsedTime": 0,
            "setup": gConfig.getValue("/DIRAC/Setup", "Unknown"),
            "alive": True,
        }
        self.__moduleProperties["system"], self.__moduleProperties[
            "agentName"] = agentName.split("/")
        self.__configDefaults = {}
        self.__configDefaults["MonitoringEnabled"] = True
        self.__configDefaults["Enabled"] = self.am_getOption(
            "Status", "Active").lower() in ("active")
        self.__configDefaults["PollingTime"] = self.am_getOption(
            "PollingTime", 120)
        self.__configDefaults["MaxCycles"] = self.am_getOption(
            "MaxCycles", 500)
        self.__configDefaults["WatchdogTime"] = self.am_getOption(
            "WatchdogTime", 0)
        self.__configDefaults["ControlDirectory"] = os.path.join(
            self.__basePath, "control", *agentName.split("/"))
        self.__configDefaults["WorkDirectory"] = os.path.join(
            self.__basePath, "work", *agentName.split("/"))
        self.__configDefaults["shifterProxy"] = ""
        self.__configDefaults["shifterProxyLocation"] = os.path.join(
            self.__configDefaults["WorkDirectory"], ".shifterCred")

        if isinstance(properties, dict):
            for key in properties:
                self.__moduleProperties[key] = properties[key]
            self.__moduleProperties["executors"] = [(self.execute, ())]
            self.__moduleProperties["shifterProxy"] = False

        self.__monitorLastStatsUpdate = -1

        self.activityMonitoring = False
        # Check if monitoring is enabled
        if "Monitoring" in Operations().getMonitoringBackends(
                monitoringType="AgentMonitoring"):
            self.activityMonitoring = True

    def __getCodeInfo(self):

        try:
            self.__codeProperties["version"] = importlib.metadata.version(
                inspect.getmodule(self).__package__.split(".")[0])
        except Exception:
            self.log.exception(f"Failed to find version for {self!r}")
            self.__codeProperties["version"] = "unset"
        try:
            self.__agentModule = __import__(self.__class__.__module__,
                                            globals(), locals(), "__doc__")
        except Exception as excp:
            self.log.exception("Cannot load agent module", lException=excp)
        try:
            self.__codeProperties["description"] = getattr(
                self.__agentModule, "__doc__")
        except Exception:
            self.log.error("Missing property __doc__")
            self.__codeProperties["description"] = "unset"
        self.__codeProperties["DIRACVersion"] = DIRAC.version
        self.__codeProperties["platform"] = DIRAC.getPlatform()

    def am_initialize(self, *initArgs):
        """Common initialization for all the agents.

        This is executed every time an agent (re)starts.
        This is called by the AgentReactor, should not be overridden.
        """
        agentName = self.am_getModuleParam("fullName")

        self.__initializeMonitor()

        result = self.initialize(*initArgs)
        if not isReturnStructure(result):
            return S_ERROR("initialize must return S_OK/S_ERROR")
        if not result["OK"]:
            return S_ERROR("Error while initializing %s: %s" %
                           (agentName, result["Message"]))
        mkDir(self.am_getControlDirectory())
        workDirectory = self.am_getWorkDirectory()
        mkDir(workDirectory)
        # Set the work directory in an environment variable available to subprocesses if needed
        os.environ["AGENT_WORKDIRECTORY"] = workDirectory

        self.__moduleProperties["shifterProxy"] = self.am_getOption(
            "shifterProxy")
        if len(self.__moduleProperties["executors"]) < 1:
            return S_ERROR("At least one executor method has to be defined")
        if not self.am_Enabled():
            return S_ERROR("Agent is disabled via the configuration")
        self.log.notice("=" * 40)
        self.log.notice("Loaded agent module %s" %
                        self.__moduleProperties["fullName"])
        self.log.notice(" Site: %s" % DIRAC.siteName())
        self.log.notice(" Setup: %s" % gConfig.getValue("/DIRAC/Setup"))
        self.log.notice(" Agent version: %s" %
                        self.__codeProperties["version"])
        self.log.notice(" DIRAC version: %s" % DIRAC.version)
        self.log.notice(" DIRAC platform: %s" % DIRAC.getPlatform())
        pollingTime = int(self.am_getOption("PollingTime"))
        if pollingTime > 3600:
            self.log.notice(" Polling time: %s hours" % (pollingTime / 3600.0))
        else:
            self.log.notice(" Polling time: %s seconds" %
                            self.am_getOption("PollingTime"))
        self.log.notice(" Control dir: %s" % self.am_getControlDirectory())
        self.log.notice(" Work dir: %s" % self.am_getWorkDirectory())
        if self.am_getOption("MaxCycles") > 0:
            self.log.notice(" Cycles: %s" % self.am_getMaxCycles())
        else:
            self.log.notice(" Cycles: unlimited")
        if self.am_getWatchdogTime() > 0:
            self.log.notice(" Watchdog interval: %s" %
                            self.am_getWatchdogTime())
        else:
            self.log.notice(" Watchdog interval: disabled ")
        self.log.notice("=" * 40)
        return S_OK()

    def am_getControlDirectory(self):
        return os.path.join(self.__basePath,
                            str(self.am_getOption("ControlDirectory")))

    def am_getStopAgentFile(self):
        return os.path.join(self.am_getControlDirectory(), "stop_agent")

    def am_checkStopAgentFile(self):
        return os.path.isfile(self.am_getStopAgentFile())

    def am_createStopAgentFile(self):
        try:
            with open(self.am_getStopAgentFile(), "w") as fd:
                fd.write("Dirac site agent Stopped at %s" %
                         str(datetime.datetime.utcnow()))
        except Exception:
            pass

    def am_removeStopAgentFile(self):
        try:
            os.unlink(self.am_getStopAgentFile())
        except Exception:
            pass

    def am_getBasePath(self):
        return self.__basePath

    def am_getWorkDirectory(self):
        return os.path.join(self.__basePath,
                            str(self.am_getOption("WorkDirectory")))

    def am_getShifterProxyLocation(self):
        return os.path.join(self.__basePath,
                            str(self.am_getOption("shifterProxyLocation")))

    def am_getOption(self, optionName, defaultValue=None):
        """Gets an option from the agent's configuration section.
        The section will be a subsection of the /Systems section in the CS.
        """
        if defaultValue is None:
            if optionName in self.__configDefaults:
                defaultValue = self.__configDefaults[optionName]
        if optionName and optionName[0] == "/":
            return gConfig.getValue(optionName, defaultValue)
        for section in (self.__moduleProperties["section"],
                        self.__moduleProperties["loadSection"]):
            result = gConfig.getOption("%s/%s" % (section, optionName),
                                       defaultValue)
            if result["OK"]:
                return result["Value"]
        return defaultValue

    def am_setOption(self, optionName, value):
        self.__configDefaults[optionName] = value

    def am_getModuleParam(self, optionName):
        return self.__moduleProperties[optionName]

    def am_setModuleParam(self, optionName, value):
        self.__moduleProperties[optionName] = value

    def am_getPollingTime(self):
        return self.am_getOption("PollingTime")

    def am_getMaxCycles(self):
        return self.am_getOption("MaxCycles")

    def am_getWatchdogTime(self):
        return int(self.am_getOption("WatchdogTime"))

    def am_getCyclesDone(self):
        return self.am_getModuleParam("cyclesDone")

    def am_Enabled(self):
        return self.am_getOption("Enabled")

    def am_stopExecution(self):
        self.am_setModuleParam("alive", False)

    def __initializeMonitor(self):
        """
        Initialize the system monitoring.
        """
        # This flag is used to activate ES based monitoring
        if self.activityMonitoring:
            self.log.debug("Monitoring of the agent is enabled.")
            # The import needs to be here because of the CS must be initialized before importing
            # this class (see https://github.com/DIRACGrid/DIRAC/issues/4793)
            from DIRAC.MonitoringSystem.Client.MonitoringReporter import MonitoringReporter

            self.activityMonitoringReporter = MonitoringReporter(
                monitoringType="AgentMonitoring")
            # With the help of this periodic task we commit the data to ES at an interval of 100 seconds.
            gThreadScheduler.addPeriodicTask(
                100, self.__activityMonitoringReporting)
            self.__monitorLastStatsUpdate = time.time()

    def am_secureCall(self, functor, args=(), name=False):
        if not name:
            name = str(functor)
        try:
            result = functor(*args)
            if not isReturnStructure(result):
                raise Exception(
                    "%s method for %s module has to return S_OK/S_ERROR" %
                    (name, self.__moduleProperties["fullName"]))
            return result
        except Exception as e:
            self.log.exception("Agent exception while calling method %s" %
                               name,
                               lException=e)
            return S_ERROR("Exception while calling %s method: %s" %
                           (name, str(e)))

    def _setShifterProxy(self):
        if self.__moduleProperties["shifterProxy"]:
            result = setupShifterProxyInEnv(
                self.__moduleProperties["shifterProxy"],
                self.am_getShifterProxyLocation())
            if not result["OK"]:
                self.log.error("Failed to set shifter proxy",
                               result["Message"])
                return result
        return S_OK()

    def am_go(self):
        # Set the shifter proxy if required
        result = self._setShifterProxy()
        if not result["OK"]:
            return result
        self.log.notice("-" * 40)
        self.log.notice("Starting cycle for module %s" %
                        self.__moduleProperties["fullName"])
        mD = self.am_getMaxCycles()
        if mD > 0:
            cD = self.__moduleProperties["cyclesDone"]
            self.log.notice("Remaining %s of %s cycles" % (mD - cD, mD))
        self.log.notice("-" * 40)
        # use SIGALARM as a watchdog interrupt if enabled
        watchdogInt = self.am_getWatchdogTime()
        if watchdogInt > 0:
            signal.signal(signal.SIGALRM, signal.SIG_DFL)
            signal.alarm(watchdogInt)
        elapsedTime = time.time()
        if self.activityMonitoring:
            initialWallTime, initialCPUTime, mem = self._startReportToMonitoring(
            )
        cycleResult = self.__executeModuleCycle()
        if self.activityMonitoring and initialWallTime and initialCPUTime:
            cpuPercentage = self._endReportToMonitoring(
                initialWallTime, initialCPUTime)
        # Increment counters
        self.__moduleProperties["cyclesDone"] += 1
        # Show status
        elapsedTime = time.time() - elapsedTime
        self.__moduleProperties["totalElapsedTime"] += elapsedTime
        self.log.notice("-" * 40)
        self.log.notice("Agent module %s run summary" %
                        self.__moduleProperties["fullName"])
        self.log.notice(" Executed %s times previously" %
                        self.__moduleProperties["cyclesDone"])
        self.log.notice(" Cycle took %.2f seconds" % elapsedTime)
        averageElapsedTime = self.__moduleProperties[
            "totalElapsedTime"] / self.__moduleProperties["cyclesDone"]
        self.log.notice(" Average execution time: %.2f seconds" %
                        (averageElapsedTime))
        elapsedPollingRate = averageElapsedTime * 100 / self.am_getOption(
            "PollingTime")
        self.log.notice(" Polling time: %s seconds" %
                        self.am_getOption("PollingTime"))
        self.log.notice(" Average execution/polling time: %.2f%%" %
                        elapsedPollingRate)
        if cycleResult["OK"]:
            self.log.notice(" Cycle was successful")
            if self.activityMonitoring:
                # Here we record the data about the cycle duration along with some basic details about the
                # agent and right now it isn't committed to the ES backend.
                self.activityMonitoringReporter.addRecord({
                    "AgentName":
                    self.agentName,
                    "timestamp":
                    int(TimeUtilities.toEpoch()),
                    "Host":
                    Network.getFQDN(),
                    "MemoryUsage":
                    mem,
                    "CpuPercentage":
                    cpuPercentage,
                    "CycleDuration":
                    elapsedTime,
                })
        else:
            self.log.warn(" Cycle had an error:", cycleResult["Message"])
        self.log.notice("-" * 40)
        # Update number of cycles
        # cycle finished successfully, cancel watchdog
        if watchdogInt > 0:
            signal.alarm(0)
        return cycleResult

    def _startReportToMonitoring(self):
        now = time.time()
        stats = os.times()
        mem = None
        cpuTime = stats[0] + stats[2]
        if now - self.__monitorLastStatsUpdate < 10:
            return (now, cpuTime, mem)
        self.__monitorLastStatsUpdate = now
        membytes = MemStat.VmB("VmRSS:")
        if membytes:
            mem = membytes / (1024.0 * 1024.0)
        return (now, cpuTime, mem)

    def _endReportToMonitoring(self, initialWallTime, initialCPUTime):
        wallTime = time.time() - initialWallTime
        stats = os.times()
        cpuTime = stats[0] + stats[2] - initialCPUTime
        percentage = 0
        if wallTime:
            percentage = cpuTime / wallTime * 100.0
        return percentage

    def __executeModuleCycle(self):
        # Execute the beginExecution function
        result = self.am_secureCall(self.beginExecution, name="beginExecution")
        if not result["OK"]:
            return result
        # Launch executor functions
        executors = self.__moduleProperties["executors"]
        if len(executors) == 1:
            result = self.am_secureCall(executors[0][0], executors[0][1])
            if not result["OK"]:
                return result
        else:
            exeThreads = [
                threading.Thread(target=executor[0], args=executor[1])
                for executor in executors
            ]
            for thread in exeThreads:
                thread.setDaemon(1)
                thread.start()
            for thread in exeThreads:
                thread.join()
        # Execute the endExecution function
        return self.am_secureCall(self.endExecution, name="endExecution")

    def initialize(self, *args, **kwargs):
        """Agents should override this method for specific initialization.
        Executed at every agent (re)start.
        """
        return S_OK()

    def beginExecution(self):
        return S_OK()

    def endExecution(self):
        return S_OK()

    def finalize(self):
        return S_OK()

    def execute(self):
        return S_ERROR("Execute method has to be overwritten by agent module")

    def __activityMonitoringReporting(self):
        """This method is called by the ThreadScheduler as a periodic task in order to commit the collected data which
        is done by the MonitoringReporter and is send to the 'ComponentMonitoring' type.

        :return: True / False
        """
        return self.activityMonitoringReporter.commit()["OK"]