Exemplo n.º 1
0
class MCSimulationTestingAgent (AgentModule):
  """An agent to check for MCSimulation productions that have undergone the testing phase.
     Productions that have the status Idle and are also in the table StoredJobDescription have undergone testing.
     A report is created by the agent from the results of the test phase and emailed to the Production Manager
  """

  def __init__(self, *args, **kwargs):
    """ c'tor
    """
    AgentModule.__init__(self, *args, **kwargs)
    self.transClient = None
    self.bkClient = None
    self.notifyClient = None
    self.operations = None

    self.failedTransIDs = []

  def initialize(self):
    self.transClient = TransformationClient()
    self.bkClient = BookkeepingClient()
    self.notifyClient = NotificationClient()
    self.operations = Operations()

    self.email = self.am_getOption("MailTo", '')

    return S_OK()

  def execute(self):
    # get all the idle transformations
    extendableTTypes = Operations().getValue('Transformations/ExtendableTransfTypes', ['MCSimulation'])
    res = self.transClient.getTransformations(condDict={"Status": "Idle", "Type": extendableTTypes})
    if res['OK']:
      idleTransformations = res['Value']
      idleTransformations = [d.get("TransformationID") for d in idleTransformations]
      self.log.verbose("Found %d Idle MC transformations" % len(idleTransformations))
      self.log.debug("Idle transformations found: %s" % ','.join([str(it) for it in idleTransformations]))
    else:
      self.log.error("Call to Transformation Client service failed", res['Message'])
      return res

    # get all the IDs of transformations undergoing a testing phase
    res = self.transClient.getStoredJobDescriptionIDs()
    if res['OK']:
      testingSimulations = res['Value']
      testingSimulations = [pair[0] for pair in testingSimulations]
      self.log.verbose("Found %d MC transformations undergoing a testing phase" % len(testingSimulations))
      self.log.debug("MC transformations found undergoing a testing phase: %s" %
                     ','.join([str(ts) for ts in testingSimulations]))
    else:
      self.log.error("Call to Transformation Client service failed", res['Message'])
      return res

    # get the IDs that occur in both idle transformations and testing phase
    idleSimulations = list(set(testingSimulations).intersection(idleTransformations))
    # remove those that we know failed
    idleSimulations = list(set(idleSimulations).difference(self.failedTransIDs))
    self.log.info("MC transformations under considerations: %s (will loop on them)" %
                  ','.join([str(idS) for idS in idleSimulations]))
    for transID in idleSimulations:
      self.log.info("Looking into %d" % transID)
      tasks = self.transClient.getTransformationTasks(condDict={"TransformationID": transID})
      if not tasks['OK']:
        self.log.error("Call to Transformation Client service failed", tasks['Message'])
        continue
      else:
        tasks = tasks['Value']
        numberOfTasks = len(tasks)
        numberOfDoneTasks = sum(1 for d in tasks if d.get("ExternalStatus") == "Done")
        self.log.verbose(
            "TransID = %d, numberOfTasks = %d, numberOfDoneTasks = %d" %
            (transID, numberOfTasks, numberOfDoneTasks))
        if numberOfTasks == numberOfDoneTasks:
          self.log.info("All tasks have passed so the request can be accepted and the transformation updated")
          res = self._activateTransformation(transID, tasks)
          if not res['OK']:
            self.log.error("Error Activating Production", res['Message'])
        else:
          self.log.warn("There are failed tasks")
          report = self.__createReport(tasks)
          numberOfFailedTasks = sum(1 for d in tasks if d.get('ExternalStatus') == 'Failed')
          if numberOfFailedTasks == numberOfTasks:
            # all tasks have failed so the request can be rejected and an email report sent
            self._sendReport(report)
            self.log.warn("Transformation " + str(transID) + " failed the testing phase")
            self.failedTransIDs.append(transID)
          else:
            # only some tasks have failed so continue but send a warn email
            self.log.warn("Transformation " + str(transID) + " failed partially the testing phase, continuing anyway")
            doneTasks = list()
            for d in tasks:
              if d.get("ExternalStatus") == "Done":
                doneTasks.append(d)
            if not doneTasks:
              self.log.info("No tasks done for Transformation %d" % transID)
              continue
            res = self._activateTransformation(transID, doneTasks)
            if not res['OK']:
              self.log.error("Error Activating Production", res['Message'])
              continue
            subject = "MCSimulation Test Failure Report. TransformationID: " + str(transID) + " - some tasks failed"
            report['subject'] = subject
            self._sendReport(report)

    return S_OK()

  def _activateTransformation(self, transID, tasks):
    """ Calculate parameters, update the workflow, then move the production to Active
    """
    parameters = self._calculateParameters(tasks)
    if not parameters['OK']:
      self.log.error("Error calculating parameters", parameters['Message'])
      return parameters
    else:
      parameters = parameters['Value']
      self.log.verbose("TransID = %d, Calculated Parameters: %s" % (transID, str(parameters)))
      workflow = self._updateWorkflow(transID, int(round(float(parameters['CPUe']))), parameters['MCCpu'])
      if workflow['OK']:
        workflow = workflow['Value']
        res = self._updateTransformationsTable(transID, workflow)
        if not res['OK']:
          self.log.error("Error updating transformations table", res['Message'])
          return res
        else:
          self.log.info("Transformation " + str(transID) + " passed the testing phase and is now set to active")

    return S_OK()

  def __createReport(self, tasks):
    """creates a report from a failed task to email to the production manager
    """
    dateformat = '%d/%m/%Y %H:%M'
    transformationID = tasks[0]["TransformationID"]
    transformation = self.transClient.getTransformations(condDict={"TransformationID": transformationID})
    transformation = transformation['Value'][0]
    subject = "MCSimulation Test Failure Report. TransformationID: " + str(transformationID)
    body = [subject]
    body.append("")
    body.append("Transformation:")
    body.append("----------------------------------------------------------------------")
    body.append("TransformationID: " + str(transformation["TransformationID"]))
    body.append("TransformationName: " + transformation["TransformationName"])
    body.append("LastUpdate: " + transformation["LastUpdate"].strftime(dateformat))
    body.append("Status: " + transformation["Status"])
    body.append("Description: " + transformation["Description"])
    body.append("TransformationFamily: " + str(transformation["TransformationFamily"]))
    body.append("Plugin: " + transformation["Plugin"])
    body.append("Type: " + transformation["Type"])
    body.append("AgentType: " + transformation["AgentType"])
    body.append("GroupSize: " + str(transformation["GroupSize"]))
    body.append("MaxNumberOfTasks: " + str(transformation["MaxNumberOfTasks"]))
    body.append("AuthorDN: " + transformation["AuthorDN"])
    body.append("TransformationGroup: " + transformation["TransformationGroup"])
    body.append("InheritedFrom: " + str(transformation["InheritedFrom"]))
    body.append("CreationDate: " + transformation["CreationDate"].strftime(dateformat))
    body.append("FileMask: " + transformation["FileMask"])
    body.append("EventsPerTask: " + str(transformation["EventsPerTask"]))
    body.append("AuthorGroup: " + transformation["AuthorGroup"])
    body.append("")
    body.append("Number of Tasks: " + str(len(tasks)))
    body.append("Tasks:")
    body.append("----------------------------------------------------------------------")
    for task in tasks:
      body.append("TaskID: " + str(task['TaskID']))
      body.append("TargetSE: " + task['TargetSE'])
      body.append("LastUpdateTime: " + task['LastUpdateTime'].strftime(dateformat))
      body.append("RunNumber: " + str(task['RunNumber']))
      body.append("CreationTime: " + task['CreationTime'].strftime(dateformat))
      body.append("ExternalID: " + str(task['ExternalID']))
      body.append("ExternalStatus: " + task['ExternalStatus'])
      body.append("")
    return {'subject': subject, 'body': body}

  def _sendReport(self, report):
    """sends a given report to the production manager
    """
    if not self.email:
      self.email = getUserOption(self.operations.getValue("Shifter/ProductionManager/User"), 'Email')
    body = '\n'.join(report['body'])
    res = self.notifyClient.sendMail(
        self.email,
        report['subject'],
        body,
        self.email,
        localAttempt=False,
        avoidSpam=True)
    if not res['OK']:
      self.log.error("sendMail failed", res['Message'])
    else:
      self.log.info('Mail summary sent to production manager')

  def _calculateParameters(self, tasks):
    """ Calculates the CPU time per event for the production
    """
    jobIds = [int(x['ExternalID']) for x in tasks]
    res = self.bkClient.bulkJobInfo({'jobId': jobIds})
    if not res['OK']:
      self.log.error("Error calling bkClient", res['Message'])
      return S_ERROR(res['Message'])
    successful = res['Value']['Successful']
    self.log.debug("Successful tasks: %s" % str(successful))
    if not successful:
      self.log.error("There are no successful tasks")
      return S_ERROR("There are no successful tasks")

    events = 0
    CPUeJobTotal = 0.0
    for job in successful.itervalues():
      cpuJob = 0
      for bkJob in job:
        if bkJob['ApplicationName'] in ['Gauss', 'Boole', 'Moore', 'Brunel', 'DaVinci']:
          if not events:
            events = bkJob['NumberOfEvents']
          timeInSeconds = bkJob['CPUTIME']
          cpuJob += timeInSeconds * bkJob['WNCPUHS06']
      CPUeJob = cpuJob / events
      self.log.debug("CPUeJob = %d" % CPUeJob)

      CPUeJobTotal += CPUeJob

    CPUe = CPUeJobTotal / len(successful)
    # We want to produce at least 25 events per job...
    MCCpu = str(25 * int(round(float(CPUe))))
    self.log.verbose("CPUe = %d, MCCpu = %s" % (CPUe, MCCpu))
    return S_OK({'CPUe': CPUe, 'MCCpu': MCCpu})

  def _updateWorkflow(self, transID, CPUe, MCCpu):
    """ Updates the workflow of a savedProductionDescription to reflect the calculated CPUe
    """
    res = self.transClient.getStoredJobDescription(transID)
    if res['OK']:
      workflow = fromXMLString(res['Value'][0][1])
      prod = Production()
      prod.LHCbJob.workflow = workflow
      prod.setParameter('CPUe', 'string', str(CPUe), 'CPU time per event')
      prod.LHCbJob.setCPUTime(MCCpu)
      self.log.info("Transformation ", str(transID))
      self.log.info("Calculated CPUTime: ", str(CPUe))
      self.log.info("CpuTime: ", str(MCCpu))

      # maximum number of events to produce
      # try to get the CPU parameters from the configuration if possible
      cpuTimeAvg = Operations().getValue('Transformations/CPUTimeAvg')
      if cpuTimeAvg is None:
        self.log.info('Could not get CPUTimeAvg from config, defaulting to %d' % 200000)
        cpuTimeAvg = 200000

      try:
        CPUNormalizationFactorAvg = getCPUNormalizationFactorAvg()
      except RuntimeError:
        self.log.info('Could not get CPUNormalizationFactorAvg, defaulting to %f' % 1.0)
        CPUNormalizationFactorAvg = 1.0

      max_e = getEventsToProduce(CPUe, cpuTimeAvg, CPUNormalizationFactorAvg)
      prod.setParameter('maxNumberOfEvents', 'string', str(max_e), 'Maximum number of events to produce (Gauss)')
      return S_OK(prod.LHCbJob.workflow.toXML())
    else:
      self.log.error("Call to Transformation Client service failed", res['Message'])
      return res

  def _updateTransformationsTable(self, transID, workflow):
    """ Puts the modified workflow from the savedProductionDescription table into the transformations table
        and removes it from the savedProductionDescription table.
    """
    transformation = self.transClient.getTransformations(condDict={"TransformationID": transID})
    if transformation['OK']:
      body = self.transClient.setTransformationParameter(transID, "Body", workflow)
      status = self.transClient.setTransformationParameter(transID, "Status", "Active")
      if body['OK'] and status['OK']:
        res = self.transClient.removeStoredJobDescription(transID)
        if not res['OK']:
          self.log.error("Call to removeStoredJobDescription failed", res['Message'])
          return res
        self.log.info("Transformation %s has an updated body and Status set to active" % transID)
        return S_OK()
      else:
        self.log.error("One of the updates has failed so set them both back to the previous value to ensure atomicity")
        self.log.debug(str(transformation['Value'][0]['Body']))
        res = self.transClient.setTransformationParameter(transID, "Body", transformation['Value'][0]['Body'])
        if not res['OK']:
          self.log.error("Failure calling setTransformationParameter", res['Message'])
          return res
        res = self.transClient.setTransformationParameter(transID, "Status", transformation['Value'][0]['Status'])
        if not res['OK']:
          self.log.error("Failure calling setTransformationParameter", res['Message'])
          return res
    else:
      self.log.error("Call to getTransformations failed", transformation['Message'])
      return transformation
Exemplo n.º 2
0
class DiracProduction(DiracLHCb):
    """ class for managing productions
  """
    def __init__(self, tsClientIn=None):
        """Instantiates the Workflow object and some default parameters.
    """

        super(DiracProduction, self).__init__()

        if tsClientIn is None:
            self.transformationClient = TransformationClient()
        else:
            self.transformationClient = tsClientIn

        self.prodHeaders = {
            'AgentType': 'SubmissionMode',
            'Status': 'Status',
            'CreationDate': 'Created',
            'TransformationName': 'Name',
            'Type': 'Type'
        }
        self.prodAdj = 22
        self.commands = {
            'start': ['Active', 'Manual'],
            'stop': ['Stopped', 'Manual'],
            'automatic': ['Active', 'Automatic'],
            'manual': ['Active', 'Manual'],
            'mctestmode': ['Testing', 'Automatic'],
            'completed': ['Completed', 'Manual'],
            'completing': ['Completing', 'Automatic'],
            'cleaning': ['Cleaning', 'Manual'],
            'flush': ['Flush', 'Automatic'],
            'deleted': ['Deleted', 'Manual'],
            'cleaned': ['Cleaned', 'Manual'],
            'archived': ['Archived', 'Manual'],
            'valinput': ['ValidatingInput', 'Manual'],
            'valoutput': ['ValidatingOutput', 'Manual'],
            'remove': ['RemovingFiles', 'Manual'],
            'validated': ['ValidatedOutput', 'Manual'],
            'removed': ['RemovedFiles', 'Manual']
        }

    def getProduction(self, productionID, printOutput=False):
        """Returns the metadata associated with a given production ID. Protects against
       LFN: being prepended and different types of production ID.
    """
        if not isinstance(productionID, (int, long, str)):
            return self._errorReport(
                'Expected string, long or int for production ID')

        result = self.transformationClient.getTransformation(int(productionID))
        if not result['OK']:
            return result

        # to fix TODO
        if printOutput:
            adj = self.prodAdj
            prodInfo = result['Value']
            top = ''
            for i in self.prodHeaders.itervalues():
                top += i.ljust(adj)
            message = ['ProductionID'.ljust(adj) + top + '\n']
            # very painful to make this consistent, better improved first on the server side
            productionID = str(productionID)
            info = productionID.ljust(adj) + prodInfo['Status'].ljust(adj) + prodInfo['Type'].ljust(adj) +\
                prodInfo['AgentType'].ljust(adj) + toString(prodInfo['CreationDate']).ljust(adj) +\
                prodInfo['TransformationName'].ljust(adj)
            message.append(info)
            print '\n'.join(message)
        return S_OK(result['Value'])

    def getProductionLoggingInfo(self, productionID, printOutput=False):
        """The logging information for the given production is returned.  This includes
       the operation performed, any messages associated with the operation and the
       DN of the production manager performing it.
    """
        if not isinstance(productionID, (int, long, str)):
            return self._errorReport(
                'Expected string, long or int for production ID')

        result = self.transformationClient.getTransformationLogging(
            int(productionID))
        if not result['OK']:
            self.log.warn(
                'Could not get transformation logging information for productionID %s'
                % (productionID))
            return result
        if not result['Value']:
            self.log.warn('No logging information found for productionID %s' %
                          (productionID))
            return result

        if not printOutput:
            return result

        infoM = 'ProdID'.ljust(int(0.5 * self.prodAdj)) + 'Message'.ljust(3 * self.prodAdj) +\
            'DateTime [UTC]'.ljust(self.prodAdj) + 'AuthorCN'.ljust(2 * self.prodAdj)
        message = [infoM]
        for line in result['Value']:
            infoL = str(line['TransformationID']).ljust(int(0.5 * self.prodAdj)) +\
                line['Message'].ljust(3 * self.prodAdj) + toString(line['MessageDate']).ljust(self.prodAdj) +\
                line['AuthorDN'].split('/')[-1].ljust(2 * self.prodAdj)
            message.append(infoL)

        print '\nLogging summary for productionID ' + str(
            productionID) + '\n\n' + '\n'.join(message)

        return result

    def getProductionSummary(self, productionID=None, printOutput=False):
        """Returns a detailed summary for the productions in the system. If production ID is
       specified, the result is restricted to this value. If printOutput is specified,
       the result is printed to the screen.
    """
        if not isinstance(productionID, (int, long, str)):
            return self._errorReport(
                'Expected string, long or int for production ID')

        result = self.transformationClient.getTransformationSummary()
        if not result['OK']:
            return result

        if productionID:
            if long(productionID) in result['Value']:
                newResult = S_OK()
                newResult['Value'] = {}
                newResult['Value'][long(productionID)] = result['Value'][long(
                    productionID)]
                result = newResult
            else:
                self.log.info('Specified productionID was not found, \
          the list of active productions is:\n%s' %
                              ', '.join(str(pID) for pID in result['Value']))
                return S_ERROR('Production ID %s was not found' %
                               (productionID))

        if printOutput:
            self._prettyPrint(result['Value'])

        return result

    def getProductionApplicationSummary(self,
                                        productionID,
                                        status=None,
                                        minorStatus=None,
                                        printOutput=False):
        """Returns an application status summary for the productions in the system. If printOutput is
       specified, the result is printed to the screen.  This queries the WMS
       for the given productionID and provides an up-to-date snapshot of the application status
       combinations and associated WMS JobIDs.
    """
        if not isinstance(productionID, (int, long, str)):
            return self._errorReport(
                'Expected string, long or int for production ID')

        statusDict = self.getProdJobMetadata(productionID, status, minorStatus)
        if not statusDict['OK']:
            self.log.warn('Could not get production metadata information')
            return statusDict

        jobIDs = list(statusDict['Value'])
        if not jobIDs:
            return S_ERROR('No JobIDs with matching conditions found')

        self.log.verbose('Considering %s jobs with selected conditions' %
                         (len(jobIDs)))
        # now need to get the application status information
        result = JobMonitoringClient().getJobsApplicationStatus(jobIDs)
        if not result['OK']:
            self.log.warn('Could not get application status for jobs list')
            return result

        appStatus = result['Value']
        #    self._prettyPrint(appStatus)
        #    self._prettyPrint(statusDict['Value'])
        # Now format the result.
        summary = {}
        submittedJobs = 0
        doneJobs = 0
        for job, atts in statusDict['Value'].iteritems():
            for key, val in atts.iteritems():
                if key == 'Status':
                    uniqueStatus = val.capitalize()
                    if uniqueStatus not in summary:
                        summary[uniqueStatus] = {}
                    if atts['MinorStatus'] not in summary[uniqueStatus]:
                        summary[uniqueStatus][atts['MinorStatus']] = {}
                    if appStatus[job]['ApplicationStatus'] not in summary[
                            uniqueStatus][atts['MinorStatus']]:
                        summary[uniqueStatus][atts['MinorStatus']][
                            appStatus[job]['ApplicationStatus']] = {}
                        summary[uniqueStatus][atts['MinorStatus']][
                            appStatus[job]['ApplicationStatus']]['Total'] = 1
                        submittedJobs += 1
                        if uniqueStatus == 'Done':
                            doneJobs += 1
                        summary[uniqueStatus][atts['MinorStatus']][
                            appStatus[job]['ApplicationStatus']]['JobList'] = [
                                job
                            ]
                    else:
                        if appStatus[job]['ApplicationStatus'] not in summary[
                                uniqueStatus][atts['MinorStatus']]:
                            summary[uniqueStatus][atts['MinorStatus']] = {}
                            summary[uniqueStatus][atts['MinorStatus']][
                                appStatus[job]['ApplicationStatus']] = {}
                            summary[uniqueStatus][atts['MinorStatus']][
                                appStatus[job]
                                ['ApplicationStatus']]['Total'] = 1
                            submittedJobs += 1
                            if uniqueStatus == 'Done':
                                doneJobs += 1
                            summary[uniqueStatus][atts['MinorStatus']][
                                appStatus[job]
                                ['ApplicationStatus']]['JobList'] = [job]
                        else:
                            current = summary[uniqueStatus][
                                atts['MinorStatus']][appStatus[job][
                                    'ApplicationStatus']]['Total']
                            summary[uniqueStatus][atts['MinorStatus']][
                                appStatus[job]
                                ['ApplicationStatus']]['Total'] = current + 1
                            submittedJobs += 1
                            if uniqueStatus == 'Done':
                                doneJobs += 1
                            jobList = summary[uniqueStatus][
                                atts['MinorStatus']][appStatus[job][
                                    'ApplicationStatus']]['JobList']
                            jobList.append(job)
                            summary[uniqueStatus][atts['MinorStatus']][
                                appStatus[job]
                                ['ApplicationStatus']]['JobList'] = jobList

        if not printOutput:
            result = S_OK()
            if not status and not minorStatus:
                result['Totals'] = {
                    'Submitted': int(submittedJobs),
                    'Done': int(doneJobs)
                }
            result['Value'] = summary
            return result

        # If a printed summary is requested
        statAdj = int(0.5 * self.prodAdj)
        mStatAdj = int(2.0 * self.prodAdj)
        totalAdj = int(0.5 * self.prodAdj)
        exAdj = int(0.5 * self.prodAdj)
        message = '\nJob Summary for ProductionID %s considering status %s' % (
            productionID, status)
        if minorStatus:
            message += 'and MinorStatus = %s' % (minorStatus)

        message += ':\n\n'
        message += 'Status'.ljust(statAdj) + 'MinorStatus'.ljust(mStatAdj) + 'ApplicationStatus'.ljust(mStatAdj) + \
            'Total'.ljust(totalAdj) + 'Example'.ljust(exAdj) + '\n'
        for stat, metadata in summary.iteritems():
            message += '\n'
            for minor, appInfo in metadata.iteritems():
                message += '\n'
                for appStat, jobInfo in appInfo.iteritems():
                    message += stat.ljust(statAdj) + minor.ljust(mStatAdj) + appStat.ljust(mStatAdj) + \
                        str(jobInfo['Total']).ljust(totalAdj) + str(jobInfo['JobList'][0]).ljust(exAdj) + '\n'

        # self._prettyPrint(summary)
        if status or minorStatus:
            return S_OK(summary)

        result = self.getProductionProgress(productionID)
        if not result['OK']:
            self.log.warn('Could not get production progress information')
            return result

        if 'Created' in result['Value']:
            createdJobs = int(result['Value']['Created']) + submittedJobs
        else:
            createdJobs = submittedJobs

        percSub = int(100 * submittedJobs / createdJobs)
        percDone = int(100 * doneJobs / createdJobs)
        print '\nCurrent status of production %s:\n' % productionID
        print 'Submitted'.ljust(12) + str(percSub).ljust(3) + '%  ( ' + str(submittedJobs).ljust(7) + \
            'Submitted / '.ljust(15) + str(createdJobs).ljust(7) + ' Created jobs )'
        print 'Done'.ljust(12) + str(percDone).ljust(3) + '%  ( ' + str(doneJobs).ljust(7) + \
            'Done / '.ljust(15) + str(createdJobs).ljust(7) + ' Created jobs )'
        result = S_OK()
        result['Totals'] = {
            'Submitted': int(submittedJobs),
            'Created': int(createdJobs),
            'Done': int(doneJobs)
        }
        result['Value'] = summary
        # self.pPrint(result)
        return result

    def getProductionJobSummary(self,
                                productionID,
                                status=None,
                                minorStatus=None,
                                printOutput=False):
        """Returns a job summary for the productions in the system. If printOutput is
       specified, the result is printed to the screen.  This queries the WMS
       for the given productionID and provides an up-to-date snapshot of the job status
       combinations and associated WMS JobIDs.
    """
        if not isinstance(productionID, (int, long, str)):
            return self._errorReport(
                'Expected string, long or int for production ID')

        statusDict = self.getProdJobMetadata(productionID, status, minorStatus)
        if not statusDict['OK']:
            self.log.warn('Could not get production metadata information')
            return statusDict

        # Now format the result.
        summary = {}
        submittedJobs = 0
        doneJobs = 0
        for job, atts in statusDict['Value'].ietritems():
            for key, val in atts.iteritems():
                if key == 'Status':
                    uniqueStatus = val.capitalize()
                    if uniqueStatus not in summary:
                        summary[uniqueStatus] = {}
                    if atts['MinorStatus'] not in summary[uniqueStatus]:
                        summary[uniqueStatus][atts['MinorStatus']] = {}
                        summary[uniqueStatus][atts['MinorStatus']]['Total'] = 1
                        submittedJobs += 1
                        if uniqueStatus == 'Done':
                            doneJobs += 1
                        summary[uniqueStatus][
                            atts['MinorStatus']]['JobList'] = [job]
                    else:
                        current = summary[uniqueStatus][
                            atts['MinorStatus']]['Total']
                        summary[uniqueStatus][
                            atts['MinorStatus']]['Total'] = current + 1
                        submittedJobs += 1
                        if uniqueStatus == 'Done':
                            doneJobs += 1
                        jobList = summary[uniqueStatus][
                            atts['MinorStatus']]['JobList']
                        jobList.append(job)
                        summary[uniqueStatus][
                            atts['MinorStatus']]['JobList'] = jobList

        if not printOutput:
            result = S_OK()
            if not status and not minorStatus:
                result['Totals'] = {
                    'Submitted': int(submittedJobs),
                    'Done': int(doneJobs)
                }
            result['Value'] = summary
            return result

        # If a printed summary is requested
        statAdj = int(0.5 * self.prodAdj)
        mStatAdj = int(2.0 * self.prodAdj)
        totalAdj = int(0.5 * self.prodAdj)
        exAdj = int(0.5 * self.prodAdj)
        message = '\nJob Summary for ProductionID %s considering' % (
            productionID)
        if status:
            message += ' Status = %s' % (status)
        if minorStatus:
            message += ' MinorStatus = %s' % (minorStatus)
        if not status and not minorStatus:
            message += ' all status combinations'

        message += ':\n\n'
        message += 'Status'.ljust(statAdj) + 'MinorStatus'.ljust(mStatAdj) + 'Total'.ljust(totalAdj) + \
            'Example'.ljust(exAdj) + '\n'
        for stat, metadata in summary.iteritems():
            message += '\n'
            for minor, jobInfo in metadata.iteritems():
                message += stat.ljust(statAdj) + minor.ljust(mStatAdj) + str(jobInfo['Total']).ljust(totalAdj) + \
                    str(jobInfo['JobList'][0]).ljust(exAdj) + '\n'

        print message
        # self._prettyPrint(summary)
        if status or minorStatus:
            return S_OK(summary)

        result = self.getProductionProgress(productionID)
        if not result['OK']:
            return result

        if 'Created' in result['Value']:
            createdJobs = int(result['Value']['Created']) + submittedJobs
        else:
            createdJobs = submittedJobs

        percSub = int(100 * submittedJobs / createdJobs)
        percDone = int(100 * doneJobs / createdJobs)
        print '\nCurrent status of production %s:\n' % productionID
        print 'Submitted'.ljust(12) + str(percSub).ljust(3) + '%  ( ' + str(submittedJobs).ljust(7) + \
            'Submitted / '.ljust(15) + str(createdJobs).ljust(7) + ' Created jobs )'
        print 'Done'.ljust(12) + str(percDone).ljust(3) + '%  ( ' + str(doneJobs).ljust(7) + \
            'Done / '.ljust(15) + str(createdJobs).ljust(7) + ' Created jobs )'
        result = S_OK()
        result['Totals'] = {
            'Submitted': int(submittedJobs),
            'Created': int(createdJobs),
            'Done': int(doneJobs)
        }
        result['Value'] = summary
        return result

    def getProductionSiteSummary(self,
                                 productionID,
                                 site=None,
                                 printOutput=False):
        """Returns a site summary for the productions in the system. If printOutput is
       specified, the result is printed to the screen.  This queries the WMS
       for the given productionID and provides an up-to-date snapshot of the sites
       that jobs were submitted to.
    """
        if not isinstance(productionID, (int, long, str)):
            return self._errorReport(
                'Expected string, long or int for production ID')

        statusDict = self.getProdJobMetadata(productionID, None, None, site)
        if not statusDict['OK']:
            self.log.warn('Could not get production metadata information')
            return statusDict

        summary = {}
        submittedJobs = 0
        doneJobs = 0

        for job, atts in statusDict['Value'].iteritems():
            for key, val in atts.iteritems():
                if key == 'Site':
                    uniqueSite = val
                    currentStatus = atts['Status'].capitalize()
                    if uniqueSite not in summary:
                        summary[uniqueSite] = {}
                    if currentStatus not in summary[uniqueSite]:
                        summary[uniqueSite][currentStatus] = {}
                        summary[uniqueSite][currentStatus]['Total'] = 1
                        submittedJobs += 1
                        if currentStatus == 'Done':
                            doneJobs += 1
                        summary[uniqueSite][currentStatus]['JobList'] = [job]
                    else:
                        current = summary[uniqueSite][currentStatus]['Total']
                        summary[uniqueSite][currentStatus][
                            'Total'] = current + 1
                        submittedJobs += 1
                        if currentStatus == 'Done':
                            doneJobs += 1
                        jobList = summary[uniqueSite][currentStatus]['JobList']
                        jobList.append(job)
                        summary[uniqueSite][currentStatus]['JobList'] = jobList

        if not printOutput:
            result = S_OK()
            if not site:
                result = self.getProductionProgress(productionID)
                if not result['OK']:
                    return result
                if 'Created' in result['Value']:
                    createdJobs = result['Value']['Created']
                result['Totals'] = {
                    'Submitted': int(submittedJobs),
                    'Done': int(doneJobs)
                }
            result['Value'] = summary
            return result

        # If a printed summary is requested
        siteAdj = int(1.0 * self.prodAdj)
        statAdj = int(0.5 * self.prodAdj)
        totalAdj = int(0.5 * self.prodAdj)
        exAdj = int(0.5 * self.prodAdj)
        message = '\nSummary for ProductionID %s' % (productionID)
        if site:
            message += ' at Site %s' % (site)
        else:
            message += ' at all Sites'
        message += ':\n\n'
        message += 'Site'.ljust(siteAdj) + 'Status'.ljust(statAdj) + 'Total'.ljust(totalAdj) + \
            'Example'.ljust(exAdj) + '\n'
        for siteStr, metadata in summary.iteritems():
            message += '\n'
            for stat, jobInfo in metadata.iteritems():
                message += siteStr.ljust(siteAdj) + stat.ljust(statAdj) + str(jobInfo['Total']).ljust(totalAdj) + \
                    str(jobInfo['JobList'][0]).ljust(exAdj) + '\n'

        print message
        # self._prettyPrint(summary)
        result = self.getProductionProgress(productionID)

        if not result['OK']:
            return result

        if 'Created' in result['Value']:
            createdJobs = int(result['Value']['Created']) + submittedJobs
        else:
            createdJobs = submittedJobs

        percSub = int(100 * submittedJobs / createdJobs)
        percDone = int(100 * doneJobs / createdJobs)
        if not site:
            print '\nCurrent status of production %s:\n' % productionID
            print 'Submitted'.ljust(12) + str(percSub).ljust(3) + '%  ( ' + str(submittedJobs).ljust(7) + \
                'Submitted / '.ljust(15) + str(createdJobs).ljust(7) + ' Created jobs )'
            print 'Done'.ljust(12) + str(percDone).ljust(3) + '%  ( ' + str(doneJobs).ljust(7) + \
                'Done / '.ljust(15) + str(createdJobs).ljust(7) + ' Created jobs )'
        result = S_OK()
        result['Totals'] = {
            'Submitted': int(submittedJobs),
            'Created': int(createdJobs),
            'Done': int(doneJobs)
        }
        result['Value'] = summary
        return result

    def getProductionProgress(self, productionID=None, printOutput=False):
        """Returns the status of jobs as seen by the production management infrastructure.
    """
        if not isinstance(productionID, (int, long, str)):
            return self._errorReport(
                'Expected string, long or int for production ID')

        productionID = long(productionID)

        if not productionID:
            result = self._getActiveProductions()
            if not result['OK']:
                return result
            productionID = result['Value']
        else:
            productionID = [productionID]

        productionID = [str(x) for x in productionID]
        self.log.verbose('Will check progress for production(s):\n%s' %
                         (', '.join(productionID)))
        progress = {}
        for prod in productionID:
            # self._prettyPrint(result)
            result = self.transformationClient.getTransformationTaskStats(
                int(prod))
            if not result['Value']:
                self.log.error(result)
                return result
            progress[int(prod)] = result['Value']

        if not printOutput:
            return result
        idAdj = int(self.prodAdj)
        statAdj = int(self.prodAdj)
        countAdj = int(self.prodAdj)
        message = 'ProductionID'.ljust(idAdj) + 'Status'.ljust(
            statAdj) + 'Count'.ljust(countAdj) + '\n\n'
        for prod, info in progress.iteritems():
            for status, count in info.iteritems():
                message += str(prod).ljust(idAdj) + status.ljust(
                    statAdj) + str(count).ljust(countAdj) + '\n'
            message += '\n'

        print message
        return result

    def _getActiveProductions(self, printOutput=False):
        """Returns a dictionary of active production IDs and their status, e.g. automatic, manual.
    """
        result = self.transformationClient.getTransformations()
        if not result['OK']:
            return result
        prodList = result['Value']
        currentProductions = {}
        for prodDict in prodList:
            self.log.debug(prodDict)
            if 'AgentType' in prodDict and 'TransformationID' in prodDict:
                prodID = prodDict['TransformationID']
                status = prodDict['AgentType']
                currentProductions[prodID] = status
                if status.lower() == 'automatic':
                    self.log.verbose(
                        'Found active production %s eligible to submit jobs' %
                        prodID)

        if printOutput:
            self._prettyPrint(currentProductions)

        return S_OK(currentProductions)

    def getProductionCommands(self):
        """ Returns the list of possible commands and their meaning.
    """
        prodCommands = {}
        for keyword, statusSubMode in self.commands.iteritems():
            prodCommands[keyword] = {
                'Status': statusSubMode[0],
                'SubmissionMode': statusSubMode[1]
            }
        return S_OK(prodCommands)

    def production(self, productionID, command, disableCheck=True):
        """Allows basic production management by supporting the following commands:
       - start : set production status to Active, job submission possible
       - stop : set production status to Stopped, no job submissions
       - automatic: set production submission mode to Automatic, e.g. submission via Agent
       - manual: set produciton submission mode to manual, e.g. dirac-production-submit
    """
        commands = self.commands

        if not isinstance(productionID, (int, long, str)):
            return self._errorReport(
                'Expected string, long or int for production ID')

        productionID = long(productionID)
        if not isinstance(command, str):
            return self._errorReport('Expected string, for command')
        if not command.lower() in commands:
            return self._errorReport('Expected one of: %s for command string' %
                                     (', '.join(commands)))

        self.log.verbose(
            'Requested to change production %s with command "%s"' %
            (productionID, command.lower().capitalize()))
        if not disableCheck:
            result = promptUser(
                'Do you wish to change production %s with command "%s"? ' %
                (productionID, command.lower().capitalize()))
            if not result['OK']:
                self.log.info('Action cancelled')
                return S_OK('Action cancelled')
            if result['Value'] != 'y':
                self.log.info('Doing nothing')
                return S_OK('Doing nothing')

        actions = commands[command]
        self.log.info(
            'Setting production status to %s and submission mode to %s for productionID %s'
            % (actions[0], actions[1], productionID))
        result = self.transformationClient.setTransformationParameter(
            long(productionID), "Status", actions[0])
        if not result['OK']:
            self.log.warn(
                'Problem updating transformation status with result:\n%s' %
                result)
            return result
        self.log.verbose('Setting transformation status to %s successful' %
                         (actions[0]))
        result = self.transformationClient.setTransformationParameter(
            long(productionID), 'AgentType', actions[1])
        if not result['OK']:
            self.log.warn(
                'Problem updating transformation agent type with result:\n%s' %
                result)
            return result
        self.log.verbose('Setting transformation agent type to %s successful' %
                         (actions[1]))
        return S_OK('Production %s status updated' % productionID)

    def productionFileSummary(self,
                              productionID,
                              selectStatus=None,
                              outputFile=None,
                              orderOutput=True,
                              printSummary=False,
                              printOutput=False):
        """ Allows to investigate the input files for a given production transformation
        and provides summaries / selections based on the file status if desired.
    """
        adj = 18
        ordering = 'TaskID'
        if not orderOutput:
            ordering = 'LFN'
        fileSummary = self.transformationClient.getTransformationFiles(
            condDict={'TransformationID': int(productionID)},
            orderAttribute=ordering)
        if not fileSummary['OK']:
            return fileSummary

        toWrite = ''
        totalRecords = 0
        summary = {}
        selected = 0
        if fileSummary['OK']:
            for lfnDict in fileSummary['Value']:
                totalRecords += 1
                record = ''
                recordStatus = ''
                for n, v in lfnDict.iteritems():
                    record += str(n) + ' = ' + str(v).ljust(adj) + ' '
                    if n == 'Status':
                        recordStatus = v
                        if selectStatus == recordStatus:
                            selected += 1
                        if v in summary:
                            new = summary[v] + 1
                            summary[v] = new
                        else:
                            summary[v] = 1

                if outputFile and selectStatus:
                    if selectStatus == recordStatus:
                        toWrite += record + '\n'
                        if printOutput:
                            print record
                elif outputFile:
                    toWrite += record + '\n'
                    if printOutput:
                        print record
                else:
                    if printOutput:
                        print record

        if printSummary:
            print '\nSummary for %s files in production %s\n' % (totalRecords,
                                                                 productionID)
            print 'Status'.ljust(adj) + ' ' + 'Total'.ljust(
                adj) + 'Percentage'.ljust(adj) + '\n'
            for n, v in summary.iteritems():
                percentage = int(100 * int(v) / totalRecords)
                print str(n).ljust(adj) + ' ' + str(v).ljust(adj) + ' ' + str(
                    percentage).ljust(2) + ' % '
            print '\n'

        if selectStatus and not selected:
            return S_ERROR(
                'No files were selected for production %s and status "%s"' %
                (productionID, selectStatus))
        elif selectStatus and selected:
            print '%s / %s files (%s percent) were found for production %s in status "%s"' % (
                selected, totalRecords, int(
                    100 * int(selected) / totalRecords), productionID,
                selectStatus)

        if outputFile:
            if os.path.exists(outputFile):
                print 'Requested output file %s already exists, please remove this file to continue' % outputFile
                return fileSummary

            fopen = open(outputFile, 'w')
            fopen.write(toWrite)
            fopen.close()
            if not selectStatus:
                print 'Wrote %s lines to file %s' % (totalRecords, outputFile)
            else:
                print 'Wrote %s lines to file %s for status "%s"' % (
                    selected, outputFile, selectStatus)

        return fileSummary

    def checkFilesStatus(self, lfns, productionID='', printOutput=False):
        """Checks the given LFN(s) status in the productionDB.  All productions
       are considered by default but can restrict to productionID.
    """
        if not isinstance(productionID, (int, long, str)):
            return self._errorReport(
                'Expected string, long or int for production ID')

        if isinstance(lfns, str):
            lfns = lfns.replace('LFN:', '')
        elif isinstance(lfns, list):
            try:
                lfns = [str(lfnName.replace('LFN:', '')) for lfnName in lfns]
            except Exception as x:
                return self._errorReport(str(x), 'Expected strings for LFN(s)')
        else:
            return self._errorReport(
                'Expected single string or list of strings for LFN(s)')

        fileStatus = self.transformationClient.getFileSummary(
            lfns, long(productionID))
        if printOutput:
            self._prettyPrint(fileStatus['Value'])
        return fileStatus

    def getWMSProdJobID(self, jobID, printOutput=False):
        """This method takes the DIRAC WMS JobID and returns the Production JobID information.
    """
        result = self.attributes(jobID)
        if not result['OK']:
            return result
        if 'JobName' not in result['Value']:
            return S_ERROR(
                'Could not establish ProductionID / ProductionJobID, missing JobName'
            )

        wmsJobName = result['Value']['JobName']
        prodID = wmsJobName.split('_')[0]
        prodJobID = wmsJobName.split('_')[1]
        info = {
            'WMSJobID': jobID,
            'JobName': wmsJobName,
            'ProductionID': prodID,
            'JobID': prodJobID
        }
        if printOutput:
            self._prettyPrint(info)
        return S_OK(info)

    def getProdJobInfo(self, productionID, jobID, printOutput=False):
        """Retrieve production job information from Production Manager service.
    """
        res = self.transformationClient.getTransformationTasks(
            condDict={
                'TransformationID': productionID,
                'TaskID': jobID
            },
            inputVector=True)
        if not res['OK']:
            return res
        if not res['Value']:
            return S_ERROR("Job %s not found for production %s" %
                           (jobID, productionID))
        jobInfo = res['Value'][0]
        if printOutput:
            self._prettyPrint(jobInfo)
        return S_OK(jobInfo)

    def selectProductionJobs(self,
                             productionID,
                             status=None,
                             minorStatus=None,
                             applicationStatus=None,
                             site=None,
                             owner=None,
                             date=None):
        """Wraps around DIRAC API selectJobs(). Arguments correspond to the web page
       selections. By default, the date is the creation date of the production.
    """
        if not date:
            self.log.verbose(
                'No Date supplied, setting old date for production %s' %
                productionID)
            date = '2001-01-01'
        return self.selectJobs(status, minorStatus, applicationStatus, site,
                               owner,
                               str(productionID).zfill(8), date)

    def extendProduction(self, productionID, numberOfJobs, printOutput=False):
        """ Extend Simulation type Production by number of jobs.
        Usage: extendProduction <ProductionNameOrID> nJobs
    """
        if not isinstance(productionID, (int, long, str)):
            return self._errorReport(
                'Expected string, long or int for production ID')

        if isinstance(numberOfJobs, str):
            try:
                numberOfJobs = int(numberOfJobs)
            except Exception as x:
                return self._errorReport(
                    str(x),
                    'Expected integer or string for number of jobs to submit')

        result = self.transformationClient.extendTransformation(
            long(productionID), numberOfJobs)
        if not result['OK']:
            return self._errorReport(
                result, 'Could not extend production %s by %s jobs' %
                (productionID, numberOfJobs))

        if printOutput:
            print 'Extended production %s by %s jobs' % (productionID,
                                                         numberOfJobs)

        return result

    def getProdJobMetadata(self,
                           productionID,
                           status=None,
                           minorStatus=None,
                           site=None):
        """Function to get the WMS job metadata for selected fields. Given a production ID will return
       the current WMS status information for all jobs in that production starting from the creation
       date.
    """
        result = self.transformationClient.getTransformationParameters(
            long(productionID), ['CreationDate'])
        if not result['OK']:
            self.log.warn(
                'Problem getting production metadata for ID %s:\n%s' %
                (productionID, result))
            return result

        creationDate = toString(result['Value']).split()[0]
        result = self.selectProductionJobs(productionID,
                                           status=status,
                                           minorStatus=minorStatus,
                                           site=site,
                                           date=creationDate)
        if not result['OK']:
            self.log.warn('Problem selecting production jobs for ID %s:\n%s' %
                          (productionID, result))
            return result

        jobsList = result['Value']
        return self.status(jobsList)

    def launchProduction(self,
                         prod,
                         publishFlag,
                         testFlag,
                         requestID,
                         extend=0,
                         tracking=0,
                         MCsimflag=False):
        """ Given a production object (prod), launch it
        It returns the productionID created
    """

        if publishFlag is False and testFlag:
            gLogger.info('Test prod will be launched locally')
            result = prod.runLocal()
            if result['OK']:
                gLogger.info('Template finished successfully')
                return S_OK()
            else:
                gLogger.error(
                    'Launching production: something wrong with execution!')
                return S_ERROR('Something wrong with execution!')

        result = prod.create(publish=publishFlag,
                             requestID=requestID,
                             reqUsed=tracking)

        if not result['OK']:
            gLogger.error(
                'Error during prod creation:\n%s\ncheck that the wkf name is unique.'
                % (result['Message']))
            return result

        if publishFlag:
            prodID = result['Value']
            msg = 'Production %s successfully created ' % (prodID)

            if extend:
                self.extendProduction(prodID, extend, printOutput=True)
                msg += ', extended by %s jobs' % extend
            if MCsimflag:
                self.production(prodID, 'mctestmode')
                msg = msg + ' and started in mctestmode.'
            elif testFlag:
                self.production(prodID, 'manual')
                msg = msg + ' and started in manual mode.'
            else:
                self.production(prodID, 'automatic')
                msg = msg + ' and started in automatic mode.'
            gLogger.notice(msg)

        else:
            prodID = 1
            gLogger.notice(
                'Production creation completed but not published (publishFlag was %s). \
      Setting ID = %s (useless, just for the test)' % (publishFlag, prodID))

        return S_OK(prodID)
Exemplo n.º 3
0
class BookkeepingWatchAgent(AgentModule, TransformationAgentsUtilities):
  """ LHCbDIRAC only agent. A threaded agent.
  """

  def __init__(self, *args, **kwargs):
    """ c'tor
    """
    AgentModule.__init__(self, *args, **kwargs)
    TransformationAgentsUtilities.__init__(self)

    self.bkQueriesToBeChecked = Queue.Queue()
    self.bkQueriesInCheck = []

    self.fullUpdatePeriod = 86400
    self.bkUpdateLatency = 7200
    self.debug = False

    self.transInThread = {}

    self.pickleFile = 'BookkeepingWatchAgent.pkl'
    self.chunkSize = 1000

    self.pluginsWithNoRunInfo = ['LHCbStandard', 'ReplicateDataset', 'ArchiveDataset',
                                 'LHCbMCDSTBroadcastRandom', 'ReplicateToLocalSE',
                                 'RemoveReplicas', 'RemoveReplicasWhenProcessed',
                                 'RemoveReplicasWithAncestors', 'ReplicateWithAncestors',
                                 'ReduceReplicas', 'RemoveDatasetFromDisk',
                                 'DestroyDataset', 'DestroyDatasetWhenProcessed',
                                 'BySize', 'Standard']

    self.timeLog = {}
    self.fullTimeLog = {}
    self.bkQueries = {}

    self.transClient = None
    self.bkClient = None

  def initialize(self):
    """ Make the necessary initializations.
        The ThreadPool is created here, the _execute() method is what each thread will execute.
    """

    self.fullUpdatePeriod = self.am_getOption('FullUpdatePeriod', self.fullUpdatePeriod)
    self.bkUpdateLatency = self.am_getOption('BKUpdateLatency', self.bkUpdateLatency)
    self.debug = self.am_getOption('verbose', self.debug)

    self.pickleFile = os.path.join(self.am_getWorkDirectory(), self.pickleFile)
    self.chunkSize = self.am_getOption('maxFilesPerChunk', self.chunkSize)

    self.pluginsWithNoRunInfo = Operations().getValue('TransformationPlugins/PluginsWithNoRunInfo',
                                                      self.pluginsWithNoRunInfo)

    self._logInfo('Full Update Period: %d seconds' % self.fullUpdatePeriod)
    self._logInfo('BK update latency : %d seconds' % self.bkUpdateLatency)
    self._logInfo('Plugins with no run info: %s' % ', '.join(self.pluginsWithNoRunInfo))

    self.transClient = TransformationClient()
    self.bkClient = BookkeepingClient()

    try:
      with open(self.pickleFile, 'r') as pf:
        self.timeLog = pickle.load(pf)
        self.fullTimeLog = pickle.load(pf)
        self.bkQueries = pickle.load(pf)
      self._logInfo("successfully loaded Log from", self.pickleFile, "initialize")
    except (EOFError, IOError):
      self._logInfo("failed loading Log from", self.pickleFile, "initialize")
      self.timeLog = {}
      self.fullTimeLog = {}
      self.bkQueries = {}

    maxNumberOfThreads = self.am_getOption('maxThreadsInPool', 1)
    threadPool = ThreadPool(maxNumberOfThreads, maxNumberOfThreads)

    for i in xrange(maxNumberOfThreads):
      threadPool.generateJobAndQueueIt(self._execute, [i])

    gMonitor.registerActivity("Iteration", "Agent Loops", AGENT_NAME, "Loops/min", gMonitor.OP_SUM)
    return S_OK()

  @gSynchro
  def __dumpLog(self):
    """ dump the log in the pickle file
    """
    if self.pickleFile:
      try:
        with open(self.pickleFile, 'w') as pf:
          pickle.dump(self.timeLog, pf)
          pickle.dump(self.fullTimeLog, pf)
          pickle.dump(self.bkQueries, pf)
        self._logVerbose("successfully dumped Log into %s" % self.pickleFile)
      except IOError as e:
        self._logError("fail to open %s: %s" % (self.pickleFile, e))
      except pickle.PickleError as e:
        self._logError("fail to dump %s: %s" % (self.pickleFile, e))
      except ValueError as e:
        self._logError("fail to close %s: %s" % (self.pickleFile, e))

  ################################################################################

  def execute(self):
    """ Main execution method. Just fills a list, and a queue, with BKKQueries ID.
    """

    gMonitor.addMark('Iteration', 1)
    # Get all the transformations
    result = self.transClient.getTransformations(condDict={'Status': ['Active', 'Idle']})
    if not result['OK']:
      self._logError("Failed to get transformations.", result['Message'])
      return S_OK()
    transIDsList = [long(transDict['TransformationID']) for transDict in result['Value']]
    res = self.transClient.getTransformationsWithBkQueries(transIDsList)
    if not res['OK']:
      self._logError("Failed to get transformations with Bk Queries.", res['Message'])
      return S_OK()
    transIDsWithBkQueriesList = res['Value']

    _count = 0
    # Process each transformation
    for transID in transIDsWithBkQueriesList:
      if transID in self.bkQueriesInCheck:
        continue
      self.bkQueriesInCheck.append(transID)
      self.bkQueriesToBeChecked.put(transID)
      _count += 1

    self._logInfo("Out of %d transformations, %d put in thread queue" % (len(result['Value']), _count))

    self.__dumpLog()
    return S_OK()

  def _execute(self, threadID):
    """ Real executor. This is what is executed by the single threads - so do not return here! Just continue
    """

    while True:  # not self.bkQueriesToBeChecked.empty():

      transID = None

      try:

        transID = self.bkQueriesToBeChecked.get()
        self.transInThread[transID] = ' [Thread%d] [%s] ' % (threadID, str(transID))

        startTime = time.time()
        self._logInfo("Processing transformation %s." % transID, transID=transID)

        res = self.transClient.getTransformation(transID, extraParams=False)
        if not res['OK']:
          self._logError("Failed to get transformation", res['Message'], transID=transID)
          continue
        transPlugin = res['Value']['Plugin']

        res = self.transClient.getBookkeepingQuery(transID)
        if not res['OK']:
          self._logError("Failed to get BkQuery", res['Message'], transID=transID)
          continue
        bkQuery = res['Value']

        # Determine the correct time stamp to use for this transformation
        now = datetime.datetime.utcnow()
        self.__timeStampForTransformation(transID, bkQuery, now)

        try:
          files = self.__getFiles(transID, bkQuery, now)
        except RuntimeError as e:
          # In case we failed a full query, we should retry full query until successful
          if 'StartDate' not in bkQuery:
            self.bkQueries.pop(transID, None)
          self._logError("Failed to get response from the Bookkeeping: %s" % e, "", "__getFiles", transID)
          continue

        runDict = {}
        filesMetadata = {}
        # get the files metadata
        for lfnChunk in breakListIntoChunks(files, self.chunkSize):
          start = time.time()
          res = self.bkClient.getFileMetadata(lfnChunk)
          self._logVerbose("Got metadata from BK for %d files" % len(lfnChunk), transID=transID, reftime=start)
          if not res['OK']:
            self._logError("Failed to get BK metadata for %d files" % len(lfnChunk),
                           res['Message'], transID=transID)
            # No need to return as we only consider files that are successful...
          else:
            filesMetadata.update(res['Value']['Successful'])

        # There is no need to add the run information for a transformation that doesn't need it
        if transPlugin not in self.pluginsWithNoRunInfo:
          for lfn, metadata in filesMetadata.iteritems():
            runID = metadata.get('RunNumber', None)
            if isinstance(runID, (basestring, int, long)):
              runDict.setdefault(int(runID), []).append(lfn)
          try:
            self.__addRunsMetadata(transID, runDict.keys())
          except RuntimeError as e:
            self._logException("Failure adding runs metadata",
                               method="__addRunsMetadata",
                               lException=e,
                               transID=transID)
        else:
          runDict[None] = filesMetadata.keys()

        # Add all new files to the transformation
        for runID in sorted(runDict):
          lfnList = runDict[runID]
          # We enter all files of a run at once, otherwise do it by chunks
          lfnChunks = [lfnList] if runID else breakListIntoChunks(lfnList, self.chunkSize)
          for lfnChunk in lfnChunks:
            # Add the files to the transformation
            self._logVerbose('Adding %d lfns for transformation' % len(lfnChunk), transID=transID)
            result = self.transClient.addFilesToTransformation(transID, lfnChunk)
            if not result['OK']:
              self._logError("Failed to add %d lfns to transformation" % len(lfnChunk), result['Message'],
                             transID=transID)
              return result
            else:
              # Handle errors
              errors = {}
              for lfn, error in result['Value']['Failed'].iteritems():
                errors.setdefault(error, []).append(lfn)
              for error, lfns in errors.iteritems():
                self._logWarn("Failed to add files to transformation", error, transID=transID)
                self._logVerbose("\n\t".join([''] + lfns))
              # Add the metadata and RunNumber to the newly inserted files
              addedLfns = [lfn for (lfn, status) in result['Value']['Successful'].iteritems() if status == 'Added']
              if addedLfns:
                # Add files metadata: size and file type
                lfnDict = dict((lfn, {'Size': filesMetadata[lfn]['FileSize'],
                                      'FileType': filesMetadata[lfn]['FileType']})
                               for lfn in addedLfns)
                res = self.transClient.setParameterToTransformationFiles(transID, lfnDict)
                if not res['OK']:
                  self._logError("Failed to set transformation files metadata", res['Message'])
                  return res
                # Add run information if it exists
                if runID:
                  self._logInfo("Added %d files to transformation for run %d, now including run information"
                                % (len(addedLfns), runID), transID=transID)
                  self._logVerbose("Associating %d files to run %d" % (len(addedLfns), runID), transID=transID)
                  res = self.transClient.addTransformationRunFiles(transID, runID, addedLfns)
                  if not res['OK']:
                    self._logError("Failed to associate %d files to run %d" % (len(addedLfns), runID),
                                   res['Message'], transID=transID)
                    return res
                else:
                  self._logInfo("Added %d files to transformation" % len(addedLfns), transID=transID)

      except Exception as x:  # pylint: disable=broad-except
        self._logException('Exception while adding files to transformation',
                           lException=x,
                           method='_execute',
                           transID=transID)
      finally:
        self._logInfo("Processed transformation", transID=transID, reftime=startTime)
        if transID in self.bkQueriesInCheck:
          self.bkQueriesInCheck.remove(transID)
        self.transInThread.pop(transID, None)

    return S_OK()

  @gSynchro
  def __timeStampForTransformation(self, transID, bkQuery, now):
    """ Determine the correct time stamp to use for this transformation
    """

    fullTimeLog = self.fullTimeLog.setdefault(transID, now)
    bkQueryLog = self.bkQueries.setdefault(transID, {})

    bkQueryLog.pop('StartDate', None)
    self.bkQueries[transID] = bkQuery.copy()
    if transID in self.timeLog \
            and bkQueryLog == bkQuery \
            and (now - fullTimeLog) < datetime.timedelta(seconds=self.fullUpdatePeriod):
      # If it is more than a day since the last reduced query, make a full query just in case
      timeStamp = self.timeLog[transID]
      delta = datetime.timedelta(seconds=self.bkUpdateLatency)
      bkQuery['StartDate'] = (timeStamp - delta).strftime('%Y-%m-%d %H:%M:%S')
    if 'StartDate' not in bkQuery:
      self.fullTimeLog[transID] = now

  def __getFiles(self, transID, bkQuery, now):
    """ Perform the query to the Bookkeeping
    """
    self._logInfo("Using BK query for transformation: %s" % str(bkQuery), transID=transID)
    start = time.time()
    result = self.bkClient.getFiles(bkQuery)
    self._logVerbose("BK query time: %.2f seconds." % (time.time() - start), transID=transID)
    if not result['OK']:
      raise RuntimeError(result['Message'])
    else:
      self.__updateTimeStamp(transID, now)
      if result['Value']:
        self._logInfo("Obtained %d files from BK" % len(result['Value']), transID=transID)
      return result['Value']

  @gSynchro
  def __updateTimeStamp(self, transID, now):
    """
    Update time stamp for current transformation to now
    """
    self.timeLog[transID] = now

  def __addRunsMetadata(self, transID, runsList):
    """ Add the run metadata
    """
    runsInCache = self.transClient.getRunsInCache({'Name': ['TCK', 'CondDb', 'DDDB']})
    if not runsInCache['OK']:
      raise RuntimeError(runsInCache['Message'])
    newRuns = list(set(runsList) - set(runsInCache['Value']))
    if newRuns:
      self._logVerbose("Associating run metadata to %d runs" % len(newRuns), transID=transID)
      res = self.bkClient.getRunInformation({'RunNumber': newRuns, 'Fields': ['TCK', 'CondDb', 'DDDB']})
      if not res['OK']:
        raise RuntimeError(res['Message'])
      else:
        for run, runMeta in res['Value'].iteritems():
          res = self.transClient.addRunsMetadata(run, runMeta)
          if not res['OK']:
            raise RuntimeError(res['Message'])
    # Add run duration to the metadata
    runsInCache = self.transClient.getRunsInCache({'Name': ['Duration']})
    if not runsInCache['OK']:
      raise RuntimeError(runsInCache['Message'])
    newRuns = list(set(runsList) - set(runsInCache['Value']))
    if newRuns:
      self._logVerbose("Associating run duration to %d runs" % len(newRuns), transID=transID)
      res = self.bkClient.getRunInformation({'RunNumber': newRuns, 'Fields': ['JobStart', 'JobEnd']})
      if not res['OK']:
        raise RuntimeError(res['Message'])
      else:
        for run, runMeta in res['Value'].iteritems():
          duration = (runMeta['JobEnd'] - runMeta['JobStart']).seconds
          res = self.transClient.addRunsMetadata(run, {'Duration': duration})
          if not res['OK']:
            raise RuntimeError(res['Message'])

  def finalize(self):
    """ Gracious finalization
    """
    if self.bkQueriesInCheck:
      self._logInfo("Wait for queue to get empty before terminating the agent (%d tasks)" % len(self.transInThread))
      self.bkQueriesInCheck = []
      while self.transInThread:
        time.sleep(2)
      self.log.info("Threads are empty, terminating the agent...")
    return S_OK()
Exemplo n.º 4
0
class DataRecoveryAgent(AgentModule):
    """ Standard DIRAC agent class
  """
    def __init__(self, *args, **kwargs):
        """ c'tor
    """
        AgentModule.__init__(self, *args, **kwargs)

        self.transClient = None
        self.reqClient = None
        self.consChecks = None

        self.enableFlag = True
        self.transformationTypes = []
        self.transLogger = self.log

    #############################################################################

    def initialize(self):
        """Sets defaults
    """
        self.am_setOption('shifterProxy', 'ProductionManager')

        self.transClient = TransformationClient()
        self.reqClient = ReqClient()
        self.consChecks = ConsistencyChecks(interactive=False,
                                            transClient=self.transClient)

        transformationTypes = Operations().getValue(
            'Transformations/DataProcessing', [])
        extendableTTypes = Operations().getValue(
            'Transformations/ExtendableTransfTypes', ['MCSimulation'])
        self.transformationTypes = list(
            set(transformationTypes) - set(extendableTTypes))

        return S_OK()

    #############################################################################
    def execute(self):
        """ The main execution method.
    """
        # Configuration settings
        self.enableFlag = self.am_getOption('EnableFlag', True)
        self.log.verbose('Enable flag is %s' % self.enableFlag)
        if not self.transformationTypes:
            self.log.warn("No transformation types to look for... aborting")
            return S_OK()

        transformationStatus = self.am_getOption('TransformationStatus',
                                                 ['Active', 'Completing'])
        fileSelectionStatus = self.am_getOption('FileSelectionStatus',
                                                ['Assigned', 'MaxReset'])
        unrecoverableStatus = self.am_getOption('UnrecoverableStatus',
                                                ['MaxReset'])
        updateStatus = self.am_getOption('FileUpdateStatus', 'Unused')
        wmsStatusList = self.am_getOption('WMSStatus', ['Failed'])

        # only worry about files > 12hrs since last update
        selectDelay = self.am_getOption('SelectionDelay', 1)  # hours

        transformationDict = {}
        for transStatus in transformationStatus:
            result = self.__getEligibleTransformations(
                transStatus, self.transformationTypes)
            if not result['OK']:
                self.log.error(
                    "Could not obtain eligible transformations",
                    "Status '%s': %s" % (transStatus, result['Message']))
                return result

            if not result['Value']:
                self.log.info(
                    'No "%s" transformations of types %s to process.' %
                    (transStatus, ', '.join(self.transformationTypes)))
                continue

            transformationDict.update(result['Value'])

        self.log.info(
            'Selected %d transformations of types %s' %
            (len(transformationDict), ', '.join(self.transformationTypes)))
        self.log.verbose('Transformations selected:\n%s' %
                         (', '.join(transformationDict)))

        for transformation, typeName in transformationDict.iteritems():
            self.transLogger = self.log.getSubLogger('Trans-%s' %
                                                     transformation)
            result = self.__selectTransformationFiles(transformation,
                                                      fileSelectionStatus)
            if not result['OK']:
                self.transLogger.error(
                    'Could not select files for transformation',
                    '%s: %s' % (transformation, result['Message']))
                continue
            fileDict = result['Value']
            if not fileDict:
                self.transLogger.verbose(
                    'No files in status %s selected for transformation %s' %
                    (', '.join(fileSelectionStatus), transformation))
                continue

            title = 'Looking at transformation %s, type %s ' % (transformation,
                                                                typeName)
            self.transLogger.info('=' * len(title))
            self.transLogger.info(title)

            self.transLogger.info(
                'Selected %d files with status %s' %
                (len(fileDict), ','.join(fileSelectionStatus)))
            result = self.__obtainWMSJobIDs(transformation, fileDict,
                                            selectDelay, wmsStatusList)
            if not result['OK']:
                self.transLogger.error(
                    "Could not obtain jobs for files of transformation",
                    result['Message'])
                continue
            jobFileDict = result['Value']
            if not jobFileDict:
                self.transLogger.info('No %s jobs found for selected files' %
                                      ' or '.join(wmsStatusList))
                continue

            self.transLogger.verbose(
                "Looking at WMS jobs %s" %
                ','.join(str(jobID) for jobID in jobFileDict))

            fileCount = sum(
                len(lfnList) for lfnList in jobFileDict.itervalues())
            self.transLogger.verbose(
                '%s files are selected after examining WMS jobs' %
                (str(fileCount) if fileCount else 'No'))
            if not fileCount:
                continue

            result = self.__removePendingRequestsJobs(jobFileDict)
            if not result['OK']:
                self.transLogger.error(
                    "Error while removing jobs with pending requests",
                    result['Message'])
                continue
            # This method modifies the input dictionary
            if not jobFileDict:
                self.transLogger.info(
                    'No WMS jobs without pending requests to process.')
                continue

            fileCount = sum(
                len(lfnList) for lfnList in jobFileDict.itervalues())
            self.transLogger.info(
                '%s files are selected in %d jobs after removing any job with pending requests'
                % (str(fileCount) if fileCount else 'No', len(jobFileDict)))
            if not fileCount:
                continue

            jobsThatDidntProduceOutputs, jobsThatProducedOutputs = self.__checkdescendants(
                transformation, jobFileDict)
            title = '======== Transformation %s: results ========' % transformation
            self.transLogger.info(title)
            self.transLogger.info('\tTotal jobs that can be updated now: %d' %
                                  len(jobsThatDidntProduceOutputs))
            if jobsThatProducedOutputs:
                self.transLogger.info('\t%d jobs have descendants' %
                                      len(jobsThatProducedOutputs))
            else:
                self.transLogger.info('\tNo jobs have descendants')

            filesToUpdate = []
            filesMaxReset = []
            filesWithDescendants = []
            for job, fileList in jobFileDict.iteritems():
                if job in jobsThatDidntProduceOutputs:
                    recoverableFiles = set(
                        lfn for lfn in fileList
                        if fileDict[lfn][1] not in unrecoverableStatus)
                    filesToUpdate += list(recoverableFiles)
                    filesMaxReset += list(set(fileList) - recoverableFiles)
                elif job in jobsThatProducedOutputs:
                    filesWithDescendants += fileList

            if filesToUpdate:
                self.transLogger.info("\tUpdating %d files to '%s'" %
                                      (len(filesToUpdate), updateStatus))
                result = self.__updateFileStatus(transformation, filesToUpdate,
                                                 updateStatus)
                if not result['OK']:
                    self.transLogger.error(
                        '\tRecoverable files were not updated',
                        result['Message'])

            if filesMaxReset:
                self.transLogger.info(
                    '\t%d files are in %s status and have no descendants' %
                    (len(filesMaxReset), ','.join(unrecoverableStatus)))

            if filesWithDescendants:
                # FIXME: we should mark these files with another status such that they are not considered again and again
                # In addition a notification should be sent to the production managers
                self.transLogger.warn(
                    '\t!!!!!!!! Transformation has descendants for files that are not marked as processed !!!!!!!!'
                )
                self.transLogger.warn('\tFiles with descendants:',
                                      ','.join(filesWithDescendants))

        return S_OK()

    #############################################################################
    def __getEligibleTransformations(self, status, typeList):
        """ Select transformations of given status and type.
    """
        res = self.transClient.getTransformations(condDict={
            'Status': status,
            'Type': typeList
        })
        if not res['OK']:
            return res
        transformations = dict((str(prod['TransformationID']), prod['Type'])
                               for prod in res['Value'])
        return S_OK(transformations)

    #############################################################################
    def __selectTransformationFiles(self, transformation, statusList):
        """ Select files, production jobIDs in specified file status for a given transformation.
    """
        # Until a query for files with timestamp can be obtained must rely on the
        # WMS job last update
        res = self.transClient.getTransformationFiles(condDict={
            'TransformationID': transformation,
            'Status': statusList
        })
        if not res['OK']:
            return res
        resDict = {}
        mandatoryKeys = {'LFN', 'TaskID', 'LastUpdate'}
        for fileDict in res['Value']:
            missingKeys = mandatoryKeys - set(fileDict)
            if missingKeys:
                for key in missingKeys:
                    self.transLogger.warn(
                        '%s is mandatory, but missing for:\n\t%s' %
                        (key, str(fileDict)))
            else:
                resDict[fileDict['LFN']] = (fileDict['TaskID'],
                                            fileDict['Status'])
        return S_OK(resDict)

    #############################################################################
    def __obtainWMSJobIDs(self, transformation, fileDict, selectDelay,
                          wmsStatusList):
        """ Group files by the corresponding WMS jobIDs, check the corresponding
        jobs have not been updated for the delay time.  Can't get into any
        mess because we start from files only in MaxReset / Assigned and check
        corresponding jobs.  Mixtures of files for jobs in MaxReset and Assigned
        statuses only possibly include some files in Unused status (not Processed
        for example) that will not be touched.
    """
        taskIDList = sorted(
            set(taskID for taskID, _status in fileDict.values()))
        self.transLogger.verbose(
            "The following %d task IDs correspond to the selected files:\n%s" %
            (len(taskIDList), ', '.join(str(taskID) for taskID in taskIDList)))

        jobFileDict = {}
        olderThan = dateTime() - datetime.timedelta(hours=selectDelay)

        res = self.transClient.getTransformationTasks(
            condDict={
                'TransformationID': transformation,
                'TaskID': taskIDList
            },
            older=olderThan,
            timeStamp='LastUpdateTime')
        if not res['OK']:
            self.transLogger.error("getTransformationTasks returned an error",
                                   '%s' % res['Message'])
            return res

        mandatoryKeys = {
            'TaskID', 'ExternalID', 'LastUpdateTime', 'ExternalStatus'
        }
        for taskDict in res['Value']:
            missingKey = mandatoryKeys - set(taskDict)
            if missingKey:
                for key in missingKey:
                    self.transLogger.warn(
                        'Missing key %s for job dictionary:\n\t%s' %
                        (key, str(taskDict)))
                continue

            taskID = taskDict['TaskID']
            wmsID = taskDict['ExternalID']
            wmsStatus = taskDict['ExternalStatus']

            if not int(wmsID):
                self.transLogger.verbose(
                    'TaskID %s: status is %s (jobID = %s) so will not recheck with WMS'
                    % (taskID, wmsStatus, wmsID))
                continue

            # Exclude jobs not having appropriate WMS status - have to trust that production management status is correct
            if wmsStatus not in wmsStatusList:
                self.transLogger.verbose(
                    'Job %s is in status %s, not in %s so will be ignored' %
                    (wmsID, wmsStatus, ', '.join(wmsStatusList)))
                continue

            # Must map unique files -> jobs in expected state
            jobFileDict[wmsID] = [
                lfn for lfn, (tID, _st) in fileDict.iteritems()
                if int(tID) == int(taskID)
            ]

            self.transLogger.info(
                'Found %d files for taskID %s, jobID %s (%s), last update %s' %
                (len(jobFileDict[wmsID]), taskID, wmsID, wmsStatus,
                 taskDict['LastUpdateTime']))

        return S_OK(jobFileDict)

    #############################################################################

    def __removePendingRequestsJobs(self, jobFileDict):
        """ Before doing anything check that no outstanding requests are pending for the set of WMS jobIDs.
    """
        jobs = jobFileDict.keys()

        level = self.reqClient.log.getLevel()
        self.reqClient.log.setLevel('ERROR')
        result = self.reqClient.getRequestIDsForJobs(jobs)
        self.reqClient.log.setLevel(level)
        if not result['OK']:
            return result

        if not result['Value']['Successful']:
            self.transLogger.verbose('None of the jobs have pending requests')
            return S_OK()

        for jobID, requestID in result['Value']['Successful'].iteritems():
            res = self.reqClient.getRequestStatus(requestID)
            if not res['OK']:
                self.transLogger.error('Failed to get Status for Request',
                                       '%s:%s' % (requestID, res['Message']))
            elif res['Value'] != 'Done':
                # If we fail to get the Status or it is not Done, we must wait, so remove the job from the list.
                del jobFileDict[str(jobID)]
                self.transLogger.verbose(
                    'Removing jobID %s from consideration until requests are completed'
                    % (jobID))

        return S_OK()

    #############################################################################
    def __checkdescendants(self, transformation, jobFileDict):
        """ Check BK descendants for input files, prepare list of actions to be
        taken for recovery.
    """

        jobsThatDidntProduceOutputs = []
        jobsThatProducedOutputs = []

        self.consChecks.prod = transformation
        for job, fileList in jobFileDict.iteritems():
            result = self.consChecks.getDescendants(fileList)
            filesWithDesc = result[0]
            filesWithMultipleDesc = result[2]
            if filesWithDesc or filesWithMultipleDesc:
                jobsThatProducedOutputs.append(job)
            else:
                jobsThatDidntProduceOutputs.append(job)

        return jobsThatDidntProduceOutputs, jobsThatProducedOutputs

    ############################################################################
    def __updateFileStatus(self, transformation, fileList, fileStatus):
        """ Update file list to specified status.
    """
        if not self.enableFlag:
            self.transLogger.info(
                "\tEnable flag is False, would have updated %d files to '%s' status for %s"
                % (len(fileList), fileStatus, transformation))
            return S_OK()

        return self.transClient.setFileStatusForTransformation(
            int(transformation), fileStatus, fileList, force=False)