示例#1
0
class UploadLogFile(ModuleBase):
    """ Handle log file uploads in the production jobs
  """

    #############################################################################
    def __init__(self):
        """Module initialization.
    """
        super(UploadLogFile, self).__init__()
        self.version = __RCSID__
        self.log = gLogger.getSubLogger("UploadLogFile")
        self.PRODUCTION_ID = None
        self.JOB_ID = None
        self.workflow_commons = None
        self.request = None
        self.logFilePath = ""
        self.logLFNPath = ""
        self.logdir = ""
        self.logSE = self.ops.getValue("/LogStorage/LogSE", "LogSE")
        self.root = gConfig.getValue("/LocalSite/Root", os.getcwd())
        self.logSizeLimit = self.ops.getValue("/LogFiles/SizeLimit", 20 * 1024 * 1024)
        self.logExtensions = []
        self.failoverSEs = gConfig.getValue("/Resources/StorageElementGroups/Tier1-Failover", [])
        self.diracLogo = self.ops.getValue(
            "/SAM/LogoURL", "https://lhcbweb.pic.es/DIRAC/images/logos/DIRAC-logo-transp.png"
        )
        self.rm = ReplicaManager()

        self.experiment = "CLIC"
        self.enable = True
        self.failoverTest = False  # flag to put log files to failover by default
        self.jobID = ""

    ######################################################################
    def applicationSpecificInputs(self):

        if self.step_commons.has_key("Enable"):
            self.enable = self.step_commons["Enable"]
            if not type(self.enable) == type(True):
                self.log.warn("Enable flag set to non-boolean value %s, setting to False" % self.enable)
                self.enable = False

        if self.step_commons.has_key("TestFailover"):
            self.enable = self.step_commons["TestFailover"]
            if not type(self.failoverTest) == type(True):
                self.log.warn("Test failover flag set to non-boolean value %s, setting to False" % self.failoverTest)
                self.failoverTest = False

        if os.environ.has_key("JOBID"):
            self.jobID = os.environ["JOBID"]
            self.log.verbose("Found WMS JobID = %s" % self.jobID)
        else:
            self.log.info("No WMS JobID found, disabling module via control flag")
            self.enable = False

        if self.workflow_commons.has_key("LogFilePath") and self.workflow_commons.has_key("LogTargetPath"):
            self.logFilePath = self.workflow_commons["LogFilePath"]
            self.logLFNPath = self.workflow_commons["LogTargetPath"]
        else:
            self.log.info("LogFilePath parameter not found, creating on the fly")
            result = getLogPath(self.workflow_commons)
            if not result["OK"]:
                self.log.error("Could not create LogFilePath", result["Message"])
                return result
            self.logFilePath = result["Value"]["LogFilePath"][0]
            self.logLFNPath = result["Value"]["LogTargetPath"][0]

        if not type(self.logFilePath) == type(" "):
            self.logFilePath = self.logFilePath[0]
        if not type(self.logLFNPath) == type(" "):
            self.logLFNPath = self.logLFNPath[0]

        example_file = self.logFilePath
        if "/ilc/prod/clic" in example_file:
            self.experiment = "CLIC"
        elif "/ilc/prod/ilc/sid" in example_file:
            self.experiment = "ILC_SID"
        elif "/ilc/prod/ilc/mc-dbd" in example_file:
            self.experiment = "ILC_ILD"
        else:
            self.log.warn("Failed to determine experiment, reverting to default: %s" % self.experiment)

        if self.workflow_commons.has_key("Request"):
            self.request = self.workflow_commons["Request"]
        else:
            self.request = RequestContainer()
            self.request.setRequestName("job_%s_request.xml" % self.jobID)
            self.request.setJobID(self.jobID)
            self.request.setSourceComponent("Job_%s" % self.jobID)

        return S_OK("Parameters resolved")

    ######################################################################
    def execute(self):
        """ Main execution method
    """
        self.log.info("Initializing %s" % self.version)
        # Add global reporting tool
        self.resolveInputVariables()

        res = shellCall(0, "ls -al")
        if res["OK"] and res["Value"][0] == 0:
            self.log.info("The contents of the working directory...")
            self.log.info(str(res["Value"][1]))
        else:
            self.log.error("Failed to list the log directory", str(res["Value"][2]))

        self.log.info("Job root is found to be %s" % (self.root))
        self.log.info("PRODUCTION_ID = %s, JOB_ID = %s " % (self.PRODUCTION_ID, self.JOB_ID))
        self.logdir = os.path.realpath("./job/log/%s/%s" % (self.PRODUCTION_ID, self.JOB_ID))
        self.log.info("Selected log files will be temporarily stored in %s" % self.logdir)

        res = self.finalize()
        self.workflow_commons["Request"] = self.request
        return res

    #############################################################################
    def finalize(self):
        """ finalize method performs final operations after all the job
        steps were executed. Only production jobs are treated.
    """

        self.log.verbose("Starting UploadLogFile finalize")
        ##########################################
        # First determine the files which should be saved
        self.log.info("Determining the files to be saved in the logs.")
        res = self.determineRelevantFiles()
        if not res["OK"]:
            self.log.error("Completely failed to select relevant log files.", res["Message"])
            return S_OK()  # because if the logs are lost, it's not the end of the world.
        selectedFiles = res["Value"]
        self.log.info(
            "The following %s files were selected to be saved:\n%s"
            % (len(selectedFiles), string.join(selectedFiles, "\n"))
        )

        #########################################
        # Create a temporary directory containing these files
        self.log.info("Populating a temporary directory for selected files.")
        res = self.populateLogDirectory(selectedFiles)
        if not res["OK"]:
            self.log.error("Completely failed to populate temporary log file directory.", res["Message"])
            self.setApplicationStatus("Failed To Populate Log Dir")
            return S_OK()  # because if the logs are lost, it's not the end of the world.
        self.log.info("%s populated with log files." % self.logdir)

        #########################################
        # Create a tailored index page
        # self.log.info('Creating an index page for the logs')
        # result = self.__createLogIndex(selectedFiles)
        # if not result['OK']:
        #  self.log.error('Failed to create index page for logs', res['Message'])

        if not self.enable:
            self.log.info("Module is disabled by control flag")
            return S_OK("Module is disabled by control flag")

        #########################################
        # Make sure all the files in the log directory have the correct permissions
        result = self.__setLogFilePermissions(self.logdir)
        if not result["OK"]:
            self.log.error("Could not set permissions of log files to 0755 with message:\n%s" % (result["Message"]))

        #########################################
        # Attempt to uplaod logs to the LogSE
        self.log.info("Transferring log files to the %s" % self.logSE)
        res = S_ERROR()
        if not self.failoverTest:
            self.log.info("PutDirectory %s %s %s" % (self.logFilePath, os.path.realpath(self.logdir), self.logSE))
            res = self.rm.putStorageDirectory(
                {self.logFilePath: os.path.realpath(self.logdir)}, self.logSE, singleDirectory=True
            )
            self.log.verbose(res)
            if res["OK"]:
                self.log.info("Successfully upload log directory to %s" % self.logSE)
                # TODO: The logURL should be constructed using the LogSE and StorageElement()
                # storageElement = StorageElement(self.logSE)
                # pfn = storageElement.getPfnForLfn(self.logFilePath)['Value']
                # logURL = getPfnForProtocol(res['Value'],'http')['Value']
                logURL = "%s" % self.logFilePath
                self.setJobParameter("Log LFN", logURL)
                self.log.info("Logs for this job may be retrieved with dirac-ilc-get-prod-log -F %s" % logURL)
                return S_OK()

        #########################################
        # Recover the logs to a failover storage element
        self.log.error(
            "Completely failed to upload log files to %s, will attempt upload to failover SE" % self.logSE,
            res["Message"],
        )

        tarFileDir = os.path.dirname(self.logdir)
        self.logLFNPath = "%s.gz" % self.logLFNPath
        tarFileName = os.path.basename(self.logLFNPath)
        start = os.getcwd()
        os.chdir(self.logdir)
        logTarFiles = os.listdir(self.logdir)
        # comm = 'tar czvf %s %s' % (tarFileName,string.join(logTarFiles,' '))
        tfile = tarfile.open(tarFileName, "w:gz")
        for item in logTarFiles:
            tfile.add(item)
        tfile.close()
        # res = shellCall(0,comm)
        if not os.path.exists(tarFileName):
            res = S_ERROR("File was not created")
        os.chdir(start)
        if not res["OK"]:
            self.log.error("Failed to create tar file from directory", "%s %s" % (self.logdir, res["Message"]))
            self.setApplicationStatus("Failed To Create Log Tar Dir")
            return S_OK()  # because if the logs are lost, it's not the end of the world.

        # if res['Value'][0]: #i.e. non-zero status
        #  self.log.error('Failed to create tar file from directory','%s %s' % (self.logdir,res['Value']))
        #  self.setApplicationStatus('Failed To Create Log Tar Dir')
        #  return S_OK()#because if the logs are lost, it's not the end of the world.

        ############################################################
        # Instantiate the failover transfer client with the global request object
        failoverTransfer = FailoverTransfer(self.request)
        ##determine the experiment
        self.failoverSEs = self.ops.getValue("Production/%s/FailOverSE" % self.experiment, self.failoverSEs)

        random.shuffle(self.failoverSEs)
        self.log.info(
            "Attempting to store file %s to the following SE(s):\n%s"
            % (tarFileName, string.join(self.failoverSEs, ", "))
        )
        result = failoverTransfer.transferAndRegisterFile(
            tarFileName,
            "%s/%s" % (tarFileDir, tarFileName),
            self.logLFNPath,
            self.failoverSEs,
            fileGUID=None,
            fileCatalog=["FileCatalog", "LcgFileCatalog"],
        )
        if not result["OK"]:
            self.log.error("Failed to upload logs to all destinations")
            self.setApplicationStatus("Failed To Upload Logs")
            return S_OK()  # because if the logs are lost, it's not the end of the world.

        # Now after all operations, retrieve potentially modified request object
        result = failoverTransfer.getRequestObject()
        if not result["OK"]:
            self.log.error(result)
            return S_ERROR("Could not retrieve modified request")

        self.request = result["Value"]
        res = self.createLogUploadRequest(self.logSE, self.logLFNPath)
        if not res["OK"]:
            self.log.error("Failed to create failover request", res["Message"])
            self.setApplicationStatus("Failed To Upload Logs To Failover")
        else:
            self.log.info("Successfully created failover request")

        self.workflow_commons["Request"] = self.request
        return S_OK()

    #############################################################################
    def determineRelevantFiles(self):
        """ The files which are below a configurable size will be stored in the logs.
        This will typically pick up everything in the working directory minus the output data files.
    """
        logFileExtensions = ["*.txt", "*.log", "*.out", "*.output", "*.xml", "*.sh", "*.info", "*.err", "*.root"]
        self.logExtensions = self.ops.getValue("/LogFiles/%s/Extensions" % self.experiment, [])

        if self.logExtensions:
            self.log.info("Using list of log extensions from CS:\n%s" % (", ".join(self.logExtensions)))
            logFileExtensions = self.logExtensions
        else:
            self.log.info("Using default list of log extensions:\n%s" % (", ".join(logFileExtensions)))

        candidateFiles = []
        for ext in logFileExtensions:
            self.log.debug("Looking at log file wildcard: %s" % ext)
            globList = glob.glob(ext)
            for check in globList:
                if os.path.isfile(check):
                    self.log.debug("Found locally existing log file: %s" % check)
                    candidateFiles.append(check)

        selectedFiles = []
        try:
            for candidate in candidateFiles:
                fileSize = os.stat(candidate)[6]
                if fileSize < self.logSizeLimit:
                    selectedFiles.append(candidate)
                else:
                    self.log.error(
                        "Log file found to be greater than maximum of %s bytes" % self.logSizeLimit, candidate
                    )
            return S_OK(selectedFiles)
        except Exception, x:
            self.log.exception("Exception while determining files to save.", "", str(x))
            return S_ERROR("Could not determine log files")