Пример #1
0
        logging.Formatter.converter = time.gmtime
        logHandler.setFormatter(logFormatter)
        logging.getLogger().addHandler(logHandler)

        if ad and not "CRAB3_RUNTIME_DEBUG" in os.environ:
            startDashboardMonitoring(ad)
        print("==== CMSSW Stack Execution STARTING at %s ====" % time.asctime(time.gmtime()))
        scram = Scram(
            version = opts.cmsswVersion,
            directory = os.getcwd(),
            architecture = opts.scramArch,
            )

        print("==== SCRAM Obj CREATED at %s ====" % time.asctime(time.gmtime()))
        if scram.project() or scram.runtime(): #if any of the two commands fail...
            msg = scram.diagnostic()
            handleException("FAILED", EC_CMSMissingSoftware, 'Error setting CMSSW environment: %s' % msg)
            mintime()
            sys.exit(EC_CMSMissingSoftware)

        extractUserSandbox(opts.archiveJob, opts.cmsswVersion)

        try:
            jobExitCode = None
            if opts.scriptExe=='None':
                print("==== CMSSW JOB Execution started at %s ====" % time.asctime(time.gmtime()))
                cmssw = executeCMSSWStack(opts, scram)
                jobExitCode = cmssw.step.execution.exitStatus
            else:
                print("==== ScriptEXE Execution started at %s ====" % time.asctime(time.gmtime()))
                jobExitCode = executeScriptExe(opts, scram)
Пример #2
0
    def execute(self, emulator=None):
        """
        _execute_


        """
        stepModule = "WMTaskSpace.%s" % self.stepName
        if emulator is not None:
            return emulator.emulate(self.step, self.job)

        # write the wrapper script to a temporary location
        # I don't pass it directly through os.system because I don't
        # trust that there won't be shell-escape shenanigans with
        # arbitrary input files
        scramSetup = self.step.application.setup.softwareEnvironment
        scramCommand = self.step.application.setup.scramCommand
        scramProject = self.step.application.setup.scramProject
        scramArch = self.step.application.setup.scramArch
        cmsswVersion = self.step.application.setup.cmsswVersion
        jobReportXML = self.step.output.jobReport
        cmsswCommand = self.step.application.command.executable
        cmsswConfig = self.step.application.command.configuration
        cmsswArguments = self.step.application.command.arguments
        userTarball = ",".join(self.step.user.inputSandboxes)
        userFiles = ",".join(self.step.user.userFiles)
        logging.info("User files are %s", userFiles)
        logging.info("User sandboxes are %s", userTarball)

        multicoreSettings = self.step.application.multicore
        try:
            logging.info("CMSSW configured for %s cores", multicoreSettings.numberOfCores)
        except AttributeError:
            logging.info("No value set for multicore.numberOfCores")

        logging.info("Executing CMSSW step")

        #
        # set any global environment variables
        #
        try:
            os.environ["FRONTIER_ID"] = "wmagent_%s" % (self.report.data.workload)
        except Exception as ex:
            logging.error("Have critical error in setting FRONTIER_ID: %s", str(ex))
            logging.error("Continuing, as this is not a critical function yet.")

        #
        # scram bootstrap
        #
        scram = Scram(
            command=scramCommand,
            version=cmsswVersion,
            initialise=self.step.application.setup.softwareEnvironment,
            directory=self.step.builder.workingDir,
            architecture=scramArch,
        )

        logging.info("Runing SCRAM")
        try:
            projectOutcome = scram.project()
        except Exception as ex:
            msg = "Exception raised while running scram.\n"
            msg += str(ex)
            logging.critical("Error running SCRAM")
            logging.critical(msg)
            raise WMExecutionFailure(50513, "ScramSetupFailure", msg)

        if projectOutcome > 0:
            msg = scram.diagnostic()
            logging.critical("Error running SCRAM")
            logging.critical(msg)
            raise WMExecutionFailure(50513, "ScramSetupFailure", msg)
        runtimeOutcome = scram.runtime()
        if runtimeOutcome > 0:
            msg = scram.diagnostic()
            logging.critical("Error running SCRAM")
            logging.critical(msg)
            raise WMExecutionFailure(50513, "ScramSetupFailure", msg)

        #
        # pre scripts
        #
        logging.info("Running PRE scripts")
        for script in self.step.runtime.preScripts:
            # TODO: Exception handling and error handling & logging
            scriptProcess = subprocess.Popen(
                ["/bin/bash"],
                shell=True,
                cwd=self.step.builder.workingDir,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                stdin=subprocess.PIPE,
            )
            # BADPYTHON
            scriptProcess.stdin.write("export LD_LIBRARY_PATH=$LD_LIBRARY_PATH\n")
            invokeCommand = "%s -m WMCore.WMRuntime.ScriptInvoke %s %s \n" % (sys.executable, stepModule, script)
            logging.info("    Invoking command: %s", invokeCommand)
            scriptProcess.stdin.write(invokeCommand)
            stdout, stderr = scriptProcess.communicate()
            retCode = scriptProcess.returncode
            if retCode > 0:
                msg = "Error running command\n%s\n" % invokeCommand
                msg += "%s\n %s\n %s\n" % (retCode, stdout, stderr)
                logging.critical("Error running command")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "PreScriptFailure", msg)

        #
        # pre scripts with scram
        #
        logging.info("RUNNING SCRAM SCRIPTS")
        for script in self.step.runtime.scramPreScripts:
            # invoke scripts with scram()
            runtimeDir = getattr(self.step.runtime, "scramPreDir", None)
            invokeCommand = (
                self.step.runtime.invokeCommand
                if hasattr(self.step.runtime, "invokeCommand")
                else "%s -m WMCore.WMRuntime.ScriptInvoke %s" % (sys.executable, stepModule)
            )
            invokeCommand += " %s \n" % script
            retCode = scram(invokeCommand, runtimeDir=runtimeDir)
            if retCode > 0:
                msg = "Error running command\n%s\n" % invokeCommand
                msg += "%s\n " % retCode
                msg += scram.diagnostic()
                logging.critical(msg)
                raise WMExecutionFailure(50513, "PreScriptScramFailure", msg)

        configPath = "%s/%s-main.sh" % (self.step.builder.workingDir, self.stepName)
        handle = open(configPath, "w")
        handle.write(CONFIG_BLOB)
        handle.close()

        # spawn this new process
        # the script looks for:
        # <SCRAM_COMMAND> <SCRAM_PROJECT> <CMSSW_VERSION> <JOB_REPORT> <EXECUTABLE> <CONFIG>
        # open the output files
        stdoutHandle = open(self.step.output.stdout, "w")
        stderrHandle = open(self.step.output.stderr, "w")
        args = [
            "/bin/bash",
            configPath,
            scramSetup,
            scramArch,
            scramCommand,
            scramProject,
            cmsswVersion,
            jobReportXML,
            cmsswCommand,
            cmsswConfig,
            userTarball,
            userFiles,
            cmsswArguments,
        ]
        logging.info("Executing CMSSW. args: %s", args)

        # at CERN override the environment to work around problem with GSI authentication plugin and EOS
        if socket.getfqdn().endswith("cern.ch"):
            returncode = subprocess.call(
                args, stdout=stdoutHandle, stderr=stderrHandle, env=dict(os.environ, XRD_LOADBALANCERTTL="86400")
            )
        else:
            returncode = subprocess.call(args, stdout=stdoutHandle, stderr=stderrHandle)

        self.setCondorChirpAttrDelayed("Chirp_WMCore_cmsRun_ExitCode", returncode)
        self.setCondorChirpAttrDelayed("Chirp_WMCore_%s_ExitCode" % self.stepName, returncode)

        stdoutHandle.close()
        stderrHandle.close()

        self.step.execution.exitStatus = returncode
        argsDump = {"arguments": args}

        if returncode != 0:
            msg = "Error running cmsRun\n%s\n" % argsDump
            msg += "Return code: %s\n" % returncode
            logging.critical(msg)
            raise WMExecutionFailure(returncode, "CmsRunFailure", msg)

        try:
            self.report.parse(jobReportXML, stepName=self.stepName)
        except Exception as ex:
            # Catch it if something goes wrong
            raise WMExecutionFailure(50115, "BadJobReportXML", str(ex))

        stepHelper = WMStepHelper(self.step)
        typeHelper = stepHelper.getTypeHelper()

        acquisitionEra = self.task.getAcquisitionEra()
        processingVer = self.task.getProcessingVersion()
        processingStr = self.task.getProcessingString()
        validStatus = self.workload.getValidStatus()
        inputPath = self.task.getInputDatasetPath()
        globalTag = typeHelper.getGlobalTag()
        prepID = self.task.getPrepID()
        cacheUrl, cacheDB, configID = stepHelper.getConfigInfo()

        self.report.setValidStatus(validStatus=validStatus)
        self.report.setGlobalTag(globalTag=globalTag)
        self.report.setPrepID(prepID)
        self.report.setInputDataset(inputPath=inputPath)
        self.report.setAcquisitionProcessing(
            acquisitionEra=acquisitionEra, processingVer=processingVer, processingStr=processingStr
        )
        self.report.setConfigURL(configURL="%s;;%s;;%s" % (cacheUrl, cacheDB, configID))

        # Attach info to files
        self.report.addInfoToOutputFilesForStep(stepName=self.stepName, step=self.step)

        self.report.checkForOutputFiles(stepName=self.stepName)
        self.report.checkForAdlerChecksum(stepName=self.stepName)
        self.report.checkForRunLumiInformation(stepName=self.stepName)

        if self.step.output.keep != True:
            self.report.killOutput()
        else:
            # Check that we only keep the desired output
            for module in stepHelper.getIgnoredOutputModules():
                self.report.deleteOutputModuleForStep(stepName=self.stepName, moduleName=module)

        # Add stageout LFN to existing TFileService files
        reportAnalysisFiles = self.report.getAnalysisFilesFromStep(self.stepName)
        for reportAnalysisFile in reportAnalysisFiles:
            newLFN = analysisFileLFN(reportAnalysisFile.fileName, self.step.user.lfnBase, self.job)
            addAttributesToFile(reportAnalysisFile, pfn=reportAnalysisFile.fileName, lfn=newLFN, validate=False)

        # Add analysis file entries for additional files listed in workflow
        for fileName in stepHelper.listAnalysisFiles():
            analysisFile = stepHelper.getAnalysisFile(fileName)
            if os.path.isfile(analysisFile.fileName):
                newLFN = analysisFileLFN(analysisFile.fileName, analysisFile.lfnBase, self.job)
                self.report.addAnalysisFile(
                    analysisFile.fileName,
                    lfn=newLFN,
                    Source="UserDefined",
                    pfn=os.path.join(os.getcwd(), analysisFile.fileName),
                    validate=False,
                )

        return
Пример #3
0
    def execute(self, emulator = None):
        """
        _execute_

        """
        scramCommand = self.step.application.setup.scramCommand
        scramArch = self.step.application.setup.scramArch
        cmsswVersion = self.step.application.setup.cmsswVersion

        # Are we using emulators again?
        if (emulator != None):
            return emulator.emulate( self.step, self.job )

        overrides = {}
        if hasattr(self.step, 'override'):
            overrides = self.step.override.dictionary_()

        # Set wait to over an hour
        waitTime = overrides.get('waitTime', 3600 + (self.step.retryDelay * self.step.retryCount))

        # Pull out StageOutMgr Overrides
        # switch between old stageOut behavior and new, fancy stage out behavior
        useNewStageOutCode = False
        if 'newStageOut' in overrides and overrides.get('newStageOut'):
            useNewStageOutCode = True

        # hardcode CERN Castor T0_CH_CERN_MSS stageout parameters
        castorStageOutParams = {}
        castorStageOutParams['command'] = overrides.get('command', "srmv2-lcg")
        castorStageOutParams['option'] = overrides.get('option', "")
        castorStageOutParams['se-name'] = overrides.get('se-name', "srm-cms.cern.ch")
        castorStageOutParams['phedex-node'] = overrides.get('phedex-node', "T2_CH_CERN")
        castorStageOutParams['lfn-prefix'] = overrides.get('lfn-prefix', "srm://srm-cms.cern.ch:8443/srm/managerv2?SFN=/castor/cern.ch/cms")

        # hardcode CERN EOS T2_CH_CERN stageout parameters
        eosStageOutParams = {}
        eosStageOutParams['command'] = overrides.get('command', "srmv2-lcg")
        eosStageOutParams['option'] = overrides.get('option', "")
        eosStageOutParams['se-name'] = overrides.get('se-name', "srm-eoscms.cern.ch")
        eosStageOutParams['phedex-node'] = overrides.get('phedex-node', "T2_CH_CERN")
        eosStageOutParams['lfn-prefix'] = overrides.get('lfn-prefix', "srm://srm-eoscms.cern.ch:8443/srm/v2/server?SFN=/eos/cms")

        # are we using the new stageout method ?
        useNewStageOutCode = False
        if getattr(self.step, 'newStageout', False) or \
            ('newStageOut' in overrides and overrides.get('newStageOut')):
            useNewStageOutCode = True

        try:
            if useNewStageOutCode:
                # is this even working ???
                #logging.info("LOGCOLLECT IS USING NEW STAGEOUT CODE")
                #stageOutMgr = StageOutMgr(retryPauseTime  = self.step.retryDelay,
                #                          numberOfRetries = self.step.retryCount,
                #                          **overrides)
                #stageInMgr = StageInMgr(retryPauseTime  = 0,
                #                        numberOfRetries = 0)
                #deleteMgr = DeleteMgr(retryPauseTime  = 0,
                #                      numberOfRetries = 0)
                castorStageOutMgr = StageOutMgr(**castorStageOutParams)
                eosStageOutMgr = StageOutMgr(**eosStageOutParams)
                stageInMgr = StageInMgr()
                deleteMgr = DeleteMgr()
            else:
                castorStageOutMgr = StageOutMgr(**castorStageOutParams)
                eosStageOutMgr = StageOutMgr(**eosStageOutParams)
                stageInMgr = StageInMgr()
                deleteMgr = DeleteMgr()
        except Exception as ex:
            msg = "Unable to load StageOut/Delete Impl: %s" % str(ex)
            logging.error(msg)
            raise WMExecutionFailure(60312, "MgrImplementationError", msg)

        # prepare output tar file
        taskName = self.report.getTaskName().split('/')[-1]
        host = socket.gethostname().split('.')[0]
        tarName = '%s-%s-%s-%i-logs.tar' % (self.report.data.workload, taskName, host , self.job["counter"])
        tarLocation = os.path.join(self.stepSpace.location, tarName)

        # check if the cmsswVersion supports edmCopyUtil (min CMSSW_8_X)
        result = re.match("CMSSW_([0-9]+)_([0-9]+)_([0-9]+).*", cmsswVersion)
        useEdmCopyUtil = False
        if result:
            try:
                if int(result.group(1)) >= 8:
                    useEdmCopyUtil = True
            except ValueError:
                pass

        # setup Scram needed to run edmCopyUtil
        if useEdmCopyUtil:
            scram = Scram(
                command=scramCommand,
                version=cmsswVersion,
                initialise=self.step.application.setup.softwareEnvironment,
                directory=self.step.builder.workingDir,
                architecture=scramArch,
                )
            logging.info("Running scram")
            try:
                projectOutcome = scram.project()
            except Exception as ex:
                msg = "Exception raised while running scram.\n"
                msg += str(ex)
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)

            if projectOutcome > 0:
                msg = scram.diagnostic()
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)
            runtimeOutcome = scram.runtime()
            if runtimeOutcome > 0:
                msg = scram.diagnostic()
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)

        # iterate through input files
        localLogs = []
        deleteLogArchives = []
        if useEdmCopyUtil:
            numberOfFilesPerCopy = 10
        else:
            numberOfFilesPerCopy = 1
        for logs in grouper(self.job["input_files"], numberOfFilesPerCopy):

            copyCommand = "env X509_USER_PROXY=%s edmCopyUtil" % os.environ.get('X509_USER_PROXY', None)
            for log in logs:
                copyCommand += " %s" % log['lfn']
            copyCommand += " %s" % self.step.builder.workingDir

            # give up after timeout of 1 minute per input file
            signal.signal(signal.SIGALRM, alarmHandler)
            signal.alarm(60 * numberOfFilesPerCopy)

            filesCopied = False
            try:
                if useEdmCopyUtil:
                    logging.info("Running edmCopyUtil")
                    retval = scram(copyCommand)
                    if retval == 0:
                        filesCopied = True
                else:
                    logging.info("Running stageIn")
                    for log in logs:
                        fileInfo = {"LFN": log['lfn']}
                        logArchive = stageInMgr(**fileInfo)
                        if logArchive:
                            filesCopied = True
            except Alarm:
                logging.error("Indefinite hang during edmCopyUtil/stageIn of logArchives")
            except StageOutFailure:
                logging.error("Unable to stageIn logArchives")
            except Exception:
                raise

            signal.alarm(0)

            if filesCopied:
                for log in logs:
                    localLogs.append(os.path.join(self.step.builder.workingDir, os.path.basename(log['lfn'])))
                    deleteLogArchives.append(log)
                    self.report.addInputFile(sourceName = "logArchives", lfn = log['lfn'])
            else:
                logging.error("Unable to copy logArchives to local disk")
                if useEdmCopyUtil:
                    with open('scramOutput.log', 'r') as f:
                        logging.error("Scram output: %s", f.read())
                for log in logs:
                    self.report.addSkippedFile(log['lfn'], None)

        # create tarfile if any logArchive copied in
        if localLogs:
            tarFile = tarfile.open(tarLocation, 'w:')
            for log in localLogs:
                path = log.split('/')
                tarFile.add(name = log,
                            arcname = os.path.join(path[-3],
                                                   path[-2],
                                                   path[-1]))
                os.remove(log)
            tarFile.close()
        else:
            msg = "Unable to copy any logArchives to local disk"
            logging.error(msg)
            raise WMExecutionFailure(60312, "LogCollectError", msg)


        # now staging out the LogCollect tarfile
        logging.info("Staging out LogCollect tarfile to Castor and EOS")
        now = datetime.datetime.now()
        lfn = "/store/logs/prod/%i/%.2i/%s/%s/%s" % (now.year, now.month, "WMAgent",
                                                     self.report.data.workload,
                                                     os.path.basename(tarLocation))

        tarInfo = {'LFN'    : lfn,
                   'PFN'    : tarLocation,
                   'SEName' : None,
                   'PNN'    : None,
                   'GUID'   : None}

        # perform mandatory stage out to CERN Castor
        signal.signal(signal.SIGALRM, alarmHandler)
        signal.alarm(waitTime)
        try:
            castorStageOutMgr(tarInfo)
        except Alarm:
            msg = "Indefinite hang during stageOut of LogCollect to Castor"
            logging.error(msg)
            raise WMExecutionFailure(60409, "LogCollectTimeout", msg)
        except Exception as ex:
            msg = "Unable to stageOut LogCollect to Castor:\n"
            msg += str(ex)
            logging.error(msg)
            raise WMExecutionFailure(60408, "LogCollectStageOutError", msg)
        signal.alarm(0)

        # add to job report
        self.report.addOutputFile(outputModule = "LogCollect", file = tarInfo)
        outputRef = getattr(self.report.data, self.stepName)
        outputRef.output.pfn = tarInfo['PFN']
        outputRef.output.location = tarInfo['PNN']
        outputRef.output.lfn = tarInfo['LFN']


        tarInfo = {'LFN'    : lfn,
                   'PFN'    : tarLocation,
                   'SEName' : None,
                   'PNN'    : None,
                   'GUID'   : None}

        # then, perform best effort stage out to CERN EOS
        signal.signal(signal.SIGALRM, alarmHandler)
        signal.alarm(waitTime)
        try:
            eosStageOutMgr(tarInfo)
        except Alarm:
            logging.error("Indefinite hang during stageOut of LogCollect to EOS")
        except Exception as ex:
            logging.error("Unable to stageOut LogCollect to EOS:\n", ex)
        signal.alarm(0)

        # we got this far, delete input
        for log in deleteLogArchives:

            # give up after timeout of 1 minutes
            signal.signal(signal.SIGALRM, alarmHandler)
            signal.alarm(60)
            try:
                fileToDelete = {'LFN': log['lfn'],
                                'PFN': None,
                                'SEName': None,
                                'PNN': None,
                                'StageOutCommand': None}
                deleteMgr(fileToDelete = fileToDelete)
            except Alarm:
                logging.error("Indefinite hang during delete of logArchive")
            except Exception as ex:
                logging.error("Unable to delete logArchive: %s", ex)
            signal.alarm(0)

        return
Пример #4
0
    def execute(self, emulator=None):
        """
        _execute_

        """
        scramCommand = self.step.application.setup.scramCommand
        scramArch = self.step.application.setup.scramArch
        cmsswVersion = self.step.application.setup.cmsswVersion

        # Are we using emulators again?
        if (emulator != None):
            return emulator.emulate(self.step, self.job)

        overrides = {}
        if hasattr(self.step, 'override'):
            overrides = self.step.override.dictionary_()

        # Set wait to over an hour
        waitTime = overrides.get(
            'waitTime', 3600 + (self.step.retryDelay * self.step.retryCount))

        # hardcode CERN Castor T0_CH_CERN_MSS stageout parameters
        castorStageOutParams = {}
        castorStageOutParams['command'] = overrides.get('command', "xrdcp")
        castorStageOutParams['option'] = overrides.get('option',
                                                       "--cerncastor")
        castorStageOutParams['phedex-node'] = overrides.get(
            'phedex-node', "T2_CH_CERN")
        castorStageOutParams['lfn-prefix'] = overrides.get(
            'lfn-prefix', "root://castorcms.cern.ch//castor/cern.ch/cms")

        # hardcode CERN EOS T2_CH_CERN stageout parameters
        eosStageOutParams = {}
        eosStageOutParams['command'] = overrides.get('command', "xrdcp")
        eosStageOutParams['option'] = overrides.get('option', "")
        eosStageOutParams['phedex-node'] = overrides.get(
            'phedex-node', "T2_CH_CERN")
        eosStageOutParams['lfn-prefix'] = overrides.get(
            'lfn-prefix', "root://eoscms.cern.ch//eos/cms")

        # are we using the new stageout method ?
        useNewStageOutCode = False
        if getattr(self.step, 'newStageout', False) or \
            ('newStageOut' in overrides and overrides.get('newStageOut')):
            useNewStageOutCode = True

        try:
            castorStageOutMgr = StageOutMgr(**castorStageOutParams)
            eosStageOutMgr = StageOutMgr(**eosStageOutParams)
            stageInMgr = StageInMgr()
            deleteMgr = DeleteMgr()
        except Exception as ex:
            msg = "Unable to load StageOut/Delete Impl: %s" % str(ex)
            logging.error(msg)
            raise WMExecutionFailure(60312, "MgrImplementationError", msg)

        # prepare output tar file
        taskName = self.report.getTaskName().split('/')[-1]
        host = socket.gethostname().split('.')[0]
        tarName = '%s-%s-%s-%i-logs.tar' % (
            self.report.data.workload, taskName, host, self.job["counter"])
        tarLocation = os.path.join(self.stepSpace.location, tarName)

        # check if the cmsswVersion supports edmCopyUtil (min CMSSW_8_X)
        result = re.match("CMSSW_([0-9]+)_([0-9]+)_([0-9]+).*", cmsswVersion)
        useEdmCopyUtil = False
        if result:
            try:
                if int(result.group(1)) >= 8:
                    useEdmCopyUtil = True
            except ValueError:
                pass

        # setup Scram needed to run edmCopyUtil
        if useEdmCopyUtil:
            scram = Scram(
                command=scramCommand,
                version=cmsswVersion,
                initialise=self.step.application.setup.softwareEnvironment,
                directory=self.step.builder.workingDir,
                architecture=scramArch,
            )
            logging.info("Running scram")
            try:
                projectOutcome = scram.project()
            except Exception as ex:
                msg = "Exception raised while running scram.\n"
                msg += str(ex)
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)

            if projectOutcome > 0:
                msg = scram.diagnostic()
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)
            runtimeOutcome = scram.runtime()
            if runtimeOutcome > 0:
                msg = scram.diagnostic()
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)

        # iterate through input files
        localLogs = []
        deleteLogArchives = []
        if useEdmCopyUtil:
            numberOfFilesPerCopy = 10
        else:
            numberOfFilesPerCopy = 1
        for logs in grouper(self.job["input_files"], numberOfFilesPerCopy):

            copyCommand = "env X509_USER_PROXY=%s edmCopyUtil" % os.environ.get(
                'X509_USER_PROXY', None)
            for log in logs:
                copyCommand += " %s" % log['lfn']
            copyCommand += " %s" % self.step.builder.workingDir

            # give up after timeout of 1 minute per input file
            signal.signal(signal.SIGALRM, alarmHandler)
            signal.alarm(60 * numberOfFilesPerCopy)

            filesCopied = False
            try:
                if useEdmCopyUtil:
                    logging.info("Running edmCopyUtil")
                    retval = scram(copyCommand)
                    if retval == 0:
                        filesCopied = True
                else:
                    logging.info("Running stageIn")
                    for log in logs:
                        fileInfo = {"LFN": log['lfn']}
                        logArchive = stageInMgr(**fileInfo)
                        if logArchive:
                            filesCopied = True
            except Alarm:
                logging.error(
                    "Indefinite hang during edmCopyUtil/stageIn of logArchives"
                )
            except StageOutFailure:
                logging.error("Unable to stageIn logArchives")
            except Exception:
                raise

            signal.alarm(0)

            if filesCopied:
                for log in logs:
                    localLogs.append(
                        os.path.join(self.step.builder.workingDir,
                                     os.path.basename(log['lfn'])))
                    deleteLogArchives.append(log)
                    self.report.addInputFile(sourceName="logArchives",
                                             lfn=log['lfn'])
            else:
                logging.error("Unable to copy logArchives to local disk")
                if useEdmCopyUtil:
                    with open('scramOutput.log', 'r') as f:
                        logging.error("Scram output: %s", f.read())
                for log in logs:
                    self.report.addSkippedFile(log['lfn'], None)

        # create tarfile if any logArchive copied in
        if localLogs:
            tarFile = tarfile.open(tarLocation, 'w:')
            for log in localLogs:
                path = log.split('/')
                tarFile.add(name=log,
                            arcname=os.path.join(path[-3], path[-2], path[-1]))
                os.remove(log)
            tarFile.close()
        else:
            msg = "Unable to copy any logArchives to local disk"
            logging.error(msg)
            raise WMExecutionFailure(60312, "LogCollectError", msg)

        # now staging out the LogCollect tarfile
        logging.info("Staging out LogCollect tarfile to Castor and EOS")
        now = datetime.datetime.now()
        lfn = "/store/logs/prod/%i/%.2i/%s/%s/%s" % (
            now.year, now.month, "WMAgent", self.report.data.workload,
            os.path.basename(tarLocation))

        tarInfo = {'LFN': lfn, 'PFN': tarLocation, 'PNN': None, 'GUID': None}

        # perform mandatory stage out to CERN Castor
        signal.signal(signal.SIGALRM, alarmHandler)
        signal.alarm(waitTime)
        try:
            castorStageOutMgr(tarInfo)
        except Alarm:
            msg = "Indefinite hang during stageOut of LogCollect to Castor"
            logging.error(msg)
            raise WMExecutionFailure(60409, "LogCollectTimeout", msg)
        except Exception as ex:
            msg = "Unable to stageOut LogCollect to Castor:\n"
            msg += str(ex)
            logging.error(msg)
            raise WMExecutionFailure(60408, "LogCollectStageOutError", msg)
        signal.alarm(0)

        # add to job report
        self.report.addOutputFile(outputModule="LogCollect", file=tarInfo)
        outputRef = getattr(self.report.data, self.stepName)
        outputRef.output.pfn = tarInfo['PFN']
        outputRef.output.location = tarInfo['PNN']
        outputRef.output.lfn = tarInfo['LFN']

        tarInfo = {'LFN': lfn, 'PFN': tarLocation, 'PNN': None, 'GUID': None}

        # then, perform best effort stage out to CERN EOS
        signal.signal(signal.SIGALRM, alarmHandler)
        signal.alarm(waitTime)
        try:
            eosStageOutMgr(tarInfo)
        except Alarm:
            logging.error(
                "Indefinite hang during stageOut of LogCollect to EOS")
        except Exception as ex:
            logging.error("Unable to stageOut LogCollect to EOS:\n", ex)
        signal.alarm(0)

        # we got this far, delete input
        for log in deleteLogArchives:

            # give up after timeout of 1 minutes
            signal.signal(signal.SIGALRM, alarmHandler)
            signal.alarm(60)
            try:
                fileToDelete = {
                    'LFN': log['lfn'],
                    'PFN': None,
                    'PNN': None,
                    'StageOutCommand': None
                }
                deleteMgr(fileToDelete=fileToDelete)
            except Alarm:
                logging.error("Indefinite hang during delete of logArchive")
            except Exception as ex:
                logging.error("Unable to delete logArchive: %s", ex)
            signal.alarm(0)

        return
Пример #5
0
    def execute(self, emulator=None):
        """
        _execute_


        """
        stepModule = "WMTaskSpace.%s" % self.stepName
        if emulator is not None:
            return emulator.emulate(self.step, self.job)

        # write the wrapper script to a temporary location
        # I don't pass it directly through os.system because I don't
        # trust that there won't be shell-escape shenanigans with
        # arbitrary input files
        scramSetup = self.step.application.setup.softwareEnvironment
        scramCommand = self.step.application.setup.scramCommand
        scramProject = self.step.application.setup.scramProject
        scramArch = self.step.application.setup.scramArch
        cmsswVersion = self.step.application.setup.cmsswVersion
        jobReportXML = self.step.output.jobReport
        cmsswCommand = self.step.application.command.executable
        cmsswConfig = self.step.application.command.configuration
        cmsswArguments = self.step.application.command.arguments
        userTarball = ','.join(self.step.user.inputSandboxes)
        userFiles = ','.join(self.step.user.userFiles)
        logging.info('User files are %s', userFiles)
        logging.info('User sandboxes are %s', userTarball)

        scramArch = getSingleScramArch(scramArch)

        multicoreSettings = self.step.application.multicore
        try:
            logging.info("CMSSW configured for %s cores and %s event streams",
                         multicoreSettings.numberOfCores,
                         multicoreSettings.eventStreams)
        except AttributeError:
            logging.info(
                "No value set for multicore numberOfCores or eventStreams")

        logging.info("Executing CMSSW step")

        #
        # set any global environment variables
        #
        try:
            os.environ['FRONTIER_ID'] = 'wmagent_%s' % (
                self.report.data.workload)
        except Exception as ex:
            logging.error('Have critical error in setting FRONTIER_ID: %s',
                          str(ex))
            logging.error(
                'Continuing, as this is not a critical function yet.')

        #
        # scram bootstrap
        #
        scram = Scram(
            command=scramCommand,
            version=cmsswVersion,
            initialise=self.step.application.setup.softwareEnvironment,
            directory=self.step.builder.workingDir,
            architecture=scramArch,
        )

        logging.info("Runing SCRAM")
        try:
            projectOutcome = scram.project()
        except Exception as ex:
            msg = "Exception raised while running scram.\n"
            msg += str(ex)
            logging.critical("Error running SCRAM")
            logging.critical(msg)
            raise WMExecutionFailure(50513, "ScramSetupFailure", msg)

        if projectOutcome > 0:
            msg = scram.diagnostic()
            logging.critical("Error running SCRAM")
            logging.critical(msg)
            raise WMExecutionFailure(50513, "ScramSetupFailure", msg)
        runtimeOutcome = scram.runtime()
        if runtimeOutcome > 0:
            msg = scram.diagnostic()
            logging.critical("Error running SCRAM")
            logging.critical(msg)
            raise WMExecutionFailure(50513, "ScramSetupFailure", msg)

        #
        # pre scripts
        #
        logging.info("Running PRE scripts")
        for script in self.step.runtime.preScripts:
            # TODO: Exception handling and error handling & logging
            scriptProcess = subprocess.Popen(
                ["/bin/bash"],
                shell=True,
                cwd=self.step.builder.workingDir,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                stdin=subprocess.PIPE,
            )
            # BADPYTHON
            scriptProcess.stdin.write(
                "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH\n")
            invokeCommand = "%s -m WMCore.WMRuntime.ScriptInvoke %s %s \n" % (
                sys.executable, stepModule, script)
            logging.info("    Invoking command: %s", invokeCommand)
            scriptProcess.stdin.write(invokeCommand)
            stdout, stderr = scriptProcess.communicate()
            retCode = scriptProcess.returncode
            if retCode > 0:
                msg = "Error running command\n%s\n" % invokeCommand
                msg += "%s\n %s\n %s\n" % (retCode, stdout, stderr)
                logging.critical("Error running command")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "PreScriptFailure", msg)

        #
        # pre scripts with scram
        #
        logging.info("RUNNING SCRAM SCRIPTS")
        for script in self.step.runtime.scramPreScripts:
            # invoke scripts with scram()
            runtimeDir = getattr(self.step.runtime, 'scramPreDir', None)
            invokeCommand = self.step.runtime.invokeCommand if hasattr(self.step.runtime, 'invokeCommand') else \
                "%s -m WMCore.WMRuntime.ScriptInvoke %s" % (sys.executable, stepModule)
            invokeCommand += " %s \n" % script
            retCode = scram(invokeCommand, runtimeDir=runtimeDir)
            if retCode > 0:
                msg = "Error running command\n%s\n" % invokeCommand
                msg += "%s\n " % retCode
                msg += scram.diagnostic()
                logging.critical(msg)
                raise WMExecutionFailure(50513, "PreScriptScramFailure", msg)

        configPath = "%s/%s-main.sh" % (self.step.builder.workingDir,
                                        self.stepName)
        handle = open(configPath, 'w')
        handle.write(CONFIG_BLOB)
        handle.close()

        # spawn this new process
        # the script looks for:
        # <SCRAM_COMMAND> <SCRAM_PROJECT> <CMSSW_VERSION> <JOB_REPORT> <EXECUTABLE> <CONFIG>
        # open the output files
        stdoutHandle = open(self.step.output.stdout, 'w')
        stderrHandle = open(self.step.output.stderr, 'w')
        args = [
            '/bin/bash', configPath, scramSetup, scramArch, scramCommand,
            scramProject, cmsswVersion, jobReportXML, cmsswCommand,
            cmsswConfig, userTarball, userFiles, cmsswArguments
        ]
        logging.info("Executing CMSSW. args: %s", args)

        # possibly needed environment overrides for CMSSW call go here
        envOverride = {}
        # work around problem with GSI authentication plugin and EOS at CERN
        if socket.getfqdn().endswith("cern.ch"):
            envOverride['XRD_LOADBALANCERTTL'] = "86400"
        # some libraries linked with CMSSW need HOME in the environment
        if 'HOME' not in os.environ:
            envOverride['HOME'] = os.environ.get('PWD', "/")

        os.environ.update(envOverride)

        returncode = subprocess.call(args,
                                     stdout=stdoutHandle,
                                     stderr=stderrHandle)

        self.setCondorChirpAttrDelayed('Chirp_WMCore_cmsRun_ExitCode',
                                       returncode)
        self.setCondorChirpAttrDelayed(
            'Chirp_WMCore_%s_ExitCode' % self.stepName, returncode)

        stdoutHandle.close()
        stderrHandle.close()

        self.step.execution.exitStatus = returncode
        argsDump = {'arguments': args}

        if returncode != 0:
            msg = "Error running cmsRun\n%s\n" % argsDump
            msg += "Return code: %s\n" % returncode
            logging.critical(msg)
            raise WMExecutionFailure(returncode, "CmsRunFailure", msg)

        try:
            self.report.parse(jobReportXML, stepName=self.stepName)
        except Exception as ex:
            # Catch it if something goes wrong
            raise WMExecutionFailure(50115, "BadJobReportXML", str(ex))

        stepHelper = WMStepHelper(self.step)
        typeHelper = stepHelper.getTypeHelper()

        acquisitionEra = self.task.getAcquisitionEra()
        processingVer = self.task.getProcessingVersion()
        processingStr = self.task.getProcessingString()
        validStatus = self.workload.getValidStatus()
        inputPath = self.task.getInputDatasetPath()
        globalTag = typeHelper.getGlobalTag()
        prepID = self.task.getPrepID()
        campaign = self.workload.getCampaign()
        cacheUrl, cacheDB, configID = stepHelper.getConfigInfo()

        self.report.setValidStatus(validStatus=validStatus)
        self.report.setGlobalTag(globalTag=globalTag)
        self.report.setCampaign(campaign)
        self.report.setPrepID(prepID)
        self.report.setInputDataset(inputPath=inputPath)
        self.report.setAcquisitionProcessing(acquisitionEra=acquisitionEra,
                                             processingVer=processingVer,
                                             processingStr=processingStr)
        self.report.setConfigURL(configURL="%s;;%s;;%s" %
                                 (cacheUrl, cacheDB, configID))

        # Attach info to files
        self.report.addInfoToOutputFilesForStep(stepName=self.stepName,
                                                step=self.step)

        self.report.checkForOutputFiles(stepName=self.stepName)
        self.report.checkForAdlerChecksum(stepName=self.stepName)
        self.report.checkForRunLumiInformation(stepName=self.stepName)

        if self.step.output.keep != True:
            self.report.killOutput()
        else:
            # Check that we only keep the desired output
            for module in stepHelper.getIgnoredOutputModules():
                self.report.deleteOutputModuleForStep(stepName=self.stepName,
                                                      moduleName=module)

        # Add stageout LFN to existing TFileService files
        reportAnalysisFiles = self.report.getAnalysisFilesFromStep(
            self.stepName)
        for reportAnalysisFile in reportAnalysisFiles:
            newLFN = analysisFileLFN(reportAnalysisFile.fileName,
                                     self.step.user.lfnBase, self.job)
            addAttributesToFile(reportAnalysisFile,
                                pfn=reportAnalysisFile.fileName,
                                lfn=newLFN,
                                validate=False)

        # Add analysis file entries for additional files listed in workflow
        for fileName in stepHelper.listAnalysisFiles():
            analysisFile = stepHelper.getAnalysisFile(fileName)
            if os.path.isfile(analysisFile.fileName):
                newLFN = analysisFileLFN(analysisFile.fileName,
                                         analysisFile.lfnBase, self.job)
                self.report.addAnalysisFile(analysisFile.fileName,
                                            lfn=newLFN,
                                            Source='UserDefined',
                                            pfn=os.path.join(
                                                os.getcwd(),
                                                analysisFile.fileName),
                                            validate=False)

        return
Пример #6
0
    def execute(self, emulator=None):
        """
        _execute_

        """
        # Are we using emulators again?
        if emulator is not None:
            return emulator.emulate(self.step, self.job)

        logging.info("Steps.Executors.%s.execute called", self.__class__.__name__)

        scramCommand = self.step.application.setup.scramCommand
        cmsswVersion = self.step.application.setup.cmsswVersion
        scramArch = getSingleScramArch(self.step.application.setup.scramArch)
        overrideCatalog = getattr(self.step.application, 'overrideCatalog', None)

        overrides = {}
        if hasattr(self.step, 'override'):
            overrides = self.step.override.dictionary_()

        # Set wait to over an hour
        waitTime = overrides.get('waitTime', 3600 + (self.step.retryDelay * self.step.retryCount))

        # hardcode CERN EOS T2_CH_CERN stageout parameters
        eosStageOutParams = {}
        eosStageOutParams['command'] = overrides.get('command', "xrdcp")
        eosStageOutParams['option'] = overrides.get('option', "--wma-disablewriterecovery")
        eosStageOutParams['phedex-node'] = overrides.get('phedex-node', "T2_CH_CERN")
        eosStageOutParams['lfn-prefix'] = overrides.get('lfn-prefix', "root://eoscms.cern.ch//eos/cms")

        try:
            eosStageOutMgr = StageOutMgr(**eosStageOutParams)
            stageInMgr = StageInMgr()
            deleteMgr = DeleteMgr()
        except Exception as ex:
            msg = "Unable to load StageOut/Delete Impl: %s" % str(ex)
            logging.error(msg)
            raise WMExecutionFailure(60312, "MgrImplementationError", msg)

        # prepare output tar file
        taskName = self.report.getTaskName().split('/')[-1]
        host = socket.gethostname().split('.')[0]
        tarName = '%s-%s-%s-%i-logs.tar' % (self.report.data.workload, taskName, host, self.job["counter"])
        tarLocation = os.path.join(self.stepSpace.location, tarName)

        # Supported by any release beyond CMSSW_8_X, however DaviX is broken until CMSSW_10_4_X
        # see: https://github.com/cms-sw/cmssw/issues/25292
        useEdmCopyUtil = True
        if isCMSSWSupported(cmsswVersion, "CMSSW_10_4_0"):
            pass
        elif scramArch.startswith('slc7_amd64_'):
            msg = "CMSSW too old or not fully functional to support edmCopyUtil, using CMSSW_10_4_0 instead"
            logging.warning(msg)
            cmsswVersion = "CMSSW_10_4_0"
            scramArch = "slc7_amd64_gcc820"
        elif scramArch.startswith('slc6_amd64_'):
            msg = "CMSSW too old or not fully functional to support edmCopyUtil, using CMSSW_10_4_0 instead"
            logging.warning(msg)
            cmsswVersion = "CMSSW_10_4_0"
            scramArch = "slc6_amd64_gcc700"
        else:
            useEdmCopyUtil = False
        logging.info("Using edmCopyUtil: %s", useEdmCopyUtil)

        # setup Scram needed to run edmCopyUtil
        if useEdmCopyUtil:
            scram = Scram(
                    command=scramCommand,
                    version=cmsswVersion,
                    initialise=self.step.application.setup.softwareEnvironment,
                    directory=self.step.builder.workingDir,
                    architecture=scramArch,
            )
            logging.info("Running scram")
            try:
                projectOutcome = scram.project()
            except Exception as ex:
                msg = "Exception raised while running scram.\n"
                msg += str(ex)
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)

            if projectOutcome > 0:
                msg = scram.diagnostic()
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)
            runtimeOutcome = scram.runtime()
            if runtimeOutcome > 0:
                msg = scram.diagnostic()
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)

        # iterate through input files
        localLogs = []
        deleteLogArchives = []
        if useEdmCopyUtil:
            numberOfFilesPerCopy = 10
        else:
            numberOfFilesPerCopy = 1
        for logs in grouper(self.job["input_files"], numberOfFilesPerCopy):

            copyCommand = "env X509_USER_PROXY=%s edmCopyUtil" % os.environ.get('X509_USER_PROXY', None)

            # specify TFC if necessary
            if overrideCatalog:
                copyCommand += " -c %s" % overrideCatalog

            for log in logs:
                copyCommand += " %s" % log['lfn']
            copyCommand += " %s" % self.step.builder.workingDir

            # give up after timeout of 1 minute per input file
            signal.signal(signal.SIGALRM, alarmHandler)
            signal.alarm(60 * numberOfFilesPerCopy)

            filesCopied = False
            try:
                if useEdmCopyUtil:
                    logging.info("Running edmCopyUtil")
                    retval = scram(copyCommand)
                    if retval == 0:
                        filesCopied = True
                else:
                    logging.info("Running stageIn")
                    for log in logs:
                        fileInfo = {"LFN": log['lfn']}
                        logArchive = stageInMgr(**fileInfo)
                        if logArchive:
                            filesCopied = True
            except Alarm:
                logging.error("Indefinite hang during edmCopyUtil/stageIn of logArchives")
            except StageOutFailure:
                logging.error("Unable to stageIn logArchives")
            except Exception:
                raise

            signal.alarm(0)

            if filesCopied:
                for log in logs:
                    localLogs.append(os.path.join(self.step.builder.workingDir, os.path.basename(log['lfn'])))
                    deleteLogArchives.append(log)
                    self.report.addInputFile(sourceName="logArchives", lfn=log['lfn'])
            else:
                logging.error("Unable to copy logArchives to local disk")
                for log in logs:
                    self.report.addSkippedFile(log['lfn'], None)

        # create tarfile if any logArchive copied in
        if localLogs:
            with tarfile.open(tarLocation, 'w:') as tarFile:
                for log in localLogs:
                    path = log.split('/')
                    tarFile.add(name=log,
                                arcname=os.path.join(path[-3], path[-2], path[-1]))
                    os.remove(log)
        else:
            msg = "Unable to copy any logArchives to local disk"
            logging.error(msg)
            raise WMExecutionFailure(60312, "LogCollectError", msg)

        # now staging out the LogCollect tarfile
        logging.info("Staging out LogCollect tarfile to EOS (skipping CASTOR)")
        now = datetime.datetime.now()
        lfn = "/store/logs/prod/%i/%.2i/%s/%s/%s" % (now.year, now.month, "WMAgent",
                                                     self.report.data.workload,
                                                     os.path.basename(tarLocation))

        tarInfo = {'LFN': lfn,
                   'PFN': tarLocation,
                   'PNN': None,
                   'GUID': None}

        # perform mandatory stage out to CERN EOS
        signal.signal(signal.SIGALRM, alarmHandler)
        signal.alarm(waitTime)
        try:
            eosStageOutMgr(tarInfo)
        except Alarm:
            msg = "Indefinite hang during stageOut of LogCollect to EOS"
            logging.error(msg)
            raise WMExecutionFailure(60409, "LogCollectTimeout", msg)
        except Exception as ex:
            msg = "Unable to stageOut LogCollect to Castor:\n"
            msg += str(ex)
            logging.error(msg)
            raise WMExecutionFailure(60408, "LogCollectStageOutError", msg)
        signal.alarm(0)

        # add to job report
        self.report.addOutputFile(outputModule="LogCollect", aFile=tarInfo)
        outputRef = getattr(self.report.data, self.stepName)
        outputRef.output.pfn = tarInfo['PFN']
        outputRef.output.location = tarInfo['PNN']
        outputRef.output.lfn = tarInfo['LFN']

        # we got this far, delete ALL input files assigned to this job
        for log in self.job["input_files"]:
            # give up after timeout of 1 minutes
            signal.signal(signal.SIGALRM, alarmHandler)
            signal.alarm(60)
            try:
                fileToDelete = {'LFN': log['lfn'],
                                'PFN': None,
                                'PNN': None,
                                'StageOutCommand': None}
                deleteMgr(fileToDelete=fileToDelete)
            except Alarm:
                logging.error("Indefinite hang during delete of logArchive")
            except Exception as ex:
                logging.error("Unable to delete logArchive: %s", ex)
            signal.alarm(0)

        return
Пример #7
0
        #logging.getLogger().addHandler(logHandler)

        if ad and not "CRAB3_RUNTIME_DEBUG" in os.environ:
            startDashboardMonitoring(ad)
        print("==== CMSSW Stack Execution STARTING at %s ====" %
              time.asctime(time.gmtime()))
        scr = Scram(
            version=options.cmsswVersion,
            directory=os.getcwd(),
            architecture=options.scramArch,
        )

        print("==== SCRAM Obj CREATED at %s ====" %
              time.asctime(time.gmtime()))
        if scr.project() or scr.runtime():  #if any of the two commands fail...
            dgn = scr.diagnostic()
            handleException("FAILED", EC_CMSMissingSoftware,
                            'Error setting CMSSW environment: %s' % dgn)
            mintime()
            sys.exit(EC_CMSMissingSoftware)

        extractUserSandbox(options.archiveJob, options.cmsswVersion)

        try:
            jobExitCode = None
            if options.scriptExe == 'None':
                print("==== CMSSW JOB Execution started at %s ====" %
                      time.asctime(time.gmtime()))
                cmsswSt = executeCMSSWStack(options, scr)
                jobExitCode = cmsswSt.step.execution.exitStatus
            else:
Пример #8
0
        if ad and not "CRAB3_RUNTIME_DEBUG" in os.environ:
            startDashboardMonitoring(ad)
        print("==== CMSSW Stack Execution STARTING at %s ====" %
              time.asctime(time.gmtime()))
        scram = Scram(
            version=opts.cmsswVersion,
            directory=os.getcwd(),
            architecture=opts.scramArch,
        )

        print("==== SCRAM Obj CREATED at %s ====" %
              time.asctime(time.gmtime()))
        if scram.project() or scram.runtime(
        ):  #if any of the two commands fail...
            msg = scram.diagnostic()
            handleException("FAILED", EC_CMSMissingSoftware,
                            'Error setting CMSSW environment: %s' % msg)
            mintime()
            sys.exit(EC_CMSMissingSoftware)

        extractUserSandbox(opts.archiveJob, opts.cmsswVersion)

        try:
            jobExitCode = None
            if opts.scriptExe == 'None':
                print("==== CMSSW JOB Execution started at %s ====" %
                      time.asctime(time.gmtime()))
                cmssw = executeCMSSWStack(opts, scram)
                jobExitCode = cmssw.step.execution.exitStatus
            else:
Пример #9
0
    try:
        setupLogging('.')

        print("==== CMSSW Stack Execution STARTING at %s ====" %
              time.asctime(time.gmtime()))
        scram = Scram(
            version=options.cmsswVersion,
            directory=os.getcwd(),
            architecture=options.scramArch,
        )

        print("==== SCRAM Obj CREATED at %s ====" %
              time.asctime(time.gmtime()))
        if scram.project() or scram.runtime(
        ):  #if any of the two commands fail...
            dgn = scram.diagnostic()
            handleException("FAILED", EC_CMSMissingSoftware,
                            'Error setting CMSSW environment: %s' % dgn)
            mintime()
            sys.exit(EC_CMSMissingSoftware)
        print("==== SCRAM Obj INITIALIZED at %s ====" %
              time.asctime(time.gmtime()))

        print("==== Extract user sandbox in CMSSW directory ====")
        extractUserSandbox(options.archiveJob, options.cmsswVersion)

        # tweaking of the PSet is needed both for CMSSWStack and ScriptEXE
        print("==== Tweak PSet at %s ====" % time.asctime(time.gmtime()))
        tweakingScriptName = 'tweakThePset.sh'
        prepareTweakingScript(options, tweakingScriptName)
        command = 'sh %s' % tweakingScriptName
Пример #10
0
    def execute(self, emulator=None):
        """
        _execute_

        """
        scramCommand = self.step.application.setup.scramCommand
        scramArch = self.step.application.setup.scramArch
        cmsswVersion = self.step.application.setup.cmsswVersion

        scramArch = getSingleScramArch(scramArch)

        # Are we using emulators again?
        if emulator is not None:
            return emulator.emulate(self.step, self.job)

        overrides = {}
        if hasattr(self.step, 'override'):
            overrides = self.step.override.dictionary_()

        # Set wait to over an hour
        waitTime = overrides.get('waitTime', 3600 + (self.step.retryDelay * self.step.retryCount))

        # hardcode CERN Castor T0_CH_CERN_MSS stageout parameters
        castorStageOutParams = {}
        castorStageOutParams['command'] = overrides.get('command', "xrdcp")
        castorStageOutParams['option'] = overrides.get('option', "--wma-cerncastor")
        castorStageOutParams['phedex-node'] = overrides.get('phedex-node', "T2_CH_CERN")
        castorStageOutParams['lfn-prefix'] = overrides.get('lfn-prefix', "root://castorcms.cern.ch//castor/cern.ch/cms")

        # hardcode CERN EOS T2_CH_CERN stageout parameters
        eosStageOutParams = {}
        eosStageOutParams['command'] = overrides.get('command', "xrdcp")
        eosStageOutParams['option'] = overrides.get('option', "--wma-disablewriterecovery")
        eosStageOutParams['phedex-node'] = overrides.get('phedex-node', "T2_CH_CERN")
        eosStageOutParams['lfn-prefix'] = overrides.get('lfn-prefix', "root://eoscms.cern.ch//eos/cms")

        try:
            castorStageOutMgr = StageOutMgr(**castorStageOutParams)
            eosStageOutMgr = StageOutMgr(**eosStageOutParams)
            stageInMgr = StageInMgr()
            deleteMgr = DeleteMgr()
        except Exception as ex:
            msg = "Unable to load StageOut/Delete Impl: %s" % str(ex)
            logging.error(msg)
            raise WMExecutionFailure(60312, "MgrImplementationError", msg)

        # prepare output tar file
        taskName = self.report.getTaskName().split('/')[-1]
        host = socket.gethostname().split('.')[0]
        tarName = '%s-%s-%s-%i-logs.tar' % (self.report.data.workload, taskName, host, self.job["counter"])
        tarLocation = os.path.join(self.stepSpace.location, tarName)

        # Supported by any release beyond CMSSW_8_X, however DaviX is broken until CMSSW_10_4_X
        # see: https://github.com/cms-sw/cmssw/issues/25292
        useEdmCopyUtil = True
        if isCMSSWSupported(cmsswVersion, "CMSSW_10_4_0"):
            pass
        elif scramArch.startswith('slc6_amd64_'):
            msg = "CMSSW too old or not fully functional to support edmCopyUtil, using CMSSW_10_4_0 instead"
            logging.warning(msg)
            cmsswVersion = "CMSSW_10_4_0"
            scramArch = "slc6_amd64_gcc700"
        else:
            useEdmCopyUtil = False
        logging.info("Using edmCopyUtil: %s", useEdmCopyUtil)

        # setup Scram needed to run edmCopyUtil
        if useEdmCopyUtil:
            scram = Scram(
                command=scramCommand,
                version=cmsswVersion,
                initialise=self.step.application.setup.softwareEnvironment,
                directory=self.step.builder.workingDir,
                architecture=scramArch,
            )
            logging.info("Running scram")
            try:
                projectOutcome = scram.project()
            except Exception as ex:
                msg = "Exception raised while running scram.\n"
                msg += str(ex)
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)

            if projectOutcome > 0:
                msg = scram.diagnostic()
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)
            runtimeOutcome = scram.runtime()
            if runtimeOutcome > 0:
                msg = scram.diagnostic()
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)

        # iterate through input files
        localLogs = []
        deleteLogArchives = []
        if useEdmCopyUtil:
            numberOfFilesPerCopy = 10
        else:
            numberOfFilesPerCopy = 1
        for logs in grouper(self.job["input_files"], numberOfFilesPerCopy):

            copyCommand = "env X509_USER_PROXY=%s edmCopyUtil" % os.environ.get('X509_USER_PROXY', None)
            for log in logs:
                copyCommand += " %s" % log['lfn']
            copyCommand += " %s" % self.step.builder.workingDir

            # give up after timeout of 1 minute per input file
            signal.signal(signal.SIGALRM, alarmHandler)
            signal.alarm(60 * numberOfFilesPerCopy)

            filesCopied = False
            try:
                if useEdmCopyUtil:
                    logging.info("Running edmCopyUtil")
                    retval = scram(copyCommand)
                    if retval == 0:
                        filesCopied = True
                else:
                    logging.info("Running stageIn")
                    for log in logs:
                        fileInfo = {"LFN": log['lfn']}
                        logArchive = stageInMgr(**fileInfo)
                        if logArchive:
                            filesCopied = True
            except Alarm:
                logging.error("Indefinite hang during edmCopyUtil/stageIn of logArchives")
            except StageOutFailure:
                logging.error("Unable to stageIn logArchives")
            except Exception:
                raise

            signal.alarm(0)

            if filesCopied:
                for log in logs:
                    localLogs.append(os.path.join(self.step.builder.workingDir, os.path.basename(log['lfn'])))
                    deleteLogArchives.append(log)
                    self.report.addInputFile(sourceName="logArchives", lfn=log['lfn'])
            else:
                logging.error("Unable to copy logArchives to local disk")
                for log in logs:
                    self.report.addSkippedFile(log['lfn'], None)

        # create tarfile if any logArchive copied in
        if localLogs:
            tarFile = tarfile.open(tarLocation, 'w:')
            for log in localLogs:
                path = log.split('/')
                tarFile.add(name=log,
                            arcname=os.path.join(path[-3],
                                                 path[-2],
                                                 path[-1]))
                os.remove(log)
            tarFile.close()
        else:
            msg = "Unable to copy any logArchives to local disk"
            logging.error(msg)
            raise WMExecutionFailure(60312, "LogCollectError", msg)

        # now staging out the LogCollect tarfile
        logging.info("Staging out LogCollect tarfile to Castor and EOS")
        now = datetime.datetime.now()
        lfn = "/store/logs/prod/%i/%.2i/%s/%s/%s" % (now.year, now.month, "WMAgent",
                                                     self.report.data.workload,
                                                     os.path.basename(tarLocation))

        tarInfo = {'LFN': lfn,
                   'PFN': tarLocation,
                   'PNN': None,
                   'GUID': None}

        # perform mandatory stage out to CERN Castor
        signal.signal(signal.SIGALRM, alarmHandler)
        signal.alarm(waitTime)
        try:
            castorStageOutMgr(tarInfo)
        except Alarm:
            msg = "Indefinite hang during stageOut of LogCollect to Castor"
            logging.error(msg)
            raise WMExecutionFailure(60409, "LogCollectTimeout", msg)
        except Exception as ex:
            msg = "Unable to stageOut LogCollect to Castor:\n"
            msg += str(ex)
            logging.error(msg)
            raise WMExecutionFailure(60408, "LogCollectStageOutError", msg)
        signal.alarm(0)

        # add to job report
        self.report.addOutputFile(outputModule="LogCollect", aFile=tarInfo)
        outputRef = getattr(self.report.data, self.stepName)
        outputRef.output.pfn = tarInfo['PFN']
        outputRef.output.location = tarInfo['PNN']
        outputRef.output.lfn = tarInfo['LFN']

        tarInfo = {'LFN': lfn,
                   'PFN': tarLocation,
                   'PNN': None,
                   'GUID': None}

        # then, perform best effort stage out to CERN EOS
        signal.signal(signal.SIGALRM, alarmHandler)
        signal.alarm(waitTime)
        try:
            eosStageOutMgr(tarInfo)
        except Alarm:
            logging.error("Indefinite hang during stageOut of LogCollect to EOS")
        except Exception as ex:
            logging.error("Unable to stageOut LogCollect to EOS: %s", ex)
        signal.alarm(0)

        # we got this far, delete input
        for log in deleteLogArchives:

            # give up after timeout of 1 minutes
            signal.signal(signal.SIGALRM, alarmHandler)
            signal.alarm(60)
            try:
                fileToDelete = {'LFN': log['lfn'],
                                'PFN': None,
                                'PNN': None,
                                'StageOutCommand': None}
                deleteMgr(fileToDelete=fileToDelete)
            except Alarm:
                logging.error("Indefinite hang during delete of logArchive")
            except Exception as ex:
                logging.error("Unable to delete logArchive: %s", ex)
            signal.alarm(0)

        return
Пример #11
0
        logging.Formatter.converter = time.gmtime
        logHandler.setFormatter(logFormatter)
        logging.getLogger().addHandler(logHandler)

        if ad and not "CRAB3_RUNTIME_DEBUG" in os.environ:
            startDashboardMonitoring(ad)
        print("==== CMSSW Stack Execution STARTING at %s ====" % time.asctime(time.gmtime()))
        scr = Scram(
            version = options.cmsswVersion,
            directory = os.getcwd(),
            architecture = options.scramArch,
            )

        print("==== SCRAM Obj CREATED at %s ====" % time.asctime(time.gmtime()))
        if scr.project() or scr.runtime(): #if any of the two commands fail...
            dgn = scr.diagnostic()
            handleException("FAILED", EC_CMSMissingSoftware, 'Error setting CMSSW environment: %s' % dgn)
            mintime()
            sys.exit(EC_CMSMissingSoftware)

        extractUserSandbox(options.archiveJob, options.cmsswVersion)

        try:
            jobExitCode = None
            if options.scriptExe=='None':
                print("==== CMSSW JOB Execution started at %s ====" % time.asctime(time.gmtime()))
                cmsswSt = executeCMSSWStack(options, scr)
                jobExitCode = cmsswSt.step.execution.exitStatus
            else:
                print("==== ScriptEXE Execution started at %s ====" % time.asctime(time.gmtime()))
                jobExitCode = executeScriptExe(options, scr)