示例#1
0
    def execute(self, emulator=None):
        """
        _execute_

        """
        scramCommand = self.step.application.setup.scramCommand
        scramArch = self.step.application.setup.scramArch
        cmsswVersion = self.step.application.setup.cmsswVersion

        # Are we using emulators again?
        if (emulator != None):
            return emulator.emulate(self.step, self.job)

        overrides = {}
        if hasattr(self.step, 'override'):
            overrides = self.step.override.dictionary_()

        # Set wait to over an hour
        waitTime = overrides.get(
            'waitTime', 3600 + (self.step.retryDelay * self.step.retryCount))

        # hardcode CERN Castor T0_CH_CERN_MSS stageout parameters
        castorStageOutParams = {}
        castorStageOutParams['command'] = overrides.get('command', "xrdcp")
        castorStageOutParams['option'] = overrides.get('option',
                                                       "--cerncastor")
        castorStageOutParams['phedex-node'] = overrides.get(
            'phedex-node', "T2_CH_CERN")
        castorStageOutParams['lfn-prefix'] = overrides.get(
            'lfn-prefix', "root://castorcms.cern.ch//castor/cern.ch/cms")

        # hardcode CERN EOS T2_CH_CERN stageout parameters
        eosStageOutParams = {}
        eosStageOutParams['command'] = overrides.get('command', "xrdcp")
        eosStageOutParams['option'] = overrides.get('option', "")
        eosStageOutParams['phedex-node'] = overrides.get(
            'phedex-node', "T2_CH_CERN")
        eosStageOutParams['lfn-prefix'] = overrides.get(
            'lfn-prefix', "root://eoscms.cern.ch//eos/cms")

        # are we using the new stageout method ?
        useNewStageOutCode = False
        if getattr(self.step, 'newStageout', False) or \
            ('newStageOut' in overrides and overrides.get('newStageOut')):
            useNewStageOutCode = True

        try:
            castorStageOutMgr = StageOutMgr(**castorStageOutParams)
            eosStageOutMgr = StageOutMgr(**eosStageOutParams)
            stageInMgr = StageInMgr()
            deleteMgr = DeleteMgr()
        except Exception as ex:
            msg = "Unable to load StageOut/Delete Impl: %s" % str(ex)
            logging.error(msg)
            raise WMExecutionFailure(60312, "MgrImplementationError", msg)

        # prepare output tar file
        taskName = self.report.getTaskName().split('/')[-1]
        host = socket.gethostname().split('.')[0]
        tarName = '%s-%s-%s-%i-logs.tar' % (
            self.report.data.workload, taskName, host, self.job["counter"])
        tarLocation = os.path.join(self.stepSpace.location, tarName)

        # check if the cmsswVersion supports edmCopyUtil (min CMSSW_8_X)
        result = re.match("CMSSW_([0-9]+)_([0-9]+)_([0-9]+).*", cmsswVersion)
        useEdmCopyUtil = False
        if result:
            try:
                if int(result.group(1)) >= 8:
                    useEdmCopyUtil = True
            except ValueError:
                pass

        # setup Scram needed to run edmCopyUtil
        if useEdmCopyUtil:
            scram = Scram(
                command=scramCommand,
                version=cmsswVersion,
                initialise=self.step.application.setup.softwareEnvironment,
                directory=self.step.builder.workingDir,
                architecture=scramArch,
            )
            logging.info("Running scram")
            try:
                projectOutcome = scram.project()
            except Exception as ex:
                msg = "Exception raised while running scram.\n"
                msg += str(ex)
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)

            if projectOutcome > 0:
                msg = scram.diagnostic()
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)
            runtimeOutcome = scram.runtime()
            if runtimeOutcome > 0:
                msg = scram.diagnostic()
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)

        # iterate through input files
        localLogs = []
        deleteLogArchives = []
        if useEdmCopyUtil:
            numberOfFilesPerCopy = 10
        else:
            numberOfFilesPerCopy = 1
        for logs in grouper(self.job["input_files"], numberOfFilesPerCopy):

            copyCommand = "env X509_USER_PROXY=%s edmCopyUtil" % os.environ.get(
                'X509_USER_PROXY', None)
            for log in logs:
                copyCommand += " %s" % log['lfn']
            copyCommand += " %s" % self.step.builder.workingDir

            # give up after timeout of 1 minute per input file
            signal.signal(signal.SIGALRM, alarmHandler)
            signal.alarm(60 * numberOfFilesPerCopy)

            filesCopied = False
            try:
                if useEdmCopyUtil:
                    logging.info("Running edmCopyUtil")
                    retval = scram(copyCommand)
                    if retval == 0:
                        filesCopied = True
                else:
                    logging.info("Running stageIn")
                    for log in logs:
                        fileInfo = {"LFN": log['lfn']}
                        logArchive = stageInMgr(**fileInfo)
                        if logArchive:
                            filesCopied = True
            except Alarm:
                logging.error(
                    "Indefinite hang during edmCopyUtil/stageIn of logArchives"
                )
            except StageOutFailure:
                logging.error("Unable to stageIn logArchives")
            except Exception:
                raise

            signal.alarm(0)

            if filesCopied:
                for log in logs:
                    localLogs.append(
                        os.path.join(self.step.builder.workingDir,
                                     os.path.basename(log['lfn'])))
                    deleteLogArchives.append(log)
                    self.report.addInputFile(sourceName="logArchives",
                                             lfn=log['lfn'])
            else:
                logging.error("Unable to copy logArchives to local disk")
                if useEdmCopyUtil:
                    with open('scramOutput.log', 'r') as f:
                        logging.error("Scram output: %s", f.read())
                for log in logs:
                    self.report.addSkippedFile(log['lfn'], None)

        # create tarfile if any logArchive copied in
        if localLogs:
            tarFile = tarfile.open(tarLocation, 'w:')
            for log in localLogs:
                path = log.split('/')
                tarFile.add(name=log,
                            arcname=os.path.join(path[-3], path[-2], path[-1]))
                os.remove(log)
            tarFile.close()
        else:
            msg = "Unable to copy any logArchives to local disk"
            logging.error(msg)
            raise WMExecutionFailure(60312, "LogCollectError", msg)

        # now staging out the LogCollect tarfile
        logging.info("Staging out LogCollect tarfile to Castor and EOS")
        now = datetime.datetime.now()
        lfn = "/store/logs/prod/%i/%.2i/%s/%s/%s" % (
            now.year, now.month, "WMAgent", self.report.data.workload,
            os.path.basename(tarLocation))

        tarInfo = {'LFN': lfn, 'PFN': tarLocation, 'PNN': None, 'GUID': None}

        # perform mandatory stage out to CERN Castor
        signal.signal(signal.SIGALRM, alarmHandler)
        signal.alarm(waitTime)
        try:
            castorStageOutMgr(tarInfo)
        except Alarm:
            msg = "Indefinite hang during stageOut of LogCollect to Castor"
            logging.error(msg)
            raise WMExecutionFailure(60409, "LogCollectTimeout", msg)
        except Exception as ex:
            msg = "Unable to stageOut LogCollect to Castor:\n"
            msg += str(ex)
            logging.error(msg)
            raise WMExecutionFailure(60408, "LogCollectStageOutError", msg)
        signal.alarm(0)

        # add to job report
        self.report.addOutputFile(outputModule="LogCollect", file=tarInfo)
        outputRef = getattr(self.report.data, self.stepName)
        outputRef.output.pfn = tarInfo['PFN']
        outputRef.output.location = tarInfo['PNN']
        outputRef.output.lfn = tarInfo['LFN']

        tarInfo = {'LFN': lfn, 'PFN': tarLocation, 'PNN': None, 'GUID': None}

        # then, perform best effort stage out to CERN EOS
        signal.signal(signal.SIGALRM, alarmHandler)
        signal.alarm(waitTime)
        try:
            eosStageOutMgr(tarInfo)
        except Alarm:
            logging.error(
                "Indefinite hang during stageOut of LogCollect to EOS")
        except Exception as ex:
            logging.error("Unable to stageOut LogCollect to EOS:\n", ex)
        signal.alarm(0)

        # we got this far, delete input
        for log in deleteLogArchives:

            # give up after timeout of 1 minutes
            signal.signal(signal.SIGALRM, alarmHandler)
            signal.alarm(60)
            try:
                fileToDelete = {
                    'LFN': log['lfn'],
                    'PFN': None,
                    'PNN': None,
                    'StageOutCommand': None
                }
                deleteMgr(fileToDelete=fileToDelete)
            except Alarm:
                logging.error("Indefinite hang during delete of logArchive")
            except Exception as ex:
                logging.error("Unable to delete logArchive: %s", ex)
            signal.alarm(0)

        return
示例#2
0
    def execute(self, emulator=None):
        """
        _execute_

        """
        # Are we using emulators again?
        if emulator is not None:
            return emulator.emulate(self.step, self.job)

        logging.info("Steps.Executors.%s.execute called", self.__class__.__name__)

        scramCommand = self.step.application.setup.scramCommand
        cmsswVersion = self.step.application.setup.cmsswVersion
        scramArch = getSingleScramArch(self.step.application.setup.scramArch)
        overrideCatalog = getattr(self.step.application, 'overrideCatalog', None)

        overrides = {}
        if hasattr(self.step, 'override'):
            overrides = self.step.override.dictionary_()

        # Set wait to over an hour
        waitTime = overrides.get('waitTime', 3600 + (self.step.retryDelay * self.step.retryCount))

        # hardcode CERN EOS T2_CH_CERN stageout parameters
        eosStageOutParams = {}
        eosStageOutParams['command'] = overrides.get('command', "xrdcp")
        eosStageOutParams['option'] = overrides.get('option', "--wma-disablewriterecovery")
        eosStageOutParams['phedex-node'] = overrides.get('phedex-node', "T2_CH_CERN")
        eosStageOutParams['lfn-prefix'] = overrides.get('lfn-prefix', "root://eoscms.cern.ch//eos/cms")

        try:
            eosStageOutMgr = StageOutMgr(**eosStageOutParams)
            stageInMgr = StageInMgr()
            deleteMgr = DeleteMgr()
        except Exception as ex:
            msg = "Unable to load StageOut/Delete Impl: %s" % str(ex)
            logging.error(msg)
            raise WMExecutionFailure(60312, "MgrImplementationError", msg)

        # prepare output tar file
        taskName = self.report.getTaskName().split('/')[-1]
        host = socket.gethostname().split('.')[0]
        tarName = '%s-%s-%s-%i-logs.tar' % (self.report.data.workload, taskName, host, self.job["counter"])
        tarLocation = os.path.join(self.stepSpace.location, tarName)

        # Supported by any release beyond CMSSW_8_X, however DaviX is broken until CMSSW_10_4_X
        # see: https://github.com/cms-sw/cmssw/issues/25292
        useEdmCopyUtil = True
        if isCMSSWSupported(cmsswVersion, "CMSSW_10_4_0"):
            pass
        elif scramArch.startswith('slc7_amd64_'):
            msg = "CMSSW too old or not fully functional to support edmCopyUtil, using CMSSW_10_4_0 instead"
            logging.warning(msg)
            cmsswVersion = "CMSSW_10_4_0"
            scramArch = "slc7_amd64_gcc820"
        elif scramArch.startswith('slc6_amd64_'):
            msg = "CMSSW too old or not fully functional to support edmCopyUtil, using CMSSW_10_4_0 instead"
            logging.warning(msg)
            cmsswVersion = "CMSSW_10_4_0"
            scramArch = "slc6_amd64_gcc700"
        else:
            useEdmCopyUtil = False
        logging.info("Using edmCopyUtil: %s", useEdmCopyUtil)

        # setup Scram needed to run edmCopyUtil
        if useEdmCopyUtil:
            scram = Scram(
                    command=scramCommand,
                    version=cmsswVersion,
                    initialise=self.step.application.setup.softwareEnvironment,
                    directory=self.step.builder.workingDir,
                    architecture=scramArch,
            )
            logging.info("Running scram")
            try:
                projectOutcome = scram.project()
            except Exception as ex:
                msg = "Exception raised while running scram.\n"
                msg += str(ex)
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)

            if projectOutcome > 0:
                msg = scram.diagnostic()
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)
            runtimeOutcome = scram.runtime()
            if runtimeOutcome > 0:
                msg = scram.diagnostic()
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)

        # iterate through input files
        localLogs = []
        deleteLogArchives = []
        if useEdmCopyUtil:
            numberOfFilesPerCopy = 10
        else:
            numberOfFilesPerCopy = 1
        for logs in grouper(self.job["input_files"], numberOfFilesPerCopy):

            copyCommand = "env X509_USER_PROXY=%s edmCopyUtil" % os.environ.get('X509_USER_PROXY', None)

            # specify TFC if necessary
            if overrideCatalog:
                copyCommand += " -c %s" % overrideCatalog

            for log in logs:
                copyCommand += " %s" % log['lfn']
            copyCommand += " %s" % self.step.builder.workingDir

            # give up after timeout of 1 minute per input file
            signal.signal(signal.SIGALRM, alarmHandler)
            signal.alarm(60 * numberOfFilesPerCopy)

            filesCopied = False
            try:
                if useEdmCopyUtil:
                    logging.info("Running edmCopyUtil")
                    retval = scram(copyCommand)
                    if retval == 0:
                        filesCopied = True
                else:
                    logging.info("Running stageIn")
                    for log in logs:
                        fileInfo = {"LFN": log['lfn']}
                        logArchive = stageInMgr(**fileInfo)
                        if logArchive:
                            filesCopied = True
            except Alarm:
                logging.error("Indefinite hang during edmCopyUtil/stageIn of logArchives")
            except StageOutFailure:
                logging.error("Unable to stageIn logArchives")
            except Exception:
                raise

            signal.alarm(0)

            if filesCopied:
                for log in logs:
                    localLogs.append(os.path.join(self.step.builder.workingDir, os.path.basename(log['lfn'])))
                    deleteLogArchives.append(log)
                    self.report.addInputFile(sourceName="logArchives", lfn=log['lfn'])
            else:
                logging.error("Unable to copy logArchives to local disk")
                for log in logs:
                    self.report.addSkippedFile(log['lfn'], None)

        # create tarfile if any logArchive copied in
        if localLogs:
            with tarfile.open(tarLocation, 'w:') as tarFile:
                for log in localLogs:
                    path = log.split('/')
                    tarFile.add(name=log,
                                arcname=os.path.join(path[-3], path[-2], path[-1]))
                    os.remove(log)
        else:
            msg = "Unable to copy any logArchives to local disk"
            logging.error(msg)
            raise WMExecutionFailure(60312, "LogCollectError", msg)

        # now staging out the LogCollect tarfile
        logging.info("Staging out LogCollect tarfile to EOS (skipping CASTOR)")
        now = datetime.datetime.now()
        lfn = "/store/logs/prod/%i/%.2i/%s/%s/%s" % (now.year, now.month, "WMAgent",
                                                     self.report.data.workload,
                                                     os.path.basename(tarLocation))

        tarInfo = {'LFN': lfn,
                   'PFN': tarLocation,
                   'PNN': None,
                   'GUID': None}

        # perform mandatory stage out to CERN EOS
        signal.signal(signal.SIGALRM, alarmHandler)
        signal.alarm(waitTime)
        try:
            eosStageOutMgr(tarInfo)
        except Alarm:
            msg = "Indefinite hang during stageOut of LogCollect to EOS"
            logging.error(msg)
            raise WMExecutionFailure(60409, "LogCollectTimeout", msg)
        except Exception as ex:
            msg = "Unable to stageOut LogCollect to Castor:\n"
            msg += str(ex)
            logging.error(msg)
            raise WMExecutionFailure(60408, "LogCollectStageOutError", msg)
        signal.alarm(0)

        # add to job report
        self.report.addOutputFile(outputModule="LogCollect", aFile=tarInfo)
        outputRef = getattr(self.report.data, self.stepName)
        outputRef.output.pfn = tarInfo['PFN']
        outputRef.output.location = tarInfo['PNN']
        outputRef.output.lfn = tarInfo['LFN']

        # we got this far, delete ALL input files assigned to this job
        for log in self.job["input_files"]:
            # give up after timeout of 1 minutes
            signal.signal(signal.SIGALRM, alarmHandler)
            signal.alarm(60)
            try:
                fileToDelete = {'LFN': log['lfn'],
                                'PFN': None,
                                'PNN': None,
                                'StageOutCommand': None}
                deleteMgr(fileToDelete=fileToDelete)
            except Alarm:
                logging.error("Indefinite hang during delete of logArchive")
            except Exception as ex:
                logging.error("Unable to delete logArchive: %s", ex)
            signal.alarm(0)

        return
示例#3
0
    def execute(self, emulator = None):
        """
        _execute_


        """
        #Are we using emulators again?
        if (emulator != None):
            return emulator.emulate( self.step, self.job )


        overrides = {}
        if hasattr(self.step, 'override'):
            overrides = self.step.override.dictionary_()

        # Set wait to two hours per retry
        # this alarm leaves a subprocess behing that may cause trouble, see #6273
        waitTime = overrides.get('waitTime', 7200 * self.step.retryCount)

        logging.info("StageOut override is: %s " % self.step)

        # Pull out StageOutMgr Overrides

        # switch between old stageOut behavior and new, fancy stage out behavior
        useNewStageOutCode = False
        if getattr(self.step, 'newStageout', False) or \
            ('newStageOut' in overrides and overrides.get('newStageOut')):
            useNewStageOutCode = True


        stageOutCall = {}
        if "command" in overrides and "option" in overrides \
               and "phedex-node" in overrides \
               and"lfn-prefix" in overrides:
            logging.critical('using override in StageOut')
            stageOutCall['command']    = overrides.get('command')
            stageOutCall['option']     = overrides.get('option')
            stageOutCall['phedex-node']= overrides.get('phedex-node')
            stageOutCall['lfn-prefix'] = overrides.get('lfn-prefix')

        # naw man, this is real
        # iterate over all the incoming files
        if not useNewStageOutCode:
            # old style
            manager = StageOutMgr(**stageOutCall)
            manager.numberOfRetries = self.step.retryCount
            manager.retryPauseTime  = self.step.retryDelay
        else:
            # new style
            logging.critical("STAGEOUT IS USING NEW STAGEOUT CODE")
            print("STAGEOUT IS USING NEW STAGEOUT CODE")
            manager = FMStageOutMgr(retryPauseTime  = self.step.retryDelay,
                                    numberOfRetries = self.step.retryCount,
                                    **stageOutCall)

        # We need to find a list of steps in our task
        # And eventually a list of jobReports for out steps

        # Search through steps for report files
        filesTransferred = []

        for step in self.stepSpace.taskSpace.stepSpaces():
            if step == self.stepName:
                #Don't try to parse your own report; it's not there yet
                continue
            stepLocation = os.path.join(self.stepSpace.taskSpace.location, step)
            logging.info("Beginning report processing for step %s" % (step))
            reportLocation = os.path.join(stepLocation, 'Report.pkl')
            if not os.path.isfile(reportLocation):
                logging.error("Cannot find report for step %s in space %s" \
                              % (step, stepLocation))
                continue
            # First, get everything from a file and 'unpersist' it
            stepReport = Report()
            stepReport.unpersist(reportLocation, step)

            # Don't stage out files from bad steps.
            if not stepReport.stepSuccessful(step):
                continue

            # Okay, time to start using stuff
            # Now I'm a bit confused about this; each report should ONLY
            # Have the results of that particular step in it,
            # So getting all the files should get ONLY the files
            # for that step; or so I hope
            files = stepReport.getAllFileRefsFromStep(step = step)
            for fileName in files:
                if not hasattr(fileName, 'lfn') and hasattr(fileName, 'pfn'):
                    # Then we're truly hosed on this file; ignore it
                    msg = "Not a file: %s" % fileName
                    logging.error(msg)
                    continue
                # Support direct-to-merge
                # This requires pulling a bunch of stuff from everywhere
                # First check if it's needed
                if hasattr(self.step.output, 'minMergeSize') and hasattr(fileName, 'size') \
                    and not getattr(fileName, 'merged', False):

                    # We need both of those to continue, and we don't
                    # direct-to-merge
                    if getattr(self.step.output, 'doNotDirectMerge', False):
                        # Then we've been told explicitly not to do direct-to-merge
                        continue
                    if fileName.size >= self.step.output.minMergeSize:
                        # Then this goes direct to merge
                        try:
                            fileName = self.handleLFNForMerge(mergefile = fileName, step = step)
                        except Exception as ex:
                            logging.error("Encountered error while handling LFN for merge due to size.\n")
                            logging.error(str(ex))
                            logging.debug(fileName)
                            logging.debug("minMergeSize: %s" % self.step.output.minMergeSize)
                            manager.cleanSuccessfulStageOuts()
                            stepReport.addError(self.stepName, 60401, "DirectToMergeFailure", str(ex))
                    elif getattr(self.step.output, 'maxMergeEvents', None) != None \
                        and getattr(fileName, 'events', None) != None and not getattr(fileName, 'merged', False):
                        # Then direct-to-merge due to events if
                        # the file is large enough:
                        if fileName.events >= self.step.output.maxMergeEvents:
                            # straight to merge
                            try:
                                fileName = self.handleLFNForMerge(mergefile = fileName, step = step)
                            except Exception as ex:
                                logging.error("Encountered error while handling LFN for merge due to events.\n")
                                logging.error(str(ex))
                                logging.debug(fileName)
                                logging.debug("maxMergeEvents: %s" % self.step.output.maxMergeEvents)
                                manager.cleanSuccessfulStageOuts()
                                stepReport.addError(self.stepName, 60402, "DirectToMergeFailure", str(ex))

                # Save the input PFN in case we need it
                # Undecided whether to move fileName.pfn to the output PFN
                fileName.InputPFN = fileName.pfn
                lfn = getattr(fileName, 'lfn')
                fileSource = getattr(fileName, 'Source', None)
                if fileSource in ['TFileService', 'UserDefined']:
                    userLfnRegEx(lfn)
                else:
                    lfnRegEx(lfn)
                fileForTransfer = {'LFN': lfn,
                                   'PFN': getattr(fileName, 'pfn'),
                                   'PNN' : None,
                                   'StageOutCommand': None,
                                   'Checksums' : getattr(fileName, 'checksums', None)}

                signal.signal(signal.SIGALRM, alarmHandler)
                signal.alarm(waitTime)
                try:
                    manager(fileForTransfer)
                    #Afterwards, the file should have updated info.
                    filesTransferred.append(fileForTransfer)
                    fileName.StageOutCommand = fileForTransfer['StageOutCommand']
                    fileName.location        = fileForTransfer['PNN']
                    fileName.OutputPFN       = fileForTransfer['PFN']
                except Alarm:
                    msg = "Indefinite hang during stageOut of logArchive"
                    logging.error(msg)
                    manager.cleanSuccessfulStageOuts()
                    stepReport.addError(self.stepName, 60403, "StageOutTimeout", msg)
                    stepReport.setStepStatus(self.stepName, 1)
                    # well, if it fails for one file, it fails for the whole job...
                    break
                except Exception as ex:
                    manager.cleanSuccessfulStageOuts()
                    stepReport.addError(self.stepName, 60307, "StageOutFailure", str(ex))
                    stepReport.setStepStatus(self.stepName, 1)
                    stepReport.persist(reportLocation)
                    raise

                signal.alarm(0)

            # Am DONE with report. Persist it
            stepReport.persist(reportLocation)

        #Done with all steps, and should have a list of
        #stagedOut files in fileForTransfer
        logging.info("Transferred %i files" %(len(filesTransferred)))
        return
示例#4
0
 def __init__(self, jobReport):
     self.report = jobReport
     self.mgr = StageOutMgr()
     self.failures = {}
示例#5
0
    def execute(self, emulator=None):
        """
        _execute_


        """
        #Are we using emulators again?
        if (emulator != None):
            return emulator.emulate(self.step, self.job)

        overrides = {}
        if hasattr(self.step, 'override'):
            overrides = self.step.override.dictionary_()

        # Set wait to two hours per retry
        # this alarm leaves a subprocess behing that may cause trouble, see #6273
        waitTime = overrides.get('waitTime', 7200 * self.step.retryCount)

        logging.info("StageOut override is: %s ", self.step)

        # Pull out StageOutMgr Overrides

        # switch between old stageOut behavior and new, fancy stage out behavior
        useNewStageOutCode = False
        if getattr(self.step, 'newStageout', False) or \
            ('newStageOut' in overrides and overrides.get('newStageOut')):
            useNewStageOutCode = True

        stageOutCall = {}
        if "command" in overrides and "option" in overrides \
               and "phedex-node" in overrides \
               and"lfn-prefix" in overrides:
            logging.critical('using override in StageOut')
            stageOutCall['command'] = overrides.get('command')
            stageOutCall['option'] = overrides.get('option')
            stageOutCall['phedex-node'] = overrides.get('phedex-node')
            stageOutCall['lfn-prefix'] = overrides.get('lfn-prefix')

        # naw man, this is real
        # iterate over all the incoming files
        if not useNewStageOutCode:
            # old style
            manager = StageOutMgr(**stageOutCall)
            manager.numberOfRetries = self.step.retryCount
            manager.retryPauseTime = self.step.retryDelay
        else:
            # new style
            logging.critical("STAGEOUT IS USING NEW STAGEOUT CODE")
            print("STAGEOUT IS USING NEW STAGEOUT CODE")
            manager = FMStageOutMgr(retryPauseTime=self.step.retryDelay,
                                    numberOfRetries=self.step.retryCount,
                                    **stageOutCall)

        # We need to find a list of steps in our task
        # And eventually a list of jobReports for out steps

        # Search through steps for report files
        filesTransferred = []

        for step in self.stepSpace.taskSpace.stepSpaces():
            if step == self.stepName:
                #Don't try to parse your own report; it's not there yet
                continue
            stepLocation = os.path.join(self.stepSpace.taskSpace.location,
                                        step)
            logging.info("Beginning report processing for step %s", step)
            reportLocation = os.path.join(stepLocation, 'Report.pkl')
            if not os.path.isfile(reportLocation):
                logging.error("Cannot find report for step %s in space %s",
                              step, stepLocation)
                continue
            # First, get everything from a file and 'unpersist' it
            stepReport = Report()
            stepReport.unpersist(reportLocation, step)

            # Don't stage out files from bad steps.
            if not stepReport.stepSuccessful(step):
                continue

            # Okay, time to start using stuff
            # Now I'm a bit confused about this; each report should ONLY
            # Have the results of that particular step in it,
            # So getting all the files should get ONLY the files
            # for that step; or so I hope
            files = stepReport.getAllFileRefsFromStep(step=step)
            for fileName in files:

                # make sure the file information is consistent
                if hasattr(fileName,
                           'pfn') and (not hasattr(fileName, 'lfn') or
                                       not hasattr(fileName, 'module_label')):
                    msg = "Not a valid file: %s" % fileName
                    logging.error(msg)
                    continue

                # Figuring out if we should do straight to merge
                #  - should we do straight to merge at all ?
                #  - is straight to merge disabled for this output ?
                #  - are we over the size threshold
                #  - are we over the event threshold ?
                straightToMerge = False
                if not getattr(fileName, 'merged', False) and hasattr(
                        self.step.output, 'minMergeSize'):
                    if fileName.module_label not in getattr(
                            self.step.output, 'forceUnmergedOutputs', []):
                        if getattr(fileName, 'size',
                                   0) >= self.step.output.minMergeSize:
                            straightToMerge = True
                        if getattr(fileName, 'events', 0) >= getattr(
                                self.step.output, 'maxMergeEvents',
                                sys.maxsize):
                            straightToMerge = True

                if straightToMerge:

                    try:
                        fileName = self.handleLFNForMerge(mergefile=fileName,
                                                          step=step)
                    except Exception as ex:
                        logging.info(
                            "minMergeSize: %s",
                            getattr(self.step.output, 'minMergeSize', None))
                        logging.info(
                            "maxMergeEvents: %s",
                            getattr(self.step.output, 'maxMergeEvents', None))
                        logging.error(
                            "Encountered error while handling LFN for merge %s",
                            fileName)
                        logging.error(str(ex))
                        manager.cleanSuccessfulStageOuts()
                        stepReport.addError(self.stepName, 60401,
                                            "DirectToMergeFailure", str(ex))

                # Save the input PFN in case we need it
                # Undecided whether to move fileName.pfn to the output PFN
                fileName.InputPFN = fileName.pfn
                lfn = getattr(fileName, 'lfn')
                fileSource = getattr(fileName, 'Source', None)
                if fileSource in ['TFileService', 'UserDefined']:
                    userLfnRegEx(lfn)
                else:
                    lfnRegEx(lfn)

                fileForTransfer = {
                    'LFN': lfn,
                    'PFN': getattr(fileName, 'pfn'),
                    'PNN': None,
                    'StageOutCommand': None,
                    'Checksums': getattr(fileName, 'checksums', None)
                }

                signal.signal(signal.SIGALRM, alarmHandler)
                signal.alarm(waitTime)
                try:
                    manager(fileForTransfer)
                    #Afterwards, the file should have updated info.
                    filesTransferred.append(fileForTransfer)
                    fileName.StageOutCommand = fileForTransfer[
                        'StageOutCommand']
                    fileName.location = fileForTransfer['PNN']
                    fileName.OutputPFN = fileForTransfer['PFN']
                except Alarm:
                    msg = "Indefinite hang during stageOut of logArchive"
                    logging.error(msg)
                    manager.cleanSuccessfulStageOuts()
                    stepReport.addError(self.stepName, 60403,
                                        "StageOutTimeout", msg)
                    stepReport.setStepStatus(self.stepName, 1)
                    # well, if it fails for one file, it fails for the whole job...
                    break
                except Exception as ex:
                    manager.cleanSuccessfulStageOuts()
                    stepReport.addError(self.stepName, 60307,
                                        "StageOutFailure", str(ex))
                    stepReport.setStepStatus(self.stepName, 1)
                    stepReport.persist(reportLocation)
                    raise

                signal.alarm(0)

            # Am DONE with report. Persist it
            stepReport.persist(reportLocation)

        #Done with all steps, and should have a list of
        #stagedOut files in fileForTransfer
        logging.info("Transferred %i files", len(filesTransferred))
        return
示例#6
0
文件: StageOut.py 项目: dmwm/WMCore
    def execute(self, emulator=None):
        """
        _execute_


        """
        # Are we using emulators again?
        if emulator is not None:
            return emulator.emulate(self.step, self.job)

        overrides = {}
        if hasattr(self.step, 'override'):
            overrides = self.step.override.dictionary_()

        # Set wait to two hours per retry
        # this alarm leaves a subprocess behing that may cause trouble, see #6273
        waitTime = overrides.get('waitTime', 7200 * self.step.retryCount)

        logging.info("StageOut override is: %s ", self.step)

        # Pull out StageOutMgr Overrides

        # switch between old stageOut behavior and new, fancy stage out behavior
        useNewStageOutCode = False
        if getattr(self.step, 'newStageout', False) or \
                ('newStageOut' in overrides and overrides.get('newStageOut')):
            useNewStageOutCode = True

        stageOutCall = {}
        if "command" in overrides and "option" in overrides \
                and "phedex-node" in overrides \
                and "lfn-prefix" in overrides:
            logging.critical('using override in StageOut')
            stageOutCall['command'] = overrides.get('command')
            stageOutCall['option'] = overrides.get('option')
            stageOutCall['phedex-node'] = overrides.get('phedex-node')
            stageOutCall['lfn-prefix'] = overrides.get('lfn-prefix')

        # naw man, this is real
        # iterate over all the incoming files
        if not useNewStageOutCode:
            # old style
            manager = StageOutMgr(**stageOutCall)
            manager.numberOfRetries = self.step.retryCount
            manager.retryPauseTime = self.step.retryDelay
        else:
            # new style
            logging.critical("STAGEOUT IS USING NEW STAGEOUT CODE")
            manager = FMStageOutMgr(retryPauseTime=self.step.retryDelay,
                                    numberOfRetries=self.step.retryCount,
                                    **stageOutCall)

        # We need to find a list of steps in our task
        # And eventually a list of jobReports for out steps

        # Search through steps for report files
        filesTransferred = []

        for step in self.stepSpace.taskSpace.stepSpaces():
            if step == self.stepName:
                # Don't try to parse your own report; it's not there yet
                continue
            stepLocation = os.path.join(self.stepSpace.taskSpace.location, step)
            logging.info("Beginning report processing for step %s", step)
            reportLocation = os.path.join(stepLocation, 'Report.pkl')
            if not os.path.isfile(reportLocation):
                logging.error("Cannot find report for step %s in space %s", step, stepLocation)
                continue
            # First, get everything from a file and 'unpersist' it
            stepReport = Report()
            stepReport.unpersist(reportLocation, step)

            # Don't stage out files from bad steps.
            if not stepReport.stepSuccessful(step):
                continue

            # Okay, time to start using stuff
            # Now I'm a bit confused about this; each report should ONLY
            # Have the results of that particular step in it,
            # So getting all the files should get ONLY the files
            # for that step; or so I hope
            files = stepReport.getAllFileRefsFromStep(step=step)
            for fileName in files:

                # make sure the file information is consistent
                if hasattr(fileName, 'pfn') and (not hasattr(fileName, 'lfn') or not hasattr(fileName, 'module_label')):
                    msg = "Not a valid file: %s" % fileName
                    logging.error(msg)
                    continue

                # Figuring out if we should do straight to merge
                #  - should we do straight to merge at all ?
                #  - is straight to merge disabled for this output ?
                #  - are we over the size threshold
                #  - are we over the event threshold ?
                straightToMerge = False
                if not getattr(fileName, 'merged', False):
                    if hasattr(fileName, 'dataset') and fileName.dataset.get('dataTier', "") in ["NANOAOD",
                                                                                                 "NANOAODSIM"]:
                        logging.info("NANOAOD and NANOAODSIM files never go straight to merge!")
                    elif fileName.module_label not in getattr(self.step.output, 'forceUnmergedOutputs', []):
                        if hasattr(self.step.output, 'minMergeSize') and getattr(fileName, 'size',
                                                                                 0) >= self.step.output.minMergeSize:
                            logging.info("Sending %s straight to merge due to minMergeSize", fileName.lfn)
                            straightToMerge = True
                        elif getattr(fileName, 'events', 0) >= getattr(self.step.output, 'maxMergeEvents', sys.maxsize):
                            logging.info("Sending %s straight to merge due to maxMergeEvents", fileName.lfn)
                            straightToMerge = True

                if straightToMerge:
                    try:
                        fileName = self.handleLFNForMerge(mergefile=fileName, step=step)
                    except Exception as ex:
                        logging.info("minMergeSize: %s", getattr(self.step.output, 'minMergeSize', None))
                        logging.info("maxMergeEvents: %s", getattr(self.step.output, 'maxMergeEvents', None))
                        logging.error("Encountered error while handling LFN for merge %s", fileName)
                        logging.error(str(ex))
                        manager.cleanSuccessfulStageOuts()
                        stepReport.addError(self.stepName, 60401, "DirectToMergeFailure", str(ex))

                # Save the input PFN in case we need it
                # Undecided whether to move fileName.pfn to the output PFN
                fileName.InputPFN = fileName.pfn
                lfn = fileName.lfn
                fileSource = getattr(fileName, 'Source', None)
                if fileSource in ['TFileService', 'UserDefined']:
                    userLfnRegEx(lfn)
                else:
                    lfnRegEx(lfn)

                fileForTransfer = {'LFN': lfn,
                                   'PFN': getattr(fileName, 'pfn'),
                                   'PNN': None,
                                   'StageOutCommand': None,
                                   'Checksums': getattr(fileName, 'checksums', None)}

                signal.signal(signal.SIGALRM, alarmHandler)
                signal.alarm(waitTime)
                try:
                    manager(fileForTransfer)
                    # Afterwards, the file should have updated info.
                    filesTransferred.append(fileForTransfer)
                    fileName.StageOutCommand = fileForTransfer['StageOutCommand']
                    fileName.location = fileForTransfer['PNN']
                    fileName.OutputPFN = fileForTransfer['PFN']
                except Alarm:
                    msg = "Indefinite hang during stageOut of logArchive"
                    logging.error(msg)
                    manager.cleanSuccessfulStageOuts()
                    stepReport.addError(self.stepName, 60403, "StageOutTimeout", msg)
                    # well, if it fails for one file, it fails for the whole job...
                    break
                except Exception as ex:
                    manager.cleanSuccessfulStageOuts()
                    stepReport.addError(self.stepName, 60307, "StageOutFailure", str(ex))
                    stepReport.persist(reportLocation)
                    raise

                signal.alarm(0)

            # Am DONE with report. Persist it
            stepReport.persist(reportLocation)

        # Done with all steps, and should have a list of
        # stagedOut files in fileForTransfer
        logging.info("Transferred %i files", len(filesTransferred))
        return