Пример #1
0
def stageIn(job, jobSite, analJob, pilot_initdir, pworkdir):
    """ perform the stage-in """

    ec = 0
    statusPFCTurl = None
    usedFAXandDirectIO = False

    # prepare the input files (remove non-valid names) if there are any
    ins, job.filesizeIn, job.checksumIn = RunJobUtilities.prepareInFiles(job.inFiles, job.filesizeIn, job.checksumIn)
    if ins:
        tolog("Preparing for get command")

        # get the file access info (only useCT is needed here)
        useCT, oldPrefix, newPrefix, useFileStager, directIn = getFileAccessInfo()

        # transfer input files
        tin_0 = os.times()
        ec, job.pilotErrorDiag, statusPFCTurl, job.filesWithoutFAX, job.filesWithFAX, usedFAXandDirectIO = \
            mover.get_data(job, jobSite, ins, stageinretry, analysisJob=analJob, usect=useCT,\
                           pinitdir=pilot_initdir, proxycheck=False, inputDir=inputDir, workDir=pworkdir)
        if ec != 0:
            job.result[2] = ec
        tin_1 = os.times()
        job.timeStageIn = int(round(tin_1[4] - tin_0[4]))

    return job, ins, statusPFCTurl, usedFAXandDirectIO
Пример #2
0
def failJob(transExitCode, pilotExitCode, job, pilotserver, pilotport, ins=None, pilotErrorDiag=None, docleanup=True):
    """ set the fail code and exit """

    job.setState(["failed", transExitCode, pilotExitCode])
    if pilotErrorDiag:
        job.pilotErrorDiag = pilotErrorDiag
    tolog("Will now update local pilot TCP server")
    rt = RunJobUtilities.updatePilotServer(job, pilotserver, pilotport, final=True)
    if ins:
        ec = pUtil.removeFiles(job.workdir, ins)
    if docleanup:
        sysExit(job)
Пример #3
0
    def stageInHPCEvent(self):
        tolog("Setting stage-in state until all input files have been copied")
        self.__job.jobState = "transferring"
        self.__job.setState([self.__job.jobState, 0, 0])
        rt = RunJobUtilities.updatePilotServer(self.__job, self.getPilotServer(), self.getPilotPort())
        self.__JR.updateJobStateTest(self.__job, self.__jobSite, self.__node, mode="test")

        # stage-in all input files (if necessary)
        job, ins, statusPFCTurl, usedFAXandDirectIO = self.stageIn(self.__job, self.__jobSite, self.__analysisJob, pfc_name="PFC.xml")
        if job.result[2] != 0:
            tolog("Failing job with ec: %d" % (job.result[2]))
            self.failJob(0, job.result[2], job, ins=ins, pilotErrorDiag=job.pilotErrorDiag)
        self.__job = job
        self.__job.displayJob()
Пример #4
0
def createFileMetadata(outFiles, job, outsDict, dsname, datasetDict, sitename):
    """ create the metadata for the output + log files """

    ec = 0

    # get/assign guids to the output files
    if outFiles:
        tolog("outFiles=%s"%str(outFiles))
        if not pUtil.isBuildJob(outFiles):
            ec, job.pilotErrorDiag, job.outFilesGuids = RunJobUtilities.getOutFilesGuids(job.outFiles, job.workdir)
            if ec:
                # missing PoolFileCatalog (only error code from getOutFilesGuids)
                return ec, job, None
        else:
            tolog("Build job - do not use PoolFileCatalog to get guid (generated)")
    else:
        tolog("This job has no output files")

    # get the file sizes and checksums for the local output files
    # WARNING: any errors are lost if occur in getOutputFileInfo()
    ec, pilotErrorDiag, fsize, checksum = pUtil.getOutputFileInfo(list(outFiles), getChecksumCommand(), skiplog=True, logFile=job.logFile)
    if ec != 0:
        tolog("!!FAILED!!2999!! %s" % (pilotErrorDiag))
        failJob(job.result[1], ec, job, pilotserver, pilotport, pilotErrorDiag=pilotErrorDiag)

    if logguid:
        guid = logguid
    else:
        guid = job.tarFileGuid

    # create preliminary metadata (no metadata yet about log file - added later in pilot.py)
    _fname = "%s/metadata-%d.xml" % (job.workdir, job.jobId)
    try:
        _status = pUtil.PFCxml(job.experiment, _fname, list(job.outFiles), fguids=job.outFilesGuids, fntag="lfn", alog=job.logFile, alogguid=guid,\
                               fsize=fsize, checksum=checksum, analJob=analJob)
    except Exception, e:
        pilotErrorDiag = "PFCxml failed due to problematic XML: %s" % (e)
        tolog("!!WARNING!!1113!! %s" % (pilotErrorDiag)) 
        failJob(job.result[1], error.ERR_MISSINGGUID, job, pilotserver, pilotport, pilotErrorDiag=pilotErrorDiag)
Пример #5
0
    def stageInHPCEvent(self):
        tolog("Setting stage-in state until all input files have been copied")
        self.__job.jobState = "transferring"
        self.__job.setState([self.__job.jobState, 0, 0])
        rt = RunJobUtilities.updatePilotServer(self.__job,
                                               self.getPilotServer(),
                                               self.getPilotPort())
        self.__JR.updateJobStateTest(self.__job,
                                     self.__jobSite,
                                     self.__node,
                                     mode="test")

        # stage-in all input files (if necessary)
        job, ins, statusPFCTurl, usedFAXandDirectIO = self.stageIn(
            self.__job, self.__jobSite, self.__analysisJob, pfc_name="PFC.xml")
        if job.result[2] != 0:
            tolog("Failing job with ec: %d" % (job.result[2]))
            self.failJob(0,
                         job.result[2],
                         job,
                         ins=ins,
                         pilotErrorDiag=job.pilotErrorDiag)
        self.__job = job
        self.__job.displayJob()
Пример #6
0
def setup(job, jobSite, thisExperiment):
    """ prepare the setup and get the run command list """

    # start setup time counter
    t0 = time.time()

    ec = 0
    runCommandList = []

    # split up the job parameters to be able to loop over the tasks
    jobParameterList = job.jobPars.split("\n")
    jobHomePackageList = job.homePackage.split("\n")
    jobTrfList = job.trf.split("\n")
    jobAtlasRelease = getAtlasRelease(job.atlasRelease)

    tolog("Number of transformations to process: %s" % len(jobParameterList))
    if len(jobParameterList) > 1:
        multi_trf = True
    else:
        multi_trf = False

    # verify that the multi-trf job is setup properly
    ec, job.pilotErrorDiag, jobAtlasRelease = RunJobUtilities.verifyMultiTrf(jobParameterList, jobHomePackageList, jobTrfList, jobAtlasRelease)
    if ec > 0:
        return ec, runCommandList, job, multi_trf
            
    os.chdir(jobSite.workdir)
    tolog("Current job workdir is %s" % os.getcwd())
    
    # setup the trf(s)
    _i = 0
    _stdout = job.stdout
    _stderr = job.stderr
    _first = True
    for (_jobPars, _homepackage, _trf, _swRelease) in map(None, jobParameterList, jobHomePackageList, jobTrfList, jobAtlasRelease):
        tolog("Preparing setup %d/%d" % (_i + 1, len(jobParameterList)))

        # reset variables
        job.jobPars = _jobPars
        job.homePackage = _homepackage
        job.trf = _trf
        job.atlasRelease = _swRelease
        if multi_trf:
            job.stdout = _stdout.replace(".txt", "_%d.txt" % (_i + 1))
            job.stderr = _stderr.replace(".txt", "_%d.txt" % (_i + 1))

        # post process copysetup variable in case of directIn/useFileStager
        _copysetup = readpar('copysetup')
        _copysetupin = readpar('copysetupin')
        if "--directIn" in job.jobPars or "--useFileStager" in job.jobPars or _copysetup.count('^') == 5 or _copysetupin.count('^') == 5:
            # only need to update the queuedata file once
            if _first:
                RunJobUtilities.updateCopysetups(job.jobPars)
                _first = False

        # setup the trf
        ec, job.pilotErrorDiag, cmd, job.spsetup, job.JEM, job.cmtconfig = thisExperiment.getJobExecutionCommand(job, jobSite, pilot_initdir)
        if ec > 0:
            # setup failed
            break

        # add the setup command to the command list
        runCommandList.append(cmd)
        _i += 1

    job.stdout = _stdout
    job.stderr = _stderr
    job.timeSetup = int(time.time() - t0)
    tolog("Total setup time: %d s" % (job.timeSetup))

    return ec, runCommandList, job, multi_trf
Пример #7
0
            tolog("Warning: Could not copy metadata-%d.xml to site work dir - ddm Adder problems will occure in case of job recovery" % \
                  (job.jobId))

        if job.result[0] == 'holding' and job.result[1] == 0:
            try:
                # create the data directory
                os.makedirs(job.datadir)
            except OSError, e:
                tolog("!!WARNING!!3000!! Could not create data directory: %s, %s" % (job.datadir, str(e)))
            else:
                # find all remaining files in case 'rf' is not empty
                remaining_files = []
                moved_files_list = []
                try:
                    if rf != None:
                        moved_files_list = RunJobUtilities.getFileNamesFromString(rf[1])
                        remaining_files = RunJobUtilities.getRemainingFiles(moved_files_list, job.outFiles) 
                except Exception, e:
                    tolog("!!WARNING!!3000!! Illegal return value from Mover: %s, %s" % (str(rf), str(e)))
                    remaining_files = job.outFiles

                # move all remaining output files to the data directory
                nr_moved = 0
                for _file in remaining_files:
                    try:
                        os.system("mv %s %s" % (_file, job.datadir))
                    except OSError, e:
                        tolog("!!WARNING!!3000!! Failed to move file %s (abort all)" % (_file))
                        break
                    else:
                        nr_moved += 1
Пример #8
0
    def executePayload(self, thisExperiment, job):
        
        t0 = os.times() 
        res_tuple = None
        
        # loop over all run commands (only >1 for multi-trfs)
        getstatusoutput_was_interrupted = False
        job_status = None
        tolog("About to launch ARGO job")
        # Poll MQ for Job Status
        try:
            # Initiate MQ interface and send job
            self.argo_job.job_status_routing_key = '%s_job_status' % job.jobId #'status_' + jobID
            si = SiteInformation()
            mi = MessageInterface()
            mi.host = 'atlasgridftp02.hep.anl.gov'
            mi.port = 5671
            mi.ssl_cert = si.getSSLCertificate() #'/grid/atlas/hpc/pilot_certs/xrootdsrv-cert.pem'
            proxy_cert_path = si.getSSLCertificate()
            mi.ssl_cert = os.path.dirname(proxy_cert_path) + "/rabbitmq-cert.pem"
            if 'X509_USER_CERT' in os.environ.keys():
                mi.ssl_cert = os.environ['X509_USER_CERT'] #'/users/hpcusers/balsam_dev/gridsecurity/jchilders/xrootdsrv-cert.pem'
            
            mi.ssl_key  = mi.ssl_cert #'/grid/atlas/hpc/pilot_certs/xrootdsrv-key.pem'
            mi.ssl_key = os.path.dirname(proxy_cert_path) + "/rabbitmq-key.pem"
            if 'X509_USER_KEY' in os.environ.keys():
                mi.ssl_key  = os.environ['X509_USER_KEY'] #'/users/hpcusers/balsam_dev/gridsecurity/jchilders/xrootdsrv-key.pem'
            
            #mi.ssl_ca_certs = os.path.dirname(proxy_cert_path) + "/rabbitmq-cacerts.pem"
            mi.ssl_ca_certs = '/grid/atlas/hpc/pilot_certs/cacerts.pem' 
            #if 'X509_CA_CERTS' in os.environ.keys():
            #    mi.ssl_ca_certs = os.environ['X509_CA_CERTS'] #'/users/hpcusers/balsam_dev/gridsecurity/jchilders/cacerts.pem'
            #tolog("CA certs: %s" % (mi.ssl_ca_certs))
            ca_certs = os.path.dirname(proxy_cert_path) + "/rabbitmq-cacerts.pem"
            if os.path.isfile(ca_certs): 
                mi.ssl_ca_certs = ca_certs
 
            mi.exchange_name = 'argo_users'

            #Create queue to get messages about ARGO Job status from MQ
            tolog('Opening connection with MQ')
            mi.open_blocking_connection()
            tolog('Create queue [%s]  to retrieve messages with job status' % self.argo_job.job_status_routing_key)

            mi.create_queue(self.argo_job.job_status_routing_key, self.argo_job.job_status_routing_key)

            # submit ARGO job to MQ
            
            #tolog('Opening connection with MQ')
            #mi.open_blocking_connection()
            routing_key = 'argo_job'
            if self.dev:
                routing_key = 'argo_job_dev'
            tolog('Sending msg with job to ARGO')
            mi.send_msg(self.argo_job.serialize(), routing_key)
            tolog(' done sending ')
            
            # Waiting till job done or failed    
            ARGO_err_msg = ''
            while True:
                time.sleep(5)
                message = mi.receive_msg(self.argo_job.job_status_routing_key, True)
                if message[2]:
                    tolog ("Got message from queue [%s]: method [%s], properties [%s], body [ %s ]" % (self.argo_job.job_status_routing_key, message[0], message[1], message[2]))
                    job_status = ArgoJobStatus.get_from_message(message[2])
                    job.hpcStatus = job_status.state
                    rt = RunJobUtilities.updatePilotServer(job, self.getPilotServer(), self.getPilotPort())

                    tolog("Extracted state: %s" % job_status.state)
                    if job_status.state == job_status.HISTORY:
                        res_tuple = (0, "Done")
                        break
                    elif job_status.is_failed():
                        res_tuple = (1, "Failed")
                        ARGO_err_msg = ARGO_err_msg + ' ' + job_status.message
                    elif job_status.state == job_status.FAILED:
                        res_tuple = (1, "Failed")
                        ARGO_err_msg = ARGO_err_msg + ' ' + job_status.message
                        runJob.failJob(1, 0, job, ins=job.inFiles, pilotErrorDiag=ARGO_err_msg)
                        break
                time.sleep(5)
                  
            mi.close()
            tolog(' closing connection to MQ')
                
            tolog("Job State: %s" % (job_status.state))
            #job.timeExe = int(fork_job.finished - fork_job.started)
                
            ####################################################
    
        except Exception, e:
            tolog("!!FAILED!!3000!! Failed to run command %s" % str(e))
            getstatusoutput_was_interrupted = True
            res_tuple = (1, "Failed")
            self.failJob(0, self.__error.ERR_GENERALERROR, job, pilotErrorDiag=str(e))
Пример #9
0
class RunJobHpcEvent(RunJob):

    # private data members
    __runjob = "RunJobHpcEvent"  # String defining the sub class
    __instance = None  # Boolean used by subclasses to become a Singleton

    #__error = PilotErrors()                     # PilotErrors object

    # Required methods

    def __init__(self):
        """ Default initialization """

        # e.g. self.__errorLabel = errorLabel
        pass
        self.__output_es_files = []
        self.__eventRanges = {}
        self.__failedStageOuts = []
        self._hpcManager = None

    def __new__(cls, *args, **kwargs):
        """ Override the __new__ method to make the class a singleton """

        if not cls.__instance:
            cls.__instance = super(RunJob, cls).__new__(cls, *args, **kwargs)

        return cls.__instance

    def getRunJob(self):
        """ Return a string with the experiment name """

        return self.__runjob

    def getRunJobFileName(self):
        """ Return the filename of the module """

        return super(RunJobHpcEvent, self).getRunJobFileName()

    # def argumentParser(self):  <-- see example in RunJob.py

    def allowLoopingJobKiller(self):
        """ Should the pilot search for looping jobs? """

        # The pilot has the ability to monitor the payload work directory. If there are no updated files within a certain
        # time limit, the pilot will consider the as stuck (looping) and will kill it. The looping time limits are set
        # in environment.py (see e.g. loopingLimitDefaultProd)

        return False

    def setupHPCEvent(self):
        self.__jobSite = Site.Site()
        self.__jobSite.setSiteInfo(self.argumentParser())
        ## For HPC job, we don't need to reassign the workdir
        # reassign workdir for this job
        self.__jobSite.workdir = self.__jobSite.wntmpdir
        if not os.path.exists(self.__jobSite.workdir):
            os.makedirs(self.__jobSite.workdir)

        tolog("runJobHPCEvent.getPilotLogFilename=%s" %
              self.getPilotLogFilename())
        if self.getPilotLogFilename() != "":
            pUtil.setPilotlogFilename(self.getPilotLogFilename())

        # set node info
        self.__node = Node.Node()
        self.__node.setNodeName(os.uname()[1])
        self.__node.collectWNInfo(self.__jobSite.workdir)

        # redirect stderr
        #sys.stderr = open("%s/runJobHPCEvent.stderr" % (self.__jobSite.workdir), "w")

        tolog("Current job workdir is: %s" % os.getcwd())
        tolog("Site workdir is: %s" % self.__jobSite.workdir)

        # get the experiment object
        self.__thisExperiment = getExperiment(self.getExperiment())
        tolog("runEvent will serve experiment: %s" %
              (self.__thisExperiment.getExperiment()))

    def getHPCEventJobFromPanda(self):
        pass

    def getHPCEventJobFromEnv(self):
        tolog("getHPCEventJobFromEnv")
        try:
            # always use this filename as the new jobDef module name
            import newJobDef
            job = Job.Job()
            job.setJobDef(newJobDef.job)
            job.coreCount = 0
            job.workdir = self.__jobSite.workdir
            job.experiment = self.getExperiment()
            # figure out and set payload file names
            job.setPayloadName(self.__thisExperiment.getPayloadName(job))
            # reset the default job output file list which is anyway not correct
            job.outFiles = []
        except Exception, e:
            pilotErrorDiag = "Failed to process job info: %s" % str(e)
            tolog("!!WARNING!!3000!! %s" % (pilotErrorDiag))
            self.failJob(0,
                         PilotErrors.ERR_UNKNOWN,
                         job,
                         pilotErrorDiag=pilotErrorDiag)

        self.__job = job
        # prepare for the output file data directory
        # (will only created for jobs that end up in a 'holding' state)
        self.__job.datadir = self.getParentWorkDir() + "/PandaJob_%s_data" % (
            job.jobId)

        # See if it's an analysis job or not
        trf = self.__job.trf
        self.__analysisJob = isAnalysisJob(trf.split(",")[0])

        # Setup starts here ................................................................................

        # Update the job state file
        self.__job.jobState = "starting"
        self.__job.setHpcStatus('init')

        # Send [especially] the process group back to the pilot
        self.__job.setState([self.__job.jobState, 0, 0])
        self.__job.jobState = self.__job.result
        rt = RunJobUtilities.updatePilotServer(job, self.getPilotServer(),
                                               runJob.getPilotPort())

        self.__JR = JobRecovery(pshttpurl='https://pandaserver.cern.ch',
                                pilot_initdir=self.__job.workdir)
        self.__JR.updateJobStateTest(self.__job,
                                     self.__jobSite,
                                     self.__node,
                                     mode="test")
        self.__JR.updatePandaServer(self.__job, self.__jobSite, self.__node,
                                    25443)

        # prepare the setup and get the run command list
        ec, runCommandList, job, multi_trf = self.setup(
            self.__job, self.__jobSite, self.__thisExperiment)
        if ec != 0:
            tolog("!!WARNING!!2999!! runJob setup failed: %s" %
                  (job.pilotErrorDiag))
            self.failJob(0, ec, job, pilotErrorDiag=job.pilotErrorDiag)
        tolog("Setup has finished successfully")
        self.__job = job
        self.__runCommandList = runCommandList
        self.__multi_trf = multi_trf

        # job has been updated, display it again
        self.__job.displayJob()
        tolog("RunCommandList: %s" % self.__runCommandList)
        tolog("Multi_trf: %s" % self.__multi_trf)
Пример #10
0
    def finishJob(self):
        try:
            self.__hpcManager.finishJob()
        except:
            tolog(sys.exc_info()[1])
            tolog(sys.exc_info()[2])

        # If payload leaves the input files, delete them explicitly
        if self.__job.inFiles:
            ec = pUtil.removeFiles(self.__job.workdir, self.__job.inFiles)
        #if self.__output_es_files:
        #    ec = pUtil.removeFiles("/", self.__output_es_files)


        errorCode = PilotErrors.ERR_UNKNOWN
        if self.__job.attemptNr < 4:
            errorCode = PilotErrors.ERR_ESRECOVERABLE

        #check HPC job status
        #if self.__hpcStatus:
        #    self.failJob(0, 1220, self.__job, pilotErrorDiag="HPC job failed")

        if len(self.__eventRanges) == 0:
            tolog("Cannot get event ranges")
            self.failJob(0, errorCode, self.__job, pilotErrorDiag="Cannot get event ranges")

        # check whether all event ranges are handled
        tolog("Total event ranges: %s" % len(self.__eventRanges))
        not_handled_events = self.__eventRanges.values().count('new')
        tolog("Not handled events: %s" % not_handled_events)
        done_events = self.__eventRanges.values().count('Done')
        tolog("Finished events: %s" % done_events)
        stagedOut_events = self.__eventRanges.values().count('stagedOut')
        tolog("stagedOut but not updated to panda server events: %s" % stagedOut_events)
        if done_events + stagedOut_events:
            errorCode = PilotErrors.ERR_ESRECOVERABLE
        if not_handled_events + stagedOut_events:
            tolog("Not all event ranges are handled. failed job")
            self.failJob(0, errorCode, self.__job, pilotErrorDiag="Not All events are handled(total:%s, left:%s)" % (len(self.__eventRanges), not_handled_events + stagedOut_events))

        dsname, datasetDict = self.getDatasets()
        tolog("dsname = %s" % (dsname))
        tolog("datasetDict = %s" % (datasetDict))

        # Create the output file dictionary needed for generating the metadata
        ec, pilotErrorDiag, outs, outsDict = RunJobUtilities.prepareOutFiles(self.__job.outFiles, self.__job.logFile, self.__job.workdir, fullpath=True)
        if ec:
            # missing output file (only error code from prepareOutFiles)
            self.failJob(self.__job.result[1], ec, self.__job, pilotErrorDiag=pilotErrorDiag)
        tolog("outsDict: %s" % str(outsDict))

        # Create metadata for all successfully staged-out output files (include the log file as well, even if it has not been created yet)
        ec, job, outputFileInfo = self.createFileMetadata([], self.__job, outsDict, dsname, datasetDict, self.__jobSite.sitename)
        if ec:
            self.failJob(0, ec, job, pilotErrorDiag=job.pilotErrorDiag)

        # Rename the metadata produced by the payload
        # if not pUtil.isBuildJob(outs):
        self.moveTrfMetadata(self.__job.workdir, self.__job.jobId)

        # Check the job report for any exit code that should replace the res_tuple[0]
        res0, exitAcronym, exitMsg = self.getTrfExitInfo(0, self.__job.workdir)
        res = (res0, exitMsg, exitMsg)

        # Payload error handling
        ed = ErrorDiagnosis()
        job = ed.interpretPayload(self.__job, res, False, 0, self.__runCommandList, self.getFailureCode())
        if job.result[1] != 0 or job.result[2] != 0:
            self.failJob(job.result[1], job.result[2], job, pilotErrorDiag=job.pilotErrorDiag)
        self.__job = job

        job.jobState = "finished"
        job.setState([job.jobState, 0, 0])
        job.jobState = job.result
        rt = RunJobUtilities.updatePilotServer(job, self.getPilotServer(), self.getPilotPort(), final=True)

        tolog("Done")
        self.sysExit(self.__job)
Пример #11
0
    def runHPCEvent(self):
        tolog("runHPCEvent")
        self.__job.jobState = "running"
        self.__job.setState([self.__job.jobState, 0, 0])
        self.__job.pilotErrorDiag = None
        rt = RunJobUtilities.updatePilotServer(self.__job, self.getPilotServer(), self.getPilotPort())
        self.__JR.updateJobStateTest(self.__job, self.__jobSite, self.__node, mode="test")

        defRes = self.getDefaultResources()
        if defRes['copy_input_files'] == 'true':
            self.__copyInputFiles = True
        else:
            self.__copyInputFiles = False

        status, output, hpcJob = self.prepareHPCJob()
        if status == 0:
            tolog("HPC Job: %s " % hpcJob)
        else:
            tolog("failed to create the Tag file")
            self.failJob(0, PilotErrors.ERR_UNKNOWN, self.__job, pilotErrorDiag=output)
            return 


        self.__hpcStatus = None
        self.__hpcLog = None

        logFileName = None
        tolog("runJobHPCEvent.getPilotLogFilename=%s"% self.getPilotLogFilename())
        if self.getPilotLogFilename() != "":
            logFileName = self.getPilotLogFilename()
        hpcManager = HPCManager(globalWorkingDir=self.__job.workdir, logFileName=logFileName, poolFileCatalog=self.__poolFileCatalogTemp, inputFiles=self.__inputFilesGlobal, copyInputFiles=self.__copyInputFiles)

        self.__hpcManager = hpcManager
        self.HPCMode = "HPC_" + hpcManager.getMode(defRes)
        self.__job.setMode(self.HPCMode)
        self.__job.setHpcStatus('waitingResource')
        rt = RunJobUtilities.updatePilotServer(self.__job, self.getPilotServer(), self.getPilotPort())
        self.__JR.updatePandaServer(self.__job, self.__jobSite, self.__node, 25443)

        hpcManager.getFreeResources(defRes)
        self.__job.coreCount = hpcManager.getCoreCount()
        self.__job.setHpcStatus('gettingEvents')
        rt = RunJobUtilities.updatePilotServer(self.__job, self.getPilotServer(), self.getPilotPort())
        self.__JR.updatePandaServer(self.__job, self.__jobSite, self.__node, 25443)

        numRanges = hpcManager.getEventsNumber()
        tolog("HPC Manager needs events: %s, max_events: %s; use the smallest one." % (numRanges, defRes['max_events']))
        if numRanges > int(defRes['max_events']):
            numRanges = int(defRes['max_events'])
        eventRanges = self.getEventRanges(numRanges=numRanges)
        #tolog("Event Ranges: %s " % eventRanges)
        if len(eventRanges) == 0:
            tolog("Get no Event ranges. return")
            return
        for eventRange in eventRanges:
            self.__eventRanges[eventRange['eventRangeID']] = 'new'

        # setup stage out
        self.setupStageOutHPCEvent()

        hpcManager.initJob(hpcJob)
        hpcManager.initEventRanges(eventRanges)
        
        hpcManager.submit()
        threadpool = ThreadPool(defRes['stageout_threads'])

        old_state = None
        time_start = time.time()
        while not hpcManager.isFinished():
            state = hpcManager.poll()
            self.__job.setHpcStatus(state)
            if old_state is None or old_state != state or time.time() > (time_start + 60*10):
                old_state = state
                time_start = time.time()
                tolog("HPCManager Job stat: %s" % state)
                self.__JR.updateJobStateTest(self.__job, self.__jobSite, self.__node, mode="test")
                rt = RunJobUtilities.updatePilotServer(self.__job, self.getPilotServer(), self.getPilotPort())
                self.__JR.updatePandaServer(self.__job, self.__jobSite, self.__node, 25443)

            if state and state == 'Complete':
                break
            outputs = hpcManager.getOutputs()
            for output in outputs:
                #self.stageOutHPCEvent(output)
                threadpool.add_task(self.stageOutHPCEvent, output)

            time.sleep(30)
            self.updateHPCEventRanges()

        tolog("HPCManager Job Finished")
        self.__job.setHpcStatus('stagingOut')
        rt = RunJobUtilities.updatePilotServer(self.__job, self.getPilotServer(), self.getPilotPort())
        self.__JR.updatePandaServer(self.__job, self.__jobSite, self.__node, 25443)

        outputs = hpcManager.getOutputs()
        for output in outputs:
            #self.stageOutHPCEvent(output)
            threadpool.add_task(self.stageOutHPCEvent, output)

        self.updateHPCEventRanges()
        threadpool.wait_completion()
        self.updateHPCEventRanges()


        if len(self.__failedStageOuts) > 0:
            tolog("HPC Stage out retry 1")
            half_stageout_threads = defRes['stageout_threads'] / 2
            if half_stageout_threads < 1:
                half_stageout_threads = 1
            threadpool = ThreadPool(half_stageout_threads)
            failedStageOuts = self.__failedStageOuts
            self.__failedStageOuts = []
            for failedStageOut in failedStageOuts:
                threadpool.add_task(self.stageOutHPCEvent, failedStageOut)
            threadpool.wait_completion()
            self.updateHPCEventRanges()

        if len(self.__failedStageOuts) > 0:
            tolog("HPC Stage out retry 2")
            threadpool = ThreadPool(1)
            failedStageOuts = self.__failedStageOuts
            self.__failedStageOuts = []
            for failedStageOut in failedStageOuts:
                threadpool.add_task(self.stageOutHPCEvent, failedStageOut)
            threadpool.wait_completion()
            self.updateHPCEventRanges()

        self.__job.setHpcStatus('finished')
        self.__JR.updatePandaServer(self.__job, self.__jobSite, self.__node, 25443)
        self.__hpcStatus, self.__hpcLog = hpcManager.checkHPCJobLog()
        tolog("HPC job log status: %s, job log error: %s" % (self.__hpcStatus, self.__hpcLog))
Пример #12
0
    def executePayload(self, thisExperiment, job):

        t0 = os.times()
        res_tuple = None

        # loop over all run commands (only >1 for multi-trfs)
        getstatusoutput_was_interrupted = False
        job_status = None
        tolog("About to launch ARGO job")
        # Poll MQ for Job Status
        try:
            # Initiate MQ interface and send job
            self.argo_job.job_status_routing_key = '%s_job_status' % job.jobId  #'status_' + jobID
            si = SiteInformation()
            mi = MessageInterface()
            mi.host = 'atlasgridftp02.hep.anl.gov'
            mi.port = 5671
            mi.ssl_cert = si.getSSLCertificate(
            )  #'/grid/atlas/hpc/pilot_certs/xrootdsrv-cert.pem'
            proxy_cert_path = si.getSSLCertificate()
            mi.ssl_cert = os.path.dirname(
                proxy_cert_path) + "/rabbitmq-cert.pem"
            if 'X509_USER_CERT' in os.environ.keys():
                mi.ssl_cert = os.environ[
                    'X509_USER_CERT']  #'/users/hpcusers/balsam_dev/gridsecurity/jchilders/xrootdsrv-cert.pem'

            mi.ssl_key = mi.ssl_cert  #'/grid/atlas/hpc/pilot_certs/xrootdsrv-key.pem'
            mi.ssl_key = os.path.dirname(proxy_cert_path) + "/rabbitmq-key.pem"
            if 'X509_USER_KEY' in os.environ.keys():
                mi.ssl_key = os.environ[
                    'X509_USER_KEY']  #'/users/hpcusers/balsam_dev/gridsecurity/jchilders/xrootdsrv-key.pem'

            #mi.ssl_ca_certs = os.path.dirname(proxy_cert_path) + "/rabbitmq-cacerts.pem"
            mi.ssl_ca_certs = '/grid/atlas/hpc/pilot_certs/cacerts.pem'
            #if 'X509_CA_CERTS' in os.environ.keys():
            #    mi.ssl_ca_certs = os.environ['X509_CA_CERTS'] #'/users/hpcusers/balsam_dev/gridsecurity/jchilders/cacerts.pem'
            #tolog("CA certs: %s" % (mi.ssl_ca_certs))
            ca_certs = os.path.dirname(
                proxy_cert_path) + "/rabbitmq-cacerts.pem"
            if os.path.isfile(ca_certs):
                mi.ssl_ca_certs = ca_certs

            mi.exchange_name = 'argo_users'

            #Create queue to get messages about ARGO Job status from MQ
            tolog('Opening connection with MQ')
            mi.open_blocking_connection()
            tolog('Create queue [%s]  to retrieve messages with job status' %
                  self.argo_job.job_status_routing_key)

            mi.create_queue(self.argo_job.job_status_routing_key,
                            self.argo_job.job_status_routing_key)

            # submit ARGO job to MQ

            #tolog('Opening connection with MQ')
            #mi.open_blocking_connection()
            routing_key = 'argo_job'
            if self.dev:
                routing_key = 'argo_job_dev'
            tolog('Sending msg with job to ARGO')
            mi.send_msg(self.argo_job.serialize(), routing_key)
            tolog(' done sending ')

            # Waiting till job done or failed
            ARGO_err_msg = ''
            while True:
                time.sleep(5)
                message = mi.receive_msg(self.argo_job.job_status_routing_key,
                                         True)
                if message[2]:
                    tolog(
                        "Got message from queue [%s]: method [%s], properties [%s], body [ %s ]"
                        % (self.argo_job.job_status_routing_key, message[0],
                           message[1], message[2]))
                    job_status = ArgoJobStatus.get_from_message(message[2])
                    job.hpcStatus = job_status.state
                    rt = RunJobUtilities.updatePilotServer(
                        job, self.getPilotServer(), self.getPilotPort())

                    tolog("Extracted state: %s" % job_status.state)
                    if job_status.state == job_status.HISTORY:
                        res_tuple = (0, "Done")
                        break
                    elif job_status.is_failed():
                        res_tuple = (1, "Failed")
                        ARGO_err_msg = ARGO_err_msg + ' ' + job_status.message
                    elif job_status.state == job_status.FAILED:
                        res_tuple = (1, "Failed")
                        ARGO_err_msg = ARGO_err_msg + ' ' + job_status.message
                        runJob.failJob(1,
                                       0,
                                       job,
                                       ins=job.inFiles,
                                       pilotErrorDiag=ARGO_err_msg)
                        break
                time.sleep(5)

            mi.close()
            tolog(' closing connection to MQ')

            tolog("Job State: %s" % (job_status.state))
            #job.timeExe = int(fork_job.finished - fork_job.started)

            ####################################################

        except Exception, e:
            tolog("!!FAILED!!3000!! Failed to run command %s" % str(e))
            getstatusoutput_was_interrupted = True
            res_tuple = (1, "Failed")
            self.failJob(0,
                         self.__error.ERR_GENERALERROR,
                         job,
                         pilotErrorDiag=str(e))
Пример #13
0
        if analysisJob:
            tolog("User analysis job")
        else:
            tolog("Production job")
        tolog("runJobArgo received a job with prodSourceLabel=%s" %
              (job.prodSourceLabel))

        # setup starts here ................................................................................

        # update the job state file
        job.jobState = "setup"
        #_retjs = JR.updateJobStateTest(job, jobSite, node, mode="test")

        # send [especially] the process group back to the pilot
        job.setState([job.jobState, 0, 0])
        rt = RunJobUtilities.updatePilotServer(job, runJob.getPilotServer(),
                                               runJob.getPilotPort())

        # prepare the setup and get the run command list
        ec, job = runJob.setup(job, jobSite, thisExperiment)
        if ec != 0:
            tolog("!!WARNING!!2999!! runJob setup failed: %s" %
                  (job.pilotErrorDiag))
            runJob.failJob(0, ec, job, pilotErrorDiag=job.pilotErrorDiag)
        tolog("Setup has finished successfully")

        # job has been updated, display it again
        job.displayJob()

        # (setup ends here) ................................................................................

        tolog("Setting stage-in state until all input files have been copied")
Пример #14
0
    def finishJob(self):
        try:
            self.__hpcManager.finishJob()
        except:
            tolog(sys.exc_info()[1])
            tolog(sys.exc_info()[2])

        # If payload leaves the input files, delete them explicitly
        if self.__job.inFiles:
            ec = pUtil.removeFiles(self.__job.workdir, self.__job.inFiles)
        #if self.__output_es_files:
        #    ec = pUtil.removeFiles("/", self.__output_es_files)

        errorCode = PilotErrors.ERR_UNKNOWN
        if self.__job.attemptNr < 4:
            errorCode = PilotErrors.ERR_ESRECOVERABLE

        #check HPC job status
        #if self.__hpcStatus:
        #    self.failJob(0, 1220, self.__job, pilotErrorDiag="HPC job failed")

        if len(self.__eventRanges) == 0:
            tolog("Cannot get event ranges")
            self.failJob(0,
                         errorCode,
                         self.__job,
                         pilotErrorDiag="Cannot get event ranges")

        # check whether all event ranges are handled
        tolog("Total event ranges: %s" % len(self.__eventRanges))
        not_handled_events = self.__eventRanges.values().count('new')
        tolog("Not handled events: %s" % not_handled_events)
        done_events = self.__eventRanges.values().count('Done')
        tolog("Finished events: %s" % done_events)
        stagedOut_events = self.__eventRanges.values().count('stagedOut')
        tolog("stagedOut but not updated to panda server events: %s" %
              stagedOut_events)
        if done_events + stagedOut_events:
            errorCode = PilotErrors.ERR_ESRECOVERABLE
        if not_handled_events + stagedOut_events:
            tolog("Not all event ranges are handled. failed job")
            self.failJob(
                0,
                errorCode,
                self.__job,
                pilotErrorDiag="Not All events are handled(total:%s, left:%s)"
                % (len(self.__eventRanges),
                   not_handled_events + stagedOut_events))

        dsname, datasetDict = self.getDatasets()
        tolog("dsname = %s" % (dsname))
        tolog("datasetDict = %s" % (datasetDict))

        # Create the output file dictionary needed for generating the metadata
        ec, pilotErrorDiag, outs, outsDict = RunJobUtilities.prepareOutFiles(
            self.__job.outFiles,
            self.__job.logFile,
            self.__job.workdir,
            fullpath=True)
        if ec:
            # missing output file (only error code from prepareOutFiles)
            self.failJob(self.__job.result[1],
                         ec,
                         self.__job,
                         pilotErrorDiag=pilotErrorDiag)
        tolog("outsDict: %s" % str(outsDict))

        # Create metadata for all successfully staged-out output files (include the log file as well, even if it has not been created yet)
        ec, job, outputFileInfo = self.createFileMetadata(
            [], self.__job, outsDict, dsname, datasetDict,
            self.__jobSite.sitename)
        if ec:
            self.failJob(0, ec, job, pilotErrorDiag=job.pilotErrorDiag)

        # Rename the metadata produced by the payload
        # if not pUtil.isBuildJob(outs):
        self.moveTrfMetadata(self.__job.workdir, self.__job.jobId)

        # Check the job report for any exit code that should replace the res_tuple[0]
        res0, exitAcronym, exitMsg = self.getTrfExitInfo(0, self.__job.workdir)
        res = (res0, exitMsg, exitMsg)

        # Payload error handling
        ed = ErrorDiagnosis()
        job = ed.interpretPayload(self.__job, res, False, 0,
                                  self.__runCommandList, self.getFailureCode())
        if job.result[1] != 0 or job.result[2] != 0:
            self.failJob(job.result[1],
                         job.result[2],
                         job,
                         pilotErrorDiag=job.pilotErrorDiag)
        self.__job = job

        job.jobState = "finished"
        job.setState([job.jobState, 0, 0])
        job.jobState = job.result
        rt = RunJobUtilities.updatePilotServer(job,
                                               self.getPilotServer(),
                                               self.getPilotPort(),
                                               final=True)

        tolog("Done")
        self.sysExit(self.__job)
Пример #15
0
    def runHPCEvent(self):
        tolog("runHPCEvent")
        self.__job.jobState = "running"
        self.__job.setState([self.__job.jobState, 0, 0])
        self.__job.pilotErrorDiag = None
        rt = RunJobUtilities.updatePilotServer(self.__job,
                                               self.getPilotServer(),
                                               self.getPilotPort())
        self.__JR.updateJobStateTest(self.__job,
                                     self.__jobSite,
                                     self.__node,
                                     mode="test")

        defRes = self.getDefaultResources()
        if defRes['copy_input_files'] == 'true':
            self.__copyInputFiles = True
        else:
            self.__copyInputFiles = False

        status, output, hpcJob = self.prepareHPCJob()
        if status == 0:
            tolog("HPC Job: %s " % hpcJob)
        else:
            tolog("failed to create the Tag file")
            self.failJob(0,
                         PilotErrors.ERR_UNKNOWN,
                         self.__job,
                         pilotErrorDiag=output)
            return

        self.__hpcStatus = None
        self.__hpcLog = None

        logFileName = None
        tolog("runJobHPCEvent.getPilotLogFilename=%s" %
              self.getPilotLogFilename())
        if self.getPilotLogFilename() != "":
            logFileName = self.getPilotLogFilename()
        hpcManager = HPCManager(globalWorkingDir=self.__job.workdir,
                                logFileName=logFileName,
                                poolFileCatalog=self.__poolFileCatalogTemp,
                                inputFiles=self.__inputFilesGlobal,
                                copyInputFiles=self.__copyInputFiles)

        self.__hpcManager = hpcManager
        self.HPCMode = "HPC_" + hpcManager.getMode(defRes)
        self.__job.setMode(self.HPCMode)
        self.__job.setHpcStatus('waitingResource')
        rt = RunJobUtilities.updatePilotServer(self.__job,
                                               self.getPilotServer(),
                                               self.getPilotPort())
        self.__JR.updatePandaServer(self.__job, self.__jobSite, self.__node,
                                    25443)

        hpcManager.getFreeResources(defRes)
        self.__job.coreCount = hpcManager.getCoreCount()
        self.__job.setHpcStatus('gettingEvents')
        rt = RunJobUtilities.updatePilotServer(self.__job,
                                               self.getPilotServer(),
                                               self.getPilotPort())
        self.__JR.updatePandaServer(self.__job, self.__jobSite, self.__node,
                                    25443)

        numRanges = hpcManager.getEventsNumber()
        tolog(
            "HPC Manager needs events: %s, max_events: %s; use the smallest one."
            % (numRanges, defRes['max_events']))
        if numRanges > int(defRes['max_events']):
            numRanges = int(defRes['max_events'])
        eventRanges = self.getEventRanges(numRanges=numRanges)
        #tolog("Event Ranges: %s " % eventRanges)
        if len(eventRanges) == 0:
            tolog("Get no Event ranges. return")
            return
        for eventRange in eventRanges:
            self.__eventRanges[eventRange['eventRangeID']] = 'new'

        # setup stage out
        self.setupStageOutHPCEvent()

        hpcManager.initJob(hpcJob)
        hpcManager.initEventRanges(eventRanges)

        hpcManager.submit()
        threadpool = ThreadPool(defRes['stageout_threads'])

        old_state = None
        time_start = time.time()
        while not hpcManager.isFinished():
            state = hpcManager.poll()
            self.__job.setHpcStatus(state)
            if old_state is None or old_state != state or time.time() > (
                    time_start + 60 * 10):
                old_state = state
                time_start = time.time()
                tolog("HPCManager Job stat: %s" % state)
                self.__JR.updateJobStateTest(self.__job,
                                             self.__jobSite,
                                             self.__node,
                                             mode="test")
                rt = RunJobUtilities.updatePilotServer(self.__job,
                                                       self.getPilotServer(),
                                                       self.getPilotPort())
                self.__JR.updatePandaServer(self.__job, self.__jobSite,
                                            self.__node, 25443)

            if state and state == 'Complete':
                break
            outputs = hpcManager.getOutputs()
            for output in outputs:
                #self.stageOutHPCEvent(output)
                threadpool.add_task(self.stageOutHPCEvent, output)

            time.sleep(30)
            self.updateHPCEventRanges()

        tolog("HPCManager Job Finished")
        self.__job.setHpcStatus('stagingOut')
        rt = RunJobUtilities.updatePilotServer(self.__job,
                                               self.getPilotServer(),
                                               self.getPilotPort())
        self.__JR.updatePandaServer(self.__job, self.__jobSite, self.__node,
                                    25443)

        outputs = hpcManager.getOutputs()
        for output in outputs:
            #self.stageOutHPCEvent(output)
            threadpool.add_task(self.stageOutHPCEvent, output)

        self.updateHPCEventRanges()
        threadpool.wait_completion()
        self.updateHPCEventRanges()

        if len(self.__failedStageOuts) > 0:
            tolog("HPC Stage out retry 1")
            half_stageout_threads = defRes['stageout_threads'] / 2
            if half_stageout_threads < 1:
                half_stageout_threads = 1
            threadpool = ThreadPool(half_stageout_threads)
            failedStageOuts = self.__failedStageOuts
            self.__failedStageOuts = []
            for failedStageOut in failedStageOuts:
                threadpool.add_task(self.stageOutHPCEvent, failedStageOut)
            threadpool.wait_completion()
            self.updateHPCEventRanges()

        if len(self.__failedStageOuts) > 0:
            tolog("HPC Stage out retry 2")
            threadpool = ThreadPool(1)
            failedStageOuts = self.__failedStageOuts
            self.__failedStageOuts = []
            for failedStageOut in failedStageOuts:
                threadpool.add_task(self.stageOutHPCEvent, failedStageOut)
            threadpool.wait_completion()
            self.updateHPCEventRanges()

        self.__job.setHpcStatus('finished')
        self.__JR.updatePandaServer(self.__job, self.__jobSite, self.__node,
                                    25443)
        self.__hpcStatus, self.__hpcLog = hpcManager.checkHPCJobLog()
        tolog("HPC job log status: %s, job log error: %s" %
              (self.__hpcStatus, self.__hpcLog))
Пример #16
0
        if analysisJob:
            tolog("User analysis job")
        else:
            tolog("Production job")
        tolog("runJob received a job with prodSourceLabel=%s" %
              (job.prodSourceLabel))

        # setup starts here ................................................................................

        # update the job state file
        job.jobState = "setup"
        #_retjs = JR.updateJobStateTest(job, jobSite, node, mode="test")

        # send [especially] the process group back to the pilot
        job.setState([job.jobState, 0, 0])
        rt = RunJobUtilities.updatePilotServer(job, runJob.getPilotServer(),
                                               runJob.getPilotPort())

        # prepare the setup and get the run command list
        ec, runCommandList, job, multi_trf = runJob.setup(
            job, jobSite, thisExperiment)
        if ec != 0:
            tolog("!!WARNING!!2999!! runJob setup failed: %s" %
                  (job.pilotErrorDiag))
            runJob.failJob(0, ec, job, pilotErrorDiag=job.pilotErrorDiag)
        tolog("Setup has finished successfully")

        # job has been updated, display it again
        job.displayJob()

        # (setup ends here) ................................................................................
Пример #17
0
        analysisJob = isAnalysisJob(job.trf.split(",")[0])
        if analysisJob:
            tolog("User analysis job")
        else:
            tolog("Production job")
        tolog("runJob received a job with prodSourceLabel=%s" % (job.prodSourceLabel))

        # setup starts here ................................................................................

        # update the job state file
        job.jobState = "setup"
        #_retjs = JR.updateJobStateTest(job, jobSite, node, mode="test")

        # send [especially] the process group back to the pilot
        job.setState([job.jobState, 0, 0])
        rt = RunJobUtilities.updatePilotServer(job, runJob.getPilotServer(), runJob.getPilotPort())

        # prepare the setup and get the run command list
        ec, runCommandList, job, multi_trf = runJob.setup(job, jobSite, thisExperiment)
        if ec != 0:
            tolog("!!WARNING!!2999!! runJob setup failed: %s" % (job.pilotErrorDiag))
            runJob.failJob(0, ec, job, pilotErrorDiag=job.pilotErrorDiag)
        tolog("Setup has finished successfully")

        # job has been updated, display it again
        job.displayJob()

        # (setup ends here) ................................................................................

        tolog("Setting stage-in state until all input files have been copied")
        job.setState(["stagein", 0, 0])
Пример #18
0
        analysisJob = isAnalysisJob(job.trf.split(",")[0])
        if analysisJob:
            tolog("User analysis job")
        else:
            tolog("Production job")
        tolog("runJobArgo received a job with prodSourceLabel=%s" % (job.prodSourceLabel))

        # setup starts here ................................................................................

        # update the job state file
        job.jobState = "setup"
        #_retjs = JR.updateJobStateTest(job, jobSite, node, mode="test")

        # send [especially] the process group back to the pilot
        job.setState([job.jobState, 0, 0])
        rt = RunJobUtilities.updatePilotServer(job, runJob.getPilotServer(), runJob.getPilotPort())

        # prepare the setup and get the run command list
        ec, job = runJob.setup(job, jobSite, thisExperiment)
        if ec != 0:
            tolog("!!WARNING!!2999!! runJob setup failed: %s" % (job.pilotErrorDiag))
            runJob.failJob(0, ec, job, pilotErrorDiag=job.pilotErrorDiag)
        tolog("Setup has finished successfully")

        # job has been updated, display it again
        job.displayJob()

        # (setup ends here) ................................................................................

        tolog("Setting stage-in state until all input files have been copied")
        job.setState(["stagein", 0, 0])