예제 #1
0
파일: Jedi.py 프로젝트: ganga-devs/ganga
    def master_resubmit(self,jobs):
        '''Resubmit failed Jedi job'''
        from pandatools import Client

        jobIDs = {}
        for job in jobs: 
            jobIDs[job.backend.id] = job

        allJobIDs = jobIDs.keys()
        pandaJobIDs = {}
        for jID in allJobIDs:
            with inject_proxy(self.credential_requirements):
                status, jediTaskDict = Client.getJediTaskDetails({'jediTaskID': jID},False,True,verbose=False)
            if status != 0:
                logger.error("Failed to get task details for %s" % jID)
                raise BackendError('Jedi','Return code %d retrieving job status information.' % status)

            # Retrieve job
            job = jobIDs[jediTaskDict['jediTaskID']]
       
            newJobsetID = -1 # get jobset
            retryJobs = [] # jspecs
            resubmittedJobs = [] # ganga jobs

            if jediTaskDict['status'] in ['failed', 'killed', 'cancelled', 'aborted', 'broken', 'finished' ]:
                retryJobs.append(job)
                resubmittedJobs.append(jID)
            #elif jediTaskDict['status'] == 'finished':
            #    pass
            else:
                logger.warning("Cannot resubmit. Jedi task %s is status %s." %(jID, jediTaskDict['status'] ))
                return False

            # submit
            if len(retryJobs)==0:
                logger.warning("No failed jobs to resubmit")
                return False

            with inject_proxy(self.credential_requirements):
                status,out = Client.retryTask(jID, verbose=False)
            if status != 0:
                logger.error(status)
                logger.error(out)
                logger.error("Failed to retry JobID=%s" % jID)                                                                                         
                return False
            tmpStat,tmpDiag = out
            if not tmpStat:
                logger.error(tmpDiag)
                logger.error("Failed to retry JobID=%s" % jID)
                return False
            logger.info(tmpDiag)
       
            job.backend.status = None
            job.backend.jobSpec = {}
            job.updateStatus('submitted')

        logger.info('Resubmission successful')
        return True
예제 #2
0
def ELG_jediState(sample) :

    from pandatools import PandaToolsPkgInfo
    if int(float(PandaToolsPkgInfo.release_version[2])) < 4 :
        print "Need prun with JEDI support, try:"
        print "    localSetupPandaClient currentJedi --noAthenaCheck"
        return ''

    jediTaskID = int(sample.getMetaDouble("nc_jediTaskID", 0))

    if jediTaskID < 100 :
        print "Sample " + sample.name() + " does not have a jediTaskID"
        return ''

    from pandatools import Client

    taskDict = {}
    taskDict['jediTaskID'] = jediTaskID
    ret = Client.getJediTaskDetails(taskDict, False, True)
    if ret[0] != 0 :
        print "Problem checking status of task %s with id %s" % (sample.name(), jediTaskID)
        return ''

    return ret[1]['status']
예제 #3
0
 def status(self,JobID,forceUpdate=False):
     # get logger
     tmpLog = PLogger.getPandaLogger()
     # check proxy
     self.gridPassPhrase,self.vomsFQAN = PsubUtils.checkGridProxy(
             self.gridPassPhrase,
             False,
             self.verbose,
             useCache=True)
     # get job info from local repository
     job = self.getJobInfo(JobID)
     if job == None:
         # not found
         return None
     # update if needed
     if job.dbStatus != 'frozen' or forceUpdate:
         if not job.isJEDI():
             tmpLog.info("Getting status for JobID=%s ..." % JobID)
             # get status from Panda server
             status,pandaIDstatus = Client.getPandIDsWithJobID(JobID,verbose=self.verbose)
             if status != 0:
                 tmpLog.error("Failed to get status for ID=%s" % JobID)
                 return None
             # get one job to set computingSite which may have changed due to rebrokerage
             pandaJob = None
             if pandaIDstatus != {}:
                 tmpPandaIDs = pandaIDstatus.keys()
                 tmpPandaIDs.sort()
                 status,tmpPandaJobs = Client.getFullJobStatus(
                         tmpPandaIDs[:1],
                         verbose=self.verbose)
                 if status != 0:
                     tmpLog.error("Failed to get PandaJobs for %s" % JobID)
                     return None
                 pandaJob = tmpPandaJobs[0]
             # convert to local job spec
             job = PdbUtils.convertPtoD([],pandaIDstatus,job,pandaJobForSiteID=pandaJob)
             # check merge job generation
             status = self.setMergeJobStatus(job,forceUpdate)
             if not status:
                 return None
         else:
             tmpLog.info("Getting status for TaskID=%s ..." % job.jediTaskID)
             # get JEDI task
             status,jediTaskDict = Client.getJediTaskDetails(
                     {'jediTaskID':job.jediTaskID},
                     False,
                     True,
                     verbose=self.verbose)
             if status != 0:
                 tmpLog.error("Failed to get task details for %s" % JobID)
                 return
             # convert JEDI task
             job = PdbUtils.convertJTtoD(jediTaskDict,job)
         # update DB
         try:
             PdbUtils.updateJobDB(job,self.verbose)
         except:
             tmpLog.error("Failed to update local repository for JobID=%s" % JobID)
             return None
         if not job.isJEDI():
             tmpLog.info("Updated JobID=%s" % JobID)                        
         else:
             tmpLog.info("Updated TaskID=%s ..." % job.jediTaskID)
     # return
     return job
예제 #4
0
파일: Jedi.py 프로젝트: MannyMoo/ganga
    def master_updateMonitoringInformation(jobs):
        '''Monitor jobs'''       
        from pandatools import Client

        #active_status = [ None, 'defined', 'unknown', 'assigned', 'waiting', 'activated', 'sent', 'starting', 'running', 'holding', 'transferring' ]

        submitting_status = [ ]
        active_status = [ None, 'registered', 'waiting', 'defined', 'pending', 'assigning', 'ready', 'scouting', 'running', 'holding', 'merging', 'prepared', 'aborting', 'finishing' ]
 
        inactive_status = [ 'finished', 'aborted', 'broken', 'failed', 'done' ]

        # Find jobs to be monitored
        jobdict = {}
        for job in jobs:
            # add a delay as Panda can be a little slow in sorting out a new Task
            if job.backend.id and job.backend.status in active_status and ( (datetime.datetime.utcnow() - job.time.timestamps["submitted"]).seconds > 120):
                jobdict[job.backend.id] = job 

        logger.debug("jobdict = %s" %jobdict)
        
        # Monitor active Jedi tasks
        allJobIDs = jobdict.keys()
        pandaJobIDs = {}
        for jID in allJobIDs:
            status, jediTaskDict = Client.getJediTaskDetails({'jediTaskID': jID},False,True,verbose=False)
            if status != 0:
                logger.error("Failed to get task details for %s" % jID)
                #raise BackendError('Jedi','Return code %d retrieving job status information.' % status)
                continue
            # Retrieve job
            job = jobdict[jediTaskDict['jediTaskID']]
            # Store associated Panda jobs
            if job.backend.pandajobs:
                pandaJobIDs[job.backend.id] = [pj.id for pj in job.backend.pandajobs]
            else:
                pandaJobIDs[jediTaskDict['jediTaskID']] = jediTaskDict['PandaID']
            logger.debug("jID = %s, pandaJobIDs = %s" % (jID, pandaJobIDs))

            # Fill the output data dataset list
            if 'outDS' in jediTaskDict and jediTaskDict['outDS'] != '':
                for ds in jediTaskDict['outDS'].split(','):
                    if not ds in job.outputdata.datasetList:
                        job.outputdata.datasetList.append(ds)

            # Jedi job status has changed
            if job.backend.status != jediTaskDict['status']:
                logger.debug('Job %s has changed status from %s to %s',job.getFQID('.'),job.backend.status, jediTaskDict['status'])
                job.backend.status = jediTaskDict['status']
                job.backend.reason = jediTaskDict['statistics']

                # Now update Jedi job status
                if jediTaskDict['status'] in ['registered', 'waiting', 'defined', 'pending', 'assigning', 'ready']:
                    job.updateStatus('submitted')
                elif jediTaskDict['status'] in ['scouting', 'running', 'holding', 'merging', 'prepared' ]:
                    job.updateStatus('running')
                elif jediTaskDict['status'] in ['done']:
                    job.updateStatus('completed')
                elif jediTaskDict['status'] in ['failed', 'finished']:
                    job.updateStatus('failed')
                elif jediTaskDict['status'] in [ 'aborted', 'broken', 'cancelled' ] and job.status not in ['completed','failed']:
                    job.updateStatus('killed')
                else:
                    logger.warning('Unexpected Jedi task status %s', jediTaskDict['status'])

            # Check if associated Panda job exist and monitor them
            if not job.backend.pandajobs:
                jdefids = pandaJobIDs[jID]
                # skip if there are no Panda jobs yet 
                if not jdefids:
                    continue
                tot_num_mjobs = 0

                do_master_update = True
                ick,status,num_mjobs = retrievePandaJobs(job, jdefids)
                logger.debug('retrievePandaJobs returns: %s %s' % (repr(ick),status))
                if not ick:
                    logger.debug('Panda job retrival failure for Jedi task %s with PandaIds %s' % (job.backend.id, jdefids))
                    do_master_update = False

                tot_num_mjobs += num_mjobs
                logger.debug('Job %s retrieved %d Panda jobs' % (job.getFQID('.'),tot_num_mjobs) )
            # Now monitor the already attached Panda jobs
            else:
                jdefids = [ pj.id for pj in job.backend.pandajobs ] 
                rc, jobsStatus = Client.getFullJobStatus(jdefids,False)
                if rc:
                    logger.error('Return code %d retrieving job status information.',rc)
                    raise BackendError('Jedi','Return code %d retrieving job status information.' % rc)

                for status in jobsStatus:
                    if not status: continue

                    for pjob in job.backend.pandajobs:
                        if pjob.id == status.PandaID:
                            # skip if no status change
                            if pjob.status == status.jobStatus:
                                continue 
                            # Else update job record
                            pjob.jobSpec = dict(zip(status._attributes,status.values()))

                            for k in pjob.jobSpec.keys():
                                if type(pjob.jobSpec[k]) not in [type(''),type(1)]:
                                    pjob.jobSpec[k]=str(pjob.jobSpec[k])

                            logger.debug('Job %s with Panda job %s has changed status from %s to %s',job.getFQID('.'),pjob.id, pjob.status,status.jobStatus)
                            pjob.status = status.jobStatus
                            pjob.exitcode = str(status.transExitCode)
                            pjob.piloterrorcode = str(status.pilotErrorCode)
                            pjob.reason = ''
                            for k in pjob.jobSpec.keys():
                                if k.endswith('ErrorDiag') and pjob.jobSpec[k]!='NULL':
                                    pjob.reason += '%s: %s, '%(k,str(pjob.jobSpec[k]))
                            #if job.backend.jobSpec['transExitCode'] != 'NULL':
                            pjob.reason += 'transExitCode: %s'%pjob.jobSpec['transExitCode']

                            if status.jobStatus in ['defined','unknown','assigned','waiting','activated','sent']:
                                logger.debug('Panda job %s %s' % (pjob.id, status.jobStatus))
                            elif status.jobStatus in ['starting','running','holding','transferring', 'merging']:
                                logger.debug('Panda job %s %s '% (pjob.id, status.jobStatus))
                            elif status.jobStatus in ['finished']:
                                logger.debug('Panda job %s %s '% (pjob.id, status.jobStatus))
                            elif status.jobStatus == 'failed':
                                logger.debug('Panda job %s %s '% (pjob.id, status.jobStatus))
                                # check for server side retry
                                if 'taskBufferErrorDiag' in pjob.jobSpec and pjob.jobSpec['taskBufferErrorDiag'].find("PandaID=") != -1:
                                    # grab the new panda ID
                                    newPandaID = long(pjob.jobSpec['taskBufferErrorDiag'].split("=")[1])
                                    pjob.id = newPandaID
                                    pjob.status = None
                                    pjob.url = 'http://panda.cern.ch/?job=%d'%newPandaID
                            elif status.jobStatus == 'cancelled' and pjob.status not in ['completed','failed']: # bug 67716
                                logger.debug('Panda job %s cancelled'%pjob.id)
                                if 'taskBufferErrorDiag' in pjob.jobSpec and "rebrokerage" in pjob.jobSpec['taskBufferErrorDiag']:
                                    newPandaID = checkForRebrokerage(pjob.jobSpec['taskBufferErrorDiag'])
                                    logger.warning("Subjob rebrokered by Panda server. Job %d moved to %d."%(pjob.id, newPandaID))
                                    pjob.id = newPandaID
                                    pjob.status = None
                            else:
                                logger.warning('Unexpected job status %s',status.jobStatus)
예제 #5
0
    def master_resubmit(self, jobs):
        '''Resubmit failed Jedi job'''
        from pandatools import Client

        jobIDs = {}
        for job in jobs:
            jobIDs[job.backend.id] = job

        allJobIDs = jobIDs.keys()
        pandaJobIDs = {}
        for jID in allJobIDs:
            status, jediTaskDict = Client.getJediTaskDetails(
                {'jediTaskID': jID}, False, True, verbose=False)
            if status != 0:
                logger.error("Failed to get task details for %s" % jID)
                raise BackendError(
                    'Jedi',
                    'Return code %d retrieving job status information.' %
                    status)

            # Retrieve job
            job = jobIDs[jediTaskDict['jediTaskID']]

            newJobsetID = -1  # get jobset
            retryJobs = []  # jspecs
            resubmittedJobs = []  # ganga jobs

            if jediTaskDict['status'] in [
                    'failed', 'killed', 'cancelled', 'aborted', 'broken',
                    'finished'
            ]:
                retryJobs.append(job)
                resubmittedJobs.append(jID)
            #elif jediTaskDict['status'] == 'finished':
            #    pass
            else:
                logger.warning("Cannot resubmit. Jedi task %s is status %s." %
                               (jID, jediTaskDict['status']))
                return False

            # submit
            if len(retryJobs) == 0:
                logger.warning("No failed jobs to resubmit")
                return False

            status, out = Client.retryTask(jID, verbose=False)
            if status != 0:
                logger.error(status)
                logger.error(out)
                logger.error("Failed to retry JobID=%s" % jID)
                return False
            tmpStat, tmpDiag = out
            if not tmpStat:
                logger.error(tmpDiag)
                logger.error("Failed to retry JobID=%s" % jID)
                return False
            logger.info(tmpDiag)

            job.backend.status = None
            job.backend.jobSpec = {}
            job.updateStatus('submitted')

        logger.info('Resubmission successful')
        return True
예제 #6
0
    def master_updateMonitoringInformation(jobs):
        '''Monitor jobs'''
        from pandatools import Client

        #active_status = [ None, 'defined', 'unknown', 'assigned', 'waiting', 'activated', 'sent', 'starting', 'running', 'holding', 'transferring' ]

        submitting_status = []
        active_status = [
            None, 'registered', 'waiting', 'defined', 'pending', 'assigning',
            'ready', 'scouting', 'running', 'holding', 'merging', 'prepared',
            'aborting', 'finishing'
        ]

        inactive_status = ['finished', 'aborted', 'broken', 'failed', 'done']

        # Find jobs to be monitored
        jobdict = {}
        for job in jobs:
            # add a delay as Panda can be a little slow in sorting out a new Task
            if job.backend.id and job.backend.status in active_status and (
                (datetime.datetime.utcnow() -
                 job.time.timestamps["submitted"]).seconds > 120):
                jobdict[job.backend.id] = job

        logger.debug("jobdict = %s" % jobdict)

        # Monitor active Jedi tasks
        allJobIDs = jobdict.keys()
        pandaJobIDs = {}
        for jID in allJobIDs:
            status, jediTaskDict = Client.getJediTaskDetails(
                {'jediTaskID': jID}, False, True, verbose=False)
            if status != 0:
                logger.error("Failed to get task details for %s" % jID)
                #raise BackendError('Jedi','Return code %d retrieving job status information.' % status)
                continue
            # Retrieve job
            job = jobdict[jediTaskDict['jediTaskID']]
            # Store associated Panda jobs
            if job.backend.pandajobs:
                pandaJobIDs[job.backend.id] = [
                    pj.id for pj in job.backend.pandajobs
                ]
            else:
                pandaJobIDs[
                    jediTaskDict['jediTaskID']] = jediTaskDict['PandaID']
            logger.debug("jID = %s, pandaJobIDs = %s" % (jID, pandaJobIDs))

            # Fill the output data dataset list
            if 'outDS' in jediTaskDict and jediTaskDict['outDS'] != '':
                for ds in jediTaskDict['outDS'].split(','):
                    if not ds in job.outputdata.datasetList:
                        job.outputdata.datasetList.append(ds)

            # Jedi job status has changed
            if job.backend.status != jediTaskDict['status']:
                logger.debug('Job %s has changed status from %s to %s',
                             job.getFQID('.'), job.backend.status,
                             jediTaskDict['status'])
                job.backend.status = jediTaskDict['status']
                job.backend.reason = jediTaskDict['statistics']

                # Now update Jedi job status
                if jediTaskDict['status'] in [
                        'registered', 'waiting', 'defined', 'pending',
                        'assigning', 'ready'
                ]:
                    job.updateStatus('submitted')
                elif jediTaskDict['status'] in [
                        'scouting', 'running', 'holding', 'merging', 'prepared'
                ]:
                    job.updateStatus('running')
                elif jediTaskDict['status'] in ['done']:
                    job.updateStatus('completed')
                elif jediTaskDict['status'] in ['failed', 'finished']:
                    job.updateStatus('failed')
                elif jediTaskDict['status'] in [
                        'aborted', 'broken', 'cancelled'
                ] and job.status not in ['completed', 'failed']:
                    job.updateStatus('killed')
                else:
                    logger.warning('Unexpected Jedi task status %s',
                                   jediTaskDict['status'])

            # Check if associated Panda job exist and monitor them
            if not job.backend.pandajobs:
                jdefids = pandaJobIDs[jID]
                # skip if there are no Panda jobs yet
                if not jdefids:
                    continue
                tot_num_mjobs = 0

                do_master_update = True
                ick, status, num_mjobs = retrievePandaJobs(job, jdefids)
                logger.debug('retrievePandaJobs returns: %s %s' %
                             (repr(ick), status))
                if not ick:
                    logger.debug(
                        'Panda job retrival failure for Jedi task %s with PandaIds %s'
                        % (job.backend.id, jdefids))
                    do_master_update = False

                tot_num_mjobs += num_mjobs
                logger.debug('Job %s retrieved %d Panda jobs' %
                             (job.getFQID('.'), tot_num_mjobs))
            # Now monitor the already attached Panda jobs
            else:
                jdefids = [pj.id for pj in job.backend.pandajobs]
                rc, jobsStatus = Client.getFullJobStatus(jdefids, False)
                if rc:
                    logger.error(
                        'Return code %d retrieving job status information.',
                        rc)
                    raise BackendError(
                        'Jedi',
                        'Return code %d retrieving job status information.' %
                        rc)

                for status in jobsStatus:
                    if not status: continue

                    for pjob in job.backend.pandajobs:
                        if pjob.id == status.PandaID:
                            # skip if no status change
                            if pjob.status == status.jobStatus:
                                continue
                            # Else update job record
                            pjob.jobSpec = dict(
                                zip(status._attributes, status.values()))

                            for k in pjob.jobSpec.keys():
                                if type(pjob.jobSpec[k]) not in [
                                        type(''), type(1)
                                ]:
                                    pjob.jobSpec[k] = str(pjob.jobSpec[k])

                            logger.debug(
                                'Job %s with Panda job %s has changed status from %s to %s',
                                job.getFQID('.'), pjob.id, pjob.status,
                                status.jobStatus)
                            pjob.status = status.jobStatus
                            pjob.exitcode = str(status.transExitCode)
                            pjob.piloterrorcode = str(status.pilotErrorCode)
                            pjob.reason = ''
                            for k in pjob.jobSpec.keys():
                                if k.endswith('ErrorDiag'
                                              ) and pjob.jobSpec[k] != 'NULL':
                                    pjob.reason += '%s: %s, ' % (
                                        k, str(pjob.jobSpec[k]))
                            #if job.backend.jobSpec['transExitCode'] != 'NULL':
                            pjob.reason += 'transExitCode: %s' % pjob.jobSpec[
                                'transExitCode']

                            if status.jobStatus in [
                                    'defined', 'unknown', 'assigned',
                                    'waiting', 'activated', 'sent'
                            ]:
                                logger.debug('Panda job %s %s' %
                                             (pjob.id, status.jobStatus))
                            elif status.jobStatus in [
                                    'starting', 'running', 'holding',
                                    'transferring', 'merging'
                            ]:
                                logger.debug('Panda job %s %s ' %
                                             (pjob.id, status.jobStatus))
                            elif status.jobStatus in ['finished']:
                                logger.debug('Panda job %s %s ' %
                                             (pjob.id, status.jobStatus))
                            elif status.jobStatus == 'failed':
                                logger.debug('Panda job %s %s ' %
                                             (pjob.id, status.jobStatus))
                                # check for server side retry
                                if 'taskBufferErrorDiag' in pjob.jobSpec and pjob.jobSpec[
                                        'taskBufferErrorDiag'].find(
                                            "PandaID=") != -1:
                                    # grab the new panda ID
                                    newPandaID = long(
                                        pjob.jobSpec['taskBufferErrorDiag'].
                                        split("=")[1])
                                    pjob.id = newPandaID
                                    pjob.status = None
                                    pjob.url = 'http://panda.cern.ch/?job=%d' % newPandaID
                            elif status.jobStatus == 'cancelled' and pjob.status not in [
                                    'completed', 'failed'
                            ]:  # bug 67716
                                logger.debug('Panda job %s cancelled' %
                                             pjob.id)
                                if 'taskBufferErrorDiag' in pjob.jobSpec and "rebrokerage" in pjob.jobSpec[
                                        'taskBufferErrorDiag']:
                                    newPandaID = checkForRebrokerage(
                                        pjob.jobSpec['taskBufferErrorDiag'])
                                    logger.warning(
                                        "Subjob rebrokered by Panda server. Job %d moved to %d."
                                        % (pjob.id, newPandaID))
                                    pjob.id = newPandaID
                                    pjob.status = None
                            else:
                                logger.warning('Unexpected job status %s',
                                               status.jobStatus)
예제 #7
0
    if job_info is not None:
        # if job_info.Files and len(job_info.Files) > 0:
        print(job_info)
        print(job_info.attemptNr)
        print(job_info.maxAttempt)
        print(job_info.Files)
        print(job_info.Files[0])
        for f in job_info.Files:
            # print(dir(f))
            print(f._attributes)
            print(f.values())
            print(f.type)

jediTaskID = 3885
ret = Client.getJediTaskDetails({'jediTaskID': jediTaskID},
                                True,
                                True,
                                verbose=False)
print(ret)

ret = Client.getTaskStatus(jediTaskID, verbose=False)
print(ret)
"""
sys.exit(0)

jediTaskID = 998
ret = Client.getPandaIDsWithTaskID(jediTaskID, verbose=False)
# print(ret)
jobids = ret[1]
# print(jobids)

ret = Client.getJobStatus(ids=jobids, verbose=False)