Пример #1
0
    def updateEvents(self, jobs):
        """
        Handle event service updates for finished jobs
        TOFIX for pilot2
        """
        tlist=[]
        for j in jobs:
            eventrangestoupdate = []

            if j['actpandastatus'] == 'finished' \
              and 'plugin=arc' in self.sites[j['siteName']]['catchall'] \
              and re.search('eventService=True', j['pandajob']):

                # Check if we are running in harvester mode
                try:
                    smeta = json.loads(str(j['metadata']))
                    harvesteraccesspoint = smeta.get('harvesteraccesspoint')
                except:
                    harvesteraccesspoint = None

                if not harvesteraccesspoint and j['sendhb'] == 0:
                    continue

                if not j['eventranges'] or j['eventranges'] == '[]':
                    fname = os.path.join(self.tmpdir, "pickle", "%d.pickle" % j['pandaid'])
                    if not os.path.exists(fname):
                        # Jobs which were never submitted should have substatus pilot_noevents so they go to closed
                        # Assume only ARC sites (not condor) run NG-mode ES
                        if j['arcjobid'] == -1 or j['arcjobid'] is None:
                            substatus = 'pilot_noevents'
                            self.log.info('%s: Job did not run and has no eventranges to update, marking pilot_noevents' % j['pandaid'])
                        # Jobs which ran but produced no events have pilot_failed so they go to failed
                        else:
                            substatus = 'pilot_failed'
                            self.log.info('%s: Job ran but has no eventranges to update, marking failed' % j['pandaid'])
                        jobinfo = aCTPandaJob({'jobId': j['pandaid'], 'state': 'closed', 'jobSubStatus': substatus})
                        # Create the empty pickle so that heartbeat code below doesn't fail
                        if harvesteraccesspoint:
                            jobinfo.writeToFile(os.path.join(harvesteraccesspoint, 'jobReport.json'))
                        else:
                            jobinfo.writeToFile(fname)
                    continue

                # If zip is used we need to first send transferring heartbeat
                # with jobMetrics containing the zip file
                # In harvester mode harvester does this itself?
                if 'es_to_zip' in self.sites[j['siteName']]['catchall'] and not harvesteraccesspoint:
                    try:
                        # Load pickled information from pilot
                        fname = os.path.join(self.tmpdir, "pickle", "%d.pickle" % j['pandaid'])
                        jobinfo = aCTPandaJob(filename=fname)
                        jobmetrics = {'jobMetrics': getattr(jobinfo, 'jobMetrics', '')}
                        self.log.info('%s: Sending jobMetrics and transferring state: %s' % (j['pandaid'], jobmetrics))
                    except Exception as x:
                        self.log.error('%s: No pickle info found: %s' % (j['pandaid'], x))
                    else:
                        t = PandaThr(self.getPanda(j['siteName']).updateStatus, j['pandaid'], 'transferring', jobmetrics)
                        aCTUtils.RunThreadsSplit([t], self.nthreads)
                        # If update fails panda won't see the zip and events
                        # will be rescheduled to another job
                        if t.result == None or 'StatusCode' not in t.result:
                            # Strange response from panda
                            continue
                        if t.result['StatusCode'][0] == '60':
                            self.log.error('Failed to contact Panda, proxy may have expired')
                        elif t.result['StatusCode'][0] == '30':
                            self.log.error('Job was already killed')

                eventranges = j['eventranges']
                eventrangeslist = json.loads(eventranges)

                # Get object store ID used
                try:
                    objstoreID = self.sites[j['siteName']]['ddmoses']
                except:
                    self.log.warning('No ES object store defined for %s' % j['siteName'])
                    objstoreID = None

                for eventrange in eventrangeslist:
                    node = {}
                    node['eventRangeID'] = eventrange['eventRangeID']
                    try:
                        node['eventStatus'] = eventrange['status']
                    except:
                        node['eventStatus'] = j['actpandastatus']
                    node['objstoreID'] = objstoreID
                    eventrangestoupdate.append(node)

                self.log.info('%s: updating %i event ranges: %s' % (j['pandaid'], len(eventrangestoupdate), eventrangestoupdate))
                if harvesteraccesspoint:
                    self.log.info('%s: Dumping processed event ranges to %s' %
                                 (j['pandaid'], os.path.join(harvesteraccesspoint, 'worker_updateevents.json')))
                    harvesterdict = {j['pandaid']: eventrangestoupdate}
                    with open(os.path.join(harvesteraccesspoint, 'worker_updateevents.json'), 'w') as f:
                        json.dump(harvesterdict, f)
                else:
                    updatenode = {'eventRanges': json.dumps(eventrangestoupdate)}
                    t = PandaEventsThr(self.getPanda(j['siteName']).updateEventRanges, j['pandaid'], updatenode)
                    tlist.append(t)

        aCTUtils.RunThreadsSplit(tlist, self.nthreads)
        for t in tlist:
            # If update fails events will be rescheduled to another job
            if t.result == None or 'StatusCode' not in t.result:
                # Strange response from panda
                continue
            if t.result['StatusCode'][0] == '60':
                self.log.error('Failed to contact Panda, proxy may have expired')
            elif t.result['StatusCode'][0] == '30':
                self.log.warning('%s: Job was already killed' % j['pandaid'])
Пример #2
0
    def updatePandaHeartbeat(self,pstatus):
        """
        Heartbeat status updates.
        """
        nthreads=int(self.conf.get(["panda","threads"]))
        columns = ['pandaid', 'siteName', 'startTime', 'computingElement', 'node', 'corecount', 'eventranges']
        jobs=self.dbpanda.getJobs("pandastatus='"+pstatus+"' and sendhb=1 and ("+self.dbpanda.timeStampLessThan("theartbeat", self.conf.get(['panda','heartbeattime']))+" or modified > theartbeat) limit 1000", columns)
        if not jobs:
            return

        self.log.info("Update heartbeat for %d jobs in state %s (%s)" % (len(jobs), pstatus, ','.join([str(j['pandaid']) for j in jobs])))

        changed_pstatus = False
        if pstatus == 'sent':
            pstatus = 'starting'
            changed_pstatus = True

        tlist=[]
        for j in jobs:
            # Don't send transferring heartbeat for ES jobs, they must be in running while events are updated
            if pstatus == 'transferring' and j['eventranges']:
                pstatus = 'running'
            jd = {}
            if pstatus != 'starting':
                jd['startTime'] = j['startTime']
            if j['computingElement']:
                if j['computingElement'].find('://') != -1: # this if is only needed during transition period
                    jd['computingElement'] = arc.URL(str(j['computingElement'])).Host()
                else:
                    jd['computingElement'] = j['computingElement']
            jd['node'] = j['node']
            jd['siteName'] = j['siteName']
            # For starting truepilot jobs send pilotID with expected log
            # location so logs are available in case of lost heartbeat
            if pstatus == 'starting' and not changed_pstatus and self.sites[j['siteName']]['truepilot']:
                date = time.strftime('%Y-%m-%d', time.gmtime())
                logurl = '/'.join([self.conf.get(["joblog","urlprefix"]), date, j['siteName'], '%s.out' % j['pandaid']])
                jd['pilotID'] = '%s|Unknown|Unknown|Unknown|Unknown' % logurl
            try:
                jd['jobMetrics']="coreCount=%s" % (j['corecount'] if j['corecount'] > 0 else self.sites[j['siteName']]['corecount'])
            except:
                pass
            t=PandaThr(self.getPanda(j['siteName']).updateStatus,j['pandaid'],pstatus,jd)
            tlist.append(t)
        aCTUtils.RunThreadsSplit(tlist,nthreads)

        for t in tlist:
            if t.result == None or 'StatusCode' not in t.result:
                # Strange response from panda, try later
                continue
            if t.result['StatusCode'] and t.result['StatusCode'][0] == '60':
                self.log.error('Failed to contact Panda, proxy may have expired')
                continue
            #self.log.debug('%s: %s' % (t.id, t.result))
            if 'command' in t.result  and t.result['command'][0] != "NULL":
                self.log.info("%s: response: %s" % (t.id,t.result) )
            jd={}
            if changed_pstatus:
                jd['pandastatus']=pstatus
            # Make sure heartbeat is ahead of modified time so it is not picked up again
            if self.sites[t.args['siteName']]['truepilot'] and pstatus == 'starting':
                # Set theartbeat 1h in the future to allow job to start
                # running and avoid race conditions with heartbeats
                # Now heartbeat timeout is 2h so we remove the offset
                #jd['theartbeat'] = self.dbpanda.getTimeStamp(time.time()+3600)
                jd['theartbeat'] = self.dbpanda.getTimeStamp(time.time()+1)
            else:
                jd['theartbeat'] = self.dbpanda.getTimeStamp(time.time()+1)
            # If panda tells us to kill the job, set actpandastatus to tobekilled
            # and remove from heartbeats
            if 'command' in t.result and ( ("tobekilled" in t.result['command'][0]) or ("badattemptnr" in t.result['command'][0]) ):
                self.log.info('%s: cancelled by panda' % t.id)
                jd['actpandastatus']="tobekilled"
                jd['pandastatus']=None
            self.dbpanda.updateJob(t.id,jd)

        self.log.info("Threads finished")
Пример #3
0
    def updatePandaFinishedPilot(self):
        """
        Final status update for completed jobs (finished or failed in athena)
        and cancelled jobs
        """
        jobs=self.dbpanda.getJobs("actpandastatus='finished' or actpandastatus='failed' or actpandastatus='cancelled' limit 1000")

        if not jobs:
            return

        self.log.info("Updating panda for %d finished jobs (%s)" % (len(jobs), ','.join([str(j['pandaid']) for j in jobs])))

        self.updateEvents(jobs)
        tlist = []
        for j in jobs:
            # If true pilot skip heartbeat and just update DB
            if not j['sendhb']:
                jd={}
                jd['pandastatus']=None
                jd['actpandastatus']='done'
                if j['actpandastatus'] == 'failed':
                    jd['actpandastatus']='donefailed'
                if j['actpandastatus'] == 'cancelled':
                    jd['actpandastatus']='donecancelled'
                if not j['startTime']:
                    jd['startTime'] = datetime.datetime.utcnow()
                if not j['endTime']:
                    jd['endTime'] = datetime.datetime.utcnow()
                self.dbpanda.updateJob(j['pandaid'], jd)
                continue

            # Cancelled jobs have no heartbeat info
            if j['actpandastatus'] == 'cancelled':
                jobinfo = aCTPandaJob(jobinfo = {'jobId': j['pandaid'], 'state': 'failed'})
                jobinfo.pilotErrorCode = 1144
                jobinfo.pilotErrorDiag = "This job was killed by panda server"
                jobinfo.startTime = j['startTime'] if j['startTime'] else datetime.datetime.utcnow()
                jobinfo.endTime = j['endTime'] if j['endTime'] else datetime.datetime.utcnow()
            else:
                try:
                    # Load heartbeat information from pilot
                    fname = os.path.join(self.tmpdir, "heartbeats", "%d.json" % j['pandaid'])
                    jobinfo = aCTPandaJob(filename=fname)
                except Exception as x:
                    self.log.error('%s: %s' % (j['pandaid'], x))
                    # Send some basic info back to panda
                    info = {'jobId': j['pandaid'], 'state': j['pandastatus']}
                    jobinfo = aCTPandaJob(jobinfo=info)
                    jobinfo.errorCode = 9000
                    jobinfo.errorDiag = 'Job failed for unknown reason'
                else:
                    os.remove(fname)

            self.log.debug('%s: final heartbeat: %s' % (j['pandaid'], jobinfo.dictionary()))
            t=PandaThr(self.getPanda(j['siteName']).updateStatus,j['pandaid'],j['pandastatus'],jobinfo.dictionary())
            tlist.append(t)

        aCTUtils.RunThreadsSplit(tlist, self.nthreads)

        for t in tlist:
            if t.result == None:
                continue
            if 'StatusCode' in t.result and t.result['StatusCode'] and t.result['StatusCode'][0] != '0':
                self.log.error('Error updating panda')
                continue
            jd={}
            jd['pandastatus']=None
            jd['actpandastatus']='done'
            if t.status == 'failed':
                jd['actpandastatus']='donefailed'
            if 'pilotErrorCode' in t.args and t.args['pilotErrorCode'] == 1144:
                jd['actpandastatus']='donecancelled'
            jd['theartbeat']=self.dbpanda.getTimeStamp()
            self.dbpanda.updateJob(t.id,jd)
            # Send done message to APFMon
            self.apfmon.updateJob(t.id, 'done' if jd['actpandastatus'] == 'done' else 'fault')

        self.log.info("Threads finished")

        # Clean inputfiles, pickle and eventranges
        for j in jobs:
            pandaid=j['pandaid']
            pandainputdir = os.path.join(self.tmpdir, 'inputfiles', str(pandaid))
            picklefile = os.path.join(self.tmpdir, 'pickle', str(pandaid)+".pickle")
            eventrangesfile = os.path.join(self.tmpdir, 'eventranges', str(pandaid)+".json")
            shutil.rmtree(pandainputdir, ignore_errors=True)
            # remove pickle
            if os.path.exists(picklefile):
                os.unlink(picklefile)
            # remove eventrangesfile
            if os.path.exists(eventrangesfile):
                os.unlink(eventrangesfile)
Пример #4
0
    def updatePandaHeartbeatBulk(self,pstatus):
        """
        Heartbeat status updates in bulk.
        """
        columns = ['pandaid', 'siteName', 'startTime', 'computingElement', 'node', 'corecount', 'eventranges']
        jobs=self.dbpanda.getJobs("pandastatus='"+pstatus+"' and sendhb=1 and ("+self.dbpanda.timeStampLessThan("theartbeat", self.conf.get(['panda','heartbeattime']))+" or modified > theartbeat) limit 1000", columns)
        #jobs=self.dbpanda.getJobs("pandastatus='"+pstatus+"' and sendhb=1 and ("+self.dbpanda.timeStampLessThan("theartbeat", 60)+" or modified > theartbeat) limit 1000", columns)
        if not jobs:
            return

        self.log.info("Update heartbeat for %d jobs in state %s (%s)" % (len(jobs), pstatus, ','.join([str(j['pandaid']) for j in jobs])))

        changed_pstatus = False
        if pstatus == 'sent':
            pstatus = 'starting'
            changed_pstatus = True

        tlist=[]
        jobsbyproxy = {}
        for j in jobs:
            # Don't send transferring heartbeat for ES jobs, they must be in running while events are updated
            if pstatus == 'transferring' and j['eventranges']:
                pstatus = 'running'
            jd = {'jobId': j['pandaid'], 'state': pstatus}
            if pstatus != 'starting':
                jd['startTime'] = j['startTime']
            if j['computingElement']:
                if j['computingElement'].find('://') != -1: # this if is only needed during transition period
                    jd['computingElement'] = arc.URL(str(j['computingElement'])).Host()
                else:
                    jd['computingElement'] = j['computingElement']
            jd['node'] = j['node']
            jd['siteName'] = j['siteName']
            # For starting truepilot jobs send pilotID with expected log
            # location so logs are available in case of lost heartbeat
            if pstatus == 'starting' and not changed_pstatus and self.sites[j['siteName']]['truepilot']:
                date = time.strftime('%Y-%m-%d', time.gmtime())
                logurl = '/'.join([self.conf.get(["joblog","urlprefix"]), date, j['siteName'], '%s.out' % j['pandaid']])
                jd['pilotID'] = '%s|Unknown|Unknown|Unknown|Unknown' % logurl
            try:
                corecount = int(j['corecount']) if j['corecount'] > 0 else self.sites[j['siteName']]['corecount']
                jd['jobMetrics'] = "coreCount=%d" % corecount
                jd['coreCount'] = corecount
            except:
                self.log.warning('%s: no corecount available' % j['pandaid'])

            try:
                jobsbyproxy[self.sites[j['siteName']]['type']].append(jd)
            except:
                jobsbyproxy[self.sites[j['siteName']]['type']] = [jd]

        for sitetype, jobs in jobsbyproxy.items():
            t = PandaBulkThr(self.pandas.get(sitetype, self.pandas.get('production')).updateStatuses, [j['jobId'] for j in jobs], jobs)
            tlist.append(t)
        aCTUtils.RunThreadsSplit(tlist, self.nthreads)

        for t in tlist:
            if not t or not t.result or not t.result[0]:
                # Strange response from panda, try later
                continue

            for pandaid, response in zip(t.ids, t.result[1]):
                try:
                    result = cgi.parse_qs(response)
                except Exception:
                    self.log.error('Could not parse result from panda: %s' % response)
                    continue

                if not result.get('StatusCode'):
                    # Strange response from panda, try later
                    continue
                if result['StatusCode'][0] == '60':
                    self.log.error('Failed to contact Panda, proxy may have expired')
                    continue
                if result.get('command', [''])[0] not in ['', "NULL"]:
                    self.log.info("%s: response: %s" % (pandaid, result))
                jd = {}
                if changed_pstatus:
                    jd['pandastatus'] = pstatus
                # Make sure heartbeat is ahead of modified time so it is not picked up again
                jd['theartbeat'] = self.dbpanda.getTimeStamp(time.time()+1)
                # If panda tells us to kill the job, set actpandastatus to tobekilled
                # and remove from heartbeats
                if result.get('command', [''])[0] in ["tobekilled", "badattemptnr", "alreadydone"]:
                    self.log.info('%s: cancelled by panda' % pandaid)
                    jd['actpandastatus'] = "tobekilled"
                    jd['pandastatus'] = None
                self.dbpanda.updateJob(pandaid, jd)

        self.log.info("Threads finished")
Пример #5
0
class aCTAutopilot(aCTATLASProcess):
    """
    Main class for Panda interaction. Three major functions: init, run, finish
    """
    def __init__(self):
        aCTATLASProcess.__init__(self)

        # Get DN from configured proxy file
        uc = arc.UserConfig()
        uc.ProxyPath(str(self.arcconf.get(['voms', 'proxypath'])))
        cred = arc.Credential(uc)
        dn = cred.GetIdentityName()
        self.log.info("Running under DN %s" % dn)
        self.agisparser = aCTAGISParser(self.log)
        # Keep a panda object per proxy. The site "type" maps to a specific
        # proxy role
        self.pandas = {}
        # Map the site type to a proxy id in proxies table
        # In future for analysis the id will change once the job is picked up
        self.proxymap = {}

        actp = aCTProxy.aCTProxy(self.log)
        for role in self.arcconf.getList(['voms', 'roles', 'item']):
            attr = '/atlas/Role=' + role
            proxyid = actp.getProxyId(dn, attr)
            if not proxyid:
                raise Exception("Proxy with DN " + dn + " and attribute " +
                                attr + " was not found in proxies table")

            proxyfile = actp.path(dn, attribute=attr)
            # pilot role is mapped to analysis type
            if role == 'pilot':
                role = 'analysis'
            self.pandas[role] = aCTPanda.aCTPanda(self.log, proxyfile)
            self.proxymap[role] = proxyid

        # queue interval
        self.queuestamp = 0

        self.sites = {}

    def getEndTime(self):
        return time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())

    def setSites(self):
        self.sites = self.agisparser.getSites()

    def getPanda(self, sitename):
        return self.pandas[self.sites[sitename]['type']]

    def updatePandaHeartbeat(self, pstatus):
        """
        Heartbeat status updates.
        """
        nthreads = int(self.conf.get(["panda", "threads"]))
        columns = [
            'pandaid', 'siteName', 'startTime', 'computingElement', 'node',
            'corecount', 'eventranges'
        ]
        jobs = self.dbpanda.getJobs(
            "pandastatus='" + pstatus + "' and sendhb=1 and (" +
            self.dbpanda.timeStampLessThan(
                "theartbeat", self.conf.get(['panda', 'heartbeattime'])) +
            " or modified > theartbeat) limit 1000", columns)
        if not jobs:
            return

        self.log.info(
            "Update heartbeat for %d jobs in state %s (%s)" %
            (len(jobs), pstatus, ','.join([str(j['pandaid']) for j in jobs])))

        changed_pstatus = False
        if pstatus == 'sent':
            pstatus = 'starting'
            changed_pstatus = True

        tlist = []
        for j in jobs:
            # Don't send transferring heartbeat for ES jobs, they must be in running while events are updated
            if pstatus == 'transferring' and j['eventranges']:
                pstatus = 'running'
            jd = {}
            if pstatus != 'starting':
                jd['startTime'] = j['startTime']
            if j['computingElement']:
                if j['computingElement'].find(
                        '://'
                ) != -1:  # this if is only needed during transition period
                    jd['computingElement'] = arc.URL(str(
                        j['computingElement'])).Host()
                else:
                    jd['computingElement'] = j['computingElement']
            jd['node'] = j['node']
            jd['siteName'] = j['siteName']
            # For starting truepilot jobs send pilotID with expected log
            # location so logs are available in case of lost heartbeat
            if pstatus == 'starting' and not changed_pstatus and j[
                    'computingElement'] and j['computingElement'].find(
                        '://') != -1 and self.sites[
                            j['siteName']]['truepilot']:
                jobid = j['computingElement']
                date = time.strftime('%Y%m%d')
                cluster = arc.URL(str(jobid)).Host()
                sessionid = jobid[jobid.rfind('/') + 1:]
                logurl = '/'.join([
                    self.conf.get(["joblog", "urlprefix"]), date, cluster,
                    sessionid
                ])
                jd['pilotID'] = '%s|Unknown|Unknown|Unknown|Unknown' % logurl

            try:
                jd['jobMetrics'] = "coreCount=%s" % (
                    j['corecount'] if j['corecount'] > 0 else
                    self.sites[j['siteName']]['corecount'])
            except:
                pass
            t = PandaThr(
                self.getPanda(j['siteName']).updateStatus, j['pandaid'],
                pstatus, jd)
            tlist.append(t)
        aCTUtils.RunThreadsSplit(tlist, nthreads)

        for t in tlist:
            if t.result == None or not t.result.has_key('StatusCode'):
                # Strange response from panda, try later
                continue
            if t.result['StatusCode'] and t.result['StatusCode'][0] == '60':
                self.log.error(
                    'Failed to contact Panda, proxy may have expired')
                continue
            #self.log.debug('%s: %s' % (t.id, t.result))
            if t.result.has_key(
                    'command') and t.result['command'][0] != "NULL":
                self.log.info("%s: response: %s" % (t.id, t.result))
            jd = {}
            if changed_pstatus:
                jd['pandastatus'] = pstatus
            # Make sure heartbeat is ahead of modified time so it is not picked up again
            if self.sites[
                    t.args['siteName']]['truepilot'] and pstatus == 'starting':
                # Set theartbeat 1h in the future to allow job to start
                # running and avoid race conditions with heartbeats
                # Now heartbeat timeout is 2h so we remove the offset
                #jd['theartbeat'] = self.dbpanda.getTimeStamp(time.time()+3600)
                jd['theartbeat'] = self.dbpanda.getTimeStamp(time.time() + 1)
            else:
                jd['theartbeat'] = self.dbpanda.getTimeStamp(time.time() + 1)
            # If panda tells us to kill the job, set actpandastatus to tobekilled
            # and remove from heartbeats
            if t.result.has_key('command') and (
                ("tobekilled" in t.result['command'][0]) or
                ("badattemptnr" in t.result['command'][0])):
                self.log.info('%s: cancelled by panda' % t.id)
                jd['actpandastatus'] = "tobekilled"
                jd['pandastatus'] = None
            self.dbpanda.updateJob(t.id, jd)

        self.log.info("Threads finished")

    def updatePandaFinishedPilot(self):
        """
        Final status update for completed jobs (finished or failed in athena)
        and cancelled jobs
        """
        nthreads = int(self.conf.get(["panda", "threads"]))
        jobs = self.dbpanda.getJobs(
            "actpandastatus='finished' or actpandastatus='failed' or actpandastatus='cancelled' limit 1000"
        )

        if not jobs:
            return

        self.log.info("Updating panda for %d finished jobs (%s)" %
                      (len(jobs), ','.join([str(j['pandaid']) for j in jobs])))

        tlist = []
        # If event service update event ranges. Validator filters for the successful ones
        for j in jobs:
            eventrangestoupdate = []
            if j['actpandastatus'] == 'finished' \
              and j['sendhb'] \
              and 'plugin=arc' in self.sites[j['siteName']]['catchall'] \
              and re.search('eventService=True', j['pandajob']):

                if not j['eventranges'] or j['eventranges'] == '[]':
                    fname = self.arcconf.get([
                        'tmp', 'dir'
                    ]) + "/pickle/" + str(j['pandaid']) + ".pickle"
                    if not os.path.exists(fname):
                        # Jobs which were never submitted should have substatus pilot_noevents so they go to closed
                        if j['arcjobid'] == -1 or j['arcjobid'] is None:
                            substatus = 'pilot_noevents'
                            self.log.info(
                                '%s: Job did not run and has no eventranges to update, marking pilot_noevents'
                                % j['pandaid'])
                        # Jobs which ran but produced no events have pilot_failed so they go to failed
                        else:
                            substatus = 'pilot_failed'
                            self.log.info(
                                '%s: Job ran but has no eventranges to update, marking failed'
                                % j['pandaid'])
                        jobinfo = aCTPandaJob({
                            'jobId': j['pandaid'],
                            'state': 'failed',
                            'jobSubStatus': substatus
                        })
                        # Create the empty pickle so that heartbeat code below doesn't fail
                        jobinfo.writeToFile(fname)
                    continue

                # If zip is used we need to first send transferring heartbeat
                # with jobMetrics containing the zip file
                if 'es_to_zip' in self.sites[j['siteName']]['catchall']:
                    try:
                        # Load pickled information from pilot
                        fname = self.arcconf.get([
                            'tmp', 'dir'
                        ]) + "/pickle/" + str(j['pandaid']) + ".pickle"
                        jobinfo = aCTPandaJob(filename=fname)
                        jobmetrics = {'jobMetrics': jobinfo.jobMetrics}
                        self.log.info(
                            '%s: Sending jobMetrics and transferring state: %s'
                            % (j['pandaid'], jobmetrics))
                    except Exception, x:
                        self.log.error('%s: No pickle info found: %s' %
                                       (j['pandaid'], x))
                    else:
                        t = PandaThr(
                            self.getPanda(j['siteName']).updateStatus,
                            j['pandaid'], 'transferring', jobmetrics)
                        aCTUtils.RunThreadsSplit([t], nthreads)
                        self.log.debug(t.result)
                        # If update fails panda won't see the zip and events
                        # will be rescheduled to another job
                        if t.result == None or not t.result.has_key(
                                'StatusCode'):
                            # Strange response from panda
                            continue
                        if t.result['StatusCode'][0] == '60':
                            self.log.error(
                                'Failed to contact Panda, proxy may have expired'
                            )
                        elif t.result['StatusCode'][0] == '30':
                            self.log.error('Job was already killed')

                eventranges = j['eventranges']
                eventrangeslist = json.loads(eventranges)

                # Get object store ID used
                try:
                    objstoreID = self.sites[j['siteName']]['ddmoses']
                except:
                    self.log.warning('No ES object store defined for %s' %
                                     j['siteName'])
                    objstoreID = None

                for eventrange in eventrangeslist:
                    node = {}
                    node['eventRangeID'] = eventrange['eventRangeID']
                    try:
                        node['eventStatus'] = eventrange['status']
                    except:
                        node['eventStatus'] = j['actpandastatus']
                    node['objstoreID'] = objstoreID
                    eventrangestoupdate.append(node)

                self.log.info('%s: updating %i event ranges: %s' %
                              (j['pandaid'], len(eventrangestoupdate),
                               eventrangestoupdate))
                node = {'eventRanges': json.dumps(eventrangestoupdate)}
                t = PandaEventsThr(
                    self.getPanda(j['siteName']).updateEventRanges,
                    j['pandaid'], node)
                tlist.append(t)

        aCTUtils.RunThreadsSplit(tlist, nthreads)
        for t in tlist:
            self.log.debug('%s: %s' % (t.id, t.result))
            # If update fails events will be rescheduled to another job
            if t.result == None or not t.result.has_key('StatusCode'):
                # Strange response from panda
                continue
            if t.result['StatusCode'][0] == '60':
                self.log.error(
                    'Failed to contact Panda, proxy may have expired')
            elif t.result['StatusCode'][0] == '30':
                self.log.warning('%s: Job was already killed' % j['pandaid'])

        tlist = []
        for j in jobs:
            # If true pilot skip heartbeat and just update DB
            if not j['sendhb']:
                jd = {}
                jd['pandastatus'] = None
                jd['actpandastatus'] = 'done'
                if j['actpandastatus'] == 'failed':
                    jd['actpandastatus'] = 'donefailed'
                if j['actpandastatus'] == 'cancelled':
                    jd['actpandastatus'] = 'donecancelled'
                if not j['startTime']:
                    jd['startTime'] = datetime.datetime.utcnow()
                if not j['endTime']:
                    jd['endTime'] = datetime.datetime.utcnow()
                self.dbpanda.updateJob(j['pandaid'], jd)
                continue

            # Cancelled jobs have no pickle info
            if j['actpandastatus'] == 'cancelled':
                jobinfo = aCTPandaJob(jobinfo={
                    'jobId': j['pandaid'],
                    'state': 'failed'
                })
                jobinfo.pilotErrorCode = 1144
                jobinfo.pilotErrorDiag = "This job was killed by panda server"
                jobinfo.startTime = j['startTime'] if j[
                    'startTime'] else datetime.datetime.utcnow()
                jobinfo.endTime = j['endTime'] if j[
                    'endTime'] else datetime.datetime.utcnow()
            else:
                try:
                    # Load pickled information from pilot
                    fname = self.arcconf.get([
                        'tmp', 'dir'
                    ]) + "/pickle/" + str(j['pandaid']) + ".pickle"
                    jobinfo = aCTPandaJob(filename=fname)
                except Exception, x:
                    self.log.error('%s: %s' % (j['pandaid'], x))
                    # Send some basic info back to panda
                    info = {'jobId': j['pandaid'], 'state': j['pandastatus']}
                    jobinfo = aCTPandaJob(jobinfo=info)
                    jobinfo.pilotErrorCode = 1008
                    jobinfo.pilotErrorDiag = 'Job failed for unknown reason'
                else:
                    os.remove(fname)
Пример #6
0
    def updatePandaFinishedPilot(self):
        """
        Final status update for completed jobs (finished or failed in athena)
        and cancelled jobs
        """
        nthreads = int(self.conf.get(["panda", "threads"]))
        jobs = self.dbpanda.getJobs(
            "actpandastatus='finished' or actpandastatus='failed' or actpandastatus='cancelled' limit 1000"
        )

        if not jobs:
            return

        self.log.info("Updating panda for %d finished jobs (%s)" %
                      (len(jobs), ','.join([str(j['pandaid']) for j in jobs])))

        tlist = []
        # If event service update event ranges. Validator filters for the successful ones
        for j in jobs:
            eventrangestoupdate = []
            if j['actpandastatus'] == 'finished' \
              and j['sendhb'] \
              and 'plugin=arc' in self.sites[j['siteName']]['catchall'] \
              and re.search('eventService=True', j['pandajob']):

                if not j['eventranges'] or j['eventranges'] == '[]':
                    fname = self.arcconf.get([
                        'tmp', 'dir'
                    ]) + "/pickle/" + str(j['pandaid']) + ".pickle"
                    if not os.path.exists(fname):
                        # Jobs which were never submitted should have substatus pilot_noevents so they go to closed
                        if j['arcjobid'] == -1 or j['arcjobid'] is None:
                            substatus = 'pilot_noevents'
                            self.log.info(
                                '%s: Job did not run and has no eventranges to update, marking pilot_noevents'
                                % j['pandaid'])
                        # Jobs which ran but produced no events have pilot_failed so they go to failed
                        else:
                            substatus = 'pilot_failed'
                            self.log.info(
                                '%s: Job ran but has no eventranges to update, marking failed'
                                % j['pandaid'])
                        jobinfo = aCTPandaJob({
                            'jobId': j['pandaid'],
                            'state': 'failed',
                            'jobSubStatus': substatus
                        })
                        # Create the empty pickle so that heartbeat code below doesn't fail
                        jobinfo.writeToFile(fname)
                    continue

                # If zip is used we need to first send transferring heartbeat
                # with jobMetrics containing the zip file
                if 'es_to_zip' in self.sites[j['siteName']]['catchall']:
                    try:
                        # Load pickled information from pilot
                        fname = self.arcconf.get([
                            'tmp', 'dir'
                        ]) + "/pickle/" + str(j['pandaid']) + ".pickle"
                        jobinfo = aCTPandaJob(filename=fname)
                        jobmetrics = {'jobMetrics': jobinfo.jobMetrics}
                        self.log.info(
                            '%s: Sending jobMetrics and transferring state: %s'
                            % (j['pandaid'], jobmetrics))
                    except Exception, x:
                        self.log.error('%s: No pickle info found: %s' %
                                       (j['pandaid'], x))
                    else:
                        t = PandaThr(
                            self.getPanda(j['siteName']).updateStatus,
                            j['pandaid'], 'transferring', jobmetrics)
                        aCTUtils.RunThreadsSplit([t], nthreads)
                        self.log.debug(t.result)
                        # If update fails panda won't see the zip and events
                        # will be rescheduled to another job
                        if t.result == None or not t.result.has_key(
                                'StatusCode'):
                            # Strange response from panda
                            continue
                        if t.result['StatusCode'][0] == '60':
                            self.log.error(
                                'Failed to contact Panda, proxy may have expired'
                            )
                        elif t.result['StatusCode'][0] == '30':
                            self.log.error('Job was already killed')

                eventranges = j['eventranges']
                eventrangeslist = json.loads(eventranges)

                # Get object store ID used
                try:
                    objstoreID = self.sites[j['siteName']]['ddmoses']
                except:
                    self.log.warning('No ES object store defined for %s' %
                                     j['siteName'])
                    objstoreID = None

                for eventrange in eventrangeslist:
                    node = {}
                    node['eventRangeID'] = eventrange['eventRangeID']
                    try:
                        node['eventStatus'] = eventrange['status']
                    except:
                        node['eventStatus'] = j['actpandastatus']
                    node['objstoreID'] = objstoreID
                    eventrangestoupdate.append(node)

                self.log.info('%s: updating %i event ranges: %s' %
                              (j['pandaid'], len(eventrangestoupdate),
                               eventrangestoupdate))
                node = {'eventRanges': json.dumps(eventrangestoupdate)}
                t = PandaEventsThr(
                    self.getPanda(j['siteName']).updateEventRanges,
                    j['pandaid'], node)
                tlist.append(t)
Пример #7
0
                except Exception, x:
                    self.log.error('%s: %s' % (j['pandaid'], x))
                    # Send some basic info back to panda
                    info = {'jobId': j['pandaid'], 'state': j['pandastatus']}
                    jobinfo = aCTPandaJob(jobinfo=info)
                    jobinfo.pilotErrorCode = 1008
                    jobinfo.pilotErrorDiag = 'Job failed for unknown reason'
                else:
                    os.remove(fname)

            t = PandaThr(
                self.getPanda(j['siteName']).updateStatus, j['pandaid'],
                j['pandastatus'], jobinfo.dictionary())
            tlist.append(t)

        aCTUtils.RunThreadsSplit(tlist, nthreads)

        for t in tlist:
            self.log.debug('%s: %s' % (t.id, t.result))
            if t.result == None:
                continue
            if 'StatusCode' in t.result and t.result[
                    'StatusCode'] and t.result['StatusCode'][0] != '0':
                self.log.error('Error updating panda')
                continue
            jd = {}
            jd['pandastatus'] = None
            jd['actpandastatus'] = 'done'
            if t.status == 'failed':
                jd['actpandastatus'] = 'donefailed'
            if 'pilotErrorCode' in t.args and t.args['pilotErrorCode'] == 1144: