Exemplo n.º 1
0
    def run(self):
        '''Do brokering and submit'''

        arclog = arc_utils.ARCLogger(baselogger, 0)
        tmplog = arclog.log
        # Do brokering among the available queues
        jobdesc = self.jobdescs[0]
        broker = arc.Broker(self.userconfig, jobdesc, "Random")
        targetsorter = arc.ExecutionTargetSorter(broker)
        for target in self.queuelist:
            tmplog.debug("considering target {0}:{1}".format(
                target.ComputingService.Name, target.ComputingShare.Name))

            # Adding an entity performs matchmaking and brokering
            targetsorter.addEntity(target)

        if len(targetsorter.getMatchingTargets()) == 0:
            tmplog.error("no clusters satisfied job description requirements")
            return

        targetsorter.reset(
        )  # required to reset iterator, otherwise we get a seg fault
        selectedtarget = targetsorter.getCurrentTarget()
        # Job object will contain the submitted job
        job = arc.Job()
        submitter = arc.Submitter(self.userconfig)
        if submitter.Submit(selectedtarget, jobdesc,
                            job) != arc.SubmissionStatus.NONE:
            tmplog.error("Submission failed")
            return

        self.job = job
Exemplo n.º 2
0
    def acknowledge_events_files(self, workspec):
        '''Tell workers that harvester received events/files'''

        # get logger
        arclog = arc_utils.ARCLogger(baselogger, workspec.workerID)
        tmpLog = arclog.log

        job = workspec.workAttributes['arcjob']
        arcid = job['JobID']
        # Set certificate to use for interacting with ARC CE
        usercfg = arc.UserConfig(self.cred_type)
        if not self._setup_proxy(usercfg, workspec, arcid, tmpLog):
            return False

        # Delete jobid/jsonEventsUpdateFileName.read
        for pandaID in workspec.pandaid_list:
            accessPoint = self.get_access_point(workspec, pandaID)
            remoteJsonFilePath = '%s/%s%s' % (arcid, jsonEventsUpdateFileName,
                                              suffixReadJson)
            status = self._delete_file(remoteJsonFilePath, usercfg, tmpLog)
            if not status and status.GetErrno() != errno.ENOENT:
                tmpLog.error('Failed deleting {0}: {1}'.format(
                    remoteJsonFilePath, str(status)))

        tmpLog.debug('done')
        return
Exemplo n.º 3
0
    def feed_events(self, workspec, events_dict):
        '''Havester has an event range to pass to job'''

        # get logger
        arclog = arc_utils.ARCLogger(baselogger, workspec.workerID)
        tmpLog = arclog.log

        # Upload to jobid/jsonEventsFeedFileName, delete jobid/jsonEventsRequestFileName
        job = workspec.workAttributes['arcjob']
        arcid = job['JobID']
        # Set certificate to use for interacting with ARC CE
        usercfg = arc.UserConfig(self.cred_type)
        if not self._setup_proxy(usercfg, workspec, arcid, tmpLog):
            return False

        retVal = True
        if workspec.mapType in [
                WorkSpec.MT_OneToOne, WorkSpec.MT_MultiWorkers
        ]:
            # put the json just under the access point then upload to ARC CE
            localJsonFilePath = os.path.join(workspec.get_access_point(),
                                             jsonEventsFeedFileName)
            tmpLog.debug('feeding events to {0}'.format(localJsonFilePath))
            try:
                with open(localJsonFilePath, 'w') as jsonFile:
                    json.dump(events_dict, jsonFile)
            except Exception:
                core_utils.dump_error_message(tmpLog)
                retVal = False

            remoteJsonFilePath = '%s/%s' % (arcid, jsonEventsFeedFileName)
            # Try to copy the file
            status = self._copy_file(localJsonFilePath, remoteJsonFilePath,
                                     usercfg, tmpLog)
            if not status:
                tmpLog.error('Failed to feed events to {0}: {1}'.format(
                    remoteJsonFilePath, str(status)))
                retVal = False
            else:
                remoteJsonEventsRequestFile = '%s/%s' % (
                    arcid, jsonEventsRequestFileName)
                status = self._delete_file(remoteJsonEventsRequestFile,
                                           usercfg, tmpLog)
                if not status and status.GetErrno() != errno.ENOENT:
                    tmpLog.error(
                        'Failed to delete event request file at {0}'.format(
                            remoteJsonEventsRequestFile))

        elif workspec.mapType == WorkSpec.MT_MultiJobs:
            # TOBEFIXED
            pass
        # remove request file
        try:
            jsonFilePath = os.path.join(workspec.get_access_point(),
                                        jsonEventsFeedFileName)
            os.remove(jsonFilePath)
        except Exception:
            pass
        tmpLog.debug('done')
        return retVal
Exemplo n.º 4
0
    def post_processing(self, workspec, jobspec_list, map_type):
        '''
        Fetch job output and process pilot info for sending in final heartbeat.
        The pilot pickle is loaded and some attributes corrected (schedulerid,
        pilotlog etc), then converted to dictionary and stored in
        workspec.workAttributes[pandaid]. If pilot pickle cannot be used,
        report ARC error in pilotErrorDiag and fill all possible attributes
        using ARC information.
        '''

        arclog = arc_utils.ARCLogger(baselogger, workspec.workerID)
        tmplog = arclog.log
        tmplog.info('Post processing ARC job {0}'.format(workspec.batchID))
        job = workspec.workAttributes['arcjob']
        proxyrole = workspec.workAttributes['proxyrole']
        arcid = job['JobID']
        tmplog.info('Job id {0}'.format(arcid))

        if 'arcdownloadfiles' not in workspec.workAttributes:
            tmplog.error('No files to download')
            return

        # Assume one-to-one mapping of workers to jobs. If jobspec_list is empty
        # it means the job was cancelled by panda or otherwise forgotten
        if not jobspec_list:
            return

        # Set certificate
        userconfig = arc.UserConfig(self.cred_type)
        try:
            userconfig.ProxyPath(str(self.certs[proxyrole]))
        except:
            tmplog.error("Job {0}: no proxy found with role {1}".format(job.JobID, proxyrole))
            return

        queueconfigmapper = QueueConfigMapper()
        queueconfig = queueconfigmapper.get_queue(jobspec_list[0].computingSite)
        logbaseurl = queueconfig.submitter.get('logBaseURL')
        logbasedir = queueconfig.submitter.get('logDir', self.tmpdir)
        logsubdir = workspec.workAttributes['logsubdir']
        pandaid = str(jobspec_list[0].PandaID)

        # Construct log path and url
        logurl = '/'.join([logbaseurl, logsubdir, str(pandaid)]) if logbaseurl else None
        logdir = os.path.join(logbasedir, logsubdir)

        # post_processing is only called once, so no retries are done. But keep
        # the possibility here in case it changes
        (fetched, notfetched, notfetchedretry) = self._download_outputs(workspec.workAttributes['arcdownloadfiles'],
                                                                        logdir, arcid, pandaid, userconfig, tmplog)
        if arcid not in fetched:
            tmplog.warning("Could not get outputs of {0}".format(arcid))

        workspec.workAttributes[long(pandaid)] = {}

        workspec.workAttributes[long(pandaid)] = self._extractAndFixPilotPickle(job, pandaid, (arcid in fetched), logurl, tmplog)
        
        tmplog.debug("pilot info for {0}: {1}".format(pandaid, workspec.workAttributes[long(pandaid)]))
Exemplo n.º 5
0
    def kill_worker(self, workspec):
        """Cancel the ARC job.

        :param workspec: worker specification
        :type workspec: WorkSpec
        :return: A tuple of return code (True for success, False otherwise) and error dialog
        :rtype: (bool, string)
        """

        # make logger
        arclog = arc_utils.ARCLogger(baselogger, workspec.workerID)
        tmplog = arclog.log

        (job, modtime, proxyrole) = arc_utils.workspec2arcjob(workspec)
        if not job.JobID:
            # Job not submitted
            tmplog.info("Job was not submitted so cannot be cancelled")
            return True, ''

        # Set certificate
        userconfig = arc.UserConfig(self.cred_type)
        try:
            userconfig.ProxyPath(str(self.certs[proxyrole]))
        except:
            # Log a warning and return True so that job can be cleaned
            tmplog.warning("Job {0}: no proxy found with role {1}".format(
                job.JobID, proxyrole))
            return True, ''

        job_supervisor = arc.JobSupervisor(userconfig, [job])
        job_supervisor.Update()
        job_supervisor.Cancel()

        notcancelled = job_supervisor.GetIDsNotProcessed()

        if job.JobID in notcancelled:
            if job.State == arc.JobState.UNDEFINED:
                # If longer than one hour since submission assume job never made it
                if job.SubmissionTime + arc.Period(3600) < arc.Time():
                    tmplog.warning(
                        "Assuming job is lost and marking as cancelled")
                    return True, ''

                # Job has not yet reached info system
                tmplog.warning(
                    "Job is not yet in info system so cannot be cancelled")
                return False, "Job is not yet in info system so could not be cancelled"

            # Log a warning and return True so that job can be cleaned
            tmplog.warning("Job could not be cancelled")
            return True, ''

        tmplog.info("Job cancelled successfully")
        return True, ''
Exemplo n.º 6
0
    def sweep_worker(self, workspec):
        """Clean the ARC job

        :param workspec: worker specification
        :type workspec: WorkSpec
        :return: A tuple of return code (True for success, False otherwise) and error dialog
        :rtype: (bool, string)
        """

        # make logger
        arclog = arc_utils.ARCLogger(baselogger, workspec.workerID)
        tmplog = arclog.log

        (job, modtime, proxyrole) = arc_utils.workspec2arcjob(workspec)
        if not job.JobID:
            # Job not submitted
            tmplog.info("Job was not submitted so cannot be cleaned")
            return True, ''

        # Set certificate
        userconfig = arc.UserConfig(self.cred_type)
        try:
            userconfig.ProxyPath(str(self.certs[proxyrole]))
        except:
            # Log a warning and return True so that job can be cleaned
            tmplog.warning("Job {0}: no proxy found with role {1}".format(
                job.JobID, proxyrole))
            return True, ''

        job_supervisor = arc.JobSupervisor(userconfig, [job])
        job_supervisor.Update()
        job_supervisor.Clean()

        notcleaned = job_supervisor.GetIDsNotProcessed()

        if job.JobID in notcleaned:
            # Log a warning and return True so that job can be finished
            tmplog.warning("Job could not be cleaned")
            return True, ''

        tmplog.info("Job cleaned successfully")
        return True, ''
Exemplo n.º 7
0
    def events_requested(self, workspec):
        '''Used to tell harvester that the worker requests events'''

        # get logger
        arclog = arc_utils.ARCLogger(baselogger, workspec.workerID)
        tmpLog = arclog.log

        # Check for jobid/jsonEventsRequestFileName
        job = workspec.workAttributes['arcjob']
        arcid = job['JobID']
        # Set certificate to use for interacting with ARC CE
        usercfg = arc.UserConfig(self.cred_type)
        if not self._setup_proxy(usercfg, workspec, arcid, tmpLog):
            return {}

        remoteJsonFilePath = '%s/%s' % (arcid, jsonEventsRequestFileName)
        localJsonFilePath = os.path.join(workspec.get_access_point(),
                                         jsonEventsRequestFileName)
        tmpLog.debug(
            'looking for event request file {0}'.format(remoteJsonFilePath))
        # Try to copy the file
        status = self._copy_file(remoteJsonFilePath, localJsonFilePath,
                                 usercfg, tmpLog)
        if not status:
            if status.GetErrno() == errno.ENOENT:
                # Not found
                tmpLog.debug('not found')
                return {}
            # Some other error
            tmpLog.warning('Failed to copy {0}: {1}'.format(
                remoteJsonFilePath, str(status)))
            return {}

        try:
            with open(localJsonFilePath) as jsonFile:
                retDict = json.load(jsonFile)
            os.remove(localJsonFilePath)
        except Exception:
            tmpLog.debug('failed to load json')
            return {}
        tmpLog.debug('found')
        return retDict
Exemplo n.º 8
0
    def submit_workers(self, workspec_list):
        retlist = []

        # Get queue info from DB
        pandaqueues = self.dbproxy.get_cache("panda_queues.json", None)
        if pandaqueues is None:
            raise Exception("Failed to get panda queue info from database")
        pandaqueues = pandaqueues.data

        osmap = self.dbproxy.get_cache("ddmendpoints_objectstores.json", None)
        if osmap is None:
            raise Exception("Failed to get Object Store info from database")
        osmap = osmap.data

        for workspec in workspec_list:

            arclog = arc_utils.ARCLogger(baselogger, workspec.workerID)
            tmplog = arclog.log

            # Assume for aCT that jobs are always pre-fetched (no late-binding)
            for jobspec in workspec.get_jobspec_list():

                tmplog.debug("JobSpec: {0}".format(jobspec.values_map()))

                if jobspec.computingSite not in pandaqueues:
                    retlist.append(
                        (False, "No queue information for {0}".format(
                            jobspec.computingSite)))
                    continue

                # Get CEs from panda queue info
                # List of (endpoint, queue) tuples
                arcces = []
                for endpoint in pandaqueues[jobspec.computingSite]['queues']:
                    ce_endpoint = endpoint['ce_endpoint']
                    if not re.search('://', ce_endpoint):
                        ce_endpoint = 'gsiftp://%s' % ce_endpoint
                    ce_queue = endpoint['ce_queue_name']
                    arcces.append((ce_endpoint, ce_queue))

                if not arcces:
                    retlist.append((False, "No CEs defined for %{0}".format(
                        jobspec.computingSite)))
                    continue

                # Set true pilot or not
                queueconfigmapper = QueueConfigMapper()
                queueconfig = queueconfigmapper.get_queue(
                    jobspec.computingSite)
                pandaqueues[jobspec.computingSite][
                    'truepilot'] = 'running' in queueconfig.noHeartbeat

                # Set log URL for GTAG env in job description
                logbaseurl = queueconfig.submitter.get('logBaseURL')
                logsubdir = self._set_logdir(jobspec.computingSite)
                logfileurl = '/'.join(
                    [logbaseurl, logsubdir,
                     '%d.out' % jobspec.PandaID]) if logbaseurl else None

                tmplog.debug("Converting to ARC XRSL format")
                arcxrsl = ARCParser(
                    jobspec.jobParams,
                    jobspec.computingSite,
                    pandaqueues[jobspec.computingSite],
                    logfileurl,
                    self.schedulerid,
                    osmap,
                    '/tmp',  # tmpdir, TODO common tmp dir
                    None,  #jobSpec.eventranges, # TODO event ranges
                    tmplog)
                arcxrsl.parse()
                xrsl = arcxrsl.getXrsl()
                tmplog.debug("ARC xrsl: {0}".format(xrsl))

                # Set the files to be downloaded at the end of the job
                downloadfiles = 'gmlog/errors'
                if 'logFile' in jobspec.jobParams:
                    downloadfiles += ';%s' % jobspec.jobParams[
                        'logFile'].replace('.tgz', '')
                if not pandaqueues[jobspec.computingSite]['truepilot']:
                    downloadfiles += ';jobSmallFiles.tgz'

                # Set certificate
                userconfig = arc.UserConfig(self.cred_type)
                proxyrole = ''
                if jobspec.jobParams['prodSourceLabel'] == 'user':
                    userconfig.ProxyPath(str(self.certs['pilot']))
                    proxyrole = 'pilot'
                else:
                    userconfig.ProxyPath(str(self.certs['production']))
                    proxyrole = 'production'
                tmplog.debug("Submitting using {0} proxy at {1}".format(
                    proxyrole, userconfig.ProxyPath()))

                try:
                    tmplog.debug("Submission targets: {0}".format(arcces))
                    arcjob = self._arc_submit(xrsl, arcces, userconfig, tmplog)
                    tmplog.info("ARC CE job id {0}".format(arcjob.JobID))
                    arc_utils.arcjob2workspec(arcjob, workspec)
                    workspec.workAttributes['arcdownloadfiles'] = downloadfiles
                    workspec.workAttributes['proxyrole'] = proxyrole
                    workspec.workAttributes['logsubdir'] = logsubdir
                    workspec.batchID = arcjob.JobID
                    tmplog.debug(workspec.workAttributes)
                    result = (True, '')
                except Exception as exc:
                    tmplog.error(traceback.format_exc())
                    result = (False,
                              "Failed to submit ARC job: {0}".format(str(exc)))

                retlist.append(result)

        return retlist
Exemplo n.º 9
0
    def events_to_update(self, workspec):
        '''Report events processed for harvester to update'''

        # get logger
        arclog = arc_utils.ARCLogger(baselogger, workspec.workerID)
        tmpLog = arclog.log

        job = workspec.workAttributes['arcjob']
        arcid = job['JobID']
        # Set certificate to use for interacting with ARC CE
        usercfg = arc.UserConfig(self.cred_type)
        if not self._setup_proxy(usercfg, workspec, arcid, tmpLog):
            return False

        # Check for jobid/jsonEventsUpdateFileName on CE, rename to .read
        retDict = dict()
        for pandaID in workspec.pandaid_list:

            # first look for json.read which is not yet acknowledged
            accessPoint = self.get_access_point(workspec, pandaID)
            localJsonFilePath = os.path.join(accessPoint,
                                             jsonEventsUpdateFileName)
            remoteJsonFilePathRead = '%s/%s%s' % (
                arcid, jsonEventsUpdateFileName, suffixReadJson)
            tmpLog.debug('looking for event update file {0}'.format(
                remoteJsonFilePathRead))

            status = self._copy_file(remoteJsonFilePathRead, localJsonFilePath,
                                     usercfg, tmpLog)
            if not status:
                if status.GetErrno() != errno.ENOENT:
                    tmpLog.warning('Failed checking {0}: {1}'.format(
                        remoteJsonFilePathRead, str(status)))
                    continue

                # Look for new json
                remoteJsonFilePath = '%s/%s' % (arcid,
                                                jsonEventsUpdateFileName)
                status = self._copy_file(remoteJsonFilePath, localJsonFilePath,
                                         usercfg, tmpLog)
                if not status:
                    if status.GetErrno() != errno.ENOENT:
                        tmpLog.warning('Failed checking {0}: {1}'.format(
                            remoteJsonFilePath, str(status)))
                    else:
                        # not found
                        tmpLog.debug('not found')
                    continue

                # Rename to prevent from being overwritten
                # Gridftp does not support renaming so upload .read file and delete old one
                status = self._copy_file(localJsonFilePath,
                                         remoteJsonFilePathRead, usercfg,
                                         tmpLog)
                if not status:
                    tmpLog.warning('Failed copying {0} to {1}: {2}'.format(
                        localJsonFilePath, remoteJsonFilePathRead,
                        str(status)))
                # If rename fails, delete old file anyway
                status = self._delete_file(remoteJsonFilePath, usercfg, tmpLog)
                if not status:
                    tmpLog.warning('Failed deleting {0}: {1}'.format(
                        remoteJsonFilePath, str(status)))

            # load json
            nData = 0
            try:
                with open(localJsonFilePath) as jsonFile:
                    tmpOrigDict = json.load(jsonFile)
                    newDict = dict()
                    # change the key from str to int
                    for tmpPandaID, tmpDict in tmpOrigDict.iteritems():
                        tmpPandaID = long(tmpPandaID)
                        retDict[tmpPandaID] = tmpDict
                        nData += 1
            except Exception:
                raise
                tmpLog.error('failed to load json')
            # delete local file
            try:
                os.remove(localJsonFilePath)
            except Exception:
                pass
            tmpLog.debug('got {0} events for PandaID={1}'.format(
                nData, pandaID))
        return retDict
Exemplo n.º 10
0
    def check_workers(self, workspec_list):
        retList = []
        for workspec in workspec_list:

            # make logger
            arclog = arc_utils.ARCLogger(baselogger, workspec.workerID)
            tmplog = arclog.log
            tmplog.info("checking worker id {0}".format(workspec.workerID))
            (job, modtime, proxyrole) = arc_utils.workspec2arcjob(workspec)

            # Set certificate
            userconfig = arc.UserConfig(self.cred_type)
            try:
                userconfig.ProxyPath(str(self.certs[proxyrole]))
            except:
                tmplog.error("Job {0}: no proxy found with role {1}".format(
                    job.JobID, proxyrole))
                retList.append((workspec.status, ''))
                continue

            job_supervisor = arc.JobSupervisor(userconfig, [job])
            job_supervisor.Update()

            jobsupdated = job_supervisor.GetAllJobs()
            jobsnotupdated = job_supervisor.GetIDsNotProcessed()

            for updatedjob in jobsupdated:
                if updatedjob.JobID in jobsnotupdated:
                    tmplog.error("Failed to find information on {0}".format(
                        updatedjob.JobID))
                    # If missing for too long (2 days), mark as lost
                    if arc.Time() - modtime > arc.Period(172800):
                        tmplog.error(
                            "Job {0} missing for more than 2 days, marking as lost"
                            .format(updatedjob.JobID))
                        retList.append((workspec.ST_failed, ''))
                    else:
                        retList.append((workspec.status, ''))
                    continue

                # Convert arc state to WorkSpec state
                arcstatus = updatedjob.State
                newstatus = WorkSpec.ST_submitted
                if arcstatus == arc.JobState.RUNNING or \
                   arcstatus == arc.JobState.FINISHING:
                    newstatus = WorkSpec.ST_running
                elif arcstatus == arc.JobState.FINISHED:
                    if updatedjob.ExitCode == -1:
                        # Missing exit code, but assume success
                        tmplog.warning(
                            "Job {0} FINISHED but has missing exit code, setting to zero"
                            .format(updatedjob.JobID))
                        updatedjob.ExitCode = 0
                    newstatus = WorkSpec.ST_finished
                elif arcstatus == arc.JobState.FAILED:
                    newstatus = WorkSpec.ST_failed
                    tmplog.info("Job {0} failed: {1}".format(
                        updatedjob.JobID,
                        ";".join([joberr for joberr in updatedjob.Error])))
                elif arcstatus == arc.JobState.KILLED:
                    newstatus = WorkSpec.ST_cancelled
                elif arcstatus == arc.JobState.DELETED or \
                     arcstatus == arc.JobState.OTHER:
                    # unexpected
                    newstatus = WorkSpec.ST_failed
                # Not covered: arc.JobState.HOLD. Maybe need a post-run state in
                # harvester, also to cover FINISHING

                # compare strings here to get around limitations of JobState API
                if job.State.GetGeneralState(
                ) == updatedjob.State.GetGeneralState():
                    tmplog.debug("Job {0} still in state {1}".format(
                        job.JobID, job.State.GetGeneralState()))
                    retList.append((newstatus, ''))
                    continue

                tmplog.info("Job {0}: {1} -> {2} ({3})".format(
                    job.JobID, job.State.GetGeneralState(),
                    updatedjob.State.GetGeneralState(),
                    updatedjob.State.GetSpecificState()))

                arc_utils.arcjob2workspec(updatedjob, workspec)
                # Have to force update to change info in DB
                workspec.force_update('workAttributes')
                tmplog.debug("batchStatus {0} -> workerStatus {1}".format(
                    arcstatus.GetGeneralState(), newstatus))
                retList.append((newstatus, ''))

        return True, retList