Пример #1
0
def test():
    '''test submission'''
    from pandaharvester.harvestercore.job_spec import JobSpec
    from pandaharvester.harvestercore.plugin_factory import PluginFactory

    import json

    queuename = 'ARC-TEST'
    queueconfmapper = QueueConfigMapper()
    queueconf = queueconfmapper.get_queue(queuename)
    pluginfactory = PluginFactory()

    pandajob = '{"jobsetID": 11881, "logGUID": "88ee8a52-5c70-490c-a585-5eb6f48e4152", "cmtConfig": "x86_64-slc6-gcc49-opt", "prodDBlocks": "mc16_13TeV:mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.merge.EVNT.e5340_e5984_tid11329621_00", "dispatchDBlockTokenForOut": "NULL,NULL", "destinationDBlockToken": "dst:CERN-PROD_DATADISK,dst:NDGF-T1_DATADISK", "destinationSE": "CERN-PROD_PRESERVATION", "realDatasets": "mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.HITS.e5340_e5984_s3126_tid11364822_00,mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.log.e5340_e5984_s3126_tid11364822_00", "prodUserID": "gingrich", "GUID": "A407D965-B139-A543-8851-A8E134A678D7", "realDatasetsIn": "mc16_13TeV:mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.merge.EVNT.e5340_e5984_tid11329621_00", "nSent": 2, "cloud": "WORLD", "StatusCode": 0, "homepackage": "AtlasOffline/21.0.15", "inFiles": "EVNT.11329621._001079.pool.root.1", "processingType": "simul", "currentPriority": 900, "fsize": "129263662", "fileDestinationSE": "CERN-PROD_PRESERVATION,BOINC_MCORE", "scopeOut": "mc16_13TeV", "minRamCount": 1573, "jobDefinitionID": 0, "maxWalltime": 40638, "scopeLog": "mc16_13TeV", "transformation": "Sim_tf.py", "maxDiskCount": 485, "coreCount": 1, "prodDBlockToken": "NULL", "transferType": "NULL", "destinationDblock": "mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.HITS.e5340_e5984_s3126_tid11364822_00_sub0418634273,mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.log.e5340_e5984_s3126_tid11364822_00_sub0418634276", "dispatchDBlockToken": "NULL", "jobPars": "--inputEVNTFile=EVNT.11329621._001079.pool.root.1 --maxEvents=50 --postInclude \\"default:RecJobTransforms/UseFrontier.py\\" --preExec \\"EVNTtoHITS:simFlags.SimBarcodeOffset.set_Value_and_Lock(200000)\\" \\"EVNTtoHITS:simFlags.TRTRangeCut=30.0;simFlags.TightMuonStepping=True\\" --preInclude \\"EVNTtoHITS:SimulationJobOptions/preInclude.BeamPipeKill.py,SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py\\" --skipEvents=4550 --firstEvent=5334551 --outputHITSFile=HITS.11364822._128373.pool.root.1 --physicsList=FTFP_BERT_ATL_VALIDATION --randomSeed=106692 --DBRelease=\\"all:current\\" --conditionsTag \\"default:OFLCOND-MC16-SDR-14\\" --geometryVersion=\\"default:ATLAS-R2-2016-01-00-01_VALIDATION\\" --runNumber=364168 --AMITag=s3126 --DataRunNumber=284500 --simulator=FullG4 --truthStrategy=MC15aPlus", "attemptNr": 2, "swRelease": "Atlas-21.0.15", "nucleus": "CERN-PROD", "maxCpuCount": 40638, "outFiles": "HITS.11364822._128373.pool.root.11,log.11364822._128373.job.log.tgz.11", "ddmEndPointOut": "CERN-PROD_DATADISK,NDGF-T1_DATADISK", "scopeIn": "mc16_13TeV", "PandaID": 3487584273, "sourceSite": "NULL", "dispatchDblock": "panda.11364822.07.05.GEN.0c9b1d3b-feec-411a-89e4-1cbf7347d70c_dis003487584270", "prodSourceLabel": "managed", "checksum": "ad:cd0bf10b", "jobName": "mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.e5340_e5984_s3126.3433643361", "ddmEndPointIn": "NDGF-T1_DATADISK", "taskID": 11364822, "logFile": "log.11364822._128373.job.log.tgz.1"}'
    pandajob = json.loads(pandajob)
    jspec = JobSpec()
    jspec.convert_job_json(pandajob)
    jspec.computingSite = queuename
    jspeclist = [jspec]

    maker = pluginfactory.get_plugin(queueconf.workerMaker)
    wspec = maker.make_worker(jspeclist, queueconf)

    wspec.hasJob = 1
    wspec.set_jobspec_list(jspeclist)

    sub = ARCSubmitter()
    print sub.submit_workers([wspec])
    print wspec.batchID
Пример #2
0
 def __init__(self, **kwarg):
     FIFOBase.__init__(self, **kwarg)
     self.fifoName = '{0}_fifo'.format(self.titleName)
     pluginConf = {}
     pluginConf.update( {'titleName': self.titleName} )
     pluginConf.update( {'module': harvester_config.fifo.fifoModule,
                         'name': harvester_config.fifo.fifoClass,} )
     pluginFactory = PluginFactory()
     self.fifo = pluginFactory.get_plugin(pluginConf)
Пример #3
0
 def _initialize_fifo(self, force_enable=False):
     self.fifoName = '{0}_fifo'.format(self.titleName)
     self.config = getattr(harvester_config, self.titleName)
     if force_enable:
         self.enabled = True
     elif hasattr(self.config, 'fifoEnable') and self.config.fifoEnable:
         self.enabled = True
     else:
         self.enabled = False
         return
     pluginConf = vars(self.config).copy()
     pluginConf.update( {'titleName': self.titleName} )
     if hasattr(self.config, 'fifoModule') and hasattr(self.config, 'fifoClass'):
         pluginConf.update( {'module': self.config.fifoModule,
                             'name': self.config.fifoClass,} )
     else:
         if not hasattr(harvester_config, 'fifo'):
             return
         pluginConf.update( {'module': harvester_config.fifo.fifoModule,
                             'name': harvester_config.fifo.fifoClass,} )
     pluginFactory = PluginFactory()
     self.fifo = pluginFactory.get_plugin(pluginConf)
Пример #4
0
class WorkerMaker(object):
    # constructor
    def __init__(self):
        self.pluginFactory = PluginFactory()
        self.dbProxy = DBProxy()

    # get plugin
    def get_plugin(self, queue_config):
        return self.pluginFactory.get_plugin(queue_config.workerMaker)

    # make workers
    def make_workers(self, jobchunk_list, queue_config, n_ready, resource_type, maker=None):
        tmpLog = core_utils.make_logger(_logger, 'queue={0} rtype={1}'.format(queue_config.queueName, resource_type),
                                        method_name='make_workers')
        tmpLog.debug('start')
        try:
            # get plugin
            if maker is None:
                maker = self.pluginFactory.get_plugin(queue_config.workerMaker)
            if maker is None:
                # not found
                tmpLog.error('plugin for {0} not found'.format(queue_config.queueName))
                return [], jobchunk_list
            # get ready workers
            readyWorkers = self.dbProxy.get_ready_workers(queue_config.queueName, n_ready)
            # loop over all chunks
            okChunks = []
            ngChunks = []
            for iChunk, jobChunk in enumerate(jobchunk_list):
                # make a worker
                if iChunk >= n_ready:
                    workSpec = maker.make_worker(jobChunk, queue_config, resource_type)
                else:
                    # use ready worker
                    if iChunk < len(readyWorkers):
                        workSpec = readyWorkers[iChunk]
                    else:
                        workSpec = None
                # failed
                if workSpec is None:
                    ngChunks.append(jobChunk)
                    continue
                # set workerID
                if workSpec.workerID is None:
                    workSpec.workerID = self.dbProxy.get_next_seq_number('SEQ_workerID')
                    workSpec.configID = queue_config.configID
                    workSpec.isNew = True
                okChunks.append((workSpec, jobChunk))
            # dump
            tmpLog.debug('made {0} workers while {1} chunks failed'.format(len(okChunks),
                                                                           len(ngChunks)))
            return okChunks, ngChunks
        except Exception:
            # dump error
            core_utils.dump_error_message(tmpLog)
            return [], jobchunk_list

    # get number of jobs per worker
    def get_num_jobs_per_worker(self, queue_config, n_workers, resource_type, maker=None):
        # get plugin
        if maker is None:
            maker = self.pluginFactory.get_plugin(queue_config.workerMaker)
        return maker.get_num_jobs_per_worker(n_workers)

    # get number of workers per job
    def get_num_workers_per_job(self, queue_config, n_workers, resource_type, maker=None):
        # get plugin
        if maker is None:
            maker = self.pluginFactory.get_plugin(queue_config.workerMaker)
        return maker.get_num_workers_per_job(n_workers)

    # check number of ready resources
    def num_ready_resources(self, queue_config, resource_type, maker=None):
        # get plugin
        if maker is None:
            maker = self.pluginFactory.get_plugin(queue_config.workerMaker)
        return maker.num_ready_resources()

    # get upper limit on the cumulative total of workers per job
    def get_max_workers_per_job_in_total(self, queue_config, resource_type, maker=None):
        # get plugin
        if maker is None:
            maker = self.pluginFactory.get_plugin(queue_config.workerMaker)
        return maker.get_max_workers_per_job_in_total()

    # get upper limit on the number of new workers per job in a cycle
    def get_max_workers_per_job_per_cycle(self, queue_config, resource_type, maker=None):
        # get plugin
        if maker is None:
            maker = self.pluginFactory.get_plugin(queue_config.workerMaker)
        return maker.get_max_workers_per_job_per_cycle()
Пример #5
0
class RpcBot(rpyc.Service):

    # initialization action
    def on_connect(self, conn):
        self.pluginFactory = PluginFactory(no_db=True)


    ######################
    # submitter section

    # submit workers
    def exposed_submit_workers(self, plugin_config, workspec_list):
        core = self.pluginFactory.get_plugin(plugin_config)
        return core.submit_workers(workspec_list)


    ######################
    # monitor section

    # check workers
    def exposed_check_workers(self, plugin_config, workspec_list):
        core = self.pluginFactory.get_plugin(plugin_config)
        return core.check_workers(workspec_list)


    ######################
    # messenger section

    # setup access points
    def exposed_setup_access_points(self, plugin_config, workspec_list):
        core = self.pluginFactory.get_plugin(plugin_config)
        return core.setup_access_points(workspec_list)

    # feed jobs
    def exposed_feed_jobs(self, plugin_config, workspec, jobspec_list):
        core = self.pluginFactory.get_plugin(plugin_config)
        return core.feed_jobs(workspec, jobspec_list)

    # request job
    def exposed_job_requested(self, plugin_config, workspec):
        core = self.pluginFactory.get_plugin(plugin_config)
        return core.job_requested(workspec)

    # request kill
    def exposed_kill_requested(self, plugin_config, workspec):
        core = self.pluginFactory.get_plugin(plugin_config)
        return core.kill_requested(workspec)

    # is alive
    def exposed_is_alive(self, plugin_config, workspec, worker_heartbeat_limit):
        core = self.pluginFactory.get_plugin(plugin_config)
        return core.is_alive(workspec, worker_heartbeat_limit)

    # get work attributes
    def exposed_get_work_attributes(self, plugin_config, workspec):
        core = self.pluginFactory.get_plugin(plugin_config)
        return core.get_work_attributes(workspec)

    # get output files
    def exposed_get_files_to_stage_out(self, plugin_config, workspec):
        core = self.pluginFactory.get_plugin(plugin_config)
        return core.get_files_to_stage_out(workspec)

    # feed events
    def exposed_feed_events(self, plugin_config, workspec, events_dict):
        core = self.pluginFactory.get_plugin(plugin_config)
        return core.feed_events(workspec, events_dict)

    # get events
    def exposed_events_to_update(self, plugin_config, workspec):
        core = self.pluginFactory.get_plugin(plugin_config)
        return core.events_to_update(workspec)

    # request events
    def exposed_events_requested(self, plugin_config, workspec):
        core = self.pluginFactory.get_plugin(plugin_config)
        return core.events_requested(workspec)

    # get PandaIDs
    def exposed_get_panda_ids(self, plugin_config, workspec):
        core = self.pluginFactory.get_plugin(plugin_config)
        return core.get_panda_ids(workspec)

    # post processing
    def exposed_post_processing(self, plugin_config, workspec, jobspec_list, map_type):
        core = self.pluginFactory.get_plugin(plugin_config)
        return core.post_processing(workspec, jobspec_list, map_type)

    # send ACK
    def exposed_acknowledge_events_files(self, plugin_config, workspec):
        core = self.pluginFactory.get_plugin(plugin_config)
        return core.acknowledge_events_files(workspec)
Пример #6
0
class Submitter(AgentBase):
    # constructor
    def __init__(self, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.queueConfigMapper = queue_config_mapper
        self.dbProxy = DBProxy()
        self.workerMaker = WorkerMaker()
        self.workerAdjuster = WorkerAdjuster(queue_config_mapper)
        self.pluginFactory = PluginFactory()

    # main loop
    def run(self):
        lockedBy = 'submitter-{0}'.format(self.ident)
        while True:
            mainLog = core_utils.make_logger(_logger,
                                             'id={0}'.format(lockedBy),
                                             method_name='run')
            mainLog.debug('getting queues to submit workers')

            # get queues associated to a site to submit workers
            curWorkers, siteName, resMap = self.dbProxy.get_queues_to_submit(
                harvester_config.submitter.nQueues,
                harvester_config.submitter.lookupTime,
                harvester_config.submitter.lockInterval)
            if siteName is not None:
                mainLog.debug('got {0} queues for site {1}'.format(
                    len(curWorkers), siteName))

                # get commands
                comStr = '{0}:{1}'.format(CommandSpec.COM_setNWorkers,
                                          siteName)
                commandSpecs = self.dbProxy.get_commands_for_receiver(
                    'submitter', comStr)
                mainLog.debug('got {0} commands'.format(len(commandSpecs)))
                for commandSpec in commandSpecs:
                    newLimits = self.dbProxy.set_queue_limit(
                        siteName, commandSpec.params)
                    for tmpResource, tmpNewVal in iteritems(newLimits):
                        # if available, overwrite new worker value with the command from panda server
                        if tmpResource in resMap:
                            tmpQueueName = resMap[tmpResource]
                            if tmpQueueName in curWorkers:
                                curWorkers[tmpQueueName][tmpResource][
                                    'nNewWorkers'] = tmpNewVal

                # define number of new workers
                if len(curWorkers) == 0:
                    n_workers_per_queue_and_rt = dict()
                else:
                    n_workers_per_queue_and_rt = self.workerAdjuster.define_num_workers(
                        curWorkers, siteName)

                if n_workers_per_queue_and_rt is None:
                    mainLog.error(
                        'WorkerAdjuster failed to define the number of workers'
                    )
                elif len(n_workers_per_queue_and_rt) == 0:
                    pass
                else:
                    # loop over all queues and resource types
                    for queueName in n_workers_per_queue_and_rt:
                        for resource_type, tmpVal in iteritems(
                                n_workers_per_queue_and_rt[queueName]):

                            tmpLog = core_utils.make_logger(
                                _logger,
                                'id={0} queue={1} resource_type={2}'.format(
                                    lockedBy, queueName, resource_type),
                                method_name='run')
                            tmpLog.debug('start')
                            nWorkers = tmpVal['nNewWorkers'] + tmpVal['nReady']
                            nReady = tmpVal['nReady']

                            # check queue
                            if not self.queueConfigMapper.has_queue(queueName):
                                tmpLog.error('config not found')
                                continue

                            # no new workers
                            if nWorkers == 0:
                                tmpLog.debug(
                                    'skipped since no new worker is needed based on current stats'
                                )
                                continue
                            # get queue
                            queueConfig = self.queueConfigMapper.get_queue(
                                queueName)

                            # actions based on mapping type
                            if queueConfig.mapType == WorkSpec.MT_NoJob:
                                # workers without jobs
                                jobChunks = []
                                for i in range(nWorkers):
                                    jobChunks.append([])
                            elif queueConfig.mapType == WorkSpec.MT_OneToOne:
                                # one worker per one job
                                jobChunks = self.dbProxy.get_job_chunks_for_workers(
                                    queueName, nWorkers, nReady, 1, None,
                                    queueConfig.useJobLateBinding,
                                    harvester_config.submitter.checkInterval,
                                    harvester_config.submitter.lockInterval,
                                    lockedBy)
                            elif queueConfig.mapType == WorkSpec.MT_MultiJobs:
                                # one worker for multiple jobs
                                nJobsPerWorker = self.workerMaker.get_num_jobs_per_worker(
                                    queueConfig, nWorkers, resource_type)
                                tmpLog.debug('nJobsPerWorker={0}'.format(
                                    nJobsPerWorker))
                                jobChunks = self.dbProxy.get_job_chunks_for_workers(
                                    queueName, nWorkers, nReady,
                                    nJobsPerWorker, None,
                                    queueConfig.useJobLateBinding,
                                    harvester_config.submitter.checkInterval,
                                    harvester_config.submitter.lockInterval,
                                    lockedBy, queueConfig.allowJobMixture)
                            elif queueConfig.mapType == WorkSpec.MT_MultiWorkers:
                                # multiple workers for one job
                                nWorkersPerJob = self.workerMaker.get_num_workers_per_job(
                                    queueConfig, nWorkers, resource_type)
                                jobChunks = self.dbProxy.get_job_chunks_for_workers(
                                    queueName, nWorkers, nReady, None,
                                    nWorkersPerJob,
                                    queueConfig.useJobLateBinding,
                                    harvester_config.submitter.checkInterval,
                                    harvester_config.submitter.lockInterval,
                                    lockedBy)
                            else:
                                tmpLog.error('unknown mapType={0}'.format(
                                    queueConfig.mapType))
                                continue

                            tmpLog.debug('got {0} job chunks'.format(
                                len(jobChunks)))
                            if len(jobChunks) == 0:
                                continue
                            # make workers
                            okChunks, ngChunks = self.workerMaker.make_workers(
                                jobChunks, queueConfig, nReady, resource_type)
                            if len(ngChunks) == 0:
                                tmpLog.debug(
                                    'successfully made {0} workers'.format(
                                        len(okChunks)))
                            else:
                                tmpLog.debug(
                                    'made {0} workers, while {1} workers failed'
                                    .format(len(okChunks), len(ngChunks)))
                            timeNow = datetime.datetime.utcnow()
                            # NG (=not good)
                            for ngJobs in ngChunks:
                                for jobSpec in ngJobs:
                                    jobSpec.status = 'failed'
                                    jobSpec.subStatus = 'failedtomake'
                                    jobSpec.stateChangeTime = timeNow
                                    jobSpec.lockedBy = None
                                    jobSpec.trigger_propagation()
                                    self.dbProxy.update_job(
                                        jobSpec, {
                                            'lockedBy': lockedBy,
                                            'subStatus': 'prepared'
                                        })
                            # OK
                            pandaIDs = set()
                            workSpecList = []
                            if len(okChunks) > 0:
                                for workSpec, okJobs in okChunks:
                                    # has job
                                    if (queueConfig.useJobLateBinding and workSpec.workerID is None) \
                                            or queueConfig.mapType == WorkSpec.MT_NoJob:
                                        workSpec.hasJob = 0
                                    else:
                                        workSpec.hasJob = 1
                                        if workSpec.nJobsToReFill in [None, 0]:
                                            workSpec.set_jobspec_list(okJobs)
                                        else:
                                            # refill free slots during the worker is running
                                            workSpec.set_jobspec_list(
                                                okJobs[:workSpec.
                                                       nJobsToReFill])
                                            workSpec.nJobsToReFill = None
                                            for jobSpec in okJobs[
                                                    workSpec.nJobsToReFill:]:
                                                pandaIDs.add(jobSpec.PandaID)
                                    # map type
                                    workSpec.mapType = queueConfig.mapType
                                    # queue name
                                    workSpec.computingSite = queueConfig.queueName
                                    # set access point
                                    workSpec.accessPoint = queueConfig.messenger[
                                        'accessPoint']
                                    # events
                                    if len(okJobs) > 0 and (
                                            'eventService'
                                            in okJobs[0].jobParams or
                                            'cloneJob' in okJobs[0].jobParams):
                                        workSpec.eventsRequest = WorkSpec.EV_useEvents
                                    workSpecList.append(workSpec)
                            if len(workSpecList) > 0:
                                # get plugin for submitter
                                submitterCore = self.pluginFactory.get_plugin(
                                    queueConfig.submitter)
                                if submitterCore is None:
                                    # not found
                                    tmpLog.error(
                                        'submitter plugin for {0} not found'.
                                        format(jobSpec.computingSite))
                                    continue
                                # get plugin for messenger
                                messenger = self.pluginFactory.get_plugin(
                                    queueConfig.messenger)
                                if messenger is None:
                                    # not found
                                    tmpLog.error(
                                        'messenger plugin for {0} not found'.
                                        format(jobSpec.computingSite))
                                    continue
                                # setup access points
                                messenger.setup_access_points(workSpecList)
                                # feed jobs
                                for workSpec in workSpecList:
                                    if workSpec.hasJob == 1:
                                        tmpStat = messenger.feed_jobs(
                                            workSpec,
                                            workSpec.get_jobspec_list())
                                        if tmpStat is False:
                                            tmpLog.error(
                                                'failed to send jobs to workerID={0}'
                                                .format(workSpec.workerID))
                                        else:
                                            tmpLog.debug(
                                                'sent jobs to workerID={0} with {1}'
                                                .format(
                                                    workSpec.workerID,
                                                    tmpStat))
                                # insert workers
                                self.dbProxy.insert_workers(
                                    workSpecList, lockedBy)
                                # submit
                                tmpLog.info('submitting {0} workers'.format(
                                    len(workSpecList)))
                                workSpecList, tmpRetList, tmpStrList = self.submit_workers(
                                    submitterCore, workSpecList)
                                for iWorker, (tmpRet, tmpStr) in enumerate(
                                        zip(tmpRetList, tmpStrList)):
                                    workSpec, jobList = okChunks[iWorker]
                                    # use associated job list since it can be truncated for re-filling
                                    jobList = workSpec.get_jobspec_list()
                                    # set status
                                    if not tmpRet:
                                        # failed submission
                                        tmpLog.error(
                                            'failed to submit a workerID={0} with {1}'
                                            .format(workSpec.workerID, tmpStr))
                                        workSpec.set_status(WorkSpec.ST_missed)
                                        workSpec.set_dialog_message(tmpStr)
                                        jobList = []
                                    elif queueConfig.useJobLateBinding and workSpec.hasJob == 1:
                                        # directly go to running after feeding jobs for late biding
                                        workSpec.set_status(
                                            WorkSpec.ST_running)
                                    else:
                                        # normal successful submission
                                        workSpec.set_status(
                                            WorkSpec.ST_submitted)
                                    workSpec.submitTime = timeNow
                                    workSpec.modificationTime = timeNow
                                    # prefetch events
                                    if tmpRet and workSpec.hasJob == 1 and workSpec.eventsRequest == WorkSpec.EV_useEvents:
                                        workSpec.eventsRequest = WorkSpec.EV_requestEvents
                                        eventsRequestParams = dict()
                                        for jobSpec in jobList:
                                            eventsRequestParams[
                                                jobSpec.PandaID] = {
                                                    'pandaID':
                                                    jobSpec.PandaID,
                                                    'taskID':
                                                    jobSpec.taskID,
                                                    'jobsetID':
                                                    jobSpec.
                                                    jobParams['jobsetID'],
                                                    'nRanges':
                                                    jobSpec.
                                                    jobParams['coreCount'],
                                                }
                                        workSpec.eventsRequestParams = eventsRequestParams
                                    # register worker
                                    tmpStat = self.dbProxy.register_worker(
                                        workSpec, jobList, lockedBy)
                                    if jobList is not None:
                                        for jobSpec in jobList:
                                            pandaIDs.add(jobSpec.PandaID)
                                            if tmpStat:
                                                tmpStr = 'submitted a workerID={0} for PandaID={1} with batchID={2}'
                                                tmpLog.info(
                                                    tmpStr.format(
                                                        workSpec.workerID,
                                                        jobSpec.PandaID,
                                                        workSpec.batchID))
                                            else:
                                                tmpStr = 'failed to register a worker for PandaID={0} with batchID={1}'
                                                tmpLog.error(
                                                    tmpStr.format(
                                                        jobSpec.PandaID,
                                                        workSpec.batchID))
                            # release jobs
                            self.dbProxy.release_jobs(pandaIDs, lockedBy)
                            tmpLog.info('done')
            mainLog.debug('done')
            # define sleep interval
            if siteName is None:
                sleepTime = harvester_config.submitter.sleepTime
            else:
                sleepTime = 0
            # check if being terminated
            if self.terminated(sleepTime):
                mainLog.debug('terminated')
                return

    # wrapper for submitWorkers to skip ready workers
    def submit_workers(self, submitter_core, workspec_list):
        retList = []
        strList = []
        newSpecList = []
        workersToSubmit = []
        for workSpec in workspec_list:
            if workSpec.status == WorkSpec.ST_ready:
                newSpecList.append(workSpec)
                retList.append(True)
                strList.append('')
            else:
                workersToSubmit.append(workSpec)
        tmpRetList = submitter_core.submit_workers(workersToSubmit)
        for tmpRet, tmpStr in tmpRetList:
            retList.append(tmpRet)
            strList.append(tmpStr)
        newSpecList += workersToSubmit
        return newSpecList, retList, strList
   begin_job_id = int(sys.argv[2])
if len(sys.argv) > 3:
   end_job_id = int(sys.argv[3])
if len(sys.argv) > 4:
   globus_sleep_time = int(sys.argv[4])

queueConfigMapper = QueueConfigMapper()
queueConfig = queueConfigMapper.get_queue(queueName)
initial_queueConfig_preparator = queueConfig.preparator
queueConfig.preparator['module'] = 'pandaharvester.harvesterpreparator.go_bulk_preparator'
queueConfig.preparator['name'] = 'GlobusBulkPreparator'
modified_queueConfig_preparator = queueConfig.preparator

pluginFactory = PluginFactory()
# get stage-out plugin
preparatorCore = pluginFactory.get_plugin(queueConfig.preparator)

# logger
_logger = core_utils.setup_logger('stageInTest_go_bulk_preparator')
tmpLog = core_utils.make_logger(_logger, method_name='stageInTest_go_bulk_preparator')
tmpLog.debug('start')

for loggerName, loggerObj in logging.Logger.manager.loggerDict.iteritems():
   #print "loggerName - {}".format(loggerName)
   if loggerName.startswith('panda.log'):
      if len(loggerObj.handlers) == 0:
         continue
      if loggerName.split('.')[-1] in ['db_proxy']:
         continue
      stdoutHandler = logging.StreamHandler(sys.stdout)
      stdoutHandler.setFormatter(loggerObj.handlers[0].formatter)
Пример #8
0
vomses = get_list(harvester_config.credmanager.voms)

# logger
_logger = core_utils.setup_logger('credManagerTest')

# get plugin(s)
exeCores = []
for moduleName, className, inCertFile, outCertFile, voms in \
        zip(moduleNames, classNames, inCertFiles, outCertFiles, vomses):
    pluginPar = {}
    pluginPar['module'] = moduleName
    pluginPar['name'] = className
    pluginPar['inCertFile'] = inCertFile
    pluginPar['outCertFile'] = outCertFile
    pluginPar['voms'] = voms
    exeCore = pluginFactory.get_plugin(pluginPar)
    exeCores.append(exeCore)

# setup logger to write to screen also
for loggerName, loggerObj in iteritems(logging.Logger.manager.loggerDict):
    if loggerName.startswith('panda.log'):
        if len(loggerObj.handlers) == 0:
            continue
        if loggerName.split('.')[-1] in ['db_proxy']:
            continue
        stdoutHandler = logging.StreamHandler(sys.stdout)
        stdoutHandler.setFormatter(loggerObj.handlers[0].formatter)
        loggerObj.addHandler(stdoutHandler)

# loop over all plugins
for exeCore in exeCores:
Пример #9
0
class Monitor(AgentBase):
    # fifos
    monitor_fifo = MonitorFIFO()

    # constructor
    def __init__(self, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.queueConfigMapper = queue_config_mapper
        self.dbProxy = DBProxy()
        self.pluginFactory = PluginFactory()
        self.startTimestamp = time.time()

    # main loop
    def run(self):
        lockedBy = 'monitor-{0}'.format(self.ident)
        # init messengers
        for queueConfig in self.queueConfigMapper.get_all_queues().values():
            # just import for module initialization
            self.pluginFactory.get_plugin(queueConfig.messenger)
        # main
        last_DB_cycle_timestamp = 0
        monitor_fifo = self.monitor_fifo
        while True:
            sw = core_utils.get_stopwatch()
            mainLog = self.make_logger(_logger,
                                       'id={0}'.format(lockedBy),
                                       method_name='run')

            if time.time(
            ) >= last_DB_cycle_timestamp + harvester_config.monitor.sleepTime:
                # run with workers from DB
                mainLog.debug('starting run with DB')
                mainLog.debug('getting workers to monitor')
                workSpecsPerQueue = self.dbProxy.get_workers_to_update(
                    harvester_config.monitor.maxWorkers,
                    harvester_config.monitor.checkInterval,
                    harvester_config.monitor.lockInterval, lockedBy)
                mainLog.debug('got {0} queues'.format(len(workSpecsPerQueue)))
                # loop over all workers
                for queueName, configIdWorkSpecs in iteritems(
                        workSpecsPerQueue):
                    for configID, workSpecsList in iteritems(
                            configIdWorkSpecs):
                        retVal = self.monitor_agent_core(lockedBy,
                                                         queueName,
                                                         workSpecsList,
                                                         config_id=configID)
                        if self.monitor_fifo.enabled and retVal is not None:
                            workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = retVal
                            if workSpecsToEnqueue:
                                mainLog.debug('putting workers to FIFO')
                                try:
                                    score = fifoCheckInterval + timeNow_timestamp
                                    monitor_fifo.put(
                                        (queueName, workSpecsToEnqueue), score)
                                    mainLog.info(
                                        'put workers of {0} to FIFO with score {1}'
                                        .format(queueName, score))
                                except Exception as errStr:
                                    mainLog.error(
                                        'failed to put object from FIFO: {0}'.
                                        format(errStr))
                            if workSpecsToEnqueueToHead:
                                mainLog.debug('putting workers to FIFO head')
                                try:
                                    score = fifoCheckInterval - timeNow_timestamp
                                    monitor_fifo.put(
                                        (queueName, workSpecsToEnqueueToHead),
                                        score)
                                    mainLog.info(
                                        'put workers of {0} to FIFO with score {1}'
                                        .format(queueName, score))
                                except Exception as errStr:
                                    mainLog.error(
                                        'failed to put object from FIFO head: {0}'
                                        .format(errStr))
                last_DB_cycle_timestamp = time.time()
                mainLog.debug('ended run with DB')
            elif self.monitor_fifo.enabled:
                # run with workers from FIFO
                if monitor_fifo.to_check_workers():
                    # check fifo size
                    fifo_size = monitor_fifo.size()
                    mainLog.debug('FIFO size is {0}'.format(fifo_size))
                    mainLog.debug('starting run with FIFO')
                    try:
                        obj_gotten = monitor_fifo.get(timeout=1)
                    except Exception as errStr:
                        mainLog.error(
                            'failed to get object from FIFO: {0}'.format(
                                errStr))
                    else:
                        if obj_gotten is not None:
                            queueName, workSpecsList = obj_gotten
                            mainLog.debug('got {0} workers of {1}'.format(
                                len(workSpecsList), queueName))
                            configID = workSpecsList[0][0].configID
                            for workSpecs in workSpecsList:
                                for workSpec in workSpecs:
                                    if workSpec.pandaid_list is None:
                                        _jobspec_list = workSpec.get_jobspec_list(
                                        )
                                        if _jobspec_list is not None:
                                            workSpec.pandaid_list = [
                                                j.PandaID for j in
                                                workSpec.get_jobspec_list()
                                            ]
                                        else:
                                            workSpec.pandaid_list = []
                                        workSpec.force_update('pandaid_list')
                            retVal = self.monitor_agent_core(
                                lockedBy,
                                queueName,
                                workSpecsList,
                                from_fifo=True,
                                config_id=configID)
                            if retVal is not None:
                                workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = retVal
                                if workSpecsToEnqueue:
                                    mainLog.debug('putting workers to FIFO')
                                    try:
                                        score = fifoCheckInterval + timeNow_timestamp
                                        monitor_fifo.put(
                                            (queueName, workSpecsToEnqueue),
                                            score)
                                        mainLog.info(
                                            'put workers of {0} to FIFO with score {1}'
                                            .format(queueName, score))
                                    except Exception as errStr:
                                        mainLog.error(
                                            'failed to put object from FIFO: {0}'
                                            .format(errStr))
                                if workSpecsToEnqueueToHead:
                                    mainLog.debug(
                                        'putting workers to FIFO head')
                                    try:
                                        score = fifoCheckInterval - timeNow_timestamp
                                        monitor_fifo.put(
                                            (queueName,
                                             workSpecsToEnqueueToHead), score)
                                        mainLog.info(
                                            'put workers of {0} to FIFO with score {1}'
                                            .format(queueName, score))
                                    except Exception as errStr:
                                        mainLog.error(
                                            'failed to put object from FIFO head: {0}'
                                            .format(errStr))
                            else:
                                mainLog.debug(
                                    'monitor_agent_core returned None. Skipped putting to FIFO'
                                )
                        else:
                            mainLog.debug('got nothing in FIFO')
                    mainLog.debug('ended run with FIFO')
                else:
                    mainLog.debug(
                        'workers in FIFO too young to check. Skipped')

            if sw.get_elapsed_time_in_sec(
            ) > harvester_config.monitor.lockInterval:
                mainLog.warning(
                    'a single cycle was longer than lockInterval ' +
                    sw.get_elapsed_time())
            else:
                mainLog.debug('done' + sw.get_elapsed_time())

            # check if being terminated
            sleepTime = (harvester_config.monitor.fifoSleepTimeMilli / 1000.0) \
                            if self.monitor_fifo.enabled else harvester_config.monitor.sleepTime
            if self.terminated(sleepTime):
                mainLog.debug('terminated')
                return

    # core of monitor agent to check workers in workSpecsList of queueName
    def monitor_agent_core(self,
                           lockedBy,
                           queueName,
                           workSpecsList,
                           from_fifo=False,
                           config_id=None):
        tmpQueLog = self.make_logger(_logger,
                                     'id={0} queue={1}'.format(
                                         lockedBy, queueName),
                                     method_name='run')
        # check queue
        if not self.queueConfigMapper.has_queue(queueName, config_id):
            tmpQueLog.error('config not found')
            return
        # get queue
        queueConfig = self.queueConfigMapper.get_queue(queueName, config_id)
        # get plugins
        monCore = self.pluginFactory.get_plugin(queueConfig.monitor)
        messenger = self.pluginFactory.get_plugin(queueConfig.messenger)
        # workspec chunk of active workers
        workSpecsToEnqueue = []
        workSpecsToEnqueueToHead = []
        timeNow_timestamp = time.time()
        # get fifoCheckInterval for PQ
        try:
            fifoCheckInterval = monCore.fifoCheckInterval
        except:
            if hasattr(harvester_config.monitor, 'fifoCheckInterval'):
                fifoCheckInterval = harvester_config.monitor.fifoCheckInterval
            else:
                fifoCheckInterval = harvester_config.monitor.checkInterval
        # check workers
        allWorkers = [item for sublist in workSpecsList for item in sublist]
        tmpQueLog.debug('checking {0} workers'.format(len(allWorkers)))
        tmpStat, tmpRetMap = self.check_workers(monCore, messenger, allWorkers,
                                                queueConfig, tmpQueLog)
        if tmpStat:
            # loop over all worker chunks
            tmpQueLog.debug('update jobs and workers')
            iWorker = 0
            for workSpecs in workSpecsList:
                jobSpecs = None
                filesToStageOut = dict()
                pandaIDsList = []
                eventsToUpdateList = []
                filesToStageOutList = []
                mapType = workSpecs[0].mapType
                # lock workers for fifo
                temRetLockWorker = None
                if from_fifo:
                    # lock workers
                    worker_id_list = [w.workerID for w in workSpecs]
                    temRetLockWorker = self.dbProxy.lock_workers(
                        worker_id_list, harvester_config.monitor.lockInterval,
                        lockedBy)
                    # skip if not locked
                    if not temRetLockWorker:
                        continue
                # loop over workSpecs
                for workSpec in workSpecs:
                    tmpLog = self.make_logger(_logger,
                                              'id={0} workerID={1}'.format(
                                                  lockedBy, workSpec.workerID),
                                              method_name='run')
                    tmpOut = tmpRetMap[workSpec.workerID]
                    newStatus = tmpOut['newStatus']
                    monStatus = tmpOut['monStatus']
                    diagMessage = tmpOut['diagMessage']
                    workAttributes = tmpOut['workAttributes']
                    eventsToUpdate = tmpOut['eventsToUpdate']
                    filesToStageOut = tmpOut['filesToStageOut']
                    eventsRequestParams = tmpOut['eventsRequestParams']
                    nJobsToReFill = tmpOut['nJobsToReFill']
                    pandaIDs = tmpOut['pandaIDs']
                    tmpStr = 'newStatus={0} monitoredStatus={1} diag={2} '
                    tmpStr += 'postProcessed={3} files={4}'
                    tmpLog.debug(
                        tmpStr.format(newStatus, monStatus, diagMessage,
                                      workSpec.is_post_processed(),
                                      str(filesToStageOut)))
                    iWorker += 1
                    if from_fifo:
                        workSpec.lockedBy = lockedBy
                        workSpec.force_update('lockedBy')
                    # check status
                    if newStatus not in WorkSpec.ST_LIST:
                        tmpLog.error('unknown status={0}'.format(newStatus))
                        return
                    # update worker
                    workSpec.set_status(newStatus)
                    workSpec.set_work_attributes(workAttributes)
                    workSpec.set_dialog_message(diagMessage)
                    if monStatus == WorkSpec.ST_failed:
                        if not workSpec.has_pilot_error():
                            workSpec.set_pilot_error(
                                PilotErrors.ERR_GENERALERROR, diagMessage)
                    elif monStatus == WorkSpec.ST_cancelled:
                        if not workSpec.has_pilot_error():
                            workSpec.set_pilot_error(PilotErrors.ERR_PANDAKILL,
                                                     diagMessage)
                    # request events
                    if eventsRequestParams != {}:
                        workSpec.eventsRequest = WorkSpec.EV_requestEvents
                        workSpec.eventsRequestParams = eventsRequestParams
                    # jobs to refill
                    if nJobsToReFill is not None:
                        workSpec.nJobsToReFill = nJobsToReFill
                    # get associated jobs for the worker chunk
                    if workSpec.hasJob == 1 and jobSpecs is None:
                        jobSpecs = self.dbProxy.get_jobs_with_worker_id(
                            workSpec.workerID, None, only_running=True)
                    # pandaIDs for push
                    pandaIDsList.append(pandaIDs)
                    if len(eventsToUpdate) > 0:
                        eventsToUpdateList.append(eventsToUpdate)
                    if len(filesToStageOut) > 0:
                        filesToStageOutList.append(filesToStageOut)
                # update jobs and workers
                if jobSpecs is not None:
                    tmpQueLog.debug(
                        'updating {0} jobs with {1} workers'.format(
                            len(jobSpecs), len(workSpecs)))
                    core_utils.update_job_attributes_with_workers(
                        mapType, jobSpecs, workSpecs, filesToStageOutList,
                        eventsToUpdateList)
                    for jobSpec in jobSpecs:
                        tmpLog = self.make_logger(_logger,
                                                  'id={0} PandaID={1}'.format(
                                                      lockedBy,
                                                      jobSpec.PandaID),
                                                  method_name='run')
                        tmpLog.debug(
                            'new status={0} subStatus={1} status_in_metadata={2}'
                            .format(jobSpec.status, jobSpec.subStatus,
                                    jobSpec.get_job_status_from_attributes()))
                # update local database
                tmpRet = self.dbProxy.update_jobs_workers(
                    jobSpecs, workSpecs, lockedBy, pandaIDsList)
                if not tmpRet:
                    for workSpec in workSpecs:
                        tmpLog = self.make_logger(_logger,
                                                  'id={0} workerID={1}'.format(
                                                      lockedBy,
                                                      workSpec.workerID),
                                                  method_name='run')
                        if from_fifo:
                            tmpLog.info(
                                'failed to update the DB. Maybe locked by other thread running with DB'
                            )
                        else:
                            tmpLog.error(
                                'failed to update the DB. lockInterval may be too short'
                            )
                        sendWarning = True
                # send ACK to workers for events and files
                if len(eventsToUpdateList) > 0 or len(filesToStageOutList) > 0:
                    for workSpec in workSpecs:
                        messenger.acknowledge_events_files(workSpec)
                # active workers for fifo
                if self.monitor_fifo.enabled and workSpecs:
                    workSpec = workSpecs[0]
                    tmpOut = tmpRetMap[workSpec.workerID]
                    newStatus = tmpOut['newStatus']
                    monStatus = tmpOut['monStatus']
                    if newStatus in [WorkSpec.ST_submitted, WorkSpec.ST_running] \
                        and workSpec.mapType != WorkSpec.MT_MultiWorkers \
                        and workSpec.workAttributes is not None:
                        forceEnqueueInterval = harvester_config.monitor.fifoForceEnqueueInterval
                        timeNow = datetime.datetime.utcnow()
                        timeNow_timestamp = time.time()
                        _bool, lastCheckAt = workSpec.get_work_params(
                            'lastCheckAt')
                        try:
                            last_check_period = timeNow_timestamp - lastCheckAt
                        except TypeError:
                            last_check_period = forceEnqueueInterval + 1.0
                        if (from_fifo and tmpRet) \
                            or (not from_fifo and timeNow_timestamp - harvester_config.monitor.sleepTime > self.startTimestamp
                                and last_check_period > forceEnqueueInterval):
                            if not from_fifo and _bool and lastCheckAt is not None \
                                and last_check_period > harvester_config.monitor.checkInterval:
                                tmpQueLog.warning(
                                    'last check period of workerID={0} is {1} sec, longer than monitor checkInterval'
                                    .format(workSpec.workerID,
                                            last_check_period))
                            workSpec.set_work_params(
                                {'lastCheckAt': timeNow_timestamp})
                            workSpec.lockedBy = None
                            workSpec.force_update('lockedBy')
                            if monStatus in [
                                    WorkSpec.ST_finished, WorkSpec.ST_failed,
                                    WorkSpec.ST_cancelled
                            ]:
                                _bool, startFifoPreemptAt = workSpec.get_work_params(
                                    'startFifoPreemptAt')
                                if not _bool or startFifoPreemptAt is None:
                                    startFifoPreemptAt = timeNow_timestamp
                                    workSpec.set_work_params({
                                        'startFifoPreemptAt':
                                        startFifoPreemptAt
                                    })
                                tmpQueLog.debug(
                                    'workerID={0} , startFifoPreemptAt: {1}'.
                                    format(workSpec.workerID,
                                           startFifoPreemptAt))
                                if timeNow_timestamp - startFifoPreemptAt < harvester_config.monitor.fifoMaxPreemptInterval:
                                    workSpecsToEnqueueToHead.append(workSpecs)
                                else:
                                    workSpec.set_work_params({
                                        'startFifoPreemptAt':
                                        timeNow_timestamp
                                    })
                                    workSpec.modificationTime = timeNow
                                    workSpec.force_update('modificationTime')
                                    workSpecsToEnqueue.append(workSpecs)
                            else:
                                workSpec.modificationTime = timeNow
                                workSpec.force_update('modificationTime')
                                workSpecsToEnqueue.append(workSpecs)
        else:
            tmpQueLog.error('failed to check workers')
        retVal = workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval
        tmpQueLog.debug('done')
        return retVal

    # wrapper for checkWorkers
    def check_workers(self, mon_core, messenger, all_workers, queue_config,
                      tmp_log):
        workersToCheck = []
        retMap = dict()
        for workSpec in all_workers:
            eventsRequestParams = {}
            eventsToUpdate = []
            pandaIDs = []
            workStatus = None
            workAttributes = None
            filesToStageOut = []
            nJobsToReFill = None
            # job-level late binding
            if workSpec.hasJob == 0 and workSpec.mapType != WorkSpec.MT_NoJob:
                # check if job is requested
                jobRequested = messenger.job_requested(workSpec)
                if jobRequested:
                    # set ready when job is requested
                    workStatus = WorkSpec.ST_ready
                else:
                    workStatus = workSpec.status
            elif workSpec.nJobsToReFill in [0, None]:
                # check if job is requested to refill free slots
                jobRequested = messenger.job_requested(workSpec)
                if jobRequested:
                    nJobsToReFill = jobRequested
                workersToCheck.append(workSpec)
            else:
                workersToCheck.append(workSpec)
            # add
            retMap[workSpec.workerID] = {
                'newStatus': workStatus,
                'monStatus': workStatus,
                'workAttributes': workAttributes,
                'filesToStageOut': filesToStageOut,
                'eventsRequestParams': eventsRequestParams,
                'eventsToUpdate': eventsToUpdate,
                'diagMessage': '',
                'pandaIDs': pandaIDs,
                'nJobsToReFill': nJobsToReFill
            }
        # check workers
        tmp_log.debug('checking workers with plugin')
        try:
            tmpStat, tmpOut = mon_core.check_workers(workersToCheck)
            if not tmpStat:
                tmp_log.error(
                    'failed to check workers with: {0}'.format(tmpOut))
            else:
                tmp_log.debug('checked')
                for workSpec, (newStatus,
                               diagMessage) in zip(workersToCheck, tmpOut):
                    workerID = workSpec.workerID
                    tmp_log.debug(
                        'Going to check workerID={0}'.format(workerID))
                    pandaIDs = []
                    if workerID in retMap:
                        # request kill
                        if messenger.kill_requested(workSpec):
                            self.dbProxy.kill_worker(workSpec.workerID)

                        # expired heartbeat - only when requested in the configuration
                        try:
                            # check if the queue configuration requires checking for worker heartbeat
                            worker_heartbeat_limit = int(
                                queue_config.messenger['worker_heartbeat'])
                        except (AttributeError, KeyError):
                            worker_heartbeat_limit = None
                        tmp_log.debug(
                            'workerID={0} heartbeat limit is configured to {1}'
                            .format(workerID, worker_heartbeat_limit))
                        if worker_heartbeat_limit:
                            if messenger.is_alive(workSpec,
                                                  worker_heartbeat_limit):
                                tmp_log.debug(
                                    'heartbeat for workerID={0} is valid'.
                                    format(workerID))
                            else:
                                tmp_log.debug(
                                    'heartbeat for workerID={0} expired: sending kill request'
                                    .format(workerID))
                                self.dbProxy.kill_worker(workSpec.workerID)

                        # get work attributes
                        workAttributes = messenger.get_work_attributes(
                            workSpec)
                        retMap[workerID]['workAttributes'] = workAttributes
                        # get output files
                        filesToStageOut = messenger.get_files_to_stage_out(
                            workSpec)
                        retMap[workerID]['filesToStageOut'] = filesToStageOut
                        # get events to update
                        if workSpec.eventsRequest in [
                                WorkSpec.EV_useEvents,
                                WorkSpec.EV_requestEvents
                        ]:
                            eventsToUpdate = messenger.events_to_update(
                                workSpec)
                            retMap[workerID]['eventsToUpdate'] = eventsToUpdate
                        # request events
                        if workSpec.eventsRequest == WorkSpec.EV_useEvents:
                            eventsRequestParams = messenger.events_requested(
                                workSpec)
                            retMap[workerID][
                                'eventsRequestParams'] = eventsRequestParams
                        # get PandaIDs for pull model
                        if workSpec.mapType == WorkSpec.MT_NoJob:
                            pandaIDs = messenger.get_panda_ids(workSpec)
                        retMap[workerID]['pandaIDs'] = pandaIDs
                        # keep original new status
                        retMap[workerID]['monStatus'] = newStatus
                        # set running while there are events to update or files to stage out
                        if newStatus in [
                                WorkSpec.ST_finished, WorkSpec.ST_failed,
                                WorkSpec.ST_cancelled
                        ]:
                            if len(retMap[workerID]['filesToStageOut']) > 0 or \
                                            len(retMap[workerID]['eventsToUpdate']) > 0:
                                newStatus = WorkSpec.ST_running
                            elif not workSpec.is_post_processed():
                                if not queue_config.is_no_heartbeat_status(
                                        newStatus):
                                    # post processing unless heartbeat is suppressed
                                    jobSpecs = self.dbProxy.get_jobs_with_worker_id(
                                        workSpec.workerID,
                                        None,
                                        True,
                                        only_running=True)
                                    # post processing
                                    messenger.post_processing(
                                        workSpec, jobSpecs, workSpec.mapType)
                                workSpec.post_processed()
                                newStatus = WorkSpec.ST_running
                            # reset modification time to immediately trigger subsequent lookup
                            workSpec.trigger_next_lookup()
                        retMap[workerID]['newStatus'] = newStatus
                        retMap[workerID]['diagMessage'] = diagMessage
                    else:
                        tmp_log.debug(
                            'workerID={0} not in retMap'.format(workerID))
            return True, retMap
        except:
            core_utils.dump_error_message(tmp_log)
            return False, None
Пример #10
0
class Submitter(AgentBase):
    # constructor
    def __init__(self, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.queueConfigMapper = queue_config_mapper
        self.dbProxy = DBProxy()
        self.workerMaker = WorkerMaker()
        self.workerAdjuster = WorkerAdjuster(queue_config_mapper)
        self.pluginFactory = PluginFactory()
        self.monitor_fifo = MonitorFIFO()
        self.apfmon = Apfmon(self.queueConfigMapper)

    # main loop
    def run(self):
        lockedBy = 'submitter-{0}'.format(self.get_pid())
        monitor_fifo = self.monitor_fifo
        queueLockInterval = getattr(harvester_config.submitter, 'queueLockInterval',
                                    harvester_config.submitter.lockInterval)
        while True:
            sw_main = core_utils.get_stopwatch()
            mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run')
            mainLog.debug('getting queues to submit workers')

            # get queues associated to a site to submit workers
            curWorkers, siteName, resMap = self.dbProxy.get_queues_to_submit(harvester_config.submitter.nQueues,
                                                                             harvester_config.submitter.lookupTime,
                                                                             harvester_config.submitter.lockInterval,
                                                                             lockedBy, queueLockInterval)
            submitted = False
            if siteName is not None:
                mainLog.debug('got {0} queues for site {1}'.format(len(curWorkers), siteName))

                # get commands
                comStr = '{0}:{1}'.format(CommandSpec.COM_setNWorkers, siteName)
                commandSpecs = self.dbProxy.get_commands_for_receiver('submitter', comStr)
                mainLog.debug('got {0} {1} commands'.format(commandSpecs, comStr))
                for commandSpec in commandSpecs:
                    newLimits = self.dbProxy.set_queue_limit(siteName, commandSpec.params)
                    for tmpResource, tmpNewVal in iteritems(newLimits):
                        # if available, overwrite new worker value with the command from panda server
                        if tmpResource in resMap:
                            tmpQueueName = resMap[tmpResource]
                            if tmpQueueName in curWorkers:
                                curWorkers[tmpQueueName][tmpResource]['nNewWorkers'] = tmpNewVal

                # define number of new workers
                if len(curWorkers) == 0:
                    n_workers_per_queue_and_rt = dict()
                else:
                    n_workers_per_queue_and_rt = self.workerAdjuster.define_num_workers(curWorkers, siteName)

                if n_workers_per_queue_and_rt is None:
                    mainLog.error('WorkerAdjuster failed to define the number of workers')
                elif len(n_workers_per_queue_and_rt) == 0:
                    pass
                else:
                    # loop over all queues and resource types
                    for queueName in n_workers_per_queue_and_rt:
                        for resource_type, tmpVal in iteritems(n_workers_per_queue_and_rt[queueName]):

                            tmpLog = self.make_logger(_logger, 'id={0} queue={1} rtype={2}'.format(lockedBy,
                                                                                                   queueName,
                                                                                                   resource_type),
                                                      method_name='run')
                            try:
                                tmpLog.debug('start')
                                tmpLog.debug('workers status: %s' % tmpVal)
                                nWorkers = tmpVal['nNewWorkers'] + tmpVal['nReady']
                                nReady = tmpVal['nReady']

                                # check queue
                                if not self.queueConfigMapper.has_queue(queueName):
                                    tmpLog.error('config not found')
                                    continue

                                # no new workers
                                if nWorkers == 0:
                                    tmpLog.debug('skipped since no new worker is needed based on current stats')
                                    continue
                                # get queue
                                queueConfig = self.queueConfigMapper.get_queue(queueName)
                                workerMakerCore = self.workerMaker.get_plugin(queueConfig)
                                # check if resource is ready
                                if hasattr(workerMakerCore, 'dynamicSizing') and workerMakerCore.dynamicSizing is True:
                                    numReadyResources = self.workerMaker.num_ready_resources(queueConfig,
                                                                                             resource_type,
                                                                                             workerMakerCore)
                                    tmpLog.debug('numReadyResources: %s' % numReadyResources)
                                    if not numReadyResources:
                                        if hasattr(workerMakerCore, 'staticWorkers'):
                                            nQRWorkers = tmpVal['nQueue'] + tmpVal['nRunning']
                                            tmpLog.debug('staticWorkers: %s, nQRWorkers(Queue+Running): %s' %
                                                         (workerMakerCore.staticWorkers, nQRWorkers))
                                            if nQRWorkers >= workerMakerCore.staticWorkers:
                                                tmpLog.debug('No left static workers, skip')
                                                continue
                                            else:
                                                nWorkers = min(workerMakerCore.staticWorkers - nQRWorkers, nWorkers)
                                                tmpLog.debug('staticWorkers: %s, nWorkers: %s' %
                                                             (workerMakerCore.staticWorkers, nWorkers))
                                        else:
                                            tmpLog.debug('skip since no resources are ready')
                                            continue
                                    else:
                                        nWorkers = min(nWorkers, numReadyResources)
                                # post action of worker maker
                                if hasattr(workerMakerCore, 'skipOnFail') and workerMakerCore.skipOnFail is True:
                                    skipOnFail = True
                                else:
                                    skipOnFail = False
                                # actions based on mapping type
                                if queueConfig.mapType == WorkSpec.MT_NoJob:
                                    # workers without jobs
                                    jobChunks = []
                                    for i in range(nWorkers):
                                        jobChunks.append([])
                                elif queueConfig.mapType == WorkSpec.MT_OneToOne:
                                    # one worker per one job
                                    jobChunks = self.dbProxy.get_job_chunks_for_workers(
                                        queueName,
                                        nWorkers, nReady, 1, None,
                                        queueConfig.useJobLateBinding,
                                        harvester_config.submitter.checkInterval,
                                        harvester_config.submitter.lockInterval,
                                        lockedBy)
                                elif queueConfig.mapType == WorkSpec.MT_MultiJobs:
                                    # one worker for multiple jobs
                                    nJobsPerWorker = self.workerMaker.get_num_jobs_per_worker(queueConfig,
                                                                                              nWorkers,
                                                                                              resource_type,
                                                                                              maker=workerMakerCore)
                                    tmpLog.debug('nJobsPerWorker={0}'.format(nJobsPerWorker))
                                    jobChunks = self.dbProxy.get_job_chunks_for_workers(
                                        queueName,
                                        nWorkers, nReady, nJobsPerWorker, None,
                                        queueConfig.useJobLateBinding,
                                        harvester_config.submitter.checkInterval,
                                        harvester_config.submitter.lockInterval,
                                        lockedBy,
                                        queueConfig.allowJobMixture)
                                elif queueConfig.mapType == WorkSpec.MT_MultiWorkers:
                                    # multiple workers for one job
                                    nWorkersPerJob = self.workerMaker.get_num_workers_per_job(queueConfig,
                                                                                              nWorkers,
                                                                                              resource_type,
                                                                                              maker=workerMakerCore)
                                    maxWorkersPerJob = self.workerMaker.get_max_workers_per_job_in_total(
                                        queueConfig, resource_type, maker=workerMakerCore)
                                    maxWorkersPerJobPerCycle = self.workerMaker.get_max_workers_per_job_per_cycle(
                                        queueConfig, resource_type, maker=workerMakerCore)
                                    tmpLog.debug('nWorkersPerJob={0}'.format(nWorkersPerJob))
                                    jobChunks = self.dbProxy.get_job_chunks_for_workers(
                                        queueName,
                                        nWorkers, nReady, None, nWorkersPerJob,
                                        queueConfig.useJobLateBinding,
                                        harvester_config.submitter.checkInterval,
                                        harvester_config.submitter.lockInterval,
                                        lockedBy, max_workers_per_job_in_total=maxWorkersPerJob,
                                        max_workers_per_job_per_cycle=maxWorkersPerJobPerCycle)
                                else:
                                    tmpLog.error('unknown mapType={0}'.format(queueConfig.mapType))
                                    continue

                                tmpLog.debug('got {0} job chunks'.format(len(jobChunks)))
                                if len(jobChunks) == 0:
                                    continue
                                # make workers
                                okChunks, ngChunks = self.workerMaker.make_workers(jobChunks, queueConfig,
                                                                                   nReady, resource_type,
                                                                                   maker=workerMakerCore)
                                if len(ngChunks) == 0:
                                    tmpLog.debug('successfully made {0} workers'.format(len(okChunks)))
                                else:
                                    tmpLog.debug('made {0} workers, while {1} workers failed'.format(len(okChunks),
                                                                                                     len(ngChunks)))
                                timeNow = datetime.datetime.utcnow()
                                timeNow_timestamp = time.time()
                                pandaIDs = set()
                                # NG (=not good)
                                for ngJobs in ngChunks:
                                    for jobSpec in ngJobs:
                                        if skipOnFail:
                                            # release jobs when workers are not made
                                            pandaIDs.add(jobSpec.PandaID)
                                        else:
                                            jobSpec.status = 'failed'
                                            jobSpec.subStatus = 'failed_to_make'
                                            jobSpec.stateChangeTime = timeNow
                                            jobSpec.lockedBy = None
                                            errStr = 'failed to make a worker'
                                            jobSpec.set_pilot_error(PilotErrors.ERR_SETUPFAILURE, errStr)
                                            jobSpec.trigger_propagation()
                                            self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy,
                                                                              'subStatus': 'prepared'})
                                # OK
                                workSpecList = []
                                if len(okChunks) > 0:
                                    for workSpec, okJobs in okChunks:
                                        # has job
                                        if (queueConfig.useJobLateBinding and workSpec.workerID is None) \
                                                or queueConfig.mapType == WorkSpec.MT_NoJob:
                                            workSpec.hasJob = 0
                                        else:
                                            workSpec.hasJob = 1
                                            if workSpec.nJobsToReFill in [None, 0]:
                                                workSpec.set_jobspec_list(okJobs)
                                            else:
                                                # refill free slots during the worker is running
                                                workSpec.set_jobspec_list(okJobs[:workSpec.nJobsToReFill])
                                                workSpec.nJobsToReFill = None
                                                for jobSpec in okJobs[workSpec.nJobsToReFill:]:
                                                    pandaIDs.add(jobSpec.PandaID)
                                            workSpec.set_num_jobs_with_list()
                                        # map type
                                        workSpec.mapType = queueConfig.mapType
                                        # queue name
                                        workSpec.computingSite = queueConfig.queueName
                                        # set access point
                                        workSpec.accessPoint = queueConfig.messenger['accessPoint']
                                        # sync level
                                        workSpec.syncLevel = queueConfig.get_synchronization_level()
                                        # events
                                        if len(okJobs) > 0 and \
                                                ('eventService' in okJobs[0].jobParams or
                                                 'cloneJob' in okJobs[0].jobParams):
                                            workSpec.eventsRequest = WorkSpec.EV_useEvents
                                        workSpecList.append(workSpec)
                                if len(workSpecList) > 0:
                                    sw = core_utils.get_stopwatch()
                                    # get plugin for submitter
                                    submitterCore = self.pluginFactory.get_plugin(queueConfig.submitter)
                                    if submitterCore is None:
                                        # not found
                                        tmpLog.error(
                                            'submitter plugin for {0} not found'.format(jobSpec.computingSite))
                                        continue
                                    # get plugin for messenger
                                    messenger = self.pluginFactory.get_plugin(queueConfig.messenger)
                                    if messenger is None:
                                        # not found
                                        tmpLog.error(
                                            'messenger plugin for {0} not found'.format(jobSpec.computingSite))
                                        continue
                                    # setup access points
                                    messenger.setup_access_points(workSpecList)
                                    # feed jobs
                                    for workSpec in workSpecList:
                                        if workSpec.hasJob == 1:
                                            tmpStat = messenger.feed_jobs(workSpec, workSpec.get_jobspec_list())
                                            if tmpStat is False:
                                                tmpLog.error(
                                                    'failed to send jobs to workerID={0}'.format(workSpec.workerID))
                                            else:
                                                tmpLog.debug(
                                                    'sent jobs to workerID={0} with {1}'.format(workSpec.workerID,
                                                                                                tmpStat))
                                    # insert workers
                                    self.dbProxy.insert_workers(workSpecList, lockedBy)
                                    # submit
                                    sw.reset()
                                    tmpLog.info('submitting {0} workers'.format(len(workSpecList)))
                                    workSpecList, tmpRetList, tmpStrList = self.submit_workers(submitterCore,
                                                                                               workSpecList)
                                    tmpLog.debug('done submitting {0} workers'.format(len(workSpecList))
                                                    + sw.get_elapsed_time())
                                    # collect successful jobs
                                    okPandaIDs = set()
                                    for iWorker, (tmpRet, tmpStr) in enumerate(zip(tmpRetList, tmpStrList)):
                                        if tmpRet:
                                            workSpec, jobList = okChunks[iWorker]
                                            jobList = workSpec.get_jobspec_list()
                                            if jobList is not None:
                                                for jobSpec in jobList:
                                                    okPandaIDs.add(jobSpec.PandaID)
                                    # loop over all workers
                                    for iWorker, (tmpRet, tmpStr) in enumerate(zip(tmpRetList, tmpStrList)):
                                        workSpec, jobList = okChunks[iWorker]
                                        # set harvesterHost
                                        workSpec.harvesterHost = socket.gethostname()
                                        # use associated job list since it can be truncated for re-filling
                                        jobList = workSpec.get_jobspec_list()
                                        # set status
                                        if not tmpRet:
                                            # failed submission
                                            errStr = 'failed to submit a workerID={0} with {1}'.format(
                                                workSpec.workerID,
                                                tmpStr)
                                            tmpLog.error(errStr)
                                            workSpec.set_status(WorkSpec.ST_missed)
                                            workSpec.set_dialog_message(tmpStr)
                                            workSpec.set_pilot_error(PilotErrors.ERR_SETUPFAILURE, errStr)
                                            if jobList is not None:
                                                # increment attempt number
                                                newJobList = []
                                                for jobSpec in jobList:
                                                    # skip if successful with another worker
                                                    if jobSpec.PandaID in okPandaIDs:
                                                        continue
                                                    if jobSpec.submissionAttempts is None:
                                                        jobSpec.submissionAttempts = 0
                                                    jobSpec.submissionAttempts += 1
                                                    # max attempt or permanent error
                                                    if tmpRet is False or \
                                                            jobSpec.submissionAttempts >= \
                                                            queueConfig.maxSubmissionAttempts:
                                                        newJobList.append(jobSpec)
                                                    else:
                                                        self.dbProxy.increment_submission_attempt(
                                                            jobSpec.PandaID,
                                                            jobSpec.submissionAttempts)
                                                jobList = newJobList
                                        elif queueConfig.useJobLateBinding and workSpec.hasJob == 1:
                                            # directly go to running after feeding jobs for late biding
                                            workSpec.set_status(WorkSpec.ST_running)
                                        else:
                                            # normal successful submission
                                            workSpec.set_status(WorkSpec.ST_submitted)
                                        workSpec.submitTime = timeNow
                                        workSpec.modificationTime = timeNow
                                        workSpec.checkTime = timeNow
                                        if self.monitor_fifo.enabled:
                                            workSpec.set_work_params({'lastCheckAt': timeNow_timestamp})
                                        # prefetch events
                                        if tmpRet and workSpec.hasJob == 1 and \
                                                workSpec.eventsRequest == WorkSpec.EV_useEvents and \
                                                queueConfig.prefetchEvents:
                                            workSpec.eventsRequest = WorkSpec.EV_requestEvents
                                            eventsRequestParams = dict()
                                            for jobSpec in jobList:
                                                eventsRequestParams[jobSpec.PandaID] = \
                                                    {'pandaID': jobSpec.PandaID,
                                                     'taskID': jobSpec.taskID,
                                                     'jobsetID': jobSpec.jobParams['jobsetID'],
                                                     'nRanges': max(int(math.ceil(workSpec.nCore / len(jobList))),
                                                                    jobSpec.jobParams['coreCount']),
                                                     }
                                            workSpec.eventsRequestParams = eventsRequestParams
                                        # register worker
                                        tmpStat = self.dbProxy.register_worker(workSpec, jobList, lockedBy)
                                        if jobList is not None:
                                            for jobSpec in jobList:
                                                pandaIDs.add(jobSpec.PandaID)
                                                if tmpStat:
                                                    if tmpRet:
                                                        tmpStr = \
                                                            'submitted a workerID={0} for PandaID={1} with batchID={2}'
                                                        tmpLog.info(tmpStr.format(workSpec.workerID,
                                                                                  jobSpec.PandaID,
                                                                                  workSpec.batchID))
                                                    else:
                                                        tmpStr = 'failed to submit a workerID={0} for PandaID={1}'
                                                        tmpLog.error(tmpStr.format(workSpec.workerID,
                                                                                   jobSpec.PandaID))
                                                else:
                                                    tmpStr = \
                                                        'failed to register a worker for PandaID={0} with batchID={1}'
                                                    tmpLog.error(tmpStr.format(jobSpec.PandaID, workSpec.batchID))
                                    # enqueue to monitor fifo
                                    if self.monitor_fifo.enabled \
                                            and queueConfig.mapType != WorkSpec.MT_MultiWorkers:
                                        workSpecsToEnqueue = \
                                            [[w] for w in workSpecList if w.status
                                             in (WorkSpec.ST_submitted, WorkSpec.ST_running)]
                                        check_delay = min(
                                                        getattr(harvester_config.monitor, 'eventBasedCheckInterval',
                                                                harvester_config.monitor.checkInterval),
                                                        getattr(harvester_config.monitor, 'fifoCheckInterval',
                                                                harvester_config.monitor.checkInterval))
                                        monitor_fifo.put((queueName, workSpecsToEnqueue), time.time() + check_delay)
                                        mainLog.debug('put workers to monitor FIFO')
                                    submitted = True
                                # release jobs
                                self.dbProxy.release_jobs(pandaIDs, lockedBy)
                                tmpLog.info('done')
                            except Exception:
                                core_utils.dump_error_message(tmpLog)
                # release the site
                self.dbProxy.release_site(siteName, lockedBy)
                if sw_main.get_elapsed_time_in_sec() > queueLockInterval:
                    mainLog.warning('a submitter cycle was longer than queueLockInterval {0} sec'.format(queueLockInterval)
                                    + sw_main.get_elapsed_time())
            mainLog.debug('done')
            # define sleep interval
            if siteName is None:
                sleepTime = harvester_config.submitter.sleepTime
            else:
                sleepTime = 0
                if submitted and hasattr(harvester_config.submitter, 'minSubmissionInterval'):
                    interval = harvester_config.submitter.minSubmissionInterval
                    if interval > 0:
                        newTime = datetime.datetime.utcnow() + datetime.timedelta(seconds=interval)
                        self.dbProxy.update_panda_queue_attribute('submitTime', newTime, site_name=siteName)

            # time the cycle
            mainLog.debug('done a submitter cycle' + sw_main.get_elapsed_time())
            # check if being terminated
            if self.terminated(sleepTime):
                mainLog.debug('terminated')
                return

    # wrapper for submitWorkers to skip ready workers
    def submit_workers(self, submitter_core, workspec_list):
        retList = []
        strList = []
        newSpecList = []
        workersToSubmit = []
        for workSpec in workspec_list:
            if workSpec.status in [WorkSpec.ST_ready, WorkSpec.ST_running]:
                newSpecList.append(workSpec)
                retList.append(True)
                strList.append('')
            else:
                workersToSubmit.append(workSpec)
        tmpRetList = submitter_core.submit_workers(workersToSubmit)

        # submit the workers to the monitoring
        self.apfmon.create_workers(workersToSubmit)

        for tmpRet, tmpStr in tmpRetList:
            retList.append(tmpRet)
            strList.append(tmpStr)
        newSpecList += workersToSubmit
        return newSpecList, retList, strList
Пример #11
0
class Sweeper(AgentBase):
    # constructor
    def __init__(self, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.dbProxy = DBProxy()
        self.queueConfigMapper = queue_config_mapper
        self.pluginFactory = PluginFactory()

    # main loop
    def run(self):
        lockedBy = 'sweeper-{0}'.format(self.get_pid())
        while True:
            sw_main = core_utils.get_stopwatch()
            mainLog = self.make_logger(_logger,
                                       'id={0}'.format(lockedBy),
                                       method_name='run')
            # get commands to kill
            sw_getcomm = core_utils.get_stopwatch()
            mainLog.debug('try to get commands')
            comStr = CommandSpec.COM_killWorkers
            commandSpecs = self.dbProxy.get_commands_for_receiver(
                'sweeper', comStr)
            mainLog.debug('got {0} {1} commands'.format(
                len(commandSpecs), comStr))
            for commandSpec in commandSpecs:
                n_to_kill = self.dbProxy.kill_workers_by_query(
                    commandSpec.params)
                mainLog.debug('will kill {0} workers with {1}'.format(
                    n_to_kill, commandSpec.params))
            mainLog.debug('done handling commands' +
                          sw_getcomm.get_elapsed_time())
            # killing stage
            sw_kill = core_utils.get_stopwatch()
            mainLog.debug('try to get workers to kill')
            # get workers to kill
            workersToKill = self.dbProxy.get_workers_to_kill(
                harvester_config.sweeper.maxWorkers,
                harvester_config.sweeper.checkInterval)
            mainLog.debug('got {0} queues to kill workers'.format(
                len(workersToKill)))
            # loop over all workers
            sw = core_utils.get_stopwatch()
            for queueName, configIdWorkSpecList in iteritems(workersToKill):
                for configID, workspec_list in iteritems(configIdWorkSpecList):
                    # get sweeper
                    if not self.queueConfigMapper.has_queue(
                            queueName, configID):
                        mainLog.error(
                            'queue config for {0}/{1} not found'.format(
                                queueName, configID))
                        continue
                    queueConfig = self.queueConfigMapper.get_queue(
                        queueName, configID)
                    try:
                        sweeperCore = self.pluginFactory.get_plugin(
                            queueConfig.sweeper)
                    except Exception:
                        mainLog.error(
                            'failed to launch sweeper plugin for {0}/{1}'.
                            format(queueName, configID))
                        core_utils.dump_error_message(mainLog)
                        continue
                    sw.reset()
                    n_workers = len(workspec_list)
                    try:
                        # try bulk method
                        tmpLog = self.make_logger(_logger,
                                                  'id={0}'.format(lockedBy),
                                                  method_name='run')
                        tmpLog.debug('start killing')
                        tmpList = sweeperCore.kill_workers(workspec_list)
                    except AttributeError:
                        # fall back to single-worker method
                        for workspec in workspec_list:
                            tmpLog = self.make_logger(_logger,
                                                      'workerID={0}'.format(
                                                          workspec.workerID),
                                                      method_name='run')
                            try:
                                tmpLog.debug('start killing one worker')
                                tmpStat, tmpOut = sweeperCore.kill_worker(
                                    workspec)
                                tmpLog.debug(
                                    'done killing with status={0} diag={1}'.
                                    format(tmpStat, tmpOut))
                            except Exception:
                                core_utils.dump_error_message(tmpLog)
                    except Exception:
                        core_utils.dump_error_message(mainLog)
                    else:
                        # bulk method
                        n_killed = 0
                        for workspec, (tmpStat,
                                       tmpOut) in zip(workspec_list, tmpList):
                            tmpLog.debug(
                                'done killing workerID={0} with status={1} diag={2}'
                                .format(workspec.workerID, tmpStat, tmpOut))
                            if tmpStat:
                                n_killed += 1
                        tmpLog.debug('killed {0}/{1} workers'.format(
                            n_killed, n_workers))
                    mainLog.debug(
                        'done killing {0} workers'.format(n_workers) +
                        sw.get_elapsed_time())
            mainLog.debug('done all killing' + sw_kill.get_elapsed_time())
            # cleanup stage
            sw_cleanup = core_utils.get_stopwatch()
            # timeout for missed
            try:
                keepMissed = harvester_config.sweeper.keepMissed
            except Exception:
                keepMissed = 24
            try:
                keepPending = harvester_config.sweeper.keepPending
            except Exception:
                keepPending = 24
            # get workers for cleanup
            statusTimeoutMap = {
                'finished': harvester_config.sweeper.keepFinished,
                'failed': harvester_config.sweeper.keepFailed,
                'cancelled': harvester_config.sweeper.keepCancelled,
                'missed': keepMissed,
                'pending': keepPending
            }
            workersForCleanup = self.dbProxy.get_workers_for_cleanup(
                harvester_config.sweeper.maxWorkers, statusTimeoutMap)
            mainLog.debug('got {0} queues for workers cleanup'.format(
                len(workersForCleanup)))
            sw = core_utils.get_stopwatch()
            for queueName, configIdWorkSpecList in iteritems(
                    workersForCleanup):
                for configID, workspec_list in iteritems(configIdWorkSpecList):
                    # get sweeper
                    if not self.queueConfigMapper.has_queue(
                            queueName, configID):
                        mainLog.error(
                            'queue config for {0}/{1} not found'.format(
                                queueName, configID))
                        continue
                    queueConfig = self.queueConfigMapper.get_queue(
                        queueName, configID)
                    sweeperCore = self.pluginFactory.get_plugin(
                        queueConfig.sweeper)
                    messenger = self.pluginFactory.get_plugin(
                        queueConfig.messenger)
                    sw.reset()
                    n_workers = len(workspec_list)
                    # make sure workers to clean up are all terminated
                    mainLog.debug(
                        'making sure workers to clean up are all terminated')
                    try:
                        # try bulk method
                        tmpList = sweeperCore.kill_workers(workspec_list)
                    except AttributeError:
                        # fall back to single-worker method
                        for workspec in workspec_list:
                            tmpLog = self.make_logger(_logger,
                                                      'workerID={0}'.format(
                                                          workspec.workerID),
                                                      method_name='run')
                            try:
                                tmpStat, tmpOut = sweeperCore.kill_worker(
                                    workspec)
                            except Exception:
                                core_utils.dump_error_message(tmpLog)
                    except Exception:
                        core_utils.dump_error_message(mainLog)
                    mainLog.debug(
                        'made sure workers to clean up are all terminated')
                    # start cleanup
                    for workspec in workspec_list:
                        tmpLog = self.make_logger(_logger,
                                                  'workerID={0}'.format(
                                                      workspec.workerID),
                                                  method_name='run')
                        try:
                            tmpLog.debug('start cleaning up one worker')
                            # sweep worker
                            tmpStat, tmpOut = sweeperCore.sweep_worker(
                                workspec)
                            tmpLog.debug(
                                'swept_worker with status={0} diag={1}'.format(
                                    tmpStat, tmpOut))
                            tmpLog.debug('start messenger cleanup')
                            mc_tmpStat, mc_tmpOut = messenger.clean_up(
                                workspec)
                            tmpLog.debug(
                                'messenger cleaned up with status={0} diag={1}'
                                .format(mc_tmpStat, mc_tmpOut))
                            if tmpStat:
                                self.dbProxy.delete_worker(workspec.workerID)
                        except Exception:
                            core_utils.dump_error_message(tmpLog)
                    mainLog.debug(
                        'done cleaning up {0} workers'.format(n_workers) +
                        sw.get_elapsed_time())
            mainLog.debug('done all cleanup' + sw_cleanup.get_elapsed_time())
            # old-job-deletion stage
            sw_delete = core_utils.get_stopwatch()
            mainLog.debug('delete old jobs')
            jobTimeout = max(statusTimeoutMap.values()) + 1
            self.dbProxy.delete_old_jobs(jobTimeout)
            # delete orphaned job info
            self.dbProxy.delete_orphaned_job_info()
            mainLog.debug('done deletion of old jobs' +
                          sw_delete.get_elapsed_time())
            # disk cleanup
            if hasattr(harvester_config.sweeper, 'diskCleanUpInterval') and \
                    hasattr(harvester_config.sweeper, 'diskHighWatermark'):
                locked = self.dbProxy.get_process_lock(
                    'sweeper', self.get_pid(),
                    harvester_config.sweeper.diskCleanUpInterval * 60 * 60)
                if locked:
                    try:
                        all_active_files = None
                        for item in harvester_config.sweeper.diskHighWatermark.split(
                                ','):
                            # dir name and watermark in GB
                            dir_name, watermark = item.split('|')
                            mainLog.debug(
                                'checking {0} for cleanup with watermark {1} GB'
                                .format(dir_name, watermark))
                            watermark = int(watermark) * 10**9
                            total_size = 0
                            file_dict = {}
                            # scan dir
                            for root, dirs, filenames in walk(dir_name):
                                for base_name in filenames:
                                    full_name = os.path.join(root, base_name)
                                    f_size = os.path.getsize(full_name)
                                    total_size += f_size
                                    mtime = os.path.getmtime(full_name)
                                    file_dict.setdefault(mtime, set())
                                    file_dict[mtime].add(
                                        (base_name, full_name, f_size))
                            # delete if necessary
                            if total_size < watermark:
                                mainLog.debug(
                                    'skip cleanup {0} due to total_size {1} GB < watermark {2} GB'
                                    .format(dir_name, total_size // (10**9),
                                            watermark // (10**9)))
                            else:
                                mainLog.debug(
                                    'cleanup {0} due to total_size {1} GB >= watermark {2} GB'
                                    .format(dir_name, total_size // (10**9),
                                            watermark // (10**9)))
                                # get active input files
                                if all_active_files is None:
                                    all_active_files = self.dbProxy.get_all_active_input_files(
                                    )
                                deleted_size = 0
                                mtimes = sorted(file_dict.keys())
                                for mtime in mtimes:
                                    for base_name, full_name, f_size in file_dict[
                                            mtime]:
                                        # keep if active
                                        if base_name in all_active_files:
                                            continue
                                        try:
                                            os.remove(full_name)
                                        except Exception:
                                            core_utils.dump_error_message(
                                                mainLog)
                                        deleted_size += f_size
                                        if total_size - deleted_size < watermark:
                                            break
                                    if total_size - deleted_size < watermark:
                                        break
                    except Exception:
                        core_utils.dump_error_message(mainLog)
            # time the cycle
            mainLog.debug('done a sweeper cycle' + sw_main.get_elapsed_time())
            # check if being terminated
            if self.terminated(harvester_config.sweeper.sleepTime):
                mainLog.debug('terminated')
                return
Пример #12
0
class WorkerAdjuster:
    # constructor
    def __init__(self, queue_config_mapper):
        self.queueConfigMapper = queue_config_mapper
        self.pluginFactory = PluginFactory()
        self.dbProxy = DBProxy()
        self.throttlerMap = dict()

    # define number of workers to submit based on various information
    def define_num_workers(self, static_num_workers, site_name):
        tmpLog = core_utils.make_logger(_logger,
                                        'site={0}'.format(site_name),
                                        method_name='define_num_workers')
        tmpLog.debug('start')
        dyn_num_workers = copy.copy(static_num_workers)
        try:
            # get queue status
            queueStat = self.dbProxy.get_cache("panda_queues.json", None)
            if queueStat is None:
                queueStat = dict()
            else:
                queueStat = queueStat.data
            # define num of new workers
            for queueName, tmpVal in iteritems(static_num_workers):
                # set 0 to num of new workers when the queue is disabled
                if queueName in queueStat and queueStat[queueName][
                        'status'] in ['offline']:
                    dyn_num_workers[queueName]['nNewWorkers'] = 0
                    retMsg = 'set nNewWorkers=0 since status={0}'.format(
                        queueStat[queueName]['status'])
                    tmpLog.debug(retMsg)
                    continue
                # get queue
                queueConfig = self.queueConfigMapper.get_queue(queueName)
                # get throttler
                if queueName not in self.throttlerMap:
                    if hasattr(queueConfig, 'throttler'):
                        throttler = self.pluginFactory.get_plugin(
                            queueConfig.throttler)
                    else:
                        throttler = None
                    self.throttlerMap[queueName] = throttler
                # check throttler
                throttler = self.throttlerMap[queueName]
                if throttler is not None:
                    toThrottle, tmpMsg = throttler.to_be_throttled(queueConfig)
                    if toThrottle:
                        dyn_num_workers[queueName]['nNewWorkers'] = 0
                        retMsg = 'set nNewWorkers=0 by {0}:{1}'.format(
                            throttler.__class__.__name__, tmpMsg)
                        tmpLog.debug(retMsg)
                        continue
                # check stats
                nQueue = tmpVal['nQueue']
                nReady = tmpVal['nReady']
                nRunning = tmpVal['nRunning']
                nQueueLimit = queueConfig.nQueueLimitWorker
                maxWorkers = queueConfig.maxWorkers
                if queueConfig.runMode == 'slave':
                    nNewWorkersDef = tmpVal['nNewWorkers']
                    if nNewWorkersDef == 0:
                        dyn_num_workers[queueName]['nNewWorkers'] = 0
                        retMsg = 'set nNewWorkers=0 by panda in slave mode'
                        tmpLog.debug(retMsg)
                        continue
                else:
                    nNewWorkersDef = None
                # define num of new workers based on static site config
                nNewWorkers = 0
                if nQueueLimit > 0 and nQueue >= nQueueLimit:
                    # enough queued workers
                    retMsg = 'No nNewWorkers since nQueue({0})>=nQueueLimit({1})'.format(
                        nQueue, nQueueLimit)
                    tmpLog.debug(retMsg)
                    pass
                elif maxWorkers > 0 and (nQueue + nReady +
                                         nRunning) >= maxWorkers:
                    # enough workers in the system
                    retMsg = 'No nNewWorkers since nQueue({0}) + nReady({1}) + nRunning({2}) '.format(
                        nQueue, nReady, nRunning)
                    retMsg += '>= maxWorkers({0})'.format(maxWorkers)
                    tmpLog.debug(retMsg)
                    pass
                else:
                    # get max number of queued workers
                    maxQueuedWorkers = 0
                    if nQueueLimit > 0:
                        maxQueuedWorkers = nQueueLimit
                    if maxQueuedWorkers == 0:
                        if nNewWorkersDef is not None:
                            # slave mode
                            maxQueuedWorkers = nNewWorkersDef + nQueue
                        else:
                            # use default value
                            maxQueuedWorkers = 1
                    # new workers
                    nNewWorkers = max(maxQueuedWorkers - nQueue, 0)
                    if maxWorkers > 0:
                        nNewWorkers = min(
                            nNewWorkers,
                            max(maxWorkers - nQueue - nReady - nRunning, 0))
                if queueConfig.maxNewWorkersPerCycle > 0:
                    nNewWorkers = min(nNewWorkers,
                                      queueConfig.maxNewWorkersPerCycle)
                dyn_num_workers[queueName]['nNewWorkers'] = nNewWorkers
            # dump
            tmpLog.debug('defined {0}'.format(str(dyn_num_workers)))
            return dyn_num_workers
        except:
            # dump error
            errMsg = core_utils.dump_error_message(tmpLog)
            return None
Пример #13
0
class CredManager(AgentBase):

    # constructor
    def __init__(self, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.pluginFactory = PluginFactory()
        self.dbProxy = DBProxy()
        # get module and class names
        moduleNames = self.get_list(harvester_config.credmanager.moduleName)
        classNames = self.get_list(harvester_config.credmanager.className)
        # file names of original certificates
        if hasattr(harvester_config.credmanager, 'inCertFile'):
            inCertFiles = self.get_list(
                harvester_config.credmanager.inCertFile)
        else:
            inCertFiles = self.get_list(harvester_config.credmanager.certFile)
        # file names of certificates to be generated
        if hasattr(harvester_config.credmanager, 'outCertFile'):
            outCertFiles = self.get_list(
                harvester_config.credmanager.outCertFile)
        else:
            # use the file name of the certificate for panda connection as output name
            outCertFiles = self.get_list(harvester_config.pandacon.cert_file)
        # VOMS
        vomses = self.get_list(harvester_config.credmanager.voms)
        # get plugin
        self.exeCores = []
        for moduleName, className, inCertFile, outCertFile, voms in \
                zip(moduleNames, classNames, inCertFiles, outCertFiles, vomses):
            pluginPar = {}
            pluginPar['module'] = moduleName
            pluginPar['name'] = className
            pluginPar['inCertFile'] = inCertFile
            pluginPar['outCertFile'] = outCertFile
            pluginPar['voms'] = voms
            exeCore = self.pluginFactory.get_plugin(pluginPar)
            self.exeCores.append(exeCore)

    # get list
    def get_list(self, data):
        if isinstance(data, list):
            return data
        else:
            return [data]

    # main loop
    def run(self):
        while True:
            # execute
            self.execute()
            # check if being terminated
            if self.terminated(harvester_config.credmanager.sleepTime,
                               randomize=False):
                return

    # main
    def execute(self):
        # get lock
        locked = self.dbProxy.get_process_lock(
            'credmanager', self.get_pid(),
            harvester_config.credmanager.sleepTime)
        if not locked:
            return
        # loop over all plugins
        for exeCore in self.exeCores:
            # do nothing
            if exeCore is None:
                continue
                # make logger
            mainLog = core_utils.make_logger(_logger,
                                             "{0} {1}".format(
                                                 exeCore.__class__.__name__,
                                                 exeCore.outCertFile),
                                             method_name='execute')
            # check credential
            mainLog.debug('check credential')
            isValid = exeCore.check_credential()
            if isValid:
                mainLog.debug('valid')
            elif not isValid:
                # renew it if necessary
                mainLog.debug('invalid')
                mainLog.debug('renew credential')
                tmpStat, tmpOut = exeCore.renew_credential()
                if not tmpStat:
                    mainLog.error('failed : {0}'.format(tmpOut))
                    continue
            mainLog.debug('done')
Пример #14
0
class Submitter(AgentBase):
    # constructor
    def __init__(self, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.queueConfigMapper = queue_config_mapper
        self.dbProxy = DBProxy()
        self.workerMaker = WorkerMaker()
        self.workerAdjuster = WorkerAdjuster(queue_config_mapper)
        self.pluginFactory = PluginFactory()
        self.monitor_fifo = MonitorFIFO()
        self.apfmon = Apfmon(self.queueConfigMapper)

    # main loop
    def run(self):
        lockedBy = 'submitter-{0}'.format(self.get_pid())
        monitor_fifo = self.monitor_fifo
        while True:
            sw_main = core_utils.get_stopwatch()
            mainLog = self.make_logger(_logger,
                                       'id={0}'.format(lockedBy),
                                       method_name='run')
            mainLog.debug('getting queues to submit workers')

            # get queues associated to a site to submit workers
            curWorkers, siteName, resMap = self.dbProxy.get_queues_to_submit(
                harvester_config.submitter.nQueues,
                harvester_config.submitter.lookupTime,
                harvester_config.submitter.lockInterval)
            submitted = False
            if siteName is not None:
                mainLog.debug('got {0} queues for site {1}'.format(
                    len(curWorkers), siteName))

                # get commands
                comStr = '{0}:{1}'.format(CommandSpec.COM_setNWorkers,
                                          siteName)
                commandSpecs = self.dbProxy.get_commands_for_receiver(
                    'submitter', comStr)
                mainLog.debug('got {0} {1} commands'.format(
                    commandSpecs, comStr))
                for commandSpec in commandSpecs:
                    newLimits = self.dbProxy.set_queue_limit(
                        siteName, commandSpec.params)
                    for tmpResource, tmpNewVal in iteritems(newLimits):
                        # if available, overwrite new worker value with the command from panda server
                        if tmpResource in resMap:
                            tmpQueueName = resMap[tmpResource]
                            if tmpQueueName in curWorkers:
                                curWorkers[tmpQueueName][tmpResource][
                                    'nNewWorkers'] = tmpNewVal

                # define number of new workers
                if len(curWorkers) == 0:
                    n_workers_per_queue_and_rt = dict()
                else:
                    n_workers_per_queue_and_rt = self.workerAdjuster.define_num_workers(
                        curWorkers, siteName)

                if n_workers_per_queue_and_rt is None:
                    mainLog.error(
                        'WorkerAdjuster failed to define the number of workers'
                    )
                elif len(n_workers_per_queue_and_rt) == 0:
                    pass
                else:
                    # loop over all queues and resource types
                    for queueName in n_workers_per_queue_and_rt:
                        for resource_type, tmpVal in iteritems(
                                n_workers_per_queue_and_rt[queueName]):

                            tmpLog = self.make_logger(
                                _logger,
                                'id={0} queue={1} rtype={2}'.format(
                                    lockedBy, queueName, resource_type),
                                method_name='run')
                            try:
                                tmpLog.debug('start')
                                tmpLog.debug('workers status: %s' % tmpVal)
                                nWorkers = tmpVal['nNewWorkers'] + tmpVal[
                                    'nReady']
                                nReady = tmpVal['nReady']

                                # check queue
                                if not self.queueConfigMapper.has_queue(
                                        queueName):
                                    tmpLog.error('config not found')
                                    continue

                                # no new workers
                                if nWorkers == 0:
                                    tmpLog.debug(
                                        'skipped since no new worker is needed based on current stats'
                                    )
                                    continue
                                # get queue
                                queueConfig = self.queueConfigMapper.get_queue(
                                    queueName)
                                workerMakerCore = self.workerMaker.get_plugin(
                                    queueConfig)
                                # check if resource is ready
                                if hasattr(
                                        workerMakerCore, 'dynamicSizing'
                                ) and workerMakerCore.dynamicSizing is True:
                                    numReadyResources = self.workerMaker.num_ready_resources(
                                        queueConfig, resource_type,
                                        workerMakerCore)
                                    tmpLog.debug('numReadyResources: %s' %
                                                 numReadyResources)
                                    if not numReadyResources:
                                        if hasattr(workerMakerCore,
                                                   'staticWorkers'):
                                            nQRWorkers = tmpVal[
                                                'nQueue'] + tmpVal['nRunning']
                                            tmpLog.debug(
                                                'staticWorkers: %s, nQRWorkers(Queue+Running): %s'
                                                %
                                                (workerMakerCore.staticWorkers,
                                                 nQRWorkers))
                                            if nQRWorkers >= workerMakerCore.staticWorkers:
                                                tmpLog.debug(
                                                    'No left static workers, skip'
                                                )
                                                continue
                                            else:
                                                nWorkers = min(
                                                    workerMakerCore.
                                                    staticWorkers - nQRWorkers,
                                                    nWorkers)
                                                tmpLog.debug(
                                                    'staticWorkers: %s, nWorkers: %s'
                                                    %
                                                    (workerMakerCore.
                                                     staticWorkers, nWorkers))
                                        else:
                                            tmpLog.debug(
                                                'skip since no resources are ready'
                                            )
                                            continue
                                    else:
                                        nWorkers = min(nWorkers,
                                                       numReadyResources)
                                # post action of worker maker
                                if hasattr(
                                        workerMakerCore, 'skipOnFail'
                                ) and workerMakerCore.skipOnFail is True:
                                    skipOnFail = True
                                else:
                                    skipOnFail = False
                                # actions based on mapping type
                                if queueConfig.mapType == WorkSpec.MT_NoJob:
                                    # workers without jobs
                                    jobChunks = []
                                    for i in range(nWorkers):
                                        jobChunks.append([])
                                elif queueConfig.mapType == WorkSpec.MT_OneToOne:
                                    # one worker per one job
                                    jobChunks = self.dbProxy.get_job_chunks_for_workers(
                                        queueName, nWorkers, nReady, 1, None,
                                        queueConfig.useJobLateBinding,
                                        harvester_config.submitter.
                                        checkInterval, harvester_config.
                                        submitter.lockInterval, lockedBy)
                                elif queueConfig.mapType == WorkSpec.MT_MultiJobs:
                                    # one worker for multiple jobs
                                    nJobsPerWorker = self.workerMaker.get_num_jobs_per_worker(
                                        queueConfig,
                                        nWorkers,
                                        resource_type,
                                        maker=workerMakerCore)
                                    tmpLog.debug('nJobsPerWorker={0}'.format(
                                        nJobsPerWorker))
                                    jobChunks = self.dbProxy.get_job_chunks_for_workers(
                                        queueName, nWorkers, nReady,
                                        nJobsPerWorker, None,
                                        queueConfig.useJobLateBinding,
                                        harvester_config.submitter.
                                        checkInterval, harvester_config.
                                        submitter.lockInterval, lockedBy,
                                        queueConfig.allowJobMixture)
                                elif queueConfig.mapType == WorkSpec.MT_MultiWorkers:
                                    # multiple workers for one job
                                    nWorkersPerJob = self.workerMaker.get_num_workers_per_job(
                                        queueConfig,
                                        nWorkers,
                                        resource_type,
                                        maker=workerMakerCore)
                                    maxWorkersPerJob = self.workerMaker.get_max_workers_per_job_in_total(
                                        queueConfig,
                                        resource_type,
                                        maker=workerMakerCore)
                                    maxWorkersPerJobPerCycle = self.workerMaker.get_max_workers_per_job_per_cycle(
                                        queueConfig,
                                        resource_type,
                                        maker=workerMakerCore)
                                    tmpLog.debug('nWorkersPerJob={0}'.format(
                                        nWorkersPerJob))
                                    jobChunks = self.dbProxy.get_job_chunks_for_workers(
                                        queueName,
                                        nWorkers,
                                        nReady,
                                        None,
                                        nWorkersPerJob,
                                        queueConfig.useJobLateBinding,
                                        harvester_config.submitter.
                                        checkInterval,
                                        harvester_config.submitter.
                                        lockInterval,
                                        lockedBy,
                                        max_workers_per_job_in_total=
                                        maxWorkersPerJob,
                                        max_workers_per_job_per_cycle=
                                        maxWorkersPerJobPerCycle)
                                else:
                                    tmpLog.error('unknown mapType={0}'.format(
                                        queueConfig.mapType))
                                    continue

                                tmpLog.debug('got {0} job chunks'.format(
                                    len(jobChunks)))
                                if len(jobChunks) == 0:
                                    continue
                                # make workers
                                okChunks, ngChunks = self.workerMaker.make_workers(
                                    jobChunks,
                                    queueConfig,
                                    nReady,
                                    resource_type,
                                    maker=workerMakerCore)
                                if len(ngChunks) == 0:
                                    tmpLog.debug(
                                        'successfully made {0} workers'.format(
                                            len(okChunks)))
                                else:
                                    tmpLog.debug(
                                        'made {0} workers, while {1} workers failed'
                                        .format(len(okChunks), len(ngChunks)))
                                timeNow = datetime.datetime.utcnow()
                                timeNow_timestamp = time.time()
                                pandaIDs = set()
                                # NG (=not good)
                                for ngJobs in ngChunks:
                                    for jobSpec in ngJobs:
                                        if skipOnFail:
                                            # release jobs when workers are not made
                                            pandaIDs.add(jobSpec.PandaID)
                                        else:
                                            jobSpec.status = 'failed'
                                            jobSpec.subStatus = 'failed_to_make'
                                            jobSpec.stateChangeTime = timeNow
                                            jobSpec.lockedBy = None
                                            errStr = 'failed to make a worker'
                                            jobSpec.set_pilot_error(
                                                PilotErrors.ERR_SETUPFAILURE,
                                                errStr)
                                            jobSpec.trigger_propagation()
                                            self.dbProxy.update_job(
                                                jobSpec, {
                                                    'lockedBy': lockedBy,
                                                    'subStatus': 'prepared'
                                                })
                                # OK
                                workSpecList = []
                                if len(okChunks) > 0:
                                    for workSpec, okJobs in okChunks:
                                        # has job
                                        if (queueConfig.useJobLateBinding and workSpec.workerID is None) \
                                                or queueConfig.mapType == WorkSpec.MT_NoJob:
                                            workSpec.hasJob = 0
                                        else:
                                            workSpec.hasJob = 1
                                            if workSpec.nJobsToReFill in [
                                                    None, 0
                                            ]:
                                                workSpec.set_jobspec_list(
                                                    okJobs)
                                            else:
                                                # refill free slots during the worker is running
                                                workSpec.set_jobspec_list(
                                                    okJobs[:workSpec.
                                                           nJobsToReFill])
                                                workSpec.nJobsToReFill = None
                                                for jobSpec in okJobs[
                                                        workSpec.
                                                        nJobsToReFill:]:
                                                    pandaIDs.add(
                                                        jobSpec.PandaID)
                                            workSpec.set_num_jobs_with_list()
                                        # map type
                                        workSpec.mapType = queueConfig.mapType
                                        # queue name
                                        workSpec.computingSite = queueConfig.queueName
                                        # set access point
                                        workSpec.accessPoint = queueConfig.messenger[
                                            'accessPoint']
                                        # sync level
                                        workSpec.syncLevel = queueConfig.get_synchronization_level(
                                        )
                                        # events
                                        if len(okJobs) > 0 and \
                                                ('eventService' in okJobs[0].jobParams or
                                                 'cloneJob' in okJobs[0].jobParams):
                                            workSpec.eventsRequest = WorkSpec.EV_useEvents
                                        workSpecList.append(workSpec)
                                if len(workSpecList) > 0:
                                    sw = core_utils.get_stopwatch()
                                    # get plugin for submitter
                                    submitterCore = self.pluginFactory.get_plugin(
                                        queueConfig.submitter)
                                    if submitterCore is None:
                                        # not found
                                        tmpLog.error(
                                            'submitter plugin for {0} not found'
                                            .format(jobSpec.computingSite))
                                        continue
                                    # get plugin for messenger
                                    messenger = self.pluginFactory.get_plugin(
                                        queueConfig.messenger)
                                    if messenger is None:
                                        # not found
                                        tmpLog.error(
                                            'messenger plugin for {0} not found'
                                            .format(jobSpec.computingSite))
                                        continue
                                    # setup access points
                                    messenger.setup_access_points(workSpecList)
                                    # feed jobs
                                    for workSpec in workSpecList:
                                        if workSpec.hasJob == 1:
                                            tmpStat = messenger.feed_jobs(
                                                workSpec,
                                                workSpec.get_jobspec_list())
                                            if tmpStat is False:
                                                tmpLog.error(
                                                    'failed to send jobs to workerID={0}'
                                                    .format(workSpec.workerID))
                                            else:
                                                tmpLog.debug(
                                                    'sent jobs to workerID={0} with {1}'
                                                    .format(
                                                        workSpec.workerID,
                                                        tmpStat))
                                    # insert workers
                                    self.dbProxy.insert_workers(
                                        workSpecList, lockedBy)
                                    # submit
                                    sw.reset()
                                    tmpLog.info(
                                        'submitting {0} workers'.format(
                                            len(workSpecList)))
                                    workSpecList, tmpRetList, tmpStrList = self.submit_workers(
                                        submitterCore, workSpecList)
                                    tmpLog.debug('done submitting {0} workers'.
                                                 format(len(workSpecList)) +
                                                 sw.get_elapsed_time())
                                    # collect successful jobs
                                    okPandaIDs = set()
                                    for iWorker, (tmpRet, tmpStr) in enumerate(
                                            zip(tmpRetList, tmpStrList)):
                                        if tmpRet:
                                            workSpec, jobList = okChunks[
                                                iWorker]
                                            jobList = workSpec.get_jobspec_list(
                                            )
                                            if jobList is not None:
                                                for jobSpec in jobList:
                                                    okPandaIDs.add(
                                                        jobSpec.PandaID)
                                    # loop over all workers
                                    for iWorker, (tmpRet, tmpStr) in enumerate(
                                            zip(tmpRetList, tmpStrList)):
                                        workSpec, jobList = okChunks[iWorker]
                                        # set harvesterHost
                                        workSpec.harvesterHost = socket.gethostname(
                                        )
                                        # use associated job list since it can be truncated for re-filling
                                        jobList = workSpec.get_jobspec_list()
                                        # set status
                                        if not tmpRet:
                                            # failed submission
                                            errStr = 'failed to submit a workerID={0} with {1}'.format(
                                                workSpec.workerID, tmpStr)
                                            tmpLog.error(errStr)
                                            workSpec.set_status(
                                                WorkSpec.ST_missed)
                                            workSpec.set_dialog_message(tmpStr)
                                            workSpec.set_pilot_error(
                                                PilotErrors.ERR_SETUPFAILURE,
                                                errStr)
                                            if jobList is not None:
                                                # increment attempt number
                                                newJobList = []
                                                for jobSpec in jobList:
                                                    # skip if successful with another worker
                                                    if jobSpec.PandaID in okPandaIDs:
                                                        continue
                                                    if jobSpec.submissionAttempts is None:
                                                        jobSpec.submissionAttempts = 0
                                                    jobSpec.submissionAttempts += 1
                                                    # max attempt or permanent error
                                                    if tmpRet is False or \
                                                            jobSpec.submissionAttempts >= \
                                                            queueConfig.maxSubmissionAttempts:
                                                        newJobList.append(
                                                            jobSpec)
                                                    else:
                                                        self.dbProxy.increment_submission_attempt(
                                                            jobSpec.PandaID,
                                                            jobSpec.
                                                            submissionAttempts)
                                                jobList = newJobList
                                        elif queueConfig.useJobLateBinding and workSpec.hasJob == 1:
                                            # directly go to running after feeding jobs for late biding
                                            workSpec.set_status(
                                                WorkSpec.ST_running)
                                        else:
                                            # normal successful submission
                                            workSpec.set_status(
                                                WorkSpec.ST_submitted)
                                        workSpec.submitTime = timeNow
                                        workSpec.modificationTime = timeNow
                                        workSpec.checkTime = timeNow
                                        if self.monitor_fifo.enabled:
                                            workSpec.set_work_params({
                                                'lastCheckAt':
                                                timeNow_timestamp
                                            })
                                        # prefetch events
                                        if tmpRet and workSpec.hasJob == 1 and \
                                                workSpec.eventsRequest == WorkSpec.EV_useEvents and \
                                                queueConfig.prefetchEvents:
                                            workSpec.eventsRequest = WorkSpec.EV_requestEvents
                                            eventsRequestParams = dict()
                                            for jobSpec in jobList:
                                                eventsRequestParams[jobSpec.PandaID] = \
                                                    {'pandaID': jobSpec.PandaID,
                                                     'taskID': jobSpec.taskID,
                                                     'jobsetID': jobSpec.jobParams['jobsetID'],
                                                     'nRanges': max(int(math.ceil(workSpec.nCore / len(jobList))),
                                                                    jobSpec.jobParams['coreCount']),
                                                     }
                                            workSpec.eventsRequestParams = eventsRequestParams
                                        # register worker
                                        tmpStat = self.dbProxy.register_worker(
                                            workSpec, jobList, lockedBy)
                                        if jobList is not None:
                                            for jobSpec in jobList:
                                                pandaIDs.add(jobSpec.PandaID)
                                                if tmpStat:
                                                    if tmpRet:
                                                        tmpStr = \
                                                            'submitted a workerID={0} for PandaID={1} with batchID={2}'
                                                        tmpLog.info(
                                                            tmpStr.format(
                                                                workSpec.
                                                                workerID,
                                                                jobSpec.
                                                                PandaID,
                                                                workSpec.
                                                                batchID))
                                                    else:
                                                        tmpStr = 'failed to submit a workerID={0} for PandaID={1}'
                                                        tmpLog.error(
                                                            tmpStr.format(
                                                                workSpec.
                                                                workerID,
                                                                jobSpec.PandaID
                                                            ))
                                                else:
                                                    tmpStr = \
                                                        'failed to register a worker for PandaID={0} with batchID={1}'
                                                    tmpLog.error(
                                                        tmpStr.format(
                                                            jobSpec.PandaID,
                                                            workSpec.batchID))
                                    # enqueue to monitor fifo
                                    if self.monitor_fifo.enabled \
                                            and queueConfig.mapType != WorkSpec.MT_MultiWorkers:
                                        workSpecsToEnqueue = \
                                            [[w] for w in workSpecList if w.status
                                             in (WorkSpec.ST_submitted, WorkSpec.ST_running)]
                                        monitor_fifo.put(
                                            (queueName, workSpecsToEnqueue),
                                            time.time() + harvester_config.
                                            monitor.fifoCheckInterval)
                                        mainLog.debug(
                                            'put workers to monitor FIFO')
                                    submitted = True
                                # release jobs
                                self.dbProxy.release_jobs(pandaIDs, lockedBy)
                                tmpLog.info('done')
                            except Exception:
                                core_utils.dump_error_message(tmpLog)
            mainLog.debug('done')
            # define sleep interval
            if siteName is None:
                sleepTime = harvester_config.submitter.sleepTime
            else:
                sleepTime = 0
                if submitted and hasattr(harvester_config.submitter,
                                         'minSubmissionInterval'):
                    interval = harvester_config.submitter.minSubmissionInterval
                    if interval > 0:
                        newTime = datetime.datetime.utcnow(
                        ) + datetime.timedelta(seconds=interval)
                        self.dbProxy.update_panda_queue_attribute(
                            'submitTime', newTime, site_name=siteName)

            # time the cycle
            mainLog.debug('done a submitter cycle' +
                          sw_main.get_elapsed_time())
            # check if being terminated
            if self.terminated(sleepTime):
                mainLog.debug('terminated')
                return

    # wrapper for submitWorkers to skip ready workers
    def submit_workers(self, submitter_core, workspec_list):
        retList = []
        strList = []
        newSpecList = []
        workersToSubmit = []
        for workSpec in workspec_list:
            if workSpec.status in [WorkSpec.ST_ready, WorkSpec.ST_running]:
                newSpecList.append(workSpec)
                retList.append(True)
                strList.append('')
            else:
                workersToSubmit.append(workSpec)
        tmpRetList = submitter_core.submit_workers(workersToSubmit)

        # submit the workers to the monitoring
        self.apfmon.create_workers(workersToSubmit)

        for tmpRet, tmpStr in tmpRetList:
            retList.append(tmpRet)
            strList.append(tmpStr)
        newSpecList += workersToSubmit
        return newSpecList, retList, strList
Пример #15
0
class SAGAMonitor(PluginBase):
    # constructor
    def __init__(self, **kwarg):
        PluginBase.__init__(self, **kwarg)
        self.pluginFactory = PluginFactory()
        self.queue_config_mapper = QueueConfigMapper()
        tmpLog = self.make_logger(baseLogger, method_name='__init__')
        tmpLog.info("[{0}] SAGA adaptor will be used.".format(self.adaptor))

    # check workers
    def check_workers(self, workspec_list):
        """Check status of workers. This method takes a list of WorkSpecs as input argument
        and returns a list of worker's statuses.
  
        :param workspec_list: a list of work specs instances
        :return: A tuple of return code (True for success, False otherwise) and a list of worker's statuses.
        :rtype: (bool, [string,])
        """
        try:
            job_service = saga.job.Service(self.adaptor)
        except saga.SagaException as ex:
            time.sleep(10)
            self.check_workers(workspec_list)
        sagadateformat_str = '%a %b %d %H:%M:%S %Y'
        retList = []
        for workSpec in workspec_list:
            # make logger
            errStr = ''
            tmpLog = self.make_logger(baseLogger,
                                      'workerID={0}'.format(workSpec.workerID),
                                      method_name='check_workers')
            tmpLog.debug("SAGA monitor started")
            if workSpec.batchID:
                saga_submission_id = '[{0}]-[{1}]'.format(
                    self.adaptor, workSpec.batchID)
                try:
                    worker = job_service.get_job(saga_submission_id)
                    tmpLog.debug(
                        'SAGA State for submission with batchid: {0} is: {1}'.
                        format(workSpec.batchID, worker.state))
                    harvester_job_state = SAGASubmitter.status_translator(
                        worker.state)
                    workSpec.nativeStatus = worker.state
                    workSpec.set_status(harvester_job_state)
                    tmpLog.debug(
                        'Worker state with batchid: {0} is: {1} exit code: {2}'
                        .format(workSpec.batchID, harvester_job_state,
                                worker.exit_code))
                    workSpec.set_status(harvester_job_state)
                    if worker.created:
                        tmpLog.debug("Worker created (SAGA): {0}".format(
                            worker.created))
                        workSpec.submitTime = datetime.strptime(
                            worker.created, sagadateformat_str)
                    if worker.started:
                        tmpLog.debug("Worker started (SAGA): {0}".format(
                            worker.started))
                        workSpec.startTime = datetime.strptime(
                            worker.started, sagadateformat_str)
                    if worker.finished:
                        tmpLog.debug("Worker finished (SAGA): {0}".format(
                            worker.finished))
                        workSpec.endTime = datetime.strptime(
                            worker.finished, sagadateformat_str)

                    if workSpec.is_final_status():
                        workSpec.nativeExitCode = worker.exit_code
                        tmpLog.info(
                            "Worker in final status [{0}] exit code: {1}".
                            format(workSpec.status, workSpec.nativeExitCode))
                        if workSpec.nativeExitCode != 0:  # let's try to find exit code, exit message etc...
                            tmpLog.info(
                                "Deep check to find exit code and exit status required"
                            )
                            harvester_job_state, workSpec.nativeExitCode, workSpec.nativeStatus, starttime, endtime, errStr = self.deep_checkjob(
                                workSpec.batchID, workSpec.workerID)
                            if harvester_job_state == "":
                                harvester_job_state = workSpec.ST_finished
                            if not workSpec.startTime:
                                workSpec.startTime = starttime
                            if endtime:
                                workSpec.endTime = endtime
                            workSpec.set_status(harvester_job_state)
                        tmpLog.info(
                            'Worker {2} with BatchID={0} finished with exit code {1} and state {3}'
                            .format(workSpec.batchID, worker.exit_code,
                                    workSpec.workerID, worker.state))
                        tmpLog.debug('Started: [{0}] finished: [{1}]'.format(
                            worker.started, worker.finished))

                    if worker.state == saga.job.PENDING:
                        queue_time = (datetime.now() -
                                      workSpec.submitTime).total_seconds()
                        tmpLog.info(
                            "Worker queued for {0} sec.".format(queue_time))
                        if hasattr(self, 'maxqueuetime'
                                   ) and queue_time > self.maxqueuetime:
                            tmpLog.info(
                                "Queue time {0} is longer than limit {1} worker will be canceled"
                                .format(queue_time, self.maxqueuetime))
                            worker.cancel()
                            worker.wait()
                            workSpec.nativeExitCode = worker.exit_code
                            cur_time = datetime.now()
                            workSpec.startTime = cur_time
                            workSpec.endTime = cur_time
                            workSpec.set_pilot_closed()
                            workSpec.set_status(workSpec.ST_cancelled)
                            harvester_job_state = workSpec.ST_cancelled
                            tmpLog.info(
                                "Worker state: {0} worker exit code: {1}".
                                format(harvester_job_state,
                                       workSpec.nativeExitCode))
                            # proper processing of jobs for worker will be required, to avoid 'fake' fails

                except saga.SagaException as ex:
                    tmpLog.info(
                        'An exception occured during retriving worker information {0}'
                        .format(workSpec.batchID))
                    tmpLog.info(ex.get_message())
                    # probably 'fnished' is not proper state in this case, 'undefined' looks a bit better
                    # some more work for SAGA to get proper state
                    harvester_job_state, workSpec.nativeExitCode, workSpec.nativeStatus, starttime, endtime, errStr = self.deep_checkjob(
                        workSpec.batchID, workSpec.workerID)
                    if harvester_job_state == "":
                        harvester_job_state = workSpec.ST_finished
                    if not workSpec.startTime:
                        workSpec.startTime = starttime
                    if endtime:
                        workSpec.endTime = endtime
                    workSpec.set_status(harvester_job_state)
                    tmpLog.debug('Worker state set to: {0} ({1})'.format(
                        workSpec.status, harvester_job_state))
                retList.append((harvester_job_state, errStr))
                # for compatibility with dummy monitor
                f = open(os.path.join(workSpec.accessPoint, 'status.txt'), 'w')
                f.write(workSpec.status)
                f.close()

            else:
                tmpLog.debug(
                    "SAGA monitor found worker [{0}] without batchID".format(
                        workSpec.workerID))

        job_service.close()
        tmpLog.debug('Results: {0}'.format(retList))

        return True, retList

    def deep_checkjob(self, batchid, workerid):
        """
        Get job state, exit code and some more parameters, from resources depending sources

        :param batchid:
        :return harvester_job_state, nativeExitCode, nativeStatus, startTime, endTime, diagMessage
        """
        tmpLog = self.make_logger(baseLogger,
                                  'workerID={0}'.format(workerid),
                                  method_name='deep_checkjob')
        harvester_job_state = None
        nativeexitcode = None
        nativestatus = None
        diagmessage = ""
        starttime = None
        endtime = None
        queue_config = self.queue_config_mapper.get_queue(self.queueName)
        if hasattr(queue_config, 'resource'):
            resource_utils = self.pluginFactory.get_plugin(
                queue_config.resource)
        else:
            tmpLog.debug("Resource configuration missed for: {0}".format(
                self.queueName))
            resource_utils = None
        if resource_utils:
            batchjob_info = resource_utils.get_batchjob_info(batchid)
        if batchjob_info:
            tmpLog.info('Batch job info collected: {0}'.format(batchjob_info))
            harvester_job_state = batchjob_info['status']
            nativeexitcode = batchjob_info['nativeExitCode']
            nativestatus = batchjob_info['nativeStatus']
            diagmessage = batchjob_info['nativeExitMsg']
            if batchjob_info['start_time']:
                starttime = batchjob_info['start_time']
            if batchjob_info['finish_time']:
                endtime = batchjob_info['finish_time']

        return harvester_job_state, nativeexitcode, nativestatus, starttime, endtime, diagmessage
Пример #16
0
class Monitor(AgentBase):
    # constructor
    def __init__(self, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.queueConfigMapper = queue_config_mapper
        self.dbProxy = DBProxy()
        self.pluginFactory = PluginFactory()

    # main loop
    def run(self):
        lockedBy = 'monitor-{0}'.format(self.ident)
        # init messengers
        for queueConfig in self.queueConfigMapper.get_all_queues().values():
            # just import for module initialization
            self.pluginFactory.get_plugin(queueConfig.messenger)
        # main
        while True:
            sw = core_utils.get_stopwatch()
            mainLog = core_utils.make_logger(_logger,
                                             'id={0}'.format(lockedBy),
                                             method_name='run')
            mainLog.debug('getting workers to monitor')
            workSpecsPerQueue = self.dbProxy.get_workers_to_update(
                harvester_config.monitor.maxWorkers,
                harvester_config.monitor.checkInterval,
                harvester_config.monitor.lockInterval, lockedBy)
            mainLog.debug('got {0} queues'.format(len(workSpecsPerQueue)))
            # loop over all workers
            for queueName, workSpecsList in iteritems(workSpecsPerQueue):
                tmpQueLog = core_utils.make_logger(_logger,
                                                   'id={0} queue={1}'.format(
                                                       lockedBy, queueName),
                                                   method_name='run')
                # check queue
                if not self.queueConfigMapper.has_queue(queueName):
                    tmpQueLog.error('config not found')
                    continue
                # get queue
                queueConfig = self.queueConfigMapper.get_queue(queueName)
                # get plugins
                monCore = self.pluginFactory.get_plugin(queueConfig.monitor)
                messenger = self.pluginFactory.get_plugin(
                    queueConfig.messenger)
                # check workers
                allWorkers = [
                    item for sublist in workSpecsList for item in sublist
                ]
                tmpQueLog.debug('checking {0} workers'.format(len(allWorkers)))
                tmpRetMap = self.check_workers(monCore, messenger, allWorkers,
                                               queueConfig, tmpQueLog)
                # loop over all worker chunks
                tmpQueLog.debug('update jobs and workers')
                iWorker = 0
                for workSpecs in workSpecsList:
                    jobSpecs = None
                    filesToStageOut = dict()
                    pandaIDsList = []
                    eventsToUpdateList = []
                    filesToStageOutList = []
                    for workSpec in workSpecs:
                        tmpLog = core_utils.make_logger(_logger,
                                                        'workerID={0}'.format(
                                                            workSpec.workerID),
                                                        method_name='run')
                        tmpOut = tmpRetMap[workSpec.workerID]
                        newStatus = tmpOut['newStatus']
                        monStatus = tmpOut['monStatus']
                        diagMessage = tmpOut['diagMessage']
                        workAttributes = tmpOut['workAttributes']
                        eventsToUpdate = tmpOut['eventsToUpdate']
                        filesToStageOut = tmpOut['filesToStageOut']
                        eventsRequestParams = tmpOut['eventsRequestParams']
                        nJobsToReFill = tmpOut['nJobsToReFill']
                        pandaIDs = tmpOut['pandaIDs']
                        tmpStr = 'newStatus={0} monitoredStatus={1} diag={2} '
                        tmpStr += 'postProcessed={3} files={4}'
                        tmpLog.debug(
                            tmpStr.format(newStatus, monStatus, diagMessage,
                                          workSpec.is_post_processed(),
                                          str(filesToStageOut)))
                        iWorker += 1
                        # check status
                        if newStatus not in WorkSpec.ST_LIST:
                            tmpLog.error(
                                'unknown status={0}'.format(newStatus))
                            continue
                        # update worker
                        workSpec.set_status(newStatus)
                        workSpec.set_work_attributes(workAttributes)
                        workSpec.set_dialog_message(diagMessage)
                        # request events
                        if eventsRequestParams != {}:
                            workSpec.eventsRequest = WorkSpec.EV_requestEvents
                            workSpec.eventsRequestParams = eventsRequestParams
                        # jobs to refill
                        if nJobsToReFill is not None:
                            workSpec.nJobsToReFill = nJobsToReFill
                        # get associated jobs for the worker chunk
                        if workSpec.hasJob == 1 and jobSpecs is None:
                            jobSpecs = self.dbProxy.get_jobs_with_worker_id(
                                workSpec.workerID, None, only_running=True)
                        # pandaIDs for push
                        pandaIDsList.append(pandaIDs)
                        if len(eventsToUpdate) > 0:
                            eventsToUpdateList.append(eventsToUpdate)
                        if len(filesToStageOut) > 0:
                            filesToStageOutList.append(filesToStageOut)
                    # update jobs and workers
                    if jobSpecs is not None:
                        tmpQueLog.debug(
                            'updating {0} jobs with {1} workers'.format(
                                len(jobSpecs), len(workSpecs)))
                        core_utils.update_job_attributes_with_workers(
                            queueConfig.mapType, jobSpecs, workSpecs,
                            filesToStageOutList, eventsToUpdateList)
                        for jobSpec in jobSpecs:
                            tmpLog = core_utils.make_logger(
                                _logger,
                                'PandaID={0}'.format(jobSpec.PandaID),
                                method_name='run')
                            tmpLog.debug(
                                'new status={0} subStatus={1} status_in_metadata={2}'
                                .format(
                                    jobSpec.status, jobSpec.subStatus,
                                    jobSpec.get_job_status_from_attributes()))
                    # update local database
                    tmpRet = self.dbProxy.update_jobs_workers(
                        jobSpecs, workSpecs, lockedBy, pandaIDsList)
                    if not tmpRet:
                        for workSpec in workSpecs:
                            tmpLog = core_utils.make_logger(
                                _logger,
                                'workerID={0}'.format(workSpec.workerID),
                                method_name='run')
                            tmpLog.error(
                                'failed to update the DB. lockInterval may be too short'
                            )
                    # send ACK to workers for events and files
                    if len(eventsToUpdateList) > 0 or len(
                            filesToStageOutList) > 0:
                        for workSpec in workSpecs:
                            messenger.acknowledge_events_files(workSpec)
                tmpQueLog.debug('done')
            mainLog.debug('done' + sw.get_elapsed_time())
            # check if being terminated
            if self.terminated(harvester_config.monitor.sleepTime):
                mainLog.debug('terminated')
                return

    # wrapper for checkWorkers
    def check_workers(self, mon_core, messenger, all_workers, queue_config,
                      tmp_log):
        workersToCheck = []
        retMap = dict()
        for workSpec in all_workers:
            eventsRequestParams = {}
            eventsToUpdate = []
            pandaIDs = []
            workStatus = None
            workAttributes = None
            filesToStageOut = None
            nJobsToReFill = None
            # job-level late binding
            if workSpec.hasJob == 0 and queue_config.mapType != WorkSpec.MT_NoJob:
                # check if job is requested
                jobRequested = messenger.job_requested(workSpec)
                if jobRequested:
                    # set ready when job is requested
                    workStatus = WorkSpec.ST_ready
                else:
                    workStatus = workSpec.status
            elif workSpec.nJobsToReFill in [0, None]:
                # check if job is requested to refill free slots
                jobRequested = messenger.job_requested(workSpec)
                if jobRequested:
                    nJobsToReFill = jobRequested
                workersToCheck.append(workSpec)
            else:
                workersToCheck.append(workSpec)
            # add
            retMap[workSpec.workerID] = {
                'newStatus': workStatus,
                'monStatus': workStatus,
                'workAttributes': workAttributes,
                'filesToStageOut': filesToStageOut,
                'eventsRequestParams': eventsRequestParams,
                'eventsToUpdate': eventsToUpdate,
                'diagMessage': '',
                'pandaIDs': pandaIDs,
                'nJobsToReFill': nJobsToReFill
            }
        # check workers
        tmp_log.debug('checking workers with plugin')
        tmpStat, tmpOut = mon_core.check_workers(workersToCheck)
        if not tmpStat:
            tmp_log.error('failed to check workers with {0}'.format(tmpOut))
        else:
            tmp_log.debug('checked')
            for workSpec, (newStatus,
                           diagMessage) in zip(workersToCheck, tmpOut):
                workerID = workSpec.workerID
                pandaIDs = []
                if workerID in retMap:
                    # request kill
                    if messenger.kill_requested(workSpec):
                        self.dbProxy.kill_worker(workSpec.workerID)
                    # get output files
                    filesToStageOut = messenger.get_files_to_stage_out(
                        workSpec)
                    retMap[workerID]['filesToStageOut'] = filesToStageOut
                    # get events to update
                    if workSpec.eventsRequest in [
                            WorkSpec.EV_useEvents, WorkSpec.EV_requestEvents
                    ]:
                        eventsToUpdate = messenger.events_to_update(workSpec)
                        retMap[workerID]['eventsToUpdate'] = eventsToUpdate
                    # request events
                    if workSpec.eventsRequest == WorkSpec.EV_useEvents:
                        eventsRequestParams = messenger.events_requested(
                            workSpec)
                        retMap[workerID][
                            'eventsRequestParams'] = eventsRequestParams
                    # get PandaIDs for pull model
                    if queue_config.mapType == WorkSpec.MT_NoJob:
                        pandaIDs = messenger.get_panda_ids(workSpec)
                    retMap[workerID]['pandaIDs'] = pandaIDs
                    # keep original new status
                    retMap[workerID]['monStatus'] = newStatus
                    # set running while there are events to update or files to stage out
                    if newStatus in [
                            WorkSpec.ST_finished, WorkSpec.ST_failed,
                            WorkSpec.ST_cancelled
                    ]:
                        if len(retMap[workerID]['filesToStageOut']) > 0 or \
                                        len(retMap[workerID]['eventsToUpdate']) > 0:
                            newStatus = WorkSpec.ST_running
                        elif not workSpec.is_post_processed():
                            if not queue_config.is_no_heartbeat_status(
                                    newStatus):
                                # post processing unless heartbeat is suppressed
                                jobSpecs = self.dbProxy.get_jobs_with_worker_id(
                                    workSpec.workerID,
                                    None,
                                    True,
                                    only_running=True)
                                # post processing
                                messenger.post_processing(
                                    workSpec, jobSpecs, queue_config.mapType)
                            workSpec.post_processed()
                            newStatus = WorkSpec.ST_running
                        # reset modification time to immediately trigger subsequent lookup
                        workSpec.trigger_next_lookup()
                    # get work attributes so that they can be updated in post_processing if any
                    workAttributes = messenger.get_work_attributes(workSpec)
                    retMap[workerID]['workAttributes'] = workAttributes
                    retMap[workerID]['newStatus'] = newStatus
                    retMap[workerID]['diagMessage'] = diagMessage
        return retMap
Пример #17
0
class SAGAMonitor(PluginBase):
    # constructor
    def __init__(self, **kwarg):
        PluginBase.__init__(self, **kwarg)
        self.pluginFactory = PluginFactory()
        self.queue_config_mapper = QueueConfigMapper()
        tmpLog = self.make_logger(baseLogger, method_name='__init__')
        tmpLog.info("[{0}] SAGA adaptor will be used.".format(self.adaptor))

    # check workers
    def check_workers(self, workspec_list):
        """Check status of workers. This method takes a list of WorkSpecs as input argument
        and returns a list of worker's statuses.
  
        :param workspec_list: a list of work specs instances
        :return: A tuple of return code (True for success, False otherwise) and a list of worker's statuses.
        :rtype: (bool, [string,])
        """
        try:
            job_service = saga.job.Service(self.adaptor)
        except saga.SagaException as ex:
            time.sleep(10)
            self.check_workers(workspec_list)
        sagadateformat_str = '%a %b %d %H:%M:%S %Y'
        retList = []
        for workSpec in workspec_list:
            # make logger
            errStr = ''
            tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID),
                                      method_name='check_workers')
            tmpLog.debug("SAGA monitor started")
            if workSpec.batchID:
                saga_submission_id = '[{0}]-[{1}]'.format(self.adaptor, workSpec.batchID)
                try:
                    worker = job_service.get_job(saga_submission_id)
                    tmpLog.debug(
                        'SAGA State for submission with batchid: {0} is: {1}'.format(workSpec.batchID, worker.state))
                    harvester_job_state = SAGASubmitter.status_translator(worker.state)
                    workSpec.nativeStatus = worker.state
                    workSpec.set_status(harvester_job_state)
                    tmpLog.debug(
                        'Worker state with batchid: {0} is: {1} exit code: {2}'.format(workSpec.batchID, harvester_job_state, worker.exit_code))
                    workSpec.set_status(harvester_job_state)
                    if worker.created:
                        tmpLog.debug("Worker created (SAGA): {0}".format(worker.created))
                        workSpec.submitTime = datetime.strptime(worker.created, sagadateformat_str)
                    if worker.started:
                        tmpLog.debug("Worker started (SAGA): {0}".format(worker.started))
                        workSpec.startTime = datetime.strptime(worker.started, sagadateformat_str)
                    if worker.finished:
                        tmpLog.debug("Worker finished (SAGA): {0}".format(worker.finished))
                        workSpec.endTime = datetime.strptime(worker.finished, sagadateformat_str)

                    if workSpec.is_final_status():
                        workSpec.nativeExitCode = worker.exit_code
                        tmpLog.info("Worker in final status [{0}] exit code: {1}".format(workSpec.status, workSpec.nativeExitCode))
                        if workSpec.nativeExitCode != 0:  # let's try to find exit code, exit message etc...
                            tmpLog.info("Deep check to find exit code and exit status required")
                            harvester_job_state, workSpec.nativeExitCode, workSpec.nativeStatus, starttime, endtime, errStr = self.deep_checkjob(
                                workSpec.batchID, workSpec.workerID)
                            if harvester_job_state == "":
                                harvester_job_state = workSpec.ST_finished
                            if not workSpec.startTime:
                                workSpec.startTime = starttime
                            if endtime:
                                workSpec.endTime = endtime
                            workSpec.set_status(harvester_job_state)
                        tmpLog.info('Worker {2} with BatchID={0} finished with exit code {1} and state {3}'.format(
                            workSpec.batchID, worker.exit_code, workSpec.workerID, worker.state))
                        tmpLog.debug('Started: [{0}] finished: [{1}]'.format(worker.started, worker.finished))

                    if worker.state == saga.job.PENDING:
                        queue_time = (datetime.now() - workSpec.submitTime).total_seconds()
                        tmpLog.info("Worker queued for {0} sec.".format(queue_time))
                        if hasattr(self, 'maxqueuetime') and queue_time > self.maxqueuetime:
                            tmpLog.info(
                                "Queue time {0} is longer than limit {1} worker will be canceled".format(queue_time,
                                                                                                         self.maxqueuetime))
                            worker.cancel()
                            worker.wait()
                            workSpec.nativeExitCode = worker.exit_code
                            cur_time = datetime.now()
                            workSpec.startTime = cur_time
                            workSpec.endTime = cur_time
                            workSpec.set_pilot_closed()
                            workSpec.set_status(workSpec.ST_cancelled)
                            harvester_job_state = workSpec.ST_cancelled
                            tmpLog.info("Worker state: {0} worker exit code: {1}".format(harvester_job_state,
                                                                                         workSpec.nativeExitCode))
                            # proper processing of jobs for worker will be required, to avoid 'fake' fails

                except saga.SagaException as ex:
                    tmpLog.info('An exception occured during retriving worker information {0}'.format(workSpec.batchID))
                    tmpLog.info(ex.get_message())
                    # probably 'fnished' is not proper state in this case, 'undefined' looks a bit better
                    # some more work for SAGA to get proper state
                    harvester_job_state, workSpec.nativeExitCode, workSpec.nativeStatus, starttime, endtime, errStr = self.deep_checkjob(
                        workSpec.batchID, workSpec.workerID)
                    if harvester_job_state == "":
                        harvester_job_state = workSpec.ST_finished
                    if not workSpec.startTime:
                        workSpec.startTime = starttime
                    if endtime:
                        workSpec.endTime = endtime
                    workSpec.set_status(harvester_job_state)
                    tmpLog.debug('Worker state set to: {0} ({1})'.format(workSpec.status, harvester_job_state))
                retList.append((harvester_job_state, errStr))
                # for compatibility with dummy monitor
                f = open(os.path.join(workSpec.accessPoint, 'status.txt'), 'w')
                f.write(workSpec.status)
                f.close()

            else:
                tmpLog.debug("SAGA monitor found worker [{0}] without batchID".format(workSpec.workerID))

        job_service.close()
        tmpLog.debug('Results: {0}'.format(retList))

        return True, retList

    def deep_checkjob(self, batchid, workerid):
        """
        Get job state, exit code and some more parameters, from resources depending sources

        :param batchid:
        :return harvester_job_state, nativeExitCode, nativeStatus, startTime, endTime, diagMessage
        """
        tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workerid), method_name='deep_checkjob')
        harvester_job_state = None
        nativeexitcode = None
        nativestatus = None
        diagmessage = ""
        starttime = None
        endtime = None
        queue_config = self.queue_config_mapper.get_queue(self.queueName)
        if hasattr(queue_config, 'resource'):
            resource_utils = self.pluginFactory.get_plugin(queue_config.resource)
        else:
            tmpLog.debug("Resource configuration missed for: {0}".format(self.queueName))
            resource_utils = None
        if resource_utils:
            batchjob_info = resource_utils.get_batchjob_info(batchid)
        if batchjob_info:
            tmpLog.info('Batch job info collected: {0}'.format(batchjob_info))
            harvester_job_state = batchjob_info['status']
            nativeexitcode = batchjob_info['nativeExitCode']
            nativestatus = batchjob_info['nativeStatus']
            diagmessage = batchjob_info['nativeExitMsg']
            if batchjob_info['start_time']:
                starttime = batchjob_info['start_time']
            if batchjob_info['finish_time']:
                endtime = batchjob_info['finish_time']

        return harvester_job_state, nativeexitcode, nativestatus, starttime, endtime, diagmessage
Пример #18
0
class EventFeeder(AgentBase):
    # constructor
    def __init__(self, communicator, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.dbProxy = DBProxy()
        self.queueConfigMapper = queue_config_mapper
        self.communicator = communicator
        self.pluginFactory = PluginFactory()

    # main loop
    def run(self):
        lockedBy = 'eventfeeder-{0}'.format(self.get_pid())
        while True:
            mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run')
            mainLog.debug('getting workers to feed events')
            workSpecsPerQueue = self.dbProxy.get_workers_to_feed_events(harvester_config.eventfeeder.maxWorkers,
                                                                        harvester_config.eventfeeder.lockInterval,
                                                                        lockedBy)
            mainLog.debug('got {0} queues'.format(len(workSpecsPerQueue)))
            # loop over all workers
            for queueName, workSpecList in iteritems(workSpecsPerQueue):
                tmpQueLog = self.make_logger(_logger, 'queue={0}'.format(queueName), method_name='run')
                # check queue
                if not self.queueConfigMapper.has_queue(queueName):
                    tmpQueLog.error('config not found')
                    continue
                # get queue
                queueConfig = self.queueConfigMapper.get_queue(queueName)
                if hasattr(queueConfig, 'scatteredEvents') and queueConfig.scatteredEvents:
                    scattered = True
                else:
                    scattered = False
                # get plugin
                messenger = self.pluginFactory.get_plugin(queueConfig.messenger)
                # loop over all workers
                for workSpec in workSpecList:
                    tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workSpec.workerID),
                                                    method_name='run')
                    # lock worker again
                    lockedFlag = self.dbProxy.lock_worker_again_to_feed_events(workSpec.workerID, lockedBy)
                    if not lockedFlag:
                        tmpLog.debug('skipped since locked by another')
                        continue
                    # get events
                    tmpLog.debug('get events')
                    tmpStat, events = self.communicator.get_event_ranges(workSpec.eventsRequestParams,
                                                                         scattered)
                    # failed
                    if tmpStat is False:
                        tmpLog.error('failed to get events with {0}'.format(events))
                        continue
                    # lock worker again
                    lockedFlag = self.dbProxy.lock_worker_again_to_feed_events(workSpec.workerID, lockedBy)
                    if not lockedFlag:
                        tmpLog.debug('skipped before feeding since locked by another')
                        continue
                    tmpStat = messenger.feed_events(workSpec, events)
                    # failed
                    if tmpStat is False:
                        tmpLog.error('failed to feed events')
                        continue
                    # dump
                    for pandaID, eventList in iteritems(events):
                        try:
                            nRanges = workSpec.eventsRequestParams[pandaID]['nRanges']
                        except Exception:
                            nRanges = None
                        tmpLog.debug('got {0} events for PandaID={1} while getting {2} events'.format(len(eventList),
                                                                                                      pandaID,
                                                                                                      nRanges))
                        # disable multi workers
                        if workSpec.mapType == WorkSpec.MT_MultiWorkers:
                            if len(eventList) == 0 or (nRanges is not None and len(eventList) < nRanges):
                                tmpStat = self.dbProxy.disable_multi_workers(pandaID)
                                if tmpStat == 1:
                                    tmpStr = 'disabled MultiWorkers for PandaID={0}'.format(pandaID)
                                    tmpLog.debug(tmpStr)
                    # update worker
                    workSpec.eventsRequest = WorkSpec.EV_useEvents
                    workSpec.eventsRequestParams = None
                    workSpec.eventFeedTime = None
                    workSpec.eventFeedLock = None
                    # update local database
                    tmpStat = self.dbProxy.update_worker(workSpec, {'eventFeedLock': lockedBy})
                    tmpLog.debug('done with {0}'.format(tmpStat))
                tmpQueLog.debug('done')
            mainLog.debug('done')
            # check if being terminated
            if self.terminated(harvester_config.eventfeeder.sleepTime):
                mainLog.debug('terminated')
                return
Пример #19
0
class Sweeper(AgentBase):
    # constructor
    def __init__(self, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.dbProxy = DBProxy()
        self.queueConfigMapper = queue_config_mapper
        self.pluginFactory = PluginFactory()


    # main loop
    def run(self):
        lockedBy = 'sweeper-{0}'.format(self.get_pid())
        while True:
            sw_main = core_utils.get_stopwatch()
            mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run')
            # killing stage
            sw_kill = core_utils.get_stopwatch()
            mainLog.debug('try to get workers to kill')
            # get workers to kill
            workersToKill = self.dbProxy.get_workers_to_kill(harvester_config.sweeper.maxWorkers,
                                                             harvester_config.sweeper.checkInterval)
            mainLog.debug('got {0} queues to kill workers'.format(len(workersToKill)))
            # loop over all workers
            sw = core_utils.get_stopwatch()
            for queueName, configIdWorkSpecList in iteritems(workersToKill):
                for configID, workspec_list in iteritems(configIdWorkSpecList):
                    # get sweeper
                    if not self.queueConfigMapper.has_queue(queueName, configID):
                        mainLog.error('queue config for {0}/{1} not found'.format(queueName, configID))
                        continue
                    queueConfig = self.queueConfigMapper.get_queue(queueName, configID)
                    sweeperCore = self.pluginFactory.get_plugin(queueConfig.sweeper)
                    sw.reset()
                    n_workers = len(workspec_list)
                    try:
                        # try bulk method
                        tmpLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run')
                        tmpLog.debug('start killing')
                        tmpList = sweeperCore.kill_workers(workspec_list)
                    except AttributeError:
                        # fall back to single-worker method
                        for workspec in workspec_list:
                            tmpLog = self.make_logger(_logger, 'workerID={0}'.format(workspec.workerID),
                                                      method_name='run')
                            try:
                                tmpLog.debug('start killing one worker')
                                tmpStat, tmpOut = sweeperCore.kill_worker(workspec)
                                tmpLog.debug('done killing with status={0} diag={1}'.format(tmpStat, tmpOut))
                            except Exception:
                                core_utils.dump_error_message(tmpLog)
                    except Exception:
                        core_utils.dump_error_message(mainLog)
                    else:
                        # bulk method
                        n_killed = 0
                        for workspec, (tmpStat, tmpOut) in zip(workspec_list, tmpList):
                            tmpLog.debug('done killing workerID={0} with status={1} diag={2}'.format(
                                            workspec.workerID, tmpStat, tmpOut))
                            if tmpStat:
                                n_killed += 1
                        tmpLog.debug('killed {0}/{1} workers'.format(n_killed, n_workers))
                    mainLog.debug('done killing {0} workers'.format(n_workers) + sw.get_elapsed_time())
            mainLog.debug('done all killing' + sw_kill.get_elapsed_time())
            # cleanup stage
            sw_cleanup = core_utils.get_stopwatch()
            # timeout for missed
            try:
                keepMissed = harvester_config.sweeper.keepMissed
            except Exception:
                keepMissed = 24
            try:
                keepPending = harvester_config.sweeper.keepPending
            except Exception:
                keepPending = 24
            # get workers for cleanup
            statusTimeoutMap = {'finished': harvester_config.sweeper.keepFinished,
                                'failed': harvester_config.sweeper.keepFailed,
                                'cancelled': harvester_config.sweeper.keepCancelled,
                                'missed': keepMissed,
                                'pending': keepPending
                                }
            workersForCleanup = self.dbProxy.get_workers_for_cleanup(harvester_config.sweeper.maxWorkers,
                                                                     statusTimeoutMap)
            mainLog.debug('got {0} queues for workers cleanup'.format(len(workersForCleanup)))
            sw = core_utils.get_stopwatch()
            for queueName, configIdWorkSpecList in iteritems(workersForCleanup):
                for configID, workspec_list in iteritems(configIdWorkSpecList):
                    # get sweeper
                    if not self.queueConfigMapper.has_queue(queueName, configID):
                        mainLog.error('queue config for {0}/{1} not found'.format(queueName, configID))
                        continue
                    queueConfig = self.queueConfigMapper.get_queue(queueName, configID)
                    sweeperCore = self.pluginFactory.get_plugin(queueConfig.sweeper)
                    messenger = self.pluginFactory.get_plugin(queueConfig.messenger)
                    sw.reset()
                    n_workers = len(workspec_list)
                    # make sure workers to clean up are all terminated
                    mainLog.debug('making sure workers to clean up are all terminated')
                    try:
                        # try bulk method
                        tmpList = sweeperCore.kill_workers(workspec_list)
                    except AttributeError:
                        # fall back to single-worker method
                        for workspec in workspec_list:
                            tmpLog = self.make_logger(_logger, 'workerID={0}'.format(workspec.workerID),
                                                      method_name='run')
                            try:
                                tmpStat, tmpOut = sweeperCore.kill_worker(workspec)
                            except Exception:
                                core_utils.dump_error_message(tmpLog)
                    except Exception:
                        core_utils.dump_error_message(mainLog)
                    mainLog.debug('made sure workers to clean up are all terminated')
                    # start cleanup
                    for workspec in workspec_list:
                        tmpLog = self.make_logger(_logger, 'workerID={0}'.format(workspec.workerID),
                                                  method_name='run')
                        try:
                            tmpLog.debug('start cleaning up one worker')
                            # sweep worker
                            tmpStat, tmpOut = sweeperCore.sweep_worker(workspec)
                            tmpLog.debug('swept_worker with status={0} diag={1}'.format(tmpStat, tmpOut))
                            tmpLog.debug('start messenger cleanup')
                            mc_tmpStat, mc_tmpOut = messenger.clean_up(workspec)
                            tmpLog.debug('messenger cleaned up with status={0} diag={1}'.format(mc_tmpStat, mc_tmpOut))
                            if tmpStat:
                                self.dbProxy.delete_worker(workspec.workerID)
                        except Exception:
                            core_utils.dump_error_message(tmpLog)
                    mainLog.debug('done cleaning up {0} workers'.format(n_workers) + sw.get_elapsed_time())
            mainLog.debug('done all cleanup' + sw_cleanup.get_elapsed_time())
            # old-job-deletion stage
            sw_delete = core_utils.get_stopwatch()
            mainLog.debug('delete old jobs')
            jobTimeout = max(statusTimeoutMap.values()) + 1
            self.dbProxy.delete_old_jobs(jobTimeout)
            # delete orphaned job info
            self.dbProxy.delete_orphaned_job_info()
            mainLog.debug('done deletion of old jobs' + sw_delete.get_elapsed_time())
            # time the cycle
            mainLog.debug('done a sweeper cycle' + sw_main.get_elapsed_time())
            # check if being terminated
            if self.terminated(harvester_config.sweeper.sleepTime):
                mainLog.debug('terminated')
                return
#   begin_job_id = int(sys.argv[2])
#if len(sys.argv) > 3:
#   end_job_id = int(sys.argv[3])
#if len(sys.argv) > 4:
#   globus_sleep_time = int(sys.argv[4])

queueConfigMapper = QueueConfigMapper()
queueConfig = queueConfigMapper.get_queue(queueName)
initial_queueConfig_stager = queueConfig.stager
queueConfig.stager['module'] = 'pandaharvester.harvesterstager.go_bulk_stager'
queueConfig.stager['name'] = 'GlobusBulkStager'
modified_queueConfig_stager = queueConfig.stager

pluginFactory = PluginFactory()
# get stage-out plugin
stagerCore = pluginFactory.get_plugin(queueConfig.stager)

# logger
_logger = core_utils.setup_logger('further_testing_go_bulk_stager')
tmpLog = core_utils.make_logger(_logger,
                                method_name='further_testing_go_bulk_stager')
tmpLog.debug('start')

for loggerName, loggerObj in logging.Logger.manager.loggerDict.iteritems():
    #print "loggerName - {}".format(loggerName)
    if loggerName.startswith('panda.log'):
        if len(loggerObj.handlers) == 0:
            continue
        if loggerName.split('.')[-1] in ['db_proxy']:
            continue
        stdoutHandler = logging.StreamHandler(sys.stdout)
if len(sys.argv) > 3:
    end_job_id = int(sys.argv[3])
if len(sys.argv) > 4:
    globus_sleep_time = int(sys.argv[4])

queueConfigMapper = QueueConfigMapper()
queueConfig = queueConfigMapper.get_queue(queueName)
initial_queueConfig_preparator = queueConfig.preparator
queueConfig.preparator[
    'module'] = 'pandaharvester.harvesterpreparator.go_bulk_preparator'
queueConfig.preparator['name'] = 'GlobusBulkPreparator'
modified_queueConfig_preparator = queueConfig.preparator

pluginFactory = PluginFactory()
# get stage-out plugin
preparatorCore = pluginFactory.get_plugin(queueConfig.preparator)

# logger
_logger = core_utils.setup_logger('stageInTest_go_bulk_preparator')
tmpLog = core_utils.make_logger(_logger,
                                method_name='stageInTest_go_bulk_preparator')
tmpLog.debug('start')

for loggerName, loggerObj in logging.Logger.manager.loggerDict.iteritems():
    #print "loggerName - {}".format(loggerName)
    if loggerName.startswith('panda.log'):
        if len(loggerObj.handlers) == 0:
            continue
        if loggerName.split('.')[-1] in ['db_proxy']:
            continue
        stdoutHandler = logging.StreamHandler(sys.stdout)
Пример #22
0
                                    'computingElement', 1, None)
        if len(jobs) == 0:
            print ("Failed to get jobs at {0} due to {1}".format(queueConfig.queueName, errStr))
            sys.exit(0)

        jobSpec = JobSpec()
        jobSpec.convert_job_json(jobs[0])

        # set input file paths
        inFiles = jobSpec.get_input_file_attributes()
        for inLFN, inFile in iteritems(inFiles):
            inFile['path'] = '{0}/{1}'.format(os.getcwd(), inLFN)
        jobSpec.set_input_file_paths(inFiles)
        jobSpecList.append(jobSpec)

    maker = pluginFactory.get_plugin(queueConfig.workerMaker)
    workSpec = maker.make_worker(jobSpecList, queueConfig, 'SCORE') # TODO: needs to be thought

    workSpec.accessPoint = queueConfig.messenger['accessPoint']
    workSpec.mapType = queueConfig.mapType
    workSpec.computingSite = queueConfig.queueName

    # set job to worker if not job-level late binding
    if not queueConfig.useJobLateBinding:
        workSpec.hasJob = 1
        workSpec.set_jobspec_list(jobSpecList)

    messenger = pluginFactory.get_plugin(queueConfig.messenger)
    messenger.setup_access_points([workSpec])

    # get plugin for messenger
Пример #23
0
class MultiNodeWorkerMaker(BaseWorkerMaker):
    # constructor
    def __init__(self, **kwarg):
        BaseWorkerMaker.__init__(self, **kwarg)
        self.pluginFactory = PluginFactory()
        self.queue_config_mapper = QueueConfigMapper()
        tmpLog = self.make_logger(baseLogger, method_name='__init__')
        tmpLog.info("Multinode workermaker: created.")
        tmpLog.debug("Queue name: {0}".format(self.queueName))
        if self.mode == "static":
            tmpLog.info("Static configuration")
        elif self.mode == "dynamic":
            tmpLog.info("Dynamic configuration")
            self.nNodes, self.walltimelimit = self.get_resources()
        self.nJobsPerWorker = self.nNodes * self.nJobsPerNode

    def _get_executable(self):
        # return string which contain body of script for scheduler: specific enviroment setup, executor with parameters
        exe_str = ""

        tmpLog = self.make_logger(baseLogger, method_name='_get_executable')

        # prepare static enviroment
        env_str = ""
        if self.env not in (None, "NULL"):
            env_str = "\n".join(map(lambda s: s.strip(), self.env.split(", ")))

        # prepare executor
        try:
            if self.executor == "aprun":  # "aprun -n [number of required nodes/jobs] -d [number of cpu per node/job]" - for one multicore job per node
                exe_str = self.executor + " -n {0} -d {1} ".format(
                    self.nJobsPerWorker, self.nCorePerJob)
                exe_str += self.pilot
            else:
                exe_str = self.executor + " " + self.pilot
            if self.pilot_params:
                exe_str = " ".join([exe_str, self.pilot_params])
        except Exception:
            tmpLog.error(
                "Unable to build executor command check configuration")
            exe_str = ""

        exe_str = "\n".join([env_str, exe_str])
        tmpLog.debug("Shell script body: \n%s" % exe_str)

        return exe_str

    # make a worker from jobs
    def make_worker(self, jobspec_list, queue_config, resource_type):
        tmpLog = core_utils.make_logger(baseLogger,
                                        'queue={0}'.format(
                                            queue_config.queueName),
                                        method_name='make_worker')

        tmpLog.info("Multi node worker preparation started.")
        tmpLog.info("Worker size: {0} jobs on {2} nodes for {1} sec.".format(
            self.nJobsPerWorker, self.walltimelimit, self.nNodes))

        workSpec = WorkSpec()
        workSpec.nCore = self.nNodes * queue_config.submitter['nCorePerNode']
        workSpec.minRamCount = 0
        workSpec.maxDiskCount = 0
        workSpec.maxWalltime = self.walltimelimit
        workSpec.workParams = self._get_executable()

        if len(jobspec_list) > 0:
            # push case: we know the job and set the parameters of the job
            for jobSpec in jobspec_list:
                try:
                    workSpec.minRamCount += jobSpec.jobParams['minRamCount']
                except Exception:
                    pass
                try:
                    workSpec.maxDiskCount += jobSpec.jobParams['maxDiskCount']
                except Exception:
                    pass
                #try:
                #    if jobSpec.jobParams['maxWalltime'] not in (None, "NULL"):
                #        workSpec.maxWalltime = max(int(queue_config.walltimeLimit), jobSpec.jobParams['maxWalltime'])
                #    else:
                #        workSpec.maxWalltime = queue_config.walltimeLimit
                #except Exception:
                #    pass
        tmpLog.info(
            "Worker for {0} nodes with {2} jobs with walltime {1} sec. defined"
            .format(self.nNodes, workSpec.maxWalltime, self.nJobsPerWorker))

        return workSpec

    # def get_num_jobs_per_worker(self, n_workers):
    #     """
    #     Function to set 'size' of worker. Define number of jobs per worker
    #     """
    #     tmpLog = core_utils.make_logger(baseLogger, 'queue={0}'.format(self.queueName),
    #                                     method_name='get_num_jobs_per_worker')
    #     tmpLog.info("Get number of jobs per worker")
    #     self.nJobsPerWorker = 1
    #     if self.mode == "static":
    #         tmpLog.info("Static configuration")
    #         self.nJobsPerWorker = self.nNodes * self.nJobsPerNode
    #     elif self.mode == "dynamic":
    #         tmpLog.info("Dynamic configuration")
    #         self.nNodes, self.walltimelimit = self.get_resources()
    #         self.nJobsPerWorker = self.nNodes * self.nJobsPerNode
    #
    #     tmpLog.info("Get: {0} jobs to run for {1} sec.".format(self.nJobsPerWorker, self.walltimelimit))
    #     return self.nJobsPerWorker

    def get_resources(self):
        """
        Function to get resourcese and map them to number of jobs
        """
        tmpLog = core_utils.make_logger(baseLogger,
                                        'queue={0}'.format(self.queueName),
                                        method_name='get_resources')
        njobs = 0
        walltime = self.walltimelimit
        queue_config = self.queue_config_mapper.get_queue(self.queueName)
        resource_utils = self.pluginFactory.get_plugin(queue_config.resource)
        if resource_utils:
            nodes, walltime = resource_utils.get_resources()
        else:
            tmpLog.info("Resource plugin is not defined")
            nodes = self.nNodes

        return nodes, walltime
Пример #24
0
class WorkerAdjuster(object):
    # constructor
    def __init__(self, queue_config_mapper):
        self.queueConfigMapper = queue_config_mapper
        self.pluginFactory = PluginFactory()
        self.dbProxy = DBProxy()
        self.throttlerMap = dict()
        self.apf_mon = Apfmon(self.queueConfigMapper)
        try:
            self.maxNewWorkers = harvester_config.submitter.maxNewWorkers
        except AttributeError:
            self.maxNewWorkers = None

    # define number of workers to submit based on various information
    def define_num_workers(self, static_num_workers, site_name):
        tmpLog = core_utils.make_logger(_logger, 'site={0}'.format(site_name), method_name='define_num_workers')
        tmpLog.debug('start')
        tmpLog.debug('static_num_workers: {0}'.format(static_num_workers))
        dyn_num_workers = copy.deepcopy(static_num_workers)
        try:
            # get queue status
            queueStat = self.dbProxy.get_cache("panda_queues.json", None)
            if queueStat is None:
                queueStat = dict()
            else:
                queueStat = queueStat.data

            # get job statistics
            job_stats = self.dbProxy.get_cache("job_statistics.json", None)
            if job_stats is None:
                job_stats = dict()
            else:
                job_stats = job_stats.data

            # define num of new workers
            for queueName in static_num_workers:
                # get queue
                queueConfig = self.queueConfigMapper.get_queue(queueName)
                workerLimits_dict = self.dbProxy.get_worker_limits(queueName)
                maxWorkers = workerLimits_dict.get('maxWorkers', 0)
                nQueueLimit = workerLimits_dict.get('nQueueLimitWorker', 0)
                nQueueLimitPerRT = workerLimits_dict['nQueueLimitWorkerPerRT']
                nQueue_total, nReady_total, nRunning_total = 0, 0, 0
                apf_msg = None
                apf_data = None
                for resource_type, tmpVal in iteritems(static_num_workers[queueName]):
                    tmpLog.debug('Processing queue {0} resource {1} with static_num_workers {2}'.
                                 format(queueName, resource_type, tmpVal))

                    # set 0 to num of new workers when the queue is disabled
                    if queueName in queueStat and queueStat[queueName]['status'] in ['offline', 'standby',
                                                                                     'maintenance']:
                        dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0
                        retMsg = 'set nNewWorkers=0 since status={0}'.format(queueStat[queueName]['status'])
                        tmpLog.debug(retMsg)
                        apf_msg = 'Not submitting workers since queue status = {0}'.format(queueStat[queueName]['status'])
                        continue

                    # protection against not-up-to-date queue config
                    if queueConfig is None:
                        dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0
                        retMsg = 'set nNewWorkers=0 due to missing queueConfig'
                        tmpLog.debug(retMsg)
                        apf_msg = 'Not submitting workers because of missing queueConfig'
                        continue

                    # get throttler
                    if queueName not in self.throttlerMap:
                        if hasattr(queueConfig, 'throttler'):
                            throttler = self.pluginFactory.get_plugin(queueConfig.throttler)
                        else:
                            throttler = None
                        self.throttlerMap[queueName] = throttler

                    # check throttler
                    throttler = self.throttlerMap[queueName]
                    if throttler is not None:
                        toThrottle, tmpMsg = throttler.to_be_throttled(queueConfig)
                        if toThrottle:
                            dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0
                            retMsg = 'set nNewWorkers=0 by {0}:{1}'.format(throttler.__class__.__name__, tmpMsg)
                            tmpLog.debug(retMsg)
                            continue

                    # check stats
                    nQueue = tmpVal['nQueue']
                    nReady = tmpVal['nReady']
                    nRunning = tmpVal['nRunning']
                    if resource_type != 'ANY':
                        nQueue_total += nQueue
                        nReady_total += nReady
                        nRunning_total += nRunning
                    if queueConfig.runMode == 'slave':
                        nNewWorkersDef = tmpVal['nNewWorkers']
                        if nNewWorkersDef == 0:
                            dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0
                            retMsg = 'set nNewWorkers=0 by panda in slave mode'
                            tmpLog.debug(retMsg)
                            continue
                    else:
                        nNewWorkersDef = None

                    # define num of new workers based on static site config
                    nNewWorkers = 0
                    if nQueue >= nQueueLimitPerRT > 0:
                        # enough queued workers
                        retMsg = 'No nNewWorkers since nQueue({0})>=nQueueLimitPerRT({1})'.format(nQueue, nQueueLimitPerRT)
                        tmpLog.debug(retMsg)
                        pass
                    elif (nQueue + nReady + nRunning) >= maxWorkers > 0:
                        # enough workers in the system
                        retMsg = 'No nNewWorkers since nQueue({0}) + nReady({1}) + nRunning({2}) '.format(nQueue,
                                                                                                          nReady,
                                                                                                          nRunning)
                        retMsg += '>= maxWorkers({0})'.format(maxWorkers)
                        tmpLog.debug(retMsg)
                        pass
                    else:

                        maxQueuedWorkers = None

                        if nQueueLimitPerRT > 0:  # there is a limit set for the queue
                            maxQueuedWorkers = nQueueLimitPerRT

                        # Reset the maxQueueWorkers according to particular
                        if nNewWorkersDef is not None:  # don't surpass limits given centrally
                            maxQueuedWorkers_slave = nNewWorkersDef + nQueue
                            if maxQueuedWorkers is not None:
                                maxQueuedWorkers = min(maxQueuedWorkers_slave, maxQueuedWorkers)
                            else:
                                maxQueuedWorkers = maxQueuedWorkers_slave

                        elif queueConfig.mapType == 'NoJob': # for pull mode, limit to activated jobs
                            # limit the queue to the number of activated jobs to avoid empty pilots
                            try:
                                n_activated = max(job_stats[queueName]['activated'], 1) # avoid no activity queues
                                queue_limit = maxQueuedWorkers
                                maxQueuedWorkers = min(n_activated, maxQueuedWorkers)
                                tmpLog.debug('limiting maxQueuedWorkers to min(n_activated={0}, queue_limit={1})'.
                                             format(n_activated, queue_limit))
                            except KeyError:
                                tmpLog.warning('n_activated not defined, defaulting to configured queue limits')
                                pass

                        if maxQueuedWorkers is None:  # no value found, use default value
                            maxQueuedWorkers = 1

                        # new workers
                        nNewWorkers = max(maxQueuedWorkers - nQueue, 0)
                        tmpLog.debug('setting nNewWorkers to {0} in maxQueuedWorkers calculation'
                                     .format(nNewWorkers))
                        if maxWorkers > 0:
                            nNewWorkers = min(nNewWorkers, max(maxWorkers - nQueue - nReady - nRunning, 0))
                            tmpLog.debug('setting nNewWorkers to {0} to respect maxWorkers'
                                         .format(nNewWorkers))
                    if queueConfig.maxNewWorkersPerCycle > 0:
                        nNewWorkers = min(nNewWorkers, queueConfig.maxNewWorkersPerCycle)
                        tmpLog.debug('setting nNewWorkers to {0} in order to respect maxNewWorkersPerCycle'
                                     .format(nNewWorkers))
                    if self.maxNewWorkers is not None and self.maxNewWorkers > 0:
                        nNewWorkers = min(nNewWorkers, self.maxNewWorkers)
                        tmpLog.debug('setting nNewWorkers to {0} in order to respect universal maxNewWorkers'
                                     .format(nNewWorkers))
                    dyn_num_workers[queueName][resource_type]['nNewWorkers'] = nNewWorkers

                # adjust nNewWorkers for UCORE to let aggregations over RT respect nQueueLimitWorker and maxWorkers
                if queueConfig is None:
                    maxNewWorkersPerCycle = 0
                    retMsg = 'set maxNewWorkersPerCycle=0 in UCORE aggregation due to missing queueConfig'
                    tmpLog.debug(retMsg)
                else:
                    maxNewWorkersPerCycle = queueConfig.maxNewWorkersPerCycle
                if len(dyn_num_workers[queueName]) > 1:
                    total_new_workers_rts = sum( dyn_num_workers[queueName][_rt]['nNewWorkers']
                                                if _rt != 'ANY' else 0
                                                for _rt in dyn_num_workers[queueName] )
                    nNewWorkers_max_agg = min(
                                                max(nQueueLimit - nQueue_total, 0),
                                                max(maxWorkers - nQueue_total - nReady_total - nRunning_total, 0),
                                                )
                    if maxNewWorkersPerCycle >= 0:
                        nNewWorkers_max_agg = min(nNewWorkers_max_agg, maxNewWorkersPerCycle)
                    if self.maxNewWorkers is not None and self.maxNewWorkers > 0:
                        nNewWorkers_max_agg = min(nNewWorkers_max_agg, self.maxNewWorkers)
                    # exceeded max, to adjust
                    if total_new_workers_rts > nNewWorkers_max_agg:
                        if nNewWorkers_max_agg == 0:
                            for resource_type in dyn_num_workers[queueName]:
                                dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0
                            tmpLog.debug('No nNewWorkers since nNewWorkers_max_agg=0 for UCORE')
                        else:
                            tmpLog.debug('nNewWorkers_max_agg={0} for UCORE'.format(nNewWorkers_max_agg))
                            _d = dyn_num_workers[queueName].copy()
                            del _d['ANY']
                            simple_rt_nw_list = [ [_rt, _d[_rt].get('nNewWorkers', 0), 0] for _rt in _d ]
                            _countdown = nNewWorkers_max_agg
                            for _rt_list in simple_rt_nw_list:
                                resource_type, nNewWorkers_orig, _r = _rt_list
                                nNewWorkers, remainder = divmod(nNewWorkers_orig*nNewWorkers_max_agg, total_new_workers_rts)
                                dyn_num_workers[queueName][resource_type]['nNewWorkers'] = nNewWorkers
                                _rt_list[2] = remainder
                                _countdown -= nNewWorkers
                            _s_list = sorted(simple_rt_nw_list, key=(lambda x: x[1]))
                            sorted_rt_nw_list = sorted(_s_list, key=(lambda x: x[2]), reverse=True)
                            for resource_type, nNewWorkers_orig, remainder in sorted_rt_nw_list:
                                if _countdown <= 0:
                                    break
                                dyn_num_workers[queueName][resource_type]['nNewWorkers'] += 1
                                _countdown -= 1
                        for resource_type in dyn_num_workers[queueName]:
                            if resource_type == 'ANY':
                                continue
                            nNewWorkers = dyn_num_workers[queueName][resource_type]['nNewWorkers']
                            tmpLog.debug('setting nNewWorkers to {0} of type {1} in order to respect RT aggregations for UCORE'
                                         .format(nNewWorkers, resource_type))

                if not apf_msg:
                    apf_data = copy.deepcopy(dyn_num_workers[queueName])

                self.apf_mon.update_label(queueName, apf_msg, apf_data)

            # dump
            tmpLog.debug('defined {0}'.format(str(dyn_num_workers)))
            return dyn_num_workers
        except Exception:
            # dump error
            errMsg = core_utils.dump_error_message(tmpLog)
            return None
Пример #25
0
                                    'computingElement', 1, None)
        if len(jobs) == 0:
            print("Failed to get jobs at {0} due to {1}".format(queueConfig.queueName, errStr))
            sys.exit(0)

        jobSpec = JobSpec()
        jobSpec.convert_job_json(jobs[0])

        # set input file paths
        inFiles = jobSpec.get_input_file_attributes()
        for inLFN, inFile in iteritems(inFiles):
            inFile['path'] = '{0}/{1}'.format(os.getcwd(), inLFN)
        jobSpec.set_input_file_paths(inFiles)
        jobSpecList.append(jobSpec)

    maker = pluginFactory.get_plugin(queueConfig.workerMaker)
    workSpec = maker.make_worker(jobSpecList, queueConfig, jobType, resourceType)

    workSpec.accessPoint = queueConfig.messenger['accessPoint']
    workSpec.mapType = queueConfig.mapType
    workSpec.computingSite = queueConfig.queueName

    # set job to worker if not job-level late binding
    if not queueConfig.useJobLateBinding:
        workSpec.hasJob = 1
        workSpec.set_jobspec_list(jobSpecList)

    messenger = pluginFactory.get_plugin(queueConfig.messenger)
    messenger.setup_access_points([workSpec])

    # get plugin for messenger
class MultiNodeWorkerMaker(BaseWorkerMaker):
    # constructor
    def __init__(self, **kwarg):
        BaseWorkerMaker.__init__(self, **kwarg)
        self.pluginFactory = PluginFactory()
        self.queue_config_mapper = QueueConfigMapper()
        tmpLog = self.make_logger(baseLogger, method_name='__init__')
        tmpLog.info("Multinode workermaker: created.")
        tmpLog.debug("Queue name: {0}".format(self.queueName))
        if self.mode == "static":
            tmpLog.info("Static configuration")
        elif self.mode == "dynamic":
            tmpLog.info("Dynamic configuration")
            self.nNodes, self.walltimelimit = self.get_resources()
        self.nJobsPerWorker = self.nNodes * self.nJobsPerNode

    def _get_executable(self):
        # return string which contain body of script for scheduler: specific enviroment setup, executor with parameters
        exe_str = ""

        tmpLog = self.make_logger(baseLogger, method_name='_get_executable')

        # prepare static enviroment
        env_str = ""
        if self.env not in (None, "NULL"):
            env_str = "\n".join(map(lambda s: s.strip(),  self.env.split(", ")))

        # prepare executor
        try:
            if self.executor == "aprun":  # "aprun -n [number of required nodes/jobs] -d [number of cpu per node/job]" - for one multicore job per node
                exe_str = self.executor + " -n {0} -d {1} ".format(self.nJobsPerWorker, self.nCorePerJob)
                exe_str += self.pilot
            else:
                exe_str = self.executor + " " + self.pilot
            if self.pilot_params:
                exe_str = " ".join([exe_str, self.pilot_params])
        except Exception:
            tmpLog.error("Unable to build executor command check configuration")
            exe_str = ""

        exe_str = "\n".join([env_str, exe_str])
        tmpLog.debug("Shell script body: \n%s" % exe_str)

        return exe_str

    # make a worker from jobs
    def make_worker(self, jobspec_list, queue_config, resource_type):
        tmpLog = core_utils.make_logger(baseLogger, 'queue={0}'.format(queue_config.queueName),
                                        method_name='make_worker')

        tmpLog.info("Multi node worker preparation started.")
        tmpLog.info("Worker size: {0} jobs on {2} nodes for {1} sec.".format(self.nJobsPerWorker, self.walltimelimit,
                                                                                  self.nNodes))

        workSpec = WorkSpec()
        workSpec.nCore = self.nNodes * queue_config.submitter['nCorePerNode']
        workSpec.minRamCount = 0
        workSpec.maxDiskCount = 0
        workSpec.maxWalltime = self.walltimelimit
        workSpec.workParams = self._get_executable()

        if len(jobspec_list) > 0:
            # push case: we know the job and set the parameters of the job
            for jobSpec in jobspec_list:
                try:
                    workSpec.minRamCount += jobSpec.jobParams['minRamCount']
                except Exception:
                    pass
                try:
                    workSpec.maxDiskCount += jobSpec.jobParams['maxDiskCount']
                except Exception:
                    pass
                #try:
                #    if jobSpec.jobParams['maxWalltime'] not in (None, "NULL"):
                #        workSpec.maxWalltime = max(int(queue_config.walltimeLimit), jobSpec.jobParams['maxWalltime'])
                #    else:
                #        workSpec.maxWalltime = queue_config.walltimeLimit
                #except Exception:
                #    pass
        tmpLog.info("Worker for {0} nodes with {2} jobs with walltime {1} sec. defined".format(self.nNodes,
                                                                                             workSpec.maxWalltime,
                                                                                             self.nJobsPerWorker))

        return workSpec

    # def get_num_jobs_per_worker(self, n_workers):
    #     """
    #     Function to set 'size' of worker. Define number of jobs per worker
    #     """
    #     tmpLog = core_utils.make_logger(baseLogger, 'queue={0}'.format(self.queueName),
    #                                     method_name='get_num_jobs_per_worker')
    #     tmpLog.info("Get number of jobs per worker")
    #     self.nJobsPerWorker = 1
    #     if self.mode == "static":
    #         tmpLog.info("Static configuration")
    #         self.nJobsPerWorker = self.nNodes * self.nJobsPerNode
    #     elif self.mode == "dynamic":
    #         tmpLog.info("Dynamic configuration")
    #         self.nNodes, self.walltimelimit = self.get_resources()
    #         self.nJobsPerWorker = self.nNodes * self.nJobsPerNode
    #
    #     tmpLog.info("Get: {0} jobs to run for {1} sec.".format(self.nJobsPerWorker, self.walltimelimit))
    #     return self.nJobsPerWorker

    def get_resources(self):
        """
        Function to get resourcese and map them to number of jobs
        """
        tmpLog = core_utils.make_logger(baseLogger, 'queue={0}'.format(self.queueName),
                                        method_name='get_resources')
        njobs = 0
        walltime = self.walltimelimit
        queue_config = self.queue_config_mapper.get_queue(self.queueName)
        resource_utils = self.pluginFactory.get_plugin(queue_config.resource)
        if resource_utils:
            nodes, walltime = resource_utils.get_resources()
        else:
            tmpLog.info("Resource plugin is not defined")
            nodes = self.nNodes

        return nodes, walltime
Пример #27
0
class Stager(AgentBase):
    # constructor
    def __init__(self, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.dbProxy = DBProxy()
        self.queueConfigMapper = queue_config_mapper
        self.pluginFactory = PluginFactory()


    # main loop
    def run(self):
        lockedBy = 'stager-{0}'.format(self.get_pid())
        while True:
            sw = core_utils.get_stopwatch()
            mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run')
            mainLog.debug('try to get jobs to check')
            # get jobs to check preparation
            try:
                maxFilesPerJob = harvester_config.stager.maxFilesPerJobToCheck
            except Exception:
                maxFilesPerJob = None
            jobsToCheck = self.dbProxy.get_jobs_for_stage_out(harvester_config.stager.maxJobsToCheck,
                                                              harvester_config.stager.checkInterval,
                                                              harvester_config.stager.lockInterval,
                                                              lockedBy, 'transferring',
                                                              JobSpec.HO_hasTransfer,
                                                              max_files_per_job=maxFilesPerJob)
            mainLog.debug('got {0} jobs to check'.format(len(jobsToCheck)))
            # loop over all jobs
            for jobSpec in jobsToCheck:
                tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID),
                                          method_name='run')
                try:
                    tmpLog.debug('start checking')
                    # configID
                    configID = jobSpec.configID
                    if not core_utils.dynamic_plugin_change():
                        configID = None
                    # get queue
                    if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID):
                        tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite,
                                                                                 configID))
                        continue
                    queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID)
                    # get plugin
                    stagerCore = self.pluginFactory.get_plugin(queueConfig.stager)
                    if stagerCore is None:
                        # not found
                        tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite))
                        continue
                    # lock job again
                    lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy)
                    if not lockedAgain:
                        tmpLog.debug('skip since locked by another thread')
                        continue
                    tmpStat, tmpStr = stagerCore.check_status(jobSpec)
                    # check result
                    if tmpStat is True:
                        # succeeded
                        newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True, lockedBy)
                        tmpLog.debug('succeeded new subStatus={0}'.format(newSubStatus))
                    elif tmpStat is False:
                        # fatal error
                        tmpLog.debug('fatal error when checking status with {0}'.format(tmpStr))
                        # update job
                        for fileSpec in jobSpec.outFiles:
                            if fileSpec.status != 'finished':
                                fileSpec.status = 'failed'
                        errStr = 'stage-out failed with {0}'.format(tmpStr)
                        jobSpec.set_pilot_error(PilotErrors.ERR_STAGEOUTFAILED, errStr)
                        jobSpec.trigger_propagation()
                        newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True, lockedBy)
                        tmpLog.debug('updated new subStatus={0}'.format(newSubStatus))
                    else:
                        # on-going
                        tmpLog.debug('try to check later since {0}'.format(tmpStr))
                except Exception:
                    core_utils.dump_error_message(tmpLog)
            # get jobs to trigger stage-out
            try:
                maxFilesPerJob = harvester_config.stager.maxFilesPerJobToTrigger
            except Exception:
                maxFilesPerJob = None
            jobsToTrigger = self.dbProxy.get_jobs_for_stage_out(harvester_config.stager.maxJobsToTrigger,
                                                                harvester_config.stager.triggerInterval,
                                                                harvester_config.stager.lockInterval,
                                                                lockedBy, 'to_transfer',
                                                                JobSpec.HO_hasOutput,
                                                                JobSpec.HO_hasZipOutput,
                                                                max_files_per_job=maxFilesPerJob)
            mainLog.debug('got {0} jobs to trigger'.format(len(jobsToTrigger)))
            # loop over all jobs
            for jobSpec in jobsToTrigger:
                tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID),
                                          method_name='run')
                try:
                    tmpLog.debug('try to trigger stage-out')
                    # configID
                    configID = jobSpec.configID
                    if not core_utils.dynamic_plugin_change():
                        configID = None
                    # get queue
                    if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID):
                        tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite,
                                                                                 configID))
                        continue
                    queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID)
                    # get plugin
                    stagerCore = self.pluginFactory.get_plugin(queueConfig.stager)
                    if stagerCore is None:
                        # not found
                        tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite))
                        continue
                    # lock job again
                    lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy)
                    if not lockedAgain:
                        tmpLog.debug('skip since locked by another thread')
                        continue
                    # trigger stage-out
                    tmpStat, tmpStr = stagerCore.trigger_stage_out(jobSpec)
                    # check result
                    if tmpStat is True:
                        # succeeded
                        jobSpec.all_files_triggered_to_stage_out()
                        newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True, lockedBy)
                        tmpLog.debug('triggered new subStatus={0}'.format(newSubStatus))
                    elif tmpStat is False:
                        # fatal error
                        tmpLog.debug('fatal error to trigger with {0}'.format(tmpStr))
                        # update job
                        for fileSpec in jobSpec.outFiles:
                            if fileSpec.status != 'finished':
                                fileSpec.status = 'failed'
                        errStr = 'stage-out failed with {0}'.format(tmpStr)
                        jobSpec.set_pilot_error(PilotErrors.ERR_STAGEOUTFAILED, errStr)
                        jobSpec.trigger_propagation()
                        newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True, lockedBy)
                        tmpLog.debug('updated new subStatus={0}'.format(newSubStatus))
                    else:
                        # temporary error
                        tmpLog.debug('try to trigger later since {0}'.format(tmpStr))
                except Exception:
                    core_utils.dump_error_message(tmpLog)
            # get jobs to zip output
            try:
                maxFilesPerJob = harvester_config.stager.maxFilesPerJobToZip
            except Exception:
                maxFilesPerJob = None
            try:
                zipInterval = harvester_config.stager.zipInterval
            except Exception:
                zipInterval = harvester_config.stager.triggerInterval
            jobsToZip = self.dbProxy.get_jobs_for_stage_out(harvester_config.stager.maxJobsToZip,
                                                            zipInterval,
                                                            harvester_config.stager.lockInterval,
                                                            lockedBy, 'to_transfer',
                                                            JobSpec.HO_hasZipOutput,
                                                            JobSpec.HO_hasOutput,
                                                            max_files_per_job=maxFilesPerJob)
            mainLog.debug('got {0} jobs to zip'.format(len(jobsToZip)))
            # loop over all jobs
            for jobSpec in jobsToZip:
                tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID),
                                          method_name='run')
                try:
                    tmpLog.debug('try to zip output')
                    # configID
                    configID = jobSpec.configID
                    if not core_utils.dynamic_plugin_change():
                        configID = None
                    # get queue
                    if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID):
                        tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite,
                                                                                 configID))
                        continue
                    queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID)
                    # get plugin
                    stagerCore = self.pluginFactory.get_plugin(queueConfig.stager)
                    if stagerCore is None:
                        # not found
                        tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite))
                        continue
                    # lock job again
                    lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy)
                    if not lockedAgain:
                        tmpLog.debug('skip since locked by another thread')
                        continue
                    # trigger preparation
                    tmpStat, tmpStr = stagerCore.zip_output(jobSpec)
                    # succeeded
                    if tmpStat is True:
                        # update job
                        jobSpec.all_files_zipped()
                        newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, False, lockedBy)
                        tmpLog.debug('zipped new subStatus={0}'.format(newSubStatus))
                    else:
                        # failed
                        tmpLog.debug('failed to zip with {0}'.format(tmpStr))
                except Exception:
                    core_utils.dump_error_message(tmpLog)
            mainLog.debug('done' + sw.get_elapsed_time())
            # check if being terminated
            if self.terminated(harvester_config.stager.sleepTime):
                mainLog.debug('terminated')
                return
Пример #28
0
oFile.close()
fileSpec.add_associated_file(assFileSpec)
jobSpec = JobSpec()
jobSpec.jobParams = {'outFiles': fileSpec.lfn + ',log',
                     'scopeOut': 'panda',
                     'scopeLog': 'panda',
                     'logFile': 'log',
                     'realDatasets': 'panda.' + fileSpec.lfn,
                     'ddmEndPointOut': 'BNL-OSG2_DATADISK',
                     }
jobSpec.add_out_file(fileSpec)

pluginFactory = PluginFactory()

# get stage-out plugin
stagerCore = pluginFactory.get_plugin(queueConfig.stager)
print ("plugin={0}".format(stagerCore.__class__.__name__))

print ("testing zip")
tmpStat, tmpOut = stagerCore.zip_output(jobSpec)
if tmpStat:
    print (" OK")
else:
    print (" NG {0}".format(tmpOut))

print ()

print ("testing stage-out")
transferID = None
tmpStat, tmpOut = stagerCore.trigger_stage_out(jobSpec)
if tmpStat:
Пример #29
0
class Sweeper(AgentBase):
    # constructor
    def __init__(self, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.dbProxy = DBProxy()
        self.queueConfigMapper = queue_config_mapper
        self.pluginFactory = PluginFactory()


    # main loop
    def run(self):
        lockedBy = 'sweeper-{0}'.format(self.get_pid())
        while True:
            mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run')
            mainLog.debug('try to get workers to kill')
            # get workers to kill
            workersToKill = self.dbProxy.get_workers_to_kill(harvester_config.sweeper.maxWorkers,
                                                             harvester_config.sweeper.checkInterval)
            mainLog.debug('got {0} queues to kill workers'.format(len(workersToKill)))
            # loop over all workers
            for queueName, configIdWorkSpecs in iteritems(workersToKill):
                for configID, workSpecs in iteritems(configIdWorkSpecs):
                    # get sweeper
                    if not self.queueConfigMapper.has_queue(queueName, configID):
                        mainLog.error('queue config for {0}/{1} not found'.format(queueName, configID))
                        continue
                    queueConfig = self.queueConfigMapper.get_queue(queueName, configID)
                    sweeperCore = self.pluginFactory.get_plugin(queueConfig.sweeper)
                    for workSpec in workSpecs:
                        tmpLog = self.make_logger(_logger, 'workerID={0}'.format(workSpec.workerID),
                                                  method_name='run')
                        try:
                            tmpLog.debug('start killing')
                            tmpStat, tmpOut = sweeperCore.kill_worker(workSpec)
                            tmpLog.debug('done with status={0} diag={1}'.format(tmpStat, tmpOut))
                        except Exception:
                            core_utils.dump_error_message(tmpLog)
            mainLog.debug('done kill')
            # timeout for missed
            try:
                keepMissed = harvester_config.sweeper.keepMissed
            except Exception:
                keepMissed = 24
            keepPending = 24
            # get workers for cleanup
            statusTimeoutMap = {'finished': harvester_config.sweeper.keepFinished,
                                'failed': harvester_config.sweeper.keepFailed,
                                'cancelled': harvester_config.sweeper.keepCancelled,
                                'missed': keepMissed,
                                'pending': keepPending
                                }
            workersForCleanup = self.dbProxy.get_workers_for_cleanup(harvester_config.sweeper.maxWorkers,
                                                                     statusTimeoutMap)
            mainLog.debug('got {0} queues for workers cleanup'.format(len(workersForCleanup)))
            for queueName, configIdWorkSpecs in iteritems(workersForCleanup):
                for configID, workSpecs in iteritems(configIdWorkSpecs):
                    # get sweeper
                    if not self.queueConfigMapper.has_queue(queueName, configID):
                        mainLog.error('queue config for {0}/{1} not found'.format(queueName, configID))
                        continue
                    queueConfig = self.queueConfigMapper.get_queue(queueName, configID)
                    sweeperCore = self.pluginFactory.get_plugin(queueConfig.sweeper)
                    for workSpec in workSpecs:
                        tmpLog = self.make_logger(_logger, 'workerID={0}'.format(workSpec.workerID),
                                                  method_name='run')
                        try:
                            tmpLog.debug('start cleanup')
                            tmpStat, tmpOut = sweeperCore.sweep_worker(workSpec)
                            tmpLog.debug('done with status={0} diag={1}'.format(tmpStat, tmpOut))
                            if tmpStat:
                                # delete from DB
                                self.dbProxy.delete_worker(workSpec.workerID)
                        except Exception:
                            core_utils.dump_error_message(tmpLog)
            # delete old jobs
            mainLog.debug('delete old jobs')
            jobTimeout = max(statusTimeoutMap.values()) + 1
            self.dbProxy.delete_old_jobs(jobTimeout)
            mainLog.debug('done cleanup')
            # check if being terminated
            if self.terminated(harvester_config.sweeper.sleepTime):
                mainLog.debug('terminated')
                return