Exemplo n.º 1
0
class WorkerAdjuster(object):
    # constructor
    def __init__(self, queue_config_mapper):
        self.queue_configMapper = queue_config_mapper
        self.pluginFactory = PluginFactory()
        self.dbProxy = DBProxy()
        self.throttlerMap = dict()
        self.apf_mon = Apfmon(self.queue_configMapper)
        try:
            self.maxNewWorkers = harvester_config.submitter.maxNewWorkers
        except AttributeError:
            self.maxNewWorkers = None

    # define number of workers to submit based on various information
    def define_num_workers(self, static_num_workers, site_name):
        tmp_log = core_utils.make_logger(_logger,
                                         'site={0}'.format(site_name),
                                         method_name='define_num_workers')
        tmp_log.debug('start')
        tmp_log.debug('static_num_workers: {0}'.format(static_num_workers))
        dyn_num_workers = copy.deepcopy(static_num_workers)
        try:
            # get queue status
            queue_stat = self.dbProxy.get_cache("panda_queues.json", None)
            if queue_stat is None:
                queue_stat = dict()
            else:
                queue_stat = queue_stat.data

            # get job statistics
            job_stats = self.dbProxy.get_cache("job_statistics.json", None)
            if job_stats is None:
                job_stats = dict()
            else:
                job_stats = job_stats.data

            # define num of new workers
            for queue_name in static_num_workers:
                # get queue
                queue_config = self.queue_configMapper.get_queue(queue_name)
                worker_limits_dict = self.dbProxy.get_worker_limits(queue_name)
                max_workers = worker_limits_dict.get('maxWorkers', 0)
                n_queue_limit = worker_limits_dict.get('nQueueLimitWorker', 0)
                n_queue_limit_per_rt = worker_limits_dict[
                    'nQueueLimitWorkerPerRT']
                n_queue_total, n_ready_total, n_running_total = 0, 0, 0
                apf_msg = None
                apf_data = None
                for job_type, jt_values in iteritems(
                        static_num_workers[queue_name]):
                    for resource_type, tmp_val in iteritems(jt_values):
                        tmp_log.debug(
                            'Processing queue {0} job_type {1} resource_type {2} with static_num_workers {3}'
                            .format(queue_name, job_type, resource_type,
                                    tmp_val))

                        # set 0 to num of new workers when the queue is disabled
                        if queue_name in queue_stat and queue_stat[queue_name][
                                'status'] in [
                                    'offline', 'standby', 'maintenance'
                                ]:
                            dyn_num_workers[queue_name][job_type][
                                resource_type]['nNewWorkers'] = 0
                            ret_msg = 'set n_new_workers=0 since status={0}'.format(
                                queue_stat[queue_name]['status'])
                            tmp_log.debug(ret_msg)
                            apf_msg = 'Not submitting workers since queue status = {0}'.format(
                                queue_stat[queue_name]['status'])
                            continue

                        # protection against not-up-to-date queue config
                        if queue_config is None:
                            dyn_num_workers[queue_name][job_type][
                                resource_type]['nNewWorkers'] = 0
                            ret_msg = 'set n_new_workers=0 due to missing queue_config'
                            tmp_log.debug(ret_msg)
                            apf_msg = 'Not submitting workers because of missing queue_config'
                            continue

                        # get throttler
                        if queue_name not in self.throttlerMap:
                            if hasattr(queue_config, 'throttler'):
                                throttler = self.pluginFactory.get_plugin(
                                    queue_config.throttler)
                            else:
                                throttler = None
                            self.throttlerMap[queue_name] = throttler

                        # check throttler
                        throttler = self.throttlerMap[queue_name]
                        if throttler is not None:
                            to_throttle, tmp_msg = throttler.to_be_throttled(
                                queue_config)
                            if to_throttle:
                                dyn_num_workers[queue_name][job_type][
                                    resource_type]['nNewWorkers'] = 0
                                ret_msg = 'set n_new_workers=0 by {0}:{1}'.format(
                                    throttler.__class__.__name__, tmp_msg)
                                tmp_log.debug(ret_msg)
                                continue

                        # check stats
                        n_queue = tmp_val['nQueue']
                        n_ready = tmp_val['nReady']
                        n_running = tmp_val['nRunning']
                        if resource_type != 'ANY' and job_type != 'ANY' and job_type is not None:
                            n_queue_total += n_queue
                            n_ready_total += n_ready
                            n_running_total += n_running
                        if queue_config.runMode == 'slave':
                            n_new_workers_def = tmp_val['nNewWorkers']
                            if n_new_workers_def == 0:
                                dyn_num_workers[queue_name][job_type][
                                    resource_type]['nNewWorkers'] = 0
                                ret_msg = 'set n_new_workers=0 by panda in slave mode'
                                tmp_log.debug(ret_msg)
                                continue
                        else:
                            n_new_workers_def = None

                        # define num of new workers based on static site config
                        n_new_workers = 0
                        if n_queue >= n_queue_limit_per_rt > 0:
                            # enough queued workers
                            ret_msg = 'No n_new_workers since n_queue({0})>=n_queue_limit_per_rt({1})'.format(
                                n_queue, n_queue_limit_per_rt)
                            tmp_log.debug(ret_msg)
                            pass
                        elif (n_queue + n_ready +
                              n_running) >= max_workers > 0:
                            # enough workers in the system
                            ret_msg = 'No n_new_workers since n_queue({0}) + n_ready({1}) + n_running({2}) '.format(
                                n_queue, n_ready, n_running)
                            ret_msg += '>= max_workers({0})'.format(
                                max_workers)
                            tmp_log.debug(ret_msg)
                            pass
                        else:

                            max_queued_workers = None

                            if n_queue_limit_per_rt > 0:  # there is a limit set for the queue
                                max_queued_workers = n_queue_limit_per_rt

                            # Reset the maxQueueWorkers according to particular
                            if n_new_workers_def is not None:  # don't surpass limits given centrally
                                maxQueuedWorkers_slave = n_new_workers_def + n_queue
                                if max_queued_workers is not None:
                                    max_queued_workers = min(
                                        maxQueuedWorkers_slave,
                                        max_queued_workers)
                                else:
                                    max_queued_workers = maxQueuedWorkers_slave

                            elif queue_config.mapType == 'NoJob':  # for pull mode, limit to activated jobs
                                # limit the queue to the number of activated jobs to avoid empty pilots
                                try:
                                    n_activated = max(
                                        job_stats[queue_name]['activated'],
                                        1)  # avoid no activity queues
                                    queue_limit = max_queued_workers
                                    max_queued_workers = min(
                                        n_activated, max_queued_workers)
                                    tmp_log.debug(
                                        'limiting max_queued_workers to min(n_activated={0}, queue_limit={1})'
                                        .format(n_activated, queue_limit))
                                except KeyError:
                                    tmp_log.warning(
                                        'n_activated not defined, defaulting to configured queue limits'
                                    )
                                    pass

                            if max_queued_workers is None:  # no value found, use default value
                                max_queued_workers = 1

                            # new workers
                            n_new_workers = max(max_queued_workers - n_queue,
                                                0)
                            tmp_log.debug(
                                'setting n_new_workers to {0} in max_queued_workers calculation'
                                .format(n_new_workers))
                            if max_workers > 0:
                                n_new_workers = min(
                                    n_new_workers,
                                    max(
                                        max_workers - n_queue - n_ready -
                                        n_running, 0))
                                tmp_log.debug(
                                    'setting n_new_workers to {0} to respect max_workers'
                                    .format(n_new_workers))
                        if queue_config.maxNewWorkersPerCycle > 0:
                            n_new_workers = min(
                                n_new_workers,
                                queue_config.maxNewWorkersPerCycle)
                            tmp_log.debug(
                                'setting n_new_workers to {0} in order to respect maxNewWorkersPerCycle'
                                .format(n_new_workers))
                        if self.maxNewWorkers is not None and self.maxNewWorkers > 0:
                            n_new_workers = min(n_new_workers,
                                                self.maxNewWorkers)
                            tmp_log.debug(
                                'setting n_new_workers to {0} in order to respect universal maxNewWorkers'
                                .format(n_new_workers))
                        dyn_num_workers[queue_name][job_type][resource_type][
                            'nNewWorkers'] = n_new_workers

                # adjust n_new_workers for UCORE to let aggregations over RT respect nQueueLimitWorker and max_workers
                if queue_config is None:
                    max_new_workers_per_cycle = 0
                    ret_msg = 'set max_new_workers_per_cycle=0 in UCORE aggregation due to missing queue_config'
                    tmp_log.debug(ret_msg)
                else:
                    max_new_workers_per_cycle = queue_config.maxNewWorkersPerCycle
                if len(dyn_num_workers[queue_name]) > 1:
                    total_new_workers_rts = 0
                    for _jt in dyn_num_workers[queue_name]:
                        for _rt in dyn_num_workers[queue_name][_jt]:
                            if _jt != 'ANY' and _rt != 'ANY':
                                total_new_workers_rts = total_new_workers_rts + dyn_num_workers[
                                    queue_name][_jt][_rt]['nNewWorkers']
                    n_new_workers_max_agg = min(
                        max(n_queue_limit - n_queue_total, 0),
                        max(
                            max_workers - n_queue_total - n_ready_total -
                            n_running_total, 0))
                    if max_new_workers_per_cycle >= 0:
                        n_new_workers_max_agg = min(n_new_workers_max_agg,
                                                    max_new_workers_per_cycle)
                    if self.maxNewWorkers is not None and self.maxNewWorkers > 0:
                        n_new_workers_max_agg = min(n_new_workers_max_agg,
                                                    self.maxNewWorkers)

                    # exceeded max, to adjust
                    if total_new_workers_rts > n_new_workers_max_agg:
                        if n_new_workers_max_agg == 0:
                            for job_type in dyn_num_workers[queue_name]:
                                for resource_type in dyn_num_workers[
                                        queue_name][job_type]:
                                    dyn_num_workers[queue_name][job_type][
                                        resource_type]['nNewWorkers'] = 0
                            tmp_log.debug(
                                'No n_new_workers since n_new_workers_max_agg=0 for UCORE'
                            )
                        else:
                            tmp_log.debug(
                                'n_new_workers_max_agg={0} for UCORE'.format(
                                    n_new_workers_max_agg))
                            _d = dyn_num_workers[queue_name].copy()
                            del _d['ANY']

                            # TODO: needs to be recalculated
                            simple_rt_nw_list = []
                            for job_type in _d:  # jt: job type
                                for resource_type in _d[
                                        job_type]:  # rt: resource type
                                    simple_rt_nw_list.append([
                                        (resource_type, job_type),
                                        _d[job_type][resource_type].get(
                                            'nNewWorkers', 0), 0
                                    ])

                            _countdown = n_new_workers_max_agg
                            for _rt_list in simple_rt_nw_list:
                                (resource_type,
                                 job_type), n_new_workers_orig, _r = _rt_list
                                n_new_workers, remainder = divmod(
                                    n_new_workers_orig * n_new_workers_max_agg,
                                    total_new_workers_rts)
                                dyn_num_workers[queue_name][
                                    job_type].setdefault(
                                        resource_type, {
                                            'nReady': 0,
                                            'nRunning': 0,
                                            'nQueue': 0,
                                            'nNewWorkers': 0
                                        })
                                dyn_num_workers[queue_name][job_type][
                                    resource_type][
                                        'nNewWorkers'] = n_new_workers
                                _rt_list[2] = remainder
                                _countdown -= n_new_workers
                            _s_list = sorted(simple_rt_nw_list,
                                             key=(lambda x: x[1]))
                            sorted_rt_nw_list = sorted(_s_list,
                                                       key=(lambda x: x[2]),
                                                       reverse=True)
                            for (
                                    resource_type, job_type
                            ), n_new_workers_orig, remainder in sorted_rt_nw_list:
                                if _countdown <= 0:
                                    break
                                dyn_num_workers[queue_name][job_type][
                                    resource_type]['nNewWorkers'] += 1
                                _countdown -= 1
                        for job_type in dyn_num_workers[queue_name]:
                            for resource_type in dyn_num_workers[queue_name][
                                    job_type]:
                                if job_type == 'ANY' or resource_type == 'ANY':
                                    continue
                                n_new_workers = dyn_num_workers[queue_name][
                                    job_type][resource_type]['nNewWorkers']
                                tmp_log.debug(
                                    'setting n_new_workers to {0} of job_type {1} resource_type {2} in order to respect RT aggregations for UCORE'
                                    .format(n_new_workers, job_type,
                                            resource_type))

                if not apf_msg:
                    apf_data = copy.deepcopy(dyn_num_workers[queue_name])

                self.apf_mon.update_label(queue_name, apf_msg, apf_data)

            # dump
            tmp_log.debug('defined {0}'.format(str(dyn_num_workers)))
            return dyn_num_workers
        except Exception:
            # dump error
            err_msg = core_utils.dump_error_message(tmp_log)
            return None
Exemplo n.º 2
0
class WorkerAdjuster:
    # constructor
    def __init__(self, queue_config_mapper):
        self.queueConfigMapper = queue_config_mapper
        self.pluginFactory = PluginFactory()
        self.dbProxy = DBProxy()
        self.throttlerMap = dict()

    # define number of workers to submit based on various information
    def define_num_workers(self, static_num_workers, site_name):
        tmpLog = core_utils.make_logger(_logger, 'site={0}'.format(site_name), method_name='define_num_workers')
        tmpLog.debug('start')
        tmpLog.debug('static_num_workers: {0}'.format(static_num_workers))
        dyn_num_workers = copy.deepcopy(static_num_workers)
        try:
            # get queue status
            queueStat = self.dbProxy.get_cache("panda_queues.json", None)
            if queueStat is None:
                queueStat = dict()
            else:
                queueStat = queueStat.data

            # define num of new workers
            for queueName in static_num_workers:
                for resource_type, tmpVal in iteritems(static_num_workers[queueName]):
                    tmpLog.debug('Processing queue {0} resource {1} with static_num_workers {2}'.
                                 format(queueName, resource_type, tmpVal))

                    # set 0 to num of new workers when the queue is disabled
                    if queueName in queueStat and queueStat[queueName]['status'] in ['offline', 'standby',
                                                                                     'maintenance']:
                        dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0
                        retMsg = 'set nNewWorkers=0 since status={0}'.format(queueStat[queueName]['status'])
                        tmpLog.debug(retMsg)
                        continue

                    # get queue
                    queueConfig = self.queueConfigMapper.get_queue(queueName)

                    # get throttler
                    if queueName not in self.throttlerMap:
                        if hasattr(queueConfig, 'throttler'):
                            throttler = self.pluginFactory.get_plugin(queueConfig.throttler)
                        else:
                            throttler = None
                        self.throttlerMap[queueName] = throttler

                    # check throttler
                    throttler = self.throttlerMap[queueName]
                    if throttler is not None:
                        toThrottle, tmpMsg = throttler.to_be_throttled(queueConfig)
                        if toThrottle:
                            dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0
                            retMsg = 'set nNewWorkers=0 by {0}:{1}'.format(throttler.__class__.__name__, tmpMsg)
                            tmpLog.debug(retMsg)
                            continue

                    # check stats
                    nQueue = tmpVal['nQueue']
                    nReady = tmpVal['nReady']
                    nRunning = tmpVal['nRunning']
                    nQueueLimit = queueConfig.nQueueLimitWorker
                    maxWorkers = queueConfig.maxWorkers
                    if queueConfig.runMode == 'slave':
                        nNewWorkersDef = tmpVal['nNewWorkers']
                        if nNewWorkersDef == 0:
                            dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0
                            retMsg = 'set nNewWorkers=0 by panda in slave mode'
                            tmpLog.debug(retMsg)
                            continue
                    else:
                        nNewWorkersDef = None

                    # define num of new workers based on static site config
                    nNewWorkers = 0
                    if nQueue >= nQueueLimit > 0:
                        # enough queued workers
                        retMsg = 'No nNewWorkers since nQueue({0})>=nQueueLimit({1})'.format(nQueue, nQueueLimit)
                        tmpLog.debug(retMsg)
                        pass
                    elif (nQueue + nReady + nRunning) >= maxWorkers > 0:
                        # enough workers in the system
                        retMsg = 'No nNewWorkers since nQueue({0}) + nReady({1}) + nRunning({2}) '.format(nQueue,
                                                                                                          nReady,
                                                                                                          nRunning)
                        retMsg += '>= maxWorkers({0})'.format(maxWorkers)
                        tmpLog.debug(retMsg)
                        pass
                    else:

                        maxQueuedWorkers = None

                        if nQueueLimit > 0:  # there is a limit set for the queue
                            maxQueuedWorkers = nQueueLimit

                        if nNewWorkersDef is not None:  # don't surpass limits given centrally
                            maxQueuedWorkers_slave = nNewWorkersDef + nQueue
                            if maxQueuedWorkers is not None:
                                maxQueuedWorkers = min(maxQueuedWorkers_slave, maxQueuedWorkers)
                            else:
                                maxQueuedWorkers = maxQueuedWorkers_slave

                        if maxQueuedWorkers is None:  # no value found, use default value
                            maxQueuedWorkers = 1

                        # new workers
                        nNewWorkers = max(maxQueuedWorkers - nQueue, 0)
                        tmpLog.debug('setting nNewWorkers to {0} in maxQueuedWorkers calculation'
                                     .format(nNewWorkers))
                        if maxWorkers > 0:
                            nNewWorkers = min(nNewWorkers, max(maxWorkers - nQueue - nReady - nRunning, 0))
                            tmpLog.debug('setting nNewWorkers to {0} to respect maxWorkers'
                                         .format(nNewWorkers))
                    if queueConfig.maxNewWorkersPerCycle > 0:
                        nNewWorkers = min(nNewWorkers, queueConfig.maxNewWorkersPerCycle)
                        tmpLog.debug('setting nNewWorkers to {0} in order to respect maxNewWorkersPerCycle'
                                     .format(nNewWorkers))
                    dyn_num_workers[queueName][resource_type]['nNewWorkers'] = nNewWorkers
            # dump
            tmpLog.debug('defined {0}'.format(str(dyn_num_workers)))
            return dyn_num_workers
        except:
            # dump error
            errMsg = core_utils.dump_error_message(tmpLog)
            return None
Exemplo n.º 3
0
class ARCSubmitter(PluginBase):
    '''Submitter for ARC CE'''

    def __init__(self, **kwarg):
        '''Set up DB connection and credentials'''
        PluginBase.__init__(self, **kwarg)

        self.dbproxy = DBProxy()
        self.schedulerid = harvester_config.master.harvester_id

        # Credential dictionary role: proxy file
        self.certs = dict(zip([r.split('=')[1] for r in list(harvester_config.credmanager.voms)],
                              list(harvester_config.credmanager.outCertFile)))
        self.cred_type = arc.initializeCredentialsType(arc.initializeCredentialsType.SkipCredentials)


    def _run_submit(self, thr):
        '''Run a thread to do the submission'''

        try:
            thr.start()
        except:
            pass

        # Be careful to wait longer than submission timeout
        thr.join(thr.userconfig.Timeout() + 60.0)
        if thr.isAlive():
            # abort due to timeout and try again
            raise Exception("Submission timeout")
        if thr.job is None:
            raise Exception("Submission failed")

        return thr.job


    def _arc_submit(self, xrsl, arcces, userconfig, log):
        '''Check the available CEs and submit'''

        queuelist = []

        for arcce in arcces:
            (ce_endpoint, ce_queue) = arcce
            aris = arc.URL(str(ce_endpoint))
            ce_host = aris.Host()
            if aris.Protocol() == 'https':
                aris.ChangePath('/arex')
                infoendpoints = [arc.Endpoint(aris.str(),
                                              arc.Endpoint.COMPUTINGINFO,
                                              'org.ogf.glue.emies.resourceinfo')]
            else:
                aris = 'ldap://'+aris.Host()+'/mds-vo-name=local,o=grid'
                infoendpoints = [arc.Endpoint(aris,
                                              arc.Endpoint.COMPUTINGINFO,
                                              'org.nordugrid.ldapng')]

            # retriever contains a list of CE endpoints
            retriever = arc.ComputingServiceRetriever(userconfig, infoendpoints)
            retriever.wait()
            # targets is the list of queues
            # parse target.ComputingService.ID for the CE hostname
            # target.ComputingShare.Name is the queue name
            targets = retriever.GetExecutionTargets()

            # Filter only sites for this process
            for target in targets:
                if not target.ComputingService.ID:
                    log.info("Target {0} does not have ComputingService ID defined, skipping".format(target.ComputingService.Name))
                    continue
                # If EMI-ES infoendpoint, force EMI-ES submission
                if infoendpoints[0].InterfaceName == 'org.ogf.glue.emies.resourceinfo' \
                  and target.ComputingEndpoint.InterfaceName != 'org.ogf.glue.emies.activitycreation':
                    log.debug("Rejecting target interface {0} because not EMI-ES".format(target.ComputingEndpoint.InterfaceName))
                    continue
                # Check for matching host and queue
                targethost = re.sub(':arex$', '', re.sub('urn:ogf:ComputingService:', '', target.ComputingService.ID))
                targetqueue = target.ComputingShare.Name
                if targethost != ce_host:
                    log.debug('Rejecting target host {0} as it does not match {1}'.format(targethost, ce_host))
                    continue
                if targetqueue != ce_queue:
                    log.debug('Rejecting target queue {0} as it does not match {1}'.format(targetqueue, ce_queue))
                    continue

                queuelist.append(target)
                log.debug("Adding target {0}:{1}".format(targethost, targetqueue))

        # check if any queues are available, if not leave and try again next time
        if not queuelist:
            raise Exception("No free queues available")

        log.debug("preparing submission")
        jobdescs = arc.JobDescriptionList()
        if not arc.JobDescription_Parse(str(xrsl), jobdescs):
            raise Exception("Failed to prepare job description")

        # Run the submission in a separate thread
        thr = SubmitThr(queuelist, jobdescs, userconfig)
        return self._run_submit(thr)


    def _set_logdir(self, site):
        date = time.strftime('%Y-%m-%d')
        return os.path.join(date, site)


    # submit workers
    def submit_workers(self, workspec_list):
        retlist = []

        # Get queue info from DB
        pandaqueues = self.dbproxy.get_cache("panda_queues.json", None)
        if pandaqueues is None:
            raise Exception("Failed to get panda queue info from database")
        pandaqueues = pandaqueues.data

        osmap = self.dbproxy.get_cache("ddmendpoints_objectstores.json", None)
        if osmap is None:
            raise Exception("Failed to get Object Store info from database")
        osmap = osmap.data

        for workspec in workspec_list:

            arclog = arc_utils.ARCLogger(baselogger, workspec.workerID)
            tmplog = arclog.log

            # Assume for aCT that jobs are always pre-fetched (no late-binding)
            for jobspec in workspec.get_jobspec_list():

                tmplog.debug("JobSpec: {0}".format(jobspec.values_map()))

                if jobspec.computingSite not in pandaqueues:
                    retlist.append((False, "No queue information for {0}".format(jobspec.computingSite)))
                    continue

                # Get CEs from panda queue info
                # List of (endpoint, queue) tuples
                arcces = []
                for endpoint in pandaqueues[jobspec.computingSite]['queues']:
                    ce_endpoint = endpoint['ce_endpoint']
                    if not re.search('://', ce_endpoint):
                        ce_endpoint = 'gsiftp://%s' % ce_endpoint
                    ce_queue = endpoint['ce_queue_name']
                    arcces.append((ce_endpoint, ce_queue))

                if not arcces:
                    retlist.append((False, "No CEs defined for %{0}".format(jobspec.computingSite)))
                    continue

                # Set true pilot or not
                queueconfigmapper = QueueConfigMapper()
                queueconfig = queueconfigmapper.get_queue(jobspec.computingSite)
                pandaqueues[jobspec.computingSite]['truepilot'] = 'running' in queueconfig.noHeartbeat

                # Set log URL for GTAG env in job description
                logbaseurl = queueconfig.submitter.get('logBaseURL')
                logsubdir = self._set_logdir(jobspec.computingSite)
                logfileurl = '/'.join([logbaseurl, logsubdir, '%d.out' % jobspec.PandaID]) if logbaseurl else None

                tmplog.debug("Converting to ARC XRSL format")
                arcxrsl = ARCParser(jobspec.jobParams,
                                    jobspec.computingSite,
                                    pandaqueues[jobspec.computingSite],
                                    logfileurl,
                                    self.schedulerid,
                                    osmap,
                                    '/tmp', # tmpdir, TODO common tmp dir
                                    None, #jobSpec.eventranges, # TODO event ranges
                                    tmplog)
                arcxrsl.parse()
                xrsl = arcxrsl.getXrsl()
                tmplog.debug("ARC xrsl: {0}".format(xrsl))
                
                # Set the files to be downloaded at the end of the job
                downloadfiles = 'gmlog/errors'
                if 'logFile' in jobspec.jobParams:
                    downloadfiles += ';%s' %jobspec.jobParams['logFile'].replace('.tgz', '')
                if not pandaqueues[jobspec.computingSite]['truepilot']:
                    downloadfiles += ';jobSmallFiles.tgz'
                    
                # Set certificate
                userconfig = arc.UserConfig(self.cred_type)
                proxyrole = ''
                if jobspec.jobParams['prodSourceLabel'] == 'user':
                    userconfig.ProxyPath(str(self.certs['pilot']))
                    proxyrole = 'pilot'
                else:
                    userconfig.ProxyPath(str(self.certs['production']))
                    proxyrole = 'production'
                tmplog.debug("Submitting using {0} proxy at {1}".format(proxyrole, userconfig.ProxyPath()))

                try:
                    tmplog.debug("Submission targets: {0}".format(arcces))
                    arcjob = self._arc_submit(xrsl, arcces, userconfig, tmplog)
                    tmplog.info("ARC CE job id {0}".format(arcjob.JobID))
                    arc_utils.arcjob2workspec(arcjob, workspec)
                    workspec.workAttributes['arcdownloadfiles'] = downloadfiles
                    workspec.workAttributes['proxyrole'] = proxyrole
                    workspec.workAttributes['logsubdir'] = logsubdir
                    workspec.batchID = arcjob.JobID
                    tmplog.debug(workspec.workAttributes)
                    result = (True, '')
                except Exception as exc:
                    tmplog.error(traceback.format_exc())
                    result = (False, "Failed to submit ARC job: {0}".format(str(exc)))

                retlist.append(result)

        return retlist
Exemplo n.º 4
0
class ARCSubmitter(PluginBase):
    '''Submitter for ARC CE'''
    def __init__(self, **kwarg):
        '''Set up DB connection and credentials'''
        PluginBase.__init__(self, **kwarg)

        self.dbproxy = DBProxy()
        self.schedulerid = harvester_config.master.harvester_id

        # Credential dictionary role: proxy file
        self.certs = dict(
            zip([
                r.split('=')[1]
                for r in list(harvester_config.credmanager.voms)
            ], list(harvester_config.credmanager.outCertFile)))
        self.cred_type = arc.initializeCredentialsType(
            arc.initializeCredentialsType.SkipCredentials)

    def _run_submit(self, thr):
        '''Run a thread to do the submission'''

        try:
            thr.start()
        except:
            pass

        # Be careful to wait longer than submission timeout
        thr.join(thr.userconfig.Timeout() + 60.0)
        if thr.isAlive():
            # abort due to timeout and try again
            raise Exception("Submission timeout")
        if thr.job is None:
            raise Exception("Submission failed")

        return thr.job

    def _arc_submit(self, xrsl, arcces, userconfig, log):
        '''Check the available CEs and submit'''

        queuelist = []

        for arcce in arcces:
            (ce_endpoint, ce_queue) = arcce
            aris = arc.URL(str(ce_endpoint))
            ce_host = aris.Host()
            if aris.Protocol() == 'https':
                aris.ChangePath('/arex')
                infoendpoints = [
                    arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO,
                                 'org.ogf.glue.emies.resourceinfo')
                ]
            else:
                aris = 'ldap://' + aris.Host() + '/mds-vo-name=local,o=grid'
                infoendpoints = [
                    arc.Endpoint(aris, arc.Endpoint.COMPUTINGINFO,
                                 'org.nordugrid.ldapng')
                ]

            # retriever contains a list of CE endpoints
            retriever = arc.ComputingServiceRetriever(userconfig,
                                                      infoendpoints)
            retriever.wait()
            # targets is the list of queues
            # parse target.ComputingService.ID for the CE hostname
            # target.ComputingShare.Name is the queue name
            targets = retriever.GetExecutionTargets()

            # Filter only sites for this process
            for target in targets:
                if not target.ComputingService.ID:
                    log.info(
                        "Target {0} does not have ComputingService ID defined, skipping"
                        .format(target.ComputingService.Name))
                    continue
                # If EMI-ES infoendpoint, force EMI-ES submission
                if infoendpoints[0].InterfaceName == 'org.ogf.glue.emies.resourceinfo' \
                  and target.ComputingEndpoint.InterfaceName != 'org.ogf.glue.emies.activitycreation':
                    log.debug(
                        "Rejecting target interface {0} because not EMI-ES".
                        format(target.ComputingEndpoint.InterfaceName))
                    continue
                # Check for matching host and queue
                targethost = re.sub(
                    ':arex$', '',
                    re.sub('urn:ogf:ComputingService:', '',
                           target.ComputingService.ID))
                targetqueue = target.ComputingShare.Name
                if targethost != ce_host:
                    log.debug(
                        'Rejecting target host {0} as it does not match {1}'.
                        format(targethost, ce_host))
                    continue
                if targetqueue != ce_queue:
                    log.debug(
                        'Rejecting target queue {0} as it does not match {1}'.
                        format(targetqueue, ce_queue))
                    continue

                queuelist.append(target)
                log.debug("Adding target {0}:{1}".format(
                    targethost, targetqueue))

        # check if any queues are available, if not leave and try again next time
        if not queuelist:
            raise Exception("No free queues available")

        log.debug("preparing submission")
        jobdescs = arc.JobDescriptionList()
        if not arc.JobDescription_Parse(str(xrsl), jobdescs):
            raise Exception("Failed to prepare job description")

        # Run the submission in a separate thread
        thr = SubmitThr(queuelist, jobdescs, userconfig)
        return self._run_submit(thr)

    def _set_logdir(self, site):
        date = time.strftime('%Y-%m-%d')
        return os.path.join(date, site)

    # submit workers
    def submit_workers(self, workspec_list):
        retlist = []

        # Get queue info from DB
        pandaqueues = self.dbproxy.get_cache("panda_queues.json", None)
        if pandaqueues is None:
            raise Exception("Failed to get panda queue info from database")
        pandaqueues = pandaqueues.data

        osmap = self.dbproxy.get_cache("ddmendpoints_objectstores.json", None)
        if osmap is None:
            raise Exception("Failed to get Object Store info from database")
        osmap = osmap.data

        for workspec in workspec_list:

            arclog = arc_utils.ARCLogger(baselogger, workspec.workerID)
            tmplog = arclog.log

            # Assume for aCT that jobs are always pre-fetched (no late-binding)
            for jobspec in workspec.get_jobspec_list():

                tmplog.debug("JobSpec: {0}".format(jobspec.values_map()))

                if jobspec.computingSite not in pandaqueues:
                    retlist.append(
                        (False, "No queue information for {0}".format(
                            jobspec.computingSite)))
                    continue

                # Get CEs from panda queue info
                # List of (endpoint, queue) tuples
                arcces = []
                for endpoint in pandaqueues[jobspec.computingSite]['queues']:
                    ce_endpoint = endpoint['ce_endpoint']
                    if not re.search('://', ce_endpoint):
                        ce_endpoint = 'gsiftp://%s' % ce_endpoint
                    ce_queue = endpoint['ce_queue_name']
                    arcces.append((ce_endpoint, ce_queue))

                if not arcces:
                    retlist.append((False, "No CEs defined for %{0}".format(
                        jobspec.computingSite)))
                    continue

                # Set true pilot or not
                queueconfigmapper = QueueConfigMapper()
                queueconfig = queueconfigmapper.get_queue(
                    jobspec.computingSite)
                pandaqueues[jobspec.computingSite][
                    'truepilot'] = 'running' in queueconfig.noHeartbeat

                # Set log URL for GTAG env in job description
                logbaseurl = queueconfig.submitter.get('logBaseURL')
                logsubdir = self._set_logdir(jobspec.computingSite)
                logfileurl = '/'.join(
                    [logbaseurl, logsubdir,
                     '%d.out' % jobspec.PandaID]) if logbaseurl else None

                tmplog.debug("Converting to ARC XRSL format")
                arcxrsl = ARCParser(
                    jobspec.jobParams,
                    jobspec.computingSite,
                    pandaqueues[jobspec.computingSite],
                    logfileurl,
                    self.schedulerid,
                    osmap,
                    '/tmp',  # tmpdir, TODO common tmp dir
                    None,  #jobSpec.eventranges, # TODO event ranges
                    tmplog)
                arcxrsl.parse()
                xrsl = arcxrsl.getXrsl()
                tmplog.debug("ARC xrsl: {0}".format(xrsl))

                # Set the files to be downloaded at the end of the job
                downloadfiles = 'gmlog/errors'
                if 'logFile' in jobspec.jobParams:
                    downloadfiles += ';%s' % jobspec.jobParams[
                        'logFile'].replace('.tgz', '')
                if not pandaqueues[jobspec.computingSite]['truepilot']:
                    downloadfiles += ';jobSmallFiles.tgz'

                # Set certificate
                userconfig = arc.UserConfig(self.cred_type)
                proxyrole = ''
                if jobspec.jobParams['prodSourceLabel'] == 'user':
                    userconfig.ProxyPath(str(self.certs['pilot']))
                    proxyrole = 'pilot'
                else:
                    userconfig.ProxyPath(str(self.certs['production']))
                    proxyrole = 'production'
                tmplog.debug("Submitting using {0} proxy at {1}".format(
                    proxyrole, userconfig.ProxyPath()))

                try:
                    tmplog.debug("Submission targets: {0}".format(arcces))
                    arcjob = self._arc_submit(xrsl, arcces, userconfig, tmplog)
                    tmplog.info("ARC CE job id {0}".format(arcjob.JobID))
                    arc_utils.arcjob2workspec(arcjob, workspec)
                    workspec.workAttributes['arcdownloadfiles'] = downloadfiles
                    workspec.workAttributes['proxyrole'] = proxyrole
                    workspec.workAttributes['logsubdir'] = logsubdir
                    workspec.batchID = arcjob.JobID
                    tmplog.debug(workspec.workAttributes)
                    result = (True, '')
                except Exception as exc:
                    tmplog.error(traceback.format_exc())
                    result = (False,
                              "Failed to submit ARC job: {0}".format(str(exc)))

                retlist.append(result)

        return retlist
Exemplo n.º 5
0
class WorkerAdjuster(object):
    # constructor
    def __init__(self, queue_config_mapper):
        self.queueConfigMapper = queue_config_mapper
        self.pluginFactory = PluginFactory()
        self.dbProxy = DBProxy()
        self.throttlerMap = dict()
        self.apf_mon = Apfmon(self.queueConfigMapper)
        try:
            self.maxNewWorkers = harvester_config.submitter.maxNewWorkers
        except AttributeError:
            self.maxNewWorkers = None

    # define number of workers to submit based on various information
    def define_num_workers(self, static_num_workers, site_name):
        tmpLog = core_utils.make_logger(_logger, 'site={0}'.format(site_name), method_name='define_num_workers')
        tmpLog.debug('start')
        tmpLog.debug('static_num_workers: {0}'.format(static_num_workers))
        dyn_num_workers = copy.deepcopy(static_num_workers)
        try:
            # get queue status
            queueStat = self.dbProxy.get_cache("panda_queues.json", None)
            if queueStat is None:
                queueStat = dict()
            else:
                queueStat = queueStat.data

            # get job statistics
            job_stats = self.dbProxy.get_cache("job_statistics.json", None)
            if job_stats is None:
                job_stats = dict()
            else:
                job_stats = job_stats.data

            # define num of new workers
            for queueName in static_num_workers:
                # get queue
                queueConfig = self.queueConfigMapper.get_queue(queueName)
                workerLimits_dict = self.dbProxy.get_worker_limits(queueName)
                maxWorkers = workerLimits_dict.get('maxWorkers', 0)
                nQueueLimit = workerLimits_dict.get('nQueueLimitWorker', 0)
                nQueueLimitPerRT = workerLimits_dict['nQueueLimitWorkerPerRT']
                nQueue_total, nReady_total, nRunning_total = 0, 0, 0
                apf_msg = None
                apf_data = None
                for resource_type, tmpVal in iteritems(static_num_workers[queueName]):
                    tmpLog.debug('Processing queue {0} resource {1} with static_num_workers {2}'.
                                 format(queueName, resource_type, tmpVal))

                    # set 0 to num of new workers when the queue is disabled
                    if queueName in queueStat and queueStat[queueName]['status'] in ['offline', 'standby',
                                                                                     'maintenance']:
                        dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0
                        retMsg = 'set nNewWorkers=0 since status={0}'.format(queueStat[queueName]['status'])
                        tmpLog.debug(retMsg)
                        apf_msg = 'Not submitting workers since queue status = {0}'.format(queueStat[queueName]['status'])
                        continue

                    # protection against not-up-to-date queue config
                    if queueConfig is None:
                        dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0
                        retMsg = 'set nNewWorkers=0 due to missing queueConfig'
                        tmpLog.debug(retMsg)
                        apf_msg = 'Not submitting workers because of missing queueConfig'
                        continue

                    # get throttler
                    if queueName not in self.throttlerMap:
                        if hasattr(queueConfig, 'throttler'):
                            throttler = self.pluginFactory.get_plugin(queueConfig.throttler)
                        else:
                            throttler = None
                        self.throttlerMap[queueName] = throttler

                    # check throttler
                    throttler = self.throttlerMap[queueName]
                    if throttler is not None:
                        toThrottle, tmpMsg = throttler.to_be_throttled(queueConfig)
                        if toThrottle:
                            dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0
                            retMsg = 'set nNewWorkers=0 by {0}:{1}'.format(throttler.__class__.__name__, tmpMsg)
                            tmpLog.debug(retMsg)
                            continue

                    # check stats
                    nQueue = tmpVal['nQueue']
                    nReady = tmpVal['nReady']
                    nRunning = tmpVal['nRunning']
                    if resource_type != 'ANY':
                        nQueue_total += nQueue
                        nReady_total += nReady
                        nRunning_total += nRunning
                    if queueConfig.runMode == 'slave':
                        nNewWorkersDef = tmpVal['nNewWorkers']
                        if nNewWorkersDef == 0:
                            dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0
                            retMsg = 'set nNewWorkers=0 by panda in slave mode'
                            tmpLog.debug(retMsg)
                            continue
                    else:
                        nNewWorkersDef = None

                    # define num of new workers based on static site config
                    nNewWorkers = 0
                    if nQueue >= nQueueLimitPerRT > 0:
                        # enough queued workers
                        retMsg = 'No nNewWorkers since nQueue({0})>=nQueueLimitPerRT({1})'.format(nQueue, nQueueLimitPerRT)
                        tmpLog.debug(retMsg)
                        pass
                    elif (nQueue + nReady + nRunning) >= maxWorkers > 0:
                        # enough workers in the system
                        retMsg = 'No nNewWorkers since nQueue({0}) + nReady({1}) + nRunning({2}) '.format(nQueue,
                                                                                                          nReady,
                                                                                                          nRunning)
                        retMsg += '>= maxWorkers({0})'.format(maxWorkers)
                        tmpLog.debug(retMsg)
                        pass
                    else:

                        maxQueuedWorkers = None

                        if nQueueLimitPerRT > 0:  # there is a limit set for the queue
                            maxQueuedWorkers = nQueueLimitPerRT

                        # Reset the maxQueueWorkers according to particular
                        if nNewWorkersDef is not None:  # don't surpass limits given centrally
                            maxQueuedWorkers_slave = nNewWorkersDef + nQueue
                            if maxQueuedWorkers is not None:
                                maxQueuedWorkers = min(maxQueuedWorkers_slave, maxQueuedWorkers)
                            else:
                                maxQueuedWorkers = maxQueuedWorkers_slave

                        elif queueConfig.mapType == 'NoJob': # for pull mode, limit to activated jobs
                            # limit the queue to the number of activated jobs to avoid empty pilots
                            try:
                                n_activated = max(job_stats[queueName]['activated'], 1) # avoid no activity queues
                                queue_limit = maxQueuedWorkers
                                maxQueuedWorkers = min(n_activated, maxQueuedWorkers)
                                tmpLog.debug('limiting maxQueuedWorkers to min(n_activated={0}, queue_limit={1})'.
                                             format(n_activated, queue_limit))
                            except KeyError:
                                tmpLog.warning('n_activated not defined, defaulting to configured queue limits')
                                pass

                        if maxQueuedWorkers is None:  # no value found, use default value
                            maxQueuedWorkers = 1

                        # new workers
                        nNewWorkers = max(maxQueuedWorkers - nQueue, 0)
                        tmpLog.debug('setting nNewWorkers to {0} in maxQueuedWorkers calculation'
                                     .format(nNewWorkers))
                        if maxWorkers > 0:
                            nNewWorkers = min(nNewWorkers, max(maxWorkers - nQueue - nReady - nRunning, 0))
                            tmpLog.debug('setting nNewWorkers to {0} to respect maxWorkers'
                                         .format(nNewWorkers))
                    if queueConfig.maxNewWorkersPerCycle > 0:
                        nNewWorkers = min(nNewWorkers, queueConfig.maxNewWorkersPerCycle)
                        tmpLog.debug('setting nNewWorkers to {0} in order to respect maxNewWorkersPerCycle'
                                     .format(nNewWorkers))
                    if self.maxNewWorkers is not None and self.maxNewWorkers > 0:
                        nNewWorkers = min(nNewWorkers, self.maxNewWorkers)
                        tmpLog.debug('setting nNewWorkers to {0} in order to respect universal maxNewWorkers'
                                     .format(nNewWorkers))
                    dyn_num_workers[queueName][resource_type]['nNewWorkers'] = nNewWorkers

                # adjust nNewWorkers for UCORE to let aggregations over RT respect nQueueLimitWorker and maxWorkers
                if queueConfig is None:
                    maxNewWorkersPerCycle = 0
                    retMsg = 'set maxNewWorkersPerCycle=0 in UCORE aggregation due to missing queueConfig'
                    tmpLog.debug(retMsg)
                else:
                    maxNewWorkersPerCycle = queueConfig.maxNewWorkersPerCycle
                if len(dyn_num_workers[queueName]) > 1:
                    total_new_workers_rts = sum( dyn_num_workers[queueName][_rt]['nNewWorkers']
                                                if _rt != 'ANY' else 0
                                                for _rt in dyn_num_workers[queueName] )
                    nNewWorkers_max_agg = min(
                                                max(nQueueLimit - nQueue_total, 0),
                                                max(maxWorkers - nQueue_total - nReady_total - nRunning_total, 0),
                                                )
                    if maxNewWorkersPerCycle >= 0:
                        nNewWorkers_max_agg = min(nNewWorkers_max_agg, maxNewWorkersPerCycle)
                    if self.maxNewWorkers is not None and self.maxNewWorkers > 0:
                        nNewWorkers_max_agg = min(nNewWorkers_max_agg, self.maxNewWorkers)
                    # exceeded max, to adjust
                    if total_new_workers_rts > nNewWorkers_max_agg:
                        if nNewWorkers_max_agg == 0:
                            for resource_type in dyn_num_workers[queueName]:
                                dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0
                            tmpLog.debug('No nNewWorkers since nNewWorkers_max_agg=0 for UCORE')
                        else:
                            tmpLog.debug('nNewWorkers_max_agg={0} for UCORE'.format(nNewWorkers_max_agg))
                            _d = dyn_num_workers[queueName].copy()
                            del _d['ANY']
                            simple_rt_nw_list = [ [_rt, _d[_rt].get('nNewWorkers', 0), 0] for _rt in _d ]
                            _countdown = nNewWorkers_max_agg
                            for _rt_list in simple_rt_nw_list:
                                resource_type, nNewWorkers_orig, _r = _rt_list
                                nNewWorkers, remainder = divmod(nNewWorkers_orig*nNewWorkers_max_agg, total_new_workers_rts)
                                dyn_num_workers[queueName][resource_type]['nNewWorkers'] = nNewWorkers
                                _rt_list[2] = remainder
                                _countdown -= nNewWorkers
                            _s_list = sorted(simple_rt_nw_list, key=(lambda x: x[1]))
                            sorted_rt_nw_list = sorted(_s_list, key=(lambda x: x[2]), reverse=True)
                            for resource_type, nNewWorkers_orig, remainder in sorted_rt_nw_list:
                                if _countdown <= 0:
                                    break
                                dyn_num_workers[queueName][resource_type]['nNewWorkers'] += 1
                                _countdown -= 1
                        for resource_type in dyn_num_workers[queueName]:
                            if resource_type == 'ANY':
                                continue
                            nNewWorkers = dyn_num_workers[queueName][resource_type]['nNewWorkers']
                            tmpLog.debug('setting nNewWorkers to {0} of type {1} in order to respect RT aggregations for UCORE'
                                         .format(nNewWorkers, resource_type))

                if not apf_msg:
                    apf_data = copy.deepcopy(dyn_num_workers[queueName])

                self.apf_mon.update_label(queueName, apf_msg, apf_data)

            # dump
            tmpLog.debug('defined {0}'.format(str(dyn_num_workers)))
            return dyn_num_workers
        except Exception:
            # dump error
            errMsg = core_utils.dump_error_message(tmpLog)
            return None