示例#1
0
    def testG_monitoringDAO(self):
        """
        _monitoringDAO_

        Because I need a test for the monitoring DAO
        """
        myThread = threading.currentThread()

        config = self.getConfig()

        baAPI = BossAirAPI(config=config, insertStates=True)

        # Create some jobs
        nJobs = 10

        jobDummies = self.createDummyJobs(nJobs=nJobs)

        # Prior to building the job, each job must have a plugin
        # and user assigned
        for job in jobDummies:
            job['plugin'] = 'TestPlugin'
            job['owner'] = 'tapas'
            job['location'] = 'T2_US_UCSD'
            job.save()

        baAPI.submit(jobs=jobDummies)

        results = baAPI.monitor()
        self.assertEqual(results[0]['Pending'], nJobs)

        return
示例#2
0
    def testG_monitoringDAO(self):
        """
        _monitoringDAO_

        Because I need a test for the monitoring DAO
        """
        myThread = threading.currentThread()

        config = self.getConfig()

        baAPI = BossAirAPI(config=config, insertStates=True)

        # Create some jobs
        nJobs = 10

        jobDummies = self.createDummyJobs(nJobs=nJobs)

        # Prior to building the job, each job must have a plugin
        # and user assigned
        for job in jobDummies:
            job['plugin'] = 'TestPlugin'
            job['owner'] = 'tapas'
            job['location'] = 'T2_US_UCSD'
            job.save()

        baAPI.submit(jobs=jobDummies)

        results = baAPI.monitor()
        self.assertEqual(results[0]['Pending'], nJobs)

        return
示例#3
0
    def testG_monitoringDAO(self):
        """
        _monitoringDAO_

        Because I need a test for the monitoring DAO
        """

        return

        myThread = threading.currentThread()

        config = self.getConfig()

        changeState = ChangeState(config)

        baAPI  = BossAirAPI(config = config)


        # Create some jobs
        nJobs = 10

        jobDummies = self.createDummyJobs(nJobs = nJobs)

        # Prior to building the job, each job must have a plugin
        # and user assigned
        for job in jobDummies:
            job['plugin']   = 'TestPlugin'
            job['owner']    = 'mnorman'
            job['location'] = 'T2_US_UCSD'
            job.save()

        baAPI.submit(jobs = jobDummies)


        results = baAPI.monitor()

        self.assertEqual(len(results), nJobs)
        for job in results:
            self.assertEqual(job['plugin'], 'CondorPlugin')


        return
示例#4
0
    def testG_monitoringDAO(self):
        """
        _monitoringDAO_

        Because I need a test for the monitoring DAO
        """

        return

        myThread = threading.currentThread()

        config = self.getConfig()

        changeState = ChangeState(config)

        baAPI = BossAirAPI(config=config)

        # Create some jobs
        nJobs = 10

        jobDummies = self.createDummyJobs(nJobs=nJobs)

        # Prior to building the job, each job must have a plugin
        # and user assigned
        for job in jobDummies:
            job['plugin'] = 'TestPlugin'
            job['owner'] = 'mnorman'
            job['location'] = 'T2_US_UCSD'
            job.save()

        baAPI.submit(jobs=jobDummies)

        results = baAPI.monitor()

        self.assertEqual(len(results), nJobs)
        for job in results:
            self.assertEqual(job['plugin'], 'CondorPlugin')

        return
示例#5
0
class JobSubmitterPoller(BaseWorkerThread):
    """
    _JobSubmitterPoller_

    The jobSubmitterPoller takes the jobs and organizes them into packages
    before sending them to the individual plugin submitters.
    """
    def __init__(self, config):
        BaseWorkerThread.__init__(self)
        myThread = threading.currentThread()
        self.config = config

        #DAO factory for WMBS objects
        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=logging,
                                     dbinterface=myThread.dbi)

        #Libraries
        self.resourceControl = ResourceControl()
        self.changeState = ChangeState(self.config)
        self.bossAir = BossAirAPI(config=self.config)

        self.hostName = self.config.Agent.hostName
        self.repollCount = getattr(self.config.JobSubmitter, 'repollCount',
                                   10000)
        self.maxJobsPerPoll = int(
            getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000))
        self.maxJobsThisCycle = self.maxJobsPerPoll  # changes as per schedd limit
        self.cacheRefreshSize = int(
            getattr(self.config.JobSubmitter, 'cacheRefreshSize', 30000))
        self.skipRefreshCount = int(
            getattr(self.config.JobSubmitter, 'skipRefreshCount', 20))
        self.packageSize = getattr(self.config.JobSubmitter, 'packageSize',
                                   500)
        self.collSize = getattr(self.config.JobSubmitter, 'collectionSize',
                                self.packageSize * 1000)
        self.maxTaskPriority = getattr(self.config.BossAir, 'maxTaskPriority',
                                       1e7)
        self.condorFraction = 0.75  # update during every algorithm cycle
        self.condorOverflowFraction = 0.2
        self.ioboundTypes = ('LogCollect', 'Merge', 'Cleanup', 'Harvesting')

        # Additions for caching-based JobSubmitter
        self.cachedJobIDs = set()
        self.cachedJobs = {}
        self.jobDataCache = {}
        self.jobsToPackage = {}
        self.sandboxPackage = {}
        self.locationDict = {}
        self.taskTypePrioMap = {}
        self.drainSites = set()
        self.abortSites = set()
        self.refreshPollingCount = 0

        try:
            if not getattr(self.config.JobSubmitter, 'submitDir', None):
                self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir
            self.packageDir = os.path.join(self.config.JobSubmitter.submitDir,
                                           'packages')

            if not os.path.exists(self.packageDir):
                os.makedirs(self.packageDir)
        except OSError as ex:
            msg = "Error while trying to create packageDir %s\n!"
            msg += str(ex)
            logging.error(msg)
            logging.debug("PackageDir: %s", self.packageDir)
            logging.debug("Config: %s", config)
            raise JobSubmitterPollerException(msg)

        # Now the DAOs
        self.listJobsAction = self.daoFactory(
            classname="Jobs.ListForSubmitter")
        self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation")
        self.locationAction = self.daoFactory(
            classname="Locations.GetSiteInfo")
        self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath")
        self.listWorkflows = self.daoFactory(
            classname="Workflow.ListForSubmitter")

        # Keep a record of the thresholds in memory
        self.currentRcThresholds = {}

        self.useReqMgrForCompletionCheck = getattr(
            self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)

        if self.useReqMgrForCompletionCheck:
            # only set up this when reqmgr is used (not Tier0)
            self.reqmgr2Svc = ReqMgr(self.config.General.ReqMgr2ServiceURL)
            self.abortedAndForceCompleteWorkflowCache = self.reqmgr2Svc.getAbortedAndForceCompleteRequestsFromMemoryCache(
            )
            self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)
        else:
            # Tier0 Case - just for the clarity (This private variable shouldn't be used
            self.abortedAndForceCompleteWorkflowCache = None

        return

    def getPackageCollection(self, sandboxDir):
        """
        _getPackageCollection_

        Given a jobID figure out which packageCollection
        it should belong in.
        """

        rawList = os.listdir(sandboxDir)
        collections = []
        numberList = []
        for entry in rawList:
            if 'PackageCollection' in entry:
                collections.append(entry)

        # If we have no collections, return 0 (PackageCollection_0)
        if len(collections) < 1:
            return 0

        # Loop over the list of PackageCollections
        for collection in collections:
            collectionPath = os.path.join(sandboxDir, collection)
            packageList = os.listdir(collectionPath)
            collectionNum = int(collection.split('_')[1])
            if len(packageList) < self.collSize:
                return collectionNum
            else:
                numberList.append(collectionNum)

        # If we got here, then all collections are full.  We'll need
        # a new one.  Find the highest number, increment by one
        numberList.sort()
        return numberList[-1] + 1

    def addJobsToPackage(self, loadedJob):
        """
        _addJobsToPackage_

        Add a job to a job package and then return the batch ID for the job.
        Packages are only written out to disk when they contain 100 jobs.  The
        flushJobsPackages() method must be called after all jobs have been added
        to the cache and before they are actually submitted to make sure all the
        job packages have been written to disk.
        """
        if loadedJob["workflow"] not in self.jobsToPackage:
            # First, let's pull all the information from the loadedJob
            batchid = "%s-%s" % (loadedJob["id"], loadedJob["retry_count"])
            sandboxDir = os.path.dirname(loadedJob["sandbox"])

            # Second, assemble the jobPackage location
            collectionIndex = self.getPackageCollection(sandboxDir)
            collectionDir = os.path.join(
                sandboxDir, 'PackageCollection_%i' % collectionIndex,
                'batch_%s' % batchid)

            # Now create the package object
            self.jobsToPackage[loadedJob["workflow"]] = {
                "batchid": batchid,
                'id': loadedJob['id'],
                "package": JobPackage(directory=collectionDir)
            }

        jobPackage = self.jobsToPackage[loadedJob["workflow"]]["package"]
        jobPackage[loadedJob["id"]] = loadedJob.getDataStructsJob()
        batchDir = jobPackage['directory']

        if len(jobPackage.keys()) == self.packageSize:
            if not os.path.exists(batchDir):
                os.makedirs(batchDir)

            batchPath = os.path.join(batchDir, "JobPackage.pkl")
            jobPackage.save(batchPath)
            del self.jobsToPackage[loadedJob["workflow"]]

        return batchDir

    def flushJobPackages(self):
        """
        _flushJobPackages_

        Write any jobs packages to disk that haven't been written out already.
        """
        workflowNames = self.jobsToPackage.keys()
        for workflowName in workflowNames:
            jobPackage = self.jobsToPackage[workflowName]["package"]
            batchDir = jobPackage['directory']

            if not os.path.exists(batchDir):
                os.makedirs(batchDir)

            batchPath = os.path.join(batchDir, "JobPackage.pkl")
            jobPackage.save(batchPath)
            del self.jobsToPackage[workflowName]

        return

    def refreshCache(self):
        """
        _refreshCache_

        Query WMBS for all jobs in the 'created' state.  For all jobs returned
        from the query, check if they already exist in the cache.  If they
        don't, unpickle them and combine their site white and black list with
        the list of locations they can run at.  Add them to the cache.

        Each entry in the cache is a tuple with five items:
          - WMBS Job ID
          - Retry count
          - Batch ID
          - Path to sanbox
          - Path to cache directory
        """
        badJobs = dict([(x, []) for x in range(71101, 71105)])
        dbJobs = set()

        logging.info("Refreshing priority cache with currently %i jobs",
                     len(self.cachedJobIDs))

        if self.cacheRefreshSize == -1 or len(self.cachedJobIDs) < self.cacheRefreshSize or \
           self.refreshPollingCount >= self.skipRefreshCount:
            newJobs = self.listJobsAction.execute()
            self.refreshPollingCount = 0

            if self.useReqMgrForCompletionCheck:
                # if reqmgr is used (not Tier0 Agent) get the aborted/forceCompleted record
                abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData(
                )
            else:
                #T0Agent
                abortedAndForceCompleteRequests = []

            logging.info("Found %s new jobs to be submitted.", len(newJobs))
        else:
            self.refreshPollingCount += 1
            newJobs = []
            dbJobs = self.cachedJobIDs
            abortedAndForceCompleteRequests = []
            logging.info(
                "Skipping cache update to be submitted. (%s job in cache)",
                len(dbJobs))

        logging.info("Determining possible sites for new jobs...")
        jobCount = 0
        for newJob in newJobs:
            # whether newJob belongs to aborted or force-complete workflow, and skip it if it is.
            if (newJob['request_name'] in abortedAndForceCompleteRequests) and \
               (newJob['type'] not in ['LogCollect', "Cleanup"]):
                continue

            jobID = newJob['id']
            dbJobs.add(jobID)
            if jobID in self.cachedJobIDs:
                continue

            jobCount += 1
            if jobCount % 5000 == 0:
                logging.info("Processed %d/%d new jobs.", jobCount,
                             len(newJobs))

            pickledJobPath = os.path.join(newJob["cache_dir"], "job.pkl")

            if not os.path.isfile(pickledJobPath):
                # Then we have a problem - there's no file
                logging.error("Could not find pickled jobObject %s",
                              pickledJobPath)
                badJobs[71103].append(newJob)
                continue
            try:
                jobHandle = open(pickledJobPath, "r")
                loadedJob = pickle.load(jobHandle)
                jobHandle.close()
            except Exception as ex:
                msg = "Error while loading pickled job object %s\n" % pickledJobPath
                msg += str(ex)
                logging.error(msg)
                raise JobSubmitterPollerException(msg)

            loadedJob['retry_count'] = newJob['retry_count']

            # figure out possible locations for job
            possibleLocations = loadedJob["possiblePSN"]

            # Create another set of locations that may change when a site goes white/black listed
            # Does not care about the non_draining or aborted sites, they may change and that is the point
            potentialLocations = set()
            potentialLocations.update(possibleLocations)

            # now check for sites in drain and adjust the possible locations
            # also check if there is at least one site left to run the job
            if len(possibleLocations) == 0:
                newJob['name'] = loadedJob['name']
                newJob['fileLocations'] = loadedJob.get('fileLocations', [])
                newJob['siteWhitelist'] = loadedJob.get('siteWhitelist', [])
                newJob['siteBlacklist'] = loadedJob.get('siteBlacklist', [])
                badJobs[71101].append(newJob)
                continue
            else:
                nonAbortSites = [
                    x for x in possibleLocations if x not in self.abortSites
                ]
                if nonAbortSites:  # if there is at least a non aborted/down site then run there, otherwise fail the job
                    possibleLocations = nonAbortSites
                else:
                    newJob['name'] = loadedJob['name']
                    newJob['possibleLocations'] = possibleLocations
                    badJobs[71102].append(newJob)
                    continue

            # try to remove draining sites if possible, this is needed to stop
            # jobs that could run anywhere blocking draining sites
            # if the job type is Merge, LogCollect or Cleanup this is skipped
            if newJob['type'] not in self.ioboundTypes:
                nonDrainingSites = [
                    x for x in possibleLocations if x not in self.drainSites
                ]
                if nonDrainingSites:  # if >1 viable non-draining site remove draining ones
                    possibleLocations = nonDrainingSites
                else:
                    newJob['name'] = loadedJob['name']
                    newJob['possibleLocations'] = possibleLocations
                    badJobs[71104].append(newJob)
                    continue

            # locations clear of abort and draining sites
            newJob['possibleLocations'] = possibleLocations

            batchDir = self.addJobsToPackage(loadedJob)
            self.cachedJobIDs.add(jobID)

            # calculate the final job priority such that we can order cached jobs by prio
            jobPrio = self.taskTypePrioMap.get(newJob['type'],
                                               0) + newJob['wf_priority']
            if jobPrio not in self.cachedJobs:
                self.cachedJobs[jobPrio] = {}

            # now add basic information keyed by the jobid
            self.cachedJobs[jobPrio][jobID] = newJob

            # allow job baggage to override numberOfCores
            #       => used for repacking to get more slots/disk
            numberOfCores = loadedJob.get('numberOfCores', 1)
            if numberOfCores == 1:
                baggage = loadedJob.getBaggage()
                numberOfCores = getattr(baggage, "numberOfCores", 1)
            loadedJob['numberOfCores'] = numberOfCores

            # Create a job dictionary object and put it in the cache (needs to be in sync with RunJob)
            jobInfo = {
                'id':
                jobID,
                'requestName':
                newJob['request_name'],
                'taskName':
                newJob['task_name'],
                'taskType':
                newJob['type'],
                'cache_dir':
                newJob["cache_dir"],
                'priority':
                newJob['wf_priority'],
                'taskID':
                newJob['task_id'],
                'retry_count':
                newJob["retry_count"],
                'taskPriority':
                None,  # update from the thresholds
                'custom': {
                    'location': None
                },  # update later
                'packageDir':
                batchDir,
                'sandbox':
                loadedJob["sandbox"],  # remove before submit
                'userdn':
                loadedJob.get("ownerDN", None),
                'usergroup':
                loadedJob.get("ownerGroup", ''),
                'userrole':
                loadedJob.get("ownerRole", ''),
                'possibleSites':
                frozenset(
                    possibleLocations),  # abort and drain sites filtered out
                'potentialSites':
                frozenset(potentialLocations),  # original list of sites
                'scramArch':
                loadedJob.get("scramArch", None),
                'swVersion':
                loadedJob.get("swVersion", None),
                'name':
                loadedJob["name"],
                'proxyPath':
                loadedJob.get("proxyPath", None),
                'estimatedJobTime':
                loadedJob.get("estimatedJobTime", None),
                'estimatedDiskUsage':
                loadedJob.get("estimatedDiskUsage", None),
                'estimatedMemoryUsage':
                loadedJob.get("estimatedMemoryUsage", None),
                'numberOfCores':
                loadedJob.get("numberOfCores", 1),  # may update it later
                'inputDataset':
                loadedJob.get('inputDataset', None),
                'inputDatasetLocations':
                loadedJob.get('inputDatasetLocations', None),
                'allowOpportunistic':
                loadedJob.get('allowOpportunistic', False)
            }

            self.jobDataCache[jobID] = jobInfo

        # Register failures in submission
        for errorCode in badJobs:
            if badJobs[errorCode]:
                logging.debug(
                    "The following jobs could not be submitted: %s, error code : %d",
                    badJobs, errorCode)
                self._handleSubmitFailedJobs(badJobs[errorCode], errorCode)

        # If there are any leftover jobs, we want to get rid of them.
        self.flushJobPackages()

        # We need to remove any jobs from the cache that were not returned in
        # the last call to the database.
        jobIDsToPurge = self.cachedJobIDs - dbJobs
        self._purgeJobsFromCache(jobIDsToPurge)

        logging.info("Done pruning killed jobs, moving on to submit.")
        return

    def removeAbortedForceCompletedWorkflowFromCache(self):
        abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData(
        )
        jobIDsToPurge = set()
        for jobID, jobInfo in self.jobDataCache.iteritems():
            if (jobInfo['requestName'] in abortedAndForceCompleteRequests) and \
               (jobInfo['taskType'] not in ['LogCollect', "Cleanup"]):
                jobIDsToPurge.add(jobID)
        self._purgeJobsFromCache(jobIDsToPurge)
        return

    def _purgeJobsFromCache(self, jobIDsToPurge):

        if len(jobIDsToPurge) == 0:
            return

        self.cachedJobIDs -= jobIDsToPurge

        for jobid in jobIDsToPurge:
            self.jobDataCache.pop(jobid, None)
            for jobPrio in self.cachedJobs:
                if self.cachedJobs[jobPrio].pop(jobid, None):
                    # then the jobid was found, go to the next one
                    break
        return

    def _handleSubmitFailedJobs(self, badJobs, exitCode):
        """
        __handleSubmitFailedJobs_

        For a default job report for the exitCode
        and register in the job. Preserve it on disk as well.
        Propagate the failure to the JobStateMachine.
        """
        fwjrBinds = []
        for job in badJobs:
            job['couch_record'] = None
            job['fwjr'] = Report()
            if exitCode in [71102, 71104]:
                job['fwjr'].addError(
                    "JobSubmit", exitCode, "SubmitFailed",
                    WM_JOB_ERROR_CODES[exitCode] +
                    ', '.join(job['possibleLocations']))
            elif exitCode in [71101]:
                # there is no possible site
                if job.get("fileLocations"):
                    job['fwjr'].addError(
                        "JobSubmit", exitCode, "SubmitFailed",
                        WM_JOB_ERROR_CODES[exitCode] + ": file locations: " +
                        ', '.join(job['fileLocations']) +
                        ": site white list: " +
                        ', '.join(job['siteWhitelist']) +
                        ": site black list: " +
                        ', '.join(job['siteBlacklist']))

            else:
                job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed",
                                     WM_JOB_ERROR_CODES[exitCode])
            fwjrPath = os.path.join(job['cache_dir'],
                                    'Report.%d.pkl' % int(job['retry_count']))
            job['fwjr'].setJobID(job['id'])
            try:
                job['fwjr'].save(fwjrPath)
                fwjrBinds.append({"jobid": job["id"], "fwjrpath": fwjrPath})
            except IOError as ioer:
                logging.error(
                    "Failed to write FWJR for submit failed job %d, message: %s",
                    job['id'], str(ioer))
        self.changeState.propagate(badJobs, "submitfailed", "created")
        self.setFWJRPathAction.execute(binds=fwjrBinds)
        return

    def getThresholds(self):
        """
        _getThresholds_

        Retrieve submit thresholds, which considers what is pending and running
        for those sites.
        Also update the list of draining and abort/down sites.
        Finally, creates a map between task type and its priority.
        """
        self.taskTypePrioMap = {}
        newDrainSites = set()
        newAbortSites = set()

        rcThresholds = self.resourceControl.listThresholdsForSubmit()

        for siteName in rcThresholds.keys():
            # Add threshold if we don't have it already
            state = rcThresholds[siteName]["state"]

            if state == "Draining":
                newDrainSites.add(siteName)
            if state in ["Down", "Aborted"]:
                newAbortSites.add(siteName)

            # then update the task type x task priority mapping
            if not self.taskTypePrioMap:
                for task, value in rcThresholds[siteName]['thresholds'].items(
                ):
                    self.taskTypePrioMap[task] = value.get(
                        'priority', 0) * self.maxTaskPriority

        # When the list of drain/abort sites change between iteration then a location
        # refresh is needed, for now it forces a full cache refresh
        if newDrainSites != self.drainSites or newAbortSites != self.abortSites:
            logging.info(
                "Draining or Aborted sites have changed, the cache will be rebuilt."
            )
            self.cachedJobIDs = set()
            self.cachedJobs = {}
            self.jobDataCache = {}

        self.currentRcThresholds = rcThresholds
        self.abortSites = newAbortSites
        self.drainSites = newDrainSites

        return

    def _getJobSubmitCondition(self, jobPrio, siteName, jobType):
        """
        returns the string describing whether a job is ready to be submitted or the reason can't be submitted
        Only jobs with "JobSubmitReady" return value will be added to submit job.
        Other return values will indicate the reason jobs cannot be submitted.
        i.e. "NoPendingSlot"  - pending slot is full with pending job
        """
        try:
            totalPendingSlots = self.currentRcThresholds[siteName][
                "total_pending_slots"]
            totalPendingJobs = self.currentRcThresholds[siteName][
                "total_pending_jobs"]
            totalRunningSlots = self.currentRcThresholds[siteName][
                "total_running_slots"]
            totalRunningJobs = self.currentRcThresholds[siteName][
                "total_running_jobs"]

            taskPendingSlots = self.currentRcThresholds[siteName][
                'thresholds'][jobType]["pending_slots"]
            taskPendingJobs = self.currentRcThresholds[siteName]['thresholds'][
                jobType]["task_pending_jobs"]
            taskRunningSlots = self.currentRcThresholds[siteName][
                'thresholds'][jobType]["max_slots"]
            taskRunningJobs = self.currentRcThresholds[siteName]['thresholds'][
                jobType]["task_running_jobs"]
            highestPriorityInJobs = self.currentRcThresholds[siteName][
                'thresholds'][jobType]['wf_highest_priority']

            # set the initial totalPendingJobs since it increases in every cycle when a job is submitted
            self.currentRcThresholds[siteName].setdefault(
                "init_total_pending_jobs", totalPendingJobs)

            # set the initial taskPendingJobs since it increases in every cycle when a job is submitted
            self.currentRcThresholds[siteName]['thresholds'][
                jobType].setdefault("init_task_pending_jobs", taskPendingJobs)

            initTotalPending = self.currentRcThresholds[siteName][
                "init_total_pending_jobs"]
            initTaskPending = self.currentRcThresholds[siteName]['thresholds'][
                jobType]["init_task_pending_jobs"]

        except KeyError as ex:
            msg = "Invalid key for site %s and job type %s\n" % (siteName,
                                                                 jobType)
            logging.exception(msg)
            return "NoJobType_%s_%s" % (siteName, jobType)

        if (highestPriorityInJobs is None) or (
                jobPrio <= highestPriorityInJobs) or (jobType
                                                      in self.ioboundTypes):
            # there is no pending or running jobs in the system (None case) or
            # priority of the job is lower or equal don't allow overflow
            # Also if jobType is in ioboundTypes don't allow overflow
            totalPendingThreshold = totalPendingSlots
            taskPendingThreshold = taskPendingSlots
            totalJobThreshold = totalPendingSlots + totalRunningSlots
            totalTaskTheshold = taskPendingSlots + taskRunningSlots
        else:
            # In case the priority of the job is higher than any of currently pending or running jobs.
            # Then increase the threshold by condorOverflowFraction * original pending slot.
            totalPendingThreshold = max(
                totalPendingSlots, initTotalPending) + (
                    totalPendingSlots * self.condorOverflowFraction)
            taskPendingThreshold = max(taskPendingSlots, initTaskPending) + (
                taskPendingSlots * self.condorOverflowFraction)
            totalJobThreshold = totalPendingThreshold + totalRunningSlots
            totalTaskTheshold = taskPendingThreshold + taskRunningSlots

        jobStats = [{
            "Condition": "NoPendingSlot",
            "Current": totalPendingJobs,
            "Threshold": totalPendingThreshold
        }, {
            "Condition": "NoTaskPendingSlot",
            "Current": taskPendingJobs,
            "Threshold": taskPendingThreshold
        }, {
            "Condition": "NoRunningSlot",
            "Current": totalPendingJobs + totalRunningJobs,
            "Threshold": totalJobThreshold
        }, {
            "Condition": "NoTaskRunningSlot",
            "Current": taskPendingJobs + taskRunningJobs,
            "Threshold": totalTaskTheshold
        }]
        return jobSubmitCondition(jobStats)

    def assignJobLocations(self):
        """
        _assignJobLocations_

        Loop through the submit thresholds and pull sites out of the job cache
        as we discover open slots.  This will return a list of tuple where each
        tuple will have six elements:
          - WMBS Job ID
          - Retry count
          - Batch ID
          - Path to sanbox
          - Path to cache directory
          - SE name of the site to run at
        """
        jobsToSubmit = {}
        jobsToUncache = []
        jobsCount = 0
        exitLoop = False
        jobSubmitLogBySites = defaultdict(Counter)
        jobSubmitLogByPriority = defaultdict(Counter)

        # iterate over jobs from the highest to the lowest prio
        for jobPrio in sorted(self.cachedJobs, reverse=True):

            # then we're completely done and have our basket full of jobs to submit
            if exitLoop:
                break

            # start eating through the elder jobs first
            for job in sorted(self.cachedJobs[jobPrio].values(),
                              key=itemgetter('timestamp')):
                jobid = job['id']
                jobType = job['type']
                possibleSites = job['possibleLocations']
                jobSubmitLogByPriority[jobPrio]['Total'] += 1
                # now look for sites with free pending slots
                for siteName in possibleSites:
                    if siteName not in self.currentRcThresholds:
                        logging.warn(
                            "Have a job for %s which is not in the resource control",
                            siteName)
                        continue

                    condition = self._getJobSubmitCondition(
                        jobPrio, siteName, jobType)

                    if condition != "JobSubmitReady":
                        jobSubmitLogBySites[siteName][condition] += 1
                        logging.debug("Found a job for %s : %s", siteName,
                                      condition)
                        continue

                    # otherwise, update the site/task thresholds and the component job counter
                    self.currentRcThresholds[siteName][
                        "total_pending_jobs"] += 1
                    self.currentRcThresholds[siteName]['thresholds'][jobType][
                        "task_pending_jobs"] += 1

                    jobsCount += 1

                    # load (and remove) the job dictionary object from jobDataCache
                    cachedJob = self.jobDataCache.pop(jobid)
                    jobsToUncache.append((jobPrio, jobid))

                    # Sort jobs by jobPackage
                    package = cachedJob['packageDir']
                    if package not in jobsToSubmit.keys():
                        jobsToSubmit[package] = []

                    # Add the sandbox to a global list
                    self.sandboxPackage[package] = cachedJob.pop('sandbox')

                    # Now update the job dictionary object
                    cachedJob['custom'] = {'location': siteName}
                    cachedJob['taskPriority'] = self.currentRcThresholds[
                        siteName]['thresholds'][jobType]["priority"]

                    # Get this job in place to be submitted by the plugin
                    jobsToSubmit[package].append(cachedJob)

                    jobSubmitLogBySites[siteName]["submitted"] += 1
                    jobSubmitLogByPriority[jobPrio]['submitted'] += 1
                    # found a site to submit this job, so go to the next job
                    break

                # set the flag and get out of the job iteration
                if jobsCount >= self.maxJobsThisCycle:
                    logging.info(
                        "Submitter reached limit of submit slots for this cycle: %i",
                        self.maxJobsThisCycle)
                    exitLoop = True
                    break

        # jobs that are going to be submitted must be removed from all caches
        for prio, jobid in jobsToUncache:
            self.cachedJobs[prio].pop(jobid)
            self.cachedJobIDs.remove(jobid)

        logging.info("Site submission report: %s", dict(jobSubmitLogBySites))
        logging.info("Priority submission report: %s",
                     dict(jobSubmitLogByPriority))
        logging.info("Have %s packages to submit.", len(jobsToSubmit))
        logging.info("Have %s jobs to submit.", jobsCount)
        logging.info("Done assigning site locations.")
        return jobsToSubmit

    def submitJobs(self, jobsToSubmit):
        """
        _submitJobs_

        Actually do the submission of the jobs
        """

        jobList = []
        idList = []

        if len(jobsToSubmit) == 0:
            logging.debug("There are no packages to submit.")
            return

        for package in jobsToSubmit.keys():

            sandbox = self.sandboxPackage[package]
            jobs = jobsToSubmit.get(package, [])
            for job in jobs:
                job['location'], job['plugin'], job[
                    'site_cms_name'] = self.getSiteInfo(
                        job['custom']['location'])
                job['sandbox'] = sandbox
                idList.append({
                    'jobid': job['id'],
                    'location': job['custom']['location']
                })

            #Clean out the package reference
            del self.sandboxPackage[package]

            jobList.extend(jobs)

        myThread = threading.currentThread()
        myThread.transaction.begin()

        # Run the actual underlying submit code using bossAir
        successList, failList = self.bossAir.submit(jobs=jobList)
        logging.info("Jobs that succeeded/failed submission: %d/%d.",
                     len(successList), len(failList))

        # Propagate states in the WMBS database
        logging.debug("Propagating success state to WMBS.")
        self.changeState.propagate(successList, 'executing', 'created')
        logging.debug("Propagating fail state to WMBS.")
        self.changeState.propagate(failList, 'submitfailed', 'created')

        # At the end we mark the locations of the jobs
        # This applies even to failed jobs, since the location
        # could be part of the failure reason.
        logging.debug("Updating job location...")
        self.setLocationAction.execute(bulkList=idList,
                                       conn=myThread.transaction.conn,
                                       transaction=True)
        myThread.transaction.commit()
        logging.info("Transaction cycle successfully completed.")

        return

    def getSiteInfo(self, jobSite):
        """
        _getSiteInfo_

        This is how you get the name of a CE and the plugin for a job
        """

        if not jobSite in self.locationDict.keys():
            siteInfo = self.locationAction.execute(siteName=jobSite)
            self.locationDict[jobSite] = siteInfo[0]
        return (self.locationDict[jobSite].get('ce_name'),
                self.locationDict[jobSite].get('plugin'),
                self.locationDict[jobSite].get('cms_name'))

    @timeFunction
    def algorithm(self, parameters=None):
        """
        _algorithm_

        Try to, in order:
        1) Refresh the cache
        2) Find jobs for all the necessary sites
        3) Submit the jobs to the plugin
        """
        myThread = threading.currentThread()

        if self.useReqMgrForCompletionCheck:
            # only runs when reqmgr is used (not Tier0)
            self.removeAbortedForceCompletedWorkflowFromCache()
            agentConfig = self.reqAuxDB.getWMAgentConfig(
                self.config.Agent.hostName)
            self.condorFraction = agentConfig.get('CondorJobsFraction', 0.75)
            self.condorOverflowFraction = agentConfig.get(
                "CondorOverflowFraction", 0.2)
        else:
            # For Tier0 agent
            self.condorFraction = 1
            self.condorOverflowFraction = 0

        if not self.passSubmitConditions():
            msg = "JobSubmitter didn't pass the submit conditions. Skipping this cycle."
            logging.warning(msg)
            myThread.logdbClient.post("JobSubmitter_submitWork", msg,
                                      "warning")
            return

        try:
            myThread.logdbClient.delete("JobSubmitter_submitWork",
                                        "warning",
                                        this_thread=True)
            self.getThresholds()
            self.refreshCache()

            jobsToSubmit = self.assignJobLocations()
            self.submitJobs(jobsToSubmit=jobsToSubmit)
        except WMException:
            if getattr(myThread, 'transaction', None) != None:
                myThread.transaction.rollback()
            raise
        except Exception as ex:
            msg = 'Fatal error in JobSubmitter:\n'
            msg += str(ex)
            #msg += str(traceback.format_exc())
            msg += '\n\n'
            logging.error(msg)
            if getattr(myThread, 'transaction', None) != None:
                myThread.transaction.rollback()
            raise JobSubmitterPollerException(msg)

        return

    def passSubmitConditions(self):
        """
        _passSubmitConditions_

        Check whether the component is allowed to submit jobs to condor.

        Initially it has only one condition, which is the total number of
        jobs we can have in condor (pending + running) per schedd, set by
        MAX_JOBS_PER_OWNER.
        """

        myThread = threading.currentThread()
        freeSubmitSlots = availableScheddSlots(
            dbi=myThread.dbi,
            logger=logging,
            condorFraction=self.condorFraction)
        self.maxJobsThisCycle = min(freeSubmitSlots, self.maxJobsPerPoll)

        return (self.maxJobsThisCycle > 0)

    def terminate(self, params):
        """
        _terminate_

        Kill the code after one final pass when called by the master thread.
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)
示例#6
0
    def testC_CondorTest(self):
        """
        _CondorTest_

        This test works on the CondorPlugin, checking all of
        its functions with a single set of jobs
        """
        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0, "User currently has %i running jobs.  Test will not continue" % (nRunning))

        # Get the config and set the removal time to -10 for testing
        config = self.getConfig()
        config.BossAir.removeTime = -10.0

        nJobs = 10

        jobDummies = self.createDummyJobs(nJobs = nJobs)

        baAPI  = BossAirAPI(config = config)

        print self.testDir

        jobPackage = os.path.join(self.testDir, 'JobPackage.pkl')
        f = open(jobPackage, 'w')
        f.write(' ')
        f.close()

        sandbox = os.path.join(self.testDir, 'sandbox.box')
        f = open(sandbox, 'w')
        f.write(' ')
        f.close()

        jobList = []
        for j in jobDummies:
            tmpJob = {'id': j['id']}
            tmpJob['custom']      = {'location': 'malpaquet'}
            tmpJob['name']        = j['name']
            tmpJob['cache_dir']   = self.testDir
            tmpJob['retry_count'] = 0
            tmpJob['plugin']      = 'CondorPlugin'
            tmpJob['owner']       = 'tapas'
            tmpJob['packageDir']  = self.testDir
            tmpJob['sandbox']     = sandbox
            tmpJob['priority']    = None
            tmpJob['usergroup']   = "wheel"
            tmpJob['userrole']    = 'cmsuser'
            jobList.append(tmpJob)


        info = {}
        #info['packageDir'] = self.testDir
        info['index']      = 0
        info['sandbox']    = sandbox

        baAPI.submit(jobs = jobList, info = info)

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nJobs)

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), nJobs)


        baAPI.track()

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), 0)

        newJobs = baAPI._loadByStatus(status = 'Idle')
        self.assertEqual(len(newJobs), nJobs)

        # Do a second time to make sure that the cache
        # doesn't die on us
        baAPI.track()

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), 0)

        newJobs = baAPI._loadByStatus(status = 'Idle')
        self.assertEqual(len(newJobs), nJobs)

        baAPI.kill(jobs = jobList)

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0)

        # Try resubmission
        for j in jobList:
            j['retry_count'] = 1

        baAPI.submit(jobs = jobList, info = info)

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nJobs)

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), nJobs)


        # See where they are
        baAPI.track()

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), 0)

        newJobs = baAPI._loadByStatus(status = 'Idle')
        self.assertEqual(len(newJobs), nJobs)

        # Now kill 'em manually
        command = ['condor_rm', self.user]
        pipe = Popen(command, stdout = PIPE, stderr = PIPE, shell = False)
        pipe.communicate()

        # See what happened
        baAPI.track()

        newJobs = baAPI._loadByStatus(status = 'Idle')
        self.assertEqual(len(newJobs), 0)

        #newJobs = baAPI._loadByStatus(status = 'Removed')
        #self.assertEqual(len(newJobs), nJobs)

        # Because removal time is -10.0, jobs should remove immediately
        baAPI.track()

        # Assert that jobs were listed as completed
        myThread = threading.currentThread()
        newJobs = baAPI._loadByStatus(status = 'Removed', complete = '0')
        self.assertEqual(len(newJobs), nJobs)

        return
示例#7
0
    def testG_gLiteTest(self):
        """
        _gLiteTest_

        This test works on the gLitePlugin, checking all of
        its functions with a single set of jobs
        """

        config = self.getConfig()
        config.BossAir.gliteConf = '/afs/cern.ch/cms/LCG/LCG-2/UI/conf/glite_wms_CERN.conf'
        config.BossAir.credentialDir = '/home/crab/ALL_SETUP/credentials/'
        config.BossAir.gLiteProcesses = 2
        config.BossAir.gLitePrefixEnv = "/lib64/"
        config.BossAir.pluginNames.append("gLitePlugin")
        config.BossAir.manualProxyPath = environ['X509_USER_PROXY']

        config.Agent.serverDN = "/we/bypass/myproxy/logon"

        #config.BossAir.pluginNames = ["gLitePlugin"]
        baAPI = BossAirAPI(config=config)

        nJobs = 2
        jobDummies = self.createDummyJobs(nJobs=nJobs,
                                          location='grid-ce-01.ba.infn.it')

        jobPackage = os.path.join(self.testDir, 'JobPackage.pkl')
        f = open(jobPackage, 'w')
        f.write(' ')
        f.close()

        sandbox = os.path.join(self.testDir, 'sandbox.box')
        f = open(sandbox, 'w')
        f.write(' ')
        f.close()

        jobList = []
        userdn = executeCommand('grid-cert-info -subject -file %s' %
                                config.BossAir.manualProxyPath)
        newuser = self.daoFactory(classname="Users.New")
        newuser.execute(dn=userdn)
        for j in jobDummies:
            job = j  # {'id': j['id']}
            job['custom'] = {'location': 'grid-ce-01.ba.infn.it'}
            job['location'] = 'grid-ce-01.ba.infn.it'
            job['plugin'] = 'gLitePlugin'
            job['name'] = j['name']
            job['cache_dir'] = self.testDir
            job['retry_count'] = 0
            job['owner'] = userdn
            job['packageDir'] = self.testDir
            job['sandbox'] = sandbox
            job['priority'] = None
            jobList.append(job)

        baAPI.submit(jobs=jobList)

        # Should be new jobs
        newJobs = baAPI._loadByStatus(status='New')
        self.assertNotEqual(len(newJobs), nJobs)

        time.sleep(2)
        baAPI.track()

        # Should be not anymore marked as new
        newJobs = baAPI._loadByStatus('New', 0)
        self.assertNotEqual(len(newJobs), nJobs)

        # Killing all the jobs
        baAPI.kill(jobList)
        #time.sleep(15)
        baAPI.track()

        ## Issues running tests below due to glite delay on marking job as killed
        # Should be just running jobs
        #killedJobs = baAPI._loadByStatus('Cancelled by user', 0)
        #self.assertEqual(len(killedJobs), 0)

        # Check if they're complete
        #completeJobs = baAPI.getComplete()
        #self.assertEqual(len(completeJobs), nJobs)

        return
    def testD_MyProxyDelegation(self):
        """
        _MyProxyDelegation_

        Test whether we can delegate a proxy via myproxy to this job

        IMPORTANT:
        If you are going to run this test you will have to set the serverCert/Key
        config options to point to your local server cert.  You will also have to
        run this job with your DN.  I don't recommend figuring out how to do this
        without knowing what you're doing in regards to proxy stuff.
        """

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0, "User currently has %i running jobs.  Test will not continue" % (nRunning))

        # Get the config and set the removal time to -10 for testing
        proxyDir = os.path.join(self.testDir, 'proxyDir')
        os.mkdir(proxyDir)
        config = self.getConfig()
        config.BossAir.removeTime = -10.0
        config.BossAir.pluginNames.append('VanillaCondorPlugin')
        config.BossAir.delegatedServerCert = '/uscms/home/mnorman/.globus/cms-xen39crab3devcert.pem'
        config.BossAir.delegatedServerKey  = '/uscms/home/mnorman/.globus/cms-xen39crab3devkey.pem'
        config.BossAir.myproxyServer       = 'myproxy.cern.ch'
        config.BossAir.proxyDir            = proxyDir
        config.BossAir.delegatedServerHash = 'a6f078516a0beed5dcb31ba866868fa690069f9a'

        userDN = '/DC=org/DC=doegrids/OU=People/CN=Matthew Norman 453632'

        nJobs = 10

        jobDummies = self.createDummyJobs(nJobs = nJobs)

        baAPI  = BossAirAPI(config = config)


        jobPackage = os.path.join(self.testDir, 'JobPackage.pkl')
        f = open(jobPackage, 'w')
        f.write(' ')
        f.close()

        sandbox = os.path.join(self.testDir, 'sandbox.box')
        f = open(sandbox, 'w')
        f.write(' ')
        f.close()

        jobList = []
        for j in jobDummies:
            tmpJob = {'id': j['id']}
            tmpJob['custom']      = {'location': 'malpaquet'}
            tmpJob['name']        = j['name']
            tmpJob['cache_dir']   = self.testDir
            tmpJob['retry_count'] = 0
            tmpJob['plugin']      = 'VanillaCondorPlugin'
            tmpJob['owner']       = userDN
            tmpJob['packageDir']  = self.testDir
            tmpJob['sandbox']     = sandbox
            tmpJob['priority']    = None
            jobList.append(tmpJob)


        info = {}
        #info['packageDir'] = self.testDir
        info['index']      = 0
        info['sandbox']    = sandbox

        baAPI.submit(jobs = jobList, info = info)

        proxyFile = os.listdir(proxyDir)[0]
        stdout, stderr = SubprocessAlgos.runCommand(cmd = 'export X509_USER_PROXY=%s; voms-proxy-info' \
                                                    % os.path.join(proxyDir, proxyFile))
        self.assertEqual(stdout.split('\n')[0],
                         'subject   : %s/CN=proxy/CN=proxy/CN=proxy/CN=proxy' % userDN)

        # Now kill 'em manually
        command = ['condor_rm', self.user]
        SubprocessAlgos.runCommand(cmd = command, shell = False)

        return
示例#9
0
    def testB_PluginTest(self):
        """
        _PluginTest_


        Now check that these functions worked if called through plugins
        Instead of directly.

        There are only three plugin
        """
        #return

        myThread = threading.currentThread()

        config = self.getConfig()

        baAPI  = BossAirAPI(config = config)


        # Create some jobs
        nJobs = 10

        jobDummies = self.createDummyJobs(nJobs = nJobs, location = 'Xanadu')
        changeState = ChangeState(config)
        changeState.propagate(jobDummies, 'created', 'new')
        changeState.propagate(jobDummies, 'executing', 'created')

        # Prior to building the job, each job must have a plugin
        # and user assigned
        for job in jobDummies:
            job['plugin']   = 'TestPlugin'
            job['owner']    = 'tapas'

        baAPI.submit(jobs = jobDummies)


        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), nJobs)

        # Should be no more running jobs
        runningJobs = baAPI._listRunJobs()
        self.assertEqual(len(runningJobs), nJobs)


        # Test Plugin should complete all jobs
        baAPI.track()

        # Should be no more running jobs
        runningJobs = baAPI._listRunJobs()
        self.assertEqual(len(runningJobs), 0)


        # Check if they're complete
        completeJobs = baAPI.getComplete()
        self.assertEqual(len(completeJobs), nJobs)


        # Do this test because BossAir is specifically built
        # to keep it from finding completed jobs
        result = myThread.dbi.processData("SELECT id FROM bl_runjob")[0].fetchall()
        self.assertEqual(len(result), nJobs)


        baAPI.removeComplete(jobs = jobDummies)


        result = myThread.dbi.processData("SELECT id FROM bl_runjob")[0].fetchall()
        self.assertEqual(len(result), 0)


        return
示例#10
0
    def testC_CondorTest(self):
        """
        _CondorTest_

        This test works on the SimpleCondorPlugin, checking all of
        its functions with a single set of jobs
        """
        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(
            nRunning, 0,
            "User currently has %i running jobs.  Test will not continue" %
            (nRunning))

        # Get the config and set the removal time to -10 for testing
        config = self.getConfig()
        config.BossAir.removeTime = -10.0

        nJobs = 10
        jobDummies = self.createDummyJobs(nJobs=nJobs)
        baAPI = BossAirAPI(config=config, insertStates=True)

        jobPackage = os.path.join(self.testDir, 'JobPackage.pkl')
        f = open(jobPackage, 'w')
        f.write(' ')
        f.close()

        sandbox = os.path.join(self.testDir, 'sandbox.box')
        f = open(sandbox, 'w')
        f.write(' ')
        f.close()

        jobList = []
        for j in jobDummies:
            tmpJob = {'id': j['id']}
            tmpJob['custom'] = {'location': 'malpaquet'}
            tmpJob['name'] = j['name']
            tmpJob['cache_dir'] = self.testDir
            tmpJob['retry_count'] = 0
            tmpJob['plugin'] = 'SimpleCondorPlugin'
            tmpJob['owner'] = 'tapas'
            tmpJob['packageDir'] = self.testDir
            tmpJob['sandbox'] = sandbox
            tmpJob['priority'] = None
            tmpJob['usergroup'] = "wheel"
            tmpJob['userrole'] = 'cmsuser'
            jobList.append(tmpJob)

        info = {}
        # info['packageDir'] = self.testDir
        info['index'] = 0
        info['sandbox'] = sandbox

        baAPI.submit(jobs=jobList, info=info)

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nJobs)

        newJobs = baAPI._loadByStatus(status='New')
        self.assertEqual(len(newJobs), nJobs)

        baAPI.track()

        newJobs = baAPI._loadByStatus(status='New')
        self.assertEqual(len(newJobs), 0)

        newJobs = baAPI._loadByStatus(status='Idle')
        self.assertEqual(len(newJobs), nJobs)

        # Do a second time to make sure that the cache
        # doesn't die on us
        baAPI.track()

        newJobs = baAPI._loadByStatus(status='New')
        self.assertEqual(len(newJobs), 0)

        newJobs = baAPI._loadByStatus(status='Idle')
        self.assertEqual(len(newJobs), nJobs)

        baAPI.kill(jobs=jobList)

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0)

        # Try resubmission
        for j in jobList:
            j['retry_count'] = 1

        baAPI.submit(jobs=jobList, info=info)

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nJobs)

        newJobs = baAPI._loadByStatus(status='New')
        self.assertEqual(len(newJobs), nJobs)

        # See where they are
        baAPI.track()

        newJobs = baAPI._loadByStatus(status='New')
        self.assertEqual(len(newJobs), 0)

        newJobs = baAPI._loadByStatus(status='Idle')
        self.assertEqual(len(newJobs), nJobs)

        # Now kill 'em manually
        command = ['condor_rm', self.user]
        pipe = Popen(command, stdout=PIPE, stderr=PIPE, shell=False)
        pipe.communicate()

        # See what happened
        baAPI.track()

        newJobs = baAPI._loadByStatus(status='Idle')
        self.assertEqual(len(newJobs), 0)

        # newJobs = baAPI._loadByStatus(status = 'Removed')
        # self.assertEqual(len(newJobs), nJobs)

        # Because removal time is -10.0, jobs should remove immediately
        baAPI.track()

        # Assert that jobs were listed as completed
        myThread = threading.currentThread()
        newJobs = baAPI._loadByStatus(status='Removed', complete='0')
        self.assertEqual(len(newJobs), nJobs)
        return
示例#11
0
    def testD_MyProxyDelegation(self):
        """
        _MyProxyDelegation_

        Test whether we can delegate a proxy via myproxy to this job

        IMPORTANT:
        If you are going to run this test you will have to set the serverCert/Key
        config options to point to your local server cert.  You will also have to
        run this job with your DN.  I don't recommend figuring out how to do this
        without knowing what you're doing in regards to proxy stuff.
        """

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(
            nRunning, 0,
            "User currently has %i running jobs.  Test will not continue" %
            (nRunning))

        # Get the config and set the removal time to -10 for testing
        proxyDir = os.path.join(self.testDir, 'proxyDir')
        os.mkdir(proxyDir)
        config = self.getConfig()
        config.BossAir.removeTime = -10.0
        config.BossAir.pluginNames.append('VanillaCondorPlugin')
        config.BossAir.delegatedServerCert = '/uscms/home/mnorman/.globus/cms-xen39crab3devcert.pem'
        config.BossAir.delegatedServerKey = '/uscms/home/mnorman/.globus/cms-xen39crab3devkey.pem'
        config.BossAir.myproxyServer = 'myproxy.cern.ch'
        config.BossAir.proxyDir = proxyDir
        config.BossAir.delegatedServerHash = 'a6f078516a0beed5dcb31ba866868fa690069f9a'

        userDN = '/DC=org/DC=doegrids/OU=People/CN=Matthew Norman 453632'

        nJobs = 10

        jobDummies = self.createDummyJobs(nJobs=nJobs)

        baAPI = BossAirAPI(config=config)

        jobPackage = os.path.join(self.testDir, 'JobPackage.pkl')
        f = open(jobPackage, 'w')
        f.write(' ')
        f.close()

        sandbox = os.path.join(self.testDir, 'sandbox.box')
        f = open(sandbox, 'w')
        f.write(' ')
        f.close()

        jobList = []
        for j in jobDummies:
            tmpJob = {'id': j['id']}
            tmpJob['custom'] = {'location': 'malpaquet'}
            tmpJob['name'] = j['name']
            tmpJob['cache_dir'] = self.testDir
            tmpJob['retry_count'] = 0
            tmpJob['plugin'] = 'VanillaCondorPlugin'
            tmpJob['owner'] = userDN
            tmpJob['packageDir'] = self.testDir
            tmpJob['sandbox'] = sandbox
            tmpJob['priority'] = None
            jobList.append(tmpJob)

        info = {}
        #info['packageDir'] = self.testDir
        info['index'] = 0
        info['sandbox'] = sandbox

        baAPI.submit(jobs=jobList, info=info)

        proxyFile = os.listdir(proxyDir)[0]
        stdout, stderr = SubprocessAlgos.runCommand(cmd = 'export X509_USER_PROXY=%s; voms-proxy-info' \
                                                    % os.path.join(proxyDir, proxyFile))
        self.assertEqual(
            stdout.split('\n')[0],
            'subject   : %s/CN=proxy/CN=proxy/CN=proxy/CN=proxy' % userDN)

        # Now kill 'em manually
        command = ['condor_rm', self.user]
        SubprocessAlgos.runCommand(cmd=command, shell=False)

        return
示例#12
0
class JobSubmitterPoller(BaseWorkerThread):
    """
    _JobSubmitterPoller_

    The jobSubmitterPoller takes the jobs and organizes them into packages
    before sending them to the individual plugin submitters.
    """

    def __init__(self, config):
        BaseWorkerThread.__init__(self)
        myThread = threading.currentThread()
        self.config = config

        # DAO factory for WMBS objects
        self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=logging, dbinterface=myThread.dbi)

        # Libraries
        self.resourceControl = ResourceControl()
        self.changeState = ChangeState(self.config)
        self.bossAir = BossAirAPI(config=self.config, insertStates=True)

        self.hostName = self.config.Agent.hostName
        self.repollCount = getattr(self.config.JobSubmitter, 'repollCount', 10000)
        self.maxJobsPerPoll = int(getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000))
        self.maxJobsToCache = int(getattr(self.config.JobSubmitter, 'maxJobsToCache', 50000))
        self.maxJobsThisCycle = self.maxJobsPerPoll  # changes as per schedd limit
        self.cacheRefreshSize = int(getattr(self.config.JobSubmitter, 'cacheRefreshSize', 30000))
        self.skipRefreshCount = int(getattr(self.config.JobSubmitter, 'skipRefreshCount', 20))
        self.packageSize = getattr(self.config.JobSubmitter, 'packageSize', 500)
        self.collSize = getattr(self.config.JobSubmitter, 'collectionSize', self.packageSize * 1000)
        self.maxTaskPriority = getattr(self.config.BossAir, 'maxTaskPriority', 1e7)
        self.condorFraction = 0.75  # update during every algorithm cycle
        self.condorOverflowFraction = 0.2
        self.ioboundTypes = ('LogCollect', 'Merge', 'Cleanup', 'Harvesting')
        self.drainGracePeriod = getattr(self.config.JobSubmitter, 'drainGraceTime', 2 * 24 * 60 * 60)  # 2 days

        # Used for speed draining the agent
        self.enableAllSites = False

        # Additions for caching-based JobSubmitter
        self.jobsByPrio = {}  # key'ed by the final job priority, which contains a set of job ids
        self.jobDataCache = {}  # key'ed by the job id, containing the whole job info dict
        self.jobsToPackage = {}
        self.locationDict = {}
        self.drainSites = dict()
        self.drainSitesSet = set()
        self.abortSites = set()
        self.refreshPollingCount = 0

        try:
            if not getattr(self.config.JobSubmitter, 'submitDir', None):
                self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir
            self.packageDir = os.path.join(self.config.JobSubmitter.submitDir, 'packages')

            if not os.path.exists(self.packageDir):
                os.makedirs(self.packageDir)
        except OSError as ex:
            msg = "Error while trying to create packageDir %s\n!"
            msg += str(ex)
            logging.error(msg)
            logging.debug("PackageDir: %s", self.packageDir)
            logging.debug("Config: %s", config)
            raise JobSubmitterPollerException(msg)

        # Now the DAOs
        self.listJobsAction = self.daoFactory(classname="Jobs.ListForSubmitter")
        self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation")
        self.locationAction = self.daoFactory(classname="Locations.GetSiteInfo")
        self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath")
        self.listWorkflows = self.daoFactory(classname="Workflow.ListForSubmitter")

        # Keep a record of the thresholds in memory
        self.currentRcThresholds = {}

        self.useReqMgrForCompletionCheck = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)

        if self.useReqMgrForCompletionCheck:
            # only set up this when reqmgr is used (not Tier0)
            self.reqmgr2Svc = ReqMgr(self.config.General.ReqMgr2ServiceURL)
            self.abortedAndForceCompleteWorkflowCache = self.reqmgr2Svc.getAbortedAndForceCompleteRequestsFromMemoryCache()
            self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)
        else:
            # Tier0 Case - just for the clarity (This private variable shouldn't be used
            self.abortedAndForceCompleteWorkflowCache = None

        return

    def getPackageCollection(self, sandboxDir):
        """
        _getPackageCollection_

        Given a jobID figure out which packageCollection
        it should belong in.
        """

        rawList = os.listdir(sandboxDir)
        collections = []
        numberList = []
        for entry in rawList:
            if 'PackageCollection' in entry:
                collections.append(entry)

        # If we have no collections, return 0 (PackageCollection_0)
        if len(collections) < 1:
            return 0

        # Loop over the list of PackageCollections
        for collection in collections:
            collectionPath = os.path.join(sandboxDir, collection)
            packageList = os.listdir(collectionPath)
            collectionNum = int(collection.split('_')[1])
            if len(packageList) < self.collSize:
                return collectionNum
            else:
                numberList.append(collectionNum)

        # If we got here, then all collections are full.  We'll need
        # a new one.  Find the highest number, increment by one
        numberList.sort()
        return numberList[-1] + 1

    def addJobsToPackage(self, loadedJob):
        """
        _addJobsToPackage_

        Add a job to a job package and then return the batch ID for the job.
        Packages are only written out to disk when they contain 100 jobs.  The
        flushJobsPackages() method must be called after all jobs have been added
        to the cache and before they are actually submitted to make sure all the
        job packages have been written to disk.
        """
        if loadedJob["workflow"] not in self.jobsToPackage:
            # First, let's pull all the information from the loadedJob
            batchid = "%s-%s" % (loadedJob["id"], loadedJob["retry_count"])
            sandboxDir = os.path.dirname(loadedJob["sandbox"])

            # Second, assemble the jobPackage location
            collectionIndex = self.getPackageCollection(sandboxDir)
            collectionDir = os.path.join(sandboxDir,
                                         'PackageCollection_%i' % collectionIndex,
                                         'batch_%s' % batchid)

            # Now create the package object
            self.jobsToPackage[loadedJob["workflow"]] = {"batchid": batchid,
                                                         'id': loadedJob['id'],
                                                         "package": JobPackage(directory=collectionDir)}

        jobPackage = self.jobsToPackage[loadedJob["workflow"]]["package"]
        jobPackage[loadedJob["id"]] = loadedJob.getDataStructsJob()
        batchDir = jobPackage['directory']

        if len(jobPackage.keys()) == self.packageSize:
            if not os.path.exists(batchDir):
                os.makedirs(batchDir)

            batchPath = os.path.join(batchDir, "JobPackage.pkl")
            jobPackage.save(batchPath)
            del self.jobsToPackage[loadedJob["workflow"]]

        return batchDir

    def flushJobPackages(self):
        """
        _flushJobPackages_

        Write any jobs packages to disk that haven't been written out already.
        """
        workflowNames = self.jobsToPackage.keys()
        for workflowName in workflowNames:
            jobPackage = self.jobsToPackage[workflowName]["package"]
            batchDir = jobPackage['directory']

            if not os.path.exists(batchDir):
                os.makedirs(batchDir)

            batchPath = os.path.join(batchDir, "JobPackage.pkl")
            jobPackage.save(batchPath)
            del self.jobsToPackage[workflowName]

        return

    def hasToRefreshCache(self):
        """
        _hasToRefreshCache_

        Check whether we should update the job data cache (or update it
        with new jobs in the created state) or if we just skip it.
        """
        if self.cacheRefreshSize == -1 or len(self.jobDataCache) < self.cacheRefreshSize or\
                        self.refreshPollingCount >= self.skipRefreshCount:
            self.refreshPollingCount = 0
            return True
        else:
            self.refreshPollingCount += 1
            logging.info("Skipping cache update to be submitted. (%s job in cache)", len(self.jobDataCache))
        return False

    def refreshCache(self):
        """
        _refreshCache_

        Query WMBS for all jobs in the 'created' state.  For all jobs returned
        from the query, check if they already exist in the cache.  If they
        don't, unpickle them and combine their site white and black list with
        the list of locations they can run at.  Add them to the cache.

        Each entry in the cache is a tuple with five items:
          - WMBS Job ID
          - Retry count
          - Batch ID
          - Path to sanbox
          - Path to cache directory
        """
        # make a counter for jobs pending to sites in drain mode within the grace period
        countDrainingJobs = 0
        timeNow = int(time.time())
        badJobs = dict([(x, []) for x in range(71101, 71105)])
        newJobIds = set()

        logging.info("Refreshing priority cache with currently %i jobs", len(self.jobDataCache))

        newJobs = self.listJobsAction.execute(limitRows=self.maxJobsToCache)
        if self.useReqMgrForCompletionCheck:
            # if reqmgr is used (not Tier0 Agent) get the aborted/forceCompleted record
            abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData()
        else:
            abortedAndForceCompleteRequests = []

        logging.info("Found %s new jobs to be submitted.", len(newJobs))

        if self.enableAllSites:
            logging.info("Agent is in speed drain mode. Submitting jobs to all possible locations.")

        logging.info("Determining possible sites for new jobs...")
        jobCount = 0
        for newJob in newJobs:
            jobCount += 1
            if jobCount % 5000 == 0:
                logging.info("Processed %d/%d new jobs.", jobCount, len(newJobs))

            # whether newJob belongs to aborted or force-complete workflow, and skip it if it is.
            if newJob['request_name'] in abortedAndForceCompleteRequests and \
                            newJob['task_type'] not in ['LogCollect', "Cleanup"]:
                continue

            jobID = newJob['id']
            newJobIds.add(jobID)
            if jobID in self.jobDataCache:
                continue

            pickledJobPath = os.path.join(newJob["cache_dir"], "job.pkl")

            if not os.path.isfile(pickledJobPath):
                # Then we have a problem - there's no file
                logging.error("Could not find pickled jobObject %s", pickledJobPath)
                badJobs[71103].append(newJob)
                continue
            try:
                with open(pickledJobPath, 'r') as jobHandle:
                    loadedJob = pickle.load(jobHandle)
            except Exception as ex:
                msg = "Error while loading pickled job object %s\n" % pickledJobPath
                msg += str(ex)
                logging.error(msg)
                raise JobSubmitterPollerException(msg)

            # figure out possible locations for job
            possibleLocations = loadedJob["possiblePSN"]

            # Create another set of locations that may change when a site goes white/black listed
            # Does not care about the non_draining or aborted sites, they may change and that is the point
            potentialLocations = set()
            potentialLocations.update(possibleLocations)

            # check if there is at least one site left to run the job
            if len(possibleLocations) == 0:
                newJob['fileLocations'] = loadedJob.get('fileLocations', [])
                newJob['siteWhitelist'] = loadedJob.get('siteWhitelist', [])
                newJob['siteBlacklist'] = loadedJob.get('siteBlacklist', [])
                logging.warning("Input data location doesn't pass the site restrictions for job id: %s", jobID)
                badJobs[71101].append(newJob)
                continue

            # if agent is in speed drain and has hit the threshold to submit to all sites, we can skip the logic below that exclude sites
            if not self.enableAllSites:
                # check for sites in aborted state and adjust the possible locations
                nonAbortSites = [x for x in possibleLocations if x not in self.abortSites]
                if nonAbortSites:  # if there is at least a non aborted/down site then run there, otherwise fail the job
                    possibleLocations = nonAbortSites
                else:
                    newJob['possibleSites'] = possibleLocations
                    logging.warning("Job id %s can only run at a site in Aborted state", jobID)
                    badJobs[71102].append(newJob)
                    continue

                # try to remove draining sites if possible, this is needed to stop
                # jobs that could run anywhere blocking draining sites
                # if the job type is Merge, LogCollect or Cleanup this is skipped
                if newJob['task_type'] not in self.ioboundTypes:
                    nonDrainingSites = [x for x in possibleLocations if x not in self.drainSites]
                    if nonDrainingSites:  # if >1 viable non-draining site remove draining ones
                        possibleLocations = nonDrainingSites
                    elif self.failJobDrain(timeNow, possibleLocations):
                        newJob['possibleSites'] = possibleLocations
                        logging.warning("Job id %s can only run at a sites in Draining state", jobID)
                        badJobs[71104].append(newJob)
                        continue
                    else:
                        countDrainingJobs += 1
                        continue

            # Sigh...make sure the job added to the package has the proper retry_count
            loadedJob['retry_count'] = newJob['retry_count']
            batchDir = self.addJobsToPackage(loadedJob)

            # calculate the final job priority such that we can order cached jobs by prio
            jobPrio = newJob['task_prio'] * self.maxTaskPriority + newJob['wf_priority']
            self.jobsByPrio.setdefault(jobPrio, set())
            self.jobsByPrio[jobPrio].add(jobID)

            # allow job baggage to override numberOfCores
            #       => used for repacking to get more slots/disk
            numberOfCores = loadedJob.get('numberOfCores', 1)
            if numberOfCores == 1:
                baggage = loadedJob.getBaggage()
                numberOfCores = getattr(baggage, "numberOfCores", 1)
            loadedJob['numberOfCores'] = numberOfCores

            # Create a job dictionary object and put it in the cache (needs to be in sync with RunJob)
            jobInfo = {'taskPriority': newJob['task_prio'],
                       'custom': {'location': None},  # update later
                       'packageDir': batchDir,
                       'retry_count': newJob["retry_count"],
                       'sandbox': loadedJob["sandbox"],  # remove before submit
                       'userdn': loadedJob.get("ownerDN", None),
                       'usergroup': loadedJob.get("ownerGroup", ''),
                       'userrole': loadedJob.get("ownerRole", ''),
                       'possibleSites': frozenset(possibleLocations),  # abort and drain sites filtered out
                       'potentialSites': frozenset(potentialLocations),  # original list of sites
                       'scramArch': loadedJob.get("scramArch", None),
                       'swVersion': loadedJob.get("swVersion", []),
                       'proxyPath': loadedJob.get("proxyPath", None),
                       'estimatedJobTime': loadedJob.get("estimatedJobTime", None),
                       'estimatedDiskUsage': loadedJob.get("estimatedDiskUsage", None),
                       'estimatedMemoryUsage': loadedJob.get("estimatedMemoryUsage", None),
                       'numberOfCores': loadedJob.get("numberOfCores"),  # may update it later
                       'inputDataset': loadedJob.get('inputDataset', None),
                       'inputDatasetLocations': loadedJob.get('inputDatasetLocations', None),
                       'inputPileup': loadedJob.get('inputPileup', None),
                       'allowOpportunistic': loadedJob.get('allowOpportunistic', False)}
            # then update it with the info retrieved from the database
            jobInfo.update(newJob)

            self.jobDataCache[jobID] = jobInfo

        # Register failures in submission
        for errorCode in badJobs:
            if badJobs[errorCode]:
                logging.debug("The following jobs could not be submitted: %s, error code : %d", badJobs, errorCode)
                self._handleSubmitFailedJobs(badJobs[errorCode], errorCode)

        # Persist remaining job packages to disk
        self.flushJobPackages()

        # We need to remove any jobs from the cache that were not returned in
        # the last call to the database.
        jobIDsToPurge = set(self.jobDataCache.keys()) - newJobIds
        self._purgeJobsFromCache(jobIDsToPurge)

        logging.info("Found %d jobs pending to sites in drain within the grace period", countDrainingJobs)
        logging.info("Done pruning killed jobs, moving on to submit.")
        return

    def failJobDrain(self, timeNow, possibleLocations):
        """
        Check whether sites are in drain for too long such that the job
        has to be marked as failed or not.
        :param timeNow: timestamp for this cycle
        :param possibleLocations: list of possible locations where the job can run
        :return: a boolean saying whether the job has to fail or not
        """
        fail = True
        for siteName in set(possibleLocations).union(self.drainSitesSet):
            if timeNow - self.drainSites[siteName] < self.drainGracePeriod:
                # then let this job be, it's a fresh draining site
                fail = False
                break
        return fail

    def removeAbortedForceCompletedWorkflowFromCache(self):
        abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData()
        jobIDsToPurge = set()
        for jobID, jobInfo in self.jobDataCache.iteritems():
            if (jobInfo['request_name'] in abortedAndForceCompleteRequests) and \
                    (jobInfo['task_type'] not in ['LogCollect', "Cleanup"]):
                jobIDsToPurge.add(jobID)
        self._purgeJobsFromCache(jobIDsToPurge)
        return

    def _purgeJobsFromCache(self, jobIDsToPurge):

        if len(jobIDsToPurge) == 0:
            return

        for jobid in jobIDsToPurge:
            self.jobDataCache.pop(jobid, None)
            for jobPrio in self.jobsByPrio:
                if jobid in self.jobsByPrio[jobPrio]:
                    # then the jobid was found, go to the next one
                    self.jobsByPrio[jobPrio].discard(jobid)
                    break
        return

    def _handleSubmitFailedJobs(self, badJobs, exitCode):
        """
        __handleSubmitFailedJobs_

        For a default job report for the exitCode
        and register in the job. Preserve it on disk as well.
        Propagate the failure to the JobStateMachine.
        """
        fwjrBinds = []
        for job in badJobs:
            job['couch_record'] = None
            job['fwjr'] = Report()
            if exitCode in [71102, 71104]:
                job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed",
                                     WM_JOB_ERROR_CODES[exitCode] + ', '.join(job['possibleSites']),
                                     ', '.join(job['possibleSites']))
            elif exitCode in [71101]:
                # there is no possible site
                if job.get("fileLocations"):
                    job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode] +
                                         ": file locations: " + ', '.join(job['fileLocations']) +
                                         ": site white list: " + ', '.join(job['siteWhitelist']) +
                                         ": site black list: " + ', '.join(job['siteBlacklist']))
                else:
                    job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed",
                                         WM_JOB_ERROR_CODES[exitCode] + ', and empty fileLocations')

            else:
                job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode])

            fwjrPath = os.path.join(job['cache_dir'], 'Report.%d.pkl' % int(job['retry_count']))
            job['fwjr'].setJobID(job['id'])
            try:
                job['fwjr'].save(fwjrPath)
                fwjrBinds.append({"jobid": job["id"], "fwjrpath": fwjrPath})
            except IOError as ioer:
                logging.error("Failed to write FWJR for submit failed job %d, message: %s", job['id'], str(ioer))
        self.changeState.propagate(badJobs, "submitfailed", "created")
        self.setFWJRPathAction.execute(binds=fwjrBinds)
        return

    def getThresholds(self):
        """
        _getThresholds_

        Retrieve submit thresholds, which considers what is pending and running
        for those sites.
        Also update the list of draining and abort/down sites.
        Finally, creates a map between task type and its priority.
        """
        # lets store also a timestamp for when a site joined the Drain state
        newDrainSites = dict()
        newAbortSites = set()

        rcThresholds = self.resourceControl.listThresholdsForSubmit()

        for siteName in rcThresholds.keys():
            # Add threshold if we don't have it already
            state = rcThresholds[siteName]["state"]

            if state == "Draining":
                newDrainSites.update({siteName: rcThresholds[siteName]["state_time"]})
            if state in ["Down", "Aborted"]:
                newAbortSites.add(siteName)

        # When the list of drain/abort sites change between iteration then a location
        # refresh is needed, for now it forces a full cache refresh
        if set(newDrainSites.keys()) != self.drainSitesSet or newAbortSites != self.abortSites:
            logging.info("Draining or Aborted sites have changed, the cache will be rebuilt.")
            self.jobsByPrio = {}
            self.jobDataCache = {}

        self.currentRcThresholds = rcThresholds
        self.abortSites = newAbortSites
        self.drainSites = newDrainSites
        self.drainSitesSet = set(newDrainSites.keys())

        return

    def checkZeroTaskThresholds(self, jobType, siteList):
        """
        _checkZeroTaskThresholds_

        Given a job type and a list of sites, remove sites from the list
        if that site + task has 0 pending thresholds.
        Returns a new list of sites
        """
        newSiteList = []
        for site in siteList:
            try:
                taskPendingSlots = self.currentRcThresholds[site]['thresholds'][jobType]["pending_slots"]
            except KeyError as ex:
                msg = "Invalid key for site %s and job type %s. Error: %s" % (site, jobType, str(ex))
                logging.warning(msg)
            else:
                if taskPendingSlots > 0:
                    newSiteList.append(site)

        return newSiteList

    def _getJobSubmitCondition(self, jobPrio, siteName, jobType):
        """
        returns the string describing whether a job is ready to be submitted or the reason can't be submitted
        Only jobs with "JobSubmitReady" return value will be added to submit job.
        Other return values will indicate the reason jobs cannot be submitted.
        i.e. "NoPendingSlot"  - pending slot is full with pending job
        """
        try:
            totalPendingSlots = self.currentRcThresholds[siteName]["total_pending_slots"]
            totalPendingJobs = self.currentRcThresholds[siteName]["total_pending_jobs"]
            totalRunningSlots = self.currentRcThresholds[siteName]["total_running_slots"]
            totalRunningJobs = self.currentRcThresholds[siteName]["total_running_jobs"]

            taskPendingSlots = self.currentRcThresholds[siteName]['thresholds'][jobType]["pending_slots"]
            taskPendingJobs = self.currentRcThresholds[siteName]['thresholds'][jobType]["task_pending_jobs"]
            taskRunningSlots = self.currentRcThresholds[siteName]['thresholds'][jobType]["max_slots"]
            taskRunningJobs = self.currentRcThresholds[siteName]['thresholds'][jobType]["task_running_jobs"]
            highestPriorityInJobs = self.currentRcThresholds[siteName]['thresholds'][jobType]['wf_highest_priority']

            # set the initial totalPendingJobs since it increases in every cycle when a job is submitted
            self.currentRcThresholds[siteName].setdefault("init_total_pending_jobs", totalPendingJobs)

            # set the initial taskPendingJobs since it increases in every cycle when a job is submitted
            self.currentRcThresholds[siteName]['thresholds'][jobType].setdefault("init_task_pending_jobs",
                                                                                 taskPendingJobs)

            initTotalPending = self.currentRcThresholds[siteName]["init_total_pending_jobs"]
            initTaskPending = self.currentRcThresholds[siteName]['thresholds'][jobType]["init_task_pending_jobs"]

        except KeyError as ex:
            msg = "Invalid key for site %s and job type %s\n" % (siteName, jobType)
            msg += str(ex)
            logging.exception(msg)
            return "NoJobType_%s_%s" % (siteName, jobType)

        if (highestPriorityInJobs is None) or (jobPrio <= highestPriorityInJobs) or (jobType in self.ioboundTypes):
            # there is no pending or running jobs in the system (None case) or
            # priority of the job is lower or equal don't allow overflow
            # Also if jobType is in ioboundTypes don't allow overflow
            totalPendingThreshold = totalPendingSlots
            taskPendingThreshold = taskPendingSlots
            totalJobThreshold = totalPendingSlots + totalRunningSlots
            totalTaskTheshold = taskPendingSlots + taskRunningSlots
        else:
            # In case the priority of the job is higher than any of currently pending or running jobs.
            # Then increase the threshold by condorOverflowFraction * original pending slot.
            totalPendingThreshold = max(totalPendingSlots, initTotalPending) + (
                totalPendingSlots * self.condorOverflowFraction)
            taskPendingThreshold = max(taskPendingSlots, initTaskPending) + (
                taskPendingSlots * self.condorOverflowFraction)
            totalJobThreshold = totalPendingThreshold + totalRunningSlots
            totalTaskTheshold = taskPendingThreshold + taskRunningSlots

        jobStats = [{"Condition": "NoPendingSlot",
                     "Current": totalPendingJobs,
                     "Threshold": totalPendingThreshold},
                    {"Condition": "NoTaskPendingSlot",
                     "Current": taskPendingJobs,
                     "Threshold": taskPendingThreshold},
                    {"Condition": "NoRunningSlot",
                     "Current": totalPendingJobs + totalRunningJobs,
                     "Threshold": totalJobThreshold},
                    {"Condition": "NoTaskRunningSlot",
                     "Current": taskPendingJobs + taskRunningJobs,
                     "Threshold": totalTaskTheshold}]
        return jobSubmitCondition(jobStats)

    def assignJobLocations(self):
        """
        _assignJobLocations_

        Loop through the submit thresholds and pull sites out of the job cache
        as we discover open slots.  This will return a list of tuple where each
        tuple will have six elements:
          - WMBS Job ID
          - Retry count
          - Batch ID
          - Path to sanbox
          - Path to cache directory
          - SE name of the site to run at
        """
        jobsToSubmit = {}
        jobsCount = 0
        exitLoop = False
        jobSubmitLogBySites = defaultdict(lambda: defaultdict(Counter))
        jobSubmitLogByPriority = defaultdict(lambda: defaultdict(Counter))

        # iterate over jobs from the highest to the lowest prio
        for jobPrio in sorted(self.jobsByPrio, reverse=True):

            # then we're completely done and have our basket full of jobs to submit
            if exitLoop:
                break

            # can we assume jobid=1 is older than jobid=3? I think so...
            for jobid in sorted(self.jobsByPrio[jobPrio]):
                jobType = self.jobDataCache[jobid]['task_type']
                possibleSites = self.jobDataCache[jobid]['possibleSites']
                # remove sites with 0 task thresholds
                possibleSites = self.checkZeroTaskThresholds(jobType, possibleSites)
                jobSubmitLogByPriority[jobPrio][jobType]['Total'] += 1
                # now look for sites with free pending slots
                for siteName in possibleSites:
                    condition = self._getJobSubmitCondition(jobPrio, siteName, jobType)
                    if condition != "JobSubmitReady":
                        jobSubmitLogBySites[siteName][jobType][condition] += 1
                        logging.debug("Found a job for %s : %s", siteName, condition)
                        continue

                    # pop the job dictionary object and update it
                    cachedJob = self.jobDataCache.pop(jobid)
                    cachedJob['custom'] = {'location': siteName}
                    cachedJob['possibleSites'] = possibleSites

                    # Sort jobs by jobPackage and get it in place to be submitted by the plugin
                    package = cachedJob['packageDir']
                    jobsToSubmit.setdefault(package, [])
                    jobsToSubmit[package].append(cachedJob)

                    # update site/task thresholds and the component job counter
                    self.currentRcThresholds[siteName]["total_pending_jobs"] += 1
                    self.currentRcThresholds[siteName]['thresholds'][jobType]["task_pending_jobs"] += 1
                    jobsCount += 1
                    jobSubmitLogBySites[siteName][jobType]["submitted"] += 1
                    jobSubmitLogByPriority[jobPrio][jobType]['submitted'] += 1

                    # jobs that will be submitted must leave the job data cache
                    self.jobsByPrio[jobPrio].discard(jobid)

                    # found a site to submit this job, so go to the next job
                    break

                # set the flag and get out of the job iteration
                if jobsCount >= self.maxJobsThisCycle:
                    logging.info("Submitter reached limit of submit slots for this cycle: %i", self.maxJobsThisCycle)
                    exitLoop = True
                    break

        logging.info("Site submission report ...")
        for site in jobSubmitLogBySites:
            logging.info("    %s : %s", site, json.dumps(jobSubmitLogBySites[site]))
        logging.info("Priority submission report ...")
        for prio in jobSubmitLogByPriority:
            logging.info("    %s : %s", prio, json.dumps(jobSubmitLogByPriority[prio]))
        logging.info("Have %s packages to submit.", len(jobsToSubmit))
        logging.info("Have %s jobs to submit.", jobsCount)
        logging.info("Done assigning site locations.")
        return jobsToSubmit

    def submitJobs(self, jobsToSubmit):
        """
        _submitJobs_

        Actually do the submission of the jobs
        """

        jobList = []
        idList = []

        if len(jobsToSubmit) == 0:
            logging.debug("There are no packages to submit.")
            return

        for package in jobsToSubmit.keys():
            jobs = jobsToSubmit.get(package, [])
            for job in jobs:
                job['location'], job['plugin'], job['site_cms_name'] = self.getSiteInfo(job['custom']['location'])
                idList.append({'jobid': job['id'], 'location': job['custom']['location']})

            jobList.extend(jobs)

        myThread = threading.currentThread()
        myThread.transaction.begin()

        # Run the actual underlying submit code using bossAir
        successList, failList = self.bossAir.submit(jobs=jobList)
        logging.info("Jobs that succeeded/failed submission: %d/%d.", len(successList), len(failList))

        # Propagate states in the WMBS database
        logging.debug("Propagating success state to WMBS.")
        self.changeState.propagate(successList, 'executing', 'created')
        logging.debug("Propagating fail state to WMBS.")
        self.changeState.propagate(failList, 'submitfailed', 'created')
        # At the end we mark the locations of the jobs
        # This applies even to failed jobs, since the location
        # could be part of the failure reason.
        logging.debug("Updating job location...")
        self.setLocationAction.execute(bulkList=idList, conn=myThread.transaction.conn,
                                       transaction=True)
        myThread.transaction.commit()
        logging.info("Transaction cycle successfully completed.")

        return

    def getSiteInfo(self, jobSite):
        """
        _getSiteInfo_

        This is how you get the name of a CE and the plugin for a job
        """

        if jobSite not in self.locationDict.keys():
            siteInfo = self.locationAction.execute(siteName=jobSite)
            self.locationDict[jobSite] = siteInfo[0]
        return (self.locationDict[jobSite].get('ce_name'),
                self.locationDict[jobSite].get('plugin'),
                self.locationDict[jobSite].get('cms_name'))

    @timeFunction
    def algorithm(self, parameters=None):
        """
        _algorithm_

        Try to, in order:
        1) Refresh the cache
        2) Find jobs for all the necessary sites
        3) Submit the jobs to the plugin
        """
        myThread = threading.currentThread()

        if self.useReqMgrForCompletionCheck:
            # only runs when reqmgr is used (not Tier0)
            self.removeAbortedForceCompletedWorkflowFromCache()
            agentConfig = self.reqAuxDB.getWMAgentConfig(self.config.Agent.hostName)
            if agentConfig.get("UserDrainMode") and agentConfig.get("SpeedDrainMode"):
                self.enableAllSites = agentConfig.get("SpeedDrainConfig")['EnableAllSites']['Enabled']
            else:
                self.enableAllSites = False
            self.condorFraction = agentConfig.get('CondorJobsFraction', 0.75)
            self.condorOverflowFraction = agentConfig.get("CondorOverflowFraction", 0.2)
        else:
            # For Tier0 agent
            self.condorFraction = 1
            self.condorOverflowFraction = 0

        if not self.passSubmitConditions():
            msg = "JobSubmitter didn't pass the submit conditions. Skipping this cycle."
            logging.warning(msg)
            myThread.logdbClient.post("JobSubmitter_submitWork", msg, "warning")
            return

        try:
            myThread.logdbClient.delete("JobSubmitter_submitWork", "warning", this_thread=True)
            self.getThresholds()
            if self.hasToRefreshCache():
                self.refreshCache()

            jobsToSubmit = self.assignJobLocations()
            self.submitJobs(jobsToSubmit=jobsToSubmit)
        except WMException:
            if getattr(myThread, 'transaction', None) is not None:
                myThread.transaction.rollback()
            raise
        except Exception as ex:
            msg = 'Fatal error in JobSubmitter:\n'
            msg += str(ex)
            # msg += str(traceback.format_exc())
            msg += '\n\n'
            logging.error(msg)
            if getattr(myThread, 'transaction', None) is not None:
                myThread.transaction.rollback()
            raise JobSubmitterPollerException(msg)

        return

    def passSubmitConditions(self):
        """
        _passSubmitConditions_

        Check whether the component is allowed to submit jobs to condor.

        Initially it has only one condition, which is the total number of
        jobs we can have in condor (pending + running) per schedd, set by
        MAX_JOBS_PER_OWNER.
        """

        myThread = threading.currentThread()
        freeSubmitSlots = availableScheddSlots(dbi=myThread.dbi, logger=logging,
                                               condorFraction=self.condorFraction)
        self.maxJobsThisCycle = min(freeSubmitSlots, self.maxJobsPerPoll)

        return (self.maxJobsThisCycle > 0)

    def terminate(self, params):
        """
        _terminate_

        Kill the code after one final pass when called by the master thread.
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)
示例#13
0
class JobSubmitterPoller(BaseWorkerThread):
    """
    _JobSubmitterPoller_

    The jobSubmitterPoller takes the jobs and organizes them into packages
    before sending them to the individual plugin submitters.
    """
    def __init__(self, config):
        BaseWorkerThread.__init__(self)
        myThread = threading.currentThread()

        #DAO factory for WMBS objects
        self.daoFactory = DAOFactory(package="WMCore.WMBS", \
                                     logger=logging,
                                     dbinterface=myThread.dbi)
        self.config = config

        #Libraries
        self.resourceControl = ResourceControl()

        self.changeState = ChangeState(self.config)
        self.repollCount = getattr(self.config.JobSubmitter, 'repollCount',
                                   10000)
        self.maxJobsPerPoll = int(
            getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000))
        self.cacheRefreshSize = int(
            getattr(self.config.JobSubmitter, 'cacheRefreshSize', 30000))
        self.skipRefreshCount = int(
            getattr(self.config.JobSubmitter, 'skipRefreshCount', 20))
        self.refreshPollingCount = 0

        # BossAir
        self.bossAir = BossAirAPI(config=self.config)

        # Additions for caching-based JobSubmitter
        self.workflowTimestamps = {}
        self.workflowPrios = {}
        self.cachedJobIDs = set()
        self.cachedJobs = {}
        self.jobDataCache = {}
        self.jobsToPackage = {}
        self.sandboxPackage = {}
        self.siteKeys = {}
        self.locationDict = {}
        self.drainSites = set()
        self.abortSites = set()
        self.sortedSites = []
        self.packageSize = getattr(self.config.JobSubmitter, 'packageSize',
                                   500)
        self.collSize = getattr(self.config.JobSubmitter, 'collectionSize',
                                self.packageSize * 1000)

        # initialize the alert framework (if available)
        self.initAlerts(compName="JobSubmitter")

        try:
            if not getattr(self.config.JobSubmitter, 'submitDir', None):
                self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir
            self.packageDir = os.path.join(self.config.JobSubmitter.submitDir,
                                           'packages')

            if not os.path.exists(self.packageDir):
                os.makedirs(self.packageDir)
        except Exception as ex:
            msg = "Error while trying to create packageDir %s\n!"
            msg += str(ex)
            logging.error(msg)
            self.sendAlert(6, msg=msg)
            try:
                logging.debug("PackageDir: %s", self.packageDir)
                logging.debug("Config: %s", config)
            except:
                pass
            raise JobSubmitterPollerException(msg)

        # Now the DAOs
        self.listJobsAction = self.daoFactory(
            classname="Jobs.ListForSubmitter")
        self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation")
        self.locationAction = self.daoFactory(
            classname="Locations.GetSiteInfo")
        self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath")
        self.listWorkflows = self.daoFactory(
            classname="Workflow.ListForSubmitter")

        # Keep a record of the thresholds in memory
        self.currentRcThresholds = {}

        return

    def getPackageCollection(self, sandboxDir):
        """
        _getPackageCollection_

        Given a jobID figure out which packageCollection
        it should belong in.
        """

        rawList = os.listdir(sandboxDir)
        collections = []
        numberList = []
        for entry in rawList:
            if 'PackageCollection' in entry:
                collections.append(entry)

        # If we have no collections, return 0 (PackageCollection_0)
        if len(collections) < 1:
            return 0

        # Loop over the list of PackageCollections
        for collection in collections:
            collectionPath = os.path.join(sandboxDir, collection)
            packageList = os.listdir(collectionPath)
            collectionNum = int(collection.split('_')[1])
            if len(packageList) < self.collSize:
                return collectionNum
            else:
                numberList.append(collectionNum)

        # If we got here, then all collections are full.  We'll need
        # a new one.  Find the highest number, increment by one
        numberList.sort()
        return numberList[-1] + 1

    def addJobsToPackage(self, loadedJob):
        """
        _addJobsToPackage_

        Add a job to a job package and then return the batch ID for the job.
        Packages are only written out to disk when they contain 100 jobs.  The
        flushJobsPackages() method must be called after all jobs have been added
        to the cache and before they are actually submitted to make sure all the
        job packages have been written to disk.
        """
        if loadedJob["workflow"] not in self.jobsToPackage:
            # First, let's pull all the information from the loadedJob
            batchid = "%s-%s" % (loadedJob["id"], loadedJob["retry_count"])
            sandboxDir = os.path.dirname(loadedJob["sandbox"])

            # Second, assemble the jobPackage location
            collectionIndex = self.getPackageCollection(sandboxDir)
            collectionDir = os.path.join(
                sandboxDir, 'PackageCollection_%i' % collectionIndex,
                'batch_%s' % batchid)

            # Now create the package object
            self.jobsToPackage[loadedJob["workflow"]] = {
                "batchid": batchid,
                'id': loadedJob['id'],
                "package": JobPackage(directory=collectionDir)
            }

        jobPackage = self.jobsToPackage[loadedJob["workflow"]]["package"]
        jobPackage[loadedJob["id"]] = loadedJob.getDataStructsJob()
        batchDir = jobPackage['directory']

        if len(jobPackage.keys()) == self.packageSize:
            if not os.path.exists(batchDir):
                os.makedirs(batchDir)

            batchPath = os.path.join(batchDir, "JobPackage.pkl")
            jobPackage.save(batchPath)
            del self.jobsToPackage[loadedJob["workflow"]]

        return batchDir

    def flushJobPackages(self):
        """
        _flushJobPackages_

        Write any jobs packages to disk that haven't been written out already.
        """
        workflowNames = self.jobsToPackage.keys()
        for workflowName in workflowNames:
            jobPackage = self.jobsToPackage[workflowName]["package"]
            batchDir = jobPackage['directory']

            if not os.path.exists(batchDir):
                os.makedirs(batchDir)

            batchPath = os.path.join(batchDir, "JobPackage.pkl")
            jobPackage.save(batchPath)
            del self.jobsToPackage[workflowName]

        return

    def refreshCache(self):
        """
        _refreshCache_

        Query WMBS for all jobs in the 'created' state.  For all jobs returned
        from the query, check if they already exist in the cache.  If they
        don't, unpickle them and combine their site white and black list with
        the list of locations they can run at.  Add them to the cache.

        Each entry in the cache is a tuple with five items:
          - WMBS Job ID
          - Retry count
          - Batch ID
          - Path to sanbox
          - Path to cache directory
        """
        badJobs = dict([(x, []) for x in range(71101, 71105)])
        dbJobs = set()

        logging.info("Refreshing priority cache...")
        workflows = self.listWorkflows.execute()
        workflows = filter(lambda x: x['name'] in self.workflowPrios,
                           workflows)
        for workflow in workflows:
            self.workflowPrios[workflow['name']] = workflow['priority']

        logging.info("Querying WMBS for jobs to be submitted...")
        logging.info("cachedJobIDs : %s" % len(self.cachedJobIDs))
        if self.cacheRefreshSize == -1 or len(self.cachedJobIDs) < self.cacheRefreshSize or \
           self.refreshPollingCount >= self.skipRefreshCount:
            newJobs = self.listJobsAction.execute()
            self.refreshPollingCount = 0
            logging.info("Found %s new jobs to be submitted.", len(newJobs))
        else:
            self.refreshPollingCount += 1
            newJobs = []
            dbJobs = self.cachedJobIDs
            logging.info(
                "Skipping cache update to be submitted. (%s job in cache)" %
                len(dbJobs))

        logging.info("Determining possible sites for new jobs...")
        jobCount = 0
        for newJob in newJobs:
            jobID = newJob['id']
            dbJobs.add(jobID)
            if jobID in self.cachedJobIDs:
                continue

            jobCount += 1
            if jobCount % 5000 == 0:
                logging.info("Processed %d/%d new jobs.", jobCount,
                             len(newJobs))

            pickledJobPath = os.path.join(newJob["cache_dir"], "job.pkl")

            if not os.path.isfile(pickledJobPath):
                # Then we have a problem - there's no file
                logging.error("Could not find pickled jobObject %s",
                              pickledJobPath)
                badJobs[71103].append(newJob)
                continue
            try:
                jobHandle = open(pickledJobPath, "r")
                loadedJob = cPickle.load(jobHandle)
                jobHandle.close()
            except Exception as ex:
                msg = "Error while loading pickled job object %s\n" % pickledJobPath
                msg += str(ex)
                logging.error(msg)
                self.sendAlert(6, msg=msg)
                raise JobSubmitterPollerException(msg)

            loadedJob['retry_count'] = newJob['retry_count']

            # figure out possible locations for job
            possibleLocations = loadedJob["possiblePSN"]

            # Create another set of locations that may change when a site goes white/black listed
            # Does not care about the non_draining or aborted sites, they may change and that is the point
            potentialLocations = set()
            potentialLocations.update(possibleLocations)

            # now check for sites in drain and adjust the possible locations
            # also check if there is at least one site left to run the job
            if len(possibleLocations) == 0:
                newJob['name'] = loadedJob['name']
                badJobs[71101].append(newJob)
                continue
            else:
                nonAbortSites = [
                    x for x in possibleLocations if x not in self.abortSites
                ]
                if nonAbortSites:  # if there is at least a non aborted/down site then run there, otherwise fail the job
                    possibleLocations = nonAbortSites
                else:
                    newJob['name'] = loadedJob['name']
                    newJob['possibleLocations'] = possibleLocations
                    badJobs[71102].append(newJob)
                    continue

            # try to remove draining sites if possible, this is needed to stop
            # jobs that could run anywhere blocking draining sites
            # if the job type is Merge, LogCollect or Cleanup this is skipped
            if newJob['type'] not in ('LogCollect', 'Merge', 'Cleanup',
                                      'Harvesting'):
                nonDrainingSites = [
                    x for x in possibleLocations if x not in self.drainSites
                ]
                if nonDrainingSites:  # if >1 viable non-draining site remove draining ones
                    possibleLocations = nonDrainingSites
                else:
                    newJob['name'] = loadedJob['name']
                    newJob['possibleLocations'] = possibleLocations
                    badJobs[71104].append(newJob)
                    continue

            batchDir = self.addJobsToPackage(loadedJob)
            self.cachedJobIDs.add(jobID)

            for possibleLocation in possibleLocations:
                if possibleLocation not in self.cachedJobs:
                    self.cachedJobs[possibleLocation] = {}
                if newJob["type"] not in self.cachedJobs[possibleLocation]:
                    self.cachedJobs[possibleLocation][newJob["type"]] = {}

                locTypeCache = self.cachedJobs[possibleLocation][
                    newJob["type"]]
                workflowName = newJob['workflow']
                timestamp = newJob['timestamp']
                prio = newJob['task_priority']
                if workflowName not in locTypeCache:
                    locTypeCache[workflowName] = set()
                if workflowName not in self.jobDataCache:
                    self.jobDataCache[workflowName] = {}
                if not workflowName in self.workflowTimestamps:
                    self.workflowTimestamps[workflowName] = timestamp
                if workflowName not in self.workflowPrios:
                    self.workflowPrios[workflowName] = prio

                locTypeCache[workflowName].add(jobID)

            # allow job baggage to override numberOfCores
            #       => used for repacking to get more slots/disk
            numberOfCores = loadedJob.get('numberOfCores', 1)
            if numberOfCores == 1:
                baggage = loadedJob.getBaggage()
                numberOfCores = getattr(baggage, "numberOfCores", 1)
            loadedJob['numberOfCores'] = numberOfCores

            # check if job should be submitted as highIO job
            highIOjob = False
            if newJob['type'] in ["Repack", "Merge", "Cleanup", "LogCollect"]:
                highIOjob = True

            # Now that we're out of that loop, put the job data in the cache
            jobInfo = (jobID, newJob["retry_count"], batchDir,
                       loadedJob["sandbox"], loadedJob["cache_dir"],
                       loadedJob.get("ownerDN",
                                     None), loadedJob.get("ownerGroup", ''),
                       loadedJob.get("ownerRole",
                                     ''), frozenset(possibleLocations),
                       loadedJob.get("scramArch", None),
                       loadedJob.get("swVersion", None), loadedJob["name"],
                       loadedJob.get("proxyPath",
                                     None), newJob['request_name'],
                       loadedJob.get("estimatedJobTime", None),
                       loadedJob.get("estimatedDiskUsage", None),
                       loadedJob.get("estimatedMemoryUsage",
                                     None), newJob['task_name'],
                       frozenset(potentialLocations),
                       loadedJob.get("numberOfCores", 1), newJob['task_id'],
                       loadedJob.get('inputDataset', None),
                       loadedJob.get('inputDatasetLocations', None),
                       loadedJob.get('allowOpportunistic', False), highIOjob)

            self.jobDataCache[workflowName][jobID] = jobInfo

        # Register failures in submission
        for errorCode in badJobs:
            if badJobs[errorCode]:
                logging.debug(
                    "The following jobs could not be submitted: %s, error code : %d",
                    badJobs, errorCode)
                self._handleSubmitFailedJobs(badJobs[errorCode], errorCode)

        # If there are any leftover jobs, we want to get rid of them.
        self.flushJobPackages()
        logging.info("Done with refreshCache() loop, pruning killed jobs.")

        # We need to remove any jobs from the cache that were not returned in
        # the last call to the database.
        jobIDsToPurge = self.cachedJobIDs - dbJobs
        self.cachedJobIDs -= jobIDsToPurge

        if len(jobIDsToPurge) == 0:
            return

        for siteName in self.cachedJobs.keys():
            for taskType in self.cachedJobs[siteName].keys():
                for workflow in self.cachedJobs[siteName][taskType].keys():
                    for cachedJobID in list(
                            self.cachedJobs[siteName][taskType][workflow]):
                        if cachedJobID in jobIDsToPurge:
                            self.cachedJobs[siteName][taskType][
                                workflow].remove(cachedJobID)
                            try:
                                del self.jobDataCache[workflow][cachedJobID]
                            except KeyError:
                                # Already gone
                                pass
        logging.info("Done pruning killed jobs, moving on to submit.")
        return

    def _handleSubmitFailedJobs(self, badJobs, exitCode):
        """
        __handleSubmitFailedJobs_

        For a default job report for the exitCode
        and register in the job. Preserve it on disk as well.
        Propagate the failure to the JobStateMachine.
        """
        fwjrBinds = []
        for job in badJobs:
            job['couch_record'] = None
            job['fwjr'] = Report()
            if exitCode in (71102, 71104):
                job['fwjr'].addError(
                    "JobSubmit", exitCode, "SubmitFailed",
                    WM_JOB_ERROR_CODES[exitCode] +
                    ', '.join(job['possibleLocations']))
            else:
                job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed",
                                     WM_JOB_ERROR_CODES[exitCode])
            fwjrPath = os.path.join(job['cache_dir'],
                                    'Report.%d.pkl' % int(job['retry_count']))
            job['fwjr'].setJobID(job['id'])
            try:
                job['fwjr'].save(fwjrPath)
                fwjrBinds.append({"jobid": job["id"], "fwjrpath": fwjrPath})
            except IOError as ioer:
                logging.error(
                    "Failed to write FWJR for submit failed job %d, message: %s",
                    job['id'], str(ioer))
        self.changeState.propagate(badJobs, "submitfailed", "created")
        self.setFWJRPathAction.execute(binds=fwjrBinds)
        return

    def getThresholds(self):
        """
        _getThresholds_

        Reformat the submit thresholds.  This will store a dictionary keyed by
        task type.  Each task type will contain a list of tuples where each
        tuple contains teh site name and the number of running jobs.
        """
        rcThresholds = self.resourceControl.listThresholdsForSubmit()

        newDrainSites = set()
        newAbortSites = set()

        for siteName in rcThresholds.keys():
            # Add threshold if we don't have it already
            state = rcThresholds[siteName]["state"]

            if state == "Draining":
                newDrainSites.add(siteName)
            if state in ["Down", "Aborted"]:
                newAbortSites.add(siteName)

            for pnn in rcThresholds[siteName]["pnns"]:
                if not pnn in self.siteKeys.keys():
                    self.siteKeys[pnn] = []
                if not siteName in self.siteKeys[pnn]:
                    self.siteKeys[pnn].append(siteName)

        # When the list of drain/abort sites changes between iteration then a location
        # refresh is needed, for now it forces a full cache refresh
        # TODO: Make it more efficient, only reshuffle locations when this happens
        if newDrainSites != self.drainSites or newAbortSites != self.abortSites:
            logging.info(
                "Draining or Aborted sites have changed, the cache will be rebuilt."
            )
            self.cachedJobIDs = set()
            self.cachedJobs = {}
            self.jobDataCache = {}

        # Sort the sites, utilizing the fact python has a stable sort function - we can simply
        # sort twice.
        # - First, fill up the smaller tiers
        # - Within a tier, fill up the smallest sites first
        # Of course, condor does not care about our submission order.  We sort the sites because
        # WMAgent will stop submitting once we hit a pending threshold.  We assume the T1s
        # will have the workflows with the most restrictive whitelist and want to hit their
        # WMAgent-calculated limit last.
        self.sortedSites = sorted(
            rcThresholds.keys(),
            key=lambda x: rcThresholds[x]["total_pending_slots"])
        #T3 sites go first, then T2, then T1
        self.sortedSites = sorted(
            self.sortedSites,
            key=lambda x: rcThresholds[x]["cms_name"][0:2],
            reverse=True)
        logging.debug('Sites will be filled in the following order: %s',
                      self.sortedSites)

        self.currentRcThresholds = rcThresholds
        self.abortSites = newAbortSites
        self.drainSites = newDrainSites

        return

    def assignJobLocations(self):
        """
        _assignJobLocations_

        Loop through the submit thresholds and pull sites out of the job cache
        as we discover open slots.  This will return a list of tuple where each
        tuple will have six elements:
          - WMBS Job ID
          - Retry count
          - Batch ID
          - Path to sanbox
          - Path to cache directory
          - SE name of the site to run at
        """
        jobsToSubmit = {}
        jobsToPrune = {}
        jobsCount = 0
        exitLoop = False

        for siteName in self.sortedSites:
            if exitLoop:
                break

            totalPending = None
            if siteName not in self.cachedJobs:
                logging.debug("No jobs for site %s", siteName)
                continue
            logging.debug("Have site %s", siteName)
            try:
                totalPendingSlots = self.currentRcThresholds[siteName][
                    "total_pending_slots"]
                totalRunningSlots = self.currentRcThresholds[siteName][
                    "total_running_slots"]
                totalRunning = self.currentRcThresholds[siteName][
                    "total_running_jobs"]
                totalPending = self.currentRcThresholds[siteName][
                    "total_pending_jobs"]
                state = self.currentRcThresholds[siteName]["state"]
            except KeyError as ex:
                msg = "Had invalid site info %s\n" % siteName['thresholds']
                msg += str(ex)
                logging.error(msg)
                continue
            for threshold in self.currentRcThresholds[siteName].get(
                    'thresholds', []):
                if exitLoop:
                    break
                try:
                    # Pull basic info for the threshold
                    taskType = threshold["task_type"]
                    maxSlots = threshold["max_slots"]
                    taskPendingSlots = threshold["pending_slots"]
                    taskRunning = threshold["task_running_jobs"]
                    taskPending = threshold["task_pending_jobs"]
                    taskPriority = threshold["priority"]
                except KeyError as ex:
                    msg = "Had invalid threshold %s\n" % threshold
                    msg += str(ex)
                    logging.error(msg)
                    continue

                #If the site is down, do not submit anything to it
                if state == 'Down':
                    continue

                #If the number of running exceeded the running slots in the
                #site then get out
                if totalRunningSlots >= 0 and totalRunning >= totalRunningSlots:
                    continue

                #If the task is running more than allowed then get out
                if maxSlots >= 0 and taskRunning >= maxSlots:
                    continue

                # Ignore this threshold if we've cleaned out the site
                if siteName not in self.cachedJobs:
                    continue

                # Ignore this threshold if we have no jobs
                # for it
                if taskType not in self.cachedJobs[siteName]:
                    continue

                taskCache = self.cachedJobs[siteName][taskType]

                # Calculate number of jobs we need
                nJobsRequired = min(totalPendingSlots - totalPending,
                                    taskPendingSlots - taskPending)
                breakLoop = False
                logging.debug("nJobsRequired for task %s: %i", taskType,
                              nJobsRequired)

                while nJobsRequired > 0:
                    # Do this until we have all the jobs for this threshold

                    # Pull a job out of the cache for the task/site.  Verify that we
                    # haven't already used this job in this polling cycle.
                    cachedJob = None
                    cachedJobWorkflow = None

                    workflows = taskCache.keys()

                    # Sorting by prio and timestamp on the subscription
                    def sortingCmp(x, y):
                        if (x not in self.workflowTimestamps) or (
                                y not in self.workflowTimestamps):
                            return -1
                        if (x not in self.workflowPrios) or (
                                y not in self.workflowPrios):
                            return -1
                        if self.workflowPrios[x] > self.workflowPrios[y]:
                            return -1
                        elif self.workflowPrios[x] == self.workflowPrios[y]:
                            return cmp(self.workflowTimestamps[x],
                                       self.workflowTimestamps[y])
                        else:
                            return 1

                    workflows.sort(cmp=sortingCmp)

                    for workflow in workflows:
                        # Run a while loop until you get a job
                        while len(taskCache[workflow]) > 0:
                            cachedJobID = taskCache[workflow].pop()
                            try:
                                cachedJob = self.jobDataCache[workflow].get(
                                    cachedJobID, None)
                                del self.jobDataCache[workflow][cachedJobID]
                            except:
                                pass

                            if cachedJobID not in jobsToPrune.get(
                                    workflow, set()):
                                cachedJobWorkflow = workflow
                                break
                            else:
                                cachedJob = None

                        # Remove the entry in the cache for the workflow if it is empty.
                        if len(self.cachedJobs[siteName][taskType]
                               [workflow]) == 0:
                            del self.cachedJobs[siteName][taskType][workflow]
                        if workflow in self.jobDataCache and len(
                                self.jobDataCache[workflow].keys()) == 0:
                            del self.jobDataCache[workflow]

                        if cachedJob:
                            # We found a job, bail out and handle it.
                            break

                    # Check to see if we need to delete this site from the cache
                    if len(self.cachedJobs[siteName][taskType].keys()) == 0:
                        del self.cachedJobs[siteName][taskType]
                        breakLoop = True
                    if len(self.cachedJobs[siteName].keys()) == 0:
                        del self.cachedJobs[siteName]
                        breakLoop = True

                    if not cachedJob:
                        # We didn't find a job, bail out.
                        # This site and task type is done
                        break

                    self.cachedJobIDs.remove(cachedJob[0])

                    if cachedJobWorkflow not in jobsToPrune:
                        jobsToPrune[cachedJobWorkflow] = set()

                    jobsToPrune[cachedJobWorkflow].add(cachedJob[0])

                    # Sort jobs by jobPackage
                    package = cachedJob[2]
                    if not package in jobsToSubmit.keys():
                        jobsToSubmit[package] = []

                    # Add the sandbox to a global list
                    self.sandboxPackage[package] = cachedJob[3]

                    possibleSites = cachedJob[8]
                    # We used to pick a site at random from all the possible ones and
                    # attribute the job as 'pending' there (some WMAgent components and
                    # Dashboard doesn't understand jobs pending at multiple sites).  HOWEVER,
                    # this caused some sites to go way over their pending job limit (for
                    # example, what if the site name "Nebraska" comes up 10,000 jobs in a row).
                    # This would cause acquired high-priority jobs to starve on future rounds.
                    assignedSiteName = siteName
                    potentialSites = cachedJob[18]

                    # Create a job dictionary object
                    jobDict = {
                        'id': cachedJob[0],
                        'retry_count': cachedJob[1],
                        'custom': {
                            'location': assignedSiteName
                        },
                        'cache_dir': cachedJob[4],
                        'packageDir': package,
                        'userdn': cachedJob[5],
                        'usergroup': cachedJob[6],
                        'userrole': cachedJob[7],
                        'priority': taskPriority,
                        'taskType': taskType,
                        'possibleSites': possibleSites,
                        'scramArch': cachedJob[9],
                        'swVersion': cachedJob[10],
                        'name': cachedJob[11],
                        'proxyPath': cachedJob[12],
                        'requestName': cachedJob[13],
                        'estimatedJobTime': cachedJob[14],
                        'estimatedDiskUsage': cachedJob[15],
                        'estimatedMemoryUsage': cachedJob[16],
                        'taskPriority': self.workflowPrios[workflow],
                        'taskName': cachedJob[17],
                        'potentialSites': potentialSites,
                        'numberOfCores': cachedJob[19],
                        'taskID': cachedJob[20],
                        'inputDataset': cachedJob[21],
                        'inputDatasetLocations': cachedJob[22],
                        'allowOpportunistic': cachedJob[23],
                        'highIOjob': cachedJob[24]
                    }

                    # Add to jobsToSubmit
                    jobsToSubmit[package].append(jobDict)
                    jobsCount += 1
                    if jobsCount >= self.maxJobsPerPoll:
                        breakLoop, exitLoop = True, True

                    # Deal with accounting
                    nJobsRequired -= 1
                    totalPending += 1
                    taskPending += 1

                    if breakLoop:
                        break

        # Remove the jobs that we're going to submit from the cache.
        allWorkflows = set()
        for siteName in self.cachedJobs.keys():
            for taskType in self.cachedJobs[siteName].keys():
                for workflow in self.cachedJobs[siteName][taskType].keys():
                    allWorkflows.add(workflow)
                    if workflow in jobsToPrune.keys():
                        self.cachedJobs[siteName][taskType][
                            workflow] -= jobsToPrune[workflow]

        # Remove workflows from the timestamp dictionary which are not anymore in the cache
        workflowsWithTimestamp = self.workflowTimestamps.keys()
        for workflow in workflowsWithTimestamp:
            if workflow not in allWorkflows:
                del self.workflowTimestamps[workflow]

        workflowsWithPrios = self.workflowPrios.keys()
        for workflow in workflowsWithPrios:
            if workflow not in allWorkflows:
                del self.workflowPrios[workflow]

        logging.info("Have %s packages to submit.", len(jobsToSubmit))
        logging.info("Done assigning site locations.")
        return jobsToSubmit

    def submitJobs(self, jobsToSubmit):
        """
        _submitJobs_

        Actually do the submission of the jobs
        """

        jobList = []
        idList = []

        if len(jobsToSubmit) == 0:
            logging.debug("There are no packages to submit.")
            return

        for package in jobsToSubmit.keys():

            sandbox = self.sandboxPackage[package]
            jobs = jobsToSubmit.get(package, [])

            for job in jobs:
                job['location'], job['plugin'], job[
                    'site_cms_name'] = self.getSiteInfo(
                        job['custom']['location'])
                job['sandbox'] = sandbox
                idList.append({
                    'jobid': job['id'],
                    'location': job['custom']['location']
                })

            #Clean out the package reference
            del self.sandboxPackage[package]

            jobList.extend(jobs)

        myThread = threading.currentThread()
        myThread.transaction.begin()

        # Run the actual underlying submit code using bossAir
        successList, failList = self.bossAir.submit(jobs=jobList)
        logging.info("Jobs that succeeded/failed submission: %d/%d.",
                     len(successList), len(failList))

        # Propagate states in the WMBS database
        logging.debug("Propagating success state to WMBS.")
        self.changeState.propagate(successList, 'executing', 'created')
        logging.debug("Propagating fail state to WMBS.")
        self.changeState.propagate(failList, 'submitfailed', 'created')

        # At the end we mark the locations of the jobs
        # This applies even to failed jobs, since the location
        # could be part of the failure reason.
        logging.debug("Updating job location...")
        self.setLocationAction.execute(bulkList=idList,
                                       conn=myThread.transaction.conn,
                                       transaction=True)
        myThread.transaction.commit()
        logging.info("Transaction cycle successfully completed.")

        return

    def getSiteInfo(self, jobSite):
        """
        _getSiteInfo_

        This is how you get the name of a CE and the plugin for a job
        """

        if not jobSite in self.locationDict.keys():
            siteInfo = self.locationAction.execute(siteName=jobSite)
            self.locationDict[jobSite] = siteInfo[0]
        return (self.locationDict[jobSite].get('ce_name'),
                self.locationDict[jobSite].get('plugin'),
                self.locationDict[jobSite].get('cms_name'))

    def algorithm(self, parameters=None):
        """
        _algorithm_

        Try to, in order:
        1) Refresh the cache
        2) Find jobs for all the necessary sites
        3) Submit the jobs to the plugin
        """

        try:
            myThread = threading.currentThread()
            self.getThresholds()
            self.refreshCache()
            jobsToSubmit = self.assignJobLocations()
            self.submitJobs(jobsToSubmit=jobsToSubmit)

        except WMException:
            if getattr(myThread, 'transaction', None) != None:
                myThread.transaction.rollback()
            raise
        except Exception as ex:
            msg = 'Fatal error in JobSubmitter:\n'
            msg += str(ex)
            #msg += str(traceback.format_exc())
            msg += '\n\n'
            logging.error(msg)
            self.sendAlert(7, msg=msg)
            if getattr(myThread, 'transaction', None) != None:
                myThread.transaction.rollback()
            raise JobSubmitterPollerException(msg)

        return

    def terminate(self, params):
        """
        _terminate_

        Kill the code after one final pass when called by the master thread.
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)
示例#14
0
class JobSubmitterPoller(BaseWorkerThread):
    """
    _JobSubmitterPoller_

    The jobSubmitterPoller takes the jobs and organizes them into packages
    before sending them to the individual plugin submitters.
    """
    def __init__(self, config):
        BaseWorkerThread.__init__(self)
        myThread = threading.currentThread()

        #DAO factory for WMBS objects
        self.daoFactory = DAOFactory(package = "WMCore.WMBS", \
                                     logger = logging,
                                     dbinterface = myThread.dbi)
        self.config = config

        #Libraries
        self.resourceControl = ResourceControl()

        self.changeState = ChangeState(self.config)
        self.repollCount = getattr(self.config.JobSubmitter, 'repollCount', 10000)
        self.maxJobsPerPoll = int(getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000))

        # BossAir
        self.bossAir = BossAirAPI(config = self.config)

        # Additions for caching-based JobSubmitter
        self.workflowTimestamps = {}
        self.workflowPrios      = {}
        self.cachedJobIDs       = set()
        self.cachedJobs         = {}
        self.jobDataCache       = {}
        self.jobsToPackage      = {}
        self.sandboxPackage     = {}
        self.siteKeys           = {}
        self.locationDict       = {}
        self.cmsNames           = {}
        self.drainSites         = set()
        self.abortSites         = set()
        self.sortedSites        = []
        self.packageSize        = getattr(self.config.JobSubmitter, 'packageSize', 500)
        self.collSize           = getattr(self.config.JobSubmitter, 'collectionSize',
                                          self.packageSize * 1000)

        # initialize the alert framework (if available)
        self.initAlerts(compName = "JobSubmitter")

        try:
            if not getattr(self.config.JobSubmitter, 'submitDir', None):
                self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir
            self.packageDir = os.path.join(self.config.JobSubmitter.submitDir, 'packages')

            if not os.path.exists(self.packageDir):
                os.makedirs(self.packageDir)
        except Exception as ex:
            msg =  "Error while trying to create packageDir %s\n!"
            msg += str(ex)
            logging.error(msg)
            self.sendAlert(6, msg = msg)
            try:
                logging.debug("PackageDir: %s" % self.packageDir)
                logging.debug("Config: %s" % config)
            except:
                pass
            raise JobSubmitterPollerException(msg)


        # Now the DAOs
        self.listJobsAction = self.daoFactory(classname = "Jobs.ListForSubmitter")
        self.setLocationAction = self.daoFactory(classname = "Jobs.SetLocation")
        self.locationAction = self.daoFactory(classname = "Locations.GetSiteInfo")
        self.setFWJRPathAction = self.daoFactory(classname = "Jobs.SetFWJRPath")
        self.listWorkflows = self.daoFactory(classname = "Workflow.ListForSubmitter")

        # Keep a record of the thresholds in memory
        self.currentRcThresholds = {}

        return

    def getPackageCollection(self, sandboxDir):
        """
        _getPackageCollection_

        Given a jobID figure out which packageCollection
        it should belong in.
        """

        rawList     = os.listdir(sandboxDir)
        collections = []
        numberList  = []
        for entry in rawList:
            if 'PackageCollection' in entry:
                collections.append(entry)

        # If we have no collections, return 0 (PackageCollection_0)
        if len(collections) < 1:
            return 0

        # Loop over the list of PackageCollections
        for collection in collections:
            collectionPath = os.path.join(sandboxDir, collection)
            packageList    = os.listdir(collectionPath)
            collectionNum  = int(collection.split('_')[1])
            if len(packageList) < self.collSize:
                return collectionNum
            else:
                numberList.append(collectionNum)

        # If we got here, then all collections are full.  We'll need
        # a new one.  Find the highest number, increment by one
        numberList.sort()
        return numberList[-1] + 1

    def addJobsToPackage(self, loadedJob):
        """
        _addJobsToPackage_

        Add a job to a job package and then return the batch ID for the job.
        Packages are only written out to disk when they contain 100 jobs.  The
        flushJobsPackages() method must be called after all jobs have been added
        to the cache and before they are actually submitted to make sure all the
        job packages have been written to disk.
        """
        if loadedJob["workflow"] not in self.jobsToPackage:
            # First, let's pull all the information from the loadedJob
            batchid    = "%s-%s" % (loadedJob["id"], loadedJob["retry_count"])
            sandboxDir = os.path.dirname(loadedJob["sandbox"])

            # Second, assemble the jobPackage location
            collectionIndex = self.getPackageCollection(sandboxDir)
            collectionDir   = os.path.join(sandboxDir,
                                           'PackageCollection_%i' % collectionIndex,
                                           'batch_%s' % batchid)

            # Now create the package object
            self.jobsToPackage[loadedJob["workflow"]] = {"batchid": batchid,
                                                         'id': loadedJob['id'],
                                                         "package": JobPackage(directory = collectionDir)}

        jobPackage = self.jobsToPackage[loadedJob["workflow"]]["package"]
        jobPackage[loadedJob["id"]] = loadedJob.getDataStructsJob()
        batchDir = jobPackage['directory']

        if len(jobPackage.keys()) == self.packageSize:
            if not os.path.exists(batchDir):
                os.makedirs(batchDir)

            batchPath = os.path.join(batchDir, "JobPackage.pkl")
            jobPackage.save(batchPath)
            del self.jobsToPackage[loadedJob["workflow"]]

        return batchDir

    def flushJobPackages(self):
        """
        _flushJobPackages_

        Write any jobs packages to disk that haven't been written out already.
        """
        workflowNames = self.jobsToPackage.keys()
        for workflowName in workflowNames:
            batchID    = self.jobsToPackage[workflowName]["batchid"]
            id         = self.jobsToPackage[workflowName]["id"]
            jobPackage = self.jobsToPackage[workflowName]["package"]
            batchDir   = jobPackage['directory']

            if not os.path.exists(batchDir):
                os.makedirs(batchDir)

            batchPath = os.path.join(batchDir, "JobPackage.pkl")
            jobPackage.save(batchPath)
            del self.jobsToPackage[workflowName]

        return

    def refreshCache(self):
        """
        _refreshCache_

        Query WMBS for all jobs in the 'created' state.  For all jobs returned
        from the query, check if they already exist in the cache.  If they
        don't, unpickle them and combine their site white and black list with
        the list of locations they can run at.  Add them to the cache.

        Each entry in the cache is a tuple with five items:
          - WMBS Job ID
          - Retry count
          - Batch ID
          - Path to sanbox
          - Path to cache directory
        """
        badJobs = dict([(x, []) for x in range(61101,61105)])
        dbJobs = set()

        logging.info("Refreshing priority cache...")
        workflows = self.listWorkflows.execute()
        workflows = filter(lambda x: x['name'] in self.workflowPrios, workflows)
        for workflow in workflows:
            self.workflowPrios[workflow['name']] = workflow['priority']

        logging.info("Querying WMBS for jobs to be submitted...")
        newJobs = self.listJobsAction.execute()
        logging.info("Found %s new jobs to be submitted." % len(newJobs))

        logging.info("Determining possible sites for new jobs...")
        jobCount = 0
        for newJob in newJobs:
            jobID = newJob['id']
            dbJobs.add(jobID)
            if jobID in self.cachedJobIDs:
                continue

            jobCount += 1
            if jobCount % 5000 == 0:
                logging.info("Processed %d/%d new jobs." % (jobCount, len(newJobs)))

            pickledJobPath = os.path.join(newJob["cache_dir"], "job.pkl")

            if not os.path.isfile(pickledJobPath):
                # Then we have a problem - there's no file
                logging.error("Could not find pickled jobObject %s" % pickledJobPath)
                badJobs[61103].append(newJob)
                continue
            try:
                jobHandle = open(pickledJobPath, "r")
                loadedJob = cPickle.load(jobHandle)
                jobHandle.close()
            except Exception as ex:
                msg =  "Error while loading pickled job object %s\n" % pickledJobPath
                msg += str(ex)
                logging.error(msg)
                self.sendAlert(6, msg = msg)
                raise JobSubmitterPollerException(msg)

            loadedJob['retry_count'] = newJob['retry_count']

            siteWhitelist = loadedJob.get("siteWhitelist", [])
            siteBlacklist = loadedJob.get("siteBlacklist", [])
            trustSitelists = loadedJob.get("trustSitelists", False)

            # convert site lists into correct format
            if len(siteWhitelist) > 0:
                whitelist = []
                for cmsName in siteWhitelist:
                    whitelist.extend(self.cmsNames.get(cmsName, []))
                siteWhitelist = whitelist
            if len(siteBlacklist) > 0:
                blacklist = []
                for cmsName in siteBlacklist:
                    blacklist.extend(self.cmsNames.get(cmsName, []))
                siteBlacklist = blacklist

            # figure out possible locations for job
            if trustSitelists:
                possibleLocations = set(siteWhitelist) - set(siteBlacklist)
            else:
                possibleLocations = set()

                # all files in job have same location (in se names)
                rawLocations = loadedJob["input_files"][0]["locations"]

                # transform se names into site names
                for loc in rawLocations:
                    if not loc in self.siteKeys.keys():
                        # Then we have a problem
                        logging.error('Encountered unknown location %s for job %i' % (loc, jobID))
                        logging.error('Ignoring for now, but watch out for this')
                    else:
                        for siteName in self.siteKeys[loc]:
                            possibleLocations.add(siteName)

                # filter with site lists
                if len(siteWhitelist) > 0:
                    possibleLocations = possibleLocations & set(siteWhitelist)
                if len(siteBlacklist) > 0:
                    possibleLocations = possibleLocations - set(siteBlacklist)

            # Create another set of locations that may change when a site goes white/black listed
            # Does not care about the non_draining or aborted sites, they may change and that is the point
            potentialLocations = set()
            potentialLocations.update(possibleLocations)

            # now check for sites in drain and adjust the possible locations
            # also check if there is at least one site left to run the job
            if len(possibleLocations) == 0:
                newJob['name'] = loadedJob['name']
                badJobs[61101].append(newJob)
                continue
            else :
                non_abort_sites = [x for x in possibleLocations if x not in self.abortSites]
                if non_abort_sites: # if there is at least a non aborted/down site then run there, otherwise fail the job
                    possibleLocations = non_abort_sites
                else:
                    newJob['name'] = loadedJob['name']
                    newJob['possibleLocations'] = possibleLocations
                    badJobs[61102].append(newJob)
                    continue

            # try to remove draining sites if possible, this is needed to stop
            # jobs that could run anywhere blocking draining sites
            # if the job type is Merge, LogCollect or Cleanup this is skipped
            if newJob['type'] not in ('LogCollect','Merge','Cleanup','Harvesting'):
                non_draining_sites = [x for x in possibleLocations if x not in self.drainSites]
                if non_draining_sites: # if >1 viable non-draining site remove draining ones
                    possibleLocations = non_draining_sites
                else:
                    newJob['name'] = loadedJob['name']
                    newJob['possibleLocations'] = possibleLocations
                    badJobs[61104].append(newJob)
                    continue

            batchDir = self.addJobsToPackage(loadedJob)
            self.cachedJobIDs.add(jobID)

            for possibleLocation in possibleLocations:
                if possibleLocation not in self.cachedJobs:
                    self.cachedJobs[possibleLocation] = {}
                if newJob["type"] not in self.cachedJobs[possibleLocation]:
                    self.cachedJobs[possibleLocation][newJob["type"]] = {}

                locTypeCache = self.cachedJobs[possibleLocation][newJob["type"]]
                workflowName = newJob['workflow']
                timestamp    = newJob['timestamp']
                prio         = newJob['task_priority']
                if workflowName not in locTypeCache:
                    locTypeCache[workflowName] = set()
                if workflowName not in self.jobDataCache:
                    self.jobDataCache[workflowName] = {}
                if not workflowName in self.workflowTimestamps:
                    self.workflowTimestamps[workflowName] = timestamp
                if workflowName not in self.workflowPrios:
                    self.workflowPrios[workflowName] = prio

                locTypeCache[workflowName].add(jobID)

            # allow job baggage to override numberOfCores
            #       => used for repacking to get more slots/disk
            numberOfCores = loadedJob.get('numberOfCores', 1)
            if numberOfCores == 1:
                baggage = loadedJob.getBaggage()
                numberOfCores = getattr(baggage, "numberOfCores", 1)
            loadedJob['numberOfCores'] = numberOfCores

            # Now that we're out of that loop, put the job data in the cache
            jobInfo = (jobID,
                       newJob["retry_count"],
                       batchDir,
                       loadedJob["sandbox"],
                       loadedJob["cache_dir"],
                       loadedJob.get("ownerDN", None),
                       loadedJob.get("ownerGroup", ''),
                       loadedJob.get("ownerRole", ''),
                       frozenset(possibleLocations),
                       loadedJob.get("scramArch", None),
                       loadedJob.get("swVersion", None),
                       loadedJob["name"],
                       loadedJob.get("proxyPath", None),
                       newJob['request_name'],
                       loadedJob.get("estimatedJobTime", None),
                       loadedJob.get("estimatedDiskUsage", None),
                       loadedJob.get("estimatedMemoryUsage", None),
                       newJob['task_name'],
                       frozenset(potentialLocations),
                       loadedJob.get("numberOfCores", 1),
                       newJob['task_id']
                       )

            self.jobDataCache[workflowName][jobID] = jobInfo

        # Register failures in submission
        for errorCode in badJobs:
            if badJobs[errorCode]:
                logging.debug("The following jobs could not be submitted: %s, error code : %d" % (badJobs, errorCode))
                self._handleSubmitFailedJobs(badJobs[errorCode], errorCode)

        # If there are any leftover jobs, we want to get rid of them.
        self.flushJobPackages()
        logging.info("Done with refreshCache() loop, pruning killed jobs.")

        # We need to remove any jobs from the cache that were not returned in
        # the last call to the database.
        jobIDsToPurge = self.cachedJobIDs - dbJobs
        self.cachedJobIDs -= jobIDsToPurge

        if len(jobIDsToPurge) == 0:
            return

        for siteName in self.cachedJobs.keys():
            for taskType in self.cachedJobs[siteName].keys():
                for workflow in self.cachedJobs[siteName][taskType].keys():
                    for cachedJobID in list(self.cachedJobs[siteName][taskType][workflow]):
                        if cachedJobID in jobIDsToPurge:
                            self.cachedJobs[siteName][taskType][workflow].remove(cachedJobID)
                            try:
                                del self.jobDataCache[workflow][cachedJobID]
                            except KeyError:
                                # Already gone
                                pass

        logging.info("Done pruning killed jobs, moving on to submit.")
        return

    def _handleSubmitFailedJobs(self, badJobs, exitCode):
        """
        __handleSubmitFailedJobs_

        For a default job report for the exitCode
        and register in the job. Preserve it on disk as well.
        Propagate the failure to the JobStateMachine.
        """
        fwjrBinds = []
        for job in badJobs:
            job['couch_record'] = None
            job['fwjr'] = Report()
            if exitCode in (61102, 61104):
                job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WMJobErrorCodes[exitCode] + ', '.join(job['possibleLocations']))
            else:
                job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WMJobErrorCodes[exitCode] )
            fwjrPath = os.path.join(job['cache_dir'],
                                    'Report.%d.pkl' % int(job['retry_count']))
            job['fwjr'].setJobID(job['id'])
            try:
                job['fwjr'].save(fwjrPath)
                fwjrBinds.append({"jobid" : job["id"], "fwjrpath" : fwjrPath})
            except IOError as ioer:
                logging.error("Failed to write FWJR for submit failed job %d, message: %s" % (job['id'], str(ioer)))
        self.changeState.propagate(badJobs, "submitfailed", "created")
        self.setFWJRPathAction.execute(binds = fwjrBinds)
        return

    def getThresholds(self):
        """
        _getThresholds_

        Reformat the submit thresholds.  This will store a dictionary keyed by
        task type.  Each task type will contain a list of tuples where each
        tuple contains teh site name and the number of running jobs.
        """
        rcThresholds = self.resourceControl.listThresholdsForSubmit()

        newDrainSites = set()
        newAbortSites = set()

        for siteName in rcThresholds.keys():
            # Add threshold if we don't have it already
            cmsName = rcThresholds[siteName]["cms_name"]
            state   = rcThresholds[siteName]["state"]

            if not cmsName in self.cmsNames.keys():
                self.cmsNames[cmsName] = []
            if not siteName in self.cmsNames[cmsName]:
                self.cmsNames[cmsName].append(siteName)
            if state == "Draining":
                newDrainSites.add(siteName)
            if state in ["Down", "Aborted"]:
                newAbortSites.add(siteName)

            for seName in rcThresholds[siteName]["se_names"]:
                if not seName in self.siteKeys.keys():
                    self.siteKeys[seName] = []
                if not siteName in self.siteKeys[seName]:
                    self.siteKeys[seName].append(siteName)

        # When the list of drain/abort sites changes between iteration then a location
        # refresh is needed, for now it forces a full cache refresh
        # TODO: Make it more efficient, only reshuffle locations when this happens
        if newDrainSites != self.drainSites or  newAbortSites != self.abortSites:
            logging.info("Draining or Aborted sites have changed, the cache will be rebuilt.")
            self.cachedJobIDs       = set()
            self.cachedJobs         = {}
            self.jobDataCache       = {}

        #Sort the sites using the following criteria:
        #T1 sites go first, then T2, then T3
        #After that we fill first the bigger ones
        #Python sorting is stable so let's do 2 sort passes, it should be fast
        #Assume  that all CMS names start with T[0-9]+, which the lexicon guarantees
        self.sortedSites = sorted(rcThresholds.keys(),
                                  key = lambda x : rcThresholds[x]["total_pending_slots"],
                                  reverse = True)
        self.sortedSites = sorted(self.sortedSites, key = lambda x : rcThresholds[x]["cms_name"][0:2])
        logging.debug('Sites will be filled in the following order: %s' % str(self.sortedSites))

        self.currentRcThresholds = rcThresholds
        self.abortSites = newAbortSites
        self.drainSites = newDrainSites

        return

    def assignJobLocations(self):
        """
        _assignJobLocations_

        Loop through the submit thresholds and pull sites out of the job cache
        as we discover open slots.  This will return a list of tuple where each
        tuple will have six elements:
          - WMBS Job ID
          - Retry count
          - Batch ID
          - Path to sanbox
          - Path to cache directory
          - SE name of the site to run at
        """
        jobsToSubmit = {}
        jobsToPrune = {}
        jobsCount = 0
        exitLoop = False 

        for siteName in self.sortedSites:
            if exitLoop:
                break

            totalPending = None
            if siteName not in self.cachedJobs:
                logging.debug("No jobs for site %s" % siteName)
                continue
            logging.debug("Have site %s" % siteName)
            try:
                totalPendingSlots   = self.currentRcThresholds[siteName]["total_pending_slots"]
                totalRunningSlots   = self.currentRcThresholds[siteName]["total_running_slots"]
                totalRunning        = self.currentRcThresholds[siteName]["total_running_jobs"]
                totalPending        = self.currentRcThresholds[siteName]["total_pending_jobs"]
                state               = self.currentRcThresholds[siteName]["state"]
            except KeyError as ex:
                msg =  "Had invalid site info %s\n" % siteName['thresholds']
                msg += str(ex)
                logging.error(msg)
                continue
            for threshold in self.currentRcThresholds[siteName].get('thresholds', []):
                if exitLoop:
                    break
                try:
                    # Pull basic info for the threshold
                    taskType            = threshold["task_type"]
                    maxSlots            = threshold["max_slots"]
                    taskPendingSlots    = threshold["pending_slots"]
                    taskRunning         = threshold["task_running_jobs"]
                    taskPending         = threshold["task_pending_jobs"]
                    taskPriority        = threshold["priority"]
                except KeyError as ex:
                    msg =  "Had invalid threshold %s\n" % threshold
                    msg += str(ex)
                    logging.error(msg)
                    continue

                #If the site is down, do not submit anything to it
                if state == 'Down':
                    continue

                #If the number of running exceeded the running slots in the
                #site then get out
                if totalRunningSlots >= 0 and totalRunning >= totalRunningSlots:
                    continue

                #If the task is running more than allowed then get out
                if maxSlots >= 0 and taskRunning >= maxSlots:
                    continue

                # Ignore this threshold if we've cleaned out the site
                if siteName not in self.cachedJobs:
                    continue

                # Ignore this threshold if we have no jobs
                # for it
                if taskType not in self.cachedJobs[siteName]:
                    continue

                taskCache = self.cachedJobs[siteName][taskType]

                # Calculate number of jobs we need
                nJobsRequired = min(totalPendingSlots - totalPending, taskPendingSlots - taskPending)
                breakLoop = False
                logging.debug("nJobsRequired for task %s: %i" % (taskType, nJobsRequired))

                while nJobsRequired > 0:
                    # Do this until we have all the jobs for this threshold

                    # Pull a job out of the cache for the task/site.  Verify that we
                    # haven't already used this job in this polling cycle.
                    cachedJob = None
                    cachedJobWorkflow = None

                    workflows = taskCache.keys()
                    # Sorting by prio and timestamp on the subscription
                    def sortingCmp(x, y):
                        if (x not in self.workflowTimestamps) or (y not in self.workflowTimestamps):
                            return -1
                        if (x not in self.workflowPrios) or (y not in self.workflowPrios):
                            return -1
                        if self.workflowPrios[x] > self.workflowPrios[y]:
                            return -1
                        elif self.workflowPrios[x] == self.workflowPrios[y]:
                            return cmp(self.workflowTimestamps[x], self.workflowTimestamps[y])
                        else:
                            return 1
                    workflows.sort(cmp = sortingCmp)

                    for workflow in workflows:
                        # Run a while loop until you get a job
                        while len(taskCache[workflow]) > 0:
                            cachedJobID = taskCache[workflow].pop()
                            try:
                                cachedJob   = self.jobDataCache[workflow].get(cachedJobID, None)
                                del self.jobDataCache[workflow][cachedJobID]
                            except:
                                pass

                            if cachedJobID not in jobsToPrune.get(workflow, set()):
                                cachedJobWorkflow = workflow
                                break
                            else:
                                cachedJob = None

                        # Remove the entry in the cache for the workflow if it is empty.
                        if len(self.cachedJobs[siteName][taskType][workflow]) == 0:
                            del self.cachedJobs[siteName][taskType][workflow]
                        if workflow in self.jobDataCache and len(self.jobDataCache[workflow].keys()) == 0:
                            del self.jobDataCache[workflow]

                        if cachedJob:
                            # We found a job, bail out and handle it.
                            break

                    # Check to see if we need to delete this site from the cache
                    if len(self.cachedJobs[siteName][taskType].keys()) == 0:
                        del self.cachedJobs[siteName][taskType]
                        breakLoop = True
                    if len(self.cachedJobs[siteName].keys()) == 0:
                        del self.cachedJobs[siteName]
                        breakLoop = True

                    if not cachedJob:
                        # We didn't find a job, bail out.
                        # This site and task type is done
                        break

                    self.cachedJobIDs.remove(cachedJob[0])

                    if cachedJobWorkflow not in jobsToPrune:
                        jobsToPrune[cachedJobWorkflow] = set()

                    jobsToPrune[cachedJobWorkflow].add(cachedJob[0])

                    # Sort jobs by jobPackage
                    package = cachedJob[2]
                    if not package in jobsToSubmit.keys():
                        jobsToSubmit[package] = []

                    # Add the sandbox to a global list
                    self.sandboxPackage[package] = cachedJob[3]

                    possibleSites = cachedJob[8]
                    possibleSiteList = list(possibleSites)
                    fakeAssignedSiteName = random.choice(possibleSiteList)
                    potentialSites = cachedJob[18]

                    # Create a job dictionary object
                    jobDict = {'id': cachedJob[0],
                               'retry_count': cachedJob[1],
                               'custom': {'location': fakeAssignedSiteName},
                               'cache_dir': cachedJob[4],
                               'packageDir': package,
                               'userdn': cachedJob[5],
                               'usergroup': cachedJob[6],
                               'userrole': cachedJob[7],
                               'priority': taskPriority,
                               'taskType': taskType,
                               'possibleSites': possibleSites,
                               'scramArch': cachedJob[9],
                               'swVersion': cachedJob[10],
                               'name': cachedJob[11],
                               'proxyPath': cachedJob[12],
                               'requestName': cachedJob[13],
                               'estimatedJobTime' : cachedJob[14],
                               'estimatedDiskUsage' : cachedJob[15],
                               'estimatedMemoryUsage' : cachedJob[16],
                               'taskPriority' : self.workflowPrios[workflow],
                               'taskName' : cachedJob[17],
                               'numberOfCores' : cachedJob[19],
                               'taskID' : cachedJob[20],
                               'potentialSites' : potentialSites}

                    # Add to jobsToSubmit
                    jobsToSubmit[package].append(jobDict)
                    jobsCount += 1
                    if jobsCount >= self.maxJobsPerPoll:
                        breakLoop, exitLoop = True, True

                    # Deal with accounting
                    if len(possibleSites) == 1:
                        nJobsRequired -= 1
                    totalPending  += 1
                    taskPending   += 1

                    if breakLoop:
                        break

        # Remove the jobs that we're going to submit from the cache.
        allWorkflows = set()
        for siteName in self.cachedJobs.keys():
            for taskType in self.cachedJobs[siteName].keys():
                for workflow in self.cachedJobs[siteName][taskType].keys():
                    allWorkflows.add(workflow)
                    if workflow in jobsToPrune.keys():
                        self.cachedJobs[siteName][taskType][workflow] -= jobsToPrune[workflow]

        # Remove workflows from the timestamp dictionary which are not anymore in the cache
        workflowsWithTimestamp = self.workflowTimestamps.keys()
        for workflow in workflowsWithTimestamp:
            if workflow not in allWorkflows:
                del self.workflowTimestamps[workflow]

        workflowsWithPrios = self.workflowPrios.keys()
        for workflow in workflowsWithPrios:
            if workflow not in allWorkflows:
                del self.workflowPrios[workflow]

        logging.info("Have %s packages to submit." % len(jobsToSubmit))
        logging.info("Done assigning site locations.")
        return jobsToSubmit


    def submitJobs(self, jobsToSubmit):
        """
        _submitJobs_

        Actually do the submission of the jobs
        """

        jobList   = []
        idList    = []

        if len(jobsToSubmit) == 0:
            logging.debug("There are no packages to submit.")
            return

        for package in jobsToSubmit.keys():

            sandbox = self.sandboxPackage[package]
            jobs    = jobsToSubmit.get(package, [])

            for job in jobs:
                job['location'], job['plugin'], job['site_cms_name'] = self.getSiteInfo(job['custom']['location'])
                job['sandbox'] = sandbox
                idList.append({'jobid': job['id'], 'location': job['custom']['location']})

            #Clean out the package reference
            del self.sandboxPackage[package]

            jobList.extend(jobs)

        myThread = threading.currentThread()
        myThread.transaction.begin()

        # Run the actual underlying submit code using bossAir
        successList, failList = self.bossAir.submit(jobs = jobList)
        logging.info("Jobs that succeeded/failed submission: %d/%d." % (len(successList),len(failList)))

        # Propagate states in the WMBS database
        logging.debug("Propagating success state to WMBS.")
        self.changeState.propagate(successList, 'executing', 'created')
        logging.debug("Propagating fail state to WMBS.")
        self.changeState.propagate(failList, 'submitfailed', 'created')

        # At the end we mark the locations of the jobs
        # This applies even to failed jobs, since the location
        # could be part of the failure reason.
        logging.debug("Updating job location...")
        self.setLocationAction.execute(bulkList = idList, conn = myThread.transaction.conn,
                                       transaction = True)
        myThread.transaction.commit()
        logging.info("Transaction cycle successfully completed.")

        return


    def getSiteInfo(self, jobSite):
        """
        _getSiteInfo_

        This is how you get the name of a CE and the plugin for a job
        """

        if not jobSite in self.locationDict.keys():
            siteInfo = self.locationAction.execute(siteName = jobSite)
            self.locationDict[jobSite] = siteInfo[0]
        return (self.locationDict[jobSite].get('ce_name'),
                self.locationDict[jobSite].get('plugin'),
                self.locationDict[jobSite].get('cms_name'))

    def algorithm(self, parameters = None):
        """
        _algorithm_

        Try to, in order:
        1) Refresh the cache
        2) Find jobs for all the necessary sites
        3) Submit the jobs to the plugin
        """

        try:
            myThread = threading.currentThread()
            self.getThresholds()
            self.refreshCache()
            jobsToSubmit = self.assignJobLocations()
            self.submitJobs(jobsToSubmit = jobsToSubmit)


        except WMException:
            if getattr(myThread, 'transaction', None) != None:
                myThread.transaction.rollback()
            raise
        except Exception as ex:
            msg = 'Fatal error in JobSubmitter:\n'
            msg += str(ex)
            #msg += str(traceback.format_exc())
            msg += '\n\n'
            logging.error(msg)
            self.sendAlert(7, msg = msg)
            if getattr(myThread, 'transaction', None) != None:
                myThread.transaction.rollback()
            raise JobSubmitterPollerException(msg)

        return



    def terminate(self, params):
        """
        _terminate_

        Kill the code after one final pass when called by the master thread.
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)
示例#15
0
    def testG_gLiteTest(self):
        """
        _gLiteTest_

        This test works on the gLitePlugin, checking all of
        its functions with a single set of jobs
        """

        config = self.getConfig()
        config.BossAir.UISetupScript = '/afs/cern.ch/cms/LCG/LCG-2/UI/cms_ui_env.sh'
        config.BossAir.gliteConf = '/afs/cern.ch/cms/LCG/LCG-2/UI/conf/glite_wms_CERN.conf'
        config.BossAir.credentialDir = '/home/crab/ALL_SETUP/credentials/'
        config.BossAir.gLiteProcesses = 2
        config.BossAir.gLitePrefixEnv = "/lib64/"
        config.BossAir.pluginNames.append("gLitePlugin")
        config.BossAir.manualProxyPath = environ['X509_USER_PROXY']

        config.Agent.serverDN = "/we/bypass/myproxy/logon"

        #config.BossAir.pluginNames = ["gLitePlugin"]
        baAPI  = BossAirAPI(config = config)

        nJobs = 2
        jobDummies = self.createDummyJobs(nJobs = nJobs, location = 'grid-ce-01.ba.infn.it')

        jobPackage = os.path.join(self.testDir, 'JobPackage.pkl')
        f = open(jobPackage, 'w')
        f.write(' ')
        f.close()

        sandbox = os.path.join(self.testDir, 'sandbox.box')
        f = open(sandbox, 'w')
        f.write(' ')
        f.close()

        jobList = []
        userdn = executeCommand('grid-cert-info -subject -file %s' % config.BossAir.manualProxyPath)
        newuser = self.daoFactory(classname = "Users.New")
        newuser.execute(dn = userdn)
        for j in jobDummies:
            job = j # {'id': j['id']}
            job['custom']      = {'location': 'grid-ce-01.ba.infn.it'}
            job['location']    = 'grid-ce-01.ba.infn.it'
            job['plugin']      = 'gLitePlugin'
            job['name']        = j['name']
            job['cache_dir']   = self.testDir
            job['retry_count'] = 0
            job['owner']       = userdn
            job['packageDir']  = self.testDir
            job['sandbox']     = sandbox
            job['priority']    = None
            jobList.append(job)

        baAPI.submit(jobs = jobList)

        # Should be new jobs
        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertNotEqual(len(newJobs), nJobs)

        time.sleep(2)
        baAPI.track()

        # Should be not anymore marked as new
        newJobs = baAPI._loadByStatus('New', 0)
        self.assertNotEqual(len(newJobs), nJobs)


        # Killing all the jobs
        baAPI.kill( jobList )
        #time.sleep(15)
        baAPI.track()

        ## Issues running tests below due to glite delay on marking job as killed
        # Should be just running jobs
        #killedJobs = baAPI._loadByStatus('Cancelled by user', 0)
        #self.assertEqual(len(killedJobs), 0)

        # Check if they're complete
        #completeJobs = baAPI.getComplete()
        #self.assertEqual(len(completeJobs), nJobs)

        return
class JobSubmitterPoller(BaseWorkerThread):
    """
    _JobSubmitterPoller_

    The jobSubmitterPoller takes the jobs and organizes them into packages
    before sending them to the individual plugin submitters.
    """
    def __init__(self, config):
        BaseWorkerThread.__init__(self)
        myThread = threading.currentThread()
        self.config = config

        #DAO factory for WMBS objects
        self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=logging, dbinterface=myThread.dbi)

        #Libraries
        self.resourceControl = ResourceControl()
        self.changeState = ChangeState(self.config)
        self.bossAir = BossAirAPI(config=self.config)

        self.repollCount = getattr(self.config.JobSubmitter, 'repollCount', 10000)
        self.maxJobsPerPoll = int(getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000))
        self.cacheRefreshSize = int(getattr(self.config.JobSubmitter, 'cacheRefreshSize', 30000))
        self.skipRefreshCount = int(getattr(self.config.JobSubmitter, 'skipRefreshCount', 20))
        self.packageSize = getattr(self.config.JobSubmitter, 'packageSize', 500)
        self.collSize = getattr(self.config.JobSubmitter, 'collectionSize', self.packageSize * 1000)
        self.maxTaskPriority = getattr(self.config.BossAir, 'maxTaskPriority', 1e7)

        # Additions for caching-based JobSubmitter
        self.cachedJobIDs = set()
        self.cachedJobs = {}
        self.jobDataCache = {}
        self.jobsToPackage = {}
        self.sandboxPackage = {}
        self.locationDict = {}
        self.taskTypePrioMap = {}
        self.drainSites = set()
        self.abortSites = set()
        self.refreshPollingCount = 0

        try:
            if not getattr(self.config.JobSubmitter, 'submitDir', None):
                self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir
            self.packageDir = os.path.join(self.config.JobSubmitter.submitDir, 'packages')

            if not os.path.exists(self.packageDir):
                os.makedirs(self.packageDir)
        except OSError as ex:
            msg = "Error while trying to create packageDir %s\n!"
            msg += str(ex)
            logging.error(msg)
            logging.debug("PackageDir: %s", self.packageDir)
            logging.debug("Config: %s", config)
            raise JobSubmitterPollerException(msg)


        # Now the DAOs
        self.listJobsAction = self.daoFactory(classname="Jobs.ListForSubmitter")
        self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation")
        self.locationAction = self.daoFactory(classname="Locations.GetSiteInfo")
        self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath")
        self.listWorkflows = self.daoFactory(classname="Workflow.ListForSubmitter")

        # Keep a record of the thresholds in memory
        self.currentRcThresholds = {}

        self.useReqMgrForCompletionCheck = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)

        if self.useReqMgrForCompletionCheck:
            # only set up this when reqmgr is used (not Tier0)
            self.reqmgr2Svc = ReqMgr(self.config.TaskArchiver.ReqMgr2ServiceURL)
            self.abortedAndForceCompleteWorkflowCache = self.reqmgr2Svc.getAbortedAndForceCompleteRequestsFromMemoryCache()
        else:
            # Tier0 Case - just for the clarity (This private variable shouldn't be used
            self.abortedAndForceCompleteWorkflowCache = None

        return

    def getPackageCollection(self, sandboxDir):
        """
        _getPackageCollection_

        Given a jobID figure out which packageCollection
        it should belong in.
        """

        rawList = os.listdir(sandboxDir)
        collections = []
        numberList = []
        for entry in rawList:
            if 'PackageCollection' in entry:
                collections.append(entry)

        # If we have no collections, return 0 (PackageCollection_0)
        if len(collections) < 1:
            return 0

        # Loop over the list of PackageCollections
        for collection in collections:
            collectionPath = os.path.join(sandboxDir, collection)
            packageList = os.listdir(collectionPath)
            collectionNum = int(collection.split('_')[1])
            if len(packageList) < self.collSize:
                return collectionNum
            else:
                numberList.append(collectionNum)

        # If we got here, then all collections are full.  We'll need
        # a new one.  Find the highest number, increment by one
        numberList.sort()
        return numberList[-1] + 1

    def addJobsToPackage(self, loadedJob):
        """
        _addJobsToPackage_

        Add a job to a job package and then return the batch ID for the job.
        Packages are only written out to disk when they contain 100 jobs.  The
        flushJobsPackages() method must be called after all jobs have been added
        to the cache and before they are actually submitted to make sure all the
        job packages have been written to disk.
        """
        if loadedJob["workflow"] not in self.jobsToPackage:
            # First, let's pull all the information from the loadedJob
            batchid = "%s-%s" % (loadedJob["id"], loadedJob["retry_count"])
            sandboxDir = os.path.dirname(loadedJob["sandbox"])

            # Second, assemble the jobPackage location
            collectionIndex = self.getPackageCollection(sandboxDir)
            collectionDir = os.path.join(sandboxDir,
                                         'PackageCollection_%i' % collectionIndex,
                                         'batch_%s' % batchid)

            # Now create the package object
            self.jobsToPackage[loadedJob["workflow"]] = {"batchid": batchid,
                                                         'id': loadedJob['id'],
                                                         "package": JobPackage(directory=collectionDir)}

        jobPackage = self.jobsToPackage[loadedJob["workflow"]]["package"]
        jobPackage[loadedJob["id"]] = loadedJob.getDataStructsJob()
        batchDir = jobPackage['directory']

        if len(jobPackage.keys()) == self.packageSize:
            if not os.path.exists(batchDir):
                os.makedirs(batchDir)

            batchPath = os.path.join(batchDir, "JobPackage.pkl")
            jobPackage.save(batchPath)
            del self.jobsToPackage[loadedJob["workflow"]]

        return batchDir

    def flushJobPackages(self):
        """
        _flushJobPackages_

        Write any jobs packages to disk that haven't been written out already.
        """
        workflowNames = self.jobsToPackage.keys()
        for workflowName in workflowNames:
            jobPackage = self.jobsToPackage[workflowName]["package"]
            batchDir = jobPackage['directory']

            if not os.path.exists(batchDir):
                os.makedirs(batchDir)

            batchPath = os.path.join(batchDir, "JobPackage.pkl")
            jobPackage.save(batchPath)
            del self.jobsToPackage[workflowName]

        return

    def refreshCache(self):
        """
        _refreshCache_

        Query WMBS for all jobs in the 'created' state.  For all jobs returned
        from the query, check if they already exist in the cache.  If they
        don't, unpickle them and combine their site white and black list with
        the list of locations they can run at.  Add them to the cache.

        Each entry in the cache is a tuple with five items:
          - WMBS Job ID
          - Retry count
          - Batch ID
          - Path to sanbox
          - Path to cache directory
        """
        badJobs = dict([(x, []) for x in range(71101, 71105)])
        dbJobs = set()

        logging.info("Refreshing priority cache with currently %i jobs", len(self.cachedJobIDs))

        if self.cacheRefreshSize == -1 or len(self.cachedJobIDs) < self.cacheRefreshSize or \
           self.refreshPollingCount >= self.skipRefreshCount:
            newJobs = self.listJobsAction.execute()
            self.refreshPollingCount = 0

            if self.useReqMgrForCompletionCheck:
                # if reqmgr is used (not Tier0 Agent) get the aborted/forceCompleted record
                abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData()
            else:
                #T0Agent
                abortedAndForceCompleteRequests = []

            logging.info("Found %s new jobs to be submitted.", len(newJobs))
        else:
            self.refreshPollingCount += 1
            newJobs = []
            dbJobs = self.cachedJobIDs
            abortedAndForceCompleteRequests = []
            logging.info("Skipping cache update to be submitted. (%s job in cache)", len(dbJobs))

        logging.info("Determining possible sites for new jobs...")
        jobCount = 0
        for newJob in newJobs:
            # whether newJob belongs to aborted or force-complete workflow, and skip it if it is.
            if (newJob['request_name'] in abortedAndForceCompleteRequests) and \
               (newJob['type'] not in ['LogCollect', "Cleanup"]):
                continue

            jobID = newJob['id']
            dbJobs.add(jobID)
            if jobID in self.cachedJobIDs:
                continue

            jobCount += 1
            if jobCount % 5000 == 0:
                logging.info("Processed %d/%d new jobs.", jobCount, len(newJobs))

            pickledJobPath = os.path.join(newJob["cache_dir"], "job.pkl")

            if not os.path.isfile(pickledJobPath):
                # Then we have a problem - there's no file
                logging.error("Could not find pickled jobObject %s", pickledJobPath)
                badJobs[71103].append(newJob)
                continue
            try:
                jobHandle = open(pickledJobPath, "r")
                loadedJob = pickle.load(jobHandle)
                jobHandle.close()
            except Exception as ex:
                msg = "Error while loading pickled job object %s\n" % pickledJobPath
                msg += str(ex)
                logging.error(msg)
                raise JobSubmitterPollerException(msg)

            loadedJob['retry_count'] = newJob['retry_count']

            # figure out possible locations for job
            possibleLocations = loadedJob["possiblePSN"]

            # Create another set of locations that may change when a site goes white/black listed
            # Does not care about the non_draining or aborted sites, they may change and that is the point
            potentialLocations = set()
            potentialLocations.update(possibleLocations)

            # now check for sites in drain and adjust the possible locations
            # also check if there is at least one site left to run the job
            if len(possibleLocations) == 0:
                newJob['name'] = loadedJob['name']
                newJob['fileLocations'] = loadedJob.get('fileLocations', [])
                newJob['siteWhitelist'] = loadedJob.get('siteWhitelist', [])
                newJob['siteBlacklist'] = loadedJob.get('siteBlacklist', [])
                badJobs[71101].append(newJob)
                continue
            else:
                nonAbortSites = [x for x in possibleLocations if x not in self.abortSites]
                if nonAbortSites: # if there is at least a non aborted/down site then run there, otherwise fail the job
                    possibleLocations = nonAbortSites
                else:
                    newJob['name'] = loadedJob['name']
                    newJob['possibleLocations'] = possibleLocations
                    badJobs[71102].append(newJob)
                    continue

            # try to remove draining sites if possible, this is needed to stop
            # jobs that could run anywhere blocking draining sites
            # if the job type is Merge, LogCollect or Cleanup this is skipped
            if newJob['type'] not in ('LogCollect', 'Merge', 'Cleanup', 'Harvesting'):
                nonDrainingSites = [x for x in possibleLocations if x not in self.drainSites]
                if nonDrainingSites: # if >1 viable non-draining site remove draining ones
                    possibleLocations = nonDrainingSites
                else:
                    newJob['name'] = loadedJob['name']
                    newJob['possibleLocations'] = possibleLocations
                    badJobs[71104].append(newJob)
                    continue

            # locations clear of abort and draining sites
            newJob['possibleLocations'] = possibleLocations

            batchDir = self.addJobsToPackage(loadedJob)
            self.cachedJobIDs.add(jobID)

            # calculate the final job priority such that we can order cached jobs by prio
            jobPrio = self.taskTypePrioMap.get(newJob['type'], 0) + newJob['wf_priority']
            if jobPrio not in self.cachedJobs:
                self.cachedJobs[jobPrio] = {}

            # now add basic information keyed by the jobid
            self.cachedJobs[jobPrio][jobID] = newJob

            # allow job baggage to override numberOfCores
            #       => used for repacking to get more slots/disk
            numberOfCores = loadedJob.get('numberOfCores', 1)
            if numberOfCores == 1:
                baggage = loadedJob.getBaggage()
                numberOfCores = getattr(baggage, "numberOfCores", 1)
            loadedJob['numberOfCores'] = numberOfCores

            # Create a job dictionary object and put it in the cache (needs to be in sync with RunJob)
            jobInfo = {'id': jobID,
                       'requestName': newJob['request_name'],
                       'taskName': newJob['task_name'],
                       'taskType': newJob['type'],
                       'cache_dir': newJob["cache_dir"],
                       'priority': newJob['wf_priority'],
                       'taskID': newJob['task_id'],
                       'retry_count': newJob["retry_count"],
                       'taskPriority': None,                                # update from the thresholds
                       'custom': {'location': None},                        # update later
                       'packageDir': batchDir,
                       'sandbox': loadedJob["sandbox"],                     # remove before submit
                       'userdn': loadedJob.get("ownerDN", None),
                       'usergroup': loadedJob.get("ownerGroup", ''),
                       'userrole': loadedJob.get("ownerRole", ''),
                       'possibleSites': frozenset(possibleLocations),       # abort and drain sites filtered out
                       'potentialSites': frozenset(potentialLocations),     # original list of sites
                       'scramArch': loadedJob.get("scramArch", None),
                       'swVersion': loadedJob.get("swVersion", None),
                       'name': loadedJob["name"],
                       'proxyPath': loadedJob.get("proxyPath", None),
                       'estimatedJobTime': loadedJob.get("estimatedJobTime", None),
                       'estimatedDiskUsage': loadedJob.get("estimatedDiskUsage", None),
                       'estimatedMemoryUsage': loadedJob.get("estimatedMemoryUsage", None),
                       'numberOfCores': loadedJob.get("numberOfCores", 1),  # may update it later
                       'inputDataset': loadedJob.get('inputDataset', None),
                       'inputDatasetLocations': loadedJob.get('inputDatasetLocations', None),
                       'allowOpportunistic': loadedJob.get('allowOpportunistic', False)}

            self.jobDataCache[jobID] = jobInfo

        # Register failures in submission
        for errorCode in badJobs:
            if badJobs[errorCode]:
                logging.debug("The following jobs could not be submitted: %s, error code : %d", badJobs, errorCode)
                self._handleSubmitFailedJobs(badJobs[errorCode], errorCode)

        # If there are any leftover jobs, we want to get rid of them.
        self.flushJobPackages()

        # We need to remove any jobs from the cache that were not returned in
        # the last call to the database.
        jobIDsToPurge = self.cachedJobIDs - dbJobs
        self._purgeJobsFromCache(jobIDsToPurge)

        logging.info("Done pruning killed jobs, moving on to submit.")
        return

    def removeAbortedForceCompletedWorkflowFromCache(self):
        abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData()
        jobIDsToPurge = set()
        for jobID, jobInfo in self.jobDataCache.iteritems():
            if (jobInfo['requestName'] in abortedAndForceCompleteRequests) and \
               (jobInfo['taskType'] not in ['LogCollect', "Cleanup"]):
                jobIDsToPurge.add(jobID)
        self._purgeJobsFromCache(jobIDsToPurge)
        return

    def _purgeJobsFromCache(self, jobIDsToPurge):

        if len(jobIDsToPurge) == 0:
            return

        self.cachedJobIDs -= jobIDsToPurge

        for jobid in jobIDsToPurge:
            self.jobDataCache.pop(jobid, None)
            for jobPrio in self.cachedJobs:
                if self.cachedJobs[jobPrio].pop(jobid, None):
                    # then the jobid was found, go to the next one
                    break
        return

    def _handleSubmitFailedJobs(self, badJobs, exitCode):
        """
        __handleSubmitFailedJobs_

        For a default job report for the exitCode
        and register in the job. Preserve it on disk as well.
        Propagate the failure to the JobStateMachine.
        """
        fwjrBinds = []
        for job in badJobs:
            job['couch_record'] = None
            job['fwjr'] = Report()
            if exitCode in [71102, 71104]:
                job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode] + ', '.join(job['possibleLocations']))
            elif exitCode in [71101]:
                # there is no possible site
                if job.get("fileLocations"):
                    job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode]  +
                                         ": file locations: " + ', '.join(job['fileLocations']) +
                                         ": site white list: " + ', '.join(job['siteWhitelist']) +
                                         ": site black list: " + ', '.join(job['siteBlacklist']))
                else:
                    # This is temporary addition if this is patched for existing agent.
                    # If jobs are created before the patch is applied fileLocations is not set.
                    # TODO. remove this later for new agent
                    job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode]  +
                                         ": Job is created before this patch. Please check this input for the jobs: %s " %
                                         job['fwjr'].getAllInputFiles())

            else:
                job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode])
            fwjrPath = os.path.join(job['cache_dir'],
                                    'Report.%d.pkl' % int(job['retry_count']))
            job['fwjr'].setJobID(job['id'])
            try:
                job['fwjr'].save(fwjrPath)
                fwjrBinds.append({"jobid" : job["id"], "fwjrpath" : fwjrPath})
            except IOError as ioer:
                logging.error("Failed to write FWJR for submit failed job %d, message: %s", job['id'], str(ioer))
        self.changeState.propagate(badJobs, "submitfailed", "created")
        self.setFWJRPathAction.execute(binds=fwjrBinds)
        return

    def getThresholds(self):
        """
        _getThresholds_

        Retrieve submit thresholds, which considers what is pending and running
        for those sites.
        Also update the list of draining and abort/down sites.
        Finally, creates a map between task type and its priority.
        """
        self.taskTypePrioMap = {}
        newDrainSites = set()
        newAbortSites = set()

        rcThresholds = self.resourceControl.listThresholdsForSubmit()

        for siteName in rcThresholds.keys():
            # Add threshold if we don't have it already
            state = rcThresholds[siteName]["state"]

            if state == "Draining":
                newDrainSites.add(siteName)
            if state in ["Down", "Aborted"]:
                newAbortSites.add(siteName)

            # then update the task type x task priority mapping
            if not self.taskTypePrioMap:
                for task, value in rcThresholds[siteName]['thresholds'].items():
                    self.taskTypePrioMap[task] = value.get('priority', 0) * self.maxTaskPriority

        # When the list of drain/abort sites change between iteration then a location
        # refresh is needed, for now it forces a full cache refresh
        if newDrainSites != self.drainSites or  newAbortSites != self.abortSites:
            logging.info("Draining or Aborted sites have changed, the cache will be rebuilt.")
            self.cachedJobIDs = set()
            self.cachedJobs = {}
            self.jobDataCache = {}

        self.currentRcThresholds = rcThresholds
        self.abortSites = newAbortSites
        self.drainSites = newDrainSites

        return


    def assignJobLocations(self):
        """
        _assignJobLocations_

        Loop through the submit thresholds and pull sites out of the job cache
        as we discover open slots.  This will return a list of tuple where each
        tuple will have six elements:
          - WMBS Job ID
          - Retry count
          - Batch ID
          - Path to sanbox
          - Path to cache directory
          - SE name of the site to run at
        """
        jobsToSubmit = {}
        jobsToUncache = []
        jobsCount = 0
        exitLoop = False
        jobSubmitLogBySites = defaultdict(Counter)
        jobSubmitLogByPriority = defaultdict(Counter)

        # iterate over jobs from the highest to the lowest prio
        for jobPrio in sorted(self.cachedJobs, reverse=True):

            # then we're completely done and have our basket full of jobs to submit
            if exitLoop:
                break

            # start eating through the elder jobs first
            for job in sorted(self.cachedJobs[jobPrio].values(), key=itemgetter('timestamp')):
                jobid = job['id']
                jobType = job['type']
                possibleSites = job['possibleLocations']
                jobSubmitLogByPriority[jobPrio]['Total'] += 1
                # now look for sites with free pending slots
                for siteName in possibleSites:
                    if siteName not in self.currentRcThresholds:
                        logging.warn("Have a job for %s which is not in the resource control", siteName)
                        continue

                    try:
                        totalPendingSlots = self.currentRcThresholds[siteName]["total_pending_slots"]
                        totalPendingJobs = self.currentRcThresholds[siteName]["total_pending_jobs"]
                        totalRunningSlots = self.currentRcThresholds[siteName]["total_running_slots"]
                        totalRunningJobs = self.currentRcThresholds[siteName]["total_running_jobs"]

                        taskPendingSlots = self.currentRcThresholds[siteName]['thresholds'][jobType]["pending_slots"]
                        taskPendingJobs = self.currentRcThresholds[siteName]['thresholds'][jobType]["task_pending_jobs"]
                        taskRunningSlots = self.currentRcThresholds[siteName]['thresholds'][jobType]["max_slots"]
                        taskRunningJobs = self.currentRcThresholds[siteName]['thresholds'][jobType]["task_running_jobs"]
                        taskPriority = self.currentRcThresholds[siteName]['thresholds'][jobType]["priority"]
                    except KeyError as ex:
                        msg = "Invalid key for site %s and job type %s\n" % (siteName, jobType)
                        msg += str(ex)
                        logging.error(msg)
                        continue

                    # check if site has free pending slots AND free pending task slots
                    if totalPendingJobs >= totalPendingSlots or taskPendingJobs >= taskPendingSlots:
                        jobSubmitLogBySites[siteName]["NoPendingSlot"] += 1
                        logging.debug("Found a job for %s which has no free pending slots", siteName)
                        continue
                    # check if site overall thresholds have free slots
                    if totalPendingJobs + totalRunningJobs >= totalPendingSlots + totalRunningSlots:
                        jobSubmitLogBySites[siteName]["NoRunningSlot"] += 1
                        logging.debug("Found a job for %s which has no free overall slots", siteName)
                        continue
                    # finally, check whether task has free overall slots
                    if taskPendingJobs + taskRunningJobs >= taskPendingSlots + taskRunningSlots:
                        jobSubmitLogBySites[siteName]["NoTaskSlot"] += 1
                        logging.debug("Found a job for %s which has no free task slots", siteName)
                        continue

                    # otherwise, update the site/task thresholds and the component job counter
                    self.currentRcThresholds[siteName]["total_pending_jobs"] += 1
                    self.currentRcThresholds[siteName]['thresholds'][jobType]["task_pending_jobs"] += 1
                    jobsCount += 1

                    # load (and remove) the job dictionary object from jobDataCache
                    cachedJob = self.jobDataCache.pop(jobid)
                    jobsToUncache.append((jobPrio, jobid))

                    # Sort jobs by jobPackage
                    package = cachedJob['packageDir']
                    if package not in jobsToSubmit.keys():
                        jobsToSubmit[package] = []

                    # Add the sandbox to a global list
                    self.sandboxPackage[package] = cachedJob.pop('sandbox')

                    # Now update the job dictionary object
                    cachedJob['custom'] = {'location': siteName}
                    cachedJob['taskPriority'] = taskPriority

                    # Get this job in place to be submitted by the plugin
                    jobsToSubmit[package].append(cachedJob)

                    jobSubmitLogBySites[siteName]["submitted"] += 1
                    jobSubmitLogByPriority[jobPrio]['submitted'] += 1
                    # found a site to submit this job, so go to the next job
                    break

                # set the flag and get out of the job iteration
                if jobsCount >= self.maxJobsPerPoll:
                    exitLoop = True
                    break

        # jobs that are going to be submitted must be removed from all caches
        for prio, jobid in jobsToUncache:
            self.cachedJobs[prio].pop(jobid)
            self.cachedJobIDs.remove(jobid)

        logging.info("Site submission report: %s", dict(jobSubmitLogBySites))
        logging.info("Priority submission report: %s", dict(jobSubmitLogByPriority))
        logging.info("Have %s packages to submit.", len(jobsToSubmit))
        logging.info("Have %s jobs to submit.", jobsCount)
        logging.info("Done assigning site locations.")
        return jobsToSubmit


    def submitJobs(self, jobsToSubmit):
        """
        _submitJobs_

        Actually do the submission of the jobs
        """

        jobList = []
        idList = []

        if len(jobsToSubmit) == 0:
            logging.debug("There are no packages to submit.")
            return

        for package in jobsToSubmit.keys():

            sandbox = self.sandboxPackage[package]
            jobs = jobsToSubmit.get(package, [])
            for job in jobs:
                job['location'], job['plugin'], job['site_cms_name'] = self.getSiteInfo(job['custom']['location'])
                job['sandbox'] = sandbox
                idList.append({'jobid': job['id'], 'location': job['custom']['location']})

            #Clean out the package reference
            del self.sandboxPackage[package]

            jobList.extend(jobs)

        myThread = threading.currentThread()
        myThread.transaction.begin()

        # Run the actual underlying submit code using bossAir
        successList, failList = self.bossAir.submit(jobs=jobList)
        logging.info("Jobs that succeeded/failed submission: %d/%d.", len(successList), len(failList))

        # Propagate states in the WMBS database
        logging.debug("Propagating success state to WMBS.")
        self.changeState.propagate(successList, 'executing', 'created')
        logging.debug("Propagating fail state to WMBS.")
        self.changeState.propagate(failList, 'submitfailed', 'created')

        # At the end we mark the locations of the jobs
        # This applies even to failed jobs, since the location
        # could be part of the failure reason.
        logging.debug("Updating job location...")
        self.setLocationAction.execute(bulkList=idList, conn=myThread.transaction.conn,
                                       transaction=True)
        myThread.transaction.commit()
        logging.info("Transaction cycle successfully completed.")

        return


    def getSiteInfo(self, jobSite):
        """
        _getSiteInfo_

        This is how you get the name of a CE and the plugin for a job
        """

        if not jobSite in self.locationDict.keys():
            siteInfo = self.locationAction.execute(siteName=jobSite)
            self.locationDict[jobSite] = siteInfo[0]
        return (self.locationDict[jobSite].get('ce_name'),
                self.locationDict[jobSite].get('plugin'),
                self.locationDict[jobSite].get('cms_name'))

    def algorithm(self, parameters=None):
        """
        _algorithm_

        Try to, in order:
        1) Refresh the cache
        2) Find jobs for all the necessary sites
        3) Submit the jobs to the plugin
        """

        try:
            myThread = threading.currentThread()
            self.getThresholds()
            self.refreshCache()

            if self.useReqMgrForCompletionCheck:
                # only runs when reqmgr is used (not Tier0)
                self.removeAbortedForceCompletedWorkflowFromCache()

            jobsToSubmit = self.assignJobLocations()
            self.submitJobs(jobsToSubmit=jobsToSubmit)
        except WMException:
            if getattr(myThread, 'transaction', None) != None:
                myThread.transaction.rollback()
            raise
        except Exception as ex:
            msg = 'Fatal error in JobSubmitter:\n'
            msg += str(ex)
            #msg += str(traceback.format_exc())
            msg += '\n\n'
            logging.error(msg)
            if getattr(myThread, 'transaction', None) != None:
                myThread.transaction.rollback()
            raise JobSubmitterPollerException(msg)

        return



    def terminate(self, params):
        """
        _terminate_

        Kill the code after one final pass when called by the master thread.
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)
示例#17
0
    def testB_PluginTest(self):
        """
        _PluginTest_


        Now check that these functions worked if called through plugins
        Instead of directly.

        There are only three plugin
        """
        #return

        myThread = threading.currentThread()

        config = self.getConfig()

        baAPI = BossAirAPI(config=config)

        # Create some jobs
        nJobs = 10

        jobDummies = self.createDummyJobs(nJobs=nJobs, location='Xanadu')
        changeState = ChangeState(config)
        changeState.propagate(jobDummies, 'created', 'new')
        changeState.propagate(jobDummies, 'executing', 'created')

        # Prior to building the job, each job must have a plugin
        # and user assigned
        for job in jobDummies:
            job['plugin'] = 'TestPlugin'
            job['owner'] = 'tapas'

        baAPI.submit(jobs=jobDummies)

        newJobs = baAPI._loadByStatus(status='New')
        self.assertEqual(len(newJobs), nJobs)

        # Should be no more running jobs
        runningJobs = baAPI._listRunJobs()
        self.assertEqual(len(runningJobs), nJobs)

        # Test Plugin should complete all jobs
        baAPI.track()

        # Should be no more running jobs
        runningJobs = baAPI._listRunJobs()
        self.assertEqual(len(runningJobs), 0)

        # Check if they're complete
        completeJobs = baAPI.getComplete()
        self.assertEqual(len(completeJobs), nJobs)

        # Do this test because BossAir is specifically built
        # to keep it from finding completed jobs
        result = myThread.dbi.processData(
            "SELECT id FROM bl_runjob")[0].fetchall()
        self.assertEqual(len(result), nJobs)

        baAPI.removeComplete(jobs=jobDummies)

        result = myThread.dbi.processData(
            "SELECT id FROM bl_runjob")[0].fetchall()
        self.assertEqual(len(result), 0)

        return
示例#18
0
    def testH_ARCTest(self):
        """
        _ARCTest_

        This test works on the ARCPlugin, checking all of
        its functions with a single set of jobs
        """

        nRunning = getNArcJobs()
        self.assertEqual(nRunning, 0, "User currently has %i running jobs.  Test will not continue" % (nRunning))

        config = self.getConfig()
        config.BossAir.pluginNames.append("ARCPlugin")
        #config.BossAir.pluginNames = ["ARCPlugin"]
        baAPI  = BossAirAPI(config = config)

        nJobs = 2
        jobDummies = self.createDummyJobs(nJobs = nJobs, location = 'jade-cms.hip.fi')
        #baAPI.createNewJobs(wmbsJobs = jobDummies)
        #changeState = ChangeState(config)
        #changeState.propagate(jobDummies, 'created', 'new')
        #changeState.propagate(jobDummies, 'executing', 'created')

        jobPackage = os.path.join(self.testDir, 'JobPackage.pkl')
        f = open(jobPackage, 'w')
        f.write(' ')
        f.close()

        sandbox = os.path.join(self.testDir, 'sandbox.box')
        f = open(sandbox, 'w')
        f.write(' ')
        f.close()

        jobList = []
        for j in jobDummies:
            job = j # {'id': j['id']}
            job['custom']      = {'location': 'jade-cms.hip.fi'}
            job['location'] = 'jade-cms.hip.fi'
            job['plugin'] = 'ARCPlugin'
            job['name']        = j['name']
            job['cache_dir']   = self.testDir
            job['retry_count'] = 0
            job['owner']       = 'edelmann'
            job['packageDir']  = self.testDir
            job['sandbox']     = sandbox
            job['priority']    = None
            jobList.append(job)

        baAPI.submit(jobs = jobList)

        nRunning = getNArcJobs()
        self.assertEqual(nRunning, nJobs)

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), nJobs)

        baAPI.track()

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), 0)

        rJobs = baAPI._listRunJobs()
        nOldJobs = 0
        for j in rJobs:
            if j['status'] != "New":
                nOldJobs += 1
        self.assertEqual(nOldJobs, nJobs)

            #if baAPI.plugins['ARCPlugin'].stateDict[j['status']] in [ "Pending", "Running" ]:

        baAPI.kill(jobs = jobList)

        nRunning = getNArcJobs()
        self.assertEqual(nRunning, 0)

        # Try resubmission
        for j in jobList:
            j['retry_count'] = 1

        succ, fail = baAPI.submit(jobs = jobList)

        time.sleep(30)

        nRunning = getNArcJobs()
        self.assertEqual(nRunning, nJobs)

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), nJobs)

        # See where they are
        baAPI.track()

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), 0)

        rJobs = baAPI._listRunJobs()
        nOldJobs = 0
        idStr = ""
        for j in rJobs:
            idStr += " " + j['gridid']
            if j['status'] != "New":
                nOldJobs += 1
        self.assertEqual(nOldJobs, nJobs)

        # Now kill 'em manually
        no_jobs = True
        while no_jobs:
            command = 'LD_LIBRARY_PATH=$CLEAN_LD_LIBRARY_PATH ngkill -t 180 -a'
            pipe = Popen(command, stdout = PIPE, stderr = STDOUT, shell = True)
            output = pipe.communicate()[0]
            if output.find("Job information not found") >= 0:
                # It seems the jobs hasn't reached the ARC info.sys yet.
                # Sleep a while and try again
                time.sleep(20)
                continue
            else:
                no_jobs = False

            # Just to be sure, if the jobs were already finished, do a
            # 'ngclean' too.
            command = 'LD_LIBRARY_PATH=$CLEAN_LD_LIBRARY_PATH ngclean -t 180 -a'
            pipe = Popen(command, stdout = PIPE, stderr = STDOUT, shell = True)
            output = pipe.communicate()[0]

        # Make sure the killing of the jobs reaches the info.sys.
        still_jobs = True
        while still_jobs:
            command = 'LD_LIBRARY_PATH=$CLEAN_LD_LIBRARY_PATH ngstat -t 180 ' + idStr
            pipe = Popen(command, stdout = PIPE, stderr = STDOUT, shell = True)
            output = pipe.communicate()[0]
            if output.find("Job information not found") < 0:
                # It seems the killing of the jobs hasn't reached the ARC info.sys yet.
                # Sleep a while and try again
                time.sleep(20)
                continue
            else:
                still_jobs = False

        # See what happened
        baAPI.track()

        idJobs = baAPI._loadByID(rJobs)
        nActiveJobs = 0
        nRemovedJobs = 0
        for j in idJobs:
            if j['status'] not in [ "New", "KILLING", "KILLED", "LOST" ]:
                nActiveJobs += 1
            if j['status'] in [ "KILLING", "KILLED", "LOST" ]:
                nRemovedJobs += 1
        self.assertEqual(nActiveJobs, 0)
        self.assertEqual(nRemovedJobs, nJobs)

        return