Exemplo n.º 1
0
    def testGrouper(self):
        """
        Test the grouper function (returns chunk of an iterable)
        """

        listChunks = [i for i in grouper(list(range(0, 7)), 3)]  # Want list(range) for python 3
        iterChunks = [i for i in grouper(xrange(0, 7), 3)]  # xrange becomes range in python 3

        for a, b in itertools.izip_longest(listChunks, iterChunks):
            self.assertEqual(a, b)

        self.assertEqual(listChunks[-1], [6])
Exemplo n.º 2
0
    def testGrouper(self):
        """
        Test the grouper function (returns chunk of an iterable)
        """

        listChunks = [i for i in grouper(list(range(0, 7)), 3)
                      ]  # Want list(range) for python 3
        iterChunks = [i for i in grouper(xrange(0, 7), 3)
                      ]  # xrange becomes range in python 3

        for a, b in itertools.izip_longest(listChunks, iterChunks):
            self.assertEqual(a, b)

        self.assertEqual(listChunks[-1], [6])
Exemplo n.º 3
0
    def _getLumiList(self, blockName=None, lfns=None, validFileOnly=1):
        """
        currently only take one lfn but dbs api need be updated
        """
        try:
            if blockName:
                lumiLists = self.dbs.listFileLumis(block_name=blockName, validFileOnly=validFileOnly)
            elif lfns:
                lumiLists = []
                for slfn in grouper(lfns, 50):
                    lumiLists.extend(self.dbs.listFileLumiArray(logical_file_name = slfn))
            else:
                # shouldn't call this with both blockName and lfns empty
                # but still returns empty dict for that case
                return {}
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFileLumiArray(%s)\n" % lfns
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        lumiDict = {}
        for lumisItem in lumiLists:
            lumiDict.setdefault(lumisItem['logical_file_name'], [])
            item = {}
            item["RunNumber"] = lumisItem['run_num']
            item['LumiSectionNumber'] = lumisItem['lumi_section_num']
            lumiDict[lumisItem['logical_file_name']].append(item)
        return lumiDict
Exemplo n.º 4
0
    def _getLumiList(self, blockName=None, lfns=None, validFileOnly=1):
        """
        currently only take one lfn but dbs api need be updated
        """
        try:
            if blockName:
                lumiLists = self.dbs.listFileLumis(block_name=blockName,
                                                   validFileOnly=validFileOnly)
            elif lfns:
                lumiLists = []
                for slfn in grouper(lfns, 50):
                    lumiLists.extend(
                        self.dbs.listFileLumiArray(logical_file_name=slfn))
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFileLumiArray(%s)\n" % lfns
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        lumiDict = {}
        for lumisItem in lumiLists:
            lumiDict.setdefault(lumisItem['logical_file_name'], [])
            item = {}
            item["RunNumber"] = lumisItem['run_num']
            item['LumiSectionNumber'] = lumisItem['lumi_section_num']
            lumiDict[lumisItem['logical_file_name']].append(item)
        return lumiDict
Exemplo n.º 5
0
    def algorithm(self, parameters):
        """
        get information from wmbs, workqueue and local couch
        """
        try:
            data = self.fwjrAPI.getFWJRByArchiveStatus('ready', limit=self.numDocsRetrievePerPolling)['rows']
            logging.info("Found %i not archived documents from FWRJ db to upload to WMArchive.", len(data))

            for slicedData in grouper(data, self.numDocsUploadPerCall):
                jobIDs = []
                archiveDocs = []
                for job in slicedData:
                    doc = createArchiverDoc(job)
                    archiveDocs.append(doc)
                    jobIDs.append(job["id"])

                response = self.wmarchiver.archiveData(archiveDocs)

                # Partial success is not allowed either all the insert is successful or none is
                if response[0]['status'] == "ok" and len(response[0]['ids']) == len(jobIDs):
                    archiveIDs = response[0]['ids']
                    for docID in jobIDs:
                        self.fwjrAPI.updateArchiveUploadedStatus(docID)
                    logging.info("...successfully uploaded %d docs", len(jobIDs))
                    logging.debug("JobIDs uploaded: %s", jobIDs)
                    logging.debug("Archived IDs returned: %s", archiveIDs)
                else:
                    logging.warning("Upload failed and it will be retried in the next cycle: %s: %s.",
                                    response[0]['status'], response[0]['reason'])
                    logging.debug("failed JobIds %s", jobIDs)
        except Exception as ex:
            logging.error("Error occurred, will retry later:")
            logging.error(str(ex))
            logging.error("Trace back: \n%s", traceback.format_exc())
Exemplo n.º 6
0
    def listFilesInBlockWithParents(self, fileBlockName, lumis = True, validFileOnly = 1):
        """
        _listFilesInBlockWithParents_

        Get a list of files in the named fileblock including
        the parents of that file.
        TODO: lumis can be false when lumi splitting is not required
        However WMBSHelper expect file['LumiList'] to get the run number
        so for now it will be always true.

        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.listFilesInBlockWithParents(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            #TODO: shoud we get only valid block for this?
            files = self.dbs.listFileParents(block_name = fileBlockName)
            fileDetails = self.listFilesInBlock(fileBlockName, lumis, validFileOnly)

        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlockWithParents(%s)\n" % (
                fileBlockName,)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        childByParents = defaultdict(list)
        for f in files:
            # Probably a child can have more than 1 parent file
            for fp in f['parent_logical_file_name']:
                childByParents[fp].append(f['logical_file_name'])
        parentsLFNs = childByParents.keys()

        parentFilesDetail = []
        #TODO: slicing parentLFNs util DBS api is handling that.
        #Remove slicing if DBS api handles
        for pLFNs in grouper(parentsLFNs, 50):
            parentFilesDetail.extend(self.dbs.listFileArray(logical_file_name = pLFNs, detail = True))

        if lumis:
            parentLumis = self._getLumiList(lfns = parentsLFNs)

        parentsByLFN = defaultdict(list)

        for pf in parentFilesDetail:
            parentLFN = pf['logical_file_name']
            dbsFile = remapDBS3Keys(pf, stringify = True)
            if lumis:
                dbsFile["LumiList"] = parentLumis[parentLFN]

            for childLFN in childByParents[parentLFN]:
                parentsByLFN[childLFN].append(dbsFile)

        for fileInfo in fileDetails:
            fileInfo["ParentList"] = parentsByLFN[fileInfo['logical_file_name']]

        return fileDetails
Exemplo n.º 7
0
    def getClassAds(self):
        """
        _getClassAds_

        Grab CONDOR classAds using CONDOR-PYTHON

        This looks at the schedd running on the
        Submit-Host and edit/remove jobs
        """

        jobInfo = {}
        schedd = condor.Schedd()

        try:
            logging.debug("Start: Retrieving classAds using Condor Python XQuery")
            itobj = schedd.xquery(
                'WMAgent_JobID =!= "UNDEFINED" && WMAgent_AgentName == %s' % classad.quote(str(self.agent)),
                ["JobStatus", "EnteredCurrentStatus", "JobStartDate", "QDate", "DESIRED_Sites",
                 "ExtDESIRED_Sites", "MATCH_EXP_JOBGLIDEIN_CMSSite", "WMAgent_JobID"]
                )
            logging.debug("Finish: Retrieving classAds using Condor Python XQuery")
        except:
            msg = "Query to condor schedd failed in PyCondorPlugin"
            logging.debug(msg)
            return None, None
        else:
            for slicedAds in grouper(itobj, 1000):
                for jobAd in slicedAds:
                    ### This condition ignores jobs that are Removed, but stay in the X state
                    ### For manual condor_rm removal, job wont be in the queue \
                    ### and status of the jobs will be read from condor log
                    if jobAd["JobStatus"] == 3:
                        continue
                    else:
                        ## For some strange race condition, schedd sometimes does not publish StartDate for a Running Job
                        ## Get the entire classad for such a job
                        ## Do not crash WMA, wait for next polling cycle to get all the info.
                        if jobAd["JobStatus"] == 2 and jobAd.get("JobStartDate") is None:
                            logging.debug("THIS SHOULD NOT HAPPEN. JobStartDate is MISSING from the CLASSAD.")
                            logging.debug("Could be caused by some race condition. Wait for the next Polling Cycle")
                            logging.debug("%s", str(jobAd))
                            continue

                        tmpDict = {}
                        tmpDict["JobStatus"] = int(jobAd.get("JobStatus", 100))
                        tmpDict["stateTime"] = int(jobAd["EnteredCurrentStatus"])
                        tmpDict["runningTime"] = int(jobAd.get("JobStartDate", 0))
                        tmpDict["submitTime"] = int(jobAd["QDate"])
                        tmpDict["DESIRED_Sites"] = jobAd["DESIRED_Sites"]
                        tmpDict["ExtDESIRED_Sites"] = jobAd["ExtDESIRED_Sites"]
                        tmpDict["runningCMSSite"] = jobAd.get("MATCH_EXP_JOBGLIDEIN_CMSSite", None)
                        tmpDict["WMAgentID"] = int(jobAd["WMAgent_JobID"])
                        jobInfo[tmpDict["WMAgentID"]] = tmpDict

            logging.info("Retrieved %i classAds", len(jobInfo))

        return jobInfo, schedd
Exemplo n.º 8
0
    def submit(self, jobs, info=None):
        """
        _submit_


        Submit jobs for one subscription
        """
        successfulJobs = []
        failedJobs = []

        if len(jobs) == 0:
            # Then was have nothing to do
            return successfulJobs, failedJobs

        schedd = htcondor.Schedd()

        # Submit the jobs
        for jobsReady in grouper(jobs, self.jobsPerSubmit):

            cluster_ad = self.getClusterAd()
            proc_ads = self.getProcAds(jobsReady)

            logging.debug(
                "Start: Submitting %d jobs using Condor Python SubmitMany" %
                len(proc_ads))
            try:
                clusterId = schedd.submitMany(cluster_ad, proc_ads)
            except Exception as ex:
                logging.error("SimpleCondorPlugin job submission failed.")
                logging.error(
                    "Moving on the the next batch of jobs and/or cycle....")
                logging.exception(ex)

                condorErrorReport = Report()
                condorErrorReport.addError("JobSubmit", 61202, "CondorError",
                                           str(ex))
                for job in jobsReady:
                    job['fwjr'] = condorErrorReport
                    failedJobs.append(job)
            else:
                logging.debug(
                    "Finish: Submitting jobs using Condor Python SubmitMany")
                for index, job in enumerate(jobsReady):
                    job['gridid'] = "%s.%s" % (clusterId, index)
                    job['status'] = '1'
                    successfulJobs.append(job)

        # We must return a list of jobs successfully submitted and a list of jobs failed
        logging.info(
            "Done submitting jobs for this cycle in SimpleCondorPlugin")
        return successfulJobs, failedJobs
Exemplo n.º 9
0
def createDirectories(dirList):
    """
    Create the directory if everything is sane

    """
    for sdirList in grouper(dirList, 500):
        cmdArgs = ['mkdir']
        cmdArgs.extend(sdirList)
        pipe = Popen(cmdArgs, stdout=PIPE, stderr=PIPE, shell=False)
        stdout, stderr = pipe.communicate()
        if not stderr == "":
            msg = "Error in making directories: %s\n" % stderr
            logging.error(msg)
            logging.debug("Executing command %s\n" % cmdArgs)
            raise CreateWorkAreaException(msg)

    return
Exemplo n.º 10
0
def createDirectories(dirList):
    """
    Create the directory if everything is sane

    """
    for sdirList in grouper(dirList, 500):
        cmdArgs = ['mkdir']
        cmdArgs.extend(sdirList)
        pipe = Popen(cmdArgs, stdout = PIPE, stderr = PIPE, shell = False)
        stdout, stderr = pipe.communicate()
        if not stderr == "":
            msg = "Error in making directories: %s\n" % stderr
            logging.error(msg)
            logging.debug("Executing command %s\n" % cmdArgs)
            raise CreateWorkAreaException(msg)

    return
Exemplo n.º 11
0
    def submit(self, jobs, info=None):
        """
        _submit_


        Submit jobs for one subscription
        """
        successfulJobs = []
        failedJobs = []

        if len(jobs) == 0:
            # Then was have nothing to do
            return successfulJobs, failedJobs

        schedd = htcondor.Schedd()

        # Submit the jobs
        for jobsReady in grouper(jobs, self.jobsPerSubmit):

            cluster_ad = self.getClusterAd()
            proc_ads = self.getProcAds(jobsReady)

            logging.debug("Start: Submitting %d jobs using Condor Python SubmitMany" % len(proc_ads))
            try:
                clusterId = schedd.submitMany(cluster_ad, proc_ads)
            except Exception as ex:
                logging.error("SimpleCondorPlugin job submission failed.")
                logging.error("Moving on the the next batch of jobs and/or cycle....")
                logging.exception(ex)

                condorErrorReport = Report()
                condorErrorReport.addError("JobSubmit", 61202, "CondorError", str(ex))
                for job in jobsReady:
                    job['fwjr'] = condorErrorReport
                    failedJobs.append(job)
            else:
                logging.debug("Finish: Submitting jobs using Condor Python SubmitMany")
                for index,job in enumerate(jobsReady):
                    job['gridid'] = "%s.%s" % (clusterId, index)
                    job['status'] = 'Idle'
                    successfulJobs.append(job)

        # We must return a list of jobs successfully submitted and a list of jobs failed
        logging.info("Done submitting jobs for this cycle in SimpleCondorPlugin")
        return successfulJobs, failedJobs
Exemplo n.º 12
0
    def archiveJobs(self):
        """
        _archiveJobs_

        archiveJobs will handle the master task of looking for finished jobs,
        and running the code that cleans them out.
        """
        doneList = self.findFinishedJobs()
        logging.info("Found %i finished jobs to archive", len(doneList))

        jobCounter = 0
        for slicedList in grouper(doneList, 10000):
            self.cleanWorkArea(slicedList)

            successList = []
            failList = []
            killList = []
            for job in slicedList:
                if job["outcome"] == "success":
                    successList.append(job)
                elif job["outcome"] == "killed":
                    killList.append(job)
                else:
                    failList.append(job)

            if self.uploadPublishInfo:
                self.createAndUploadPublish(successList)

            myThread = threading.currentThread()
            myThread.transaction.begin()
            self.changeState.propagate(successList, "cleanout", "success")
            self.changeState.propagate(failList, "cleanout", "exhausted")
            self.changeState.propagate(killList, "cleanout", "killed")
            myThread.transaction.commit()

            jobCounter += len(slicedList)
            logging.info("Successfully archived %d jobs out of %d.",
                         jobCounter, len(doneList))
Exemplo n.º 13
0
 def algorithm(self, parameters):
     """
     get information from wmbs, workqueue and local couch
     """
     try:
         logging.info("Getting not archived data info from FWRJ db...")
         data = self.fwjrAPI.getFWJRByArchiveStatus('ready', limit=1000)['rows']
         
         for slicedData in grouper(data, self.numDocsUploadPerCall):
             jobIDs = []
             archiverDocs = []
             for job in slicedData:
                 doc = createArchiverDoc(job)
                 archiverDocs.append(doc)
                 jobIDs.append(job["id"])
                 
             response = self.wmarchiver.archiveData(archiverDocs)
         
             # Partial success is not allowed either all the insert is successful of none is successful.
             if response[0]['status'] == "ok" and len(response[0]['ids']) == len(jobIDs):
                 archiveIDs = response[0]['ids']
                 for docID in jobIDs:
                     self.fwjrAPI.updateArchiveUploadedStatus(docID)
                 logging.info("...successfully uploaded %d docs", len(jobIDs))
                 logging.debug("JobIDs uploaded: %s", jobIDs)
                 logging.debug("Archive IDs returned: %s", response[0]['ids'])
                 
                 if len(set(archiveIDs)) == len(archiveIDs):
                     duplicateIDs = set([x for x in archiveIDs if archiveIDs.count(x) > 1])
                     logging.info("There are duplicate entry %s", duplicateIDs) 
             else:
                 logging.warning("Upload failed: %s: %s", response[0]['status'], response[0]['reason'])
                 logging.debug("failed JobIds %s", jobIDs)
     except Exception as ex:
         logging.error("Error occurred, will retry later:")
         logging.error(str(ex))
         logging.error("Trace back: \n%s" % traceback.format_exc())
Exemplo n.º 14
0
    def archiveJobs(self):
        """
        _archiveJobs_

        archiveJobs will handle the master task of looking for finished jobs,
        and running the code that cleans them out.
        """
        doneList = self.findFinishedJobs()
        logging.info("Found %i finished jobs to archive", len(doneList))

        jobCounter = 0
        for slicedList in grouper(doneList, 10000):
            self.cleanWorkArea(slicedList)

            successList = []
            failList = []
            killList = []
            for job in slicedList:
                if job["outcome"] == "success":
                    successList.append(job)
                elif job["outcome"] == "killed":
                    killList.append(job)
                else:
                    failList.append(job)

            if self.uploadPublishInfo:
                self.createAndUploadPublish(successList)

            myThread = threading.currentThread()
            myThread.transaction.begin()
            self.changeState.propagate(successList, "cleanout", "success")
            self.changeState.propagate(failList, "cleanout", "exhausted")
            self.changeState.propagate(killList, "cleanout", "killed")
            myThread.transaction.commit()

            jobCounter += len(slicedList)
            logging.info("Successfully archived %d jobs out of %d.", jobCounter, len(doneList))
Exemplo n.º 15
0
    def execute(self, emulator=None):
        """
        _execute_

        """
        scramCommand = self.step.application.setup.scramCommand
        scramArch = self.step.application.setup.scramArch
        cmsswVersion = self.step.application.setup.cmsswVersion

        # Are we using emulators again?
        if (emulator != None):
            return emulator.emulate(self.step, self.job)

        overrides = {}
        if hasattr(self.step, 'override'):
            overrides = self.step.override.dictionary_()

        # Set wait to over an hour
        waitTime = overrides.get(
            'waitTime', 3600 + (self.step.retryDelay * self.step.retryCount))

        # hardcode CERN Castor T0_CH_CERN_MSS stageout parameters
        castorStageOutParams = {}
        castorStageOutParams['command'] = overrides.get('command', "xrdcp")
        castorStageOutParams['option'] = overrides.get('option',
                                                       "--cerncastor")
        castorStageOutParams['phedex-node'] = overrides.get(
            'phedex-node', "T2_CH_CERN")
        castorStageOutParams['lfn-prefix'] = overrides.get(
            'lfn-prefix', "root://castorcms.cern.ch//castor/cern.ch/cms")

        # hardcode CERN EOS T2_CH_CERN stageout parameters
        eosStageOutParams = {}
        eosStageOutParams['command'] = overrides.get('command', "xrdcp")
        eosStageOutParams['option'] = overrides.get('option', "")
        eosStageOutParams['phedex-node'] = overrides.get(
            'phedex-node', "T2_CH_CERN")
        eosStageOutParams['lfn-prefix'] = overrides.get(
            'lfn-prefix', "root://eoscms.cern.ch//eos/cms")

        # are we using the new stageout method ?
        useNewStageOutCode = False
        if getattr(self.step, 'newStageout', False) or \
            ('newStageOut' in overrides and overrides.get('newStageOut')):
            useNewStageOutCode = True

        try:
            castorStageOutMgr = StageOutMgr(**castorStageOutParams)
            eosStageOutMgr = StageOutMgr(**eosStageOutParams)
            stageInMgr = StageInMgr()
            deleteMgr = DeleteMgr()
        except Exception as ex:
            msg = "Unable to load StageOut/Delete Impl: %s" % str(ex)
            logging.error(msg)
            raise WMExecutionFailure(60312, "MgrImplementationError", msg)

        # prepare output tar file
        taskName = self.report.getTaskName().split('/')[-1]
        host = socket.gethostname().split('.')[0]
        tarName = '%s-%s-%s-%i-logs.tar' % (
            self.report.data.workload, taskName, host, self.job["counter"])
        tarLocation = os.path.join(self.stepSpace.location, tarName)

        # check if the cmsswVersion supports edmCopyUtil (min CMSSW_8_X)
        result = re.match("CMSSW_([0-9]+)_([0-9]+)_([0-9]+).*", cmsswVersion)
        useEdmCopyUtil = False
        if result:
            try:
                if int(result.group(1)) >= 8:
                    useEdmCopyUtil = True
            except ValueError:
                pass

        # setup Scram needed to run edmCopyUtil
        if useEdmCopyUtil:
            scram = Scram(
                command=scramCommand,
                version=cmsswVersion,
                initialise=self.step.application.setup.softwareEnvironment,
                directory=self.step.builder.workingDir,
                architecture=scramArch,
            )
            logging.info("Running scram")
            try:
                projectOutcome = scram.project()
            except Exception as ex:
                msg = "Exception raised while running scram.\n"
                msg += str(ex)
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)

            if projectOutcome > 0:
                msg = scram.diagnostic()
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)
            runtimeOutcome = scram.runtime()
            if runtimeOutcome > 0:
                msg = scram.diagnostic()
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)

        # iterate through input files
        localLogs = []
        deleteLogArchives = []
        if useEdmCopyUtil:
            numberOfFilesPerCopy = 10
        else:
            numberOfFilesPerCopy = 1
        for logs in grouper(self.job["input_files"], numberOfFilesPerCopy):

            copyCommand = "env X509_USER_PROXY=%s edmCopyUtil" % os.environ.get(
                'X509_USER_PROXY', None)
            for log in logs:
                copyCommand += " %s" % log['lfn']
            copyCommand += " %s" % self.step.builder.workingDir

            # give up after timeout of 1 minute per input file
            signal.signal(signal.SIGALRM, alarmHandler)
            signal.alarm(60 * numberOfFilesPerCopy)

            filesCopied = False
            try:
                if useEdmCopyUtil:
                    logging.info("Running edmCopyUtil")
                    retval = scram(copyCommand)
                    if retval == 0:
                        filesCopied = True
                else:
                    logging.info("Running stageIn")
                    for log in logs:
                        fileInfo = {"LFN": log['lfn']}
                        logArchive = stageInMgr(**fileInfo)
                        if logArchive:
                            filesCopied = True
            except Alarm:
                logging.error(
                    "Indefinite hang during edmCopyUtil/stageIn of logArchives"
                )
            except StageOutFailure:
                logging.error("Unable to stageIn logArchives")
            except Exception:
                raise

            signal.alarm(0)

            if filesCopied:
                for log in logs:
                    localLogs.append(
                        os.path.join(self.step.builder.workingDir,
                                     os.path.basename(log['lfn'])))
                    deleteLogArchives.append(log)
                    self.report.addInputFile(sourceName="logArchives",
                                             lfn=log['lfn'])
            else:
                logging.error("Unable to copy logArchives to local disk")
                if useEdmCopyUtil:
                    with open('scramOutput.log', 'r') as f:
                        logging.error("Scram output: %s", f.read())
                for log in logs:
                    self.report.addSkippedFile(log['lfn'], None)

        # create tarfile if any logArchive copied in
        if localLogs:
            tarFile = tarfile.open(tarLocation, 'w:')
            for log in localLogs:
                path = log.split('/')
                tarFile.add(name=log,
                            arcname=os.path.join(path[-3], path[-2], path[-1]))
                os.remove(log)
            tarFile.close()
        else:
            msg = "Unable to copy any logArchives to local disk"
            logging.error(msg)
            raise WMExecutionFailure(60312, "LogCollectError", msg)

        # now staging out the LogCollect tarfile
        logging.info("Staging out LogCollect tarfile to Castor and EOS")
        now = datetime.datetime.now()
        lfn = "/store/logs/prod/%i/%.2i/%s/%s/%s" % (
            now.year, now.month, "WMAgent", self.report.data.workload,
            os.path.basename(tarLocation))

        tarInfo = {'LFN': lfn, 'PFN': tarLocation, 'PNN': None, 'GUID': None}

        # perform mandatory stage out to CERN Castor
        signal.signal(signal.SIGALRM, alarmHandler)
        signal.alarm(waitTime)
        try:
            castorStageOutMgr(tarInfo)
        except Alarm:
            msg = "Indefinite hang during stageOut of LogCollect to Castor"
            logging.error(msg)
            raise WMExecutionFailure(60409, "LogCollectTimeout", msg)
        except Exception as ex:
            msg = "Unable to stageOut LogCollect to Castor:\n"
            msg += str(ex)
            logging.error(msg)
            raise WMExecutionFailure(60408, "LogCollectStageOutError", msg)
        signal.alarm(0)

        # add to job report
        self.report.addOutputFile(outputModule="LogCollect", file=tarInfo)
        outputRef = getattr(self.report.data, self.stepName)
        outputRef.output.pfn = tarInfo['PFN']
        outputRef.output.location = tarInfo['PNN']
        outputRef.output.lfn = tarInfo['LFN']

        tarInfo = {'LFN': lfn, 'PFN': tarLocation, 'PNN': None, 'GUID': None}

        # then, perform best effort stage out to CERN EOS
        signal.signal(signal.SIGALRM, alarmHandler)
        signal.alarm(waitTime)
        try:
            eosStageOutMgr(tarInfo)
        except Alarm:
            logging.error(
                "Indefinite hang during stageOut of LogCollect to EOS")
        except Exception as ex:
            logging.error("Unable to stageOut LogCollect to EOS:\n", ex)
        signal.alarm(0)

        # we got this far, delete input
        for log in deleteLogArchives:

            # give up after timeout of 1 minutes
            signal.signal(signal.SIGALRM, alarmHandler)
            signal.alarm(60)
            try:
                fileToDelete = {
                    'LFN': log['lfn'],
                    'PFN': None,
                    'PNN': None,
                    'StageOutCommand': None
                }
                deleteMgr(fileToDelete=fileToDelete)
            except Alarm:
                logging.error("Indefinite hang during delete of logArchive")
            except Exception as ex:
                logging.error("Unable to delete logArchive: %s", ex)
            signal.alarm(0)

        return
Exemplo n.º 16
0
    def execute(self, emulator = None):
        """
        _execute_

        """
        scramCommand = self.step.application.setup.scramCommand
        scramArch = self.step.application.setup.scramArch
        cmsswVersion = self.step.application.setup.cmsswVersion

        # Are we using emulators again?
        if (emulator != None):
            return emulator.emulate( self.step, self.job )

        overrides = {}
        if hasattr(self.step, 'override'):
            overrides = self.step.override.dictionary_()

        # Set wait to over an hour
        waitTime = overrides.get('waitTime', 3600 + (self.step.retryDelay * self.step.retryCount))

        # Pull out StageOutMgr Overrides
        # switch between old stageOut behavior and new, fancy stage out behavior
        useNewStageOutCode = False
        if 'newStageOut' in overrides and overrides.get('newStageOut'):
            useNewStageOutCode = True

        # hardcode CERN Castor T0_CH_CERN_MSS stageout parameters
        castorStageOutParams = {}
        castorStageOutParams['command'] = overrides.get('command', "srmv2-lcg")
        castorStageOutParams['option'] = overrides.get('option', "")
        castorStageOutParams['se-name'] = overrides.get('se-name', "srm-cms.cern.ch")
        castorStageOutParams['phedex-node'] = overrides.get('phedex-node', "T2_CH_CERN")
        castorStageOutParams['lfn-prefix'] = overrides.get('lfn-prefix', "srm://srm-cms.cern.ch:8443/srm/managerv2?SFN=/castor/cern.ch/cms")

        # hardcode CERN EOS T2_CH_CERN stageout parameters
        eosStageOutParams = {}
        eosStageOutParams['command'] = overrides.get('command', "srmv2-lcg")
        eosStageOutParams['option'] = overrides.get('option', "")
        eosStageOutParams['se-name'] = overrides.get('se-name', "srm-eoscms.cern.ch")
        eosStageOutParams['phedex-node'] = overrides.get('phedex-node', "T2_CH_CERN")
        eosStageOutParams['lfn-prefix'] = overrides.get('lfn-prefix', "srm://srm-eoscms.cern.ch:8443/srm/v2/server?SFN=/eos/cms")

        # are we using the new stageout method ?
        useNewStageOutCode = False
        if getattr(self.step, 'newStageout', False) or \
            ('newStageOut' in overrides and overrides.get('newStageOut')):
            useNewStageOutCode = True

        try:
            if useNewStageOutCode:
                # is this even working ???
                #logging.info("LOGCOLLECT IS USING NEW STAGEOUT CODE")
                #stageOutMgr = StageOutMgr(retryPauseTime  = self.step.retryDelay,
                #                          numberOfRetries = self.step.retryCount,
                #                          **overrides)
                #stageInMgr = StageInMgr(retryPauseTime  = 0,
                #                        numberOfRetries = 0)
                #deleteMgr = DeleteMgr(retryPauseTime  = 0,
                #                      numberOfRetries = 0)
                castorStageOutMgr = StageOutMgr(**castorStageOutParams)
                eosStageOutMgr = StageOutMgr(**eosStageOutParams)
                stageInMgr = StageInMgr()
                deleteMgr = DeleteMgr()
            else:
                castorStageOutMgr = StageOutMgr(**castorStageOutParams)
                eosStageOutMgr = StageOutMgr(**eosStageOutParams)
                stageInMgr = StageInMgr()
                deleteMgr = DeleteMgr()
        except Exception as ex:
            msg = "Unable to load StageOut/Delete Impl: %s" % str(ex)
            logging.error(msg)
            raise WMExecutionFailure(60312, "MgrImplementationError", msg)

        # prepare output tar file
        taskName = self.report.getTaskName().split('/')[-1]
        host = socket.gethostname().split('.')[0]
        tarName = '%s-%s-%s-%i-logs.tar' % (self.report.data.workload, taskName, host , self.job["counter"])
        tarLocation = os.path.join(self.stepSpace.location, tarName)

        # check if the cmsswVersion supports edmCopyUtil (min CMSSW_8_X)
        result = re.match("CMSSW_([0-9]+)_([0-9]+)_([0-9]+).*", cmsswVersion)
        useEdmCopyUtil = False
        if result:
            try:
                if int(result.group(1)) >= 8:
                    useEdmCopyUtil = True
            except ValueError:
                pass

        # setup Scram needed to run edmCopyUtil
        if useEdmCopyUtil:
            scram = Scram(
                command=scramCommand,
                version=cmsswVersion,
                initialise=self.step.application.setup.softwareEnvironment,
                directory=self.step.builder.workingDir,
                architecture=scramArch,
                )
            logging.info("Running scram")
            try:
                projectOutcome = scram.project()
            except Exception as ex:
                msg = "Exception raised while running scram.\n"
                msg += str(ex)
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)

            if projectOutcome > 0:
                msg = scram.diagnostic()
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)
            runtimeOutcome = scram.runtime()
            if runtimeOutcome > 0:
                msg = scram.diagnostic()
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)

        # iterate through input files
        localLogs = []
        deleteLogArchives = []
        if useEdmCopyUtil:
            numberOfFilesPerCopy = 10
        else:
            numberOfFilesPerCopy = 1
        for logs in grouper(self.job["input_files"], numberOfFilesPerCopy):

            copyCommand = "env X509_USER_PROXY=%s edmCopyUtil" % os.environ.get('X509_USER_PROXY', None)
            for log in logs:
                copyCommand += " %s" % log['lfn']
            copyCommand += " %s" % self.step.builder.workingDir

            # give up after timeout of 1 minute per input file
            signal.signal(signal.SIGALRM, alarmHandler)
            signal.alarm(60 * numberOfFilesPerCopy)

            filesCopied = False
            try:
                if useEdmCopyUtil:
                    logging.info("Running edmCopyUtil")
                    retval = scram(copyCommand)
                    if retval == 0:
                        filesCopied = True
                else:
                    logging.info("Running stageIn")
                    for log in logs:
                        fileInfo = {"LFN": log['lfn']}
                        logArchive = stageInMgr(**fileInfo)
                        if logArchive:
                            filesCopied = True
            except Alarm:
                logging.error("Indefinite hang during edmCopyUtil/stageIn of logArchives")
            except StageOutFailure:
                logging.error("Unable to stageIn logArchives")
            except Exception:
                raise

            signal.alarm(0)

            if filesCopied:
                for log in logs:
                    localLogs.append(os.path.join(self.step.builder.workingDir, os.path.basename(log['lfn'])))
                    deleteLogArchives.append(log)
                    self.report.addInputFile(sourceName = "logArchives", lfn = log['lfn'])
            else:
                logging.error("Unable to copy logArchives to local disk")
                if useEdmCopyUtil:
                    with open('scramOutput.log', 'r') as f:
                        logging.error("Scram output: %s", f.read())
                for log in logs:
                    self.report.addSkippedFile(log['lfn'], None)

        # create tarfile if any logArchive copied in
        if localLogs:
            tarFile = tarfile.open(tarLocation, 'w:')
            for log in localLogs:
                path = log.split('/')
                tarFile.add(name = log,
                            arcname = os.path.join(path[-3],
                                                   path[-2],
                                                   path[-1]))
                os.remove(log)
            tarFile.close()
        else:
            msg = "Unable to copy any logArchives to local disk"
            logging.error(msg)
            raise WMExecutionFailure(60312, "LogCollectError", msg)


        # now staging out the LogCollect tarfile
        logging.info("Staging out LogCollect tarfile to Castor and EOS")
        now = datetime.datetime.now()
        lfn = "/store/logs/prod/%i/%.2i/%s/%s/%s" % (now.year, now.month, "WMAgent",
                                                     self.report.data.workload,
                                                     os.path.basename(tarLocation))

        tarInfo = {'LFN'    : lfn,
                   'PFN'    : tarLocation,
                   'SEName' : None,
                   'PNN'    : None,
                   'GUID'   : None}

        # perform mandatory stage out to CERN Castor
        signal.signal(signal.SIGALRM, alarmHandler)
        signal.alarm(waitTime)
        try:
            castorStageOutMgr(tarInfo)
        except Alarm:
            msg = "Indefinite hang during stageOut of LogCollect to Castor"
            logging.error(msg)
            raise WMExecutionFailure(60409, "LogCollectTimeout", msg)
        except Exception as ex:
            msg = "Unable to stageOut LogCollect to Castor:\n"
            msg += str(ex)
            logging.error(msg)
            raise WMExecutionFailure(60408, "LogCollectStageOutError", msg)
        signal.alarm(0)

        # add to job report
        self.report.addOutputFile(outputModule = "LogCollect", file = tarInfo)
        outputRef = getattr(self.report.data, self.stepName)
        outputRef.output.pfn = tarInfo['PFN']
        outputRef.output.location = tarInfo['PNN']
        outputRef.output.lfn = tarInfo['LFN']


        tarInfo = {'LFN'    : lfn,
                   'PFN'    : tarLocation,
                   'SEName' : None,
                   'PNN'    : None,
                   'GUID'   : None}

        # then, perform best effort stage out to CERN EOS
        signal.signal(signal.SIGALRM, alarmHandler)
        signal.alarm(waitTime)
        try:
            eosStageOutMgr(tarInfo)
        except Alarm:
            logging.error("Indefinite hang during stageOut of LogCollect to EOS")
        except Exception as ex:
            logging.error("Unable to stageOut LogCollect to EOS:\n", ex)
        signal.alarm(0)

        # we got this far, delete input
        for log in deleteLogArchives:

            # give up after timeout of 1 minutes
            signal.signal(signal.SIGALRM, alarmHandler)
            signal.alarm(60)
            try:
                fileToDelete = {'LFN': log['lfn'],
                                'PFN': None,
                                'SEName': None,
                                'PNN': None,
                                'StageOutCommand': None}
                deleteMgr(fileToDelete = fileToDelete)
            except Alarm:
                logging.error("Indefinite hang during delete of logArchive")
            except Exception as ex:
                logging.error("Unable to delete logArchive: %s", ex)
            signal.alarm(0)

        return
Exemplo n.º 17
0
    def submit(self, jobs, info=None):
        """
        _submit_


        Submit jobs for one subscription
        """

        # If we're here, then we have submitter components
        self.scriptFile = self.config.JobSubmitter.submitScript
        self.submitDir = self.config.JobSubmitter.submitDir
        timeout = getattr(self.config.JobSubmitter, 'getTimeout', 400)

        successfulJobs = []
        failedJobs = []
        jdlFiles = []

        if len(jobs) == 0:
            # Then was have nothing to do
            return successfulJobs, failedJobs

        if len(self.pool) == 0:
            # Starting things up
            # This is obviously a submit API
            logging.info("Starting up PyCondorPlugin worker pool")
            self.inputQueue = multiprocessing.Queue()
            self.result = multiprocessing.Queue()
            for x in range(self.nProcess):
                p = multiprocessing.Process(target=submitWorker,
                                            args=(self.inputQueue, self.result, timeout))
                p.start()
                self.pool.append(p)

        if not os.path.exists(self.submitDir):
            os.makedirs(self.submitDir)

        # Submit the jobs
        nSubmits = 0
        queueError = False
        for jobsReady in grouper(jobs, self.jobsPerWorker):

            if queueError:
                # If the queue has failed, then we must not process any more jobs this cycle.
                break

            idList = [x['id'] for x in jobsReady]
            jdlList = self.makeSubmit(jobList=jobsReady)
            if not jdlList:
                # Then we got nothing
                logging.error("No JDL file made!")
                return {'NoResult': [0]}
            jdlFile = "%s/submit_%i_%i.jdl" % (self.submitDir, os.getpid(), idList[0])

            with open(jdlFile, 'w') as handle:
                handle.writelines(jdlList)
            jdlFiles.append(jdlFile)

            # Now submit them
            logging.info("About to submit %i jobs", len(jobsReady))
            if self.glexecPath:
                command = 'CS=`which condor_submit`; '
                if self.glexecWrapScript:
                    command += 'export GLEXEC_ENV=`%s 2>/dev/null`; ' % self.glexecWrapScript
                command += 'export GLEXEC_CLIENT_CERT=%s; ' % self.glexecProxyFile
                command += 'export GLEXEC_SOURCE_PROXY=%s; ' % self.glexecProxyFile
                command += 'export X509_USER_PROXY=%s; ' % self.glexecProxyFile
                command += 'export GLEXEC_TARGET_PROXY=%s; ' % self.jdlProxyFile
                if self.glexecUnwrapScript:
                    command += '%s %s -- $CS %s' % (self.glexecPath, self.glexecUnwrapScript, jdlFile)
                else:
                    command += '%s $CS %s' % (self.glexecPath, jdlFile)
            else:
                command = "condor_submit %s" % jdlFile

            try:
                self.inputQueue.put({'command': command, 'idList': idList})
            except AssertionError as ex:
                msg = "Critical error: input pipeline probably closed.\n"
                msg += str(ex)
                msg += "Error Procedure: Something critical has happened in the worker process\n"
                msg += "We will now proceed to pull all useful data from the queue (if it exists)\n"
                msg += "Then refresh the worker pool\n"
                logging.error(msg)
                queueError = True
                break
            nSubmits += 1

        # Now we should have sent all jobs to be submitted
        # Going to do the rest of it now
        for dummy in range(nSubmits):
            try:
                res = self.result.get(block=True, timeout=timeout)
            except Queue.Empty:
                # If the queue was empty go to the next submit
                # Those jobs have vanished
                logging.error("Queue.Empty error received!")
                logging.error("This could indicate a critical condor error!")
                logging.error("However, no information of any use was obtained due to process failure.")
                logging.error("Either process failed, or process timed out after %s seconds.", timeout)
                continue
            except AssertionError as ex:
                msg = "Found Assertion error while retrieving output from worker process.\n"
                msg += str(ex)
                msg += "This indicates something critical happened to a worker process"
                msg += "We will recover what jobs we know were submitted, and resubmit the rest"
                msg += "Refreshing worker pool at end of loop"
                logging.error(msg)
                continue

            try:
                dummyOut = res['stdout']
                error = res['stderr']
                idList = res['idList']
                exitCode = res['exitCode']
            except KeyError as ex:
                msg = "Error in finding key from result pipe\n"
                msg += "Something has gone critically wrong in the worker\n"
                try:
                    msg += "Result: %s\n" % str(res)
                except:
                    pass
                msg += str(ex)
                logging.error(msg)
                continue

            if not exitCode == 0:
                logging.error("Condor returned non-zero.  Printing out command stderr")
                logging.error(error)
                errorCheck, errorMsg = parseError(error=error)
                logging.error("Processing failed jobs and proceeding to the next jobs.")
                logging.error("Do not restart component.")
            else:
                errorCheck = None

            if errorCheck:
                self.errorCount += 1
                condorErrorReport = Report()
                condorErrorReport.addError("JobSubmit", 61202, "CondorError", errorMsg)
                for jobID in idList:
                    for job in jobs:
                        if job.get('id', None) == jobID:
                            job['fwjr'] = condorErrorReport
                            failedJobs.append(job)
                            break
            else:
                if self.errorCount > 0:
                    self.errorCount -= 1
                for jobID in idList:
                    for job in jobs:
                        if job.get('id', None) == jobID:
                            successfulJobs.append(job)
                            break

            # If we get a lot of errors in a row it's probably time to
            # report this to the operators.
            if self.errorCount > self.errorThreshold:
                try:
                    msg = "Exceeded errorThreshold while submitting to condor. Check condor status."
                    logging.error(msg)
                    logging.error("Reporting to Alert system and continuing to process jobs")
                    from WMCore.Alerts import API as alertAPI
                    preAlert, sender = alertAPI.setUpAlertsMessaging(self,
                                                                     compName="BossAirPyCondorPlugin")
                    sendAlert = alertAPI.getSendAlert(sender=sender,
                                                      preAlert=preAlert)
                    sendAlert(6, msg=msg)
                    sender.unregister()
                    self.errorCount = 0
                except:
                    # There's nothing we can really do here
                    pass

        # Remove JDL files unless commanded otherwise
        if self.deleteJDLFiles:
            for f in jdlFiles:
                os.remove(f)

        # When we're finished, clean up the queue workers in order
        # to free up memory (in the midst of the process, the forked
        # memory space shouldn't be touched, so it should still be
        # shared, but after this point any action by the Submitter will
        # result in memory duplication).
        logging.info("Purging worker pool to clean up memory")
        self.close()


        # We must return a list of jobs successfully submitted,
        # and a list of jobs failed
        logging.info("Done submitting jobs for this cycle in PyCondorPlugin")
        return successfulJobs, failedJobs
Exemplo n.º 18
0
    def listFilesInBlockWithParents(self,
                                    fileBlockName,
                                    lumis=True,
                                    validFileOnly=1):
        """
        _listFilesInBlockWithParents_

        Get a list of files in the named fileblock including
        the parents of that file.
        TODO: lumis can be false when lumi splitting is not required
        However WMBSHelper expect file['LumiList'] to get the run number
        so for now it will be always true.

        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.listFilesInBlockWithParents(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            #TODO: shoud we get only valid block for this?
            files = self.dbs.listFileParents(block_name=fileBlockName)
            fileDetails = self.listFilesInBlock(fileBlockName, lumis,
                                                validFileOnly)

        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlockWithParents(%s)\n" % (
                fileBlockName, )
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        childByParents = defaultdict(list)
        for f in files:
            # Probably a child can have more than 1 parent file
            for fp in f['parent_logical_file_name']:
                childByParents[fp].append(f['logical_file_name'])
        parentsLFNs = childByParents.keys()

        parentFilesDetail = []
        #TODO: slicing parentLFNs util DBS api is handling that.
        #Remove slicing if DBS api handles
        for pLFNs in grouper(parentsLFNs, 50):
            parentFilesDetail.extend(
                self.dbs.listFileArray(logical_file_name=pLFNs, detail=True))

        if lumis:
            parentLumis = self._getLumiList(lfns=parentsLFNs)

        parentsByLFN = defaultdict(list)

        for pf in parentFilesDetail:
            parentLFN = pf['logical_file_name']
            dbsFile = remapDBS3Keys(pf, stringify=True)
            if lumis:
                dbsFile["LumiList"] = parentLumis[parentLFN]

            for childLFN in childByParents[parentLFN]:
                parentsByLFN[childLFN].append(dbsFile)

        for fileInfo in fileDetails:
            fileInfo["ParentList"] = parentsByLFN[
                fileInfo['logical_file_name']]

        return fileDetails
Exemplo n.º 19
0
    def submit(self, jobs, info=None):
        """
        _submit_


        Submit jobs for one subscription
        """

        # If we're here, then we have submitter components
        self.scriptFile = self.config.JobSubmitter.submitScript
        self.submitDir = self.config.JobSubmitter.submitDir
        timeout = getattr(self.config.JobSubmitter, 'getTimeout', 400)

        successfulJobs = []
        failedJobs = []
        jdlFiles = []

        if len(jobs) == 0:
            # Then was have nothing to do
            return successfulJobs, failedJobs

        if len(self.pool) == 0:
            # Starting things up
            # This is obviously a submit API
            logging.info("Starting up PyCondorPlugin worker pool")
            self.inputQueue = multiprocessing.Queue()
            self.result = multiprocessing.Queue()
            for x in range(self.nProcess):
                p = multiprocessing.Process(target=submitWorker,
                                            args=(self.inputQueue, self.result,
                                                  timeout))
                p.start()
                self.pool.append(p)

        if not os.path.exists(self.submitDir):
            os.makedirs(self.submitDir)

        # Submit the jobs
        nSubmits = 0
        queueError = False
        for jobsReady in grouper(jobs, self.jobsPerWorker):

            if queueError:
                # If the queue has failed, then we must not process any more jobs this cycle.
                break

            idList = [x['id'] for x in jobsReady]
            jdlList = self.makeSubmit(jobList=jobsReady)
            if not jdlList:
                # Then we got nothing
                logging.error("No JDL file made!")
                return {'NoResult': [0]}
            jdlFile = "%s/submit_%i_%i.jdl" % (self.submitDir, os.getpid(),
                                               idList[0])

            with open(jdlFile, 'w') as handle:
                handle.writelines(jdlList)
            jdlFiles.append(jdlFile)

            # Now submit them
            logging.info("About to submit %i jobs", len(jobsReady))
            if self.glexecPath:
                command = 'CS=`which condor_submit`; '
                if self.glexecWrapScript:
                    command += 'export GLEXEC_ENV=`%s 2>/dev/null`; ' % self.glexecWrapScript
                command += 'export GLEXEC_CLIENT_CERT=%s; ' % self.glexecProxyFile
                command += 'export GLEXEC_SOURCE_PROXY=%s; ' % self.glexecProxyFile
                command += 'export X509_USER_PROXY=%s; ' % self.glexecProxyFile
                command += 'export GLEXEC_TARGET_PROXY=%s; ' % self.jdlProxyFile
                if self.glexecUnwrapScript:
                    command += '%s %s -- $CS %s' % (
                        self.glexecPath, self.glexecUnwrapScript, jdlFile)
                else:
                    command += '%s $CS %s' % (self.glexecPath, jdlFile)
            else:
                command = "condor_submit %s" % jdlFile

            try:
                self.inputQueue.put({'command': command, 'idList': idList})
            except AssertionError as ex:
                msg = "Critical error: input pipeline probably closed.\n"
                msg += str(ex)
                msg += "Error Procedure: Something critical has happened in the worker process\n"
                msg += "We will now proceed to pull all useful data from the queue (if it exists)\n"
                msg += "Then refresh the worker pool\n"
                logging.error(msg)
                queueError = True
                break
            nSubmits += 1

        # Now we should have sent all jobs to be submitted
        # Going to do the rest of it now
        for dummy in range(nSubmits):
            try:
                res = self.result.get(block=True, timeout=timeout)
            except Queue.Empty:
                # If the queue was empty go to the next submit
                # Those jobs have vanished
                logging.error("Queue.Empty error received!")
                logging.error("This could indicate a critical condor error!")
                logging.error(
                    "However, no information of any use was obtained due to process failure."
                )
                logging.error(
                    "Either process failed, or process timed out after %s seconds.",
                    timeout)
                continue
            except AssertionError as ex:
                msg = "Found Assertion error while retrieving output from worker process.\n"
                msg += str(ex)
                msg += "This indicates something critical happened to a worker process"
                msg += "We will recover what jobs we know were submitted, and resubmit the rest"
                msg += "Refreshing worker pool at end of loop"
                logging.error(msg)
                continue

            try:
                dummyOut = res['stdout']
                error = res['stderr']
                idList = res['idList']
                exitCode = res['exitCode']
            except KeyError as ex:
                msg = "Error in finding key from result pipe\n"
                msg += "Something has gone critically wrong in the worker\n"
                try:
                    msg += "Result: %s\n" % str(res)
                except:
                    pass
                msg += str(ex)
                logging.error(msg)
                continue

            if not exitCode == 0:
                logging.error(
                    "Condor returned non-zero.  Printing out command stderr")
                logging.error(error)
                errorCheck, errorMsg = parseError(error=error)
                logging.error(
                    "Processing failed jobs and proceeding to the next jobs.")
                logging.error("Do not restart component.")
            else:
                errorCheck = None

            if errorCheck:
                self.errorCount += 1
                condorErrorReport = Report()
                condorErrorReport.addError("JobSubmit", 61202, "CondorError",
                                           errorMsg)
                for jobID in idList:
                    for job in jobs:
                        if job.get('id', None) == jobID:
                            job['fwjr'] = condorErrorReport
                            failedJobs.append(job)
                            break
            else:
                if self.errorCount > 0:
                    self.errorCount -= 1
                for jobID in idList:
                    for job in jobs:
                        if job.get('id', None) == jobID:
                            successfulJobs.append(job)
                            break

            # If we get a lot of errors in a row it's probably time to
            # stop the component
            if self.errorCount > self.errorThreshold:
                msg = "Exceeded errorThreshold while submitting to condor. Check condor status."
                logging.error(msg)
                raise BossAirPluginException(msg)

        # Remove JDL files unless commanded otherwise
        if self.deleteJDLFiles:
            for f in jdlFiles:
                os.remove(f)

        # When we're finished, clean up the queue workers in order
        # to free up memory (in the midst of the process, the forked
        # memory space shouldn't be touched, so it should still be
        # shared, but after this point any action by the Submitter will
        # result in memory duplication).
        logging.info("Purging worker pool to clean up memory")
        self.close()

        # We must return a list of jobs successfully submitted,
        # and a list of jobs failed
        logging.info("Done submitting jobs for this cycle in PyCondorPlugin")
        return successfulJobs, failedJobs