Exemplo n.º 1
0
    def testGrouper(self):
        """
        Test the grouper function (returns chunk of an iterable)
        """

        listChunks = [i for i in grouper(list(range(0, 7)), 3)]  # Want list(range) for python 3
        iterChunks = [i for i in grouper(xrange(0, 7), 3)]  # xrange becomes range in python 3

        for a, b in itertools.izip_longest(listChunks, iterChunks):
            self.assertEqual(a, b)

        self.assertEqual(listChunks[-1], [6])
Exemplo n.º 2
0
    def archiveJobs(self):
        """
        _archiveJobs_

        archiveJobs will handle the master task of looking for finished jobs,
        and running the code that cleans them out.
        """
        doneList = self.findFinishedJobs()
        logging.info("Found %i finished jobs to archive", len(doneList))

        jobCounter = 0
        for slicedList in grouper(doneList, 10000):
            self.cleanWorkArea(slicedList)

            successList = []
            failList = []
            killList = []
            for job in slicedList:
                if job["outcome"] == "success":
                    successList.append(job)
                elif job["outcome"] == "killed":
                    killList.append(job)
                else:
                    failList.append(job)

            myThread = threading.currentThread()
            myThread.transaction.begin()
            self.changeState.propagate(successList, "cleanout", "success")
            self.changeState.propagate(failList, "cleanout", "exhausted")
            self.changeState.propagate(killList, "cleanout", "killed")
            myThread.transaction.commit()

            jobCounter += len(slicedList)
            logging.info("Successfully archived %d jobs out of %d.", jobCounter, len(doneList))
Exemplo n.º 3
0
def main():
    # FIXME update the workflow name here
    wf = "mcremone_task_EXO-RunIISummer15wmLHEGS-04802__v1_T_170811_181808_305"
    print("Looking for problematic inbox elements...")

    wq = WorkQueue("https://cmsweb.cern.ch/couchdb/workqueue")
    print("Workqueue config: server %s and db %s" % (wq.server.url, wq.db.name))

    nonCancelableElements = ['Done', 'Canceled', 'Failed']
    data = wq.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus',
                          {'startkey': [wf], 'endkey': [wf, {}], 'reduce': False})

    elements = [x['id'] for x in data.get('rows', []) if x['key'][1] not in nonCancelableElements]
    print("Found %d elements for wf %s" % (len(elements), wf))
    total = 0
    for eleSlice in grouper(elements, 100):
        try:
            wq.updateElements(*eleSlice, Status='CancelRequested')
        except Exception as ex:
            print("Exception happened, but keep going: %s" % str(ex))
        else:
            total += 100
            print("Elements updated: %s" % total)

    print("Done!")

    sys.exit(0)
Exemplo n.º 4
0
    def algorithm(self, parameters):
        """
        get information from wmbs, workqueue and local couch
        """
        try:
            data = self.fwjrAPI.getFWJRByArchiveStatus('ready', limit=self.numDocsRetrievePerPolling)['rows']
            logging.info("Found %i not archived documents from FWRJ db to upload to WMArchive.", len(data))

            for slicedData in grouper(data, self.numDocsUploadPerCall):
                jobIDs = []
                archiveDocs = []
                for job in slicedData:
                    doc = createArchiverDoc(job)
                    archiveDocs.append(doc)
                    jobIDs.append(job["id"])

                response = self.wmarchiver.archiveData(archiveDocs)

                # Partial success is not allowed either all the insert is successful or none is
                if response[0]['status'] == "ok" and len(response[0]['ids']) == len(jobIDs):
                    archiveIDs = response[0]['ids']
                    for docID in jobIDs:
                        self.fwjrAPI.updateArchiveUploadedStatus(docID)
                    logging.info("...successfully uploaded %d docs", len(jobIDs))
                    logging.debug("JobIDs uploaded: %s", jobIDs)
                    logging.debug("Archived IDs returned: %s", archiveIDs)
                else:
                    logging.warning("Upload failed and it will be retried in the next cycle: %s: %s.",
                                    response[0]['status'], response[0]['reason'])
                    logging.debug("failed JobIds %s", jobIDs)
        except Exception as ex:
            logging.error("Error occurred, will retry later:")
            logging.error(str(ex))
            logging.error("Trace back: \n%s", traceback.format_exc())
Exemplo n.º 5
0
def main():
    # FIXME update the workflow name here
    wf = "mcremone_task_EXO-RunIISummer15wmLHEGS-04802__v1_T_170811_181808_305"
    print("Looking for problematic inbox elements...")

    wq = WorkQueue("https://cmsweb.cern.ch/couchdb/workqueue")
    print("Workqueue config: server %s and db %s" %
          (wq.server.url, wq.db.name))

    nonCancelableElements = ['Done', 'Canceled', 'Failed']
    data = wq.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus', {
        'startkey': [wf],
        'endkey': [wf, {}],
        'reduce': False
    })

    elements = [
        x['id'] for x in data.get('rows', [])
        if x['key'][1] not in nonCancelableElements
    ]
    print("Found %d elements for wf %s" % (len(elements), wf))
    total = 0
    for eleSlice in grouper(elements, 100):
        try:
            wq.updateElements(*eleSlice, Status='CancelRequested')
        except Exception as ex:
            print("Exception happened, but keep going: %s" % str(ex))
        else:
            total += 100
            print("Elements updated: %s" % total)

    print("Done!")

    sys.exit(0)
Exemplo n.º 6
0
 def _getLatestJobInfo(self, keys):
     """
     Given a list of lists as keys, in the format of:
         [['request_name', 'agent_url'], ['request_name2', 'agent_url2'], ....]
     The result format from the latestRequest view is:
         {u'offset': 527,
          u'rows': [{u'doc': {u'_rev': u'32-6027014210',
          ...
                     u'id': u'cmsgwms-submit6.fnal.gov-cmsunified_ACDC0_task_BTV-RunIISummer19UL18wmLHEGEN-00004__v1_T_200507_162125_3670',
                     u'key': [u'cmsunified_ACDC0_task_BTV-RunIISummer19UL18wmLHEGEN-00004__v1_T_200507_162125_3670',
                              u'cmsgwms-submit6.fnal.gov'],
                     u'value': None}],
          u'total_rows': 49606}
     """
     if not keys:
         return []
     options = {}
     options["include_docs"] = True
     options["reduce"] = False
     finalResults = {}
     # magic number: 5000 keys (need to check which number is optimal)
     for sliceKeys in grouper(keys, 5000):
         self.logger.info("Querying latestRequest with %d keys",
                          len(sliceKeys))
         result = self._getCouchView("latestRequest", options, sliceKeys)
         if not finalResults and result:
             finalResults = result
         elif result.get('rows'):
             finalResults['rows'].extend(result['rows'])
     return finalResults
Exemplo n.º 7
0
 def execute(self, file=None, runs=None, conn=None, transaction=False):
     for sliceBinds in grouper(self.getBinds(file, runs), 10000):
         result = self.dbi.processData(self.sql,
                                       sliceBinds,
                                       conn=conn,
                                       transaction=transaction)
     return self.format(result)
Exemplo n.º 8
0
def getBlocksByDsetAndRun(datasetName, runList, dbsUrl):
    """
    Given a dataset name and a list of runs, find all the blocks
    :return: flat list of blocks
    """
    blocks = set()
    if isinstance(runList, set):
        runList = list(runList)

    urls = []
    for runSlice in grouper(runList, 50):
        urls.append('%s/blocks?run_num=%s&dataset=%s' %
                    (dbsUrl, str(runSlice).replace(" ", ""), datasetName))
    logging.info(
        "Executing %d requests against DBS 'blocks' API, with run_num list",
        len(urls))
    data = multi_getdata(urls, ckey(), cert())

    for row in data:
        dataset = row['url'].rsplit('=')[-1]
        if row['data'] is None:
            msg = "Failure in getBlocksByDsetAndRun for %s. Error: %s %s" % (
                dataset, row.get('code'), row.get('error'))
            raise RuntimeError(msg)
        rows = json.loads(row['data'])
        for item in rows:
            blocks.add(item['block_name'])

    return list(blocks)
Exemplo n.º 9
0
    def archiveJobs(self):
        """
        _archiveJobs_

        archiveJobs will handle the master task of looking for finished jobs,
        and running the code that cleans them out.
        """
        doneList = self.findFinishedJobs()
        logging.info("Found %i finished jobs to archive", len(doneList))

        jobCounter = 0
        for slicedList in grouper(doneList, 10000):
            self.cleanWorkArea(slicedList)

            successList = []
            failList = []
            killList = []
            for job in slicedList:
                if job["outcome"] == "success":
                    successList.append(job)
                elif job["outcome"] == "killed":
                    killList.append(job)
                else:
                    failList.append(job)

            myThread = threading.currentThread()
            myThread.transaction.begin()
            self.changeState.propagate(successList, "cleanout", "success")
            self.changeState.propagate(failList, "cleanout", "exhausted")
            self.changeState.propagate(killList, "cleanout", "killed")
            myThread.transaction.commit()

            jobCounter += len(slicedList)
            logging.info("Successfully archived %d jobs out of %d.",
                         jobCounter, len(doneList))
Exemplo n.º 10
0
    def _getLumiList(self, blockName=None, lfns=None, validFileOnly=1):
        """
        currently only take one lfn but dbs api need be updated
        """
        try:
            if blockName:
                lumiLists = self.dbs.listFileLumis(block_name=blockName,
                                                   validFileOnly=validFileOnly)
            elif lfns:
                lumiLists = []
                for slfn in grouper(lfns, 50):
                    lumiLists.extend(
                        self.dbs.listFileLumiArray(logical_file_name=slfn))
            else:
                # shouldn't call this with both blockName and lfns empty
                # but still returns empty dict for that case
                return {}
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFileLumiArray(%s)\n" % lfns
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        lumiDict = {}
        for lumisItem in lumiLists:
            lumiDict.setdefault(lumisItem['logical_file_name'], [])
            item = {}
            item["RunNumber"] = lumisItem['run_num']
            item['LumiSectionNumber'] = lumisItem['lumi_section_num']
            if lumisItem.get('event_count', None) is not None:
                item['EventCount'] = lumisItem['event_count']
            lumiDict[lumisItem['logical_file_name']].append(item)
            # TODO: add key for lumi and event pair.
        return lumiDict
Exemplo n.º 11
0
    def _getLumiList(self, blockName=None, lfns=None, validFileOnly=1):
        """
        currently only take one lfn but dbs api need be updated
        """
        try:
            if blockName:
                lumiLists = self.dbs.listFileLumis(block_name=blockName, validFileOnly=validFileOnly)
            elif lfns:
                lumiLists = []
                for slfn in grouper(lfns, 50):
                    lumiLists.extend(self.dbs.listFileLumiArray(logical_file_name = slfn))
            else:
                # shouldn't call this with both blockName and lfns empty
                # but still returns empty dict for that case
                return {}
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFileLumiArray(%s)\n" % lfns
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        lumiDict = {}
        for lumisItem in lumiLists:
            lumiDict.setdefault(lumisItem['logical_file_name'], [])
            item = {}
            item["RunNumber"] = lumisItem['run_num']
            item['LumiSectionNumber'] = lumisItem['lumi_section_num']
            lumiDict[lumisItem['logical_file_name']].append(item)
        return lumiDict
Exemplo n.º 12
0
    def execute(self):
        """
        Executes the whole transferor logic
        :param reqStatus: request status to that matters for this module
        :return:
        """
        if not self.prep():
            self.logger.warning(
                "Failed to fetch the latest unified config. Skipping this cycle"
            )
            return
        self.uConfig = self.uConfig[0]

        requestRecords = []
        try:
            # get requests from ReqMgr2 data-service for given status
            requests = self.reqmgr2.getRequestByStatus([self.status],
                                                       detail=True)
            if requests:
                requests = requests[0]
            self.logger.info("### transferor found %s requests in '%s' state",
                             len(requests), self.status)
            if requests:
                for _, wfData in requests.iteritems():
                    requestRecords.append(self.requestRecord(wfData))
        except Exception as err:
            self.logger.exception('### transferor error: %s', str(err))

        if not requestRecords:
            return

        try:
            reqInfo = RequestInfo(self.msConfig, self.uConfig, self.logger)
            for reqSlice in grouper(requestRecords, 50):
                reqResults = reqInfo(reqSlice)
                self.logger.info("%d requests completely processed.",
                                 len(reqResults))
                self.logger.info(
                    "Working on the data subscription and status change...")
                # process all requests
                for req in reqResults:
                    reqName = req['name']
                    # perform transfer
                    tid = self.transferRequest(req)
                    if tid:
                        # Once all transfer requests were successfully made, update: assigned -> staging
                        self.logger.debug(
                            "### transfer request for %s successfull", reqName)
                        self.change(req, 'staging', '### transferor')
                        # if there is nothing to be transferred (no input at all),
                        # then update the request status once again staging -> staged
                        # self.change(req, 'staged', '### transferor')
        except Exception as err:  # general error
            self.logger.exception('### transferor error: %s', str(err))
Exemplo n.º 13
0
    def submit(self, jobs, info=None):
        """
        _submit_

        Submits jobs to the condor queue
        """
        successfulJobs = []
        failedJobs = []

        if len(jobs) == 0:
            # Then was have nothing to do
            return successfulJobs, failedJobs

        schedd = htcondor.Schedd()

        # Submit the jobs
        for jobsReady in grouper(jobs, self.jobsPerSubmit):

            clusterAd = self.getClusterAd()
            procAds = self.getProcAds(jobsReady)

            logging.debug(
                "Start: Submitting %d jobs using Condor Python SubmitMany",
                len(procAds))
            try:
                # 4th argument has to be None otherwise HTCondor leaks the result ads
                # through it (as of 8.7.x). More info in WMCore/#8729
                clusterId = schedd.submitMany(clusterAd, procAds, False, None)
            except Exception as ex:
                logging.error("SimpleCondorPlugin job submission failed.")
                logging.exception(str(ex))
                logging.error(
                    "Moving on the the next batch of jobs and/or cycle....")

                condorErrorReport = Report()
                condorErrorReport.addError("JobSubmit", 61202, "CondorError",
                                           str(ex))
                for job in jobsReady:
                    job['fwjr'] = condorErrorReport
                    failedJobs.append(job)
            else:
                logging.debug(
                    "Job submission to condor succeeded, clusterId is %s",
                    clusterId)
                for index, job in enumerate(jobsReady):
                    job['gridid'] = "%s.%s" % (clusterId, index)
                    job['status'] = 'Idle'
                    successfulJobs.append(job)

        # We must return a list of jobs successfully submitted and a list of jobs failed
        logging.info(
            "Done submitting jobs for this cycle in SimpleCondorPlugin")
        return successfulJobs, failedJobs
Exemplo n.º 14
0
    def submit(self, jobs, info=None):
        """
        _submit_

        Submits jobs to the condor queue
        """
        successfulJobs = []
        failedJobs = []

        if len(jobs) == 0:
            # Then was have nothing to do
            return successfulJobs, failedJobs

        schedd = htcondor.Schedd()

        # Submit the jobs
        for jobsReady in grouper(jobs, self.jobsPerSubmit):

            (sub, jobParams) = self.createSubmitRequest(jobsReady)

            logging.debug(
                "Start: Submitting %d jobs using Condor Python Submit",
                len(jobParams))
            try:
                with schedd.transaction() as txn:
                    submitRes = sub.queue_with_itemdata(
                        txn, 1, iter(jobParams))
                    clusterId = submitRes.cluster()
            except Exception as ex:
                logging.error("SimpleCondorPlugin job submission failed.")
                logging.exception(str(ex))
                logging.error(
                    "Moving on the the next batch of jobs and/or cycle....")

                condorErrorReport = Report()
                condorErrorReport.addError("JobSubmit", 61202, "CondorError",
                                           str(ex))
                for job in jobsReady:
                    job['fwjr'] = condorErrorReport
                    failedJobs.append(job)
            else:
                logging.debug(
                    "Job submission to condor succeeded, clusterId is %s",
                    clusterId)
                for index, job in enumerate(jobsReady):
                    job['gridid'] = "%s.%s" % (clusterId, index)
                    job['status'] = 'Idle'
                    successfulJobs.append(job)

        # We must return a list of jobs successfully submitted and a list of jobs failed
        logging.info(
            "Done submitting jobs for this cycle in SimpleCondorPlugin")
        return successfulJobs, failedJobs
Exemplo n.º 15
0
    def handleErrors(self):
        """
        Queries DB for all watched filesets, if matching filesets become
        available, create the subscriptions
        """
        # Run over created, submitted and executed job failures
        failure_states = ['create', 'submit', 'job']
        for state in failure_states:
            idList = self.getJobs.execute(state="%sfailed" % state)
            logging.info("Found %d failed jobs in state %sfailed", len(idList), state)
            for jobSlice in grouper(idList, self.maxProcessSize):
                jobList = self.loadJobsFromList(jobSlice)
                self.handleFailedJobs(jobList, state)

        # Run over jobs done with retries
        idList = self.getJobs.execute(state='retrydone')
        logging.info("Found %d jobs done with all retries", len(idList))
        for jobSlice in grouper(idList, self.maxProcessSize):
            jobList = self.loadJobsFromList(jobSlice)
            self.handleRetryDoneJobs(jobList)

        return
Exemplo n.º 16
0
    def submit(self, jobs, info=None):
        """
        _submit_


        Submit jobs for one subscription
        """
        successfulJobs = []
        failedJobs = []

        if len(jobs) == 0:
            # Then was have nothing to do
            return successfulJobs, failedJobs

        schedd = htcondor.Schedd()

        # Submit the jobs
        for jobsReady in grouper(jobs, self.jobsPerSubmit):

            clusterAd = self.getClusterAd()
            procAds = self.getProcAds(jobsReady)

            logging.debug(
                "Start: Submitting %d jobs using Condor Python SubmitMany",
                len(procAds))
            try:
                clusterId = schedd.submitMany(clusterAd, procAds)
            except Exception as ex:
                logging.error("SimpleCondorPlugin job submission failed.")
                logging.error(
                    "Moving on the the next batch of jobs and/or cycle....")
                logging.exception(ex)

                condorErrorReport = Report()
                condorErrorReport.addError("JobSubmit", 61202, "CondorError",
                                           str(ex))
                for job in jobsReady:
                    job['fwjr'] = condorErrorReport
                    failedJobs.append(job)
            else:
                logging.debug(
                    "Finish: Submitting jobs using Condor Python SubmitMany")
                for index, job in enumerate(jobsReady):
                    job['gridid'] = "%s.%s" % (clusterId, index)
                    job['status'] = 'Idle'
                    successfulJobs.append(job)

        # We must return a list of jobs successfully submitted and a list of jobs failed
        logging.info(
            "Done submitting jobs for this cycle in SimpleCondorPlugin")
        return successfulJobs, failedJobs
Exemplo n.º 17
0
    def handleErrors(self):
        """
        Queries DB for all watched filesets, if matching filesets become
        available, create the subscriptions
        """
        # Run over created, submitted and executed job failures
        failure_states = ['create', 'submit', 'job']
        for state in failure_states:
            idList = self.getJobs.execute(state="%sfailed" % state)
            logging.info("Found %d failed jobs in state %sfailed", len(idList),
                         state)
            for jobSlice in grouper(idList, self.maxProcessSize):
                jobList = self.loadJobsFromList(jobSlice)
                self.handleFailedJobs(jobList, state)

        # Run over jobs done with retries
        idList = self.getJobs.execute(state='retrydone')
        logging.info("Found %d jobs done with all retries", len(idList))
        for jobSlice in grouper(idList, self.maxProcessSize):
            jobList = self.loadJobsFromList(jobSlice)
            self.handleRetryDoneJobs(jobList)

        return
Exemplo n.º 18
0
    def submit(self, jobs, info=None):
        """
        _submit_

        Submits jobs to the condor queue
        """
        successfulJobs = []
        failedJobs = []

        if len(jobs) == 0:
            # Then was have nothing to do
            return successfulJobs, failedJobs

        schedd = htcondor.Schedd()

        # Submit the jobs
        for jobsReady in grouper(jobs, self.jobsPerSubmit):

            clusterAd = self.getClusterAd()
            procAds = self.getProcAds(jobsReady)

            logging.debug("Start: Submitting %d jobs using Condor Python SubmitMany", len(procAds))
            try:
                # 4th argument has to be None otherwise HTCondor leaks the result ads
                # through it (as of 8.7.x). More info in WMCore/#8729
                clusterId = schedd.submitMany(clusterAd, procAds, False, None)
            except Exception as ex:
                logging.error("SimpleCondorPlugin job submission failed.")
                logging.exception(str(ex))
                logging.error("Moving on the the next batch of jobs and/or cycle....")

                condorErrorReport = Report()
                condorErrorReport.addError("JobSubmit", 61202, "CondorError", str(ex))
                for job in jobsReady:
                    job['fwjr'] = condorErrorReport
                    failedJobs.append(job)
            else:
                logging.debug("Job submission to condor succeeded, clusterId is %s", clusterId)
                for index, job in enumerate(jobsReady):
                    job['gridid'] = "%s.%s" % (clusterId, index)
                    job['status'] = 'Idle'
                    successfulJobs.append(job)

        # We must return a list of jobs successfully submitted and a list of jobs failed
        logging.info("Done submitting jobs for this cycle in SimpleCondorPlugin")
        return successfulJobs, failedJobs
Exemplo n.º 19
0
def createDirectories(dirList):
    """
    Create the directory if everything is sane

    """
    for sdirList in grouper(dirList, 500):
        cmdArgs = ['mkdir']
        cmdArgs.extend(sdirList)
        pipe = Popen(cmdArgs, stdout = PIPE, stderr = PIPE, shell = False)
        stdout, stderr = pipe.communicate()
        if not stderr == "":
            msg = "Error in making directories: %s\n" % stderr
            logging.error(msg)
            logging.debug("Executing command %s\n" % cmdArgs)
            raise CreateWorkAreaException(msg)

    return
Exemplo n.º 20
0
    def submit(self, jobs, info=None):
        """
        _submit_


        Submit jobs for one subscription
        """
        successfulJobs = []
        failedJobs = []

        if len(jobs) == 0:
            # Then was have nothing to do
            return successfulJobs, failedJobs

        schedd = htcondor.Schedd()

        # Submit the jobs
        for jobsReady in grouper(jobs, self.jobsPerSubmit):

            clusterAd = self.getClusterAd()
            procAds = self.getProcAds(jobsReady)

            logging.debug("Start: Submitting %d jobs using Condor Python SubmitMany", len(procAds))
            try:
                clusterId = schedd.submitMany(clusterAd, procAds)
            except Exception as ex:
                logging.error("SimpleCondorPlugin job submission failed.")
                logging.error("Moving on the the next batch of jobs and/or cycle....")
                logging.exception(ex)

                condorErrorReport = Report()
                condorErrorReport.addError("JobSubmit", 61202, "CondorError", str(ex))
                for job in jobsReady:
                    job['fwjr'] = condorErrorReport
                    failedJobs.append(job)
            else:
                logging.debug("Finish: Submitting jobs using Condor Python SubmitMany")
                for index, job in enumerate(jobsReady):
                    job['gridid'] = "%s.%s" % (clusterId, index)
                    job['status'] = 'Idle'
                    successfulJobs.append(job)

        # We must return a list of jobs successfully submitted and a list of jobs failed
        logging.info("Done submitting jobs for this cycle in SimpleCondorPlugin")
        return successfulJobs, failedJobs
Exemplo n.º 21
0
def createDirectories(dirList):
    """
    Create the directory if everything is sane

    """
    for sdirList in grouper(dirList, 500):
        cmdArgs = ['mkdir']
        cmdArgs.extend(sdirList)
        pipe = Popen(cmdArgs, stdout=PIPE, stderr=PIPE, shell=False)
        stdout, stderr = pipe.communicate()
        if stderr:
            if hasattr(stderr, "decode"):
                stderr = stderr.decode('utf-8', 'ignore')
            msg = "Error in creating directories: %s\n" % stderr
            logging.error(msg)
            raise CreateWorkAreaException(msg)

    return
Exemplo n.º 22
0
    def algorithm(self, parameters):
        """
        get information from wmbs, workqueue and local couch
        """
        try:
            data = self.fwjrAPI.getFWJRByArchiveStatus(
                'ready', limit=self.numDocsRetrievePerPolling)['rows']
            logging.info(
                "Found %i not archived documents from FWRJ db to upload to WMArchive.",
                len(data))

            for slicedData in grouper(data, self.numDocsUploadPerCall):
                jobIDs = []
                archiveDocs = []
                for job in slicedData:
                    doc = createArchiverDoc(job)
                    archiveDocs.append(doc)
                    jobIDs.append(job["id"])

                response = self.wmarchiver.archiveData(archiveDocs)

                # Partial success is not allowed either all the insert is successful or none is
                if response[0]['status'] == "ok" and len(
                        response[0]['ids']) == len(jobIDs):
                    archiveIDs = response[0]['ids']
                    for docID in jobIDs:
                        self.fwjrAPI.updateArchiveUploadedStatus(docID)
                    logging.info("...successfully uploaded %d docs",
                                 len(jobIDs))
                    logging.debug("JobIDs uploaded: %s", jobIDs)
                    logging.debug("Archived IDs returned: %s", archiveIDs)
                else:
                    logging.warning(
                        "Upload failed and it will be retried in the next cycle: %s: %s.",
                        response[0]['status'], response[0]['reason'])
                    logging.debug("failed JobIds %s", jobIDs)
        except Exception as ex:
            logging.error("Error occurred, will retry later:")
            logging.error(str(ex))
            logging.error("Trace back: \n%s", traceback.format_exc())
Exemplo n.º 23
0
    def updateBulkDocuments(self, doc_ids, paramsToUpdate, updateLimits=1000):

        uri = '/%s/_bulk_docs/' % self.name
        conflictDocIDs = []
        for ids in grouper(doc_ids, updateLimits):
            # get original documens
            docs = self.allDocs(options={"include_docs": True}, keys=ids)['rows']
            data = {}
            data['docs'] = []
            for j in docs:
                doc = {}
                doc.update(j['doc'])
                nestedDictUpdate(doc, paramsToUpdate)
                data['docs'].append(doc)

            if len(data['docs']) > 0:
                retval = self.post(uri, data)
                for result in retval:
                    if result.get('error', None) == 'conflict':
                        conflictDocIDs.append(result['id'])

        return conflictDocIDs
Exemplo n.º 24
0
    def algorithm(self, parameters=None):
        """
        _algorithm_

        Poll WMBS for jobs in the 'Complete' state and then pass them to the
        accountant worker.
        """
        completeJobs = self.getJobsAction.execute(state="complete")
        logging.info("Found %d completed jobs", len(completeJobs))

        if len(completeJobs) == 0:
            logging.debug("No work to do; exiting")
            return

        for jobsSlice in grouper(completeJobs, self.accountantWorkSize):
            try:
                self.accountantWorker(jobsSlice)
            except WMException:
                myThread = threading.currentThread()
                if getattr(myThread, 'transaction', None) is not None:
                    myThread.transaction.rollback()
                raise
            except CouchConnectionError as ex:
                msg = "Caught CouchConnectionError exception. Waiting until the next polling cycle.\n"
                msg += str(ex)
                logging.error(msg)
                myThread = threading.currentThread()
                if getattr(myThread, 'transaction', None) is not None:
                    myThread.transaction.rollback()
            except Exception as ex:
                myThread = threading.currentThread()
                if getattr(myThread, 'transaction', None) is not None:
                    myThread.transaction.rollback()
                msg = "Hit general exception in JobAccountantPoller while using worker.\n"
                msg += str(ex)
                logging.exception(msg)
                raise JobAccountantPollerException(msg)

        return
Exemplo n.º 25
0
    def updateBulkDocuments(self, doc_ids, paramsToUpdate, updateLimits=1000):

        uri = '/%s/_bulk_docs/' % self.name
        conflictDocIDs = []
        for ids in grouper(doc_ids, updateLimits):
            # get original documens
            docs = self.allDocs(options={"include_docs": True}, keys=ids)['rows']
            data = {}
            data['docs'] = []
            for j in docs:
                doc = {}
                doc.update(j['doc'])
                nestedDictUpdate(doc, paramsToUpdate)
                data['docs'].append(doc)

            if data['docs']:
                retval = self.post(uri, data)
                for result in retval:
                    if result.get('error', None) == 'conflict':
                        conflictDocIDs.append(result['id'])

        return conflictDocIDs
Exemplo n.º 26
0
    def algorithm(self, parameters=None):
        """
        _algorithm_

        Poll WMBS for jobs in the 'Complete' state and then pass them to the
        accountant worker.
        """
        completeJobs = self.getJobsAction.execute(state="complete")
        logging.info("Found %d completed jobs", len(completeJobs))

        if len(completeJobs) == 0:
            logging.debug("No work to do; exiting")
            return

        for jobsSlice in grouper(completeJobs, self.accountantWorkSize):
            try:
                self.accountantWorker(jobsSlice)
            except WMException:
                myThread = threading.currentThread()
                if getattr(myThread, 'transaction', None) is not None:
                    myThread.transaction.rollback()
                raise
            except CouchConnectionError as ex:
                msg = "Caught CouchConnectionError exception. Waiting until the next polling cycle.\n"
                msg += str(ex)
                logging.error(msg)
                myThread = threading.currentThread()
                if getattr(myThread, 'transaction', None) is not None:
                    myThread.transaction.rollback()
            except Exception as ex:
                myThread = threading.currentThread()
                if getattr(myThread, 'transaction', None) is not None:
                    myThread.transaction.rollback()
                msg = "Hit general exception in JobAccountantPoller while using worker.\n"
                msg += str(ex)
                logging.exception(msg)
                raise JobAccountantPollerException(msg)

        return
Exemplo n.º 27
0
    def bulkUpdateData(self, docs, existingDocs):
        """
        Update documents to WMStats in bulk, breaking down to 100 docs chunks.
        :param docs: docs to insert or update
        :param existingDocs: dict of docId: docRev of docs already existent in wmstats
        """
        if isinstance(docs, dict):
            docs = [docs]
        for chunk in grouper(docs, 100):
            for doc in chunk:
                if doc['_id'] in existingDocs:
                    revList = existingDocs[doc['_id']].split('-')
                    # update the revision number and keep the history of the revision
                    doc['_revisions'] = {"start": int(revList[0]) + 1, "ids": [str(int(revList[1]) + 1), revList[1]]}
                else:
                    # then create a random 10 digits uid for the first revision number, required by new_edits=False
                    firstId = "%10d" % random.randrange(9999999999)
                    doc['_revisions'] = {"start": 1, "ids": [firstId]}
                self.couchDB.queue(doc)

            logging.info("Committing bulk of %i docs ...", len(chunk))
            self.couchDB.commit(new_edits=False)
        return
Exemplo n.º 28
0
    def execute(self, emulator=None):
        """
        _execute_

        """
        scramCommand = self.step.application.setup.scramCommand
        scramArch = self.step.application.setup.scramArch
        cmsswVersion = self.step.application.setup.cmsswVersion

        scramArch = getSingleScramArch(scramArch)

        # Are we using emulators again?
        if emulator is not None:
            return emulator.emulate(self.step, self.job)

        overrides = {}
        if hasattr(self.step, 'override'):
            overrides = self.step.override.dictionary_()

        # Set wait to over an hour
        waitTime = overrides.get(
            'waitTime', 3600 + (self.step.retryDelay * self.step.retryCount))

        # hardcode CERN Castor T0_CH_CERN_MSS stageout parameters
        castorStageOutParams = {}
        castorStageOutParams['command'] = overrides.get('command', "xrdcp")
        castorStageOutParams['option'] = overrides.get('option',
                                                       "--cerncastor")
        castorStageOutParams['phedex-node'] = overrides.get(
            'phedex-node', "T2_CH_CERN")
        castorStageOutParams['lfn-prefix'] = overrides.get(
            'lfn-prefix', "root://castorcms.cern.ch//castor/cern.ch/cms")

        # hardcode CERN EOS T2_CH_CERN stageout parameters
        eosStageOutParams = {}
        eosStageOutParams['command'] = overrides.get('command', "xrdcp")
        eosStageOutParams['option'] = overrides.get('option', "")
        eosStageOutParams['phedex-node'] = overrides.get(
            'phedex-node', "T2_CH_CERN")
        eosStageOutParams['lfn-prefix'] = overrides.get(
            'lfn-prefix', "root://eoscms.cern.ch//eos/cms")

        try:
            castorStageOutMgr = StageOutMgr(**castorStageOutParams)
            eosStageOutMgr = StageOutMgr(**eosStageOutParams)
            stageInMgr = StageInMgr()
            deleteMgr = DeleteMgr()
        except Exception as ex:
            msg = "Unable to load StageOut/Delete Impl: %s" % str(ex)
            logging.error(msg)
            raise WMExecutionFailure(60312, "MgrImplementationError", msg)

        # prepare output tar file
        taskName = self.report.getTaskName().split('/')[-1]
        host = socket.gethostname().split('.')[0]
        tarName = '%s-%s-%s-%i-logs.tar' % (
            self.report.data.workload, taskName, host, self.job["counter"])
        tarLocation = os.path.join(self.stepSpace.location, tarName)

        # check if the cmsswVersion supports edmCopyUtil (min CMSSW_8_X)
        result = re.match("CMSSW_([0-9]+)_([0-9]+)_([0-9]+).*", cmsswVersion)
        useEdmCopyUtil = False
        if result:
            try:
                if int(result.group(1)) >= 8:
                    useEdmCopyUtil = True
            except ValueError:
                pass

        # setup Scram needed to run edmCopyUtil
        if useEdmCopyUtil:
            scram = Scram(
                command=scramCommand,
                version=cmsswVersion,
                initialise=self.step.application.setup.softwareEnvironment,
                directory=self.step.builder.workingDir,
                architecture=scramArch,
            )
            logging.info("Running scram")
            try:
                projectOutcome = scram.project()
            except Exception as ex:
                msg = "Exception raised while running scram.\n"
                msg += str(ex)
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)

            if projectOutcome > 0:
                msg = scram.diagnostic()
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)
            runtimeOutcome = scram.runtime()
            if runtimeOutcome > 0:
                msg = scram.diagnostic()
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)

        # iterate through input files
        localLogs = []
        deleteLogArchives = []
        if useEdmCopyUtil:
            numberOfFilesPerCopy = 10
        else:
            numberOfFilesPerCopy = 1
        for logs in grouper(self.job["input_files"], numberOfFilesPerCopy):

            copyCommand = "env X509_USER_PROXY=%s edmCopyUtil" % os.environ.get(
                'X509_USER_PROXY', None)
            for log in logs:
                copyCommand += " %s" % log['lfn']
            copyCommand += " %s" % self.step.builder.workingDir

            # give up after timeout of 1 minute per input file
            signal.signal(signal.SIGALRM, alarmHandler)
            signal.alarm(60 * numberOfFilesPerCopy)

            filesCopied = False
            try:
                if useEdmCopyUtil:
                    logging.info("Running edmCopyUtil")
                    retval = scram(copyCommand)
                    if retval == 0:
                        filesCopied = True
                else:
                    logging.info("Running stageIn")
                    for log in logs:
                        fileInfo = {"LFN": log['lfn']}
                        logArchive = stageInMgr(**fileInfo)
                        if logArchive:
                            filesCopied = True
            except Alarm:
                logging.error(
                    "Indefinite hang during edmCopyUtil/stageIn of logArchives"
                )
            except StageOutFailure:
                logging.error("Unable to stageIn logArchives")
            except Exception:
                raise

            signal.alarm(0)

            if filesCopied:
                for log in logs:
                    localLogs.append(
                        os.path.join(self.step.builder.workingDir,
                                     os.path.basename(log['lfn'])))
                    deleteLogArchives.append(log)
                    self.report.addInputFile(sourceName="logArchives",
                                             lfn=log['lfn'])
            else:
                logging.error("Unable to copy logArchives to local disk")
                if useEdmCopyUtil:
                    with open('scramOutput.log', 'r') as f:
                        logging.error("Scram output: %s", f.read())
                for log in logs:
                    self.report.addSkippedFile(log['lfn'], None)

        # create tarfile if any logArchive copied in
        if localLogs:
            tarFile = tarfile.open(tarLocation, 'w:')
            for log in localLogs:
                path = log.split('/')
                tarFile.add(name=log,
                            arcname=os.path.join(path[-3], path[-2], path[-1]))
                os.remove(log)
            tarFile.close()
        else:
            msg = "Unable to copy any logArchives to local disk"
            logging.error(msg)
            raise WMExecutionFailure(60312, "LogCollectError", msg)

        # now staging out the LogCollect tarfile
        logging.info("Staging out LogCollect tarfile to Castor and EOS")
        now = datetime.datetime.now()
        lfn = "/store/logs/prod/%i/%.2i/%s/%s/%s" % (
            now.year, now.month, "WMAgent", self.report.data.workload,
            os.path.basename(tarLocation))

        tarInfo = {'LFN': lfn, 'PFN': tarLocation, 'PNN': None, 'GUID': None}

        # perform mandatory stage out to CERN Castor
        signal.signal(signal.SIGALRM, alarmHandler)
        signal.alarm(waitTime)
        try:
            castorStageOutMgr(tarInfo)
        except Alarm:
            msg = "Indefinite hang during stageOut of LogCollect to Castor"
            logging.error(msg)
            raise WMExecutionFailure(60409, "LogCollectTimeout", msg)
        except Exception as ex:
            msg = "Unable to stageOut LogCollect to Castor:\n"
            msg += str(ex)
            logging.error(msg)
            raise WMExecutionFailure(60408, "LogCollectStageOutError", msg)
        signal.alarm(0)

        # add to job report
        self.report.addOutputFile(outputModule="LogCollect", aFile=tarInfo)
        outputRef = getattr(self.report.data, self.stepName)
        outputRef.output.pfn = tarInfo['PFN']
        outputRef.output.location = tarInfo['PNN']
        outputRef.output.lfn = tarInfo['LFN']

        tarInfo = {'LFN': lfn, 'PFN': tarLocation, 'PNN': None, 'GUID': None}

        # then, perform best effort stage out to CERN EOS
        signal.signal(signal.SIGALRM, alarmHandler)
        signal.alarm(waitTime)
        try:
            eosStageOutMgr(tarInfo)
        except Alarm:
            logging.error(
                "Indefinite hang during stageOut of LogCollect to EOS")
        except Exception as ex:
            logging.error("Unable to stageOut LogCollect to EOS: %s", ex)
        signal.alarm(0)

        # we got this far, delete input
        for log in deleteLogArchives:

            # give up after timeout of 1 minutes
            signal.signal(signal.SIGALRM, alarmHandler)
            signal.alarm(60)
            try:
                fileToDelete = {
                    'LFN': log['lfn'],
                    'PFN': None,
                    'PNN': None,
                    'StageOutCommand': None
                }
                deleteMgr(fileToDelete=fileToDelete)
            except Alarm:
                logging.error("Indefinite hang during delete of logArchive")
            except Exception as ex:
                logging.error("Unable to delete logArchive: %s", ex)
            signal.alarm(0)

        return
Exemplo n.º 29
0
    def execute(self, emulator=None):
        """
        _execute_

        """
        # Are we using emulators again?
        if emulator is not None:
            return emulator.emulate(self.step, self.job)

        logging.info("Steps.Executors.%s.execute called", self.__class__.__name__)

        scramCommand = self.step.application.setup.scramCommand
        cmsswVersion = self.step.application.setup.cmsswVersion
        scramArch = getSingleScramArch(self.step.application.setup.scramArch)
        overrideCatalog = getattr(self.step.application, 'overrideCatalog', None)

        overrides = {}
        if hasattr(self.step, 'override'):
            overrides = self.step.override.dictionary_()

        # Set wait to over an hour
        waitTime = overrides.get('waitTime', 3600 + (self.step.retryDelay * self.step.retryCount))

        # hardcode CERN EOS T2_CH_CERN stageout parameters
        eosStageOutParams = {}
        eosStageOutParams['command'] = overrides.get('command', "xrdcp")
        eosStageOutParams['option'] = overrides.get('option', "--wma-disablewriterecovery")
        eosStageOutParams['phedex-node'] = overrides.get('phedex-node', "T2_CH_CERN")
        eosStageOutParams['lfn-prefix'] = overrides.get('lfn-prefix', "root://eoscms.cern.ch//eos/cms")

        try:
            eosStageOutMgr = StageOutMgr(**eosStageOutParams)
            stageInMgr = StageInMgr()
            deleteMgr = DeleteMgr()
        except Exception as ex:
            msg = "Unable to load StageOut/Delete Impl: %s" % str(ex)
            logging.error(msg)
            raise WMExecutionFailure(60312, "MgrImplementationError", msg)

        # prepare output tar file
        taskName = self.report.getTaskName().split('/')[-1]
        host = socket.gethostname().split('.')[0]
        tarName = '%s-%s-%s-%i-logs.tar' % (self.report.data.workload, taskName, host, self.job["counter"])
        tarLocation = os.path.join(self.stepSpace.location, tarName)

        # Supported by any release beyond CMSSW_8_X, however DaviX is broken until CMSSW_10_4_X
        # see: https://github.com/cms-sw/cmssw/issues/25292
        useEdmCopyUtil = True
        if isCMSSWSupported(cmsswVersion, "CMSSW_10_4_0"):
            pass
        elif scramArch.startswith('slc7_amd64_'):
            msg = "CMSSW too old or not fully functional to support edmCopyUtil, using CMSSW_10_4_0 instead"
            logging.warning(msg)
            cmsswVersion = "CMSSW_10_4_0"
            scramArch = "slc7_amd64_gcc820"
        elif scramArch.startswith('slc6_amd64_'):
            msg = "CMSSW too old or not fully functional to support edmCopyUtil, using CMSSW_10_4_0 instead"
            logging.warning(msg)
            cmsswVersion = "CMSSW_10_4_0"
            scramArch = "slc6_amd64_gcc700"
        else:
            useEdmCopyUtil = False
        logging.info("Using edmCopyUtil: %s", useEdmCopyUtil)

        # setup Scram needed to run edmCopyUtil
        if useEdmCopyUtil:
            scram = Scram(
                    command=scramCommand,
                    version=cmsswVersion,
                    initialise=self.step.application.setup.softwareEnvironment,
                    directory=self.step.builder.workingDir,
                    architecture=scramArch,
            )
            logging.info("Running scram")
            try:
                projectOutcome = scram.project()
            except Exception as ex:
                msg = "Exception raised while running scram.\n"
                msg += str(ex)
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)

            if projectOutcome > 0:
                msg = scram.diagnostic()
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)
            runtimeOutcome = scram.runtime()
            if runtimeOutcome > 0:
                msg = scram.diagnostic()
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)

        # iterate through input files
        localLogs = []
        deleteLogArchives = []
        if useEdmCopyUtil:
            numberOfFilesPerCopy = 10
        else:
            numberOfFilesPerCopy = 1
        for logs in grouper(self.job["input_files"], numberOfFilesPerCopy):

            copyCommand = "env X509_USER_PROXY=%s edmCopyUtil" % os.environ.get('X509_USER_PROXY', None)

            # specify TFC if necessary
            if overrideCatalog:
                copyCommand += " -c %s" % overrideCatalog

            for log in logs:
                copyCommand += " %s" % log['lfn']
            copyCommand += " %s" % self.step.builder.workingDir

            # give up after timeout of 1 minute per input file
            signal.signal(signal.SIGALRM, alarmHandler)
            signal.alarm(60 * numberOfFilesPerCopy)

            filesCopied = False
            try:
                if useEdmCopyUtil:
                    logging.info("Running edmCopyUtil")
                    retval = scram(copyCommand)
                    if retval == 0:
                        filesCopied = True
                else:
                    logging.info("Running stageIn")
                    for log in logs:
                        fileInfo = {"LFN": log['lfn']}
                        logArchive = stageInMgr(**fileInfo)
                        if logArchive:
                            filesCopied = True
            except Alarm:
                logging.error("Indefinite hang during edmCopyUtil/stageIn of logArchives")
            except StageOutFailure:
                logging.error("Unable to stageIn logArchives")
            except Exception:
                raise

            signal.alarm(0)

            if filesCopied:
                for log in logs:
                    localLogs.append(os.path.join(self.step.builder.workingDir, os.path.basename(log['lfn'])))
                    deleteLogArchives.append(log)
                    self.report.addInputFile(sourceName="logArchives", lfn=log['lfn'])
            else:
                logging.error("Unable to copy logArchives to local disk")
                for log in logs:
                    self.report.addSkippedFile(log['lfn'], None)

        # create tarfile if any logArchive copied in
        if localLogs:
            with tarfile.open(tarLocation, 'w:') as tarFile:
                for log in localLogs:
                    path = log.split('/')
                    tarFile.add(name=log,
                                arcname=os.path.join(path[-3], path[-2], path[-1]))
                    os.remove(log)
        else:
            msg = "Unable to copy any logArchives to local disk"
            logging.error(msg)
            raise WMExecutionFailure(60312, "LogCollectError", msg)

        # now staging out the LogCollect tarfile
        logging.info("Staging out LogCollect tarfile to EOS (skipping CASTOR)")
        now = datetime.datetime.now()
        lfn = "/store/logs/prod/%i/%.2i/%s/%s/%s" % (now.year, now.month, "WMAgent",
                                                     self.report.data.workload,
                                                     os.path.basename(tarLocation))

        tarInfo = {'LFN': lfn,
                   'PFN': tarLocation,
                   'PNN': None,
                   'GUID': None}

        # perform mandatory stage out to CERN EOS
        signal.signal(signal.SIGALRM, alarmHandler)
        signal.alarm(waitTime)
        try:
            eosStageOutMgr(tarInfo)
        except Alarm:
            msg = "Indefinite hang during stageOut of LogCollect to EOS"
            logging.error(msg)
            raise WMExecutionFailure(60409, "LogCollectTimeout", msg)
        except Exception as ex:
            msg = "Unable to stageOut LogCollect to Castor:\n"
            msg += str(ex)
            logging.error(msg)
            raise WMExecutionFailure(60408, "LogCollectStageOutError", msg)
        signal.alarm(0)

        # add to job report
        self.report.addOutputFile(outputModule="LogCollect", aFile=tarInfo)
        outputRef = getattr(self.report.data, self.stepName)
        outputRef.output.pfn = tarInfo['PFN']
        outputRef.output.location = tarInfo['PNN']
        outputRef.output.lfn = tarInfo['LFN']

        # we got this far, delete ALL input files assigned to this job
        for log in self.job["input_files"]:
            # give up after timeout of 1 minutes
            signal.signal(signal.SIGALRM, alarmHandler)
            signal.alarm(60)
            try:
                fileToDelete = {'LFN': log['lfn'],
                                'PFN': None,
                                'PNN': None,
                                'StageOutCommand': None}
                deleteMgr(fileToDelete=fileToDelete)
            except Alarm:
                logging.error("Indefinite hang during delete of logArchive")
            except Exception as ex:
                logging.error("Unable to delete logArchive: %s", ex)
            signal.alarm(0)

        return
Exemplo n.º 30
0
    def submit(self, jobs, info=None):
        """
        _submit_


        Submit jobs for one subscription
        """

        # If we're here, then we have submitter components
        self.scriptFile = self.config.JobSubmitter.submitScript
        self.submitDir = self.config.JobSubmitter.submitDir
        timeout = getattr(self.config.JobSubmitter, 'getTimeout', 400)

        successfulJobs = []
        failedJobs = []
        jdlFiles = []

        if len(jobs) == 0:
            # Then was have nothing to do
            return successfulJobs, failedJobs

        if len(self.pool) == 0:
            # Starting things up
            # This is obviously a submit API
            logging.info("Starting up PyCondorPlugin worker pool")
            self.inputQueue = multiprocessing.Queue()
            self.result = multiprocessing.Queue()
            for x in range(self.nProcess):
                p = multiprocessing.Process(target=submitWorker,
                                            args=(self.inputQueue, self.result,
                                                  timeout))
                p.start()
                self.pool.append(p)

        if not os.path.exists(self.submitDir):
            os.makedirs(self.submitDir)

        # Submit the jobs
        nSubmits = 0
        queueError = False
        for jobsReady in grouper(jobs, self.jobsPerWorker):

            if queueError:
                # If the queue has failed, then we must not process any more jobs this cycle.
                break

            idList = [x['id'] for x in jobsReady]
            jdlList = self.makeSubmit(jobList=jobsReady)
            if not jdlList:
                # Then we got nothing
                logging.error("No JDL file made!")
                return {'NoResult': [0]}
            jdlFile = "%s/submit_%i_%i.jdl" % (self.submitDir, os.getpid(),
                                               idList[0])

            with open(jdlFile, 'w') as handle:
                handle.writelines(jdlList)
            jdlFiles.append(jdlFile)

            # Now submit them
            logging.info("About to submit %i jobs", len(jobsReady))
            if self.glexecPath:
                command = 'CS=`which condor_submit`; '
                if self.glexecWrapScript:
                    command += 'export GLEXEC_ENV=`%s 2>/dev/null`; ' % self.glexecWrapScript
                command += 'export GLEXEC_CLIENT_CERT=%s; ' % self.glexecProxyFile
                command += 'export GLEXEC_SOURCE_PROXY=%s; ' % self.glexecProxyFile
                command += 'export X509_USER_PROXY=%s; ' % self.glexecProxyFile
                command += 'export GLEXEC_TARGET_PROXY=%s; ' % self.jdlProxyFile
                if self.glexecUnwrapScript:
                    command += '%s %s -- $CS %s' % (
                        self.glexecPath, self.glexecUnwrapScript, jdlFile)
                else:
                    command += '%s $CS %s' % (self.glexecPath, jdlFile)
            else:
                command = "condor_submit %s" % jdlFile

            try:
                self.inputQueue.put({'command': command, 'idList': idList})
            except AssertionError as ex:
                msg = "Critical error: input pipeline probably closed.\n"
                msg += str(ex)
                msg += "Error Procedure: Something critical has happened in the worker process\n"
                msg += "We will now proceed to pull all useful data from the queue (if it exists)\n"
                msg += "Then refresh the worker pool\n"
                logging.error(msg)
                queueError = True
                break
            nSubmits += 1

        # Now we should have sent all jobs to be submitted
        # Going to do the rest of it now
        for dummy in range(nSubmits):
            try:
                res = self.result.get(block=True, timeout=timeout)
            except Queue.Empty:
                # If the queue was empty go to the next submit
                # Those jobs have vanished
                logging.error("Queue.Empty error received!")
                logging.error("This could indicate a critical condor error!")
                logging.error(
                    "However, no information of any use was obtained due to process failure."
                )
                logging.error(
                    "Either process failed, or process timed out after %s seconds.",
                    timeout)
                continue
            except AssertionError as ex:
                msg = "Found Assertion error while retrieving output from worker process.\n"
                msg += str(ex)
                msg += "This indicates something critical happened to a worker process"
                msg += "We will recover what jobs we know were submitted, and resubmit the rest"
                msg += "Refreshing worker pool at end of loop"
                logging.error(msg)
                continue

            try:
                dummyOut = res['stdout']
                error = res['stderr']
                idList = res['idList']
                exitCode = res['exitCode']
            except KeyError as ex:
                msg = "Error in finding key from result pipe\n"
                msg += "Something has gone critically wrong in the worker\n"
                try:
                    msg += "Result: %s\n" % str(res)
                except Exception:
                    pass
                msg += str(ex)
                logging.error(msg)
                continue

            if exitCode != 0:
                logging.error(
                    "Condor returned non-zero.  Printing out command stderr")
                logging.error(error)
                errorCheck, errorMsg = parseError(error=error)
                logging.error(
                    "Processing failed jobs and proceeding to the next jobs.")
                logging.error("Do not restart component.")
            else:
                errorCheck = None

            if errorCheck:
                self.errorCount += 1
                condorErrorReport = Report()
                condorErrorReport.addError("JobSubmit", 61202, "CondorError",
                                           errorMsg)
                for jobID in idList:
                    for job in jobs:
                        if job.get('id', None) == jobID:
                            job['fwjr'] = condorErrorReport
                            failedJobs.append(job)
                            break
            else:
                if self.errorCount > 0:
                    self.errorCount -= 1
                for jobID in idList:
                    for job in jobs:
                        if job.get('id', None) == jobID:
                            successfulJobs.append(job)
                            break

            # If we get a lot of errors in a row it's probably time to
            # stop the component
            if self.errorCount > self.errorThreshold:
                msg = "Exceeded errorThreshold while submitting to condor. Check condor status."
                logging.error(msg)
                raise BossAirPluginException(msg)

        # Remove JDL files unless commanded otherwise
        if self.deleteJDLFiles:
            for f in jdlFiles:
                os.remove(f)

        # When we're finished, clean up the queue workers in order
        # to free up memory (in the midst of the process, the forked
        # memory space shouldn't be touched, so it should still be
        # shared, but after this point any action by the Submitter will
        # result in memory duplication).
        logging.info("Purging worker pool to clean up memory")
        self.close()

        # We must return a list of jobs successfully submitted,
        # and a list of jobs failed
        logging.info("Done submitting jobs for this cycle in PyCondorPlugin")
        return successfulJobs, failedJobs
Exemplo n.º 31
0
    def execute(self, reqStatus):
        """
        Executes the whole transferor logic
        :param reqStatus: request status to process
        :return:
        """
        counterWorkflows = 0
        counterFailedRequests = 0
        counterSuccessRequests = 0
        summary = dict(TRANSFEROR_REPORT)
        try:
            requestRecords = self.getRequestRecords(reqStatus)
            self.updateReportDict(summary, "total_num_requests", len(requestRecords))
            msg = "  retrieved %s requests. " % len(requestRecords)
            msg += "Service set to process up to %s requests per cycle." % self.msConfig["limitRequestsPerCycle"]
            self.logger.info(msg)
        except Exception as err:  # general error
            msg = "Unknown exception while fetching requests from ReqMgr2. Error: %s", str(err)
            self.logger.exception(msg)
            self.updateReportDict(summary, "error", msg)

        try:
            self.updateCaches()
            self.updateReportDict(summary, "total_num_campaigns", len(self.campaigns))
            self.updateReportDict(summary, "nodes_out_of_space", list(self.rseQuotas.getOutOfSpaceRSEs()))
        except RuntimeWarning as ex:
            msg = "All retries exhausted! Last error was: '%s'" % str(ex)
            msg += "\nRetrying to update caches again in the next cycle."
            self.logger.error(msg)
            self.updateReportDict(summary, "error", msg)
            return summary
        except Exception as ex:
            msg = "Unknown exception updating caches. Error: %s" % str(ex)
            self.logger.exception(msg)
            self.updateReportDict(summary, "error", msg)
            return summary

        # process all requests
        for reqSlice in grouper(requestRecords, 100):
            self.logger.info("Processing workflows from %d to %d.",
                             counterWorkflows + 1, counterWorkflows + len(reqSlice))
            # get complete requests information
            # based on Unified Transferor logic
            reqResults = self.reqInfo(reqSlice)
            self.logger.info("%d requests information completely processed.", len(reqResults))

            for wflow in reqResults:
                # first check which data is already in place and filter them out
                self.checkDataLocation(wflow)

                try:
                    success, transfers = self.makeTransferRequest(wflow)
                except Exception as ex:
                    success = False
                    msg = "Unknown exception while making Transfer Request for %s " % wflow.getName()
                    msg += "\tError: %s" % str(ex)
                    self.logger.exception(msg)
                if success:
                    self.logger.info("Transfer requests successful for %s. Summary: %s",
                                     wflow.getName(), pformat(transfers))                    # then create a document in ReqMgr Aux DB
                    if self.createTransferDoc(wflow.getName(), transfers):
                        self.logger.info("Transfer document successfully created in CouchDB for: %s", wflow.getName())
                        # then move this request to staging status
                        self.change(wflow.getName(), 'staging', self.__class__.__name__)
                        counterSuccessRequests += 1
                    else:
                        counterFailedRequests += 1
                else:
                    counterFailedRequests += 1
            # it can go slightly beyond the limit. It's evaluated for every slice
            if counterSuccessRequests >= self.msConfig["limitRequestsPerCycle"]:
                msg = "Transferor succeeded acting on %d workflows in this cycle. " % counterSuccessRequests
                msg += "Which exceeds the configuration limit set to: %s" % self.msConfig["limitRequestsPerCycle"]
                self.logger.info(msg)
                break
            counterWorkflows += len(reqSlice)

        self.logger.info("There were %d failed and %d success requests in this cycle",
                         counterFailedRequests, counterSuccessRequests)
        self.logger.info("%s subscribed %d datasets and %d blocks in this cycle",
                         self.__class__.__name__, self.dsetCounter, self.blockCounter)
        self.updateReportDict(summary, "success_request_transition", counterSuccessRequests)
        self.updateReportDict(summary, "failed_request_transition", counterFailedRequests)
        self.updateReportDict(summary, "num_datasets_subscribed", self.dsetCounter)
        self.updateReportDict(summary, "num_blocks_subscribed", self.blockCounter)
        self.updateReportDict(summary, "nodes_out_of_space", list(self.rseQuotas.getOutOfSpaceRSEs()))
        return summary
Exemplo n.º 32
0
 def execute(self, file=None, runs=None, conn=None, transaction=False):
     for sliceBinds in grouper(self.getBinds(file, runs), 10000):
         result = self.dbi.processData(self.sql, sliceBinds, conn=conn,
                                       transaction=transaction)
     return self.format(result)
Exemplo n.º 33
0
    def cleanRSE(self, rse):
        """
        The method to implement the actual deletion of files for an RSE.
        :param rse: MSUnmergedRSE object to be cleaned
        :return:    The MSUnmergedRSE object
        """

        # Create the gfal2 context object:
        try:
            ctx = createGfal2Context(self.msConfig['gfalLogLevel'], self.msConfig['emulateGfal2'])
        except Exception as ex:
            msg = "RSE: %s, Failed to create gfal2 Context object. " % rse['name']
            msg += "Skipping it in the current run."
            self.logger.exception(msg)
            raise MSUnmergedPlineExit(msg)

        # Start cleaning one directory at a time:
        for dirLfn, fileLfnGen in rse['files']['toDelete'].items():
            if self.msConfig['limitFilesPerRSE'] < 0 or \
               rse['counters']['filesToDelete'] < self.msConfig['limitFilesPerRSE']:

                # First increment the dir counter:
                rse['counters']['dirsToDelete'] += 1

                # Now we consume the rse['files']['toDelete'][dirLfn] generator
                # upon that no values will be left in it. In case we need it again
                # we will have to recreate the filter as we did in self.filterUnmergedFiles()
                pfnList = []
                if not rse['pfnPrefix']:
                    # Fall back to calling Rucio on a per directory basis for
                    # resolving the lfn to pfn mapping
                    dirPfn = self.rucio.getPFN(rse['name'], dirLfn, operation='delete')[dirLfn]
                    for fileLfn in fileLfnGen:
                        fileLfnSuffix = fileLfn.split(dirLfn)[1]
                        filePfn = dirPfn + fileLfnSuffix
                        pfnList.append(filePfn)
                else:
                    # Proceed with assembling the full filePfn out of the rse['pfnPrefix'] and the fileLfn
                    dirPfn = rse['pfnPrefix'] + dirLfn
                    for fileLfn in fileLfnGen:
                        filePfn = rse['pfnPrefix'] + fileLfn
                        pfnList.append(filePfn)

                rse['counters']['filesToDelete'] += len(pfnList)
                msg = "\nRSE: %s \nDELETING: %s."
                msg += "\nPFN list with: %s entries: \n%s"
                self.logger.debug(msg, rse['name'], dirLfn, len(pfnList), twFormat(pfnList, maxLength=4))

                if self.msConfig['enableRealMode']:
                    try:
                        # execute the actual deletion in bulk - full list of files per directory

                        deletedSuccess = 0
                        for pfnSlice in list(grouper(pfnList, self.msConfig["filesToDeleteSliceSize"])):
                            delResult = ctx.unlink(pfnSlice)
                            # Count all the successfully deleted files (if a deletion was
                            # successful a value of None is put in the delResult list):
                            deletedSuccess += sum([1 for pfnStatus in delResult if pfnStatus is None])

                        self.logger.debug("RSE: %s, Dir: %s, deletedSuccess: %s",
                                          rse['name'], dirLfn, deletedSuccess)
                        rse['counters']['deletedSuccess'] = deletedSuccess

                        # Now clean the whole branch
                        self.logger.debug("Purging dirEntry: %s:\n", dirPfn)
                        purgeSuccess = self._purgeTree(ctx, dirPfn)
                        if not purgeSuccess:
                            msg = "RSE: %s Failed to purge nonEmpty directory: %s"
                            self.logger.error(msg, rse['name'], dirPfn)
                    except Exception as ex:
                        msg = "Error while cleaning RSE: %s. "
                        msg += "Will retry in the next cycle. Err: %s"
                        self.logger.warning(msg, rse['name'], str(ex))
        rse['isClean'] = self._checkClean(rse)

        return rse
Exemplo n.º 34
0
    def listFilesInBlockWithParents(self, fileBlockName, lumis=True, validFileOnly=1):
        """
        _listFilesInBlockWithParents_

        Get a list of files in the named fileblock including
        the parents of that file.
        TODO: lumis can be false when lumi splitting is not required
        However WMBSHelper expect file['LumiList'] to get the run number
        so for now it will be always true.

        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.listFilesInBlockWithParents(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            # TODO: shoud we get only valid block for this?
            files = self.dbs.listFileParents(block_name=fileBlockName)
            fileDetails = self.listFilesInBlock(fileBlockName, lumis, validFileOnly)

        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlockWithParents(%s)\n" % (
                fileBlockName,)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        childByParents = defaultdict(list)
        for f in files:
            # Probably a child can have more than 1 parent file
            for fp in f['parent_logical_file_name']:
                childByParents[fp].append(f['logical_file_name'])
        
        parentsLFNs = childByParents.keys()
        
        if len(parentsLFNs) == 0:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlockWithParents(%s)\n There is no parents files" % (
                fileBlockName)
            raise DBSReaderError(msg)

        parentFilesDetail = []
        # TODO: slicing parentLFNs util DBS api is handling that.
        # Remove slicing if DBS api handles
        for pLFNs in grouper(parentsLFNs, 50):
            parentFilesDetail.extend(self.dbs.listFileArray(logical_file_name=pLFNs, detail=True))

        if lumis:
            parentLumis = self._getLumiList(lfns=parentsLFNs)

        parentsByLFN = defaultdict(list)

        for pf in parentFilesDetail:
            parentLFN = pf['logical_file_name']
            dbsFile = remapDBS3Keys(pf, stringify=True)
            if lumis:
                dbsFile["LumiList"] = parentLumis[parentLFN]

            for childLFN in childByParents[parentLFN]:
                parentsByLFN[childLFN].append(dbsFile)

        for fileInfo in fileDetails:
            fileInfo["ParentList"] = parentsByLFN[fileInfo['logical_file_name']]

        return fileDetails
Exemplo n.º 35
0
    def _handleInputDataInfo(self, wflow, dset, blocksDict):
        """
        Applies any run/block/lumi list on the primary and parent
        blocks provided.
        It's a convoluted logic, such as:
         1) if there is no run/block/lumi list, just return the initial blocksDict
         2) if it has lumi list, filter runs from it and run block discovery
            given a dataset name and a list of runs
         3) if it has RunWhitelist, run block discovery for a given dataset name
            and a list of runs
         4) if it has only RunBlacklist, discover the run list of all initial blocks
            provided in blocksDict and remove blocks matching only the black list
         5) for the steps above, always check whether the block has replicas
         6) NOW that the block data discovery is completed (considering runs):
           * if LumiList is not enabled, just return the current list of blocks
           * else, fetch file/run/lumi information in bulk of blocks and compare it
           to the LumiList, skipping blocks without a single file that matches it.

        Note that the LumiList check is dealt with in a similar way
        as done in the WorkQueue StartPolicyInterface/getMaskedBlocks:

        :param wflow: the Workflow object
        :param dset: dataset name
        :param blocksDict: dictionary of blocks, their size and location
        :return: dictionary of block names and block size
        """
        finalBlocks = {}
        dbsUrl = wflow.getDbsUrl()
        runWhite = wflow.getRunWhitelist()
        runBlack = set(wflow.getRunBlacklist())
        lumiList = wflow.getLumilist()
        if lumiList:
            # LumiList has precedence over RunWhitelist
            runWhite = []
            for run in lumiList.getRuns():
                runWhite.append(int(run))
            runWhite = list(set(runWhite))
        if runWhite:
            # Run number 1 is not supported by DBSServer
            if int(runWhite[0]) == 1:
                finalBlocks = deepcopy(blocksDict)
            else:
                runWhite = list(set(runWhite) - runBlack)
                self.logger.info(
                    "Fetching blocks matching a list of runs for %s",
                    wflow.getName())
                try:
                    blocks = getBlocksByDsetAndRun(dset, runWhite, dbsUrl)
                except Exception as exc:
                    self.logger.error(
                        "Failed to retrieve blocks by dataset and run. Details: %s",
                        str(exc))
                    raise
                for block in blocks:
                    if block in blocksDict:
                        finalBlocks[block] = deepcopy(blocksDict[block])
                    else:
                        self.logger.info(
                            "Dropping block with no replicas in PhEDEx: %s",
                            block)
        elif runBlack:
            # only run blacklist set
            self.logger.info("Fetching runs in blocks for RunBlacklist for %s",
                             wflow.getName())
            try:
                blockRuns = getRunsInBlock(list(blocksDict), dbsUrl)
            except Exception as exc:
                self.logger.error(
                    "Failed to bulk retrieve runs per block. Details: %s",
                    str(exc))
                raise
            for block, runs in blockRuns.items():
                if not set(runs).difference(runBlack):
                    self.logger.info(
                        "Dropping block with only blacklisted runs: %s", block)
                elif block in blocksDict:
                    finalBlocks[block] = deepcopy(blocksDict[block])

        if lumiList:
            self.logger.info(
                "Fetching block/lumi information for %d blocks in %s",
                len(finalBlocks), wflow.getName())
            self.logger.debug("with the following run whitelist: %s", runWhite)
            goodBlocks = set()
            # now with a smaller set of blocks in hand, we collect their lumi
            # information and discard any blocks not matching the lumi list
            for blockSlice in grouper(finalBlocks, 10):
                try:
                    blockFileLumis = getFileLumisInBlock(blockSlice,
                                                         dbsUrl,
                                                         validFileOnly=1)
                except Exception as exc:
                    self.logger.error(
                        "Failed to bulk retrieve run/lumi per block. Details: %s",
                        str(exc))
                    raise
                for block, fileLumis in blockFileLumis.items():
                    for fileLumi in fileLumis:
                        if int(fileLumi['run_num']) not in runWhite:
                            continue
                        runNumber = str(fileLumi['run_num'])
                        lumis = fileLumi['lumi_section_num']
                        fileMask = LumiList(runsAndLumis={runNumber: lumis})
                        if lumiList & fileMask:
                            # then it has lumis that we need, keep this block and move on
                            goodBlocks.add(block)
                            break
            # last but not least, drop any blocks that are not in the good list
            for block in list(finalBlocks):
                if block not in goodBlocks:
                    self.logger.info(
                        "Dropping block not matching LumiList: %s", block)
                    finalBlocks.pop(block)

        if not finalBlocks:
            finalBlocks = blocksDict
        return finalBlocks
Exemplo n.º 36
0
    def execute(self, reqStatus):
        """
        Executes the whole transferor logic
        :param reqStatus: request status to process
        :return:
        """
        counterWorkflows = 0
        counterFailedRequests = 0
        counterProblematicRequests = 0
        counterSuccessRequests = 0
        summary = dict(TRANSFEROR_REPORT)
        try:
            requestRecords = self.getRequestRecords(reqStatus)
            self.updateReportDict(summary, "total_num_requests",
                                  len(requestRecords))
            msg = "  retrieved %s requests. " % len(requestRecords)
            msg += "Service set to process up to %s requests per cycle." % self.msConfig[
                "limitRequestsPerCycle"]
            self.logger.info(msg)
        except Exception as err:  # general error
            msg = "Unknown exception while fetching requests from ReqMgr2. Error: %s", str(
                err)
            self.logger.exception(msg)
            self.updateReportDict(summary, "error", msg)

        try:
            self.updateCaches()
            self.updateReportDict(summary, "total_num_campaigns",
                                  len(self.campaigns))
            self.updateReportDict(summary, "nodes_out_of_space",
                                  list(self.rseQuotas.getOutOfSpaceRSEs()))
        except RuntimeWarning as ex:
            msg = "All retries exhausted! Last error was: '%s'" % str(ex)
            msg += "\nRetrying to update caches again in the next cycle."
            self.logger.error(msg)
            self.updateReportDict(summary, "error", msg)
            return summary
        except Exception as ex:
            msg = "Unknown exception updating caches. Error: %s" % str(ex)
            self.logger.exception(msg)
            self.updateReportDict(summary, "error", msg)
            return summary

        # process all requests
        for reqSlice in grouper(requestRecords, 100):
            self.logger.info("Processing workflows from %d to %d.",
                             counterWorkflows + 1,
                             counterWorkflows + len(reqSlice))
            # get complete requests information
            # based on Unified Transferor logic
            reqResults = self.reqInfo(reqSlice)
            self.logger.info("%d requests information completely processed.",
                             len(reqResults))

            for wflow in reqResults:
                if not self.verifyCampaignExist(wflow):
                    counterProblematicRequests += 1
                    continue

                # first, check whether any pileup dataset is already in place
                self.checkPUDataLocation(wflow)
                if wflow.getSecondarySummary() and not wflow.getPURSElist():
                    # then we still have pileup to be transferred, but with incorrect locations
                    msg = "Workflow: %s cannot proceed due to some PU misconfiguration. Check previous logs..."
                    self.logger.critical(msg, wflow.getName())
                    # FIXME: this needs to be logged somewhere and workflow be set to failed
                    counterProblematicRequests += 1
                    continue

                # now check where input primary and parent blocks will need to go
                self.checkDataLocation(wflow)

                try:
                    success, transfers = self.makeTransferRequest(wflow)
                except Exception as ex:
                    success = False
                    msg = "Unknown exception while making Transfer Request for %s " % wflow.getName(
                    )
                    msg += "\tError: %s" % str(ex)
                    self.logger.exception(msg)
                if success:
                    self.logger.info(
                        "Transfer requests successful for %s. Summary: %s",
                        wflow.getName(), pformat(transfers)
                    )  # then create a document in ReqMgr Aux DB
                    if self.createTransferDoc(wflow.getName(), transfers):
                        self.logger.info(
                            "Transfer document successfully created in CouchDB for: %s",
                            wflow.getName())
                        # then move this request to staging status
                        self.change(wflow.getName(), 'staging',
                                    self.__class__.__name__)
                        counterSuccessRequests += 1
                    else:
                        counterFailedRequests += 1
                else:
                    counterFailedRequests += 1
            # it can go slightly beyond the limit. It's evaluated for every slice
            if counterSuccessRequests >= self.msConfig["limitRequestsPerCycle"]:
                msg = "Transferor succeeded acting on %d workflows in this cycle. " % counterSuccessRequests
                msg += "Which exceeds the configuration limit set to: %s" % self.msConfig[
                    "limitRequestsPerCycle"]
                self.logger.info(msg)
                break
            counterWorkflows += len(reqSlice)

        self.logger.info("Summary for this cycle is:")
        self.logger.info("    * there were %d problematic requests;",
                         counterProblematicRequests)
        self.logger.info("    * there were %d failed requests;",
                         counterFailedRequests)
        self.logger.info("    * there were %d successful requests;",
                         counterSuccessRequests)
        self.logger.info("    * a total of %d datasets were subscribed;",
                         self.dsetCounter)
        self.logger.info("    * a total of %d blocks were subscribed.",
                         self.blockCounter)
        self.updateReportDict(summary, "success_request_transition",
                              counterSuccessRequests)
        self.updateReportDict(summary, "failed_request_transition",
                              counterFailedRequests)
        self.updateReportDict(summary, "problematic_requests",
                              counterProblematicRequests)
        self.updateReportDict(summary, "num_datasets_subscribed",
                              self.dsetCounter)
        self.updateReportDict(summary, "num_blocks_subscribed",
                              self.blockCounter)
        self.updateReportDict(summary, "nodes_out_of_space",
                              list(self.rseQuotas.getOutOfSpaceRSEs()))
        return summary
Exemplo n.º 37
0
    def execute(self, emulator=None):
        """
        _execute_

        """
        scramCommand = self.step.application.setup.scramCommand
        scramArch = self.step.application.setup.scramArch
        cmsswVersion = self.step.application.setup.cmsswVersion

        scramArch = getSingleScramArch(scramArch)

        # Are we using emulators again?
        if emulator is not None:
            return emulator.emulate(self.step, self.job)

        overrides = {}
        if hasattr(self.step, 'override'):
            overrides = self.step.override.dictionary_()

        # Set wait to over an hour
        waitTime = overrides.get('waitTime', 3600 + (self.step.retryDelay * self.step.retryCount))

        # hardcode CERN Castor T0_CH_CERN_MSS stageout parameters
        castorStageOutParams = {}
        castorStageOutParams['command'] = overrides.get('command', "xrdcp")
        castorStageOutParams['option'] = overrides.get('option', "--wma-cerncastor")
        castorStageOutParams['phedex-node'] = overrides.get('phedex-node', "T2_CH_CERN")
        castorStageOutParams['lfn-prefix'] = overrides.get('lfn-prefix', "root://castorcms.cern.ch//castor/cern.ch/cms")

        # hardcode CERN EOS T2_CH_CERN stageout parameters
        eosStageOutParams = {}
        eosStageOutParams['command'] = overrides.get('command', "xrdcp")
        eosStageOutParams['option'] = overrides.get('option', "--wma-disablewriterecovery")
        eosStageOutParams['phedex-node'] = overrides.get('phedex-node', "T2_CH_CERN")
        eosStageOutParams['lfn-prefix'] = overrides.get('lfn-prefix', "root://eoscms.cern.ch//eos/cms")

        try:
            castorStageOutMgr = StageOutMgr(**castorStageOutParams)
            eosStageOutMgr = StageOutMgr(**eosStageOutParams)
            stageInMgr = StageInMgr()
            deleteMgr = DeleteMgr()
        except Exception as ex:
            msg = "Unable to load StageOut/Delete Impl: %s" % str(ex)
            logging.error(msg)
            raise WMExecutionFailure(60312, "MgrImplementationError", msg)

        # prepare output tar file
        taskName = self.report.getTaskName().split('/')[-1]
        host = socket.gethostname().split('.')[0]
        tarName = '%s-%s-%s-%i-logs.tar' % (self.report.data.workload, taskName, host, self.job["counter"])
        tarLocation = os.path.join(self.stepSpace.location, tarName)

        # Supported by any release beyond CMSSW_8_X, however DaviX is broken until CMSSW_10_4_X
        # see: https://github.com/cms-sw/cmssw/issues/25292
        useEdmCopyUtil = True
        if isCMSSWSupported(cmsswVersion, "CMSSW_10_4_0"):
            pass
        elif scramArch.startswith('slc6_amd64_'):
            msg = "CMSSW too old or not fully functional to support edmCopyUtil, using CMSSW_10_4_0 instead"
            logging.warning(msg)
            cmsswVersion = "CMSSW_10_4_0"
            scramArch = "slc6_amd64_gcc700"
        else:
            useEdmCopyUtil = False
        logging.info("Using edmCopyUtil: %s", useEdmCopyUtil)

        # setup Scram needed to run edmCopyUtil
        if useEdmCopyUtil:
            scram = Scram(
                command=scramCommand,
                version=cmsswVersion,
                initialise=self.step.application.setup.softwareEnvironment,
                directory=self.step.builder.workingDir,
                architecture=scramArch,
            )
            logging.info("Running scram")
            try:
                projectOutcome = scram.project()
            except Exception as ex:
                msg = "Exception raised while running scram.\n"
                msg += str(ex)
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)

            if projectOutcome > 0:
                msg = scram.diagnostic()
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)
            runtimeOutcome = scram.runtime()
            if runtimeOutcome > 0:
                msg = scram.diagnostic()
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)

        # iterate through input files
        localLogs = []
        deleteLogArchives = []
        if useEdmCopyUtil:
            numberOfFilesPerCopy = 10
        else:
            numberOfFilesPerCopy = 1
        for logs in grouper(self.job["input_files"], numberOfFilesPerCopy):

            copyCommand = "env X509_USER_PROXY=%s edmCopyUtil" % os.environ.get('X509_USER_PROXY', None)
            for log in logs:
                copyCommand += " %s" % log['lfn']
            copyCommand += " %s" % self.step.builder.workingDir

            # give up after timeout of 1 minute per input file
            signal.signal(signal.SIGALRM, alarmHandler)
            signal.alarm(60 * numberOfFilesPerCopy)

            filesCopied = False
            try:
                if useEdmCopyUtil:
                    logging.info("Running edmCopyUtil")
                    retval = scram(copyCommand)
                    if retval == 0:
                        filesCopied = True
                else:
                    logging.info("Running stageIn")
                    for log in logs:
                        fileInfo = {"LFN": log['lfn']}
                        logArchive = stageInMgr(**fileInfo)
                        if logArchive:
                            filesCopied = True
            except Alarm:
                logging.error("Indefinite hang during edmCopyUtil/stageIn of logArchives")
            except StageOutFailure:
                logging.error("Unable to stageIn logArchives")
            except Exception:
                raise

            signal.alarm(0)

            if filesCopied:
                for log in logs:
                    localLogs.append(os.path.join(self.step.builder.workingDir, os.path.basename(log['lfn'])))
                    deleteLogArchives.append(log)
                    self.report.addInputFile(sourceName="logArchives", lfn=log['lfn'])
            else:
                logging.error("Unable to copy logArchives to local disk")
                for log in logs:
                    self.report.addSkippedFile(log['lfn'], None)

        # create tarfile if any logArchive copied in
        if localLogs:
            tarFile = tarfile.open(tarLocation, 'w:')
            for log in localLogs:
                path = log.split('/')
                tarFile.add(name=log,
                            arcname=os.path.join(path[-3],
                                                 path[-2],
                                                 path[-1]))
                os.remove(log)
            tarFile.close()
        else:
            msg = "Unable to copy any logArchives to local disk"
            logging.error(msg)
            raise WMExecutionFailure(60312, "LogCollectError", msg)

        # now staging out the LogCollect tarfile
        logging.info("Staging out LogCollect tarfile to Castor and EOS")
        now = datetime.datetime.now()
        lfn = "/store/logs/prod/%i/%.2i/%s/%s/%s" % (now.year, now.month, "WMAgent",
                                                     self.report.data.workload,
                                                     os.path.basename(tarLocation))

        tarInfo = {'LFN': lfn,
                   'PFN': tarLocation,
                   'PNN': None,
                   'GUID': None}

        # perform mandatory stage out to CERN Castor
        signal.signal(signal.SIGALRM, alarmHandler)
        signal.alarm(waitTime)
        try:
            castorStageOutMgr(tarInfo)
        except Alarm:
            msg = "Indefinite hang during stageOut of LogCollect to Castor"
            logging.error(msg)
            raise WMExecutionFailure(60409, "LogCollectTimeout", msg)
        except Exception as ex:
            msg = "Unable to stageOut LogCollect to Castor:\n"
            msg += str(ex)
            logging.error(msg)
            raise WMExecutionFailure(60408, "LogCollectStageOutError", msg)
        signal.alarm(0)

        # add to job report
        self.report.addOutputFile(outputModule="LogCollect", aFile=tarInfo)
        outputRef = getattr(self.report.data, self.stepName)
        outputRef.output.pfn = tarInfo['PFN']
        outputRef.output.location = tarInfo['PNN']
        outputRef.output.lfn = tarInfo['LFN']

        tarInfo = {'LFN': lfn,
                   'PFN': tarLocation,
                   'PNN': None,
                   'GUID': None}

        # then, perform best effort stage out to CERN EOS
        signal.signal(signal.SIGALRM, alarmHandler)
        signal.alarm(waitTime)
        try:
            eosStageOutMgr(tarInfo)
        except Alarm:
            logging.error("Indefinite hang during stageOut of LogCollect to EOS")
        except Exception as ex:
            logging.error("Unable to stageOut LogCollect to EOS: %s", ex)
        signal.alarm(0)

        # we got this far, delete input
        for log in deleteLogArchives:

            # give up after timeout of 1 minutes
            signal.signal(signal.SIGALRM, alarmHandler)
            signal.alarm(60)
            try:
                fileToDelete = {'LFN': log['lfn'],
                                'PFN': None,
                                'PNN': None,
                                'StageOutCommand': None}
                deleteMgr(fileToDelete=fileToDelete)
            except Alarm:
                logging.error("Indefinite hang during delete of logArchive")
            except Exception as ex:
                logging.error("Unable to delete logArchive: %s", ex)
            signal.alarm(0)

        return
Exemplo n.º 38
0
    def processData(self,
                    sqlstmt,
                    binds={},
                    conn=None,
                    transaction=False,
                    returnCursor=False):
        """
        set conn if you already have an active connection to reuse
        set transaction = True if you already have an active transaction

        """
        connection = None
        try:
            if not conn:
                connection = self.connection()
            else:
                connection = conn

            result = []
            # Can take either a single statement or a list of statements and binds
            sqlstmt = self.makelist(sqlstmt)
            binds = self.makelist(binds)
            if len(sqlstmt) > 0 and (len(binds) == 0 or
                                     (binds[0] == {} or binds[0] == None)):
                # Should only be run by create statements
                if not transaction:
                    #WMCore.WMLogging.sqldebug("transaction created in DBInterface")
                    trans = connection.begin()

                for i in sqlstmt:
                    r = self.executebinds(i,
                                          connection=connection,
                                          returnCursor=returnCursor)
                    result.append(r)

                if not transaction:
                    trans.commit()
            elif len(binds) > len(sqlstmt) and len(sqlstmt) == 1:
                #Run single SQL statement for a list of binds - use execute_many()
                if not transaction:
                    trans = connection.begin()
                for subBinds in grouper(binds, self.maxBindsPerQuery):
                    result.extend(
                        self.executemanybinds(sqlstmt[0],
                                              subBinds,
                                              connection=connection,
                                              returnCursor=returnCursor))

                if not transaction:
                    trans.commit()
            elif len(binds) == len(sqlstmt):
                # Run a list of SQL for a list of binds
                if not transaction:
                    trans = connection.begin()

                for i, s in enumerate(sqlstmt):
                    b = binds[i]

                    r = self.executebinds(s,
                                          b,
                                          connection=connection,
                                          returnCursor=returnCursor)
                    result.append(r)

                if not transaction:
                    trans.commit()
            else:
                self.logger.exception(
                    "DBInterface.processData Nothing executed, problem with your arguments"
                )
                self.logger.exception("DBInterface.processData SQL = %s" %
                                      sqlstmt)
                WMCore.WMLogging.sqldebug(
                    'DBInterface.processData  sql is %s items long' %
                    len(sqlstmt))
                WMCore.WMLogging.sqldebug(
                    'DBInterface.processData  binds are %s items long' %
                    len(binds))
                assert_value = False
                if len(binds) == len(sqlstmt):
                    assert_value = True
                WMCore.WMLogging.sqldebug(
                    'DBInterface.processData are binds and sql same length? : %s'
                    % (assert_value))
                WMCore.WMLogging.sqldebug(
                    'sql: %s\n binds: %s\n, connection:%s\n, transaction:%s\n'
                    % (sqlstmt, binds, connection, transaction))
                WMCore.WMLogging.sqldebug(
                    'type check:\nsql: %s\n binds: %s\n, connection:%s\n, transaction:%s\n'
                    % (type(sqlstmt), type(binds), type(connection),
                       type(transaction)))
                raise Exception(
                    """DBInterface.processData Nothing executed, problem with your arguments
                Probably mismatched sizes for sql (%i) and binds (%i)""" %
                    (len(sqlstmt), len(binds)))
        finally:
            if not conn and connection != None:
                connection.close()  # Return connection to the pool
        return result
Exemplo n.º 39
0
    def processData(self, sqlstmt, binds={}, conn=None,
                    transaction=False, returnCursor=False):
        """
        set conn if you already have an active connection to reuse
        set transaction = True if you already have an active transaction

        """
        connection = None
        try:
            if not conn:
                connection = self.connection()
            else:
                connection = conn

            result = []
            # Can take either a single statement or a list of statements and binds
            sqlstmt = self.makelist(sqlstmt)
            binds = self.makelist(binds)
            if len(sqlstmt) > 0 and (len(binds) == 0 or (binds[0] == {} or binds[0] == None)):
                # Should only be run by create statements
                if not transaction:
                    #WMCore.WMLogging.sqldebug("transaction created in DBInterface")
                    trans = connection.begin()

                for i in sqlstmt:
                    r = self.executebinds(i, connection=connection,
                                          returnCursor=returnCursor)
                    result.append(r)

                if not transaction:
                    trans.commit()
            elif len(binds) > len(sqlstmt) and len(sqlstmt) == 1:
                #Run single SQL statement for a list of binds - use execute_many()
                if not transaction:
                    trans = connection.begin()
                for subBinds in grouper(binds, self.maxBindsPerQuery):
                    result.extend(self.executemanybinds(sqlstmt[0], subBinds,
                                                        connection=connection, returnCursor=returnCursor))

                if not transaction:
                    trans.commit()
            elif len(binds) == len(sqlstmt):
                # Run a list of SQL for a list of binds
                if not transaction:
                    trans = connection.begin()

                for i, s in enumerate(sqlstmt):
                    b = binds[i]

                    r = self.executebinds(s, b, connection=connection,
                                          returnCursor=returnCursor)
                    result.append(r)

                if not transaction:
                    trans.commit()
            else:
                self.logger.exception(
                    "DBInterface.processData Nothing executed, problem with your arguments")
                self.logger.exception(
                    "DBInterface.processData SQL = %s" % sqlstmt)
                WMCore.WMLogging.sqldebug('DBInterface.processData  sql is %s items long' % len(sqlstmt))
                WMCore.WMLogging.sqldebug('DBInterface.processData  binds are %s items long' % len(binds))
                assert_value = False
                if len(binds) == len(sqlstmt):
                    assert_value = True
                WMCore.WMLogging.sqldebug('DBInterface.processData are binds and sql same length? : %s' % (assert_value))
                WMCore.WMLogging.sqldebug('sql: %s\n binds: %s\n, connection:%s\n, transaction:%s\n' %
                                           (sqlstmt, binds, connection, transaction))
                WMCore.WMLogging.sqldebug('type check:\nsql: %s\n binds: %s\n, connection:%s\n, transaction:%s\n' %
                                           (type(sqlstmt), type(binds), type(connection), type(transaction)))
                raise Exception("""DBInterface.processData Nothing executed, problem with your arguments
                Probably mismatched sizes for sql (%i) and binds (%i)""" % (len(sqlstmt), len(binds)))
        finally:
            if not conn and connection != None:
                connection.close() # Return connection to the pool
        return result
Exemplo n.º 40
0
    def listFilesInBlockWithParents(self,
                                    fileBlockName,
                                    lumis=True,
                                    validFileOnly=1):
        """
        _listFilesInBlockWithParents_

        Get a list of files in the named fileblock including
        the parents of that file.
        TODO: lumis can be false when lumi splitting is not required
        However WMBSHelper expect file['LumiList'] to get the run number
        so for now it will be always true.

        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.listFilesInBlockWithParents(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            # TODO: shoud we get only valid block for this?
            files = self.dbs.listFileParents(block_name=fileBlockName)
            fileDetails = self.listFilesInBlock(fileBlockName, lumis,
                                                validFileOnly)

        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlockWithParents(%s)\n" % (
                fileBlockName, )
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        childByParents = defaultdict(list)
        for f in files:
            # Probably a child can have more than 1 parent file
            for fp in f['parent_logical_file_name']:
                childByParents[fp].append(f['logical_file_name'])

        parentsLFNs = list(childByParents)

        if len(parentsLFNs) == 0:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlockWithParents(%s)\n There is no parents files" % (
                fileBlockName)
            raise DBSReaderError(msg)

        parentFilesDetail = []
        # TODO: slicing parentLFNs util DBS api is handling that.
        # Remove slicing if DBS api handles
        for pLFNs in grouper(parentsLFNs, 50):
            parentFilesDetail.extend(
                self.dbs.listFileArray(logical_file_name=pLFNs, detail=True))

        if lumis:
            parentLumis = self._getLumiList(lfns=parentsLFNs)

        parentsByLFN = defaultdict(list)

        for pf in parentFilesDetail:
            parentLFN = pf['logical_file_name']
            dbsFile = remapDBS3Keys(pf, stringify=True)
            if lumis:
                dbsFile["LumiList"] = parentLumis[parentLFN]

            for childLFN in childByParents[parentLFN]:
                parentsByLFN[childLFN].append(dbsFile)

        for fileInfo in fileDetails:
            fileInfo["ParentList"] = parentsByLFN[
                fileInfo['logical_file_name']]

        return fileDetails
Exemplo n.º 41
0
    def submit(self, jobs, info=None):
        """
        _submit_


        Submit jobs for one subscription
        """

        # If we're here, then we have submitter components
        self.scriptFile = self.config.JobSubmitter.submitScript
        self.submitDir = self.config.JobSubmitter.submitDir
        timeout = getattr(self.config.JobSubmitter, 'getTimeout', 400)

        successfulJobs = []
        failedJobs = []
        jdlFiles = []

        if len(jobs) == 0:
            # Then was have nothing to do
            return successfulJobs, failedJobs

        if len(self.pool) == 0:
            # Starting things up
            # This is obviously a submit API
            logging.info("Starting up PyCondorPlugin worker pool")
            self.inputQueue = multiprocessing.Queue()
            self.result = multiprocessing.Queue()
            for x in range(self.nProcess):
                p = multiprocessing.Process(target=submitWorker,
                                            args=(self.inputQueue, self.result, timeout))
                p.start()
                self.pool.append(p)

        if not os.path.exists(self.submitDir):
            os.makedirs(self.submitDir)

        # Submit the jobs
        nSubmits = 0
        queueError = False
        for jobsReady in grouper(jobs, self.jobsPerWorker):

            if queueError:
                # If the queue has failed, then we must not process any more jobs this cycle.
                break

            idList = [x['id'] for x in jobsReady]
            jdlList = self.makeSubmit(jobList=jobsReady)
            if not jdlList:
                # Then we got nothing
                logging.error("No JDL file made!")
                return {'NoResult': [0]}
            jdlFile = "%s/submit_%i_%i.jdl" % (self.submitDir, os.getpid(), idList[0])

            with open(jdlFile, 'w') as handle:
                handle.writelines(jdlList)
            jdlFiles.append(jdlFile)

            # Now submit them
            logging.info("About to submit %i jobs", len(jobsReady))
            if self.glexecPath:
                command = 'CS=`which condor_submit`; '
                if self.glexecWrapScript:
                    command += 'export GLEXEC_ENV=`%s 2>/dev/null`; ' % self.glexecWrapScript
                command += 'export GLEXEC_CLIENT_CERT=%s; ' % self.glexecProxyFile
                command += 'export GLEXEC_SOURCE_PROXY=%s; ' % self.glexecProxyFile
                command += 'export X509_USER_PROXY=%s; ' % self.glexecProxyFile
                command += 'export GLEXEC_TARGET_PROXY=%s; ' % self.jdlProxyFile
                if self.glexecUnwrapScript:
                    command += '%s %s -- $CS %s' % (self.glexecPath, self.glexecUnwrapScript, jdlFile)
                else:
                    command += '%s $CS %s' % (self.glexecPath, jdlFile)
            else:
                command = "condor_submit %s" % jdlFile

            try:
                self.inputQueue.put({'command': command, 'idList': idList})
            except AssertionError as ex:
                msg = "Critical error: input pipeline probably closed.\n"
                msg += str(ex)
                msg += "Error Procedure: Something critical has happened in the worker process\n"
                msg += "We will now proceed to pull all useful data from the queue (if it exists)\n"
                msg += "Then refresh the worker pool\n"
                logging.error(msg)
                queueError = True
                break
            nSubmits += 1

        # Now we should have sent all jobs to be submitted
        # Going to do the rest of it now
        for dummy in range(nSubmits):
            try:
                res = self.result.get(block=True, timeout=timeout)
            except Queue.Empty:
                # If the queue was empty go to the next submit
                # Those jobs have vanished
                logging.error("Queue.Empty error received!")
                logging.error("This could indicate a critical condor error!")
                logging.error("However, no information of any use was obtained due to process failure.")
                logging.error("Either process failed, or process timed out after %s seconds.", timeout)
                continue
            except AssertionError as ex:
                msg = "Found Assertion error while retrieving output from worker process.\n"
                msg += str(ex)
                msg += "This indicates something critical happened to a worker process"
                msg += "We will recover what jobs we know were submitted, and resubmit the rest"
                msg += "Refreshing worker pool at end of loop"
                logging.error(msg)
                continue

            try:
                dummyOut = res['stdout']
                error = res['stderr']
                idList = res['idList']
                exitCode = res['exitCode']
            except KeyError as ex:
                msg = "Error in finding key from result pipe\n"
                msg += "Something has gone critically wrong in the worker\n"
                try:
                    msg += "Result: %s\n" % str(res)
                except Exception:
                    pass
                msg += str(ex)
                logging.error(msg)
                continue

            if exitCode != 0:
                logging.error("Condor returned non-zero.  Printing out command stderr")
                logging.error(error)
                errorCheck, errorMsg = parseError(error=error)
                logging.error("Processing failed jobs and proceeding to the next jobs.")
                logging.error("Do not restart component.")
            else:
                errorCheck = None

            if errorCheck:
                self.errorCount += 1
                condorErrorReport = Report()
                condorErrorReport.addError("JobSubmit", 61202, "CondorError", errorMsg)
                for jobID in idList:
                    for job in jobs:
                        if job.get('id', None) == jobID:
                            job['fwjr'] = condorErrorReport
                            failedJobs.append(job)
                            break
            else:
                if self.errorCount > 0:
                    self.errorCount -= 1
                for jobID in idList:
                    for job in jobs:
                        if job.get('id', None) == jobID:
                            successfulJobs.append(job)
                            break

            # If we get a lot of errors in a row it's probably time to
            # stop the component
            if self.errorCount > self.errorThreshold:
                msg = "Exceeded errorThreshold while submitting to condor. Check condor status."
                logging.error(msg)
                raise BossAirPluginException(msg)

        # Remove JDL files unless commanded otherwise
        if self.deleteJDLFiles:
            for f in jdlFiles:
                os.remove(f)

        # When we're finished, clean up the queue workers in order
        # to free up memory (in the midst of the process, the forked
        # memory space shouldn't be touched, so it should still be
        # shared, but after this point any action by the Submitter will
        # result in memory duplication).
        logging.info("Purging worker pool to clean up memory")
        self.close()

        # We must return a list of jobs successfully submitted,
        # and a list of jobs failed
        logging.info("Done submitting jobs for this cycle in PyCondorPlugin")
        return successfulJobs, failedJobs