def testGrouper(self): """ Test the grouper function (returns chunk of an iterable) """ listChunks = [i for i in grouper(list(range(0, 7)), 3)] # Want list(range) for python 3 iterChunks = [i for i in grouper(xrange(0, 7), 3)] # xrange becomes range in python 3 for a, b in itertools.izip_longest(listChunks, iterChunks): self.assertEqual(a, b) self.assertEqual(listChunks[-1], [6])
def archiveJobs(self): """ _archiveJobs_ archiveJobs will handle the master task of looking for finished jobs, and running the code that cleans them out. """ doneList = self.findFinishedJobs() logging.info("Found %i finished jobs to archive", len(doneList)) jobCounter = 0 for slicedList in grouper(doneList, 10000): self.cleanWorkArea(slicedList) successList = [] failList = [] killList = [] for job in slicedList: if job["outcome"] == "success": successList.append(job) elif job["outcome"] == "killed": killList.append(job) else: failList.append(job) myThread = threading.currentThread() myThread.transaction.begin() self.changeState.propagate(successList, "cleanout", "success") self.changeState.propagate(failList, "cleanout", "exhausted") self.changeState.propagate(killList, "cleanout", "killed") myThread.transaction.commit() jobCounter += len(slicedList) logging.info("Successfully archived %d jobs out of %d.", jobCounter, len(doneList))
def main(): # FIXME update the workflow name here wf = "mcremone_task_EXO-RunIISummer15wmLHEGS-04802__v1_T_170811_181808_305" print("Looking for problematic inbox elements...") wq = WorkQueue("https://cmsweb.cern.ch/couchdb/workqueue") print("Workqueue config: server %s and db %s" % (wq.server.url, wq.db.name)) nonCancelableElements = ['Done', 'Canceled', 'Failed'] data = wq.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus', {'startkey': [wf], 'endkey': [wf, {}], 'reduce': False}) elements = [x['id'] for x in data.get('rows', []) if x['key'][1] not in nonCancelableElements] print("Found %d elements for wf %s" % (len(elements), wf)) total = 0 for eleSlice in grouper(elements, 100): try: wq.updateElements(*eleSlice, Status='CancelRequested') except Exception as ex: print("Exception happened, but keep going: %s" % str(ex)) else: total += 100 print("Elements updated: %s" % total) print("Done!") sys.exit(0)
def algorithm(self, parameters): """ get information from wmbs, workqueue and local couch """ try: data = self.fwjrAPI.getFWJRByArchiveStatus('ready', limit=self.numDocsRetrievePerPolling)['rows'] logging.info("Found %i not archived documents from FWRJ db to upload to WMArchive.", len(data)) for slicedData in grouper(data, self.numDocsUploadPerCall): jobIDs = [] archiveDocs = [] for job in slicedData: doc = createArchiverDoc(job) archiveDocs.append(doc) jobIDs.append(job["id"]) response = self.wmarchiver.archiveData(archiveDocs) # Partial success is not allowed either all the insert is successful or none is if response[0]['status'] == "ok" and len(response[0]['ids']) == len(jobIDs): archiveIDs = response[0]['ids'] for docID in jobIDs: self.fwjrAPI.updateArchiveUploadedStatus(docID) logging.info("...successfully uploaded %d docs", len(jobIDs)) logging.debug("JobIDs uploaded: %s", jobIDs) logging.debug("Archived IDs returned: %s", archiveIDs) else: logging.warning("Upload failed and it will be retried in the next cycle: %s: %s.", response[0]['status'], response[0]['reason']) logging.debug("failed JobIds %s", jobIDs) except Exception as ex: logging.error("Error occurred, will retry later:") logging.error(str(ex)) logging.error("Trace back: \n%s", traceback.format_exc())
def main(): # FIXME update the workflow name here wf = "mcremone_task_EXO-RunIISummer15wmLHEGS-04802__v1_T_170811_181808_305" print("Looking for problematic inbox elements...") wq = WorkQueue("https://cmsweb.cern.ch/couchdb/workqueue") print("Workqueue config: server %s and db %s" % (wq.server.url, wq.db.name)) nonCancelableElements = ['Done', 'Canceled', 'Failed'] data = wq.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus', { 'startkey': [wf], 'endkey': [wf, {}], 'reduce': False }) elements = [ x['id'] for x in data.get('rows', []) if x['key'][1] not in nonCancelableElements ] print("Found %d elements for wf %s" % (len(elements), wf)) total = 0 for eleSlice in grouper(elements, 100): try: wq.updateElements(*eleSlice, Status='CancelRequested') except Exception as ex: print("Exception happened, but keep going: %s" % str(ex)) else: total += 100 print("Elements updated: %s" % total) print("Done!") sys.exit(0)
def _getLatestJobInfo(self, keys): """ Given a list of lists as keys, in the format of: [['request_name', 'agent_url'], ['request_name2', 'agent_url2'], ....] The result format from the latestRequest view is: {u'offset': 527, u'rows': [{u'doc': {u'_rev': u'32-6027014210', ... u'id': u'cmsgwms-submit6.fnal.gov-cmsunified_ACDC0_task_BTV-RunIISummer19UL18wmLHEGEN-00004__v1_T_200507_162125_3670', u'key': [u'cmsunified_ACDC0_task_BTV-RunIISummer19UL18wmLHEGEN-00004__v1_T_200507_162125_3670', u'cmsgwms-submit6.fnal.gov'], u'value': None}], u'total_rows': 49606} """ if not keys: return [] options = {} options["include_docs"] = True options["reduce"] = False finalResults = {} # magic number: 5000 keys (need to check which number is optimal) for sliceKeys in grouper(keys, 5000): self.logger.info("Querying latestRequest with %d keys", len(sliceKeys)) result = self._getCouchView("latestRequest", options, sliceKeys) if not finalResults and result: finalResults = result elif result.get('rows'): finalResults['rows'].extend(result['rows']) return finalResults
def execute(self, file=None, runs=None, conn=None, transaction=False): for sliceBinds in grouper(self.getBinds(file, runs), 10000): result = self.dbi.processData(self.sql, sliceBinds, conn=conn, transaction=transaction) return self.format(result)
def getBlocksByDsetAndRun(datasetName, runList, dbsUrl): """ Given a dataset name and a list of runs, find all the blocks :return: flat list of blocks """ blocks = set() if isinstance(runList, set): runList = list(runList) urls = [] for runSlice in grouper(runList, 50): urls.append('%s/blocks?run_num=%s&dataset=%s' % (dbsUrl, str(runSlice).replace(" ", ""), datasetName)) logging.info( "Executing %d requests against DBS 'blocks' API, with run_num list", len(urls)) data = multi_getdata(urls, ckey(), cert()) for row in data: dataset = row['url'].rsplit('=')[-1] if row['data'] is None: msg = "Failure in getBlocksByDsetAndRun for %s. Error: %s %s" % ( dataset, row.get('code'), row.get('error')) raise RuntimeError(msg) rows = json.loads(row['data']) for item in rows: blocks.add(item['block_name']) return list(blocks)
def archiveJobs(self): """ _archiveJobs_ archiveJobs will handle the master task of looking for finished jobs, and running the code that cleans them out. """ doneList = self.findFinishedJobs() logging.info("Found %i finished jobs to archive", len(doneList)) jobCounter = 0 for slicedList in grouper(doneList, 10000): self.cleanWorkArea(slicedList) successList = [] failList = [] killList = [] for job in slicedList: if job["outcome"] == "success": successList.append(job) elif job["outcome"] == "killed": killList.append(job) else: failList.append(job) myThread = threading.currentThread() myThread.transaction.begin() self.changeState.propagate(successList, "cleanout", "success") self.changeState.propagate(failList, "cleanout", "exhausted") self.changeState.propagate(killList, "cleanout", "killed") myThread.transaction.commit() jobCounter += len(slicedList) logging.info("Successfully archived %d jobs out of %d.", jobCounter, len(doneList))
def _getLumiList(self, blockName=None, lfns=None, validFileOnly=1): """ currently only take one lfn but dbs api need be updated """ try: if blockName: lumiLists = self.dbs.listFileLumis(block_name=blockName, validFileOnly=validFileOnly) elif lfns: lumiLists = [] for slfn in grouper(lfns, 50): lumiLists.extend( self.dbs.listFileLumiArray(logical_file_name=slfn)) else: # shouldn't call this with both blockName and lfns empty # but still returns empty dict for that case return {} except dbsClientException as ex: msg = "Error in " msg += "DBSReader.listFileLumiArray(%s)\n" % lfns msg += "%s\n" % formatEx3(ex) raise DBSReaderError(msg) lumiDict = {} for lumisItem in lumiLists: lumiDict.setdefault(lumisItem['logical_file_name'], []) item = {} item["RunNumber"] = lumisItem['run_num'] item['LumiSectionNumber'] = lumisItem['lumi_section_num'] if lumisItem.get('event_count', None) is not None: item['EventCount'] = lumisItem['event_count'] lumiDict[lumisItem['logical_file_name']].append(item) # TODO: add key for lumi and event pair. return lumiDict
def _getLumiList(self, blockName=None, lfns=None, validFileOnly=1): """ currently only take one lfn but dbs api need be updated """ try: if blockName: lumiLists = self.dbs.listFileLumis(block_name=blockName, validFileOnly=validFileOnly) elif lfns: lumiLists = [] for slfn in grouper(lfns, 50): lumiLists.extend(self.dbs.listFileLumiArray(logical_file_name = slfn)) else: # shouldn't call this with both blockName and lfns empty # but still returns empty dict for that case return {} except dbsClientException as ex: msg = "Error in " msg += "DBSReader.listFileLumiArray(%s)\n" % lfns msg += "%s\n" % formatEx3(ex) raise DBSReaderError(msg) lumiDict = {} for lumisItem in lumiLists: lumiDict.setdefault(lumisItem['logical_file_name'], []) item = {} item["RunNumber"] = lumisItem['run_num'] item['LumiSectionNumber'] = lumisItem['lumi_section_num'] lumiDict[lumisItem['logical_file_name']].append(item) return lumiDict
def execute(self): """ Executes the whole transferor logic :param reqStatus: request status to that matters for this module :return: """ if not self.prep(): self.logger.warning( "Failed to fetch the latest unified config. Skipping this cycle" ) return self.uConfig = self.uConfig[0] requestRecords = [] try: # get requests from ReqMgr2 data-service for given status requests = self.reqmgr2.getRequestByStatus([self.status], detail=True) if requests: requests = requests[0] self.logger.info("### transferor found %s requests in '%s' state", len(requests), self.status) if requests: for _, wfData in requests.iteritems(): requestRecords.append(self.requestRecord(wfData)) except Exception as err: self.logger.exception('### transferor error: %s', str(err)) if not requestRecords: return try: reqInfo = RequestInfo(self.msConfig, self.uConfig, self.logger) for reqSlice in grouper(requestRecords, 50): reqResults = reqInfo(reqSlice) self.logger.info("%d requests completely processed.", len(reqResults)) self.logger.info( "Working on the data subscription and status change...") # process all requests for req in reqResults: reqName = req['name'] # perform transfer tid = self.transferRequest(req) if tid: # Once all transfer requests were successfully made, update: assigned -> staging self.logger.debug( "### transfer request for %s successfull", reqName) self.change(req, 'staging', '### transferor') # if there is nothing to be transferred (no input at all), # then update the request status once again staging -> staged # self.change(req, 'staged', '### transferor') except Exception as err: # general error self.logger.exception('### transferor error: %s', str(err))
def submit(self, jobs, info=None): """ _submit_ Submits jobs to the condor queue """ successfulJobs = [] failedJobs = [] if len(jobs) == 0: # Then was have nothing to do return successfulJobs, failedJobs schedd = htcondor.Schedd() # Submit the jobs for jobsReady in grouper(jobs, self.jobsPerSubmit): clusterAd = self.getClusterAd() procAds = self.getProcAds(jobsReady) logging.debug( "Start: Submitting %d jobs using Condor Python SubmitMany", len(procAds)) try: # 4th argument has to be None otherwise HTCondor leaks the result ads # through it (as of 8.7.x). More info in WMCore/#8729 clusterId = schedd.submitMany(clusterAd, procAds, False, None) except Exception as ex: logging.error("SimpleCondorPlugin job submission failed.") logging.exception(str(ex)) logging.error( "Moving on the the next batch of jobs and/or cycle....") condorErrorReport = Report() condorErrorReport.addError("JobSubmit", 61202, "CondorError", str(ex)) for job in jobsReady: job['fwjr'] = condorErrorReport failedJobs.append(job) else: logging.debug( "Job submission to condor succeeded, clusterId is %s", clusterId) for index, job in enumerate(jobsReady): job['gridid'] = "%s.%s" % (clusterId, index) job['status'] = 'Idle' successfulJobs.append(job) # We must return a list of jobs successfully submitted and a list of jobs failed logging.info( "Done submitting jobs for this cycle in SimpleCondorPlugin") return successfulJobs, failedJobs
def submit(self, jobs, info=None): """ _submit_ Submits jobs to the condor queue """ successfulJobs = [] failedJobs = [] if len(jobs) == 0: # Then was have nothing to do return successfulJobs, failedJobs schedd = htcondor.Schedd() # Submit the jobs for jobsReady in grouper(jobs, self.jobsPerSubmit): (sub, jobParams) = self.createSubmitRequest(jobsReady) logging.debug( "Start: Submitting %d jobs using Condor Python Submit", len(jobParams)) try: with schedd.transaction() as txn: submitRes = sub.queue_with_itemdata( txn, 1, iter(jobParams)) clusterId = submitRes.cluster() except Exception as ex: logging.error("SimpleCondorPlugin job submission failed.") logging.exception(str(ex)) logging.error( "Moving on the the next batch of jobs and/or cycle....") condorErrorReport = Report() condorErrorReport.addError("JobSubmit", 61202, "CondorError", str(ex)) for job in jobsReady: job['fwjr'] = condorErrorReport failedJobs.append(job) else: logging.debug( "Job submission to condor succeeded, clusterId is %s", clusterId) for index, job in enumerate(jobsReady): job['gridid'] = "%s.%s" % (clusterId, index) job['status'] = 'Idle' successfulJobs.append(job) # We must return a list of jobs successfully submitted and a list of jobs failed logging.info( "Done submitting jobs for this cycle in SimpleCondorPlugin") return successfulJobs, failedJobs
def handleErrors(self): """ Queries DB for all watched filesets, if matching filesets become available, create the subscriptions """ # Run over created, submitted and executed job failures failure_states = ['create', 'submit', 'job'] for state in failure_states: idList = self.getJobs.execute(state="%sfailed" % state) logging.info("Found %d failed jobs in state %sfailed", len(idList), state) for jobSlice in grouper(idList, self.maxProcessSize): jobList = self.loadJobsFromList(jobSlice) self.handleFailedJobs(jobList, state) # Run over jobs done with retries idList = self.getJobs.execute(state='retrydone') logging.info("Found %d jobs done with all retries", len(idList)) for jobSlice in grouper(idList, self.maxProcessSize): jobList = self.loadJobsFromList(jobSlice) self.handleRetryDoneJobs(jobList) return
def submit(self, jobs, info=None): """ _submit_ Submit jobs for one subscription """ successfulJobs = [] failedJobs = [] if len(jobs) == 0: # Then was have nothing to do return successfulJobs, failedJobs schedd = htcondor.Schedd() # Submit the jobs for jobsReady in grouper(jobs, self.jobsPerSubmit): clusterAd = self.getClusterAd() procAds = self.getProcAds(jobsReady) logging.debug( "Start: Submitting %d jobs using Condor Python SubmitMany", len(procAds)) try: clusterId = schedd.submitMany(clusterAd, procAds) except Exception as ex: logging.error("SimpleCondorPlugin job submission failed.") logging.error( "Moving on the the next batch of jobs and/or cycle....") logging.exception(ex) condorErrorReport = Report() condorErrorReport.addError("JobSubmit", 61202, "CondorError", str(ex)) for job in jobsReady: job['fwjr'] = condorErrorReport failedJobs.append(job) else: logging.debug( "Finish: Submitting jobs using Condor Python SubmitMany") for index, job in enumerate(jobsReady): job['gridid'] = "%s.%s" % (clusterId, index) job['status'] = 'Idle' successfulJobs.append(job) # We must return a list of jobs successfully submitted and a list of jobs failed logging.info( "Done submitting jobs for this cycle in SimpleCondorPlugin") return successfulJobs, failedJobs
def handleErrors(self): """ Queries DB for all watched filesets, if matching filesets become available, create the subscriptions """ # Run over created, submitted and executed job failures failure_states = ['create', 'submit', 'job'] for state in failure_states: idList = self.getJobs.execute(state="%sfailed" % state) logging.info("Found %d failed jobs in state %sfailed", len(idList), state) for jobSlice in grouper(idList, self.maxProcessSize): jobList = self.loadJobsFromList(jobSlice) self.handleFailedJobs(jobList, state) # Run over jobs done with retries idList = self.getJobs.execute(state='retrydone') logging.info("Found %d jobs done with all retries", len(idList)) for jobSlice in grouper(idList, self.maxProcessSize): jobList = self.loadJobsFromList(jobSlice) self.handleRetryDoneJobs(jobList) return
def submit(self, jobs, info=None): """ _submit_ Submits jobs to the condor queue """ successfulJobs = [] failedJobs = [] if len(jobs) == 0: # Then was have nothing to do return successfulJobs, failedJobs schedd = htcondor.Schedd() # Submit the jobs for jobsReady in grouper(jobs, self.jobsPerSubmit): clusterAd = self.getClusterAd() procAds = self.getProcAds(jobsReady) logging.debug("Start: Submitting %d jobs using Condor Python SubmitMany", len(procAds)) try: # 4th argument has to be None otherwise HTCondor leaks the result ads # through it (as of 8.7.x). More info in WMCore/#8729 clusterId = schedd.submitMany(clusterAd, procAds, False, None) except Exception as ex: logging.error("SimpleCondorPlugin job submission failed.") logging.exception(str(ex)) logging.error("Moving on the the next batch of jobs and/or cycle....") condorErrorReport = Report() condorErrorReport.addError("JobSubmit", 61202, "CondorError", str(ex)) for job in jobsReady: job['fwjr'] = condorErrorReport failedJobs.append(job) else: logging.debug("Job submission to condor succeeded, clusterId is %s", clusterId) for index, job in enumerate(jobsReady): job['gridid'] = "%s.%s" % (clusterId, index) job['status'] = 'Idle' successfulJobs.append(job) # We must return a list of jobs successfully submitted and a list of jobs failed logging.info("Done submitting jobs for this cycle in SimpleCondorPlugin") return successfulJobs, failedJobs
def createDirectories(dirList): """ Create the directory if everything is sane """ for sdirList in grouper(dirList, 500): cmdArgs = ['mkdir'] cmdArgs.extend(sdirList) pipe = Popen(cmdArgs, stdout = PIPE, stderr = PIPE, shell = False) stdout, stderr = pipe.communicate() if not stderr == "": msg = "Error in making directories: %s\n" % stderr logging.error(msg) logging.debug("Executing command %s\n" % cmdArgs) raise CreateWorkAreaException(msg) return
def submit(self, jobs, info=None): """ _submit_ Submit jobs for one subscription """ successfulJobs = [] failedJobs = [] if len(jobs) == 0: # Then was have nothing to do return successfulJobs, failedJobs schedd = htcondor.Schedd() # Submit the jobs for jobsReady in grouper(jobs, self.jobsPerSubmit): clusterAd = self.getClusterAd() procAds = self.getProcAds(jobsReady) logging.debug("Start: Submitting %d jobs using Condor Python SubmitMany", len(procAds)) try: clusterId = schedd.submitMany(clusterAd, procAds) except Exception as ex: logging.error("SimpleCondorPlugin job submission failed.") logging.error("Moving on the the next batch of jobs and/or cycle....") logging.exception(ex) condorErrorReport = Report() condorErrorReport.addError("JobSubmit", 61202, "CondorError", str(ex)) for job in jobsReady: job['fwjr'] = condorErrorReport failedJobs.append(job) else: logging.debug("Finish: Submitting jobs using Condor Python SubmitMany") for index, job in enumerate(jobsReady): job['gridid'] = "%s.%s" % (clusterId, index) job['status'] = 'Idle' successfulJobs.append(job) # We must return a list of jobs successfully submitted and a list of jobs failed logging.info("Done submitting jobs for this cycle in SimpleCondorPlugin") return successfulJobs, failedJobs
def createDirectories(dirList): """ Create the directory if everything is sane """ for sdirList in grouper(dirList, 500): cmdArgs = ['mkdir'] cmdArgs.extend(sdirList) pipe = Popen(cmdArgs, stdout=PIPE, stderr=PIPE, shell=False) stdout, stderr = pipe.communicate() if stderr: if hasattr(stderr, "decode"): stderr = stderr.decode('utf-8', 'ignore') msg = "Error in creating directories: %s\n" % stderr logging.error(msg) raise CreateWorkAreaException(msg) return
def algorithm(self, parameters): """ get information from wmbs, workqueue and local couch """ try: data = self.fwjrAPI.getFWJRByArchiveStatus( 'ready', limit=self.numDocsRetrievePerPolling)['rows'] logging.info( "Found %i not archived documents from FWRJ db to upload to WMArchive.", len(data)) for slicedData in grouper(data, self.numDocsUploadPerCall): jobIDs = [] archiveDocs = [] for job in slicedData: doc = createArchiverDoc(job) archiveDocs.append(doc) jobIDs.append(job["id"]) response = self.wmarchiver.archiveData(archiveDocs) # Partial success is not allowed either all the insert is successful or none is if response[0]['status'] == "ok" and len( response[0]['ids']) == len(jobIDs): archiveIDs = response[0]['ids'] for docID in jobIDs: self.fwjrAPI.updateArchiveUploadedStatus(docID) logging.info("...successfully uploaded %d docs", len(jobIDs)) logging.debug("JobIDs uploaded: %s", jobIDs) logging.debug("Archived IDs returned: %s", archiveIDs) else: logging.warning( "Upload failed and it will be retried in the next cycle: %s: %s.", response[0]['status'], response[0]['reason']) logging.debug("failed JobIds %s", jobIDs) except Exception as ex: logging.error("Error occurred, will retry later:") logging.error(str(ex)) logging.error("Trace back: \n%s", traceback.format_exc())
def updateBulkDocuments(self, doc_ids, paramsToUpdate, updateLimits=1000): uri = '/%s/_bulk_docs/' % self.name conflictDocIDs = [] for ids in grouper(doc_ids, updateLimits): # get original documens docs = self.allDocs(options={"include_docs": True}, keys=ids)['rows'] data = {} data['docs'] = [] for j in docs: doc = {} doc.update(j['doc']) nestedDictUpdate(doc, paramsToUpdate) data['docs'].append(doc) if len(data['docs']) > 0: retval = self.post(uri, data) for result in retval: if result.get('error', None) == 'conflict': conflictDocIDs.append(result['id']) return conflictDocIDs
def algorithm(self, parameters=None): """ _algorithm_ Poll WMBS for jobs in the 'Complete' state and then pass them to the accountant worker. """ completeJobs = self.getJobsAction.execute(state="complete") logging.info("Found %d completed jobs", len(completeJobs)) if len(completeJobs) == 0: logging.debug("No work to do; exiting") return for jobsSlice in grouper(completeJobs, self.accountantWorkSize): try: self.accountantWorker(jobsSlice) except WMException: myThread = threading.currentThread() if getattr(myThread, 'transaction', None) is not None: myThread.transaction.rollback() raise except CouchConnectionError as ex: msg = "Caught CouchConnectionError exception. Waiting until the next polling cycle.\n" msg += str(ex) logging.error(msg) myThread = threading.currentThread() if getattr(myThread, 'transaction', None) is not None: myThread.transaction.rollback() except Exception as ex: myThread = threading.currentThread() if getattr(myThread, 'transaction', None) is not None: myThread.transaction.rollback() msg = "Hit general exception in JobAccountantPoller while using worker.\n" msg += str(ex) logging.exception(msg) raise JobAccountantPollerException(msg) return
def updateBulkDocuments(self, doc_ids, paramsToUpdate, updateLimits=1000): uri = '/%s/_bulk_docs/' % self.name conflictDocIDs = [] for ids in grouper(doc_ids, updateLimits): # get original documens docs = self.allDocs(options={"include_docs": True}, keys=ids)['rows'] data = {} data['docs'] = [] for j in docs: doc = {} doc.update(j['doc']) nestedDictUpdate(doc, paramsToUpdate) data['docs'].append(doc) if data['docs']: retval = self.post(uri, data) for result in retval: if result.get('error', None) == 'conflict': conflictDocIDs.append(result['id']) return conflictDocIDs
def algorithm(self, parameters=None): """ _algorithm_ Poll WMBS for jobs in the 'Complete' state and then pass them to the accountant worker. """ completeJobs = self.getJobsAction.execute(state="complete") logging.info("Found %d completed jobs", len(completeJobs)) if len(completeJobs) == 0: logging.debug("No work to do; exiting") return for jobsSlice in grouper(completeJobs, self.accountantWorkSize): try: self.accountantWorker(jobsSlice) except WMException: myThread = threading.currentThread() if getattr(myThread, 'transaction', None) is not None: myThread.transaction.rollback() raise except CouchConnectionError as ex: msg = "Caught CouchConnectionError exception. Waiting until the next polling cycle.\n" msg += str(ex) logging.error(msg) myThread = threading.currentThread() if getattr(myThread, 'transaction', None) is not None: myThread.transaction.rollback() except Exception as ex: myThread = threading.currentThread() if getattr(myThread, 'transaction', None) is not None: myThread.transaction.rollback() msg = "Hit general exception in JobAccountantPoller while using worker.\n" msg += str(ex) logging.exception(msg) raise JobAccountantPollerException(msg) return
def bulkUpdateData(self, docs, existingDocs): """ Update documents to WMStats in bulk, breaking down to 100 docs chunks. :param docs: docs to insert or update :param existingDocs: dict of docId: docRev of docs already existent in wmstats """ if isinstance(docs, dict): docs = [docs] for chunk in grouper(docs, 100): for doc in chunk: if doc['_id'] in existingDocs: revList = existingDocs[doc['_id']].split('-') # update the revision number and keep the history of the revision doc['_revisions'] = {"start": int(revList[0]) + 1, "ids": [str(int(revList[1]) + 1), revList[1]]} else: # then create a random 10 digits uid for the first revision number, required by new_edits=False firstId = "%10d" % random.randrange(9999999999) doc['_revisions'] = {"start": 1, "ids": [firstId]} self.couchDB.queue(doc) logging.info("Committing bulk of %i docs ...", len(chunk)) self.couchDB.commit(new_edits=False) return
def execute(self, emulator=None): """ _execute_ """ scramCommand = self.step.application.setup.scramCommand scramArch = self.step.application.setup.scramArch cmsswVersion = self.step.application.setup.cmsswVersion scramArch = getSingleScramArch(scramArch) # Are we using emulators again? if emulator is not None: return emulator.emulate(self.step, self.job) overrides = {} if hasattr(self.step, 'override'): overrides = self.step.override.dictionary_() # Set wait to over an hour waitTime = overrides.get( 'waitTime', 3600 + (self.step.retryDelay * self.step.retryCount)) # hardcode CERN Castor T0_CH_CERN_MSS stageout parameters castorStageOutParams = {} castorStageOutParams['command'] = overrides.get('command', "xrdcp") castorStageOutParams['option'] = overrides.get('option', "--cerncastor") castorStageOutParams['phedex-node'] = overrides.get( 'phedex-node', "T2_CH_CERN") castorStageOutParams['lfn-prefix'] = overrides.get( 'lfn-prefix', "root://castorcms.cern.ch//castor/cern.ch/cms") # hardcode CERN EOS T2_CH_CERN stageout parameters eosStageOutParams = {} eosStageOutParams['command'] = overrides.get('command', "xrdcp") eosStageOutParams['option'] = overrides.get('option', "") eosStageOutParams['phedex-node'] = overrides.get( 'phedex-node', "T2_CH_CERN") eosStageOutParams['lfn-prefix'] = overrides.get( 'lfn-prefix', "root://eoscms.cern.ch//eos/cms") try: castorStageOutMgr = StageOutMgr(**castorStageOutParams) eosStageOutMgr = StageOutMgr(**eosStageOutParams) stageInMgr = StageInMgr() deleteMgr = DeleteMgr() except Exception as ex: msg = "Unable to load StageOut/Delete Impl: %s" % str(ex) logging.error(msg) raise WMExecutionFailure(60312, "MgrImplementationError", msg) # prepare output tar file taskName = self.report.getTaskName().split('/')[-1] host = socket.gethostname().split('.')[0] tarName = '%s-%s-%s-%i-logs.tar' % ( self.report.data.workload, taskName, host, self.job["counter"]) tarLocation = os.path.join(self.stepSpace.location, tarName) # check if the cmsswVersion supports edmCopyUtil (min CMSSW_8_X) result = re.match("CMSSW_([0-9]+)_([0-9]+)_([0-9]+).*", cmsswVersion) useEdmCopyUtil = False if result: try: if int(result.group(1)) >= 8: useEdmCopyUtil = True except ValueError: pass # setup Scram needed to run edmCopyUtil if useEdmCopyUtil: scram = Scram( command=scramCommand, version=cmsswVersion, initialise=self.step.application.setup.softwareEnvironment, directory=self.step.builder.workingDir, architecture=scramArch, ) logging.info("Running scram") try: projectOutcome = scram.project() except Exception as ex: msg = "Exception raised while running scram.\n" msg += str(ex) logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) if projectOutcome > 0: msg = scram.diagnostic() logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) runtimeOutcome = scram.runtime() if runtimeOutcome > 0: msg = scram.diagnostic() logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) # iterate through input files localLogs = [] deleteLogArchives = [] if useEdmCopyUtil: numberOfFilesPerCopy = 10 else: numberOfFilesPerCopy = 1 for logs in grouper(self.job["input_files"], numberOfFilesPerCopy): copyCommand = "env X509_USER_PROXY=%s edmCopyUtil" % os.environ.get( 'X509_USER_PROXY', None) for log in logs: copyCommand += " %s" % log['lfn'] copyCommand += " %s" % self.step.builder.workingDir # give up after timeout of 1 minute per input file signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(60 * numberOfFilesPerCopy) filesCopied = False try: if useEdmCopyUtil: logging.info("Running edmCopyUtil") retval = scram(copyCommand) if retval == 0: filesCopied = True else: logging.info("Running stageIn") for log in logs: fileInfo = {"LFN": log['lfn']} logArchive = stageInMgr(**fileInfo) if logArchive: filesCopied = True except Alarm: logging.error( "Indefinite hang during edmCopyUtil/stageIn of logArchives" ) except StageOutFailure: logging.error("Unable to stageIn logArchives") except Exception: raise signal.alarm(0) if filesCopied: for log in logs: localLogs.append( os.path.join(self.step.builder.workingDir, os.path.basename(log['lfn']))) deleteLogArchives.append(log) self.report.addInputFile(sourceName="logArchives", lfn=log['lfn']) else: logging.error("Unable to copy logArchives to local disk") if useEdmCopyUtil: with open('scramOutput.log', 'r') as f: logging.error("Scram output: %s", f.read()) for log in logs: self.report.addSkippedFile(log['lfn'], None) # create tarfile if any logArchive copied in if localLogs: tarFile = tarfile.open(tarLocation, 'w:') for log in localLogs: path = log.split('/') tarFile.add(name=log, arcname=os.path.join(path[-3], path[-2], path[-1])) os.remove(log) tarFile.close() else: msg = "Unable to copy any logArchives to local disk" logging.error(msg) raise WMExecutionFailure(60312, "LogCollectError", msg) # now staging out the LogCollect tarfile logging.info("Staging out LogCollect tarfile to Castor and EOS") now = datetime.datetime.now() lfn = "/store/logs/prod/%i/%.2i/%s/%s/%s" % ( now.year, now.month, "WMAgent", self.report.data.workload, os.path.basename(tarLocation)) tarInfo = {'LFN': lfn, 'PFN': tarLocation, 'PNN': None, 'GUID': None} # perform mandatory stage out to CERN Castor signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: castorStageOutMgr(tarInfo) except Alarm: msg = "Indefinite hang during stageOut of LogCollect to Castor" logging.error(msg) raise WMExecutionFailure(60409, "LogCollectTimeout", msg) except Exception as ex: msg = "Unable to stageOut LogCollect to Castor:\n" msg += str(ex) logging.error(msg) raise WMExecutionFailure(60408, "LogCollectStageOutError", msg) signal.alarm(0) # add to job report self.report.addOutputFile(outputModule="LogCollect", aFile=tarInfo) outputRef = getattr(self.report.data, self.stepName) outputRef.output.pfn = tarInfo['PFN'] outputRef.output.location = tarInfo['PNN'] outputRef.output.lfn = tarInfo['LFN'] tarInfo = {'LFN': lfn, 'PFN': tarLocation, 'PNN': None, 'GUID': None} # then, perform best effort stage out to CERN EOS signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: eosStageOutMgr(tarInfo) except Alarm: logging.error( "Indefinite hang during stageOut of LogCollect to EOS") except Exception as ex: logging.error("Unable to stageOut LogCollect to EOS: %s", ex) signal.alarm(0) # we got this far, delete input for log in deleteLogArchives: # give up after timeout of 1 minutes signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(60) try: fileToDelete = { 'LFN': log['lfn'], 'PFN': None, 'PNN': None, 'StageOutCommand': None } deleteMgr(fileToDelete=fileToDelete) except Alarm: logging.error("Indefinite hang during delete of logArchive") except Exception as ex: logging.error("Unable to delete logArchive: %s", ex) signal.alarm(0) return
def execute(self, emulator=None): """ _execute_ """ # Are we using emulators again? if emulator is not None: return emulator.emulate(self.step, self.job) logging.info("Steps.Executors.%s.execute called", self.__class__.__name__) scramCommand = self.step.application.setup.scramCommand cmsswVersion = self.step.application.setup.cmsswVersion scramArch = getSingleScramArch(self.step.application.setup.scramArch) overrideCatalog = getattr(self.step.application, 'overrideCatalog', None) overrides = {} if hasattr(self.step, 'override'): overrides = self.step.override.dictionary_() # Set wait to over an hour waitTime = overrides.get('waitTime', 3600 + (self.step.retryDelay * self.step.retryCount)) # hardcode CERN EOS T2_CH_CERN stageout parameters eosStageOutParams = {} eosStageOutParams['command'] = overrides.get('command', "xrdcp") eosStageOutParams['option'] = overrides.get('option', "--wma-disablewriterecovery") eosStageOutParams['phedex-node'] = overrides.get('phedex-node', "T2_CH_CERN") eosStageOutParams['lfn-prefix'] = overrides.get('lfn-prefix', "root://eoscms.cern.ch//eos/cms") try: eosStageOutMgr = StageOutMgr(**eosStageOutParams) stageInMgr = StageInMgr() deleteMgr = DeleteMgr() except Exception as ex: msg = "Unable to load StageOut/Delete Impl: %s" % str(ex) logging.error(msg) raise WMExecutionFailure(60312, "MgrImplementationError", msg) # prepare output tar file taskName = self.report.getTaskName().split('/')[-1] host = socket.gethostname().split('.')[0] tarName = '%s-%s-%s-%i-logs.tar' % (self.report.data.workload, taskName, host, self.job["counter"]) tarLocation = os.path.join(self.stepSpace.location, tarName) # Supported by any release beyond CMSSW_8_X, however DaviX is broken until CMSSW_10_4_X # see: https://github.com/cms-sw/cmssw/issues/25292 useEdmCopyUtil = True if isCMSSWSupported(cmsswVersion, "CMSSW_10_4_0"): pass elif scramArch.startswith('slc7_amd64_'): msg = "CMSSW too old or not fully functional to support edmCopyUtil, using CMSSW_10_4_0 instead" logging.warning(msg) cmsswVersion = "CMSSW_10_4_0" scramArch = "slc7_amd64_gcc820" elif scramArch.startswith('slc6_amd64_'): msg = "CMSSW too old or not fully functional to support edmCopyUtil, using CMSSW_10_4_0 instead" logging.warning(msg) cmsswVersion = "CMSSW_10_4_0" scramArch = "slc6_amd64_gcc700" else: useEdmCopyUtil = False logging.info("Using edmCopyUtil: %s", useEdmCopyUtil) # setup Scram needed to run edmCopyUtil if useEdmCopyUtil: scram = Scram( command=scramCommand, version=cmsswVersion, initialise=self.step.application.setup.softwareEnvironment, directory=self.step.builder.workingDir, architecture=scramArch, ) logging.info("Running scram") try: projectOutcome = scram.project() except Exception as ex: msg = "Exception raised while running scram.\n" msg += str(ex) logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) if projectOutcome > 0: msg = scram.diagnostic() logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) runtimeOutcome = scram.runtime() if runtimeOutcome > 0: msg = scram.diagnostic() logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) # iterate through input files localLogs = [] deleteLogArchives = [] if useEdmCopyUtil: numberOfFilesPerCopy = 10 else: numberOfFilesPerCopy = 1 for logs in grouper(self.job["input_files"], numberOfFilesPerCopy): copyCommand = "env X509_USER_PROXY=%s edmCopyUtil" % os.environ.get('X509_USER_PROXY', None) # specify TFC if necessary if overrideCatalog: copyCommand += " -c %s" % overrideCatalog for log in logs: copyCommand += " %s" % log['lfn'] copyCommand += " %s" % self.step.builder.workingDir # give up after timeout of 1 minute per input file signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(60 * numberOfFilesPerCopy) filesCopied = False try: if useEdmCopyUtil: logging.info("Running edmCopyUtil") retval = scram(copyCommand) if retval == 0: filesCopied = True else: logging.info("Running stageIn") for log in logs: fileInfo = {"LFN": log['lfn']} logArchive = stageInMgr(**fileInfo) if logArchive: filesCopied = True except Alarm: logging.error("Indefinite hang during edmCopyUtil/stageIn of logArchives") except StageOutFailure: logging.error("Unable to stageIn logArchives") except Exception: raise signal.alarm(0) if filesCopied: for log in logs: localLogs.append(os.path.join(self.step.builder.workingDir, os.path.basename(log['lfn']))) deleteLogArchives.append(log) self.report.addInputFile(sourceName="logArchives", lfn=log['lfn']) else: logging.error("Unable to copy logArchives to local disk") for log in logs: self.report.addSkippedFile(log['lfn'], None) # create tarfile if any logArchive copied in if localLogs: with tarfile.open(tarLocation, 'w:') as tarFile: for log in localLogs: path = log.split('/') tarFile.add(name=log, arcname=os.path.join(path[-3], path[-2], path[-1])) os.remove(log) else: msg = "Unable to copy any logArchives to local disk" logging.error(msg) raise WMExecutionFailure(60312, "LogCollectError", msg) # now staging out the LogCollect tarfile logging.info("Staging out LogCollect tarfile to EOS (skipping CASTOR)") now = datetime.datetime.now() lfn = "/store/logs/prod/%i/%.2i/%s/%s/%s" % (now.year, now.month, "WMAgent", self.report.data.workload, os.path.basename(tarLocation)) tarInfo = {'LFN': lfn, 'PFN': tarLocation, 'PNN': None, 'GUID': None} # perform mandatory stage out to CERN EOS signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: eosStageOutMgr(tarInfo) except Alarm: msg = "Indefinite hang during stageOut of LogCollect to EOS" logging.error(msg) raise WMExecutionFailure(60409, "LogCollectTimeout", msg) except Exception as ex: msg = "Unable to stageOut LogCollect to Castor:\n" msg += str(ex) logging.error(msg) raise WMExecutionFailure(60408, "LogCollectStageOutError", msg) signal.alarm(0) # add to job report self.report.addOutputFile(outputModule="LogCollect", aFile=tarInfo) outputRef = getattr(self.report.data, self.stepName) outputRef.output.pfn = tarInfo['PFN'] outputRef.output.location = tarInfo['PNN'] outputRef.output.lfn = tarInfo['LFN'] # we got this far, delete ALL input files assigned to this job for log in self.job["input_files"]: # give up after timeout of 1 minutes signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(60) try: fileToDelete = {'LFN': log['lfn'], 'PFN': None, 'PNN': None, 'StageOutCommand': None} deleteMgr(fileToDelete=fileToDelete) except Alarm: logging.error("Indefinite hang during delete of logArchive") except Exception as ex: logging.error("Unable to delete logArchive: %s", ex) signal.alarm(0) return
def submit(self, jobs, info=None): """ _submit_ Submit jobs for one subscription """ # If we're here, then we have submitter components self.scriptFile = self.config.JobSubmitter.submitScript self.submitDir = self.config.JobSubmitter.submitDir timeout = getattr(self.config.JobSubmitter, 'getTimeout', 400) successfulJobs = [] failedJobs = [] jdlFiles = [] if len(jobs) == 0: # Then was have nothing to do return successfulJobs, failedJobs if len(self.pool) == 0: # Starting things up # This is obviously a submit API logging.info("Starting up PyCondorPlugin worker pool") self.inputQueue = multiprocessing.Queue() self.result = multiprocessing.Queue() for x in range(self.nProcess): p = multiprocessing.Process(target=submitWorker, args=(self.inputQueue, self.result, timeout)) p.start() self.pool.append(p) if not os.path.exists(self.submitDir): os.makedirs(self.submitDir) # Submit the jobs nSubmits = 0 queueError = False for jobsReady in grouper(jobs, self.jobsPerWorker): if queueError: # If the queue has failed, then we must not process any more jobs this cycle. break idList = [x['id'] for x in jobsReady] jdlList = self.makeSubmit(jobList=jobsReady) if not jdlList: # Then we got nothing logging.error("No JDL file made!") return {'NoResult': [0]} jdlFile = "%s/submit_%i_%i.jdl" % (self.submitDir, os.getpid(), idList[0]) with open(jdlFile, 'w') as handle: handle.writelines(jdlList) jdlFiles.append(jdlFile) # Now submit them logging.info("About to submit %i jobs", len(jobsReady)) if self.glexecPath: command = 'CS=`which condor_submit`; ' if self.glexecWrapScript: command += 'export GLEXEC_ENV=`%s 2>/dev/null`; ' % self.glexecWrapScript command += 'export GLEXEC_CLIENT_CERT=%s; ' % self.glexecProxyFile command += 'export GLEXEC_SOURCE_PROXY=%s; ' % self.glexecProxyFile command += 'export X509_USER_PROXY=%s; ' % self.glexecProxyFile command += 'export GLEXEC_TARGET_PROXY=%s; ' % self.jdlProxyFile if self.glexecUnwrapScript: command += '%s %s -- $CS %s' % ( self.glexecPath, self.glexecUnwrapScript, jdlFile) else: command += '%s $CS %s' % (self.glexecPath, jdlFile) else: command = "condor_submit %s" % jdlFile try: self.inputQueue.put({'command': command, 'idList': idList}) except AssertionError as ex: msg = "Critical error: input pipeline probably closed.\n" msg += str(ex) msg += "Error Procedure: Something critical has happened in the worker process\n" msg += "We will now proceed to pull all useful data from the queue (if it exists)\n" msg += "Then refresh the worker pool\n" logging.error(msg) queueError = True break nSubmits += 1 # Now we should have sent all jobs to be submitted # Going to do the rest of it now for dummy in range(nSubmits): try: res = self.result.get(block=True, timeout=timeout) except Queue.Empty: # If the queue was empty go to the next submit # Those jobs have vanished logging.error("Queue.Empty error received!") logging.error("This could indicate a critical condor error!") logging.error( "However, no information of any use was obtained due to process failure." ) logging.error( "Either process failed, or process timed out after %s seconds.", timeout) continue except AssertionError as ex: msg = "Found Assertion error while retrieving output from worker process.\n" msg += str(ex) msg += "This indicates something critical happened to a worker process" msg += "We will recover what jobs we know were submitted, and resubmit the rest" msg += "Refreshing worker pool at end of loop" logging.error(msg) continue try: dummyOut = res['stdout'] error = res['stderr'] idList = res['idList'] exitCode = res['exitCode'] except KeyError as ex: msg = "Error in finding key from result pipe\n" msg += "Something has gone critically wrong in the worker\n" try: msg += "Result: %s\n" % str(res) except Exception: pass msg += str(ex) logging.error(msg) continue if exitCode != 0: logging.error( "Condor returned non-zero. Printing out command stderr") logging.error(error) errorCheck, errorMsg = parseError(error=error) logging.error( "Processing failed jobs and proceeding to the next jobs.") logging.error("Do not restart component.") else: errorCheck = None if errorCheck: self.errorCount += 1 condorErrorReport = Report() condorErrorReport.addError("JobSubmit", 61202, "CondorError", errorMsg) for jobID in idList: for job in jobs: if job.get('id', None) == jobID: job['fwjr'] = condorErrorReport failedJobs.append(job) break else: if self.errorCount > 0: self.errorCount -= 1 for jobID in idList: for job in jobs: if job.get('id', None) == jobID: successfulJobs.append(job) break # If we get a lot of errors in a row it's probably time to # stop the component if self.errorCount > self.errorThreshold: msg = "Exceeded errorThreshold while submitting to condor. Check condor status." logging.error(msg) raise BossAirPluginException(msg) # Remove JDL files unless commanded otherwise if self.deleteJDLFiles: for f in jdlFiles: os.remove(f) # When we're finished, clean up the queue workers in order # to free up memory (in the midst of the process, the forked # memory space shouldn't be touched, so it should still be # shared, but after this point any action by the Submitter will # result in memory duplication). logging.info("Purging worker pool to clean up memory") self.close() # We must return a list of jobs successfully submitted, # and a list of jobs failed logging.info("Done submitting jobs for this cycle in PyCondorPlugin") return successfulJobs, failedJobs
def execute(self, reqStatus): """ Executes the whole transferor logic :param reqStatus: request status to process :return: """ counterWorkflows = 0 counterFailedRequests = 0 counterSuccessRequests = 0 summary = dict(TRANSFEROR_REPORT) try: requestRecords = self.getRequestRecords(reqStatus) self.updateReportDict(summary, "total_num_requests", len(requestRecords)) msg = " retrieved %s requests. " % len(requestRecords) msg += "Service set to process up to %s requests per cycle." % self.msConfig["limitRequestsPerCycle"] self.logger.info(msg) except Exception as err: # general error msg = "Unknown exception while fetching requests from ReqMgr2. Error: %s", str(err) self.logger.exception(msg) self.updateReportDict(summary, "error", msg) try: self.updateCaches() self.updateReportDict(summary, "total_num_campaigns", len(self.campaigns)) self.updateReportDict(summary, "nodes_out_of_space", list(self.rseQuotas.getOutOfSpaceRSEs())) except RuntimeWarning as ex: msg = "All retries exhausted! Last error was: '%s'" % str(ex) msg += "\nRetrying to update caches again in the next cycle." self.logger.error(msg) self.updateReportDict(summary, "error", msg) return summary except Exception as ex: msg = "Unknown exception updating caches. Error: %s" % str(ex) self.logger.exception(msg) self.updateReportDict(summary, "error", msg) return summary # process all requests for reqSlice in grouper(requestRecords, 100): self.logger.info("Processing workflows from %d to %d.", counterWorkflows + 1, counterWorkflows + len(reqSlice)) # get complete requests information # based on Unified Transferor logic reqResults = self.reqInfo(reqSlice) self.logger.info("%d requests information completely processed.", len(reqResults)) for wflow in reqResults: # first check which data is already in place and filter them out self.checkDataLocation(wflow) try: success, transfers = self.makeTransferRequest(wflow) except Exception as ex: success = False msg = "Unknown exception while making Transfer Request for %s " % wflow.getName() msg += "\tError: %s" % str(ex) self.logger.exception(msg) if success: self.logger.info("Transfer requests successful for %s. Summary: %s", wflow.getName(), pformat(transfers)) # then create a document in ReqMgr Aux DB if self.createTransferDoc(wflow.getName(), transfers): self.logger.info("Transfer document successfully created in CouchDB for: %s", wflow.getName()) # then move this request to staging status self.change(wflow.getName(), 'staging', self.__class__.__name__) counterSuccessRequests += 1 else: counterFailedRequests += 1 else: counterFailedRequests += 1 # it can go slightly beyond the limit. It's evaluated for every slice if counterSuccessRequests >= self.msConfig["limitRequestsPerCycle"]: msg = "Transferor succeeded acting on %d workflows in this cycle. " % counterSuccessRequests msg += "Which exceeds the configuration limit set to: %s" % self.msConfig["limitRequestsPerCycle"] self.logger.info(msg) break counterWorkflows += len(reqSlice) self.logger.info("There were %d failed and %d success requests in this cycle", counterFailedRequests, counterSuccessRequests) self.logger.info("%s subscribed %d datasets and %d blocks in this cycle", self.__class__.__name__, self.dsetCounter, self.blockCounter) self.updateReportDict(summary, "success_request_transition", counterSuccessRequests) self.updateReportDict(summary, "failed_request_transition", counterFailedRequests) self.updateReportDict(summary, "num_datasets_subscribed", self.dsetCounter) self.updateReportDict(summary, "num_blocks_subscribed", self.blockCounter) self.updateReportDict(summary, "nodes_out_of_space", list(self.rseQuotas.getOutOfSpaceRSEs())) return summary
def execute(self, file=None, runs=None, conn=None, transaction=False): for sliceBinds in grouper(self.getBinds(file, runs), 10000): result = self.dbi.processData(self.sql, sliceBinds, conn=conn, transaction=transaction) return self.format(result)
def cleanRSE(self, rse): """ The method to implement the actual deletion of files for an RSE. :param rse: MSUnmergedRSE object to be cleaned :return: The MSUnmergedRSE object """ # Create the gfal2 context object: try: ctx = createGfal2Context(self.msConfig['gfalLogLevel'], self.msConfig['emulateGfal2']) except Exception as ex: msg = "RSE: %s, Failed to create gfal2 Context object. " % rse['name'] msg += "Skipping it in the current run." self.logger.exception(msg) raise MSUnmergedPlineExit(msg) # Start cleaning one directory at a time: for dirLfn, fileLfnGen in rse['files']['toDelete'].items(): if self.msConfig['limitFilesPerRSE'] < 0 or \ rse['counters']['filesToDelete'] < self.msConfig['limitFilesPerRSE']: # First increment the dir counter: rse['counters']['dirsToDelete'] += 1 # Now we consume the rse['files']['toDelete'][dirLfn] generator # upon that no values will be left in it. In case we need it again # we will have to recreate the filter as we did in self.filterUnmergedFiles() pfnList = [] if not rse['pfnPrefix']: # Fall back to calling Rucio on a per directory basis for # resolving the lfn to pfn mapping dirPfn = self.rucio.getPFN(rse['name'], dirLfn, operation='delete')[dirLfn] for fileLfn in fileLfnGen: fileLfnSuffix = fileLfn.split(dirLfn)[1] filePfn = dirPfn + fileLfnSuffix pfnList.append(filePfn) else: # Proceed with assembling the full filePfn out of the rse['pfnPrefix'] and the fileLfn dirPfn = rse['pfnPrefix'] + dirLfn for fileLfn in fileLfnGen: filePfn = rse['pfnPrefix'] + fileLfn pfnList.append(filePfn) rse['counters']['filesToDelete'] += len(pfnList) msg = "\nRSE: %s \nDELETING: %s." msg += "\nPFN list with: %s entries: \n%s" self.logger.debug(msg, rse['name'], dirLfn, len(pfnList), twFormat(pfnList, maxLength=4)) if self.msConfig['enableRealMode']: try: # execute the actual deletion in bulk - full list of files per directory deletedSuccess = 0 for pfnSlice in list(grouper(pfnList, self.msConfig["filesToDeleteSliceSize"])): delResult = ctx.unlink(pfnSlice) # Count all the successfully deleted files (if a deletion was # successful a value of None is put in the delResult list): deletedSuccess += sum([1 for pfnStatus in delResult if pfnStatus is None]) self.logger.debug("RSE: %s, Dir: %s, deletedSuccess: %s", rse['name'], dirLfn, deletedSuccess) rse['counters']['deletedSuccess'] = deletedSuccess # Now clean the whole branch self.logger.debug("Purging dirEntry: %s:\n", dirPfn) purgeSuccess = self._purgeTree(ctx, dirPfn) if not purgeSuccess: msg = "RSE: %s Failed to purge nonEmpty directory: %s" self.logger.error(msg, rse['name'], dirPfn) except Exception as ex: msg = "Error while cleaning RSE: %s. " msg += "Will retry in the next cycle. Err: %s" self.logger.warning(msg, rse['name'], str(ex)) rse['isClean'] = self._checkClean(rse) return rse
def listFilesInBlockWithParents(self, fileBlockName, lumis=True, validFileOnly=1): """ _listFilesInBlockWithParents_ Get a list of files in the named fileblock including the parents of that file. TODO: lumis can be false when lumi splitting is not required However WMBSHelper expect file['LumiList'] to get the run number so for now it will be always true. """ if not self.blockExists(fileBlockName): msg = "DBSReader.listFilesInBlockWithParents(%s): No matching data" raise DBSReaderError(msg % fileBlockName) try: # TODO: shoud we get only valid block for this? files = self.dbs.listFileParents(block_name=fileBlockName) fileDetails = self.listFilesInBlock(fileBlockName, lumis, validFileOnly) except dbsClientException as ex: msg = "Error in " msg += "DBSReader.listFilesInBlockWithParents(%s)\n" % ( fileBlockName,) msg += "%s\n" % formatEx3(ex) raise DBSReaderError(msg) childByParents = defaultdict(list) for f in files: # Probably a child can have more than 1 parent file for fp in f['parent_logical_file_name']: childByParents[fp].append(f['logical_file_name']) parentsLFNs = childByParents.keys() if len(parentsLFNs) == 0: msg = "Error in " msg += "DBSReader.listFilesInBlockWithParents(%s)\n There is no parents files" % ( fileBlockName) raise DBSReaderError(msg) parentFilesDetail = [] # TODO: slicing parentLFNs util DBS api is handling that. # Remove slicing if DBS api handles for pLFNs in grouper(parentsLFNs, 50): parentFilesDetail.extend(self.dbs.listFileArray(logical_file_name=pLFNs, detail=True)) if lumis: parentLumis = self._getLumiList(lfns=parentsLFNs) parentsByLFN = defaultdict(list) for pf in parentFilesDetail: parentLFN = pf['logical_file_name'] dbsFile = remapDBS3Keys(pf, stringify=True) if lumis: dbsFile["LumiList"] = parentLumis[parentLFN] for childLFN in childByParents[parentLFN]: parentsByLFN[childLFN].append(dbsFile) for fileInfo in fileDetails: fileInfo["ParentList"] = parentsByLFN[fileInfo['logical_file_name']] return fileDetails
def _handleInputDataInfo(self, wflow, dset, blocksDict): """ Applies any run/block/lumi list on the primary and parent blocks provided. It's a convoluted logic, such as: 1) if there is no run/block/lumi list, just return the initial blocksDict 2) if it has lumi list, filter runs from it and run block discovery given a dataset name and a list of runs 3) if it has RunWhitelist, run block discovery for a given dataset name and a list of runs 4) if it has only RunBlacklist, discover the run list of all initial blocks provided in blocksDict and remove blocks matching only the black list 5) for the steps above, always check whether the block has replicas 6) NOW that the block data discovery is completed (considering runs): * if LumiList is not enabled, just return the current list of blocks * else, fetch file/run/lumi information in bulk of blocks and compare it to the LumiList, skipping blocks without a single file that matches it. Note that the LumiList check is dealt with in a similar way as done in the WorkQueue StartPolicyInterface/getMaskedBlocks: :param wflow: the Workflow object :param dset: dataset name :param blocksDict: dictionary of blocks, their size and location :return: dictionary of block names and block size """ finalBlocks = {} dbsUrl = wflow.getDbsUrl() runWhite = wflow.getRunWhitelist() runBlack = set(wflow.getRunBlacklist()) lumiList = wflow.getLumilist() if lumiList: # LumiList has precedence over RunWhitelist runWhite = [] for run in lumiList.getRuns(): runWhite.append(int(run)) runWhite = list(set(runWhite)) if runWhite: # Run number 1 is not supported by DBSServer if int(runWhite[0]) == 1: finalBlocks = deepcopy(blocksDict) else: runWhite = list(set(runWhite) - runBlack) self.logger.info( "Fetching blocks matching a list of runs for %s", wflow.getName()) try: blocks = getBlocksByDsetAndRun(dset, runWhite, dbsUrl) except Exception as exc: self.logger.error( "Failed to retrieve blocks by dataset and run. Details: %s", str(exc)) raise for block in blocks: if block in blocksDict: finalBlocks[block] = deepcopy(blocksDict[block]) else: self.logger.info( "Dropping block with no replicas in PhEDEx: %s", block) elif runBlack: # only run blacklist set self.logger.info("Fetching runs in blocks for RunBlacklist for %s", wflow.getName()) try: blockRuns = getRunsInBlock(list(blocksDict), dbsUrl) except Exception as exc: self.logger.error( "Failed to bulk retrieve runs per block. Details: %s", str(exc)) raise for block, runs in blockRuns.items(): if not set(runs).difference(runBlack): self.logger.info( "Dropping block with only blacklisted runs: %s", block) elif block in blocksDict: finalBlocks[block] = deepcopy(blocksDict[block]) if lumiList: self.logger.info( "Fetching block/lumi information for %d blocks in %s", len(finalBlocks), wflow.getName()) self.logger.debug("with the following run whitelist: %s", runWhite) goodBlocks = set() # now with a smaller set of blocks in hand, we collect their lumi # information and discard any blocks not matching the lumi list for blockSlice in grouper(finalBlocks, 10): try: blockFileLumis = getFileLumisInBlock(blockSlice, dbsUrl, validFileOnly=1) except Exception as exc: self.logger.error( "Failed to bulk retrieve run/lumi per block. Details: %s", str(exc)) raise for block, fileLumis in blockFileLumis.items(): for fileLumi in fileLumis: if int(fileLumi['run_num']) not in runWhite: continue runNumber = str(fileLumi['run_num']) lumis = fileLumi['lumi_section_num'] fileMask = LumiList(runsAndLumis={runNumber: lumis}) if lumiList & fileMask: # then it has lumis that we need, keep this block and move on goodBlocks.add(block) break # last but not least, drop any blocks that are not in the good list for block in list(finalBlocks): if block not in goodBlocks: self.logger.info( "Dropping block not matching LumiList: %s", block) finalBlocks.pop(block) if not finalBlocks: finalBlocks = blocksDict return finalBlocks
def execute(self, reqStatus): """ Executes the whole transferor logic :param reqStatus: request status to process :return: """ counterWorkflows = 0 counterFailedRequests = 0 counterProblematicRequests = 0 counterSuccessRequests = 0 summary = dict(TRANSFEROR_REPORT) try: requestRecords = self.getRequestRecords(reqStatus) self.updateReportDict(summary, "total_num_requests", len(requestRecords)) msg = " retrieved %s requests. " % len(requestRecords) msg += "Service set to process up to %s requests per cycle." % self.msConfig[ "limitRequestsPerCycle"] self.logger.info(msg) except Exception as err: # general error msg = "Unknown exception while fetching requests from ReqMgr2. Error: %s", str( err) self.logger.exception(msg) self.updateReportDict(summary, "error", msg) try: self.updateCaches() self.updateReportDict(summary, "total_num_campaigns", len(self.campaigns)) self.updateReportDict(summary, "nodes_out_of_space", list(self.rseQuotas.getOutOfSpaceRSEs())) except RuntimeWarning as ex: msg = "All retries exhausted! Last error was: '%s'" % str(ex) msg += "\nRetrying to update caches again in the next cycle." self.logger.error(msg) self.updateReportDict(summary, "error", msg) return summary except Exception as ex: msg = "Unknown exception updating caches. Error: %s" % str(ex) self.logger.exception(msg) self.updateReportDict(summary, "error", msg) return summary # process all requests for reqSlice in grouper(requestRecords, 100): self.logger.info("Processing workflows from %d to %d.", counterWorkflows + 1, counterWorkflows + len(reqSlice)) # get complete requests information # based on Unified Transferor logic reqResults = self.reqInfo(reqSlice) self.logger.info("%d requests information completely processed.", len(reqResults)) for wflow in reqResults: if not self.verifyCampaignExist(wflow): counterProblematicRequests += 1 continue # first, check whether any pileup dataset is already in place self.checkPUDataLocation(wflow) if wflow.getSecondarySummary() and not wflow.getPURSElist(): # then we still have pileup to be transferred, but with incorrect locations msg = "Workflow: %s cannot proceed due to some PU misconfiguration. Check previous logs..." self.logger.critical(msg, wflow.getName()) # FIXME: this needs to be logged somewhere and workflow be set to failed counterProblematicRequests += 1 continue # now check where input primary and parent blocks will need to go self.checkDataLocation(wflow) try: success, transfers = self.makeTransferRequest(wflow) except Exception as ex: success = False msg = "Unknown exception while making Transfer Request for %s " % wflow.getName( ) msg += "\tError: %s" % str(ex) self.logger.exception(msg) if success: self.logger.info( "Transfer requests successful for %s. Summary: %s", wflow.getName(), pformat(transfers) ) # then create a document in ReqMgr Aux DB if self.createTransferDoc(wflow.getName(), transfers): self.logger.info( "Transfer document successfully created in CouchDB for: %s", wflow.getName()) # then move this request to staging status self.change(wflow.getName(), 'staging', self.__class__.__name__) counterSuccessRequests += 1 else: counterFailedRequests += 1 else: counterFailedRequests += 1 # it can go slightly beyond the limit. It's evaluated for every slice if counterSuccessRequests >= self.msConfig["limitRequestsPerCycle"]: msg = "Transferor succeeded acting on %d workflows in this cycle. " % counterSuccessRequests msg += "Which exceeds the configuration limit set to: %s" % self.msConfig[ "limitRequestsPerCycle"] self.logger.info(msg) break counterWorkflows += len(reqSlice) self.logger.info("Summary for this cycle is:") self.logger.info(" * there were %d problematic requests;", counterProblematicRequests) self.logger.info(" * there were %d failed requests;", counterFailedRequests) self.logger.info(" * there were %d successful requests;", counterSuccessRequests) self.logger.info(" * a total of %d datasets were subscribed;", self.dsetCounter) self.logger.info(" * a total of %d blocks were subscribed.", self.blockCounter) self.updateReportDict(summary, "success_request_transition", counterSuccessRequests) self.updateReportDict(summary, "failed_request_transition", counterFailedRequests) self.updateReportDict(summary, "problematic_requests", counterProblematicRequests) self.updateReportDict(summary, "num_datasets_subscribed", self.dsetCounter) self.updateReportDict(summary, "num_blocks_subscribed", self.blockCounter) self.updateReportDict(summary, "nodes_out_of_space", list(self.rseQuotas.getOutOfSpaceRSEs())) return summary
def execute(self, emulator=None): """ _execute_ """ scramCommand = self.step.application.setup.scramCommand scramArch = self.step.application.setup.scramArch cmsswVersion = self.step.application.setup.cmsswVersion scramArch = getSingleScramArch(scramArch) # Are we using emulators again? if emulator is not None: return emulator.emulate(self.step, self.job) overrides = {} if hasattr(self.step, 'override'): overrides = self.step.override.dictionary_() # Set wait to over an hour waitTime = overrides.get('waitTime', 3600 + (self.step.retryDelay * self.step.retryCount)) # hardcode CERN Castor T0_CH_CERN_MSS stageout parameters castorStageOutParams = {} castorStageOutParams['command'] = overrides.get('command', "xrdcp") castorStageOutParams['option'] = overrides.get('option', "--wma-cerncastor") castorStageOutParams['phedex-node'] = overrides.get('phedex-node', "T2_CH_CERN") castorStageOutParams['lfn-prefix'] = overrides.get('lfn-prefix', "root://castorcms.cern.ch//castor/cern.ch/cms") # hardcode CERN EOS T2_CH_CERN stageout parameters eosStageOutParams = {} eosStageOutParams['command'] = overrides.get('command', "xrdcp") eosStageOutParams['option'] = overrides.get('option', "--wma-disablewriterecovery") eosStageOutParams['phedex-node'] = overrides.get('phedex-node', "T2_CH_CERN") eosStageOutParams['lfn-prefix'] = overrides.get('lfn-prefix', "root://eoscms.cern.ch//eos/cms") try: castorStageOutMgr = StageOutMgr(**castorStageOutParams) eosStageOutMgr = StageOutMgr(**eosStageOutParams) stageInMgr = StageInMgr() deleteMgr = DeleteMgr() except Exception as ex: msg = "Unable to load StageOut/Delete Impl: %s" % str(ex) logging.error(msg) raise WMExecutionFailure(60312, "MgrImplementationError", msg) # prepare output tar file taskName = self.report.getTaskName().split('/')[-1] host = socket.gethostname().split('.')[0] tarName = '%s-%s-%s-%i-logs.tar' % (self.report.data.workload, taskName, host, self.job["counter"]) tarLocation = os.path.join(self.stepSpace.location, tarName) # Supported by any release beyond CMSSW_8_X, however DaviX is broken until CMSSW_10_4_X # see: https://github.com/cms-sw/cmssw/issues/25292 useEdmCopyUtil = True if isCMSSWSupported(cmsswVersion, "CMSSW_10_4_0"): pass elif scramArch.startswith('slc6_amd64_'): msg = "CMSSW too old or not fully functional to support edmCopyUtil, using CMSSW_10_4_0 instead" logging.warning(msg) cmsswVersion = "CMSSW_10_4_0" scramArch = "slc6_amd64_gcc700" else: useEdmCopyUtil = False logging.info("Using edmCopyUtil: %s", useEdmCopyUtil) # setup Scram needed to run edmCopyUtil if useEdmCopyUtil: scram = Scram( command=scramCommand, version=cmsswVersion, initialise=self.step.application.setup.softwareEnvironment, directory=self.step.builder.workingDir, architecture=scramArch, ) logging.info("Running scram") try: projectOutcome = scram.project() except Exception as ex: msg = "Exception raised while running scram.\n" msg += str(ex) logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) if projectOutcome > 0: msg = scram.diagnostic() logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) runtimeOutcome = scram.runtime() if runtimeOutcome > 0: msg = scram.diagnostic() logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) # iterate through input files localLogs = [] deleteLogArchives = [] if useEdmCopyUtil: numberOfFilesPerCopy = 10 else: numberOfFilesPerCopy = 1 for logs in grouper(self.job["input_files"], numberOfFilesPerCopy): copyCommand = "env X509_USER_PROXY=%s edmCopyUtil" % os.environ.get('X509_USER_PROXY', None) for log in logs: copyCommand += " %s" % log['lfn'] copyCommand += " %s" % self.step.builder.workingDir # give up after timeout of 1 minute per input file signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(60 * numberOfFilesPerCopy) filesCopied = False try: if useEdmCopyUtil: logging.info("Running edmCopyUtil") retval = scram(copyCommand) if retval == 0: filesCopied = True else: logging.info("Running stageIn") for log in logs: fileInfo = {"LFN": log['lfn']} logArchive = stageInMgr(**fileInfo) if logArchive: filesCopied = True except Alarm: logging.error("Indefinite hang during edmCopyUtil/stageIn of logArchives") except StageOutFailure: logging.error("Unable to stageIn logArchives") except Exception: raise signal.alarm(0) if filesCopied: for log in logs: localLogs.append(os.path.join(self.step.builder.workingDir, os.path.basename(log['lfn']))) deleteLogArchives.append(log) self.report.addInputFile(sourceName="logArchives", lfn=log['lfn']) else: logging.error("Unable to copy logArchives to local disk") for log in logs: self.report.addSkippedFile(log['lfn'], None) # create tarfile if any logArchive copied in if localLogs: tarFile = tarfile.open(tarLocation, 'w:') for log in localLogs: path = log.split('/') tarFile.add(name=log, arcname=os.path.join(path[-3], path[-2], path[-1])) os.remove(log) tarFile.close() else: msg = "Unable to copy any logArchives to local disk" logging.error(msg) raise WMExecutionFailure(60312, "LogCollectError", msg) # now staging out the LogCollect tarfile logging.info("Staging out LogCollect tarfile to Castor and EOS") now = datetime.datetime.now() lfn = "/store/logs/prod/%i/%.2i/%s/%s/%s" % (now.year, now.month, "WMAgent", self.report.data.workload, os.path.basename(tarLocation)) tarInfo = {'LFN': lfn, 'PFN': tarLocation, 'PNN': None, 'GUID': None} # perform mandatory stage out to CERN Castor signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: castorStageOutMgr(tarInfo) except Alarm: msg = "Indefinite hang during stageOut of LogCollect to Castor" logging.error(msg) raise WMExecutionFailure(60409, "LogCollectTimeout", msg) except Exception as ex: msg = "Unable to stageOut LogCollect to Castor:\n" msg += str(ex) logging.error(msg) raise WMExecutionFailure(60408, "LogCollectStageOutError", msg) signal.alarm(0) # add to job report self.report.addOutputFile(outputModule="LogCollect", aFile=tarInfo) outputRef = getattr(self.report.data, self.stepName) outputRef.output.pfn = tarInfo['PFN'] outputRef.output.location = tarInfo['PNN'] outputRef.output.lfn = tarInfo['LFN'] tarInfo = {'LFN': lfn, 'PFN': tarLocation, 'PNN': None, 'GUID': None} # then, perform best effort stage out to CERN EOS signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: eosStageOutMgr(tarInfo) except Alarm: logging.error("Indefinite hang during stageOut of LogCollect to EOS") except Exception as ex: logging.error("Unable to stageOut LogCollect to EOS: %s", ex) signal.alarm(0) # we got this far, delete input for log in deleteLogArchives: # give up after timeout of 1 minutes signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(60) try: fileToDelete = {'LFN': log['lfn'], 'PFN': None, 'PNN': None, 'StageOutCommand': None} deleteMgr(fileToDelete=fileToDelete) except Alarm: logging.error("Indefinite hang during delete of logArchive") except Exception as ex: logging.error("Unable to delete logArchive: %s", ex) signal.alarm(0) return
def processData(self, sqlstmt, binds={}, conn=None, transaction=False, returnCursor=False): """ set conn if you already have an active connection to reuse set transaction = True if you already have an active transaction """ connection = None try: if not conn: connection = self.connection() else: connection = conn result = [] # Can take either a single statement or a list of statements and binds sqlstmt = self.makelist(sqlstmt) binds = self.makelist(binds) if len(sqlstmt) > 0 and (len(binds) == 0 or (binds[0] == {} or binds[0] == None)): # Should only be run by create statements if not transaction: #WMCore.WMLogging.sqldebug("transaction created in DBInterface") trans = connection.begin() for i in sqlstmt: r = self.executebinds(i, connection=connection, returnCursor=returnCursor) result.append(r) if not transaction: trans.commit() elif len(binds) > len(sqlstmt) and len(sqlstmt) == 1: #Run single SQL statement for a list of binds - use execute_many() if not transaction: trans = connection.begin() for subBinds in grouper(binds, self.maxBindsPerQuery): result.extend( self.executemanybinds(sqlstmt[0], subBinds, connection=connection, returnCursor=returnCursor)) if not transaction: trans.commit() elif len(binds) == len(sqlstmt): # Run a list of SQL for a list of binds if not transaction: trans = connection.begin() for i, s in enumerate(sqlstmt): b = binds[i] r = self.executebinds(s, b, connection=connection, returnCursor=returnCursor) result.append(r) if not transaction: trans.commit() else: self.logger.exception( "DBInterface.processData Nothing executed, problem with your arguments" ) self.logger.exception("DBInterface.processData SQL = %s" % sqlstmt) WMCore.WMLogging.sqldebug( 'DBInterface.processData sql is %s items long' % len(sqlstmt)) WMCore.WMLogging.sqldebug( 'DBInterface.processData binds are %s items long' % len(binds)) assert_value = False if len(binds) == len(sqlstmt): assert_value = True WMCore.WMLogging.sqldebug( 'DBInterface.processData are binds and sql same length? : %s' % (assert_value)) WMCore.WMLogging.sqldebug( 'sql: %s\n binds: %s\n, connection:%s\n, transaction:%s\n' % (sqlstmt, binds, connection, transaction)) WMCore.WMLogging.sqldebug( 'type check:\nsql: %s\n binds: %s\n, connection:%s\n, transaction:%s\n' % (type(sqlstmt), type(binds), type(connection), type(transaction))) raise Exception( """DBInterface.processData Nothing executed, problem with your arguments Probably mismatched sizes for sql (%i) and binds (%i)""" % (len(sqlstmt), len(binds))) finally: if not conn and connection != None: connection.close() # Return connection to the pool return result
def processData(self, sqlstmt, binds={}, conn=None, transaction=False, returnCursor=False): """ set conn if you already have an active connection to reuse set transaction = True if you already have an active transaction """ connection = None try: if not conn: connection = self.connection() else: connection = conn result = [] # Can take either a single statement or a list of statements and binds sqlstmt = self.makelist(sqlstmt) binds = self.makelist(binds) if len(sqlstmt) > 0 and (len(binds) == 0 or (binds[0] == {} or binds[0] == None)): # Should only be run by create statements if not transaction: #WMCore.WMLogging.sqldebug("transaction created in DBInterface") trans = connection.begin() for i in sqlstmt: r = self.executebinds(i, connection=connection, returnCursor=returnCursor) result.append(r) if not transaction: trans.commit() elif len(binds) > len(sqlstmt) and len(sqlstmt) == 1: #Run single SQL statement for a list of binds - use execute_many() if not transaction: trans = connection.begin() for subBinds in grouper(binds, self.maxBindsPerQuery): result.extend(self.executemanybinds(sqlstmt[0], subBinds, connection=connection, returnCursor=returnCursor)) if not transaction: trans.commit() elif len(binds) == len(sqlstmt): # Run a list of SQL for a list of binds if not transaction: trans = connection.begin() for i, s in enumerate(sqlstmt): b = binds[i] r = self.executebinds(s, b, connection=connection, returnCursor=returnCursor) result.append(r) if not transaction: trans.commit() else: self.logger.exception( "DBInterface.processData Nothing executed, problem with your arguments") self.logger.exception( "DBInterface.processData SQL = %s" % sqlstmt) WMCore.WMLogging.sqldebug('DBInterface.processData sql is %s items long' % len(sqlstmt)) WMCore.WMLogging.sqldebug('DBInterface.processData binds are %s items long' % len(binds)) assert_value = False if len(binds) == len(sqlstmt): assert_value = True WMCore.WMLogging.sqldebug('DBInterface.processData are binds and sql same length? : %s' % (assert_value)) WMCore.WMLogging.sqldebug('sql: %s\n binds: %s\n, connection:%s\n, transaction:%s\n' % (sqlstmt, binds, connection, transaction)) WMCore.WMLogging.sqldebug('type check:\nsql: %s\n binds: %s\n, connection:%s\n, transaction:%s\n' % (type(sqlstmt), type(binds), type(connection), type(transaction))) raise Exception("""DBInterface.processData Nothing executed, problem with your arguments Probably mismatched sizes for sql (%i) and binds (%i)""" % (len(sqlstmt), len(binds))) finally: if not conn and connection != None: connection.close() # Return connection to the pool return result
def listFilesInBlockWithParents(self, fileBlockName, lumis=True, validFileOnly=1): """ _listFilesInBlockWithParents_ Get a list of files in the named fileblock including the parents of that file. TODO: lumis can be false when lumi splitting is not required However WMBSHelper expect file['LumiList'] to get the run number so for now it will be always true. """ if not self.blockExists(fileBlockName): msg = "DBSReader.listFilesInBlockWithParents(%s): No matching data" raise DBSReaderError(msg % fileBlockName) try: # TODO: shoud we get only valid block for this? files = self.dbs.listFileParents(block_name=fileBlockName) fileDetails = self.listFilesInBlock(fileBlockName, lumis, validFileOnly) except dbsClientException as ex: msg = "Error in " msg += "DBSReader.listFilesInBlockWithParents(%s)\n" % ( fileBlockName, ) msg += "%s\n" % formatEx3(ex) raise DBSReaderError(msg) childByParents = defaultdict(list) for f in files: # Probably a child can have more than 1 parent file for fp in f['parent_logical_file_name']: childByParents[fp].append(f['logical_file_name']) parentsLFNs = list(childByParents) if len(parentsLFNs) == 0: msg = "Error in " msg += "DBSReader.listFilesInBlockWithParents(%s)\n There is no parents files" % ( fileBlockName) raise DBSReaderError(msg) parentFilesDetail = [] # TODO: slicing parentLFNs util DBS api is handling that. # Remove slicing if DBS api handles for pLFNs in grouper(parentsLFNs, 50): parentFilesDetail.extend( self.dbs.listFileArray(logical_file_name=pLFNs, detail=True)) if lumis: parentLumis = self._getLumiList(lfns=parentsLFNs) parentsByLFN = defaultdict(list) for pf in parentFilesDetail: parentLFN = pf['logical_file_name'] dbsFile = remapDBS3Keys(pf, stringify=True) if lumis: dbsFile["LumiList"] = parentLumis[parentLFN] for childLFN in childByParents[parentLFN]: parentsByLFN[childLFN].append(dbsFile) for fileInfo in fileDetails: fileInfo["ParentList"] = parentsByLFN[ fileInfo['logical_file_name']] return fileDetails
def submit(self, jobs, info=None): """ _submit_ Submit jobs for one subscription """ # If we're here, then we have submitter components self.scriptFile = self.config.JobSubmitter.submitScript self.submitDir = self.config.JobSubmitter.submitDir timeout = getattr(self.config.JobSubmitter, 'getTimeout', 400) successfulJobs = [] failedJobs = [] jdlFiles = [] if len(jobs) == 0: # Then was have nothing to do return successfulJobs, failedJobs if len(self.pool) == 0: # Starting things up # This is obviously a submit API logging.info("Starting up PyCondorPlugin worker pool") self.inputQueue = multiprocessing.Queue() self.result = multiprocessing.Queue() for x in range(self.nProcess): p = multiprocessing.Process(target=submitWorker, args=(self.inputQueue, self.result, timeout)) p.start() self.pool.append(p) if not os.path.exists(self.submitDir): os.makedirs(self.submitDir) # Submit the jobs nSubmits = 0 queueError = False for jobsReady in grouper(jobs, self.jobsPerWorker): if queueError: # If the queue has failed, then we must not process any more jobs this cycle. break idList = [x['id'] for x in jobsReady] jdlList = self.makeSubmit(jobList=jobsReady) if not jdlList: # Then we got nothing logging.error("No JDL file made!") return {'NoResult': [0]} jdlFile = "%s/submit_%i_%i.jdl" % (self.submitDir, os.getpid(), idList[0]) with open(jdlFile, 'w') as handle: handle.writelines(jdlList) jdlFiles.append(jdlFile) # Now submit them logging.info("About to submit %i jobs", len(jobsReady)) if self.glexecPath: command = 'CS=`which condor_submit`; ' if self.glexecWrapScript: command += 'export GLEXEC_ENV=`%s 2>/dev/null`; ' % self.glexecWrapScript command += 'export GLEXEC_CLIENT_CERT=%s; ' % self.glexecProxyFile command += 'export GLEXEC_SOURCE_PROXY=%s; ' % self.glexecProxyFile command += 'export X509_USER_PROXY=%s; ' % self.glexecProxyFile command += 'export GLEXEC_TARGET_PROXY=%s; ' % self.jdlProxyFile if self.glexecUnwrapScript: command += '%s %s -- $CS %s' % (self.glexecPath, self.glexecUnwrapScript, jdlFile) else: command += '%s $CS %s' % (self.glexecPath, jdlFile) else: command = "condor_submit %s" % jdlFile try: self.inputQueue.put({'command': command, 'idList': idList}) except AssertionError as ex: msg = "Critical error: input pipeline probably closed.\n" msg += str(ex) msg += "Error Procedure: Something critical has happened in the worker process\n" msg += "We will now proceed to pull all useful data from the queue (if it exists)\n" msg += "Then refresh the worker pool\n" logging.error(msg) queueError = True break nSubmits += 1 # Now we should have sent all jobs to be submitted # Going to do the rest of it now for dummy in range(nSubmits): try: res = self.result.get(block=True, timeout=timeout) except Queue.Empty: # If the queue was empty go to the next submit # Those jobs have vanished logging.error("Queue.Empty error received!") logging.error("This could indicate a critical condor error!") logging.error("However, no information of any use was obtained due to process failure.") logging.error("Either process failed, or process timed out after %s seconds.", timeout) continue except AssertionError as ex: msg = "Found Assertion error while retrieving output from worker process.\n" msg += str(ex) msg += "This indicates something critical happened to a worker process" msg += "We will recover what jobs we know were submitted, and resubmit the rest" msg += "Refreshing worker pool at end of loop" logging.error(msg) continue try: dummyOut = res['stdout'] error = res['stderr'] idList = res['idList'] exitCode = res['exitCode'] except KeyError as ex: msg = "Error in finding key from result pipe\n" msg += "Something has gone critically wrong in the worker\n" try: msg += "Result: %s\n" % str(res) except Exception: pass msg += str(ex) logging.error(msg) continue if exitCode != 0: logging.error("Condor returned non-zero. Printing out command stderr") logging.error(error) errorCheck, errorMsg = parseError(error=error) logging.error("Processing failed jobs and proceeding to the next jobs.") logging.error("Do not restart component.") else: errorCheck = None if errorCheck: self.errorCount += 1 condorErrorReport = Report() condorErrorReport.addError("JobSubmit", 61202, "CondorError", errorMsg) for jobID in idList: for job in jobs: if job.get('id', None) == jobID: job['fwjr'] = condorErrorReport failedJobs.append(job) break else: if self.errorCount > 0: self.errorCount -= 1 for jobID in idList: for job in jobs: if job.get('id', None) == jobID: successfulJobs.append(job) break # If we get a lot of errors in a row it's probably time to # stop the component if self.errorCount > self.errorThreshold: msg = "Exceeded errorThreshold while submitting to condor. Check condor status." logging.error(msg) raise BossAirPluginException(msg) # Remove JDL files unless commanded otherwise if self.deleteJDLFiles: for f in jdlFiles: os.remove(f) # When we're finished, clean up the queue workers in order # to free up memory (in the midst of the process, the forked # memory space shouldn't be touched, so it should still be # shared, but after this point any action by the Submitter will # result in memory duplication). logging.info("Purging worker pool to clean up memory") self.close() # We must return a list of jobs successfully submitted, # and a list of jobs failed logging.info("Done submitting jobs for this cycle in PyCondorPlugin") return successfulJobs, failedJobs