def loadFilesByBlock(self, blockname): """ _loadFilesByBlock_ Get all files associated with a block """ dbsFiles = [] findFiles = self.daoFactory(classname = "LoadFilesByBlock") results = findFiles.execute(blockname = blockname, transaction = False) for entry in results: # Add loaded information dbsfile = DBSBufferFile(id=entry['id']) dbsfile.update(entry) dbsFiles.append(dbsfile) for dbsfile in dbsFiles: if 'runInfo' in dbsfile.keys(): # Then we have to replace it with a real run for r in dbsfile['runInfo'].keys(): run = Run(runNumber = r) run.extend(dbsfile['runInfo'][r]) dbsfile.addRun(run) del dbsfile['runInfo'] if 'parentLFNs' in dbsfile.keys(): # Then we have some parents for lfn in dbsfile['parentLFNs']: newFile = DBSBufferFile(lfn = lfn) dbsfile['parents'].add(newFile) del dbsfile['parentLFNs'] return dbsFiles
def findUploadableFilesByDAS(self, datasetpath): """ _findUploadableDAS_ Find all the uploadable files for a given DatasetPath. """ dbsFiles = [] findFiles = self.daoFactory(classname = "LoadDBSFilesByDAS") results = findFiles.execute(datasetpath = datasetpath, transaction = False) for entry in results: # Add loaded information dbsfile = DBSBufferFile(id=entry['id']) dbsfile.update(entry) dbsFiles.append(dbsfile) for dbsfile in dbsFiles: if 'runInfo' in dbsfile.keys(): # Then we have to replace it with a real run for r in dbsfile['runInfo'].keys(): run = Run(runNumber = r) run.extend(dbsfile['runInfo'][r]) dbsfile.addRun(run) del dbsfile['runInfo'] if 'parentLFNs' in dbsfile.keys(): # Then we have some parents for lfn in dbsfile['parentLFNs']: newFile = DBSBufferFile(lfn = lfn) dbsfile['parents'].add(newFile) del dbsfile['parentLFNs'] return dbsFiles
def getChunkFiles(self, collectionName, filesetName, chunkOffset, chunkSize = 100, user = "******", group = "cmsdataops"): """ _getChunkFiles_ Retrieve a chunk of files from the given collection and task. """ chunkFiles = [] result = self.couchdb.loadView("ACDC", "owner_coll_fileset_files", {"startkey": [group, user, collectionName, filesetName], "endkey": [group, user, collectionName, filesetName, {}], "limit": chunkSize, "skip": chunkOffset, }, []) for row in result["rows"]: resultRow = row['value'] newFile = File(lfn = resultRow["lfn"], size = resultRow["size"], events = resultRow["events"], parents = set(resultRow["parents"]), locations = set(resultRow["locations"]), merged = resultRow["merged"]) for run in resultRow["runs"]: newRun = Run(run["run_number"]) newRun.extend(run["lumis"]) newFile.addRun(newRun) chunkFiles.append(newFile) return chunkFiles
def injectFilesFromDBS(inputFileset, datasetPath, runsWhiteList=[]): """ _injectFilesFromDBS_ """ print("injecting files from %s into %s, please wait..." % (datasetPath, inputFileset.name)) args = {} args["url"] = "https://cmsweb.cern.ch/dbs/prod/global/DBSReader" args["version"] = "DBS_2_1_1" args["mode"] = "GET" dbsApi = DbsApi(args) dbsResults = dbsApi.listFileArray(path=datasetPath, retriveList=["retrive_lumi", "retrive_run"]) print(" found %d files, inserting into wmbs..." % (len(dbsResults))) for dbsResult in dbsResults: if runsWhiteList and str(dbsResult["LumiList"][0]["RunNumber"]) not in runsWhiteList: continue myFile = File(lfn=dbsResult["LogicalFileName"], size=dbsResult["FileSize"], events=dbsResult["NumberOfEvents"], checksums={"cksum": dbsResult["Checksum"]}, locations="cmssrm.fnal.gov", merged=True) myRun = Run(runNumber=dbsResult["LumiList"][0]["RunNumber"]) for lumi in dbsResult["LumiList"]: myRun.appendLumi(lumi["LumiSectionNumber"]) myFile.addRun(myRun) myFile.create() inputFileset.addFile(myFile) if len(inputFileset) < 1: raise Exception("No files were selected!") inputFileset.commit() inputFileset.markOpen(False) return
def getRunLumis(self, fileBinds, fileList, conn=None, transaction=False): """ _getRunLumis_ Fetch run/lumi/events information for each file and append Run objects to the files information. """ lumiResult = self.dbi.processData(self.runLumiSQL, fileBinds, conn=conn, transaction=transaction) lumiList = self.formatDict(lumiResult) lumiDict = {} for l in lumiList: lumiDict.setdefault(l['fileid'], []) lumiDict[l['fileid']].append(l) for f in fileList: # Add new runs f.setdefault('newRuns', []) fileRuns = {} if f['id'] in lumiDict.keys(): for l in lumiDict[f['id']]: run = l['run'] lumi = l['lumi'] numEvents = l['num_events'] fileRuns.setdefault(run, []) fileRuns[run].append((lumi, numEvents)) for r in fileRuns.keys(): newRun = Run(runNumber=r) newRun.lumis = fileRuns[r] f['newRuns'].append(newRun) return
def prepareDBSFiles(self): """ _prepareDBSFiles_ Retrieve the information from the JSON input data and create DBSFile objects that can be registered in the database. """ timestamp = time.strftime('%m%d%y_%H%M%S') for fileEntry in self.inputData: # Get all the info out of a standard named dataset datasetInfo = str(fileEntry["dataset"]) tokens = datasetInfo.split('/') primDs = tokens[1] procDs = tokens[2] dataTier = tokens[3] procDsTokens = procDs.split('-') acqEra = procDsTokens[0] procVer = procDsTokens[-1][1:] ckSumInfo = fileEntry["checksums"] for entry in ckSumInfo: ckSumInfo[entry] = str(ckSumInfo[entry]) # Build the basic dbsBuffer file dbsFile = DBSBufferFile(lfn = str(fileEntry["lfn"]), size = int(fileEntry.get("size", 0)), events = int(fileEntry.get("events", 0)), checksums = ckSumInfo, status = "NOTUPLOADED") dbsFile.setAlgorithm(appName = "cmsRun", appVer = str(fileEntry.get("cmssw", "LEGACY")), appFam = "Legacy", psetHash = "GIBBERISH", configContent = "None;;None;;None") dbsFile.setDatasetPath("/%s/%s/%s" % (primDs, procDs, dataTier)) dbsFile.setValidStatus(validStatus = "PRODUCTION") dbsFile.setProcessingVer(ver = procVer) dbsFile.setAcquisitionEra(era = acqEra) dbsFile.setGlobalTag(globalTag = str(fileEntry.get('globalTag', "LEGACY"))) # Build a representative task name dbsFile['task'] = '/LegacyInsertionTask_%s/Insertion' % timestamp # Get the runs and lumis runsAndLumis = fileEntry.get("runsAndLumis", {}) for run in runsAndLumis: newRun = Run(runNumber = int(run)) newRun.extend([int(x) for x in runsAndLumis[run]]) dbsFile.addRun(newRun) # Complete the file information with the location and queue it dbsFile.setLocation(se = str(fileEntry["location"]), immediateSave = False) self.dbsFilesToCreate.append(dbsFile) self.inputData = None return
def loadDBSBufferFilesBulk(self, fileObjs): """ _loadDBSBufferFilesBulk_ Yes, this is a stupid place to put it. No, there's not better place. """ myThread = threading.currentThread() dbsFiles = [] existingTransaction = self.beginTransaction() factory = DAOFactory(package = "WMComponent.DBSBuffer.Database", logger = myThread.logger, dbinterface = myThread.dbi) binds = [] for f in fileObjs: binds.append(f["id"]) loadFiles = factory(classname = "DBSBufferFiles.LoadBulkFilesByID") results = loadFiles.execute(files = binds, conn = self.getDBConn(), transaction = self.existingTransaction()) for entry in results: # Add loaded information dbsfile = DBSBufferFile(id=entry['id']) dbsfile.update(entry) dbsFiles.append(dbsfile) for dbsfile in dbsFiles: if 'runInfo' in dbsfile.keys(): # Then we have to replace it with a real run for r in dbsfile['runInfo'].keys(): run = Run(runNumber = r) run.extend(dbsfile['runInfo'][r]) dbsfile.addRun(run) del dbsfile['runInfo'] if 'parentLFNs' in dbsfile.keys(): # Then we have some parents for lfn in dbsfile['parentLFNs']: newFile = DBSBufferFile(lfn = lfn) dbsfile['parents'].add(newFile) del dbsfile['parentLFNs'] self.commitTransaction(existingTransaction) return dbsFiles
def addFileToDBS(self, jobReportFile, task, errorDataset = False): """ _addFileToDBS_ Add a file that was output from a job to the DBS buffer. """ datasetInfo = jobReportFile["dataset"] dbsFile = DBSBufferFile(lfn = jobReportFile["lfn"], size = jobReportFile["size"], events = jobReportFile["events"], checksums = jobReportFile["checksums"], status = "NOTUPLOADED") dbsFile.setAlgorithm(appName = datasetInfo["applicationName"], appVer = datasetInfo["applicationVersion"], appFam = jobReportFile["module_label"], psetHash = "GIBBERISH", configContent = jobReportFile.get('configURL')) if errorDataset: dbsFile.setDatasetPath("/%s/%s/%s" % (datasetInfo["primaryDataset"] + "-Error", datasetInfo["processedDataset"], datasetInfo["dataTier"])) else: dbsFile.setDatasetPath("/%s/%s/%s" % (datasetInfo["primaryDataset"], datasetInfo["processedDataset"], datasetInfo["dataTier"])) dbsFile.setValidStatus(validStatus = jobReportFile.get("validStatus", None)) dbsFile.setProcessingVer(ver = jobReportFile.get('processingVer', None)) dbsFile.setAcquisitionEra(era = jobReportFile.get('acquisitionEra', None)) dbsFile.setGlobalTag(globalTag = jobReportFile.get('globalTag', None)) #TODO need to find where to get the prep id dbsFile.setPrepID(prep_id = jobReportFile.get('prep_id', None)) dbsFile['task'] = task for run in jobReportFile["runs"]: newRun = Run(runNumber = run.run) newRun.extend(run.lumis) dbsFile.addRun(newRun) dbsFile.setLocation(pnn = list(jobReportFile["locations"])[0], immediateSave = False) self.dbsFilesToCreate.append(dbsFile) return
def findUploadableFilesByDAS(self, das): """ _findUploadableDAS_ Find all the Dataset-Algo files available with uploadable files. """ myThread = threading.currentThread() existingTransaction = self.beginTransaction() dbsFiles = [] factory = DAOFactory(package = "WMComponent.DBSUpload.Database", logger = myThread.logger, dbinterface = myThread.dbi) findFiles = factory(classname = "LoadDBSFilesByDAS") results = findFiles.execute(das = das, conn = self.getDBConn(), transaction=self.existingTransaction()) for entry in results: # Add loaded information dbsfile = DBSBufferFile(id=entry['id']) dbsfile.update(entry) dbsFiles.append(dbsfile) for dbsfile in dbsFiles: if 'runInfo' in dbsfile.keys(): # Then we have to replace it with a real run for r in dbsfile['runInfo'].keys(): run = Run(runNumber = r) run.extend(dbsfile['runInfo'][r]) dbsfile.addRun(run) del dbsfile['runInfo'] if 'parentLFNs' in dbsfile.keys(): # Then we have some parents for lfn in dbsfile['parentLFNs']: newFile = DBSBufferFile(lfn = lfn) dbsfile['parents'].add(newFile) del dbsfile['parentLFNs'] self.commitTransaction(existingTransaction) return dbsFiles
def loadDBSBufferFilesBulk(self, fileObjs): """ _loadDBSBufferFilesBulk_ Yes, this is a stupid place to put it. No, there's not better place. """ myThread = threading.currentThread() dbsFiles = [] existingTransaction = self.beginTransaction() binds = [] for f in fileObjs: binds.append(f["id"]) loadFiles = self.daoFactory(classname="DBSBufferFiles.LoadBulkFilesByID") results = loadFiles.execute(files=binds, conn=self.getDBConn(), transaction=self.existingTransaction()) for entry in results: # Add loaded information dbsfile = DBSBufferFile(id=entry["id"]) dbsfile.update(entry) dbsFiles.append(dbsfile) for dbsfile in dbsFiles: if "runInfo" in dbsfile.keys(): # Then we have to replace it with a real run for r in dbsfile["runInfo"].keys(): run = Run(runNumber=r) run.extend(dbsfile["runInfo"][r]) dbsfile.addRun(run) del dbsfile["runInfo"] if "parentLFNs" in dbsfile.keys(): # Then we have some parents for lfn in dbsfile["parentLFNs"]: newFile = DBSBufferFile(lfn=lfn) dbsfile["parents"].add(newFile) del dbsfile["parentLFNs"] self.commitTransaction(existingTransaction) return dbsFiles
def runHandler(): """ _runHandler_ Sink to add run information to a file. Given the following XML: <Runs> <Run ID="122023"> <LumiSection NEvents="100" ID="215"/> <LumiSection NEvents="100" ID="216"/> </Run> <Run ID="122024"> <LumiSection ID="1"/> <LumiSection ID="2"/> </Run> </Runs> Create a WMCore.DataStructs.Run object for each run and call the addRunInfoToFile() function to add the run information to the file section. """ while True: fileSection, node = (yield) for subnode in node.children: if subnode.name == "Run": runId = subnode.attrs.get("ID", None) if runId is None: continue lumis = [] for lumi in subnode.children: if "ID" in lumi.attrs: lumiNumber = int(lumi.attrs['ID']) nEvents = lumi.attrs.get("NEvents", None) if nEvents is not None: try: nEvents = int(nEvents) except ValueError: nEvents = None lumis.append((lumiNumber, nEvents)) runInfo = Run(runNumber=runId) runInfo.extendLumis(lumis) Report.addRunInfoToFile(fileSection, runInfo)
def loadFilesFromBlocks(self, blockID): """ _loadFilesFromBlocks_ Load the files from all active blocks """ findFiles = self.factory(classname = "LoadFilesFromBlocks") myThread = threading.currentThread() existingTransaction = self.beginTransaction() dbsFiles = [] results = findFiles.execute(blockID = blockID, conn = self.getDBConn(), transaction=self.existingTransaction()) for entry in results: # Add loaded information dbsfile = DBSBufferFile(id=entry['id']) dbsfile.update(entry) dbsFiles.append(dbsfile) for dbsfile in dbsFiles: if 'runInfo' in dbsfile.keys(): # Then we have to replace it with a real run for r in dbsfile['runInfo'].keys(): run = Run(runNumber = r) run.extend(dbsfile['runInfo'][r]) dbsfile.addRun(run) del dbsfile['runInfo'] if 'parentLFNs' in dbsfile.keys(): # Then we have some parents for lfn in dbsfile['parentLFNs']: newFile = DBSBufferFile(lfn = lfn) dbsfile['parents'].add(newFile) del dbsfile['parentLFNs'] self.commitTransaction(existingTransaction) return dbsFiles
def findUploadableFilesByDAS(self, das): """ _findUploadableDAS_ Find all the Dataset-Algo files available with uploadable files. """ myThread = threading.currentThread() existingTransaction = self.beginTransaction() dbsFiles = [] findFiles = self.daoFactory(classname="LoadDBSFilesByDAS") results = findFiles.execute(das=das, conn=self.getDBConn(), transaction=self.existingTransaction()) for entry in results: # Add loaded information dbsfile = DBSBufferFile(id=entry["id"]) dbsfile.update(entry) dbsFiles.append(dbsfile) for dbsfile in dbsFiles: if "runInfo" in dbsfile.keys(): # Then we have to replace it with a real run for r in dbsfile["runInfo"].keys(): run = Run(runNumber=r) run.extend(dbsfile["runInfo"][r]) dbsfile.addRun(run) del dbsfile["runInfo"] if "parentLFNs" in dbsfile.keys(): # Then we have some parents for lfn in dbsfile["parentLFNs"]: newFile = DBSBufferFile(lfn=lfn) dbsfile["parents"].add(newFile) del dbsfile["parentLFNs"] self.commitTransaction(existingTransaction) return dbsFiles
def getChunkFiles(self, collectionName, filesetName, chunkOffset, chunkSize=100): """ _getChunkFiles_ Retrieve a chunk of files from the given collection and task. """ chunkFiles = [] files = self._getFilesetInfo(collectionName, filesetName, chunkOffset, chunkSize) files = mergeFakeFiles(files) for fileInfo in files: newFile = File(lfn=fileInfo["lfn"], size=fileInfo["size"], events=fileInfo["events"], parents=set(fileInfo["parents"]), locations=set(fileInfo["locations"]), merged=fileInfo["merged"]) for run in fileInfo["runs"]: newRun = Run(run["run_number"]) newRun.extend(run["lumis"]) newFile.addRun(newRun) chunkFiles.append(newFile) return chunkFiles
def getFiles(self, name, tier, nFiles = 12, site = "malpaquet"): """ Create some quick dummy test files """ files = [] for f in range(0, nFiles): testFile = DBSBufferFile(lfn = '%s-%s-%i' % (name, site, f), size = 1024, events = 20, checksums = {'cksum': 1}) testFile.setAlgorithm(appName = name, appVer = "CMSSW_3_1_1", appFam = "RECO", psetHash = "GIBBERISH", configContent = self.configURL) testFile.setDatasetPath("/%s/%s/%s" % (name, name, tier)) testFile.addRun(Run( 1, *[f])) testFile.setGlobalTag("aGlobalTag") testFile.create() testFile.setLocation(site) files.append(testFile) testFileChild = DBSBufferFile(lfn = '%s-%s-child' %(name, site), size = 1024, events = 10, checksums = {'cksum': 1}) testFileChild.setAlgorithm(appName = name, appVer = "CMSSW_3_1_1", appFam = "RECO", psetHash = "GIBBERISH", configContent = self.configURL) testFileChild.setDatasetPath("/%s/%s_2/RECO" %(name, name)) testFileChild.addRun(Run( 1, *[45])) testFileChild.setGlobalTag("aGlobalTag") testFileChild.create() testFileChild.setLocation(site) testFileChild.addParents([x['lfn'] for x in files]) return files
def getRunLumis(self, fileBinds, fileList, conn=None, transaction=False): """ _getRunLumis_ Fetch run/lumi/events information for each file and append Run objects to the files information. """ if not fileBinds: return lumiResult = self.dbi.processData(self.runLumiSQL, fileBinds, conn=conn, transaction=transaction) lumiList = self.formatDict(lumiResult) lumiDict = {} for l in lumiList: lumiDict.setdefault(l['fileid'], []) lumiDict[l['fileid']].append(l) for f in fileList: # Add new runs f.setdefault('newRuns', []) fileRuns = {} if f['id'] in lumiDict: for l in lumiDict[f['id']]: run = l['run'] lumi = l['lumi'] numEvents = l['num_events'] fileRuns.setdefault(run, []) fileRuns[run].append((lumi, numEvents)) for r in fileRuns: newRun = Run(runNumber=r) newRun.lumis = fileRuns[r] f['newRuns'].append(newRun) return
def loadFilesByBlock(self, blockname): """ _loadFilesByBlock_ Get all files associated with a block """ dbsFiles = [] existingTransaction = self.beginTransaction() findFiles = self.daoFactory(classname="LoadFilesByBlock") results = findFiles.execute(blockname=blockname, conn=self.getDBConn(), transaction=self.existingTransaction()) for entry in results: # Add loaded information dbsfile = DBSBufferFile(id=entry["id"]) dbsfile.update(entry) dbsFiles.append(dbsfile) for dbsfile in dbsFiles: if "runInfo" in dbsfile.keys(): # Then we have to replace it with a real run for r in dbsfile["runInfo"].keys(): run = Run(runNumber=r) run.extend(dbsfile["runInfo"][r]) dbsfile.addRun(run) del dbsfile["runInfo"] if "parentLFNs" in dbsfile.keys(): # Then we have some parents for lfn in dbsfile["parentLFNs"]: newFile = DBSBufferFile(lfn=lfn) dbsfile["parents"].add(newFile) del dbsfile["parentLFNs"] self.commitTransaction(existingTransaction) return dbsFiles
def loadDBSBufferFilesBulk(self, fileObjs): """ _loadDBSBufferFilesBulk_ Yes, this is a stupid place to put it. No, there's not better place. """ dbsFiles = [] binds = [] for f in fileObjs: binds.append(f["id"]) loadFiles = self.daoFactory(classname = "DBSBufferFiles.LoadBulkFilesByID") results = loadFiles.execute(files = binds, transaction = False) for entry in results: # Add loaded information dbsfile = DBSBufferFile(id=entry['id']) dbsfile.update(entry) dbsFiles.append(dbsfile) for dbsfile in dbsFiles: if 'runInfo' in dbsfile.keys(): # Then we have to replace it with a real run for r in dbsfile['runInfo'].keys(): run = Run(runNumber = r) run.extend(dbsfile['runInfo'][r]) dbsfile.addRun(run) del dbsfile['runInfo'] if 'parentLFNs' in dbsfile.keys(): # Then we have some parents for lfn in dbsfile['parentLFNs']: newFile = DBSBufferFile(lfn = lfn) dbsfile['parents'].add(newFile) del dbsfile['parentLFNs'] return dbsFiles
def createTestJob(self, subscriptionType="Merge"): """ _createTestJob_ Create a test job with two files as input. This will also create the appropriate workflow, jobgroup and subscription. """ testWorkflow = Workflow(spec=makeUUID(), owner="Simon", name=makeUUID(), task="Test") testWorkflow.create() testWMBSFileset = Fileset(name="TestFileset") testWMBSFileset.create() testSubscription = Subscription(fileset=testWMBSFileset, workflow=testWorkflow, type=subscriptionType) testSubscription.create() testJobGroup = JobGroup(subscription=testSubscription) testJobGroup.create() testFileA = File(lfn="/this/is/a/lfnA", size=1024, events=10) testFileA.addRun(Run(1, *[45])) testFileB = File(lfn="/this/is/a/lfnB", size=1024, events=10) testFileB.addRun(Run(1, *[46])) testFileA.create() testFileB.create() testJob = Job(name=makeUUID(), files=[testFileA, testFileB]) testJob["couch_record"] = "somecouchrecord" testJob["location"] = "test.site.ch" testJob.create(group=testJobGroup) testJob.associateFiles() return testJob
def testSetLocationTransaction(self): """ _testSetLocationTransaction_ Create a file at specific locations and commit everything to the database. Reload the file from the database and verify that the locations are correct. Rollback the database transaction and once again reload the file. Verify that the original locations are back. """ testFileA = File(lfn="/this/is/a/lfn", size=1024, events=10, checksums={'cksum': 1}) testFileA.addRun(Run(1, *[45])) testFileA.create() testFileA.setLocation(["se1.fnal.gov"]) myThread = threading.currentThread() myThread.transaction.begin() testFileA.setLocation(["se1.cern.ch"]) testFileA.setLocation(["bunkse1.fnal.gov", "bunkse1.cern.ch"], immediateSave=False) testFileB = File(id=testFileA["id"]) testFileB.loadData() goldenLocations = ["se1.fnal.gov", "se1.cern.ch"] for location in testFileB["locations"]: assert location in goldenLocations, \ "ERROR: Unknown file location" goldenLocations.remove(location) assert len(goldenLocations) == 0, \ "ERROR: Some locations are missing" myThread.transaction.rollback() testFileB.loadData() goldenLocations = ["se1.fnal.gov"] for location in testFileB["locations"]: assert location in goldenLocations, \ "ERROR: Unknown file location" goldenLocations.remove(location) assert len(goldenLocations) == 0, \ "ERROR: Some locations are missing" return
def createTestJobs(self, nJobs, cacheDir): """ _createTestJobs_ Create several jobs """ testWorkflow = Workflow(spec = "spec.xml", owner = "Simon", name = "wf001", task="Test") testWorkflow.create() testWMBSFileset = Fileset(name = "TestFileset") testWMBSFileset.create() testSubscription = Subscription(fileset = testWMBSFileset, workflow = testWorkflow, type = "Processing", split_algo = "FileBased") testSubscription.create() testJobGroup = JobGroup(subscription = testSubscription) testJobGroup.create() # Create a file testFileA = File(lfn = "/this/is/a/lfnA", size = 1024, events = 10) testFileA.addRun(Run(10, *[12312])) testFileA.setLocation('malpaquet') testFileA.create() baseName = makeUUID() # Now create a job for i in range(nJobs): testJob = Job(name = '%s-%i' % (baseName, i)) testJob.addFile(testFileA) testJob['location'] = 'malpaquet' testJob['retry_count'] = 1 testJob['retry_max'] = 10 testJob.create(testJobGroup) testJob.save() testJobGroup.add(testJob) testJobGroup.commit() # Set test job caches for job in testJobGroup.jobs: job.setCache(cacheDir) return testJobGroup
def _addDBSFileToWMBSFile(self, dbsFile, storageElements, inFileset=True): """ There are two assumptions made to make this method behave properly, 1. DBS returns only one level of ParentList. If DBS returns multiple level of parentage, it will be still get handled. However that might not be what we wanted. In that case, restrict to one level. 2. Assumes parents files are in the same location as child files. This is not True in general case, but workquue should only select work only where child and parent files are in the same location """ wmbsParents = [] dbsFile.setdefault("ParentList", []) for parent in dbsFile["ParentList"]: wmbsParents.append( self._addDBSFileToWMBSFile(parent, storageElements, inFileset=False)) checksums = {} if dbsFile.get('Checksum'): checksums['cksum'] = dbsFile['Checksum'] if dbsFile.get('Adler32'): checksums['adler32'] = dbsFile['Adler32'] wmbsFile = File( lfn=dbsFile["LogicalFileName"], size=dbsFile["FileSize"], events=dbsFile["NumberOfEvents"], checksums=checksums, #TODO: need to get list of parent lfn parents=wmbsParents, locations=set(storageElements)) for lumi in dbsFile['LumiList']: run = Run(lumi['RunNumber'], lumi['LumiSectionNumber']) wmbsFile.addRun(run) self._addToDBSBuffer(dbsFile, checksums, storageElements) logging.info("WMBS File: %s\n on Location: %s" % (wmbsFile['lfn'], wmbsFile['newlocations'])) if inFileset: wmbsFile['inFileset'] = True else: wmbsFile['inFileset'] = False self.wmbsFilesToCreate.append(wmbsFile) return wmbsFile
def getInputFilesFromStep(self, stepName, inputSource=None): """ _getInputFilesFromStep_ Retrieve a list of input files from the given step. """ step = self.retrieveStep(stepName) if inputSource is None: inputSources = step.input.listSections_() else: inputSources = [inputSource] inputFiles = [] for inputSource in inputSources: source = getattr(step.input, inputSource) for fileNum in range(source.files.fileCount): fwjrFile = getattr(source.files, "file%d" % fileNum) lfn = getattr(fwjrFile, "lfn", None) pfn = getattr(fwjrFile, "pfn", None) size = getattr(fwjrFile, "size", 0) events = getattr(fwjrFile, "events", 0) branches = getattr(fwjrFile, "branches", []) catalog = getattr(fwjrFile, "catalog", None) guid = getattr(fwjrFile, "guid", None) inputSourceClass = getattr(fwjrFile, "input_source_class", None) moduleLabel = getattr(fwjrFile, "module_label", None) inputType = getattr(fwjrFile, "input_type", None) inputFile = File(lfn=lfn, size=size, events=events) inputFile["pfn"] = pfn inputFile["branches"] = branches inputFile["catalog"] = catalog inputFile["guid"] = guid inputFile["input_source_class"] = inputSourceClass inputFile["module_label"] = moduleLabel inputFile["input_type"] = inputType runSection = getattr(fwjrFile, "runs") runNumbers = runSection.listSections_() for runNumber in runNumbers: lumiTuple = getattr(runSection, str(runNumber)) inputFile.addRun(Run(int(runNumber), *lumiTuple)) inputFiles.append(inputFile) return inputFiles
def testA(self): """instantiation""" run1 = Run() self.assertEqual(run1.run, None) self.assertEqual(len(run1), 0) run2 = Run(1000000) self.assertEqual(run2.run, 1000000) self.assertEqual(len(run2), 0) run3 = Run(1000000, 1) self.assertEqual(run3.run, 1000000) self.assertEqual(len(run3), 1) self.assertEqual(run3[0], 1) run4 = Run(1000000, 1,2,3,4,5) self.assertEqual(run4.run, 1000000) self.assertEqual(len(run4), 5) self.assertEqual(run4[0], 1) self.assertEqual(run4[4], 5)
def testLocationMerging(self): """ _testLocationMerging_ Verify that files residing on different SEs are not merged together in the same job. """ self.stuffWMBS() locationAction = self.daoFactory(classname="Locations.New") locationAction.execute(siteName="s2", seName="somese2.cern.ch") fileSite2 = File(lfn="fileSite2", size=4098, events=1024, first_event=0, locations=set(["somese2.cern.ch"])) fileSite2.addRun(Run(1, *[46])) fileSite2.create() self.mergeFileset.addFile(fileSite2) self.mergeFileset.commit() splitter = SplitterFactory() jobFactory = splitter(package="WMCore.WMBS", subscription=self.mergeSubscription) result = jobFactory(min_merge_size=4097, max_merge_size=99999999, max_merge_events=999999999, merge_across_runs=False) assert len(result) == 1, \ "ERROR: More than one JobGroup returned." assert len(result[0].jobs) == 3, \ "ERROR: Three jobs should have been returned." for job in result[0].jobs: firstInputFile = job.getFiles()[0] baseLocation = list(firstInputFile["locations"])[0] for inputFile in job.getFiles(): assert len(inputFile["locations"]) == 1, \ "Error: Wrong number of locations" assert list(inputFile["locations"])[0] == baseLocation, \ "Error: Wrong location." return
def testLocationMerging(self): """ _testLocationMerging_ Verify that files residing on different SEs are not merged together in the same job. """ self.stuffWMBS() locationAction = self.daoFactory(classname="Locations.New") locationAction.execute(siteName="T1_UK_RAL", pnn="T1_UK_RAL_Disk") fileSite2 = File(lfn="fileRAL", size=4098, events=1024, first_event=0, locations=set(["T1_UK_RAL_Disk"])) fileSite2.addRun(Run(1, *[46])) fileSite2.create() self.mergeFileset.addFile(fileSite2) self.mergeFileset.commit() splitter = SplitterFactory() jobFactory = splitter(package="WMCore.WMBS", subscription=self.mergeSubscription) result = jobFactory(min_merge_size=4097, max_merge_size=99999999, max_merge_events=999999999, merge_across_runs=False) assert len(result) == 1, \ "ERROR: More than one JobGroup returned." assert len(result[0].jobs) == 3, \ "ERROR: Three jobs should have been returned." ralJobs = 0 fnalJobs = 0 for job in result[0].jobs: if job["possiblePSN"] == set(["T1_UK_RAL"]): ralJobs += 1 elif job["possiblePSN"] == set(["T1_US_FNAL"]): fnalJobs += 1 self.assertEqual(ralJobs, 1) self.assertEqual(fnalJobs, 2) return
def createFile(self, lfn, events, run, lumis, location): """ _createFile_ Create a file for testing """ newFile = File(lfn=lfn, size=1000, events=events) lumiList = [] for lumi in range(lumis): lumiList.append((run * lumis) + lumi) newFile.addRun(Run(run, *lumiList)) newFile.setLocation(location) return newFile
def getOutputFile(self, fileName, outputModule, step): """ _getOutputFile_ Takes a fileRef object and returns a DataStructs/File object as output """ outputMod = self.getOutputModule(step=step, outputModule=outputModule) if not outputMod: return None fileRef = getattr(outputMod.files, fileName, None) newFile = File(locations=set()) #Locations newFile.setLocation(getattr(fileRef, "location", None)) #Runs runList = fileRef.runs.listSections_() for run in runList: lumis = getattr(fileRef.runs, run) newRun = Run(int(run), *lumis) newFile.addRun(newRun) newFile["lfn"] = getattr(fileRef, "lfn", None) newFile["pfn"] = getattr(fileRef, "pfn", None) newFile["events"] = int(getattr(fileRef, "events", 0)) newFile["size"] = int(getattr(fileRef, "size", 0)) newFile["branches"] = getattr(fileRef, "branches", []) newFile["input"] = getattr(fileRef, "input", []) newFile["inputpfns"] = getattr(fileRef, "inputpfns", []) newFile["branch_hash"] = getattr(fileRef, "branch_hash", None) newFile["catalog"] = getattr(fileRef, "catalog", "") newFile["guid"] = getattr(fileRef, "guid", "") newFile["module_label"] = getattr(fileRef, "module_label", "") newFile["checksums"] = getattr(fileRef, "checksums", {}) newFile["merged"] = bool(getattr(fileRef, "merged", False)) newFile["dataset"] = getattr(fileRef, "dataset", {}) newFile["acquisitionEra"] = getattr(fileRef, 'acquisitionEra', None) newFile["processingVer"] = getattr(fileRef, 'processingVer', None) newFile["validStatus"] = getattr(fileRef, 'validStatus', None) newFile["globalTag"] = getattr(fileRef, 'globalTag', None) newFile["prep_id"] = getattr(fileRef, 'prep_id', None) newFile['configURL'] = getattr(fileRef, 'configURL', None) newFile['inputPath'] = getattr(fileRef, 'inputPath', None) newFile["outputModule"] = outputModule newFile["fileRef"] = fileRef return newFile
def createSubscription(self, nFiles, lumisPerFile, twoSites=False): """ _createSubscription_ Create a subscription for testing """ baseName = makeUUID() testFileset = Fileset(name=baseName) for i in range(nFiles): newFile = File(lfn='%s_%i' % (baseName, i), size=1000, events=100) lumis = [] for lumi in range(lumisPerFile): lumis.append((i * 100) + lumi) newFile.addRun(Run(i, *lumis)) newFile.setLocation('blenheim') testFileset.addFile(newFile) if twoSites: for i in range(nFiles): newFile = File(lfn='%s_%i_2' % (baseName, i), size=1000, events=100) lumis = [] for lumi in range(lumisPerFile): lumis.append(5 + 10 * (i * 100) + lumi) #lumis should be different newFile.addRun(Run(i, *lumis)) newFile.setLocation('malpaquet') testFileset.addFile(newFile) testSubscription = Subscription(fileset=testFileset, workflow=self.testWorkflow, split_algo="LumiBased", type="Processing") return testSubscription
def injectFilesFromDBS(inputFileset, datasetPath): """ _injectFilesFromDBS_ """ print("injecting files from %s into %s, please wait..." % (datasetPath, inputFileset.name)) args={} args["url"] = "https://cmsweb.cern.ch/dbs/prod/global/DBSReader" args["mode"] = "GET" dbsApi = DbsApi(args) dbsResults = dbsApi.listFileArray(path = datasetPath, retriveList = ["retrive_lumi", "retrive_run"]) # Limiter on number of files dbsResults = dbsResults[0:20] print(" found %d files, inserting into wmbs..." % (len(dbsResults))) for dbsResult in dbsResults: myFile = File(lfn = dbsResult["LogicalFileName"], size = dbsResult["FileSize"], events = dbsResult["NumberOfEvents"], checksums = {"cksum": dbsResult["Checksum"]}, locations = "cmssrm.fnal.gov", merged = True) myRun = Run(runNumber = dbsResult["LumiList"][0]["RunNumber"]) for lumi in dbsResult["LumiList"]: myRun.appendLumi(lumi["LumiSectionNumber"]) myFile.addRun(myRun) myFile.create() inputFileset.addFile(myFile) dbsFile = DBSBufferFile(lfn = dbsResult["LogicalFileName"], size = dbsResult["FileSize"], events = dbsResult["NumberOfEvents"], checksums = {"cksum": dbsResult["Checksum"]}, locations = "cmssrm.fnal.gov", status = "NOTUPLOADED") dbsFile.setDatasetPath(datasetPath) dbsFile.setAlgorithm(appName = "cmsRun", appVer = "Unknown", appFam = "Unknown", psetHash = "Unknown", configContent = "Unknown") dbsFile.create() inputFileset.commit() inputFileset.markOpen(False) return
def testWorkUnitHashAndCompare(self): """ Test that the hash function works and that the comparisons work """ testRunLumi0 = Run(TEST_RUN_NUMBER, TEST_LUMI) testWorkUnit0 = WorkUnit(taskID=TEST_TASKID, lastUnitCount=TEST_LAST_UNIT_COUNT, fileid=TEST_FILEID, runLumi=testRunLumi0) testRunLumi1 = Run(TEST_RUN_NUMBER, TEST_LUMI) testWorkUnit1 = WorkUnit(taskID=TEST_TASKID, lastUnitCount=TEST_LAST_UNIT_COUNT, fileid=TEST_FILEID, runLumi=testRunLumi1) testRunLumi2 = Run(TEST_RUN_NUMBER, TEST_LUMI + 1) testWorkUnit2 = WorkUnit(taskID=TEST_TASKID, lastUnitCount=TEST_LAST_UNIT_COUNT, fileid=TEST_FILEID, runLumi=testRunLumi2) testRunLumi3 = Run(TEST_RUN_NUMBER + 1, TEST_LUMI) testWorkUnit3 = WorkUnit(taskID=TEST_TASKID, lastUnitCount=TEST_LAST_UNIT_COUNT, fileid=TEST_FILEID, runLumi=testRunLumi3) # Tests for hashers self.assertEqual(hash(testWorkUnit0), hash(testWorkUnit1)) self.assertNotEqual(hash(testWorkUnit1), hash(testWorkUnit2)) self.assertNotEqual(hash(testWorkUnit1), hash(testWorkUnit3)) # Tests for comparisons self.assertEqual(testWorkUnit0, testWorkUnit1) self.assertLess(testWorkUnit1, testWorkUnit2) self.assertLess(testWorkUnit2, testWorkUnit3) return
def injectFilesFromDBS(inputFileset, datasetPath): """ _injectFilesFromDBS_ """ print("injecting files from %s into %s, please wait..." % (datasetPath, inputFileset.name)) args={} args["url"] = "https://cmsweb-prod.cern.ch/dbs/prod/global/DBSReader" args["version"] = "DBS_2_0_9" args["mode"] = "GET" dbsApi = DbsApi(args) dbsResults = dbsApi.listFileArray(path = datasetPath, retriveList = ["retrive_lumi", "retrive_run"]) dbsResults = dbsResults[0:10] print(" found %d files, inserting into wmbs..." % (len(dbsResults))) for dbsResult in dbsResults: myFile = File(lfn = dbsResult["LogicalFileName"], size = dbsResult["FileSize"], events = dbsResult["NumberOfEvents"], checksums = {"cksum": dbsResult["Checksum"]}, locations = "cmssrm.fnal.gov", merged = True) myRun = Run(runNumber = dbsResult["LumiList"][0]["RunNumber"]) for lumi in dbsResult["LumiList"]: myRun.appendLumi(lumi["LumiSectionNumber"]) myFile.addRun(myRun) myFile.create() inputFileset.addFile(myFile) dbsFile = DBSBufferFile(lfn = dbsResult["LogicalFileName"], size = dbsResult["FileSize"], events = dbsResult["NumberOfEvents"], checksums = {"cksum": dbsResult["Checksum"]}, locations = "cmssrm.fnal.gov", status = "LOCAL") dbsFile.setDatasetPath(datasetPath) dbsFile.setAlgorithm(appName = "cmsRun", appVer = "Unknown", appFam = "Unknown", psetHash = "Unknown", configContent = "Unknown") dbsFile.create() inputFileset.commit() inputFileset.markOpen(False) return
def getChunkFiles(self, collectionName, filesetName, chunkOffset, chunkSize=100, user="******", group="cmsdataops"): """ _getChunkFiles_ Retrieve a chunk of files from the given collection and task. """ chunkFiles = [] result = self.couchdb.loadView( "ACDC", "owner_coll_fileset_files", { "startkey": [group, user, collectionName, filesetName], "endkey": [group, user, collectionName, filesetName, {}], "limit": chunkSize, "skip": chunkOffset, }, []) for row in result["rows"]: resultRow = row['value'] newFile = File(lfn=resultRow["lfn"], size=resultRow["size"], events=resultRow["events"], parents=set(resultRow["parents"]), locations=set(resultRow["locations"]), merged=resultRow["merged"]) for run in resultRow["runs"]: newRun = Run(run["run_number"]) newRun.extend(run["lumis"]) newFile.addRun(newRun) chunkFiles.append(newFile) return chunkFiles
def setUp(self): """ _setUp_ """ self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() self.testInit.setSchema(customModules=["WMCore.WMBS"]) self.splitterFactory = SplitterFactory(package="WMCore.JobSplitting") myThread = threading.currentThread() self.myThread = myThread daoFactory = DAOFactory(package="WMCore.WMBS", logger=logging, dbinterface=myThread.dbi) self.WMBSFactory = daoFactory config = self.getConfig() self.changer = ChangeState(config) myResourceControl = ResourceControl() myResourceControl.insertSite("T1_US_FNAL", 10, 20, "T1_US_FNAL_Disk", "T1_US_FNAL") myResourceControl.insertSite("T1_US_FNAL", 10, 20, "T3_US_FNALLPC", "T1_US_FNAL") myResourceControl.insertSite("T2_CH_CERN", 10, 20, "T2_CH_CERN", "T2_CH_CERN") self.fileset1 = Fileset(name="TestFileset1") for fileNum in range(11): newFile = File("/some/file/name%d" % fileNum, size=1000, events=100) newFile.addRun(Run(1, *[1])) newFile.setLocation('T1_US_FNAL_Disk') self.fileset1.addFile(newFile) self.fileset1.create() workflow1 = Workflow(spec="spec.xml", owner="hufnagel", name="TestWorkflow1", task="Test") workflow1.create() self.subscription1 = Subscription(fileset=self.fileset1, workflow=workflow1, split_algo="Harvest", type="Harvesting") self.subscription1.create() self.configFile = EmulatorSetup.setupWMAgentConfig() return
def load(self, parentage=0): """ _load_ The the file and all it's metadata from the database. Either the LFN or the file's ID must be specified before this is called. """ existingTransaction = self.beginTransaction() if self["id"] != -1: action = self.daofactory(classname="DBSBufferFiles.GetByID") result = action.execute(self["id"], conn=self.getDBConn(), transaction=self.existingTransaction()) else: action = self.daofactory(classname="DBSBufferFiles.GetByLFN") result = action.execute(self["lfn"], conn=self.getDBConn(), transaction=self.existingTransaction()) self.update(result) action = self.daofactory(classname='DBSBufferFiles.GetChecksum') result = action.execute(fileid=self['id'], conn=self.getDBConn(), transaction=self.existingTransaction()) self["checksums"] = result action = self.daofactory(classname="DBSBufferFiles.GetRunLumiFile") runs = action.execute(self["lfn"], conn=self.getDBConn(), transaction=self.existingTransaction()) for r in runs: self.addRun(run=Run(r, *runs[r])) action = self.daofactory(classname="DBSBufferFiles.GetLocation") self["locations"] = action.execute(self["lfn"], conn=self.getDBConn(), transaction=self.existingTransaction()) self["newlocations"].clear() self["parents"].clear() if parentage > 0: action = self.daofactory(classname="DBSBufferFiles.GetParents") lfns = action.execute(self["lfn"], conn=self.getDBConn(), transaction=self.existingTransaction()) for lfn in lfns: parentFile = DBSBufferFile(lfn=lfn) parentFile.load(parentage=parentage - 1) self["parents"].add(parentFile) self.commitTransaction(existingTransaction) return
def injectFilesFromDBS(inputFileset, datasetPath): """ _injectFilesFromDBS_ """ print "injecting files from %s into %s, please wait..." % ( datasetPath, inputFileset.name) args = {} args[ "url"] = "http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet" args["mode"] = "GET" dbsApi = DbsApi(args) dbsResults = dbsApi.listFiles(path=datasetPath, retriveList=["retrive_lumi", "retrive_run"]) # Limiter on number of files dbsResults = dbsResults[0:20] print " found %d files, inserting into wmbs..." % (len(dbsResults)) for dbsResult in dbsResults: myFile = File(lfn=dbsResult["LogicalFileName"], size=dbsResult["FileSize"], events=dbsResult["NumberOfEvents"], checksums={"cksum": dbsResult["Checksum"]}, locations="cmssrm.fnal.gov", merged=True) myRun = Run(runNumber=dbsResult["LumiList"][0]["RunNumber"]) for lumi in dbsResult["LumiList"]: myRun.lumis.append(lumi["LumiSectionNumber"]) myFile.addRun(myRun) myFile.create() inputFileset.addFile(myFile) dbsFile = DBSBufferFile(lfn=dbsResult["LogicalFileName"], size=dbsResult["FileSize"], events=dbsResult["NumberOfEvents"], checksums={"cksum": dbsResult["Checksum"]}, locations="cmssrm.fnal.gov", status="NOTUPLOADED") dbsFile.setDatasetPath(datasetPath) dbsFile.setAlgorithm(appName="cmsRun", appVer="Unknown", appFam="Unknown", psetHash="Unknown", configContent="Unknown") dbsFile.create() inputFileset.commit() inputFileset.markOpen(False) return
def testGetInfo(self): testRunLumi = Run(TEST_RUN_NUMBER, TEST_LUMI) testWorkUnit = WorkUnit(taskID=TEST_TASKID, lastUnitCount=TEST_LAST_UNIT_COUNT, fileid=TEST_FILEID, runLumi=testRunLumi) info = testWorkUnit.getInfo() # Test the things we set self.assertEqual(info[0], TEST_TASKID) self.assertEqual(info[2], TEST_LAST_UNIT_COUNT) self.assertEqual(info[7], TEST_FILEID) self.assertEqual(info[8].run, TEST_RUN_NUMBER) self.assertItemsEqual(info[8].lumis, [TEST_LUMI]) # Test the defaults we did not set self.assertEqual(info[1], 0) self.assertGreater(info[3], 0) self.assertEqual(info[4], 0) self.assertGreaterEqual(info[5], 0) self.assertGreaterEqual(info[6], 0) # Run another test by overriding defaults of things with default values testRetries = 10 testSubmitTime = 10 * 365 * 24 * 3600 testStatus = 4 testFirstEvent = 100 testLastEvent = 600 testWorkUnit = WorkUnit(taskID=TEST_TASKID, lastUnitCount=TEST_LAST_UNIT_COUNT, fileid=TEST_FILEID, runLumi=testRunLumi, retryCount=testRetries, lastSubmitTime=testSubmitTime, status=testStatus, firstEvent=testFirstEvent, lastEvent=testLastEvent) info = testWorkUnit.getInfo() # Test the defaults we overrode self.assertEqual(info[1], testRetries) self.assertEqual(info[3], testSubmitTime) self.assertEqual(info[4], testStatus) self.assertEqual(info[5], testFirstEvent) self.assertEqual(info[6], testLastEvent) return
def stuffACDCDatabase(self, numFiles=50, lumisPerFile=20, lumisPerACDCRecord=2): """ _stuffACDCDatabase_ Fill the ACDC database with ACDC records, both for processing and merge """ filesetName = '/%s/DataProcessing' % self.workflowName for i in range(numFiles): for j in range(1, lumisPerFile + 1, lumisPerACDCRecord): lfn = '/store/data/a/%d' % i acdcFile = File(lfn=lfn, size=100, events=250, locations=self.validLocations, merged=1) run = Run(i + 1, *range(j, min(j + lumisPerACDCRecord, lumisPerFile + 1))) acdcFile.addRun(run) acdcDoc = {'collection_name': self.workflowName, 'collection_type': 'ACDC.CollectionTypes.DataCollection', 'files': {lfn: acdcFile}, 'fileset_name': filesetName} self.acdcDB.queue(acdcDoc) filesetName = '/%s/DataProcessing/DataProcessingMergeRECOoutput' % self.workflowName for i in range(numFiles): for j in range(1, lumisPerFile + 1, lumisPerACDCRecord): lfn = '/store/unmerged/b/%d' % i acdcFile = File(lfn=lfn, size=100, events=250, locations=set([choice(self.validLocations)]), merged=0) run = Run(i + 1, *range(j, min(j + lumisPerACDCRecord, lumisPerFile + 1))) acdcFile.addRun(run) acdcDoc = {'collection_name': self.workflowName, 'collection_type': 'ACDC.CollectionTypes.DataCollection', 'files': {lfn: acdcFile}, 'fileset_name': filesetName} self.acdcDB.queue(acdcDoc) self.acdcDB.commit() return
def populateFilesFromWMBS(self, filesByLocation): """ Load the lumi information for files from WMBS Args: filesByLocation: the files at the location currently under consideration Returns: nothing """ fileLumis = self.loadRunLumi.execute(files=filesByLocation) for f in filesByLocation: lumiDict = fileLumis.get(f['id'], {}) for run in lumiDict.keys(): f.addRun(run=Run(run, *lumiDict[run]))
def generateFakeMCFile(self, numEvents=100, firstEvent=1, lastEvent=100, firstLumi=1, lastLumi=10, existingSub=None): # MC comes with only one MCFakeFile newFile = File("MCFakeFileTest", size=1000, events=numEvents) newFile.setLocation('se01') if firstLumi == lastLumi: newFile.addRun(Run(1, *list(range(firstLumi, lastLumi + 1)))) else: newFile.addRun(Run(1, *list(range(firstLumi, lastLumi)))) newFile["first_event"] = firstEvent newFile["last_event"] = lastEvent if existingSub is None: singleMCFileset = Fileset(name="MCTestFileset") singleMCFileset.addFile(newFile) testWorkflow = Workflow() existingSub = Subscription(fileset=singleMCFileset, workflow=testWorkflow, split_algo="EventBased", type="Production") else: existingSub['fileset'].addFile(newFile) return existingSub
def createFile(lfn, events, run, lumis, location, lumiMultiplier=None): """ _createFile_ Create a file for testing """ if lumiMultiplier is None: lumiMultiplier = run newFile = File(lfn=lfn, size=1000, events=events) lumiList = [] for lumi in range(lumis): lumiList.append((lumiMultiplier * lumis) + lumi) newFile.addRun(Run(run, *lumiList)) newFile.setLocation(location) return newFile
def generateFakeMCFile(self, numEvents = 100, firstEvent = 1, lastEvent = 100, firstLumi = 1, lastLumi = 10): #MC comes with only one MCFakeFile singleMCFileset = Fileset(name = "MCTestFileset") newFile = File("MCFakeFileTest", size = 1000, events = numEvents) newFile.setLocation('se01') newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1))) newFile["first_event"] = firstEvent newFile["last_event"] = lastEvent testWorkflow = Workflow() singleMCFileset.addFile(newFile) singleMCFileSubscription = Subscription(fileset = singleMCFileset, workflow = testWorkflow, split_algo = "EventBased", type = "Production") return singleMCFileSubscription
def createParentFiles(self, acqEra, nFiles=10, workflowName='TestWorkload', taskPath='/TestWorkload/DataTest'): """ _createParentFiles_ Create several parentless files in DBSBuffer. This simulates raw files in the T0. """ workflowId = self.injectWorkflow(workflowName=workflowName, taskPath=taskPath) parentlessFiles = [] baseLFN = "/store/data/%s/Cosmics/RAW/v1/000/143/316/" % (acqEra) for i in range(nFiles): testFile = DBSBufferFile(lfn=baseLFN + makeUUID() + ".root", size=1024, events=20, checksums={"cksum": 1}, workflowId=workflowId) testFile.setAlgorithm(appName="cmsRun", appVer="CMSSW_3_1_1", appFam="RAW", psetHash="GIBBERISH", configContent="MOREGIBBERISH") testFile.setDatasetPath("/Cosmics/%s-v1/RAW" % (acqEra)) testFile['block_close_max_wait_time'] = 1000000 testFile['block_close_max_events'] = 1000000 testFile['block_close_max_size'] = 1000000 testFile['block_close_max_files'] = 1000000 lumis = [] for j in range(10): lumis.append((i * 10) + j) testFile.addRun(Run(143316, *lumis)) testFile.setAcquisitionEra(acqEra) testFile.setProcessingVer("1") testFile.setGlobalTag("START54::All") testFile.create() testFile.setLocation("malpaquet") parentlessFiles.append(testFile) return parentlessFiles
def execute(self, *args, **kwargs): self.logger.info( "Data discovery and splitting for %s using user-provided files" % kwargs['task']['tm_taskname']) userfiles = kwargs['task']['tm_user_files'] splitting = kwargs['task']['tm_split_algo'] total_units = kwargs['task']['tm_totalunits'] if not userfiles or splitting != 'FileBased': if not userfiles: msg = "No files specified to process for task %s." % kwargs[ 'task']['tm_taskname'] if splitting != 'FileBased': msg = "Data.splitting must be set to 'FileBased' when using a custom set of files." raise TaskWorkerException(msg) if hasattr(self.config.Sites, 'available'): locations = self.config.Sites.available else: with self.config.TaskWorker.envForCMSWEB: configDict = { "cacheduration": 1, "pycurl": True } # cache duration is in hours resourceCatalog = CRIC(logger=self.logger, configDict=configDict) locations = resourceCatalog.getAllPSNs() userFileset = Fileset(name=kwargs['task']['tm_taskname']) self.logger.info("There are %d files specified by the user." % len(userfiles)) if total_units > 0: self.logger.info("Will run over the first %d files." % total_units) file_counter = 0 for userfile, idx in zip(userfiles, range(len(userfiles))): newFile = File(userfile, size=1000, events=1) newFile.setLocation(locations) newFile.addRun(Run(1, idx)) newFile["block"] = 'UserFilesFakeBlock' newFile["first_event"] = 1 newFile["last_event"] = 2 userFileset.addFile(newFile) file_counter += 1 if total_units > 0 and file_counter >= total_units: break return Result(task=kwargs['task'], result=userFileset)
def populateACDCCouch(self, numFiles=2, lumisPerJob=35, eventsPerJob=20000): """ _populateACDCCouch_ Create production files in couchDB to test the creation of ACDC jobs for the EventBased algorithm """ # Define some constants workflowName = "ACDC_TestEventBased" filesetName = "/%s/Production" % workflowName owner = "*****@*****.**" group = "unknown" lumisPerFile = lumisPerJob * 250 for i in range(numFiles): for j in range(250): lfn = "MCFakeFile-some-hash-%s" % str(i).zfill(5) acdcFile = File(lfn=lfn, size=100, events=eventsPerJob, locations=self.validLocations, merged=False, first_event=1) run = Run( 1, *range(1 + (i * lumisPerFile) + j * lumisPerJob, (j + 1) * lumisPerJob + (i * lumisPerFile) + 2)) acdcFile.addRun(run) acdcDoc = { "collection_name": workflowName, "collection_type": "ACDC.CollectionTypes.DataCollection", "files": { lfn: acdcFile }, "fileset_name": filesetName, "owner": { "user": owner, "group": group } } self.couchDB.queue(acdcDoc) self.couchDB.commit() return
def test03(self): """ _test03_ Test max input files threshold for single lumi small lumi, followed by large lumi expect 1 job for small lumi and 1 job for large """ mySplitArgs = self.splitArgs.copy() for lumi in [1, 2]: for i in range(lumi * 2): newFile = File(makeUUID(), size = 1000, events = 100) newFile.addRun(Run(1, *[lumi])) newFile.setLocation("SomePNN", immediateSave = False) newFile.create() self.fileset2.addFile(newFile) self.fileset2.commit() jobFactory = self.splitterFactory(package = "WMCore.WMBS", subscription = self.subscription2) jobGroups = jobFactory(**mySplitArgs) self.assertEqual(len(jobGroups), 0, "ERROR: JobFactory should have returned no JobGroup") mySplitArgs['maxInputFiles'] = 3 jobGroups = jobFactory(**mySplitArgs) self.assertEqual(len(jobGroups), 1, "ERROR: JobFactory didn't return one JobGroup") self.assertEqual(len(jobGroups[0].jobs), 2, "ERROR: JobFactory didn't create two jobs") job = jobGroups[0].jobs[0] self.assertEqual(len(job.getFiles()), 2, "ERROR: Job does not process 2 files") job = jobGroups[0].jobs[1] self.assertEqual(len(job.getFiles()), 4, "ERROR: Job does not process 4 files") return
#!/usr/bin/env python import random from WMCore.FwkJobReport import Report from WMCore.DataStructs.Run import Run from WMCore.Services.UUIDLib import makeUUID outputModules = ["write_A_Calo_RAW", "write_A_Cosmics_RAW", "write_A_HcalHPDNoise_RAW", "write_A_MinimumBias_RAW", "write_A_RandomTriggers_RAW", "write_A_Calibration_TestEnables_RAW", "write_HLTDEBUG_Monitor_RAW"] runInfo = Run(1) runInfo.extendLumis([11, 12, 13, 14, 15]) for i in range(100): loadTestReport = Report.Report("cmsRun1") loadTestReport.addInputSource("PoolSource") inputFile = loadTestReport.addInputFile("PoolSource", lfn = makeUUID(), events = 600000, size = 600000) Report.addRunInfoToFile(inputFile, runInfo) for outputModule in outputModules: loadTestReport.addOutputModule(outputModule) datasetInfo = {"applicationName": "cmsRun", "applicationVersion": "CMSSW_3_3_5_patch3", "primaryDataset": outputModule, "dataTier": "RAW", "processedDataset": "LoadTest10"} fileAttrs = {"lfn": makeUUID(), "location": "cmssrm.fnal.gov", "checksums": {"adler32": "ff810ec3", "cksum": "2212831827"}, "events": random.randrange(500, 5000, 50),
taskMaker = TaskMaker(workload, os.path.join(os.getcwd(), workloadName)) taskMaker.skipSubscription = True taskMaker.processWorkload() workload.save(workloadPath) myThread = threading.currentThread() myThread.transaction.begin() for workloadTask in workload.taskIterator(): inputFileset = Fileset(name = workloadTask.getPathName()) inputFileset.create() virtualFile = File(lfn = "%s-virtual-input" % workloadTask.getPathName(), size = 0, events = numEvents, locations = set(["cmssrm.fnal.gov", "storm-fe-cms.cr.cnaf.infn.it", "cmssrm-fzk.gridka.de", "srm2.grid.sinica.edu.tw", "srm-cms.gridpp.rl.ac.uk", "ccsrm.in2p3.fr", "srmcms.pic.es"]), merged = False) myRun = Run(runNumber = 1) myRun.appendLumi(1) virtualFile.addRun(myRun) virtualFile.create() inputFileset.addFile(virtualFile) inputFileset.commit() myWMBSHelper = WMBSHelper(workload) myWMBSHelper._createSubscriptionsInWMBS(workloadTask.getPathName()) myThread.transaction.commit()
#!/usr/bin/env python import random from WMCore.FwkJobReport import Report from WMCore.DataStructs.Run import Run from WMCore.Services.UUIDLib import makeUUID outputModules = ["outputModule1", "outputModule2", "outputModule3", "outputModule4", "outputModule5", "outputModule6", "outputModule7", "outputModule8", "outputModule9", "outputModule10"] runInfo = Run(1) runInfo.extendLumis([11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40]) totalReports = 25 inputFilesPerReport = 50 inputFileCounter = 0 for i in range(totalReports): loadTestReport = Report.Report("cmsRun1") loadTestReport.addInputSource("PoolSource") for j in range(inputFilesPerReport): inputFile = loadTestReport.addInputFile("PoolSource", lfn = "input%i" % inputFileCounter, events = 600000, size = 600000) inputFileCounter += 1
# Have to transform this because JSON is too stupid to understand ints # Also for some reason we're getting a strange problem where the mask # isn't being loaded at all. I'm not sure what to do there except drop it. try: for key in maskA['runAndLumis'].keys(): maskA['runAndLumis'][int(key)] = maskA['runAndLumis'][key] del maskA['runAndLumis'][key] except KeyError: # We don't have a mask. Not much we can do about this maskA = Mask() mask = Mask() mask.update(maskA) runs = [] # Turn arbitrary format into real runs for r in runsA: run = Run(runNumber = r['run_number']) run.lumis = r.get('lumis', []) runs.append(run) # Get rid of runs that aren't in the mask runs = mask.filterRunLumisByMask(runs = runs) for err in errorCouch: task = err['value']['task'] step = err['value']['step'] errors = err['value']['error'] logs = err['value']['logs'] start = err['value']['start'] stop = err['value']['stop'] if not task in workflowData['errors'].keys(): workflowData['errors'][task] = {'failureTime': 0} if not step in workflowData['errors'][task].keys(): workflowData['errors'][task][step] = {}
def execute(self, jobID, conn = None, transaction = False): """ _execute_ Execute the SQL for the given job ID(s) and then format and return the result. """ if type(jobID) != list: jobID = [jobID] binds = [{"jobid": x} for x in jobID] if not binds: return [] #First load full file information with run/lumis filesResult = self.dbi.processData(self.fileSQL, binds, conn = conn, transaction = transaction) fileList = self.formatDict(filesResult) #Clear duplicates bindDict = {} for result in fileList: bindDict[result['id']] = 1 result['newRuns'] = [] fileBinds = [{'fileid' : x} for x in bindDict.keys()] #Load file information if len(fileBinds): lumiResult = self.dbi.processData(self.runLumiSQL, fileBinds, conn = conn, transaction = transaction) lumiList = self.formatDict(lumiResult) lumiDict = {} for l in lumiList: if not l['fileid'] in lumiDict.keys(): lumiDict[l['fileid']] = [] lumiDict[l['fileid']].append(l) for f in fileList: fileRuns = {} if f['id'] in lumiDict.keys(): for l in lumiDict[f['id']]: run = l['run'] lumi = l['lumi'] try: fileRuns[run].append(lumi) except KeyError: fileRuns[run] = [] fileRuns[run].append(lumi) for r in fileRuns.keys(): newRun = Run(runNumber = r) newRun.lumis = fileRuns[r] f['newRuns'].append(newRun) filesForJobs = {} for f in fileList: jobid = f['jobid'] if not jobid in filesForJobs.keys(): filesForJobs[jobid] = {} if f['id'] not in filesForJobs[jobid].keys(): wmbsFile = File(id = f['id']) wmbsFile.update(f) for r in wmbsFile['newRuns']: wmbsFile.addRun(r) filesForJobs[jobid][f['id']] = wmbsFile #Add the file information to job objects and load the masks jobList = [Job(id = x) for x in jobID] for j in jobList: if j['id'] in filesForJobs.keys(): j['input_files'] = filesForJobs[j['id']].values() j['mask'].load(j['id']) return jobList
def execute(self, jobID, fileSelection = None, conn = None, transaction = False): """ _execute_ Execute the SQL for the given job ID and then format and return the result. """ if type(jobID) == list: if len(jobID) < 1: # Nothing to do return [] binds = jobID else: binds = {"jobid": jobID} result = self.dbi.processData(self.sql, binds, conn = conn, transaction = transaction) jobList = self.formatJobs(result) filesResult = self.dbi.processData(self.fileSQL, binds, conn = conn, transaction = transaction) fileList = self.formatDict(filesResult) fileBinds = [] if fileSelection: fileList = filter(lambda x : x['lfn'] in fileSelection[x['jobid']], fileList) for x in fileList: # Add new runs x['newRuns'] = [] # Assemble unique list of binds if not {'fileid': x['id']} in fileBinds: fileBinds.append({'fileid': x['id']}) parentList = [] if len(fileBinds) > 0: parentResult = self.dbi.processData(self.parentSQL, fileBinds, conn = conn, transaction = transaction) parentList = self.formatDict(parentResult) lumiResult = self.dbi.processData(self.runLumiSQL, fileBinds, conn = conn, transaction = transaction) lumiList = self.formatDict(lumiResult) lumiDict = {} for l in lumiList: if not l['fileid'] in lumiDict.keys(): lumiDict[l['fileid']] = [] lumiDict[l['fileid']].append(l) for f in fileList: fileRuns = {} if f['id'] in lumiDict.keys(): for l in lumiDict[f['id']]: run = l['run'] lumi = l['lumi'] try: fileRuns[run].append(lumi) except KeyError: fileRuns[run] = [] fileRuns[run].append(lumi) for r in fileRuns.keys(): newRun = Run(runNumber = r) newRun.lumis = fileRuns[r] f['newRuns'].append(newRun) filesForJobs = {} for f in fileList: jobid = f['jobid'] if not jobid in filesForJobs.keys(): filesForJobs[jobid] = {} if f['id'] not in filesForJobs[jobid].keys(): wmbsFile = File(id = f['id']) wmbsFile.update(f) wmbsFile['locations'].add(f['se_name']) for r in wmbsFile['newRuns']: wmbsFile.addRun(r) for entry in parentList: if entry['id'] == f['id']: wmbsFile['parents'].add(entry['lfn']) filesForJobs[jobid][f['id']] = wmbsFile else: # If the file is there, just add the location filesForJobs[jobid][f['id']]['locations'].add(f['se_name']) for j in jobList: if j['id'] in filesForJobs.keys(): j['input_files'] = filesForJobs[j['id']].values() return jobList