def getChunkFiles(self, collectionName, filesetName, chunkOffset, chunkSize = 100, user = "******", group = "cmsdataops"): """ _getChunkFiles_ Retrieve a chunk of files from the given collection and task. """ chunkFiles = [] result = self.couchdb.loadView("ACDC", "owner_coll_fileset_files", {"startkey": [group, user, collectionName, filesetName], "endkey": [group, user, collectionName, filesetName, {}], "limit": chunkSize, "skip": chunkOffset, }, []) for row in result["rows"]: resultRow = row['value'] newFile = File(lfn = resultRow["lfn"], size = resultRow["size"], events = resultRow["events"], parents = set(resultRow["parents"]), locations = set(resultRow["locations"]), merged = resultRow["merged"]) for run in resultRow["runs"]: newRun = Run(run["run_number"]) newRun.extend(run["lumis"]) newFile.addRun(newRun) chunkFiles.append(newFile) return chunkFiles
def generateFakeMCFile(self, numEvents=100, firstEvent=1, lastEvent=100, firstLumi=1, lastLumi=10, existingSub=None): # MC comes with only one MCFakeFile newFile = File("MCFakeFileTest", size=1000, events=numEvents) newFile.setLocation('se01') if firstLumi == lastLumi: newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1))) else: newFile.addRun(Run(1, *range(firstLumi, lastLumi))) newFile["first_event"] = firstEvent newFile["last_event"] = lastEvent if existingSub is None: singleMCFileset = Fileset(name="MCTestFileset") singleMCFileset.addFile(newFile) testWorkflow = Workflow() existingSub = Subscription(fileset=singleMCFileset, workflow=testWorkflow, split_algo="EventBased", type="Production") else: existingSub['fileset'].addFile(newFile) return existingSub
def execute(self, *args, **kwargs): totalevents = kwargs['task']['tm_totalunits'] firstEvent = 1 lastEvent = totalevents firstLumi = 1 lastLumi = 10 # Set a default of 100 events per lumi. This is set as a task # property, as the splitting considers it independently of the file # information provided by the fake dataset. if not kwargs['task']['tm_events_per_lumi']: kwargs['task']['tm_events_per_lumi'] = 100 #MC comes with only one MCFakeFile singleMCFileset = Fileset(name = "MCFakeFileSet") newFile = File("MCFakeFile", size = 1000, events = totalevents) if hasattr(self.config.Sites, 'available'): newFile.setLocation(self.config.Sites.available) else: sbj = SiteDBJSON({"key":self.config.TaskWorker.cmskey, "cert":self.config.TaskWorker.cmscert}) newFile.setLocation(sbj.getAllCMSNames()) newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1))) newFile["block"] = 'MCFackBlock' newFile["first_event"] = firstEvent newFile["last_event"] = lastEvent singleMCFileset.addFile(newFile) return Result(task=kwargs['task'], result=singleMCFileset)
def getChunkFiles(self, collectionName, filesetName, chunkOffset, chunkSize=100, user="******", group="cmsdataops"): """ _getChunkFiles_ Retrieve a chunk of files from the given collection and task. """ chunkFiles = [] files = self._getFilesetInfo(collectionName, filesetName, user, group, chunkOffset, chunkSize) files = mergeFakeFiles(files) for fileInfo in files: newFile = File(lfn=fileInfo["lfn"], size=fileInfo["size"], events=fileInfo["events"], parents=set(fileInfo["parents"]), locations=set(fileInfo["locations"]), merged=fileInfo["merged"]) for run in fileInfo["runs"]: newRun = Run(run["run_number"]) newRun.extend(run["lumis"]) newFile.addRun(newRun) chunkFiles.append(newFile) return chunkFiles
def returnDataStructsFile(self): """ _returnDataStructsFile_ Creates a dataStruct file out of this file """ parents = set() for parent in self["parents"]: parents.add( WMFile(lfn=parent['lfn'], size=parent['size'], events=parent['events'], checksums=parent['checksums'], parents=parent['parents'], merged=parent['merged'])) file = WMFile(lfn=self['lfn'], size=self['size'], events=self['events'], checksums=self['checksums'], parents=parents, merged=self['merged']) for run in self['runs']: file.addRun(run) for location in self['locations']: file.setLocation(pnn=location) return file
def setUp(self): """ _setUp_ Initial Setup for the Job Testcase """ self.inputFiles = [] for i in range(1, 1000): lfn = "/store/data/%s/%s/file.root" % (random.randint( 1000, 9999), random.randint(1000, 9999)) size = random.randint(1000, 2000) events = 1000 run = random.randint(0, 2000) lumi = random.randint(0, 8) file = File(lfn=lfn, size=size, events=events, checksums={"cksum": "1"}) file.addRun(Run(run, *[lumi])) self.inputFiles.append(file) self.dummyJob = Job(files=self.inputFiles) return
def execute(self, *args, **kwargs): #pylint: disable=unused-argument # since https://github.com/dmwm/CRABServer/issues/5633 totalunits can be a float # but that would confuse WMCore, therefore cast to int totalevents = int(kwargs['task']['tm_totalunits']) firstEvent = 1 lastEvent = totalevents firstLumi = 1 lastLumi = 10 # Set a default of 100 events per lumi. This is set as a task # property, as the splitting considers it independently of the file # information provided by the fake dataset. if not kwargs['task']['tm_events_per_lumi']: kwargs['task']['tm_events_per_lumi'] = 100 #MC comes with only one MCFakeFile singleMCFileset = Fileset(name="MCFakeFileSet") newFile = File("MCFakeFile", size=1000, events=totalevents) newFile.setLocation(self.getListOfSites()) newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1))) newFile["block"] = 'MCFakeBlock' newFile["first_event"] = firstEvent newFile["last_event"] = lastEvent singleMCFileset.addFile(newFile) return Result(task=kwargs['task'], result=singleMCFileset)
def setUp(self): """ Create a dummy fileset and populate it with random files, in order to use it for the testcase methods """ logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', datefmt='%m-%d %H:%M', filename=__file__.replace('.py','.log'), filemode='w') self.logger = logging.getLogger('FilesetClassTest') #Setup the initial testcase environment: initialfile = File('/tmp/lfn1',1000,1,1,1) self.initialSet = set() self.initialSet.add(initialfile) #Create a Fileset, containing a initial file on it. self.fileset = Fileset(name = 'self.fileset', files = self.initialSet) #Populate the fileset with random files for i in range(1,1000): lfn = '/store/data/%s/%s/file.root' % (random.randint(1000, 9999), random.randint(1000, 9999)) size = random.randint(1000, 2000) events = 1000 run = random.randint(0, 2000) lumi = random.randint(0, 8) file = File(lfn=lfn, size=size, events=events, checksums = {"cksum": "1"}) file.addRun(Run(run, *[lumi])) self.fileset.addFile(file)
def execute(self, *args, **kwargs): totalevents = kwargs['task']['tm_totalunits'] firstEvent = 1 lastEvent = totalevents firstLumi = 1 lastLumi = 10 # Set a default of 100 events per lumi. This is set as a task # property, as the splitting considers it independently of the file # information provided by the fake dataset. if not kwargs['task']['tm_events_per_lumi']: kwargs['task']['tm_events_per_lumi'] = 100 #MC comes with only one MCFakeFile singleMCFileset = Fileset(name = "MCFakeFileSet") newFile = File("MCFakeFile", size = 1000, events = totalevents) sbj = SiteDBJSON({"key":self.config.TaskWorker.cmskey, "cert":self.config.TaskWorker.cmscert}) newFile.setLocation(sbj.getAllCMSNames()) newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1))) newFile["block"] = 'MCFakeBlock' newFile["first_event"] = firstEvent newFile["last_event"] = lastEvent singleMCFileset.addFile(newFile) return Result(task=kwargs['task'], result=singleMCFileset)
def testAddRun(self): """ This tests the addRun() function of a DataStructs File object """ testLFN = "lfn" testSize = "1024" testEvents = "100" testCksum = "1" testParents = "parent" testLumi = 1 testRunNumber = 1000000 testFile = File(lfn=testLFN, size=testSize, events=testEvents, checksums=testCksum, parents=testParents) testRun = Run(testRunNumber, testLumi) testFile.addRun(testRun) assert testRun in testFile[ 'runs'], "Run not added properly to run in File.addRun()" return
def setUp(self): """ Create a dummy fileset and populate it with random files, in order to use it for the testcase methods """ logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', datefmt='%m-%d %H:%M', filename=__file__.replace('.py','.log'), filemode='w') self.logger = logging.getLogger('FilesetClassTest') #Setup the initial testcase environment: initialfile = File('/tmp/lfn1',1000,1,1,1) self.initialSet = set() self.initialSet.add(initialfile) #Create a Fileset, containing a initial file on it. self.fileset = Fileset(name = 'self.fileset', files = self.initialSet) #Populate the fileset with random files for i in range(1,1000): lfn = '/store/data/%s/%s/file.root' % (random.randint(1000, 9999), random.randint(1000, 9999)) size = random.randint(1000, 2000) events = 1000 run = random.randint(0, 2000) lumi = random.randint(0, 8) file = File(lfn=lfn, size=size, events=events, checksums = {"cksum": "1"}) file.addRun(Run(run, *[lumi])) self.fileset.addFile(file)
def execute(self, *args, **kwargs): #pylint: disable=unused-argument # since https://github.com/dmwm/CRABServer/issues/5633 totalunits can be a float # but that would confuse WMCore, therefore cast to int totalevents = int(kwargs['task']['tm_totalunits']) firstEvent = 1 lastEvent = totalevents firstLumi = 1 lastLumi = 10 # Set a default of 100 events per lumi. This is set as a task # property, as the splitting considers it independently of the file # information provided by the fake dataset. if not kwargs['task']['tm_events_per_lumi']: kwargs['task']['tm_events_per_lumi'] = 100 #MC comes with only one MCFakeFile singleMCFileset = Fileset(name = "MCFakeFileSet") newFile = File("MCFakeFile", size = 1000, events = totalevents) newFile.setLocation(self.getListOfSites()) newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1))) newFile["block"] = 'MCFakeBlock' newFile["first_event"] = firstEvent newFile["last_event"] = lastEvent singleMCFileset.addFile(newFile) return Result(task=kwargs['task'], result=singleMCFileset)
def createFilesetFromDBS(self, collection, filesetName, dbsURL, dataset, mask=None): """ _createFilesetFromDBS_ Get info from DBS, apply mask (filter) and create a fileset """ fileSet = CouchFileset(database=self.database, url=self.url, name=filesetName) fileSet.setCollection(collection) files = [] blockLocations = {} dbsReader = DBSReader(dbsURL, version="DBS_2_0_9", mode="GET") dbsResults = dbsReader.dbs.listFiles( path=dataset, retriveList=["retrive_lumi", "retrive_run"]) logging.info('Found %s files from DBS' % len(dbsResults)) for dbsResult in dbsResults: blockName = dbsResult["Block"]["Name"] if not blockName in blockLocations: blockLocations[blockName] = dbsReader.listFileBlockLocation( blockName) file = File(lfn=dbsResult["LogicalFileName"], size=dbsResult["FileSize"], merged=True, events=dbsResult["NumberOfEvents"], locations=blockLocations[blockName]) runs = {} for lumi in dbsResult["LumiList"]: runNumber = lumi['RunNumber'] runString = str(runNumber) lumiNumber = lumi["LumiSectionNumber"] if runString in runs: runs[runString].lumis.append(lumiNumber) else: runs[runString] = Run(runNumber, lumiNumber) for run in runs.values(): file.addRun(run) files.append(file) logging.info('Uploading %s files in fileset' % len(files)) fileList = fileSet.add(files, mask) return fileSet, fileList
def jobConfig(self, wf, task, jobid, lfn): """ Create a fake job dict to upload to the ACDC server """ testFile = File(lfn=lfn, size=1024, events=1024) testFile.setLocation(["T2_CH_CERN", "T2_CH_CERN_HLT"]) testFile.addRun(Run(jobid, 1, 2)) # run = jobid testJob = self.getMinimalJob(wf, task) testJob.addFile(testFile) return testJob
def jobConfig(self, wf, task, jobid, lfn): """ Create a fake job dict to upload to the ACDC server """ testFile = File(lfn=lfn, size=1024, events=1024) testFile.setLocation(["T2_CH_CERN", "T2_CH_CERN_HLT"]) testFile.addRun(Run(jobid, 1, 2)) # run = jobid testJob = self.getMinimalJob(wf, task) testJob.addFile(testFile) return testJob
def getOutputFile(self, fileName, outputModule, step): """ _getOutputFile_ Takes a fileRef object and returns a DataStructs/File object as output """ outputMod = self.getOutputModule(step=step, outputModule=outputModule) if not outputMod: return None fileRef = getattr(outputMod.files, fileName, None) newFile = File(locations=set()) # Locations newFile.setLocation(getattr(fileRef, "location", None)) # Runs runList = fileRef.runs.listSections_() for run in runList: lumis = getattr(fileRef.runs, run) if isinstance(lumis, dict): newRun = Run(int(run), *listitems(lumis)) else: newRun = Run(int(run), *lumis) newFile.addRun(newRun) newFile["lfn"] = getattr(fileRef, "lfn", None) newFile["pfn"] = getattr(fileRef, "pfn", None) newFile["events"] = int(getattr(fileRef, "events", 0)) newFile["size"] = int(getattr(fileRef, "size", 0)) newFile["branches"] = getattr(fileRef, "branches", []) newFile["input"] = getattr(fileRef, "input", []) newFile["inputpfns"] = getattr(fileRef, "inputpfns", []) newFile["branch_hash"] = getattr(fileRef, "branch_hash", None) newFile["catalog"] = getattr(fileRef, "catalog", "") newFile["guid"] = getattr(fileRef, "guid", "") newFile["module_label"] = getattr(fileRef, "module_label", "") newFile["checksums"] = getattr(fileRef, "checksums", {}) newFile["merged"] = bool(getattr(fileRef, "merged", False)) newFile["dataset"] = getattr(fileRef, "dataset", {}) newFile["acquisitionEra"] = getattr(fileRef, 'acquisitionEra', None) newFile["processingVer"] = getattr(fileRef, 'processingVer', None) newFile["validStatus"] = getattr(fileRef, 'validStatus', None) newFile["globalTag"] = getattr(fileRef, 'globalTag', None) newFile["prep_id"] = getattr(fileRef, 'prep_id', None) newFile['configURL'] = getattr(fileRef, 'configURL', None) newFile['inputPath'] = getattr(fileRef, 'inputPath', None) newFile["outputModule"] = outputModule newFile["fileRef"] = fileRef return newFile
def getOutputFile(self, fileName, outputModule, step): """ _getOutputFile_ Takes a fileRef object and returns a DataStructs/File object as output """ outputMod = self.getOutputModule(step=step, outputModule=outputModule) if not outputMod: return None fileRef = getattr(outputMod.files, fileName, None) newFile = File(locations=set()) # Locations newFile.setLocation(getattr(fileRef, "location", None)) # Runs runList = fileRef.runs.listSections_() for run in runList: lumis = getattr(fileRef.runs, run) if isinstance(lumis, dict): newRun = Run(int(run), *lumis.items()) else: newRun = Run(int(run), *lumis) newFile.addRun(newRun) newFile["lfn"] = getattr(fileRef, "lfn", None) newFile["pfn"] = getattr(fileRef, "pfn", None) newFile["events"] = int(getattr(fileRef, "events", 0)) newFile["size"] = int(getattr(fileRef, "size", 0)) newFile["branches"] = getattr(fileRef, "branches", []) newFile["input"] = getattr(fileRef, "input", []) newFile["inputpfns"] = getattr(fileRef, "inputpfns", []) newFile["branch_hash"] = getattr(fileRef, "branch_hash", None) newFile["catalog"] = getattr(fileRef, "catalog", "") newFile["guid"] = getattr(fileRef, "guid", "") newFile["module_label"] = getattr(fileRef, "module_label", "") newFile["checksums"] = getattr(fileRef, "checksums", {}) newFile["merged"] = bool(getattr(fileRef, "merged", False)) newFile["dataset"] = getattr(fileRef, "dataset", {}) newFile["acquisitionEra"] = getattr(fileRef, 'acquisitionEra', None) newFile["processingVer"] = getattr(fileRef, 'processingVer', None) newFile["validStatus"] = getattr(fileRef, 'validStatus', None) newFile["globalTag"] = getattr(fileRef, 'globalTag', None) newFile["prep_id"] = getattr(fileRef, 'prep_id', None) newFile['configURL'] = getattr(fileRef, 'configURL', None) newFile['inputPath'] = getattr(fileRef, 'inputPath', None) newFile["outputModule"] = outputModule newFile["fileRef"] = fileRef return newFile
def execute(self, *args, **kwargs): self.logger.info( "Data discovery and splitting for %s using user-provided files" % kwargs['task']['tm_taskname']) userfiles = kwargs['task']['tm_arguments'].get('userfiles') splitting = kwargs['task']['tm_split_algo'] total_units = kwargs['task']['tm_totalunits'] if not userfiles or splitting != 'FileBased': if not userfiles: msg = "No files specified to process for task %s." % kwargs[ 'task']['tm_taskname'] if splitting != 'FileBased': msg = "Data.splitting must be set to 'FileBased' when using a custom set of files." self.logger.error("Setting %s as failed: %s" % (kwargs['task']['tm_taskname'], msg)) configreq = { 'workflow': kwargs['task']['tm_taskname'], 'status': "FAILED", 'subresource': 'failure', 'failure': b64encode(msg) } self.server.post(self.resturi, data=urllib.urlencode(configreq)) raise StopHandler(msg) if hasattr(self.config.Sites, 'available'): locations = self.config.Sites.available else: sbj = SiteDBJSON({ "key": self.config.TaskWorker.cmskey, "cert": self.config.TaskWorker.cmscert }) locations = sbj.getAllCMSNames() userFileset = Fileset(name=kwargs['task']['tm_taskname']) self.logger.info("There are %d files specified by the user." % len(userfiles)) if total_units > 0: self.logger.info("Will run over the first %d files." % total_units) file_counter = 0 for userfile, idx in zip(userfiles, range(len(userfiles))): newFile = File(userfile, size=1000, events=1) newFile.setLocation(locations) newFile.addRun(Run(1, idx)) newFile["block"] = 'UserFilesFakeBlock' newFile["first_event"] = 1 newFile["last_event"] = 2 userFileset.addFile(newFile) file_counter += 1 if total_units > 0 and file_counter >= total_units: break return Result(task=kwargs['task'], result=userFileset)
def createResubmitSpec(self, serverUrl, couchDB): """ _createResubmitSpec_ Create a bogus resubmit workload. """ self.site = "cmssrm.fnal.gov" workload = WMWorkloadHelper(WMWorkload("TestWorkload")) reco = workload.newTask("reco") workload.setOwnerDetails(name = "evansde77", group = "DMWM") # first task uses the input dataset reco.addInputDataset(primary = "PRIMARY", processed = "processed-v1", tier = "TIER1") reco.data.input.splitting.algorithm = "File" reco.setTaskType("Processing") cmsRunReco = reco.makeStep("cmsRun1") cmsRunReco.setStepType("CMSSW") reco.applyTemplates() cmsRunRecoHelper = cmsRunReco.getTypeHelper() cmsRunRecoHelper.addOutputModule("outputRECO", primaryDataset = "PRIMARY", processedDataset = "processed-v2", dataTier = "TIER2", lfnBase = "/store/dunkindonuts", mergedLFNBase = "/store/kfc") dcs = DataCollectionService(url = serverUrl, database = couchDB) def getJob(workload): job = Job() job["task"] = workload.getTask("reco").getPathName() job["workflow"] = workload.name() job["location"] = self.site job["owner"] = "evansde77" job["group"] = "DMWM" return job testFileA = WMFile(lfn = makeUUID(), size = 1024, events = 1024) testFileA.setLocation([self.site]) testFileA.addRun(Run(1, 1, 2)) testFileB = WMFile(lfn = makeUUID(), size = 1024, events = 1024) testFileB.setLocation([self.site]) testFileB.addRun(Run(1, 3, 4)) testJobA = getJob(workload) testJobA.addFile(testFileA) testJobA.addFile(testFileB) dcs.failedJobs([testJobA]) topLevelTask = workload.getTopLevelTask()[0] workload.truncate("Resubmit_TestWorkload", topLevelTask.getPathName(), serverUrl, couchDB) return workload
def createFile(lfn, events, run, lumis, location): """ _createFile_ Create a file for testing """ newFile = File(lfn=lfn, size=1000, events=events) lumiList = [] for lumi in range(lumis): lumiList.append((run * lumis) + lumi) newFile.addRun(Run(run, *lumiList)) newFile.setLocation(location) return newFile
def createFile(self, lfn, events, run, lumis, location): """ _createFile_ Create a file for testing """ newFile = File(lfn=lfn, size=1000, events=events) lumiList = [] for lumi in range(lumis): lumiList.append((run * lumis) + lumi) newFile.addRun(Run(run, *lumiList)) newFile.setLocation(location) return newFile
def getInputFilesFromStep(self, stepName, inputSource=None): """ _getInputFilesFromStep_ Retrieve a list of input files from the given step. """ step = self.retrieveStep(stepName) inputSources = [] if inputSource == None: inputSources = step.input.listSections_() else: inputSources = [inputSource] inputFiles = [] for inputSource in inputSources: source = getattr(step.input, inputSource) for fileNum in range(source.files.fileCount): fwjrFile = getattr(source.files, "file%d" % fileNum) lfn = getattr(fwjrFile, "lfn", None) pfn = getattr(fwjrFile, "pfn", None) size = getattr(fwjrFile, "size", 0) events = getattr(fwjrFile, "events", 0) branches = getattr(fwjrFile, "branches", []) catalog = getattr(fwjrFile, "catalog", None) guid = getattr(fwjrFile, "guid", None) inputSourceClass = getattr(fwjrFile, "input_source_class", None) moduleLabel = getattr(fwjrFile, "module_label", None) inputType = getattr(fwjrFile, "input_type", None) inputFile = File(lfn=lfn, size=size, events=events) inputFile["pfn"] = pfn inputFile["branches"] = branches inputFile["catalog"] = catalog inputFile["guid"] = guid inputFile["input_source_class"] = inputSourceClass inputFile["module_label"] = moduleLabel inputFile["input_type"] = inputType runSection = getattr(fwjrFile, "runs") runNumbers = runSection.listSections_() for runNumber in runNumbers: lumiTuple = getattr(runSection, str(runNumber)) inputFile.addRun(Run(int(runNumber), *lumiTuple)) inputFiles.append(inputFile) return inputFiles
def getInputFilesFromStep(self, stepName, inputSource = None): """ _getInputFilesFromStep_ Retrieve a list of input files from the given step. """ step = self.retrieveStep(stepName) inputSources = [] if inputSource == None: inputSources = step.input.listSections_() else: inputSources = [inputSource] inputFiles = [] for inputSource in inputSources: source = getattr(step.input, inputSource) for fileNum in range(source.files.fileCount): fwjrFile = getattr(source.files, "file%d" % fileNum) lfn = getattr(fwjrFile, "lfn", None) pfn = getattr(fwjrFile, "pfn", None) size = getattr(fwjrFile, "size", 0) events = getattr(fwjrFile, "events", 0) branches = getattr(fwjrFile, "branches", []) catalog = getattr(fwjrFile, "catalog", None) guid = getattr(fwjrFile, "guid", None) inputSourceClass = getattr(fwjrFile, "input_source_class", None) moduleLabel = getattr(fwjrFile, "module_label", None) inputType = getattr(fwjrFile, "input_type", None) inputFile = File(lfn = lfn, size = size, events = events) inputFile["pfn"] = pfn inputFile["branches"] = branches inputFile["catalog"] = catalog inputFile["guid"] = guid inputFile["input_source_class"] = inputSourceClass inputFile["module_label"] = moduleLabel inputFile["input_type"] = inputType runSection = getattr(fwjrFile, "runs") runNumbers = runSection.listSections_() for runNumber in runNumbers: lumiTuple = getattr(runSection, str(runNumber)) inputFile.addRun(Run(int(runNumber), *lumiTuple)) inputFiles.append(inputFile) return inputFiles
def generateFakeMCFile(self, numEvents=100, firstEvent=1, lastEvent=100, firstLumi=1, lastLumi=10): # MC comes with only one MCFakeFile singleMCFileset = Fileset(name="MCTestFileset") newFile = File("MCFakeFileTest", size=1000, events=numEvents) newFile.setLocation("se01") newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1))) newFile["first_event"] = firstEvent newFile["last_event"] = lastEvent testWorkflow = Workflow() singleMCFileset.addFile(newFile) singleMCFileSubscription = Subscription( fileset=singleMCFileset, workflow=testWorkflow, split_algo="EventBased", type="Production" ) return singleMCFileSubscription
def generateFakeMCFile(self, numEvents = 100, firstEvent = 1, lastEvent = 100, firstLumi = 1, lastLumi = 10): #MC comes with only one MCFakeFile singleMCFileset = Fileset(name = "MCTestFileset") newFile = File("MCFakeFileTest", size = 1000, events = numEvents) newFile.setLocation('se01') newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1))) newFile["first_event"] = firstEvent newFile["last_event"] = lastEvent testWorkflow = Workflow() singleMCFileset.addFile(newFile) singleMCFileSubscription = Subscription(fileset = singleMCFileset, workflow = testWorkflow, split_algo = "EventBased", type = "Production") return singleMCFileSubscription
def createFilesetFromDBS(self, collection, filesetName, dbsURL, dataset, mask=None): """ _createFilesetFromDBS_ Get info from DBS, apply mask (filter) and create a fileset """ fileSet = CouchFileset(database=self.database, url=self.url, name=filesetName) fileSet.setCollection(collection) files = [] blockLocations = {} dbsReader = DBSReader(dbsURL, version="DBS_2_0_9", mode="GET") dbsResults = dbsReader.dbs.listFiles(path=dataset, retriveList=["retrive_lumi", "retrive_run"]) logging.info("Found %s files from DBS" % len(dbsResults)) for dbsResult in dbsResults: blockName = dbsResult["Block"]["Name"] if not blockName in blockLocations: blockLocations[blockName] = dbsReader.listFileBlockLocation(blockName) file = File( lfn=dbsResult["LogicalFileName"], size=dbsResult["FileSize"], merged=True, events=dbsResult["NumberOfEvents"], locations=blockLocations[blockName], ) runs = {} for lumi in dbsResult["LumiList"]: runNumber = lumi["RunNumber"] runString = str(runNumber) lumiNumber = lumi["LumiSectionNumber"] if runString in runs: runs[runString].lumis.append(lumiNumber) else: runs[runString] = Run(runNumber, lumiNumber) for run in runs.values(): file.addRun(run) files.append(file) logging.info("Uploading %s files in fileset" % len(files)) fileList = fileSet.add(files, mask) return fileSet, fileList
def execute(self, *args, **kwargs): self.logger.info( "Data discovery and splitting for %s using user-provided files" % kwargs['task']['tm_taskname']) userfiles = kwargs['task']['tm_user_files'] splitting = kwargs['task']['tm_split_algo'] total_units = kwargs['task']['tm_totalunits'] if not userfiles or splitting != 'FileBased': if not userfiles: msg = "No files specified to process for task %s." % kwargs[ 'task']['tm_taskname'] if splitting != 'FileBased': msg = "Data.splitting must be set to 'FileBased' when using a custom set of files." raise TaskWorkerException(msg) if hasattr(self.config.Sites, 'available'): locations = self.config.Sites.available else: with self.config.TaskWorker.envForCMSWEB: configDict = { "cacheduration": 1, "pycurl": True } # cache duration is in hours resourceCatalog = CRIC(logger=self.logger, configDict=configDict) locations = resourceCatalog.getAllPSNs() userFileset = Fileset(name=kwargs['task']['tm_taskname']) self.logger.info("There are %d files specified by the user." % len(userfiles)) if total_units > 0: self.logger.info("Will run over the first %d files." % total_units) file_counter = 0 for userfile, idx in zip(userfiles, range(len(userfiles))): newFile = File(userfile, size=1000, events=1) newFile.setLocation(locations) newFile.addRun(Run(1, idx)) newFile["block"] = 'UserFilesFakeBlock' newFile["first_event"] = 1 newFile["last_event"] = 2 userFileset.addFile(newFile) file_counter += 1 if total_units > 0 and file_counter >= total_units: break return Result(task=kwargs['task'], result=userFileset)
def execute(self, *args, **kwargs): self.logger.info("Data discovery and splitting for %s using user-provided files" % kwargs['task']['tm_taskname']) if 'tm_user_files' in kwargs['task'] and kwargs['task']['tm_user_files']: userfiles = kwargs['task']['tm_user_files'] else: ## For backward compatibility only. userfiles = kwargs['task']['tm_arguments'].get('userfiles') splitting = kwargs['task']['tm_split_algo'] total_units = kwargs['task']['tm_totalunits'] if not userfiles or splitting != 'FileBased': if not userfiles: msg = "No files specified to process for task %s." % kwargs['task']['tm_taskname'] if splitting != 'FileBased': msg = "Data.splitting must be set to 'FileBased' when using a custom set of files." self.logger.error("Setting %s as failed: %s" % (kwargs['task']['tm_taskname'], msg)) configreq = {'workflow': kwargs['task']['tm_taskname'], 'status': "FAILED", 'subresource': 'failure', 'failure': b64encode(msg)} self.server.post(self.resturi, data = urllib.urlencode(configreq)) raise StopHandler(msg) if hasattr(self.config.Sites, 'available'): locations = self.config.Sites.available else: sbj = SiteDBJSON({"key":self.config.TaskWorker.cmskey, "cert":self.config.TaskWorker.cmscert}) locations = sbj.getAllCMSNames() userFileset = Fileset(name = kwargs['task']['tm_taskname']) self.logger.info("There are %d files specified by the user." % len(userfiles)) if total_units > 0: self.logger.info("Will run over the first %d files." % total_units) file_counter = 0 for userfile, idx in zip(userfiles, range(len(userfiles))): newFile = File(userfile, size = 1000, events = 1) newFile.setLocation(locations) newFile.addRun(Run(1, idx)) newFile["block"] = 'UserFilesFakeBlock' newFile["first_event"] = 1 newFile["last_event"] = 2 userFileset.addFile(newFile) file_counter += 1 if total_units > 0 and file_counter >= total_units: break return Result(task = kwargs['task'], result = userFileset)
def testDataStructsFile(self): """ _testDataStructsFile_ Tests our ability to create a WMBS file from a DataStructs File and vice versa """ myThread = threading.currentThread() testLFN = "lfn1" testSize = 1024 testEvents = 100 testCksum = {"cksum": '1'} testParents = set(["lfn2"]) testRun = Run(1, *[45]) testSE = "se1.cern.ch" parentFile = File(lfn="lfn2") parentFile.create() testFile = File() inputFile = WMFile(lfn=testLFN, size=testSize, events=testEvents, checksums=testCksum, parents=testParents) inputFile.addRun(testRun) inputFile.setLocation(se=testSE) testFile.loadFromDataStructsFile(file=inputFile) testFile.create() testFile.save() loadFile = File(lfn="lfn1") loadFile.loadData(parentage=1) self.assertEqual(loadFile['size'], testSize) self.assertEqual(loadFile['events'], testEvents) self.assertEqual(loadFile['checksums'], testCksum) self.assertEqual(loadFile['locations'], set([testSE])) #self.assertEqual(loadFile['parents'].pop()['lfn'], 'lfn2') wmFile = loadFile.returnDataStructsFile() self.assertEqual(wmFile == inputFile, True) return
def setUp(self): """ _setUp_ Create two subscriptions: One that contains a single file and one that contains multiple files. """ self.multipleFileFileset = Fileset(name="TestFileset1") for i in range(10): newFile = File(makeUUID(), size=1000, events=100) newFile.setLocation('blenheim') newFile.setLocation('malpaquet') lumis = [] for lumi in range(20): lumis.append((i * 100) + lumi) newFile.addRun(Run(i, *lumis)) self.multipleFileFileset.addFile(newFile) self.singleFileFileset = Fileset(name="TestFileset2") newFile = File("/some/file/name", size=1000, events=100) newFile.setLocation('blenheim') lumis = list(range(50, 60)) + list(range(70, 80)) newFile.addRun(Run(13, *lumis)) self.singleFileFileset.addFile(newFile) testWorkflow = Workflow() self.multipleFileSubscription = Subscription( fileset=self.multipleFileFileset, workflow=testWorkflow, split_algo="FileBased", type="Processing") self.singleFileSubscription = Subscription( fileset=self.singleFileFileset, workflow=testWorkflow, split_algo="FileBased", type="Processing") #self.multipleFileSubscription.create() #self.singleFileSubscription.create() self.performanceParams = { 'timePerEvent': 12, 'memoryRequirement': 2300, 'sizePerEvent': 400 } return
def testDataStructsFile(self): """ _testDataStructsFile_ Tests our ability to create a WMBS file from a DataStructs File and vice versa """ myThread = threading.currentThread() testLFN = "lfn1" testSize = 1024 testEvents = 100 testCksum = {"cksum": '1'} testParents = set(["lfn2"]) testRun = Run( 1, *[45]) testSE = "se1.cern.ch" parentFile = File(lfn= "lfn2") parentFile.create() testFile = File() inputFile = WMFile(lfn = testLFN, size = testSize, events = testEvents, checksums = testCksum, parents = testParents) inputFile.addRun(testRun) inputFile.setLocation(se = testSE) testFile.loadFromDataStructsFile(file = inputFile) testFile.create() testFile.save() loadFile = File(lfn = "lfn1") loadFile.loadData(parentage = 1) self.assertEqual(loadFile['size'], testSize) self.assertEqual(loadFile['events'], testEvents) self.assertEqual(loadFile['checksums'], testCksum) self.assertEqual(loadFile['locations'], set([testSE])) #self.assertEqual(loadFile['parents'].pop()['lfn'], 'lfn2') wmFile = loadFile.returnDataStructsFile() self.assertEqual(wmFile == inputFile, True) return
def stuffACDCDatabase(self, numFiles=50, lumisPerFile=20, lumisPerACDCRecord=2): """ _stuffACDCDatabase_ Fill the ACDC database with ACDC records, both for processing and merge """ filesetName = "/%s/DataProcessing" % self.workflowName owner = "*****@*****.**" group = "unknown" for i in range(numFiles): for j in range(1, lumisPerFile + 1, lumisPerACDCRecord): lfn = "/store/data/a/%d" % i acdcFile = File(lfn=lfn, size=100, events=250, locations=self.validLocations, merged=1) run = Run(i + 1, *range(j, min(j + lumisPerACDCRecord, lumisPerFile + 1))) acdcFile.addRun(run) acdcDoc = { "collection_name": self.workflowName, "collection_type": "ACDC.CollectionTypes.DataCollection", "files": {lfn: acdcFile}, "fileset_name": filesetName, "owner": {"user": owner, "group": group}, } self.acdcDB.queue(acdcDoc) filesetName = "/%s/DataProcessing/DataProcessingMergeRECOoutput" % self.workflowName for i in range(numFiles): for j in range(1, lumisPerFile + 1, lumisPerACDCRecord): lfn = "/store/unmerged/b/%d" % i acdcFile = File(lfn=lfn, size=100, events=250, locations=set([choice(self.validLocations)]), merged=0) run = Run(i + 1, *range(j, min(j + lumisPerACDCRecord, lumisPerFile + 1))) acdcFile.addRun(run) acdcDoc = { "collection_name": self.workflowName, "collection_type": "ACDC.CollectionTypes.DataCollection", "files": {lfn: acdcFile}, "fileset_name": filesetName, "owner": {"user": owner, "group": group}, } self.acdcDB.queue(acdcDoc) self.acdcDB.commit() return
def setUp(self): """ _setUp_ Create two subscriptions: One that contains a single file and one that contains multiple files. """ self.multipleFileFileset = Fileset(name = "TestFileset1") for i in range(10): newFile = File(makeUUID(), size = 1000, events = 100) newFile.setLocation('blenheim') newFile.setLocation('malpaquet') lumis = [] for lumi in range(20): lumis.append((i * 100) + lumi) newFile.addRun(Run(i, *lumis)) self.multipleFileFileset.addFile(newFile) self.singleFileFileset = Fileset(name = "TestFileset2") newFile = File("/some/file/name", size = 1000, events = 100) newFile.setLocation('blenheim') lumis = range(50,60) + range(70,80) newFile.addRun(Run(13, *lumis)) self.singleFileFileset.addFile(newFile) testWorkflow = Workflow() self.multipleFileSubscription = Subscription(fileset = self.multipleFileFileset, workflow = testWorkflow, split_algo = "FileBased", type = "Processing") self.singleFileSubscription = Subscription(fileset = self.singleFileFileset, workflow = testWorkflow, split_algo = "FileBased", type = "Processing") #self.multipleFileSubscription.create() #self.singleFileSubscription.create() self.performanceParams = {'timePerEvent' : 12, 'memoryRequirement' : 2300, 'sizePerEvent' : 400} return
def setUp(self): """ _setUp_ Create two subscriptions: One that contains a single file and one that contains multiple files. """ self.multipleFileFileset = Fileset(name="TestFileset1") for i in range(10): newFile = File(makeUUID(), size=1000, events=100) newFile.addRun(Run(i, *[45 + i])) self.multipleFileFileset.addFile(newFile) self.singleFileFileset = Fileset(name="TestFileset2") newFile = File("/some/file/name", size=1000, events=100) newFile.addRun(Run(1, *[45])) self.singleFileFileset.addFile(newFile) self.multipleFileLumiset = Fileset(name="TestFileset3") for i in range(10): newFile = File(makeUUID(), size=1000, events=100) newFile.addRun(Run(1, *[45 + i / 3])) self.multipleFileLumiset.addFile(newFile) self.singleLumiFileset = Fileset(name="TestFileset4") for i in range(10): newFile = File(makeUUID(), size=1000, events=100) newFile.addRun(Run(1, *[45])) self.singleLumiFileset.addFile(newFile) testWorkflow = Workflow() self.multipleFileSubscription = Subscription( fileset=self.multipleFileFileset, workflow=testWorkflow, split_algo="FixedDelay", type="Processing") self.singleFileSubscription = Subscription( fileset=self.singleFileFileset, workflow=testWorkflow, split_algo="FixedDelay", type="Processing") self.multipleLumiSubscription = Subscription( fileset=self.multipleFileLumiset, workflow=testWorkflow, split_algo="FixedDelay", type="Processing") self.singleLumiSubscription = Subscription( fileset=self.singleLumiFileset, workflow=testWorkflow, split_algo="FixedDelay", type="Processing") return
def execute(self, *args, **kwargs): totalevents = kwargs['task']['tm_totalunits'] firstEvent = 1 lastEvent = totalevents firstLumi = 1 lastLumi = 10 #MC comes with only one MCFakeFile singleMCFileset = Fileset(name = "MCFakeFileSet") newFile = File("MCFakeFile", size = 1000, events = totalevents) newFile.setLocation(self.config.Sites.available) newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1))) newFile["block"] = 'MCFackBlock' newFile["first_event"] = firstEvent newFile["last_event"] = lastEvent singleMCFileset.addFile(newFile) return Result(task=kwargs['task'], result=singleMCFileset)
def createTestJob(self): """ Create a test job to pass to the DashboardInterface """ job = Job(name = "ThisIsASillyName") testFileA = File(lfn = "/this/is/a/lfnA", size = 1024, events = 10) testFileA.addRun(Run(1, *[45])) testFileB = File(lfn = "/this/is/a/lfnB", size = 1024, events = 10) testFileB.addRun(Run(1, *[46])) job.addFile(testFileA) job.addFile(testFileB) job['id'] = 1 return job
def stuffACDCDatabase(self, numFiles=50, lumisPerFile=20, lumisPerACDCRecord=2): """ _stuffACDCDatabase_ Fill the ACDC database with ACDC records, both for processing and merge """ filesetName = '/%s/DataProcessing' % self.workflowName owner = self.user group = self.group for i in range(numFiles): for j in range(1, lumisPerFile + 1, lumisPerACDCRecord): lfn = '/store/data/a/%d' % i acdcFile = File(lfn=lfn, size=100, events=250, locations=self.validLocations, merged=1) run = Run(i + 1, *range(j, min(j + lumisPerACDCRecord, lumisPerFile + 1))) acdcFile.addRun(run) acdcDoc = {'collection_name' : self.workflowName, 'collection_type' : 'ACDC.CollectionTypes.DataCollection', 'files' : {lfn : acdcFile}, 'fileset_name' : filesetName, 'owner' : {'user': owner, 'group' : group}} self.acdcDB.queue(acdcDoc) filesetName = '/%s/DataProcessing/DataProcessingMergeRECOoutput' % self.workflowName for i in range(numFiles): for j in range(1, lumisPerFile + 1, lumisPerACDCRecord): lfn = '/store/unmerged/b/%d' % i acdcFile = File(lfn=lfn, size=100, events=250, locations=set([choice(self.validLocations)]), merged=0) run = Run(i + 1, *range(j, min(j + lumisPerACDCRecord, lumisPerFile + 1))) acdcFile.addRun(run) acdcDoc = {'collection_name' : self.workflowName, 'collection_type' : 'ACDC.CollectionTypes.DataCollection', 'files' : {lfn : acdcFile}, 'fileset_name' : filesetName, 'owner' : {'user': owner, 'group' : group}} self.acdcDB.queue(acdcDoc) self.acdcDB.commit() return
def stuffACDCDatabase(self, numFiles = 50, lumisPerFile = 20, lumisPerACDCRecord = 2): """ _stuffACDCDatabase_ Fill the ACDC database with ACDC records, both for processing and merge """ filesetName = '/%s/DataProcessing' % self.workflowName owner = 'unknown' group = 'unknown' for i in range(numFiles): for j in range(1, lumisPerFile + 1, lumisPerACDCRecord): lfn = '/store/data/a/%d' % i acdcFile = File(lfn = lfn, size = 100, events = 250, locations = self.validLocations, merged = 1) run = Run(i + 1, *range(j, min(j + lumisPerACDCRecord, lumisPerFile + 1))) acdcFile.addRun(run) acdcDoc = {'collection_name' : self.workflowName, 'collection_type' : 'ACDC.CollectionTypes.DataCollection', 'files' : {lfn : acdcFile}, 'fileset_name' : filesetName, 'owner' : {'user': owner, 'group' : group}} self.acdcDB.queue(acdcDoc) filesetName = '/%s/DataProcessing/DataProcessingMergeRECOoutput' % self.workflowName for i in range(numFiles): for j in range(1, lumisPerFile + 1, lumisPerACDCRecord): lfn = '/store/unmerged/b/%d' % i acdcFile = File(lfn = lfn, size = 100, events = 250, locations = set([choice(self.validLocations)]), merged = 0) run = Run(i + 1, *range(j, min(j + lumisPerACDCRecord, lumisPerFile + 1))) acdcFile.addRun(run) acdcDoc = {'collection_name' : self.workflowName, 'collection_type' : 'ACDC.CollectionTypes.DataCollection', 'files' : {lfn : acdcFile}, 'fileset_name' : filesetName, 'owner' : {'user': owner, 'group' : group}} self.acdcDB.queue(acdcDoc) self.acdcDB.commit() return
def setUp(self): """ _setUp_ Create two subscriptions: One that contains a single file and one that contains multiple files. """ self.multipleFileFileset = Fileset(name = "TestFileset1") for i in range(10): newFile = File(makeUUID(), size = 1000, events = 100, locations = set(["somese.cern.ch"])) newFile.addRun(Run(i, *[45+i])) self.multipleFileFileset.addFile(newFile) self.singleFileFileset = Fileset(name = "TestFileset2") newFile = File("/some/file/name", size = 1000, events = 100, locations = set(["somese.cern.ch"])) newFile.addRun(Run(1, *[45])) self.singleFileFileset.addFile(newFile) self.multipleFileLumiset = Fileset(name = "TestFileset3") for i in range(10): newFile = File(makeUUID(), size = 1000, events = 100, locations = set(["somese.cern.ch"])) newFile.addRun(Run(1, *[45+i/3])) self.multipleFileLumiset.addFile(newFile) self.singleLumiFileset = Fileset(name = "TestFileset4") for i in range(10): newFile = File(makeUUID(), size = 1000, events = 100, locations = set(["somese.cern.ch"])) newFile.addRun(Run(1, *[45])) self.singleLumiFileset.addFile(newFile) testWorkflow = Workflow() self.multipleFileSubscription = Subscription(fileset = self.multipleFileFileset, workflow = testWorkflow, split_algo = "EndOfRun", type = "Processing") self.singleFileSubscription = Subscription(fileset = self.singleFileFileset, workflow = testWorkflow, split_algo = "EndOfRun", type = "Processing") self.multipleLumiSubscription = Subscription(fileset = self.multipleFileLumiset, workflow = testWorkflow, split_algo = "EndOfRun", type = "Processing") self.singleLumiSubscription = Subscription(fileset = self.singleLumiFileset, workflow = testWorkflow, split_algo = "EndOfRun", type = "Processing") return
def testG_LumiMask(self): """ _testG_LumiMask_ Test that we can use a lumi-mask to filter good runs/lumis. """ splitter = SplitterFactory() # Create 3 files with 100 events per lumi: # - file1 with 1 run of 8 lumis # - file2 with 2 runs of 2 lumis each # - file3 with 1 run of 5 lumis fileA = File(lfn="/this/is/file1", size=1000, events=800) fileB = File(lfn="/this/is/file2", size=1000, events=400) fileC = File(lfn="/this/is/file3", size=1000, events=500) lumiListA = [] for lumi in range(8): lumiListA.append(10 + lumi) fileA.addRun(Run(1, *lumiListA)) fileA.setLocation("somese.cern.ch") lumiListB1 = [] lumiListB2 = [] for lumi in range(2): lumiListB1.append(20 + lumi) lumiListB2.append(30 + lumi) fileB.addRun(Run(2, *lumiListB1)) fileB.addRun(Run(3, *lumiListB2)) fileB.setLocation("somese.cern.ch") lumiListC = [] for lumi in range(5): lumiListC.append(40 + lumi) fileC.addRun(Run(4, *lumiListC)) fileC.setLocation("somese.cern.ch") testFileset = Fileset(name='Fileset') testFileset.addFile(fileA) testFileset.addFile(fileB) testFileset.addFile(fileC) testSubscription = Subscription(fileset=testFileset, workflow=self.testWorkflow, split_algo="EventAwareLumiBased", type="Processing") jobFactory = splitter(package="WMCore.DataStructs", subscription=testSubscription) # Use a lumi-mask = {1: [[10,14]], 2: [[20,21]], 4: [[40,41]]} jobGroups = jobFactory(halt_job_on_file_boundaries=False, splitOnRun=False, events_per_job=850, runs=['1', '2', '4'], lumis=['10,14', '20,21', '40,41'], performance=self.performanceParams) self.assertEqual(len(jobGroups), 1, "There should be only one job group") jobs = jobGroups[0].jobs self.assertEqual(len(jobs), 2, "Two jobs must be in the jobgroup") self.assertEqual(jobs[0]['mask'].getRunAndLumis(), {1: [[10, 14]], 2: [[20, 21]], 4: [[40, 40]]}) self.assertEqual(jobs[1]['mask'].getRunAndLumis(), {4: [[41, 41]]})
def testG_LumiMask(self): """ _testG_LumiMask_ Test that we can use a lumi-mask to filter good runs/lumis. """ splitter = SplitterFactory() # Create 3 files with 100 events per lumi: # - file1 with 1 run of 8 lumis # - file2 with 2 runs of 2 lumis each # - file3 with 1 run of 5 lumis fileA = File(lfn = "/this/is/file1", size = 1000, events = 800) fileB = File(lfn = "/this/is/file2", size = 1000, events = 400) fileC = File(lfn = "/this/is/file3", size = 1000, events = 500) lumiListA = [] for lumi in range(8): lumiListA.append(10 + lumi) fileA.addRun(Run(1, *lumiListA)) fileA.setLocation("somese.cern.ch") lumiListB1 = [] lumiListB2 = [] for lumi in range(2): lumiListB1.append(20 + lumi) lumiListB2.append(30 + lumi) fileB.addRun(Run(2, *lumiListB1)) fileB.addRun(Run(3, *lumiListB2)) fileB.setLocation("somese.cern.ch") lumiListC = [] for lumi in range(5): lumiListC.append(40 + lumi) fileC.addRun(Run(4, *lumiListC)) fileC.setLocation("somese.cern.ch") testFileset = Fileset(name = 'Fileset') testFileset.addFile(fileA) testFileset.addFile(fileB) testFileset.addFile(fileC) testSubscription = Subscription(fileset = testFileset, workflow = self.testWorkflow, split_algo = "EventAwareLumiBased", type = "Processing") jobFactory = splitter(package = "WMCore.DataStructs", subscription = testSubscription) # Use a lumi-mask = {1: [[10,14]], 2: [[20,21]], 4: [[40,41]]} jobGroups = jobFactory(halt_job_on_file_boundaries = False, splitOnRun = False, events_per_job = 850, runs = ['1', '2', '4'], lumis = ['10,14', '20,21', '40,41'], performance = self.performanceParams) self.assertEqual(len(jobGroups), 1, "There should be only one job group") jobs = jobGroups[0].jobs self.assertEqual(len(jobs), 2, "Two jobs must be in the jobgroup") self.assertEqual(jobs[0]['mask'].getRunAndLumis(), {1: [[10, 14]], 2: [[20, 21]], 4: [[40, 40]]}) self.assertEqual(jobs[1]['mask'].getRunAndLumis(), {4: [[41, 41]]})
def setUp(self): """ _setUp_ Initial Setup for the Job Testcase """ self.inputFiles = [] for i in range(1,1000): lfn = "/store/data/%s/%s/file.root" % (random.randint(1000, 9999), random.randint(1000, 9999)) size = random.randint(1000, 2000) events = 1000 run = random.randint(0, 2000) lumi = random.randint(0, 8) file = File(lfn = lfn, size = size, events = events, checksums = {"cksum": "1"}) file.addRun(Run(run, *[lumi])) self.inputFiles.append(file) self.dummyJob = Job(files = self.inputFiles) return
def execute(self, *args, **kwargs): self.logger.info("Data discovery and splitting for %s using user-provided files" % kwargs['task']['tm_taskname']) userfiles = kwargs['task']['tm_user_files'] splitting = kwargs['task']['tm_split_algo'] total_units = kwargs['task']['tm_totalunits'] if not userfiles or splitting != 'FileBased': if not userfiles: msg = "No files specified to process for task %s." % kwargs['task']['tm_taskname'] if splitting != 'FileBased': msg = "Data.splitting must be set to 'FileBased' when using a custom set of files." raise TaskWorkerException(msg) if hasattr(self.config.Sites, 'available'): locations = self.config.Sites.available else: with self.config.TaskWorker.envForCMSWEB : configDict = {"cacheduration": 1, "pycurl": True} # cache duration is in hours resourceCatalog = CRIC(logger=self.logger, configDict=configDict) locations = resourceCatalog.getAllPSNs() userFileset = Fileset(name = kwargs['task']['tm_taskname']) self.logger.info("There are %d files specified by the user." % len(userfiles)) if total_units > 0: self.logger.info("Will run over the first %d files." % total_units) file_counter = 0 for userfile, idx in zip(userfiles, range(len(userfiles))): newFile = File(userfile, size = 1000, events = 1) newFile.setLocation(locations) newFile.addRun(Run(1, idx)) newFile["block"] = 'UserFilesFakeBlock' newFile["first_event"] = 1 newFile["last_event"] = 2 userFileset.addFile(newFile) file_counter += 1 if total_units > 0 and file_counter >= total_units: break return Result(task = kwargs['task'], result = userFileset)
def getChunkFiles(self, collectionName, filesetName, chunkOffset, chunkSize=100): """ _getChunkFiles_ Retrieve a chunk of files from the given collection and task. """ chunkFiles = [] files = self._getFilesetInfo(collectionName, filesetName, chunkOffset, chunkSize) files = mergeFakeFiles(files) for fileInfo in files: newFile = File(lfn=fileInfo["lfn"], size=fileInfo["size"], events=fileInfo["events"], parents=set(fileInfo["parents"]), locations=set(fileInfo["locations"]), merged=fileInfo["merged"]) for run in fileInfo["runs"]: newRun = Run(run["run_number"]) newRun.extend(run["lumis"]) newFile.addRun(newRun) chunkFiles.append(newFile) return chunkFiles
def testAddRun(self): """ This tests the addRun() function of a DataStructs File object """ testLFN = "lfn" testSize = "1024" testEvents = "100" testCksum = "1" testParents = "parent" testLumi = 1 testRunNumber = 1000000 testFile = File(lfn = testLFN, size = testSize, events = testEvents, checksums = testCksum, parents = testParents) testRun = Run(testRunNumber, testLumi) testFile.addRun(testRun) assert testRun in testFile['runs'], "Run not added properly to run in File.addRun()" return
def returnDataStructsFile(self): """ _returnDataStructsFile_ Creates a dataStruct file out of this file """ parents = set() for parent in self["parents"]: parents.add(WMFile(lfn = parent['lfn'], size = parent['size'], events = parent['events'], checksums = parent['checksums'], parents = parent['parents'], merged = parent['merged'])) file = WMFile(lfn = self['lfn'], size = self['size'], events = self['events'], checksums = self['checksums'], parents = parents, merged = self['merged']) for run in self['runs']: file.addRun(run) for location in self['locations']: file.setLocation(se = location) return file
def getChunkFiles(self, collectionName, filesetName, chunkOffset, chunkSize=100, user="******", group="cmsdataops"): """ _getChunkFiles_ Retrieve a chunk of files from the given collection and task. """ chunkFiles = [] result = self.couchdb.loadView( "ACDC", "owner_coll_fileset_files", { "startkey": [group, user, collectionName, filesetName], "endkey": [group, user, collectionName, filesetName, {}], "limit": chunkSize, "skip": chunkOffset, }, []) for row in result["rows"]: resultRow = row['value'] newFile = File(lfn=resultRow["lfn"], size=resultRow["size"], events=resultRow["events"], parents=set(resultRow["parents"]), locations=set(resultRow["locations"]), merged=resultRow["merged"]) for run in resultRow["runs"]: newRun = Run(run["run_number"]) newRun.extend(run["lumis"]) newFile.addRun(newRun) chunkFiles.append(newFile) return chunkFiles
def createSubscription(self, nFiles, lumisPerFile, twoSites = False): """ _createSubscription_ Create a subscription for testing """ baseName = makeUUID() testFileset = Fileset(name = baseName) for i in range(nFiles): newFile = File(lfn = '%s_%i' % (baseName, i), size = 1000, events = 100) lumis = [] for lumi in range(lumisPerFile): lumis.append((i * 100) + lumi) newFile.addRun(Run(i, *lumis)) newFile.setLocation('blenheim') testFileset.addFile(newFile) if twoSites: for i in range(nFiles): newFile = File(lfn = '%s_%i_2' % (baseName, i), size = 1000, events = 100) lumis = [] for lumi in range(lumisPerFile): lumis.append(5 + 10 * (i * 100) + lumi) #lumis should be different newFile.addRun(Run(i, *lumis)) newFile.setLocation('malpaquet') testFileset.addFile(newFile) testSubscription = Subscription(fileset = testFileset, workflow = self.testWorkflow, split_algo = "LumiBased", type = "Processing") return testSubscription
def createSubscription(self, nFiles, lumisPerFile, twoSites=False): """ _createSubscription_ Create a subscription for testing """ baseName = makeUUID() testFileset = Fileset(name=baseName) for i in range(nFiles): newFile = File(lfn='%s_%i' % (baseName, i), size=1000, events=100) lumis = [] for lumi in range(lumisPerFile): lumis.append((i * 100) + lumi) newFile.addRun(Run(i, *lumis)) newFile.setLocation('blenheim') testFileset.addFile(newFile) if twoSites: for i in range(nFiles): newFile = File(lfn='%s_%i_2' % (baseName, i), size=1000, events=100) lumis = [] for lumi in range(lumisPerFile): lumis.append(5 + 10 * (i * 100) + lumi) #lumis should be different newFile.addRun(Run(i, *lumis)) newFile.setLocation('malpaquet') testFileset.addFile(newFile) testSubscription = Subscription(fileset=testFileset, workflow=self.testWorkflow, split_algo="LumiBased", type="Processing") return testSubscription
def generateFakeMCFile(self, numEvents=100, firstEvent=1, lastEvent=100, firstLumi=1, lastLumi=10, existingSub=None): # MC comes with only one MCFakeFile newFile = File("MCFakeFileTest", size=1000, events=numEvents) newFile.setLocation('se01') if firstLumi == lastLumi: newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1))) else: newFile.addRun(Run(1, *range(firstLumi, lastLumi))) newFile["first_event"] = firstEvent newFile["last_event"] = lastEvent if existingSub is None: singleMCFileset = Fileset(name="MCTestFileset") singleMCFileset.addFile(newFile) testWorkflow = Workflow() existingSub = Subscription(fileset=singleMCFileset, workflow=testWorkflow, split_algo="EventBased", type="Production") else: existingSub['fileset'].addFile(newFile) return existingSub
def formatOutput(self, task, requestname, datasetfiles, locations): """Receives as input the result of the data location discovery operations and fill up the WMCore objects.""" self.logger.debug(" Formatting data discovery output ") # TEMPORARY secmsmap = {} sbj = SiteDBJSON({"key":self.config.MyProxy.serverhostkey, "cert":self.config.MyProxy.serverhostcert}) wmfiles = [] lumicounter = evecounter = 0 for lfn, infos in datasetfiles.iteritems(): wmfile = File(lfn=lfn, events=infos['NumberOfEvents'], size=infos['Size'], checksums=infos['Checksums']) wmfile['block'] = infos['BlockName'] wmfile['locations'] = [] if locations.has_key(infos['BlockName']): for se in locations[infos['BlockName']]: if se not in secmsmap: self.logger.debug("Translating SE %s" %se) try: secmsmap[se] = sbj.seToCMSName(se) except KeyError, ke: self.logger.error("Impossible translating %s to a CMS name through SiteDB" %se) secmsmap[se] = '' if se in secmsmap: if type(secmsmap[se]) == list: wmfile['locations'].extend(secmsmap[se]) else: wmfile['locations'].append(secmsmap[se]) wmfile['workflow'] = requestname evecounter += infos['NumberOfEvents'] for run, lumis in infos['Lumis'].iteritems(): #self.logger.debug(' - adding run %d and lumis %s' %(run, lumis)) wmfile.addRun(Run(run, *lumis)) lumicounter += len(lumis) wmfiles.append(wmfile)
def testD_NoFileSplitNoHardLimit(self): """ _testD_NoFileSplitNoHardLimit_ In this case we don't split on file boundaries, check different combination of files make sure we make the most of the splitting, e.g. include many zero event files in a single job. """ splitter = SplitterFactory() #Create 100 files with 7 lumi per file and 0 events per lumi on average. testSubscription = self.createSubscription(nFiles = 100, lumisPerFile = 7, twoSites = False, nEventsPerFile = 0) jobFactory = splitter(package = "WMCore.DataStructs", subscription = testSubscription) #First test, the optimal settings are 360 events per job #As we have files with 0 events per lumi, this will configure the splitting to #a single job containing all files jobGroups = jobFactory(halt_job_on_file_boundaries = False, splitOnRun = False, events_per_job = 360) self.assertEqual(len(jobGroups), 1, "There should be only one job group") jobs = jobGroups[0].jobs self.assertEqual(len(jobs), 1, "There should be 1 job") self.assertEqual(len(jobs[0]['input_files']), 100, "All 100 files must be in the job") #Create 7 files, each one with different lumi/event distributions testFileset = Fileset(name = "FilesetA") testFileA = self.createFile("/this/is/file1", 250, 0, 5, "blenheim") testFileB = self.createFile("/this/is/file2", 600, 1, 1, "blenheim") testFileC = self.createFile("/this/is/file3", 1200, 2, 2, "blenheim") testFileD = self.createFile("/this/is/file4", 100, 3, 1, "blenheim") testFileE = self.createFile("/this/is/file5", 30, 4, 1, "blenheim") testFileF = self.createFile("/this/is/file6", 10, 5, 1, "blenheim") testFileG = self.createFile("/this/is/file7", 151, 6, 3, "blenheim") testFileset.addFile(testFileA) testFileset.addFile(testFileB) testFileset.addFile(testFileC) testFileset.addFile(testFileD) testFileset.addFile(testFileE) testFileset.addFile(testFileF) testFileset.addFile(testFileG) testSubscription = Subscription(fileset = testFileset, workflow = self.testWorkflow, split_algo = "EventAwareLumiBased", type = "Processing") jobFactory = splitter(package = "WMCore.DataStructs", subscription = testSubscription) #Optimal settings are: jobs with 150 events per job #This means, the first file must be splitted in 3 lumis per job which would leave room #for another lumi in the second job, but the second file has a lumi too big for that #The 3rd job only contains the second file, the fourth and fifth job split the third file jobGroups = jobFactory(halt_job_on_file_boundaries = False, splitOnRun = False, events_per_job = 150) self.assertEqual(len(jobGroups), 1, "There should be only one job group") jobs = jobGroups[0].jobs self.assertEqual(len(jobs), 8, "Eight jobs must be in the jobgroup") self.assertEqual(jobs[0]["mask"].getRunAndLumis(), {0L : [[0L, 2L]]}, "Wrong mask for the first job") self.assertEqual(jobs[1]["mask"].getRunAndLumis(), {0L : [[3L, 4L]]}, "Wrong mask for the second job") self.assertEqual(jobs[2]["mask"].getRunAndLumis(), {1L : [[1L, 1L]]}, "Wrong mask for the third job") self.assertEqual(jobs[3]["mask"].getRunAndLumis(), {2L : [[4L, 4L]]}, "Wrong mask for the fourth job") self.assertEqual(jobs[4]["mask"].getRunAndLumis(), {2L : [[5L, 5L]]}, "Wrong mask for the fifth job") self.assertEqual(jobs[5]["mask"].getRunAndLumis(), {3L : [[3L, 3L]], 4L : [[4L, 4L]], 5L : [[5L, 5L]]}, "Wrong mask for the sixth job") self.assertEqual(jobs[6]["mask"].getRunAndLumis(), {6L : [[18L, 19L]]}, "Wrong mask for the seventh job") self.assertEqual(jobs[7]["mask"].getRunAndLumis(), {6L : [[20L, 20L]]}, "Wrong mask for the seventh job") #Test interactions of this algorithm with splitOnRun = True #Make 2 files, one with 3 runs and a second one with the last run of the first fileA = File(lfn = "/this/is/file1", size = 1000, events = 2400) lumiListA = [] lumiListB = [] lumiListC = [] for lumi in range(8): lumiListA.append(1 + lumi) lumiListB.append(1 + lumi) lumiListC.append(1 + lumi) fileA.addRun(Run(1, *lumiListA)) fileA.addRun(Run(2, *lumiListA)) fileA.addRun(Run(3, *lumiListA)) fileA.setLocation("malpaquet") fileB = self.createFile('/this/is/file2', 200, 3, 5, "malpaquet") testFileset = Fileset(name = 'FilesetB') testFileset.addFile(fileA) testFileset.addFile(fileB) testSubscription = Subscription(fileset = testFileset, workflow = self.testWorkflow, split_algo = "EventAwareLumiBased", type = "Processing") jobFactory = splitter(package = "WMCore.DataStructs", subscription = testSubscription) #The settings for this splitting are 700 events per job jobGroups = jobFactory(splitOnRun = True, halt_job_on_file_boundaries = False, events_per_job = 700) self.assertEqual(len(jobGroups), 1, "There should be only one job group") jobs = jobGroups[0].jobs self.assertEqual(len(jobs), 6, "Six jobs must be in the jobgroup")
def testRunWhiteList(self): """ _testRunWhiteList_ Test that we can use a run white list to filter good runs/lumis. """ splitter = SplitterFactory() # Create 3 files with 100 events per lumi: # - file1 with 1 run of 8 lumis # - file2 with 2 runs of 2 lumis each # - file3 with 1 run of 5 lumis fileA = File(lfn="/this/is/file1", size=1000, events=800) fileB = File(lfn="/this/is/file2", size=1000, events=400) fileC = File(lfn="/this/is/file3", size=1000, events=500) lumiListA = [] for lumi in range(8): lumiListA.append(10 + lumi) fileA.addRun(Run(1, *lumiListA)) fileA.setLocation("somese.cern.ch") lumiListB1 = [] lumiListB2 = [] for lumi in range(2): lumiListB1.append(20 + lumi) lumiListB2.append(30 + lumi) fileB.addRun(Run(2, *lumiListB1)) fileB.addRun(Run(3, *lumiListB2)) fileB.setLocation("somese.cern.ch") lumiListC = [] for lumi in range(5): lumiListC.append(40 + lumi) fileC.addRun(Run(4, *lumiListC)) fileC.setLocation("somese.cern.ch") testFileset = Fileset(name='Fileset') testFileset.addFile(fileA) testFileset.addFile(fileB) testFileset.addFile(fileC) testSubscription = Subscription(fileset=testFileset, workflow=self.testWorkflow, split_algo="EventAwareLumiByWork", type="Processing") jobFactory = splitter(package="WMCore.DataStructs", subscription=testSubscription) # Split with no breaks jobGroups = jobFactory(halt_job_on_file_boundaries=False, splitOnRun=False, events_per_job=725, runWhitelist=[1, 4], performance=self.performanceParams) self.assertEqual(len(jobGroups), 1) jobs = jobGroups[0].jobs self.assertEqual(len(jobs), 2) for job in jobs: for run in job['mask'].getRunAndLumis().keys(): self.assertIn(run, [1, 4]) # Re-split with a break on runs jobGroups = jobFactory(halt_job_on_file_boundaries=False, splitOnRun=True, events_per_job=595, runWhitelist=[1, 3, 4], performance=self.performanceParams) self.assertEqual(len(jobGroups), 1) jobs = jobGroups[0].jobs self.assertEqual(len(jobs), 4) self.enforceLimits(jobs=jobs, runsPerJob=1) for job in jobs: for run in job['mask'].getRunAndLumis().keys(): self.assertIn(run, [1, 3, 4]) # Re-split with a break on files jobGroups = jobFactory(halt_job_on_file_boundaries=True, splitOnRun=False, events_per_job=595, runWhitelist=[1, 2, 3], performance=self.performanceParams) self.assertEqual(len(jobGroups), 1) jobs = jobGroups[0].jobs self.assertEqual(len(jobs), 3) self.enforceLimits(jobs=jobs, filesPerJob=1) for job in jobs: for run in job['mask'].getRunAndLumis().keys(): self.assertIn(run, [1, 2, 3])
def formatOutput(self, task, requestname, datasetfiles, locations, tempDir): """ Receives as input the result of the data location discovery operations and fill up the WMCore objects. """ self.logger.debug(" Formatting data discovery output ") wmfiles = [] event_counter = 0 lumi_counter = 0 uniquelumis = set() datasetLumis = {} blocksWithNoLocations = set() ## Loop over the sorted list of files. configDict = { "cacheduration": 1, "pycurl": True } # cache duration is in hours with tempSetLogLevel(logger=self.logger, level=logging.ERROR): resourceCatalog = CRIC(logger=self.logger, configDict=configDict) # can't affort one message from CRIC per file, unless critical ! with tempSetLogLevel(logger=self.logger, level=logging.ERROR): for lfn, infos in datasetfiles.iteritems(): ## Skip the file if it is not in VALID state. if not infos.get('ValidFile', True): self.logger.warning("Skipping invalid file %s", lfn) continue ## Skip the file if the block has not been found or has no locations. if not infos['BlockName'] in locations or not locations[ infos['BlockName']]: self.logger.warning( "Skipping %s because its block (%s) has no locations", lfn, infos['BlockName']) blocksWithNoLocations.add(infos['BlockName']) continue if task['tm_use_parent'] == 1 and len(infos['Parents']) == 0: self.logger.warning( "Skipping %s because it has no parents") continue ## Create a WMCore File object. size = infos['FileSize'] checksums = { 'Checksum': infos['Checksum'], 'Adler32': infos['Adler32'], 'Md5': infos['Md5'] } wmfile = File(lfn=lfn, events=infos['NumberOfEvents'], size=size, checksums=checksums, parents=infos['Parents']) wmfile['block'] = infos['BlockName'] try: wmfile['locations'] = resourceCatalog.PNNstoPSNs( locations[wmfile['block']]) except Exception as ex: self.logger.error( "Impossible translating %s to a CMS name through CMS Resource Catalog", locations[wmfile['block']]) self.logger.error("got this exception:\n %s", ex) raise wmfile['workflow'] = requestname event_counter += infos['NumberOfEvents'] for run, lumis in infos['Lumis'].iteritems(): datasetLumis.setdefault(run, []).extend(lumis) wmfile.addRun(Run(run, *lumis)) for lumi in lumis: uniquelumis.add((run, lumi)) lumi_counter += len(lumis) wmfiles.append(wmfile) if blocksWithNoLocations: msg = "%d blocks will be skipped because are not completely replicated on DISK: %s" % ( len(blocksWithNoLocations), list(blocksWithNoLocations)) self.logger.warning(msg) self.uploadWarning(msg, task['user_proxy'], task['tm_taskname']) uniquelumis = len(uniquelumis) self.logger.debug('Tot events found: %d', event_counter) self.logger.debug('Tot lumis found: %d', uniquelumis) self.logger.debug('Duplicate lumis found: %d', (lumi_counter - uniquelumis)) self.logger.debug('Tot files found: %d', len(wmfiles)) self.logger.debug( "Starting to create compact lumilists for input dataset") datasetLumiList = LumiList(runsAndLumis=datasetLumis) datasetLumis = datasetLumiList.getCompactList() datasetDuplicateLumis = datasetLumiList.getDuplicates().getCompactList( ) self.logger.debug( "Finished to create compact lumilists for input dataset") with open(os.path.join(tempDir, "input_dataset_lumis.json"), "w") as fd: json.dump(datasetLumis, fd) with open(os.path.join(tempDir, "input_dataset_duplicate_lumis.json"), "w") as fd: json.dump(datasetDuplicateLumis, fd) return Result(task=task, result=Fileset(name='FilesToSplit', files=set(wmfiles)))
def testD_NoFileSplitNoHardLimit(self): """ _testD_NoFileSplitNoHardLimit_ In this case we don't split on file boundaries, check different combination of files make sure we make the most of the splitting, e.g. include many zero event files in a single job. """ splitter = SplitterFactory() #Create 100 files with 7 lumi per file and 0 events per lumi on average. testSubscription = self.createSubscription(nFiles=100, lumisPerFile=7, twoSites=False, nEventsPerFile=0) jobFactory = splitter(package="WMCore.DataStructs", subscription=testSubscription) #First test, the optimal settings are 360 events per job #As we have files with 0 events per lumi, this will configure the splitting to #a single job containing all files jobGroups = jobFactory(halt_job_on_file_boundaries=False, splitOnRun=False, events_per_job=360) self.assertEqual(len(jobGroups), 1, "There should be only one job group") jobs = jobGroups[0].jobs self.assertEqual(len(jobs), 1, "There should be 1 job") self.assertEqual(len(jobs[0]['input_files']), 100, "All 100 files must be in the job") #Create 7 files, each one with different lumi/event distributions testFileset = Fileset(name="FilesetA") testFileA = self.createFile("/this/is/file1", 250, 0, 5, "blenheim") testFileB = self.createFile("/this/is/file2", 600, 1, 1, "blenheim") testFileC = self.createFile("/this/is/file3", 1200, 2, 2, "blenheim") testFileD = self.createFile("/this/is/file4", 100, 3, 1, "blenheim") testFileE = self.createFile("/this/is/file5", 30, 4, 1, "blenheim") testFileF = self.createFile("/this/is/file6", 10, 5, 1, "blenheim") testFileG = self.createFile("/this/is/file7", 151, 6, 3, "blenheim") testFileset.addFile(testFileA) testFileset.addFile(testFileB) testFileset.addFile(testFileC) testFileset.addFile(testFileD) testFileset.addFile(testFileE) testFileset.addFile(testFileF) testFileset.addFile(testFileG) testSubscription = Subscription(fileset=testFileset, workflow=self.testWorkflow, split_algo="EventAwareLumiBased", type="Processing") jobFactory = splitter(package="WMCore.DataStructs", subscription=testSubscription) #Optimal settings are: jobs with 150 events per job #This means, the first file must be splitted in 3 lumis per job which would leave room #for another lumi in the second job, but the second file has a lumi too big for that #The 3rd job only contains the second file, the fourth and fifth job split the third file jobGroups = jobFactory(halt_job_on_file_boundaries=False, splitOnRun=False, events_per_job=150) self.assertEqual(len(jobGroups), 1, "There should be only one job group") jobs = jobGroups[0].jobs self.assertEqual(len(jobs), 8, "Eight jobs must be in the jobgroup") self.assertEqual(jobs[0]["mask"].getRunAndLumis(), {0L: [[0L, 2L]]}, "Wrong mask for the first job") self.assertEqual(jobs[1]["mask"].getRunAndLumis(), {0L: [[3L, 4L]]}, "Wrong mask for the second job") self.assertEqual(jobs[2]["mask"].getRunAndLumis(), {1L: [[1L, 1L]]}, "Wrong mask for the third job") self.assertEqual(jobs[3]["mask"].getRunAndLumis(), {2L: [[4L, 4L]]}, "Wrong mask for the fourth job") self.assertEqual(jobs[4]["mask"].getRunAndLumis(), {2L: [[5L, 5L]]}, "Wrong mask for the fifth job") self.assertEqual(jobs[5]["mask"].getRunAndLumis(), { 3L: [[3L, 3L]], 4L: [[4L, 4L]], 5L: [[5L, 5L]] }, "Wrong mask for the sixth job") self.assertEqual(jobs[6]["mask"].getRunAndLumis(), {6L: [[18L, 19L]]}, "Wrong mask for the seventh job") self.assertEqual(jobs[7]["mask"].getRunAndLumis(), {6L: [[20L, 20L]]}, "Wrong mask for the seventh job") #Test interactions of this algorithm with splitOnRun = True #Make 2 files, one with 3 runs and a second one with the last run of the first fileA = File(lfn="/this/is/file1", size=1000, events=2400) lumiListA = [] lumiListB = [] lumiListC = [] for lumi in range(8): lumiListA.append(1 + lumi) lumiListB.append(1 + lumi) lumiListC.append(1 + lumi) fileA.addRun(Run(1, *lumiListA)) fileA.addRun(Run(2, *lumiListA)) fileA.addRun(Run(3, *lumiListA)) fileA.setLocation("malpaquet") fileB = self.createFile('/this/is/file2', 200, 3, 5, "malpaquet") testFileset = Fileset(name='FilesetB') testFileset.addFile(fileA) testFileset.addFile(fileB) testSubscription = Subscription(fileset=testFileset, workflow=self.testWorkflow, split_algo="EventAwareLumiBased", type="Processing") jobFactory = splitter(package="WMCore.DataStructs", subscription=testSubscription) #The settings for this splitting are 700 events per job jobGroups = jobFactory(splitOnRun=True, halt_job_on_file_boundaries=False, events_per_job=700) self.assertEqual(len(jobGroups), 1, "There should be only one job group") jobs = jobGroups[0].jobs self.assertEqual(len(jobs), 6, "Six jobs must be in the jobgroup")
def formatOutput(self, task, requestname, datasetfiles, locations, tempDir): """ Receives as input the result of the data location discovery operations and fill up the WMCore objects. """ self.logger.debug(" Formatting data discovery output ") # TEMPORARY pnn_psn_map = {} sbj = SiteDBJSON({ "key": self.config.TaskWorker.cmskey, "cert": self.config.TaskWorker.cmscert }) wmfiles = [] event_counter = 0 lumi_counter = 0 uniquelumis = set() datasetLumis = {} ## Loop over the sorted list of files. for lfn, infos in datasetfiles.iteritems(): ## Skip the file if the block has not been found or has no locations. if not infos['BlockName'] in locations or not locations[ infos['BlockName']]: self.logger.warning( "Skipping %s because its block (%s) has no locations" % (lfn, infos['BlockName'])) continue ## Skip the file if it is not in VALID state. if not infos.get('ValidFile', True): self.logger.warning("Skipping invalid file %s" % lfn) continue if task['tm_use_parent'] == 1 and len(infos['Parents']) == 0: raise TaskWorkerException( "The CRAB3 server backend refuses to submit jobs to the Grid scheduler\n" + "because you specified useParents=True but some your files have no" + "parents.\nExample: " + lfn) ## Create a WMCore File object. try: size = infos['FileSize'] checksums = { 'Checksum': infos['Checksum'], 'Adler32': infos['Adler32'], 'Md5': infos['Md5'] } except: #This is so that the task worker does not crash if an old version of WMCore is used (the interface of an API suddenly changed). # We may want to remove the try/except and the following two lines eventually, but keeping them for the moment so other devels won't be affected #See this WMCore commit: https://github.com/dmwm/WMCore/commit/2afc01ae571390f5fa009dd258be757adac89c28#diff-374b7a6640288184175057234e393e1cL204 size = infos['Size'] checksums = infos['Checksums'] wmfile = File(lfn=lfn, events=infos['NumberOfEvents'], size=size, checksums=checksums, parents=infos['Parents']) wmfile['block'] = infos['BlockName'] wmfile['locations'] = [] for pnn in locations[infos['BlockName']]: if pnn and pnn not in pnn_psn_map: self.logger.debug("Translating PNN %s" % pnn) try: pnn_psn_map[pnn] = sbj.PNNtoPSN(pnn) except KeyError: self.logger.error( "Impossible translating %s to a CMS name through SiteDB" % pnn) pnn_psn_map[pnn] = '' except httplib.HTTPException as ex: self.logger.error("Couldn't map SE to site: %s" % pnn) print("Couldn't map SE to site: %s" % pnn) print("got problem: %s" % ex) print("got another problem: %s" % ex.__dict__) if pnn and pnn in pnn_psn_map: if isinstance(pnn_psn_map[pnn], list): wmfile['locations'].extend(pnn_psn_map[pnn]) else: wmfile['locations'].append(pnn_psn_map[pnn]) wmfile['workflow'] = requestname event_counter += infos['NumberOfEvents'] for run, lumis in infos['Lumis'].iteritems(): datasetLumis.setdefault(run, []).extend(lumis) wmfile.addRun(Run(run, *lumis)) for lumi in lumis: uniquelumis.add((run, lumi)) lumi_counter += len(lumis) wmfiles.append(wmfile) uniquelumis = len(uniquelumis) self.logger.debug('Tot events found: %d' % event_counter) self.logger.debug('Tot lumis found: %d' % uniquelumis) self.logger.debug('Duplicate lumis found: %d' % (lumi_counter - uniquelumis)) self.logger.debug('Tot files found: %d' % len(wmfiles)) self.logger.debug( "Starting to create compact lumilists for input dataset") datasetLumiList = LumiList(runsAndLumis=datasetLumis) datasetLumis = datasetLumiList.getCompactList() datasetDuplicateLumis = datasetLumiList.getDuplicates().getCompactList( ) self.logger.debug( "Finished to create compact lumilists for input dataset") with open(os.path.join(tempDir, "input_dataset_lumis.json"), "w") as fd: json.dump(datasetLumis, fd) with open(os.path.join(tempDir, "input_dataset_duplicate_lumis.json"), "w") as fd: json.dump(datasetDuplicateLumis, fd) return Result(task=task, result=Fileset(name='FilesToSplit', files=set(wmfiles)))
def testProcessing(self): """ _testProcessing_ Setup a processing workflow and job and verify that the FWJR produced by the emulator is reasonable. """ rerecoTask = self.workload.getTask("DataProcessing") cmsRunStep = rerecoTask.getStep("cmsRun1") inputFile = File(lfn = "/path/to/test/lfn", size = 1048576, events = 1000, merged = True) inputFile.addRun(Run(1, *[1, 2, 3, 4, 5])) inputFile.addRun(Run(2, *[1, 2, 3, 4, 5, 6])) processingJob = Job(name = "ProcessingJob", files = [inputFile]) processingJob["task"] = "/Tier1ReReco/ReReco" processingJob["mask"].setMaxAndSkipEvents(500, 0) processingJob["id"] = 1 processingJob["location"] = "cmssrm.fnal.gov" emu = ReportEmu(WMStep = cmsRunStep.getTypeHelper(), Job = processingJob) report = emu() reportInputFiles = report.getInputFilesFromStep("cmsRun1") assert len(reportInputFiles) == 1, \ "Error: Wrong number of input files for the job." assert reportInputFiles[0]["lfn"] == inputFile["lfn"], \ "Error: Input LFNs do not match: %s" % reportInputFiles[0]["lfn"] assert reportInputFiles[0]["size"] == inputFile["size"], \ "Error: Input file sizes do not match." assert reportInputFiles[0]["events"] == inputFile["events"], \ "Error: Input file events do not match." goldenRuns = [Run(1, *[1, 2, 3, 4, 5]), Run(2, *[1, 2, 3, 4, 5, 6])] assert len(reportInputFiles[0]["runs"]) == len(goldenRuns), \ "Error: Wrong number of runs in input file." for inputRun in reportInputFiles[0]["runs"]: for goldenRun in goldenRuns: if inputRun.run == goldenRun.run: goldenRun.lumis.sort() inputRun.lumis.sort() if goldenRun.lumis == inputRun.lumis: goldenRuns.remove(goldenRun) break assert len(goldenRuns) == 0, \ "Error: Run information wrong on input file." recoOutputFiles = report.getFilesFromOutputModule("cmsRun1", "outputRECORECO") alcaOutputFiles = report.getFilesFromOutputModule("cmsRun1", "outputALCARECOALCARECO") assert len(recoOutputFiles) == 1, \ "Error: There should only be one RECO output file." assert len(alcaOutputFiles) == 1, \ "Error: There should only be one ALCA output file." assert recoOutputFiles[0]["module_label"] == "outputRECORECO", \ "Error: RECO file has wrong output module." assert alcaOutputFiles[0]["module_label"] == "outputALCARECOALCARECO", \ "Error: ALCA file has wrong output module." self.verifyOutputMetaData(recoOutputFiles[0], processingJob) self.verifyOutputMetaData(alcaOutputFiles[0], processingJob) dataTierMap = {"outputRECORECO": "RECO", "outputALCARECOALCARECO": "ALCARECO"} for outputFile in [recoOutputFiles[0], alcaOutputFiles[0]]: assert outputFile["dataset"]["applicationName"] == "cmsRun", \ "Error: Application name is incorrect." assert outputFile["dataset"]["primaryDataset"] == self.primaryDataset, \ "Error: Primary dataset is incorrect." assert outputFile["dataset"]["dataTier"] == dataTierMap[outputFile["module_label"]], \ "Error: Data tier is incorrect." return