def setUp(self): """ Create a dummy fileset and populate it with random files, in order to use it for the testcase methods """ logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', datefmt='%m-%d %H:%M', filename=__file__.replace('.py','.log'), filemode='w') self.logger = logging.getLogger('FilesetClassTest') #Setup the initial testcase environment: initialfile = File('/tmp/lfn1',1000,1,1,1) self.initialSet = set() self.initialSet.add(initialfile) #Create a Fileset, containing a initial file on it. self.fileset = Fileset(name = 'self.fileset', files = self.initialSet) #Populate the fileset with random files for i in range(1,1000): lfn = '/store/data/%s/%s/file.root' % (random.randint(1000, 9999), random.randint(1000, 9999)) size = random.randint(1000, 2000) events = 1000 run = random.randint(0, 2000) lumi = random.randint(0, 8) file = File(lfn=lfn, size=size, events=events, checksums = {"cksum": "1"}) file.addRun(Run(run, *[lumi])) self.fileset.addFile(file)
def __call__(self, fileset): """ return a randomly sized list of files (DataStructs.File) at locations files will always be new """ num_files = random.randint(0 , self.max) for f in self.makelist(fileset): list = [] for i in range(0, num_files): # Decide where the file is locs = [] for i in range(0, len(self.locations)): if random.randint(0 , 1): locs.append(self.locations[i]) lfn='/store/data/fake-feeder-files/notreal/%s.root' % uuid(i) size=2000 + ((i-5) * 50) events=1000 + ((i-3) * 150) run = random.randint(0 , int(3.14159265 * i * self.max)) lumi = random.randint(0 ,10) file = File(lfn, size, events, run, lumi) file.setLocation(locs) f.addFile(file) return fileset
def execute(self, *args, **kwargs): #pylint: disable=unused-argument # since https://github.com/dmwm/CRABServer/issues/5633 totalunits can be a float # but that would confuse WMCore, therefore cast to int totalevents = int(kwargs['task']['tm_totalunits']) firstEvent = 1 lastEvent = totalevents firstLumi = 1 lastLumi = 10 # Set a default of 100 events per lumi. This is set as a task # property, as the splitting considers it independently of the file # information provided by the fake dataset. if not kwargs['task']['tm_events_per_lumi']: kwargs['task']['tm_events_per_lumi'] = 100 #MC comes with only one MCFakeFile singleMCFileset = Fileset(name = "MCFakeFileSet") newFile = File("MCFakeFile", size = 1000, events = totalevents) newFile.setLocation(self.getListOfSites()) newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1))) newFile["block"] = 'MCFakeBlock' newFile["first_event"] = firstEvent newFile["last_event"] = lastEvent singleMCFileset.addFile(newFile) return Result(task=kwargs['task'], result=singleMCFileset)
def testAddFile(self): """ Testcase for the addFile method of the Fileset class """ #First test - Add file and check if its there testfile = File('/tmp/lfntest',9999,9,9) self.fileset.addFile(testfile) assert(testfile in self.fileset.listNewFiles(), 'Couldn\'t add file ' + 'to fileset - fileset.addfile method not working') #Second test - Add file that was already at Fileset.files , # and check if it gets updated testFileSame = File('/tmp/lfntest',9999,9,9) testFileSame.setLocation(set('dummyse.dummy.com')) self.fileset.addFile(testFileSame) assert(testFileSame in self.fileset.getFiles(),'Same file copy ' + 'failed - fileset.addFile not updating location of already ' + 'existing files' ) assert(testfile in self.fileset.getFiles(),'Same file copy ' + 'failed - fileset.addFile unable to remove previous file ' + 'from list') #Third test - Add file that was already at Fileset.newfiles , #and check if it gets updated assert(testFileSame in self.fileset.listNewFiles(),'Same file copy ' + 'failed - fileset.addFile not adding file to fileset.newFiles')
def getChunkFiles(self, collectionName, filesetName, chunkOffset, chunkSize = 100, user = "******", group = "cmsdataops"): """ _getChunkFiles_ Retrieve a chunk of files from the given collection and task. """ chunkFiles = [] result = self.couchdb.loadView("ACDC", "owner_coll_fileset_files", {"startkey": [group, user, collectionName, filesetName], "endkey": [group, user, collectionName, filesetName, {}], "limit": chunkSize, "skip": chunkOffset, }, []) for row in result["rows"]: resultRow = row['value'] newFile = File(lfn = resultRow["lfn"], size = resultRow["size"], events = resultRow["events"], parents = set(resultRow["parents"]), locations = set(resultRow["locations"]), merged = resultRow["merged"]) for run in resultRow["runs"]: newRun = Run(run["run_number"]) newRun.extend(run["lumis"]) newFile.addRun(newRun) chunkFiles.append(newFile) return chunkFiles
def doBlock(self, entity, fileset): connection = urlopen(self.nodeURL + "&block=%s" % quote(entity)) aString = connection.read() connection.close() if aString[2:8] != "phedex": print "PhEDExNotifier: bad string from server follows." print "%s" % aString phedex = eval(aString.replace("null", "None"), {}, {}) blocks = phedex["phedex"]["block"] if len(blocks) != 1: print "PhEDExNotifier: Found %d blocks, expected 1, will only consider first block" % len(blocks) files = blocks[0]["file"] for file in files: lfn = file["name"] events = self.getEvents(lfn) (runs, lumis) = self.getRunLumi(lfn) fileToAdd = File(lfn, file["bytes"], events, runs[0], lumis[0]) replicas = file["replica"] if len(replicas) > 0: locations = [] for replica in replicas: locations.append(replica["node"]) fileToAdd.setLocation(locations) fileset.addFile(fileToAdd)
def execute(self, *args, **kwargs): totalevents = kwargs['task']['tm_totalunits'] firstEvent = 1 lastEvent = totalevents firstLumi = 1 lastLumi = 10 # Set a default of 100 events per lumi. This is set as a task # property, as the splitting considers it independently of the file # information provided by the fake dataset. if not kwargs['task']['tm_events_per_lumi']: kwargs['task']['tm_events_per_lumi'] = 100 #MC comes with only one MCFakeFile singleMCFileset = Fileset(name = "MCFakeFileSet") newFile = File("MCFakeFile", size = 1000, events = totalevents) sbj = SiteDBJSON({"key":self.config.TaskWorker.cmskey, "cert":self.config.TaskWorker.cmscert}) newFile.setLocation(sbj.getAllCMSNames()) newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1))) newFile["block"] = 'MCFakeBlock' newFile["first_event"] = firstEvent newFile["last_event"] = lastEvent singleMCFileset.addFile(newFile) return Result(task=kwargs['task'], result=singleMCFileset)
def __init__(self, lfn = None, id = -1, size = None, events = None, checksums = {}, parents = None, locations = None, status = "NOTUPLOADED"): WMBSBase.__init__(self) WMFile.__init__(self, lfn = lfn, size = size, events = events, checksums = checksums, parents = parents, merged = True) self.setdefault("status", status) self.setdefault("id", id) # Parameters for the algorithm self.setdefault("appName", None) self.setdefault("appVer", None) self.setdefault("appFam", None) self.setdefault("psetHash", None) self.setdefault("configContent", None) self.setdefault("datasetPath", None) self.setdefault("valid_status", None) if locations == None: self.setdefault("newlocations", set()) else: self.setdefault("newlocations", self.makeset(locations)) # The WMBS base class creates a DAO factory for WMBS, we'll need to # overwrite that so we can use the factory for DBSBuffer objects. self.daoFactory = DAOFactory(package = "WMComponent.DBSBuffer.Database", logger = self.logger, dbinterface = self.dbi) #Remove reference to WMBS daofactory to prevent confusion self.daofactory = self.daoFactory return
def setUp(self): """ _setUp_ Create two subscriptions: One that contains a single file and one that contains multiple files. """ self.multipleFileFileset = Fileset(name="TestFileset1") for i in range(10): newFile = File(makeUUID(), size=1000, events=100) newFile.setLocation("se01") self.multipleFileFileset.addFile(newFile) self.singleFileFileset = Fileset(name="TestFileset2") newFile = File("/some/file/name", size=1000, events=100) newFile.setLocation("se02") self.singleFileFileset.addFile(newFile) self.emptyFileFileset = Fileset(name="TestFileset3") newFile = File("/some/file/name", size=1000, events=0) newFile.setdefault("se03") self.emptyFileFileset.addFile(newFile) testWorkflow = Workflow() self.multipleFileSubscription = Subscription( fileset=self.multipleFileFileset, workflow=testWorkflow, split_algo="EventBased", type="Processing" ) self.singleFileSubscription = Subscription( fileset=self.singleFileFileset, workflow=testWorkflow, split_algo="EventBased", type="Processing" ) self.emptyFileSubscription = Subscription( fileset=self.emptyFileFileset, workflow=testWorkflow, split_algo="EventBased", type="Processing" ) return
def jobConfig(self, wf, task, jobid, lfn): """ Create a fake job dict to upload to the ACDC server """ testFile = File(lfn=lfn, size=1024, events=1024) testFile.setLocation(["T2_CH_CERN", "T2_CH_CERN_HLT"]) testFile.addRun(Run(jobid, 1, 2)) # run = jobid testJob = self.getMinimalJob(wf, task) testJob.addFile(testFile) return testJob
def getOutputFile(self, fileName, outputModule, step): """ _getOutputFile_ Takes a fileRef object and returns a DataStructs/File object as output """ outputMod = self.getOutputModule(step=step, outputModule=outputModule) if not outputMod: return None fileRef = getattr(outputMod.files, fileName, None) newFile = File(locations=set()) # Locations newFile.setLocation(getattr(fileRef, "location", None)) # Runs runList = fileRef.runs.listSections_() for run in runList: lumis = getattr(fileRef.runs, run) if isinstance(lumis, dict): newRun = Run(int(run), *lumis.items()) else: newRun = Run(int(run), *lumis) newFile.addRun(newRun) newFile["lfn"] = getattr(fileRef, "lfn", None) newFile["pfn"] = getattr(fileRef, "pfn", None) newFile["events"] = int(getattr(fileRef, "events", 0)) newFile["size"] = int(getattr(fileRef, "size", 0)) newFile["branches"] = getattr(fileRef, "branches", []) newFile["input"] = getattr(fileRef, "input", []) newFile["inputpfns"] = getattr(fileRef, "inputpfns", []) newFile["branch_hash"] = getattr(fileRef, "branch_hash", None) newFile["catalog"] = getattr(fileRef, "catalog", "") newFile["guid"] = getattr(fileRef, "guid", "") newFile["module_label"] = getattr(fileRef, "module_label", "") newFile["checksums"] = getattr(fileRef, "checksums", {}) newFile["merged"] = bool(getattr(fileRef, "merged", False)) newFile["dataset"] = getattr(fileRef, "dataset", {}) newFile["acquisitionEra"] = getattr(fileRef, 'acquisitionEra', None) newFile["processingVer"] = getattr(fileRef, 'processingVer', None) newFile["validStatus"] = getattr(fileRef, 'validStatus', None) newFile["globalTag"] = getattr(fileRef, 'globalTag', None) newFile["prep_id"] = getattr(fileRef, 'prep_id', None) newFile['configURL'] = getattr(fileRef, 'configURL', None) newFile['inputPath'] = getattr(fileRef, 'inputPath', None) newFile["outputModule"] = outputModule newFile["fileRef"] = fileRef return newFile
def createResubmitSpec(self, serverUrl, couchDB): """ _createResubmitSpec_ Create a bogus resubmit workload. """ self.site = "cmssrm.fnal.gov" workload = WMWorkloadHelper(WMWorkload("TestWorkload")) reco = workload.newTask("reco") workload.setOwnerDetails(name = "evansde77", group = "DMWM") # first task uses the input dataset reco.addInputDataset(primary = "PRIMARY", processed = "processed-v1", tier = "TIER1") reco.data.input.splitting.algorithm = "File" reco.setTaskType("Processing") cmsRunReco = reco.makeStep("cmsRun1") cmsRunReco.setStepType("CMSSW") reco.applyTemplates() cmsRunRecoHelper = cmsRunReco.getTypeHelper() cmsRunRecoHelper.addOutputModule("outputRECO", primaryDataset = "PRIMARY", processedDataset = "processed-v2", dataTier = "TIER2", lfnBase = "/store/dunkindonuts", mergedLFNBase = "/store/kfc") dcs = DataCollectionService(url = serverUrl, database = couchDB) def getJob(workload): job = Job() job["task"] = workload.getTask("reco").getPathName() job["workflow"] = workload.name() job["location"] = self.site job["owner"] = "evansde77" job["group"] = "DMWM" return job testFileA = WMFile(lfn = makeUUID(), size = 1024, events = 1024) testFileA.setLocation([self.site]) testFileA.addRun(Run(1, 1, 2)) testFileB = WMFile(lfn = makeUUID(), size = 1024, events = 1024) testFileB.setLocation([self.site]) testFileB.addRun(Run(1, 3, 4)) testJobA = getJob(workload) testJobA.addFile(testFileA) testJobA.addFile(testFileB) dcs.failedJobs([testJobA]) topLevelTask = workload.getTopLevelTask()[0] workload.truncate("Resubmit_TestWorkload", topLevelTask.getPathName(), serverUrl, couchDB) return workload
def createFile(lfn, events, run, lumis, location): """ _createFile_ Create a file for testing """ newFile = File(lfn=lfn, size=1000, events=events) lumiList = [] for lumi in range(lumis): lumiList.append((run * lumis) + lumi) newFile.addRun(Run(run, *lumiList)) newFile.setLocation(location) return newFile
def getInputFilesFromStep(self, stepName, inputSource = None): """ _getInputFilesFromStep_ Retrieve a list of input files from the given step. """ step = self.retrieveStep(stepName) inputSources = [] if inputSource == None: inputSources = step.input.listSections_() else: inputSources = [inputSource] inputFiles = [] for inputSource in inputSources: source = getattr(step.input, inputSource) for fileNum in range(source.files.fileCount): fwjrFile = getattr(source.files, "file%d" % fileNum) lfn = getattr(fwjrFile, "lfn", None) pfn = getattr(fwjrFile, "pfn", None) size = getattr(fwjrFile, "size", 0) events = getattr(fwjrFile, "events", 0) branches = getattr(fwjrFile, "branches", []) catalog = getattr(fwjrFile, "catalog", None) guid = getattr(fwjrFile, "guid", None) inputSourceClass = getattr(fwjrFile, "input_source_class", None) moduleLabel = getattr(fwjrFile, "module_label", None) inputType = getattr(fwjrFile, "input_type", None) inputFile = File(lfn = lfn, size = size, events = events) inputFile["pfn"] = pfn inputFile["branches"] = branches inputFile["catalog"] = catalog inputFile["guid"] = guid inputFile["input_source_class"] = inputSourceClass inputFile["module_label"] = moduleLabel inputFile["input_type"] = inputType runSection = getattr(fwjrFile, "runs") runNumbers = runSection.listSections_() for runNumber in runNumbers: lumiTuple = getattr(runSection, str(runNumber)) inputFile.addRun(Run(int(runNumber), *lumiTuple)) inputFiles.append(inputFile) return inputFiles
def generateFakeMCFile(self, numEvents=100, firstEvent=1, lastEvent=100, firstLumi=1, lastLumi=10): # MC comes with only one MCFakeFile singleMCFileset = Fileset(name="MCTestFileset") newFile = File("MCFakeFileTest", size=1000, events=numEvents) newFile.setLocation("se01") newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1))) newFile["first_event"] = firstEvent newFile["last_event"] = lastEvent testWorkflow = Workflow() singleMCFileset.addFile(newFile) singleMCFileSubscription = Subscription( fileset=singleMCFileset, workflow=testWorkflow, split_algo="EventBased", type="Production" ) return singleMCFileSubscription
def setUp(self): """ _setUp_ Create two subscriptions: One that contains a single file and one that contains multiple files. """ self.multipleFileFileset = Fileset(name = "TestFileset1") for i in range(10): newFile = File(makeUUID(), size = 1000, events = 100) newFile.setLocation('blenheim') newFile.setLocation('malpaquet') self.multipleFileFileset.addFile(newFile) self.singleFileFileset = Fileset(name = "TestFileset2") newFile = File("/some/file/name", size = 1000, events = 100) newFile.setLocation('blenheim') self.singleFileFileset.addFile(newFile) testWorkflow = Workflow() self.multipleFileSubscription = Subscription(fileset = self.multipleFileFileset, workflow = testWorkflow, split_algo = "FileBased", type = "Processing") self.singleFileSubscription = Subscription(fileset = self.singleFileFileset, workflow = testWorkflow, split_algo = "FileBased", type = "Processing") #self.multipleFileSubscription.create() #self.singleFileSubscription.create() return
def execute(self, *args, **kwargs): self.logger.info("Data discovery and splitting for %s using user-provided files" % kwargs['task']['tm_taskname']) if 'tm_user_files' in kwargs['task'] and kwargs['task']['tm_user_files']: userfiles = kwargs['task']['tm_user_files'] else: ## For backward compatibility only. userfiles = kwargs['task']['tm_arguments'].get('userfiles') splitting = kwargs['task']['tm_split_algo'] total_units = kwargs['task']['tm_totalunits'] if not userfiles or splitting != 'FileBased': if not userfiles: msg = "No files specified to process for task %s." % kwargs['task']['tm_taskname'] if splitting != 'FileBased': msg = "Data.splitting must be set to 'FileBased' when using a custom set of files." self.logger.error("Setting %s as failed: %s" % (kwargs['task']['tm_taskname'], msg)) configreq = {'workflow': kwargs['task']['tm_taskname'], 'status': "FAILED", 'subresource': 'failure', 'failure': b64encode(msg)} self.server.post(self.resturi, data = urllib.urlencode(configreq)) raise StopHandler(msg) if hasattr(self.config.Sites, 'available'): locations = self.config.Sites.available else: sbj = SiteDBJSON({"key":self.config.TaskWorker.cmskey, "cert":self.config.TaskWorker.cmscert}) locations = sbj.getAllCMSNames() userFileset = Fileset(name = kwargs['task']['tm_taskname']) self.logger.info("There are %d files specified by the user." % len(userfiles)) if total_units > 0: self.logger.info("Will run over the first %d files." % total_units) file_counter = 0 for userfile, idx in zip(userfiles, range(len(userfiles))): newFile = File(userfile, size = 1000, events = 1) newFile.setLocation(locations) newFile.addRun(Run(1, idx)) newFile["block"] = 'UserFilesFakeBlock' newFile["first_event"] = 1 newFile["last_event"] = 2 userFileset.addFile(newFile) file_counter += 1 if total_units > 0 and file_counter >= total_units: break return Result(task = kwargs['task'], result = userFileset)
def createFilesetFromDBS(self, collection, filesetName, dbsURL, dataset, mask=None): """ _createFilesetFromDBS_ Get info from DBS, apply mask (filter) and create a fileset """ fileSet = CouchFileset(database=self.database, url=self.url, name=filesetName) fileSet.setCollection(collection) files = [] blockLocations = {} dbsReader = DBSReader(dbsURL, version="DBS_2_0_9", mode="GET") dbsResults = dbsReader.dbs.listFiles(path=dataset, retriveList=["retrive_lumi", "retrive_run"]) logging.info("Found %s files from DBS" % len(dbsResults)) for dbsResult in dbsResults: blockName = dbsResult["Block"]["Name"] if not blockName in blockLocations: blockLocations[blockName] = dbsReader.listFileBlockLocation(blockName) file = File( lfn=dbsResult["LogicalFileName"], size=dbsResult["FileSize"], merged=True, events=dbsResult["NumberOfEvents"], locations=blockLocations[blockName], ) runs = {} for lumi in dbsResult["LumiList"]: runNumber = lumi["RunNumber"] runString = str(runNumber) lumiNumber = lumi["LumiSectionNumber"] if runString in runs: runs[runString].lumis.append(lumiNumber) else: runs[runString] = Run(runNumber, lumiNumber) for run in runs.values(): file.addRun(run) files.append(file) logging.info("Uploading %s files in fileset" % len(files)) fileList = fileSet.add(files, mask) return fileSet, fileList
def getFileset(self): """ Get a fileset based on the task """ fileset = Fileset(name='Merge%s' % (type)) for i in range(0, random.randint(15, 25)): # Use the testDir to generate a random lfn inpFile = File(lfn="%s/%s.root" % (self.testDir, makeUUID()), size=random.randint(200000, 1000000), events=random.randint(1000, 2000)) inpFile.setLocation('Megiddo') fileset.addFile(inpFile) return fileset
def testDataStructsFile(self): """ _testDataStructsFile_ Tests our ability to create a WMBS file from a DataStructs File and vice versa """ myThread = threading.currentThread() testLFN = "lfn1" testSize = 1024 testEvents = 100 testCksum = {"cksum": '1'} testParents = set(["lfn2"]) testRun = Run( 1, *[45]) testSE = "se1.cern.ch" parentFile = File(lfn= "lfn2") parentFile.create() testFile = File() inputFile = WMFile(lfn = testLFN, size = testSize, events = testEvents, checksums = testCksum, parents = testParents) inputFile.addRun(testRun) inputFile.setLocation(se = testSE) testFile.loadFromDataStructsFile(file = inputFile) testFile.create() testFile.save() loadFile = File(lfn = "lfn1") loadFile.loadData(parentage = 1) self.assertEqual(loadFile['size'], testSize) self.assertEqual(loadFile['events'], testEvents) self.assertEqual(loadFile['checksums'], testCksum) self.assertEqual(loadFile['locations'], set([testSE])) #self.assertEqual(loadFile['parents'].pop()['lfn'], 'lfn2') wmFile = loadFile.returnDataStructsFile() self.assertEqual(wmFile == inputFile, True) return
def execute(self, *args, **kwargs): totalevents = kwargs['task']['tm_totalunits'] firstEvent = 1 lastEvent = totalevents firstLumi = 1 lastLumi = 10 #MC comes with only one MCFakeFile singleMCFileset = Fileset(name = "MCFakeFileSet") newFile = File("MCFakeFile", size = 1000, events = totalevents) newFile.setLocation(self.config.Sites.available) newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1))) newFile["block"] = 'MCFackBlock' newFile["first_event"] = firstEvent newFile["last_event"] = lastEvent singleMCFileset.addFile(newFile) return Result(task=kwargs['task'], result=singleMCFileset)
def __init__(self, lfn = "", id = -1, size = 0, events = 0, checksums = {}, parents = None, locations = None, first_event = 0, last_event = 0, merged = True): WMBSBase.__init__(self) WMFile.__init__(self, lfn = lfn, size = size, events = events, checksums = checksums, parents = parents, merged = merged) if locations == None: self.setdefault("newlocations", set()) else: if type(locations) == str: self.setdefault("newlocations", set()) self['newlocations'].add(locations) else: self.setdefault("newlocations", locations) # overwrite the default value set from the WMFile self["first_event"] = first_event self["last_event"] = last_event self.setdefault("id", id) self['locations'] = set()
def execute(self, *args, **kwargs): self.logger.info("Data discovery and splitting for %s using user-provided files" % kwargs['task']['tm_taskname']) userfiles = kwargs['task']['tm_user_files'] splitting = kwargs['task']['tm_split_algo'] total_units = kwargs['task']['tm_totalunits'] if not userfiles or splitting != 'FileBased': if not userfiles: msg = "No files specified to process for task %s." % kwargs['task']['tm_taskname'] if splitting != 'FileBased': msg = "Data.splitting must be set to 'FileBased' when using a custom set of files." raise TaskWorkerException(msg) if hasattr(self.config.Sites, 'available'): locations = self.config.Sites.available else: with self.config.TaskWorker.envForCMSWEB : configDict = {"cacheduration": 1, "pycurl": True} # cache duration is in hours resourceCatalog = CRIC(logger=self.logger, configDict=configDict) locations = resourceCatalog.getAllPSNs() userFileset = Fileset(name = kwargs['task']['tm_taskname']) self.logger.info("There are %d files specified by the user." % len(userfiles)) if total_units > 0: self.logger.info("Will run over the first %d files." % total_units) file_counter = 0 for userfile, idx in zip(userfiles, range(len(userfiles))): newFile = File(userfile, size = 1000, events = 1) newFile.setLocation(locations) newFile.addRun(Run(1, idx)) newFile["block"] = 'UserFilesFakeBlock' newFile["first_event"] = 1 newFile["last_event"] = 2 userFileset.addFile(newFile) file_counter += 1 if total_units > 0 and file_counter >= total_units: break return Result(task = kwargs['task'], result = userFileset)
def __init__(self, lfn=None, id=-1, size=None, events=None, checksums=None, parents=None, locations=None, status="NOTUPLOADED", inPhedex=0, workflowId=None, prep_id=None): checksums = checksums or {} WMBSBase.__init__(self) WMFile.__init__(self, lfn=lfn, size=size, events=events, checksums=checksums, parents=parents, merged=True) self.setdefault("status", status) self.setdefault("in_phedex", inPhedex) self.setdefault("id", id) self.setdefault("workflowId", workflowId) # Parameters for the algorithm self.setdefault("appName", None) self.setdefault("appVer", None) self.setdefault("appFam", None) self.setdefault("psetHash", None) self.setdefault("configContent", None) self.setdefault("datasetPath", None) self.setdefault("processingVer", None) self.setdefault("acquisitionEra", None) self.setdefault("validStatus", None) self.setdefault("globalTag", None) self.setdefault("datasetParent", None) self.setdefault("prep_id", None) if locations is None: self.setdefault("newlocations", set()) else: self.setdefault("newlocations", self.makeset(locations)) # The WMBS base class creates a DAO factory for WMBS, we'll need to # overwrite that so we can use the factory for DBSBuffer objects. self.daofactory = DAOFactory(package="WMComponent.DBS3Buffer", logger=self.logger, dbinterface=self.dbi) return
def getChunkFiles(self, collectionName, filesetName, chunkOffset, chunkSize=100): """ _getChunkFiles_ Retrieve a chunk of files from the given collection and task. """ chunkFiles = [] files = self._getFilesetInfo(collectionName, filesetName, chunkOffset, chunkSize) files = mergeFakeFiles(files) for fileInfo in files: newFile = File(lfn=fileInfo["lfn"], size=fileInfo["size"], events=fileInfo["events"], parents=set(fileInfo["parents"]), locations=set(fileInfo["locations"]), merged=fileInfo["merged"]) for run in fileInfo["runs"]: newRun = Run(run["run_number"]) newRun.extend(run["lumis"]) newFile.addRun(newRun) chunkFiles.append(newFile) return chunkFiles
def setUp(self): """ _setUp_ Initial Setup for the Job Testcase """ self.inputFiles = [] for i in range(1,1000): lfn = "/store/data/%s/%s/file.root" % (random.randint(1000, 9999), random.randint(1000, 9999)) size = random.randint(1000, 2000) events = 1000 run = random.randint(0, 2000) lumi = random.randint(0, 8) file = File(lfn = lfn, size = size, events = events, checksums = {"cksum": "1"}) file.addRun(Run(run, *[lumi])) self.inputFiles.append(file) self.dummyJob = Job(files = self.inputFiles) return
def addOutputFilesToReport(self, report): """ _addOutputFilesToReport_ Add output files to every output module in the step. Scale the size and number of events in the output files appropriately. """ (outputSize, outputEvents) = self.determineOutputSize() if not os.path.exists('ReportEmuTestFile.txt'): f = open('ReportEmuTestFile.txt', 'w') f.write('A Shubbery') f.close() for outputModuleName in self.step.listOutputModules(): outputModuleSection = self.step.getOutputModule(outputModuleName) outputModuleSection.fixedLFN = False outputModuleSection.disableGUID = False outputLFN = "%s/%s.root" % (outputModuleSection.lfnBase, str(makeUUID())) outputFile = File(lfn = outputLFN, size = outputSize, events = outputEvents, merged = False) outputFile.setLocation(self.job["location"]) outputFile['pfn'] = "ReportEmuTestFile.txt" outputFile['guid'] = "ThisIsGUID" outputFile["checksums"] = {"adler32": "1234", "cksum": "5678"} outputFile["dataset"] = {"primaryDataset": outputModuleSection.primaryDataset, "processedDataset": outputModuleSection.processedDataset, "dataTier": outputModuleSection.dataTier, "applicationName": "cmsRun", "applicationVersion": self.step.getCMSSWVersion()} outputFile["module_label"] = outputModuleName outputFileSection = report.addOutputFile(outputModuleName, outputFile) for inputFile in self.job["input_files"]: Report.addRunInfoToFile(outputFileSection, inputFile["runs"]) return
def setUp(self): """ _setUp_ Create two subscriptions: One that contains a single file and one that contains multiple files. """ self.multipleFileFileset = Fileset(name="TestFileset1") for i in range(10): newFile = File(makeUUID(), size=1000, events=100) newFile.setLocation('se01') self.multipleFileFileset.addFile(newFile) self.singleFileFileset = Fileset(name="TestFileset2") newFile = File("/some/file/name", size=1000, events=100) newFile.setLocation('se02') self.singleFileFileset.addFile(newFile) self.emptyFileFileset = Fileset(name="TestFileset3") newFile = File("/some/file/name", size=1000, events=0) newFile.setLocation('se03') self.emptyFileFileset.addFile(newFile) testWorkflow = Workflow() self.multipleFileSubscription = Subscription(fileset=self.multipleFileFileset, workflow=testWorkflow, split_algo="EventBased", type="Processing") self.singleFileSubscription = Subscription(fileset=self.singleFileFileset, workflow=testWorkflow, split_algo="EventBased", type="Processing") self.emptyFileSubscription = Subscription(fileset=self.emptyFileFileset, workflow=testWorkflow, split_algo="EventBased", type="Processing") self.eventsPerJob = 100 self.performanceParams = {'timePerEvent': None, 'memoryRequirement': 2300, 'sizePerEvent': 400} return
def testAddRun(self): """ This tests the addRun() function of a DataStructs File object """ testLFN = "lfn" testSize = "1024" testEvents = "100" testCksum = "1" testParents = "parent" testLumi = 1 testRunNumber = 1000000 testFile = File(lfn = testLFN, size = testSize, events = testEvents, checksums = testCksum, parents = testParents) testRun = Run(testRunNumber, testLumi) testFile.addRun(testRun) assert testRun in testFile['runs'], "Run not added properly to run in File.addRun()" return
def returnDataStructsFile(self): """ _returnDataStructsFile_ Creates a dataStruct file out of this file """ parents = set() for parent in self["parents"]: parents.add(WMFile(lfn = parent['lfn'], size = parent['size'], events = parent['events'], checksums = parent['checksums'], parents = parent['parents'], merged = parent['merged'])) file = WMFile(lfn = self['lfn'], size = self['size'], events = self['events'], checksums = self['checksums'], parents = parents, merged = self['merged']) for run in self['runs']: file.addRun(run) for location in self['locations']: file.setLocation(se = location) return file
def setUp(self): """ _setUp_ Create two subscriptions: One that contains a single file and one that contains multiple files. """ self.multipleFileFileset = Fileset(name = "TestFileset1") for i in range(10): newFile = File(makeUUID(), size = 1000, events = 100, locations = set(["somese.cern.ch"])) self.multipleFileFileset.addFile(newFile) self.singleFileFileset = Fileset(name = "TestFileset2") newFile = File("/some/file/name", size = 1000, events = 100, locations = set(["somese.cern.ch"])) self.singleFileFileset.addFile(newFile) self.multipleSiteFileset = Fileset(name = "TestFileset3") for i in range(5): newFile = File(makeUUID(), size = 1000, events = 100, locations = set(["somese.cern.ch"])) newFile.setLocation("somese.cern.ch") self.multipleSiteFileset.addFile(newFile) for i in range(5): newFile = File(makeUUID(), size = 1000, events = 100) newFile.setLocation(["somese.cern.ch","otherse.cern.ch"]) self.multipleSiteFileset.addFile(newFile) testWorkflow = Workflow() self.multipleFileSubscription = Subscription(fileset = self.multipleFileFileset, workflow = testWorkflow, split_algo = "SizeBased", type = "Processing") self.singleFileSubscription = Subscription(fileset = self.singleFileFileset, workflow = testWorkflow, split_algo = "SizeBased", type = "Processing") self.multipleSiteSubscription = Subscription(fileset = self.multipleSiteFileset, workflow = testWorkflow, split_algo = "EventBased", type = "Processing") return
def testDropCount(self): """ _testDropCount_ Verify that dropping a fileset and counting the files in a fileset works correctly. """ testCollectionA = CouchCollection(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="Thunderstruck") testCollectionB = CouchCollection(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="StruckThunder") testCollectionA.setOwner(self.owner) testCollectionB.setOwner(self.owner) testFiles = [] for i in range(5): testFile = File(lfn=makeUUID(), size=random.randint(1024, 4096), events=random.randint(1024, 4096)) testFiles.append(testFile) testFilesetA = CouchFileset(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="TestFilesetA") testFilesetB = CouchFileset(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="TestFilesetB") testFilesetC = CouchFileset(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="TestFilesetC") testCollectionA.addFileset(testFilesetA) testCollectionB.addFileset(testFilesetB) testCollectionB.addFileset(testFilesetC) testFilesetA.add(testFiles) testFilesetB.add(testFiles) testFilesetC.add(testFiles) testFilesetC.drop() testCollectionC = CouchCollection(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="StruckThunder") testCollectionC.setOwner(self.owner) testCollectionC.populate() self.assertEqual( len(testCollectionC["filesets"]), 1, "Error: There should be one fileset in this collection.") self.assertEqual(testCollectionC["filesets"][0].fileCount(), 5, "Error: Wrong number of files in fileset.") testCollectionD = CouchCollection(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="Thunderstruck") testCollectionD.setOwner(self.owner) testCollectionD.populate() self.assertEqual( len(testCollectionD["filesets"]), 1, "Error: There should be one fileset in this collection.") self.assertEqual(testCollectionD["filesets"][0].fileCount(), 5, "Error: Wrong number of files in fileset.") return
def testD_NoFileSplitNoHardLimit(self): """ _testD_NoFileSplitNoHardLimit_ In this case we don't split on file boundaries, check different combination of files make sure we make the most of the splitting, e.g. include many zero event files in a single job. """ splitter = SplitterFactory() # Create 100 files with 7 lumi per file and 0 events per lumi on average. testSubscription = self.createSubscription(nFiles=100, lumisPerFile=7, twoSites=False, nEventsPerFile=0) jobFactory = splitter(package="WMCore.DataStructs", subscription=testSubscription) # First test, the optimal settings are 360 events per job # As we have files with 0 events per lumi, this will configure the splitting to # a single job containing all files jobGroups = jobFactory(halt_job_on_file_boundaries=False, splitOnRun=False, events_per_job=360, performance=self.performanceParams) self.assertEqual(len(jobGroups), 1, "There should be only one job group") jobs = jobGroups[0].jobs self.assertEqual(len(jobs), 1, "There should be 1 job") self.assertEqual(len(jobs[0]['input_files']), 100, "All 100 files must be in the job") # Create 7 files, each one with different lumi/event distributions testFileset = Fileset(name="FilesetA") testFileA = self.createFile("/this/is/file1", 250, 0, 5, "blenheim") # job1, job2 testFileB = self.createFile("/this/is/file2", 600, 1, 1, "blenheim") # job3 testFileC = self.createFile("/this/is/file3", 1200, 2, 2, "blenheim") # job4, job5 testFileD = self.createFile("/this/is/file4", 100, 3, 1, "blenheim") # job6 testFileE = self.createFile("/this/is/file5", 30, 4, 1, "blenheim") # job6 testFileF = self.createFile("/this/is/file6", 10, 5, 1, "blenheim") # job6 testFileG = self.createFile("/this/is/file7", 151, 6, 3, "blenheim") # job7 testFileset.addFile(testFileA) testFileset.addFile(testFileB) testFileset.addFile(testFileC) testFileset.addFile(testFileD) testFileset.addFile(testFileE) testFileset.addFile(testFileF) testFileset.addFile(testFileG) testSubscription = Subscription(fileset=testFileset, workflow=self.testWorkflow, split_algo="EventAwareLumiBased", type="Processing") jobFactory = splitter(package="WMCore.DataStructs", subscription=testSubscription) # Optimal settings are: jobs with 150 events per job # This means, the first file must be splitted in 3 lumis per job which would leave room # for another lumi in the second job, but the second file has a lumi too big for that # The 3rd job only contains the second file, the fourth and fifth job split the third file jobGroups = jobFactory(halt_job_on_file_boundaries=False, splitOnRun=False, events_per_job=150, performance=self.performanceParams) self.assertEqual(len(jobGroups), 1, "There should be only one job group") jobs = jobGroups[0].jobs self.assertEqual(len(jobs), 7, "Seven jobs must be in the jobgroup") self.assertEqual(jobs[0]["mask"].getRunAndLumis(), {0: [[0, 2]]}, "Wrong mask for the first job") self.assertEqual(jobs[1]["mask"].getRunAndLumis(), {0: [[3, 4]]}, "Wrong mask for the second job") self.assertEqual(jobs[2]["mask"].getRunAndLumis(), {1: [[1, 1]]}, "Wrong mask for the third job") self.assertEqual(jobs[3]["mask"].getRunAndLumis(), {2: [[4, 4]]}, "Wrong mask for the fourth job") self.assertEqual(jobs[4]["mask"].getRunAndLumis(), {2: [[5, 5]]}, "Wrong mask for the fifth job") self.assertEqual(jobs[5]["mask"].getRunAndLumis(), { 3: [[3, 3]], 4: [[4, 4]], 5: [[5, 5]] }, "Wrong mask for the sixth job") self.assertEqual(jobs[6]["mask"].getRunAndLumis(), {6: [[18, 20]]}, "Wrong mask for the seventh job") # Test interactions of this algorithm with splitOnRun = True # Make 2 files, one with 3 runs and a second one with the last run of the first fileA = File(lfn="/this/is/file1", size=1000, events=2400) lumiListA = [] lumiListB = [] lumiListC = [] for lumi in range(8): lumiListA.append(1 + lumi) lumiListB.append(1 + lumi) lumiListC.append(1 + lumi) fileA.addRun(Run(1, *lumiListA)) fileA.addRun(Run(2, *lumiListA)) fileA.addRun(Run(3, *lumiListA)) fileA.setLocation("malpaquet") fileB = self.createFile('/this/is/file2', 200, 3, 5, "malpaquet") testFileset = Fileset(name='FilesetB') testFileset.addFile(fileA) testFileset.addFile(fileB) testSubscription = Subscription(fileset=testFileset, workflow=self.testWorkflow, split_algo="EventAwareLumiBased", type="Processing") jobFactory = splitter(package="WMCore.DataStructs", subscription=testSubscription) # The settings for this splitting are 700 events per job jobGroups = jobFactory(splitOnRun=True, halt_job_on_file_boundaries=False, events_per_job=700, performance=self.performanceParams) self.assertEqual(len(jobGroups), 1, "There should be only one job group") jobs = jobGroups[0].jobs self.assertEqual(len(jobs), 6, "Six jobs must be in the jobgroup")
def createSubscription(self, nFiles, lumisPerFile, twoSites=False): """ _createSubscription_ Create a subscription for testing """ baseName = makeUUID() testFileset = Fileset(name=baseName) for i in range(nFiles): newFile = File(lfn='%s_%i' % (baseName, i), size=1000, events=100) lumis = [] for lumi in range(lumisPerFile): lumis.append((i * 100) + lumi) newFile.addRun(Run(i, *lumis)) newFile.setLocation('blenheim') testFileset.addFile(newFile) if twoSites: for i in range(nFiles): newFile = File(lfn='%s_%i_2' % (baseName, i), size=1000, events=100) lumis = [] for lumi in range(lumisPerFile): lumis.append((i * 100) + lumi) newFile.addRun(Run(i, *lumis)) newFile.setLocation('malpaquet') testFileset.addFile(newFile) testSubscription = Subscription(fileset=testFileset, workflow=self.testWorkflow, split_algo="LumiBased", type="Processing") return testSubscription
def setUp(self): """ _setUp_ Create two subscriptions: One that contains a single file and one that contains multiple files. """ self.multipleFileFileset = Fileset(name="TestFileset1") for i in range(10): newFile = File(makeUUID(), size=1000, events=100) newFile.setLocation('blenheim') newFile.setLocation('malpaquet') lumis = [] for lumi in range(20): lumis.append((i * 100) + lumi) newFile.addRun(Run(i, *lumis)) self.multipleFileFileset.addFile(newFile) self.singleFileFileset = Fileset(name="TestFileset2") newFile = File("/some/file/name", size=1000, events=100) newFile.setLocation('blenheim') lumis = list(range(50, 60)) + list(range(70, 80)) newFile.addRun(Run(13, *lumis)) self.singleFileFileset.addFile(newFile) testWorkflow = Workflow() self.multipleFileSubscription = Subscription( fileset=self.multipleFileFileset, workflow=testWorkflow, split_algo="FileBased", type="Processing") self.singleFileSubscription = Subscription( fileset=self.singleFileFileset, workflow=testWorkflow, split_algo="FileBased", type="Processing") #self.multipleFileSubscription.create() #self.singleFileSubscription.create() self.performanceParams = { 'timePerEvent': 12, 'memoryRequirement': 2300, 'sizePerEvent': 400 } return
def testChunking(self): """ _testChunking_ Insert a workload and files that have several distinct sets of locations. Verify that the chunks are created correctly and that they only groups files that have the same set of locations. Also verify that the chunks are pulled out of ACDC correctly. """ dcs = DataCollectionService(url=self.testInit.couchUrl, database="wmcore-acdc-datacollectionsvc") testFileA = File(lfn=makeUUID(), size=1024, events=1024) testFileA.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"]) testFileA.addRun(Run(1, 1, 2)) testFileB = File(lfn=makeUUID(), size=1024, events=1024) testFileB.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"]) testFileB.addRun(Run(1, 3, 4)) testFileC = File(lfn=makeUUID(), size=1024, events=1024) testFileC.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"]) testFileC.addRun(Run(1, 5, 6)) testJobA = self.getMinimalJob() testJobA.addFile(testFileA) testJobA.addFile(testFileB) testJobA.addFile(testFileC) testFileD = File(lfn=makeUUID(), size=1024, events=1024) testFileD.setLocation(["cmssrm.fnal.gov"]) testFileD.addRun(Run(2, 1, 2)) testFileE = File(lfn=makeUUID(), size=1024, events=1024) testFileE.setLocation(["cmssrm.fnal.gov"]) testFileE.addRun(Run(2, 3, 4)) testJobB = self.getMinimalJob() testJobB.addFile(testFileD) testJobB.addFile(testFileE) testFileF = File(lfn=makeUUID(), size=1024, events=1024, parents={"/some/parent/F"}) testFileF.setLocation( ["cmssrm.fnal.gov", "castor.cern.ch", "srm.ral.uk"]) testFileF.addRun(Run(3, 1, 2)) testFileG = File(lfn=makeUUID(), size=1024, events=1024, parents={"/some/parent/G"}) testFileG.setLocation( ["cmssrm.fnal.gov", "castor.cern.ch", "srm.ral.uk"]) testFileG.addRun(Run(3, 3, 4)) testFileH = File(lfn=makeUUID(), size=1024, events=1024, parents={"/some/parent/H"}) testFileH.setLocation( ["cmssrm.fnal.gov", "castor.cern.ch", "srm.ral.uk"]) testFileH.addRun(Run(3, 5, 6)) testJobC = self.getMinimalJob() testJobC.addFile(testFileF) testJobC.addFile(testFileG) testJobC.addFile(testFileH) testFileI = File(lfn=makeUUID(), size=1024, events=1024, merged=True) testFileI.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"]) testFileI.addRun(Run(4, 1, 2)) testFileJ = File(lfn=makeUUID(), size=1024, events=1024, merged=True) testFileJ.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"]) testFileJ.addRun(Run(4, 3, 4)) testFileK = File(lfn=makeUUID(), size=1024, events=1024, merged=True) testFileK.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"]) testFileK.addRun(Run(4, 5, 6)) testJobD = self.getMinimalJob() testJobD.addFile(testFileI) testJobD.addFile(testFileJ) testJobD.addFile(testFileK) dcs.failedJobs([testJobA, testJobB, testJobC, testJobD]) chunks = dcs.chunkFileset("ACDCTest", "/ACDCTest/reco", chunkSize=5) self.assertEqual( len(chunks), 4, "Error: There should be four chunks: %s" % len(chunks)) goldenMetaData = { 1: { "lumis": 2, "locations": ["castor.cern.ch", "cmssrm.fnal.gov"], "events": 1024 }, 2: { "lumis": 4, "locations": ["cmssrm.fnal.gov"], "events": 2048 }, 3: { "lumis": 6, "locations": ["castor.cern.ch", "cmssrm.fnal.gov", "srm.ral.uk"], "events": 3072 }, 5: { "lumis": 10, "locations": ["castor.cern.ch", "cmssrm.fnal.gov"], "events": 5120 } } testFiles = [ testFileA, testFileB, testFileC, testFileI, testFileJ, testFileK ] lastFile = testFileA for testFile in testFiles: if lastFile["lfn"] < testFile["lfn"]: lastFile = testFile testFiles.remove(lastFile) goldenFiles = { 1: [lastFile], 2: [testFileD, testFileE], 3: [testFileF, testFileG, testFileH], 5: testFiles } for chunk in chunks: chunkMetaData = dcs.getChunkInfo("ACDCTest", "/ACDCTest/reco", chunk["offset"], chunk["files"]) self.assertEqual(chunkMetaData["files"], chunk["files"]) self.assertEqual(chunkMetaData["lumis"], chunk["lumis"]) self.assertEqual(chunkMetaData["events"], chunk["events"]) self.assertEqual(chunkMetaData["locations"], chunk["locations"]) self.assertTrue(chunk["files"] in goldenMetaData, "Error: Extra chunk found.") self.assertEqual(chunk["lumis"], goldenMetaData[chunk["files"]]["lumis"], "Error: Lumis in chunk is wrong.") self.assertEqual(chunk["locations"], goldenMetaData[chunk["files"]]["locations"], "Error: Locations in chunk is wrong.") self.assertEqual(chunk["events"], goldenMetaData[chunk["files"]]["events"], "Error: Events in chunk is wrong.") del goldenMetaData[chunk["files"]] chunkFiles = dcs.getChunkFiles("ACDCTest", "/ACDCTest/reco", chunk["offset"], chunk["files"]) self.assertTrue(chunk["files"] in goldenFiles, "Error: Extra chunk found.") goldenChunkFiles = goldenFiles[chunk["files"]] self.assertEqual(len(chunkFiles), len(goldenChunkFiles)) for chunkFile in chunkFiles: foundFile = None for goldenChunkFile in goldenChunkFiles: if chunkFile["lfn"] == goldenChunkFile["lfn"]: foundFile = goldenChunkFile break self.assertIsNotNone( foundFile, "Error: Missing chunk file: %s, %s" % (chunkFiles, goldenChunkFiles)) self.assertEqual(set(foundFile["parents"]), chunkFile["parents"], "Error: File parents should match.") self.assertEqual(foundFile["merged"], chunkFile["merged"], "Error: File merged status should match.") self.assertEqual(foundFile["locations"], chunkFile["locations"], "Error: File locations should match.") self.assertEqual(foundFile["events"], chunkFile["events"]) self.assertEqual(foundFile["size"], chunkFile["size"]) self.assertEqual(len(foundFile["runs"]), len(chunkFile["runs"]), "Error: Wrong number of runs.") for run in foundFile["runs"]: runMatch = False for chunkRun in chunkFile["runs"]: if chunkRun.run == run.run and chunkRun.lumis == run.lumis: runMatch = True break self.assertTrue(runMatch, "Error: Run information is wrong.") del goldenFiles[chunk["files"]] singleChunk = dcs.singleChunkFileset("ACDCTest", "/ACDCTest/reco") self.assertEqual( singleChunk, { "offset": 0, "files": 11, "events": 11264, "lumis": 22, "locations": {"castor.cern.ch", "cmssrm.fnal.gov", "srm.ral.uk"} }, "Error: Single chunk metadata is wrong") return
def testFailFiles(self): """ Testcase for the failFiles method of the Subscription Class """ # Cleaning possible files already occupying the available set self.dummySubscription.failFiles([]) # First test - Test if initial file (on available set) is inserted in the # failed set - no arguments dummyFile2 = File('/tmp/dummyfile2,8888', 1, 1, 1) # Insert dummyFile2 into the available files Set at dummySubscription self.dummySubscription.available.addFile(dummyFile2) S = self.dummySubscription.availableFiles() # Fail all files self.dummySubscription.failFiles(S) assert len(self.dummySubscription.availableFiles()) == 0, \ "failed subscription still has %s files, what's up with that?" % \ len(self.dummySubscription.availableFiles()) # Second test - Test if target files are inserted at the failed set dummyFileList = [] # Populating the dummy List with a random number of files for i in range(1, random.randint(100, 1000)): lfn = '/store/data/%s/%s/file.root' % (random.randint(1000, 9999), random.randint(1000, 9999)) size = random.randint(1000, 2000) events = 1000 run = random.randint(0, 2000) lumi = random.randint(0, 8) file = File(lfn=lfn, size=size, events=events, checksums={"cksum": "1"}) file.addRun(Run(run, *[lumi])) dummyFileList.append(file) # Add the new files self.dummySubscription.available.addFile(dummyFileList) # and fail them self.dummySubscription.failFiles(files=dummyFileList) # Check there are no files available - everything should be failed assert len(self.dummySubscription.availableFiles()) == 0, \ "failed subscription still has %s files, what's up with that?" % \ len(self.dummySubscription.availableFiles()) # Check if all files were inserted at subscription's failed files Set for x in dummyFileList: assert x in self.dummySubscription.failed.getFiles(type='set'), \ 'Couldn\'t make file failed %s' % x.dict['lfn'] # Third test - Test if a replicate file is erased from the other Sets, # when a file is considered failed dummyFile3 = File('/tmp/dummyfile3,5555', 1, 1, 1) dummyFileList = [] dummyFileList.append(dummyFile3) # Inserting dummyFile3 to be used as an argument, into each of the other # file sets self.dummySubscription.acquired.addFile(dummyFile3) self.dummySubscription.available.addFile(dummyFile3) self.dummySubscription.completed.addFile(dummyFile3) # Run the method failFiles self.dummySubscription.failFiles(files=dummyFileList) # Check if dummyFile3 was inserted at the failed Set assert dummyFile3 in self.dummySubscription.failed.getFiles(type='set'), \ 'Replicated file could\'nt be inserted at failed Set' # Check if dummyFile3 was erased from all the other Sets assert dummyFile3 not in self.dummySubscription.acquired.getFiles(type='set'), \ 'Failed file still present at acquired Set' assert dummyFile3 not in self.dummySubscription.completed.getFiles(type='set'), \ 'Failed file still present at completed Set' assert dummyFile3 not in self.dummySubscription.available.getFiles(type='set'), \ 'Failed file still present at available Set'
def formatOutput(self, task, requestname, datasetfiles, locations): """ Receives as input the result of the data location discovery operations and fill up the WMCore objects. """ self.logger.debug(" Formatting data discovery output ") # TEMPORARY pnn_psn_map = {} sbj = SiteDBJSON({ "key": self.config.TaskWorker.cmskey, "cert": self.config.TaskWorker.cmscert }) wmfiles = [] event_counter = 0 lumi_counter = 0 file_counter = 0 uniquelumis = set() ## Loop over the sorted list of files. for lfn, infos in datasetfiles.iteritems(): ## Skip the file if the block has not been found or has no locations. if not infos['BlockName'] in locations or not locations[ infos['BlockName']]: self.logger.warning( "Skipping %s because its block (%s) has no locations" % (lfn, infos['BlockName'])) continue ## Skip the file if it is not in VALID state. if not infos.get('ValidFile', True): self.logger.warning("Skipping invalid file %s" % lfn) continue if task['tm_use_parent'] == 1 and len(infos['Parents']) == 0: raise TaskWorkerException( "The CRAB3 server backend refuses to submit jobs to the Grid scheduler\n" + "because you specified useParents=True but some your files have no" + "parents.\nExample: " + lfn) ## Createa a WMCore File object. wmfile = File(lfn=lfn, events=infos['NumberOfEvents'], size=infos['Size'], checksums=infos['Checksums'], parents=infos['Parents']) wmfile['block'] = infos['BlockName'] wmfile['locations'] = [] for pnn in locations[infos['BlockName']]: if pnn and pnn not in pnn_psn_map: self.logger.debug("Translating PNN %s" % pnn) try: pnn_psn_map[pnn] = sbj.PNNtoPSN(pnn) except KeyError, ke: self.logger.error( "Impossible translating %s to a CMS name through SiteDB" % pnn) pnn_psn_map[pnn] = '' except httplib.HTTPException, ex: self.logger.error("Couldn't map SE to site: %s" % pnn) print "Couldn't map SE to site: %s" % pnn print "got problem: %s" % ex print "got another problem: %s" % ex.__dict__ if pnn and pnn in pnn_psn_map: if type(pnn_psn_map[pnn]) == list: wmfile['locations'].extend(pnn_psn_map[pnn]) else: wmfile['locations'].append(pnn_psn_map[pnn])
def formatOutput(self, task, requestname, datasetfiles, locations): """ Receives as input the result of the data location discovery operations and fill up the WMCore objects. """ self.logger.debug(" Formatting data discovery output ") # TEMPORARY pnn_psn_map = {} sbj = SiteDBJSON({"key": self.config.TaskWorker.cmskey, "cert": self.config.TaskWorker.cmscert}) wmfiles = [] event_counter = 0 lumi_counter = 0 file_counter = 0 uniquelumis = set() ## Loop over the sorted list of files. for lfn, infos in datasetfiles.iteritems(): ## Skip the file if the block has not been found or has no locations. if not infos['BlockName'] in locations or not locations[infos['BlockName']]: self.logger.warning("Skipping %s because its block (%s) has no locations" % (lfn, infos['BlockName'])) continue ## Skip the file if it is not in VALID state. if not infos.get('ValidFile', True): self.logger.warning("Skipping invalid file %s" % lfn) continue if task['tm_use_parent'] == 1 and len(infos['Parents']) == 0: raise TaskWorkerException( "The CRAB3 server backend refuses to submit jobs to the Grid scheduler\n" + "because you specified useParents=True but some your files have no" + "parents.\nExample: " + lfn) ## Createa a WMCore File object. try: size = infos['FileSize'] checksums = {'Checksum': infos['Checksum'], 'Adler32': infos['Adler32'], 'Md5': infos['Md5']} except: #This is so that the task worker does not crash if an old version of WMCore is used (the interface of an API suddenly changed). # We may want to remove the try/except and the following two lines eventually, but keeping them for the moment so other devels won't be affected #See this WMCore commit: https://github.com/dmwm/WMCore/commit/2afc01ae571390f5fa009dd258be757adac89c28#diff-374b7a6640288184175057234e393e1cL204 size = infos['Size'] checksums = infos['Checksums'] wmfile = File(lfn = lfn, events = infos['NumberOfEvents'], size = size, checksums = checksums, parents = infos['Parents']) wmfile['block'] = infos['BlockName'] wmfile['locations'] = [] for pnn in locations[infos['BlockName']]: if pnn and pnn not in pnn_psn_map: self.logger.debug("Translating PNN %s" %pnn) try: pnn_psn_map[pnn] = sbj.PNNtoPSN(pnn) except KeyError as ke: self.logger.error("Impossible translating %s to a CMS name through SiteDB" %pnn) pnn_psn_map[pnn] = '' except httplib.HTTPException as ex: self.logger.error("Couldn't map SE to site: %s" % pnn) print("Couldn't map SE to site: %s" % pnn) print("got problem: %s" % ex) print("got another problem: %s" % ex.__dict__) if pnn and pnn in pnn_psn_map: if isinstance(pnn_psn_map[pnn], list): wmfile['locations'].extend(pnn_psn_map[pnn]) else: wmfile['locations'].append(pnn_psn_map[pnn]) wmfile['workflow'] = requestname event_counter += infos['NumberOfEvents'] for run, lumis in infos['Lumis'].iteritems(): wmfile.addRun(Run(run, *lumis)) for lumi in lumis: uniquelumis.add((run, lumi)) lumi_counter += len(lumis) wmfiles.append(wmfile) file_counter += 1 uniquelumis = len(uniquelumis) self.logger.debug('Tot events found: %d' % event_counter) self.logger.debug('Tot lumis found: %d' % uniquelumis) self.logger.debug('Duplicate lumis found: %d' % (lumi_counter - uniquelumis)) self.logger.debug('Tot files found: %d' % len(wmfiles)) return Result(task = task, result = Fileset(name = 'FilesToSplit', files = set(wmfiles)))
def testGetLumiWhitelist(self): """ _testGetLumiWhitelist_ Verify that the ACDC whitelist generation code works correctly. We'll add jobs with the following lumi info: # Run 1, lumis [1, 2, 3], [4, 6], [7], [9], [11, 12] # Run 2, lumis [5, 6, 7], [10, 11, 12], [15] # Run 3, lumis [20] And should get out a whitelist that looks like this: {"1": [[1, 4], [6, 7], [9, 9], [11, 12]], "2": [[5, 7], [10, 12], [15, 15]], "3": [[20, 20]]} """ dcs = DataCollectionService(url = self.testInit.couchUrl, database = "wmcore-acdc-datacollectionsvc") def getJob(): job = Job() job["task"] = "/ACDCTest/reco" job["workflow"] = "ACDCTest" job["location"] = "cmssrm.fnal.gov" job["owner"] = "cmsdataops" job["group"] = "cmsdataops" return job testFileA = File(lfn = makeUUID(), size = 1024, events = 1024) testFileA.addRun(Run(1, 1, 2)) testFileB = File(lfn = makeUUID(), size = 1024, events = 1024) testFileB.addRun(Run(1, 3)) testJobA = getJob() testJobA.addFile(testFileA) testJobA.addFile(testFileB) testFileC = File(lfn = makeUUID(), size = 1024, events = 1024) testFileC.addRun(Run(1, 4, 6)) testJobB = getJob() testJobB.addFile(testFileC) testFileD = File(lfn = makeUUID(), size = 1024, events = 1024) testFileD.addRun(Run(1, 7)) testJobC = getJob() testJobC.addFile(testFileD) testFileE = File(lfn = makeUUID(), size = 1024, events = 1024) testFileE.addRun(Run(1, 11, 12)) testJobD = getJob() testJobD.addFile(testFileE) testFileF = File(lfn = makeUUID(), size = 1024, events = 1024) testFileF.addRun(Run(2, 5, 6, 7)) testJobE = getJob() testJobE.addFile(testFileF) testFileG = File(lfn = makeUUID(), size = 1024, events = 1024) testFileG.addRun(Run(2, 10, 11, 12)) testJobF = getJob() testJobF.addFile(testFileG) testFileH = File(lfn = makeUUID(), size = 1024, events = 1024) testFileH.addRun(Run(2, 15)) testJobG = getJob() testJobG.addFile(testFileH) testFileI = File(lfn = makeUUID(), size = 1024, events = 1024) testFileI.addRun(Run(3, 20)) testJobH = getJob() testJobH.addFile(testFileI) testFileJ = File(lfn = makeUUID(), size = 1024, events = 1024) testFileJ.addRun(Run(1, 9)) testJobI = getJob() testJobI.addFile(testFileJ) dcs.failedJobs([testJobA, testJobB, testJobC, testJobD, testJobE, testJobF, testJobG, testJobH, testJobI]) whiteList = dcs.getLumiWhitelist("ACDCTest", "/ACDCTest/reco") self.assertEqual(len(whiteList.keys()), 3, "Error: There should be 3 runs.") self.assertEqual(whiteList["1"], [[1, 4], [6, 7], [9, 9], [11, 12]], "Error: Whitelist for run 1 is wrong.") self.assertEqual(whiteList["2"], [[5, 7], [10, 12], [15, 15]], "Error: Whitelist for run 2 is wrong.") self.assertEqual(whiteList["3"], [[20, 20]], "Error: Whitelist for run 3 is wrong.") return
def setUp(self): """ _setUp_ Create two subscriptions: One that contains a single file and one that contains multiple files. """ self.multipleFileFileset = Fileset(name="TestFileset1") for i in range(10): newFile = File(makeUUID(), size=1000, events=100) newFile.addRun(Run(i, *[45 + i])) self.multipleFileFileset.addFile(newFile) self.singleFileFileset = Fileset(name="TestFileset2") newFile = File("/some/file/name", size=1000, events=100) newFile.addRun(Run(1, *[45])) self.singleFileFileset.addFile(newFile) self.multipleFileLumiset = Fileset(name="TestFileset3") for i in range(10): newFile = File(makeUUID(), size=1000, events=100) newFile.addRun(Run(1, *[45 + i // 3])) self.multipleFileLumiset.addFile(newFile) self.singleLumiFileset = Fileset(name="TestFileset4") for i in range(10): newFile = File(makeUUID(), size=1000, events=100) newFile.addRun(Run(1, *[45])) self.singleLumiFileset.addFile(newFile) testWorkflow = Workflow() self.multipleFileSubscription = Subscription( fileset=self.multipleFileFileset, workflow=testWorkflow, split_algo="FixedDelay", type="Processing") self.singleFileSubscription = Subscription( fileset=self.singleFileFileset, workflow=testWorkflow, split_algo="FixedDelay", type="Processing") self.multipleLumiSubscription = Subscription( fileset=self.multipleFileLumiset, workflow=testWorkflow, split_algo="FixedDelay", type="Processing") self.singleLumiSubscription = Subscription( fileset=self.singleLumiFileset, workflow=testWorkflow, split_algo="FixedDelay", type="Processing") return
def testNoFileSplitNoHardLimit(self): """ _testNoFileSplitNoHardLimit_ In this case we don't split on file boundaries, check different combination of files make sure we make the most of the splitting, e.g. include many zero event files in a single job. """ splitter = SplitterFactory() # Create 100 files with 7 lumi per file and 0 events per lumi on average. testSubscription = self.createSubscription(nFiles=100, lumisPerFile=7, twoSites=False, nEventsPerFile=0) jobFactory = splitter(package="WMCore.DataStructs", subscription=testSubscription) # First test, the optimal settings are 360 events per job. As we have files with 0 events per lumi, this will # configure the splitting to a single job containing all files jobGroups = jobFactory(halt_job_on_file_boundaries=False, splitOnRun=False, events_per_job=360, performance=self.performanceParams) # One job in one job group with 100 files self.assertEqual(len(jobGroups), 1) jobs = jobGroups[0].jobs self.assertEqual(len(jobs), 1) self.assertEqual(len(jobs[0]['input_files']), 100) # Create 7 files, each one with different lumi/event distributions testFileset = Fileset(name="FilesetA") testFileA = self.createFile("/this/is/file1", 250, 0, 5, "blenheim") testFileB = self.createFile("/this/is/file2", 600, 1, 1, "blenheim") testFileC = self.createFile("/this/is/file3", 1200, 2, 2, "blenheim") testFileD = self.createFile("/this/is/file4", 100, 3, 1, "blenheim") testFileE = self.createFile("/this/is/file5", 30, 4, 1, "blenheim") testFileF = self.createFile("/this/is/file6", 10, 5, 1, "blenheim") testFileG = self.createFile("/this/is/file7", 153, 6, 3, "blenheim") testFileset.addFile(testFileA) testFileset.addFile(testFileB) testFileset.addFile(testFileC) testFileset.addFile(testFileD) testFileset.addFile(testFileE) testFileset.addFile(testFileF) testFileset.addFile(testFileG) testSubscription = Subscription(fileset=testFileset, workflow=self.testWorkflow, split_algo="EventAwareLumiByWork", type="Processing") jobFactory = splitter(package="WMCore.DataStructs", subscription=testSubscription) # Split the work targeting 150 events per job jobGroups = jobFactory(halt_job_on_file_boundaries=False, splitOnRun=False, events_per_job=150, performance=self.performanceParams) self.assertEqual(len(jobGroups), 1) jobs = jobGroups[0].jobs self.assertEqual(len(jobs), 7) # Test interactions of this algorithm with splitOnRun = True # Make 2 files, one with 3 runs and a second one with the last run of the first fileA = File(lfn="/this/is/file1", size=1000, events=2400) lumiListA = [] lumiListB = [] lumiListC = [] for lumi in range(8): lumiListA.append(1 + lumi) lumiListB.append(1 + lumi) lumiListC.append(1 + lumi) fileA.addRun(Run(1, *lumiListA)) fileA.addRun(Run(2, *lumiListA)) fileA.addRun(Run(3, *lumiListA)) fileA.setLocation("malpaquet") fileB = self.createFile('/this/is/file2', 200, 3, 5, "malpaquet") testFileset = Fileset(name='FilesetB') testFileset.addFile(fileA) testFileset.addFile(fileB) testSubscription = Subscription(fileset=testFileset, workflow=self.testWorkflow, split_algo="EventAwareLumiByWork", type="Processing") jobFactory = splitter(package="WMCore.DataStructs", subscription=testSubscription) # The settings for this splitting are 700 events per job jobGroups = jobFactory(splitOnRun=True, halt_job_on_file_boundaries=False, events_per_job=700, performance=self.performanceParams) self.assertEqual(len(jobGroups), 1) jobs = jobGroups[0].jobs self.assertEqual(len(jobs), 6) # Make sure each job has one run for job in jobs: self.assertEqual(len(job['mask'].getRunAndLumis()), 1)
def processDataset(self): """ _processDataset_ Import the Dataset contents and create a set of jobs from it """ # // # // Now create the job definitions #// logging.debug("MergeSize = %s" % self.mergeSize) logging.debug("AllowedSites = %s" % self.allowedSites) logging.debug("Connection to DBS at: %s" % self.dbsUrl) reader = DBSReader(self.dbsUrl) blockList = reader.dbs.listBlocks(dataset=self.inputDataset()) jobDefs = [] for block in blockList: blockName = block['Name'] logging.debug("Getting files for block %s" % blockName) locations = reader.listFileBlockLocation(blockName) fileList = reader.dbs.listFiles(blockName=blockName) if not fileList: # Skip empty blocks continue thefiles = Fileset(name='FilesToSplit') for f in fileList: f['Block']['StorageElementList'].extend(locations) wmbsFile = File(f['LogicalFileName']) [wmbsFile['locations'].add(x) for x in locations] wmbsFile['block'] = blockName wmbsFile['size'] = f['FileSize'] thefiles.addFile(wmbsFile) work = Workflow() subs = Subscription(fileset=thefiles, workflow=work, split_algo='MergeBySize', type="Merge") logging.debug("Info for Subscription %s" % subs) splitter = SplitterFactory() jobfactory = splitter(subs) jobGroups = jobfactory( merge_size=self.mergeSize, # min in Bytes all_files=True # merge all files ) if not jobGroups: raise (SyntaxError) for jobGroup in jobGroups: for job in jobGroup.getJobs(): jobDef = JobDefinition() jobDef['LFNS'].extend(job.getFiles(type='lfn')) jobDef['SkipEvents'] = 0 jobDef['MaxEvents'] = -1 [ jobDef['SENames'].extend(list(x['locations'])) for x in job.getFiles() ] jobDefs.append(jobDef) return jobDefs
def testCreatePopulateDrop(self): """ _testCreatePopulateDrop_ Test creating, populating and dropping a collection. """ testCollectionA = CouchCollection(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="Thunderstruck") testCollectionB = CouchCollection(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="StruckThunder") testCollectionA.create() testCollectionB.create() # There should be nothing in couch. Documents are only added for # filesets and files. testFilesA = [] for i in range(5): testFile = File(lfn=makeUUID(), size=random.randint(1024, 4096), events=random.randint(1024, 4096)) testFilesA.append(testFile) testFilesB = [] for i in range(10): testFile = File(lfn=makeUUID(), size=random.randint(1024, 4096), events=random.randint(1024, 4096)) testFilesB.append(testFile) testFilesetA = CouchFileset(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="TestFilesetA") testFilesetB = CouchFileset(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="TestFilesetB") testFilesetC = CouchFileset(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="TestFilesetC") testCollectionA.addFileset(testFilesetA) testCollectionB.addFileset(testFilesetB) testCollectionB.addFileset(testFilesetC) testFilesetA.add(testFilesA) testFilesetB.add(testFilesA) testFilesetC.add(testFilesA) testFilesetC.add(testFilesB) # Drop testCollectionA testCollectionA.drop() # Try to populate testFilesetA testCollectionC = CouchCollection(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="ThunderStruck") testCollectionC.populate() self.assertEqual( len(testCollectionC["filesets"]), 0, "Error: There should be no filesets in this collect.") # Try to populate testFilesetB testCollectionD = CouchCollection(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="StruckThunder") testCollectionD.populate() for fileset in testCollectionD["filesets"]: testFiles = testFilesA if fileset["name"] == "TestFilesetC": testFiles.extend(testFilesB) self.assertEqual(len(testFiles), len(fileset.files.keys()), "Error: Wrong number of files in fileset.") for testFile in testFiles: self.assertTrue(testFile["lfn"] in fileset.files.keys(), "Error: File is missing.") self.assertEqual(testFile["events"], fileset.files[testFile["lfn"]]["events"], "Error: Wrong number of events.") self.assertEqual(testFile["size"], fileset.files[testFile["lfn"]]["size"], "Error: Wrong file size.") return
def testRunWhiteList(self): """ _testRunWhiteList_ Test that we can use a run white list to filter good runs/lumis. """ splitter = SplitterFactory() # Create 3 files with 100 events per lumi: # - file1 with 1 run of 8 lumis # - file2 with 2 runs of 2 lumis each # - file3 with 1 run of 5 lumis fileA = File(lfn="/this/is/file1", size=1000, events=800) fileB = File(lfn="/this/is/file2", size=1000, events=400) fileC = File(lfn="/this/is/file3", size=1000, events=500) lumiListA = [] for lumi in range(8): lumiListA.append(10 + lumi) fileA.addRun(Run(1, *lumiListA)) fileA.setLocation("somese.cern.ch") lumiListB1 = [] lumiListB2 = [] for lumi in range(2): lumiListB1.append(20 + lumi) lumiListB2.append(30 + lumi) fileB.addRun(Run(2, *lumiListB1)) fileB.addRun(Run(3, *lumiListB2)) fileB.setLocation("somese.cern.ch") lumiListC = [] for lumi in range(5): lumiListC.append(40 + lumi) fileC.addRun(Run(4, *lumiListC)) fileC.setLocation("somese.cern.ch") testFileset = Fileset(name='Fileset') testFileset.addFile(fileA) testFileset.addFile(fileB) testFileset.addFile(fileC) testSubscription = Subscription(fileset=testFileset, workflow=self.testWorkflow, split_algo="EventAwareLumiByWork", type="Processing") jobFactory = splitter(package="WMCore.DataStructs", subscription=testSubscription) # Split with no breaks jobGroups = jobFactory(halt_job_on_file_boundaries=False, splitOnRun=False, events_per_job=725, runWhitelist=[1, 4], performance=self.performanceParams) self.assertEqual(len(jobGroups), 1) jobs = jobGroups[0].jobs self.assertEqual(len(jobs), 2) for job in jobs: for run in job['mask'].getRunAndLumis(): self.assertIn(run, [1, 4]) # Re-split with a break on runs jobGroups = jobFactory(halt_job_on_file_boundaries=False, splitOnRun=True, events_per_job=595, runWhitelist=[1, 3, 4], performance=self.performanceParams) self.assertEqual(len(jobGroups), 1) jobs = jobGroups[0].jobs self.assertEqual(len(jobs), 4) self.enforceLimits(jobs=jobs, runsPerJob=1) for job in jobs: for run in job['mask'].getRunAndLumis(): self.assertIn(run, [1, 3, 4]) # Re-split with a break on files jobGroups = jobFactory(halt_job_on_file_boundaries=True, splitOnRun=False, events_per_job=595, runWhitelist=[1, 2, 3], performance=self.performanceParams) self.assertEqual(len(jobGroups), 1) jobs = jobGroups[0].jobs self.assertEqual(len(jobs), 3) self.enforceLimits(jobs=jobs, filesPerJob=1) for job in jobs: for run in job['mask'].getRunAndLumis(): self.assertIn(run, [1, 2, 3])
def testAcquireFiles(self): """ Testcase for the acquireFiles method of the Subscription Class """ # Cleaning possible files already occupying the available set self.dummySubscription.acquireFiles() # First test - Test if initial file (on available set) is inserted in the # acquired set - no arguments dummyFile2 = File('/tmp/dummyfile2,8888', 1, 1, 1) # Insert dummyFile2 into the available files Set at dummySubscription self.dummySubscription.available.addFile(dummyFile2) S = self.dummySubscription.available.listNewFiles() # Check if Set returned by method is the same that was at the previous # available FileSet assert S == self.dummySubscription.acquireFiles(), \ 'Couldn\'t acquire file using method acquireFiles - (no arguments test)' # Second test - Test if target files are inserted at the acquired set dummyFileList = set() # Populating the dummy List with a random number of files for i in range(1, random.randint(100, 1000)): lfn = '/store/data/%s/%s/file.root' % (random.randint(1000, 9999), random.randint(1000, 9999)) size = random.randint(1000, 2000) events = 1000 run = random.randint(0, 2000) lumi = random.randint(0, 8) file = File(lfn=lfn, size=size, events=events, checksums={"cksum": "1"}) file.addRun(Run(run, *[lumi])) dummyFileList.add(file) # Check if return value is correct - with parameters acqFiles = self.dummySubscription.acquireFiles(files=dummyFileList) assert acqFiles == dummyFileList, \ 'Return value for acquireFiles method not the acquired files' # Check if all files were inserted at subscription acquired files Set for x in dummyFileList: assert x in self.dummySubscription.acquired.getFiles(type='set'), \ 'Couldn\'t acquire File %s' % x.dict['lfn'] # Third test - Test if a replicate file is erased from the other Sets, # when a file is acquired dummyFile3 = File('/tmp/dummyfile3,5555', 1, 1, 1) dummyFileList = [] dummyFileList.append(dummyFile3) # Inserting dummyFile3 to be used as an argument, into each of the other file sets self.dummySubscription.available.addFile(dummyFile3) self.dummySubscription.failed.addFile(dummyFile3) self.dummySubscription.completed.addFile(dummyFile3) # Run the method acquireFiles self.dummySubscription.acquireFiles(files=dummyFileList, size=1) # Check if dummyFile3 was inserted at the acquired Set assert dummyFile3 in self.dummySubscription.acquired.getFiles(type='set'), \ 'Replicated file could\'nt be inserted at acquired Set' # Check if dummyFile3 was erased from all the other Sets assert dummyFile3 not in self.dummySubscription.available.getFiles(type='set'), \ 'Acquired file still present at available Set' assert dummyFile3 not in self.dummySubscription.failed.getFiles(type='set'), \ 'Acquired file still present at failed Set' assert dummyFile3 not in self.dummySubscription.completed.getFiles(type='set'), \ 'Acquired file still present at completed Set' # Fourth test - Test if the method works properly if a wrong size number # is given as an argument # Case 1: size < number of files given as an argument dummyFileList = [] for i in range(90, 100): dummyFileSize = File('/tmp/dummyfile' + str(i), 7656, 1, 1, 1) dummyFileList.append(dummyFileSize) # Run the method: self.dummySubscription.acquireFiles(files=dummyFileList, size=1) # Check each file of the List for x in dummyFileList: assert x in self.dummySubscription.acquired.getFiles(type='set'), \ 'File wasn\'t acquired (lower Size argument test)' # Case 2: size = 0 # Run the method: self.dummySubscription.acquireFiles(files=dummyFileList, size=0) # Check each file of the List for x in dummyFileList: assert x in self.dummySubscription.acquired.getFiles(type='set'), \ 'File wasn\'t acquired (zero size argument test)'
def testProcessing(self): """ _testProcessing_ Setup a processing workflow and job and verify that the FWJR produced by the emulator is reasonable. """ rerecoTask = self.workload.getTask("DataProcessing") cmsRunStep = rerecoTask.getStep("cmsRun1") inputFile = File(lfn="/path/to/test/lfn", size=1048576, events=1000, merged=True) inputFile.addRun(Run(1, *[1, 2, 3, 4, 5])) inputFile.addRun(Run(2, *[1, 2, 3, 4, 5, 6])) processingJob = Job(name="ProcessingJob", files=[inputFile]) processingJob["task"] = "/Tier1ReReco/ReReco" processingJob["mask"].setMaxAndSkipEvents(500, 0) processingJob["id"] = 1 processingJob["location"] = "cmssrm.fnal.gov" emu = ReportEmu(WMStep=cmsRunStep.getTypeHelper(), Job=processingJob) report = emu() reportInputFiles = report.getInputFilesFromStep("cmsRun1") assert len(reportInputFiles) == 1, \ "Error: Wrong number of input files for the job." assert reportInputFiles[0]["lfn"] == inputFile["lfn"], \ "Error: Input LFNs do not match: %s" % reportInputFiles[0]["lfn"] assert reportInputFiles[0]["size"] == inputFile["size"], \ "Error: Input file sizes do not match." assert reportInputFiles[0]["events"] == inputFile["events"], \ "Error: Input file events do not match." goldenRuns = [Run(1, *[1, 2, 3, 4, 5]), Run(2, *[1, 2, 3, 4, 5, 6])] assert len(reportInputFiles[0]["runs"]) == len(goldenRuns), \ "Error: Wrong number of runs in input file." for inputRun in reportInputFiles[0]["runs"]: for goldenRun in goldenRuns: if inputRun.run == goldenRun.run: goldenRun.lumis.sort() inputRun.lumis.sort() if goldenRun.lumis == inputRun.lumis: goldenRuns.remove(goldenRun) break assert len(goldenRuns) == 0, \ "Error: Run information wrong on input file." recoOutputFiles = report.getFilesFromOutputModule( "cmsRun1", "RECOoutput") alcaOutputFiles = report.getFilesFromOutputModule( "cmsRun1", "ALCARECOoutput") assert len(recoOutputFiles) == 1, \ "Error: There should only be one RECO output file." assert len(alcaOutputFiles) == 1, \ "Error: There should only be one ALCA output file." assert recoOutputFiles[0]["module_label"] == "RECOoutput", \ "Error: RECO file has wrong output module." assert alcaOutputFiles[0]["module_label"] == "ALCARECOoutput", \ "Error: ALCA file has wrong output module." self.verifyOutputMetaData(recoOutputFiles[0], processingJob) self.verifyOutputMetaData(alcaOutputFiles[0], processingJob) dataTierMap = {"RECOoutput": "RECO", "ALCARECOoutput": "ALCARECO"} for outputFile in [recoOutputFiles[0], alcaOutputFiles[0]]: assert outputFile["dataset"]["applicationName"] == "cmsRun", \ "Error: Application name is incorrect." assert outputFile["dataset"]["primaryDataset"] == self.primaryDataset, \ "Error: Primary dataset is incorrect." assert outputFile["dataset"]["dataTier"] == dataTierMap[outputFile["module_label"]], \ "Error: Data tier is incorrect." return
def formatOutput(self, task, requestname, datasetfiles, locations, tempDir): """ Receives as input the result of the data location discovery operations and fill up the WMCore objects. """ self.logger.debug(" Formatting data discovery output ") wmfiles = [] event_counter = 0 lumi_counter = 0 uniquelumis = set() datasetLumis = {} ## Loop over the sorted list of files. # can't affort one message from CRIC per file, unless critical ! previousLogLevel = self.logger.getEffectiveLevel() resourceCatalog = CRIC(logger=self.logger) self.logger.setLevel(logging.ERROR) for lfn, infos in datasetfiles.iteritems(): ## Skip the file if the block has not been found or has no locations. if not infos['BlockName'] in locations or not locations[ infos['BlockName']]: self.logger.warning( "Skipping %s because its block (%s) has no locations" % (lfn, infos['BlockName'])) continue ## Skip the file if it is not in VALID state. if not infos.get('ValidFile', True): self.logger.warning("Skipping invalid file %s" % lfn) continue if task['tm_use_parent'] == 1 and len(infos['Parents']) == 0: raise TaskWorkerException( "The CRAB3 server backend refuses to submit jobs to the Grid scheduler\n" + "because you specified useParents=True but some your files have no" + "parents.\nExample: " + lfn) ## Create a WMCore File object. try: size = infos['FileSize'] checksums = { 'Checksum': infos['Checksum'], 'Adler32': infos['Adler32'], 'Md5': infos['Md5'] } except: #This is so that the task worker does not crash if an old version of WMCore is used (the interface of an API suddenly changed). # We may want to remove the try/except and the following two lines eventually, but keeping them for the moment so other devels won't be affected #See this WMCore commit: https://github.com/dmwm/WMCore/commit/2afc01ae571390f5fa009dd258be757adac89c28#diff-374b7a6640288184175057234e393e1cL204 size = infos['Size'] checksums = infos['Checksums'] wmfile = File(lfn=lfn, events=infos['NumberOfEvents'], size=size, checksums=checksums, parents=infos['Parents']) wmfile['block'] = infos['BlockName'] try: wmfile['locations'] = resourceCatalog.PNNstoPSNs( locations[wmfile['block']]) except Exception as ex: self.logger.error( "Impossible translating %s to a CMS name through CMS Resource Catalog" % locations[wmfile['block']]) self.logger.error("got this exception:\n %s" % ex) raise wmfile['workflow'] = requestname event_counter += infos['NumberOfEvents'] for run, lumis in infos['Lumis'].iteritems(): datasetLumis.setdefault(run, []).extend(lumis) wmfile.addRun(Run(run, *lumis)) for lumi in lumis: uniquelumis.add((run, lumi)) lumi_counter += len(lumis) wmfiles.append(wmfile) self.logger.setLevel(previousLogLevel) uniquelumis = len(uniquelumis) self.logger.debug('Tot events found: %d' % event_counter) self.logger.debug('Tot lumis found: %d' % uniquelumis) self.logger.debug('Duplicate lumis found: %d' % (lumi_counter - uniquelumis)) self.logger.debug('Tot files found: %d' % len(wmfiles)) self.logger.debug( "Starting to create compact lumilists for input dataset") datasetLumiList = LumiList(runsAndLumis=datasetLumis) datasetLumis = datasetLumiList.getCompactList() datasetDuplicateLumis = datasetLumiList.getDuplicates().getCompactList( ) self.logger.debug( "Finished to create compact lumilists for input dataset") with open(os.path.join(tempDir, "input_dataset_lumis.json"), "w") as fd: json.dump(datasetLumis, fd) with open(os.path.join(tempDir, "input_dataset_duplicate_lumis.json"), "w") as fd: json.dump(datasetDuplicateLumis, fd) return Result(task=task, result=Fileset(name='FilesToSplit', files=set(wmfiles)))
def setUp(self): """ _setUp_ Create two subscriptions: One that contains a single file and one that contains multiple files. """ self.multipleFileFileset = Fileset(name = "TestFileset1") for i in range(10): newFile = File(makeUUID(), size = 1000, events = 100) newFile.setLocation('se01') self.multipleFileFileset.addFile(newFile) self.singleFileFileset = Fileset(name = "TestFileset2") newFile = File("/some/file/name", size = 1000, events = 100) newFile.setLocation('se02') self.singleFileFileset.addFile(newFile) self.emptyFileFileset = Fileset(name = "TestFileset3") newFile = File("/some/file/name", size = 1000, events = 0) newFile.setdefault('se03') self.emptyFileFileset.addFile(newFile) testWorkflow = Workflow() self.multipleFileSubscription = Subscription(fileset = self.multipleFileFileset, workflow = testWorkflow, split_algo = "EventBased", type = "Processing") self.singleFileSubscription = Subscription(fileset = self.singleFileFileset, workflow = testWorkflow, split_algo = "EventBased", type = "Processing") self.emptyFileSubscription = Subscription(fileset = self.emptyFileFileset, workflow = testWorkflow, split_algo = "EventBased", type = "Processing") self.performanceParams = {'timePerEvent' : None, 'memoryRequirement' : 2300, 'sizePerEvent' : 400} return
def testG_LumiMask(self): """ _testG_LumiMask_ Test that we can use a lumi-mask to filter good runs/lumis. """ splitter = SplitterFactory() # Create 3 files with 100 events per lumi: # - file1 with 1 run of 8 lumis # - file2 with 2 runs of 2 lumis each # - file3 with 1 run of 5 lumis fileA = File(lfn="/this/is/file1", size=1000, events=800) fileB = File(lfn="/this/is/file2", size=1000, events=400) fileC = File(lfn="/this/is/file3", size=1000, events=500) lumiListA = [] for lumi in range(8): lumiListA.append(10 + lumi) fileA.addRun(Run(1, *lumiListA)) fileA.setLocation("somese.cern.ch") lumiListB1 = [] lumiListB2 = [] for lumi in range(2): lumiListB1.append(20 + lumi) lumiListB2.append(30 + lumi) fileB.addRun(Run(2, *lumiListB1)) fileB.addRun(Run(3, *lumiListB2)) fileB.setLocation("somese.cern.ch") lumiListC = [] for lumi in range(5): lumiListC.append(40 + lumi) fileC.addRun(Run(4, *lumiListC)) fileC.setLocation("somese.cern.ch") testFileset = Fileset(name='Fileset') testFileset.addFile(fileA) testFileset.addFile(fileB) testFileset.addFile(fileC) testSubscription = Subscription(fileset=testFileset, workflow=self.testWorkflow, split_algo="EventAwareLumiBased", type="Processing") jobFactory = splitter(package="WMCore.DataStructs", subscription=testSubscription) # Use a lumi-mask = {1: [[10,14]], 2: [[20,21]], 4: [[40,41]]} jobGroups = jobFactory(halt_job_on_file_boundaries=False, splitOnRun=False, events_per_job=850, runs=['1', '2', '4'], lumis=['10,14', '20,21', '40,41'], performance=self.performanceParams) self.assertEqual(len(jobGroups), 1, "There should be only one job group") jobs = jobGroups[0].jobs self.assertEqual(len(jobs), 2, "Two jobs must be in the jobgroup") self.assertEqual(jobs[0]['mask'].getRunAndLumis(), { 1: [[10, 14]], 2: [[20, 21]], 4: [[40, 40]] }) self.assertEqual(jobs[1]['mask'].getRunAndLumis(), {4: [[41, 41]]})
def testListCollectionsFilesets(self): """ _testListCollectionsFilesets_ Verify that collections and filesets in ACDC can be listed. """ svc = CouchService(url=self.testInit.couchUrl, database=self.testInit.couchDbName) ownerA = svc.newOwner("somegroup", "someuserA") ownerB = svc.newOwner("somegroup", "someuserB") testCollectionA = CouchCollection(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="Thunderstruck") testCollectionA.setOwner(ownerA) testCollectionB = CouchCollection(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="Struckthunder") testCollectionB.setOwner(ownerA) testCollectionC = CouchCollection(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="Thunderstruck") testCollectionC.setOwner(ownerB) testCollectionD = CouchCollection(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="Thunderstruck") testCollectionD.setOwner(ownerB) testFilesetA = CouchFileset(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="TestFilesetA") testCollectionA.addFileset(testFilesetA) testFilesetB = CouchFileset(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="TestFilesetB") testCollectionB.addFileset(testFilesetB) testFilesetC = CouchFileset(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="TestFilesetC") testCollectionC.addFileset(testFilesetC) testFilesetD = CouchFileset(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="TestFilesetD") testCollectionC.addFileset(testFilesetD) testFiles = [] for i in range(5): testFile = File(lfn=makeUUID(), size=random.randint(1024, 4096), events=random.randint(1024, 4096)) testFiles.append(testFile) testFilesetA.add(testFiles) testFilesetB.add(testFiles) testFilesetC.add(testFiles) testFilesetD.add(testFiles) goldenCollectionNames = ["Thunderstruck", "Struckthunder"] for collection in svc.listCollections(ownerA): self.assertTrue(collection["name"] in goldenCollectionNames, "Error: Missing collection name.") goldenCollectionNames.remove(collection["name"]) self.assertEqual(len(goldenCollectionNames), 0, "Error: Missing collections.") goldenFilesetNames = ["TestFilesetC", "TestFilesetD"] for fileset in svc.listFilesets(testCollectionD): self.assertTrue(fileset["name"] in goldenFilesetNames, "Error: Missing fileset.") goldenFilesetNames.remove(fileset["name"]) self.assertEqual(len(goldenFilesetNames), 0, "Error: Missing filesets.") return
def testGetLumiWhitelist(self): """ _testGetLumiWhitelist_ Verify that the ACDC whitelist generation code works correctly. We'll add jobs with the following lumi info: # Run 1, lumis [1, 2, 3], [4, 6], [7], [9], [11, 12] # Run 2, lumis [5, 6, 7], [10, 11, 12], [15] # Run 3, lumis [20] And should get out a whitelist that looks like this: {"1": [[1, 4], [6, 7], [9, 9], [11, 12]], "2": [[5, 7], [10, 12], [15, 15]], "3": [[20, 20]]} """ dcs = DataCollectionService(url=self.testInit.couchUrl, database="wmcore-acdc-datacollectionsvc") testFileA = File(lfn=makeUUID(), size=1024, events=1024) testFileA.addRun(Run(1, 1, 2)) testFileB = File(lfn=makeUUID(), size=1024, events=1024) testFileB.addRun(Run(1, 3)) testJobA = self.getMinimalJob() testJobA.addFile(testFileA) testJobA.addFile(testFileB) testFileC = File(lfn=makeUUID(), size=1024, events=1024) testFileC.addRun(Run(1, 4, 6)) testJobB = self.getMinimalJob() testJobB.addFile(testFileC) testFileD = File(lfn=makeUUID(), size=1024, events=1024) testFileD.addRun(Run(1, 7)) testJobC = self.getMinimalJob() testJobC.addFile(testFileD) testFileE = File(lfn=makeUUID(), size=1024, events=1024) testFileE.addRun(Run(1, 11, 12)) testJobD = self.getMinimalJob() testJobD.addFile(testFileE) testFileF = File(lfn=makeUUID(), size=1024, events=1024) testFileF.addRun(Run(2, 5, 6, 7)) testJobE = self.getMinimalJob() testJobE.addFile(testFileF) testFileG = File(lfn=makeUUID(), size=1024, events=1024) testFileG.addRun(Run(2, 10, 11, 12)) testJobF = self.getMinimalJob() testJobF.addFile(testFileG) testFileH = File(lfn=makeUUID(), size=1024, events=1024) testFileH.addRun(Run(2, 15)) testJobG = self.getMinimalJob() testJobG.addFile(testFileH) testFileI = File(lfn=makeUUID(), size=1024, events=1024) testFileI.addRun(Run(3, 20)) testJobH = self.getMinimalJob() testJobH.addFile(testFileI) testFileJ = File(lfn=makeUUID(), size=1024, events=1024) testFileJ.addRun(Run(1, 9)) testJobI = self.getMinimalJob() testJobI.addFile(testFileJ) dcs.failedJobs([ testJobA, testJobB, testJobC, testJobD, testJobE, testJobF, testJobG, testJobH, testJobI ]) whiteList = dcs.getLumiWhitelist("ACDCTest", "/ACDCTest/reco") self.assertEqual(len(whiteList), 3, "Error: There should be 3 runs.") self.assertEqual(whiteList["1"], [[1, 4], [6, 7], [9, 9], [11, 12]], "Error: Whitelist for run 1 is wrong.") self.assertEqual(whiteList["2"], [[5, 7], [10, 12], [15, 15]], "Error: Whitelist for run 2 is wrong.") self.assertEqual(whiteList["3"], [[20, 20]], "Error: Whitelist for run 3 is wrong.") correctLumiList = LumiList( compactList={ "1": [[1, 4], [6, 7], [9, 9], [11, 12]], "2": [[5, 7], [10, 12], [15, 15]], "3": [[20, 20]] }) testLumiList = dcs.getLumilistWhitelist("ACDCTest", "/ACDCTest/reco") self.assertEqual(correctLumiList.getCMSSWString(), testLumiList.getCMSSWString()) return
def stuffACDCDatabase(self, numFiles=50, lumisPerFile=20, lumisPerACDCRecord=2): """ _stuffACDCDatabase_ Fill the ACDC database with ACDC records, both for processing and merge """ filesetName = '/%s/DataProcessing' % self.workflowName owner = 'unknown' group = 'unknown' for i in range(numFiles): for j in range(1, lumisPerFile + 1, lumisPerACDCRecord): lfn = '/store/data/a/%d' % i acdcFile = File(lfn=lfn, size=100, events=250, locations=self.validLocations, merged=1) run = Run( i + 1, *range(j, min(j + lumisPerACDCRecord, lumisPerFile + 1))) acdcFile.addRun(run) acdcDoc = { 'collection_name': self.workflowName, 'collection_type': 'ACDC.CollectionTypes.DataCollection', 'files': { lfn: acdcFile }, 'fileset_name': filesetName, 'owner': { 'user': owner, 'group': group } } self.acdcDB.queue(acdcDoc) filesetName = '/%s/DataProcessing/DataProcessingMergeRECOoutput' % self.workflowName for i in range(numFiles): for j in range(1, lumisPerFile + 1, lumisPerACDCRecord): lfn = '/store/unmerged/b/%d' % i acdcFile = File(lfn=lfn, size=100, events=250, locations=set([choice(self.validLocations)]), merged=0) run = Run( i + 1, *range(j, min(j + lumisPerACDCRecord, lumisPerFile + 1))) acdcFile.addRun(run) acdcDoc = { 'collection_name': self.workflowName, 'collection_type': 'ACDC.CollectionTypes.DataCollection', 'files': { lfn: acdcFile }, 'fileset_name': filesetName, 'owner': { 'user': owner, 'group': group } } self.acdcDB.queue(acdcDoc) self.acdcDB.commit() return