def testAddFile(self): """ _testAddFile_ Testcase for the addFile method of the Fileset class """ # First test - Add file and check if it's there testfile = File('/tmp/lfntest', 9999, 9, 9) self.fileset.addFile(testfile) self.assertTrue( testfile in self.fileset.listNewFiles(), "Couldn't add file to fileset - fileset.addfile method not working" ) # Second test - Add file that was already at Fileset.files, and check if it gets updated testFileSame = File('/tmp/lfntest', 9999, 9, 9) testFileSame.setLocation(set('dummyse.dummy.com')) self.fileset.addFile(testFileSame) self.assertTrue( testFileSame in self.fileset.getFiles(), 'Same file copy ailed - fileset.addFile not updating location of already existing files' ) self.assertTrue( testfile in self.fileset.getFiles(), 'Same file copy failed - fileset.addFile unable to remove previous file from list' ) # Third test - Add file that was already at Fileset.newfiles, and check if it gets updated self.assertTrue( testFileSame in self.fileset.listNewFiles(), 'Same file copy failed - fileset.addFile not adding file to fileset.newFiles' )
def testG_LumiMask(self): """ _testG_LumiMask_ Test that we can use a lumi-mask to filter good runs/lumis. """ splitter = SplitterFactory() # Create 3 files with 100 events per lumi: # - file1 with 1 run of 8 lumis # - file2 with 2 runs of 2 lumis each # - file3 with 1 run of 5 lumis fileA = File(lfn="/this/is/file1", size=1000, events=800) fileB = File(lfn="/this/is/file2", size=1000, events=400) fileC = File(lfn="/this/is/file3", size=1000, events=500) lumiListA = [] for lumi in range(8): lumiListA.append(10 + lumi) fileA.addRun(Run(1, *lumiListA)) fileA.setLocation("somese.cern.ch") lumiListB1 = [] lumiListB2 = [] for lumi in range(2): lumiListB1.append(20 + lumi) lumiListB2.append(30 + lumi) fileB.addRun(Run(2, *lumiListB1)) fileB.addRun(Run(3, *lumiListB2)) fileB.setLocation("somese.cern.ch") lumiListC = [] for lumi in range(5): lumiListC.append(40 + lumi) fileC.addRun(Run(4, *lumiListC)) fileC.setLocation("somese.cern.ch") testFileset = Fileset(name='Fileset') testFileset.addFile(fileA) testFileset.addFile(fileB) testFileset.addFile(fileC) testSubscription = Subscription(fileset=testFileset, workflow=self.testWorkflow, split_algo="EventAwareLumiBased", type="Processing") jobFactory = splitter(package="WMCore.DataStructs", subscription=testSubscription) # Use a lumi-mask = {1: [[10,14]], 2: [[20,21]], 4: [[40,41]]} jobGroups = jobFactory(halt_job_on_file_boundaries=False, splitOnRun=False, events_per_job=850, runs=['1', '2', '4'], lumis=['10,14', '20,21', '40,41'], performance=self.performanceParams) self.assertEqual(len(jobGroups), 1, "There should be only one job group") jobs = jobGroups[0].jobs self.assertEqual(len(jobs), 2, "Two jobs must be in the jobgroup") self.assertEqual(jobs[0]['mask'].getRunAndLumis(), {1: [[10, 14]], 2: [[20, 21]], 4: [[40, 40]]}) self.assertEqual(jobs[1]['mask'].getRunAndLumis(), {4: [[41, 41]]})
def setUp(self): """ Create a dummy fileset and populate it with random files, in order to use it for the testcase methods """ logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', datefmt='%m-%d %H:%M', filename=__file__.replace('.py','.log'), filemode='w') self.logger = logging.getLogger('FilesetClassTest') #Setup the initial testcase environment: initialfile = File('/tmp/lfn1',1000,1,1,1) self.initialSet = set() self.initialSet.add(initialfile) #Create a Fileset, containing a initial file on it. self.fileset = Fileset(name = 'self.fileset', files = self.initialSet) #Populate the fileset with random files for i in range(1,1000): lfn = '/store/data/%s/%s/file.root' % (random.randint(1000, 9999), random.randint(1000, 9999)) size = random.randint(1000, 2000) events = 1000 run = random.randint(0, 2000) lumi = random.randint(0, 8) file = File(lfn=lfn, size=size, events=events, checksums = {"cksum": "1"}) file.addRun(Run(run, *[lumi])) self.fileset.addFile(file)
def testComparison(self): """ testComparison tests that File.__eq__() works properly ACHTUNG! File.__eq__() focuses only on self['lfn'], it does not check for any other key/property """ testFile1 = File(lfn="lfn") testFile1_bis = File(lfn="lfn") self.assertEqual(testFile1, testFile1_bis) self.assertTrue(testFile1 == testFile1_bis) self.assertTrue(testFile1 is testFile1) self.assertTrue(testFile1 is not testFile1_bis) self.assertEqual(testFile1, "lfn") self.assertTrue(testFile1 == "lfn") self.assertEqual("lfn", testFile1) self.assertTrue("lfn" == testFile1) testFile2 = File(lfn="lfn-2") self.assertNotEqual(testFile1, testFile2) self.assertTrue(testFile1 != testFile2) self.assertNotEqual(testFile1, "lfn-2") self.assertTrue(testFile1 != "lfn-2") self.assertNotEqual("lfn-2", testFile1) self.assertTrue("lfn-2" != testFile1) # The following two comparisons work in py2 (File.File inherits __cmp__ # from dict), but in py3 fail with # TypeError: '<' not supported between instances of 'dict' and 'dict' # self.assertFalse(testFile1 > testFile2) # self.assertTrue(testFile1 < testFile2) return
def setUp(self): """ _setUp_ Create two subscriptions: One that contains a single file and one that contains multiple files. """ self.multipleFileFileset = Fileset(name="TestFileset1") for i in range(10): newFile = File(makeUUID(), size=1000, events=100) newFile.setLocation('blenheim') newFile.setLocation('malpaquet') self.multipleFileFileset.addFile(newFile) self.singleFileFileset = Fileset(name="TestFileset2") newFile = File("/some/file/name", size=1000, events=100) newFile.setLocation('blenheim') self.singleFileFileset.addFile(newFile) testWorkflow = Workflow() self.multipleFileSubscription = Subscription( fileset=self.multipleFileFileset, workflow=testWorkflow, split_algo="FileBased", type="Processing") self.singleFileSubscription = Subscription( fileset=self.singleFileFileset, workflow=testWorkflow, split_algo="FileBased", type="Processing") #self.multipleFileSubscription.create() #self.singleFileSubscription.create() return
def setUp(self): """ _setUp_ Create two subscriptions: One that contains a single file and one that contains multiple files. """ self.multipleFileFileset = Fileset(name="TestFileset1") for i in range(10): newFile = File(makeUUID(), size=1000, events=100) newFile.addRun(Run(i, *[45 + i])) self.multipleFileFileset.addFile(newFile) self.singleFileFileset = Fileset(name="TestFileset2") newFile = File("/some/file/name", size=1000, events=100) newFile.addRun(Run(1, *[45])) self.singleFileFileset.addFile(newFile) self.multipleFileLumiset = Fileset(name="TestFileset3") for i in range(10): newFile = File(makeUUID(), size=1000, events=100) newFile.addRun(Run(1, *[45 + i / 3])) self.multipleFileLumiset.addFile(newFile) self.singleLumiFileset = Fileset(name="TestFileset4") for i in range(10): newFile = File(makeUUID(), size=1000, events=100) newFile.addRun(Run(1, *[45])) self.singleLumiFileset.addFile(newFile) testWorkflow = Workflow() self.multipleFileSubscription = Subscription( fileset=self.multipleFileFileset, workflow=testWorkflow, split_algo="FixedDelay", type="Processing") self.singleFileSubscription = Subscription( fileset=self.singleFileFileset, workflow=testWorkflow, split_algo="FixedDelay", type="Processing") self.multipleLumiSubscription = Subscription( fileset=self.multipleFileLumiset, workflow=testWorkflow, split_algo="FixedDelay", type="Processing") self.singleLumiSubscription = Subscription( fileset=self.singleLumiFileset, workflow=testWorkflow, split_algo="FixedDelay", type="Processing") return
def setUp(self): """ _setUp_ Create two subscriptions: One that contains a single file and one that contains multiple files. """ self.multipleFileFileset = Fileset(name="TestFileset1") for i in range(10): newFile = File(makeUUID(), size=1000, events=100, locations=set(["somese.cern.ch"])) self.multipleFileFileset.addFile(newFile) self.singleFileFileset = Fileset(name="TestFileset2") newFile = File("/some/file/name", size=1000, events=100, locations=set(["somese.cern.ch"])) self.singleFileFileset.addFile(newFile) self.multipleSiteFileset = Fileset(name="TestFileset3") for i in range(5): newFile = File(makeUUID(), size=1000, events=100, locations=set(["somese.cern.ch"])) newFile.setLocation("somese.cern.ch") self.multipleSiteFileset.addFile(newFile) for i in range(5): newFile = File(makeUUID(), size=1000, events=100) newFile.setLocation(["somese.cern.ch", "otherse.cern.ch"]) self.multipleSiteFileset.addFile(newFile) testWorkflow = Workflow() self.multipleFileSubscription = Subscription( fileset=self.multipleFileFileset, workflow=testWorkflow, split_algo="SizeBased", type="Processing") self.singleFileSubscription = Subscription( fileset=self.singleFileFileset, workflow=testWorkflow, split_algo="SizeBased", type="Processing") self.multipleSiteSubscription = Subscription( fileset=self.multipleSiteFileset, workflow=testWorkflow, split_algo="EventBased", type="Processing") return
def createTestJob(self): """ _createTestJob_ Create a test job that has parents for each input file. """ newJob = Job(name = "TestJob") newJob.addFile(File(lfn = "/some/file/one", parents = set([File(lfn = "/some/parent/one")]))) newJob.addFile(File(lfn = "/some/file/two", parents = set([File(lfn = "/some/parent/two")]))) return newJob
def createFile(): """ _createFile_ Create a file with some random metdata. """ newFile = File(lfn=makeUUID(), size=random.randrange(1024, 1048576, 1024), events=random.randrange(10, 100000, 50), parents=[File(lfn=makeUUID())], locations=makeUUID()) newFile["first_event"] = 0 newFile["last_event"] = 0 newFile["id"] = 1 return newFile
def setUp(self): """ _setUp_ Create two subscriptions: One that contains a single file and one that contains multiple files. """ self.multipleFileFileset = Fileset(name="TestFileset1") for i in range(10): newFile = File(makeUUID(), size=1000, events=100) newFile.setLocation('se01') self.multipleFileFileset.addFile(newFile) self.singleFileFileset = Fileset(name="TestFileset2") newFile = File("/some/file/name", size=1000, events=100) newFile.setLocation('se02') self.singleFileFileset.addFile(newFile) self.emptyFileFileset = Fileset(name="TestFileset3") newFile = File("/some/file/name", size=1000, events=0) newFile.setLocation('se03') self.emptyFileFileset.addFile(newFile) testWorkflow = Workflow() self.multipleFileSubscription = Subscription( fileset=self.multipleFileFileset, workflow=testWorkflow, split_algo="EventBased", type="Processing") self.singleFileSubscription = Subscription( fileset=self.singleFileFileset, workflow=testWorkflow, split_algo="EventBased", type="Processing") self.emptyFileSubscription = Subscription( fileset=self.emptyFileFileset, workflow=testWorkflow, split_algo="EventBased", type="Processing") self.eventsPerJob = 100 self.performanceParams = { 'timePerEvent': None, 'memoryRequirement': 2300, 'sizePerEvent': 400 } return
def testCommit(self): """ Testcase for the commit method of the Fileset class """ localTestFileSet = Fileset('LocalTestFileset', self.initialSet) fsSize = len(localTestFileSet.getFiles(type="lfn")) #Dummy file to test fileTestCommit = File('/tmp/filetestcommit', 0000, 1, 1) #File is added to the newfiles attribute of localTestFileSet localTestFileSet.addFile(fileTestCommit) assert fsSize == len(localTestFileSet.getFiles(type = "lfn")) - 1, 'file not added'\ 'correctly to test fileset' newfilestemp = localTestFileSet.newfiles assert fileTestCommit in newfilestemp, 'test file not in the new files'\ 'list' #After commit, dummy file is supposed to move from newfiles to files localTestFileSet.commit() #First, testing if the new file is present at file set object attribute of the Fileset object assert newfilestemp.issubset(localTestFileSet.files), 'Test file not ' \ 'present at fileset.files - fileset.commit ' \ 'not working properly' #Second, testing if the newfile set object attribute is empty assert localTestFileSet.newfiles == set(), \ 'Test file not present at fileset.newfiles ' \ '- fileset.commit not working properly'
def populateCouchDB(self): """ _populateCouchDB_ Populate the ACDC records """ svc = CouchService(url=self.testInit.couchUrl, database=self.testInit.couchDbName) ownerA = svc.newOwner("somegroup", "someuserA") ownerB = svc.newOwner("somegroup", "someuserB") testCollectionA = CouchCollection(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="Thunderstruck") testCollectionA.setOwner(ownerA) testCollectionB = CouchCollection(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="Struckthunder") testCollectionB.setOwner(ownerA) testCollectionC = CouchCollection(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="Thunderstruck") testCollectionC.setOwner(ownerB) testCollectionD = CouchCollection(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="Thunderstruck") testCollectionD.setOwner(ownerB) testFilesetA = CouchFileset(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="TestFilesetA") testCollectionA.addFileset(testFilesetA) testFilesetB = CouchFileset(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="TestFilesetB") testCollectionB.addFileset(testFilesetB) testFilesetC = CouchFileset(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="TestFilesetC") testCollectionC.addFileset(testFilesetC) testFilesetD = CouchFileset(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="TestFilesetD") testCollectionC.addFileset(testFilesetD) testFiles = [] for i in range(5): testFile = File(lfn=makeUUID(), size=random.randint(1024, 4096), events=random.randint(1024, 4096)) testFiles.append(testFile) testFilesetA.add(testFiles) time.sleep(1) testFilesetB.add(testFiles) time.sleep(1) testFilesetC.add(testFiles) time.sleep(2) testFilesetD.add(testFiles)
def testFileset(self): """ _testFileset_ Verify that converting an ACDC fileset to a DataStructs fileset works correctly. """ testCollection = CouchCollection(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="Thunderstruck") testFileset = CouchFileset(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="TestFileset") testCollection.addFileset(testFileset) testFiles = {} for i in range(5): lfn = makeUUID() testFile = File(lfn=lfn, size=random.randint(1024, 4096), events=random.randint(1024, 4096)) testFiles[lfn] = testFile testFileset.add([testFile]) for file in testFileset.fileset().files: self.assertTrue(file["lfn"] in testFiles, "Error: File missing.") self.assertEqual(file["events"], testFiles[file["lfn"]]["events"], "Error: Wrong number of events.") self.assertEqual(file["size"], testFiles[file["lfn"]]["size"], "Error: Wrong file size.") return
def getChunkFiles(self, collectionName, filesetName, chunkOffset, chunkSize=100, user="******", group="cmsdataops"): """ _getChunkFiles_ Retrieve a chunk of files from the given collection and task. """ chunkFiles = [] files = self._getFilesetInfo(collectionName, filesetName, user, group, chunkOffset, chunkSize) files = mergeFakeFiles(files) for fileInfo in files: newFile = File(lfn=fileInfo["lfn"], size=fileInfo["size"], events=fileInfo["events"], parents=set(fileInfo["parents"]), locations=set(fileInfo["locations"]), merged=fileInfo["merged"]) for run in fileInfo["runs"]: newRun = Run(run["run_number"]) newRun.extend(run["lumis"]) newFile.addRun(newRun) chunkFiles.append(newFile) return chunkFiles
def setUp(self): """ _setUp_ Initial Setup for the Job Testcase """ self.inputFiles = [] for i in range(1, 1000): lfn = "/store/data/%s/%s/file.root" % (random.randint( 1000, 9999), random.randint(1000, 9999)) size = random.randint(1000, 2000) events = 1000 run = random.randint(0, 2000) lumi = random.randint(0, 8) file = File(lfn=lfn, size=size, events=events, checksums={"cksum": "1"}) file.addRun(Run(run, *[lumi])) self.inputFiles.append(file) self.dummyJob = Job(files=self.inputFiles) return
def testAddRun(self): """ This tests the addRun() function of a DataStructs File object """ testLFN = "lfn" testSize = "1024" testEvents = "100" testCksum = "1" testParents = "parent" testLumi = 1 testRunNumber = 1000000 testFile = File(lfn=testLFN, size=testSize, events=testEvents, checksums=testCksum, parents=testParents) testRun = Run(testRunNumber, testLumi) testFile.addRun(testRun) assert testRun in testFile[ 'runs'], "Run not added properly to run in File.addRun()" return
def testListCollectionsFilesets(self): """ _testListCollectionsFilesets_ Verify that collections and filesets in ACDC can be listed. """ svc = CouchService(url=self.testInit.couchUrl, database=self.testInit.couchDbName) testCollectionA = CouchCollection(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="Thunderstruck") testCollectionB = CouchCollection(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="Struckthunder") testCollectionC = CouchCollection(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="Thunderstruck") testCollectionD = CouchCollection(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="Thunderstruck") testFilesetA = CouchFileset(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="TestFilesetA") testCollectionA.addFileset(testFilesetA) testFilesetB = CouchFileset(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="TestFilesetB") testCollectionB.addFileset(testFilesetB) testFilesetC = CouchFileset(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="TestFilesetC") testCollectionC.addFileset(testFilesetC) testFilesetD = CouchFileset(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="TestFilesetD") testCollectionC.addFileset(testFilesetD) testFiles = [] for i in range(5): testFile = File(lfn=makeUUID(), size=random.randint(1024, 4096), events=random.randint(1024, 4096)) testFiles.append(testFile) testFilesetA.add(testFiles) testFilesetB.add(testFiles) testFilesetC.add(testFiles) testFilesetD.add(testFiles) goldenFilesetNames = ["TestFilesetA", "TestFilesetC", "TestFilesetD"] for fileset in svc.listFilesets(testCollectionD): self.assertTrue(fileset["name"] in goldenFilesetNames, "Error: Missing fileset.") goldenFilesetNames.remove(fileset["name"]) self.assertEqual(len(goldenFilesetNames), 0, "Error: Missing filesets.") return
def setupACDCDatabase(self, collectionName, taskPath, user, group): """ _setupACDCDatabase_ Populate an ACDC database with bogus records associated to certain collection name, user and task path. """ acdcServer = CouchService(url = self.testInit.couchUrl, database = "%s_acdc" % self.couchDBName) owner = acdcServer.newOwner(group, user) testCollection = CouchCollection(database = self.testInit.couchDbName, url = self.testInit.couchUrl, name = collectionName) testCollection.setOwner(owner) testFileset = CouchFileset(database = self.testInit.couchDbName, url = self.testInit.couchUrl, name = taskPath) testCollection.addFileset(testFileset) testFiles = [] for _ in range(5): testFile = File(lfn = makeUUID(), size = random.randint(1024, 4096), events = random.randint(1024, 4096)) testFiles.append(testFile) testFileset.add(testFiles)
def execute(self, *args, **kwargs): totalevents = kwargs['task']['tm_totalunits'] firstEvent = 1 lastEvent = totalevents firstLumi = 1 lastLumi = 10 # Set a default of 100 events per lumi. This is set as a task # property, as the splitting considers it independently of the file # information provided by the fake dataset. if not kwargs['task']['tm_events_per_lumi']: kwargs['task']['tm_events_per_lumi'] = 100 #MC comes with only one MCFakeFile singleMCFileset = Fileset(name = "MCFakeFileSet") newFile = File("MCFakeFile", size = 1000, events = totalevents) if hasattr(self.config.Sites, 'available'): newFile.setLocation(self.config.Sites.available) else: sbj = SiteDBJSON({"key":self.config.TaskWorker.cmskey, "cert":self.config.TaskWorker.cmscert}) newFile.setLocation(sbj.getAllCMSNames()) newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1))) newFile["block"] = 'MCFackBlock' newFile["first_event"] = firstEvent newFile["last_event"] = lastEvent singleMCFileset.addFile(newFile) return Result(task=kwargs['task'], result=singleMCFileset)
def execute(self, *args, **kwargs): #pylint: disable=unused-argument # since https://github.com/dmwm/CRABServer/issues/5633 totalunits can be a float # but that would confuse WMCore, therefore cast to int totalevents = int(kwargs['task']['tm_totalunits']) firstEvent = 1 lastEvent = totalevents firstLumi = 1 lastLumi = 10 # Set a default of 100 events per lumi. This is set as a task # property, as the splitting considers it independently of the file # information provided by the fake dataset. if not kwargs['task']['tm_events_per_lumi']: kwargs['task']['tm_events_per_lumi'] = 100 #MC comes with only one MCFakeFile singleMCFileset = Fileset(name="MCFakeFileSet") newFile = File("MCFakeFile", size=1000, events=totalevents) newFile.setLocation(self.getListOfSites()) newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1))) newFile["block"] = 'MCFakeBlock' newFile["first_event"] = firstEvent newFile["last_event"] = lastEvent singleMCFileset.addFile(newFile) return Result(task=kwargs['task'], result=singleMCFileset)
def testDefinition(self): """ This tests the definition of a DataStructs File object """ testLFN = "lfn" testSize = "1024" testEvents = "100" testCksum = {"cksum": "1"} testParents = "parent" testFile = File(lfn=testLFN, size=testSize, events=testEvents, checksums=testCksum, parents=testParents) self.assertEqual(testFile['lfn'], testLFN) self.assertEqual(testFile['size'], testSize) self.assertEqual(testFile['events'], testEvents) self.assertEqual(testFile['checksums'], testCksum) self.assertEqual(testFile['parents'], testParents) return
def testListFiles(self): """ _testListFiles_ Verify that the files iterator works correctly. """ testCollection = CouchCollection(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="Thunderstruck") testCollection.setOwner(self.owner) testFileset = CouchFileset(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="TestFileset") testCollection.addFileset(testFileset) testFiles = {} for i in range(5): lfn = makeUUID() testFile = File(lfn=lfn, size=random.randint(1024, 4096), events=random.randint(1024, 4096)) testFiles[lfn] = testFile testFileset.add([testFile]) for file in testFileset.listFiles(): self.assertTrue(file["lfn"] in testFiles.keys(), "Error: File missing.") self.assertEqual(file["events"], testFiles[file["lfn"]]["events"], "Error: Wrong number of events.") self.assertEqual(file["size"], testFiles[file["lfn"]]["size"], "Error: Wrong file size.") return
def doBlock( self, entity, fileset ): connection = urlopen( self.nodeURL + "&block=%s" % quote( entity ) ) aString = connection.read() connection.close() if aString[2:8] != "phedex": print "PhEDExNotifier: bad string from server follows." print "%s" % aString phedex = eval( aString.replace( "null", "None" ), {}, {} ) blocks = phedex[ 'phedex' ][ 'block' ] if len( blocks ) != 1: print "PhEDExNotifier: Found %d blocks, expected 1, will only consider first block" % len( blocks) files = blocks[0][ 'file' ] for file in files: lfn = file[ 'name' ] events = self.getEvents( lfn ) (runs,lumis) = self.getRunLumi( lfn ) fileToAdd = File( lfn, file[ 'bytes'], events, runs[0], lumis[0] ) replicas = file[ 'replica' ] if len( replicas ) > 0: locations = [] for replica in replicas: locations.append( replica[ 'node' ] ) fileToAdd.setLocation( locations ) fileset.addFile( fileToAdd )
def generateFakeMCFile(self, numEvents=100, firstEvent=1, lastEvent=100, firstLumi=1, lastLumi=10, existingSub=None): # MC comes with only one MCFakeFile newFile = File("MCFakeFileTest", size=1000, events=numEvents) newFile.setLocation('se01') if firstLumi == lastLumi: newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1))) else: newFile.addRun(Run(1, *range(firstLumi, lastLumi))) newFile["first_event"] = firstEvent newFile["last_event"] = lastEvent if existingSub is None: singleMCFileset = Fileset(name="MCTestFileset") singleMCFileset.addFile(newFile) testWorkflow = Workflow() existingSub = Subscription(fileset=singleMCFileset, workflow=testWorkflow, split_algo="EventBased", type="Production") else: existingSub['fileset'].addFile(newFile) return existingSub
def setUp(self): """ _setUp_ Create two subscriptions: One that contains a single file and one that contains multiple files. """ self.multipleFileFileset = Fileset(name="TestFileset1") for i in range(10): newFile = File(makeUUID(), size=1000, events=100) newFile.setLocation('blenheim') newFile.setLocation('malpaquet') lumis = [] for lumi in range(20): lumis.append((i * 100) + lumi) newFile.addRun(Run(i, *lumis)) self.multipleFileFileset.addFile(newFile) self.singleFileFileset = Fileset(name="TestFileset2") newFile = File("/some/file/name", size=1000, events=100) newFile.setLocation('blenheim') lumis = list(range(50, 60)) + list(range(70, 80)) newFile.addRun(Run(13, *lumis)) self.singleFileFileset.addFile(newFile) testWorkflow = Workflow() self.multipleFileSubscription = Subscription( fileset=self.multipleFileFileset, workflow=testWorkflow, split_algo="FileBased", type="Processing") self.singleFileSubscription = Subscription( fileset=self.singleFileFileset, workflow=testWorkflow, split_algo="FileBased", type="Processing") #self.multipleFileSubscription.create() #self.singleFileSubscription.create() self.performanceParams = { 'timePerEvent': 12, 'memoryRequirement': 2300, 'sizePerEvent': 400 } return
def testDropCount(self): """ _testDropCount_ Verify that dropping a fileset and counting the files in a fileset works correctly. """ testCollectionA = CouchCollection(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="Thunderstruck") testCollectionB = CouchCollection(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="StruckThunder") testFiles = [] for i in range(5): testFile = File(lfn=makeUUID(), size=random.randint(1024, 4096), events=random.randint(1024, 4096)) testFiles.append(testFile) testFilesetA = CouchFileset(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="TestFilesetA") testFilesetB = CouchFileset(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="TestFilesetB") testFilesetC = CouchFileset(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="TestFilesetC") testCollectionA.addFileset(testFilesetA) testCollectionB.addFileset(testFilesetB) testCollectionB.addFileset(testFilesetC) testFilesetA.add(testFiles) testFilesetB.add(testFiles) testFilesetC.add(testFiles) testFilesetC.drop() testCollectionC = CouchCollection(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="StruckThunder") testCollectionC.populate() self.assertEqual(len(testCollectionC["filesets"]), 1, "Error: There should be one fileset in this collection.") self.assertEqual(testCollectionC["filesets"][0].fileCount(), 5, "Error: Wrong number of files in fileset.") testCollectionD = CouchCollection(database=self.testInit.couchDbName, url=self.testInit.couchUrl, name="Thunderstruck") testCollectionD.populate() self.assertEqual(len(testCollectionD["filesets"]), 1, "Error: There should be one fileset in this collection.") self.assertEqual(testCollectionD["filesets"][0].fileCount(), 5, "Error: Wrong number of files in fileset.") return
def stuffACDCDatabase(self, numFiles = 50, lumisPerFile = 20, lumisPerACDCRecord = 2): """ _stuffACDCDatabase_ Fill the ACDC database with ACDC records, both for processing and merge """ filesetName = '/%s/DataProcessing' % self.workflowName owner = 'unknown' group = 'unknown' for i in range(numFiles): for j in range(1, lumisPerFile + 1, lumisPerACDCRecord): lfn = '/store/data/a/%d' % i acdcFile = File(lfn = lfn, size = 100, events = 250, locations = self.validLocations, merged = 1) run = Run(i + 1, *range(j, min(j + lumisPerACDCRecord, lumisPerFile + 1))) acdcFile.addRun(run) acdcDoc = {'collection_name' : self.workflowName, 'collection_type' : 'ACDC.CollectionTypes.DataCollection', 'files' : {lfn : acdcFile}, 'fileset_name' : filesetName, 'owner' : {'user': owner, 'group' : group}} self.acdcDB.queue(acdcDoc) filesetName = '/%s/DataProcessing/DataProcessingMergeRECOoutput' % self.workflowName for i in range(numFiles): for j in range(1, lumisPerFile + 1, lumisPerACDCRecord): lfn = '/store/unmerged/b/%d' % i acdcFile = File(lfn = lfn, size = 100, events = 250, locations = set([choice(self.validLocations)]), merged = 0) run = Run(i + 1, *range(j, min(j + lumisPerACDCRecord, lumisPerFile + 1))) acdcFile.addRun(run) acdcDoc = {'collection_name' : self.workflowName, 'collection_type' : 'ACDC.CollectionTypes.DataCollection', 'files' : {lfn : acdcFile}, 'fileset_name' : filesetName, 'owner' : {'user': owner, 'group' : group}} self.acdcDB.queue(acdcDoc) self.acdcDB.commit() return
def createTestJob(self): """ Create a test job to pass to the DashboardInterface """ job = Job(name = "ThisIsASillyName") testFileA = File(lfn = "/this/is/a/lfnA", size = 1024, events = 10) testFileA.addRun(Run(1, *[45])) testFileB = File(lfn = "/this/is/a/lfnB", size = 1024, events = 10) testFileB.addRun(Run(1, *[46])) job.addFile(testFileA) job.addFile(testFileB) job['id'] = 1 return job
def testDefinition(self): """ This tests the definition of a DataStructs File object """ testFile = File() self.assertEqual(testFile['lfn'], "") self.assertEqual(testFile['size'], 0) self.assertEqual(testFile['events'], 0) self.assertEqual(testFile['checksums'], {}) self.assertItemsEqual(testFile['parents'], {}) self.assertItemsEqual(testFile['locations'], {}) self.assertFalse(testFile['merged']) param = { "lfn": "my_lfn", "size": 1024, "events": 100, "checksums": { 'adler32': 'BLAH', 'cksum': '12345' }, "parents": "my_parent", "locations": {"PNN_Location"}, "merged": True } testFile = File(lfn=param['lfn'], size=param['size'], events=param['events'], checksums=param['checksums'], parents=param['parents'], locations=param['locations'], merged=param['merged']) self.assertEqual(testFile['lfn'], param['lfn']) self.assertEqual(testFile['size'], param['size']) self.assertEqual(testFile['events'], param['events']) self.assertItemsEqual(testFile['checksums'], param['checksums']) self.assertEqual(testFile['parents'], param['parents']) self.assertItemsEqual(testFile['locations'], param['locations']) self.assertTrue(testFile['merged']) return
def processDataset(self): """ _processDataset_ Import the Dataset contents and create a set of jobs from it """ # // # // Now create the job definitions #// logging.debug("SplitSize = %s" % self.splitSize) logging.debug("AllowedSites = %s" % self.allowedSites) thefiles = Fileset(name='FilesToSplit') reader = DBSReader(self.dbsUrl) fileList = reader.dbs.listFiles( analysisDataset=self.inputDataset(), retriveList=['retrive_block', 'retrive_run']) blocks = {} for f in fileList: block = f['Block']['Name'] if not blocks.has_key(block): blocks[block] = reader.listFileBlockLocation(block) f['Block']['StorageElementList'].extend(blocks[block]) wmbsFile = File(f['LogicalFileName']) [wmbsFile['locations'].add(x) for x in blocks[block]] wmbsFile['block'] = block thefiles.addFile(wmbsFile) work = Workflow() subs = Subscription(fileset=thefiles, workflow=work, split_algo='FileBased', type="Processing") splitter = SplitterFactory() jobfactory = splitter(subs) jobs = jobfactory(files_per_job=self.splitSize) jobDefs = [] for job in jobs.jobs: #job.mask.setMaxAndSkipEvents(-1, 0) jobDef = JobDefinition() jobDef['LFNS'].extend(job.listLFNs()) jobDef['SkipEvents'] = 0 jobDef['MaxEvents'] = -1 [ jobDef['SENames'].extend(list(x['locations'])) for x in job.listFiles() ] jobDefs.append(jobDef) return jobDefs