Exemplo n.º 1
0
    def doBlock( self, entity, fileset ):

        connection = urlopen( self.nodeURL + "&block=%s" % quote( entity ) )
        aString = connection.read()
        connection.close()

        if aString[2:8] != "phedex":
            print "PhEDExNotifier: bad string from server follows."
            print "%s" % aString

        phedex = eval( aString.replace( "null", "None" ), {}, {} )

        blocks = phedex[ 'phedex' ][ 'block' ]
        if len( blocks ) != 1:
            print "PhEDExNotifier: Found %d blocks, expected 1, will only consider first block" % len( blocks)

        files = blocks[0][ 'file' ]
        for file in files:
            lfn = file[ 'name' ]
            events = self.getEvents( lfn )
            (runs,lumis) = self.getRunLumi( lfn )
            fileToAdd = File( lfn, file[ 'bytes'], events, runs[0], lumis[0] )
            replicas = file[ 'replica' ]
            if len( replicas ) > 0:
                locations = []
                for replica in replicas:
                    locations.append( replica[ 'node' ] )
                fileToAdd.setLocation( locations )
                fileset.addFile( fileToAdd )
    def doBlock(self, entity, fileset):

        connection = urlopen(self.nodeURL + "&block=%s" % quote(entity))
        aString = connection.read()
        connection.close()

        if aString[2:8] != "phedex":
            print "PhEDExNotifier: bad string from server follows."
            print "%s" % aString

        phedex = eval(aString.replace("null", "None"), {}, {})

        blocks = phedex["phedex"]["block"]
        if len(blocks) != 1:
            print "PhEDExNotifier: Found %d blocks, expected 1, will only consider first block" % len(blocks)

        files = blocks[0]["file"]
        for file in files:
            lfn = file["name"]
            events = self.getEvents(lfn)
            (runs, lumis) = self.getRunLumi(lfn)
            fileToAdd = File(lfn, file["bytes"], events, runs[0], lumis[0])
            replicas = file["replica"]
            if len(replicas) > 0:
                locations = []
                for replica in replicas:
                    locations.append(replica["node"])
                fileToAdd.setLocation(locations)
                fileset.addFile(fileToAdd)
Exemplo n.º 3
0
    def execute(self, *args, **kwargs):  #pylint: disable=unused-argument

        # since https://github.com/dmwm/CRABServer/issues/5633 totalunits can be a float
        # but that would confuse WMCore, therefore cast to int
        totalevents = int(kwargs['task']['tm_totalunits'])
        firstEvent = 1
        lastEvent = totalevents
        firstLumi = 1
        lastLumi = 10

        # Set a default of 100 events per lumi.  This is set as a task
        # property, as the splitting considers it independently of the file
        # information provided by the fake dataset.
        if not kwargs['task']['tm_events_per_lumi']:
            kwargs['task']['tm_events_per_lumi'] = 100

        #MC comes with only one MCFakeFile
        singleMCFileset = Fileset(name="MCFakeFileSet")
        newFile = File("MCFakeFile", size=1000, events=totalevents)
        newFile.setLocation(self.getListOfSites())
        newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1)))
        newFile["block"] = 'MCFakeBlock'
        newFile["first_event"] = firstEvent
        newFile["last_event"] = lastEvent
        singleMCFileset.addFile(newFile)

        return Result(task=kwargs['task'], result=singleMCFileset)
Exemplo n.º 4
0
    def execute(self, *args, **kwargs):

        totalevents = kwargs['task']['tm_totalunits']
        firstEvent = 1
        lastEvent = totalevents
        firstLumi = 1
        lastLumi = 10

        # Set a default of 100 events per lumi.  This is set as a task
        # property, as the splitting considers it independently of the file
        # information provided by the fake dataset.
        if not kwargs['task']['tm_events_per_lumi']:
            kwargs['task']['tm_events_per_lumi'] = 100

        #MC comes with only one MCFakeFile
        singleMCFileset = Fileset(name = "MCFakeFileSet")
        newFile = File("MCFakeFile", size = 1000, events = totalevents)
        sbj = SiteDBJSON({"key":self.config.TaskWorker.cmskey,
                          "cert":self.config.TaskWorker.cmscert})
        newFile.setLocation(sbj.getAllCMSNames())
        newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1)))
        newFile["block"] = 'MCFakeBlock'
        newFile["first_event"] = firstEvent
        newFile["last_event"] = lastEvent
        singleMCFileset.addFile(newFile)

        return Result(task=kwargs['task'], result=singleMCFileset)
Exemplo n.º 5
0
    def execute(self, *args, **kwargs): #pylint: disable=unused-argument

        # since https://github.com/dmwm/CRABServer/issues/5633 totalunits can be a float
        # but that would confuse WMCore, therefore cast to int
        totalevents = int(kwargs['task']['tm_totalunits'])
        firstEvent = 1
        lastEvent = totalevents
        firstLumi = 1
        lastLumi = 10

        # Set a default of 100 events per lumi.  This is set as a task
        # property, as the splitting considers it independently of the file
        # information provided by the fake dataset.
        if not kwargs['task']['tm_events_per_lumi']:
            kwargs['task']['tm_events_per_lumi'] = 100

        #MC comes with only one MCFakeFile
        singleMCFileset = Fileset(name = "MCFakeFileSet")
        newFile = File("MCFakeFile", size = 1000, events = totalevents)
        newFile.setLocation(self.getListOfSites())
        newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1)))
        newFile["block"] = 'MCFakeBlock'
        newFile["first_event"] = firstEvent
        newFile["last_event"] = lastEvent
        singleMCFileset.addFile(newFile)

        return Result(task=kwargs['task'], result=singleMCFileset)
Exemplo n.º 6
0
    def execute(self, *args, **kwargs):

        totalevents = kwargs['task']['tm_totalunits']
        firstEvent = 1
        lastEvent = totalevents
        firstLumi = 1
        lastLumi = 10

        # Set a default of 100 events per lumi.  This is set as a task
        # property, as the splitting considers it independently of the file
        # information provided by the fake dataset.
        if not kwargs['task']['tm_events_per_lumi']:
            kwargs['task']['tm_events_per_lumi'] = 100

        #MC comes with only one MCFakeFile
        singleMCFileset = Fileset(name = "MCFakeFileSet")
        newFile = File("MCFakeFile", size = 1000, events = totalevents)
        if hasattr(self.config.Sites, 'available'):
            newFile.setLocation(self.config.Sites.available)
        else:
            sbj = SiteDBJSON({"key":self.config.TaskWorker.cmskey,
                              "cert":self.config.TaskWorker.cmscert})
            newFile.setLocation(sbj.getAllCMSNames())
        newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1)))
        newFile["block"] = 'MCFackBlock'
        newFile["first_event"] = firstEvent
        newFile["last_event"] = lastEvent
        singleMCFileset.addFile(newFile)

        return Result(task=kwargs['task'], result=singleMCFileset)
Exemplo n.º 7
0
    def setUp(self):
        """
        _setUp_

        Create two subscriptions: One that contains a single file and one that
        contains multiple files.
        """
        self.multipleFileFileset = Fileset(name="TestFileset1")
        for i in range(10):
            newFile = File(makeUUID(), size=1000, events=100)
            newFile.setLocation("se01")
            self.multipleFileFileset.addFile(newFile)

        self.singleFileFileset = Fileset(name="TestFileset2")
        newFile = File("/some/file/name", size=1000, events=100)
        newFile.setLocation("se02")
        self.singleFileFileset.addFile(newFile)

        self.emptyFileFileset = Fileset(name="TestFileset3")
        newFile = File("/some/file/name", size=1000, events=0)
        newFile.setdefault("se03")
        self.emptyFileFileset.addFile(newFile)

        testWorkflow = Workflow()
        self.multipleFileSubscription = Subscription(
            fileset=self.multipleFileFileset, workflow=testWorkflow, split_algo="EventBased", type="Processing"
        )
        self.singleFileSubscription = Subscription(
            fileset=self.singleFileFileset, workflow=testWorkflow, split_algo="EventBased", type="Processing"
        )
        self.emptyFileSubscription = Subscription(
            fileset=self.emptyFileFileset, workflow=testWorkflow, split_algo="EventBased", type="Processing"
        )

        return
Exemplo n.º 8
0
    def generateFakeMCFile(self,
                           numEvents=100,
                           firstEvent=1,
                           lastEvent=100,
                           firstLumi=1,
                           lastLumi=10,
                           existingSub=None):
        # MC comes with only one MCFakeFile
        newFile = File("MCFakeFileTest", size=1000, events=numEvents)
        newFile.setLocation('se01')
        if firstLumi == lastLumi:
            newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1)))
        else:
            newFile.addRun(Run(1, *range(firstLumi, lastLumi)))
        newFile["first_event"] = firstEvent
        newFile["last_event"] = lastEvent

        if existingSub is None:
            singleMCFileset = Fileset(name="MCTestFileset")
            singleMCFileset.addFile(newFile)
            testWorkflow = Workflow()
            existingSub = Subscription(fileset=singleMCFileset,
                                       workflow=testWorkflow,
                                       split_algo="EventBased",
                                       type="Production")
        else:
            existingSub['fileset'].addFile(newFile)

        return existingSub
Exemplo n.º 9
0
 def testAddFile(self):
     """
         Testcase for the addFile method of the Fileset class
         
     """
     #First test - Add file and check if its there
     testfile = File('/tmp/lfntest',9999,9,9)
     self.fileset.addFile(testfile)
     assert(testfile in self.fileset.listNewFiles(), 'Couldn\'t add file ' +
             'to fileset - fileset.addfile method not working')
     #Second test - Add file that was already at Fileset.files , 
     # and check if it gets updated
     testFileSame = File('/tmp/lfntest',9999,9,9)
     testFileSame.setLocation(set('dummyse.dummy.com'))
     self.fileset.addFile(testFileSame)
     assert(testFileSame in  self.fileset.getFiles(),'Same file copy ' +
            'failed - fileset.addFile not updating location of already ' +
            'existing files' )
     assert(testfile in self.fileset.getFiles(),'Same file copy ' +
            'failed - fileset.addFile unable to remove previous file ' +
            'from list')
     #Third test - Add file that was already at Fileset.newfiles , 
     #and check if it gets updated
     assert(testFileSame in  self.fileset.listNewFiles(),'Same file copy ' +
            'failed - fileset.addFile not adding file to fileset.newFiles')
Exemplo n.º 10
0
    def returnDataStructsFile(self):
        """
        _returnDataStructsFile_

        Creates a dataStruct file out of this file
        """
        parents = set()
        for parent in self["parents"]:
            parents.add(
                WMFile(lfn=parent['lfn'],
                       size=parent['size'],
                       events=parent['events'],
                       checksums=parent['checksums'],
                       parents=parent['parents'],
                       merged=parent['merged']))

        file = WMFile(lfn=self['lfn'],
                      size=self['size'],
                      events=self['events'],
                      checksums=self['checksums'],
                      parents=parents,
                      merged=self['merged'])

        for run in self['runs']:
            file.addRun(run)

        for location in self['locations']:
            file.setLocation(pnn=location)

        return file
Exemplo n.º 11
0
    def testAddFile(self):
        """
        _testAddFile_

        Testcase for the addFile method of the Fileset class

        """

        # First test - Add file and check if it's there
        testfile = File('/tmp/lfntest', 9999, 9, 9)
        self.fileset.addFile(testfile)
        self.assertTrue(
            testfile in self.fileset.listNewFiles(),
            "Couldn't add file to fileset - fileset.addfile method not working"
        )

        # Second test - Add file that was already at Fileset.files, and check if it gets updated
        testFileSame = File('/tmp/lfntest', 9999, 9, 9)
        testFileSame.setLocation(set('dummyse.dummy.com'))
        self.fileset.addFile(testFileSame)
        self.assertTrue(
            testFileSame in self.fileset.getFiles(),
            'Same file copy ailed - fileset.addFile not updating location of already existing files'
        )
        self.assertTrue(
            testfile in self.fileset.getFiles(),
            'Same file copy failed - fileset.addFile unable to remove previous file from list'
        )

        # Third test - Add file that was already at Fileset.newfiles, and check if it gets updated
        self.assertTrue(
            testFileSame in self.fileset.listNewFiles(),
            'Same file copy failed - fileset.addFile not adding file to fileset.newFiles'
        )
Exemplo n.º 12
0
 def __call__(self, fileset):
     """
     return a randomly sized list of files (DataStructs.File) at locations
     files will always be new
     """
     num_files = random.randint(0 , self.max)
     for f in self.makelist(fileset):
         list = []
         for i in range(0, num_files):
             # Decide where the file is
             locs = []
             for i in range(0, len(self.locations)):
                 if random.randint(0 , 1):
                     locs.append(self.locations[i])
             lfn='/store/data/fake-feeder-files/notreal/%s.root' % uuid(i) 
             size=2000 + ((i-5) * 50) 
             events=1000 + ((i-3) * 150) 
             run = random.randint(0 , int(3.14159265 * i * self.max)) 
             lumi = random.randint(0 ,10)
             file = File(lfn, size, events, run, lumi)
             file.setLocation(locs)
             f.addFile(file)
             
     return fileset
             
         
Exemplo n.º 13
0
    def jobConfig(self, wf, task, jobid, lfn):
        """
        Create a fake job dict to upload to the ACDC server
        """
        testFile = File(lfn=lfn, size=1024, events=1024)
        testFile.setLocation(["T2_CH_CERN", "T2_CH_CERN_HLT"])
        testFile.addRun(Run(jobid, 1, 2))  # run = jobid
        testJob = self.getMinimalJob(wf, task)
        testJob.addFile(testFile)

        return testJob
Exemplo n.º 14
0
    def jobConfig(self, wf, task, jobid, lfn):
        """
        Create a fake job dict to upload to the ACDC server
        """
        testFile = File(lfn=lfn, size=1024, events=1024)
        testFile.setLocation(["T2_CH_CERN", "T2_CH_CERN_HLT"])
        testFile.addRun(Run(jobid, 1, 2))  # run = jobid
        testJob = self.getMinimalJob(wf, task)
        testJob.addFile(testFile)

        return testJob
Exemplo n.º 15
0
    def execute(self, *args, **kwargs):
        self.logger.info(
            "Data discovery and splitting for %s using user-provided files" %
            kwargs['task']['tm_taskname'])

        userfiles = kwargs['task']['tm_arguments'].get('userfiles')
        splitting = kwargs['task']['tm_split_algo']
        total_units = kwargs['task']['tm_totalunits']
        if not userfiles or splitting != 'FileBased':
            if not userfiles:
                msg = "No files specified to process for task %s." % kwargs[
                    'task']['tm_taskname']
            if splitting != 'FileBased':
                msg = "Data.splitting must be set to 'FileBased' when using a custom set of files."
            self.logger.error("Setting %s as failed: %s" %
                              (kwargs['task']['tm_taskname'], msg))
            configreq = {
                'workflow': kwargs['task']['tm_taskname'],
                'status': "FAILED",
                'subresource': 'failure',
                'failure': b64encode(msg)
            }
            self.server.post(self.resturi, data=urllib.urlencode(configreq))
            raise StopHandler(msg)

        if hasattr(self.config.Sites, 'available'):
            locations = self.config.Sites.available
        else:
            sbj = SiteDBJSON({
                "key": self.config.TaskWorker.cmskey,
                "cert": self.config.TaskWorker.cmscert
            })
            locations = sbj.getAllCMSNames()

        userFileset = Fileset(name=kwargs['task']['tm_taskname'])
        self.logger.info("There are %d files specified by the user." %
                         len(userfiles))
        if total_units > 0:
            self.logger.info("Will run over the first %d files." % total_units)
        file_counter = 0
        for userfile, idx in zip(userfiles, range(len(userfiles))):
            newFile = File(userfile, size=1000, events=1)
            newFile.setLocation(locations)
            newFile.addRun(Run(1, idx))
            newFile["block"] = 'UserFilesFakeBlock'
            newFile["first_event"] = 1
            newFile["last_event"] = 2
            userFileset.addFile(newFile)
            file_counter += 1
            if total_units > 0 and file_counter >= total_units:
                break

        return Result(task=kwargs['task'], result=userFileset)
Exemplo n.º 16
0
    def getOutputFile(self, fileName, outputModule, step):
        """
        _getOutputFile_

        Takes a fileRef object and returns a DataStructs/File object as output
        """

        outputMod = self.getOutputModule(step=step, outputModule=outputModule)

        if not outputMod:
            return None

        fileRef = getattr(outputMod.files, fileName, None)
        newFile = File(locations=set())

        # Locations
        newFile.setLocation(getattr(fileRef, "location", None))

        # Runs
        runList = fileRef.runs.listSections_()
        for run in runList:
            lumis = getattr(fileRef.runs, run)
            if isinstance(lumis, dict):
                newRun = Run(int(run), *lumis.items())
            else:
                newRun = Run(int(run), *lumis)
            newFile.addRun(newRun)

        newFile["lfn"] = getattr(fileRef, "lfn", None)
        newFile["pfn"] = getattr(fileRef, "pfn", None)
        newFile["events"] = int(getattr(fileRef, "events", 0))
        newFile["size"] = int(getattr(fileRef, "size", 0))
        newFile["branches"] = getattr(fileRef, "branches", [])
        newFile["input"] = getattr(fileRef, "input", [])
        newFile["inputpfns"] = getattr(fileRef, "inputpfns", [])
        newFile["branch_hash"] = getattr(fileRef, "branch_hash", None)
        newFile["catalog"] = getattr(fileRef, "catalog", "")
        newFile["guid"] = getattr(fileRef, "guid", "")
        newFile["module_label"] = getattr(fileRef, "module_label", "")
        newFile["checksums"] = getattr(fileRef, "checksums", {})
        newFile["merged"] = bool(getattr(fileRef, "merged", False))
        newFile["dataset"] = getattr(fileRef, "dataset", {})
        newFile["acquisitionEra"] = getattr(fileRef, 'acquisitionEra', None)
        newFile["processingVer"] = getattr(fileRef, 'processingVer', None)
        newFile["validStatus"] = getattr(fileRef, 'validStatus', None)
        newFile["globalTag"] = getattr(fileRef, 'globalTag', None)
        newFile["prep_id"] = getattr(fileRef, 'prep_id', None)
        newFile['configURL'] = getattr(fileRef, 'configURL', None)
        newFile['inputPath'] = getattr(fileRef, 'inputPath', None)
        newFile["outputModule"] = outputModule
        newFile["fileRef"] = fileRef

        return newFile
Exemplo n.º 17
0
    def getOutputFile(self, fileName, outputModule, step):
        """
        _getOutputFile_

        Takes a fileRef object and returns a DataStructs/File object as output
        """

        outputMod = self.getOutputModule(step=step, outputModule=outputModule)

        if not outputMod:
            return None

        fileRef = getattr(outputMod.files, fileName, None)
        newFile = File(locations=set())

        # Locations
        newFile.setLocation(getattr(fileRef, "location", None))

        # Runs
        runList = fileRef.runs.listSections_()
        for run in runList:
            lumis = getattr(fileRef.runs, run)
            if isinstance(lumis, dict):
                newRun = Run(int(run), *listitems(lumis))
            else:
                newRun = Run(int(run), *lumis)
            newFile.addRun(newRun)

        newFile["lfn"] = getattr(fileRef, "lfn", None)
        newFile["pfn"] = getattr(fileRef, "pfn", None)
        newFile["events"] = int(getattr(fileRef, "events", 0))
        newFile["size"] = int(getattr(fileRef, "size", 0))
        newFile["branches"] = getattr(fileRef, "branches", [])
        newFile["input"] = getattr(fileRef, "input", [])
        newFile["inputpfns"] = getattr(fileRef, "inputpfns", [])
        newFile["branch_hash"] = getattr(fileRef, "branch_hash", None)
        newFile["catalog"] = getattr(fileRef, "catalog", "")
        newFile["guid"] = getattr(fileRef, "guid", "")
        newFile["module_label"] = getattr(fileRef, "module_label", "")
        newFile["checksums"] = getattr(fileRef, "checksums", {})
        newFile["merged"] = bool(getattr(fileRef, "merged", False))
        newFile["dataset"] = getattr(fileRef, "dataset", {})
        newFile["acquisitionEra"] = getattr(fileRef, 'acquisitionEra', None)
        newFile["processingVer"] = getattr(fileRef, 'processingVer', None)
        newFile["validStatus"] = getattr(fileRef, 'validStatus', None)
        newFile["globalTag"] = getattr(fileRef, 'globalTag', None)
        newFile["prep_id"] = getattr(fileRef, 'prep_id', None)
        newFile['configURL'] = getattr(fileRef, 'configURL', None)
        newFile['inputPath'] = getattr(fileRef, 'inputPath', None)
        newFile["outputModule"] = outputModule
        newFile["fileRef"] = fileRef

        return newFile
Exemplo n.º 18
0
    def createResubmitSpec(self, serverUrl, couchDB):
        """
        _createResubmitSpec_
        Create a bogus resubmit workload.
        """
        self.site = "cmssrm.fnal.gov"
        workload = WMWorkloadHelper(WMWorkload("TestWorkload"))
        reco = workload.newTask("reco")
        workload.setOwnerDetails(name = "evansde77", group = "DMWM")

        # first task uses the input dataset
        reco.addInputDataset(primary = "PRIMARY", processed = "processed-v1", tier = "TIER1")
        reco.data.input.splitting.algorithm = "File"
        reco.setTaskType("Processing")
        cmsRunReco = reco.makeStep("cmsRun1")
        cmsRunReco.setStepType("CMSSW")
        reco.applyTemplates()
        cmsRunRecoHelper = cmsRunReco.getTypeHelper()
        cmsRunRecoHelper.addOutputModule("outputRECO",
                                        primaryDataset = "PRIMARY",
                                        processedDataset = "processed-v2",
                                        dataTier = "TIER2",
                                        lfnBase = "/store/dunkindonuts",
                                        mergedLFNBase = "/store/kfc")
        
        dcs = DataCollectionService(url = serverUrl, database = couchDB)

        def getJob(workload):
            job = Job()
            job["task"] = workload.getTask("reco").getPathName()
            job["workflow"] = workload.name()
            job["location"] = self.site
            job["owner"] = "evansde77"
            job["group"] = "DMWM"
            return job

        testFileA = WMFile(lfn = makeUUID(), size = 1024, events = 1024)
        testFileA.setLocation([self.site])
        testFileA.addRun(Run(1, 1, 2))
        testFileB = WMFile(lfn = makeUUID(), size = 1024, events = 1024)
        testFileB.setLocation([self.site])
        testFileB.addRun(Run(1, 3, 4))
        testJobA = getJob(workload)
        testJobA.addFile(testFileA)
        testJobA.addFile(testFileB)
        
        dcs.failedJobs([testJobA])
        topLevelTask = workload.getTopLevelTask()[0]
        workload.truncate("Resubmit_TestWorkload", topLevelTask.getPathName(), 
                          serverUrl, couchDB)
                                  
        return workload
Exemplo n.º 19
0
    def setUp(self):
        """
        _setUp_

        Create two subscriptions: One that contains a single file and one that
        contains multiple files.
        """
        self.multipleFileFileset = Fileset(name="TestFileset1")
        for i in range(10):
            newFile = File(makeUUID(),
                           size=1000,
                           events=100,
                           locations=set(["somese.cern.ch"]))
            self.multipleFileFileset.addFile(newFile)

        self.singleFileFileset = Fileset(name="TestFileset2")
        newFile = File("/some/file/name",
                       size=1000,
                       events=100,
                       locations=set(["somese.cern.ch"]))
        self.singleFileFileset.addFile(newFile)

        self.multipleSiteFileset = Fileset(name="TestFileset3")
        for i in range(5):
            newFile = File(makeUUID(),
                           size=1000,
                           events=100,
                           locations=set(["somese.cern.ch"]))
            newFile.setLocation("somese.cern.ch")
            self.multipleSiteFileset.addFile(newFile)
        for i in range(5):
            newFile = File(makeUUID(), size=1000, events=100)
            newFile.setLocation(["somese.cern.ch", "otherse.cern.ch"])
            self.multipleSiteFileset.addFile(newFile)

        testWorkflow = Workflow()
        self.multipleFileSubscription = Subscription(
            fileset=self.multipleFileFileset,
            workflow=testWorkflow,
            split_algo="SizeBased",
            type="Processing")
        self.singleFileSubscription = Subscription(
            fileset=self.singleFileFileset,
            workflow=testWorkflow,
            split_algo="SizeBased",
            type="Processing")
        self.multipleSiteSubscription = Subscription(
            fileset=self.multipleSiteFileset,
            workflow=testWorkflow,
            split_algo="EventBased",
            type="Processing")
        return
Exemplo n.º 20
0
    def createFile(self, lfn, events, run, lumis, location):
        """
        _createFile_

        Create a file for testing
        """
        newFile = File(lfn=lfn, size=1000, events=events)
        lumiList = []
        for lumi in range(lumis):
            lumiList.append((run * lumis) + lumi)
        newFile.addRun(Run(run, *lumiList))
        newFile.setLocation(location)
        return newFile
    def createFile(lfn, events, run, lumis, location):
        """
        _createFile_

        Create a file for testing
        """
        newFile = File(lfn=lfn, size=1000, events=events)
        lumiList = []
        for lumi in range(lumis):
            lumiList.append((run * lumis) + lumi)
        newFile.addRun(Run(run, *lumiList))
        newFile.setLocation(location)
        return newFile
Exemplo n.º 22
0
 def generateFakeMCFile(self, numEvents=100, firstEvent=1, lastEvent=100, firstLumi=1, lastLumi=10):
     # MC comes with only one MCFakeFile
     singleMCFileset = Fileset(name="MCTestFileset")
     newFile = File("MCFakeFileTest", size=1000, events=numEvents)
     newFile.setLocation("se01")
     newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1)))
     newFile["first_event"] = firstEvent
     newFile["last_event"] = lastEvent
     testWorkflow = Workflow()
     singleMCFileset.addFile(newFile)
     singleMCFileSubscription = Subscription(
         fileset=singleMCFileset, workflow=testWorkflow, split_algo="EventBased", type="Production"
     )
     return singleMCFileSubscription
Exemplo n.º 23
0
 def generateFakeMCFile(self, numEvents = 100, firstEvent = 1,
                        lastEvent = 100, firstLumi = 1, lastLumi = 10):
     #MC comes with only one MCFakeFile
     singleMCFileset = Fileset(name = "MCTestFileset")
     newFile = File("MCFakeFileTest", size = 1000, events = numEvents)
     newFile.setLocation('se01')
     newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1)))
     newFile["first_event"] = firstEvent
     newFile["last_event"] = lastEvent
     testWorkflow = Workflow()
     singleMCFileset.addFile(newFile)
     singleMCFileSubscription = Subscription(fileset = singleMCFileset,
                                             workflow = testWorkflow,
                                             split_algo = "EventBased",
                                             type = "Production")
     return singleMCFileSubscription
Exemplo n.º 24
0
    def setUp(self):
        """
        _setUp_

        Create two subscriptions: One that contains a single file and one that
        contains multiple files.
        """
        self.multipleFileFileset = Fileset(name="TestFileset1")
        for i in range(10):
            newFile = File(makeUUID(), size=1000, events=100)
            newFile.setLocation('se01')
            self.multipleFileFileset.addFile(newFile)

        self.singleFileFileset = Fileset(name="TestFileset2")
        newFile = File("/some/file/name", size=1000, events=100)
        newFile.setLocation('se02')
        self.singleFileFileset.addFile(newFile)

        self.emptyFileFileset = Fileset(name="TestFileset3")
        newFile = File("/some/file/name", size=1000, events=0)
        newFile.setdefault('se03')
        self.emptyFileFileset.addFile(newFile)

        testWorkflow = Workflow()
        self.multipleFileSubscription = Subscription(
            fileset=self.multipleFileFileset,
            workflow=testWorkflow,
            split_algo="EventBased",
            type="Processing")
        self.singleFileSubscription = Subscription(
            fileset=self.singleFileFileset,
            workflow=testWorkflow,
            split_algo="EventBased",
            type="Processing")
        self.emptyFileSubscription = Subscription(
            fileset=self.emptyFileFileset,
            workflow=testWorkflow,
            split_algo="EventBased",
            type="Processing")

        self.performanceParams = {
            'timePerEvent': None,
            'memoryRequirement': 2300,
            'sizePerEvent': 400
        }

        return
Exemplo n.º 25
0
    def testDataStructsFile(self):
        """
        _testDataStructsFile_

        Tests our ability to create a WMBS file from a DataStructs File and vice versa
        """

        myThread = threading.currentThread()

        testLFN = "lfn1"
        testSize = 1024
        testEvents = 100
        testCksum = {"cksum": '1'}
        testParents = set(["lfn2"])
        testRun = Run(1, *[45])
        testSE = "se1.cern.ch"

        parentFile = File(lfn="lfn2")
        parentFile.create()

        testFile = File()

        inputFile = WMFile(lfn=testLFN,
                           size=testSize,
                           events=testEvents,
                           checksums=testCksum,
                           parents=testParents)
        inputFile.addRun(testRun)
        inputFile.setLocation(se=testSE)

        testFile.loadFromDataStructsFile(file=inputFile)
        testFile.create()
        testFile.save()

        loadFile = File(lfn="lfn1")
        loadFile.loadData(parentage=1)

        self.assertEqual(loadFile['size'], testSize)
        self.assertEqual(loadFile['events'], testEvents)
        self.assertEqual(loadFile['checksums'], testCksum)
        self.assertEqual(loadFile['locations'], set([testSE]))
        #self.assertEqual(loadFile['parents'].pop()['lfn'], 'lfn2')

        wmFile = loadFile.returnDataStructsFile()
        self.assertEqual(wmFile == inputFile, True)

        return
Exemplo n.º 26
0
    def execute(self, *args, **kwargs):
        self.logger.info("Data discovery and splitting for %s using user-provided files" % kwargs['task']['tm_taskname'])

        if 'tm_user_files' in kwargs['task'] and kwargs['task']['tm_user_files']:
            userfiles = kwargs['task']['tm_user_files']
        else: ## For backward compatibility only.
            userfiles = kwargs['task']['tm_arguments'].get('userfiles')
        splitting = kwargs['task']['tm_split_algo']
        total_units = kwargs['task']['tm_totalunits']
        if not userfiles or splitting != 'FileBased':
            if not userfiles:
                msg = "No files specified to process for task %s." % kwargs['task']['tm_taskname']
            if splitting != 'FileBased':
                msg = "Data.splitting must be set to 'FileBased' when using a custom set of files."
            self.logger.error("Setting %s as failed: %s" % (kwargs['task']['tm_taskname'], msg))
            configreq = {'workflow': kwargs['task']['tm_taskname'],
                         'status': "FAILED",
                         'subresource': 'failure',
                         'failure': b64encode(msg)}
            self.server.post(self.resturi, data = urllib.urlencode(configreq))
            raise StopHandler(msg)

        if hasattr(self.config.Sites, 'available'):
            locations = self.config.Sites.available
        else:
            sbj = SiteDBJSON({"key":self.config.TaskWorker.cmskey,
                              "cert":self.config.TaskWorker.cmscert})
            locations = sbj.getAllCMSNames()

        userFileset = Fileset(name = kwargs['task']['tm_taskname'])
        self.logger.info("There are %d files specified by the user." % len(userfiles))
        if total_units > 0:
            self.logger.info("Will run over the first %d files." % total_units)
        file_counter = 0
        for userfile, idx in zip(userfiles, range(len(userfiles))):
            newFile = File(userfile, size = 1000, events = 1)
            newFile.setLocation(locations)
            newFile.addRun(Run(1, idx))
            newFile["block"] = 'UserFilesFakeBlock'
            newFile["first_event"] = 1
            newFile["last_event"] = 2
            userFileset.addFile(newFile)
            file_counter += 1
            if total_units > 0 and file_counter >= total_units:
                break

        return Result(task = kwargs['task'], result = userFileset)
Exemplo n.º 27
0
    def execute(self, *args, **kwargs):
        self.logger.info(
            "Data discovery and splitting for %s using user-provided files" %
            kwargs['task']['tm_taskname'])

        userfiles = kwargs['task']['tm_user_files']
        splitting = kwargs['task']['tm_split_algo']
        total_units = kwargs['task']['tm_totalunits']
        if not userfiles or splitting != 'FileBased':
            if not userfiles:
                msg = "No files specified to process for task %s." % kwargs[
                    'task']['tm_taskname']
            if splitting != 'FileBased':
                msg = "Data.splitting must be set to 'FileBased' when using a custom set of files."
            raise TaskWorkerException(msg)

        if hasattr(self.config.Sites, 'available'):
            locations = self.config.Sites.available
        else:
            with self.config.TaskWorker.envForCMSWEB:
                configDict = {
                    "cacheduration": 1,
                    "pycurl": True
                }  # cache duration is in hours
                resourceCatalog = CRIC(logger=self.logger,
                                       configDict=configDict)
                locations = resourceCatalog.getAllPSNs()

        userFileset = Fileset(name=kwargs['task']['tm_taskname'])
        self.logger.info("There are %d files specified by the user." %
                         len(userfiles))
        if total_units > 0:
            self.logger.info("Will run over the first %d files." % total_units)
        file_counter = 0
        for userfile, idx in zip(userfiles, range(len(userfiles))):
            newFile = File(userfile, size=1000, events=1)
            newFile.setLocation(locations)
            newFile.addRun(Run(1, idx))
            newFile["block"] = 'UserFilesFakeBlock'
            newFile["first_event"] = 1
            newFile["last_event"] = 2
            userFileset.addFile(newFile)
            file_counter += 1
            if total_units > 0 and file_counter >= total_units:
                break

        return Result(task=kwargs['task'], result=userFileset)
Exemplo n.º 28
0
    def getFileset(self):
        """
        Get a fileset based on the task

        """

        fileset = Fileset(name='Merge%s' % (type))

        for i in range(0, random.randint(15, 25)):
            # Use the testDir to generate a random lfn
            inpFile = File(lfn="%s/%s.root" % (self.testDir, makeUUID()),
                           size=random.randint(200000, 1000000),
                           events=random.randint(1000, 2000))
            inpFile.setLocation('Megiddo')
            fileset.addFile(inpFile)

        return fileset
Exemplo n.º 29
0
    def getFileset(self):
        """
        Get a fileset based on the task

        """

        fileset = Fileset(name='Merge%s' % (type))

        for i in range(0, random.randint(15, 25)):
            # Use the testDir to generate a random lfn
            inpFile = File(lfn="%s/%s.root" % (self.testDir, makeUUID()),
                           size=random.randint(200000, 1000000),
                           events=random.randint(1000, 2000))
            inpFile.setLocation('Megiddo')
            fileset.addFile(inpFile)

        return fileset
Exemplo n.º 30
0
    def testDataStructsFile(self):
        """
        _testDataStructsFile_

        Tests our ability to create a WMBS file from a DataStructs File and vice versa
        """

        myThread = threading.currentThread()
        
        testLFN     = "lfn1"
        testSize    = 1024
        testEvents  = 100
        testCksum   = {"cksum": '1'}
        testParents = set(["lfn2"])
        testRun     = Run( 1, *[45])
        testSE      = "se1.cern.ch"

        parentFile = File(lfn= "lfn2")
        parentFile.create()

        testFile = File()

        inputFile = WMFile(lfn = testLFN, size = testSize, events = testEvents, checksums = testCksum, parents = testParents)
        inputFile.addRun(testRun)
        inputFile.setLocation(se = testSE)

        testFile.loadFromDataStructsFile(file = inputFile)
        testFile.create()
        testFile.save()

        
        loadFile = File(lfn = "lfn1")
        loadFile.loadData(parentage = 1)

        self.assertEqual(loadFile['size'],   testSize)
        self.assertEqual(loadFile['events'], testEvents)
        self.assertEqual(loadFile['checksums'], testCksum)
        self.assertEqual(loadFile['locations'], set([testSE]))
        #self.assertEqual(loadFile['parents'].pop()['lfn'], 'lfn2')

        wmFile = loadFile.returnDataStructsFile()
        self.assertEqual(wmFile == inputFile, True)

        return
Exemplo n.º 31
0
    def addOutputFilesToReport(self, report):
        """
        _addOutputFilesToReport_

        Add output files to every output module in the step.  Scale the size
        and number of events in the output files appropriately.
        """
        (outputSize, outputEvents) = self.determineOutputSize()

        if not os.path.exists('ReportEmuTestFile.txt'):
            f = open('ReportEmuTestFile.txt', 'w')
            f.write('A Shubbery')
            f.close()

        for outputModuleName in self.step.listOutputModules():
            outputModuleSection = self.step.getOutputModule(outputModuleName)
            outputModuleSection.fixedLFN = False
            outputModuleSection.disableGUID = False

            outputLFN = "%s/%s.root" % (outputModuleSection.lfnBase,
                                        str(makeUUID()))
            outputFile = File(lfn=outputLFN,
                              size=outputSize,
                              events=outputEvents,
                              merged=False)
            outputFile.setLocation(self.job["location"])
            outputFile['pfn'] = "ReportEmuTestFile.txt"
            outputFile['guid'] = "ThisIsGUID"
            outputFile["checksums"] = {"adler32": "1234", "cksum": "5678"}
            outputFile["dataset"] = {
                "primaryDataset": outputModuleSection.primaryDataset,
                "processedDataset": outputModuleSection.processedDataset,
                "dataTier": outputModuleSection.dataTier,
                "applicationName": "cmsRun",
                "applicationVersion": self.step.getCMSSWVersion()
            }
            outputFile["module_label"] = outputModuleName

            outputFileSection = report.addOutputFile(outputModuleName,
                                                     outputFile)
            for inputFile in self.job["input_files"]:
                Report.addRunInfoToFile(outputFileSection, inputFile["runs"])

        return
Exemplo n.º 32
0
    def execute(self, *args, **kwargs):

        totalevents = kwargs['task']['tm_totalunits']
        firstEvent = 1
        lastEvent = totalevents
        firstLumi = 1
        lastLumi = 10

        #MC comes with only one MCFakeFile
        singleMCFileset = Fileset(name = "MCFakeFileSet")
        newFile = File("MCFakeFile", size = 1000, events = totalevents)
        newFile.setLocation(self.config.Sites.available)
        newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1)))
        newFile["block"] = 'MCFackBlock'
        newFile["first_event"] = firstEvent
        newFile["last_event"] = lastEvent
        singleMCFileset.addFile(newFile)

        return Result(task=kwargs['task'], result=singleMCFileset)
Exemplo n.º 33
0
    def setUp(self):
        """
        _setUp_

        Create two subscriptions: One that contains a single file and one that
        contains multiple files.
        """
        self.multipleFileFileset = Fileset(name = "TestFileset1")
        for i in range(10):
            newFile = File(makeUUID(), size = 1000, events = 100)
            newFile.setLocation('se01')
            self.multipleFileFileset.addFile(newFile)

        self.singleFileFileset = Fileset(name = "TestFileset2")
        newFile = File("/some/file/name", size = 1000, events = 100)
        newFile.setLocation('se02')
        self.singleFileFileset.addFile(newFile)

        self.emptyFileFileset = Fileset(name = "TestFileset3")
        newFile = File("/some/file/name", size = 1000, events = 0)
        newFile.setdefault('se03')
        self.emptyFileFileset.addFile(newFile)

        testWorkflow = Workflow()
        self.multipleFileSubscription = Subscription(fileset = self.multipleFileFileset,
                                                     workflow = testWorkflow,
                                                     split_algo = "EventBased",
                                                     type = "Processing")
        self.singleFileSubscription = Subscription(fileset = self.singleFileFileset,
                                                   workflow = testWorkflow,
                                                   split_algo = "EventBased",
                                                   type = "Processing")
        self.emptyFileSubscription = Subscription(fileset = self.emptyFileFileset,
                                                  workflow = testWorkflow,
                                                  split_algo = "EventBased",
                                                  type = "Processing")

        self.performanceParams = {'timePerEvent' : None,
                                  'memoryRequirement' : 2300,
                                  'sizePerEvent' : 400}

        return
Exemplo n.º 34
0
    def setUp(self):
        """
        _setUp_

        Create two subscriptions: One that contains a single file and one that
        contains multiple files.
        """
        self.multipleFileFileset = Fileset(name = "TestFileset1")
        for i in range(10):
            newFile = File(makeUUID(), size = 1000, events = 100)
            newFile.setLocation('blenheim')
            newFile.setLocation('malpaquet')
            self.multipleFileFileset.addFile(newFile)

        self.singleFileFileset = Fileset(name = "TestFileset2")
        newFile = File("/some/file/name", size = 1000, events = 100)
        newFile.setLocation('blenheim')
        self.singleFileFileset.addFile(newFile)

        testWorkflow = Workflow()
        self.multipleFileSubscription = Subscription(fileset = self.multipleFileFileset,
                                                     workflow = testWorkflow,
                                                     split_algo = "FileBased",
                                                     type = "Processing")
        self.singleFileSubscription = Subscription(fileset = self.singleFileFileset,
                                                   workflow = testWorkflow,
                                                   split_algo = "FileBased",
                                                   type = "Processing")

        #self.multipleFileSubscription.create()
        #self.singleFileSubscription.create()

        return
Exemplo n.º 35
0
    def testG_LumiMask(self):
        """
        _testG_LumiMask_

        Test that we can use a lumi-mask to filter good runs/lumis.
        """
        splitter = SplitterFactory()

        # Create 3 files with 100 events per lumi:
        # - file1 with 1 run  of 8 lumis
        # - file2 with 2 runs of 2 lumis each
        # - file3 with 1 run  of 5 lumis
        fileA = File(lfn="/this/is/file1", size=1000, events=800)
        fileB = File(lfn="/this/is/file2", size=1000, events=400)
        fileC = File(lfn="/this/is/file3", size=1000, events=500)

        lumiListA = []
        for lumi in range(8):
            lumiListA.append(10 + lumi)
        fileA.addRun(Run(1, *lumiListA))
        fileA.setLocation("somese.cern.ch")
        lumiListB1 = []
        lumiListB2 = []
        for lumi in range(2):
            lumiListB1.append(20 + lumi)
            lumiListB2.append(30 + lumi)
        fileB.addRun(Run(2, *lumiListB1))
        fileB.addRun(Run(3, *lumiListB2))
        fileB.setLocation("somese.cern.ch")
        lumiListC = []
        for lumi in range(5):
            lumiListC.append(40 + lumi)
        fileC.addRun(Run(4, *lumiListC))
        fileC.setLocation("somese.cern.ch")

        testFileset = Fileset(name='Fileset')
        testFileset.addFile(fileA)
        testFileset.addFile(fileB)
        testFileset.addFile(fileC)

        testSubscription = Subscription(fileset=testFileset,
                                        workflow=self.testWorkflow,
                                        split_algo="EventAwareLumiBased",
                                        type="Processing")
        jobFactory = splitter(package="WMCore.DataStructs",
                              subscription=testSubscription)

        # Use a lumi-mask = {1: [[10,14]], 2: [[20,21]], 4: [[40,41]]}
        jobGroups = jobFactory(halt_job_on_file_boundaries=False,
                               splitOnRun=False,
                               events_per_job=850,
                               runs=['1', '2', '4'],
                               lumis=['10,14', '20,21', '40,41'],
                               performance=self.performanceParams)

        self.assertEqual(len(jobGroups), 1, "There should be only one job group")
        jobs = jobGroups[0].jobs
        self.assertEqual(len(jobs), 2, "Two jobs must be in the jobgroup")
        self.assertEqual(jobs[0]['mask'].getRunAndLumis(), {1: [[10, 14]], 2: [[20, 21]], 4: [[40, 40]]})
        self.assertEqual(jobs[1]['mask'].getRunAndLumis(), {4: [[41, 41]]})
Exemplo n.º 36
0
    def testG_LumiMask(self):
        """
        _testG_LumiMask_

        Test that we can use a lumi-mask to filter good runs/lumis.
        """
        splitter = SplitterFactory()

        # Create 3 files with 100 events per lumi:
        # - file1 with 1 run  of 8 lumis
        # - file2 with 2 runs of 2 lumis each
        # - file3 with 1 run  of 5 lumis
        fileA = File(lfn = "/this/is/file1", size = 1000, events = 800)
        fileB = File(lfn = "/this/is/file2", size = 1000, events = 400)
        fileC = File(lfn = "/this/is/file3", size = 1000, events = 500)

        lumiListA = []
        for lumi in range(8):
            lumiListA.append(10 + lumi)
        fileA.addRun(Run(1, *lumiListA))
        fileA.setLocation("somese.cern.ch")
        lumiListB1 = []
        lumiListB2 = []
        for lumi in range(2):
            lumiListB1.append(20 + lumi)
            lumiListB2.append(30 + lumi)
        fileB.addRun(Run(2, *lumiListB1))
        fileB.addRun(Run(3, *lumiListB2))
        fileB.setLocation("somese.cern.ch")
        lumiListC = []
        for lumi in range(5):
            lumiListC.append(40 + lumi)
        fileC.addRun(Run(4, *lumiListC))
        fileC.setLocation("somese.cern.ch")

        testFileset = Fileset(name = 'Fileset')
        testFileset.addFile(fileA)
        testFileset.addFile(fileB)
        testFileset.addFile(fileC)

        testSubscription = Subscription(fileset = testFileset,
                                        workflow = self.testWorkflow,
                                        split_algo = "EventAwareLumiBased",
                                        type = "Processing")
        jobFactory = splitter(package = "WMCore.DataStructs",
                              subscription = testSubscription)

        # Use a lumi-mask = {1: [[10,14]], 2: [[20,21]], 4: [[40,41]]}
        jobGroups = jobFactory(halt_job_on_file_boundaries = False,
                               splitOnRun = False,
                               events_per_job = 850,
                               runs = ['1', '2', '4'],
                               lumis = ['10,14', '20,21', '40,41'],
                               performance = self.performanceParams)

        self.assertEqual(len(jobGroups), 1, "There should be only one job group")
        jobs = jobGroups[0].jobs
        self.assertEqual(len(jobs), 2, "Two jobs must be in the jobgroup")
        self.assertEqual(jobs[0]['mask'].getRunAndLumis(), {1: [[10, 14]], 2: [[20, 21]], 4: [[40, 40]]})
        self.assertEqual(jobs[1]['mask'].getRunAndLumis(), {4: [[41, 41]]})
Exemplo n.º 37
0
    def setUp(self):
        """
        _setUp_

        Create two subscriptions: One that contains a single file and one that
        contains multiple files.
        """
        self.multipleFileFileset = Fileset(name="TestFileset1")
        for i in range(10):
            newFile = File(makeUUID(), size=1000, events=100)
            newFile.setLocation('blenheim')
            newFile.setLocation('malpaquet')
            self.multipleFileFileset.addFile(newFile)

        self.singleFileFileset = Fileset(name="TestFileset2")
        newFile = File("/some/file/name", size=1000, events=100)
        newFile.setLocation('blenheim')
        self.singleFileFileset.addFile(newFile)

        testWorkflow = Workflow()
        self.multipleFileSubscription = Subscription(
            fileset=self.multipleFileFileset,
            workflow=testWorkflow,
            split_algo="FileBased",
            type="Processing")
        self.singleFileSubscription = Subscription(
            fileset=self.singleFileFileset,
            workflow=testWorkflow,
            split_algo="FileBased",
            type="Processing")

        #self.multipleFileSubscription.create()
        #self.singleFileSubscription.create()

        return
Exemplo n.º 38
0
    def addOutputFilesToReport(self, report):
        """
        _addOutputFilesToReport_

        Add output files to every output module in the step.  Scale the size
        and number of events in the output files appropriately.
        """
        (outputSize, outputEvents) = self.determineOutputSize()

        if not os.path.exists('ReportEmuTestFile.txt'):
            f = open('ReportEmuTestFile.txt', 'w')
            f.write('A Shubbery')
            f.close()

        for outputModuleName in self.step.listOutputModules():
            outputModuleSection = self.step.getOutputModule(outputModuleName)
            outputModuleSection.fixedLFN    = False
            outputModuleSection.disableGUID = False

            outputLFN = "%s/%s.root" % (outputModuleSection.lfnBase,
                                        str(makeUUID()))
            outputFile = File(lfn = outputLFN, size = outputSize, events = outputEvents,
                              merged = False)
            outputFile.setLocation(self.job["location"])
            outputFile['pfn'] = "ReportEmuTestFile.txt"
            outputFile['guid'] = "ThisIsGUID"
            outputFile["checksums"] = {"adler32": "1234", "cksum": "5678"}
            outputFile["dataset"] = {"primaryDataset": outputModuleSection.primaryDataset,
                                     "processedDataset": outputModuleSection.processedDataset,
                                     "dataTier": outputModuleSection.dataTier,
                                     "applicationName": "cmsRun",
                                     "applicationVersion": self.step.getCMSSWVersion()}
            outputFile["module_label"] = outputModuleName

            outputFileSection = report.addOutputFile(outputModuleName, outputFile)
            for inputFile in self.job["input_files"]:
                Report.addRunInfoToFile(outputFileSection, inputFile["runs"])

        return
Exemplo n.º 39
0
    def execute(self, *args, **kwargs):
        self.logger.info("Data discovery and splitting for %s using user-provided files" % kwargs['task']['tm_taskname'])

        userfiles = kwargs['task']['tm_user_files']
        splitting = kwargs['task']['tm_split_algo']
        total_units = kwargs['task']['tm_totalunits']
        if not userfiles or splitting != 'FileBased':
            if not userfiles:
                msg = "No files specified to process for task %s." % kwargs['task']['tm_taskname']
            if splitting != 'FileBased':
                msg = "Data.splitting must be set to 'FileBased' when using a custom set of files."
            raise TaskWorkerException(msg)

        if hasattr(self.config.Sites, 'available'):
            locations = self.config.Sites.available
        else:
            with self.config.TaskWorker.envForCMSWEB :
                configDict = {"cacheduration": 1, "pycurl": True} # cache duration is in hours
                resourceCatalog = CRIC(logger=self.logger, configDict=configDict)
                locations = resourceCatalog.getAllPSNs()

        userFileset = Fileset(name = kwargs['task']['tm_taskname'])
        self.logger.info("There are %d files specified by the user." % len(userfiles))
        if total_units > 0:
            self.logger.info("Will run over the first %d files." % total_units)
        file_counter = 0
        for userfile, idx in zip(userfiles, range(len(userfiles))):
            newFile = File(userfile, size = 1000, events = 1)
            newFile.setLocation(locations)
            newFile.addRun(Run(1, idx))
            newFile["block"] = 'UserFilesFakeBlock'
            newFile["first_event"] = 1
            newFile["last_event"] = 2
            userFileset.addFile(newFile)
            file_counter += 1
            if total_units > 0 and file_counter >= total_units:
                break

        return Result(task = kwargs['task'], result = userFileset)
Exemplo n.º 40
0
    def returnDataStructsFile(self):
        """
        _returnDataStructsFile_

        Creates a dataStruct file out of this file
        """
        parents = set()
        for parent in self["parents"]:
            parents.add(WMFile(lfn = parent['lfn'], size = parent['size'],
                               events = parent['events'], checksums = parent['checksums'],
                               parents = parent['parents'], merged = parent['merged']))

        file = WMFile(lfn = self['lfn'], size = self['size'],
                      events = self['events'], checksums = self['checksums'],
                      parents = parents, merged = self['merged'])

        for run in self['runs']:
            file.addRun(run)

        for location in self['locations']:
            file.setLocation(se = location)

        return file
Exemplo n.º 41
0
    def createSubscription(self, nFiles, lumisPerFile, twoSites = False):
        """
        _createSubscription_

        Create a subscription for testing
        """

        baseName = makeUUID()

        testFileset = Fileset(name = baseName)
        for i in range(nFiles):
            newFile = File(lfn = '%s_%i' % (baseName, i), size = 1000,
                           events = 100)
            lumis = []
            for lumi in range(lumisPerFile):
                lumis.append((i * 100) + lumi)
            newFile.addRun(Run(i, *lumis))
            newFile.setLocation('blenheim')
            testFileset.addFile(newFile)
        if twoSites:
            for i in range(nFiles):
                newFile = File(lfn = '%s_%i_2' % (baseName, i), size = 1000,
                               events = 100)
                lumis = []
                for lumi in range(lumisPerFile):
                    lumis.append(5 + 10 * (i * 100) + lumi) #lumis should be different
                newFile.addRun(Run(i, *lumis))
                newFile.setLocation('malpaquet')
                testFileset.addFile(newFile)


        testSubscription  = Subscription(fileset = testFileset,
                                         workflow = self.testWorkflow,
                                         split_algo = "LumiBased",
                                         type = "Processing")

        return testSubscription
Exemplo n.º 42
0
    def createSubscription(self, nFiles, lumisPerFile, twoSites=False):
        """
        _createSubscription_

        Create a subscription for testing
        """

        baseName = makeUUID()

        testFileset = Fileset(name=baseName)
        for i in range(nFiles):
            newFile = File(lfn='%s_%i' % (baseName, i), size=1000, events=100)
            lumis = []
            for lumi in range(lumisPerFile):
                lumis.append((i * 100) + lumi)
            newFile.addRun(Run(i, *lumis))
            newFile.setLocation('blenheim')
            testFileset.addFile(newFile)
        if twoSites:
            for i in range(nFiles):
                newFile = File(lfn='%s_%i_2' % (baseName, i),
                               size=1000,
                               events=100)
                lumis = []
                for lumi in range(lumisPerFile):
                    lumis.append(5 + 10 * (i * 100) +
                                 lumi)  #lumis should be different
                newFile.addRun(Run(i, *lumis))
                newFile.setLocation('malpaquet')
                testFileset.addFile(newFile)

        testSubscription = Subscription(fileset=testFileset,
                                        workflow=self.testWorkflow,
                                        split_algo="LumiBased",
                                        type="Processing")

        return testSubscription
Exemplo n.º 43
0
    def generateFakeMCFile(self, numEvents=100, firstEvent=1, lastEvent=100,
                           firstLumi=1, lastLumi=10, existingSub=None):
        # MC comes with only one MCFakeFile
        newFile = File("MCFakeFileTest", size=1000, events=numEvents)
        newFile.setLocation('se01')
        if firstLumi == lastLumi:
            newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1)))
        else:
            newFile.addRun(Run(1, *range(firstLumi, lastLumi)))
        newFile["first_event"] = firstEvent
        newFile["last_event"] = lastEvent

        if existingSub is None:
            singleMCFileset = Fileset(name="MCTestFileset")
            singleMCFileset.addFile(newFile)
            testWorkflow = Workflow()
            existingSub = Subscription(fileset=singleMCFileset,
                                       workflow=testWorkflow,
                                       split_algo="EventBased",
                                       type="Production")
        else:
            existingSub['fileset'].addFile(newFile)

        return existingSub
Exemplo n.º 44
0
    def __call__(self, fileset):
        """
        return a randomly sized list of files (DataStructs.File) at locations
        files will always be new
        """
        num_files = random.randint(0, self.max)
        for f in self.makelist(fileset):
            list = []
            for i in range(0, num_files):
                # Decide where the file is
                locs = []
                for i in range(0, len(self.locations)):
                    if random.randint(0, 1):
                        locs.append(self.locations[i])
                lfn = '/store/data/fake-feeder-files/notreal/%s.root' % uuid(i)
                size = 2000 + ((i - 5) * 50)
                events = 1000 + ((i - 3) * 150)
                run = random.randint(0, int(3.14159265 * i * self.max))
                lumi = random.randint(0, 10)
                file = File(lfn, size, events, run, lumi)
                file.setLocation(locs)
                f.addFile(file)

        return fileset
Exemplo n.º 45
0
    def testSetLocation(self):
        """
        Test the `setLocation` method functionality
        """
        testFile = File(lfn="test_file")
        self.assertItemsEqual(testFile['locations'], {})

        testFile.setLocation(None)
        self.assertItemsEqual(testFile['locations'], {})

        testFile.setLocation("")
        self.assertItemsEqual(testFile['locations'], {})

        testFile.setLocation([])
        self.assertItemsEqual(testFile['locations'], {})

        testFile.setLocation("valid_PNN")
        self.assertItemsEqual(testFile['locations'], {"valid_PNN"})
Exemplo n.º 46
0
    def setUp(self):
        """
        _setUp_

        Create two subscriptions: One that contains a single file and one that
        contains multiple files.
        """
        self.multipleFileFileset = Fileset(name="TestFileset1")
        for i in range(10):
            newFile = File(makeUUID(), size=1000, events=100)
            newFile.setLocation('blenheim')
            newFile.setLocation('malpaquet')
            lumis = []
            for lumi in range(20):
                lumis.append((i * 100) + lumi)
                newFile.addRun(Run(i, *lumis))
            self.multipleFileFileset.addFile(newFile)

        self.singleFileFileset = Fileset(name="TestFileset2")
        newFile = File("/some/file/name", size=1000, events=100)
        newFile.setLocation('blenheim')
        lumis = list(range(50, 60)) + list(range(70, 80))
        newFile.addRun(Run(13, *lumis))
        self.singleFileFileset.addFile(newFile)

        testWorkflow = Workflow()
        self.multipleFileSubscription = Subscription(
            fileset=self.multipleFileFileset,
            workflow=testWorkflow,
            split_algo="FileBased",
            type="Processing")
        self.singleFileSubscription = Subscription(
            fileset=self.singleFileFileset,
            workflow=testWorkflow,
            split_algo="FileBased",
            type="Processing")

        #self.multipleFileSubscription.create()
        #self.singleFileSubscription.create()

        self.performanceParams = {
            'timePerEvent': 12,
            'memoryRequirement': 2300,
            'sizePerEvent': 400
        }

        return
Exemplo n.º 47
0
    def setUp(self):
        """
        _setUp_

        Create two subscriptions: One that contains a single file and one that
        contains multiple files.
        """
        self.multipleFileFileset = Fileset(name = "TestFileset1")
        for i in range(10):
            newFile = File(makeUUID(), size = 1000, events = 100)
            newFile.setLocation('blenheim')
            newFile.setLocation('malpaquet')
            lumis = []
            for lumi in range(20):
                lumis.append((i * 100) + lumi)
                newFile.addRun(Run(i, *lumis))
            self.multipleFileFileset.addFile(newFile)

        self.singleFileFileset = Fileset(name = "TestFileset2")
        newFile = File("/some/file/name", size = 1000, events = 100)
        newFile.setLocation('blenheim')
        lumis = range(50,60) + range(70,80)
        newFile.addRun(Run(13, *lumis))
        self.singleFileFileset.addFile(newFile)

        testWorkflow = Workflow()
        self.multipleFileSubscription = Subscription(fileset = self.multipleFileFileset,
                                                     workflow = testWorkflow,
                                                     split_algo = "FileBased",
                                                     type = "Processing")
        self.singleFileSubscription = Subscription(fileset = self.singleFileFileset,
                                                   workflow = testWorkflow,
                                                   split_algo = "FileBased",
                                                   type = "Processing")

        #self.multipleFileSubscription.create()
        #self.singleFileSubscription.create()

        self.performanceParams = {'timePerEvent' : 12,
                                  'memoryRequirement' : 2300,
                                  'sizePerEvent' : 400}

        return
Exemplo n.º 48
0
    def testD_NoFileSplitNoHardLimit(self):
        """
        _testD_NoFileSplitNoHardLimit_

        In this case we don't split on file boundaries, check different combination of files
        make sure we make the most of the splitting, e.g. include many zero event files in
        a single job.
        """
        splitter = SplitterFactory()

        #Create 100 files with 7 lumi per file and 0 events per lumi on average.
        testSubscription = self.createSubscription(nFiles = 100, lumisPerFile = 7, twoSites = False,
                                                   nEventsPerFile = 0)
        jobFactory = splitter(package = "WMCore.DataStructs",
                              subscription = testSubscription)

        #First test, the optimal settings are 360 events per job
        #As we have files with 0 events per lumi, this will configure the splitting to
        #a single job containing all files
        jobGroups = jobFactory(halt_job_on_file_boundaries = False,
                               splitOnRun = False,
                               events_per_job = 360)
        self.assertEqual(len(jobGroups), 1, "There should be only one job group")
        jobs = jobGroups[0].jobs
        self.assertEqual(len(jobs), 1, "There should be 1 job")
        self.assertEqual(len(jobs[0]['input_files']), 100, "All 100 files must be in the job")

        #Create 7 files, each one with different lumi/event distributions
        testFileset = Fileset(name = "FilesetA")
        testFileA = self.createFile("/this/is/file1", 250, 0, 5, "blenheim")
        testFileB = self.createFile("/this/is/file2", 600, 1, 1, "blenheim")
        testFileC = self.createFile("/this/is/file3", 1200, 2, 2, "blenheim")
        testFileD = self.createFile("/this/is/file4", 100, 3, 1, "blenheim")
        testFileE = self.createFile("/this/is/file5", 30, 4, 1, "blenheim")
        testFileF = self.createFile("/this/is/file6", 10, 5, 1, "blenheim")
        testFileG = self.createFile("/this/is/file7", 151, 6, 3, "blenheim")
        testFileset.addFile(testFileA)
        testFileset.addFile(testFileB)
        testFileset.addFile(testFileC)
        testFileset.addFile(testFileD)
        testFileset.addFile(testFileE)
        testFileset.addFile(testFileF)
        testFileset.addFile(testFileG)

        testSubscription = Subscription(fileset = testFileset,
                                        workflow = self.testWorkflow,
                                        split_algo = "EventAwareLumiBased",
                                        type = "Processing")

        jobFactory = splitter(package = "WMCore.DataStructs",
                              subscription = testSubscription)
        #Optimal settings are: jobs with 150 events per job
        #This means, the first file must be splitted in 3 lumis per job which would leave room
        #for another lumi in the second job, but the second file has a lumi too big for that
        #The 3rd job only contains the second file, the fourth and fifth job split the third file
        jobGroups = jobFactory(halt_job_on_file_boundaries = False,
                               splitOnRun = False,
                               events_per_job = 150)

        self.assertEqual(len(jobGroups), 1, "There should be only one job group")
        jobs = jobGroups[0].jobs
        self.assertEqual(len(jobs), 8, "Eight jobs must be in the jobgroup")
        self.assertEqual(jobs[0]["mask"].getRunAndLumis(), {0L : [[0L, 2L]]}, "Wrong mask for the first job")
        self.assertEqual(jobs[1]["mask"].getRunAndLumis(), {0L : [[3L, 4L]]}, "Wrong mask for the second job")
        self.assertEqual(jobs[2]["mask"].getRunAndLumis(), {1L : [[1L, 1L]]}, "Wrong mask for the third job")
        self.assertEqual(jobs[3]["mask"].getRunAndLumis(), {2L : [[4L, 4L]]}, "Wrong mask for the fourth job")
        self.assertEqual(jobs[4]["mask"].getRunAndLumis(), {2L : [[5L, 5L]]}, "Wrong mask for the fifth job")
        self.assertEqual(jobs[5]["mask"].getRunAndLumis(),
                         {3L : [[3L, 3L]], 4L : [[4L, 4L]], 5L : [[5L, 5L]]}, "Wrong mask for the sixth job")
        self.assertEqual(jobs[6]["mask"].getRunAndLumis(), {6L : [[18L, 19L]]}, "Wrong mask for the seventh job")
        self.assertEqual(jobs[7]["mask"].getRunAndLumis(), {6L : [[20L, 20L]]}, "Wrong mask for the seventh job")
        #Test interactions of this algorithm with splitOnRun = True
        #Make 2 files, one with 3 runs and a second one with the last run of the first
        fileA = File(lfn = "/this/is/file1", size = 1000,
                       events = 2400)
        lumiListA = []
        lumiListB = []
        lumiListC = []
        for lumi in range(8):
            lumiListA.append(1 + lumi)
            lumiListB.append(1 + lumi)
            lumiListC.append(1 + lumi)
        fileA.addRun(Run(1, *lumiListA))
        fileA.addRun(Run(2, *lumiListA))
        fileA.addRun(Run(3, *lumiListA))
        fileA.setLocation("malpaquet")

        fileB = self.createFile('/this/is/file2', 200, 3, 5, "malpaquet")

        testFileset = Fileset(name = 'FilesetB')
        testFileset.addFile(fileA)
        testFileset.addFile(fileB)
        testSubscription = Subscription(fileset = testFileset,
                                        workflow = self.testWorkflow,
                                        split_algo = "EventAwareLumiBased",
                                        type = "Processing")

        jobFactory = splitter(package = "WMCore.DataStructs",
                              subscription = testSubscription)
        #The settings for this splitting are 700 events per job
        jobGroups = jobFactory(splitOnRun = True,
                               halt_job_on_file_boundaries = False,
                               events_per_job = 700)
        self.assertEqual(len(jobGroups), 1, "There should be only one job group")
        jobs = jobGroups[0].jobs
        self.assertEqual(len(jobs), 6, "Six jobs must be in the jobgroup")
Exemplo n.º 49
0
    def testChunking(self):
        """
        _testChunking_

        Insert a workload and files that have several distinct sets of
        locations.  Verify that the chunks are created correctly and that they
        only groups files that have the same set of locations.  Also verify that
        the chunks are pulled out of ACDC correctly.
        """
        dcs = DataCollectionService(url=self.testInit.couchUrl,
                                    database="wmcore-acdc-datacollectionsvc")

        testFileA = File(lfn=makeUUID(), size=1024, events=1024)
        testFileA.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"])
        testFileA.addRun(Run(1, 1, 2))
        testFileB = File(lfn=makeUUID(), size=1024, events=1024)
        testFileB.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"])
        testFileB.addRun(Run(1, 3, 4))
        testFileC = File(lfn=makeUUID(), size=1024, events=1024)
        testFileC.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"])
        testFileC.addRun(Run(1, 5, 6))
        testJobA = self.getMinimalJob()
        testJobA.addFile(testFileA)
        testJobA.addFile(testFileB)
        testJobA.addFile(testFileC)

        testFileD = File(lfn=makeUUID(), size=1024, events=1024)
        testFileD.setLocation(["cmssrm.fnal.gov"])
        testFileD.addRun(Run(2, 1, 2))
        testFileE = File(lfn=makeUUID(), size=1024, events=1024)
        testFileE.setLocation(["cmssrm.fnal.gov"])
        testFileE.addRun(Run(2, 3, 4))
        testJobB = self.getMinimalJob()
        testJobB.addFile(testFileD)
        testJobB.addFile(testFileE)

        testFileF = File(lfn=makeUUID(),
                         size=1024,
                         events=1024,
                         parents={"/some/parent/F"})
        testFileF.setLocation(
            ["cmssrm.fnal.gov", "castor.cern.ch", "srm.ral.uk"])
        testFileF.addRun(Run(3, 1, 2))
        testFileG = File(lfn=makeUUID(),
                         size=1024,
                         events=1024,
                         parents={"/some/parent/G"})
        testFileG.setLocation(
            ["cmssrm.fnal.gov", "castor.cern.ch", "srm.ral.uk"])
        testFileG.addRun(Run(3, 3, 4))
        testFileH = File(lfn=makeUUID(),
                         size=1024,
                         events=1024,
                         parents={"/some/parent/H"})
        testFileH.setLocation(
            ["cmssrm.fnal.gov", "castor.cern.ch", "srm.ral.uk"])
        testFileH.addRun(Run(3, 5, 6))
        testJobC = self.getMinimalJob()
        testJobC.addFile(testFileF)
        testJobC.addFile(testFileG)
        testJobC.addFile(testFileH)

        testFileI = File(lfn=makeUUID(), size=1024, events=1024, merged=True)
        testFileI.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"])
        testFileI.addRun(Run(4, 1, 2))
        testFileJ = File(lfn=makeUUID(), size=1024, events=1024, merged=True)
        testFileJ.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"])
        testFileJ.addRun(Run(4, 3, 4))
        testFileK = File(lfn=makeUUID(), size=1024, events=1024, merged=True)
        testFileK.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"])
        testFileK.addRun(Run(4, 5, 6))
        testJobD = self.getMinimalJob()
        testJobD.addFile(testFileI)
        testJobD.addFile(testFileJ)
        testJobD.addFile(testFileK)

        dcs.failedJobs([testJobA, testJobB, testJobC, testJobD])
        chunks = dcs.chunkFileset("ACDCTest", "/ACDCTest/reco", chunkSize=5)

        self.assertEqual(
            len(chunks), 4,
            "Error: There should be four chunks: %s" % len(chunks))

        goldenMetaData = {
            1: {
                "lumis": 2,
                "locations": ["castor.cern.ch", "cmssrm.fnal.gov"],
                "events": 1024
            },
            2: {
                "lumis": 4,
                "locations": ["cmssrm.fnal.gov"],
                "events": 2048
            },
            3: {
                "lumis": 6,
                "locations":
                ["castor.cern.ch", "cmssrm.fnal.gov", "srm.ral.uk"],
                "events": 3072
            },
            5: {
                "lumis": 10,
                "locations": ["castor.cern.ch", "cmssrm.fnal.gov"],
                "events": 5120
            }
        }

        testFiles = [
            testFileA, testFileB, testFileC, testFileI, testFileJ, testFileK
        ]
        lastFile = testFileA
        for testFile in testFiles:
            if lastFile["lfn"] < testFile["lfn"]:
                lastFile = testFile

        testFiles.remove(lastFile)

        goldenFiles = {
            1: [lastFile],
            2: [testFileD, testFileE],
            3: [testFileF, testFileG, testFileH],
            5: testFiles
        }

        for chunk in chunks:
            chunkMetaData = dcs.getChunkInfo("ACDCTest", "/ACDCTest/reco",
                                             chunk["offset"], chunk["files"])

            self.assertEqual(chunkMetaData["files"], chunk["files"])
            self.assertEqual(chunkMetaData["lumis"], chunk["lumis"])
            self.assertEqual(chunkMetaData["events"], chunk["events"])
            self.assertEqual(chunkMetaData["locations"], chunk["locations"])

            self.assertTrue(chunk["files"] in goldenMetaData.keys(),
                            "Error: Extra chunk found.")
            self.assertEqual(chunk["lumis"],
                             goldenMetaData[chunk["files"]]["lumis"],
                             "Error: Lumis in chunk is wrong.")
            self.assertEqual(chunk["locations"],
                             goldenMetaData[chunk["files"]]["locations"],
                             "Error: Locations in chunk is wrong.")
            self.assertEqual(chunk["events"],
                             goldenMetaData[chunk["files"]]["events"],
                             "Error: Events in chunk is wrong.")
            del goldenMetaData[chunk["files"]]

            chunkFiles = dcs.getChunkFiles("ACDCTest", "/ACDCTest/reco",
                                           chunk["offset"], chunk["files"])

            self.assertTrue(chunk["files"] in goldenFiles.keys(),
                            "Error: Extra chunk found.")
            goldenChunkFiles = goldenFiles[chunk["files"]]
            self.assertEqual(len(chunkFiles), len(goldenChunkFiles))

            for chunkFile in chunkFiles:
                foundFile = None
                for goldenChunkFile in goldenChunkFiles:
                    if chunkFile["lfn"] == goldenChunkFile["lfn"]:
                        foundFile = goldenChunkFile
                        break

                self.assertIsNotNone(
                    foundFile, "Error: Missing chunk file: %s, %s" %
                    (chunkFiles, goldenChunkFiles))
                self.assertEqual(set(foundFile["parents"]),
                                 chunkFile["parents"],
                                 "Error: File parents should match.")
                self.assertEqual(foundFile["merged"], chunkFile["merged"],
                                 "Error: File merged status should match.")
                self.assertEqual(foundFile["locations"],
                                 chunkFile["locations"],
                                 "Error: File locations should match.")
                self.assertEqual(foundFile["events"], chunkFile["events"])
                self.assertEqual(foundFile["size"], chunkFile["size"])
                self.assertEqual(len(foundFile["runs"]),
                                 len(chunkFile["runs"]),
                                 "Error: Wrong number of runs.")
                for run in foundFile["runs"]:
                    runMatch = False
                    for chunkRun in chunkFile["runs"]:
                        if chunkRun.run == run.run and chunkRun.lumis == run.lumis:
                            runMatch = True
                            break

                    self.assertTrue(runMatch,
                                    "Error: Run information is wrong.")

            del goldenFiles[chunk["files"]]

        singleChunk = dcs.singleChunkFileset("ACDCTest", "/ACDCTest/reco")
        self.assertEqual(
            singleChunk, {
                "offset": 0,
                "files": 11,
                "events": 11264,
                "lumis": 22,
                "locations":
                {"castor.cern.ch", "cmssrm.fnal.gov", "srm.ral.uk"}
            }, "Error: Single chunk metadata is wrong")

        return
Exemplo n.º 50
0
    def testD_NoFileSplitNoHardLimit(self):
        """
        _testD_NoFileSplitNoHardLimit_

        In this case we don't split on file boundaries, check different combination of files
        make sure we make the most of the splitting, e.g. include many zero event files in
        a single job.
        """
        splitter = SplitterFactory()

        #Create 100 files with 7 lumi per file and 0 events per lumi on average.
        testSubscription = self.createSubscription(nFiles=100,
                                                   lumisPerFile=7,
                                                   twoSites=False,
                                                   nEventsPerFile=0)
        jobFactory = splitter(package="WMCore.DataStructs",
                              subscription=testSubscription)

        #First test, the optimal settings are 360 events per job
        #As we have files with 0 events per lumi, this will configure the splitting to
        #a single job containing all files
        jobGroups = jobFactory(halt_job_on_file_boundaries=False,
                               splitOnRun=False,
                               events_per_job=360)
        self.assertEqual(len(jobGroups), 1,
                         "There should be only one job group")
        jobs = jobGroups[0].jobs
        self.assertEqual(len(jobs), 1, "There should be 1 job")
        self.assertEqual(len(jobs[0]['input_files']), 100,
                         "All 100 files must be in the job")

        #Create 7 files, each one with different lumi/event distributions
        testFileset = Fileset(name="FilesetA")
        testFileA = self.createFile("/this/is/file1", 250, 0, 5, "blenheim")
        testFileB = self.createFile("/this/is/file2", 600, 1, 1, "blenheim")
        testFileC = self.createFile("/this/is/file3", 1200, 2, 2, "blenheim")
        testFileD = self.createFile("/this/is/file4", 100, 3, 1, "blenheim")
        testFileE = self.createFile("/this/is/file5", 30, 4, 1, "blenheim")
        testFileF = self.createFile("/this/is/file6", 10, 5, 1, "blenheim")
        testFileG = self.createFile("/this/is/file7", 151, 6, 3, "blenheim")
        testFileset.addFile(testFileA)
        testFileset.addFile(testFileB)
        testFileset.addFile(testFileC)
        testFileset.addFile(testFileD)
        testFileset.addFile(testFileE)
        testFileset.addFile(testFileF)
        testFileset.addFile(testFileG)

        testSubscription = Subscription(fileset=testFileset,
                                        workflow=self.testWorkflow,
                                        split_algo="EventAwareLumiBased",
                                        type="Processing")

        jobFactory = splitter(package="WMCore.DataStructs",
                              subscription=testSubscription)
        #Optimal settings are: jobs with 150 events per job
        #This means, the first file must be splitted in 3 lumis per job which would leave room
        #for another lumi in the second job, but the second file has a lumi too big for that
        #The 3rd job only contains the second file, the fourth and fifth job split the third file
        jobGroups = jobFactory(halt_job_on_file_boundaries=False,
                               splitOnRun=False,
                               events_per_job=150)

        self.assertEqual(len(jobGroups), 1,
                         "There should be only one job group")
        jobs = jobGroups[0].jobs
        self.assertEqual(len(jobs), 8, "Eight jobs must be in the jobgroup")
        self.assertEqual(jobs[0]["mask"].getRunAndLumis(), {0L: [[0L, 2L]]},
                         "Wrong mask for the first job")
        self.assertEqual(jobs[1]["mask"].getRunAndLumis(), {0L: [[3L, 4L]]},
                         "Wrong mask for the second job")
        self.assertEqual(jobs[2]["mask"].getRunAndLumis(), {1L: [[1L, 1L]]},
                         "Wrong mask for the third job")
        self.assertEqual(jobs[3]["mask"].getRunAndLumis(), {2L: [[4L, 4L]]},
                         "Wrong mask for the fourth job")
        self.assertEqual(jobs[4]["mask"].getRunAndLumis(), {2L: [[5L, 5L]]},
                         "Wrong mask for the fifth job")
        self.assertEqual(jobs[5]["mask"].getRunAndLumis(), {
            3L: [[3L, 3L]],
            4L: [[4L, 4L]],
            5L: [[5L, 5L]]
        }, "Wrong mask for the sixth job")
        self.assertEqual(jobs[6]["mask"].getRunAndLumis(), {6L: [[18L, 19L]]},
                         "Wrong mask for the seventh job")
        self.assertEqual(jobs[7]["mask"].getRunAndLumis(), {6L: [[20L, 20L]]},
                         "Wrong mask for the seventh job")
        #Test interactions of this algorithm with splitOnRun = True
        #Make 2 files, one with 3 runs and a second one with the last run of the first
        fileA = File(lfn="/this/is/file1", size=1000, events=2400)
        lumiListA = []
        lumiListB = []
        lumiListC = []
        for lumi in range(8):
            lumiListA.append(1 + lumi)
            lumiListB.append(1 + lumi)
            lumiListC.append(1 + lumi)
        fileA.addRun(Run(1, *lumiListA))
        fileA.addRun(Run(2, *lumiListA))
        fileA.addRun(Run(3, *lumiListA))
        fileA.setLocation("malpaquet")

        fileB = self.createFile('/this/is/file2', 200, 3, 5, "malpaquet")

        testFileset = Fileset(name='FilesetB')
        testFileset.addFile(fileA)
        testFileset.addFile(fileB)
        testSubscription = Subscription(fileset=testFileset,
                                        workflow=self.testWorkflow,
                                        split_algo="EventAwareLumiBased",
                                        type="Processing")

        jobFactory = splitter(package="WMCore.DataStructs",
                              subscription=testSubscription)
        #The settings for this splitting are 700 events per job
        jobGroups = jobFactory(splitOnRun=True,
                               halt_job_on_file_boundaries=False,
                               events_per_job=700)
        self.assertEqual(len(jobGroups), 1,
                         "There should be only one job group")
        jobs = jobGroups[0].jobs
        self.assertEqual(len(jobs), 6, "Six jobs must be in the jobgroup")
Exemplo n.º 51
0
    def testChunking(self):
        """
        _testChunking_

        Insert a workload and files that have several distinct sets of
        locations.  Verify that the chunks are created correctly and that they
        only groups files that have the same set of locations.  Also verify that
        the chunks are pulled out of ACDC correctly.
        """
        dcs = DataCollectionService(url = self.testInit.couchUrl,
                                    database = "wmcore-acdc-datacollectionsvc")

        def getJob():
            job = Job()
            job["task"] = "/ACDCTest/reco"
            job["workflow"] = "ACDCTest"
            job["location"] = "cmssrm.fnal.gov"
            job["owner"] = "cmsdataops"
            job["group"] = "cmsdataops"
            return job

        testFileA = File(lfn = makeUUID(), size = 1024, events = 1024)
        testFileA.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"])
        testFileA.addRun(Run(1, 1, 2))
        testFileB = File(lfn = makeUUID(), size = 1024, events = 1024)
        testFileB.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"])
        testFileB.addRun(Run(1, 3, 4))
        testFileC = File(lfn = makeUUID(), size = 1024, events = 1024)
        testFileC.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"])
        testFileC.addRun(Run(1, 5, 6))
        testJobA = getJob()
        testJobA.addFile(testFileA)
        testJobA.addFile(testFileB)
        testJobA.addFile(testFileC)

        testFileD = File(lfn = makeUUID(), size = 1024, events = 1024)
        testFileD.setLocation(["cmssrm.fnal.gov"])
        testFileD.addRun(Run(2, 1, 2))
        testFileE = File(lfn = makeUUID(), size = 1024, events = 1024)
        testFileE.setLocation(["cmssrm.fnal.gov"])
        testFileE.addRun(Run(2, 3, 4))
        testJobB = getJob()
        testJobB.addFile(testFileD)
        testJobB.addFile(testFileE)

        testFileF = File(lfn = makeUUID(), size = 1024, events = 1024,
                         parents = set(["/some/parent/F"]))
        testFileF.setLocation(["cmssrm.fnal.gov", "castor.cern.ch", "srm.ral.uk"])
        testFileF.addRun(Run(3, 1, 2))
        testFileG = File(lfn = makeUUID(), size = 1024, events = 1024,
                         parents = set(["/some/parent/G"]))
        testFileG.setLocation(["cmssrm.fnal.gov", "castor.cern.ch", "srm.ral.uk"] )
        testFileG.addRun(Run(3, 3, 4))
        testFileH = File(lfn = makeUUID(), size = 1024, events = 1024,
                         parents = set(["/some/parent/H"]))
        testFileH.setLocation(["cmssrm.fnal.gov", "castor.cern.ch", "srm.ral.uk"])
        testFileH.addRun(Run(3, 5, 6))
        testJobC = getJob()
        testJobC.addFile(testFileF)
        testJobC.addFile(testFileG)
        testJobC.addFile(testFileH)

        testFileI = File(lfn = makeUUID(), size = 1024, events = 1024, merged = True)
        testFileI.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"])
        testFileI.addRun(Run(4, 1, 2))
        testFileJ = File(lfn = makeUUID(), size = 1024, events = 1024, merged = True)
        testFileJ.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"] )
        testFileJ.addRun(Run(4, 3, 4))
        testFileK = File(lfn = makeUUID(), size = 1024, events = 1024, merged = True)
        testFileK.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"])
        testFileK.addRun(Run(4, 5, 6))
        testJobD = getJob()
        testJobD.addFile(testFileI)
        testJobD.addFile(testFileJ)
        testJobD.addFile(testFileK)

        dcs.failedJobs([testJobA, testJobB, testJobC, testJobD])
        chunks = dcs.chunkFileset("ACDCTest", "/ACDCTest/reco",
                                  chunkSize = 5)

        self.assertEqual(len(chunks), 4, "Error: There should be four chunks: %s" % len(chunks))

        goldenMetaData = {1: {"lumis": 2, "locations": ["castor.cern.ch", "cmssrm.fnal.gov"], "events": 1024},
                          2: {"lumis": 4, "locations": ["cmssrm.fnal.gov"], "events": 2048},
                          3: {"lumis": 6, "locations": ["castor.cern.ch", "cmssrm.fnal.gov", "srm.ral.uk"], "events": 3072},
                          5: {"lumis": 10, "locations": ["castor.cern.ch", "cmssrm.fnal.gov"], "events": 5120}}

        testFiles =[testFileA, testFileB, testFileC, testFileI, testFileJ, testFileK]
        lastFile = testFileA
        for testFile in testFiles:
            if lastFile["lfn"] < testFile["lfn"]:
                lastFile = testFile

        testFiles.remove(lastFile)

        goldenFiles = {1: [lastFile],
                       2: [testFileD, testFileE],
                       3: [testFileF, testFileG, testFileH],
                       5: testFiles}

        for chunk in chunks:
            chunkMetaData = dcs.getChunkInfo("ACDCTest", "/ACDCTest/reco",
                                             chunk["offset"], chunk["files"])

            self.assertEqual(chunkMetaData["files"], chunk["files"],
                             "Error: Metadata doesn't match.")
            self.assertEqual(chunkMetaData["lumis"], chunk["lumis"],
                             "Error: Metadata doesn't match.")
            self.assertEqual(chunkMetaData["events"], chunk["events"],
                             "Error: Metadata doesn't match.")
            self.assertEqual(chunkMetaData["locations"], chunk["locations"],
                             "Error: Metadata doesn't match.")

            self.assertTrue(chunk["files"] in goldenMetaData.keys(),
                            "Error: Extra chunk found.")
            self.assertEqual(chunk["lumis"], goldenMetaData[chunk["files"]]["lumis"],
                             "Error: Lumis in chunk is wrong.")
            self.assertEqual(chunk["locations"], goldenMetaData[chunk["files"]]["locations"],
                             "Error: Locations in chunk is wrong.")
            self.assertEqual(chunk["events"], goldenMetaData[chunk["files"]]["events"],
                             "Error: Events in chunk is wrong.")
            del goldenMetaData[chunk["files"]]

            chunkFiles = dcs.getChunkFiles("ACDCTest", "/ACDCTest/reco",
                                           chunk["offset"], chunk["files"])

            self.assertTrue(chunk["files"] in goldenFiles.keys(),
                            "Error: Extra chunk found.")
            goldenChunkFiles = goldenFiles[chunk["files"]]
            self.assertEqual(len(chunkFiles), len(goldenChunkFiles))

            for chunkFile in chunkFiles:
                foundFile = None
                for goldenChunkFile in goldenChunkFiles:
                    if chunkFile["lfn"] == goldenChunkFile["lfn"]:
                        foundFile = goldenChunkFile
                        break

                self.assertTrue(foundFile != None,
                                "Error: Missing chunk file: %s, %s" % (chunkFiles, goldenChunkFiles))
                self.assertEqual(foundFile["parents"], chunkFile["parents"],
                                 "Error: File parents should match.")
                self.assertEqual(foundFile["merged"], chunkFile["merged"],
                                 "Error: File merged status should match.")
                self.assertEqual(foundFile["locations"], chunkFile["locations"],
                                 "Error: File locations should match.")
                self.assertEqual(foundFile["events"], chunkFile["events"],
                                 "Error: File locations should match: %s" % chunk["files"])
                self.assertEqual(foundFile["size"], chunkFile["size"],
                                 "Error: File locations should match.")
                self.assertEqual(len(foundFile["runs"]), len(chunkFile["runs"]),
                                 "Error: Wrong number of runs.")
                for run in foundFile["runs"]:
                    runMatch = False
                    for chunkRun in chunkFile["runs"]:
                        if chunkRun.run == run.run and chunkRun.lumis == run.lumis:
                            runMatch = True
                            break

                    self.assertTrue(runMatch, "Error: Run information is wrong.")

            del goldenFiles[chunk["files"]]

        singleChunk = dcs.singleChunkFileset("ACDCTest", "/ACDCTest/reco")
        self.assertEqual(singleChunk, {"offset" : 0,
                                       "files" : 11,
                                       "events" : 11264,
                                       "lumis" : 22,
                                       "locations" : set(["castor.cern.ch", "cmssrm.fnal.gov", "srm.ral.uk"])},
                         "Error: Single chunk metadata is wrong")

        return
Exemplo n.º 52
0
    def testNoFileSplitNoHardLimit(self):
        """
        _testNoFileSplitNoHardLimit_

        In this case we don't split on file boundaries, check different combination of files
        make sure we make the most of the splitting, e.g. include many zero event files in
        a single job.
        """
        splitter = SplitterFactory()

        # Create 100 files with 7 lumi per file and 0 events per lumi on average.
        testSubscription = self.createSubscription(nFiles=100,
                                                   lumisPerFile=7,
                                                   twoSites=False,
                                                   nEventsPerFile=0)
        jobFactory = splitter(package="WMCore.DataStructs",
                              subscription=testSubscription)

        # First test, the optimal settings are 360 events per job. As we have files with 0 events per lumi, this will
        # configure the splitting to a single job containing all files
        jobGroups = jobFactory(halt_job_on_file_boundaries=False,
                               splitOnRun=False,
                               events_per_job=360,
                               performance=self.performanceParams)

        # One job in one job group with 100 files
        self.assertEqual(len(jobGroups), 1)
        jobs = jobGroups[0].jobs
        self.assertEqual(len(jobs), 1)
        self.assertEqual(len(jobs[0]['input_files']), 100)

        # Create 7 files, each one with different lumi/event distributions
        testFileset = Fileset(name="FilesetA")
        testFileA = self.createFile("/this/is/file1", 250, 0, 5, "blenheim")
        testFileB = self.createFile("/this/is/file2", 600, 1, 1, "blenheim")
        testFileC = self.createFile("/this/is/file3", 1200, 2, 2, "blenheim")
        testFileD = self.createFile("/this/is/file4", 100, 3, 1, "blenheim")
        testFileE = self.createFile("/this/is/file5", 30, 4, 1, "blenheim")
        testFileF = self.createFile("/this/is/file6", 10, 5, 1, "blenheim")
        testFileG = self.createFile("/this/is/file7", 153, 6, 3, "blenheim")
        testFileset.addFile(testFileA)
        testFileset.addFile(testFileB)
        testFileset.addFile(testFileC)
        testFileset.addFile(testFileD)
        testFileset.addFile(testFileE)
        testFileset.addFile(testFileF)
        testFileset.addFile(testFileG)

        testSubscription = Subscription(fileset=testFileset,
                                        workflow=self.testWorkflow,
                                        split_algo="EventAwareLumiByWork",
                                        type="Processing")
        jobFactory = splitter(package="WMCore.DataStructs",
                              subscription=testSubscription)
        # Split the work targeting 150 events per job
        jobGroups = jobFactory(halt_job_on_file_boundaries=False,
                               splitOnRun=False,
                               events_per_job=150,
                               performance=self.performanceParams)

        self.assertEqual(len(jobGroups), 1)
        jobs = jobGroups[0].jobs
        self.assertEqual(len(jobs), 7)

        # Test interactions of this algorithm with splitOnRun = True
        # Make 2 files, one with 3 runs and a second one with the last run of the first
        fileA = File(lfn="/this/is/file1", size=1000, events=2400)
        lumiListA = []
        lumiListB = []
        lumiListC = []
        for lumi in range(8):
            lumiListA.append(1 + lumi)
            lumiListB.append(1 + lumi)
            lumiListC.append(1 + lumi)
        fileA.addRun(Run(1, *lumiListA))
        fileA.addRun(Run(2, *lumiListA))
        fileA.addRun(Run(3, *lumiListA))
        fileA.setLocation("malpaquet")

        fileB = self.createFile('/this/is/file2', 200, 3, 5, "malpaquet")

        testFileset = Fileset(name='FilesetB')
        testFileset.addFile(fileA)
        testFileset.addFile(fileB)
        testSubscription = Subscription(fileset=testFileset,
                                        workflow=self.testWorkflow,
                                        split_algo="EventAwareLumiByWork",
                                        type="Processing")
        jobFactory = splitter(package="WMCore.DataStructs",
                              subscription=testSubscription)
        # The settings for this splitting are 700 events per job
        jobGroups = jobFactory(splitOnRun=True,
                               halt_job_on_file_boundaries=False,
                               events_per_job=700,
                               performance=self.performanceParams)

        self.assertEqual(len(jobGroups), 1)
        jobs = jobGroups[0].jobs
        self.assertEqual(len(jobs), 7)
        # Make sure each job has one run
        for job in jobs:
            self.assertEqual(len(job['mask'].getRunAndLumis()), 1)
Exemplo n.º 53
0
    def testRunWhiteList(self):
        """
        _testRunWhiteList_

        Test that we can use a run white list to filter good runs/lumis.
        """
        splitter = SplitterFactory()

        # Create 3 files with 100 events per lumi:
        # - file1 with 1 run  of 8 lumis
        # - file2 with 2 runs of 2 lumis each
        # - file3 with 1 run  of 5 lumis
        fileA = File(lfn="/this/is/file1", size=1000, events=800)
        fileB = File(lfn="/this/is/file2", size=1000, events=400)
        fileC = File(lfn="/this/is/file3", size=1000, events=500)

        lumiListA = []
        for lumi in range(8):
            lumiListA.append(10 + lumi)
        fileA.addRun(Run(1, *lumiListA))
        fileA.setLocation("somese.cern.ch")
        lumiListB1 = []
        lumiListB2 = []
        for lumi in range(2):
            lumiListB1.append(20 + lumi)
            lumiListB2.append(30 + lumi)
        fileB.addRun(Run(2, *lumiListB1))
        fileB.addRun(Run(3, *lumiListB2))
        fileB.setLocation("somese.cern.ch")
        lumiListC = []
        for lumi in range(5):
            lumiListC.append(40 + lumi)
        fileC.addRun(Run(4, *lumiListC))
        fileC.setLocation("somese.cern.ch")

        testFileset = Fileset(name='Fileset')
        testFileset.addFile(fileA)
        testFileset.addFile(fileB)
        testFileset.addFile(fileC)

        testSubscription = Subscription(fileset=testFileset,
                                        workflow=self.testWorkflow,
                                        split_algo="EventAwareLumiByWork",
                                        type="Processing")
        jobFactory = splitter(package="WMCore.DataStructs",
                              subscription=testSubscription)

        # Split with no breaks
        jobGroups = jobFactory(halt_job_on_file_boundaries=False,
                               splitOnRun=False,
                               events_per_job=725,
                               runWhitelist=[1, 4],
                               performance=self.performanceParams)

        self.assertEqual(len(jobGroups), 1)
        jobs = jobGroups[0].jobs
        self.assertEqual(len(jobs), 2)
        for job in jobs:
            for run in job['mask'].getRunAndLumis().keys():
                self.assertIn(run, [1, 4])

        # Re-split with a break on runs
        jobGroups = jobFactory(halt_job_on_file_boundaries=False,
                               splitOnRun=True,
                               events_per_job=595,
                               runWhitelist=[1, 3, 4],
                               performance=self.performanceParams)

        self.assertEqual(len(jobGroups), 1)
        jobs = jobGroups[0].jobs
        self.assertEqual(len(jobs), 4)
        self.enforceLimits(jobs=jobs, runsPerJob=1)
        for job in jobs:
            for run in job['mask'].getRunAndLumis().keys():
                self.assertIn(run, [1, 3, 4])

        # Re-split with a break on files
        jobGroups = jobFactory(halt_job_on_file_boundaries=True,
                               splitOnRun=False,
                               events_per_job=595,
                               runWhitelist=[1, 2, 3],
                               performance=self.performanceParams)

        self.assertEqual(len(jobGroups), 1)
        jobs = jobGroups[0].jobs
        self.assertEqual(len(jobs), 3)
        self.enforceLimits(jobs=jobs, filesPerJob=1)
        for job in jobs:
            for run in job['mask'].getRunAndLumis().keys():
                self.assertIn(run, [1, 2, 3])
    def testNoFileSplitNoHardLimit(self):
        """
        _testNoFileSplitNoHardLimit_

        In this case we don't split on file boundaries, check different combination of files
        make sure we make the most of the splitting, e.g. include many zero event files in
        a single job.
        """
        splitter = SplitterFactory()

        # Create 100 files with 7 lumi per file and 0 events per lumi on average.
        testSubscription = self.createSubscription(nFiles=100, lumisPerFile=7, twoSites=False, nEventsPerFile=0)
        jobFactory = splitter(package="WMCore.DataStructs", subscription=testSubscription)

        # First test, the optimal settings are 360 events per job. As we have files with 0 events per lumi, this will
        # configure the splitting to a single job containing all files
        jobGroups = jobFactory(halt_job_on_file_boundaries=False, splitOnRun=False, events_per_job=360,
                               performance=self.performanceParams)

        # One job in one job group with 100 files
        self.assertEqual(len(jobGroups), 1)
        jobs = jobGroups[0].jobs
        self.assertEqual(len(jobs), 1)
        self.assertEqual(len(jobs[0]['input_files']), 100)

        # Create 7 files, each one with different lumi/event distributions
        testFileset = Fileset(name="FilesetA")
        testFileA = self.createFile("/this/is/file1", 250, 0, 5, "blenheim")
        testFileB = self.createFile("/this/is/file2", 600, 1, 1, "blenheim")
        testFileC = self.createFile("/this/is/file3", 1200, 2, 2, "blenheim")
        testFileD = self.createFile("/this/is/file4", 100, 3, 1, "blenheim")
        testFileE = self.createFile("/this/is/file5", 30, 4, 1, "blenheim")
        testFileF = self.createFile("/this/is/file6", 10, 5, 1, "blenheim")
        testFileG = self.createFile("/this/is/file7", 153, 6, 3, "blenheim")
        testFileset.addFile(testFileA)
        testFileset.addFile(testFileB)
        testFileset.addFile(testFileC)
        testFileset.addFile(testFileD)
        testFileset.addFile(testFileE)
        testFileset.addFile(testFileF)
        testFileset.addFile(testFileG)

        testSubscription = Subscription(fileset=testFileset, workflow=self.testWorkflow,
                                        split_algo="EventAwareLumiByWork", type="Processing")
        jobFactory = splitter(package="WMCore.DataStructs", subscription=testSubscription)
        # Split the work targeting 150 events per job
        jobGroups = jobFactory(halt_job_on_file_boundaries=False, splitOnRun=False, events_per_job=150,
                               performance=self.performanceParams)

        self.assertEqual(len(jobGroups), 1)
        jobs = jobGroups[0].jobs
        self.assertEqual(len(jobs), 7)

        # Test interactions of this algorithm with splitOnRun = True
        # Make 2 files, one with 3 runs and a second one with the last run of the first
        fileA = File(lfn="/this/is/file1", size=1000, events=2400)
        lumiListA = []
        lumiListB = []
        lumiListC = []
        for lumi in range(8):
            lumiListA.append(1 + lumi)
            lumiListB.append(1 + lumi)
            lumiListC.append(1 + lumi)
        fileA.addRun(Run(1, *lumiListA))
        fileA.addRun(Run(2, *lumiListA))
        fileA.addRun(Run(3, *lumiListA))
        fileA.setLocation("malpaquet")

        fileB = self.createFile('/this/is/file2', 200, 3, 5, "malpaquet")

        testFileset = Fileset(name='FilesetB')
        testFileset.addFile(fileA)
        testFileset.addFile(fileB)
        testSubscription = Subscription(fileset=testFileset, workflow=self.testWorkflow,
                                        split_algo="EventAwareLumiByWork", type="Processing")
        jobFactory = splitter(package="WMCore.DataStructs", subscription=testSubscription)
        # The settings for this splitting are 700 events per job
        jobGroups = jobFactory(splitOnRun=True, halt_job_on_file_boundaries=False, events_per_job=700,
                               performance=self.performanceParams)

        self.assertEqual(len(jobGroups), 1)
        jobs = jobGroups[0].jobs
        self.assertEqual(len(jobs), 7)
        # Make sure each job has one run
        for job in jobs:
            self.assertEqual(len(job['mask'].getRunAndLumis()), 1)
    def testRunWhiteList(self):
        """
        _testRunWhiteList_

        Test that we can use a run white list to filter good runs/lumis.
        """
        splitter = SplitterFactory()

        # Create 3 files with 100 events per lumi:
        # - file1 with 1 run  of 8 lumis
        # - file2 with 2 runs of 2 lumis each
        # - file3 with 1 run  of 5 lumis
        fileA = File(lfn="/this/is/file1", size=1000, events=800)
        fileB = File(lfn="/this/is/file2", size=1000, events=400)
        fileC = File(lfn="/this/is/file3", size=1000, events=500)

        lumiListA = []
        for lumi in range(8):
            lumiListA.append(10 + lumi)
        fileA.addRun(Run(1, *lumiListA))
        fileA.setLocation("somese.cern.ch")
        lumiListB1 = []
        lumiListB2 = []
        for lumi in range(2):
            lumiListB1.append(20 + lumi)
            lumiListB2.append(30 + lumi)
        fileB.addRun(Run(2, *lumiListB1))
        fileB.addRun(Run(3, *lumiListB2))
        fileB.setLocation("somese.cern.ch")
        lumiListC = []
        for lumi in range(5):
            lumiListC.append(40 + lumi)
        fileC.addRun(Run(4, *lumiListC))
        fileC.setLocation("somese.cern.ch")

        testFileset = Fileset(name='Fileset')
        testFileset.addFile(fileA)
        testFileset.addFile(fileB)
        testFileset.addFile(fileC)

        testSubscription = Subscription(fileset=testFileset,
                                        workflow=self.testWorkflow,
                                        split_algo="EventAwareLumiByWork",
                                        type="Processing")
        jobFactory = splitter(package="WMCore.DataStructs",
                              subscription=testSubscription)

        # Split with no breaks
        jobGroups = jobFactory(halt_job_on_file_boundaries=False,
                               splitOnRun=False,
                               events_per_job=725,
                               runWhitelist=[1, 4],
                               performance=self.performanceParams)

        self.assertEqual(len(jobGroups), 1)
        jobs = jobGroups[0].jobs
        self.assertEqual(len(jobs), 2)
        for job in jobs:
            for run in job['mask'].getRunAndLumis().keys():
                self.assertIn(run, [1, 4])

        # Re-split with a break on runs
        jobGroups = jobFactory(halt_job_on_file_boundaries=False,
                               splitOnRun=True,
                               events_per_job=595,
                               runWhitelist=[1, 3, 4],
                               performance=self.performanceParams)

        self.assertEqual(len(jobGroups), 1)
        jobs = jobGroups[0].jobs
        self.assertEqual(len(jobs), 4)
        self.enforceLimits(jobs=jobs, runsPerJob=1)
        for job in jobs:
            for run in job['mask'].getRunAndLumis().keys():
                self.assertIn(run, [1, 3, 4])

        # Re-split with a break on files
        jobGroups = jobFactory(halt_job_on_file_boundaries=True,
                               splitOnRun=False,
                               events_per_job=595,
                               runWhitelist=[1, 2, 3],
                               performance=self.performanceParams)

        self.assertEqual(len(jobGroups), 1)
        jobs = jobGroups[0].jobs
        self.assertEqual(len(jobs), 3)
        self.enforceLimits(jobs=jobs, filesPerJob=1)
        for job in jobs:
            for run in job['mask'].getRunAndLumis().keys():
                self.assertIn(run, [1, 2, 3])