def __make_subjob__(self, mj, guids, names, sjob_evnts=-1, sites=None): """ private method to create subjob object """ logger.debug('generating subjob to run %d events in-total on files: %s' % (sjob_evnts, repr(guids))) j = Job() j.name = mj.name j.inputdata = mj.inputdata if j.inputdata.type in ['','DQ2']: j.inputdata.guids = guids j.inputdata.names = names j.outputdata = mj.outputdata j.application = mj.application if sjob_evnts != -1: j.application.max_events = sjob_evnts j.backend = mj.backend if j.backend._name in ['LCG'] and j.backend.requirements._name == 'AtlasLCGRequirements': if sites: j.backend.requirements.sites = sites j.inputsandbox = mj.inputsandbox j.outputsandbox = mj.outputsandbox return j
def split(self, job): from Ganga.GPIDev.Lib.Job import Job logger.debug("AnaTaskSplitterJob split called") sjl = [] transform = stripProxy(job.application.getTransform()) transform.setAppStatus(job.application, "removed") # Do the splitting for sj in self.subjobs: j = Job() j.inputdata = transform.partitions_data[sj - 1] j.outputdata = job.outputdata j.application = job.application j.application.atlas_environment.append("OUTPUT_FILE_NUMBER=%i" % sj) j.backend = job.backend if transform.partitions_sites: if hasattr(j.backend.requirements, 'sites'): j.backend.requirements.sites = transform.partitions_sites[ sj - 1] else: j.backend.site = transform.partitions_sites[sj - 1] j.inputsandbox = job.inputsandbox j.outputsandbox = job.outputsandbox sjl.append(j) # Task handling j.application.tasks_id = job.application.tasks_id j.application.id = transform.getNewAppID(sj) #transform.setAppStatus(j.application, "submitting") if not job.application.tasks_id.startswith("00"): job.application.tasks_id = "00:%s" % job.application.tasks_id return sjl
def split(self,job): from Ganga.GPIDev.Lib.Job import Job logger.debug("AnaTaskSplitterJob split called") sjl = [] transform = stripProxy(job.application.getTransform()) transform.setAppStatus(job.application, "removed") # Do the splitting for sj in self.subjobs: j = Job() j.inputdata = transform.partitions_data[sj-1] j.outputdata = job.outputdata j.application = job.application j.application.atlas_environment.append("OUTPUT_FILE_NUMBER=%i" % sj) j.backend = job.backend if transform.partitions_sites: if hasattr(j.backend.requirements, 'sites'): j.backend.requirements.sites = transform.partitions_sites[sj-1] else: j.backend.site = transform.partitions_sites[sj-1] j.inputsandbox = job.inputsandbox j.outputsandbox = job.outputsandbox sjl.append(j) # Task handling j.application.tasks_id = job.application.tasks_id j.application.id = transform.getNewAppID(sj) #transform.setAppStatus(j.application, "submitting") if not job.application.tasks_id.startswith("00"): job.application.tasks_id = "00:%s" % job.application.tasks_id return sjl
def create_gaudi_subjob(job, inputdata): j = Job() j.name = job.name j.application = copy_app(job.application) j.backend = job.backend # no need to deepcopy if inputdata: j.inputdata = inputdata if hasattr(j.application,'extra'): j.application.extra.inputdata = j.inputdata else: j.inputdata = None if hasattr(j.application,'extra'): j.application.extra.inputdata = BesDataset() j.outputsandbox = job.outputsandbox[:] j.outputdata = job.outputdata return j
def test__setup_bulk_subjobs(tmpdir, db): from Ganga.Core.exceptions import BackendError from Ganga.GPIDev.Lib.Dataset.Dataset import Dataset from GangaDirac.Lib.Backends import Dirac name = str(tmpdir.join('submit_script')) with open(name, 'w') as fd: fd.write( script_template.replace('###PARAMETRIC_INPUTDATA###', str([['a'], ['b']]))) with pytest.raises(BackendError): db._setup_bulk_subjobs([], name) d = Dirac() j = Job() j.id = 0 # This would normally be set by the registry if this was a proxy job j.application = Executable() j.splitter = ArgSplitter() j.splitter.args = [['a'], ['b'], ['c'], ['d'], ['e']] j.inputdata = Dataset() j.backend = d d._parent = j dirac_ids = [123, 456] def fake_setup_subjob_dataset(dataset): assert dataset in [['a'], ['b']], 'dataset not passed properly' with patch.object(d, '_setup_subjob_dataset', fake_setup_subjob_dataset): assert d._setup_bulk_subjobs(dirac_ids, name), 'didnt run' assert len(j.subjobs) == len(dirac_ids), 'didnt work' for id_, backend_id, subjob in zip(range(len(dirac_ids)), dirac_ids, j.subjobs): assert id_ == subjob.id, 'ids dont match' assert backend_id == subjob.backend.id, 'backend.ids dont match' assert isinstance(subjob.application, j.application.__class__), 'apps dont match' assert subjob.splitter is None, 'splitter not done' assert isinstance(subjob.backend, j.backend.__class__), 'backend dont match'
def split(self,job): from Ganga.GPIDev.Lib.Job import Job subjobs = [] primeTables = job.inputdata.get_dataset() ## avoid creating jobs with nothing to do if self.numsubjobs > len(primeTables): self.numsubjobs = len(primeTables) ## create subjobs for i in range(self.numsubjobs): j = Job() j.application = job.application j.inputdata = job.inputdata j.inputdata.table_id_lower = 1 j.inputdata.table_id_upper = 1 j.outputdata = job.outputdata j.inputsandbox = job.inputsandbox j.outputsandbox = job.outputsandbox j.backend = job.backend subjobs.append(j) ## chunksize of each subjob chunksize = len(primeTables) / self.numsubjobs offset = 0 for i in range(len(subjobs)): my_chunksize = chunksize if len(primeTables) % self.numsubjobs >= i+1: my_chunksize+=1 ## set lower bound id (inclusive) subjobs[i].inputdata.table_id_lower = offset+1 ## fill subjob with prime tables #for j in range(my_chunksize): # subjobs[i].application.addPrimeTable(primeTables[offset+j]) offset += my_chunksize ## set upper bound id (inclusive) subjobs[i].inputdata.table_id_upper = offset return subjobs
def split(self, job): from Ganga.GPIDev.Lib.Job import Job subjobs = [] primeTables = job.inputdata.get_dataset() ## avoid creating jobs with nothing to do if self.numsubjobs > len(primeTables): self.numsubjobs = len(primeTables) ## create subjobs for i in range(self.numsubjobs): j = Job() j.application = job.application j.inputdata = job.inputdata j.inputdata.table_id_lower = 1 j.inputdata.table_id_upper = 1 j.outputdata = job.outputdata j.inputsandbox = job.inputsandbox j.outputsandbox = job.outputsandbox j.backend = job.backend subjobs.append(j) ## chunksize of each subjob chunksize = len(primeTables) / self.numsubjobs offset = 0 for i in range(len(subjobs)): my_chunksize = chunksize if len(primeTables) % self.numsubjobs >= i + 1: my_chunksize += 1 ## set lower bound id (inclusive) subjobs[i].inputdata.table_id_lower = offset + 1 ## fill subjob with prime tables #for j in range(my_chunksize): # subjobs[i].application.addPrimeTable(primeTables[offset+j]) offset += my_chunksize ## set upper bound id (inclusive) subjobs[i].inputdata.table_id_upper = offset return subjobs
def test__setup_bulk_subjobs(tmpdir, db): from Ganga.Core import BackendError from Ganga.GPIDev.Lib.Dataset.Dataset import Dataset from GangaDirac.Lib.Backends import Dirac name = str(tmpdir.join('submit_script')) with open(name, 'w') as fd: fd.write(script_template.replace('###PARAMETRIC_INPUTDATA###', str([['a'], ['b']]))) with pytest.raises(BackendError): db._setup_bulk_subjobs([], name) d = Dirac() j = Job() j.id = 0 # This would normally be set by the registry if this was a proxy job j.application = Executable() j.splitter = ArgSplitter() j.splitter.args = [['a'], ['b'], ['c'], ['d'], ['e']] j.inputdata = Dataset() j.backend = d d._parent = j dirac_ids = [123, 456] def fake_setup_subjob_dataset(dataset): assert dataset in [['a'], ['b']], 'dataset not passed properly' with patch.object(d, '_setup_subjob_dataset', fake_setup_subjob_dataset): assert d._setup_bulk_subjobs(dirac_ids, name), 'didnt run' assert len(j.subjobs) == len(dirac_ids), 'didnt work' for id_, backend_id, subjob in zip(range(len(dirac_ids)), dirac_ids, j.subjobs): assert id_ == subjob.id, 'ids dont match' assert backend_id == subjob.backend.id, 'backend.ids dont match' assert isinstance(subjob.application, j.application.__class__), 'apps dont match' assert subjob.splitter is None, 'splitter not done' assert isinstance(subjob.backend, j.backend.__class__), 'backend dont match'