def test_update(self): from Ganga import GPI t = GPI.LHCbTask() tr = GPI.LHCbTransform(application=DaVinci(), backend=Dirac()) t.appendTransform(tr) try: bkQueryList = [GPI.BKTestQuery(stripping20up)] tr.updateQuery() assert false, 'Should have thrown exception if updated with no query' except: tr.addQuery(GPI.BKTestQuery(stripping20down)) # Check some new data added assert len(tr.inputdata), 'No data added after call to update' try: # Shouldn't allow a second update before processed the data in # toProcess_dataset tr.updateQuery() assert false, 'Should have thrown an error if updated with files already to process' except: # run so can update again with a removed dataset recall that jobs with the # old dataset only created when run called. t.run() assert len(tr.getJobs()), "No Jobs created upon run()" job = GPI.jobs(int(tr.getJobs()[0].fqid.split('.')[0])) sleep_until_state(job, 300, 'submitted') del tr._impl.query.dataset.files[0] tr.update(True) # Check the dead dataset is picked up assert len(tr._impl.removed_data.files ), "Didn\'t Pick up loss of a dataset" job.remove()
def getJobs(self): """ Get the job slice of all jobs that process this task """ jobslice = JobRegistrySlice("tasks(%i).getJobs()" % (self.id)) for trf in self.transforms: for jid in trf.getJobs(): jobslice.objects[GPI.jobs(jid).fqid] = stripProxy(GPI.jobs(jid)) return JobRegistrySliceProxy(jobslice)
def getJobs(self): """ Get the job slice of all jobs that process this task """ jobslice = JobRegistrySlice("tasks(%i).getJobs()" % (self.id)) for trf in self.transforms: for jid in trf.getJobs(): jobslice.objects[GPI.jobs(jid).fqid] = stripProxy( GPI.jobs(jid)) return JobRegistrySliceProxy(jobslice)
def getTransform(self): tid = self.tasks_id.split(":") if len(tid) == 2 and tid[0].isdigit() and tid[1].isdigit(): try: task = GPI.tasks(int(tid[0])) except KeyError: return None if task: return task.transforms[int(tid[1])] if len(tid) == 3 and tid[1].isdigit() and tid[2].isdigit(): task = GPI.tasks(int(tid[1])) if task: return task.transforms[int(tid[2])] return None
def test_appendTransform(self): from Ganga import GPI tr1 = GPI.LHCbTransform(application=DaVinci(), backend=Local()) t = GPI.LHCbTask() # Try appending t.appendTransform(tr1) assert len(t.transforms), 'Didn\'t append a transform properly' # Try appending a transform with a query and check for update tr2 = GPI.LHCbTransform(application=DaVinci(), backend=Local()) tr2.addQuery(GPI.BKTestQuery(stripping15up)) t.appendTransform(tr2) assert len(t.transforms[-1]._impl.toProcess_dataset.files ), 'Transform not updated properly after appending'
def createNewJob(self): """Create any jobs required for this unit""" j = GPI.Job() j._impl.backend = self._getParent().backend.clone() j._impl.application = self._getParent().application.clone() if not self.inputdata == None: j.inputdata = self.inputdata.clone() trf = self._getParent() task = trf._getParent() # copy across the outputfiles for f in trf.outputfiles: j.outputfiles += [f.clone()] j.inputsandbox = trf.inputsandbox # Sort out the splitter if trf.splitter: j.splitter = trf.splitter.clone() # Postprocessors for pp in trf.postprocessors: j.postprocessors.append(deepcopy(pp)) return j
def copyOutput(self): """Copy the output data to local storage""" job = GPI.jobs(self.active_job_ids[0]) if self.copy_output._name != "TaskLocalCopy" or job.outputdata._impl._name != "DQ2OutputDataset": logger.error( "Cannot transfer from DS type '%s' to '%s'. Please contact plugin developer." % (job.outputdata._name, self.copy_output._name)) return False # check which fies still need downloading to_download = [] for f in job.outputfiles: # check for REs if self.copy_output.isValid(os.path.join( f.localDir, f.namePattern)) and not self.copy_output.isDownloaded( os.path.join(f.localDir, f.namePattern)): to_download.append(f) # is everything downloaded? if len(to_download) == 0: return True # nope, so pick the requested number and off we go for f in to_download: f.get() return False
def copyOutput(self): """Copy the output data to local storage""" job = GPI.jobs(self.active_job_ids[0]) if self.copy_output._name != "TaskLocalCopy" or job.outputdata._impl._name != "DQ2OutputDataset": logger.error( "Cannot transfer from DS type '%s' to '%s'. Please contact plugin developer." % (job.outputdata._name, self.copy_output._name) ) return False # check which fies still need downloading to_download = [] for f in job.outputfiles: # check for REs if self.copy_output.isValid(os.path.join(f.localDir, f.namePattern)) and not self.copy_output.isDownloaded( os.path.join(f.localDir, f.namePattern) ): to_download.append(f) # is everything downloaded? if len(to_download) == 0: return True # nope, so pick the requested number and off we go for f in to_download: f.get() return False
def n_all(self): total = 0 for jid in self.active_job_ids: try: job = GPI.jobs(jid) except Exception as err: logger.debug("n_all Err: %s" % str(err)) task = self._getParent()._getParent() trf = self._getParent() logger.warning("Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)" % (jid, task.id, trf.getID(), self.getID())) continue j = stripProxy(job) # try to preserve lazy loading if hasattr(j, 'getNodeIndexCache') and j.getNodeIndexCache() and 'subjobs:status' in j.getNodeIndexCache(): if len(j.getNodeIndexCache()['subjobs:status']) != 0: total += len(j.getNodeIndexCache()['subjobs:status']) else: total += 1 else: #logger.warning("WARNING: (status check) No index cache for job object %d" % jid) if j.subjobs: total = len(j.subjobs) else: total = 1 return total
def n_all(self): total = 0 for jid in self.active_job_ids: try: job = GPI.jobs(jid) except Exception as err: logger.debug("n_all Err: %s" % str(err)) task = self._getParent()._getParent() trf = self._getParent() logger.warning( "Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)" % (jid, task.id, trf.getID(), self.getID())) continue j = stripProxy(job) # try to preserve lazy loading if hasattr(j, 'getNodeIndexCache') and j.getNodeIndexCache( ) and 'subjobs:status' in j.getNodeIndexCache(): if len(j.getNodeIndexCache()['subjobs:status']) != 0: total += len(j.getNodeIndexCache()['subjobs:status']) else: total += 1 else: #logger.warning("WARNING: (status check) No index cache for job object %d" % jid) if j.subjobs: total = len(j.subjobs) else: total = 1 return total
def createNewJob(self): """Create any jobs required for this unit""" j = GPI.Job() j.backend = self._getParent().backend.clone() # copy form ourselves or the parent transform depending on what's # specified fields = [ 'application', 'splitter', 'inputfiles', 'inputdata', 'inputsandbox', 'outputfiles', 'postprocessors' ] for f in fields: if (f == "postprocessors" and len(getattr(self, f).process_objects) > 0): j.postprocessors = copy.deepcopy(addProxy(self).postprocessors) elif (f != "postprocessors" and getattr(self, f)): setattr(j, f, copy.deepcopy(getattr(self, f))) elif (f == "postprocessors" and len(getattr(self._getParent(), f).process_objects) > 0): j.postprocessors = copy.deepcopy( addProxy(self._getParent()).postprocessors) elif (f != "postprocessors" and getattr(self._getParent(), f)): setattr(j, f, copy.deepcopy(getattr(self._getParent(), f))) return j
def checkOutputContainers(self): """Go through all completed units and make sure datasets are registered as required""" logger.info("Cleaning out transform %d container..." % self.getID()) try: dslist = [] dq2_lock.acquire() try: dslist = dq2.listDatasetsInContainer(self.getContainerName()) except: dslist = [] try: dq2.deleteDatasetsFromContainer(self.getContainerName(), dslist ) except DQContainerDoesNotHaveDataset: pass except Exception as x: logger.error("Problem cleaning out Transform container: %s %s", x.__class__, x) except DQException as x: logger.error('DQ2 Problem cleaning out Transform container: %s %s' %( x.__class__, x)) finally: dq2_lock.release() logger.info("Checking output data has been registered for Transform %d..." % self.getID()) for unit in self.units: if len(unit.active_job_ids) == 0: continue if unit.status == "completed" and GPI.jobs(unit.active_job_ids[0]).outputdata and GPI.jobs(unit.active_job_ids[0]).outputdata._impl._name == "DQ2OutputDataset": logger.info("Checking containers in Unit %d..." % unit.getID() ) unit.registerDataset()
def test_update(self): from Ganga import GPI t = GPI.LHCbTask() tr = GPI.LHCbTransform(application=DaVinci(), backend=Dirac()) t.appendTransform(tr) try: bkQueryList = [GPI.BKTestQuery(stripping20up)] tr.updateQuery() assert false, 'Should have thrown exception if updated with no query' except: tr.addQuery(GPI.BKTestQuery(stripping20down)) # Check some new data added assert len(tr.inputdata), 'No data added after call to update' try: # Shouldn't allow a second update before processed the data in # toProcess_dataset tr.updateQuery() assert false, 'Should have thrown an error if updated with files already to process' except: # run so can update again with a removed dataset recall that jobs with the # old dataset only created when run called. t.run() assert len(tr.getJobs()), "No Jobs created upon run()" job = GPI.jobs(int(tr.getJobs()[0].fqid.split('.')[0])) sleep_until_state(job, 300, 'submitted') del tr._impl.query.dataset.files[0] tr.update(True) # Check the dead dataset is picked up assert len( tr._impl.removed_data.files), "Didn\'t Pick up loss of a dataset" job.remove()
def test_OptionsFileSplitter_split(self): splitter = GPI.OptionsFileSplitter() splitter.optsArray = ['dummy1.opt', 'dummy2.opt', 'dummy3.opt'] job = Job(application=DaVinci()) job.prepare() #job.application.extra = GaudiExtras() subjobs = stripProxy(splitter).split(job) assert len(subjobs) == 3, 'incorrect number of subjobs'
def updateStatus(self, status): """Update status hook""" # register the dataset if applicable if status == "completed": job = GPI.jobs(self.active_job_ids[0]) if job.outputdata and job.outputdata._impl._name == "DQ2OutputDataset" and not self.registerDataset(): return super(AtlasUnit,self).updateStatus(status)
def test_update(self): from Ganga import GPI t = GPI.LHCbTask() tr1 = GPI.LHCbTransform(application=DaVinci(), backend=Local()) tr2 = GPI.LHCbTransform(application=DaVinci(), backend=Local()) t.appendTransform(tr1) t.appendTransform(tr2) tr1.addQuery(GPI.BKTestQuery(stripping15up)) tr2.addQuery(GPI.BKTestQuery(stripping15down)) # Check that update produces some files to process over multiple # transforms t.update() assert len( t.transforms[0]._impl.toProcess_dataset.files ), 'Update did not produce any datafiles to process in transform 0' assert len( t.transforms[1]._impl.toProcess_dataset.files ), 'Update did not produce any datafiles to process in transform 1'
def removeUnusedJobs(self): """Remove all jobs that aren't being used, e.g. failed jobs""" for unit in self.units: for jid in unit.prev_job_ids: try: logger.warning("Removing job '%d'..." % jid) job = GPI.jobs(jid) job.remove() except: logger.error("Problem removing job '%d'" % jid)
def checkOutputContainers(self): """Go through all completed units and make sure datasets are registered as required""" logger.info("Cleaning out transform %d container..." % self.getID()) try: dslist = [] dq2_lock.acquire() try: dslist = dq2.listDatasetsInContainer(self.getContainerName()) except: dslist = [] try: dq2.deleteDatasetsFromContainer(self.getContainerName(), dslist) except DQContainerDoesNotHaveDataset: pass except Exception as x: logger.error("Problem cleaning out Transform container: %s %s", x.__class__, x) except DQException as x: logger.error( 'DQ2 Problem cleaning out Transform container: %s %s' % (x.__class__, x)) finally: dq2_lock.release() logger.info( "Checking output data has been registered for Transform %d..." % self.getID()) for unit in self.units: if len(unit.active_job_ids) == 0: continue if unit.status == "completed" and GPI.jobs( unit.active_job_ids[0]).outputdata and GPI.jobs( unit.active_job_ids[0] ).outputdata._impl._name == "DQ2OutputDataset": logger.info("Checking containers in Unit %d..." % unit.getID()) unit.registerDataset()
def _getPartitionMasterJob(self, partition): """Get the master job from any number of partition jobs.""" partition_jobs = self.getPartitionJobs(partition) # only call method once if not len(partition_jobs): raise GangaException(None, "Cant get partition master job when NO jobs assigned to partition") elif len(partition_jobs) is 1: return partition_jobs[0] # Need registry access here might be better to get registry directly # as in prepared stuff, see Executable for example or even # tasksregistry.py! return GPI.jobs(partition_jobs[0].fqid.split(".")[0])
def createChainUnit(self, parent_units, use_copy_output=True): """Create an output unit given this output data""" # we need a parent job that has completed to get the output files incl_pat_list = [] excl_pat_list = [] for parent in parent_units: if len(parent.active_job_ids) == 0 or parent.status != "completed": return None for inds in self.inputdata: from Ganga.GPI import TaskChainInput if isType( inds, TaskChainInput ) and inds.input_trf_id == parent._getParent().getID(): incl_pat_list += inds.include_file_mask excl_pat_list += inds.exclude_file_mask # go over the output files and copy the appropriates over as input # files flist = [] import re for parent in parent_units: job = GPI.jobs(parent.active_job_ids[0]) if job.subjobs: job_list = job.subjobs else: job_list = [job] for sj in job_list: for f in sj.outputfiles: # match any dirac files that are allowed in the file mask if isType(f, DiracFile): if len(incl_pat_list) > 0: for pat in incl_pat_list: if re.search(pat, f.lfn): flist.append("LFN:" + f.lfn) else: flist.append("LFN:" + f.lfn) if len(excl_pat_list) > 0: for pat in excl_pat_list: if re.search( pat, f.lfn) and "LFN:" + f.lfn in flist: flist.remove("LFN:" + f.lfn) # just do one unit that uses all data unit = LHCbUnit() unit.name = "Unit %d" % len(self.units) unit.inputdata = LHCbDataset(files=[DiracFile(lfn=f) for f in flist]) return unit
def getParentUnitJobs(self, parent_units, include_subjobs=True): """Return the list of parent jobs""" job_list = [] for parent in parent_units: job = GPI.jobs(parent.active_job_ids[0]) if job.subjobs: job_list += job.subjobs else: job_list += [job] return job_list
def removeUnusedJobs(self): """Remove all jobs that aren't being used, e.g. failed jobs""" for unit in self.units: for jid in unit.prev_job_ids: try: logger.warning("Removing job '%d'..." % jid) job = GPI.jobs(jid) job.remove() except Exception as err: logger.debug("removeUnused: %s" % str(err)) logger.error("Problem removing job '%d'" % jid)
def remove(self, remove_jobs="do_nothing"): """Delete the task""" # make sure the task isn't running if self.status.find("running") != -1: logger.error( "Task is still running. Please pause before removing!") return if not remove_jobs in [True, False]: logger.info("You want to remove the task %i named '%s'." % (self.id, self.name)) logger.info( "Since this operation cannot be easily undone, please call this command again:") logger.info( " * as tasks(%i).remove(remove_jobs=True) if you want to remove all associated jobs," % (self.id)) logger.info( " * as tasks(%i).remove(remove_jobs=False) if you want to keep the jobs." % (self.id)) return if remove_jobs: for trf in self.transforms: for unit in trf.units: for jid in unit.active_job_ids: try: j = GPI.jobs(jid) j.remove() except Exception as err: logger.debug("Remove Err: %s" % str(err)) pass for jid in unit.prev_job_ids: try: j = GPI.jobs(jid) j.remove() except Exception as err2: logger.debug("Remove Err2: %s" % str(err2)) pass self._getRegistry()._remove(self) logger.info("Task #%s deleted" % self.id)
def updateStatus(self, status): """Update status hook""" # check for input data deletion of chain data if status == "completed" and self._getParent().delete_chain_input and len(self.req_units) > 0: # the inputdata field *must* be filled from the parent task # NOTE: When changing to inputfiles, will probably need to check # for any specified in trf.inputfiles # check that the parent replicas have been copied by checking # backend status == Done job_list = [] for req_unit in self.req_units: trf = self._getParent()._getParent().transforms[ int(req_unit.split(":")[0])] req_unit_id = req_unit.split(":")[1] if req_unit_id != "ALL": unit = trf.units[int(req_unit_id)] job_list.append(GPI.jobs(unit.active_job_ids[0])) else: for unit in trf.units: job_list.append(GPI.jobs(unit.active_job_ids[0])) for j in job_list: if j.subjobs: for sj in j.subjobs: if sj.backend.status != "Done": return else: if j.backend.status != "Done": return job = GPI.jobs(self.active_job_ids[0]) for f in job.inputdata.files: logger.warning( "Removing chain inputdata file '%s'..." % f.name) f.remove() super(LHCbUnit, self).updateStatus(status)
def checkForResubmission(self): """check if this unit should be resubmitted""" # check if we already have a job if len(self.active_job_ids) == 0: return False else: job = GPI.jobs(self.active_job_ids[0]) if job.status in ["failed", "killed"]: return True return False
def test_addQuery(self): from Ganga import GPI tr = GPI.LHCbTransform(application=DaVinci(), backend=Local()) t = GPI.LHCbTask() # Check non-lists and adding query to transform and non-associated t.addQuery(tr, GPI.BKTestQuery(stripping15up)) assert len(t.transforms), 'Transform not associated correctly' assert t.transforms[0].queries[ 0].path == stripping15up, 'Query path not correctly assigned' # Check duplicating t.addQuery(tr, bkQueryList) assert len( t.transforms) == 4, 'Problem duplicating and appending transforms' tmpList = [ stripping15up, stripping15down, stripping16up, stripping16down ] for tran in t.transforms: assert tran.queries[ 0].path in tmpList, 'Query attribute not setup properly for all transforms'
def test_GaussSplitter_split(self): job = Job(application=Gauss()) job.application.platform = 'x86_64-slc6-gcc48-opt' f = open('this-is-not-a-file.opts', 'w') f.write('') f.close() job.application.optsfile = 'this-is-not-a-file.opts' # hack for Gauss stripProxy(job.application).master_configure() job.prepare() gsplit = GPI.GaussSplitter(eventsPerJob=1, numberOfJobs=3) subjobs = stripProxy(gsplit).split(job) assert len(subjobs) == 3, 'incorrect # of jobs'
def createChainUnit(self, parent_units, use_copy_output=True): """Create an output unit given this output data""" # we need a parent job that has completed to get the output files incl_pat_list = [] excl_pat_list = [] for parent in parent_units: if len(parent.active_job_ids) == 0 or parent.status != "completed": return None for inds in self.inputdata: from Ganga.GPI import TaskChainInput if isType(inds, TaskChainInput) and inds.input_trf_id == parent._getParent().getID(): incl_pat_list += inds.include_file_mask excl_pat_list += inds.exclude_file_mask # go over the output files and copy the appropriates over as input # files flist = [] import re for parent in parent_units: job = GPI.jobs(parent.active_job_ids[0]) if job.subjobs: job_list = job.subjobs else: job_list = [job] for sj in job_list: for f in sj.outputfiles: # match any dirac files that are allowed in the file mask if isType(f, DiracFile): if len(incl_pat_list) > 0: for pat in incl_pat_list: if re.search(pat, f.lfn): flist.append("LFN:" + f.lfn) else: flist.append("LFN:" + f.lfn) if len(excl_pat_list) > 0: for pat in excl_pat_list: if re.search(pat, f.lfn) and "LFN:" + f.lfn in flist: flist.remove("LFN:" + f.lfn) # just do one unit that uses all data unit = LHCbUnit() unit.name = "Unit %d" % len(self.units) unit.inputdata = LHCbDataset(files=[DiracFile(lfn=f) for f in flist]) return unit
def createNewJob(self): """Create any jobs required for this unit""" j = GPI.Job() j._impl.backend = self._getParent().backend.clone() j._impl.application = self._getParent().application.clone() j.inputdata = self.inputdata.clone() trf = self._getParent() task = trf._getParent() # copy across the outputfiles for f in trf.outputfiles: j.outputfiles += [f.clone()] j.inputsandbox = trf.inputsandbox if type(self.eventswanted) == type(''): subLines = self.eventswanted else: subLines = '\n'.join(self.eventswanted) # Base for the naming of each subjob's CSV file incsvfile = j._impl.application.csvfile tmpname = os.path.basename(incsvfile) if len(tmpname.split('.')) > 1: patterncsv = '.'.join(tmpname.split('.')[0:-1])+"_sub%d."+ tmpname.split('.')[-1] else: patterncsv = tmpname+"_sub%d" from Ganga.GPIDev.Lib.File import FileBuffer thiscsv = patterncsv % self.subpartid # Create the CSV file for this Unit j._impl.getInputWorkspace().writefile(FileBuffer(thiscsv,subLines),executable=0) j._impl.application.csvfile = j._impl.getInputWorkspace().getPath()+thiscsv j.inputsandbox.append(j._impl.getInputWorkspace().getPath()+thiscsv) # Base for the naming of each subjob's output file tmpname = os.path.basename(j._impl.application.outputfile) if len(tmpname.split('.')) > 1: patternout = '.'.join(tmpname.split('.')[0:-1])+"_sub%d."+ tmpname.split('.')[-1] else: patternout = tmpname+"_sub%d" j._impl.application.outputfile = patternout % self.subpartid # Sort out the splitter if trf.splitter: j.splitter = trf.splitter.clone() return j
def _getPartitionMasterJob(self, partition): """Get the master job from any number of partition jobs.""" partition_jobs = self.getPartitionJobs( partition) # only call method once if not len(partition_jobs): raise GangaException( None, 'Cant get partition master job when NO jobs assigned to partition' ) elif len(partition_jobs) is 1: return partition_jobs[0] # Need registry access here might be better to get registry directly # as in prepared stuff, see Executable for example or even # tasksregistry.py! return GPI.jobs(partition_jobs[0].fqid.split('.')[0])
def createNewJob(self, partition): """ Returns a new job initialized with the transforms application, backend and name """ task = self._getParent( ) # this works because createNewJob is only called by a task id = task.transforms.index(self) j = GPI.Job() stripProxy(j).backend = self.backend.clone() stripProxy(j).application = self.application.clone() stripProxy(j).application.tasks_id = "%i:%i" % (task.id, id) stripProxy(j).application.id = self.getNewAppID(partition) j.inputdata = self.inputdata j.outputdata = self.outputdata j.inputsandbox = self.inputsandbox j.outputsandbox = self.outputsandbox j.name = "T%i:%i C%i" % (task.id, id, partition) return j
def n_active(self): if self.status == 'completed': return 0 tot_active = 0 active_states = ['submitted', 'running'] for jid in self.active_job_ids: try: job = GPI.jobs(jid) except Exception as err: logger.debug("n_active Err: %s" % str(err)) task = self._getParent()._getParent() trf = self._getParent() logger.warning( "Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)" % (jid, task.id, trf.getID(), self.getID())) continue j = stripProxy(job) # try to preserve lazy loading if hasattr(j, 'getNodeIndexCache') and j.getNodeIndexCache( ) and 'subjobs:status' in j.getNodeIndexCache(): if len(j.getNodeIndexCache()['subjobs:status']) > 0: for sj_stat in j.getNodeIndexCache()['subjobs:status']: if sj_stat in active_states: tot_active += 1 else: if j.getNodeIndexCache()['status'] in active_states: tot_active += 1 else: #logger.warning("WARNING: (active check) No index cache for job object %d" % jid) if j.status in active_states: if j.subjobs: for sj in j.subjobs: if sj.status in active_states: tot_active += 1 else: tot_active += 1 return tot_active
def n_active(self): if self.status == 'completed': return 0 tot_active = 0 active_states = ['submitted', 'running'] for jid in self.active_job_ids: try: job = GPI.jobs(jid) except Exception as err: logger.debug("n_active Err: %s" % str(err)) task = self._getParent()._getParent() trf = self._getParent() logger.warning("Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)" % (jid, task.id, trf.getID(), self.getID())) continue j = stripProxy(job) # try to preserve lazy loading if hasattr(j, 'getNodeIndexCache') and j.getNodeIndexCache() and 'subjobs:status' in j.getNodeIndexCache(): if len(j.getNodeIndexCache()['subjobs:status']) > 0: for sj_stat in j.getNodeIndexCache()['subjobs:status']: if sj_stat in active_states: tot_active += 1 else: if j.getNodeIndexCache()['status'] in active_states: tot_active += 1 else: #logger.warning("WARNING: (active check) No index cache for job object %d" % jid) if j.status in active_states: if j.subjobs: for sj in j.subjobs: if sj.status in active_states: tot_active += 1 else: tot_active += 1 return tot_active
def createChainUnit(self, parent_units, use_copy_output=True): """Create a chained unit using the output data from the given units""" # check all parent units for copy_output copy_output_ok = True for parent in parent_units: if not parent.copy_output: copy_output_ok = False # all parent units must be completed so the outputfiles are filled correctly for parent in parent_units: if parent.status != "completed": return None if not use_copy_output or not copy_output_ok: unit = ND280Unit_CSVEvtList() unit.inputdata = ND280LocalDataset() for parent in parent_units: # loop over the output files and add them to the ND280LocalDataset - THIS MIGHT NEED SOME WORK! job = GPI.jobs(parent.active_job_ids[0]) for f in job.outputfiles: # should check for different file types and add them as appropriate to the dataset # self.inputdata (== TaskChainInput).include/exclude_file_mask could help with this # This will be A LOT easier with Ganga 6.1 as you can easily map outputfiles -> inputfiles! unit.inputdata.names.append( os.path.join(job.outputdir, f.namePattern)) else: unit = ND280Unit_CSVEvtList() unit.inputdata = ND280LocalDataset() for parent in parent_units: # unit needs to have completed and downloaded before we can get file list if parent.status != "completed": return None # we should be OK so copy all output to the dataset for f in parent.copy_output.files: unit.inputdata.names.append( os.path.join(parent.copy_output.local_location, f)) return unit
def createNewJob(self, partition): """ Returns a new job initialized with the transforms application, backend and name """ j = GPI.Job() stripProxy(j).backend = self.backend.clone() stripProxy(j).application = self.application.clone() stripProxy(j).application.tasks_id = "%i:%i" % (self.task_id, self.transform_id) stripProxy(j).application.id = self.getNewAppID(partition) if self.splitter is not None: stripProxy(j).splitter = LHCbTaskDummySplitter(self.splitter) # if self.merger is not None: # stripProxy(j).merger = self.merger j.inputdata = self.toProcess_dataset j.outputdata = self.outputdata j.inputsandbox = self.inputsandbox j.outputsandbox = self.outputsandbox j.name = "T%i Tr%i P%i" % (self.task_id, self.transform_id, partition) j.do_auto_resubmit = True self.toProcess_dataset.files = [] return j
def removeUnusedData(self): """Remove any output data from orphaned jobs""" for unit in self.units: for jid in unit.prev_job_ids: try: logger.warning("Removing data from job '%d'..." % jid) job = GPI.jobs(jid) jlist = [] if len(job.subjobs) > 0: jlist = job.subjobs else: jlist = [job] for sj in jlist: for f in sj.outputfiles: if isType(f, DiracFile) == "DiracFile" and f.lfn: f.remove() except: logger.error("Problem deleting data for job '%d'" % jid) pass
def createChainUnit( self, parent_units, use_copy_output = True ): """Create a chained unit using the output data from the given units""" # check all parent units for copy_output copy_output_ok = True for parent in parent_units: if not parent.copy_output: copy_output_ok = False # all parent units must be completed so the outputfiles are filled correctly for parent in parent_units: if parent.status != "completed": return None if not use_copy_output or not copy_output_ok: unit = ND280Unit_CSVEvtList() unit.inputdata = ND280LocalDataset() for parent in parent_units: # loop over the output files and add them to the ND280LocalDataset - THIS MIGHT NEED SOME WORK! job = GPI.jobs(parent.active_job_ids[0]) for f in job.outputfiles: # should check for different file types and add them as appropriate to the dataset # self.inputdata (== TaskChainInput).include/exclude_file_mask could help with this # This will be A LOT easier with Ganga 6.1 as you can easily map outputfiles -> inputfiles! unit.inputdata.names.append( os.path.join( job.outputdir, f.namePattern ) ) else: unit = ND280Unit_CSVEvtList() unit.inputdata = ND280LocalDataset() for parent in parent_units: # unit needs to have completed and downloaded before we can get file list if parent.status != "completed": return None # we should be OK so copy all output to the dataset for f in parent.copy_output.files: unit.inputdata.names.append( os.path.join( parent.copy_output.local_location, f ) ) return unit
def n_status(self, status): tot_active = 0 for jid in self.active_job_ids: try: job = GPI.jobs(jid) except Exception as err: logger.debug("n_status Err: %s" % str(err)) task = self._getParent()._getParent() trf = self._getParent() logger.warning("Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)" % (jid, task.id, trf.getID(), self.getID())) continue j = stripProxy(job) # try to preserve lazy loading if hasattr(j, '_index_cache') and j._index_cache and 'subjobs:status' in j._index_cache: if len(j._index_cache['subjobs:status']) > 0: for sj_stat in j._index_cache['subjobs:status']: if sj_stat == status: tot_active += 1 else: if j._index_cache['status'] == status: tot_active += 1 else: #logger.warning("WARNING: (status check) No index cache for job object %d" % jid) if j.subjobs: for sj in j.subjobs: if sj.status == status: tot_active += 1 else: if j.status == status: tot_active += 1 return tot_active
def getContainerList(self): """Return a list of the output containers assocaited with this unit""" job = GPI.jobs(self.active_job_ids[0]) cont_list = [] if job.backend._impl._name == "Jedi": # Jedi jobs have their datasets stored in datasetList for ds in job.outputdata.datasetList: cont_list.append(ds) elif job.backend.individualOutDS: # find all the individual out ds's for ds in job.subjobs(0).outputdata.output: # find all containers listed for cont_name in ds.split(","): if not cont_name.endswith("/"): continue if not cont_name in cont_list: cont_list.append(cont_name) else: cont_list.append(job.outputdata.datasetname) return cont_list
def createChainUnit( self, parent_units, use_copy_output = True ): """Create an output unit given this output data""" # we need valid parent jobs for parent in parent_units: # need datasetname filled for Panda jobs if len(parent.active_job_ids) == 0 or \ (GPI.jobs(parent.active_job_ids[0]).application._impl._name != "TagPrepare" and \ GPI.jobs(parent.active_job_ids[0]).outputdata and \ GPI.jobs(parent.active_job_ids[0]).backend._impl._name == "Panda" and \ GPI.jobs(parent.active_job_ids[0]).outputdata.datasetname == ""): return None # need datasetList filled for Jedi jobs if len(parent.active_job_ids) == 0 or \ (GPI.jobs(parent.active_job_ids[0]).application._impl._name != "TagPrepare" and \ GPI.jobs(parent.active_job_ids[0]).outputdata and \ GPI.jobs(parent.active_job_ids[0]).backend._impl._name == "Jedi" and \ len(GPI.jobs(parent.active_job_ids[0]).outputdata.datasetList) == 0): return None # for local jobs, make sure units are complete if GPI.jobs(parent_units[0].active_job_ids[0]).outputdata._impl._name == "ATLASOutputDataset" and \ parent.status != "completed": return None # Are we doing Local -> Local? i.e. are we going from ATLASOutputDataset? # Problem: Doesn't take into account merger locations... if GPI.jobs(parent_units[0].active_job_ids[0]).outputdata._impl._name == "ATLASOutputDataset": unit = AtlasUnit() unit.inputdata = ATLASLocalDataset() for parent in parent_units: for l in GPI.jobs(parent.active_job_ids[0]).outputdata.output: unit.inputdata.names += l # should we use the copy_output (ie. local output). Special case for TagPrepare elif GPI.jobs(parent_units[0].active_job_ids[0]).application._impl._name == "TagPrepare": # make sure all have completed before taking the tag-info if parent_units[0].status != "completed": return None unit = AtlasUnit() unit.inputdata = DQ2Dataset() unit.inputdata.tag_info = GPI.jobs(parent_units[0].active_job_ids[0]).application.tag_info elif not use_copy_output or not parent.copy_output: unit = AtlasUnit() unit.inputdata = DQ2Dataset() ds_list = [] for parent in parent_units: # Don't just use the main datasetname as Jedi introduces separate containers for logs and output files if GPI.jobs(parent.active_job_ids[0]).backend._impl._name == "Jedi": for ds in GPI.jobs(parent.active_job_ids[0]).outputdata.datasetList: if not ds.endswith(".log/"): unit.inputdata.dataset.append( ds ) else: unit.inputdata.dataset.append( GPI.jobs(parent.active_job_ids[0]).outputdata.datasetname ) else: unit = AtlasUnit() unit.inputdata = ATLASLocalDataset() for parent in parent_units: # unit needs to have completed and downloaded if parent.status != "completed": return None # we should be OK so copy all output to an ATLASLocalDataset for f in parent.copy_output.files: unit.inputdata.names.append( os.path.join( parent.copy_output.local_location, f ) ) return unit
def copyOutput(self): """Copy the output data to local storage""" job = GPI.jobs(self.active_job_ids[0]) if self.copy_output._name != "TaskLocalCopy" or job.outputdata._impl._name != "DQ2OutputDataset": logger.error("Cannot transfer from DS type '%s' to '%s'. Please contact plugin developer." % (job.outputdata._name, self.copy_output._name)) return False # get list of output files self._acquireDownloadLock() dq2_list = [] if len(self.output_file_list) == 0: for ds in self.getOutputDatasetList(): dq2_list = dq2.listFilesInDataset(ds) # merge job DSs leave empty non-merged DSs around if job.backend.__class__.__name__ == "Panda" and job.backend.requirements.enableMerge and not ds.endswith("merge") and len(dq2_list) == 0: continue for guid in dq2_list[0].keys(): self.output_file_list[ dq2_list[0][guid]['lfn'] ] = ds # check which ones still need downloading to_download = {} for f in self.output_file_list.keys(): # check for REs if self.copy_output.isValid(f) and not self.copy_output.isDownloaded(f): to_download[ f ] = self.output_file_list[f] # store download location in case it's changed while downloading download_loc = self.copy_output.local_location self._releaseDownloadLock() # is everything downloaded? if len(to_download.keys()) == 0: return True # nope, so pick the requested number and off we go thread_array = [] for fname in to_download.keys()[:self._getParent().num_dq2_threads]: dsname = to_download[fname] exe = 'dq2-get -L ROAMING -a -d -H %s -f %s %s' % (download_loc, fname, dsname) logger.info("Downloading '%s' to %s..." % (fname, download_loc)) thread = Download.download_dq2(exe) thread.start() thread_array.append(thread) for t in thread_array: t.join() self._acquireDownloadLock() # check for valid download - SHOULD REALLY BE A HASH CHECK for fname in to_download.keys()[:self._getParent().num_dq2_threads]: full_path = os.path.join(self.copy_output.local_location, fname) if not os.path.exists(full_path): logger.error("Error downloading '%s'. File doesn't exist after download." % full_path) elif os.path.getsize( full_path ) < 4: logger.error("Error downloading '%s'. File size smaller than 4 bytes (%d)" % (full_path, os.path.getsize( full_path ) )) else: self.copy_output.files.append(fname) logger.info("File '%s' downloaded successfully" % full_path) self._releaseDownloadLock() return False
def registerDataset(self): """Register in the transform container""" trf = self._getParent() trf_container = trf.getContainerName() fail = False try: containerinfo = {} dq2_lock.acquire() try: containerinfo = dq2.listDatasets(trf_container) except: containerinfo = {} if containerinfo == {}: try: dq2.registerContainer(trf_container) logger.info('Registered container for Unit %i of Transform %i: %s' % (self.getID(), trf.getID(), trf_container)) except Exception as x: logger.error('Problem registering container for Unit %i of Transform %i, %s : %s %s' % (self.getID(), trf.getID(), trf_container,x.__class__, x)) fail = True except DQException as x: logger.error('DQ2 Problem registering container for Unit %i of Transform %i, %s : %s %s' % (self.getID(), trf.getID(), trf_container,x.__class__, x)) fail = True job = GPI.jobs(self.active_job_ids[0]) ds_list = self.getOutputDatasetList() for ds in ds_list: try: dq2.registerDatasetsInContainer(trf_container, [ ds ] ) except DQContainerAlreadyHasDataset: pass except Exception as x: logger.error('Problem registering dataset %s in container %s: %s %s' %( job.outputdata.datasetname, trf_container, x.__class__, x)) fail = True except DQException as x: logger.error('DQ2 Problem registering dataset %s in container %s: %s %s' %( job.outputdata.datasetname, trf_container, x.__class__, x)) fail = True finally: dq2_lock.release() if fail: return not fail # add dataset to the task container task = trf._getParent() task_container = task.getContainerName() try: containerinfo = {} dq2_lock.acquire() try: containerinfo = dq2.listDatasets(task_container) except: containerinfo = {} if containerinfo == {}: try: dq2.registerContainer(task_container) logger.info('Registered container for Unit %i of Transform %i: %s' % (self.getID(), trf.getID(), task_container)) except Exception as x: logger.error('Problem registering container for Unit %i of Transform %i in Task %i, %s : %s %s' % (self.getID(), trf.getID(), task.getID(), task_container, x.__class__, x)) fail = True except DQException as x: logger.error('DQ2 Problem registering container for Unit %i of Transform %i in Task %i, %s : %s %s' % (self.getID(), trf.getID(), task.getID(), task_container, x.__class__, x)) fail = True ds_list = self.getOutputDatasetList() for ds in ds_list: try: dq2.registerDatasetsInContainer(task_container, [ ds ] ) except DQContainerAlreadyHasDataset: pass except Exception as x: logger.error('Problem registering dataset %s in container %s: %s %s' %( job.outputdata.datasetname, task_container, x.__class__, x)) fail = True except DQException as x: logger.error('DQ2 Problem registering dataset %s in container %s: %s %s' %( job.outputdata.datasetname, task_container, x.__class__, x)) fail = True finally: dq2_lock.release() return not fail
def update(self): """Update the unit and (re)submit jobs as required""" #logger.warning("Entered Unit %d update function..." % self.getID()) # if we're complete, then just return if self.status in ["completed", "recreating"] or not self.active: return 0 # check if submission is needed task = self._getParent()._getParent() trf = self._getParent() maxsub = task.n_tosub() # check parent unit(s) req_ok = self.checkParentUnitsAreComplete() # set the start time if not already set if len(self.req_units) > 0 and req_ok and self.start_time == 0: self.start_time = time.time() + trf.chain_delay * 60 - 1 if req_ok and self.checkForSubmission() and maxsub > 0: # create job and submit addInfoString(self, "Creating Job...") j = self.createNewJob() if j.name == '': j.name = "T%i:%i U%i" % (task.id, trf.getID(), self.getID()) try: if trf.submit_with_threads: addInfoString(self, "Attempting job submission with queues...") GPI.queues.add(j.submit) else: addInfoString(self, "Attempting job submission...") j.submit() except Exception as err: logger.debug("update Err: %s" % str(err)) addInfoString(self, "Failed Job Submission") addInfoString(self, "Reason: %s" % (formatTraceback())) logger.error("Couldn't submit the job. Deactivating unit.") self.prev_job_ids.append(j.id) self.active = False trf._setDirty() # ensure everything's saved return 1 self.active_job_ids.append(j.id) self.updateStatus("running") trf._setDirty() # ensure everything's saved if trf.submit_with_threads: return 0 return 1 # update any active jobs for jid in self.active_job_ids: # we have an active job so see if this job is OK and resubmit if # not try: job = GPI.jobs(jid) except Exception as err: logger.debug("Update2 Err: %s" % str(err)) logger.warning( "Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)" % (jid, task.id, trf.getID(), self.getID())) continue if job.status == "completed": # check if actually completed if not self.checkCompleted(job): return 0 # check for DS copy if trf.unit_copy_output: if not self.copy_output: trf.createUnitCopyOutputDS(self.getID()) if not self.copyOutput(): return 0 # check for merger if trf.unit_merger: if not self.merger: self.merger = trf.createUnitMerger(self.getID()) if not self.merge(): return 0 # all good so mark unit as completed self.updateStatus("completed") elif job.status == "failed" or job.status == "killed": # check for too many resubs if self.minor_resub_count + self.major_resub_count > trf.run_limit - 1: logger.error( "Too many resubmits (%i). Deactivating unit." % (self.minor_resub_count + self.major_resub_count)) addInfoString( self, "Deactivating unit. Too many resubmits (%i)" % (self.minor_resub_count + self.major_resub_count)) self.active = False return 0 rebroker = False if self.minor_resub_count > trf.minor_run_limit - 1: if self._getParent().rebroker_on_job_fail: rebroker = True else: logger.error( "Too many minor resubmits (%i). Deactivating unit." % self.minor_resub_count) addInfoString( self, "Deactivating unit. Too many resubmits (%i)" % (self.minor_resub_count + self.minor_resub_count)) self.active = False return 0 if self.major_resub_count > trf.major_run_limit - 1: logger.error( "Too many major resubmits (%i). Deactivating unit." % self.major_resub_count) addInfoString( self, "Deactivating unit. Too many resubmits (%i)" % (self.minor_resub_count + self.major_resub_count)) self.active = False return 0 # check the type of resubmit if rebroker or self.checkMajorResubmit(job): self.major_resub_count += 1 self.minor_resub_count = 0 try: addInfoString(self, "Attempting major resubmit...") self.majorResubmit(job) except Exception as err: logger.debug("Update Err3: %s" % str(err)) logger.error( "Couldn't resubmit the job. Deactivating unit.") addInfoString(self, "Failed Job resubmission") addInfoString(self, "Reason: %s" % (formatTraceback())) self.active = False # break the loop now because we've probably changed the # active jobs list return 1 else: self.minor_resub_count += 1 try: addInfoString(self, "Attempting minor resubmit...") self.minorResubmit(job) except Exception as err: logger.debug("Update Err4: %s" % str(err)) logger.error( "Couldn't resubmit the job. Deactivating unit.") addInfoString(self, "Failed Job resubmission") addInfoString(self, "Reason: %s" % (formatTraceback())) self.active = False return 1
def createChainUnit( self, parent_units, use_copy_output = True ): """Create a chained unit using the output data from the given units""" # check all parent units for copy_output copy_output_ok = True for parent in parent_units: if not parent.copy_output: copy_output_ok = False # all parent units must be completed so the outputfiles are filled correctly for parent in parent_units: if parent.status != "completed": return None if len(parent_units) == 0: return None if not use_copy_output or not copy_output_ok: unit = ND280Unit() unit.inputdata = ND280LocalDataset() for parent in parent_units: # loop over the output files and add them to the ND280LocalDataset - THIS MIGHT NEED SOME WORK! job = GPI.jobs(parent.active_job_ids[0]) # if TaskChainInput.include_file_mask is not used go old way (see below) # otherwise add all file matching include_file_mask(s) to the unit.inputdata. DV. inc_file_mask = False for p in self.inputdata[0].include_file_mask: unit.inputdata.get_dataset(job.outputdir, p) inc_file_mask = True if not inc_file_mask: for f in job.outputfiles: # should check for different file types and add them as appropriate to the dataset # self.inputdata (== TaskChainInput).include/exclude_file_mask could help with this # This will be A LOT easier with Ganga 6.1 as you can easily map outputfiles -> inputfiles! # TODO: implement use of include/exclude_file_mask # try: outputfilenameformat = f.outputfilenameformat except: inputdir = job.outputdir else: #### WARNING: The following will work only if the MassStorageFile puts the files in local directories ! inputdir = '/'.join( [getConfig('Output')['MassStorageFile']['uploadOptions']['path'], f.outputfilenameformat.replace('{fname}','')]) unit.inputdata.get_dataset( inputdir, f.namePattern ) else: unit = ND280Unit() unit.inputdata = ND280LocalDataset() for parent in parent_units: # unit needs to have completed and downloaded before we can get file list if parent.status != "completed": return None # we should be OK so copy all output to the dataset for f in parent.copy_output.files: unit.inputdata.names.append( os.path.join( parent.copy_output.local_location, f ) ) return unit
def updateQuery(self, resubmit=False): """Update the dataset information of the transforms. This will include any new data in the processing or re-run jobs that have data which has been removed.""" if len(self.queries) == 0: raise GangaException( None, 'Cannot call updateQuery() on an LHCbTransform without any queries') if self._getParent() != None: logger.info('Retrieving latest bookkeeping information for transform %i:%i, please wait...' % ( self._getParent().id, self.getID())) else: logger.info( 'Retrieving latest bookkeeping information for transform, please wait...') # check we have an input DS per BK Query while len(self.queries) > len(self.inputdata): self.inputdata.append(LHCbDataset()) # loop over the queries and add fill file lists for id, query in enumerate(self.queries): # Get the latest dataset latest_dataset = query.getDataset() # Compare to previous inputdata, get new and removed logger.info( 'Checking for new and removed data for query %d, please wait...' % self.queries.index(query)) dead_data = LHCbDataset() new_data = LHCbDataset() # loop over the old data and compare new_data.files += latest_dataset.difference( self.inputdata[id]).files dead_data.files += self.inputdata[ id].difference(latest_dataset).files # for dead data, find then kill/remove any associated jobs # loop over units and check any associated with this DS # TODO: Follow through chained tasks for unit in self.units: # associted unit if unit.input_datset_index != id: continue # find the job if len(unit.active_job_ids) == 0: continue # check the data for f in dead_data.files: if f in unit.inputdata.files: # kill the job job = GPI.jobs(unit.active_job_ids[0]) if job.status in ['submitted', 'running']: job.kill() # forget the job unit.prev_job_ids.append(unit.active_job_ids[0]) unit.active_job_ids = [] break # in any case, now just set the DS files to the new set self.inputdata[id].files = [] self.inputdata[id].files = latest_dataset.files
def update(self): """Update the unit and (re)submit jobs as required""" #logger.warning("Entered Unit %d update function..." % self.getID()) # if we're complete, then just return if self.status in ["completed", "recreating"] or not self.active: return 0 # check if submission is needed task = self._getParent()._getParent() trf = self._getParent() maxsub = task.n_tosub() # check parent unit(s) req_ok = self.checkParentUnitsAreComplete() # set the start time if not already set if len(self.req_units) > 0 and req_ok and self.start_time == 0: self.start_time = time.time() + trf.chain_delay * 60 - 1 if req_ok and self.checkForSubmission() and maxsub > 0: # create job and submit addInfoString( self, "Creating Job..." ) j = self.createNewJob() if j.name == '': j.name = "T%i:%i U%i" % (task.id, trf.getID(), self.getID()) try: if trf.submit_with_threads: addInfoString( self, "Attempting job submission with queues..." ) GPI.queues.add(j.submit) else: addInfoString( self, "Attempting job submission..." ) j.submit() except Exception as err: logger.debug("update Err: %s" % str(err)) addInfoString( self, "Failed Job Submission") addInfoString( self, "Reason: %s" % (formatTraceback())) logger.error("Couldn't submit the job. Deactivating unit.") self.prev_job_ids.append(j.id) self.active = False trf._setDirty() # ensure everything's saved return 1 self.active_job_ids.append(j.id) self.updateStatus("running") trf._setDirty() # ensure everything's saved if trf.submit_with_threads: return 0 return 1 # update any active jobs for jid in self.active_job_ids: # we have an active job so see if this job is OK and resubmit if # not try: job = GPI.jobs(jid) except Exception as err: logger.debug("Update2 Err: %s" % str(err)) logger.warning("Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)" % (jid, task.id, trf.getID(), self.getID())) continue if job.status == "completed": # check if actually completed if not self.checkCompleted(job): return 0 # check for DS copy if trf.unit_copy_output: if not self.copy_output: trf.createUnitCopyOutputDS(self.getID()) if not self.copyOutput(): return 0 # check for merger if trf.unit_merger: if not self.merger: self.merger = trf.createUnitMerger(self.getID()) if not self.merge(): return 0 # all good so mark unit as completed self.updateStatus("completed") elif job.status == "failed" or job.status == "killed": # check for too many resubs if self.minor_resub_count + self.major_resub_count > trf.run_limit - 1: logger.error("Too many resubmits (%i). Deactivating unit." % ( self.minor_resub_count + self.major_resub_count)) addInfoString( self, "Deactivating unit. Too many resubmits (%i)" % ( self.minor_resub_count + self.major_resub_count)) self.active = False return 0 rebroker = False if self.minor_resub_count > trf.minor_run_limit - 1: if self._getParent().rebroker_on_job_fail: rebroker = True else: logger.error( "Too many minor resubmits (%i). Deactivating unit." % self.minor_resub_count) addInfoString( self, "Deactivating unit. Too many resubmits (%i)" % (self.minor_resub_count + self.minor_resub_count)) self.active = False return 0 if self.major_resub_count > trf.major_run_limit - 1: logger.error( "Too many major resubmits (%i). Deactivating unit." % self.major_resub_count) addInfoString( self, "Deactivating unit. Too many resubmits (%i)" % (self.minor_resub_count + self.major_resub_count)) self.active = False return 0 # check the type of resubmit if rebroker or self.checkMajorResubmit(job): self.major_resub_count += 1 self.minor_resub_count = 0 try: addInfoString( self, "Attempting major resubmit...") self.majorResubmit(job) except Exception as err: logger.debug("Update Err3: %s" % str(err)) logger.error("Couldn't resubmit the job. Deactivating unit.") addInfoString( self, "Failed Job resubmission") addInfoString( self, "Reason: %s" % (formatTraceback())) self.active = False # break the loop now because we've probably changed the # active jobs list return 1 else: self.minor_resub_count += 1 try: addInfoString( self, "Attempting minor resubmit...") self.minorResubmit(job) except Exception as err: logger.debug("Update Err4: %s" % str(err)) logger.error("Couldn't resubmit the job. Deactivating unit.") addInfoString( self, "Failed Job resubmission") addInfoString( self, "Reason: %s" % (formatTraceback())) self.active = False return 1
def initialize(self): from Ganga import GPI self.backend = stripProxy(GPI.Local())
def unregisterDataset(self): """Register in the transform container""" trf = self._getParent() trf_container = trf.getContainerName() fail = False try: containerinfo = {} dq2_lock.acquire() try: containerinfo = dq2.listDatasets(trf_container) except: containerinfo = {} if containerinfo != {}: job = GPI.jobs(self.active_job_ids[0]) ds_list = self.getOutputDatasetList() for ds in ds_list: try: dq2.deleteDatasetsFromContainer(trf_container, [ ds ] ) except DQContainerDoesNotHaveDataset: pass except Exception as x: logger.error('Problem removing dataset %s from container %s: %s %s' %( j.outputdata.datasetname, trf_container, x.__class__, x)) fail = True except DQException as x: logger.error('DQ2 Problem removing dataset %s from container %s: %s %s' %( j.outputdata.datasetname, trf_container, x.__class__, x)) fail = True finally: dq2_lock.release() if fail: return not fail # add dataset to the task container task = trf._getParent() task_container = task.getContainerName() try: containerinfo = {} dq2_lock.acquire() try: containerinfo = dq2.listDatasets(task_container) except: containerinfo = {} if containerinfo != {}: job = GPI.jobs(self.active_job_ids[0]) ds_list = self.getOutputDatasetList() for ds in ds_list: try: dq2.deleteDatasetsFromContainer(task_container, [ ds ] ) except DQContainerDoesNotHaveDataset: pass except Exception as x: logger.error('Problem removing dataset %s from container %s: %s %s' %( j.outputdata.datasetname, task_container, x.__class__, x)) fail = True except DQException as x: logger.error('DQ2 Problem removing dataset %s from container %s: %s %s' %( j.outputdata.datasetname, task_container, x.__class__, x)) fail = True finally: dq2_lock.release() return not fail
def createChainUnit(self, parent_units, use_copy_output=True): """Create an output unit given this output data""" # we need valid parent jobs for parent in parent_units: # need datasetname filled for Panda jobs if len(parent.active_job_ids) == 0 or \ (GPI.jobs(parent.active_job_ids[0]).application._impl._name != "TagPrepare" and \ GPI.jobs(parent.active_job_ids[0]).outputdata and \ GPI.jobs(parent.active_job_ids[0]).backend._impl._name == "Panda" and \ GPI.jobs(parent.active_job_ids[0]).outputdata.datasetname == ""): return None # need datasetList filled for Jedi jobs if len(parent.active_job_ids) == 0 or \ (GPI.jobs(parent.active_job_ids[0]).application._impl._name != "TagPrepare" and \ GPI.jobs(parent.active_job_ids[0]).outputdata and \ GPI.jobs(parent.active_job_ids[0]).backend._impl._name == "Jedi" and \ len(GPI.jobs(parent.active_job_ids[0]).outputdata.datasetList) == 0): return None # for local jobs, make sure units are complete if GPI.jobs(parent_units[0].active_job_ids[0]).outputdata._impl._name == "ATLASOutputDataset" and \ parent.status != "completed": return None # Are we doing Local -> Local? i.e. are we going from ATLASOutputDataset? # Problem: Doesn't take into account merger locations... if GPI.jobs(parent_units[0].active_job_ids[0] ).outputdata._impl._name == "ATLASOutputDataset": unit = AtlasUnit() unit.inputdata = ATLASLocalDataset() for parent in parent_units: for l in GPI.jobs(parent.active_job_ids[0]).outputdata.output: unit.inputdata.names += l # should we use the copy_output (ie. local output). Special case for TagPrepare elif GPI.jobs(parent_units[0].active_job_ids[0] ).application._impl._name == "TagPrepare": # make sure all have completed before taking the tag-info if parent_units[0].status != "completed": return None unit = AtlasUnit() unit.inputdata = DQ2Dataset() unit.inputdata.tag_info = GPI.jobs( parent_units[0].active_job_ids[0]).application.tag_info elif not use_copy_output or not parent.copy_output: unit = AtlasUnit() unit.inputdata = DQ2Dataset() ds_list = [] for parent in parent_units: # Don't just use the main datasetname as Jedi introduces separate containers for logs and output files if GPI.jobs(parent.active_job_ids[0] ).backend._impl._name == "Jedi": for ds in GPI.jobs( parent.active_job_ids[0]).outputdata.datasetList: if not ds.endswith(".log/"): unit.inputdata.dataset.append(ds) else: unit.inputdata.dataset.append( GPI.jobs( parent.active_job_ids[0]).outputdata.datasetname) else: unit = AtlasUnit() unit.inputdata = ATLASLocalDataset() for parent in parent_units: # unit needs to have completed and downloaded if parent.status != "completed": return None # we should be OK so copy all output to an ATLASLocalDataset for f in parent.copy_output.files: unit.inputdata.names.append( os.path.join(parent.copy_output.local_location, f)) return unit