def setUp(self): self.dd = { 'application': ComponentItem(category='applications'), 'backend': ComponentItem(category='backends'), 'name': SimpleItem('', comparable=0), 'workdir': SimpleItem(defvalue=None, type='string', transient=1, protected=1, comparable=0), 'status': SimpleItem(defvalue='new', protected=1, comparable=0), 'id': SimpleItem(defvalue=None, typelist=[str], protected=1, comparable=0), 'inputbox': FileItem(defvalue=[], sequence=1), 'outputbox': FileItem(defvalue=[], sequence=1), 'overriden_copyable': SimpleItem(defvalue=None, protected=1, copyable=1), 'plain_copyable': SimpleItem(defvalue=None, copyable=0) } self.s = Schema(Version(1, 0), self.dd)
class SampleGangaObject(GangaObject): _schema = Schema( Version(1, 0), { 'a': SimpleItem(42, typelist=[int]), # 'b' is skipped on purpose 'c': ComponentItem('gangafiles'), }) _category = 'TestGangaObject' _name = 'TestGangaObject' _exportmethods = ['example', 'check_not_proxy'] def example(self): return 'example_string' def check_not_proxy(self, obj): assert not Ganga.GPIDev.Base.Proxy.isProxy( obj), 'incoming argument should be proxy-stripped' ret = SampleGangaObject() assert not Ganga.GPIDev.Base.Proxy.isProxy( ret), 'new object should not be proxy-wrapped' return ret def not_proxied(self): return 'example_string'
class TestGangaObject(GangaObject): """Test Ganga Object. Is used to construct test jobs""" _schema = Schema( Version(1, 0), { 'id': SimpleItem('0', doc='ID Needed for tests'), 'name': SimpleItem( '', doc= 'optional label which may be any combination of ASCII characters', typelist=['str']), 'subjobs': ComponentItem('internal', defvalue=[], sequence=1, protected=1, load_default=0, copyable=0, optional=1, doc='test subjobs'), }) _name = "TestGangaObject" _category = "internal" def __init__(self, name='TestObjectName', sj=0): super(TestGangaObject, self).__init__() self.name = name for i in range(sj): self.subjobs.append(TestGangaObject(name + "." + str(i)))
class ThreadedTestGangaObject(GangaObject): _schema = Schema(Version(1, 0), { 'a': SimpleItem(42, typelist=[int]), 'b': ComponentItem('TestGangaObject', defvalue='SimpleGangaObject'), }) _category = 'TestGangaObject' _hidden = True _enable_plugin = True
class SandboxFile(LocalFile): _schema = Schema( Version(1, 1), { 'namePattern': SimpleItem(defvalue="", doc='pattern of the file name'), 'localDir': SimpleItem( defvalue="", doc= 'local dir where the file is stored, used from get and put methods' ), 'subfiles': ComponentItem(category='gangafiles', defvalue=[], hidden=1, typelist=['Ganga.GPIDev.Lib.File.SandboxFile'], sequence=1, copyable=0, doc="collected files from the wildcard namePattern"), 'compressed': SimpleItem( defvalue=False, typelist=['bool'], protected=0, doc= 'wheather the output file should be compressed before sending somewhere' ) }) _category = 'gangafiles' _name = "SandboxFile" def __init__(self, namePattern='', localDir='', **kwds): """ name is the name of the output file that is going to be processed in some way defined by the derived class """ logger.warning( "SandboxFile is now deprecated please change your configuration to use LocalFile instead!" ) super(SandboxFile, self).__init__(namePattern, localDir, **kwds)
class ExeSplitter(ISplitter): """ Split executable applications (OBSOLETE). This splitter allows the creation of subjobs where each subjob has a different Executable application. This splitter is OBSOLETED use GenericSplitter or ArgSplitter instead. """ _name = "ExeSplitter" _schema = Schema(Version(1, 0), { 'apps': ComponentItem('applications', defvalue=[], sequence=1, doc='a list of Executable app objects') }) def split(self, job): subjobs = [] for a in self.apps: # for each subjob make a full copy of the master job j = self.createSubjob(job) j.application = a if not a.exe: j.application.exe = job.application.exe subjobs.append(j) return subjobs
class LCGSEFile(IGangaFile): """LCGSEFile represents a class marking an output file to be written into LCG SE """ _schema = Schema( Version(1, 1), { 'namePattern': SimpleItem(defvalue="", doc='pattern of the file name'), 'localDir': SimpleItem( defvalue="", copyable=1, doc= 'local dir where the file is stored, used from get and put methods' ), 'joboutputdir': SimpleItem( defvalue="", doc= 'outputdir of the job with which the outputsandbox file object is associated' ), 'se': SimpleItem(defvalue=getLCGConfig()['dest_SRM'], copyable=1, doc='the LCG SE hostname'), 'se_type': SimpleItem(defvalue='', copyable=1, doc='the LCG SE type'), 'se_rpath': SimpleItem( defvalue='', copyable=1, doc= 'the relative path to the file from the VO directory on the SE' ), 'lfc_host': SimpleItem(defvalue=getLCGConfig()['LFC_HOST'], copyable=1, doc='the LCG LFC hostname'), 'srm_token': SimpleItem( defvalue='', copyable=1, doc= 'the SRM space token, meaningful only when se_type is set to srmv2' ), 'SURL': SimpleItem(defvalue='', copyable=1, doc='the LCG SE SURL'), 'port': SimpleItem(defvalue='', copyable=1, doc='the LCG SE port'), 'locations': SimpleItem( defvalue=[], copyable=1, typelist=[str], sequence=1, doc="list of locations where the outputfiles were uploaded"), 'subfiles': ComponentItem(category='gangafiles', defvalue=[], hidden=1, sequence=1, copyable=0, doc="collected files from the wildcard namePattern"), 'failureReason': SimpleItem(defvalue="", protected=1, copyable=0, doc='reason for the upload failure'), 'compressed': SimpleItem( defvalue=False, typelist=[bool], protected=0, doc= 'wheather the output file should be compressed before sending somewhere' ) }) _category = 'gangafiles' _name = "LCGSEFile" _exportmethods = ["location", "setLocation", "get", "put", "getUploadCmd"] def __init__(self, namePattern='', localDir='', **kwds): """ namePattern is the pattern of the output file that has to be written into LCG SE """ super(LCGSEFile, self).__init__() self.namePattern = namePattern self.localDir = localDir self.locations = [] self.shell = GridShell.getShell() def __setattr__(self, attr, value): if attr == 'se_type' and value not in ['', 'srmv1', 'srmv2', 'se']: raise AttributeError('invalid se_type: %s' % value) super(LCGSEFile, self).__setattr__(attr, value) def _on_attribute__set__(self, obj_type, attrib_name): r = copy.deepcopy(self) if getName(obj_type) == 'Job' and attrib_name == 'outputfiles': r.locations = [] r.localDir = '' r.failureReason = '' return r def __repr__(self): """Get the representation of the file.""" return "LCGSEFile(namePattern='%s')" % self.namePattern def __get_unique_fname__(self): '''gets an unique filename''' import random import time uuid = (str(random.uniform(0, 100000000)) + '-' + str(time.time())).replace('.', '-') user = getConfig('Configuration')['user'] fname = 'user.%s.%s' % (user, uuid) return fname def setLocation(self): """ Sets the location of output files that were uploaded to lcg storage element from the WN """ job = self.getJobObject() postprocessLocationsPath = os.path.join( job.outputdir, getConfig('Output')['PostProcessLocationsFileName']) if not os.path.exists(postprocessLocationsPath): return def lcgse_line_processor(line, lcgse_file): guid = line[line.find('->') + 2:] pattern = line.split(' ')[1] name = line.split(' ')[2].strip('.gz') if regex.search(lcgse_file.namePattern) is not None: d = LCGSEFile(namePattern=name) d.compressed = lcgse_file.compressed d.lfc_host = lcgse_file.lfc_host d.se = lcgse_file.se # todo copy also the other attributes lcgse_file.subfiles.append(GPIProxyObjectFactory(d)) lcgse_line_processor(line, d) elif pattern == lcgse_file.namePattern: if guid.startswith('ERROR'): logger.error("Failed to upload file to LCG SE") logger.error(guid[6:]) lcgse_file.failureReason = guid[6:] return lcgse_file.locations = guid for line in open(postprocessLocationsPath, 'r'): if line.strip() == '': continue if line.startswith('lcgse'): lcgse_line_processor(line.strip(), self) def location(self): """ Return list with the locations of the post processed files (if they were configured to upload the output somewhere) """ return self.locations def getUploadCmd(self): vo = getConfig('LCG')['VirtualOrganisation'] cmd = 'lcg-cr --vo %s ' % vo if self.se != '': cmd = cmd + ' -d %s' % self.se if self.se_type == 'srmv2' and self.srm_token != '': cmd = cmd + ' -D srmv2 -s %s' % self.srm_token # specify the physical location if self.se_rpath != '': cmd = cmd + \ ' -P %s/ganga.%s/filename' % (self.se_rpath, self.__get_unique_fname__()) return cmd def put(self): """ Executes the internally created command for file upload to LCG SE, this method will be called on the client """ import glob sourceDir = '' # if used as a stand alone object if self._getParent() is None: if self.localDir == '': logger.warning( 'localDir attribute is empty, don\'t know from which dir to take the file' ) return else: sourceDir = self.localDir else: job = self.getJobObject() sourceDir = job.outputdir import os os.environ['LFC_HOST'] = self.lfc_host fileName = self.namePattern if self.compressed: fileName = '%s.gz' % self.namePattern if regex.search(fileName) is not None: for currentFile in glob.glob(os.path.join(sourceDir, fileName)): cmd = self.getUploadCmd() cmd = cmd.replace('filename', currentFile) cmd = cmd + ' file:%s' % currentFile (exitcode, output, m) = self.shell.cmd1(cmd, capture_stderr=True) d = LCGSEFile(namePattern=os.path.basename(currentFile)) d.compressed = self.compressed d.lfc_host = self.lfc_host d.se = self.se # todo copy also the other attributes if exitcode == 0: match = re.search('(guid:\S+)', output) if match: d.locations = output.strip() # Alex removed this as more general approach in job.py after put() is called # remove file from output dir if this object is attached to a job # if self._getParent() is not None: # os.system('rm %s' % os.path.join(sourceDir, currentFile)) else: d.failureReason = output if self._getParent() is not None: logger.error( "Job %s failed. One of the job.outputfiles couldn't be uploaded because of %s" % (str(self._getParent().fqid), self.failureReason)) else: logger.error( "The file can't be uploaded because of %s" % (self.failureReason)) self.subfiles.append(GPIProxyObjectFactory(d)) else: logger.debug("sourceDir: %s" % sourceDir) logger.debug("fileName: %s" % fileName) currentFile = os.path.join(sourceDir, fileName) import os.path if os.path.isfile(currentFile): logger.debug("currentFile: %s exists!" % currentFile) else: logger.debug("currentFile: %s DOES NOT exist!" % currentFile) cmd = self.getUploadCmd() cmd = cmd.replace('filename', currentFile) cmd = cmd + ' file:%s' % currentFile logger.debug("cmd is: %s" % cmd) (exitcode, output, m) = self.shell.cmd1(cmd, capture_stderr=True) if exitcode == 0: match = re.search('(guid:\S+)', output) if match: self.locations = output.strip() # Alex removed this as more general approach in job.py after put() is called # remove file from output dir if this object is attached to a job # if self._getParent() is not None: # os.system('rm %s' % os.path.join(sourceDir, currentFile)) else: self.failureReason = output if self._getParent() is not None: logger.error( "Job %s failed. One of the job.outputfiles couldn't be uploaded because of %s" % (str(self._getParent().fqid), self.failureReason)) else: logger.error("The file can't be uploaded because of %s" % (self.failureReason)) def getWNInjectedScript(self, outputFiles, indent, patternsToZip, postProcessLocationsFP): """ Returns script that have to be injected in the jobscript for postprocessing on the WN """ lcgCommands = [] for outputFile in outputFiles: lcgCommands.append('lcgse %s %s %s' % (outputFile.namePattern, outputFile.lfc_host, outputFile.getUploadCmd())) logger.debug("OutputFile (%s) cmd for WN script is: %s" % (outputFile.namePattern, outputFile.getUploadCmd())) import inspect script_location = os.path.join( os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))), 'scripts/LCGSEFileWNScript.py') from Ganga.GPIDev.Lib.File import FileUtils script = FileUtils.loadScript(script_location, '###INDENT###') script = script.replace('###LCGCOMMANDS###', str(lcgCommands)) script = script.replace('###PATTERNSTOZIP###', str(patternsToZip)) script = script.replace('###INDENT###', indent) script = script.replace('###POSTPROCESSLOCATIONSFP###', postProcessLocationsFP) return script def get(self): """ Retrieves locally all files matching this LCGSEFile object pattern """ to_location = self.localDir if not os.path.isdir(self.localDir): if self._getParent() is not None: to_location = self.getJobObject().outputdir else: logger.info( "%s is not a valid directory.... Please set the localDir attribute" % self.localDir) return # set lfc host os.environ['LFC_HOST'] = self.lfc_host vo = getConfig('LCG')['VirtualOrganisation'] for location in self.locations: destFileName = os.path.join(to_location, self.namePattern) cmd = 'lcg-cp --vo %s %s file:%s' % (vo, location, destFileName) (exitcode, output, m) = self.shell.cmd1(cmd, capture_stderr=True) if exitcode != 0: logger.error( 'command %s failed to execute , reason for failure is %s' % (cmd, output)) def getWNScriptDownloadCommand(self, indent): script = """\n ###INDENT###os.environ['LFC_HOST'] = '###LFC_HOST###' ###INDENT###cwDir = os.getcwd() ###INDENT###dwnCmd = 'lcg-cp --vo ###VO### lfn:/grid/###VO###/###LOCATION###/###NAMEPATTERN### file:%s' % os.path.join(cwDir, '###NAMEPATTERN###') ###INDENT###os.system(dwnCmd) """ script = script.replace('###INDENT###', indent) script = script.replace('###LFC_HOST###', self.lfc_host) script = script.replace('###VO###', getConfig('LCG')['VirtualOrganisation']) script = script.replace('###LOCATION###', self.se_rpath) script = script.replace('###NAMEPATTERN###', self.namePattern) return script def processWildcardMatches(self): if self.subfiles: return self.subfiles from fnmatch import fnmatch if regex.search(self.namePattern): # TODO namePattern shouldn't contain slashes and se_rpath should # not contain wildcards exitcode, output, m = self.shell.cmd1( 'lcg-ls lfn:/grid/' + getConfig('LCG')['VirtualOrganisation'] + '/' + self.se_rpath, capture_stderr=True) for filename in output.split('\n'): if fnmatch(filename, self.namePattern): subfile = LCGSEFile(namePattern=filename) subfile.se_rpath = self.se_rpath subfile.lfc_host = self.lfc_host self.subfiles.append(GPIProxyObjectFactory(subfile))
class BKQueryDict(GangaObject): """Class for handling LHCb bookkeeping queries using dictionaries. Use BKQuery if you do not know how to use BK dictionaries! Example Usage: bkqd = BKQueryDict() bkqd.dict['ConfigVersion'] = 'Collision09' bkqd.dict['ConfigName'] = 'LHCb' bkqd.dict['ProcessingPass'] = '******' bkqd.dict['EventType'] = '90000000' bkqd.dict['FileType'] = 'DST' bkqd.dict['DataTakingConditions'] = 'Beam450GeV-VeloOpen-MagDown' data = bkqd.getDataset() """ _bkQueryTemplate = { 'SimulationConditions': 'All', 'DataTakingConditions': 'All', 'ProcessingPass': '******', 'FileType': 'All', 'EventType': 'All', 'ConfigName': 'All', 'ConfigVersion': 'All', 'ProductionID': 0, 'StartRun': 0, 'EndRun': 0, 'DataQuality': 'All' } schema = {} docstr = 'Dirac BK query dictionary.' schema['dict'] = SimpleItem( defvalue=_bkQueryTemplate, # typelist=['dict'], doc=docstr) schema['credential_requirements'] = ComponentItem('CredentialRequirement', defvalue='DiracProxy') _schema = Schema(Version(1, 0), schema) _category = '' _name = "BKQueryDict" _exportmethods = ['getDataset', 'getDatasetMetadata'] def __init__(self): super(BKQueryDict, self).__init__() @require_credential def getDatasetMetadata(self): '''Gets the dataset from the bookkeeping for current dict.''' if not self.dict: return None cmd = 'bkQueryDict(%s)' % self.dict try: value = get_result( cmd, 'BK query error.', credential_requirements=self.credential_requirements) except GangaDiracError as err: return {'OK': False, 'Value': {}} files = [] if 'LFNs' in value: files = value['LFNs'] metadata = {} if not type(files) is list: if 'LFNs' in files: # i.e. a dict of LFN:Metadata metadata = files['LFNs'].copy() if metadata: return {'OK': True, 'Value': metadata} return {'OK': False, 'Value': metadata} @require_credential def getDataset(self): '''Gets the dataset from the bookkeeping for current dict.''' if not self.dict: return None cmd = 'bkQueryDict(%s)' % self.dict value = get_result( cmd, 'BK query error.', credential_requirements=self.credential_requirements) files = [] if 'LFNs' in value: files = value['LFNs'] if not type(files) is list: if 'LFNs' in files: # i.e. a dict of LFN:Metadata files = files['LFNs'].keys() from GangaDirac.Lib.Files.DiracFile import DiracFile this_list = [DiracFile(lfn=f) for f in files] from GangaLHCb.Lib.LHCbDataset import LHCbDataset ds = LHCbDataset(files=this_list, fromRef=True) return addProxy(ds)
class BKQuery(GangaObject): '''Class for handling LHCb bookkeeping queries. Currently 4 types of queries are supported: Path, RunsByDate, Run and Production. These correspond to the Dirac API methods DiracLHCb.bkQuery<type> (see Dirac docs for details). Path formats are as follows: type = "Path": /<ConfigurationName>/<Configuration Version>/\ <Sim or Data Taking Condition>/<Processing Pass>/<Event Type>/<File Type> type = "RunsByDate": /<ConfigurationName>/<Configuration Version>/<Processing Pass>/\ <Event Type>/<File Type> type = "Run": /<Run Number>/<Processing Pass>/<Event Type>/<File Type> - OR - /<Run Number 1>-<Run Number 2>/<Processing Pass>/<Event Type>/<File Type> type = "Production": /<ProductionID>/<Processing Pass>/<Event Type>/<File Type> Example Usage: bkq = BKQuery ( dqflag = "All" , path = "/LHCb/Collision09/Beam450GeV-VeloOpen-MagDown/Real Data/\ RecoToDST-07/90000000/DST" , type = "Path" ) bkq = BKQuery ( startDate = "2010-05-18" , selection = "Runs" , endDate = "2010-05-20" , dqflag = "All" , path = "/LHCb/Collision10/Real Data/90000000/RAW" , type = "RunsByDate" ) bkq = BKQuery ( dqflag = "All" , path = "111183-126823/Real Data/Reco14/Stripping20/90000000/DIMUON.DST" , type = "Run" ) bkq = BKQuery ( dqflag = "All" , path = "/5842/Real Data/RecoToDST-07/90000000/DST" , type = "Production" ) then (for any type) one can get the data set by doing the following: data = bkq.getDataset() This will query the bookkeeping for the up-to-date version of the data. N.B. BKQuery objects can be stored in your Ganga box. ''' schema = {} docstr = 'Bookkeeping query path (type dependent)' schema['path'] = SimpleItem(defvalue='', doc=docstr) docstr = 'Start date string yyyy-mm-dd (only works for type="RunsByDate")' schema['startDate'] = SimpleItem(defvalue='', doc=docstr) docstr = 'End date string yyyy-mm-dd (only works for type="RunsByDate")' schema['endDate'] = SimpleItem(defvalue='', doc=docstr) docstr = 'Data quality flag (string or list of strings).' schema['dqflag'] = SimpleItem(defvalue='OK', typelist=['str', 'list'], doc=docstr) docstr = 'Type of query (Path, RunsByDate, Run, Production)' schema['type'] = SimpleItem(defvalue='Path', doc=docstr) docstr = 'Selection criteria: Runs, ProcessedRuns, NotProcessed (only works for type="RunsByDate")' schema['selection'] = SimpleItem(defvalue='', doc=docstr) schema['credential_requirements'] = ComponentItem('CredentialRequirement', defvalue='DiracProxy') _schema = Schema(Version(1, 2), schema) _category = 'query' _name = "BKQuery" _exportmethods = ['getDataset', 'getDatasetMetadata'] def __init__(self, path=''): super(BKQuery, self).__init__() self.path = path @require_credential def getDatasetMetadata(self): '''Gets the dataset from the bookkeeping for current path, etc.''' if not self.path: return None if not self.type in ['Path', 'RunsByDate', 'Run', 'Production']: raise GangaException('Type="%s" is not valid.' % self.type) if not self.type is 'RunsByDate': if self.startDate: msg = 'startDate not supported for type="%s".' % self.type raise GangaException(msg) if self.endDate: msg = 'endDate not supported for type="%s".' % self.type raise GangaException(msg) if self.selection: msg = 'selection not supported for type="%s".' % self.type raise GangaException(msg) cmd = "getDataset('%s','%s','%s','%s','%s','%s')" % ( self.path, self.dqflag, self.type, self.startDate, self.endDate, self.selection) from Ganga.GPIDev.Lib.GangaList.GangaList import GangaList knownLists = [tuple, list, GangaList] if isType(self.dqflag, knownLists): cmd = "getDataset('%s',%s,'%s','%s','%s','%s')" % ( self.path, self.dqflag, self.type, self.startDate, self.endDate, self.selection) try: value = get_result( cmd, 'BK query error.', credential_requirements=self.credential_requirements) except GangaDiracError as err: return {'OK': False, 'Value': str(err)} files = [] metadata = {} if 'LFNs' in value: files = value['LFNs'] if not type(files) is list: # i.e. a dict of LFN:Metadata # if 'LFNs' in files: # i.e. a dict of LFN:Metadata metadata = files.copy() if metadata: return {'OK': True, 'Value': metadata} return {'OK': False, 'Value': metadata} @require_credential def getDataset(self): '''Gets the dataset from the bookkeeping for current path, etc.''' if not self.path: return None if not self.type in ['Path', 'RunsByDate', 'Run', 'Production']: raise GangaException('Type="%s" is not valid.' % self.type) if not self.type is 'RunsByDate': if self.startDate: msg = 'startDate not supported for type="%s".' % self.type raise GangaException(msg) if self.endDate: msg = 'endDate not supported for type="%s".' % self.type raise GangaException(msg) if self.selection: msg = 'selection not supported for type="%s".' % self.type raise GangaException(msg) cmd = "getDataset('%s','%s','%s','%s','%s','%s')" % ( self.path, self.dqflag, self.type, self.startDate, self.endDate, self.selection) from Ganga.GPIDev.Lib.GangaList.GangaList import GangaList knownLists = [tuple, list, GangaList] if isType(self.dqflag, knownLists): cmd = "getDataset('%s',%s,'%s','%s','%s','%s')" % ( self.path, self.dqflag, self.type, self.startDate, self.endDate, self.selection) result = get_result( cmd, 'BK query error.', credential_requirements=self.credential_requirements) logger.debug("Finished Running Command") files = [] value = result if 'LFNs' in value: files = value['LFNs'] if not type(files) is list: # i.e. a dict of LFN:Metadata # if 'LFNs' in files: # i.e. a dict of LFN:Metadata files = files.keys() logger.debug("Creating DiracFile objects") ## Doesn't work not clear why from GangaDirac.Lib.Files.DiracFile import DiracFile #new_files = [] #def _createDiracLFN(this_file): # return DiracFile(lfn = this_file) #GangaObject.__createNewList(new_files, files, _createDiracLFN) logger.debug("Creating new list") new_files = [DiracFile(lfn=f) for f in files] #new_files = [DiracFile(lfn=_file) for _file in files] #for f in files: # new_files.append(DiracFile(lfn=f)) #ds.extend([DiracFile(lfn = f)]) logger.info("Constructing LHCbDataset") from GangaLHCb.Lib.LHCbDataset import LHCbDataset logger.debug("Imported LHCbDataset") ds = LHCbDataset(files=new_files, fromRef=True) logger.debug("Returning Dataset") return addProxy(ds)
class ICredential(GangaObject): """ Interface class for working with credentials """ _schema = Schema( Version(1, 0), { "maxTry": SimpleItem( defvalue=1, typelist=[int], doc= "Number of password attempts allowed when creating credential" ), "minValidity": SimpleItem(defvalue="00:15", typelist=[str], doc="Default minimum validity"), "validityAtCreation": SimpleItem(defvalue="24:00", typelist=[str], doc="Default credential validity at creation"), "command": ComponentItem( category="credential_commands", defvalue="ICommandSet", doc= "Set of commands to be used for credential-related operations") }) _category = "credentials" _name = "ICredential" _hidden = 1 _exportmethods = [ "create", "destroy", "isAvailable", "isValid", "location", "renew", "timeleft" ] def __init__(self): super(ICredential, self).__init__() self.shell = Shell() self.inputPW_Widget = None return def create(self, validity="", maxTry=0, minValidity="", check=False): """ Create credential. Arguments other than self: validity - Validity with which credential should be created, specified as string of format "hh:mm" [ Defaults to value of self.validityAtCreation ] maxTry - Number of password attempts allowed [ Defaults to value of self.maxTry ] minValidity - Minimum validity in case checking of pre-existing credential is performed, specified as strong of format "hh:mm" [ Defaults to value of self.minValidity ] check - Flag to request checking of pre-existing credential; if flag is set to true, then new credential is created only if the validity of any pre-existing credential is less than the value of minValidity [ Default: False ] Note: create is the same as renew, except for the default value of check Return value: True if credential is created successfully, and False otherwise. """ global logTimeStamp dummy = False if not self.command.init: dummy = True if "valid" in self.command.init_parameters: if not self.command.init_parameters["valid"]: dummy = True if dummy: logger.warning("Dummy CommandSet used - no credential created") return False if not maxTry: maxTry = self.maxTry if not minValidity: minValidity = self.minValidity if not validity: validity = self.validityAtCreation validityInSeconds = self.timeInSeconds(validity) if not validityInSeconds: logger.warning("Problems with requested validity: %s" % str(validity)) return False if check and self.isValid(minValidity): return True ntry = 0 while ntry < maxTry: ntry = ntry + 1 # Test if GUI widget is to be used. if self.inputPW_Widget: # Since self.inputPW_Widget is called, current arguments are # ignored since renew() and create() in GUI mode will not be # called with any arguments. #proxy_obj = self._proxyObject ## This is removed to get rid of ref to _proxyObject proxy_obj = self if self.inputPW_Widget.ask(proxy_obj): logger.dg( "Proceeding to retrieve password from inputPW_Widget.") __pw = self.inputPW_Widget.getPassword(proxy_obj) if not __pw: logger.warning("Password/passphrase expected!") return False try: tFile = tempfile.NamedTemporaryFile() tFile.write(__pw) tFile.flush() except: del __pw logger.warning( "Could not create secure temporary file for password!" ) return False del __pw else: # Current credential modification denied for various reasons. # see GangaGUI.customDialogs.ask() method for more details. return False # self.inputPW_Widget.ask() may have modified parameters. # Calling buildOpts() to take them into account. self.buildOpts(self.command.init, False) # Create initialisation list with the 'pipe' parameter initList = [ self.command.init, self.command.init_parameters["pipe"] ] # Append option value pairs for optName, optVal in self.command.currentOpts.iteritems(): initList.append("%s %s" % (optName, optVal)) status = self.shell.system("cat %s|%s" % (tFile.name, " ".join(initList))) tFile.close() # self.inputPW_Widget dialog postprocessing. # E.g. disable autorenew mechanism if status != 0. self.inputPW_Widget.renewalStatus(proxy_obj, status) if status == 0: logger.info("%s creation/renewal successful." % self._name) return True else: logger.warning("%s creation/renewal failed [%s]." % (self._name, status)) return False else: # Non-GUI credential renewal/creation # Check if renewal is from main process (i.e. by bootstrap or # user) if threading.currentThread().getName() == 'MainThread' or\ threading.currentThread().getName().startswith('GANGA_Update_Thread_Ganga_Worker_'): if "valid" in self.command.init_parameters: self.command.currentOpts[ self.command.init_parameters['valid']] = validity initList = [self.command.init] # Append option value pairs for optName, optVal in self.command.currentOpts.iteritems( ): initList.append("%s %s" % (optName, optVal)) status = self.shell.system(" ".join(initList)) if status == 0: logger.info("%s creation/renewal successful." % self._name) return True else: logger.warning("%s creation/renewal failed [%s]." % (self._name, status)) # create initiated from worker thread from monitoring # component. else: currTime = time.time() if currTime - logTimeStamp >= logRepeatDuration: logTimeStamp = currTime # Check validity but print logging messages this time self.isValid("", True) _credentialObject = self._name[0].lower( ) + self._name[1:] logger.warning( "Renew by typing '%s.renew()' at the prompt." % (_credentialObject)) # notify the Core that the credential is not valid _validity = self.timeInSeconds(self.timeleft()) _minValidity = self.timeInSeconds(minValidity) / 2. if _validity <= max(120, _minValidity): Coordinator.notifyInvalidCredential(self) return True logger.warning("%s creation/renewal attempts exceeded %s tries!" % (self._name, maxTry)) return False def destroy(self, allowed_exit=[0]): """ Destroy credential Argument other than self: allowed_exit - List of exit codes accepted without error when issuing system command for destroying credential Return value: False if command for destroying credential is undefined, or True otherwise """ if not self.command.destroy: logger.warning("Dummy CommandSet used - no credential created") return False destroyList = [self.command.destroy] for optName, optVal in self.command.destroyOpts.iteritems(): destroyList.append("%s %s" % (optName, optVal)) Coordinator.notifyInvalidCredential(self) status, output, message = \ self.shell.cmd1(" ".join(destroyList), allowed_exit) proxyPath = self.location() if proxyPath: os.remove(proxyPath) return True def isAvailable(self): """ Check whether credential is available with system/configuration used No arguments other than self Return value: True if credential is available, false otherwise """ logger.warning("Dummy method used - this always returns True") return True def isValid(self, validity="", log=False, force_check=False): """ Check validity Arguments other than self: validity - Minimum time for which credential should be valid, specified as string of format "hh:mm" [ Defaults to valud of self.minValidity ] log - Print logger messages if credential not valid force_check - Force credential check, rather than relying on cache Return value: True if credential is valid for required time, False otherwise. """ valid = True if not validity or validity is None: validity = self.minValidity validityInSeconds = self.timeInSeconds(validity) timeleft = self.timeleft(force_check=force_check) if not timeleft: valid = False else: timeleftInSeconds = self.timeInSeconds(timeleft) if timeleftInSeconds <= validityInSeconds: valid = False if not valid and log: _tl = self.timeleft(force_check=force_check) if _tl == "-1" or _tl == "0:00:00": _expiryStatement = "has expired!" else: _expiryStatement = "will expire in %s!" % _tl itemList = [] text = self._name[0] for i in range(len(self._name) - 1): character = self._name[i + 1] if character.isupper(): itemList.append(text) text = character.lower() else: text = "".join([text, character]) itemList.append(text) _credentialName = " ".join(itemList) logger.warning("%s %s" % (_credentialName, _expiryStatement)) return valid def location(self): """ Determine credential location No arguments other than self Return value: Path to credential if found, or empty string otherwise """ return "" def renew(self, validity="", maxTry=0, minValidity="", check=True): """ Renew credential. Arguments other than self: validity - Validity with which credential should be created, specified as string of format "hh:mm" [ Defaults to value of self.validityAtCreation ] maxTry - Number of password attempts allowed [ Defaults to value of self.maxTry ] minValidity - Minimum validity in case checking of pre-existing credential is performed, specified as strong of format "hh:mm" [ Defaults to value of self.minValidity ] check - Flag to request checking of pre-existing credential; if flag is set to true, then new credential is created only if the validity of any pre-existing credential is less than the value of minValidity [ Default: True ] Note: renew is the same as create, except for the default value of check Return value: True if new credential is created successfully, and False otherwise. """ status = self.create(validity, maxTry, minValidity, check) return status def timeInSeconds(self, timeString=""): """ Convert time string to time in seconds Arguments other than self: timeString - Time specified as string of format "hh:mm:ss" Return value: Time in seconds (integer) """ totalTime = 0 timeList = timeString.split(":") if len(timeList) >= 1: totalTime = totalTime + int(timeList[0]) * 60 * 60 if len(timeList) >= 2: totalTime = totalTime + int(timeList[1]) * 60 if len(timeList) >= 3: totalTime = totalTime + int(timeList[2]) return totalTime def timeleft(self, units="hh:mm:ss", force_check=False): """ Check time for which credential is valid. Arguments other than self: units - String specifying units in which time is returned force_check - Force credential check, rather than relying on cache Allowed values for units are: "hours" - time returned as in hours "minutes" - time returned in minutes "seconds" - time returned in seconds "hh:mm:ss" [default] - time returned as hours, minutes seconds Return value: Credential validity as string giving time in requested units, or empty string if command for querying credential validity is unavailable """ timeRemaining = self.timeleftInHMS(force_check=force_check) if timeRemaining not in ["", "-1"]: if units in ["hours", "minutes", "seconds"]: timeleftInSeconds = self.timeInSeconds(timeRemaining) if "seconds" == units: timeRemaining = "%.2f" % (timeleftInSeconds) elif "minutes" == units: timeRemaining = "%.2f" % (timeleftInSeconds / 60.) elif "hours" == units: timeRemaining = "%.2f" % (timeleftInSeconds / (60. * 60.)) return timeRemaining def timeleftInHMS(self, force_check=False): """ Determine remaining validity of credential in hours, minutes and seconds Argument other than self: force_check - Force credential check, rather than relying on cache Return value: String giving credential validity, or empty string if command for querying credential validity is unavailable """ logger.warning("Dummy method used - no information returned") return ""
class Condor(IBackend): """Condor backend - submit jobs to a Condor pool. For more options see help on CondorRequirements. """ _schema = Schema(Version(1, 0), { "requirements": ComponentItem(category="condor_requirements", defvalue="CondorRequirements", doc="Requirements for selecting execution host"), "env": SimpleItem(defvalue={}, doc='Environment settings for execution host'), "getenv": SimpleItem(defvalue="False", doc='Flag to pass current envrionment to execution host'), "rank": SimpleItem(defvalue="Memory", doc="Ranking scheme to be used when selecting execution host"), "submit_options": SimpleItem(defvalue=[], typelist=["str"], sequence=1, doc="Options passed to Condor at submission time"), "id": SimpleItem(defvalue="", protected=1, copyable=0, doc="Condor jobid"), "status": SimpleItem(defvalue="", protected=1, copyable=0, doc="Condor status"), "cputime": SimpleItem(defvalue="", protected=1, copyable=0, doc="CPU time used by job"), "actualCE": SimpleItem(defvalue="", protected=1, copyable=0, doc="Machine where job has been submitted"), "shared_filesystem": SimpleItem(defvalue=True, doc="Flag indicating if Condor nodes have shared filesystem"), "universe": SimpleItem(defvalue="vanilla", doc="Type of execution environment to be used by Condor"), "globusscheduler": SimpleItem(defvalue="", doc="Globus scheduler to be used (required for Condor-G submission)"), "globus_rsl": SimpleItem(defvalue="", doc="Globus RSL settings (for Condor-G submission)"), }) _category = "backends" _name = "Condor" statusDict = \ { "0": "Unexpanded", "1": "Idle", "2": "Running", "3": "Removed", "4": "Completed", "5": "Held" } def __init__(self): super(Condor, self).__init__() def submit(self, jobconfig, master_input_sandbox): """Submit job to backend. Return value: True if job is submitted successfully, or False otherwise""" cdfpath = self.preparejob(jobconfig, master_input_sandbox) status = self.submit_cdf(cdfpath) return status def submit_cdf(self, cdfpath=""): """Submit Condor Description File. Argument other than self: cdfpath - path to Condor Description File to be submitted Return value: True if job is submitted successfully, or False otherwise""" commandList = ["condor_submit -v"] commandList.extend(self.submit_options) commandList.append(cdfpath) commandString = " ".join(commandList) status, output = commands.getstatusoutput(commandString) self.id = "" if 0 != status: logger.error\ ("Tried submitting job with command: '%s'" % commandString) logger.error("Return code: %s" % str(status)) logger.error("Condor output:") logger.error(output) else: tmpList = output.split("\n") for item in tmpList: if 1 + item.find("** Proc"): localId = item.strip(":").split()[2] queryCommand = " ".join\ (["condor_q -format \"%s\" GlobalJobId", localId]) qstatus, qoutput = commands.getstatusoutput(queryCommand) if 0 != status: logger.warning\ ("Problem determining global id for Condor job '%s'" % localId) self.id = localId else: self.id = qoutput break return not self.id is "" def resubmit(self): """Resubmit job that has already been configured. Return value: True if job is resubmitted successfully, or False otherwise""" job = self.getJobObject() inpDir = job.getInputWorkspace().getPath() outDir = job.getOutputWorkspace().getPath() # Delete any existing output files, and recreate output directory if os.path.isdir(outDir): shutil.rmtree(outDir) if os.path.exists(outDir): os.remove(outDir) os.mkdir(outDir) # Determine path to job's Condor Description File cdfpath = os.path.join(inpDir, "__cdf__") # Resubmit job if os.path.exists(cdfpath): status = self.submit_cdf(cdfpath) else: logger.warning\ ("No Condor Description File for job '%s' found in '%s'" % (str(job.id), inpDir)) logger.warning("Resubmission failed") status = False return status def kill(self): """Kill running job. No arguments other than self Return value: True if job killed successfully, or False otherwise""" job = self.getJobObject() if not self.id: logger.warning("Job %s not running" % job.id) return False idElementList = job.backend.id.split("#") if 3 == len(idElementList): if idElementList[1].find(".") != -1: killCommand = "condor_rm -name %s %s" % \ (idElementList[0], idElementList[1]) else: killCommand = "condor_rm -name %s %s" % \ (idElementList[0], idElementList[2]) else: killCommand = "condor_rm %s" % (idElementList[0]) status, output = commands.getstatusoutput(killCommand) if (status != 0): logger.warning\ ("Return code '%s' killing job '%s' - Condor id '%s'" % (str(status), job.id, job.backend.id)) logger.warning("Tried command: '%s'" % killCommand) logger.warning("Command output: '%s'" % output) logger.warning("Anyway continuing with job removal") job.backend.status = "Removed" killStatus = True return killStatus def preparejob(self, jobconfig, master_input_sandbox): """Prepare Condor description file""" job = self.getJobObject() inbox = job.createPackedInputSandbox(jobconfig.getSandboxFiles()) inpDir = job.getInputWorkspace().getPath() outDir = job.getOutputWorkspace().getPath() infileList = [] exeString = jobconfig.getExeString().strip() quotedArgList = [] for arg in jobconfig.getArgStrings(): quotedArgList.append("\\'%s\\'" % arg) exeCmdString = " ".join([exeString] + quotedArgList) for filePath in inbox: if not filePath in infileList: infileList.append(filePath) for filePath in master_input_sandbox: if not filePath in infileList: infileList.append(filePath) fileList = [] for filePath in infileList: fileList.append(os.path.basename(filePath)) if job.name: name = job.name else: name = job.application._name name = "_".join(name.split()) wrapperName = "_".join(["Ganga", str(job.id), name]) commandList = [ "#!/usr/bin/env python", "from __future__ import print_function", "# Condor job wrapper created by Ganga", "# %s" % (time.strftime("%c")), "", inspect.getsource(Sandbox.WNSandbox), "", "import os", "import time", "", "startTime = time.strftime" + "( '%a %d %b %H:%M:%S %Y', time.gmtime( time.time() ) )", "", "for inFile in %s:" % str(fileList), " getPackedInputSandbox( inFile )", "", "exePath = '%s'" % exeString, "if os.path.isfile( '%s' ):" % os.path.basename(exeString), " os.chmod( '%s', 0755 )" % os.path.basename(exeString), "wrapperName = '%s_bash_wrapper.sh'" % wrapperName, "wrapperFile = open( wrapperName, 'w' )", "wrapperFile.write( '#!/bin/bash\\n' )", "wrapperFile.write( 'echo \"\"\\n' )", "wrapperFile.write( 'echo \"Hostname: $(hostname -f)\"\\n' )", "wrapperFile.write( 'echo \"\\${BASH_ENV}: ${BASH_ENV}\"\\n' )", "wrapperFile.write( 'if ! [ -z \"${BASH_ENV}\" ]; then\\n' )", "wrapperFile.write( ' if ! [ -f \"${BASH_ENV}\" ]; then\\n' )", "wrapperFile.write( ' echo \"*** Warning: " + "\\${BASH_ENV} file not found ***\"\\n' )", "wrapperFile.write( ' fi\\n' )", "wrapperFile.write( 'fi\\n' )", "wrapperFile.write( 'echo \"\"\\n' )", "wrapperFile.write( '%s\\n' )" % exeCmdString, "wrapperFile.write( 'exit ${?}\\n' )", "wrapperFile.close()", "os.chmod( wrapperName, 0755 )", "result = os.system( './%s' % wrapperName )", "os.remove( wrapperName )", "", "endTime = time.strftime" + "( '%a %d %b %H:%M:%S %Y', time.gmtime( time.time() ) )", "print('\\nJob start: ' + startTime)", "print('Job end: ' + endTime)", "print('Exit code: %s' % str( result ))" ] commandString = "\n".join(commandList) wrapper = job.getInputWorkspace().writefile\ (FileBuffer(wrapperName, commandString), executable=1) infileString = ",".join(infileList) outfileString = ",".join(jobconfig.outputbox) cdfDict = \ { 'universe': self.universe, 'on_exit_remove': 'True', 'should_transfer_files': 'YES', 'when_to_transfer_output': 'ON_EXIT_OR_EVICT', 'executable': wrapper, 'transfer_executable': 'True', 'notification': 'Never', 'rank': self.rank, 'initialdir': outDir, 'error': 'stderr', 'output': 'stdout', 'log': 'condorLog', 'stream_output': 'false', 'stream_error': 'false', 'getenv': self.getenv } envList = [] if self.env: for key in self.env.keys(): value = self.env[key] if (isinstance(value, str)): value = os.path.expandvars(value) else: value = str(value) envList.append("=".join([key, value])) envString = ";".join(envList) if jobconfig.env: for key in jobconfig.env.keys(): value = jobconfig.env[key] if (isinstance(value, str)): value = os.path.expandvars(value) else: value = str(value) envList.append("=".join([key, value])) envString = ";".join(envList) if envString: cdfDict['environment'] = envString if infileString: cdfDict['transfer_input_files'] = infileString if self.globusscheduler: cdfDict['globusscheduler'] = self.globusscheduler if self.globus_rsl: cdfDict['globus_rsl'] = self.globus_rsl if outfileString: cdfDict['transfer_output_files'] = outfileString cdfList = [ "# Condor Description File created by Ganga", "# %s" % (time.strftime("%c")), ""] for key, value in cdfDict.iteritems(): cdfList.append("%s = %s" % (key, value)) cdfList.append(self.requirements.convert()) cdfList.append("queue") cdfString = "\n".join(cdfList) return job.getInputWorkspace().writefile\ (FileBuffer("__cdf__", cdfString)) def updateMonitoringInformation(jobs): jobDict = {} for job in jobs: if job.backend.id: jobDict[job.backend.id] = job idList = jobDict.keys() if not idList: return queryCommand = " ".join\ ([ "condor_q -global" if getConfig( "Condor")["query_global_queues"] else "condor_q", "-format \"%s \" GlobalJobId", "-format \"%s \" RemoteHost", "-format \"%d \" JobStatus", "-format \"%f\\n\" RemoteUserCpu" ]) status, output = commands.getstatusoutput(queryCommand) if 0 != status: logger.error("Problem retrieving status for Condor jobs") return if ("All queues are empty" == output): infoList = [] else: infoList = output.split("\n") allDict = {} for infoString in infoList: tmpList = infoString.split() id, host, status, cputime = ("", "", "", "") if 3 == len(tmpList): id, status, cputime = tmpList if 4 == len(tmpList): id, host, status, cputime = tmpList if id: allDict[id] = {} allDict[id]["status"] = Condor.statusDict[status] allDict[id]["cputime"] = cputime allDict[id]["host"] = host fg = Foreground() fx = Effects() status_colours = {'submitted': fg.orange, 'running': fg.green, 'completed': fg.blue} for id in idList: printStatus = False if jobDict[id].status == "killed": continue localId = id.split("#")[-1] globalId = id if globalId == localId: queryCommand = " ".join\ ([ "condor_q -global" if getConfig( "Condor")["query_global_queues"] else "condor_q", "-format \"%s\" GlobalJobId", id ]) status, output = commands.getstatusoutput(queryCommand) if 0 == status: globalId = output if globalId in allDict.keys(): status = allDict[globalId]["status"] host = allDict[globalId]["host"] cputime = allDict[globalId]["cputime"] if status != jobDict[id].backend.status: printStatus = True stripProxy(jobDict[id])._getWriteAccess() jobDict[id].backend.status = status if jobDict[id].backend.status == "Running": jobDict[id].updateStatus("running") if host: if jobDict[id].backend.actualCE != host: jobDict[id].backend.actualCE = host jobDict[id].backend.cputime = cputime else: jobDict[id].backend.status = "" outDir = jobDict[id].getOutputWorkspace().getPath() condorLogPath = "".join([outDir, "condorLog"]) checkExit = True if os.path.isfile(condorLogPath): checkExit = False for line in open(condorLogPath): if -1 != line.find("terminated"): checkExit = True break if -1 != line.find("aborted"): checkExit = True break if checkExit: printStatus = True stdoutPath = "".join([outDir, "stdout"]) jobStatus = "failed" if os.path.isfile(stdoutPath): with open(stdoutPath) as stdout: lineList = stdout.readlines() try: exitLine = lineList[-1] exitCode = exitLine.strip().split()[-1] except IndexError: exitCode = -1 if exitCode.isdigit(): jobStatus = "completed" else: logger.error("Problem extracting exit code from job %s. Line found was '%s'." % ( jobDict[id].fqid, exitLine)) jobDict[id].updateStatus(jobStatus) if printStatus: if jobDict[id].backend.actualCE: hostInfo = jobDict[id].backend.actualCE else: hostInfo = "Condor" status = jobDict[id].status if status in status_colours: colour = status_colours[status] else: colour = fg.magenta if "submitted" == status: preposition = "to" else: preposition = "on" if jobDict[id].backend.status: backendStatus = "".join\ ([" (", jobDict[id].backend.status, ") "]) else: backendStatus = "" logger.info(colour + 'Job %s %s%s %s %s - %s' + fx.normal, jobDict[ id].fqid, status, backendStatus, preposition, hostInfo, time.strftime('%c')) return None updateMonitoringInformation = \ staticmethod(updateMonitoringInformation)
class LocalFile(IGangaFile): """LocalFile represents base class for output files, such as MassStorageFile, LCGSEFile, etc """ _schema = Schema( Version(1, 1), { 'namePattern': SimpleItem(defvalue="", doc='pattern of the file name'), 'localDir': SimpleItem( defvalue="", doc= 'local dir where the file is stored, used from get and put methods' ), 'subfiles': ComponentItem(category='gangafiles', defvalue=[], hidden=1, sequence=1, copyable=0, doc="collected files from the wildcard namePattern"), 'compressed': SimpleItem( defvalue=False, typelist=[bool], protected=0, doc= 'wheather the output file should be compressed before sending somewhere' ), }) _category = 'gangafiles' _name = "LocalFile" _exportmethods = ["location", "remove", "accessURL"] def __init__(self, namePattern='', localDir='', **kwds): """ name is the name of the output file that is going to be processed in some way defined by the derived class """ super(LocalFile, self).__init__() self.tmp_pwd = None if isinstance(namePattern, str): self.namePattern = namePattern if localDir: self.localDir = localDir elif isinstance(namePattern, File): self.namePattern = path.basename(namePattern.name) self.localDir = path.dirname(namePattern.name) elif isinstance(namePattern, FileBuffer): namePattern.create() self.namePattern = path.basename(namePattern.name) self.localDir = path.dirname(namePattern.name) else: logger.error( "Unkown type: %s . Cannot Create LocalFile from this!" % type(namePattern)) def __setattr__(self, attr, value): """ This is an overloaded setter method to make sure that we're auto-expanding the filenames of files which exist. In the case we're assigning any other attributes the value is simply passed through Args: attr (str): This is the name of the attribute which we're assigning value (unknown): This is the value being assigned. """ actual_value = value if attr == 'namePattern': if len(value.split(os.sep)) > 1: this_dir = path.dirname(value) if this_dir: self.localDir = this_dir elif path.isfile(path.join(os.getcwd(), path.basename(value))): self.localDir = os.getcwd() actual_value = path.basename(value) elif attr == 'localDir': if value: new_value = path.abspath(expandfilename(value)) if path.exists(new_value): actual_value = new_value super(LocalFile, self).__setattr__(attr, actual_value) def __repr__(self): """Get the representation of the file.""" return "LocalFile(namePattern='%s', localDir='%s')" % ( self.namePattern, self.localDir) def location(self): return self.getFilenameList() def accessURL(self): URLs = [] for file in self.location(): URLs.append('file://' + path.join(os.sep, file)) return URLs def setLocation(self): """This collects the subfiles for wildcarded output LocalFile""" import glob fileName = self.namePattern if self.compressed: fileName = '%s.gz' % self.namePattern sourceDir = self.getJobObject().outputdir if self.localDir: fileName = path.join(self.localDir, fileName) for currentFile in glob.glob(path.join(sourceDir, fileName)): base_name = path.basename(currentFile) d = LocalFile(base_name) d.compressed = self.compressed self.subfiles.append(d) def processWildcardMatches(self): if self.subfiles: return self.subfiles import glob fileName = self.namePattern if self.compressed: fileName = '%s.gz' % self.namePattern sourceDir = self.localDir if regex.search(fileName) is not None: for currentFile in glob.glob(path.join(sourceDir, fileName)): d = LocalFile(namePattern=path.basename(currentFile), localDir=path.dirname(currentFile)) d.compressed = self.compressed self.subfiles.append(d) def getFilenameList(self): """Return the files referenced by this LocalFile""" filelist = [] self.processWildcardMatches() if self.subfiles: for f in self.subfiles: filelist.append(path.join(f.localDir, f.namePattern)) else: if path.exists(path.join(self.localDir, self.namePattern)): logger.debug("File: %s found, Setting localDir: %s" % (self.namePattern, self.localDir)) filelist.append(path.join(self.localDir, self.namePattern)) return filelist def hasMatchedFiles(self): """ OK for checking subfiles but of no wildcards, need to actually check file exists """ # check for subfiles if len(self.subfiles) > 0: # we have subfiles so we must have actual files associated return True else: if self.containsWildcards(): return False # check if single file exists (no locations field to try) job = self.getJobObject() fname = self.namePattern if self.compressed: fname += ".gz" if path.isfile(path.join(job.getOutputWorkspace().getPath(), fname)): return True return False def remove(self): for this_file in self.getFilenameList(): _actual_delete = False keyin = None while keyin is None: keyin = raw_input( "Do you want to remove the LocalFile: %s ? ([y]/n) " % this_file) if keyin.lower() in ['y', '']: _actual_delete = True elif keyin.lower() == 'n': _actual_delete = False else: logger.warning("y/n please!") keyin = None if _actual_delete: if not path.exists(this_file): logger.warning("File %s did not exist, can't delete" % this_file) else: logger.info("Deleting: %s" % this_file) import time remove_filename = this_file + "_" + str( time.time()) + '__to_be_deleted_' try: os.rename(this_file, remove_filename) except Exception as err: logger.warning( "Error in first stage of removing file: %s" % this_file) remove_filename = this_file try: os.remove(remove_filename) except OSError as err: if err.errno != errno.ENOENT: logger.error("Error in removing file: %s" % remove_filename) raise pass return def internalCopyTo(self, targetPath): """ Copy a the file to the local storage using the get mechanism Args: targetPath (str): Target path where the file is to copied to """ for currentFile in glob.glob( os.path.join(self.localDir, self.namePattern)): shutil.copy(currentFile, path.join(targetPath, path.basename(currentFile))) def get(self): """ Method to get the Local file and/or to check that a file exists locally """ # Deliberately do nothing. def put(self): """ Copy the file to the detination (in the case of LocalFile the localDir) """ # This is useful for placing the LocalFile in a subdir at the end of a job #FIXME this method should be written to work with some other parameter than localDir for job outputs but for now this 'works' if self.localDir: try: job = self.getJobObject() except AssertionError as err: return # Copy to 'desitnation' if path.isfile(path.join(job.outputdir, self.namePattern)): if not path.exists(path.join(job.outputdir, self.localDir)): os.makedirs(path.join(job.outputdir, self.localDir)) shutil.copy( path.join(job.outputdir, self.namePattern), path.join(job.outputdir, self.localDir, self.namePattern)) def cleanUpClient(self): """ This performs the cleanup method on the client output workspace to remove temporary files """ # For LocalFile this is where the file is stored so don't remove it pass def getWNScriptDownloadCommand(self, indent): # create symlink shortScript = """ # create symbolic links for LocalFiles for f in ###FILELIST###: if not os.path.exists(os.path.basename(f)): os.symlink(f, os.path.basename(f)) """ from Ganga.GPIDev.Lib.File import FileUtils shortScript = FileUtils.indentScript(shortScript, '###INDENT###') shortScript = shortScript.replace('###FILELIST###', "%s" % self.getFilenameList()) return shortScript def getWNInjectedScript(self, outputFiles, indent, patternsToZip, postProcessLocationsFP): cp_template = """ ###INDENT###os.system("###CP_COMMAND###") """ script = "" j = self.getJobObject() output_dir = j.getOutputWorkspace(create=True).getPath() for this_file in outputFiles: filename = this_file.namePattern cp_cmd = 'cp %s %s' % (filename, quote(output_dir)) this_cp = cp_template replace_dict = {'###INDENT###': indent, '###CP_COMMAND###': cp_cmd} for k, v in replace_dict.iteritems(): this_cp = this_cp.replace(k, v) script = this_cp break return script
class MultiPostProcessor(IPostProcessor): """ Contains and executes many postprocessors. This is the object which is attached to a job. Should behave like a list to the user. """ _category = 'postprocessor' #_exportmethods = ['__add__', '__get__', '__str__', '__getitem__', 'append', 'remove'] _exportmethods = ['__add__', '__get__', '__getitem__', '__len__', 'append', 'remove'] _name = 'MultiPostProcessor' _schema = Schema(Version(1, 0), { 'process_objects': ComponentItem('postprocessor', defvalue=[], hidden=1, doc='A list of Processors to run', sequence=1) }) def __init__(self, *args): super(MultiPostProcessor, self).__init__() for process in args: if isinstance(process, MultiPostProcessor): for process_ in process.process_objects: self.addProcess(process_) elif isinstance(process, (list, tuple, GangaList)): for process_ in process: self.addProcess(process_) else: self.addProcess(process) if hasattr(self.process_objects, 'order'): self.process_objects = sorted(self.process_objects, key=lambda process: process.order) def __str__(self): if not isType(self.process_objects, GangaObject): return str(self.process_objects) else: return str(GPIProxyObjectFactory(self.process_objects)) def append(self, value): self.addProcess(value) self.process_objects = sorted( self.process_objects, key=lambda process: process.order) def remove(self, value): for process in self.process_objects: if (isType(value, type(process)) == True): self.process_objects.remove(process) break def __get__(self): return GPIProxyObjectFactory(self.process_objects) def __getitem__(self, i): return GPIProxyObjectFactory(self.process_objects[i]) def execute(self, job, newstatus, **options): # run the merger objects one at a time process_results = [] for p in self.process_objects: # stop infinite recursion if p is self: continue # execute all postprocessors process_result = p.execute(job, newstatus, **options) if process_result == False: newstatus = 'failed' process_results.append(process_result) # if one fails then we all fail return not False in process_results def addProcess(self, process_object): """Adds a process object to the list of processes to be done.""" self.process_objects.append(process_object) def __len__(self): return len(self.process_objects) def printSummaryTree(self, level=0, verbosity_level=0, whitespace_marker='', out=None, selection='', interactive=False): """If this method is overridden, the following should be noted: level: the hierachy level we are currently at in the object tree. verbosity_level: How verbose the print should be. Currently this is always 0. whitespace_marker: If printing on multiple lines, this allows the default indentation to be replicated. The first line should never use this, as the substitution is 'name = %s' % printSummaryTree() out: An output stream to print to. The last line of output should be printed without a newline.' selection: See VPrinter for an explaintion of this. """ out.write(str(self.process_objects))
class LogicalFile(DiracFile): # Logical File schema # Observing the 'old' 1.0 schema whilst preserving backwards compatability # with the fact that we're translating the object into a DiracFile in this # case _schema = Schema( Version(1, 0), { 'name': SimpleItem( defvalue="", doc='the LFN filename a LogicalFile is constructed with'), 'namePattern': SimpleItem( defvalue="", doc='pattern of the file name', transient=1), 'localDir': SimpleItem( defvalue=None, copyable=1, typelist=['str', 'type(None)'], doc= 'local dir where the file is stored, used from get and put methods', transient=1), 'remoteDir': SimpleItem( defvalue="", doc= 'remote directory where the LFN is to be placed in the dirac base directory by the put method.', transient=1), 'locations': SimpleItem( defvalue=[], copyable=1, typelist=['str'], sequence=1, doc="list of SE locations where the outputfiles are uploaded", transient=1), 'compressed': SimpleItem( defvalue=False, typelist=['bool'], protected=0, doc= 'wheather the output file should be compressed before sending somewhere', transient=1), 'lfn': SimpleItem( defvalue='', copyable=1, typelist=['str'], doc= 'return the logical file name/set the logical file name to use if not ' 'using wildcards in namePattern', transient=1), 'guid': SimpleItem( defvalue='', copyable=1, typelist=['str'], doc= 'return the GUID/set the GUID to use if not using wildcards in the namePattern.', transient=1), 'subfiles': ComponentItem(category='gangafiles', defvalue=[], hidden=1, sequence=1, copyable=0, typelist=['GangaDirac.Lib.Files.DiracFile'], doc="collected files from the wildcard namePattern", transient=1), 'failureReason': SimpleItem(defvalue="", protected=1, copyable=0, doc='reason for the upload failure', transient=1) }) _name = "LogicalFile" # TODO: Add warning to User NOT to create these objects themselves and that they should # only be used for backwards compatability to load old jobs def __init__(self, name=""): super(LogicalFile, self).__init__(lfn=name) self.name = name logger.warning( "!!! LogicalFile has been deprecated, this is now just a wrapper to the DiracFile object" ) logger.warning( "!!! Please update your scripts before LogicalFile is removed") self._setLFNnamePattern(_lfn=self.name, _namePattern='') def __setstate__(self, dict): super(LogicalFile, self).__setstate__(dict) self._setLFNnamePattern(_lfn=self.name, _namePattern='') def __construct__(self, args): if len(args) >= 1: self.name = args[0] self._setLFNnamePattern(_lfn=self.name, _namePattern='') if (len(args) != 1) or (type(args[0]) is not type('')): super(LogicalFile, self).__construct__(args) else: self.name = strip_filename(args[0]) def __setattr__(self, name, value): if name == "name": #elf.name = value self.lfn = value import os.path self.namePattern = os.path.basename(value) self.remoteDir = os.path.dirname(value) super(LogicalFile, self).__setattr__(name, value) def _attribute_filter__set__(self, attrib_name, value): if attrib_name == "name": self._setLFNnamePattern(lfn=value, namePattern='') return super(LogicalFile, self)._attribute_filter__set__(attrib_name, value)
class IUnit(GangaObject): _schema = Schema( Version(1, 0), { 'status': SimpleItem(defvalue='new', protected=1, copyable=0, doc='Status - running, pause or completed', typelist=["str"]), 'name': SimpleItem(defvalue='Simple Unit', doc='Name of the unit (cosmetic)', typelist=["str"]), 'application': ComponentItem('applications', defvalue=None, optional=1, load_default=False, doc='Application of the Transform.'), 'inputdata': ComponentItem('datasets', defvalue=None, optional=1, load_default=False, doc='Input dataset'), 'outputdata': ComponentItem('datasets', defvalue=None, optional=1, load_default=False, doc='Output dataset'), 'active': SimpleItem(defvalue=False, hidden=1, doc='Is this unit active'), 'active_job_ids': SimpleItem(defvalue=[], typelist=['int'], sequence=1, hidden=1, doc='Active job ids associated with this unit'), 'prev_job_ids': SimpleItem(defvalue=[], typelist=['int'], sequence=1, hidden=1, doc='Previous job ids associated with this unit'), 'minor_resub_count': SimpleItem(defvalue=0, hidden=1, doc='Number of minor resubmits'), 'major_resub_count': SimpleItem(defvalue=0, hidden=1, doc='Number of major resubmits'), 'req_units': SimpleItem( defvalue=[], typelist=['str'], sequence=1, hidden=1, doc= 'List of units that must complete for this to start (format TRF_ID:UNIT_ID)' ), 'start_time': SimpleItem( defvalue=0, hidden=1, doc='Start time for this unit. Allows a delay to be put in'), 'copy_output': ComponentItem( 'datasets', defvalue=None, load_default=0, optional=1, doc= 'The dataset to copy the output of this unit to, e.g. Grid dataset -> Local Dataset' ), 'merger': ComponentItem('mergers', defvalue=None, load_default=0, optional=1, doc='Merger to be run after this unit completes.'), 'splitter': ComponentItem('splitters', defvalue=None, optional=1, load_default=False, doc='Splitter used on each unit of the Transform.'), 'postprocessors': ComponentItem( 'postprocessor', defvalue=None, doc='list of postprocessors to run after job has finished'), 'inputsandbox': FileItem(defvalue=[], typelist=['str', 'Ganga.GPIDev.Lib.File.File.File'], sequence=1, doc="list of File objects shipped to the worker node "), 'inputfiles': GangaFileItem( defvalue=[], typelist=[ 'str', 'Ganga.GPIDev.Adapters.IGangaFile.IGangaFile' ], sequence=1, doc= "list of file objects that will act as input files for a job"), 'outputfiles': GangaFileItem( defvalue=[], typelist=[ 'str', 'Ganga.GPIDev.Adapters.IGangaFile.IGangaFile' ], sequence=1, doc="list of OutputFile objects to be copied to all jobs"), 'info': SimpleItem(defvalue=[], typelist=['str'], protected=1, sequence=1, doc="Info showing status transitions and unit info"), 'id': SimpleItem(defvalue=-1, protected=1, doc='ID of the Unit', typelist=["int"]), }) _category = 'units' _name = 'IUnit' _exportmethods = [] _hidden = 0 # Special methods: def __init__(self): super(IUnit, self).__init__() self.updateStatus("new") def _readonly(self): """A unit is read-only if the status is not new.""" if self.status == "new": return 0 return 1 def validate(self): """Validate that this unit is OK and set it to active""" self.active = True return True def getID(self): """Get the ID of this unit within the transform""" # if the id isn't already set, use the index from the parent Task if self.id < 0: trf = self._getParent() if not trf: raise ApplicationConfigurationError( None, "This unit has not been associated with a transform and so there is no ID available" ) self.id = trf.units.index(self) return self.id def updateStatus(self, status): """Update status hook""" addInfoString( self, "Status change from '%s' to '%s'" % (self.status, status)) self.status = status def createNewJob(self): """Create any jobs required for this unit""" pass def checkCompleted(self, job): """Check if this unit is complete""" if job.status == "completed": return True else: return False def checkForSubmission(self): """Check if this unit should submit a job""" # check the delay if time.time() < self.start_time: return False # check if we already have a job if len(self.active_job_ids) != 0: return False # if we're using threads, check the max number if self._getParent( ).submit_with_threads and GPI.queues.totalNumUserThreads( ) > self._getParent().max_active_threads: return False return True def checkForResubmission(self): """check if this unit should be resubmitted""" # check if we already have a job if len(self.active_job_ids) == 0: return False else: job = GPI.jobs(self.active_job_ids[0]) if job.status in ["failed", "killed"]: return True return False def checkParentUnitsAreComplete(self): """Check to see if the parent units are complete""" req_ok = True task = self._getParent()._getParent() for req in self.req_units: req_trf_id = int(req.split(":")[0]) if req.find("ALL") == -1: req_unit_id = int(req.split(":")[1]) if task.transforms[req_trf_id].units[ req_unit_id].status != "completed": req_ok = False else: # need all units from this trf for u in task.transforms[req_trf_id].units: if u.status != "completed": req_ok = False return req_ok def checkMajorResubmit(self, job): """check if this job needs to be fully rebrokered or not""" pass def majorResubmit(self, job): """perform a mjor resubmit/rebroker""" self.prev_job_ids.append(job.id) self.active_job_ids.remove(job.id) def minorResubmit(self, job): """perform just a minor resubmit""" try: trf = self._getParent() except Exception as err: logger.debug("GetParent exception!\n%s" % str(err)) trf = None if trf is not None and trf.submit_with_threads: addInfoString(self, "Attempting job re-submission with queues...") GPI.queues.add(job.resubmit) else: addInfoString(self, "Attempting job re-submission...") job.resubmit() def update(self): """Update the unit and (re)submit jobs as required""" #logger.warning("Entered Unit %d update function..." % self.getID()) # if we're complete, then just return if self.status in ["completed", "recreating"] or not self.active: return 0 # check if submission is needed task = self._getParent()._getParent() trf = self._getParent() maxsub = task.n_tosub() # check parent unit(s) req_ok = self.checkParentUnitsAreComplete() # set the start time if not already set if len(self.req_units) > 0 and req_ok and self.start_time == 0: self.start_time = time.time() + trf.chain_delay * 60 - 1 if req_ok and self.checkForSubmission() and maxsub > 0: # create job and submit addInfoString(self, "Creating Job...") j = self.createNewJob() if j.name == '': j.name = "T%i:%i U%i" % (task.id, trf.getID(), self.getID()) try: if trf.submit_with_threads: addInfoString(self, "Attempting job submission with queues...") GPI.queues.add(j.submit) else: addInfoString(self, "Attempting job submission...") j.submit() except Exception as err: logger.debug("update Err: %s" % str(err)) addInfoString(self, "Failed Job Submission") addInfoString(self, "Reason: %s" % (formatTraceback())) logger.error("Couldn't submit the job. Deactivating unit.") self.prev_job_ids.append(j.id) self.active = False trf._setDirty() # ensure everything's saved return 1 self.active_job_ids.append(j.id) self.updateStatus("running") trf._setDirty() # ensure everything's saved if trf.submit_with_threads: return 0 return 1 # update any active jobs for jid in self.active_job_ids: # we have an active job so see if this job is OK and resubmit if # not try: job = GPI.jobs(jid) except Exception as err: logger.debug("Update2 Err: %s" % str(err)) logger.warning( "Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)" % (jid, task.id, trf.getID(), self.getID())) continue if job.status == "completed": # check if actually completed if not self.checkCompleted(job): return 0 # check for DS copy if trf.unit_copy_output: if not self.copy_output: trf.createUnitCopyOutputDS(self.getID()) if not self.copyOutput(): return 0 # check for merger if trf.unit_merger: if not self.merger: self.merger = trf.createUnitMerger(self.getID()) if not self.merge(): return 0 # all good so mark unit as completed self.updateStatus("completed") elif job.status == "failed" or job.status == "killed": # check for too many resubs if self.minor_resub_count + self.major_resub_count > trf.run_limit - 1: logger.error( "Too many resubmits (%i). Deactivating unit." % (self.minor_resub_count + self.major_resub_count)) addInfoString( self, "Deactivating unit. Too many resubmits (%i)" % (self.minor_resub_count + self.major_resub_count)) self.active = False return 0 rebroker = False if self.minor_resub_count > trf.minor_run_limit - 1: if self._getParent().rebroker_on_job_fail: rebroker = True else: logger.error( "Too many minor resubmits (%i). Deactivating unit." % self.minor_resub_count) addInfoString( self, "Deactivating unit. Too many resubmits (%i)" % (self.minor_resub_count + self.minor_resub_count)) self.active = False return 0 if self.major_resub_count > trf.major_run_limit - 1: logger.error( "Too many major resubmits (%i). Deactivating unit." % self.major_resub_count) addInfoString( self, "Deactivating unit. Too many resubmits (%i)" % (self.minor_resub_count + self.major_resub_count)) self.active = False return 0 # check the type of resubmit if rebroker or self.checkMajorResubmit(job): self.major_resub_count += 1 self.minor_resub_count = 0 try: addInfoString(self, "Attempting major resubmit...") self.majorResubmit(job) except Exception as err: logger.debug("Update Err3: %s" % str(err)) logger.error( "Couldn't resubmit the job. Deactivating unit.") addInfoString(self, "Failed Job resubmission") addInfoString(self, "Reason: %s" % (formatTraceback())) self.active = False # break the loop now because we've probably changed the # active jobs list return 1 else: self.minor_resub_count += 1 try: addInfoString(self, "Attempting minor resubmit...") self.minorResubmit(job) except Exception as err: logger.debug("Update Err4: %s" % str(err)) logger.error( "Couldn't resubmit the job. Deactivating unit.") addInfoString(self, "Failed Job resubmission") addInfoString(self, "Reason: %s" % (formatTraceback())) self.active = False return 1 def reset(self): """Reset the unit completely""" addInfoString(self, "Reseting Unit...") self.minor_resub_count = 0 self.major_resub_count = 0 if len(self.active_job_ids) > 0: self.prev_job_ids += self.active_job_ids self.active_job_ids = [] self.active = True # if has parents, set to recreate if len(self.req_units) > 0: self.updateStatus("recreating") else: self.updateStatus("running") # Info routines def n_active(self): if self.status == 'completed': return 0 tot_active = 0 active_states = ['submitted', 'running'] for jid in self.active_job_ids: try: job = GPI.jobs(jid) except Exception as err: logger.debug("n_active Err: %s" % str(err)) task = self._getParent()._getParent() trf = self._getParent() logger.warning( "Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)" % (jid, task.id, trf.getID(), self.getID())) continue j = stripProxy(job) # try to preserve lazy loading if hasattr(j, 'getNodeIndexCache') and j.getNodeIndexCache( ) and 'subjobs:status' in j.getNodeIndexCache(): if len(j.getNodeIndexCache()['subjobs:status']) > 0: for sj_stat in j.getNodeIndexCache()['subjobs:status']: if sj_stat in active_states: tot_active += 1 else: if j.getNodeIndexCache()['status'] in active_states: tot_active += 1 else: #logger.warning("WARNING: (active check) No index cache for job object %d" % jid) if j.status in active_states: if j.subjobs: for sj in j.subjobs: if sj.status in active_states: tot_active += 1 else: tot_active += 1 return tot_active def n_status(self, status): tot_active = 0 for jid in self.active_job_ids: try: job = GPI.jobs(jid) except Exception as err: logger.debug("n_status Err: %s" % str(err)) task = self._getParent()._getParent() trf = self._getParent() logger.warning( "Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)" % (jid, task.id, trf.getID(), self.getID())) continue j = stripProxy(job) # try to preserve lazy loading if hasattr(j, 'getNodeIndexCache') and j.getNodeIndexCache( ) and 'subjobs:status' in j.getNodeIndexCache(): if len(j.getNodeIndexCache()['subjobs:status']) > 0: for sj_stat in j.getNodeIndexCache()['subjobs:status']: if sj_stat == status: tot_active += 1 else: if j.getNodeIndexCache()['status'] == status: tot_active += 1 else: #logger.warning("WARNING: (status check) No index cache for job object %d" % jid) if j.subjobs: for sj in j.subjobs: if sj.status == status: tot_active += 1 else: if j.status == status: tot_active += 1 return tot_active def n_all(self): total = 0 for jid in self.active_job_ids: try: job = GPI.jobs(jid) except Exception as err: logger.debug("n_all Err: %s" % str(err)) task = self._getParent()._getParent() trf = self._getParent() logger.warning( "Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)" % (jid, task.id, trf.getID(), self.getID())) continue j = stripProxy(job) # try to preserve lazy loading if hasattr(j, 'getNodeIndexCache') and j.getNodeIndexCache( ) and 'subjobs:status' in j.getNodeIndexCache(): if len(j.getNodeIndexCache()['subjobs:status']) != 0: total += len(j.getNodeIndexCache()['subjobs:status']) else: total += 1 else: #logger.warning("WARNING: (status check) No index cache for job object %d" % jid) if j.subjobs: total = len(j.subjobs) else: total = 1 return total def overview(self): """Print an overview of this unit""" o = " Unit %d: %s " % (self.getID(), self.name) for s in ["submitted", "running", "completed", "failed", "unknown"]: o += markup("%i " % self.n_status(s), overview_colours[s]) print(o) def copyOutput(self): """Copy any output to the given dataset""" logger.error( "No default implementation for Copy Output - contact plugin developers" ) return False
class Remote(IBackend): """Remote backend - submit jobs to a Remote pool. The remote backend works as an SSH tunnel to a remote site where a ganga session is opened and the job submitted there using the specified remote_backend. It is (in theory!) transparent to the user and should allow submission of any jobs to any backends that are already possible in Ganga. NOTE: Due to the file transfers required, there can be some slow down during submission and monitoring E.g. 1 - Hello World example submitted to local backend: j = Job(application=Executable(exe='/bin/echo',args=['Hello World']), backend="Remote") j.backend.host = "bluebear.bham.ac.uk" # Host name j.backend.username = "******" # User name j.backend.ganga_cmd = "/bb/projects/Ganga/runGanga" # Ganga Command line on remote site j.backend.ganga_dir = "/bb/phy/slatermw/gangadir/remote_jobs" # Where to store the jobs j.backend.remote_backend = Local() j.submit() E.g. 2 - Root example submitted to PBS backend: r = Root() r.version = '5.14.00' r.script = 'gengaus.C' j = Job(application=r,backend="Remote") j.backend.host = "bluebear.bham.ac.uk" j.backend.username = "******" j.backend.ganga_cmd = "/bb/projects/Ganga/runGanga" j.backend.ganga_dir = "/bb/phy/slatermw/gangadir/remote_jobs" j.outputsandbox = ['gaus.txt'] j.backend.remote_backend = PBS() j.submit() E.g. 3 - Athena example submitted to LCG backend NOTE: you don't need a grid certificate (or UI) available on the local machine, just the remote machine j = Job() j.name='Ex3_2_1' j.application=Athena() j.application.prepare(athena_compile=False) j.application.option_file='/disk/f8b/home/mws/athena/testarea/13.0.40/PhysicsAnalysis/AnalysisCommon/UserAnalysis/run/AthExHelloWorld_jobOptions.py' j.backend = Remote() j.backend.host = "bluebear.bham.ac.uk" j.backend.username = "******" j.backend.ganga_cmd = "/bb/projects/Ganga/runGanga" j.backend.ganga_dir = "/bb/phy/slatermw/gangadir/remote_jobs" j.backend.environment = {'ATLAS_VERSION' : '13.0.40'} # Additional environment variables j.backend.remote_backend = LCG() j.backend.remote_backend.CE = 'epgce2.ph.bham.ac.uk:2119/jobmanager-lcgpbs-short' j.submit() E.g. 4 - Hello World submitted at CERN on LSF using atlas startup j = Job() j.backend = Remote() j.backend.host = "lxplus.cern.ch" j.backend.username = "******" j.backend.ganga_cmd = "ganga" j.backend.ganga_dir = "/afs/cern.ch/user/m/mslater/gangadir/remote_jobs" j.backend.pre_script = ['source /afs/cern.ch/sw/ganga/install/etc/setup-atlas.csh'] # source the atlas setup script before running ganga j.backend.remote_backend = LSF() j.submit() """ _schema = Schema(Version(1, 0), { "remote_backend": ComponentItem('backends', doc='specification of the resources to be used (e.g. batch system)'), "host": SimpleItem(defvalue="", doc="The remote host and port number ('host:port') to use. Default port is 22."), "ssh_key": SimpleItem(defvalue="", doc="Set to true to the location of the the ssh key to use for authentication, e.g. /home/mws/.ssh/id_rsa. Note, you should make sure 'key_type' is also set correctly."), "key_type": SimpleItem(defvalue="RSA", doc="Set to the type of ssh key to use (if required). Possible values are 'RSA' and 'DSS'."), "username": SimpleItem(defvalue="", doc="The username at the remote host"), "ganga_dir": SimpleItem(defvalue="", doc="The directory to use for the remote workspace, repository, etc."), "ganga_cmd": SimpleItem(defvalue="", doc="Command line to start ganga on the remote host"), "environment": SimpleItem(defvalue={}, doc="Overides any environment variables set in the job"), "pre_script": SimpleItem(defvalue=[''], doc="Sequence of commands to execute before running Ganga on the remote site"), 'remote_job_id': SimpleItem(defvalue=0, protected=1, copyable=0, doc='Remote job id.'), 'exitcode': SimpleItem(defvalue=0, protected=1, copyable=0, doc='Application exit code'), 'actualCE': SimpleItem(defvalue=0, protected=1, copyable=0, doc='Computing Element where the job actually runs.') }) _category = "backends" _name = "Remote" #_hidden = False # KUBA: temporarily disabled from the public _port = 22 _transport = None _sftp = None _code = randomString() _transportarray = None _key = {} _exportmethods = ['setup'] def __init__(self): super(Remote, self).__init__() def __del__(self): if (self._transport != None): self._transport.close() self._transport = None def setup(self): # KUBA: generic setup hook job = self.getJobObject() if job.status in ['submitted', 'running', 'completing']: # Send a script over to the remote site that updates this jobs # info with the info of the remote job import os # Create a ganga script that updates the job info from the remote # site script = """#!/usr/bin/env python from __future__ import print_function #----------------------------------------------------- # This is a setup script for a remote job. It # does very litte #----------------------------------------------------- # print a finished token print("***_FINISHED_***") """ # check for the connection if (self.opentransport() == False): return False # send the script #script_name = '/__setupscript__%s.py' % self._code #self._sftp.open(self.ganga_dir + script_name, 'w').write(script) # run the script #stdout, stderr = self.run_remote_script( script_name, self.pre_script ) # remove the script #self._sftp.remove(self.ganga_dir + script_name) return True def opentransport(self): import paramiko import getpass import atexit if (self._transport != None): # transport is open return # check for a useable transport for this username and host if Remote._transportarray != None: for t in Remote._transportarray: if (t != None) and (t[0] == self.username) and (t[1] == self.host): # check for too many retries on the same host if t[2] == None or t[3] == None: logger.warning("Too many retries for remote host " + self.username + "@" + self.host + ". Restart Ganga to have another go.") return False self._transport = t[2] self._sftp = t[3] # ensure that the remote dir is still there - it will crash if the dir structure # changes with the sftp sill open channel = self._transport.open_session() channel.exec_command('mkdir -p ' + self.ganga_dir) bufout = "" while not channel.exit_status_ready(): if channel.recv_ready(): bufout = channel.recv(1024) return # Ask user for password - give three tries num_try = 0 password = "" while num_try < 3: try: temp_host = self.host temp_port = self._port if self.host.find(":") != -1: # user specified port temp_port = eval(self.host[self.host.find(":") + 1:]) temp_host = self.host[: self.host.find(":")] self._transport = paramiko.Transport((temp_host, temp_port)) # avoid hang on exit my daemonising the thread self._transport.setDaemon(True) # register for proper shutdown atexit.register(shutdown_transport, self._transport) if self.ssh_key != "" and os.path.exists(os.path.expanduser(os.path.expandvars(self.ssh_key))): privatekeyfile = os.path.expanduser( os.path.expandvars(self.ssh_key)) if self.ssh_key not in Remote._key: if self.key_type == "RSA": password = getpass.getpass( 'Enter passphrase for key \'%s\': ' % (self.ssh_key)) Remote._key[self.ssh_key] = paramiko.RSAKey.from_private_key_file( privatekeyfile, password=password) elif self.key_type == "DSS": password = getpass.getpass( 'Enter passphrase for key \'%s\': ' % (self.ssh_key)) Remote._key[self.ssh_key] = paramiko.DSSKey.from_private_key_file( privatekeyfile, password=password) else: logger.error( "Unknown ssh key_type '%s'. Unable to connect." % self.key_type) return False self._transport.connect( username=self.username, pkey=Remote._key[self.ssh_key]) else: logger.debug("SSH key: %s" % self.ssh_key) if os.path.exists(os.path.expanduser(os.path.expandvars(self.ssh_key))): logger.debug( "PATH: %s Exists" % os.path.expanduser(os.path.expandvars(self.ssh_key))) else: logger.debug("PATH: %s Does NOT Exist" % os.path.expanduser( os.path.expandvars(self.ssh_key))) if self.username != "" and self.host != "": password = getpass.getpass( 'Password for %s@%s: ' % (self.username, self.host)) self._transport.connect( username=self.username, password=password) elif self.username == "": logger.error("ERROR: USERNAME NOT DEFINED!!!") return False elif self.host == "": logger.error("ERROR: HOSTNAME NOT DEFINED!!!") return False else: pass # blank the password just in case password = "******" channel = self._transport.open_session() channel.exec_command('mkdir -p ' + self.ganga_dir) self._sftp = paramiko.SFTPClient.from_transport( self._transport) # Add to the transport array Remote._transportarray = [Remote._transportarray, [self.username, self.host, self._transport, self._sftp]] num_try = 1000 except Exception as err: logger.debug("Err: %s" %str(err)) logger.warning("Error when comunicating with remote host. Retrying...") self._transport = None self._sftp = None if self.ssh_key in Remote._key: del Remote._key[self.ssh_key] num_try = num_try + 1 if num_try == 3: logger.error("Could not logon to remote host " + self.username + "@" + self.host + " after three attempts. Restart Ganga to have another go.") Remote._transportarray = [Remote._transportarray, [self.username, self.host, None, None]] return False return True def run_remote_script(self, script_name, pre_script): """Run a ganga script on the remote site""" import getpass # Set up a command file to source. This gets around a silly alias # problem cmd_str = "" for c in pre_script: cmd_str += c + '\n' cmd_str += self.ganga_cmd + \ " -o\'[Configuration]gangadir=" + self.ganga_dir + "\' " cmd_str += self.ganga_dir + script_name + '\n' cmd_file = os.path.join( self.ganga_dir, "__gangacmd__" + randomString()) self._sftp.open(cmd_file, 'w').write(cmd_str) # run ganga command channel = self._transport.open_session() channel.exec_command("source " + cmd_file) # Read the output after command stdout = bufout = "" stderr = buferr = "" grid_ok = False while not channel.exit_status_ready(): if channel.recv_ready(): bufout = channel.recv(1024) stdout += bufout if channel.recv_stderr_ready(): buferr = channel.recv_stderr(1024) stderr += buferr if stdout.find("***_FINISHED_***") != -1: break if (bufout.find("GRID pass") != -1 or buferr.find("GRID pass") != -1): grid_ok = True password = getpass.getpass('Enter GRID pass phrase: ') channel.send(password + "\n") password = "" bufout = buferr = "" self._sftp.remove(cmd_file) return stdout, stderr def submit(self, jobconfig, master_input_sandbox): """Submit the job to the remote backend. Return value: True if job is submitted successfully, or False otherwise""" import os import getpass # First some sanity checks... fail = 0 if self.remote_backend == None: logger.error("No backend specified for remote host.") fail = 1 if self.host == "": logger.error("No remote host specified.") fail = 1 if self.username == "": logger.error("No username specified.") fail = 1 if self.ganga_dir == "": logger.error("No remote ganga directory specified.") fail = 1 if self.ganga_cmd == "": logger.error("No ganga command specified.") fail = 1 if fail: return 0 # initiate the connection if self.opentransport() == False: return 0 # Tar up the input sandbox and copy to the remote cluster job = self.getJobObject() subjob_input_sandbox = job.createPackedInputSandbox( jobconfig.getSandboxFiles()) input_sandbox = subjob_input_sandbox + master_input_sandbox # send the sandbox sbx_name = '/__subjob_input_sbx__%s' % self._code self._sftp.put(subjob_input_sandbox[0], self.ganga_dir + sbx_name) sbx_name = '/__master_input_sbx__%s' % self._code self._sftp.put(master_input_sandbox[0], self.ganga_dir + sbx_name) # run the submit script on the remote cluster scriptpath = self.preparejob(jobconfig, master_input_sandbox) # send the script data = open(scriptpath, 'r').read() script_name = '/__jobscript_run__%s.py' % self._code self._sftp.open(self.ganga_dir + script_name, 'w').write(data) # run the script stdout, stderr = self.run_remote_script(script_name, self.pre_script) # delete the jobscript self._sftp.remove(self.ganga_dir + script_name) # Copy the job object if stdout.find("***_FINISHED_***") != -1: status, outputdir, id, be = self.grabremoteinfo(stdout) self.remote_job_id = id if hasattr(self.remote_backend, 'exitcode'): self.exitcode = be.exitcode if hasattr(self.remote_backend, 'actualCE'): self.actualCE = be.actualCE # copy each variable in the schema # Please can someone tell me why I can't just do # self.remote_backend = be? for o in be._schema.allItems(): exec("self.remote_backend." + o[0] + " = be." + o[0]) return 1 else: logger.error("Problem submitting the job on the remote site.") logger.error("<last 1536 bytes of stderr>") cut = stderr[len(stderr) - 1536:] for ln in cut.splitlines(): logger.error(ln) logger.error("<end of last 1536 bytes of stderr>") return 0 def kill(self): """Kill running job. No arguments other than self Return value: True if job killed successfully, or False otherwise""" script = """#!/usr/bin/env python from __future__ import print_function #----------------------------------------------------- # This is a kill script for a remote job. It # attempts to kill the given job and returns #----------------------------------------------------- import os,os.path,shutil,tempfile import sys,popen2,time,traceback ############################################################################################ ###INLINEMODULES### ############################################################################################ code = ###CODE### jid = ###JOBID### j = jobs( jid ) j.kill() # Start pickle token print("***_START_PICKLE_***") # pickle the job import pickle print(j.outputdir) print(pickle.dumps(j._impl)) print(j) # print a finished token print("***_END_PICKLE_***") print("***_FINISHED_***") """ script = script.replace('###CODE###', repr(self._code)) script = script.replace('###JOBID###', str(self.remote_job_id)) # check for the connection if (self.opentransport() == False): return 0 # send the script script_name = '/__jobscript_kill__%s.py' % self._code self._sftp.open(self.ganga_dir + script_name, 'w').write(script) # run the script stdout, stderr = self.run_remote_script(script_name, self.pre_script) # Copy the job object if stdout.find("***_FINISHED_***") != -1: status, outputdir, id, be = self.grabremoteinfo(stdout) if status == 'killed': return True return False def remove(self): """Remove the selected job from the remote site No arguments other than self Return value: True if job removed successfully, or False otherwise""" script = """#!/usr/bin/env python from __future__ import print_function #----------------------------------------------------- # This is a remove script for a remote job. It # attempts to kill the given job and returns #----------------------------------------------------- import os,os.path,shutil,tempfile import sys,popen2,time,traceback ############################################################################################ ###INLINEMODULES### ############################################################################################ code = ###CODE### jid = ###JOBID### j = jobs( jid ) j.remove() jobs( jid ) # print a finished token print("***_FINISHED_***") """ script = script.replace('###CODE###', repr(self._code)) script = script.replace('###JOBID###', str(self.remote_job_id)) # check for the connection if (self.opentransport() == False): return 0 # send the script script_name = '/__jobscript_remove__%s.py' % self._code self._sftp.open(self.ganga_dir + script_name, 'w').write(script) # run the script stdout, stderr = self.run_remote_script(script_name, self.pre_script) # Copy the job object if stdout.find("***_FINISHED_***") != -1: return True return False def resubmit(self): """Resubmit the job. No arguments other than self Return value: 1 if job was resubmitted, or 0 otherwise""" script = """#!/usr/bin/env python from __future__ import print_function #----------------------------------------------------- # This is a resubmit script for a remote job. It # attempts to kill the given job and returns #----------------------------------------------------- import os,os.path,shutil,tempfile import sys,popen2,time,traceback ############################################################################################ ###INLINEMODULES### ############################################################################################ code = ###CODE### jid = ###JOBID### j = jobs( jid ) j.resubmit() # Start pickle token print("***_START_PICKLE_***") # pickle the job import pickle print(j.outputdir) print(pickle.dumps(j._impl)) print(j) # print a finished token print("***_END_PICKLE_***") print("***_FINISHED_***") """ script = script.replace('###CODE###', repr(self._code)) script = script.replace('###JOBID###', str(self.remote_job_id)) # check for the connection if (self.opentransport() == False): return 0 # send the script script_name = '/__jobscript_resubmit__%s.py' % self._code self._sftp.open(self.ganga_dir + script_name, 'w').write(script) # run the script stdout, stderr = self.run_remote_script(script_name, self.pre_script) # Copy the job object if stdout.find("***_FINISHED_***") != -1: status, outputdir, id, be = self.grabremoteinfo(stdout) if status == 'submitted' or status == 'running': return 1 return 0 def grabremoteinfo(self, out): import pickle # Find the start and end of the pickle start = out.find("***_START_PICKLE_***") + len("***_START_PICKLE_***") stop = out.find("***_END_PICKLE_***") outputdir = out[start + 1:out.find("\n", start + 1) - 1] pickle_str = out[out.find("\n", start + 1) + 1:stop] # Now unpickle and recreate the job j = pickle.loads(pickle_str) return j.status, outputdir, j.id, j.backend def preparejob(self, jobconfig, master_input_sandbox): """Prepare the script to create the job on the remote host""" import tempfile workdir = tempfile.mkdtemp() job = self.getJobObject() script = """#!/usr/bin/env python from __future__ import print_function #----------------------------------------------------- # This job wrapper script is automatically created by # GANGA Remote backend handler. # # It controls: # 1. unpack input sandbox # 2. create the new job # 3. submit it #----------------------------------------------------- import os,os.path,shutil,tempfile import sys,popen2,time,traceback import tarfile ############################################################################################ ###INLINEMODULES### ############################################################################################ j = Job() output_sandbox = ###OUTPUTSANDBOX### input_sandbox = ###INPUTSANDBOX### appexec = ###APPLICATIONEXEC### appargs = ###APPLICATIONARGS### back_end = ###BACKEND### ganga_dir = ###GANGADIR### code = ###CODE### environment = ###ENVIRONMENT### user_env = ###USERENV### if user_env != None: for env_var in user_env: environment[env_var] = user_env[env_var] j.outputsandbox = output_sandbox j.backend = back_end # Unpack the input sandboxes shutil.move(os.path.expanduser(ganga_dir + "/__subjob_input_sbx__" + code), j.inputdir+"/__subjob_input_sbx__") shutil.move(os.path.expanduser(ganga_dir + "/__master_input_sbx__" + code), j.inputdir+"/__master_input_sbx__") # Add the files in the sandbox to the job inputsbx = [] fullsbxlist = [] try: tar = tarfile.open(j.inputdir+"/__master_input_sbx__") filelist = tar.getnames() print(filelist) for f in filelist: fullsbxlist.append( f ) inputsbx.append( j.inputdir + "/" + f ) except: print("Unable to open master input sandbox") try: tar = tarfile.open(j.inputdir+"/__subjob_input_sbx__") filelist = tar.getnames() for f in filelist: fullsbxlist.append( f ) inputsbx.append( j.inputdir + "/" + f ) except: print("Unable to open subjob input sandbox") # sort out the path of the exe if appexec in fullsbxlist: j.application = Executable ( exe = File(os.path.join(j.inputdir, appexec)), args = appargs, env = environment ) print("Script found: %s" % appexec) else: j.application = Executable ( exe = appexec, args = appargs, env = environment ) j.inputsandbox = inputsbx getPackedInputSandbox(j.inputdir+"/__subjob_input_sbx__", j.inputdir + "/.") getPackedInputSandbox(j.inputdir+"/__master_input_sbx__", j.inputdir + "/.") # submit the job j.submit() # Start pickle token print("***_START_PICKLE_***") # pickle the job import pickle print(j.outputdir) print(pickle.dumps(j._impl)) # print a finished token print("***_END_PICKLE_***") print("***_FINISHED_***") """ import inspect import Ganga.Core.Sandbox as Sandbox script = script.replace('###ENVIRONMENT###', repr(jobconfig.env)) script = script.replace('###USERENV###', repr(self.environment)) script = script.replace( '###INLINEMODULES###', inspect.getsource(Sandbox.WNSandbox)) script = script.replace( '###OUTPUTSANDBOX###', repr(jobconfig.outputbox)) script = script.replace( '###APPLICATIONEXEC###', repr(os.path.basename(jobconfig.getExeString()))) script = script.replace( '###APPLICATIONARGS###', repr(jobconfig.getArgStrings())) # get a string describing the required backend import cStringIO be_out = cStringIO.StringIO() job.backend.remote_backend.printTree(be_out, "copyable") be_str = be_out.getvalue() script = script.replace('###BACKEND###', be_str) script = script.replace('###GANGADIR###', repr(self.ganga_dir)) script = script.replace('###CODE###', repr(self._code)) sandbox_list = jobconfig.getSandboxFiles() str_list = "[ " for fname in sandbox_list: str_list += "j.inputdir + '/' + " + \ repr(os.path.basename(fname.name)) str_list += ", " str_list += "j.inputdir + '/__master_input_sbx__' ]" script = script.replace('###INPUTSANDBOX###', str_list) return job.getInputWorkspace().writefile(FileBuffer('__jobscript__.py', script), executable=0) @staticmethod def updateMonitoringInformation(jobs): # Send a script over to the remote site that updates this jobs # info with the info of the remote job import os import getpass # first, loop over the jobs and sort by host, username, gangadir and # pre_script jobs_sort = {} for j in jobs: host_str = j.backend.username + "@" + j.backend.host + ":" + \ j.backend.ganga_dir + "+" + ';'.join(j.backend.pre_script) if host_str not in jobs_sort: jobs_sort[host_str] = [] jobs_sort[host_str].append(j) for host_str in jobs_sort: # Create a ganga script that updates the job info for all jobs at # this remote site script = """#!/usr/bin/env python from __future__ import print_function #----------------------------------------------------- # This is a monitoring script for a remote job. It # outputs some useful job info and exits #----------------------------------------------------- import os,os.path,shutil,tempfile import sys,popen2,time,traceback ############################################################################################ ###INLINEMODULES### ############################################################################################ code = ###CODE### jids = ###JOBID### runMonitoring() import pickle for jid in jids: j = jobs( jid ) # Start pickle token print("***_START_PICKLE_***") # pickle the job print(j.outputdir) print(pickle.dumps(j._impl)) print(j) # print a finished token print("***_END_PICKLE_***") print("***_FINISHED_***") """ mj = jobs_sort[host_str][0] script = script.replace('###CODE###', repr(mj.backend._code)) rem_ids = [] for j in jobs_sort[host_str]: rem_ids.append(j.backend.remote_job_id) script = script.replace('###JOBID###', str(rem_ids)) # check for the connection if (mj.backend.opentransport() == False): return 0 # send the script script_name = '/__jobscript__%s.py' % mj.backend._code mj.backend._sftp.open( mj.backend.ganga_dir + script_name, 'w').write(script) # run the script stdout, stderr = mj.backend.run_remote_script( script_name, mj.backend.pre_script) # Copy the job object if stdout.find("***_FINISHED_***") != -1: start_pos = stdout.find("***_START_PICKLE_***") end_pos = stdout.find( "***_END_PICKLE_***") + len("***_END_PICKLE_***") while start_pos != -1 and end_pos != -1: pickle_str = stdout[start_pos:end_pos + 1] status, outputdir, id, be = mj.backend.grabremoteinfo( pickle_str) # find the job and update it found = False for j in jobs_sort[host_str]: if (id == j.backend.remote_job_id): found = True if status != j.status: j.updateStatus(status) if hasattr(j.backend.remote_backend, 'exitcode'): j.backend.exitcode = be.exitcode if hasattr(j.backend.remote_backend, 'actualCE'): j.backend.actualCE = be.actualCE for o in be._schema.allItems(): exec( "j.backend.remote_backend." + o[0] + " = be." + o[0]) # check for completed or failed and pull the output # if required if j.status == 'completed' or j.status == 'failed': # we should have output, so get the file list # first filelist = j.backend._sftp.listdir(outputdir) # go through and sftp them back for fname in filelist: data = j.backend._sftp.open( outputdir + '/' + fname, 'r').read() open( j.outputdir + '/' + os.path.basename(fname), 'w').write(data) if not found: logger.warning( "Couldn't match remote id %d with monitored job. Serious problems in Remote monitoring." % id) start_pos = stdout.find("***_START_PICKLE_***", end_pos) end_pos = stdout.find( "***_END_PICKLE_***", end_pos) + len("***_END_PICKLE_***") # remove the script j.backend._sftp.remove(j.backend.ganga_dir + script_name) return None
class GridSandboxCache(GangaObject): ''' Helper class for upladong/downloading/deleting sandbox files on a grid cache. @author: Hurng-Chun Lee @contact: [email protected] ''' _schema = Schema( Version(1, 1), { 'protocol': SimpleItem(defvalue='', copyable=1, doc='file transfer protocol'), 'max_try': SimpleItem(defvalue=1, doc='max. number of tries in case of failures'), 'timeout': SimpleItem(defvalue=180, copyable=0, hidden=1, doc='transfer timeout in seconds'), 'uploaded_files': ComponentItem('GridFileIndex', defvalue=[], sequence=1, protected=1, copyable=0, hidden=1, doc='a repository record for the uploaded files') }) _category = 'GridSandboxCache' _name = 'GridSandboxCache' _exportmethods = [ 'upload', 'download', 'delete', 'get_cached_files', 'list_cached_files', 'cleanup' ] logger = getLogger() def __init__(self): super(GridSandboxCache, self).__init__() def upload(self, cred_req, files=[], opts=''): """ Uploads multiple files to a remote grid storage. @param files is a list of local files to be uploaded to the grid. The element can be a file path or a File object. @return True if files are successfully uploaded; otherwise it returns False """ status = False paths = [] for f in files: if getName(f) == 'File': paths.append('file://%s' % f.name) elif getName(f) == 'str': paths.append('file://%s' % f) else: self.logger.warning('unknown file expression: %s' % repr(f)) uploaded_files = self.impl_upload(cred_req=cred_req, files=paths, opts=opts) if len(uploaded_files) == len(files): status = self.impl_bookkeepUploadedFiles(uploaded_files, append=True, opts=opts) else: status = False if len(uploaded_files) == len(files): status = self.impl_bookkeepUploadedFiles(uploaded_files, append=True, opts=opts) else: status = False return status def download(self, cred_req, files=[], dest_dir=None, opts=''): """ Downloads multiple files from remote grid storages to a local directory. If the file is successfully downloaded, the local file path would be: - os.path.join(dest_dir, os.path.basename(local_fname_n) @param files is a list of files to be downloaded from the grid. The data format of it should be: - [index_grid_file_1, index_grid_file_2, ...] @param dest_dir is a local destination directory to store the downloaded files. @return True if files are successfully downloaded; otherwise it returns False """ status = False myFiles = self.__get_file_index_objects__(files) downloadedFiles = self.impl_download(cred_req=cred_req, files=myFiles, dest_dir=dest_dir, opts=opts) if len(downloadedFiles) == len(myFiles): status = True else: self.logger.warning('some files not successfully downloaded') return status def delete(self, cred_req, files=[], opts=''): """ Deletes multiple files from remote grid storages. @param files is a list of files to be deleted from the grid. The data format of it should be: - [index_grid_file_1, index_grid_file_2, ...] @return True if files are successfully deleted; otherwise it returns False """ status = False myFiles = self.__get_file_index_objects__(files) deletedFiles = self.impl_delete(cred_req=cred_req, files=myFiles, opts=opts) if len(deletedFiles) == len(myFiles): status = True else: self.logger.warning('some files not successfully deleted') return status def cleanup(self, cred_req, opts=''): """ Cleans up the uploaded files. @return True if all grid files listed in the index file are successfully deleted. """ status = False all_files = self.get_cached_files() f_ids = [] for f in all_files: f_ids.append(f.id) return self.delete(cred_req=cred_req, files=f_ids) def get_cached_files(self, opts=''): """ Gets the indexes of the uploaded files on the grid. @return the dictionary indexing the uploaded files on the grid. The key of the dictionary should be the main index (e.g. GUID) of the grid files. """ return self.impl_getUploadedFiles(opts=opts) def list_cached_files(self, loop=True, opts=''): """ Lists the uploaded files. if loop = True, it prints also the uploaded files associated with subjobs. """ fc = 0 ds = '' doColoring = True fg = Foreground() fx = Effects() status_colors = {'inuse': fg.orange, 'free': fg.blue, 'gone': fg.red} status_mapping = { 'new': 'inuse', 'submitted': 'inuse', 'submitting': 'inuse', 'running': 'inuse', 'completed': 'free', 'completing': 'free', 'failed': 'free', 'killed': 'free' } if doColoring: markup = ANSIMarkup() else: markup = NoMarkup() def __markup_by_status__(fileIndex, counter, status): fmtStr = '\n%4d\t%-30s\t%-12s\t%s' % (counter, fileIndex.name, status, fileIndex.id) try: return markup(fmtStr, status_colors[status]) except KeyError: return markup(fmtStr, fx.normal) j = self.getJobObject() for f in self.get_cached_files(opts=opts): my_status = 'unknown' if j: try: my_status = status_mapping[j.status] except KeyError: pass ds += __markup_by_status__(f, fc, my_status) fc += 1 if j and loop: for sj in j.subjobs: for f in sj.backend.sandboxcache.get_cached_files(opts=opts): my_status = 'unknown' try: my_status = status_mapping[sj.status] except KeyError: pass ds += __markup_by_status__(f, fc, my_status) fc += 1 return ds # methods to be implemented in the child classes def impl_upload(self, cred_req, files=[], opts=''): """ Uploads multiple files to a remote grid storage. @param files is a list of files in URL format (i.e. file://...) @return a list of successfully uploaded files represented by GridFileIndex objects """ raise NotImplementedError def impl_download(self, cred_req, files=[], dest_dir=None, opts=''): """ Downloads multiple files from remote grid storages to a local directory. @param files is a list of files represented by GridFileIndex objects @param dest_dir is a local destination directory to store the downloaded files. @return a list of successfully downloaded files represented by GridFileIndex objects """ raise NotImplementedError def impl_delete(self, cred_req, files=[], opts=''): """ Deletes multiple files from remote grid storages. @param files is a list of files represented by GridFileIndex objects @return a list of successfully deleted files represented by GridFileIndex objects """ raise NotImplementedError def impl_bookkeepUploadedFiles(self, files=[], append=True, opts=''): """ basic implementation for bookkeeping the uploaded files. It simply keeps the GridFileIndex objects in the job repository. @param files is a list of files represented by GridFileIndex objects @return True if files are successfully logged in the local index file """ self.uploaded_files = files return True def impl_getUploadedFiles(self, opts=''): """ basic implementation for getting the previously uploaded files from the job repository. @return a list of files represented by GridFileIndex objects """ files = self.uploaded_files return files # private methods def __get_file_index_objects__(self, files=[]): '''Gets file index object according to the given file list - try to get the GridFileIndex object from the local index file. @param files is a list of file indexes @return a list of files represented by GridFileIndex objects ''' cachedFiles = self.get_cached_files() myFiles = [] for f in cachedFiles: if f.id in files: myFiles.append(f) return myFiles def __get_unique_fname__(self): '''gets an unique filename''' fname = 'user.%s' % (get_uuid()) return fname def __cmd_retry_loop__(self, shell, cmd, maxRetry=3): '''Executing system command with retry feature''' i = 0 rc = 0 output = None m = None try_again = True while try_again: i = i + 1 self.logger.debug('run cmd: %s' % cmd) rc, output, m = shell.cmd1(cmd, allowed_exit=[0, 255]) if rc in [0, 255]: try_again = False elif i == maxRetry: try_again = False else: self.logger.warning("trial %d: error: %s" % (i, output)) return (rc, output, m)
class BKTestQuery(BKQuery): ## schema = {} ## docstr = 'Bookkeeping query path (type dependent)' ## schema['path'] = SimpleItem(defvalue='' ,doc=docstr) ## docstr = 'Start date string yyyy-mm-dd (only works for type="RunsByDate")' ## schema['startDate'] = SimpleItem(defvalue='' ,doc=docstr) ## docstr = 'End date string yyyy-mm-dd (only works for type="RunsByDate")' ## schema['endDate'] = SimpleItem(defvalue='' ,doc=docstr) ## docstr = 'Data quality flag (string or list of strings).' # schema['dqflag'] = SimpleItem(defvalue='All',typelist=['str','list'], # doc=docstr) ## docstr = 'Type of query (Path, RunsByDate, Run, Production)' ## schema['type'] = SimpleItem(defvalue='Path',doc=docstr) # docstr = 'Selection criteria: Runs, ProcessedRuns, NotProcessed (only \ # works for type="RunsByDate")' ## schema['selection'] = SimpleItem(defvalue='',doc=docstr) _schema = BKQuery._schema.inherit_copy() _schema.datadict['dataset'] = ComponentItem('datasets', defvalue=None, optional=1, load_default=False, doc='dataset', hidden=0) _schema.datadict['fulldataset'] = ComponentItem('datasets', defvalue=None, optional=1, load_default=False, doc='dataset', hidden=1) _schema.datadict['fulldatasetptr'] = SimpleItem( defvalue=0, optional=0, load_default=True, doc='dataset position pointer', hidden=1, typeList=['int']) _schema.datadict['filesToRelease'] = SimpleItem( defvalue=3, optional=0, load_default=True, doc='number of files to release at a time', hidden=0, typeList=['int']) _category = 'query' _name = "BKTestQuery" _exportmethods = BKQuery._exportmethods _exportmethods += ['removeData'] def getDataset(self): if self.fulldataset is None: self.fulldataset = LHCbDataset( super(BKTestQuery, self).getDataset().files) if self.dataset is None: self.dataset = LHCbDataset( self.fulldataset.files[:self.filesToRelease]) self.fulldatasetptr = self.filesToRelease else: self.dataset.files += self.fulldataset.files[ self.fulldatasetptr:self.fulldatasetptr + self.filesToRelease] self.fulldatasetptr += self.filesToRelease return self.dataset def removeData(self): if len(self.dataset): del self.dataset.files[0]
class ITask(GangaObject): """This is the framework of a task without special properties""" _schema = Schema( Version(1, 0), { 'transforms': ComponentItem('transforms', defvalue=[], sequence=1, copyable=0, doc='list of transforms'), 'id': SimpleItem( defvalue=-1, protected=1, doc='ID of the Task', typelist=[int ]), 'name': SimpleItem(defvalue='NewTask', copyable=1, doc='Name of the Task', typelist=[str]), 'comment': SimpleItem( '', protected=0, doc='comment of the task', typelist=[str]), 'status': SimpleItem(defvalue='new', protected=1, doc='Status - new, running, pause or completed', typelist=[str]), 'float': SimpleItem(defvalue=0, copyable=1, doc='Number of Jobs run concurrently', typelist=[int]), 'metadata': ComponentItem('metadata', defvalue=MetadataDict(), doc='the metadata', protected=1), 'creation_date': SimpleItem(defvalue="19700101", copyable=0, protected=1, doc='Creation date of the task', typelist=[str]), 'check_all_trfs': SimpleItem( defvalue=True, doc='Check all Transforms during each monitoring loop cycle'), }) _category = 'tasks' _name = 'ITask' _exportmethods = [ 'run', 'appendTransform', 'overview', 'getJobs', 'remove', 'clone', 'pause', 'check', 'setBackend', 'setParameter', 'insertTransform', 'removeTransform', 'table', 'resetUnitsByStatus', 'removeUnusedJobs', 'n_all', 'n_status', 'n_all' ] _tasktype = "ITask" default_registry = "tasks" # Special methods: def _auto__init__(self, registry=None): if registry is None: from Ganga.Core.GangaRepository import getRegistry registry = getRegistry(self.default_registry) # register the job (it will also commit it) # job gets its id now registry._add(self) self.creation_date = time.strftime('%Y%m%d%H%M%S') self.startup() self.status = 'new' def startup(self): """Startup function on Ganga startup""" for t in self.transforms: t.startup() def getTransform(self, trf): """Get transform using either index or name""" if isinstance(trf, str): for trfid in range(0, len(self.transforms)): if trf == self.transforms[trfid].name: return self.transforms[trfid] logger.warning("Couldn't find transform with name '%s'." % trf) elif isinstance(trf, int): if trf < 0 and trf > len(self.transforms): logger.warning("Transform number '%d' out of range" % trf) else: return self.transforms[trf] else: logger.warning( 'Incorrect type for transform referral. Allowed types are int or string.' ) return None def update(self): """Called by the monitoring thread. Base class just calls update on each Transform""" # if we're new, then do nothing if self.status == "new": return # loop over all transforms and call update for trf in self.transforms: if trf.status != "running": continue if trf.update() and not self.check_all_trfs: break # update status and check self.updateStatus() # Public methods: # # - remove() a task # - clone() a task # - check() a task (if updated) # - run() a task to start processing # - pause() to interrupt processing # - setBackend(be) for all transforms # - setParameter(myParam=True) for all transforms # - insertTransform(id, tf) insert a new processing step # - removeTransform(id) remove a processing step def remove(self, remove_jobs="do_nothing"): """Delete the task""" # make sure the task isn't running if self.status.find("running") != -1: logger.error( "Task is still running. Please pause before removing!") return if not remove_jobs in [True, False]: logger.info("You want to remove the task %i named '%s'." % (self.id, self.name)) logger.info( "Since this operation cannot be easily undone, please call this command again:" ) logger.info( " * as tasks(%i).remove(remove_jobs=True) if you want to remove all associated jobs," % (self.id)) logger.info( " * as tasks(%i).remove(remove_jobs=False) if you want to keep the jobs." % (self.id)) return if remove_jobs: for trf in self.transforms: for unit in trf.units: for jid in unit.active_job_ids: try: j = getJobByID(jid) j.remove() except Exception as err: logger.debug("Remove Err: %s" % str(err)) pass for jid in unit.prev_job_ids: try: j = getJobByID(jid) j.remove() except Exception as err2: logger.debug("Remove Err2: %s" % str(err2)) pass self._getRegistry()._remove(self, auto_removed=1) logger.info("Task #%s deleted" % self.id) def clone(self): c = super(ITask, self).clone() for tf in c.transforms: tf.status = "new" c.check() return c def check(self): """This function is called by run() or manually by the user""" if self.status != "new": logger.error( "The check() function may modify a task and can therefore only be called on new tasks!" ) return try: for t in self.transforms: t.check() finally: self.updateStatus() return True def run(self): """Confirms that this task is fully configured and ready to be run.""" if self.status == "new": self.check() if self.status != "completed": if self.float == 0: logger.warning( "The 'float', the number of jobs this task may run, is still zero. Type 'tasks(%i).float = 5' to allow this task to submit 5 jobs at a time" % self.id) try: for tf in self.transforms: if tf.status != "completed": tf.run(check=False) finally: self.updateStatus() else: logger.info("Task is already completed!") def pause(self): """Pause the task - the background thread will not submit new jobs from this task""" float_cache = self.float self.float = 0 if self.status != "completed": for tf in self.transforms: tf.pause() self.status = "pause" else: logger.info("Transform is already completed!") self.float = float_cache def insertTransform(self, id, tf): """Insert transfrm tf before index id (counting from 0)""" if self.status != "new" and id < len(self.transforms): logger.error( "You can only insert transforms at the end of the list. Only if a task is new it can be freely modified!" ) return # self.transforms.insert(id,tf.copy()) # this would be safer, but # breaks user exspectations # this means that t.insertTransform(0,t2.transforms[0]) will cause # Great Breakage self.transforms.insert(id, tf) stripProxy(tf).id = id def appendTransform(self, tf): """Append transform""" return self.insertTransform(len(self.transforms), tf) def removeTransform(self, id): """Remove the transform with the index id (counting from 0)""" if self.status != "new": logger.error("You can only remove transforms if the task is new!") return del self.transforms[id] def getJobs(self): """ Get the job slice of all jobs that process this task """ jobslice = JobRegistrySlice("tasks(%i).getJobs()" % (self.id)) for trf in self.transforms: for jid in trf.getJobs(): jobslice.objects[getJobByID(jid).fqid] = stripProxy( getJobByID(jid)) return JobRegistrySliceProxy(jobslice) # Internal methods def updateStatus(self): """Updates status based on transform status. Called from check() or if status of a transform changes""" # Calculate status from transform status: states = [tf.status for tf in self.transforms] if "running" in states and "pause" in states: new_status = "running/pause" elif "running" in states: new_status = "running" elif "pause" in states: new_status = "pause" elif "new" in states: new_status = "new" elif "completed" in states: new_status = "completed" else: new_status = "new" # no tranforms # Handle status changes here: if self.status != new_status: if new_status == "running/pause": logger.info( "Some Transforms of Task %i '%s' have been paused. Check tasks.table() for details!" % (self.id, self.name)) elif new_status == "completed": logger.info("Task %i '%s' has completed!" % (self.id, self.name)) elif self.status == "completed": logger.warning("Task %i '%s' has been reopened!" % (self.id, self.name)) self.status = new_status return self.status # Information methods def n_tosub(self): return self.float - sum([t.n_active() for t in self.transforms]) def n_all(self): return sum([t.n_all() for t in self.transforms]) def n_status(self, status): return sum([t.n_status(status) for t in self.transforms]) def table(self): from Ganga.Core.GangaRepository import getRegistryProxy t = getRegistryProxy('tasks').table(id=self.id) def overview(self, status=''): """ Show an overview of the Task """ if status and not status in [ 'bad', 'hold', 'running', 'completed', 'new' ]: logger.error( "Not a valid status for unitOverview. Possible options are: 'bad', 'hold', 'running', 'completed', 'new'." ) return print( "Lists the units in each transform and give the state of the subjobs" ) print('') print(" " * 41 + "Active\tSub\tRun\tComp\tFail\tMinor\tMajor") for trfid in range(0, len(self.transforms)): print( "----------------------------------------------------------------------------------------------------------------------" ) print("---- Transform %d: %s" % (trfid, self.transforms[trfid].name)) print('') self.transforms[trfid].overview(status) print('') def info(self): for t in self.transforms: t.info() def help(self): print("This is a Task without special properties") def resetUnitsByStatus(self, status='bad'): """Reset all units of the given status""" for trf in self.transforms: trf.resetUnitsByStatus(status) def removeUnusedJobs(self): """Remove any unused jobs""" for trf in self.transforms: trf.removeUnusedJobs()
class ARC(IBackend): '''ARC backend - direct job submission to an ARC CE''' _schema = Schema( Version(1, 0), { 'CE': SimpleItem(defvalue='', doc='ARC CE endpoint'), 'jobtype': SimpleItem(defvalue='Normal', doc='Job type: Normal, MPICH'), 'requirements': ComponentItem('LCGRequirements', doc='Requirements for the resource selection'), 'sandboxcache': ComponentItem( 'GridSandboxCache', copyable=1, doc='Interface for handling oversized input sandbox'), 'id': SimpleItem(defvalue='', typelist=[str, list], protected=1, copyable=0, doc='Middleware job identifier'), 'status': SimpleItem(defvalue='', typelist=[str, dict], protected=1, copyable=0, doc='Middleware job status'), 'exitcode': SimpleItem(defvalue='', protected=1, copyable=0, doc='Application exit code'), 'exitcode_arc': SimpleItem(defvalue='', protected=1, copyable=0, doc='Middleware exit code'), 'actualCE': SimpleItem(defvalue='', protected=1, copyable=0, doc='The ARC CE where the job actually runs.'), 'reason': SimpleItem(defvalue='', protected=1, copyable=0, doc='Reason of causing the job status'), 'workernode': SimpleItem(defvalue='', protected=1, copyable=0, doc='The worker node on which the job actually runs.'), 'isbURI': SimpleItem(defvalue='', protected=1, copyable=0, doc='The input sandbox URI on ARC CE'), 'osbURI': SimpleItem(defvalue='', protected=1, copyable=0, doc='The output sandbox URI on ARC CE'), 'verbose': SimpleItem(defvalue=False, doc='Use verbose options for ARC commands') }) _category = 'backends' _name = 'ARC' def __init__(self): super(ARC, self).__init__() # dynamic requirement object loading try: reqName1 = config['Requirements'] reqName = config['Requirements'].split('.').pop() reqModule = __import__(reqName1, globals(), locals(), [reqName1]) reqClass = vars(reqModule)[reqName] self.requirements = reqClass() logger.debug('load %s as LCGRequirements' % reqName) except: logger.debug('load default LCGRequirements') pass # dynamic sandbox cache object loading # force to use GridftpSandboxCache self.sandboxcache = GridftpSandboxCache() try: scName1 = config['SandboxCache'] scName = config['SandboxCache'].split('.').pop() scModule = __import__(scName1, globals(), locals(), [scName1]) scClass = vars(scModule)[scName] self.sandboxcache = scClass() logger.debug('load %s as SandboxCache' % scName) except: logger.debug('load default SandboxCache') pass def __refresh_jobinfo__(self, job): '''Refresh the lcg jobinfo. It will be called after resubmission.''' job.backend.status = '' job.backend.reason = '' job.backend.actualCE = '' job.backend.exitcode = '' job.backend.exitcode_arc = '' job.backend.workernode = '' job.backend.isbURI = '' job.backend.osbURI = '' def __setup_sandboxcache__(self, job): '''Sets up the sandbox cache object to adopt the runtime configuration of the LCG backend''' re_token = re.compile('^token:(.*):(.*)$') self.sandboxcache.vo = config['VirtualOrganisation'] self.sandboxcache.timeout = config['SandboxTransferTimeout'] if self.sandboxcache._name == 'LCGSandboxCache': if not self.sandboxcache.lfc_host: self.sandboxcache.lfc_host = Grid.__get_lfc_host__() if not self.sandboxcache.se: token = '' se_host = config['DefaultSE'] m = re_token.match(se_host) if m: token = m.group(1) se_host = m.group(2) self.sandboxcache.se = se_host if token: self.sandboxcache.srm_token = token if (self.sandboxcache.se_type in ['srmv2']) and (not self.sandboxcache.srm_token): self.sandboxcache.srm_token = config['DefaultSRMToken'] return True def __check_and_prestage_inputfile__(self, file): '''Checks the given input file size and if it's size is over "BoundSandboxLimit", prestage it to a grid SE. The argument is a path of the local file. It returns a dictionary containing information to refer to the file: idx = {'lfc_host': lfc_host, 'local': [the local file pathes], 'remote': {'fname1': 'remote index1', 'fname2': 'remote index2', ... } } If prestaging failed, None object is returned. If the file has been previously uploaded (according to md5sum), the prestaging is ignored and index to the previously uploaded file is returned. ''' idx = {'lfc_host': '', 'local': [], 'remote': {}} job = self.getJobObject() # read-in the previously uploaded files uploadedFiles = [] # getting the uploaded file list from the master job if job.master: uploadedFiles += job.master.backend.sandboxcache.get_cached_files() # set and get the $LFC_HOST for uploading oversized sandbox self.__setup_sandboxcache__(job) uploadedFiles += self.sandboxcache.get_cached_files() lfc_host = None # for LCGSandboxCache, take the one specified in the sansboxcache object. # the value is exactly the same as the one from the local grid shell env. if # it is not specified exclusively. if self.sandboxcache._name == 'LCGSandboxCache': lfc_host = self.sandboxcache.lfc_host # or in general, query it from the Grid object if not lfc_host: lfc_host = Grid.__get_lfc_host__() idx['lfc_host'] = lfc_host abspath = os.path.abspath(file) fsize = os.path.getsize(abspath) if fsize > config['BoundSandboxLimit']: md5sum = get_md5sum(abspath, ignoreGzipTimestamp=True) doUpload = True for uf in uploadedFiles: if uf.md5sum == md5sum: # the same file has been uploaded to the iocache idx['remote'][os.path.basename(file)] = uf.id doUpload = False break if doUpload: logger.warning( 'The size of %s is larger than the sandbox limit (%d byte). Please wait while pre-staging ...' % (file, config['BoundSandboxLimit'])) if self.sandboxcache.upload([abspath]): remote_sandbox = self.sandboxcache.get_cached_files()[-1] idx['remote'][remote_sandbox.name] = remote_sandbox.id else: logger.error( 'Oversized sandbox not successfully pre-staged') return None else: idx['local'].append(abspath) return idx def __mt_job_prepare__(self, rjobs, subjobconfigs, masterjobconfig): '''preparing jobs in multiple threads''' logger.warning('preparing %d subjobs ... it may take a while' % len(rjobs)) # prepare the master job (i.e. create shared inputsandbox, etc.) master_input_sandbox = IBackend.master_prepare(self, masterjobconfig) # uploading the master job if it's over the WMS sandbox limitation for f in master_input_sandbox: master_input_idx = self.__check_and_prestage_inputfile__(f) if not master_input_idx: logger.error('master input sandbox perparation failed: %s' % f) return None # the algorithm for preparing a single bulk job class MyAlgorithm(Algorithm): def __init__(self): Algorithm.__init__(self) def process(self, sj_info): my_sc = sj_info[0] my_sj = sj_info[1] try: logger.debug("preparing job %s" % my_sj.getFQID('.')) jdlpath = my_sj.backend.preparejob(my_sc, master_input_sandbox) if (not jdlpath) or (not os.path.exists(jdlpath)): raise GangaException('job %s not properly prepared' % my_sj.getFQID('.')) self.__appendResult__(my_sj.id, jdlpath) return True except Exception as x: log_user_exception() return False mt_data = [] for sc, sj in zip(subjobconfigs, rjobs): mt_data.append([sc, sj]) myAlg = MyAlgorithm() myData = Data(collection=mt_data) runner = MTRunner(name='lcg_jprepare', algorithm=myAlg, data=myData, numThread=10) runner.start() runner.join(-1) if len(runner.getDoneList()) < len(mt_data): return None else: # return a JDL file dictionary with subjob ids as keys, JDL file # paths as values return runner.getResults() def __mt_bulk_submit__(self, node_jdls): '''submitting jobs in multiple threads''' job = self.getJobObject() logger.warning('submitting %d subjobs ... it may take a while' % len(node_jdls)) # the algorithm for submitting a single bulk job class MyAlgorithm(Algorithm): def __init__(self, masterInputWorkspace, ce, arcverbose): Algorithm.__init__(self) self.inpw = masterInputWorkspace self.ce = ce self.arcverbose = arcverbose def process(self, jdl_info): my_sj_id = jdl_info[0] my_sj_jdl = jdl_info[1] #my_sj_jid = self.gridObj.arc_submit(my_sj_jdl, self.ce, self.verbose) my_sj_jid = Grid.arc_submit(my_sj_jdl, self.ce, self.arcverbose) if not my_sj_jid: return False else: self.__appendResult__(my_sj_id, my_sj_jid) return True mt_data = [] for id, jdl in node_jdls.items(): mt_data.append((id, jdl)) myAlg = MyAlgorithm(masterInputWorkspace=job.getInputWorkspace(), ce=self.CE, arcverbose=self.verbose) myData = Data(collection=mt_data) runner = MTRunner(name='arc_jsubmit', algorithm=myAlg, data=myData, numThread=config['SubmissionThread']) runner.start() runner.join(timeout=-1) if len(runner.getDoneList()) < len(mt_data): # not all bulk jobs are successfully submitted. canceling the # submitted jobs on WMS immediately logger.error( 'some bulk jobs not successfully (re)submitted, canceling submitted jobs on WMS' ) Grid.arc_cancelMultiple(runner.getResults().values()) return None else: return runner.getResults() def __jobWrapperTemplate__(self): '''Create job wrapper''' script = """#!/usr/bin/env python #----------------------------------------------------- # This job wrapper script is automatically created by # GANGA LCG backend handler. # # It controls: # 1. unpack input sandbox # 2. invoke application executable # 3. invoke monitoring client #----------------------------------------------------- import os,os.path,shutil,tempfile import sys,popen2,time,traceback #bugfix #36178: subprocess.py crashes if python 2.5 is used #try to import subprocess from local python installation before an #import from PYTHON_DIR is attempted some time later try: import subprocess except ImportError: pass ## Utility functions ## def timeString(): return time.strftime('%a %b %d %H:%M:%S %Y',time.gmtime(time.time())) def printInfo(s): out.write(timeString() + ' [Info]' + ' ' + str(s) + os.linesep) out.flush() def printError(s): out.write(timeString() + ' [Error]' + ' ' + str(s) + os.linesep) out.flush() def lcg_file_download(vo,guid,localFilePath,timeout=60,maxRetry=3): cmd = 'lcg-cp -t %d --vo %s %s file://%s' % (timeout,vo,guid,localFilePath) printInfo('LFC_HOST set to %s' % os.environ['LFC_HOST']) printInfo('lcg-cp timeout: %d' % timeout) i = 0 rc = 0 isDone = False try_again = True while try_again: i = i + 1 try: ps = os.popen(cmd) status = ps.close() if not status: isDone = True printInfo('File %s download from iocache' % os.path.basename(localFilePath)) else: raise IOError("Download file %s from iocache failed with error code: %d, trial %d." % (os.path.basename(localFilePath), status, i)) except IOError as e: isDone = False printError(str(e)) if isDone: try_again = False elif i == maxRetry: try_again = False else: try_again = True return isDone ## system command executor with subprocess def execSyscmdSubprocess(cmd, wdir=os.getcwd()): import os, subprocess global exitcode outfile = file('stdout','w') errorfile = file('stderr','w') try: child = subprocess.Popen(cmd, cwd=wdir, shell=True, stdout=outfile, stderr=errorfile) while 1: exitcode = child.poll() if exitcode is not None: break else: outfile.flush() errorfile.flush() time.sleep(0.3) finally: pass outfile.flush() errorfile.flush() outfile.close() errorfile.close() return True ## system command executor with multi-thread ## stderr/stdout handler def execSyscmdEnhanced(cmd, wdir=os.getcwd()): import os, threading cwd = os.getcwd() isDone = False try: ## change to the working directory os.chdir(wdir) child = popen2.Popen3(cmd,1) child.tochild.close() # don't need stdin class PipeThread(threading.Thread): def __init__(self,infile,outfile,stopcb): self.outfile = outfile self.infile = infile self.stopcb = stopcb self.finished = 0 threading.Thread.__init__(self) def run(self): stop = False while not stop: buf = self.infile.read(10000) self.outfile.write(buf) self.outfile.flush() time.sleep(0.01) stop = self.stopcb() #FIXME: should we do here?: self.infile.read() #FIXME: this is to make sure that all the output is read (if more than buffer size of output was produced) self.finished = 1 def stopcb(poll=False): global exitcode if poll: exitcode = child.poll() return exitcode != -1 out_thread = PipeThread(child.fromchild, sys.stdout, stopcb) err_thread = PipeThread(child.childerr, sys.stderr, stopcb) out_thread.start() err_thread.start() while not out_thread.finished and not err_thread.finished: stopcb(True) time.sleep(0.3) sys.stdout.flush() sys.stderr.flush() isDone = True except(Exception,e): isDone = False ## return to the original directory os.chdir(cwd) return isDone ############################################################################################ ###INLINEMODULES### ############################################################################################ ## Main program ## outputsandbox = ###OUTPUTSANDBOX### input_sandbox = ###INPUTSANDBOX### wrapperlog = ###WRAPPERLOG### appexec = ###APPLICATIONEXEC### appargs = ###APPLICATIONARGS### appenvs = ###APPLICATIONENVS### timeout = ###TRANSFERTIMEOUT### exitcode=-1 import sys, stat, os, os.path, commands # Change to scratch directory if provided scratchdir = '' tmpdir = '' orig_wdir = os.getcwd() # prepare log file for job wrapper out = open(os.path.join(orig_wdir, wrapperlog),'w') if os.getenv('EDG_WL_SCRATCH'): scratchdir = os.getenv('EDG_WL_SCRATCH') elif os.getenv('TMPDIR'): scratchdir = os.getenv('TMPDIR') if scratchdir: (status, tmpdir) = commands.getstatusoutput('mktemp -d %s/gangajob_XXXXXXXX' % (scratchdir)) if status == 0: os.chdir(tmpdir) else: ## if status != 0, tmpdir should contains error message so print it to stderr printError('Error making ganga job scratch dir: %s' % tmpdir) printInfo('Unable to create ganga job scratch dir in %s. Run directly in: %s' % ( scratchdir, os.getcwd() ) ) ## reset scratchdir and tmpdir to disable the usage of Ganga scratch dir scratchdir = '' tmpdir = '' wdir = os.getcwd() if scratchdir: printInfo('Changed working directory to scratch directory %s' % tmpdir) try: os.system("ln -s %s %s" % (os.path.join(orig_wdir, 'stdout'), os.path.join(wdir, 'stdout'))) os.system("ln -s %s %s" % (os.path.join(orig_wdir, 'stderr'), os.path.join(wdir, 'stderr'))) except Exception as e: printError(sys.exc_info()[0]) printError(sys.exc_info()[1]) str_traceback = traceback.format_tb(sys.exc_info()[2]) for str_tb in str_traceback: printError(str_tb) printInfo('Linking stdout & stderr to original directory failed. Looking at stdout during job run may not be possible') os.environ['PATH'] = '.:'+os.environ['PATH'] vo = os.environ['GANGA_LCG_VO'] try: printInfo('Job Wrapper start.') # download inputsandbox from remote cache for f,guid in input_sandbox['remote'].iteritems(): if not lcg_file_download(vo, guid, os.path.join(wdir,f), timeout=int(timeout)): raise IOError('Download remote input %s:%s failed.' % (guid,f) ) else: if mimetypes.guess_type(f)[1] in ['gzip', 'bzip2']: getPackedInputSandbox(f) else: shutil.copy(f, os.path.join(os.getcwd(), os.path.basename(f))) printInfo('Download inputsandbox from iocache passed.') # unpack inputsandbox from wdir for f in input_sandbox['local']: if mimetypes.guess_type(f)[1] in ['gzip', 'bzip2']: getPackedInputSandbox(os.path.join(orig_wdir,f)) printInfo('Unpack inputsandbox passed.') #get input files ###DOWNLOADINPUTFILES### printInfo('Loading Python modules ...') sys.path.insert(0,os.path.join(wdir,PYTHON_DIR)) # check the python library path try: printInfo(' ** PYTHON_DIR: %s' % os.environ['PYTHON_DIR']) except KeyError: pass try: printInfo(' ** PYTHONPATH: %s' % os.environ['PYTHONPATH']) except KeyError: pass for lib_path in sys.path: printInfo(' ** sys.path: %s' % lib_path) # execute application ## convern appenvs into environment setup script to be 'sourced' before executing the user executable printInfo('Prepare environment variables for application executable') env_setup_script = os.path.join(os.getcwd(), '__ganga_lcg_env__.sh') f = open( env_setup_script, 'w') f.write('#!/bin/sh' + os.linesep ) f.write('##user application environmet setup script generated by Ganga job wrapper' + os.linesep) for k,v in appenvs.items(): str_env = 'export %s="%s"' % (k, v) printInfo(' ** ' + str_env) f.write(str_env + os.linesep) f.close() try: #try to make shipped executable executable os.chmod('%s/%s'% (wdir,appexec),stat.S_IXUSR|stat.S_IRUSR|stat.S_IWUSR) except: pass status = False try: # use subprocess to run the user's application if the module is available on the worker node import subprocess printInfo('Load application executable with subprocess module') status = execSyscmdSubprocess('source %s; %s %s' % (env_setup_script, appexec, appargs), wdir) except ImportError as err: # otherwise, use separate threads to control process IO pipes printInfo('Load application executable with separate threads') status = execSyscmdEnhanced('source %s; %s %s' % (env_setup_script, appexec, appargs), wdir) os.system("cp %s/stdout stdout.1" % orig_wdir) os.system("cp %s/stderr stderr.1" % orig_wdir) printInfo('GZipping stdout and stderr...') os.system("gzip stdout.1 stderr.1") # move them to the original wdir so they can be picked up os.system("mv stdout.1.gz %s/stdout.gz" % orig_wdir) os.system("mv stderr.1.gz %s/stderr.gz" % orig_wdir) if not status: raise OSError('Application execution failed.') printInfo('Application execution passed with exit code %d.' % exitcode) ###OUTPUTUPLOADSPOSTPROCESSING### for f in os.listdir(os.getcwd()): command = "cp %s %s" % (os.path.join(os.getcwd(),f), os.path.join(orig_wdir,f)) os.system(command) createPackedOutputSandbox(outputsandbox,None,orig_wdir) # pack outputsandbox # printInfo('== check output ==') # for line in os.popen('pwd; ls -l').readlines(): # printInfo(line) printInfo('Pack outputsandbox passed.') # Clean up after us - All log files and packed outputsandbox should be in "wdir" if scratchdir: os.chdir(orig_wdir) os.system("rm %s -rf" % wdir) except Exception as e: printError(sys.exc_info()[0]) printError(sys.exc_info()[1]) str_traceback = traceback.format_tb(sys.exc_info()[2]) for str_tb in str_traceback: printError(str_tb) printInfo('Job Wrapper stop.') out.close() # always return exit code 0 so the in the case of application failure # one can always get stdout and stderr back to the UI for debug. sys.exit(0) """ return script def preparejob(self, jobconfig, master_job_sandbox): '''Prepare the JDL''' script = self.__jobWrapperTemplate__() job = self.getJobObject() inpw = job.getInputWorkspace() wrapperlog = '__jobscript__.log' import Ganga.Core.Sandbox as Sandbox # FIXME: check what happens if 'stdout','stderr' are specified here script = script.replace('###OUTPUTSANDBOX###', repr(jobconfig.outputbox)) script = script.replace('###APPLICATION_NAME###', getName(job.application)) script = script.replace('###APPLICATIONEXEC###', repr(jobconfig.getExeString())) script = script.replace('###APPLICATIONARGS###', repr(jobconfig.getArguments())) from Ganga.GPIDev.Lib.File.OutputFileManager import getWNCodeForOutputPostprocessing, getWNCodeForDownloadingInputFiles script = script.replace('###OUTPUTUPLOADSPOSTPROCESSING###', getWNCodeForOutputPostprocessing(job, ' ')) script = script.replace('###DOWNLOADINPUTFILES###', getWNCodeForDownloadingInputFiles(job, ' ')) if jobconfig.env: script = script.replace('###APPLICATIONENVS###', repr(jobconfig.env)) else: script = script.replace('###APPLICATIONENVS###', repr({})) script = script.replace('###WRAPPERLOG###', repr(wrapperlog)) import inspect script = script.replace('###INLINEMODULES###', inspect.getsource(Sandbox.WNSandbox)) mon = job.getMonitoringService() self.monInfo = None # set the monitoring file by default to the stdout if isinstance(self.monInfo, dict): self.monInfo['remotefile'] = 'stdout' # try to print out the monitoring service information in debug mode try: logger.debug('job info of monitoring service: %s' % str(self.monInfo)) except: pass # prepare input/output sandboxes import Ganga.Utility.files from Ganga.GPIDev.Lib.File import File from Ganga.Core.Sandbox.WNSandbox import PYTHON_DIR import inspect fileutils = File(inspect.getsourcefile(Ganga.Utility.files), subdir=PYTHON_DIR) packed_files = jobconfig.getSandboxFiles() + [fileutils] sandbox_files = job.createPackedInputSandbox(packed_files) # sandbox of child jobs should include master's sandbox sandbox_files.extend(master_job_sandbox) # check the input file size and pre-upload larger inputs to the iocache lfc_host = '' input_sandbox_uris = [] input_sandbox_names = [] ick = True max_prestaged_fsize = 0 for f in sandbox_files: idx = self.__check_and_prestage_inputfile__(f) if not idx: logger.error('input sandbox preparation failed: %s' % f) ick = False break else: if idx['lfc_host']: lfc_host = idx['lfc_host'] if idx['remote']: abspath = os.path.abspath(f) fsize = os.path.getsize(abspath) if fsize > max_prestaged_fsize: max_prestaged_fsize = fsize input_sandbox_uris.append( idx['remote'][os.path.basename(f)]) input_sandbox_names.append(os.path.basename( urlparse(f)[2])) if idx['local']: input_sandbox_uris += idx['local'] input_sandbox_names.append(os.path.basename(f)) if not ick: logger.error('stop job submission') return None # determin the lcg-cp timeout according to the max_prestaged_fsize # - using the assumption of 1 MB/sec. max_prestaged_fsize = 0 lfc_host = '' transfer_timeout = config['SandboxTransferTimeout'] predict_timeout = int(math.ceil(max_prestaged_fsize / 1000000.0)) if predict_timeout > transfer_timeout: transfer_timeout = predict_timeout if transfer_timeout < 60: transfer_timeout = 60 script = script.replace('###TRANSFERTIMEOUT###', '%d' % transfer_timeout) # update the job wrapper with the inputsandbox list script = script.replace( '###INPUTSANDBOX###', repr({ 'remote': {}, 'local': input_sandbox_names })) # write out the job wrapper and put job wrapper into job's inputsandbox scriptPath = inpw.writefile(FileBuffer( '__jobscript_%s__' % job.getFQID('.'), script), executable=1) input_sandbox = input_sandbox_uris + [scriptPath] for isb in input_sandbox: logger.debug('ISB URI: %s' % isb) # compose output sandbox to include by default the following files: # - gzipped stdout (transferred only when the JobLogHandler is WMS) # - gzipped stderr (transferred only when the JobLogHandler is WMS) # - __jobscript__.log (job wrapper's log) output_sandbox = [wrapperlog] from Ganga.GPIDev.Lib.File.OutputFileManager import getOutputSandboxPatterns for outputSandboxPattern in getOutputSandboxPatterns(job): output_sandbox.append(outputSandboxPattern) if config['JobLogHandler'] in ['WMS']: output_sandbox += ['stdout.gz', 'stderr.gz'] if len(jobconfig.outputbox): output_sandbox += [Sandbox.OUTPUT_TARBALL_NAME] # compose ARC XRSL xrsl = { #'VirtualOrganisation' : config['VirtualOrganisation'], 'executable': os.path.basename(scriptPath), 'environment': { 'GANGA_LCG_VO': config['VirtualOrganisation'], 'GANGA_LOG_HANDLER': config['JobLogHandler'], 'LFC_HOST': lfc_host }, #'stdout' : 'stdout', #'stderr' : 'stderr', 'inputFiles': input_sandbox, 'outputFiles': output_sandbox, #'OutputSandboxBaseDestURI': 'gsiftp://localhost' } xrsl['environment'].update({'GANGA_LCG_CE': self.CE}) #xrsl['Requirements'] = self.requirements.merge(jobconfig.requirements).convert() # if self.jobtype.upper() in ['NORMAL','MPICH']: #xrsl['JobType'] = self.jobtype.upper() # if self.jobtype.upper() == 'MPICH': #xrsl['Requirements'].append('(other.GlueCEInfoTotalCPUs >= NodeNumber)') # xrsl['Requirements'].append('Member("MPICH",other.GlueHostApplicationSoftwareRunTimeEnvironment)') #xrsl['NodeNumber'] = self.requirements.nodenumber # else: # logger.warning('JobType "%s" not supported' % self.jobtype) # return # additional settings from the job if jobconfig.env: xrsl['environment'].update(jobconfig.env) xrslText = Grid.expandxrsl(xrsl) # append any additional requirements from the requirements object xrslText += '\n'.join(self.requirements.other) logger.debug('subjob XRSL: %s' % xrslText) return inpw.writefile(FileBuffer('__xrslfile__', xrslText)) def kill(self): '''Kill the job''' job = self.getJobObject() logger.info('Killing job %s' % job.getFQID('.')) if not self.id: logger.warning('Job %s is not running.' % job.getFQID('.')) return False return Grid.arc_cancel([self.id]) def master_kill(self): '''kill the master job to the grid''' job = self.getJobObject() if not job.master and len(job.subjobs) == 0: return IBackend.master_kill(self) elif job.master: return IBackend.master_kill(self) else: return self.master_bulk_kill() def master_bulk_kill(self): '''GLITE bulk resubmission''' job = self.getJobObject() # killing the individually re-submitted subjobs logger.debug('cancelling running/submitted subjobs.') # 1. collect job ids ids = [] for sj in job.subjobs: if sj.status in ['submitted', 'running'] and sj.backend.id: ids.append(sj.backend.id) # 2. cancel the collected jobs ck = Grid.arc_cancelMultiple(ids) if not ck: logger.warning('Job cancellation failed') return False else: for sj in job.subjobs: if sj.backend.id in ids: sj.updateStatus('killed') return True def master_bulk_submit(self, rjobs, subjobconfigs, masterjobconfig): '''submit multiple subjobs in parallel, by default using 10 concurrent threads''' from Ganga.Utility.logic import implies assert (implies(rjobs, len(subjobconfigs) == len(rjobs))) # prepare the subjobs, jdl repository before bulk submission node_jdls = self.__mt_job_prepare__(rjobs, subjobconfigs, masterjobconfig) if not node_jdls: logger.error('Some jobs not successfully prepared') return False # set all subjobs to submitting status for sj in rjobs: sj.updateStatus('submitting') node_jids = self.__mt_bulk_submit__(node_jdls) status = False if node_jids: for sj in rjobs: if sj.id in node_jids.keys(): sj.backend.id = node_jids[sj.id] sj.backend.CE = self.CE sj.backend.actualCE = sj.backend.CE sj.updateStatus('submitted') sj.info.submit_counter += 1 else: logger.warning('subjob %s not successfully submitted' % sj.getFQID('.')) status = True return status def master_bulk_resubmit(self, rjobs): '''ARC bulk resubmission''' from Ganga.Utility.logging import log_user_exception # job = self.getJobObject() # compose master JDL for collection job node_jdls = {} for sj in rjobs: jdlpath = os.path.join(sj.inputdir, '__jdlfile__') node_jdls[sj.id] = jdlpath # set all subjobs to submitting status for sj in rjobs: sj.updateStatus('submitting') node_jids = self.__mt_bulk_submit__(node_jdls) status = False if node_jids: for sj in rjobs: if sj.id in node_jids.keys(): self.__refresh_jobinfo__(sj) sj.backend.id = node_jids[sj.id] sj.backend.CE = self.CE sj.backend.actualCE = sj.backend.CE sj.updateStatus('submitted') sj.info.submit_counter += 1 else: logger.warning('subjob %s not successfully submitted' % sj.getFQID('.')) status = True # # set all subjobs to submitted status # # NOTE: this is just a workaround to avoid the unexpected transition # # that turns the master job's status from 'submitted' to 'submitting'. # # As this transition should be allowed to simulate a lock mechanism in Ganga 4, the workaround # # is to set all subjobs' status to 'submitted' so that the transition can be avoided. # # A more clear solution should be implemented with the lock mechanism introduced in Ganga 5. # for sj in rjobs: # sj.updateStatus('submitted') # sj.info.submit_counter += 1 return status def master_submit(self, rjobs, subjobconfigs, masterjobconfig): '''Submit the master job to the grid''' profiler = ElapsedTimeProfiler(getLogger(name='Profile.LCG')) profiler.start() job = self.getJobObject() # finding ARC CE endpoint for job submission #allowed_celist = [] # try: # allowed_celist = self.requirements.getce() # if not self.CE and allowed_celist: # self.CE = allowed_celist[0] # except: # logger.warning('ARC CE assigment from ARCRequirements failed.') # if self.CE and allowed_celist: # if self.CE not in allowed_celist: # logger.warning('submission to CE not allowed: %s, use %s instead' % ( self.CE, allowed_celist[0] ) ) # self.CE = allowed_celist[0] # use arc info to check for any endpoints recorded in the config file rc, output = Grid.arc_info() if not self.CE and rc != 0: raise GangaException( "ARC CE endpoint not set and no default settings in '%s'. " % config['ArcConfigFile']) elif self.CE: logger.info('ARC CE endpoint set to: ' + str(self.CE)) else: logger.info("Using ARC CE endpoints defined in '%s'" % config['ArcConfigFile']) # doing massive job preparation if len(job.subjobs) == 0: ick = IBackend.master_submit(self, rjobs, subjobconfigs, masterjobconfig) else: ick = self.master_bulk_submit(rjobs, subjobconfigs, masterjobconfig) profiler.check('==> master_submit() elapsed time') return ick def submit(self, subjobconfig, master_job_sandbox): '''Submit the job to the grid''' ick = False xrslpath = self.preparejob(subjobconfig, master_job_sandbox) if xrslpath: self.id = Grid.arc_submit(xrslpath, self.CE, self.verbose) if self.id: self.actualCE = self.CE ick = True return ick def master_auto_resubmit(self, rjobs): """ Resubmit each subjob individually as bulk resubmission will overwrite previous master job statuses """ # check for master failure - in which case bulk resubmit mj = self._getParent() if mj.status == 'failed': return self.master_resubmit(rjobs) for j in rjobs: if not j.backend.master_resubmit([j]): return False return True def master_resubmit(self, rjobs): '''Resubmit the master job to the grid''' profiler = ElapsedTimeProfiler(getLogger(name='Profile.LCG')) profiler.start() job = self.getJobObject() ick = False if not job.master and len(job.subjobs) == 0: # case 1: master job normal resubmission logger.debug('rjobs: %s' % str(rjobs)) logger.debug('mode: master job normal resubmission') ick = IBackend.master_resubmit(self, rjobs) elif job.master: # case 2: individual subjob resubmission logger.debug('mode: individual subjob resubmission') ick = IBackend.master_resubmit(self, rjobs) else: # case 3: master job bulk resubmission logger.debug('mode: master job resubmission') ick = self.master_bulk_resubmit(rjobs) if not ick: raise GangaException('ARC bulk submission failure') profiler.check('job re-submission elapsed time') return ick def resubmit(self): '''Resubmit the job''' ick = False job = self.getJobObject() jdlpath = job.getInputWorkspace().getPath("__jdlfile__") if jdlpath: self.id = Grid.arc_submit(jdlpath, self.CE, self.verbose) if self.id: # refresh the lcg job information self.__refresh_jobinfo__(job) self.actualCE = self.CE ick = True return ick @staticmethod def updateMonitoringInformation(jobs): '''Monitoring loop for normal jobs''' import datetime backenddict = {} jobdict = {} for j in jobs: if j.backend.id and ( (datetime.datetime.utcnow() - j.time.timestamps["submitted"] ).seconds > config["ArcWaitTimeBeforeStartingMonitoring"]): jobdict[j.backend.id] = j backenddict[j.backend.actualCE] = j if len(jobdict.keys()) == 0: return jobInfoDict = Grid.arc_status(jobdict.keys(), backenddict.keys()) jidListForPurge = [] # update job information for those available in jobInfoDict for id, info in jobInfoDict.items(): if info: job = jobdict[id] if job.backend.actualCE != urlparse(id)[1].split(":")[0]: job.backend.actualCE = urlparse(id)[1].split(":")[0] if job.backend.status != info['State']: doStatusUpdate = True # no need to update Ganga job status if backend status is # not changed if info['State'] == job.backend.status: doStatusUpdate = False # download output sandboxes if final status is reached elif info['State'] in [ 'Finished', '(FINISHED)', 'Finished (FINISHED)' ]: # grab output sandbox if Grid.arc_get_output( job.backend.id, job.getOutputWorkspace(create=True).getPath()): (ick, app_exitcode) = Grid.__get_app_exitcode__( job.getOutputWorkspace(create=True).getPath()) job.backend.exitcode = app_exitcode jidListForPurge.append(job.backend.id) else: logger.error('fail to download job output: %s' % jobdict[id].getFQID('.')) if doStatusUpdate: job.backend.status = info['State'] if 'Exit Code' in info: try: job.backend.exitcode_arc = int( info['Exit Code']) except: job.backend.exitcode_arc = 1 if 'Job Error' in info: try: job.backend.reason = info['Job Error'] except: pass job.backend.updateGangaJobStatus() else: logger.warning('fail to retrieve job informaton: %s' % jobdict[id].getFQID('.')) # purging the jobs the output has been fetched locally if jidListForPurge: if not Grid.arc_purgeMultiple(jidListForPurge): logger.warning("Failed to purge all ARC jobs.") def updateGangaJobStatus(self): '''map backend job status to Ganga job status''' job = self.getJobObject() if self.status.startswith('Running') or self.status.startswith( 'Finishing'): job.updateStatus('running') elif self.status.startswith('Finished'): if job.backend.exitcode and job.backend.exitcode != 0: job.backend.reason = 'non-zero app. exit code: %s' % repr( job.backend.exitcode) job.updateStatus('failed') elif job.backend.exitcode_arc and job.backend.exitcode_arc != 0: job.backend.reason = 'non-zero ARC job exit code: %s' % repr( job.backend.exitcode_arc) job.updateStatus('failed') else: job.updateStatus('completed') elif self.status in ['DONE-FAILED', 'ABORTED', 'UNKNOWN', 'Failed']: job.updateStatus('failed') elif self.status in ['CANCELLED']: job.updateStatus('killed') elif self.status.startswith('Queuing'): pass else: logger.warning('Unexpected job status "%s"', self.status)
class MassStorageFile(IGangaFile): """MassStorageFile represents a class marking a file to be written into mass storage (like Castor at CERN) """ _schema = Schema(Version(1, 1), {'namePattern': SimpleItem(defvalue="", doc='pattern of the file name'), 'localDir': SimpleItem(defvalue="", copyable=1, doc='local dir where the file is stored, used from get and put methods'), 'joboutputdir': SimpleItem(defvalue="", doc='outputdir of the job with which the outputsandbox file object is associated'), 'locations': SimpleItem(defvalue=[], copyable=1, typelist=['str'], sequence=1, doc="list of locations where the outputfiles are uploaded"), 'outputfilenameformat': SimpleItem(defvalue=None, typelist=['str', 'type(None)'], protected=0,\ doc="keyword path to where the output should be uploaded, i.e. /some/path/here/{jid}/{sjid}/{fname},\ if this field is not set, the output will go in {jid}/{sjid}/{fname} or in {jid}/{fname}\ depending on whether the job is split or not" ), 'inputremotedirectory': SimpleItem(defvalue=None, typelist=['str', 'type(None)'], protected=0, doc="Directory on mass storage where the file is stored"), 'subfiles': ComponentItem(category='gangafiles', defvalue=[], hidden=1, typelist=['Ganga.GPIDev.Lib.File.MassStorageFile'], sequence=1, copyable=0,\ doc="collected files from the wildcard namePattern"), 'failureReason': SimpleItem(defvalue="", protected=1, copyable=0, doc='reason for the upload failure'), 'compressed': SimpleItem(defvalue=False, typelist=['bool'], protected=0, doc='wheather the output file should be compressed before sending somewhere') }) _category = 'gangafiles' _name = "MassStorageFile" _exportmethods = [ "location", "get", "put", "setLocation", "remove", "accessURL" ] def __init__(self, namePattern='', localDir='', **kwds): """ namePattern is the pattern of the output file that has to be written into mass storage """ super(MassStorageFile, self).__init__() self._setNamePath(_namePattern=namePattern, _localDir=localDir) self.locations = [] self.shell = Shell.Shell() def __construct__(self, args): if len(args) == 1 and isinstance(args[0], str): self._setNamePath(args[0], '') elif len(args) == 2 and isinstance(args[0], str) and isinstance( args[1], str): self._setNamePath(args[0], args[1]) self.locations = [] self.shell = Shell.Shell() def _setNamePath(self, _namePattern='', _localDir=''): if _namePattern != '' and _localDir == '': import os.path self.namePattern = os.path.basename(_namePattern) self.localDir = os.path.dirname(_namePattern) elif _namePattern != '' and _localDir != '': self.namePattern = _namePattern self.localDir = _localDir def _on_attribute__set__(self, obj_type, attrib_name): r = copy.deepcopy(self) if getName(obj_type) == 'Job' and attrib_name == 'outputfiles': r.locations = [] r.localDir = '' r.failureReason = '' return r def __repr__(self): """Get the representation of the file.""" return "MassStorageFile(namePattern='%s')" % self.namePattern def setLocation(self): """ Sets the location of output files that were uploaded to mass storage from the WN """ job = self.getJobObject() postprocessLocationsPath = os.path.join( job.outputdir, getConfig('Output')['PostProcessLocationsFileName']) if not os.path.exists(postprocessLocationsPath): return def mass_line_processor(line, mass_file): lineParts = line.split(' ') pattern = lineParts[1] outputPath = lineParts[2] name = os.path.basename(outputPath).strip('.gz') if regex.search(mass_file.namePattern) is not None: if outputPath == 'ERROR': logger.error("Failed to upload file to mass storage") logger.error(line[line.find('ERROR') + 5:]) d = MassStorageFile(namePattern=pattern) d.compressed = mass_file.compressed d.failureReason = line[line.find('ERROR') + 5:] mass_file.subfiles.append(GPIProxyObjectFactory(d)) else: d = MassStorageFile(namePattern=name) d.compressed = mass_file.compressed d.outputfilenameformat = mass_file.outputfilenameformat mass_file.subfiles.append(GPIProxyObjectFactory(d)) mass_line_processor(line, d) elif name == mass_file.namePattern: if outputPath == 'ERROR': logger.error("Failed to upload file to mass storage") logger.error(line[line.find('ERROR') + 5:]) mass_file.failureReason = line[line.find('ERROR') + 5:] return mass_file.locations = [outputPath.strip('\n')] for line in open(postprocessLocationsPath, 'r'): if line.strip() == '': continue if line.startswith('massstorage'): mass_line_processor(line.strip(), self) def location(self): """ Return list with the locations of the post processed files (if they were configured to upload the output somewhere) """ tmpLocations = [] if self.locations == []: if self.subfiles != []: for i in self.subfiles: for j in i: tmpLocations.append(j) else: tmpLocations = self.locations return tmpLocations def get(self): """ Retrieves locally all files matching this MassStorageFile object pattern """ to_location = self.localDir if not os.path.isdir(self.localDir): if self._getParent() is not None: to_location = self.getJobObject().outputdir else: logger.error( "%s is not a valid directory.... Please set the localDir attribute" % self.localDir) return cp_cmd = getConfig( 'Output')['MassStorageFile']['uploadOptions']['cp_cmd'] for location in self.locations: targetLocation = os.path.join(to_location, os.path.basename(location)) os.system('%s %s %s' % (cp_cmd, location, targetLocation)) def getWNScriptDownloadCommand(self, indent): ## FIXME fix me for the situation of multiple files? script = """\n ###INDENT###os.system(\'###CP_COMMAND###\') """ cp_cmd = '%s %s .' % (getConfig('Output')['MassStorageFile'] ['uploadOptions']['cp_cmd'], self.locations[0]) replace_dict = {'###INDENT###': indent, '###CP_COMMAND###': cp_cmd} for k, v in replace_dict.iteritems(): script = script.replace(str(k), str(v)) return script def put(self): """ Creates and executes commands for file upload to mass storage (Castor), this method will be called on the client """ import glob import re sourceDir = '' # if used as a stand alone object if self._getParent() is None: if self.localDir == '': import os _CWD = os.getcwd() if os.path.isfile(os.path.join(_CWD, self.namePattern)): sourceDir = _CWD else: logger.warning( 'localDir attribute is empty, don\'t know from which dir to take the file' ) return else: sourceDir = self.localDir (result, message) = self.validate() if result == False: logger.warning(message) return else: job = self.getJobObject() sourceDir = job.outputdir # if there are subjobs, the put method will be called on every subjob # and will upload the resulted output file if len(job.subjobs) > 0: return massStorageConfig = getConfig( 'Output')['MassStorageFile']['uploadOptions'] mkdir_cmd = massStorageConfig['mkdir_cmd'] cp_cmd = massStorageConfig['cp_cmd'] ls_cmd = massStorageConfig['ls_cmd'] massStoragePath = massStorageConfig['path'] # create the last directory (if not exist) from the config path import os.path pathToDirName = os.path.dirname(massStoragePath) dirName = os.path.basename(massStoragePath) (exitcode, mystdout, mystderr) = self.execSyscmdSubprocess( '%s %s' % (ls_cmd, pathToDirName)) if exitcode != 0: self.handleUploadFailure(mystderr) return directoryExists = False for directory in mystdout.split('\n'): if directory.strip() == dirName: directoryExists = True break if not directoryExists: (exitcode, mystdout, mystderr) = self.execSyscmdSubprocess( '%s %s' % (mkdir_cmd, massStoragePath)) if exitcode != 0: self.handleUploadFailure(mystderr) return # the folder part of self.outputfilenameformat folderStructure = '' # the file name part of self.outputfilenameformat filenameStructure = '' if self._getParent() != None: jobfqid = self.getJobObject().fqid jobid = jobfqid subjobid = '' if (jobfqid.find('.') > -1): jobid = jobfqid.split('.')[0] subjobid = jobfqid.split('.')[1] if self.outputfilenameformat is None: filenameStructure = '{fname}' # create jid/sjid directories folderStructure = jobid if subjobid != '': folderStructure = os.path.join(jobid, subjobid) else: filenameStructure = os.path.basename(self.outputfilenameformat) filenameStructure = filenameStructure.replace('{jid}', jobid) folderStructure = os.path.dirname(self.outputfilenameformat) folderStructure = folderStructure.replace('{jid}', jobid) if subjobid != '': filenameStructure = filenameStructure.replace( '{sjid}', subjobid) folderStructure = folderStructure.replace( '{sjid}', subjobid) else: if self.outputfilenameformat != None: folderStructure = os.path.dirname(self.outputfilenameformat) filenameStructure = os.path.basename(self.outputfilenameformat) else: filenameStructure = '{fname}' # create the folder structure if folderStructure != '': folderStructure = folderStructure.strip('/') massStoragePath = os.path.join(massStoragePath, folderStructure) command = '%s -p %s' % (mkdir_cmd, massStoragePath) (exitcode, mystdout, mystderr) = self.execSyscmdSubprocess(command) if exitcode != 0: self.handleUploadFailure(mystderr) return # here filenameStructure has replaced jid and sjid if any, and only not # replaced keyword is fname fileName = self.namePattern if self.compressed: fileName = '%s.gz' % self.namePattern if regex.search(fileName) is not None: for currentFile in glob.glob(os.path.join(sourceDir, fileName)): finalFilename = filenameStructure.replace( '{fname}', os.path.basename(currentFile)) (exitcode, mystdout, mystderr) = self.execSyscmdSubprocess( '%s %s %s' % (cp_cmd, currentFile, os.path.join(massStoragePath, finalFilename))) d = MassStorageFile(namePattern=os.path.basename(currentFile)) d.compressed = self.compressed if exitcode != 0: self.handleUploadFailure(mystderr) else: logger.info( '%s successfully uploaded to mass storage as %s' % (currentFile, os.path.join(massStoragePath, finalFilename))) d.locations = os.path.join(massStoragePath, os.path.basename(finalFilename)) # Alex removed this as more general approach in job.py after put() is called # remove file from output dir if this object is attached to a job # if self._getParent() != None: # os.system('rm %s' % os.path.join(sourceDir, currentFile)) self.subfiles.append(GPIProxyObjectFactory(d)) else: currentFile = os.path.join(sourceDir, fileName) finalFilename = filenameStructure.replace( '{fname}', os.path.basename(currentFile)) (exitcode, mystdout, mystderr) = self.execSyscmdSubprocess( '%s %s %s' % (cp_cmd, currentFile, os.path.join(massStoragePath, finalFilename))) if exitcode != 0: self.handleUploadFailure(mystderr) else: logger.info('%s successfully uploaded to mass storage as %s' % (currentFile, os.path.join(massStoragePath, finalFilename))) location = os.path.join(massStoragePath, os.path.basename(finalFilename)) if location not in self.locations: self.locations.append(location) # Alex removed this as more general approach in job.py after put() is called # remove file from output dir if this object is attached to a job # if self._getParent() != None: # os.system('rm %s' % os.path.join(sourceDir, currentFile)) def validate(self): # if the user has set outputfilenameformat, validate for presence of # jid, sjid and fname keywords depending on job type - split or # non-split if self.outputfilenameformat != None: searchFor = ['{fname}'] isJob = False isSplitJob = False if self._getParent() != None: isJob = True if stripProxy(self.getJobObject()).master is not None: isSplitJob = True searchFor.append('{sjid}') missingKeywords = [] for item in searchFor: if self.outputfilenameformat.find(item) == -1: missingKeywords.append(item) if len(missingKeywords): return ( False, 'Error in MassStorageFile.outputfilenameformat field : missing keywords %s ' % ','.join(missingKeywords)) if isSplitJob == False and self.outputfilenameformat.find( '{sjid}') > -1: return ( False, 'Error in MassStorageFile.outputfilenameformat field : job is non-split, but {\'sjid\'} keyword found' ) if isJob == False and self.outputfilenameformat.find( '{sjid}') > -1: return ( False, 'Error in MassStorageFile.outputfilenameformat field : no parent job, but {\'sjid\'} keyword found' ) if isJob == False and self.outputfilenameformat.find('{jid}') > -1: return ( False, 'Error in MassStorageFile.outputfilenameformat field : no parent job, but {\'jid\'} keyword found' ) invalidUnixChars = ['"', ' '] test = self.outputfilenameformat.replace('{jid}', 'a').replace( '{sjid}', 'b').replace('{fname}', 'c') for invalidUnixChar in invalidUnixChars: if test.find(invalidUnixChar) > -1: return ( False, 'Error in MassStorageFile.outputfilenameformat field : invalid char %s found' % invalidUnixChar) return (True, '') def handleUploadFailure(self, error): self.failureReason = error if self._getParent() != None: logger.error( "Job %s failed. One of the job.outputfiles couldn't be uploaded because of %s" % (str(self._getParent().fqid), self.failureReason)) else: logger.error("The file can't be uploaded because of %s" % (self.failureReason)) def getWNInjectedScript(self, outputFiles, indent, patternsToZip, postProcessLocationsFP): """ Returns script that have to be injected in the jobscript for postprocessing on the WN """ massStorageCommands = [] massStorageConfig = getConfig( 'Output')['MassStorageFile']['uploadOptions'] for outputFile in outputFiles: outputfilenameformat = 'None' if outputFile.outputfilenameformat != None and outputFile.outputfilenameformat != '': outputfilenameformat = outputFile.outputfilenameformat massStorageCommands.append([ 'massstorage', outputFile.namePattern, outputfilenameformat, massStorageConfig['mkdir_cmd'], massStorageConfig['cp_cmd'], massStorageConfig['ls_cmd'], massStorageConfig['path'] ]) import inspect script_location = os.path.join( os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))), 'scripts/MassStorageFileWNScript.py') from Ganga.GPIDev.Lib.File import FileUtils script = FileUtils.loadScript(script_location, '###INDENT###') jobfqid = self.getJobObject().fqid jobid = jobfqid subjobid = '' if (jobfqid.find('.') > -1): jobid = jobfqid.split('.')[0] subjobid = jobfqid.split('.')[1] replace_dict = { '###MASSSTORAGECOMMANDS###': repr(massStorageCommands), '###PATTERNSTOZIP###': str(patternsToZip), '###INDENT###': indent, '###POSTPROCESSLOCATIONSFP###': postProcessLocationsFP, '###FULLJOBDIR###': str(jobfqid.replace('.', os.path.sep)), '###JOBDIR###': str(jobid), '###SUBJOBDIR###': str(subjobid) } for k, v in replace_dict.iteritems(): script = script.replace(str(k), str(v)) return script def processWildcardMatches(self): if self.subfiles: return self.subfiles from fnmatch import fnmatch if regex.search(self.namePattern): ls_cmd = getConfig( 'Output')['MassStorageFile']['uploadOptions']['ls_cmd'] exitcode, output, m = self.shell.cmd1(ls_cmd + ' ' + self.inputremotedirectory, capture_stderr=True) for filename in output.split('\n'): if fnmatch(filename, self.namePattern): subfile = MassStorageFile(namePattern=filename) subfile.inputremotedirectory = self.inputremotedirectory self.subfiles.append(GPIProxyObjectFactory(subfile)) def remove(self, force=False, removeLocal=False): """ Removes file from remote storage ONLY by default """ massStorageConfig = getConfig( 'Output')['MassStorageFile']['uploadOptions'] rm_cmd = massStorageConfig['rm_cmd'] if force == True: _auto_delete = True else: _auto_delete = False for i in self.locations: if not _auto_delete: keyin = None while keyin is None: keyin = raw_input( "Do you want to delete file %s at Location: %s ? [y/n] " % (str(self.namePattern), str(i))) if keyin == 'y': _delete_this = True elif keyin == 'n': _delete_this = False else: logger.warning("y/n please!") keyin = None else: _delete_this = True if _delete_this: logger.info("Deleting File at Location: %s") self.execSyscmdSubprocess('%s %s' % (rm_cmd, i)) self.locations.pop(i) if removeLocal: sourceDir = '' if self.localDir == '': import os _CWD = os.getcwd() if os.path.isfile(os.path.join(_CWD, self.namePattern)): sourceDir = _CWD else: sourceDir = self.localDir _localFile = os.path.join(sourceDir, self.namePattern) if os.path.isfile(_localFile): if force: _actual_delete = True else: keyin = None while keyin is None: keyin = raw_input( "Do you want to remove the local File: %s ? ([y]/n) " % str(_localFile)) if keyin in ['y', '']: _actual_delete = True elif keyin == 'n': _actual_delete = False else: logger.warning("y/n please!") keyin = None if _actual_delete: import time remove_filename = _localFile + "_" + str( time.time()) + '__to_be_deleted_' try: os.rename(_localFile, remove_filename) except OSError as err: logger.warning( "Error in first stage of removing file: %s" % remove_filename) remove_filename = _localFile try: os.remove(remove_filename) except OSError as err: if err.errno != errno.ENOENT: logger.error("Error in removing file: %s" % str(remove_filename)) raise pass return def accessURL(self): # Need to come up with a prescription based upon the server address and # file on EOS or elsewhere to return a full URL which we can pass to # ROOT... protoPath = getConfig('Output')['MassStorageFile']['defaultProtocol'] myLocations = self.location() accessURLs = [] for file in myLocations: import os accessURLs.append(protoPath + os.path.join(os.sep, file)) return accessURLs
class Transform(GangaObject): _schema = Schema( Version(1, 0), { 'status': SimpleItem(defvalue='new', protected=1, copyable=0, doc='Status - running, pause or completed', typelist=["str"]), 'name': SimpleItem(defvalue='Simple Transform', doc='Name of the transform (cosmetic)', typelist=["str"]), 'application': ComponentItem( 'applications', defvalue=None, optional=1, load_default=False, filter="checkTaskApplication", doc= 'Application of the Transform. Must be a Task-Supporting application.' ), 'inputsandbox': FileItem(defvalue=[], typelist=['str', 'Ganga.GPIDev.Lib.File.File.File'], sequence=1, doc="list of File objects shipped to the worker node "), 'outputsandbox': SimpleItem( defvalue=[], typelist=['str'], sequence=1, doc="list of filenames or patterns shipped from the worker node" ), 'inputdata': ComponentItem('datasets', defvalue=None, optional=1, load_default=False, doc='Input dataset'), 'outputdata': ComponentItem('datasets', defvalue=None, optional=1, load_default=False, doc='Output dataset'), 'backend': ComponentItem('backends', defvalue=None, optional=1, load_default=False, doc='Backend of the Transform.'), 'run_limit': SimpleItem( defvalue=4, doc='Number of times a partition is tried to be processed.', protected=1, typelist=["int"]), '_partition_status': SimpleItem(defvalue={}, hidden=1, copyable=0, doc='Map (only necessary) partitions to their status'), '_app_partition': SimpleItem(defvalue={}, hidden=1, copyable=0, doc='Map of applications to partitions'), '_app_status': SimpleItem(defvalue={}, hidden=1, copyable=0, doc='Map of applications to status'), '_next_app_id': SimpleItem(defvalue=0, hidden=1, copyable=0, doc='Next ID used for the application', typelist=["int"]), }) _category = 'transforms' _name = 'Transform' _exportmethods = [ 'run', 'pause', # Operations 'setPartitionStatus', 'setRunlimit', 'setFailed', # Control Partitions 'getPartitionStatus', 'getJobs', 'getPartitionJobs', # Info 'overview', 'info', 'n_all', 'n_status', 'retryFailed' ] # _app_status = {} _partition_apps = None # possible partition status values: # ignored, hold, ready, running, completed, attempted, failed, bad # Special methods: def __init__(self): super(Transform, self).__init__() self.initialize() def _readonly(self): """A transform is read-only if the status is not new.""" if self.status == "new": return 0 return 1 def initialize(self): from Ganga import GPI self.backend = stripProxy(GPI.Local()) def check(self): pass def startup(self): """This function is used to set the status after restarting Ganga""" # Make sure that no partitions are kept "running" from previous # sessions clist = self._partition_status.keys() for c in clist: self.updatePartitionStatus(c) # At this point the applications still need to notify the Transformation of their status # Search jobs for task-supporting applications id = "%i:%i" % (self._getParent().id, self._getParent().transforms.index(self)) for j in GPI.jobs: if "tasks_id" in stripProxy(j.application).getNodeData(): # print "tasks_id of jobid ", j.fqid, # stripProxy(j.application).getNodeAttribute("tasks_id"), id if stripProxy(j.application).getNodeAttribute( "tasks_id").endswith(id): try: if j.subjobs: for sj in j.subjobs: app = stripProxy(sj.application) stripProxy(app.getTransform()).setAppStatus( app, app._getParent().status) else: app = stripProxy(j.application) stripProxy(app.getTransform()).setAppStatus( app, app._getParent().status) except AttributeError as e: logger.error("%s", e) def getPartitionApps(self): if self._partition_apps is None: # Create the reverse map _partition_apps from _app_partition self._partition_apps = {} for (app, partition) in self._app_partition.iteritems(): if partition in self._partition_apps: if not app in self._partition_apps[partition]: self._partition_apps[partition].append(app) else: self._partition_apps[partition] = [app] return self._partition_apps def fix(self): """This function fixes inconsistencies in application status""" # Create the reverse map _partition_apps from _app_partition self._app_status = {} # Make sure that no partitions are kept "running" from previous # sessions clist = self._partition_status.keys() for c in clist: self.updatePartitionStatus(c) # At this point the applications still need to notify the Transformation of their status # Search jobs for task-supporting applications id = "%i:%i" % (self._getParent().id, self._getParent().transforms.index(self)) for j in GPI.jobs: if "tasks_id" in stripProxy(j.application).getNodeData(): if stripProxy( j.application).getNodeAttribute("tasks_id") == id: try: if j.subjobs: for sj in j.subjobs: app = stripProxy(sj.application) stripProxy(app.getTransform()).setAppStatus( app, app._getParent().status) else: app = stripProxy(j.application) stripProxy(app.getTransform()).setAppStatus( app, app._getParent().status) except AttributeError as e: logger.error("%s", e) # Public methods def run(self, check=True): """Sets this transform to running status""" if self.status == "new" and check: self.check() if self.status != "completed": self.updateStatus("running") #self.status = "running" # Check if this transform has completed in the meantime is_complete = True for s in self._partition_status.values(): if s != "completed" and s != "bad": is_complete = False break if is_complete: self.updateStatus("completed") #self.status = "completed" task = self._getParent() if task: task.updateStatus() else: logger.warning("Transform is already completed!") def pause(self): """Pause the task - the background thread will not submit new jobs from this task""" if self.status != "completed": self.updateStatus("pause") #self.status = "pause" task = self._getParent() if task: task.updateStatus() else: logger.debug("Transform is already completed!") def setRunlimit(self, newRL): """Set the number of times a job should be resubmitted before the transform is paused""" self.run_limit = newRL cs = self._partition_status.items() for (c, s) in cs: if s in ["attempted", "failed"]: failures = self.getPartitionFailures(c) if failures >= newRL: self._partition_status[c] = "failed" else: self._partition_status[c] = "attempted" logger.debug("Runlimit set to %i", newRL) def setPartitionStatus(self, partition, status): """ Set the Status of the given partition to "ready", "hold", "bad" or "completed". The status is then updated to the status indicated by the applications""" self.setPartitionsStatus([partition], status) def getJobs(self): """ Get the job slice of all jobs for this transform """ return self.getPartitionJobs(None) def getPartitionJobs(self, partition): """ Get the job slice that processed the given partition. Iterates over the job list. """ task = self._getParent() id = task.transforms.index(self) if partition is None: sname = "tasks(%i).transforms[%i].getJobs()" % (task.id, id) else: sname = "tasks(%i).transforms[%i].getPartitionJobs(%s)" % ( task.id, id, partition) jobslice = JobRegistrySlice(sname) def addjob(j): if partition is None or self._app_partition[ j.application.id] == partition: jobslice.objects[j.fqid] = stripProxy(j) for j in GPI.jobs: try: stid = j.application.tasks_id.split(":") if int(stid[-2]) == task.id and int(stid[-1]) == id: if j.subjobs: for sj in j.subjobs: addjob(sj) else: addjob(j) except Exception as err: logger.debug("getPartitionJobs Exception:\n%s" % str(err)) pass return JobRegistrySliceProxy(jobslice) def setFailed(self, partition): """ Tells Tasks that all Applications that have executed this partition have actually failed.""" for aid in self._app_partition: if aid in self._app_status and self._app_status[aid] == "removed": continue # Save the status self._app_status[aid] = "failed" # Update the corresponding partition status self.setPartitionStatus(partition, "ready") def retryFailed(self): """Retry all failed partitions (forget about failed jobs)""" for aid in self._app_partition: if aid in self._app_status and self._app_status[aid] == "failed": self._app_status[aid] = "removed" clist = self._partition_status.keys() for c in clist: self.updatePartitionStatus(c) # Internal methods def finalise(self): """Finalise the transform - no-op by default""" return def submitJobs(self, n): """Create Ganga Jobs for the next N partitions that are ready and submit them.""" next = self.getNextPartitions(n) if len(next) == 0: return 0 numjobs = 0 for j in self.getJobsForPartitions(next): stripProxy(j.application).transition_update("submitting") try: j.submit() except JobError: logger.error( "Error on job submission! The current transform will be paused until this problem is fixed." ) logger.error( "type tasks(%i).run() to continue after the problem has been fixed.", self._getParent().id) self.pause() numjobs += 1 return numjobs def checkTaskApplication(self, app): """warns the user if the application is not compatible """ if app is None: return None if not "tasks_id" in stripProxy(app).getNodeData(): return taskApp(app) return app def setAppStatus(self, app, new_status): """Reports status changes in application jobs possible status values: normal : (new, submitting,) submitted, running, completing, completed failures : killed, failed transient: incomplete (->new), unknown, removed""" # Check if we know the occurring application... if app.id == -1: return if not app.id in self._app_partition: logger.warning("%s was contacted by an unknown application %i.", self.fqn(), app.id) return # Silently ignore message if the application is already removed or # completed if app.id in self._app_status and self._app_status[app.id] in [ "removed", "completed", "failed" ]: return # Check the status if new_status == "completed" and not self.checkCompletedApp(app): logger.error("%s app %i failed despite listed as completed!", self.fqn(), app.id) new_status = "failed" # Save the status self._app_status[app.id] = new_status # Update the corresponding partition status self.updatePartitionStatus(self._app_partition[app.id]) def setMasterJobStatus(self, job, new_status): """hook for a master job status update""" return def updatePartitionStatus(self, partition): """ Calculate the correct status of the given partition. "completed" and "bad" is never changed here "hold" is only changed to "completed" here. """ # print "updatePartitionStatus ", partition, " transform ", self.id # If the partition has status, and is not in a fixed state, check it! if partition in self._partition_status and ( not self._partition_status[partition] in ["bad", "completed"]): # if we have no applications, we are in "ready" state if not partition in self.getPartitionApps(): if self._partition_status[partition] != "hold": self._partition_status[partition] = "ready" else: status = [ self._app_status[app] for app in self.getPartitionApps()[partition] if app in self._app_status and not self._app_status[app] in ["removed", "killed"] ] # Check if we have completed this partition if "completed" in status: self._partition_status[partition] = "completed" # Check if we are not on hold elif self._partition_status[partition] != "hold": # Check if we are running running = False for stat in [ "completing", "running", "submitted", "submitting" ]: if stat in status: self._partition_status[partition] = "running" running = True break if not running: # Check if we failed #failures = len([stat for stat in status if stat in ["failed","new"]]) failures = self.getPartitionFailures(partition) if failures >= self.run_limit: self._partition_status[partition] = "failed" elif failures > 0: self._partition_status[partition] = "attempted" else: # Here we only have some "unknown" applications # This could prove difficult when launching new applications. Care has to be taken # to get the applications out of "unknown" stats as quickly as possible, to avoid double submissions. #logger.warning("Partition with only unknown applications encountered. This is probably not a problem.") self._partition_status[partition] = "ready" # Notify the next transform (if any) of the change in input status self.notifyNextTransform(partition) # Update the Tasks status if necessary task = self._getParent() if partition in self._partition_status and self._partition_status[ partition] in ["completed", "bad" ] and self.status == "running": for s in self._partition_status.values(): if s != "completed" and s != "bad": return #self.status = "completed" self.updateStatus("completed") if task: task.updateStatus() elif self.status == "completed": for s in self._partition_status.values(): if s != "completed" and s != "bad": self.updateStatus("running") #self.status = "running" if task: task.updateStatus() return def notifyNextTransform(self, partition): """ Notify any dependant transforms of the input update """ task = self._getParent() if task and (task.transforms.index(self) + 1 < len(task.transforms)): task.transforms[task.transforms.index(self) + 1].updateInputStatus( self, partition) def setPartitionsStatus(self, partitions, status): """ Set the Status of the partitions to "ready", "hold", "bad" or "completed". The status is then updated to the status indicated by the applications "bad" and "completed" is never changed except to "ignored", "hold" is only changed to "completed". """ if status == "ignored": [ self._partition_status.pop(c) for c in partitions if c in self._partition_status ] elif status in ["ready", "hold", "bad", "completed"]: for c in partitions: self._partition_status[c] = status else: logger.error( "setPartitionsStatus called with invalid status string %s", status) for c in partitions: self.updatePartitionStatus(c) def setPartitionsLimit(self, limitpartition): """ Set all partitions from and including limitpartition to ignored """ partitions = [c for c in self._partition_status if c >= limitpartition] self.setPartitionsStatus(partitions, "ignored") def getPartitionStatus(self, partition): if partition in self._partition_status: return self._partition_status[partition] else: return "ignored" def getNextPartitions(self, n): """Returns the N next partitions to process""" partitionlist = sorted(c for c, v in self._partition_status.items() if v in ["ready", "attempted"]) return partitionlist[:n] def getNewAppID(self, partition): """ Returns a new application ID and associates this ID with the partition given. """ id = self._next_app_id self._app_partition[id] = partition if partition in self.getPartitionApps(): self.getPartitionApps()[partition].append(id) else: self.getPartitionApps()[partition] = [id] self._next_app_id += 1 return id def createNewJob(self, partition): """ Returns a new job initialized with the transforms application, backend and name """ task = self._getParent( ) # this works because createNewJob is only called by a task id = task.transforms.index(self) j = GPI.Job() stripProxy(j).backend = self.backend.clone() stripProxy(j).application = self.application.clone() stripProxy(j).application.tasks_id = "%i:%i" % (task.id, id) stripProxy(j).application.id = self.getNewAppID(partition) j.inputdata = self.inputdata j.outputdata = self.outputdata j.inputsandbox = self.inputsandbox j.outputsandbox = self.outputsandbox j.name = "T%i:%i C%i" % (task.id, id, partition) return j # Methods that can/should be overridden by derived classes def checkCompletedApp(self, app): """Can be overriden to improve application completeness checking""" return True def updateInputStatus(self, ltf, partition): """Is called my the last transform (ltf) if the partition 'partition' changes status""" # per default no dependencies exist pass def getJobsForPartitions(self, partitions): """This is only an example, this class should be overridden by derived classes""" return [self.createNewJob(p) for p in partitions] # Information methods def fqn(self): task = self._getParent() if task: return "Task %i Transform %i" % (task.id, task.transforms.index(self)) else: return "Unassigned Transform '%s'" % (self.name) def n_all(self): return len(self._partition_status) def n_status(self, status): return len( [cs for cs in self._partition_status.values() if cs == status]) def overview(self): """ Get an ascii art overview over task status. Can be overridden """ task = self._getParent() if not task is None: id = str(task.transforms.index(self)) else: id = "?" o = markup("#%s: %s '%s'\n" % (id, getName(self), self.name), status_colours[self.status]) i = 0 partitions = sorted(self._partition_status.keys()) for c in partitions: s = self._partition_status[c] if c in self.getPartitionApps(): failures = self.getPartitionFailures(c) o += markup("%i:%i " % (c, failures), overview_colours[s]) else: o += markup("%i " % c, overview_colours[s]) i += 1 if i % 20 == 0: o += "\n" logger.info(o) def info(self): logger.info( markup("%s '%s'" % (getName(self), self.name), status_colours[self.status])) logger.info("* backend: %s" % getName(self.backend)) logger.info("Application:") self.application.printTree() def getPartitionFailures(self, partition): """Return the number of failures for this partition""" return len([ 1 for app in self.getPartitionApps()[partition] if app in self._app_status and self._app_status[app] in ["new", "failed"] ]) def updateStatus(self, status): """Update the transform status""" self.status = status
class PhysicalFile(LocalFile): '''Class for handling physical files (i.e. PFNs) Example Usage: pfn = PhysicalFile("/some/pfn.file") pfn.upload("/some/lfn.file","CERN-USER") # upload the PFN to LFC [...etc...] ''' _schema = Schema( Version(1, 0), { 'name': SimpleItem(defvalue='', doc='PFN'), 'namePattern': SimpleItem( defvalue="", doc='pattern of the file name', transient=1), 'localDir': SimpleItem( defvalue="", doc= 'local dir where the file is stored, used from get and put methods', transient=1), 'subfiles': ComponentItem(category='gangafiles', defvalue=[], hidden=1, typelist=['Ganga.GPIDev.Lib.File.LocalFile'], sequence=1, copyable=0, doc="collected files from the wildcard namePattern", transient=1), 'compressed': SimpleItem( defvalue=False, typelist=['bool'], protected=0, doc= 'wheather the output file should be compressed before sending somewhere', transient=1) }) _category = 'gangafiles' _name = 'PhysicalFile' _exportmethods = ['location', 'upload'] def __init__(self, name=''): val = full_expand_filename(name) super(PhysicalFile, self).__init__(namePattern=val) self.namePattern = os.path.basename(name) self.localDir = os.path.dirname(val) self.name = val logger.warning( "!!! PhysicalFile has been deprecated, this is now just a wrapper to the LocalFile object" ) logger.warning( "!!! Please update your scripts before PhysicalFile is removed") def __construct__(self, args): if (len(args) != 1) or (type(args[0]) is not type('')): super(PhysicalFile, self).__construct__(args) else: self.name = full_expand_filename(args[0]) val = full_expand_filename(args[0]) self.localDir = os.path.dirname(val) self.namePattern = os.path.basename(val) def _attribute_filter__set__(self, n, v): if n == 'name': import os.path val = full_expand_filename(v) self.name = val self.namePattern = os.path.basename(val) self.localDir = os.path.dirname(val) return val return v def upload(self, lfn, diracSE, guid=None): from GangaDirac.Lib.Files.DiracFile import DiracFile diracFile = DiracFile(namePattern=self.name, lfn=lfn) diracFile.put(force=True) return diracFile
class DiracFile(IGangaFile): """ File stored on a DIRAC storage element Usage: Some common use cases: 1) Uploading a file and sending jobs to run over it 2) Uploading a file to be sent to where your jobs are running 3) Uploading and removing a file 4) Removing an existing file from Dirac storage 5) Change the path of LFN produced by a ganga job. 6) Accessing a (potentially remote) file known to Dirac through an LFN 1) To upload a file and submit a job to use it as inputdata: df = DiracFile('/path/to/some/local/file') df.put() j=Job( ... ) j.inputdata=[df.lfn] (The file is now accessible via data.py at the site) 2) To upload a file and make it available on a workernode: df = DiracFile('/path/to/some/local/file') df.put(uploadSE = 'CERN-USER') j=Job( ... ) j.inputfiles = [df] j.submit() 3) To upload and then remove a file: df = DiracFile('/path/to/some/local/file') df.put() df.remove() 4) To remove an existing file already in Dirac storage df = DiracFile('LFN:/some/lfn/path') df.remove() or: df = DiracFile(lfn='/some/lfn/path') df.remove() 5) To change an LFN path structure which is produced by Ganga: j=Job( ... ) j.outputfiles=[DiracFile('myAwesomeLFN.ext', remoteDir='myPath_{jid}_{sjid}')] j.submit() This will produce LFN similar to: /lhcb/user/<u>/<user>/myPath_1_2/2017_01/123456/123456789/myAwesomeLFN.ext Other possibilities may look like: j.outputfiles=[DiracFile('myData.ext', remoteDir='myProject/job{jid}_sj{sjid}')] => /lhcb/user/<u>/<user>/myProject/job1_sj2/2017_01/123456/123456789/myData.ext j.outputfiles=[DiracFile('myData.ext', remoteDir='myProject')] => /lhcb/user/<u>/<user>/myProject/2017_01/123456/123456789/myData.ext Alternatively you may change in your .gangarc: [DIRAC] useGangaPath=True This will give you LFN like: /lhcb/user/<u>/<user>/GangaJob_13/OutputFiles/2017_01/123456/123456789/myFile.ext for all future jobs while this is in your .gangarc config. 6) Accessing a (potentially remote) file locally known to DIRAC: df = DiracFile(lfn='/some/lfn/path') ganga_path = df.accessURL() **exit ganga** root ganga_path # to stream a file over xrootd:// """ _schema = Schema( Version(1, 1), { 'namePattern': SimpleItem(defvalue="", doc='pattern of the file name'), 'localDir': SimpleItem( defvalue=None, copyable=1, typelist=['str', 'type(None)'], doc= 'local dir where the file is stored, used from get and put methods' ), 'locations': SimpleItem( defvalue=[], copyable=1, typelist=['str'], sequence=1, doc="list of SE locations where the outputfiles are uploaded"), 'compressed': SimpleItem( defvalue=False, typelist=['bool'], protected=0, doc= 'wheather the output file should be compressed before sending somewhere' ), 'lfn': SimpleItem( defvalue='', copyable=1, typelist=['str'], doc= 'return the logical file name/set the logical file name to use if not ' 'using wildcards in namePattern'), 'remoteDir': SimpleItem( defvalue="", doc='remote directory where the LFN is to be placed within ' 'this is the relative path of the LFN which is put between the user LFN base and the filename.' ), 'guid': SimpleItem( defvalue='', copyable=1, typelist=['str'], doc= 'return the GUID/set the GUID to use if not using wildcards in the namePattern.' ), 'subfiles': ComponentItem( category='gangafiles', defvalue=[], sequence=1, copyable=0, # hidden=1, typelist=['GangaDirac.Lib.Files.DiracFile'], doc="collected files from the wildcard namePattern"), 'defaultSE': SimpleItem( defvalue='', copyable=1, doc= "defaultSE where the file is to be accessed from or uploaded to" ), 'failureReason': SimpleItem(defvalue="", protected=1, copyable=0, doc='reason for the upload failure'), 'credential_requirements': ComponentItem('CredentialRequirement', defvalue='DiracProxy'), }) _env = None _category = 'gangafiles' _name = "DiracFile" _exportmethods = [ "get", "getMetadata", "getReplicas", 'getSubFiles', 'remove', 'removeReplica', "replicate", 'put', 'locations', 'location', 'accessURL', '_updateRemoteURLs', 'hasMatchedFiles' ] _additional_slots = ['_have_copied', '_remoteURLs', '_storedReplicas'] def __init__(self, namePattern='', localDir=None, lfn='', remoteDir=None, **kwds): """ name is the name of the output file that has to be written ... """ super(DiracFile, self).__init__() self.locations = [] self._setLFNnamePattern(lfn, namePattern) if localDir is not None: self.localDir = localDir if remoteDir is not None: self.remoteDir = remoteDir self._have_copied = False self._remoteURLs = {} self._storedReplicas = {} def __setattr__(self, attr, value): """ This is an overloaded setter method to make sure that we're auto-expanding the filenames of files which exist. In the case we're assigning any other attributes the value is simply passed through Args: attr (str): This is the name of the attribute which we're assigning value (unknown): This is the value being assigned. """ actual_value = value if attr == "namePattern": this_dir, actual_value = os.path.split(value) if this_dir: self.localDir = this_dir elif attr == 'localDir': if value: new_value = os.path.abspath(expandfilename(value)) if os.path.exists(new_value): actual_value = new_value super(DiracFile, self).__setattr__(attr, actual_value) def _attribute_filter__set__(self, name, value): if value != "" and value is not None: # Do some checking of the filenames in a subprocess if name == 'lfn': this_dir, self.namePattern = os.path.split(value) if this_dir: self.remoteDir = this_dir return value elif name == 'namePattern': self.localDir, this_name = os.path.split(value) return this_name elif name == 'localDir': if value: return expandfilename(value) else: return value return value def locations(self): return self.locations def _setLFNnamePattern(self, lfn="", namePattern=""): if hasattr(self, 'defaultSE') and self.defaultSE != "": ## TODO REPLACE THIS WITH IN LIST OF VONAMES KNOWN # Check for /lhcb/some/path or /gridpp/some/path if namePattern.split(os.pathsep)[0] == self.defaultSE \ or (len(namePattern) > 3 and namePattern[0:4].upper() == "LFN:"\ or len(namePattern.split(os.pathsep)) > 1 and namePattern.split(os.pathsep)[1] == self.defaultSE): # Check for LFN:/gridpp/some/path or others... lfn = namePattern namePattern = "" if lfn: if len(lfn) > 3 and lfn[0:4].upper() == "LFN:": lfn = lfn[4:] elif namePattern: if len(namePattern) > 3 and namePattern[0:4].upper() == 'LFN:': lfn = namePattern[4:] if lfn != "" and namePattern != "": self.lfn = lfn self.namePattern = namePattern elif lfn != "" and namePattern == "": self.lfn = lfn elif namePattern != "" and lfn == "": self.namePattern = namePattern def _attribute_filter__get__(self, name): # Attempt to spend too long loading un-needed objects into memory in # order to read job status if name is 'lfn': if not self.lfn: logger.warning("Do NOT have an LFN, for file: %s" % self.namePattern) logger.warning( "If file exists locally try first using the method put()") return object.__getattribute__(self, 'lfn') elif name in ['guid', 'locations']: if configDirac['DiracFileAutoGet']: if name is 'guid': if self.guid: if self.lfn: self.getMetadata() return object.__getattribute__(self, 'guid') elif name is 'locations': if self.locations == []: if self.lfn: self.getMetadata() return object.__getattribute__(self, 'locations') return object.__getattribute__(self, name) def __repr__(self): """Get the representation of the file.""" return "DiracFile(namePattern='%s', lfn='%s', localDir='%s')" % ( self.namePattern, self.lfn, self.localDir) def getSubFiles(self): """ Returns the subfiles for this instance """ if self.lfn: self.setLocation() return self.subfiles def dirac_line_processor(self, line, dirac_file, localPath): """ Function to interperate the post processor lines. This returns False when everything went OK and True on an ERROR """ logger.debug("Calling dirac_line_processor") tokens = line.strip().split(':::') logger.debug("dirac_line_processor: %s" % tokens) pattern = tokens[1].split('->')[0].split('&&')[0] name = tokens[1].split('->')[0].split('&&')[1] lfn = tokens[1].split('->')[1] guid = tokens[3] try: locations = eval(tokens[2]) except Exception as err: logger.debug("line_process err: %s" % err) locations = tokens[2] if pattern == name: logger.debug("pattern == name") logger.error("Failed to parse outputfile data for file '%s'" % name) return True # This is the case that multiple files were requested if pattern == dirac_file.namePattern: logger.debug("pattern == dirac_file.namePattern") d = DiracFile(namePattern=name, lfn=lfn) d.compressed = dirac_file.compressed d.guid = guid d.locations = locations d.localDir = localPath dirac_file.subfiles.append(d) #dirac_line_processor(line, d) return False # This is the case that an individual file was requested elif name == dirac_file.namePattern: logger.debug("name == dirac_file.namePattern") if lfn == '###FAILED###': dirac_file.failureReason = tokens[2] logger.error("Failed to upload file '%s' to Dirac: %s" % (name, dirac_file.failureReason)) return True dirac_file.lfn = lfn dirac_file.locations = locations dirac_file.guid = guid dirac_file.localDir = localPath return False else: logger.debug("False") return False def setLocation(self): """ """ logger.debug("DiracFile: setLocation") if not stripProxy(self).getJobObject(): logger.error("No job assocaited with DiracFile: %s" % str(self)) return job = self.getJobObject() postprocessLocationsPath = os.path.join( job.outputdir, getConfig('Output')['PostProcessLocationsFileName']) postprocesslocations = None try: postprocesslocations = open(postprocessLocationsPath, 'r') self.subfiles = [] ## NB remember only do this once at it leaves the 'cursor' at the end of the file - rcurrie all_lines = postprocesslocations.readlines() logger.debug("lines:\n%s" % all_lines) for line in all_lines: logger.debug("This line: %s" % line) if line.startswith('DiracFile'): if self.dirac_line_processor( line, self, os.path.dirname(postprocessLocationsPath) ) and regex.search(self.namePattern) is None: logger.error( "Error processing line:\n%s\nAND: namePattern: %s is NOT matched" % (str(line), str(self.namePattern))) else: logger.debug("Parsed the Line") else: logger.debug("Skipping the Line") except Exception as err: logger.warning("unexpected Error: %s" % str(err)) finally: if postprocesslocations is not None: postprocesslocations.close() def _auto_remove(self): """ Remove called when job is removed as long as config option allows """ if self.lfn != '': self.remove() @require_credential def remove(self): """ Remove this lfn and all replicas from DIRAC LFC/SEs """ if self.lfn == "": raise GangaFileError( 'Can\'t remove a file from DIRAC SE without an LFN.') logger.info('Removing file %s' % self.lfn) stdout = execute('removeFile("%s")' % self.lfn, cred_req=self.credential_requirements) self.lfn = "" self.locations = [] self.guid = '' return True @require_credential def removeReplica(self, SE): """ Remove the replica from the given SE """ self.getReplicas() if SE not in self.locations: raise GangaFileError("No replica at supplied SE: %s" % SE) try: logger.info("Removing replica at %s for LFN %s" % (SE, self.lfn)) stdout = execute('removeReplica("%s", "%s")' % (self.lfn, SE), cred_req=self.credential_requirements) self.locations.remove(SE) except GangaDiracError as err: raise err return True @require_credential def getMetadata(self): """ Get Metadata associated with this files lfn. This method will also try to automatically set the files guid attribute. """ if self.lfn == "": self._optionallyUploadLocalFile() # check that it has a replica if not self.getReplicas(): raise GangaFileError("No replica found for this file!") # eval again here as datatime not included in dirac_ganga_server ret = execute('getMetadata("%s")' % self.lfn, cred_req=self.credential_requirements) if self.guid != ret.get('Successful', {}).get(self.lfn, {}).get( 'GUID', False): self.guid = ret['Successful'][self.lfn]['GUID'] reps = self.getReplicas() ret['Successful'][self.lfn].update({'replicas': self.locations}) return ret def _optionallyUploadLocalFile(self): """ """ if self.lfn != "": return if self.namePattern != "" and self.lfn == "": logger.info( "I have a local DiracFile, however you're requesting it's location on the grid" ) logger.info("Shall I upload it to the grid before I continue?") decision = raw_input('[y] / n:') while not (decision.lower() in ['y', 'n'] or decision.lower() == ''): decision = raw_input('[y] / n:') if decision.lower() in ['y', '']: # upload namePattern to grid logger.debug("Uploading the file first") self.put() elif decision == 'n': logger.debug("Not uploading now") return else: # do Nothing logger.debug("Continuing without uploading file") if self.lfn == "": raise GangaFileError('Uploading of namePattern: %s failed' % self.namePattern) if self.namePattern == "" and self.lfn == "": raise GangaFileError( 'Cannot do anything if I don\'t have an lfn or a namePattern!') return @require_credential def getReplicas(self, forceRefresh=False): """ Get the list of all SE where this file has a replica This relies on an internally stored list of replicas, (SE and unless forceRefresh = True """ if self.lfn == '': self._optionallyUploadLocalFile() if self.lfn == '': raise GangaFileError( "Can't find replicas for file which has no LFN!") these_replicas = None if len(self.subfiles) != 0: allReplicas = [] for i in self.subfiles: allReplicas.append(i.getReplicas()) these_replicas = allReplicas else: # deep copy just before wer change it incase we're pointing to the # data stored in original from a copy if self._have_copied: self._storedReplicas = copy.deepcopy(self._storedReplicas) if (self._storedReplicas == {} and len(self.subfiles) == 0) or forceRefresh: try: self._storedReplicas = execute( 'getReplicas("%s")' % self.lfn, cred_req=self.credential_requirements) except GangaDiracError as err: logger.error("Couldn't find replicas for: %s" % str(self.lfn)) self._storedReplicas = {} raise try: self._storedReplicas = self._storedReplicas['Successful'] except Exception as err: logger.error("Unknown Error: %s from %s" % (str(err), self._storedReplicas)) raise logger.debug("getReplicas: %s" % str(self._storedReplicas)) if self.lfn in self._storedReplicas: self._updateRemoteURLs(self._storedReplicas) these_replicas = [self._storedReplicas[self.lfn]] else: these_replicas = {} elif self._storedReplicas != {}: these_replicas = [self._storedReplicas[self.lfn]] return these_replicas def _updateRemoteURLs(self, reps): """ Internal function used for storing all replica information about this LFN at different sites """ if len(self.subfiles) != 0: for i in self.subfiles: i._updateRemoteURLs(reps) else: if self.lfn not in reps: return if self.locations != reps[self.lfn].keys(): self.locations = reps[self.lfn].keys() #logger.debug( "locations: %s" % str( self.locations ) ) # deep copy just before wer change it incase we're pointing to the # data stored in original from a copy if self._have_copied: self._remoteURLs = copy.deepcopy(self._remoteURLs) for site in self.locations: #logger.debug( "site: %s" % str( site ) ) self._remoteURLs[site] = reps[self.lfn][site] #logger.debug("Adding _remoteURLs[site]: %s" % str(self._remoteURLs[site])) def location(self): """ Return a list of LFN locations for this DiracFile """ if len(self.subfiles) == 0: if self.lfn == "": self._optionallyUploadLocalFile() else: return [self.lfn] else: # 1 LFN per DiracFile LFNS = [] for this_file in self.subfiles: these_LFNs = this_file.location() for this_url in these_LFNs: LFNs.append(this_url) return LFNs @require_credential def accessURL(self, thisSE='', protocol=''): """ Attempt to find an accessURL which corresponds to the specified SE. If no SE is specified then return a random one from all the replicas. Also use the specified protocol - if none then use the default. """ lfns = [] if len(self.subfiles) == 0: lfns.append(self.lfn) else: for i in self.subfiles: lfns.append(i.lfn) return getAccessURLs(lfns, thisSE, protocol, self.credential_requirements) @require_credential def internalCopyTo(self, targetPath): """ Retrieves locally the file matching this DiracFile object pattern. If localPath is specified Args: targetPath(str): The path the file should be placed at locally """ to_location = targetPath if self.lfn == "": raise GangaFileError('Can\'t download a file without an LFN.') logger.info("Getting file %s" % self.lfn) stdout = execute('getFile("%s", destDir="%s")' % (self.lfn, to_location), cred_req=self.credential_requirements) if self.namePattern == "": name = os.path.basename(self.lfn) if self.compressed: name = name[:-3] self.namePattern = name if self.guid == "" or not self.locations: self.getMetadata() return True @require_credential def replicate(self, destSE, sourceSE=''): """ Replicate an LFN to another SE Args: destSE (str): the SE to replicate the file to sourceSE (str): the se to use as a cource for the file """ if not self.lfn: raise GangaFileError('Must supply an lfn to replicate') logger.info("Replicating file %s to %s" % (self.lfn, destSE)) stdout = execute('replicateFile("%s", "%s", "%s")' % (self.lfn, destSE, sourceSE), cred_req=self.credential_requirements) if destSE not in self.locations: self.locations.append(destSE) def processWildcardMatches(self): if regex.search(self.namePattern) is not None: raise GangaFileError( "No wildcards in inputfiles for DiracFile just yet. Dirac are exposing this in API soon." ) @require_credential def put(self, lfn='', force=False, uploadSE="", replicate=False): """ Try to upload file sequentially to storage elements defined in configDirac['allDiracSE']. File will be uploaded to the first SE that the upload command succeeds for. The file is uploaded to the SE described by the DiracFile.defaultSE attribute Alternatively, the user can specify an uploadSE which contains an SE which the file is to be uploaded to. If the user wants to replicate this file(s) across all SE then they should state replicate = True. Return value will be either the stdout from the dirac upload command if not using the wildcard characters '*?[]' in the namePattern. If the wildcard characters are used then the return value will be a list containing newly created DiracFile objects which were the result of glob-ing the wildcards. The objects in this list will have been uploaded or had their failureReason attribute populated if the upload failed. """ if self.lfn != "" and force == False and lfn == '': logger.warning( "Warning you're about to 'put' this DiracFile: %s on the grid as it already has an lfn: %s" % (self.namePattern, self.lfn)) decision = raw_input('y / [n]:') while not (decision.lower() in ['y', 'n'] or decision.lower() == ''): decision = raw_input('y / [n]:') if decision.lower() == 'y': pass else: return if (lfn != '' and self.lfn != '') and force == False: logger.warning( "Warning you're attempting to put this DiracFile: %s" % self.namePattern) logger.warning("It currently has an LFN associated with it: %s" % self.lfn) logger.warning( "Do you want to continue and attempt to upload to: %s" % lfn) decision = raw_input('y / [n]:') while not (decision.lower() in ['y', 'n', '']): decision = raw_input('y / [n]:') if decision.lower() == 'y': pass else: return if lfn and os.path.basename(lfn) != self.namePattern: logger.warning( "Changing namePattern from: '%s' to '%s' during put operation" % (self.namePattern, os.path.basename(lfn))) if lfn: self.lfn = lfn # looks like will only need this for the interactive uploading of jobs. # Also if any backend need dirac upload on client then when downloaded # this will upload then delete the file. if self.namePattern == "": if self.lfn != '': logger.warning( "'Put'-ing a file with ONLY an existing LFN makes no sense!" ) raise GangaFileError( 'Can\'t upload a file without a local file name.') sourceDir = self.localDir if self.localDir is None: sourceDir = os.getcwd() # attached to a job, use the joboutputdir if self._parent != None and os.path.isdir( self.getJobObject().outputdir): sourceDir = self.getJobObject().outputdir if not os.path.isdir(sourceDir): raise GangaFileError( 'localDir attribute is not a valid dir, don\'t know from which dir to take the file' ) if regex.search(self.namePattern) is not None: if self.lfn != "": logger.warning( "Cannot specify a single lfn for a wildcard namePattern") logger.warning("LFN will be generated automatically") self.lfn = "" if not self.remoteDir: try: job = self.getJobObject() lfn_folder = os.path.join("GangaJob_%s" % job.getFQID('/'), "OutputFiles") except AssertionError: t = datetime.datetime.now() this_date = t.strftime("%H.%M_%A_%d_%B_%Y") lfn_folder = os.path.join('GangaFiles_%s' % this_date) lfn_base = os.path.join( DiracFile.diracLFNBase(self.credential_requirements), lfn_folder) else: lfn_base = os.path.join( DiracFile.diracLFNBase(self.credential_requirements), self.remoteDir) if uploadSE == "": if self.defaultSE != "": storage_elements = [self.defaultSE] else: if configDirac['allDiracSE']: storage_elements = [ random.choice(configDirac['allDiracSE']) ] else: raise GangaFileError( "Can't upload a file without a valid defaultSE or storageSE, please provide one" ) elif isinstance(uploadSE, list): storage_elements = uploadSE else: storage_elements = [uploadSE] outputFiles = GangaList() for this_file in glob.glob(os.path.join(sourceDir, self.namePattern)): name = this_file if not os.path.exists(name): if not self.compressed: raise GangaFileError( 'Cannot upload file. File "%s" must exist!' % name) name += '.gz' if not os.path.exists(name): raise GangaFileError('File "%s" must exist!' % name) else: if self.compressed: os.system('gzip -c %s > %s.gz' % (name, name)) name += '.gz' if not os.path.exists(name): raise GangaFileError('File "%s" must exist!' % name) lfn = os.path.join(lfn_base, os.path.basename(this_file)) d = DiracFile() d.namePattern = os.path.basename(name) d.compressed = self.compressed d.localDir = sourceDir stderr = '' stdout = '' logger.info('Uploading file \'%s\' to \'%s\' as \'%s\'' % (name, storage_elements[0], lfn)) logger.debug('execute: uploadFile("%s", "%s", %s)' % (lfn, os.path.join(sourceDir, name), str([storage_elements[0]]))) try: stdout = execute('uploadFile("%s", "%s", %s)' % (lfn, os.path.join(sourceDir, name), str([storage_elements[0]])), cred_req=self.credential_requirements) except GangaDiracError as err: logger.warning("Couldn't upload file '%s': \'%s\'" % (os.path.basename(name), err)) failureReason = "Error in uploading file '%s' : '%s'" % ( os.path.basename(name), err) if regex.search(self.namePattern) is not None: d.failureReason = failureReason outputFiles.append(d) continue self.failureReason += '\n' + failureReason continue stdout_temp = stdout.get('Successful') if not stdout_temp: msg = "Couldn't upload file '%s': \'%s\'" % ( os.path.basename(name), stdout) logger.warning(msg) if regex.search(self.namePattern) is not None: d.failureReason = msg outputFiles.append(d) continue self.failureReason = msg continue else: lfn_out = stdout_temp[lfn] # when doing the two step upload delete the temp file if self.compressed or self._parent != None: os.remove(name) # need another eval as datetime needs to be included. guid = lfn_out.get('GUID', '') if regex.search(self.namePattern) is not None: d.lfn = lfn d.remoteDir = os.path.dirname(lfn) d.locations = lfn_out.get('allDiracSE', '') d.guid = guid outputFiles.append(d) continue else: self.lfn = lfn self.remoteDir = os.path.dirname(lfn) self.locations = lfn_out.get('allDiracSE', '') self.guid = guid if replicate == True: if len(outputFiles) == 1 or len(outputFiles) == 0: storage_elements.pop(0) for se in storage_elements: self.replicate(se) else: storage_elements.pop(0) for this_file in outputFiles: for se in storage_elements: this_file.replicate(se) if len(outputFiles) > 0: return outputFiles else: outputFiles.append(self) return outputFiles def getWNScriptDownloadCommand(self, indent): script_location = os.path.join( os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))), 'downloadScript.py.template') download_script = FileUtils.loadScript(script_location, '') script = """\n download_script='''\n###DOWNLOAD_SCRIPT###''' import subprocess dirac_env=###DIRAC_ENV### subprocess.Popen('''python -c "import sys\nexec(sys.stdin.read())"''', shell=True, env=dirac_env, stdin=subprocess.PIPE).communicate(download_script) """ script = '\n'.join( [str(indent + str(line)) for line in script.split('\n')]) replace_dict = { '###DOWNLOAD_SCRIPT###': download_script, '###DIRAC_ENV###': self._getDiracEnvStr(), '###LFN###': self.lfn } for k, v in replace_dict.iteritems(): script = script.replace(str(k), str(v)) return script def _getDiracEnvStr(self): diracEnv = str(getDiracEnv(self.credential_requirements.dirac_env)) return diracEnv def _WN_wildcard_script(self, namePattern, lfnBase, compressed): wildcard_str = """ for f in glob.glob('###NAME_PATTERN###'): processes.append(uploadFile(os.path.basename(f), '###LFN_BASE###', ###COMPRESSED###, '###NAME_PATTERN###')) """ wildcard_str = FileUtils.indentScript(wildcard_str, '###INDENT###') replace_dict = { '###NAME_PATTERN###': namePattern, '###LFN_BASE###': lfnBase, '###COMPRESSED###': compressed } for k, v in replace_dict.iteritems(): wildcard_str = wildcard_str.replace(str(k), str(v)) return wildcard_str def getWNInjectedScript(self, outputFiles, indent, patternsToZip, postProcessLocationsFP): """ Returns script that have to be injected in the jobscript for postprocessing on the WN """ script_path = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) script_location = os.path.join(script_path, 'uploadScript.py.template') upload_script = FileUtils.loadScript(script_location, '') WNscript_location = os.path.join(script_path, 'WNInjectTemplate.py.template') script = FileUtils.loadScript(WNscript_location, '') if not self.remoteDir: try: job = self.getJobObject() lfn_folder = os.path.join("GangaJob_%s" % job.getFQID('.'), "OutputFiles") except AssertionError: t = datetime.datetime.now() this_date = t.strftime("%H.%M_%A_%d_%B_%Y") lfn_folder = os.path.join('GangaFiles_%s' % this_date) lfn_base = os.path.join( DiracFile.diracLFNBase(self.credential_requirements), lfn_folder) else: lfn_base = oa.path.join( DiracFile.diracLFNBase(self.credential_requirements), self.remoteDir) for this_file in outputFiles: isCompressed = this_file.namePattern in patternsToZip if not regex.search(this_file.namePattern) is None: script += self._WN_wildcard_script(this_file.namePattern, lfn_base, str(isCompressed)) else: script += '###INDENT###print("Uploading: %s as: %s")\n' % ( this_file.namePattern, str(os.path.join(lfn_base, this_file.namePattern))) script += '###INDENT###processes.append(uploadFile("%s", "%s", %s))\n' % ( this_file.namePattern, lfn_base, str(isCompressed)) if stripProxy(self)._parent is not None and stripProxy( self).getJobObject() and getName( stripProxy(self).getJobObject().backend) != 'Dirac': script_env = self._getDiracEnvStr() else: script_env = str(None) script = '\n'.join( [str('###INDENT###' + str(line)) for line in script.split('\n')]) replace_dict = { '###UPLOAD_SCRIPT###': upload_script, '###STORAGE_ELEMENTS###': str(configDirac['allDiracSE']), '###INDENT###': indent, '###LOCATIONSFILE###': postProcessLocationsFP, '###DIRAC_ENV###': script_env } for k, v in replace_dict.iteritems(): script = script.replace(str(k), str(v)) return script def hasMatchedFiles(self): if self.lfn != "" and self.namePattern != "": if self.namePattern == os.path.basename(self.lfn): return True else: logger.error("LFN doesn't match namePattern for file: %s" % str(self.namePattern)) return False elif len(self.subfiles) > 0 and regex.search( self.namePattern) is not None: return True else: logger.error("Failed to Match file:\n%s" % str(self)) return False @staticmethod def diracLFNBase(credential_requirements): """ Compute a sensible default LFN base name If ``DiracLFNBase`` has been defined, use that. Otherwise, construct one from the user name and the user VO Args: credential_requirements (DiracProxy): This is the credential which governs how we should format the path """ if configDirac['DiracLFNBase']: return configDirac['DiracLFNBase'] user = DiracProxyInfo(credential_requirements).username return '/{0}/user/{1}/{2}'.format(configDirac['userVO'], user[0], user)
class ITransform(GangaObject): _schema = Schema(Version(1, 0), { 'status': SimpleItem(defvalue='new', protected=1, copyable=1, doc='Status - running, pause or completed', typelist=[str]), 'name': SimpleItem(defvalue='Simple Transform', doc='Name of the transform (cosmetic)', typelist=[str]), 'application': ComponentItem('applications', defvalue=None, optional=1, load_default=False, doc='Application of the Transform.'), 'inputsandbox': FileItem(defvalue=[], sequence=1, doc="list of File objects shipped to the worker node "), 'outputsandbox': SimpleItem(defvalue=[], typelist=[str], sequence=1, doc="list of filenames or patterns shipped from the worker node"), 'backend': ComponentItem('backends', defvalue=None, optional=1, load_default=False, doc='Backend of the Transform.'), 'splitter': ComponentItem('splitters', defvalue=None, optional=1, load_default=False, doc='Splitter used on each unit of the Transform.'), 'postprocessors': ComponentItem('postprocessor', defvalue=None, doc='list of postprocessors to run after job has finished'), 'merger': ComponentItem('mergers', defvalue=None, hidden=1, copyable=0, load_default=0, optional=1, doc='Merger to be done over all units when complete.'), 'unit_merger': ComponentItem('mergers', defvalue=None, load_default=0, optional=1, doc='Merger to be copied and run on each unit separately.'), 'copy_output': ComponentItem('datasets', defvalue=None, load_default=0, optional=1, doc='The dataset to copy all units output to, e.g. Grid dataset -> Local Dataset'), 'unit_copy_output': ComponentItem('datasets', defvalue=None, load_default=0, optional=1, doc='The dataset to copy each individual unit output to, e.g. Grid dataset -> Local Dataset'), 'run_limit': SimpleItem(defvalue=8, doc='Number of times a partition is tried to be processed.', protected=1, typelist=[int]), 'minor_run_limit': SimpleItem(defvalue=3, doc='Number of times a unit can be resubmitted', protected=1, typelist=[int]), 'major_run_limit': SimpleItem(defvalue=3, doc='Number of times a junit can be rebrokered', protected=1, typelist=[int]), 'units': ComponentItem('units', defvalue=[], sequence=1, copyable=1, doc='list of units'), 'inputdata': ComponentItem('datasets', defvalue=[], sequence=1, protected=1, optional=1, load_default=False, doc='Input datasets to run over'), 'outputdata': ComponentItem('datasets', defvalue=None, optional=1, load_default=False, doc='Output dataset template'), 'inputfiles': GangaFileItem(defvalue=[], sequence=1, doc="list of file objects that will act as input files for a job"), 'outputfiles' : GangaFileItem(defvalue=[], sequence=1, doc="list of OutputFile objects to be copied to all jobs"), 'metadata': ComponentItem('metadata', defvalue=MetadataDict(), doc='the metadata', protected=1), 'rebroker_on_job_fail': SimpleItem(defvalue=True, doc='Rebroker if too many minor resubs'), 'abort_loop_on_submit': SimpleItem(defvalue=True, doc='Break out of the Task Loop after submissions'), 'required_trfs': SimpleItem(defvalue=[], typelist=[int], sequence=1, doc="IDs of transforms that must complete before this unit will start. NOTE DOESN'T COPY OUTPUT DATA TO INPUT DATA. Use TaskChainInput Dataset for that."), 'chain_delay': SimpleItem(defvalue=0, doc='Minutes delay between a required/chained unit completing and starting this one', protected=0, typelist=[int]), 'submit_with_threads': SimpleItem(defvalue=False, doc='Use Ganga Threads for submission'), 'max_active_threads': SimpleItem(defvalue=10, doc='Maximum number of Ganga Threads to use. Note that the number of simultaneous threads is controlled by the queue system (default is 5)'), 'info' : SimpleItem(defvalue=[],typelist=[str],protected=1,sequence=1,doc="Info showing status transitions and unit info"), 'id': SimpleItem(defvalue=-1, protected=1, doc='ID of the Transform', typelist=[int]), #'force_single_unit' : SimpleItem(defvalue=False, doc='Force all input data into one Unit'), }) _category = 'transforms' _name = 'ITransform' _exportmethods = ['addInputData', 'resetUnit', 'setRunLimit', 'getJobs', 'setMinorRunLimit', 'setMajorRunLimit', 'getID', 'overview', 'resetUnitsByStatus', 'removeUnusedJobs', 'showInfo', 'showUnitInfo', 'pause', 'n_all', 'n_status' ] _hidden = 0 def showInfo(self): """Print out the info in a nice way""" print("\n".join( self.info )) def showUnitInfo(self, uid): """Print out the given unit info in a nice way""" self.units[uid].showInfo() def getJobs(self): """Return a list of the currently active job ids""" joblist = [] for u in self.units: joblist += u.active_job_ids return joblist def setMinorRunLimit(self, newRL): """Set the number of times a job will be resubmitted before a major resubmit is attempted""" self.minor_run_limit = newRL def setMajorRunLimit(self, newRL): """Set the number of times a job will be rebrokered before the transform is paused""" self.major_run_limit = newRL def setRunLimit(self, newRL): """Set the total (minor+major) number of times a job should be resubmitted before the transform is paused""" self.run_limit = newRL def overview(self, status=''): """Show the status of the units in this transform""" for unit in self.units: # display colour given state o = "" o += ("%d: " % self.units.index(unit)) + unit.name # is unit active? if unit.active: o += " " * (40 - len(o) + 3) + "*" else: o += " " * (40 - len(o) + 3) + "-" # sub job status o += "\t %i" % unit.n_status("submitted") o += "\t %i" % unit.n_status("running") o += "\t %i" % unit.n_status("completed") o += "\t %i" % unit.n_status("failed") o += "\t %i" % unit.minor_resub_count o += "\t %i" % unit.major_resub_count # change colour on state if unit.status == 'completed': o = markup(o, overview_colours["completed"]) elif not unit.active: o = markup(o, overview_colours["bad"]) elif unit.status == "recreating": o = markup(o, overview_colours["attempted"]) elif len(unit.active_job_ids) == 0: o = markup(o, overview_colours["hold"]) else: o = markup(o, overview_colours["running"]) print(o) # Special methods: def __init__(self): super(ITransform, self).__init__() self.initialize() def _auto__init__(self): self.status = 'new' def _readonly(self): """A transform is read-only if the status is not new.""" if self.status == "new": return 0 return 1 def initialize(self): from Ganga.Lib.Localhost.Localhost import Localhost self.backend = Localhost() def check(self): """Check this transform has valid data, etc. and has the correct units""" # ignore anything but new transforms if self.status != "new": return # first, validate the transform if not self.validate(): raise ApplicationConfigurationError( None, "Validate failed for Transform %s" % self.name) self.updateStatus("running") def startup(self): """This function is used to set the status after restarting Ganga""" pass # Public methods def resetUnit(self, uid): """Reset the given unit""" addInfoString( self, "Reseting Unit %i" % ( uid ) ) for u in self.units: if u.getID() == uid: u.reset() break # find any chained units and mark for recreation for trf in self._getParent().transforms: for u2 in trf.units: for req in u2.req_units: if req == "%d:%d" % (self.getID(), u.getID()) or req == "%d:ALL" % (self.getID()): trf.resetUnit(u2.getID()) self.updateStatus("running") def getID(self): """Return the index of this trf in the parent task""" # if the id isn't already set, use the index from the parent Task if self.id < 0: task = self._getParent() if not task: raise ApplicationConfigurationError( None, "This transform has not been associated with a task and so there is no ID available") self.id = task.transforms.index(self) return self.id def run(self, check=True): """Sets this transform to running status""" if self.status == "new" and check: self.check() if self.status != "completed": self.updateStatus("running") task = self._getParent() if task: task.updateStatus() else: logger.warning("Transform is already completed!") def update(self): """Called by the parent task to check for status updates, submit jobs, etc.""" if self.status == "pause" or self.status == "new": return 0 # check for complete required units task = self._getParent() for trf_id in self.required_trfs: if task.transforms[trf_id].status != "completed": return 0 # set the start time if not already set if len(self.required_trfs) > 0 and self.units[0].start_time == 0: for unit in self.units: unit.start_time = time.time() + self.chain_delay * 60 - 1 # report the info for this transform unit_status = { "new":0, "hold":0, "running":0, "completed":0, "bad":0, "recreating":0 } for unit in self.units: unit_status[unit.status] += 1 info_str = "Unit overview: %i units, %i new, %i hold, %i running, %i completed, %i bad. to_sub %i" % (len(self.units), unit_status["new"], unit_status["hold"], unit_status["running"], unit_status["completed"], unit_status["bad"], self._getParent().n_tosub()) addInfoString(self, info_str) # ask the unit splitter if we should create any more units given the # current data self.createUnits() # loop over units and update them ((re)submits will be called here) old_status = self.status unit_status_list = [] # find submissions first unit_update_list = [] for unit in self.units: if not unit.checkForSubmission() and not unit.checkForResubmission(): unit_update_list.append(unit) continue if unit.update() and self.abort_loop_on_submit: logger.info("Unit %d of transform %d, Task %d has aborted the loop" % ( unit.getID(), self.getID(), task.id)) return 1 unit_status_list.append(unit.status) # now check for download for unit in unit_update_list: if unit.update() and self.abort_loop_on_submit: logger.info("Unit %d of transform %d, Task %d has aborted the loop" % ( unit.getID(), self.getID(), task.id)) return 1 unit_status_list.append(unit.status) from Ganga.GPIDev.Lib.Tasks.TaskChainInput import TaskChainInput # check for any TaskChainInput completions for ds in self.inputdata: if isType(ds, TaskChainInput) and ds.input_trf_id != -1: if task.transforms[ds.input_trf_id].status != "completed": return 0 # update status and check for state in ['running', 'hold', 'bad', 'completed']: if state in unit_status_list: if state == 'hold': state = "running" if state != self.status: self.updateStatus(state) break def createUnits(self): """Create new units if required given the inputdata""" from Ganga.GPIDev.Lib.Tasks.TaskChainInput import TaskChainInput # check for chaining for ds in self.inputdata: if isType(ds, TaskChainInput) and ds.input_trf_id != -1: # check for single unit if ds.single_unit: # is there a unit already linked? done = False rec_unit = None for out_unit in self.units: if '%d:ALL' % (ds.input_trf_id) in out_unit.req_units: done = True # check if the unit is being recreated if out_unit.status == "recreating": rec_unit = out_unit break if not done or rec_unit: new_unit = self.createChainUnit( self._getParent().transforms[ds.input_trf_id].units, ds.use_copy_output) if new_unit: self.addChainUnitToTRF( new_unit, ds, -1, prev_unit=rec_unit) else: # loop over units in parent trf and create units as # required for in_unit in self._getParent().transforms[ds.input_trf_id].units: # is there a unit already linked? done = False rec_unit = None for out_unit in self.units: if '%d:%d' % (ds.input_trf_id, in_unit.getID()) in out_unit.req_units: done = True # check if the unit is being recreated if out_unit.status == "recreating": rec_unit = out_unit break if not done or rec_unit: new_unit = self.createChainUnit( [in_unit], ds.use_copy_output) if new_unit: self.addChainUnitToTRF( new_unit, ds, in_unit.getID(), prev_unit=rec_unit) def createChainUnit(self, parent_units, use_copy_output=True): """Create a chained unit given the parent outputdata""" return IUnit() def addChainUnitToTRF(self, unit, inDS, unit_id=-1, prev_unit=None): """Add a chained unit to this TRF. Override for more control""" if unit_id == -1: unit.req_units.append('%d:ALL' % (inDS.input_trf_id)) unit.name = "Parent: TRF %d, All Units" % (inDS.input_trf_id) else: unit.req_units.append('%d:%d' % (inDS.input_trf_id, unit_id)) unit.name = "Parent: TRF %d, Unit %d" % ( inDS.input_trf_id, unit_id) self.addUnitToTRF(unit, prev_unit) def addInputData(self, inDS): """Add the given input dataset to the list""" self.inputdata.append(inDS) def pause(self): """Pause the task - the background thread will not submit new jobs from this task""" if self.status != "completed": self.updateStatus("pause") #self.status = "pause" task = self._getParent() if task: task.updateStatus() else: logger.debug("Transform is already completed!") def setRunlimit(self, newRL): """Set the number of times a job should be resubmitted before the transform is paused""" self.run_limit = newRL logger.debug("Runlimit set to %i", newRL) # Methods that can/should be overridden by derived classes def validate(self): """Override this to validate that the transform is OK""" from Ganga.GPIDev.Lib.Tasks.TaskLocalCopy import TaskLocalCopy # make sure a path has been selected for any local downloads if self.unit_copy_output is not None and isType(self.unit_copy_output, TaskLocalCopy): if self.unit_copy_output.local_location == '': logger.error("No path selected for Local Output Copy") return False if self.copy_output is not None and isType(self.copy_output, TaskLocalCopy): if self.copy_output.local_location == '': logger.error("No path selected for Local Output Copy") return False # this is a generic trf so assume the application and splitter will do # all the work return True def addUnitToTRF(self, unit, prev_unit=None): """Add a unit to this Transform given the input and output data""" if not unit: raise ApplicationConfigurationError(None, "addUnitTOTRF failed for Transform %d (%s): No unit specified" % (self.getID(), self.name)) addInfoString( self, "Adding Unit to TRF...") unit.updateStatus("hold") unit.active = True if prev_unit: unit.prev_job_ids += prev_unit.prev_job_ids self.units[prev_unit.getID()] = unit else: self.units.append(unit) stripProxy(unit).id = len(self.units) - 1 # Information methods def fqn(self): task = self._getParent() if task: return "Task %i Transform %i" % (task.id, task.transforms.index(self)) else: return "Unassigned Transform '%s'" % (self.name) def n_active(self): return sum([u.n_active() for u in self.units]) def n_all(self): return sum([u.n_all() for u in self.units]) def n_status(self, status): return sum([u.n_status(status) for u in self.units]) def info(self): logger.info(markup("%s '%s'" % (getName(self), self.name), status_colours[self.status])) logger.info("* backend: %s" % getName(self.backend)) logger.info("Application:") self.application.printTree() def updateStatus(self, status): """Update the transform status""" self.status = status def createUnitCopyOutputDS(self, unit_id): """Create a the Copy Output dataset to use with this unit. Overload to handle more than the basics""" from Ganga.GPIDev.Lib.Tasks.TaskLocalCopy import TaskLocalCopy if isType(self.unit_copy_output, TaskLocalCopy): logger.warning("Default implementation of createUnitCopyOutputDS can't handle datasets of type '%s'" % getName(self.unit_copy_output)) return # create copies of the Copy Output DS and add Unit name to path self.units[unit_id].copy_output = self.unit_copy_output.clone() self.units[unit_id].copy_output.local_location = os.path.join( self.unit_copy_output.local_location, self.units[unit_id].name.replace(":", "_").replace(" ", "").replace(",", "_")) def __setattr__(self, attr, value): if attr == 'outputfiles': if value != []: if self.outputdata is not None: logger.error( 'ITransform.outputdata is set, you can\'t set ITransform.outputfiles') return elif self.outputsandbox != []: logger.error( 'ITransform.outputsandbox is set, you can\'t set ITransform.outputfiles') return # reduce duplicate values here, leave only duplicates for LCG, # where we can have replicas uniqueValuesDict = [] uniqueValues = [] for val in value: key = '%s%s' % (getName(val), val.namePattern) if key not in uniqueValuesDict: uniqueValuesDict.append(key) uniqueValues.append(val) elif getName(val) == 'LCGSEFile': uniqueValues.append(val) super(ITransform, self).__setattr__(attr, uniqueValues) elif attr == 'inputfiles': if value != []: if self.inputsandbox != []: logger.error( 'ITransform.inputsandbox is set, you can\'t set ITransform.inputfiles') return super(ITransform, self).__setattr__(attr, value) elif attr == 'outputsandbox': if value != []: if getConfig('Output')['ForbidLegacyOutput']: logger.error( 'Use of ITransform.outputsandbox is forbidden, please use ITransform.outputfiles') return if self.outputfiles != []: logger.error( 'ITransform.outputfiles is set, you can\'t set ITransform.outputsandbox') return super(ITransform, self).__setattr__(attr, value) elif attr == 'inputsandbox': if value != []: if getConfig('Output')['ForbidLegacyInput']: logger.error( 'Use of ITransform.inputsandbox is forbidden, please use ITransform.inputfiles') return if self.inputfiles != []: logger.error( 'ITransform.inputfiles is set, you can\'t set ITransform.inputsandbox') return super(ITransform, self).__setattr__(attr, value) elif attr == 'outputdata': if value is not None: if getConfig('Output')['ForbidLegacyOutput']: logger.error( 'Use of ITransform.outputdata is forbidden, please use ITransform.outputfiles') return if self.outputfiles != []: logger.error( 'ITransform.outputfiles is set, you can\'t set ITransform.outputdata') return super(ITransform, self).__setattr__(attr, value) else: super(ITransform, self).__setattr__(attr, value) def resetUnitsByStatus(self, status='bad'): """Reset all units of a given status""" for unit in self.units: if unit.status == status: logger.info("Resetting Unit %d, Transform %d..." % (unit.getID(), self.getID())) self.resetUnit(unit.getID()) def checkUnitsAreCompleted(self, parent_units): """Check the given parent units are complete""" for parent in parent_units: if len(parent.active_job_ids) == 0 or parent.status != "completed": return False return True def getChainInclExclMasks(self, parent_units): """return the include/exclude masks from the TaskChainInput""" incl_pat_list = [] excl_pat_list = [] from Ganga.GPIDev.Lib.Tasks.TaskChainInput import TaskChainInput for parent in parent_units: for inds in self.inputdata: if isType(inds, TaskChainInput) and inds.input_trf_id == parent._getParent().getID(): incl_pat_list += inds.include_file_mask excl_pat_list += inds.exclude_file_mask return incl_pat_list, excl_pat_list def getParentUnitJobs(self, parent_units, include_subjobs=True): """Return the list of parent jobs""" job_list = [] for parent in parent_units: job = getJobByID(parent.active_job_ids[0]) if job.subjobs: job_list += job.subjobs else: job_list += [job] return job_list def removeUnusedJobs(self): """Remove all jobs that aren't being used, e.g. failed jobs""" for unit in self.units: for jid in unit.prev_job_ids: try: logger.warning("Removing job '%d'..." % jid) job = getJobByID(jid) job.remove() except Exception as err: logger.debug("removeUnused: %s" % str(err)) logger.error("Problem removing job '%d'" % jid)
class Task(GangaObject): """This is a Task without special properties""" _schema = Schema( Version(1, 0), { 'transforms': ComponentItem('transforms', defvalue=[], sequence=1, copyable=1, doc='list of transforms'), 'id': SimpleItem(defvalue=-1, protected=1, doc='ID of the Task', typelist=["int"]), 'name': SimpleItem(defvalue='NewTask', copyable=1, doc='Name of the Task', typelist=["str"]), 'comment': SimpleItem( '', protected=0, doc='comment of the task', typelist=["str"]), 'status': SimpleItem(defvalue='new', protected=1, doc='Status - new, running, pause or completed', typelist=["str"]), 'float': SimpleItem(defvalue=0, copyable=1, doc='Number of Jobs run concurrently', typelist=["int"]), 'resub_limit': SimpleItem( defvalue=0.9, copyable=1, doc= 'Resubmit only if the number of running jobs is less than "resub_limit" times the float. This makes the job table clearer, since more jobs can be submitted as subjobs.', typelist=["float"]), 'creation_date': SimpleItem(defvalue="19700101", copyable=0, hidden=1, doc='Creation date of the task (used in dq2 datasets)', typelist=["str"]), }) _category = 'tasks' _name = 'Task' _exportmethods = [ # Settings 'setBackend', 'setParameter', 'insertTransform', 'appendTransform', 'removeTransform', 'check', 'run', 'pause', 'remove', # Operations # Info 'overview', 'info', 'n_all', 'n_status', 'help', 'getJobs', 'table', 'float_all', 'run_all' # Helper ] default_registry = "tasks" # Special methods: def _auto__init__(self, registry=None): if registry is None: from Ganga.Core.GangaRepository import getRegistry registry = getRegistry(self.default_registry) # register the job (it will also commit it) # job gets its id now registry._add(self) self.creation_date = time.strftime('%Y%m%d%H%M%S') self.initialize() self.startup() self._setDirty() def initialize(self): pass def startup(self): """Startup function on Ganga startup""" for t in self.transforms: t.startup() # def _readonly(self): # """A task is read-only if the status is not new.""" # if self.status == "new": # return 0 # return 1 # Public methods: # # - remove() a task # - clone() a task # - check() a task (if updated) # - run() a task to start processing # - pause() to interrupt processing # - setBackend(be) for all transforms # - setParameter(myParam=True) for all transforms # - insertTransform(id, tf) insert a new processing step # - removeTransform(id) remove a processing step def remove(self, remove_jobs="do_nothing"): """Delete the task""" if not remove_jobs in [True, False]: logger.info("You want to remove the task %i named '%s'." % (self.id, self.name)) logger.info( "Since this operation cannot be easily undone, please call this command again:" ) logger.info( " * as tasks(%i).remove(remove_jobs=True) if you want to remove all associated jobs," % (self.id)) logger.info( " * as tasks(%i).remove(remove_jobs=False) if you want to keep the jobs." % (self.id)) return if remove_jobs: for j in GPI.jobs: try: stid = j.application.tasks_id.split(":") if int(stid[-2]) == self.id: j.remove() except Exception as err: logger.debug("Task remove_jobs task split Error!") logger.debug("Error:\n%s" % str(err)) pass self._getRegistry()._remove(self) logger.info("Task #%s deleted" % self.id) def clone(self): c = super(Task, self).clone() for tf in c.transforms: tf.status = "new" # This is cleared separately since it is not in the schema tf._partition_apps = {} # self._getParent().register(c) c.check() return c def check(self): """This function is called by run() or manually by the user""" if self.status != "new": logger.error( "The check() function may modify a task and can therefore only be called on new tasks!" ) return try: for t in self.transforms: t.check() finally: self.updateStatus() return True def run(self): """Confirms that this task is fully configured and ready to be run.""" if self.status == "new": self.check() if self.status != "completed": if self.float == 0: logger.warning( "The 'float', the number of jobs this task may run, is still zero. Type 'tasks(%i).float = 5' to allow this task to submit 5 jobs at a time" % self.id) try: for tf in self.transforms: if tf.status != "completed": tf.run(check=False) finally: self.updateStatus() else: logger.info("Task is already completed!") def pause(self): """Pause the task - the background thread will not submit new jobs from this task""" float_cache = self.float self.float = 0 if self.status != "completed": for tf in self.transforms: tf.pause() self.status = "pause" else: logger.info("Transform is already completed!") self.float = float_cache def setBackend(self, backend): """Sets the backend on all transforms""" for tf in self.transforms: if backend is None: tf.backend = None else: tf.backend = stripProxy(backend).clone() def setParameter(self, **args): """Use: setParameter(processName="HWW") to set the processName in all applications to "HWW" Warns if applications are not affected because they lack the parameter""" for name, parm in args.iteritems(): for tf in [t for t in self.transforms if t.application]: if name in tf.application.getNodeData(): addProxy(tf.application).__setattr__(name, parm) else: logger.warning("Transform %i was not affected!", tf.name) def insertTransform(self, id, tf): """Insert transfrm tf before index id (counting from 0)""" if self.status != "new" and id < len(self.transforms): logger.error( "You can only insert transforms at the end of the list. Only if a task is new it can be freely modified!" ) return # self.transforms.insert(id,tf.copy()) # this would be safer, but # breaks user exspectations # this means that t.insertTransform(0,t2.transforms[0]) will cause # Great Breakage self.transforms.insert(id, tf) def appendTransform(self, tf): """Append transform""" return self.insertTransform(len(self.transforms), tf) def removeTransform(self, id): """Remove the transform with the index id (counting from 0)""" if self.status != "new": logger.error("You can only remove transforms if the task is new!") return del self.transforms[id] def getJobs(self, transform=None, partition=None, only_master_jobs=True): """ Get the job slice of all jobs that process this task """ if not partition is None: only_master_jobs = False jobslice = JobRegistrySlice( "tasks(%i).getJobs(transform=%s, partition=%s, only_master_jobs=%s)" % (self.id, transform, partition, only_master_jobs)) def addjob(j): if transform is None or partition is None or self.transforms[int( transform)]._app_partition[j.application.id] == partition: jobslice.objects[j.fqid] = stripProxy(j) for j in GPI.jobs: try: stid = j.application.tasks_id.split(":") if int(stid[-2]) == self.id and (transform is None or stid[-1] == str(transform)): if j.subjobs and not only_master_jobs: for sj in j.subjobs: addjob(sj) else: addjob(j) except Exception as err: logger.debug("getJobs Error!!") logger.debug("Error:\n%s" % str(err)) # print x pass return JobRegistrySliceProxy(jobslice) # Internal methods def finaliseTransforms(self): """Check for any things needing doing after a transform has completed""" for t in self.transforms: t.finalise() def updateStatus(self): """Updates status based on transform status. Called from check() or if status of a transform changes""" # Calculate status from transform status: states = [tf.status for tf in self.transforms] if "running" in states and "pause" in states: new_status = "running/pause" elif "running" in states: new_status = "running" elif "pause" in states: new_status = "pause" elif "new" in states: new_status = "new" elif "completed" in states: new_status = "completed" else: new_status = "new" # no tranforms # Handle status changes here: if self.status != new_status: if new_status == "running/pause": logger.info( "Some Transforms of Task %i '%s' have been paused. Check tasks.table() for details!" % (self.id, self.name)) elif new_status == "completed": logger.warning("Task %i '%s' has completed!" % (self.id, self.name)) elif self.status == "completed": logger.warning("Task %i '%s' has been reopened!" % (self.id, self.name)) self.status = new_status return self.status def submitJobs(self): """Submits as many jobs as necessary to maintain the float. Internal""" numjobs = 0 if not self.status in ["running", "running/pause"]: return 0 for i in range(len(self.transforms) - 1, -1, -1): tf = self.transforms[i] to_run = self.float - self.n_status("running") run = (self.resub_limit * self.float >= self.n_status("running")) if tf.status == "running" and to_run > 0 and run: numjobs += tf.submitJobs(to_run) return numjobs # Information methods def n_all(self): return sum([t.n_all() for t in self.transforms]) def n_status(self, status): return sum([t.n_status(status) for t in self.transforms]) def table(self): from Ganga.GPI import tasks tasks[self.id:self.id].table() def overview(self): """ Get an ascii art overview over task status. Can be overridden """ logger.info("Colours: " + ", ".join([ markup(key, overview_colours[key]) for key in [ "hold", "ready", "running", "completed", "attempted", "failed", "bad", "unknown" ] ])) logger.info( "Lists the partitions of events that are processed in one job, and the number of failures to process it." ) logger.info("Format: (partition number)[:(number of failed attempts)]") logger.info('') for t in self.transforms: t.overview() def info(self): for t in self.transforms: t.info() def help(self): logger.info("This is a Task without special properties") # Helper methods def float_all(self): self.float = self.n_all() def run_all(self): self.float_all() self.run()
class DiracBase(IBackend): """The backend that submits jobs to the Grid via DIRAC. The backend for jobs to be submitted to the Grid. Jobs are submitted through the DIRAC WMS system and then in turn submitted to the Grid. A few examples of usage are given below # Create Dirac backend object b = Dirac() # Create and submit job. j = Job(application=app,backend=b) j.submit() # Run a Root job on the Grid if in LHCb VO # Create a Root application object. See Root help text for instructions # on how to configure this. app = Root() # Create and submit job to Dirac using default options j = Job(application=app,backend=Dirac()) j.submit() # Using the 'settings' attribute j.backend.settings['BannedSites'] = ['LCG.CERN.ch'] j.resubmit() # settings can be set at any time but are only 'respected' during # submit and resubmit. """ dirac_monitoring_is_active = True _schema = Schema( Version(3, 2), { 'id': SimpleItem( defvalue=None, protected=1, copyable=0, typelist=['int', 'type(None)'], doc= 'The id number assigned to the job by the DIRAC WMS. If seeking help' ' on jobs with the Dirac backend, please always report this id ' 'number in addition to a full description of your problem. The id ' 'can also be used to further inspect the job at ' 'https://lhcbweb.pic.es/DIRAC/info/general/diracOverview'), 'status': SimpleItem(defvalue=None, protected=1, copyable=0, typelist=['str', 'type(None)'], doc='The detailed status as reported by the DIRAC WMS'), 'actualCE': SimpleItem(defvalue=None, protected=1, copyable=0, typelist=['str', 'type(None)'], doc='The location where the job ran'), 'normCPUTime': SimpleItem( defvalue=None, protected=1, copyable=0, typelist=['str', 'type(None)'], doc='The normalized CPU time reported by the DIRAC WMS'), 'statusInfo': SimpleItem(defvalue='', protected=1, copyable=0, typelist=['str', 'type(None)'], doc='Minor status information from Dirac'), 'extraInfo': SimpleItem(defvalue='', protected=1, copyable=0, typelist=['str', 'type(None)'], doc='Application status information from Dirac'), 'diracOpts': SimpleItem( defvalue='', doc= 'DIRAC API commands to add the job definition script. Only edit ' 'if you *really* know what you are doing'), 'settings': SimpleItem( defvalue={'CPUTime': 2 * 86400}, doc='Settings for DIRAC job (e.g. CPUTime, BannedSites, etc.)' ), 'credential_requirements': ComponentItem('CredentialRequirement', defvalue=DiracProxy), }) _exportmethods = [ 'getOutputData', 'getOutputSandbox', 'removeOutputData', 'getOutputDataLFNs', 'getOutputDataAccessURLs', 'peek', 'reset', 'debug' ] _packed_input_sandbox = True _category = "backends" _name = 'DiracBase' _hidden = True def _setup_subjob_dataset(self, dataset): """ This method is used for constructing datasets on a per subjob basis when submitting parametric jobs Args: Dataset (Dataset): This is a GangaDataset object, todo check this isn't a list """ return None def _setup_bulk_subjobs(self, dirac_ids, dirac_script): """ This is the old bulk submit method which is used to construct the subjobs for a parametric job Args: dirac_ids (list): This is a list of the Dirac ids which have been created dirac_script (str): Name of the dirac script which contains the job jdl """ f = open(dirac_script, 'r') parametric_datasets = get_parametric_datasets(f.read().split('\n')) f.close() if len(parametric_datasets) != len(dirac_ids): raise BackendError( 'Dirac', 'Missmatch between number of datasets defines in dirac API script and those returned by DIRAC' ) master_job = self.getJobObject() master_job.subjobs = [] for i in range(len(dirac_ids)): j = Job() j.copyFrom(master_job) j.splitter = None j.backend.id = dirac_ids[i] j.id = i j.inputdata = self._setup_subjob_dataset(parametric_datasets[i]) j.status = 'submitted' j.time.timenow('submitted') master_job.subjobs.append(j) return True @require_credential def _common_submit(self, dirac_script): '''Submit the job via the Dirac server. Args: dirac_script (str): filename of the JDL which is to be submitted to DIRAC ''' j = self.getJobObject() self.id = None self.actualCE = None self.status = None self.extraInfo = None self.statusInfo = '' j.been_queued = False dirac_cmd = """execfile(\'%s\')""" % dirac_script try: result = execute(dirac_cmd, cred_req=self.credential_requirements) except GangaDiracError as err: err_msg = 'Error submitting job to Dirac: %s' % str(err) logger.error(err_msg) logger.error("\n\n===\n%s\n===\n" % dirac_script) logger.error("\n\n====\n") with open(dirac_script, 'r') as file_in: logger.error("%s" % file_in.read()) logger.error("\n====\n") raise BackendError('Dirac', err_msg) idlist = result if type(idlist) is list: return self._setup_bulk_subjobs(idlist, dirac_script) self.id = idlist return type(self.id) == int def _addition_sandbox_content(self, subjobconfig): '''any additional files that should be sent to dirac Args: subjobcofig (unknown): This is the config for this subjob (I think)''' return [] def submit(self, subjobconfig, master_input_sandbox): """Submit a DIRAC job Args: subjobconfig (unknown): master_input_sandbox (list): file names which are in the master sandbox of the master sandbox (if any) """ j = self.getJobObject() sboxname = j.createPackedInputSandbox(subjobconfig.getSandboxFiles()) input_sandbox = master_input_sandbox[:] input_sandbox += sboxname input_sandbox += self._addition_sandbox_content(subjobconfig) ## Add LFN to the inputfiles section of the file input_sandbox_userFiles = [] for this_file in j.inputfiles: if isType(this_file, DiracFile): input_sandbox_userFiles.append('LFN:' + str(this_file.lfn)) if j.master: for this_file in j.master.inputfiles: if isType(this_file, DiracFile): input_sandbox_userFiles.append('LFN:' + str(this_file.lfn)) for this_file in input_sandbox_userFiles: input_sandbox.append(this_file) logger.debug("dirac_script: %s" % str(subjobconfig.getExeString())) logger.debug("sandbox_cont:\n%s" % str(input_sandbox)) # This is a workaroud for the fact DIRAC doesn't like whitespace in sandbox filenames ### START_WORKAROUND tmp_dir = tempfile.mkdtemp() # Loop through all files and if the filename contains a ' ' copy it to a location which doesn't contain one. # This does have the limitation that all file basenames must not contain a ' ' character. # However we don't make any in Ganga as of 20/09/16 sandbox_str = '[' for file_ in input_sandbox: if ' ' in str(file_): new_name = os.path.join(tmp_dir, os.path.basename(file_)) shutil.copy(file_, new_name) file_ = new_name sandbox_str += '\'' + str(file_) + '\', ' sandbox_str += ']' logger.debug("sandbox_str: %s" % sandbox_str) ### FINISH_WORKAROUND dirac_script = subjobconfig.getExeString().replace( '##INPUT_SANDBOX##', sandbox_str) dirac_script_filename = os.path.join(j.getInputWorkspace().getPath(), 'dirac-script.py') with open(dirac_script_filename, 'w') as f: f.write(dirac_script) try: return self._common_submit(dirac_script_filename) finally: # CLEANUP after workaround shutil.rmtree(tmp_dir, ignore_errors=True) def master_auto_resubmit(self, rjobs): '''Duplicate of the IBackend.master_resubmit but hooked into auto resubmission such that the monitoring server is used rather than the user server Args: rjobs (list): This is a list of jobs which are to be auto-resubmitted''' incomplete = 0 def handleError(x): if incomplete: raise x else: return 0 try: for sj in rjobs: fqid = sj.getFQID('.') logger.info("resubmitting job %s to %s backend", fqid, getName(sj.backend)) try: b = sj.backend sj.updateStatus('submitting') result = b._resubmit() if result: sj.updateStatus('submitted') # sj._commit() # PENDING: TEMPORARY DISABLED incomplete = 1 else: return handleError( IncompleteJobSubmissionError( fqid, 'resubmission failed')) except Exception as x: log_user_exception(logger, debug=isType(x, GangaDiracError)) return handleError( IncompleteJobSubmissionError(fqid, str(x))) finally: master = self.getJobObject().master if master: master.updateMasterJobStatus() return 1 def resubmit(self): """Resubmit a DIRAC job""" return self._resubmit() def _resubmit(self): """Resubmit a DIRAC job""" j = self.getJobObject() parametric = False script_path = os.path.join(j.getInputWorkspace().getPath(), 'dirac-script.py') # Check old script if j.master is None and not os.path.exists(script_path): raise BackendError('Dirac', 'No "dirac-script.py" found in j.inputdir') if j.master is not None and not os.path.exists(script_path): script_path = os.path.join(j.master.getInputWorkspace().getPath(), 'dirac-script.py') if not os.path.exists(script_path): raise BackendError( 'Dirac', 'No "dirac-script.py" found in j.inputdir or j.master.inputdir' ) parametric = True # Read old script f = open(script_path, 'r') script = f.read() f.close() # Create new script - ##note instead of using get_parametric_dataset # could just use j.inputdata. if parametric is True: parametric_datasets = get_parametric_datasets(script.split('\n')) if j.master: if len(parametric_datasets) != len(j.master.subjobs): raise BackendError( 'Dirac', 'number of parametric datasets defined in API script doesn\'t match number of master.subjobs' ) if j.inputdata and len(j.inputdata) > 0: _input_files = [ f for f in j.inputdata if not isType(f, DiracFile) ] else: _input_files = [] if set(parametric_datasets[j.id]).symmetric_difference( set([f.namePattern for f in _input_files])): raise BackendError( 'Dirac', 'Mismatch between dirac-script and job attributes.') script = script.replace( '.setParametricInputData(%s)' % str(parametric_datasets), '.setInputData(%s)' % str(parametric_datasets[j.id])) script = script.replace('%n', str(j.id)) # name start_user_settings = '# <-- user settings\n' new_script = script[:script.find(start_user_settings) + len(start_user_settings)] job_ident = get_job_ident(script.split('\n')) for key, value in self.settings.iteritems(): if str(key).startswith('set'): _key = key[3:] else: _key = key if type(value) is str: template = '%s.set%s("%s")\n' else: template = '%s.set%s(%s)\n' new_script += template % (job_ident, str(_key), str(value)) new_script += script[script.find('# user settings -->'):] # Save new script new_script_filename = os.path.join(j.getInputWorkspace().getPath(), 'dirac-script.py') f = open(new_script_filename, 'w') f.write(new_script) f.flush() f.close() return self._common_submit(new_script_filename) def reset(self, doSubjobs=False): """Resets the state of a job back to 'submitted' so that the monitoring will run on it again. Args: doSubjobs (bool): Should we rest the subjobs associated with this job or not""" j = self.getJobObject() disallowed = ['submitting', 'killed'] if j.status in disallowed: logger.warning("Can not reset a job in status '%s'." % j.status) else: j.getOutputWorkspace().remove(preserve_top=True) self.extraInfo = None self.statusInfo = '' self.status = None self.actualCE = None j.been_queued = False j.updateStatus('submitted') if j.subjobs and not doSubjobs: logger.info( 'This job has subjobs, if you would like the backends ' 'of all the subjobs that are in status=\'completing\' or ' 'status=\'failed\' also reset then recall reset with the ' 'arg \'True\' i.e. job(3).backend.reset(True)') elif j.subjobs and doSubjobs: logger.info( 'resetting the backends of \'completing\' and \'failed\' subjobs.' ) for sj in j.subjobs: if sj.status == 'completing' or sj.status == 'failed': sj.backend.reset() if j.master: j.master.updateMasterJobStatus() @require_credential def kill(self): """ Kill a Dirac jobs""" if not self.id: return None dirac_cmd = 'kill(%d)' % self.id try: result = execute(dirac_cmd, cred_req=self.credential_requirements) except GangaDiracError as err: raise BackendError('Dirac', 'Could not kill job: %s' % err) return True @require_credential def peek(self, filename=None, command=None): """Peek at the output of a job (Note: filename/command are ignored). Args: filename (str): Ignored but is filename of a file in the sandbox command (str): Ignored but is a command which could be executed""" dirac_cmd = 'peek(%d)' % self.id try: result = execute(dirac_cmd, cred_req=self.credential_requirements) logger.info(result) except GangaDiracError: logger.error("No peeking available for Dirac job '%i'.", self.id) @require_credential def getOutputSandbox(self, outputDir=None): """Get the outputsandbox for the job object controlling this backend Args: outputDir (str): This string represents the output dir where the sandbox is to be placed """ j = self.getJobObject() if outputDir is None: outputDir = j.getOutputWorkspace().getPath() dirac_cmd = "getOutputSandbox(%d,'%s')" % (self.id, outputDir) try: result = execute(dirac_cmd, cred_req=self.credential_requirements) except GangaDiracError as err: msg = 'Problem retrieving output: %s' % str(err) logger.warning(msg) return False return True def removeOutputData(self): """ Remove all the LFNs associated with this job. """ # Note when the API can accept a list for removeFile I will change # this. j = self.getJobObject() if j.subjobs: for sj in j.subjobs: outputfiles_foreach(sj, DiracFile, lambda x: x.remove()) else: outputfiles_foreach(j, DiracFile, lambda x: x.remove()) def getOutputData(self, outputDir=None, names=None, force=False): """Retrieve data stored on SE to dir (default=job output workspace). If names=None, then all outputdata is downloaded otherwise names should be a list of files to download. If force=True then data will be redownloaded even if the file already exists. Note that if called on a master job then all subjobs outputwill be downloaded. If dir is None then the subjobs output goes into their individual outputworkspaces as expected. If however one specifies a dir then this is treated as a top dir and a subdir for each job will be created below it. This will avoid overwriting files with the same name from each subjob. Args: outputDir (str): This string represents the output dir where the sandbox is to be placed names (list): list of names which match namePatterns in the outputfiles force (bool): Force the download out data potentially overwriting existing objects """ j = self.getJobObject() if outputDir is not None and not os.path.isdir(outputDir): raise GangaDiracError( "Designated outupt path '%s' must exist and be a directory" % outputDir) def download(dirac_file, job, is_subjob=False): dirac_file.localDir = job.getOutputWorkspace().getPath() if outputDir is not None: output_dir = outputDir if is_subjob: output_dir = os.path.join(outputDir, job.fqid) if not os.path.isdir(output_dir): os.mkdir(output_dir) dirac_file.localDir = output_dir if os.path.exists( os.path.join(dirac_file.localDir, os.path.basename( dirac_file.lfn))) and not force: return try: dirac_file.get() return dirac_file.lfn # should really make the get method throw if doesn't suceed. todo except (GangaDiracError, GangaFileError) as e: logger.warning(e) suceeded = [] if j.subjobs: for sj in j.subjobs: suceeded.extend([ download(f, sj, True) for f in outputfiles_iterator(sj, DiracFile) if f.lfn != '' and ( names is None or f.namePattern in names) ]) else: suceeded.extend([ download(f, j, False) for f in outputfiles_iterator(j, DiracFile) if f.lfn != '' and (names is None or f.namePattern in names) ]) return filter(lambda x: x is not None, suceeded) def getOutputDataLFNs(self): """Retrieve the list of LFNs assigned to outputdata""" j = self.getJobObject() lfns = [] if j.subjobs: for sj in j.subjobs: lfns.extend([ f.lfn for f in outputfiles_iterator(sj, DiracFile) if f.lfn != '' ]) else: lfns.extend([ f.lfn for f in outputfiles_iterator(j, DiracFile) if f.lfn != '' ]) return lfns def getOutputDataAccessURLs(self): """Retrieve the list of accessURLs assigned to outputdata for a job""" return getAccessURLs(self.getOutputDataLFNs()) @require_credential def debug(self): '''Obtains some (possibly) useful DIRAC debug info. ''' # check services cmd = 'getServicePorts()' try: result = execute(cmd, cred_req=self.credential_requirements) except GangaDiracError as err: logger.warning('Could not obtain services: %s' % str(err)) return services = result for category in services: system, service = category.split('/') cmd = "ping('%s','%s')" % (system, service) try: result = execute(cmd, cred_req=self.credential_requirements) msg = 'OK.' except GangaDiracError as err: msg = '%s' % err logger.info('%s: %s' % (category, msg)) # get pilot info for this job if not isinstance(self.id, int): return j = self.getJobObject() cwd = os.getcwd() debug_dir = j.getDebugWorkspace().getPath() cmd = "getJobPilotOutput(%d,'%s')" % (self.id, debug_dir) try: result = execute(cmd, cred_req=self.credential_requirements) logger.info('Pilot Info: %s/pilot_%d/std.out.' % (debug_dir, self.id)) except GangaDiracError as err: logger.error("%s" % err) @staticmethod def _bulk_updateStateTime(jobStateDict, bulk_time_lookup={}): """ This performs the same as the _getStateTime method but loops over a list of job ids within the DIRAC namespace (much faster) Args: jobStateDict (dict): This is a dict of {job.backend.id : job_status, } elements bulk_time_lookup (dict): Dict of result of multiple calls to getBulkStateTime, performed in advance """ for this_state, these_jobs in jobStateDict.iteritems(): if bulk_time_lookup == {} or this_state not in bulk_time_lookup: bulk_result = execute( "getBulkStateTime(%s,\'%s\')" % (repr([j.backend.id for j in these_jobs]), this_state), cred_req=these_jobs[0].backend.credential_requirements ) # TODO split jobs by cred_req else: bulk_result = bulk_time_lookup[this_state] for this_job in jobStateDict[this_state]: backend_id = this_job.backend.id if backend_id in bulk_result and bulk_result[backend_id]: DiracBase._getStateTime( this_job, this_state, {this_state: bulk_result[backend_id]}) else: DiracBase._getStateTime(this_job, this_state) @staticmethod def _getStateTime(job, status, getStateTimeResult={}): """Returns the timestamps for 'running' or 'completed' by extracting their equivalent timestamps from the loggingInfo. Args: job (Job): This is the job object we want to update status (str): This is the Ganga status we're updating (running, completed... etc) getStateTimeResult (dict): This is the optional result of executing the approriate getStateTime against this job.backend.id, if not provided the command is called internally """ # Now private to stop server cross-talk from user thread. Since updateStatus calles # this method whether called itself by the user thread or monitoring thread. # Now don't use hook but define our own private version # used in monitoring loop... messy but works. if job.status != status: b_list = ['running', 'completing', 'completed', 'failed'] backend_final = ['failed', 'completed'] # backend stamps if not job.subjobs and status in b_list: for childstatus in b_list: if job.backend.id: logger.debug("Accessing getStateTime() in diracAPI") if childstatus in backend_final: if childstatus in getStateTimeResult: be_statetime = getStateTimeResult[childstatus] else: be_statetime = execute( "getStateTime(%d,\'%s\')" % (job.backend.id, childstatus), cred_req=job.backend. credential_requirements) job.time.timestamps["backend_final"] = be_statetime logger.debug( "Wrote 'backend_final' to timestamps.") break else: time_str = "backend_" + childstatus if time_str not in job.time.timestamps: if childstatus in getStateTimeResult: be_statetime = getStateTimeResult[ childstatus] else: be_statetime = execute( "getStateTime(%d,\'%s\')" % (job.backend.id, childstatus), cred_req=job.backend. credential_requirements) job.time.timestamps["backend_" + childstatus] = be_statetime logger.debug("Wrote 'backend_%s' to timestamps.", childstatus) if childstatus == status: break logger.debug("_getStateTime(job with id: %d, '%s') called.", job.id, job.status) else: logger.debug( "Status changed from '%s' to '%s'. No new timestamp was written", job.status, status) def timedetails(self): """Prints contents of the loggingInfo from the Dirac API.""" if not self.id: return None logger.debug("Accessing timedetails() in diracAPI") dirac_cmd = 'timedetails(%d)' % self.id return execute(dirac_cmd, cred_req=self.credential_requirements) @staticmethod def job_finalisation_cleanup(job, updated_dirac_status): """ Method for reverting a job back to a clean state upon a failure in the job progression Args: job (Job) This is the job to change the status updated_dirac_status (str): Ganga status which is to be used somewhere """ # Revert job back to running state if we exit uncleanly if job.status == "completing": job.updateStatus('running') if job.master: job.master.updateMasterJobStatus() # FIXME should I add something here to cleanup on sandboxes pulled from # malformed job output? @staticmethod def _internal_job_finalisation(job, updated_dirac_status): """ This method performs the main job finalisation Args: job (Job): Thi is the job we want to finalise updated_dirac_status (str): String representing the Ganga finalisation state of the job failed/completed """ if updated_dirac_status == 'completed': start = time.time() # firstly update job to completing DiracBase._getStateTime(job, 'completing') if job.status in ['removed', 'killed']: return elif (job.master and job.master.status in ['removed', 'killed']): return # user changed it under us job.updateStatus('completing') if job.master: job.master.updateMasterJobStatus() output_path = job.getOutputWorkspace().getPath() logger.info('Contacting DIRAC for job: %s' % job.fqid) # Contact dirac which knows about the job job.backend.normCPUTime, getSandboxResult, file_info_dict, completeTimeResult = execute( "finished_job(%d, '%s')" % (job.backend.id, output_path), cred_req=job.backend.credential_requirements) now = time.time() logger.info( '%0.2fs taken to download output from DIRAC for Job %s' % ((now - start), job.fqid)) #logger.info('Job ' + job.fqid + ' OutputDataInfo: ' + str(file_info_dict)) #logger.info('Job ' + job.fqid + ' OutputSandbox: ' + str(getSandboxResult)) #logger.info('Job ' + job.fqid + ' normCPUTime: ' + str(job.backend.normCPUTime)) # Set DiracFile metadata wildcards = [ f.namePattern for f in job.outputfiles.get(DiracFile) if regex.search(f.namePattern) is not None ] lfn_store = os.path.join( output_path, getConfig('Output')['PostProcessLocationsFileName']) # Make the file on disk with a nullop... if not os.path.isfile(lfn_store): with open(lfn_store, 'w'): pass if job.outputfiles.get(DiracFile): # Now we can iterate over the contents of the file without touching it with open(lfn_store, 'ab') as postprocesslocationsfile: if not hasattr(file_info_dict, 'keys'): logger.error("Error understanding OutputDataInfo: %s" % str(file_info_dict)) raise GangaDiracError( "Error understanding OutputDataInfo: %s" % str(file_info_dict)) ## Caution is not clear atm whether this 'Value' is an LHCbism or bug list_of_files = file_info_dict.get('Value', file_info_dict.keys()) for file_name in list_of_files: file_name = os.path.basename(file_name) info = file_info_dict.get(file_name) #logger.debug("file_name: %s,\tinfo: %s" % (str(file_name), str(info))) if not hasattr(info, 'get'): logger.error( "Error getting OutputDataInfo for: %s" % str(job.getFQID('.'))) logger.error( "Please check the Dirac Job still exists or attempt a job.backend.reset() to try again!" ) logger.error("Err: %s" % str(info)) logger.error("file_info_dict: %s" % str(file_info_dict)) raise GangaDiracError( "Error getting OutputDataInfo") valid_wildcards = [ wc for wc in wildcards if fnmatch.fnmatch(file_name, wc) ] if not valid_wildcards: valid_wildcards.append('') for wc in valid_wildcards: #logger.debug("wildcard: %s" % str(wc)) DiracFileData = 'DiracFile:::%s&&%s->%s:::%s:::%s\n' % ( wc, file_name, info.get('LFN', 'Error Getting LFN!'), str(info.get('LOCATIONS', ['NotAvailable'])), info.get('GUID', 'NotAvailable')) #logger.debug("DiracFileData: %s" % str(DiracFileData)) postprocesslocationsfile.write(DiracFileData) postprocesslocationsfile.flush() logger.debug("Written: %s" % open(lfn_store, 'r').readlines()) # check outputsandbox downloaded correctly if not result_ok(getSandboxResult): logger.warning('Problem retrieving outputsandbox: %s' % str(getSandboxResult)) DiracBase._getStateTime(job, 'failed') if job.status in ['removed', 'killed']: return elif (job.master and job.master.status in ['removed', 'killed']): return # user changed it under us job.updateStatus('failed') if job.master: job.master.updateMasterJobStatus() raise BackendError( 'Dirac', 'Problem retrieving outputsandbox: %s' % str(getSandboxResult)) # finally update job to completed DiracBase._getStateTime(job, 'completed', completeTimeResult) if job.status in ['removed', 'killed']: return elif (job.master and job.master.status in ['removed', 'killed']): return # user changed it under us job.updateStatus('completed') if job.master: job.master.updateMasterJobStatus() now = time.time() logger.debug('Job ' + job.fqid + ' Time for complete update : ' + str(now - start)) elif updated_dirac_status == 'failed': # firstly update status to failed DiracBase._getStateTime(job, 'failed') if job.status in ['removed', 'killed']: return if (job.master and job.master.status in ['removed', 'killed']): return # user changed it under us job.updateStatus('failed') if job.master: job.master.updateMasterJobStatus() # if requested try downloading outputsandbox anyway if configDirac['failed_sandbox_download']: execute("getOutputSandbox(%d,'%s')" % (job.backend.id, job.getOutputWorkspace().getPath()), cred_req=job.backend.credential_requirements) else: logger.error("Job #%s Unexpected dirac status '%s' encountered" % (job.getFQID('.'), updated_dirac_status)) @staticmethod def job_finalisation(job, updated_dirac_status): """ Attempt to finalise the job given and auto-retry 5 times on error Args: job (Job): Job object to finalise updated_dirac_status (str): The Ganga status to update the job to, i.e. failed/completed """ count = 1 limit = 5 sleep_length = 2.5 while count != limit: try: count += 1 # Check status is sane before we start if job.status != "running" and ( not job.status in ['completed', 'killed', 'removed']): job.updateStatus('submitted') job.updateStatus('running') if job.status in ['completed', 'killed', 'removed']: break DiracBase._internal_job_finalisation(job, updated_dirac_status) break except Exception as err: logger.warning("An error occured finalising job: %s" % job.getFQID('.')) logger.warning( "Attemting again (%s of %s) after %s-sec delay" % (str(count), str(limit), str(sleep_length))) if count == limit: logger.error( "Unable to finalise job after %s retries due to error:\n%s" % (job.getFQID('.'), str(err))) job.force_status('failed') raise time.sleep(sleep_length) job.been_queued = False @staticmethod def requeue_dirac_finished_jobs(requeue_jobs, finalised_statuses): """ Method used to requeue jobs whih are in the finalized state of some form, finished/failed/etc Args: requeue_jobs (list): This is a list of the jobs which are to be requeued to be finalised finalised_statuses (dict): Dict of the Dirac statuses vs the Ganga statuses after running """ # requeue existing completed job for j in requeue_jobs: if j.been_queued: continue if monitoring_component: if monitoring_component.should_stop(): break if not configDirac['serializeBackend']: getQueues()._monitoring_threadpool.add_function( DiracBase.job_finalisation, args=(j, finalised_statuses[j.backend.status]), priority=5, name="Job %s Finalizing" % j.fqid) j.been_queued = True else: DiracBase.job_finalisation( j, finalised_statuses[j.backend.status]) @staticmethod def monitor_dirac_running_jobs(monitor_jobs, finalised_statuses): """ Method to update the configuration of jobs which are in a submitted/running state in Ganga&Dirac Args: monitor_jobs (list): Jobs which are to be monitored for their status change finalised_statuses (dict): Dict of the Dirac statuses vs the Ganga statuses after running """ # now that can submit in non_blocking mode, can see jobs in submitting # that have yet to be assigned an id so ignore them # NOT SURE THIS IS VALID NOW BULK SUBMISSION IS GONE # EVEN THOUGH COULD ADD queues.add(j.submit) WILL KEEP AN EYE ON IT # dirac_job_ids = [ j.backend.id for j in monitor_jobs if j.backend.id is not None ] # Correction this did become a problem for a crashed session during # submit, see #104454 dead_jobs = (j for j in monitor_jobs if j.backend.id is None) for d in dead_jobs: d.updateStatus('failed') if d.master is not None: d.master.updateMasterJobStatus() ganga_job_status = [ j.status for j in monitor_jobs if j.backend.id is not None ] dirac_job_ids = [ j.backend.id for j in monitor_jobs if j.backend.id is not None ] logger.debug("GangaStatus: %s" % str(ganga_job_status)) logger.debug("diracJobIDs: %s" % str(dirac_job_ids)) if not dirac_job_ids: ## Nothing to do here stop bugging DIRAC about it! ## Everything else beyond here in the function depends on some ids present here, no ids means we can stop. return statusmapping = configDirac['statusmapping'] result, bulk_state_result = execute( 'monitorJobs(%s, %s)' % (repr(dirac_job_ids), repr(statusmapping)), cred_req=monitor_jobs[0].backend.credential_requirements) #result = results[0] #bulk_state_result = results[1] if len(result) != len(ganga_job_status): logger.warning('Dirac monitoring failed for %s, result = %s' % (str(dirac_job_ids), str(result))) logger.warning("Results: %s" % str(result)) return requeue_job_list = [] jobStateDict = {} jobs_to_update = {} master_jobs_to_update = [] thread_handled_states = ['completed', 'failed'] for job, state, old_state in zip(monitor_jobs, result, ganga_job_status): if monitoring_component: if monitoring_component.should_stop(): break if job.been_queued: continue job.backend.statusInfo = state[0] job.backend.status = state[1] job.backend.actualCE = state[2] updated_dirac_status = state[3] try: job.backend.extraInfo = state[4] except Exception as err: logger.debug("gexception: %s" % str(err)) pass logger.debug('Job status vector : ' + job.fqid + ' : ' + repr(state)) if updated_dirac_status not in jobStateDict: jobStateDict[updated_dirac_status] = [] jobStateDict[updated_dirac_status].append(job) if job.backend.status in finalised_statuses: if job.status != 'running': if job.status in ['removed', 'killed']: requeue_job_list.append(job) elif (job.master and job.master.status in ['removed', 'killed']): continue # user changed it under us else: if 'running' not in jobs_to_update: jobs_to_update['running'] = [] jobs_to_update['running'].append(job) if job.master: if job.master not in master_jobs_to_update: master_jobs_to_update.append(job.master) requeue_job_list.append(job) else: if job.status in ['removed', 'killed']: continue if (job.master and job.master.status in ['removed', 'killed']): continue # user changed it under us if job.status != updated_dirac_status: if updated_dirac_status not in jobs_to_update: jobs_to_update[updated_dirac_status] = [] jobs_to_update[updated_dirac_status].append(job) if job.master: if job.master not in master_jobs_to_update: master_jobs_to_update.append(job.master) DiracBase._bulk_updateStateTime(jobStateDict, bulk_state_result) for status in jobs_to_update: for job in jobs_to_update[status]: job.updateStatus(status, update_master=False) for j in master_jobs_to_update: j.updateMasterJobStatus() DiracBase.requeue_dirac_finished_jobs(requeue_job_list, finalised_statuses) @staticmethod def updateMonitoringInformation(jobs_): """Check the status of jobs and retrieve output sandboxesi Args: jobs_ (list): List of the appropriate jobs to monitored """ # Only those jobs in 'submitted','running' are passed in here for checking # if however they have already completed in Dirac they may have been put on queue # for processing from last time. These should be put back on queue without # querying dirac again. Their signature is status = running and job.backend.status # already set to Done or Failed etc. jobs = [stripProxy(j) for j in jobs_] # remove from consideration any jobs already in the queue. Checking this non persisted attribute # is better than querying the queue as cant tell if a job has just been taken off queue and is being processed # also by not being persistent, this attribute automatically allows queued jobs from last session to be considered # for requeing interesting_jobs = [j for j in jobs if not j.been_queued] # status that correspond to a ganga 'completed' or 'failed' (see DiracCommands.status(id)) # if backend status is these then the job should be on the queue finalised_statuses = configDirac['finalised_statuses'] monitor_jobs = [ j for j in interesting_jobs if j.backend.status not in finalised_statuses ] requeue_jobs = [ j for j in interesting_jobs if j.backend.status in finalised_statuses ] #logger.debug('Interesting jobs: ' + repr([j.fqid for j in interesting_jobs])) #logger.debug('Monitor jobs : ' + repr([j.fqid for j in monitor_jobs])) #logger.debug('Requeue jobs : ' + repr([j.fqid for j in requeue_jobs])) try: # Split all the monitorable jobs into groups based on the # credential used to communicate with DIRAC for requeue_jobs_group in group_jobs_by_backend_credential( requeue_jobs): DiracBase.requeue_dirac_finished_jobs(requeue_jobs_group, finalised_statuses) for monitor_jobs_group in group_jobs_by_backend_credential( monitor_jobs): DiracBase.monitor_dirac_running_jobs(monitor_jobs_group, finalised_statuses) except GangaDiracError as err: logger.warning( "Error in Monitoring Loop, jobs on the DIRAC backend may not update" ) logger.debug(err)
class LHCbTransform(ITransform): _schema = Schema( Version(1, 0), dict( ITransform._schema.datadict.items() + { 'files_per_unit': SimpleItem( defvalue=-1, doc= 'Maximum number of files to assign to each unit from a given input dataset. If < 1, use all files.', typelist=["int"]), 'splitter': ComponentItem('splitters', defvalue=None, optional=1, load_default=False, doc='Splitter to be used for units'), 'queries': ComponentItem('query', defvalue=[], sequence=1, protected=1, optional=1, load_default=False, doc='Queries managed by this Transform'), 'delete_chain_input': SimpleItem( defvalue=False, doc= 'Delete the Dirac input files/data after completion of each unit', typelist=["bool"]), 'mc_num_units': SimpleItem(defvalue=0, doc="No. of units to create for MC generation"), }.items())) _category = 'transforms' _name = 'LHCbTransform' _exportmethods = ITransform._exportmethods + [ 'updateQuery', 'addQuery', 'removeUnusedData', 'cleanTransform' ] def __init__(self): super(LHCbTransform, self).__init__() # generally no delay neededd self.chain_delay = 0 def addQuery(self, bk): """Add a BK query to this transform""" # Check if the BKQuery input is correct and append/update if not isType(bk, BKQuery): raise GangaAttributeError( None, 'LHCbTransform expects a BKQuery object passed to the addQuery method' ) # check we don't already have inputdata if len(self.queries) == 0 and len(self.inputdata) > 0: logger.error( "Cannot add both input data and BK queries. Input Data already present." ) return # add the query and update the input data self.queries.append(bk) self.updateQuery() def addInputQuery(self, inDS): """Add the given input dataset to the list but only if BK queries aren't given""" if len(self.queries) > 0: logger.error( "Cannot add both input data and BK queries. Query already given" ) return super(LHCbTransform, self).addInputQuery(inDS) def cleanTransform(self): """Remove unused data and then unused jobs""" self.removeUnusedData() self.removeUnusedJobs() def removeUnusedData(self): """Remove any output data from orphaned jobs""" for unit in self.units: for jid in unit.prev_job_ids: try: logger.warning("Removing data from job '%d'..." % jid) job = getJobByID(jid) jlist = [] if len(job.subjobs) > 0: jlist = job.subjobs else: jlist = [job] for sj in jlist: for f in sj.outputfiles: if isType(f, DiracFile) == "DiracFile" and f.lfn: f.remove() except: logger.error("Problem deleting data for job '%d'" % jid) pass def createUnits(self): """Create new units if required given the inputdata""" # call parent for chaining super(LHCbTransform, self).createUnits() if len(self.inputdata) > 0: # check for conflicting input if self.mc_num_units > 0: logger.warning("Inputdata specified - MC Event info ignored") # loop over input data and see if we need to create any more units import copy for id, inds in enumerate(self.inputdata): if not isType(inds, LHCbDataset): continue # go over the units and see what files have been assigned assigned_data = LHCbDataset() for unit in self.units: if unit.input_datset_index != id: continue assigned_data.files += unit.inputdata.files # any new files new_data = LHCbDataset( files=self.inputdata[id].difference(assigned_data).files) if len(new_data.files) == 0: continue # Create units for these files step = self.files_per_unit if step <= 0: step = len(new_data.files) for num in range(0, len(new_data.files), step): unit = LHCbUnit() unit.name = "Unit %d" % len(self.units) unit.input_datset_index = id self.addUnitToTRF(unit) unit.inputdata = copy.deepcopy(self.inputdata[id]) unit.inputdata.files = [] unit.inputdata.files += new_data.files[num:num + step] elif self.mc_num_units > 0: if len(self.units) == 0: # check for appropriate splitter from GangaLHCb.Lib.Splitters.GaussSplitter import GaussSplitter if not self.splitter or isType(self.splitter, GaussSplitter): logger.warning( "No GaussSplitter specified - first event info ignored" ) # create units for MC generation for i in range(0, self.mc_num_units): unit = LHCbUnit() unit.name = "Unit %d" % len(self.units) self.addUnitToTRF(unit) else: import traceback traceback.print_stack() logger.error( "Please specify either inputdata or MC info for unit generation" ) def createChainUnit(self, parent_units, use_copy_output=True): """Create an output unit given this output data""" # we need a parent job that has completed to get the output files incl_pat_list = [] excl_pat_list = [] for parent in parent_units: if len(parent.active_job_ids) == 0 or parent.status != "completed": return None for inds in self.inputdata: from Ganga.GPIDev.Lib.Tasks.TaskChainInput import TaskChainInput if isType( inds, TaskChainInput ) and inds.input_trf_id == parent._getParent().getID(): incl_pat_list += inds.include_file_mask excl_pat_list += inds.exclude_file_mask # go over the output files and copy the appropriates over as input # files flist = [] import re for parent in parent_units: job = getJobByID(parent.active_job_ids[0]) if job.subjobs: job_list = job.subjobs else: job_list = [job] for sj in job_list: for f in sj.outputfiles: # match any dirac files that are allowed in the file mask if isType(f, DiracFile): if len(incl_pat_list) > 0: for pat in incl_pat_list: if re.search(pat, f.lfn): flist.append("LFN:" + f.lfn) else: flist.append("LFN:" + f.lfn) if len(excl_pat_list) > 0: for pat in excl_pat_list: if re.search( pat, f.lfn) and "LFN:" + f.lfn in flist: flist.remove("LFN:" + f.lfn) # just do one unit that uses all data unit = LHCbUnit() unit.name = "Unit %d" % len(self.units) unit.inputdata = LHCbDataset(files=[DiracFile(lfn=f) for f in flist]) return unit def updateQuery(self, resubmit=False): """Update the dataset information of the transforms. This will include any new data in the processing or re-run jobs that have data which has been removed.""" if len(self.queries) == 0: raise GangaException( None, 'Cannot call updateQuery() on an LHCbTransform without any queries' ) if self._getParent() != None: logger.info( 'Retrieving latest bookkeeping information for transform %i:%i, please wait...' % (self._getParent().id, self.getID())) else: logger.info( 'Retrieving latest bookkeeping information for transform, please wait...' ) # check we have an input DS per BK Query while len(self.queries) > len(self.inputdata): self.inputdata.append(LHCbDataset()) # loop over the queries and add fill file lists for id, query in enumerate(self.queries): # Get the latest dataset latest_dataset = query.getDataset() # Compare to previous inputdata, get new and removed logger.info( 'Checking for new and removed data for query %d, please wait...' % self.queries.index(query)) dead_data = LHCbDataset() new_data = LHCbDataset() # loop over the old data and compare new_data.files += latest_dataset.difference( self.inputdata[id]).files dead_data.files += self.inputdata[id].difference( latest_dataset).files # for dead data, find then kill/remove any associated jobs # loop over units and check any associated with this DS # TODO: Follow through chained tasks for unit in self.units: # associted unit if unit.input_datset_index != id: continue # find the job if len(unit.active_job_ids) == 0: continue # check the data for f in dead_data.files: if f in unit.inputdata.files: # kill the job job = getJobByID(unit.active_job_ids[0]) if job.status in ['submitted', 'running']: job.kill() # forget the job unit.prev_job_ids.append(unit.active_job_ids[0]) unit.active_job_ids = [] break # in any case, now just set the DS files to the new set self.inputdata[id].files = [] self.inputdata[id].files = latest_dataset.files
class GoogleFile(IGangaFile): """ The GoogleFile outputfile type allows for files to be directly uploaded, downloaded, removed and restored from the GoogleDrive service. It can be used as part of a job to output data directly to GoogleDrive, or standalone through the Ganga interface. example job: j=Job(application=Executable(exe=File('/home/hep/hs4011/Tests/testjob.sh'), args=[]),outputfiles=[GoogleFile('TestJob.txt')]) j.submit() ### This job will automatically upload the outputfile 'TestJob.txt' to GoogleDrive. example of standalone submission: g=GoogleFile('TestFile.txt') g.localDir = '~/TestDirectory' ### The file's location must be specified for standalone submission g.put() ### The put() method uploads the file to GoogleDrive directly The GoogleFile outputfile is also compatible with the Dirac backend, making outputfiles from Dirac-run jobs upload directly to GoogleDrive. """ _schema = Schema( Version(1, 1), { 'namePattern': SimpleItem(defvalue="", doc='pattern of the file name'), 'localDir': SimpleItem( defvalue="", copyable=1, doc= 'local dir where the file is stored, used from get and put methods' ), 'subfiles': ComponentItem(category='gangafiles', defvalue=[], hidden=1, sequence=1, copyable=0, doc="collected files from the wildcard namePattern"), 'failureReason': SimpleItem( defvalue="", copyable=1, doc='reason for the upload failure'), 'compressed': SimpleItem( defvalue=False, typelist=[bool], protected=0, doc= 'wheather the output file should be compressed before sending somewhere' ), 'downloadURL': SimpleItem( defvalue="", copyable=1, protected=1, doc= 'download URL assigned to the file upon upload to GoogleDrive' ), 'id': SimpleItem( defvalue="", copyable=1, hidden=1, protected=1, doc='GoogleFile ID assigned to file on upload to GoogleDrive' ), 'title': SimpleItem(defvalue="", copyable=1, hidden=1, protected=1, doc='GoogleFile title of the uploaded file'), 'GangaFolderId': SimpleItem(defvalue="", copyable=1, hidden=1, protected=1, doc='GoogleDrive Ganga folder ID') }) _category = 'gangafiles' _name = 'GoogleFile' _exportmethods = ["get", "put", "remove", "restore", "deleteCredentials"] def __init__(self, namePattern=''): super(GoogleFile, self).__init__() self.namePattern = namePattern self.__initialized = False self.cred_path = os.path.join( getConfig('Configuration')['gangadir'], 'googlecreddata.pkl') def __initializeCred(self): while os.path.isfile(self.cred_path) == False: from oauth2client.client import OAuth2WebServerFlow # Copy your credentials from the APIs Console # CLIENT_ID = "54459939297.apps.googleusercontent.com" # CLIENT_SECRET = "mAToHx5RpXtwkeYR6nOIe_Yw" CLIENT_ID = '776655306197-dirtoquqsm7cpqgepvamofg5t2b5f637.apps.googleusercontent.com' CLIENT_SECRET = 'GpdEP-OBZZQLB3k-xxOpzFQG' # Check https://developers.google.com/drive/scopes for all # available scopes OAUTH_SCOPE = 'https://www.googleapis.com/auth/drive.file' # Redirect URI for installed apps REDIRECT_URI = 'urn:ietf:wg:oauth:2.0:oob' # Run through the OAuth flow and retrieve credentials credentials = '' flow = OAuth2WebServerFlow(CLIENT_ID, CLIENT_SECRET, OAUTH_SCOPE, REDIRECT_URI) authorize_url = flow.step1_get_authorize_url() try: import webbrowser webbrowser.get('macosx').open(authorize_url, 0, True) except: try: import webbrowser webbrowser.get('windows-default').open( authorize_url, 0, True) except: try: import webbrowser webbrowser.get('firefox').open(authorize_url, 0, True) except Exception, err: logger.error("Error: %s" % str(err)) pass logger.info('Go to the following link in your browser: ' + authorize_url) code = raw_input('Enter verification code: ').strip() try: credentials = flow.step2_exchange(code) except: deny = raw_input( 'An incorrect code was entered. Have you denied Ganga access to your GoogleDrive (y/[n])?' ) if deny.lower() in ['', 'n']: pass elif deny[0:1].upper() == 'Y': return None # Pickle credential data if credentials is not '': with open(self.cred_path, "wb") as output: pickle.dump(credentials, output) os.chmod(self.cred_path, stat.S_IWUSR | stat.S_IRUSR) logger.info( 'Your GoogleDrive credentials have been stored in the file %s and are only readable by you. ' 'The file will give permission to modify files in your GoogleDrive. ' 'Permission can be revoked by going to "Manage Apps" in your GoogleDrive ' 'or by deleting the credentials through the deleteCredentials GoogleFile method.' % self.cred_path) self.__initialized = True self._check_Ganga_folder()
class MassStorageFile(IGangaFile): """MassStorageFile represents a class marking a file to be written into mass storage (like Castor at CERN) """ _schema = Schema(Version(1, 1), {'namePattern': SimpleItem(defvalue="", doc='pattern of the file name'), 'localDir': SimpleItem(defvalue="", copyable=1, doc='local dir where the file is stored, used from get and put methods'), 'joboutputdir': SimpleItem(defvalue="", doc='outputdir of the job with which the outputsandbox file object is associated'), 'locations': SimpleItem(defvalue=[], copyable=1, typelist=[str], sequence=1, doc="list of locations where the outputfiles are uploaded"), 'outputfilenameformat': SimpleItem(defvalue=None, typelist=[str, None], protected=0,\ doc="keyword path to where the output should be uploaded, i.e. /some/path/here/{jid}/{sjid}/{fname},\ if this field is not set, the output will go in {jid}/{sjid}/{fname} or in {jid}/{fname}\ depending on whether the job is split or not" ), 'inputremotedirectory': SimpleItem(defvalue=None, typelist=[str, None], protected=0, doc="Directory on mass storage where the file is stored"), 'subfiles': ComponentItem(category='gangafiles', defvalue=[], hidden=1, sequence=1, copyable=0,\ doc="collected files from the wildcard namePattern"), 'failureReason': SimpleItem(defvalue="", protected=1, copyable=0, doc='reason for the upload failure'), 'compressed': SimpleItem(defvalue=False, typelist=[bool], protected=0, doc='wheather the output file should be compressed before sending somewhere') }) _category = 'gangafiles' _name = "MassStorageFile" _exportmethods = [ "location", "get", "put", "setLocation", "remove", "accessURL" ] _additional_slots = ['shell'] def __init__(self, namePattern='', localDir='', **kwds): """ MassStorageFile construction Args: namePattern (str): is the pattern of the output file that has to be written into mass storage localDir (str): This is the optional local directory of a file to be uploaded to mass storage """ self._checkConfig() super(MassStorageFile, self).__init__() self._setNamePath(_namePattern=namePattern, _localDir=localDir) self.locations = [] self.shell = Shell.Shell() def __setattr__(self, attr, value): """ This is an overloaded setter method to make sure that we're auto-expanding the filenames of files which exist. In the case we're assigning any other attributes the value is simply passed through Args: attr (str): This is the name of the attribute which we're assigning value (unknown): This is the value being assigned. """ actual_value = value if attr == "namePattern": this_localDir, actual_value = os.path.split(value) if this_localDir: self.localDir = this_localDir if attr == "localDir": if value and (value.find(':') == -1): actual_value = os.path.abspath(expandfilename(value)) super(MassStorageFile, self).__setattr__(attr, actual_value) def _setNamePath(self, _namePattern='', _localDir=''): if _namePattern != '' and _localDir == '': self.namePattern = os.path.basename(_namePattern) if not os.path.dirname(_namePattern): if os.path.isfile( os.path.join(os.getcwd(), os.path.basename(_namePattern))): self.localDir = os.getcwd() else: self.localDir = os.path.dirname(_namePattern) elif _namePattern != '' and _localDir != '': self.namePattern = _namePattern self.localDir = _localDir def _checkConfig(self): """ Check that the MassStorageFile configuration is correct """ if not getConfig('Output')[_getName(self)]['uploadOptions']['path']: raise GangaException( 'Unable to create MassStorageFile. Check your configuration!') def __repr__(self): """Get the representation of the file.""" return "%s(namePattern='%s')" % (_getName(self), self.namePattern) def mass_line_processor(self, line): """ This function splits the input line from the post-processsing system to define where this file is: Args: line(str): This is expected to be in the format of the postprocessor file from jobs transfering files on the WN """ lineParts = line.split() pattern = lineParts[1] outputPath = lineParts[2] split_name = os.path.splitext(outputPath) if split_name[1] == '.gz': name = split_name[0] else: name = outputPath if regex.search(self.namePattern) is not None: if outputPath == 'ERROR': logger.error("Failed to upload file to mass storage") logger.error(line[line.find('ERROR') + 5:]) d = copy.deepcopy(self) d.namePattern = pattern d.compressed = self.compressed d.failureReason = line[line.find('ERROR') + 5:] self.subfiles.append(d) else: if pattern == self.namePattern: d = copy.deepcopy(self) d.namePattern = name self.subfiles.append(d) d.mass_line_processor(line) elif name == self.namePattern: if outputPath == 'ERROR': logger.error("Failed to upload file to mass storage") logger.error(line[line.find('ERROR') + 5:]) self.failureReason = line[line.find('ERROR') + 5:] return self.locations = [outputPath.strip('\n')] def setLocation(self): """ Sets the location of output files that were uploaded to mass storage from the WN """ job = self.getJobObject() postprocessLocationsPath = os.path.join( job.outputdir, getConfig('Output')['PostProcessLocationsFileName']) if not os.path.exists(postprocessLocationsPath): return for line in open(postprocessLocationsPath, 'r'): if line.strip() == '': continue if line.startswith('massstorage'): self.mass_line_processor(line.strip()) def location(self): """ Return list with the locations of the post processed files (if they were configured to upload the output somewhere) """ tmpLocations = [] if self.subfiles: for i in self.subfiles: tmpLocations.append(i.locations) else: tmpLocations = self.locations return tmpLocations def internalCopyTo(self, targetPath): """ Copy a the file to the local storage using the get mechanism Args: targetPath (str): Target path where the file is to copied to """ to_location = targetPath cp_cmd = getConfig('Output')[_getName(self)]['uploadOptions']['cp_cmd'] for location in self.locations: targetLocation = os.path.join(to_location, os.path.basename(location)) self.execSyscmdSubprocess( '%s %s %s' % (cp_cmd, quote(location), quote(targetLocation))) def getWNScriptDownloadCommand(self, indent): ## FIXME fix me for the situation of multiple files? script = """\n ###INDENT###os.system(\'###CP_COMMAND###\') """ cp_cmd = '%s %s .' % (getConfig('Output')[_getName( self)]['uploadOptions']['cp_cmd'], quote(self.locations[0])) replace_dict = {'###INDENT###': indent, '###CP_COMMAND###': cp_cmd} for k, v in replace_dict.iteritems(): script = script.replace(str(k), str(v)) return script def _mkdir(self, massStoragePath, exitIfNotExist=False): """ Creates a folder on the mass Storage corresponding to the given path Args: massStoragePath (str): This is the path we want to make if it doesn't exist. """ massStorageConfig = getConfig('Output')[_getName( self)]['uploadOptions'] mkdir_cmd = massStorageConfig['mkdir_cmd'] ls_cmd = massStorageConfig['ls_cmd'] # create the last directory (if not exist) from the config path pathToDirName = os.path.dirname(massStoragePath) dirName = os.path.basename(massStoragePath) directoryExists = False (exitcode, mystdout, mystderr) = self.execSyscmdSubprocess( '%s %s' % (ls_cmd, quote(pathToDirName))) if exitcode != 0 and exitIfNotExist: self.handleUploadFailure(mystderr, '1) %s %s' % (ls_cmd, pathToDirName)) raise GangaException(mystderr) for directory in mystdout.split('\n'): if directory.strip() == dirName: directoryExists = True break if not directoryExists: (exitcode, mystdout, mystderr) = self.execSyscmdSubprocess( '%s -p %s' % (mkdir_cmd, quote(massStoragePath))) if exitcode != 0: self.handleUploadFailure( mystderr, '2) %s %s' % (mkdir_cmd, massStoragePath)) raise GangaException(mystderr) def put(self): """ Creates and executes commands for file upload to mass storage (Castor), this method will be called on the client """ sourceDir = '' # if used as a stand alone object if self._getParent() is None: if self.localDir == '': _CWD = os.getcwd() if os.path.isfile(os.path.join(_CWD, self.namePattern)): sourceDir = _CWD else: logger.warning( 'localDir attribute is empty, don\'t know from which dir to take the file' ) return else: sourceDir = self.localDir (result, message) = self.validate() if result == False: logger.warning(message) return else: job = self.getJobObject() sourceDir = job.outputdir # if there are subjobs, the put method will be called on every subjob # and will upload the resulted output file if len(job.subjobs) > 0: return massStorageConfig = getConfig('Output')[_getName( self)]['uploadOptions'] cp_cmd = massStorageConfig['cp_cmd'] ls_cmd = massStorageConfig['ls_cmd'] massStoragePath = massStorageConfig['path'] try: self._mkdir(massStoragePath, exitIfNotExist=True) except GangaException: return # the folder part of self.outputfilenameformat folderStructure = '' # the file name part of self.outputfilenameformat filenameStructure = '' if not self.outputfilenameformat: filenameStructure = '{fname}' parent = self._getParent() if parent is not None: folderStructure = '{jid}' if parent._getParent() is not None: folderStructure = os.path.join(folderStructure, '{sjid}') else: folderStructure = os.path.dirname(self.outputfilenameformat) filenameStructure = os.path.basename(self.outputfilenameformat) folderStructure = self.expandString(folderStructure) # create the folder structure if folderStructure: massStoragePath = os.path.join(massStoragePath, folderStructure) try: self._mkdir(massStoragePath) except GangaException: return # here filenameStructure has replaced jid and sjid if any, and only not # replaced keyword is fname fileName = self.namePattern if self.compressed: fileName = '%s.gz' % self.namePattern if regex.search(fileName) is not None: for currentFile in glob.glob(os.path.join(sourceDir, fileName)): finalFilename = self.expandString( filenameStructure, os.path.basename(currentFile)) (exitcode, mystdout, mystderr) = self.execSyscmdSubprocess('%s %s %s' %\ (cp_cmd, quote(currentFile), quote(os.path.join(massStoragePath, finalFilename)))) d = copy.deepcopy(self) d.namePattern = os.path.basename(currentFile) d.localDir = os.path.dirname(currentFile) d.compressed = self.compressed if exitcode != 0: self.handleUploadFailure( mystderr, '4) %s %s %s' % (cp_cmd, currentFile, os.path.join(massStoragePath, finalFilename))) else: logger.info( '%s successfully uploaded to mass storage as %s' % (currentFile, os.path.join(massStoragePath, finalFilename))) d.locations = os.path.join(massStoragePath, os.path.basename(finalFilename)) self.subfiles.append(d) else: currentFile = os.path.join(sourceDir, fileName) finalFilename = self.expandString(filenameStructure, fileName) (exitcode, mystdout, mystderr) = self.execSyscmdSubprocess('%s %s %s' %\ (cp_cmd, quote(currentFile), quote(os.path.join(massStoragePath, finalFilename)))) if exitcode != 0: self.handleUploadFailure( mystderr, '5) %s %s %s' % (cp_cmd, currentFile, os.path.join(massStoragePath, finalFilename))) else: logger.info('%s successfully uploaded to mass storage as %s' % (currentFile, os.path.join(massStoragePath, finalFilename))) location = os.path.join(massStoragePath, os.path.basename(finalFilename)) if location not in self.locations: self.locations.append(location) def validate(self): # if the user has set outputfilenameformat, validate for presence of # jid, sjid and fname keywords depending on job type - split or # non-split if self.outputfilenameformat != None: searchFor = ['{fname}'] isJob = False isSplitJob = False if self._getParent() != None: isJob = True if stripProxy(self.getJobObject()).master is not None: isSplitJob = True searchFor.append('{sjid}') missingKeywords = [] for item in searchFor: if self.outputfilenameformat.find(item) == -1: missingKeywords.append(item) if len(missingKeywords): return ( False, 'Error in %s.outputfilenameformat field : missing keywords %s ' % (_getName(self), ','.join(missingKeywords))) if isSplitJob == False and self.outputfilenameformat.find( '{sjid}') > -1: return ( False, 'Error in %s.outputfilenameformat field : job is non-split, but {\'sjid\'} keyword found' % _getName(self)) if isJob == False and self.outputfilenameformat.find( '{sjid}') > -1: return ( False, 'Error in %s.outputfilenameformat field : no parent job, but {\'sjid\'} keyword found' % _getName(self)) if isJob == False and self.outputfilenameformat.find('{jid}') > -1: return ( False, 'Error in %s.outputfilenameformat field : no parent job, but {\'jid\'} keyword found' % _getName(self)) invalidUnixChars = ['"', ' '] test = self.outputfilenameformat.replace('{jid}', 'a').replace( '{sjid}', 'b').replace('{fname}', 'c') for invalidUnixChar in invalidUnixChars: if test.find(invalidUnixChar) > -1: return ( False, 'Error in %s.outputfilenameformat field : invalid char %s found' % (_getName(self), invalidUnixChar)) return (True, '') def handleUploadFailure(self, error, cmd_run_str=''): """ Function to display what went wrong with an associated Job id if there is one and to assign failureReason for future. Args: error (str): This is the error which was given from the shell command cmd_run_str (str): This is a string related to but not always exactly the command run. """ self.failureReason = error if self._getParent() != None: logger.error( "Job %s failed. One of the job.outputfiles couldn't be uploaded because of %s" % (str(self._getParent().fqid), self.failureReason)) else: logger.error("The file can't be uploaded because of %s" % (self.failureReason)) if cmd_run_str: logger.error("Attempted to run: '%s'" % (cmd_run_str)) def getWNInjectedScript(self, outputFiles, indent, patternsToZip, postProcessLocationsFP): """ Returns script that have to be injected in the jobscript for postprocessing on the WN """ massStorageCommands = [] massStorageConfig = getConfig('Output')[_getName( self)]['uploadOptions'] for outputFile in outputFiles: outputfilenameformat = 'None' if outputFile.outputfilenameformat != None and outputFile.outputfilenameformat != '': outputfilenameformat = outputFile.outputfilenameformat massStorageCommands.append([ 'massstorage', outputFile.namePattern, outputfilenameformat, massStorageConfig['mkdir_cmd'], massStorageConfig['cp_cmd'], massStorageConfig['ls_cmd'], massStorageConfig['path'] ]) script_location = os.path.join( os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))), 'scripts/MassStorageFileWNScript.py.template') from Ganga.GPIDev.Lib.File import FileUtils script = FileUtils.loadScript(script_location, '###INDENT###') jobfqid = self.getJobObject().fqid jobid = jobfqid subjobid = '' if (jobfqid.find('.') > -1): jobid = jobfqid.split('.')[0] subjobid = jobfqid.split('.')[1] replace_dict = { '###MASSSTORAGECOMMANDS###': repr(massStorageCommands), '###PATTERNSTOZIP###': str(patternsToZip), '###INDENT###': indent, '###POSTPROCESSLOCATIONSFP###': postProcessLocationsFP, '###FULLJOBDIR###': str(jobfqid.replace('.', os.path.sep)), '###JOBDIR###': str(jobid), '###SUBJOBDIR###': str(subjobid) } for k, v in replace_dict.iteritems(): script = script.replace(str(k), str(v)) return script def processWildcardMatches(self): if self.subfiles: return self.subfiles if regex.search(self.namePattern): ls_cmd = getConfig('Output')[_getName( self)]['uploadOptions']['ls_cmd'] exitcode, output, m = self.shell.cmd1(ls_cmd + ' ' + self.inputremotedirectory, capture_stderr=True) for filename in output.split('\n'): if fnmatch(filename, self.namePattern): subfile = copy.deepcopy(self) subfile.namepattern = filename subfile.inputremotedirectory = self.inputremotedirectory self.subfiles.append(subfile) def remove(self, force=False, removeLocal=False): """ Removes file from remote storage ONLY by default """ massStorageConfig = getConfig('Output')[_getName( self)]['uploadOptions'] rm_cmd = massStorageConfig['rm_cmd'] if force == True: _auto_delete = True else: _auto_delete = False for i in self.locations: if not _auto_delete: keyin = None while keyin is None: keyin = raw_input( "Do you want to delete file %s at Location: %s ? [y/n] " % (str(self.namePattern), str(i))) if keyin.lower() == 'y': _delete_this = True elif keyin.lower() == 'n': _delete_this = False else: logger.warning("y/n please!") keyin = None else: _delete_this = True if _delete_this: logger.info("Deleting File at Location: %s") self.execSyscmdSubprocess('%s %s' % (rm_cmd, quote(i))) self.locations.pop(i) if removeLocal: sourceDir = '' if self.localDir == '': _CWD = os.getcwd() if os.path.isfile(os.path.join(_CWD, self.namePattern)): sourceDir = _CWD else: sourceDir = self.localDir _localFile = os.path.join(sourceDir, self.namePattern) if os.path.isfile(_localFile): if force: _actual_delete = True else: keyin = None while keyin is None: keyin = raw_input( "Do you want to remove the local File: %s ? ([y]/n) " % str(_localFile)) if keyin.lower() in ['y', '']: _actual_delete = True elif keyin.lower() == 'n': _actual_delete = False else: logger.warning("y/n please!") keyin = None if _actual_delete: remove_filename = _localFile + "_" + str( time.time()) + '__to_be_deleted_' try: os.rename(_localFile, remove_filename) except OSError as err: logger.warning( "Error in first stage of removing file: %s" % remove_filename) remove_filename = _localFile try: os.remove(remove_filename) except OSError as err: if err.errno != errno.ENOENT: logger.error("Error in removing file: %s" % str(remove_filename)) raise pass return def accessURL(self): # Need to come up with a prescription based upon the server address and # file on EOS or elsewhere to return a full URL which we can pass to # ROOT... protoPath = getConfig('Output')[_getName(self)]['defaultProtocol'] myLocations = self.location() accessURLs = [] for _file in myLocations: accessURLs.append(protoPath + os.path.join(os.sep, _file)) return accessURLs