Exemplo n.º 1
0
 def parseXML(self):
     # get LFN and GUID
     # self.logger.debug('XML filename : %s' % self.xmlFile)
     # no outputs
     log_out = [f for f in self.job.Files if f.type in ['log', 'output']]
     if not log_out:
         self.logger.debug("has no outputs")
         self.logger.debug("parseXML end")
         return 0
     # get input files
     inputLFNs = []
     for file in self.job.Files:
         if file.type == 'input':
             inputLFNs.append(file.lfn)
     # parse XML
     lfns = []
     guids = []
     fsizes = []
     md5sums = []
     chksums = []
     surls = []
     fullLfnMap = {}
     nEventsMap = {}
     guidMap = dict()
     try:
         # root  = xml.dom.minidom.parse(self.xmlFile)
         root = xml.dom.minidom.parseString(self.data)
         files = root.getElementsByTagName('File')
         for file in files:
             # get GUID
             guid = str(file.getAttribute('ID'))
             # get PFN and LFN nodes
             logical = file.getElementsByTagName('logical')[0]
             lfnNode = logical.getElementsByTagName('lfn')[0]
             # convert UTF8 to Raw
             lfn = str(lfnNode.getAttribute('name'))
             # get metadata
             fsize = None
             md5sum = None
             adler32 = None
             surl = None
             fullLFN = None
             for meta in file.getElementsByTagName('metadata'):
                 # get fsize
                 name = str(meta.getAttribute('att_name'))
                 if name == 'fsize':
                     fsize = long(meta.getAttribute('att_value'))
                 elif name == 'md5sum':
                     md5sum = str(meta.getAttribute('att_value'))
                     # check
                     if re.search("^[a-fA-F0-9]{32}$", md5sum) is None:
                         md5sum = None
                 elif name == 'adler32':
                     adler32 = str(meta.getAttribute('att_value'))
                 elif name == 'surl':
                     surl = str(meta.getAttribute('att_value'))
                 elif name == 'full_lfn':
                     fullLFN = str(meta.getAttribute('att_value'))
             # endpoints
             self.extraInfo['endpoint'][lfn] = []
             for epNode in file.getElementsByTagName('endpoint'):
                 self.extraInfo['endpoint'][lfn].append(
                     str(epNode.firstChild.data))
             # error check
             if (lfn not in inputLFNs) and (fsize is None or
                                            (md5sum is None
                                             and adler32 is None)):
                 if EventServiceUtils.isEventServiceMerge(self.job):
                     continue
                 else:
                     raise RuntimeError('fsize/md5sum/adler32/surl=None')
             # append
             lfns.append(lfn)
             guids.append(guid)
             fsizes.append(fsize)
             md5sums.append(md5sum)
             surls.append(surl)
             if adler32 is not None:
                 # use adler32 if available
                 chksums.append("ad:%s" % adler32)
             else:
                 chksums.append("md5:%s" % md5sum)
             if fullLFN is not None:
                 fullLfnMap[lfn] = fullLFN
     except Exception:
         # parse json
         try:
             import json
             # with open(self.xmlFile) as tmpF:
             jsonDict = json.loads(self.data)
             for lfn in jsonDict:
                 fileData = jsonDict[lfn]
                 lfn = str(lfn)
                 fsize = None
                 md5sum = None
                 adler32 = None
                 surl = None
                 fullLFN = None
                 guid = str(fileData['guid'])
                 if 'fsize' in fileData:
                     fsize = long(fileData['fsize'])
                 if 'md5sum' in fileData:
                     md5sum = str(fileData['md5sum'])
                     # check
                     if re.search("^[a-fA-F0-9]{32}$", md5sum) is None:
                         md5sum = None
                 if 'adler32' in fileData:
                     adler32 = str(fileData['adler32'])
                 if 'surl' in fileData:
                     surl = str(fileData['surl'])
                 if 'full_lfn' in fileData:
                     fullLFN = str(fileData['full_lfn'])
                 # endpoints
                 self.extraInfo['endpoint'][lfn] = []
                 if 'endpoint' in fileData:
                     self.extraInfo['endpoint'][lfn] = fileData['endpoint']
                 # error check
                 if (lfn not in inputLFNs) and (fsize is None or
                                                (md5sum is None
                                                 and adler32 is None)):
                     if EventServiceUtils.isEventServiceMerge(self.job):
                         continue
                     else:
                         raise RuntimeError(
                             'fsize/md5sum/adler32/surl=None')
                 # append
                 lfns.append(lfn)
                 guids.append(guid)
                 fsizes.append(fsize)
                 md5sums.append(md5sum)
                 surls.append(surl)
                 if adler32 is not None:
                     # use adler32 if available
                     chksums.append("ad:%s" % adler32)
                 else:
                     chksums.append("md5:%s" % md5sum)
                 if fullLFN is not None:
                     fullLfnMap[lfn] = fullLFN
         except Exception:
             # check if file exists
             # if os.path.exists(self.xmlFile):
             if True:
                 type, value, traceBack = sys.exc_info()
                 self.logger.error(": %s %s" % (type, value))
                 # set failed anyway
                 self.job.jobStatus = 'failed'
                 # XML error happens when pilot got killed due to wall-time limit or failures in wrapper
                 if (self.job.pilotErrorCode in [0,'0','NULL']) and \
                    (self.job.taskBufferErrorCode not in [pandaserver.taskbuffer.ErrorCode.EC_WorkerDone]) and \
                    (self.job.transExitCode  in [0,'0','NULL']):
                     self.job.ddmErrorCode = pandaserver.dataservice.ErrorCode.EC_Adder
                     self.job.ddmErrorDiag = "Could not get GUID/LFN/MD5/FSIZE/SURL from pilot XML"
                 return 2
             else:
                 # XML was deleted
                 return 1
     # parse metadata to get nEvents
     nEventsFrom = None
     try:
         root = xml.dom.minidom.parseString(self.job.metadata)
         files = root.getElementsByTagName('File')
         for file in files:
             # get GUID
             guid = str(file.getAttribute('ID'))
             # get PFN and LFN nodes
             logical = file.getElementsByTagName('logical')[0]
             lfnNode = logical.getElementsByTagName('lfn')[0]
             # convert UTF8 to Raw
             lfn = str(lfnNode.getAttribute('name'))
             guidMap[lfn] = guid
             # get metadata
             nevents = None
             for meta in file.getElementsByTagName('metadata'):
                 # get fsize
                 name = str(meta.getAttribute('att_name'))
                 if name == 'events':
                     nevents = long(meta.getAttribute('att_value'))
                     nEventsMap[lfn] = nevents
                     break
         nEventsFrom = "xml"
     except Exception:
         pass
     # parse json
     try:
         import json
         jsonDict = json.loads(self.job.metadata)
         for jsonFileItem in jsonDict['files']['output']:
             for jsonSubFileItem in jsonFileItem['subFiles']:
                 lfn = str(jsonSubFileItem['name'])
                 try:
                     nevents = long(jsonSubFileItem['nentries'])
                     nEventsMap[lfn] = nevents
                 except Exception:
                     pass
                 try:
                     guid = str(jsonSubFileItem['file_guid'])
                     guidMap[lfn] = guid
                 except Exception:
                     pass
         nEventsFrom = "json"
     except Exception:
         pass
     # use nEvents and GUIDs reported by the pilot if no job report
     if self.job.metadata == 'NULL' and self.jobStatus == 'finished' and self.job.nEvents > 0 \
             and self.job.prodSourceLabel in ['managed']:
         for file in self.job.Files:
             if file.type == 'output':
                 nEventsMap[file.lfn] = self.job.nEvents
         for lfn, guid in zip(lfns, guids):
             guidMap[lfn] = guid
         nEventsFrom = "pilot"
     self.logger.debug('nEventsMap=%s' % str(nEventsMap))
     self.logger.debug('nEventsFrom=%s' % str(nEventsFrom))
     self.logger.debug('guidMap=%s' % str(guidMap))
     self.logger.debug('self.job.jobStatus=%s in parseXML' %
                       self.job.jobStatus)
     self.logger.debug(
         'isES=%s isJumbo=%s' % (EventServiceUtils.isEventServiceJob(
             self.job), EventServiceUtils.isJumboJob(self.job)))
     # get lumi block number
     lumiBlockNr = self.job.getLumiBlockNr()
     # copy files for variable number of outputs
     tmpStat = self.copyFilesForVariableNumOutputs(lfns)
     if not tmpStat:
         self.logger.error(
             "failed to copy files for variable number of outputs")
         return 2
     # check files
     fileList = []
     for file in self.job.Files:
         fileList.append(file.lfn)
         if file.type == 'input':
             if file.lfn in lfns:
                 if self.job.prodSourceLabel in ['user', 'panda']:
                     # skipped file
                     file.status = 'skipped'
                 elif self.job.prodSourceLabel in [
                         'managed', 'test'
                 ] + JobUtils.list_ptest_prod_sources:
                     # failed by pilot
                     file.status = 'failed'
         elif file.type == 'output' or file.type == 'log':
             # add only log file for failed jobs
             if self.jobStatus == 'failed' and file.type != 'log':
                 file.status = 'failed'
                 continue
             # set failed if it is missing in XML
             if file.lfn not in lfns:
                 if (self.job.jobStatus == 'finished' and EventServiceUtils.isEventServiceJob(self.job)) \
                         or EventServiceUtils.isJumboJob(self.job):
                     # unset file status for ES jobs
                     pass
                 elif file.isAllowedNoOutput():
                     # allowed not to be produced
                     file.status = 'nooutput'
                     self.logger.debug('set {0} to status={1}'.format(
                         file.lfn, file.status))
                 else:
                     file.status = 'failed'
                     self.job.jobStatus = 'failed'
                     self.job.ddmErrorCode = pandaserver.dataservice.ErrorCode.EC_Adder
                     self.job.ddmErrorDiag = "expected output {0} is missing in pilot XML".format(
                         file.lfn)
                     self.logger.error(self.job.ddmErrorDiag)
                 continue
             # look for GUID with LFN
             try:
                 i = lfns.index(file.lfn)
                 file.GUID = guids[i]
                 file.fsize = fsizes[i]
                 file.md5sum = md5sums[i]
                 file.checksum = chksums[i]
                 surl = surls[i]
                 # status
                 file.status = 'ready'
                 # change to full LFN
                 if file.lfn in fullLfnMap:
                     file.lfn = fullLfnMap[file.lfn]
                 # add SURL to extraInfo
                 self.extraInfo['surl'][file.lfn] = surl
                 # add nevents
                 if file.lfn in nEventsMap:
                     self.extraInfo['nevents'][file.lfn] = nEventsMap[
                         file.lfn]
             except Exception:
                 # status
                 file.status = 'failed'
                 type, value, traceBack = sys.exc_info()
                 self.logger.error(": %s %s" % (type, value))
             # set lumi block number
             if lumiBlockNr is not None and file.status != 'failed':
                 self.extraInfo['lbnr'][file.lfn] = lumiBlockNr
     self.extraInfo['guid'] = guidMap
     # check consistency between XML and filesTable
     for lfn in lfns:
         if lfn not in fileList:
             self.logger.error("%s is not found in filesTable" % lfn)
             self.job.jobStatus = 'failed'
             for tmpFile in self.job.Files:
                 tmpFile.status = 'failed'
             self.job.ddmErrorCode = pandaserver.dataservice.ErrorCode.EC_Adder
             self.job.ddmErrorDiag = "pilot produced {0} inconsistently with jobdef".format(
                 lfn)
             return 2
     # return
     self.logger.debug("parseXML end")
     return 0
Exemplo n.º 2
0
 def run(self):
     self.lock.acquire()
     try:
         for vuid,name,modDate in self.datasets:
             _logger.debug("Freezer start %s %s" % (modDate,name))
             self.proxyLock.acquire()
             retF,resF = taskBuffer.querySQLS("SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ PandaID,status FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock ",
                                          {':destinationDBlock':name})
             self.proxyLock.release()
             if retF < 0:
                 _logger.error("SQL error")
             else:
                 allFinished = True
                 onePandaID = None
                 for tmpPandaID,tmpFileStatus in resF:
                     onePandaID = tmpPandaID
                     if not tmpFileStatus in ['ready', 'failed', 'skipped', 'merging', 'finished']:
                         allFinished = False
                         break
                 # check sub datasets in the jobset for event service job
                 if allFinished:
                     self.proxyLock.acquire()
                     tmpJobs = taskBuffer.getFullJobStatus([onePandaID])
                     self.proxyLock.release()
                     if len(tmpJobs) > 0 and tmpJobs[0] is not None:
                         if EventServiceUtils.isEventServiceMerge(tmpJobs[0]):
                             self.proxyLock.acquire()
                             cThr = Closer(taskBuffer, [], tmpJobs[0])
                             allFinished = cThr.checkSubDatasetsInJobset()
                             self.proxyLock.release()
                             _logger.debug("closer checked sub datasets in the jobset for %s : %s" % (name, allFinished))
                 # no files in filesTable
                 if allFinished:
                     _logger.debug("freeze %s " % name)
                     dsExists = True
                     if name.startswith('pandaddm_') or name.startswith('user.') or name.startswith('group.') \
                             or name.startswith('hc_test.') or name.startswith('panda.um.'):
                         dsExists = False
                     if name.startswith('panda.um.'):
                         self.proxyLock.acquire()
                         retMer,resMer = taskBuffer.querySQLS("SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ PandaID FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock AND status IN (:statusM,:statusF) ",
                                                              {':destinationDBlock':name,
                                                               ':statusM':'merging',
                                                               ':statusF':'failed'})
                         self.proxyLock.release()
                         if resMer is not None and len(resMer)>0:
                             mergeID = resMer[0][0]
                             # get merging jobs
                             self.proxyLock.acquire()
                             mergingJobs = taskBuffer.peekJobs([mergeID],fromDefined=False,fromArchived=False,fromWaiting=False)
                             self.proxyLock.release()    
                             mergeJob = mergingJobs[0]
                             if mergeJob is not None:
                                 tmpDestDBlocks = []
                                 # get destDBlock
                                 for tmpFile in mergeJob.Files:
                                     if tmpFile.type in ['output','log']:
                                         if not tmpFile.destinationDBlock in tmpDestDBlocks:
                                             tmpDestDBlocks.append(tmpFile.destinationDBlock)
                                 # run
                                 _logger.debug("start JEDI closer for %s " % name)
                                 self.proxyLock.acquire()
                                 cThr = Closer(taskBuffer,tmpDestDBlocks,mergeJob)
                                 cThr.start()
                                 cThr.join()
                                 self.proxyLock.release()
                                 _logger.debug("end JEDI closer for %s " % name)
                                 continue
                             else:
                                 _logger.debug("failed to get merging job for %s " % name)
                         else:
                             _logger.debug("failed to get merging file for %s " % name)
                         status,out = True,''
                     elif dsExists:
                         # check if dataset exists
                         status,out = rucioAPI.getMetaData(name)
                         if status == True:
                             if out is not None:
                                 try:
                                     rucioAPI.closeDataset(name)
                                     status = True
                                 except Exception:
                                     errtype,errvalue = sys.exc_info()[:2]
                                     out = 'failed to freeze : {0} {1}'.format(errtype,errvalue)
                                     status = False
                             else:
                                 # dataset not exist
                                 status,out = True,''
                                 dsExists = False
                     else:
                         status,out = True,''
                     if not status:
                         _logger.error('{0} failed to freeze with {1}'.format(name,out))
                     else:
                         self.proxyLock.acquire()
                         varMap = {}
                         varMap[':vuid'] = vuid
                         varMap[':status'] = 'completed' 
                         taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid",
                                          varMap)
                         self.proxyLock.release()                            
                         if name.startswith('pandaddm_') or name.startswith('panda.um.') or not dsExists:
                             continue
                         # set tobedeleted to dis
                         setTobeDeletedToDis(name)
                         # count # of files
                         status,out = rucioAPI.getNumberOfFiles(name)
                         if status is not True:
                             if status is False:
                                 _logger.error(out)
                         else:
                             _logger.debug(out)                                            
                             try:
                                 nFile = int(out)
                                 _logger.debug(nFile)
                                 if nFile == 0:
                                     # erase dataset
                                     _logger.debug('erase %s' % name)                                
                                     status,out = rucioAPI.eraseDataset(name)
                                     _logger.debug('OK with %s' % name)
                             except Exception:
                                 pass
                 else:
                     _logger.debug("wait %s " % name)
                     self.proxyLock.acquire()                        
                     taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET modificationdate=CURRENT_DATE WHERE vuid=:vuid", {':vuid':vuid})
                     self.proxyLock.release()                                                    
             _logger.debug("end %s " % name)
     except Exception:
         errStr = traceback.format_exc()
         _logger.error(errStr)
     self.pool.remove(self)
     self.lock.release()
Exemplo n.º 3
0
 def run(self):
     try:
         _logger.debug('%s Start %s' % (self.pandaID,self.job.jobStatus))
         flagComplete    = True
         topUserDsList   = []
         usingMerger     = False        
         disableNotifier = False
         firstIndvDS     = True
         finalStatusDS   = []
         for destinationDBlock in self.destinationDBlocks:
             dsList = []
             _logger.debug('%s start %s' % (self.pandaID,destinationDBlock))
             # ignore tid datasets
             if re.search('_tid[\d_]+$',destinationDBlock):
                 _logger.debug('%s skip %s' % (self.pandaID,destinationDBlock))                
                 continue
             # ignore HC datasets
             if re.search('^hc_test\.',destinationDBlock) is not None or re.search('^user\.gangarbt\.',destinationDBlock) is not None:
                 if re.search('_sub\d+$',destinationDBlock) is None and re.search('\.lib$',destinationDBlock) is None:
                     _logger.debug('%s skip HC %s' % (self.pandaID,destinationDBlock))                
                     continue
             # query dataset
             if destinationDBlock in self.datasetMap:
                 dataset = self.datasetMap[destinationDBlock]
             else:
                 dataset = self.taskBuffer.queryDatasetWithMap({'name':destinationDBlock})
             if dataset is None:
                 _logger.error('%s Not found : %s' % (self.pandaID,destinationDBlock))
                 flagComplete = False
                 continue
             # skip tobedeleted/tobeclosed 
             if dataset.status in ['cleanup','tobeclosed','completed','deleted']:
                 _logger.debug('%s skip %s due to %s' % (self.pandaID,destinationDBlock,dataset.status))
                 continue
             dsList.append(dataset)
             # sort
             dsList.sort()
             # count number of completed files
             notFinish = self.taskBuffer.countFilesWithMap({'destinationDBlock':destinationDBlock,
                                                            'status':'unknown'})
             if notFinish < 0:
                 _logger.error('%s Invalid DB return : %s' % (self.pandaID,notFinish))
                 flagComplete = False                
                 continue
             # check if completed
             _logger.debug('%s notFinish:%s' % (self.pandaID,notFinish))
             if self.job.destinationSE == 'local' and self.job.prodSourceLabel in ['user','panda']:
                 # close non-DQ2 destinationDBlock immediately
                 finalStatus = 'closed'
             elif self.job.lockedby == 'jedi' and self.isTopLevelDS(destinationDBlock):
                 # set it closed in order not to trigger DDM cleanup. It will be closed by JEDI
                 finalStatus = 'closed'
             elif self.job.prodSourceLabel in ['user'] and "--mergeOutput" in self.job.jobParameters \
                      and self.job.processingType != 'usermerge':
                 # merge output files
                 if firstIndvDS:
                     # set 'tobemerged' to only the first dataset to avoid triggering many Mergers for --individualOutDS
                     finalStatus = 'tobemerged'
                     firstIndvDS = False
                 else:
                     finalStatus = 'tobeclosed'
                 # set merging to top dataset
                 usingMerger = True
                 # disable Notifier
                 disableNotifier = True
             elif self.job.produceUnMerge():
                 finalStatus = 'doing'
             else:
                 # set status to 'tobeclosed' to trigger DQ2 closing
                 finalStatus = 'tobeclosed'
             if notFinish == 0 and EventServiceUtils.isEventServiceMerge(self.job):
                 allInJobsetFinished = self.checkSubDatasetsInJobset()
             else:
                 allInJobsetFinished = True
             if notFinish == 0 and allInJobsetFinished: 
                 _logger.debug('%s set %s to dataset : %s' % (self.pandaID,finalStatus,destinationDBlock))
                 # set status
                 dataset.status = finalStatus
                 # update dataset in DB
                 retT = self.taskBuffer.updateDatasets(dsList,withLock=True,withCriteria="status<>:crStatus AND status<>:lockStatus ",
                                                       criteriaMap={':crStatus':finalStatus,':lockStatus':'locked'})
                 if len(retT) > 0 and retT[0]==1:
                     finalStatusDS += dsList
                     # close user datasets
                     if self.job.prodSourceLabel in ['user'] and self.job.destinationDBlock.endswith('/') \
                            and (dataset.name.startswith('user') or dataset.name.startswith('group')):
                         # get top-level user dataset 
                         topUserDsName = re.sub('_sub\d+$','',dataset.name)
                         # update if it is the first attempt
                         if topUserDsName != dataset.name and not topUserDsName in topUserDsList and self.job.lockedby != 'jedi':
                             topUserDs = self.taskBuffer.queryDatasetWithMap({'name':topUserDsName})
                             if topUserDs is not None:
                                 # check status
                                 if topUserDs.status in ['completed','cleanup','tobeclosed','deleted',
                                                         'tobemerged','merging']:
                                     _logger.debug('%s skip %s due to status=%s' % (self.pandaID,topUserDsName,topUserDs.status))
                                 else:
                                     # set status
                                     if self.job.processingType.startswith('gangarobot') or \
                                            self.job.processingType.startswith('hammercloud'):
                                         # not trigger freezing for HC datasets so that files can be appended
                                         topUserDs.status = 'completed'
                                     elif not usingMerger:
                                         topUserDs.status = finalStatus
                                     else:
                                         topUserDs.status = 'merging'
                                     # append to avoid repetition
                                     topUserDsList.append(topUserDsName)
                                     # update DB
                                     retTopT = self.taskBuffer.updateDatasets([topUserDs],withLock=True,withCriteria="status<>:crStatus",
                                                                              criteriaMap={':crStatus':topUserDs.status})
                                     if len(retTopT) > 0 and retTopT[0]==1:
                                         _logger.debug('%s set %s to top dataset : %s' % (self.pandaID,topUserDs.status,topUserDsName))
                                     else:
                                         _logger.debug('%s failed to update top dataset : %s' % (self.pandaID,topUserDsName))
                         # get parent dataset for merge job
                         if self.job.processingType == 'usermerge':
                             tmpMatch = re.search('--parentDS ([^ \'\"]+)',self.job.jobParameters)
                             if tmpMatch is None:
                                 _logger.error('%s failed to extract parentDS' % self.pandaID)
                             else:
                                 unmergedDsName = tmpMatch.group(1)
                                 # update if it is the first attempt
                                 if not unmergedDsName in topUserDsList:
                                     unmergedDs = self.taskBuffer.queryDatasetWithMap({'name':unmergedDsName})
                                     if unmergedDs is None:
                                         _logger.error('%s failed to get parentDS=%s from DB' % (self.pandaID,unmergedDsName))
                                     else:
                                         # check status
                                         if unmergedDs.status in ['completed','cleanup','tobeclosed']:
                                             _logger.debug('%s skip %s due to status=%s' % (self.pandaID,unmergedDsName,unmergedDs.status))
                                         else:
                                             # set status
                                             unmergedDs.status = finalStatus
                                             # append to avoid repetition
                                             topUserDsList.append(unmergedDsName)
                                             # update DB
                                             retTopT = self.taskBuffer.updateDatasets([unmergedDs],withLock=True,withCriteria="status<>:crStatus",
                                                                                      criteriaMap={':crStatus':unmergedDs.status})
                                             if len(retTopT) > 0 and retTopT[0]==1:
                                                 _logger.debug('%s set %s to parent dataset : %s' % (self.pandaID,unmergedDs.status,unmergedDsName))
                                             else:
                                                 _logger.debug('%s failed to update parent dataset : %s' % (self.pandaID,unmergedDsName))
                     # start Activator
                     if re.search('_sub\d+$',dataset.name) is None:
                         if self.job.prodSourceLabel=='panda' and self.job.processingType in ['merge','unmerge']:
                             # don't trigger Activator for merge jobs
                             pass
                         else:
                             if self.job.jobStatus == 'finished':
                                 aThr = Activator(self.taskBuffer,dataset)
                                 aThr.start()
                                 aThr.join()
                 else:
                     # unset flag since another thread already updated 
                     #flagComplete = False
                     pass
             else:
                 # update dataset in DB
                 self.taskBuffer.updateDatasets(dsList,withLock=True,withCriteria="status<>:crStatus AND status<>:lockStatus ",
                                                criteriaMap={':crStatus':finalStatus,':lockStatus':'locked'})
                 # unset flag
                 flagComplete = False
             # end
             _logger.debug('%s end %s' % (self.pandaID,destinationDBlock))
         # special actions for vo
         if flagComplete:
             closerPluginClass = panda_config.getPlugin('closer_plugins',self.job.VO)
             if closerPluginClass is None and self.job.VO == 'atlas':
                 # use ATLAS plugin for ATLAS
                 from pandaserver.dataservice.CloserAtlasPlugin import CloserAtlasPlugin
                 closerPluginClass = CloserAtlasPlugin
             if closerPluginClass is not None:
                 closerPlugin = closerPluginClass(self.job,finalStatusDS,_logger)
                 closerPlugin.execute()
         # change pending jobs to failed
         finalizedFlag = True
         if flagComplete and self.job.prodSourceLabel=='user':
             _logger.debug('%s finalize %s %s' % (self.pandaID,self.job.prodUserName,self.job.jobDefinitionID))
             finalizedFlag = self.taskBuffer.finalizePendingJobs(self.job.prodUserName,self.job.jobDefinitionID)
             _logger.debug('%s finalized with %s' % (self.pandaID,finalizedFlag))
         # update unmerged datasets in JEDI to trigger merging
         if flagComplete and self.job.produceUnMerge() and finalStatusDS != []:
             if finalizedFlag:
                 tmpStat = self.taskBuffer.updateUnmergedDatasets(self.job,finalStatusDS)
                 _logger.debug('%s updated unmerged datasets with %s' % (self.pandaID,tmpStat))
         # start notifier
         _logger.debug('%s source:%s complete:%s' % (self.pandaID,self.job.prodSourceLabel,flagComplete))
         if (self.job.jobStatus != 'transferring') and ((flagComplete and self.job.prodSourceLabel=='user') or \
            (self.job.jobStatus=='failed' and self.job.prodSourceLabel=='panda')) and \
            self.job.lockedby != 'jedi':
             # don't send email for merge jobs
             if (not disableNotifier) and not self.job.processingType in ['merge','unmerge']:
                 useNotifier = True
                 summaryInfo = {}
                 # check all jobDefIDs in jobsetID
                 if not self.job.jobsetID in [0,None,'NULL']:
                     useNotifier,summaryInfo = self.taskBuffer.checkDatasetStatusForNotifier(self.job.jobsetID,self.job.jobDefinitionID,
                                                                                             self.job.prodUserName)
                     _logger.debug('%s useNotifier:%s' % (self.pandaID,useNotifier))
                 if useNotifier:
                     _logger.debug('%s start Notifier' % self.pandaID)
                     nThr = Notifier.Notifier(self.taskBuffer,self.job,self.destinationDBlocks,summaryInfo)
                     nThr.run()
                     _logger.debug('%s end Notifier' % self.pandaID)                    
         _logger.debug('%s End' % self.pandaID)
     except Exception:
         errType,errValue = sys.exc_info()[:2]
         _logger.error("%s %s" % (errType,errValue))
Exemplo n.º 4
0
 def updateJobs(self, jobList, tmpLog):
     updateJobs = []
     failedJobs = []
     activateJobs = []
     waitingJobs = []
     closeJobs = []
     # sort out jobs
     for job in jobList:
         # failed jobs
         if job.jobStatus in ['failed', 'cancelled']:
             failedJobs.append(job)
         # waiting
         elif job.jobStatus == 'waiting':
             waitingJobs.append(job)
         # no input jobs
         elif job.dispatchDBlock == 'NULL':
             activateJobs.append(job)
         # normal jobs
         else:
             # change status
             job.jobStatus = "assigned"
             updateJobs.append(job)
     # trigger merge generation if all events are done
     newActivateJobs = []
     nFinished = 0
     for job in activateJobs:
         if job.notDiscardEvents() and job.allOkEvents(
         ) and not EventServiceUtils.isEventServiceMerge(job):
             self.taskBuffer.activateJobs([job])
             # change status
             job.jobStatus = "finished"
             self.taskBuffer.updateJobs([job], False)
             nFinished += 1
         else:
             newActivateJobs.append(job)
     activateJobs = newActivateJobs
     tmpLog.debug('# of finished jobs in activated : {0}'.format(nFinished))
     newUpdateJobs = []
     nFinished = 0
     for job in updateJobs:
         if job.notDiscardEvents() and job.allOkEvents(
         ) and not EventServiceUtils.isEventServiceMerge(job):
             self.taskBuffer.updateJobs([job], True)
             # change status
             job.jobStatus = "finished"
             self.taskBuffer.updateJobs([job], True)
             nFinished += 1
         else:
             newUpdateJobs.append(job)
     updateJobs = newUpdateJobs
     tmpLog.debug('# of finished jobs in defined : {0}'.format(nFinished))
     # update DB
     tmpLog.debug('# of activated jobs : {0}'.format(len(activateJobs)))
     self.taskBuffer.activateJobs(activateJobs)
     tmpLog.debug('# of updated jobs : {0}'.format(len(updateJobs)))
     self.taskBuffer.updateJobs(updateJobs, True)
     tmpLog.debug('# of failed jobs : {0}'.format(len(failedJobs)))
     self.taskBuffer.updateJobs(failedJobs, True)
     tmpLog.debug('# of waiting jobs : {0}'.format(len(waitingJobs)))
     self.taskBuffer.keepJobs(waitingJobs)
     # delete local values
     del updateJobs
     del failedJobs
     del activateJobs
     del waitingJobs
Exemplo n.º 5
0
 def appendJob(self, job, siteMapperCache=None):
     # event service merge
     if EventServiceUtils.isEventServiceMerge(job):
         isEventServiceMerge = True
     else:
         isEventServiceMerge = False
     # PandaID
     self.data['PandaID'] = job.PandaID
     # prodSourceLabel
     self.data['prodSourceLabel'] = job.prodSourceLabel
     # swRelease
     self.data['swRelease'] = job.AtlasRelease
     # homepackage
     self.data['homepackage'] = job.homepackage
     # transformation
     self.data['transformation'] = job.transformation
     # job name
     self.data['jobName'] = job.jobName
     # job definition ID
     self.data['jobDefinitionID'] = job.jobDefinitionID
     # cloud
     self.data['cloud'] = job.cloud
     # files
     strIFiles = ''
     strOFiles = ''
     strDispatch = ''
     strDisToken = ''
     strDisTokenForOutput = ''
     strDestination = ''
     strRealDataset = ''
     strRealDatasetIn = ''
     strProdDBlock = ''
     strDestToken = ''
     strProdToken = ''
     strProdTokenForOutput = ''
     strGUID = ''
     strFSize = ''
     strCheckSum = ''
     strFileDestinationSE = ''
     strScopeIn = ''
     strScopeOut = ''
     strScopeLog = ''
     logFile = ''
     logGUID = ''
     ddmEndPointIn = []
     ddmEndPointOut = []
     noOutput = []
     siteSpec = None
     inDsLfnMap = {}
     inLFNset = set()
     if siteMapperCache is not None:
         siteMapper = siteMapperCache.getObj()
         siteSpec = siteMapper.getSite(job.computingSite)
         # resolve destSE
         try:
             job.destinationSE = siteMapper.resolveNucleus(
                 job.destinationSE)
             for tmpFile in job.Files:
                 tmpFile.destinationSE = siteMapper.resolveNucleus(
                     tmpFile.destinationSE)
         except Exception:
             pass
         siteMapperCache.releaseObj()
     for file in job.Files:
         if file.type == 'input':
             if EventServiceUtils.isJumboJob(job) and file.lfn in inLFNset:
                 pass
             else:
                 inLFNset.add(file.lfn)
                 if strIFiles != '':
                     strIFiles += ','
                 strIFiles += file.lfn
                 if strDispatch != '':
                     strDispatch += ','
                 strDispatch += file.dispatchDBlock
                 if strDisToken != '':
                     strDisToken += ','
                 strDisToken += file.dispatchDBlockToken
                 strProdDBlock += '%s,' % file.prodDBlock
                 if not isEventServiceMerge:
                     strProdToken += '%s,' % file.prodDBlockToken
                 else:
                     strProdToken += '%s,' % job.metadata[1][file.lfn]
                 if strGUID != '':
                     strGUID += ','
                 strGUID += file.GUID
                 strRealDatasetIn += '%s,' % file.dataset
                 strFSize += '%s,' % file.fsize
                 if file.checksum not in ['', 'NULL', None]:
                     strCheckSum += '%s,' % file.checksum
                 else:
                     strCheckSum += '%s,' % file.md5sum
                 strScopeIn += '%s,' % file.scope
                 ddmEndPointIn.append(
                     self.getDdmEndpoint(siteSpec, file.dispatchDBlockToken,
                                         'input', job.prodSourceLabel,
                                         job.job_label))
                 if file.dataset not in inDsLfnMap:
                     inDsLfnMap[file.dataset] = []
                 inDsLfnMap[file.dataset].append(file.lfn)
         if file.type == 'output' or file.type == 'log':
             if strOFiles != '':
                 strOFiles += ','
             strOFiles += file.lfn
             if strDestination != '':
                 strDestination += ','
             strDestination += file.destinationDBlock
             if strRealDataset != '':
                 strRealDataset += ','
             strRealDataset += file.dataset
             strFileDestinationSE += '%s,' % file.destinationSE
             if file.type == 'log':
                 logFile = file.lfn
                 logGUID = file.GUID
                 strScopeLog = file.scope
             else:
                 strScopeOut += '%s,' % file.scope
             if strDestToken != '':
                 strDestToken += ','
             strDestToken += re.sub(
                 '^ddd:', 'dst:',
                 file.destinationDBlockToken.split(',')[0])
             strDisTokenForOutput += '%s,' % file.dispatchDBlockToken
             strProdTokenForOutput += '%s,' % file.prodDBlockToken
             ddmEndPointOut.append(
                 self.getDdmEndpoint(
                     siteSpec,
                     file.destinationDBlockToken.split(',')[0], 'output',
                     job.prodSourceLabel, job.job_label))
             if file.isAllowedNoOutput():
                 noOutput.append(file.lfn)
     # inFiles
     self.data['inFiles'] = strIFiles
     # dispatch DBlock
     self.data['dispatchDblock'] = strDispatch
     # dispatch DBlock space token
     self.data['dispatchDBlockToken'] = strDisToken
     # dispatch DBlock space token for output
     self.data['dispatchDBlockTokenForOut'] = strDisTokenForOutput[:-1]
     # outFiles
     self.data['outFiles'] = strOFiles
     # destination DBlock
     self.data['destinationDblock'] = strDestination
     # destination DBlock space token
     self.data['destinationDBlockToken'] = strDestToken
     # prod DBlocks
     self.data['prodDBlocks'] = strProdDBlock[:-1]
     # prod DBlock space token
     self.data['prodDBlockToken'] = strProdToken[:-1]
     # real output datasets
     self.data['realDatasets'] = strRealDataset
     # real output datasets
     self.data['realDatasetsIn'] = strRealDatasetIn[:-1]
     # file's destinationSE
     self.data['fileDestinationSE'] = strFileDestinationSE[:-1]
     # log filename
     self.data['logFile'] = logFile
     # log GUID
     self.data['logGUID'] = logGUID
     # jobPars
     self.data['jobPars'], ppSteps = job.extractMultiStepExec()
     if ppSteps is not None:
         self.data.update(ppSteps)
     if job.to_encode_job_params():
         self.data['jobPars'] = base64.b64encode(
             self.data['jobPars'].encode()).decode()
     # attempt number
     self.data['attemptNr'] = job.attemptNr
     # GUIDs
     self.data['GUID'] = strGUID
     # checksum
     self.data['checksum'] = strCheckSum[:-1]
     # fsize
     self.data['fsize'] = strFSize[:-1]
     # scope
     self.data['scopeIn'] = strScopeIn[:-1]
     self.data['scopeOut'] = strScopeOut[:-1]
     self.data['scopeLog'] = strScopeLog
     # DDM endpoints
     try:
         self.data['ddmEndPointIn'] = ','.join(ddmEndPointIn)
     except TypeError:
         self.data['ddmEndPointIn'] = ''
     try:
         self.data['ddmEndPointOut'] = ','.join(ddmEndPointOut)
     except TypeError:
         self.data['ddmEndPointOut'] = ''
     # destinationSE
     self.data['destinationSE'] = job.destinationSE
     # user ID
     self.data['prodUserID'] = job.prodUserID
     # CPU count
     self.data['maxCpuCount'] = job.maxCpuCount
     # RAM count
     self.data['minRamCount'] = job.minRamCount
     # disk count
     self.data['maxDiskCount'] = job.maxDiskCount
     # cmtconfig
     if ppSteps is None:
         self.data['cmtConfig'] = job.cmtConfig
     else:
         self.data['cmtConfig'] = ''
     # processingType
     self.data['processingType'] = job.processingType
     # transferType
     self.data['transferType'] = job.transferType
     # sourceSite
     self.data['sourceSite'] = job.sourceSite
     # current priority
     self.data['currentPriority'] = job.currentPriority
     # taskID
     if job.lockedby == 'jedi':
         self.data['taskID'] = job.jediTaskID
     else:
         self.data['taskID'] = job.taskID
     # core count
     if job.coreCount in ['NULL', None]:
         self.data['coreCount'] = 1
     else:
         self.data['coreCount'] = job.coreCount
     # jobsetID
     self.data['jobsetID'] = job.jobsetID
     # nucleus
     self.data['nucleus'] = job.nucleus
     # walltime
     self.data['maxWalltime'] = job.maxWalltime
     # looping check
     if job.is_no_looping_check():
         self.data['loopingCheck'] = False
     # debug mode
     if job.specialHandling is not None and 'debug' in job.specialHandling:
         self.data['debug'] = 'True'
     # event service or job cloning
     if EventServiceUtils.isJobCloningJob(job):
         self.data['cloneJob'] = EventServiceUtils.getJobCloningType(job)
     elif EventServiceUtils.isEventServiceJob(
             job) or EventServiceUtils.isJumboJob(job):
         self.data['eventService'] = 'True'
         # prod DBlock space token for pre-merging output
         self.data['prodDBlockTokenForOutput'] = strProdTokenForOutput[:-1]
     # event service merge
     if isEventServiceMerge:
         self.data['eventServiceMerge'] = 'True'
         # write to file for ES merge
         writeToFileStr = ''
         try:
             for outputName in job.metadata[0]:
                 inputList = job.metadata[0][outputName]
                 writeToFileStr += 'inputFor_{0}:'.format(outputName)
                 for tmpInput in inputList:
                     writeToFileStr += '{0},'.format(tmpInput)
                 writeToFileStr = writeToFileStr[:-1]
                 writeToFileStr += '^'
             writeToFileStr = writeToFileStr[:-1]
         except Exception:
             pass
         self.data['writeToFile'] = writeToFileStr
     elif job.writeInputToFile():
         try:
             # write input to file
             writeToFileStr = ''
             for inDS in inDsLfnMap:
                 inputList = inDsLfnMap[inDS]
                 inDS = re.sub('/$', '', inDS)
                 inDS = inDS.split(':')[-1]
                 writeToFileStr += 'tmpin_{0}:'.format(inDS)
                 writeToFileStr += ','.join(inputList)
                 writeToFileStr += '^'
             writeToFileStr = writeToFileStr[:-1]
             self.data['writeToFile'] = writeToFileStr
         except Exception:
             pass
     # replace placeholder
     if EventServiceUtils.isJumboJob(job) or EventServiceUtils.isCoJumboJob(
             job):
         try:
             for inDS in inDsLfnMap:
                 inputList = inDsLfnMap[inDS]
                 inDS = re.sub('/$', '', inDS)
                 inDS = inDS.split(':')[-1]
                 srcStr = 'tmpin__cnt_{0}'.format(inDS)
                 dstStr = ','.join(inputList)
                 self.data['jobPars'] = self.data['jobPars'].replace(
                     srcStr, dstStr)
         except Exception:
             pass
     # no output
     if noOutput != []:
         self.data['allowNoOutput'] = ','.join(noOutput)
     # alternative stage-out
     if job.getAltStgOut() is not None:
         self.data['altStageOut'] = job.getAltStgOut()
     # log to OS
     if job.putLogToOS():
         self.data['putLogToOS'] = 'True'
     # suppress execute string conversion
     if job.noExecStrCnv():
         self.data['noExecStrCnv'] = 'True'
     # in-file positional event number
     if job.inFilePosEvtNum():
         self.data['inFilePosEvtNum'] = 'True'
     # use prefetcher
     if job.usePrefetcher():
         self.data['usePrefetcher'] = 'True'
     # image name
     if job.container_name not in ['NULL', None]:
         self.data['container_name'] = job.container_name
     # IO
     self.data['ioIntensity'] = job.get_task_attribute('ioIntensity')
     self.data['ioIntensityUnit'] = job.get_task_attribute(
         'ioIntensityUnit')
     # HPO
     if job.is_hpo_workflow():
         self.data['isHPO'] = 'True'
     # VP
     if siteSpec is not None:
         scope_input, scope_output = DataServiceUtils.select_scope(
             siteSpec, job.prodSourceLabel, job.job_label)
         if siteSpec.use_vp(scope_input):
             self.data['useVP'] = 'True'