def getDdmEndpoint(self, siteID, storageToken, prodSourceLabel): if not self.checkSite(siteID): return None siteSpec = self.getSite(siteID) scope_input, scope_output = select_scope(siteSpec, prodSourceLabel) if storageToken in siteSpec.setokens_output[scope_output]: return siteSpec.setokens_output[scope_output][storageToken] return siteSpec.ddm_output[scope_output]
def run(self): self.lock.acquire() try: # get jobs from DB ids = self.ids self.proxyLock.acquire() jobs = taskBuffer.peekJobs(ids,fromActive=False,fromArchived=False,fromWaiting=False) self.proxyLock.release() actJobs = [] replicaMap = dict() for tmpJob in jobs: if tmpJob is None or tmpJob.jobStatus == 'unknown': continue # check if locally available siteSpec = siteMapper.getSite(tmpJob.computingSite) scope_input, scope_output = select_scope(siteSpec, tmpJob.prodSourceLabel) allOK = True for tmpFile in tmpJob.Files: # only input files are checked if tmpFile.type == 'input' and tmpFile.status != 'ready': # get replicas if tmpFile.dispatchDBlock not in replicaMap: tmpStat, repMap = rucioAPI.listDatasetReplicas(tmpFile.dispatchDBlock) if tmpStat != 0: repMap = {} replicaMap[tmpFile.dispatchDBlock] = repMap # check RSEs for rse in replicaMap[tmpFile.dispatchDBlock]: repInfo = replicaMap[tmpFile.dispatchDBlock][rse] if siteSpec.ddm_endpoints_input[scope_input].isAssociated(rse) and \ siteSpec.ddm_endpoints_input[scope_input].getEndPoint(rse)['is_tape'] == 'N' and \ repInfo[0]['total'] == repInfo[0]['found'] and repInfo[0]['total'] is not None: tmpFile.status = 'ready' break # missing if tmpFile.status != 'ready': allOK = False _logger.debug("%s skip since %s:%s is missing with rule" % (tmpJob.PandaID,tmpFile.scope,tmpFile.lfn)) break if not allOK: continue # append to run activator _logger.debug("%s to activate with rule" % tmpJob.PandaID) actJobs.append(tmpJob) # update _logger.debug("activating ...") self.proxyLock.acquire() taskBuffer.activateJobs(actJobs) self.proxyLock.release() _logger.debug("done") time.sleep(1) except Exception: errtype,errvalue = sys.exc_info()[:2] _logger.error("ActivatorThr failed with %s %s" % (errtype,errvalue)) self.pool.remove(self) self.lock.release()
def run(self): try: self.putLog('start %s' % self.evpFileName) # lock evp file self.evpFile = open(self.evpFileName) try: fcntl.flock(self.evpFile.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB) except Exception: # relase self.putLog("cannot lock %s" % self.evpFileName) self.evpFile.close() return True # options runEvtList = [] eventPickDataType = '' eventPickStreamName = '' eventPickDS = [] eventPickAmiTag = '' eventPickNumSites = 1 inputFileList = [] tagDsList = [] tagQuery = '' tagStreamRef = '' skipDaTRI = False runEvtGuidMap = {} ei_api = '' # read evp file for tmpLine in self.evpFile: tmpMatch = re.search('^([^=]+)=(.+)$', tmpLine) # check format if tmpMatch is None: continue tmpItems = tmpMatch.groups() if tmpItems[0] == 'runEvent': # get run and event number tmpRunEvt = tmpItems[1].split(',') if len(tmpRunEvt) == 2: runEvtList.append(tmpRunEvt) elif tmpItems[0] == 'eventPickDataType': # data type eventPickDataType = tmpItems[1] elif tmpItems[0] == 'eventPickStreamName': # stream name eventPickStreamName = tmpItems[1] elif tmpItems[0] == 'eventPickDS': # dataset pattern eventPickDS = tmpItems[1].split(',') elif tmpItems[0] == 'eventPickAmiTag': # AMI tag eventPickAmiTag = tmpItems[1] elif tmpItems[0] == 'eventPickNumSites': # the number of sites where datasets are distributed try: eventPickNumSites = int(tmpItems[1]) except Exception: pass elif tmpItems[0] == 'userName': # user name self.userDN = tmpItems[1] self.putLog("user=%s" % self.userDN) elif tmpItems[0] == 'userTaskName': # user task name self.userTaskName = tmpItems[1] elif tmpItems[0] == 'userDatasetName': # user dataset name self.userDatasetName = tmpItems[1] elif tmpItems[0] == 'lockedBy': # client name self.lockedBy = tmpItems[1] elif tmpItems[0] == 'creationTime': # creation time self.creationTime = tmpItems[1] elif tmpItems[0] == 'params': # parameters self.params = tmpItems[1] elif tmpItems[0] == 'ei_api': # ei api parameter for MC ei_api = tmpItems[1] elif tmpItems[0] == 'inputFileList': # input file list inputFileList = tmpItems[1].split(',') try: inputFileList.remove('') except Exception: pass elif tmpItems[0] == 'tagDS': # TAG dataset tagDsList = tmpItems[1].split(',') elif tmpItems[0] == 'tagQuery': # query for TAG tagQuery = tmpItems[1] elif tmpItems[0] == 'tagStreamRef': # StreamRef for TAG tagStreamRef = tmpItems[1] if not tagStreamRef.endswith('_ref'): tagStreamRef += '_ref' elif tmpItems[0] == 'runEvtGuidMap': # GUIDs try: runEvtGuidMap = eval(tmpItems[1]) except Exception: pass # extract task name if self.userTaskName == '' and self.params != '': try: tmpMatch = re.search('--outDS(=| ) *([^ ]+)', self.params) if tmpMatch is not None: self.userTaskName = tmpMatch.group(2) if not self.userTaskName.endswith('/'): self.userTaskName += '/' except Exception: pass # suppress DaTRI if self.params != '': if '--eventPickSkipDaTRI' in self.params: skipDaTRI = True # get compact user name compactDN = self.taskBuffer.cleanUserID(self.userDN) # get jediTaskID self.jediTaskID = self.taskBuffer.getTaskIDwithTaskNameJEDI( compactDN, self.userTaskName) # get prodSourceLabel self.prodSourceLabel, self.job_label = self.taskBuffer.getProdSourceLabelwithTaskID( self.jediTaskID) # convert run/event list to dataset/file list tmpRet, locationMap, allFiles = self.pd2p.convertEvtRunToDatasets( runEvtList, eventPickDataType, eventPickStreamName, eventPickDS, eventPickAmiTag, self.userDN, runEvtGuidMap, ei_api) if not tmpRet: if 'isFatal' in locationMap and locationMap['isFatal'] is True: self.ignoreError = False self.endWithError( 'Failed to convert the run/event list to a dataset/file list' ) return False # use only files in the list if inputFileList != []: tmpAllFiles = [] for tmpFile in allFiles: if tmpFile['lfn'] in inputFileList: tmpAllFiles.append(tmpFile) allFiles = tmpAllFiles # remove redundant CN from DN tmpDN = self.userDN tmpDN = re.sub('/CN=limited proxy', '', tmpDN) tmpDN = re.sub('(/CN=proxy)+$', '', tmpDN) # make dataset container tmpRet = self.pd2p.registerDatasetContainerWithDatasets( self.userDatasetName, allFiles, locationMap, nSites=eventPickNumSites, owner=tmpDN) if not tmpRet: self.endWithError('Failed to make a dataset container %s' % self.userDatasetName) return False # skip DaTRI if skipDaTRI: # successfully terminated self.putLog("skip DaTRI") # update task self.taskBuffer.updateTaskModTimeJEDI(self.jediTaskID) else: # get candidates tmpRet, candidateMaps = self.pd2p.getCandidates( self.userDatasetName, self.prodSourceLabel, self.job_label, checkUsedFile=False, useHidden=True) if not tmpRet: self.endWithError( 'Failed to find candidate for destination') return False # collect all candidates allCandidates = [] for tmpDS in candidateMaps: tmpDsVal = candidateMaps[tmpDS] for tmpCloud in tmpDsVal: tmpCloudVal = tmpDsVal[tmpCloud] for tmpSiteName in tmpCloudVal[0]: if tmpSiteName not in allCandidates: allCandidates.append(tmpSiteName) if allCandidates == []: self.endWithError('No candidate for destination') return False # get list of dataset (container) names if eventPickNumSites > 1: # decompose container to transfer datasets separately tmpRet, tmpOut = self.pd2p.getListDatasetReplicasInContainer( self.userDatasetName) if not tmpRet: self.endWithError('Failed to get replicas in %s' % self.userDatasetName) return False userDatasetNameList = list(tmpOut) else: # transfer container at once userDatasetNameList = [self.userDatasetName] # loop over all datasets sitesUsed = [] for tmpUserDatasetName in userDatasetNameList: # get size of dataset container tmpRet, totalInputSize = rucioAPI.getDatasetSize( tmpUserDatasetName) if not tmpRet: self.endWithError( 'Failed to get the size of {0} with {1}'.format( tmpUserDatasetName, totalInputSize)) return False # run brokerage tmpJob = JobSpec() tmpJob.AtlasRelease = '' self.putLog("run brokerage for %s" % tmpDS) pandaserver.brokerage.broker.schedule( [tmpJob], self.taskBuffer, self.siteMapper, True, allCandidates, True, datasetSize=totalInputSize) if tmpJob.computingSite.startswith('ERROR'): self.endWithError('brokerage failed with %s' % tmpJob.computingSite) return False self.putLog("site -> %s" % tmpJob.computingSite) # send transfer request try: tmpDN = rucioAPI.parse_dn(tmpDN) tmpStatus, userInfo = rucioAPI.finger(tmpDN) if not tmpStatus: raise RuntimeError( 'user info not found for {0} with {1}'.format( tmpDN, userInfo)) tmpDN = userInfo['nickname'] tmpSiteSpec = self.siteMapper.getSite( tmpJob.computingSite) scope_input, scope_output = select_scope( tmpSiteSpec, JobUtils.ANALY_PS, JobUtils.ANALY_PS) tmpDQ2ID = tmpSiteSpec.ddm_input[scope_input] tmpMsg = "%s ds=%s site=%s id=%s" % ( 'registerDatasetLocation for DaTRI ', tmpUserDatasetName, tmpDQ2ID, tmpDN) self.putLog(tmpMsg) rucioAPI.registerDatasetLocation( tmpDS, [tmpDQ2ID], lifetime=14, owner=tmpDN, activity="User Subscriptions") self.putLog('OK') except Exception: errType, errValue = sys.exc_info()[:2] tmpStr = 'Failed to send transfer request : %s %s' % ( errType, errValue) tmpStr.strip() tmpStr += traceback.format_exc() self.endWithError(tmpStr) return False # list of sites already used sitesUsed.append(tmpJob.computingSite) self.putLog("used %s sites" % len(sitesUsed)) # set candidates if len(sitesUsed) >= eventPickNumSites: # reset candidates to limit the number of sites allCandidates = sitesUsed sitesUsed = [] else: # remove site allCandidates.remove(tmpJob.computingSite) # send email notification for success tmpMsg = 'A transfer request was successfully sent to Rucio.\n' tmpMsg += 'Your task will get started once transfer is completed.' self.sendEmail(True, tmpMsg) try: # unlock and delete evp file fcntl.flock(self.evpFile.fileno(), fcntl.LOCK_UN) self.evpFile.close() os.remove(self.evpFileName) except Exception: pass # successfully terminated self.putLog("end %s" % self.evpFileName) return True except Exception: errType, errValue = sys.exc_info()[:2] self.endWithError('Got exception %s:%s %s' % (errType, errValue, traceback.format_exc())) return False
def doCheck(self, taskSpecList): # make logger tmpLog = MsgWrapper(logger) tmpLog.debug('start doCheck') # return for failure retFatal = self.SC_FATAL, {} retTmpError = self.SC_FAILED, {} # get list of jediTaskIDs taskIdList = [] taskSpecMap = {} for taskSpec in taskSpecList: taskIdList.append(taskSpec.jediTaskID) taskSpecMap[taskSpec.jediTaskID] = taskSpec # check with panda tmpLog.debug('check with panda') tmpPandaStatus, cloudsInPanda = PandaClient.seeCloudTask(taskIdList) if tmpPandaStatus != 0: tmpLog.error('failed to see clouds') return retTmpError # make return map retMap = {} for tmpTaskID, tmpCoreName in iteritems(cloudsInPanda): tmpLog.debug('jediTaskID={0} -> {1}'.format( tmpTaskID, tmpCoreName)) if tmpCoreName not in ['NULL', '', None]: taskSpec = taskSpecMap[tmpTaskID] if taskSpec.useWorldCloud(): # get destinations for WORLD cloud ddmIF = self.ddmIF.getInterface(taskSpec.vo) # get site siteSpec = self.siteMapper.getSite(tmpCoreName) scopeSiteSpec_input, scopeSiteSpec_output = select_scope( siteSpec, taskSpec.prodSourceLabel, JobUtils.translate_tasktype_to_jobtype( taskSpec.taskType)) # get nucleus nucleus = siteSpec.pandasite # get output/log datasets tmpStat, tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI( tmpTaskID, ['output', 'log']) # get destinations retMap[tmpTaskID] = {'datasets': [], 'nucleus': nucleus} for datasetSpec in tmpDatasetSpecs: # skip distributed datasets if DataServiceUtils.getDistributedDestination( datasetSpec.storageToken) is not None: continue # get token token = ddmIF.convertTokenToEndpoint( siteSpec.ddm_output[scopeSiteSpec_output], datasetSpec.storageToken) # use default endpoint if token is None: token = siteSpec.ddm_output[scopeSiteSpec_output] # add original token if datasetSpec.storageToken not in ['', None]: token += '/{0}'.format(datasetSpec.storageToken) retMap[tmpTaskID]['datasets'].append({ 'datasetID': datasetSpec.datasetID, 'token': 'dst:{0}'.format(token), 'destination': tmpCoreName }) else: retMap[tmpTaskID] = tmpCoreName tmpLog.debug('ret {0}'.format(str(retMap))) # return tmpLog.debug('done') return self.SC_SUCCEEDED, retMap
def run(self): self.lock.acquire() try: # get jobs from DB ids = self.ids self.proxyLock.acquire() jobs = taskBuffer.peekJobs(ids,fromActive=False,fromArchived=False,fromWaiting=False) self.proxyLock.release() actJobs = [] for tmpJob in jobs: if tmpJob is None or tmpJob.jobStatus == 'unknown': continue # get LFN list lfns = [] guids = [] scopes = [] for tmpFile in tmpJob.Files: # only input files are checked if tmpFile.type == 'input' and tmpFile.status != 'ready': lfns.append(tmpFile.lfn) scopes.append(tmpFile.scope) # get file replicas _logger.debug("%s check input files at %s" % (tmpJob.PandaID, tmpJob.computingSite)) tmpStat,okFiles = rucioAPI.listFileReplicas(scopes,lfns) if not tmpStat: pass else: # check if locally available siteSpec = siteMapper.getSite(tmpJob.computingSite) scope_input, scope_output = select_scope(siteSpec, tmpJob.prodSourceLabel) allOK = True for tmpFile in tmpJob.Files: # only input if tmpFile.type == 'input' and tmpFile.status != 'ready': # check RSEs if tmpFile.lfn in okFiles: for rse in okFiles[tmpFile.lfn]: if siteSpec.ddm_endpoints_input[scope_input].isAssociated(rse) and \ siteSpec.ddm_endpoints_input[scope_input].getEndPoint(rse)['is_tape'] == 'N': tmpFile.status = 'ready' break # missing if tmpFile.status != 'ready': allOK = False _logger.debug("%s skip since %s:%s is missing" % (tmpJob.PandaID,tmpFile.scope,tmpFile.lfn)) break if not allOK: continue # append to run activator _logger.debug("%s to activate" % tmpJob.PandaID) actJobs.append(tmpJob) # update _logger.debug("activating ...") self.proxyLock.acquire() taskBuffer.activateJobs(actJobs) self.proxyLock.release() _logger.debug("done") time.sleep(1) except Exception: errtype,errvalue = sys.exc_info()[:2] _logger.error("ActivatorThr failed with %s %s" % (errtype,errvalue)) self.pool.remove(self) self.lock.release()
def run(self): self.lock.acquire() try: # get jobs from DB ids = self.ids self.proxyLock.acquire() jobs = taskBuffer.peekJobs(ids,fromDefined=False,fromArchived=False,fromWaiting=False) self.proxyLock.release() upJobs = [] finJobs = [] for job in jobs: if job is None or job.jobStatus == 'unknown': continue seList = ['dummy'] tmpNucleus = siteMapper.getNucleus(job.nucleus) # get SEs if job.prodSourceLabel == 'user' and job.destinationSE not in siteMapper.siteSpecList: # using --destSE for analysis job to transfer output seList = [job.destinationSE] elif tmpNucleus is not None: seList = list(tmpNucleus.allDdmEndPoints) elif siteMapper.checkCloud(job.cloud): # normal production jobs if DataServiceUtils.checkJobDestinationSE(job) is None: tmpDstID = siteMapper.getCloud(job.cloud)['dest'] else: tmpDstID = job.destinationSE tmpDstSite = siteMapper.getSite(tmpDstID) scope_input, scope_output = select_scope(tmpDstSite, job.prodSourceLabel) seList = tmpDstSite.ddm_endpoints_output[scope_output].getLocalEndPoints() # get LFN list lfns = [] guids = [] scopes = [] nTokens = 0 for file in job.Files: # only output files are checked if file.type == 'output' or file.type == 'log': if file.status == 'nooutput': continue if DataServiceUtils.getDistributedDestination(file.destinationDBlockToken) is not None: continue lfns.append(file.lfn) guids.append(file.GUID) scopes.append(file.scope) nTokens += len(file.destinationDBlockToken.split(',')) # get files in LRC _logger.debug("%s Cloud:%s" % (job.PandaID,job.cloud)) tmpStat,okFiles = rucioAPI.listFileReplicas(scopes,lfns,seList) if not tmpStat: _logger.error("%s failed to get file replicas" % job.PandaID) okFiles = {} # count files nOkTokens = 0 for okLFN in okFiles: okSEs = okFiles[okLFN] nOkTokens += len(okSEs) # check all files are ready _logger.debug("%s nToken:%s nOkToken:%s" % (job.PandaID,nTokens,nOkTokens)) if nTokens <= nOkTokens: _logger.debug("%s Finisher : Finish" % job.PandaID) for file in job.Files: if file.type == 'output' or file.type == 'log': if file.status != 'nooutput': file.status = 'ready' # append to run Finisher finJobs.append(job) else: endTime = job.endTime if endTime == 'NULL': endTime = job.startTime # priority-dependent timeout tmpCloudSpec = siteMapper.getCloud(job.cloud) if job.currentPriority >= 800 and (not job.prodSourceLabel in ['user']): if 'transtimehi' in tmpCloudSpec: timeOutValue = tmpCloudSpec['transtimehi'] else: timeOutValue = 1 else: if 'transtimelo' in tmpCloudSpec: timeOutValue = tmpCloudSpec['transtimelo'] else: timeOutValue = 2 # protection if timeOutValue < 1: timeOutValue = 1 timeOut = self.timeNow - datetime.timedelta(days=timeOutValue) _logger.debug("%s Priority:%s Limit:%s End:%s" % (job.PandaID,job.currentPriority,str(timeOut),str(endTime))) if endTime < timeOut: # timeout _logger.debug("%s Finisher : Kill" % job.PandaID) strMiss = '' for lfn in lfns: if not lfn in okFiles: strMiss += ' %s' % lfn job.jobStatus = 'failed' job.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_Transfer job.taskBufferErrorDiag = 'transfer timeout for '+strMiss guidMap = {} for file in job.Files: # set file status if file.status == 'transferring' or file.type in ['log','output']: file.status = 'failed' # collect GUIDs to delete files from _tid datasets if file.type == 'output' or file.type == 'log': if file.destinationDBlock not in guidMap: guidMap[file.destinationDBlock] = [] guidMap[file.destinationDBlock].append(file.GUID) else: # wait _logger.debug("%s Finisher : Wait" % job.PandaID) for lfn in lfns: if not lfn in okFiles: _logger.debug("%s -> %s" % (job.PandaID,lfn)) upJobs.append(job) # update _logger.debug("updating ...") self.proxyLock.acquire() taskBuffer.updateJobs(upJobs,False) self.proxyLock.release() # run Finisher for job in finJobs: fThr = Finisher(taskBuffer,None,job) fThr.start() fThr.join() _logger.debug("done") time.sleep(1) except Exception: errtype,errvalue = sys.exc_info()[:2] errStr = "FinisherThr failed with %s %s" % (errtype,errvalue) errStr += traceback.format_exc() _logger.error(errStr) self.pool.remove(self) self.lock.release()
xml = """<?xml version="1.0" encoding="UTF-8" standalone="no" ?> <!-- ATLAS file meta-data catalog --> <!DOCTYPE POOLFILECATALOG SYSTEM "InMemory"> <POOLFILECATALOG> """ try: att = sys.argv[2] except Exception: att = job.attemptNr if job.computingSite in ['', None, 'NULL']: print('computingSite is not yet defined') sys.exit(0) siteSpec = siteMapper.getSite(job.computingSite) scope_input, scope_output = select_scope(siteSpec, job.prodSourceLabel) with open( '/cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json') as f: rseDict = json.load(f) hash = hashlib.md5() iOut = 0 outFileName = [] fileDict = {} for tmpFile in job.Files: if tmpFile.type in ['output']: if False: #outFileName is None: outFileName.append(tmpFile.lfn) if tmpFile.type in ['output', 'log']: