예제 #1
0
 def run(self):
     # get logger
     tmpLog = LogWrapper(_logger,'<vuid={0} site={1} name={2}>'.format(self.vuid,
                                                                       self.site,
                                                                       self.dataset))
     # query dataset
     tmpLog.debug("start")
     if self.vuid != None:
         dataset = self.taskBuffer.queryDatasetWithMap({'vuid':self.vuid})
     else:
         dataset = self.taskBuffer.queryDatasetWithMap({'name':self.dataset})
     if dataset == None:
         tmpLog.error("Not found")
         tmpLog.debug("end")
         return
     tmpLog.debug("type:%s name:%s" % (dataset.type,dataset.name))
     if dataset.type == 'dispatch':
         # activate jobs in jobsDefined
         Activator(self.taskBuffer,dataset).start()
     if dataset.type == 'output':
         if dataset.name != None and re.search('^panda\..*_zip$',dataset.name) != None:
             # start unmerge jobs
             Activator(self.taskBuffer,dataset,enforce=True).start()
         else:
             # finish transferring jobs
             Finisher(self.taskBuffer,dataset,site=self.site).start()
     tmpLog.debug("end")
예제 #2
0
def uploadLog(req, file):
    if not Protocol.isSecure(req):
        return False
    if '/CN=limited proxy' in req.subprocess_env['SSL_CLIENT_S_DN']:
        return False
    tmpLog = LogWrapper(_logger, 'uploadLog <{0}>'.format(file.filename))
    tmpLog.debug("start {0}".format(req.subprocess_env['SSL_CLIENT_S_DN']))
    # size check
    sizeLimit = 100 * 1024 * 1024
    # get file size
    contentLength = 0
    try:
        contentLength = long(req.headers_in["content-length"])
    except:
        if req.headers_in.has_key("content-length"):
            tmpLog.error("cannot get CL : %s" %
                         req.headers_in["content-length"])
        else:
            tmpLog.error("no CL")
    tmpLog.debug("size %s" % contentLength)
    if contentLength > sizeLimit:
        errStr = "failed to upload log due to size limit"
        tmpLog.error(errStr)
        tmpLog.debug("end")
        return errStr
    jediLogDir = '/jedilog'
    retStr = ''
    try:
        fileBaseName = file.filename.split('/')[-1]
        fileFullPath = '{0}{1}/{2}'.format(panda_config.cache_dir, jediLogDir,
                                           fileBaseName)
        # delete old file
        if os.path.exists(fileFullPath):
            os.remove(fileFullPath)
        # write
        fo = open(fileFullPath, 'wb')
        fileContent = file.file.read()
        fo.write(fileContent)
        fo.close()
        tmpLog.debug("written to {0}".format(fileFullPath))
        retStr = 'http://{0}/cache{1}/{2}'.format(getServerHTTP(None),
                                                  jediLogDir, fileBaseName)
    except:
        errtype, errvalue = sys.exc_info()[:2]
        errStr = "failed to write log with {0}:{1}".format(
            errtype.__name__, errvalue)
        tmpLog.error(errStr)
        tmpLog.debug("end")
        return errStr
    tmpLog.debug("end")
    return retStr
예제 #3
0
def uploadLog(req,file):
    if not Protocol.isSecure(req):
        return False
    if '/CN=limited proxy' in req.subprocess_env['SSL_CLIENT_S_DN']:
        return False
    tmpLog = LogWrapper(_logger,'uploadLog <{0}>'.format(file.filename))
    tmpLog.debug("start {0}".format(req.subprocess_env['SSL_CLIENT_S_DN']))
    # size check
    sizeLimit = 100*1024*1024
    # get file size
    contentLength = 0
    try:
        contentLength = long(req.headers_in["content-length"])
    except:
        if req.headers_in.has_key("content-length"):
            tmpLog.error("cannot get CL : %s" % req.headers_in["content-length"])
        else:
            tmpLog.error("no CL")
    tmpLog.debug("size %s" % contentLength)
    if contentLength > sizeLimit:
        errStr = "failed to upload log due to size limit"
        tmpLog.error(errStr)
        tmpLog.debug("end")            
        return errStr
    jediLogDir = '/jedilog'
    retStr = ''
    try:
        fileBaseName = file.filename.split('/')[-1]
        fileFullPath = '{0}{1}/{2}'.format(panda_config.cache_dir,jediLogDir,fileBaseName)
        # delete old file 
        if os.path.exists(fileFullPath):
            os.remove(fileFullPath)
        # write
        fo = open(fileFullPath,'wb')
        fileContent = file.file.read()
        fo.write(fileContent)
        fo.close()
        tmpLog.debug("written to {0}".format(fileFullPath))
        retStr = 'http://{0}/cache{1}/{2}'.format(getServerHTTP(None),jediLogDir,fileBaseName) 
    except:
        errtype,errvalue = sys.exc_info()[:2]
        errStr = "failed to write log with {0}:{1}".format(errtype.__name__,errvalue)
        tmpLog.error(errStr)
        tmpLog.debug("end")
        return errStr
    tmpLog.debug("end")
    return retStr
예제 #4
0
 def run(self):
     try:
         # make a message instance
         tmpLog = LogWrapper(_logger, None)
         # run main procedure in the same process
         if not self.forkRun:
             tmpLog.debug('main start')
             tmpLog.debug('firstSubmission={0}'.format(
                 self.firstSubmission))
             # group jobs per VO
             voJobsMap = {}
             ddmFreeJobs = []
             tmpLog.debug('{0} jobs in total'.format(len(self.jobs)))
             for tmpJob in self.jobs:
                 # set VO=local for DDM free
                 if tmpJob.destinationSE == 'local':
                     tmpVO = 'local'
                 else:
                     tmpVO = tmpJob.VO
                 # make map
                 if not voJobsMap.has_key(tmpVO):
                     voJobsMap[tmpVO] = []
                 voJobsMap[tmpVO].append(tmpJob)
             # loop over all VOs
             for tmpVO, tmpJobList in voJobsMap.iteritems():
                 tmpLog.debug('vo={0} has {1} jobs'.format(
                     tmpVO, len(tmpJobList)))
                 # get plugin
                 setupperPluginClass = panda_config.getPlugin(
                     'setupper_plugins', tmpVO)
                 if setupperPluginClass == None:
                     # use ATLAS plug-in by default
                     from SetupperAtlasPlugin import SetupperAtlasPlugin
                     setupperPluginClass = SetupperAtlasPlugin
                 tmpLog.debug('plugin name -> {0}'.format(
                     setupperPluginClass.__name__))
                 try:
                     # make plugin
                     setupperPlugin = setupperPluginClass(
                         self.taskBuffer,
                         self.jobs,
                         tmpLog,
                         resubmit=self.resubmit,
                         pandaDDM=self.pandaDDM,
                         ddmAttempt=self.ddmAttempt,
                         onlyTA=self.onlyTA,
                         firstSubmission=self.firstSubmission)
                     # run plugin
                     tmpLog.debug('run plugin')
                     setupperPlugin.run()
                     # go forward if not TA
                     if not self.onlyTA:
                         # update jobs
                         tmpLog.debug('update jobs')
                         self.updateJobs(
                             setupperPlugin.jobs + setupperPlugin.jumboJobs,
                             tmpLog)
                         # execute post process
                         tmpLog.debug('post execute plugin')
                         setupperPlugin.postRun()
                     tmpLog.debug('done plugin')
                 except:
                     errtype, errvalue = sys.exc_info()[:2]
                     tmpLog.error('plugin failed with {0}:{1}'.format(
                         errtype, errvalue))
             tmpLog.debug('main end')
         else:
             tmpLog.debug('fork start')
             # write jobs to file
             import os
             import cPickle as pickle
             outFileName = '%s/set.%s_%s' % (panda_config.logdir,
                                             self.jobs[0].PandaID,
                                             commands.getoutput('uuidgen'))
             outFile = open(outFileName, 'w')
             pickle.dump(self.jobs, outFile)
             outFile.close()
             # run main procedure in another process because python doesn't release memory
             com = 'cd %s > /dev/null 2>&1; export HOME=%s; ' % (
                 panda_config.home_dir_cwd, panda_config.home_dir_cwd)
             com += 'source %s; ' % panda_config.glite_source
             com += 'env PYTHONPATH=%s:%s %s/python -Wignore %s/dataservice/forkSetupper.py -i %s' % \
                    (panda_config.pandaCommon_dir,panda_config.pandaPython_dir,panda_config.native_python,
                     panda_config.pandaPython_dir,outFileName)
             if self.onlyTA:
                 com += " -t"
             if not self.firstSubmission:
                 com += " -f"
             tmpLog.debug(com)
             # exeute
             status, output = self.taskBuffer.processLimiter.getstatusoutput(
                 com)
             tmpLog.debug("return from main process: %s %s" %
                          (status, output))
             tmpLog.debug('fork end')
     except:
         errtype, errvalue = sys.exc_info()[:2]
         tmpLog.error('master failed with {0}:{1}'.format(
             errtype, errvalue))
예제 #5
0
class EventPicker:
    # constructor
    def __init__(self,taskBuffer,siteMapper,evpFileName,ignoreError):
        self.taskBuffer      = taskBuffer
        self.siteMapper      = siteMapper
        self.ignoreError     = ignoreError
        self.evpFileName     = evpFileName
        self.token           = datetime.datetime.utcnow().isoformat(' ')        
        # logger
        self.logger = LogWrapper(_logger,self.token)
        self.pd2p            = DynDataDistributer.DynDataDistributer([],self.taskBuffer,self.siteMapper,
                                                                     token=' ',logger=self.logger)
        self.userDatasetName = ''
        self.creationTime    = ''
        self.params          = ''
        self.lockedBy        = ''
        self.evpFile         = None
        self.userTaskName    = ''
        # message buffer
        self.msgBuffer       = []
        self.lineLimit       = 100
        # JEDI
        self.jediTaskID      = None


    # main
    def run(self):
        try:
            self.putLog('start %s' % self.evpFileName)            
            # lock evp file
            self.evpFile = open(self.evpFileName)
            try:
                fcntl.flock(self.evpFile.fileno(),fcntl.LOCK_EX|fcntl.LOCK_NB)
            except:
                # relase
                self.putLog("cannot lock %s" % self.evpFileName)
                self.evpFile.close()
                return True
            # options
            runEvtList          = []
            eventPickDataType   = ''
            eventPickStreamName = ''
            eventPickDS         = []
            eventPickAmiTag     = ''
            eventPickNumSites   = 1
            inputFileList       = []
            tagDsList           = []
            tagQuery            = ''
            tagStreamRef        = ''
            skipDaTRI           = False
            runEvtGuidMap       = {}
            ei_api              = ''
            # read evp file
            for tmpLine in self.evpFile:
                tmpMatch = re.search('^([^=]+)=(.+)$',tmpLine)
                # check format
                if tmpMatch == None:
                    continue
                tmpItems = tmpMatch.groups()
                if tmpItems[0] == 'runEvent':
                    # get run and event number
                    tmpRunEvt = tmpItems[1].split(',')
                    if len(tmpRunEvt) == 2:
                        runEvtList.append(tmpRunEvt)
                elif tmpItems[0] == 'eventPickDataType':
                    # data type
                    eventPickDataType = tmpItems[1]
                elif tmpItems[0] == 'eventPickStreamName':
                    # stream name
                    eventPickStreamName = tmpItems[1]
                elif tmpItems[0] == 'eventPickDS':
                    # dataset pattern
                    eventPickDS = tmpItems[1].split(',')
                elif tmpItems[0] == 'eventPickAmiTag':
                    # AMI tag
                    eventPickAmiTag = tmpItems[1]
                elif tmpItems[0] == 'eventPickNumSites':
                    # the number of sites where datasets are distributed
                    try:
                        eventPickNumSites = int(tmpItems[1])
                    except:
                        pass
                elif tmpItems[0] == 'userName':
                    # user name
                    self.userDN = tmpItems[1]
                    self.putLog("user=%s" % self.userDN)
                elif tmpItems[0] == 'userTaskName':
                    # user task name
                    self.userTaskName = tmpItems[1]
                elif tmpItems[0] == 'userDatasetName':
                    # user dataset name
                    self.userDatasetName = tmpItems[1]
                elif tmpItems[0] == 'lockedBy':
                    # client name
                    self.lockedBy = tmpItems[1]
                elif tmpItems[0] == 'creationTime':
                    # creation time
                    self.creationTime = tmpItems[1]
                elif tmpItems[0] == 'params':
                    # parameters
                    self.params = tmpItems[1]
                elif tmpItems[0] == 'ei_api':
                    # ei api parameter for MC
                    ei_api = tmpItems[1]
                elif tmpItems[0] == 'inputFileList':
                    # input file list
                    inputFileList = tmpItems[1].split(',')
                    try:
                        inputFileList.remove('')
                    except:
                        pass
                elif tmpItems[0] == 'tagDS':
                    # TAG dataset
                    tagDsList = tmpItems[1].split(',')
                elif tmpItems[0] == 'tagQuery':
                    # query for TAG
                    tagQuery = tmpItems[1]
                elif tmpItems[0] == 'tagStreamRef':
                    # StreamRef for TAG
                    tagStreamRef = tmpItems[1]
                    if not tagStreamRef.endswith('_ref'):
                        tagStreamRef += '_ref'
                elif tmpItems[0] == 'runEvtGuidMap':
                    # GUIDs
                    try:
                        exec "runEvtGuidMap="+tmpItems[1]
                    except:
                        pass
            # extract task name
            if self.userTaskName == '' and self.params != '':
                try:
                    tmpMatch = re.search('--outDS(=| ) *([^ ]+)',self.params)
                    if tmpMatch != None:
                        self.userTaskName = tmpMatch.group(2)
                        if not self.userTaskName.endswith('/'):
                            self.userTaskName += '/'
                except:
                    pass
            # suppress DaTRI
            if self.params != '':
                if '--eventPickSkipDaTRI' in self.params:
                    skipDaTRI = True
            # get compact user name
            compactDN = self.taskBuffer.cleanUserID(self.userDN)
            # get jediTaskID
            self.jediTaskID = self.taskBuffer.getTaskIDwithTaskNameJEDI(compactDN,self.userTaskName)
            # convert 
            if tagDsList == [] or tagQuery == '':
                # convert run/event list to dataset/file list
                tmpRet,locationMap,allFiles = self.pd2p.convertEvtRunToDatasets(runEvtList,
                                                                                eventPickDataType,
                                                                                eventPickStreamName,
                                                                                eventPickDS,
                                                                                eventPickAmiTag,
                                                                                self.userDN,
                                                                                runEvtGuidMap,
                                                                                ei_api
                                                                                )
                if not tmpRet:
                    if 'isFatal' in locationMap and locationMap['isFatal'] == True:
                        self.ignoreError = False
                    self.endWithError('Failed to convert the run/event list to a dataset/file list')
                    return False
            else:
                # get parent dataset/files with TAG
                tmpRet,locationMap,allFiles = self.pd2p.getTagParentInfoUsingTagQuery(tagDsList,tagQuery,tagStreamRef) 
                if not tmpRet:
                    self.endWithError('Failed to get parent dataset/file list with TAG')
                    return False
            # use only files in the list
            if inputFileList != []:
                tmpAllFiles = []
                for tmpFile in allFiles:
                    if tmpFile['lfn'] in inputFileList:
                        tmpAllFiles.append(tmpFile)
                allFiles = tmpAllFiles        
            # remove redundant CN from DN
            tmpDN = self.userDN
            tmpDN = re.sub('/CN=limited proxy','',tmpDN)
            tmpDN = re.sub('(/CN=proxy)+$','',tmpDN)
            # make dataset container
            tmpRet = self.pd2p.registerDatasetContainerWithDatasets(self.userDatasetName,allFiles,
                                                                    locationMap,
                                                                    nSites=eventPickNumSites,
                                                                    owner=tmpDN)
            if not tmpRet:
                self.endWithError('Failed to make a dataset container %s' % self.userDatasetName)
                return False
            # skip DaTRI
            if skipDaTRI:
                # successfully terminated
                self.putLog("skip DaTRI")
                # update task
                self.taskBuffer.updateTaskModTimeJEDI(self.jediTaskID)
            else:
                # get candidates
                tmpRet,candidateMaps = self.pd2p.getCandidates(self.userDatasetName,checkUsedFile=False,
                                                               useHidden=True)
                if not tmpRet:
                    self.endWithError('Failed to find candidate for destination')
                    return False
                # collect all candidates
                allCandidates = [] 
                for tmpDS,tmpDsVal in candidateMaps.iteritems():
                    for tmpCloud,tmpCloudVal in tmpDsVal.iteritems():
                        for tmpSiteName in tmpCloudVal[0]:
                            if not tmpSiteName in allCandidates:
                                allCandidates.append(tmpSiteName)
                if allCandidates == []:
                    self.endWithError('No candidate for destination')
                    return False
                # get list of dataset (container) names
                if eventPickNumSites > 1:
                    # decompose container to transfer datasets separately
                    tmpRet,tmpOut = self.pd2p.getListDatasetReplicasInContainer(self.userDatasetName) 
                    if not tmpRet:
                        self.endWithError('Failed to get the size of %s' % self.userDatasetName)
                        return False
                    userDatasetNameList = tmpOut.keys()
                else:
                    # transfer container at once
                    userDatasetNameList = [self.userDatasetName]
                # loop over all datasets
                sitesUsed = []
                for tmpUserDatasetName in userDatasetNameList:
                    # get size of dataset container
                    tmpRet,totalInputSize = rucioAPI.getDatasetSize(tmpUserDatasetName)
                    if not tmpRet:
                        self.endWithError('Failed to get the size of %s' % tmpUserDatasetName)
                        return False
                    # run brokerage
                    tmpJob = JobSpec()
                    tmpJob.AtlasRelease = ''
                    self.putLog("run brokerage for %s" % tmpDS)
                    brokerage.broker.schedule([tmpJob],self.taskBuffer,self.siteMapper,True,allCandidates,
                                              True,datasetSize=totalInputSize)
                    if tmpJob.computingSite.startswith('ERROR'):
                        self.endWithError('brokerage failed with %s' % tmpJob.computingSite)
                        return False
                    self.putLog("site -> %s" % tmpJob.computingSite)
                    # send transfer request
                    try:
                        tmpDN = rucioAPI.parse_dn(tmpDN)
                        tmpStatus,userInfo = rucioAPI.finger(tmpDN)
                        if not tmpStatus:
                            raise RuntimeError,'user info not found for {0} with {1}'.format(tmpDN,userInfo)
                        tmpDN = userInfo['nickname']
                        tmpDQ2ID = self.siteMapper.getSite(tmpJob.computingSite).ddm_input
                        tmpMsg = "%s ds=%s site=%s id=%s" % ('registerDatasetLocation for DaTRI ',
                                                             tmpUserDatasetName,
                                                             tmpDQ2ID,
                                                             tmpDN)
                        self.putLog(tmpMsg)
                        rucioAPI.registerDatasetLocation(tmpDS,[tmpDQ2ID],lifetime=14,owner=tmpDN,
                                                         activity="User Subscriptions")
                        self.putLog('OK')
                    except:
                        errType,errValue = sys.exc_info()[:2]
                        tmpStr = 'Failed to send transfer request : %s %s' % (errType,errValue)
                        tmpStr.strip()
                        tmpStr += traceback.format_exc()
                        self.endWithError(tmpStr)
                        return False
                    # list of sites already used
                    sitesUsed.append(tmpJob.computingSite)
                    self.putLog("used %s sites" % len(sitesUsed))
                    # set candidates
                    if len(sitesUsed) >= eventPickNumSites:
                        # reset candidates to limit the number of sites
                        allCandidates = sitesUsed
                        sitesUsed = []
                    else:
                        # remove site
                        allCandidates.remove(tmpJob.computingSite)
                # send email notification for success
                tmpMsg =  'A transfer request was successfully sent to Rucio.\n'
                tmpMsg += 'Your task will get started once transfer is completed.'
                self.sendEmail(True,tmpMsg)
            try:
                # unlock and delete evp file
                fcntl.flock(self.evpFile.fileno(),fcntl.LOCK_UN)
                self.evpFile.close()
                os.remove(self.evpFileName)
            except:
                pass
            # successfully terminated
            self.putLog("end %s" % self.evpFileName)
            return True
        except:
            errType,errValue = sys.exc_info()[:2]
            self.endWithError('Got exception %s:%s %s' % (errType,errValue,traceback.format_exc()))
            return False


    # end with error
    def endWithError(self,message):
        self.putLog(message,'error')
        # unlock evp file
        try:
            fcntl.flock(self.evpFile.fileno(),fcntl.LOCK_UN)
            self.evpFile.close()
            if not self.ignoreError:
                # remove evp file
                os.remove(self.evpFileName)
                # send email notification
                self.sendEmail(False,message)
        except:
            pass
        # upload log
        if self.jediTaskID != None:
            outLog = self.uploadLog()
            self.taskBuffer.updateTaskErrorDialogJEDI(self.jediTaskID,'event picking failed. '+outLog)
            # update task
            if not self.ignoreError:
                self.taskBuffer.updateTaskModTimeJEDI(self.jediTaskID,'tobroken')
            self.putLog(outLog)
        self.putLog('end %s' % self.evpFileName)
        

    # put log
    def putLog(self,msg,type='debug'):
        tmpMsg = msg
        if type == 'error':
            self.logger.error(tmpMsg)
        else:
            self.logger.debug(tmpMsg)


    # send email notification
    def sendEmail(self,isSucceeded,message):
        # mail address
        toAdder = Notifier(self.taskBuffer,None,[]).getEmail(self.userDN)
        if toAdder == '':
            self.putLog('cannot find email address for %s' % self.userDN,'error')
            return
        # subject
        mailSubject = "PANDA notification for Event-Picking Request"
        # message
        mailBody = "Hello,\n\nHere is your request status for event picking\n\n"
        if isSucceeded:
            mailBody += "Status  : Passed to Rucio\n"
        else:
            mailBody += "Status  : Failed\n"
        mailBody +=     "Created : %s\n" % self.creationTime
        mailBody +=     "Ended   : %s\n" % datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
        mailBody +=     "Dataset : %s\n" % self.userDatasetName
        mailBody +=     "\n"        
        mailBody +=     "Parameters : %s %s\n" % (self.lockedBy,self.params)
        mailBody +=     "\n"
        mailBody +=     "%s\n" % message
        # send
        retVal = MailUtils().send(toAdder,mailSubject,mailBody)
        # return
        return

    
    # upload log
    def uploadLog(self):
        if self.jediTaskID == None:
            return 'cannot find jediTaskID'
        strMsg = self.logger.dumpToString()
        s,o = Client.uploadLog(strMsg,self.jediTaskID)
        if s != 0:
            return "failed to upload log with {0}.".format(s)
        if o.startswith('http'):
            return '<a href="{0}">log</a>'.format(o)
        return o
예제 #6
0
                tmpLog.sendMsg(msg, panda_config.loggername, 'userCap',
                               'warning')
        elif tmpNumTotal < maxNumRunPerUser * 0.9 and prodUserName in throttledUsers:
            # throttle user
            tmpNumJobs = taskBuffer.unThrottleUserJobs(prodUserName)
            if tmpNumJobs != None and tmpNumJobs > 0:
                msg = 'released jobs for user="******" since number of running jobs is less than {1}'.format(
                    prodUserName, maxNumRunPerUser)
                tmpLog.debug(msg)
                tmpLog.sendMsg(msg, panda_config.loggername, 'userCap')
except:
    errtype, errvalue = sys.exc_info()[:2]
    errStr = "cap failed: %s %s" % (job.PandaID, errtype, errvalue)
    errStr.strip()
    errStr += traceback.format_exc()
    tmpLog.error(errStr)

# global average
globalAverageRunDone = float(totalRunDone) / float(totalUsers)

tmpLog.debug("global average : %s" % globalAverageRunDone)

# count the number of users and run/done jobs for each site
siteRunDone = {}
siteUsers = {}
for computingSite, userValMap in usageBreakDownPerSite.iteritems():
    for prodUserName, wgValMap in userValMap.iteritems():
        for workingGroup, statValMap in wgValMap.iteritems():
            # ignore group production
            if workingGroup != None:
                continue
예제 #7
0
 def application(environ, start_response):
     # get method name
     methodName = ''
     if environ.has_key('SCRIPT_NAME'):
         methodName = environ['SCRIPT_NAME'].split('/')[-1]
     tmpLog = LogWrapper(_logger,
                         "PID={0} {1}".format(os.getpid(), methodName))
     tmpLog.debug("start")
     regStart = datetime.datetime.utcnow()
     retType = None
     # check method name
     if not methodName in allowedMethods:
         tmpLog.error("is forbidden")
         exeRes = "False : %s is forbidden" % methodName
     else:
         # get method object
         tmpMethod = None
         try:
             exec "tmpMethod = %s" % methodName
         except:
             pass
         # object not found
         if tmpMethod == None:
             tmpLog.error("is undefined")
             exeRes = "False"
         else:
             try:
                 # get params
                 tmpPars = cgi.FieldStorage(environ['wsgi.input'],
                                            environ=environ,
                                            keep_blank_values=1)
                 # convert to map
                 params = {}
                 for tmpKey in tmpPars.keys():
                     if tmpPars[tmpKey].file != None and tmpPars[
                             tmpKey].filename != None:
                         # file
                         params[tmpKey] = tmpPars[tmpKey]
                     else:
                         # string
                         params[tmpKey] = tmpPars.getfirst(tmpKey)
                 if panda_config.entryVerbose:
                     tmpLog.debug("with %s" % str(params.keys()))
                 # dummy request object
                 dummyReq = DummyReq(environ, tmpLog)
                 # exec
                 exeRes = apply(tmpMethod, [dummyReq], params)
                 # extract return type
                 if type(exeRes) == types.DictType:
                     retType = exeRes['type']
                     exeRes = exeRes['content']
                 # convert bool to string
                 if exeRes in [True, False]:
                     exeRes = str(exeRes)
             except Exception as e:
                 tmpLog.error("execution failure : {0}".format(str(e)))
                 errStr = ""
                 for tmpKey, tmpVal in environ.iteritems():
                     errStr += "%s : %s\n" % (tmpKey, str(tmpVal))
                 tmpLog.error(errStr)
                 # return internal server error
                 start_response('500 INTERNAL SERVER ERROR',
                                [('Content-Type', 'text/plain')])
                 return [str(e)]
     if panda_config.entryVerbose:
         tmpLog.debug("done")
     regTime = datetime.datetime.utcnow() - regStart
     tmpLog.info(
         "exec_time=%s.%03d sec, return len=%s B" %
         (regTime.seconds, regTime.microseconds / 1000, len(str(exeRes))))
     # return
     if exeRes == taskbuffer.ErrorCode.EC_NotFound:
         start_response('404 Not Found', [('Content-Type', 'text/plain')])
         return ['not found']
     elif isinstance(exeRes, taskbuffer.ErrorCode.EC_Redirect):
         start_response('302 Redirect', [('Location', exeRes.url)])
         return ['redirect']
     else:
         if retType == 'json':
             start_response('200 OK',
                            [('Content-Type', 'application/json')])
         else:
             start_response('200 OK', [('Content-Type', 'text/plain')])
         return [exeRes]
예제 #8
0
 def run(self):
     try:
         # make a message instance
         tmpLog = LogWrapper(_logger,None)
         # run main procedure in the same process
         if not self.forkRun:
             tmpLog.debug('main start')
             tmpLog.debug('firstSubmission={0}'.format(self.firstSubmission))
             # group jobs per VO
             voJobsMap = {}
             ddmFreeJobs = []
             tmpLog.debug('{0} jobs in total'.format(len(self.jobs)))
             for tmpJob in self.jobs:
                 # set VO=local for DDM free 
                 if tmpJob.destinationSE == 'local':
                     tmpVO = 'local'
                 else:
                     tmpVO = tmpJob.VO
                 # make map
                 if not voJobsMap.has_key(tmpVO):
                     voJobsMap[tmpVO] = []
                 voJobsMap[tmpVO].append(tmpJob)
             # loop over all VOs
             for tmpVO,tmpJobList in voJobsMap.iteritems():
                 tmpLog.debug('vo={0} has {1} jobs'.format(tmpVO,len(tmpJobList)))
                 # get plugin
                 setupperPluginClass = panda_config.getPlugin('setupper_plugins',tmpVO)
                 if setupperPluginClass == None:
                     # use ATLAS plug-in by default
                     from SetupperAtlasPlugin import SetupperAtlasPlugin
                     setupperPluginClass = SetupperAtlasPlugin
                 tmpLog.debug('plugin name -> {0}'.format(setupperPluginClass.__name__))
                 try:
                     # make plugin
                     setupperPlugin = setupperPluginClass(self.taskBuffer,self.jobs,tmpLog,
                                                          resubmit=self.resubmit,
                                                          pandaDDM=self.pandaDDM,
                                                          ddmAttempt=self.ddmAttempt,
                                                          onlyTA=self.onlyTA,
                                                          firstSubmission=self.firstSubmission)
                     # run plugin
                     tmpLog.debug('run plugin')
                     setupperPlugin.run()
                     # go forward if not TA
                     if not self.onlyTA:
                         # update jobs
                         tmpLog.debug('update jobs')
                         self.updateJobs(setupperPlugin.jobs+setupperPlugin.jumboJobs,tmpLog)
                         # execute post process
                         tmpLog.debug('post execute plugin')
                         setupperPlugin.postRun()
                     tmpLog.debug('done plugin')
                 except:
                     errtype,errvalue = sys.exc_info()[:2]
                     tmpLog.error('plugin failed with {0}:{1}'.format(errtype, errvalue))
             tmpLog.debug('main end')
         else:
             tmpLog.debug('fork start')
             # write jobs to file
             import os
             import cPickle as pickle
             outFileName = '%s/set.%s_%s' % (panda_config.logdir,self.jobs[0].PandaID,commands.getoutput('uuidgen'))
             outFile = open(outFileName,'w')
             pickle.dump(self.jobs,outFile)
             outFile.close()
             # run main procedure in another process because python doesn't release memory
             com =  'cd %s > /dev/null 2>&1; export HOME=%s; ' % (panda_config.home_dir_cwd,panda_config.home_dir_cwd)
             com += 'env PYTHONPATH=%s:%s %s/python -Wignore %s/dataservice/forkSetupper.py -i %s' % \
                    (panda_config.pandaCommon_dir,panda_config.pandaPython_dir,panda_config.native_python,
                     panda_config.pandaPython_dir,outFileName)
             if self.onlyTA:
                 com += " -t"
             if not self.firstSubmission:
                 com += " -f"
             tmpLog.debug(com)
             # exeute
             status,output = self.taskBuffer.processLimiter.getstatusoutput(com)
             tmpLog.debug("return from main process: %s %s" % (status,output))                
             tmpLog.debug('fork end')
     except:
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('master failed with {0}:{1}'.format(errtype,errvalue))
예제 #9
0
def getFilesFromLRC(files,url,guids=[],storageName=[],terminateWhenFailed=False,getPFN=False,
                    scopeList=[]):
    tmpLog = LogWrapper(_log,None)
    tmpLog.debug('getFilesFromLRC "%s" %s' % (url,str(storageName)))    
    # get PFC
    outSTR = ''
    if url.startswith('mysql://'):
        # from MySQL
        outSTR = _getPFNFromMySQL(files,url)
        # get PFN
        if getPFN:
            outPFN = {}
            # FIXME
            tmpLog.debug('RetPFN:%s ' % str(outPFN))            
            return outPFN
    elif url.startswith('http://'):
        # from HTTP I/F
        outSTR = _getPoolFileCatalog(files,url)
        # get PFN
        if getPFN:
            outPFN = {}
            try:
                if not outSTR in ['',None]:
                    root  = xml.dom.minidom.parseString(outSTR)
                    fileNodes = root.getElementsByTagName('File')
                    for file in fileNodes:
                        # get PFN and LFN nodes
                        physical = file.getElementsByTagName('physical')[0]
                        pfnNode  = physical.getElementsByTagName('pfn')[0]
                        logical  = file.getElementsByTagName('logical')[0]
                        lfnNode  = logical.getElementsByTagName('lfn')[0]
                        # convert UTF8 to Raw
                        pfn = str(pfnNode.getAttribute('name'))
                        lfn = str(lfnNode.getAttribute('name'))
                        # assign
                        if not outPFN.has_key(lfn):
                            outPFN[lfn] = []
                        outPFN[lfn].append(pfn)
            except:
                type, value, traceBack = sys.exc_info()
                tmpLog.error(outSTR)
                tmpLog.error("could not parse XML - %s %s" % (type, value))
            tmpLog.debug('RetPFN:%s ' % str(outPFN))                
            return outPFN
    elif url.startswith('lfc://') or url.startswith('rucio://'):
        # from LFC
        timeStart = datetime.datetime.utcnow()
        outSTR = _getPFNFromLFC(files,url,guids,storageName,scopeList=scopeList,tmpLog=tmpLog)
        regTime = datetime.datetime.utcnow() - timeStart
        tmpLog.debug('file lookup for %s LFNs from %s took %s.%03d sec' % (len(files),url,regTime.seconds,
                                                                           regTime.microseconds/1000))
        # get PFN
        if getPFN:
            outPFN = {}
            try:
                if not outSTR in ['',None]:
                    tmpItems = outSTR.split('LFCRet :')
                    tmpItems.remove('')
                    # loop over all returns
                    for tmpItem in tmpItems:
                        exec "tmpLFNmap = %s" % tmpItem
                        for tmpLFN,tmpPFN in tmpLFNmap.iteritems():
                            outPFN[tmpLFN] = tmpPFN
            except:
                type, value, traceBack = sys.exc_info()
                tmpLog.error(outSTR)
                tmpLog.error("could not parse LFC ret - %s %s" % (type, value))
            tmpLog.debug('RetPFN:%s files' % len(outPFN))
            return outPFN
    # check return
    if not isinstance(outSTR,types.StringType):
        if terminateWhenFailed:
            return None
        # set empty string
        outSTR = ''
    # collect OK Files
    okFiles = []
    for file in files:
        if re.search(file,outSTR) != None:
            okFiles.append(file)
    tmpLog.debug('Ret:%s / %s files' % (str(okFiles[:3]),len(okFiles)))
    return okFiles
예제 #10
0
class AdderGen:
    # constructor
    def __init__(self,
                 taskBuffer,
                 jobID,
                 jobStatus,
                 xmlFile,
                 ignoreTmpError=True,
                 siteMapper=None):
        self.job = None
        self.jobID = jobID
        self.jobStatus = jobStatus
        self.taskBuffer = taskBuffer
        self.ignoreTmpError = ignoreTmpError
        self.lockXML = None
        self.siteMapper = siteMapper
        self.attemptNr = None
        self.xmlFile = xmlFile
        self.datasetMap = {}
        self.extraInfo = {
            'surl': {},
            'nevents': {},
            'lbnr': {},
            'endpoint': {}
        }
        # exstract attemptNr
        try:
            tmpAttemptNr = self.xmlFile.split('/')[-1].split('_')[-1]
            if re.search('^\d+$', tmpAttemptNr) != None:
                self.attemptNr = int(tmpAttemptNr)
        except:
            pass
        # logger
        self.logger = LogWrapper(_logger, str(self.jobID))

    # dump file report
    def dumpFileReport(self, fileCatalog, attemptNr):
        self.logger.debug("dump file report")
        # dump Catalog into file
        if attemptNr == None:
            xmlFile = '%s/%s_%s_%s' % (panda_config.logdir, self.jobID,
                                       self.jobStatus, str(uuid.uuid4()))
        else:
            xmlFile = '%s/%s_%s_%s_%s' % (panda_config.logdir, self.jobID,
                                          self.jobStatus, str(
                                              uuid.uuid4()), attemptNr)
        file = open(xmlFile, 'w')
        file.write(fileCatalog)
        file.close()

    # get plugin class
    def getPluginClass(self, tmpVO):
        # instantiate concrete plugin
        adderPluginClass = panda_config.getPlugin('adder_plugins', tmpVO)
        if adderPluginClass == None:
            # use ATLAS plugin by default
            from AdderAtlasPlugin import AdderAtlasPlugin
            adderPluginClass = AdderAtlasPlugin
        self.logger.debug('plugin name {0}'.format(adderPluginClass.__name__))
        return adderPluginClass

    # main
    def run(self):
        try:
            self.logger.debug("new start: %s attemptNr=%s" %
                              (self.jobStatus, self.attemptNr))
            # lock XML
            self.lockXML = open(self.xmlFile)
            try:
                fcntl.flock(self.lockXML.fileno(),
                            fcntl.LOCK_EX | fcntl.LOCK_NB)
            except:
                self.logger.debug("cannot get lock : %s" % self.xmlFile)
                self.lockXML.close()
                # remove XML just in case for the final attempt
                if not self.ignoreTmpError:
                    try:
                        # remove Catalog
                        os.remove(self.xmlFile)
                    except:
                        pass
                return
            # check if file exists
            if not os.path.exists(self.xmlFile):
                self.logger.debug("not exist : %s" % self.xmlFile)
                try:
                    fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                    self.lockXML.close()
                except:
                    pass
                return
            # query job
            self.job = self.taskBuffer.peekJobs([self.jobID],
                                                fromDefined=False,
                                                fromWaiting=False,
                                                forAnal=True)[0]
            # check if job has finished
            if self.job == None:
                self.logger.debug(': job not found in DB')
            elif self.job.jobStatus in [
                    'finished', 'failed', 'unknown', 'merging'
            ]:
                self.logger.error(': invalid state -> %s' % self.job.jobStatus)
            elif self.attemptNr != None and self.job.attemptNr != self.attemptNr:
                self.logger.error('wrong attemptNr -> job=%s <> %s' %
                                  (self.job.attemptNr, self.attemptNr))
            elif self.attemptNr is not None and self.job.jobStatus == 'transferring':
                errMsg = 'XML with attemptNr for {0}'.format(
                    self.job.jobStatus)
                self.logger.error(errMsg)
                # FIXME
                raise RuntimeError, errMsg
            elif self.jobStatus == EventServiceUtils.esRegStatus:
                # instantiate concrete plugin
                adderPluginClass = self.getPluginClass(self.job.VO)
                adderPlugin = adderPluginClass(self.job,
                                               taskBuffer=self.taskBuffer,
                                               siteMapper=self.siteMapper,
                                               logger=self.logger)
                # execute
                self.logger.debug('plugin is ready for ES file registration')
                adderPlugin.registerEventServiceFiles()
            else:
                # check file status in JEDI
                if not self.job.isCancelled(
                ) and not self.job.taskBufferErrorCode in [
                        taskbuffer.ErrorCode.EC_PilotRetried
                ]:
                    fileCheckInJEDI = self.taskBuffer.checkInputFileStatusInJEDI(
                        self.job)
                    self.logger.debug("check file status in JEDI : {0}".format(
                        fileCheckInJEDI))
                    if fileCheckInJEDI == None:
                        raise RuntimeError, 'failed to check file status in JEDI'
                    if fileCheckInJEDI == False:
                        # set job status to failed since some file status is wrong in JEDI
                        self.jobStatus = 'failed'
                        self.job.ddmErrorCode = ErrorCode.EC_Adder
                        errStr = "inconsistent file status between Panda and JEDI. "
                        errStr += "failed to avoid duplicated processing caused by synchronization failure"
                        self.job.ddmErrorDiag = errStr
                        self.logger.debug(
                            "set jobStatus={0} since input is inconsistent between Panda and JEDI"
                            .format(self.jobStatus))
                    elif self.job.jobSubStatus in ['pilot_closed']:
                        # terminated by the pilot
                        self.logger.debug(
                            "going to closed since terminated by the pilot")
                        retClosed = self.taskBuffer.killJobs([self.jobID],
                                                             'pilot', '60',
                                                             True)
                        if retClosed[0] == True:
                            self.logger.debug("end")
                            try:
                                # remove Catalog
                                os.remove(self.xmlFile)
                            except:
                                pass
                            # unlock XML
                            if self.lockXML != None:
                                fcntl.flock(self.lockXML.fileno(),
                                            fcntl.LOCK_UN)
                                self.lockXML.close()
                            return
                    # check for cloned jobs
                    if EventServiceUtils.isJobCloningJob(self.job):
                        checkJC = self.taskBuffer.checkClonedJob(self.job)
                        if checkJC == None:
                            raise RuntimeError, 'failed to check the cloned job'
                        # failed to lock semaphore
                        if checkJC['lock'] == False:
                            self.jobStatus = 'failed'
                            self.job.ddmErrorCode = ErrorCode.EC_Adder
                            self.job.ddmErrorDiag = "failed to lock semaphore for job cloning"
                            self.logger.debug(
                                "set jobStatus={0} since did not get semaphore for job cloning"
                                .format(self.jobStatus))
                # use failed for cancelled/closed jobs
                if self.job.isCancelled():
                    self.jobStatus = 'failed'
                    # reset error codes to skip retrial module
                    self.job.pilotErrorCode = 0
                    self.job.exeErrorCode = 0
                    self.job.ddmErrorCode = 0
                # keep old status
                oldJobStatus = self.job.jobStatus
                # set job status
                if not self.job.jobStatus in ['transferring']:
                    self.job.jobStatus = self.jobStatus
                addResult = None
                adderPlugin = None
                # parse XML
                parseResult = self.parseXML()
                if parseResult < 2:
                    # intraction with DDM
                    try:
                        # instantiate concrete plugin
                        adderPluginClass = self.getPluginClass(self.job.VO)
                        adderPlugin = adderPluginClass(
                            self.job,
                            taskBuffer=self.taskBuffer,
                            siteMapper=self.siteMapper,
                            extraInfo=self.extraInfo,
                            logger=self.logger)
                        # execute
                        self.logger.debug('plugin is ready')
                        adderPlugin.execute()
                        addResult = adderPlugin.result
                        self.logger.debug('plugin done with %s' %
                                          (addResult.statusCode))
                    except:
                        errtype, errvalue = sys.exc_info()[:2]
                        self.logger.error(
                            "failed to execute AdderPlugin for VO={0} with {1}:{2}"
                            .format(self.job.VO, errtype, errvalue))
                        addResult = None
                        self.job.ddmErrorCode = ErrorCode.EC_Adder
                        self.job.ddmErrorDiag = "AdderPlugin failure"

                    # ignore temporary errors
                    if self.ignoreTmpError and addResult != None and addResult.isTemporary(
                    ):
                        self.logger.debug(': ignore %s ' %
                                          self.job.ddmErrorDiag)
                        self.logger.debug('escape')
                        # unlock XML
                        try:
                            fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                            self.lockXML.close()
                        except:
                            type, value, traceBack = sys.exc_info()
                            self.logger.debug(": %s %s" % (type, value))
                            self.logger.debug("cannot unlock XML")
                        return
                    # failed
                    if addResult == None or not addResult.isSucceeded():
                        self.job.jobStatus = 'failed'
                # set file status for failed jobs or failed transferring jobs
                self.logger.debug(
                    "status after plugin call :job.jobStatus=%s jobStatus=%s" %
                    (self.job.jobStatus, self.jobStatus))
                if self.job.jobStatus == 'failed' or self.jobStatus == 'failed':
                    # First of all: check if job failed and in this case take first actions according to error table
                    source, error_code, error_diag = None, None, None
                    if self.job.pilotErrorCode:
                        source = 'pilotErrorCode'
                        error_code = self.job.pilotErrorCode
                        error_diag = self.job.pilotErrorDiag
                    elif self.job.exeErrorCode:
                        source = 'exeErrorCode'
                        error_code = self.job.exeErrorCode
                        error_diag = self.job.exeErrorDiag
                    elif self.job.ddmErrorCode:
                        source = 'ddmErrorCode'
                        error_code = self.job.ddmErrorCode
                        error_diag = self.job.ddmErrorDiag
                    elif self.job.transExitCode:
                        source = 'transExitCode'
                        error_code = self.job.transExitCode
                        error_diag = ''

                    # _logger.info("updatejob has source %s, error_code %s and error_diag %s"%(source, error_code, error_diag))

                    if source and error_code:
                        try:
                            self.logger.debug(
                                "AdderGen.run will call apply_retrial_rules")
                            retryModule.apply_retrial_rules(
                                self.taskBuffer, self.job.PandaID, source,
                                error_code, error_diag, self.job.attemptNr)
                            self.logger.debug("apply_retrial_rules is back")
                        except Exception as e:
                            self.logger.error(
                                "apply_retrial_rules excepted and needs to be investigated (%s): %s"
                                % (e, traceback.format_exc()))

                    self.job.jobStatus = 'failed'
                    for file in self.job.Files:
                        if file.type in ['output', 'log']:
                            if addResult != None and file.lfn in addResult.mergingFiles:
                                file.status = 'merging'
                            else:
                                file.status = 'failed'
                else:
                    # reset errors
                    self.job.jobDispatcherErrorCode = 0
                    self.job.jobDispatcherErrorDiag = 'NULL'
                    # set status
                    if addResult != None and addResult.mergingFiles != []:
                        # set status for merging:
                        for file in self.job.Files:
                            if file.lfn in addResult.mergingFiles:
                                file.status = 'merging'
                        self.job.jobStatus = 'merging'
                        # propagate transition to prodDB
                        self.job.stateChangeTime = time.strftime(
                            '%Y-%m-%d %H:%M:%S', time.gmtime())
                    elif addResult != None and addResult.transferringFiles != []:
                        # set status for transferring
                        for file in self.job.Files:
                            if file.lfn in addResult.transferringFiles:
                                file.status = 'transferring'
                        self.job.jobStatus = 'transferring'
                        # propagate transition to prodDB
                        self.job.stateChangeTime = time.strftime(
                            '%Y-%m-%d %H:%M:%S', time.gmtime())
                    else:
                        self.job.jobStatus = 'finished'
                # endtime
                if self.job.endTime == 'NULL':
                    self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S',
                                                     time.gmtime())
                # output size and # of outputs
                self.job.nOutputDataFiles = 0
                self.job.outputFileBytes = 0
                for tmpFile in self.job.Files:
                    if tmpFile.type == 'output':
                        self.job.nOutputDataFiles += 1
                        try:
                            self.job.outputFileBytes += tmpFile.fsize
                        except:
                            pass
                # protection
                maxOutputFileBytes = 99999999999
                if self.job.outputFileBytes > maxOutputFileBytes:
                    self.job.outputFileBytes = maxOutputFileBytes
                # set cancelled state
                if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed':
                    self.job.jobStatus = 'cancelled'
                # update job
                if oldJobStatus in ['cancelled', 'closed']:
                    pass
                else:
                    self.logger.debug("updating DB")
                    retU = self.taskBuffer.updateJobs(
                        [self.job],
                        False,
                        oldJobStatusList=[oldJobStatus],
                        extraInfo=self.extraInfo)
                    self.logger.debug("retU: %s" % retU)
                    # failed
                    if not retU[0]:
                        self.logger.error(
                            'failed to update DB for pandaid={0}'.format(
                                self.job.PandaID))
                        # unlock XML
                        try:
                            fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                            self.lockXML.close()
                        except:
                            type, value, traceBack = sys.exc_info()
                            self.logger.debug(": %s %s" % (type, value))
                            self.logger.debug("cannot unlock XML")
                        return

                    try:
                        # updateJobs was successful and it failed a job with taskBufferErrorCode
                        self.logger.debug("AdderGen.run will peek the job")
                        job_tmp = self.taskBuffer.peekJobs(
                            [self.job.PandaID],
                            fromDefined=False,
                            fromArchived=True,
                            fromWaiting=False)[0]
                        self.logger.debug(
                            "status {0}, taskBufferErrorCode {1}, taskBufferErrorDiag {2}"
                            .format(job_tmp.jobStatus,
                                    job_tmp.taskBufferErrorCode,
                                    job_tmp.taskBufferErrorDiag))
                        if job_tmp.jobStatus == 'failed' and job_tmp.taskBufferErrorCode:
                            source = 'taskBufferErrorCode'
                            error_code = job_tmp.taskBufferErrorCode
                            error_diag = job_tmp.taskBufferErrorDiag
                            self.logger.debug(
                                "AdderGen.run 2 will call apply_retrial_rules")
                            retryModule.apply_retrial_rules(
                                self.taskBuffer, job_tmp.PandaID, source,
                                error_code, error_diag, job_tmp.attemptNr)
                            self.logger.debug("apply_retrial_rules 2 is back")
                    except IndexError:
                        pass
                    except Exception as e:
                        self.logger.error(
                            "apply_retrial_rules 2 excepted and needs to be investigated (%s): %s"
                            % (e, traceback.format_exc()))

                    # setup for closer
                    if not (EventServiceUtils.isEventServiceJob(self.job)
                            and self.job.isCancelled()):
                        destDBList = []
                        guidList = []
                        for file in self.job.Files:
                            # ignore inputs
                            if file.type == 'input':
                                continue
                            # skip pseudo datasets
                            if file.destinationDBlock in ['', None, 'NULL']:
                                continue
                            # start closer for output/log datasets
                            if not file.destinationDBlock in destDBList:
                                destDBList.append(file.destinationDBlock)
                            # collect GUIDs
                            if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['ptest','rc_test','rucio_test'] and \
                                                                      self.job.processingType in ['pathena','prun','gangarobot-rctest','hammercloud'])) \
                                                                      and file.type == 'output':
                                # extract base LFN since LFN was changed to full LFN for CMS
                                baseLFN = file.lfn.split('/')[-1]
                                guidList.append({
                                    'lfn': baseLFN,
                                    'guid': file.GUID,
                                    'type': file.type,
                                    'checksum': file.checksum,
                                    'md5sum': file.md5sum,
                                    'fsize': file.fsize,
                                    'scope': file.scope
                                })
                        if guidList != []:
                            retG = self.taskBuffer.setGUIDs(guidList)
                        if destDBList != []:
                            # start Closer
                            if adderPlugin != None and hasattr(
                                    adderPlugin, 'datasetMap'
                            ) and adderPlugin.datasetMap != {}:
                                cThr = Closer.Closer(
                                    self.taskBuffer,
                                    destDBList,
                                    self.job,
                                    datasetMap=adderPlugin.datasetMap)
                            else:
                                cThr = Closer.Closer(self.taskBuffer,
                                                     destDBList, self.job)
                            self.logger.debug("start Closer")
                            cThr.start()
                            cThr.join()
                            self.logger.debug("end Closer")
                        # run closer for assocaiate parallel jobs
                        if EventServiceUtils.isJobCloningJob(self.job):
                            assDBlockMap = self.taskBuffer.getDestDBlocksWithSingleConsumer(
                                self.job.jediTaskID, self.job.PandaID,
                                destDBList)
                            for assJobID, assDBlocks in assDBlockMap.iteritems(
                            ):
                                assJob = self.taskBuffer.peekJobs(
                                    [assJobID],
                                    fromDefined=False,
                                    fromArchived=False,
                                    fromWaiting=False,
                                    forAnal=True)[0]
                                if self.job == None:
                                    self.logger.debug(
                                        ': associated job PandaID={0} not found in DB'
                                        .format(assJobID))
                                else:
                                    cThr = Closer.Closer(
                                        self.taskBuffer, assDBlocks, assJob)
                                    self.logger.debug(
                                        "start Closer for PandaID={0}".format(
                                            assJobID))
                                    cThr.start()
                                    cThr.join()
                                    self.logger.debug(
                                        "end Closer for PandaID={0}".format(
                                            assJobID))
            self.logger.debug("end")
            try:
                # remove Catalog
                os.remove(self.xmlFile)
            except:
                pass
            # unlock XML
            if self.lockXML != None:
                fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                self.lockXML.close()
        except:
            type, value, traceBack = sys.exc_info()
            errStr = ": %s %s " % (type, value)
            errStr += traceback.format_exc()
            self.logger.error(errStr)
            self.logger.error("except")
            # unlock XML just in case
            try:
                if self.lockXML != None:
                    fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
            except:
                type, value, traceBack = sys.exc_info()
                self.logger.error(": %s %s" % (type, value))
                self.logger.error("cannot unlock XML")

    # parse XML
    # 0: succeeded, 1: harmless error to exit, 2: fatal error, 3: event service
    def parseXML(self):
        # get LFN and GUID
        self.logger.debug('XML filename : %s' % self.xmlFile)
        # no outputs
        if self.job.Files == []:
            self.logger.debug("has no outputs")
            self.logger.debug("parseXML end")
            return 0
        # get input files
        inputLFNs = []
        for file in self.job.Files:
            if file.type == 'input':
                inputLFNs.append(file.lfn)
        # parse XML
        lfns = []
        guids = []
        fsizes = []
        md5sums = []
        chksums = []
        surls = []
        fullLfnMap = {}
        nEventsMap = {}
        try:
            root = xml.dom.minidom.parse(self.xmlFile)
            files = root.getElementsByTagName('File')
            for file in files:
                # get GUID
                guid = str(file.getAttribute('ID'))
                # get PFN and LFN nodes
                logical = file.getElementsByTagName('logical')[0]
                lfnNode = logical.getElementsByTagName('lfn')[0]
                # convert UTF8 to Raw
                lfn = str(lfnNode.getAttribute('name'))
                # get metadata
                fsize = None
                md5sum = None
                adler32 = None
                surl = None
                fullLFN = None
                for meta in file.getElementsByTagName('metadata'):
                    # get fsize
                    name = str(meta.getAttribute('att_name'))
                    if name == 'fsize':
                        fsize = long(meta.getAttribute('att_value'))
                    elif name == 'md5sum':
                        md5sum = str(meta.getAttribute('att_value'))
                        # check
                        if re.search("^[a-fA-F0-9]{32}$", md5sum) == None:
                            md5sum = None
                    elif name == 'adler32':
                        adler32 = str(meta.getAttribute('att_value'))
                    elif name == 'surl':
                        surl = str(meta.getAttribute('att_value'))
                    elif name == 'full_lfn':
                        fullLFN = str(meta.getAttribute('att_value'))
                # endpoints
                self.extraInfo['endpoint'][lfn] = []
                for epNode in file.getElementsByTagName('endpoint'):
                    self.extraInfo['endpoint'][lfn].append(
                        str(epNode.firstChild.data))
                # error check
                if (not lfn in inputLFNs) and (fsize == None or
                                               (md5sum == None
                                                and adler32 == None)):
                    if EventServiceUtils.isEventServiceMerge(self.job):
                        continue
                    else:
                        raise RuntimeError, 'fsize/md5sum/adler32/surl=None'
                # append
                lfns.append(lfn)
                guids.append(guid)
                fsizes.append(fsize)
                md5sums.append(md5sum)
                surls.append(surl)
                if adler32 != None:
                    # use adler32 if available
                    chksums.append("ad:%s" % adler32)
                else:
                    chksums.append("md5:%s" % md5sum)
                if fullLFN != None:
                    fullLfnMap[lfn] = fullLFN
        except:
            # parse json
            try:
                import json
                with open(self.xmlFile) as tmpF:
                    jsonDict = json.load(tmpF)
                    for lfn, fileData in jsonDict.iteritems():
                        lfn = str(lfn)
                        fsize = None
                        md5sum = None
                        adler32 = None
                        surl = None
                        fullLFN = None
                        guid = str(fileData['guid'])
                        if 'fsize' in fileData:
                            fsize = long(fileData['fsize'])
                        if 'md5sum' in fileData:
                            md5sum = str(fileData['md5sum'])
                            # check
                            if re.search("^[a-fA-F0-9]{32}$", md5sum) == None:
                                md5sum = None
                        if 'adler32' in fileData:
                            adler32 = str(fileData['adler32'])
                        if 'surl' in fileData:
                            surl = str(fileData['surl'])
                        if 'full_lfn' in fileData:
                            fullLFN = str(fileData['full_lfn'])
                        # endpoints
                        self.extraInfo['endpoint'][lfn] = []
                        if 'endpoint' in fileData:
                            self.extraInfo['endpoint'][lfn] = fileData[
                                'endpoint']
                        # error check
                        if (not lfn in inputLFNs) and (fsize == None or
                                                       (md5sum == None
                                                        and adler32 == None)):
                            if EventServiceUtils.isEventServiceMerge(self.job):
                                continue
                            else:
                                raise RuntimeError, 'fsize/md5sum/adler32/surl=None'
                        # append
                        lfns.append(lfn)
                        guids.append(guid)
                        fsizes.append(fsize)
                        md5sums.append(md5sum)
                        surls.append(surl)
                        if adler32 != None:
                            # use adler32 if available
                            chksums.append("ad:%s" % adler32)
                        else:
                            chksums.append("md5:%s" % md5sum)
                        if fullLFN != None:
                            fullLfnMap[lfn] = fullLFN
            except:
                # check if file exists
                if os.path.exists(self.xmlFile):
                    type, value, traceBack = sys.exc_info()
                    self.logger.error(": %s %s" % (type, value))
                    # set failed anyway
                    self.job.jobStatus = 'failed'
                    # XML error happens when pilot got killed due to wall-time limit or failures in wrapper
                    if (self.job.pilotErrorCode in [0,'0','NULL']) and \
                       (self.job.transExitCode  in [0,'0','NULL']):
                        self.job.ddmErrorCode = ErrorCode.EC_Adder
                        self.job.ddmErrorDiag = "Could not get GUID/LFN/MD5/FSIZE/SURL from pilot XML"
                    return 2
                else:
                    # XML was deleted
                    return 1
        # parse metadata to get nEvents
        try:
            root = xml.dom.minidom.parseString(self.job.metadata)
            files = root.getElementsByTagName('File')
            for file in files:
                # get GUID
                guid = str(file.getAttribute('ID'))
                # get PFN and LFN nodes
                logical = file.getElementsByTagName('logical')[0]
                lfnNode = logical.getElementsByTagName('lfn')[0]
                # convert UTF8 to Raw
                lfn = str(lfnNode.getAttribute('name'))
                # get metadata
                nevents = None
                for meta in file.getElementsByTagName('metadata'):
                    # get fsize
                    name = str(meta.getAttribute('att_name'))
                    if name == 'events':
                        nevents = long(meta.getAttribute('att_value'))
                        nEventsMap[lfn] = nevents
                        break
        except:
            pass
        self.logger.debug('nEventsMap=%s' % str(nEventsMap))
        # parse json
        try:
            import json
            jsonDict = json.loads(self.job.metadata)
            for jsonFileItem in jsonDict['files']['output']:
                for jsonSubFileItem in jsonFileItem['subFiles']:
                    lfn = str(jsonSubFileItem['name'])
                    try:
                        nevents = long(jsonSubFileItem['nentries'])
                        nEventsMap[lfn] = nevents
                    except:
                        pass
        except:
            pass
        self.logger.debug('nEventsMapJson=%s' % str(nEventsMap))
        # get lumi block number
        lumiBlockNr = self.job.getLumiBlockNr()
        # copy files for variable number of outputs
        tmpStat = self.copyFilesForVariableNumOutputs(lfns)
        if not tmpStat:
            self.logger.error(
                "failed to copy files for variable number of outputs")
            return 2
        # check files
        fileList = []
        for file in self.job.Files:
            fileList.append(file.lfn)
            if file.type == 'input':
                if file.lfn in lfns:
                    if self.job.prodSourceLabel in ['user', 'panda']:
                        # skipped file
                        file.status = 'skipped'
                    elif self.job.prodSourceLabel in [
                            'managed', 'test', 'rc_test', 'ptest'
                    ]:
                        # failed by pilot
                        file.status = 'failed'
            elif file.type == 'output' or file.type == 'log':
                # add only log file for failed jobs
                if self.jobStatus == 'failed' and file.type != 'log':
                    file.status = 'failed'
                    continue
                # set failed if it is missing in XML
                if not file.lfn in lfns:
                    if self.job.jobStatus == 'finished' and \
                            (EventServiceUtils.isEventServiceJob(self.job) or EventServiceUtils.isJumboJob(self.job)):
                        # unset file status for ES jobs
                        pass
                    elif file.isAllowedNoOutput():
                        # allowed not to be produced
                        file.status = 'nooutput'
                        self.logger.debug('set {0} to status={1}'.format(
                            file.lfn, file.status))
                    else:
                        file.status = 'failed'
                        self.job.jobStatus = 'failed'
                        self.job.ddmErrorCode = ErrorCode.EC_Adder
                        self.job.ddmErrorDiag = "expected output {0} is missing in pilot XML".format(
                            file.lfn)
                        self.logger.error(self.job.ddmErrorDiag)
                    continue
                # look for GUID with LFN
                try:
                    i = lfns.index(file.lfn)
                    file.GUID = guids[i]
                    file.fsize = fsizes[i]
                    file.md5sum = md5sums[i]
                    file.checksum = chksums[i]
                    surl = surls[i]
                    # status
                    file.status = 'ready'
                    # change to full LFN
                    if fullLfnMap.has_key(file.lfn):
                        file.lfn = fullLfnMap[file.lfn]
                    # add SURL to extraInfo
                    self.extraInfo['surl'][file.lfn] = surl
                    # add nevents
                    if nEventsMap.has_key(file.lfn):
                        self.extraInfo['nevents'][file.lfn] = nEventsMap[
                            file.lfn]
                except:
                    # status
                    file.status = 'failed'
                    type, value, traceBack = sys.exc_info()
                    self.logger.error(": %s %s" % (type, value))
                # set lumi block number
                if lumiBlockNr != None and file.status != 'failed':
                    self.extraInfo['lbnr'][file.lfn] = lumiBlockNr
        # check consistency between XML and filesTable
        for lfn in lfns:
            if not lfn in fileList:
                self.logger.error("%s is not found in filesTable" % lfn)
                self.job.jobStatus = 'failed'
                for tmpFile in self.job.Files:
                    tmpFile.status = 'failed'
                self.job.ddmErrorCode = ErrorCode.EC_Adder
                self.job.ddmErrorDiag = "pilot produced {0} inconsistently with jobdef".format(
                    lfn)
                return 2
        # return
        self.logger.debug("parseXML end")
        return 0

    # copy files for variable number of outputs
    def copyFilesForVariableNumOutputs(self, lfns):
        # get original output files
        origOutputs = {}
        updateOrig = {}
        for tmpFile in self.job.Files:
            if tmpFile.type in ['output', 'log']:
                origOutputs[tmpFile.lfn] = tmpFile
                if tmpFile.lfn in lfns:
                    # keep original
                    updateOrig[tmpFile.lfn] = False
                else:
                    # overwrite original
                    updateOrig[tmpFile.lfn] = True
        # look for unkown files
        addedNewFiles = False
        for newLFN in lfns:
            if not newLFN in origOutputs:
                # look for corresponding original output
                for origLFN in origOutputs.keys():
                    tmpPatt = '^{0}\.*_\d+$'.format(origLFN)
                    if re.search(tmpPatt, newLFN) != None:
                        # copy file record
                        tmpStat = self.taskBuffer.copyFileRecord(
                            newLFN, origOutputs[origLFN], updateOrig[origLFN])
                        if not tmpStat:
                            return False
                        addedNewFiles = True
                        # disable further overwriting
                        updateOrig[origLFN] = False
                        break
        # refresh job info
        if addedNewFiles:
            self.job = self.taskBuffer.peekJobs([self.jobID],
                                                fromDefined=False,
                                                fromWaiting=False,
                                                forAnal=True)[0]
        # return
        return True
예제 #11
0
def _getPFNFromLFC(lfns,
                   dq2url,
                   guids,
                   storageName,
                   scopeList=[],
                   tmpLog=None):
    if tmpLog == None:
        tmpLog = LogWrapper(_log, logPrefix)
    tmpLog.debug('_getPFNFromLFC %s %s / %s LFNs:%s %s' %
                 (dq2url, str(storageName), len(lfns), str(
                     lfns[:3]), str(scopeList[:3])))
    outStr = ''
    # check paramter
    if guids == [] or storageName == [] or (len(lfns) != len(guids)):
        tmpLog.debug('_getPFNFromLFC done with empty list')
        return outStr
    # check scopeList
    if not scopeList in [None, []] and len(lfns) != len(scopeList):
        tmpLog.warning('_getPFNFromLFC wrong scopeList %s %s %s %s' %
                       (dq2url, str(storageName), str(lfns), str(scopeList)))
        tmpLog.error('_getPFNFromLFC failed')
        return outStr
    # loop over all LFNs
    iLFN = 0
    nLFN = 1000
    strFiles = ''
    outStr = ''
    for iLFN in range(len(lfns)):
        if scopeList != []:
            strFiles += '%s %s %s\n' % (lfns[iLFN], guids[iLFN],
                                        scopeList[iLFN])
        else:
            strFiles += '%s %s\n' % (lfns[iLFN], guids[iLFN])
        # bulk operation
        if (iLFN + 1) % nLFN == 0 or (iLFN + 1) >= len(lfns):
            # write to file
            inFileName = '%s/lfcin.%s' % (panda_config.logdir,
                                          commands.getoutput('uuidgen'))
            ifile = open(inFileName, 'w')
            ifile.write(strFiles)
            ifile.close()
            # construct commands
            strStorage = ''
            for storage in storageName:
                strStorage += '%s,' % storage
            strStorage = strStorage[:-1]
            com = 'cd %s > /dev/null 2>&1; export HOME=%s; ' % (
                panda_config.home_dir_cwd, panda_config.home_dir_cwd)
            com += 'unset LD_LIBRARY_PATH; unset PYTHONPATH; export PATH=/usr/local/bin:/bin:/usr/bin; '
            com+= 'source %s; %s/python -Wignore %s/LFCclient.py -f %s -l %s -s %s' % \
                  (panda_config.glite_source,panda_config.native_python32,panda_config.lfcClient_dir,
                   inFileName,dq2url,strStorage)
            tmpLog.debug(com)
            # exeute
            status, output = commands.getstatusoutput(com)
            tmpLog.debug(status)
            if status == 0:
                outStr += output
            else:
                tmpLog.error("_getPFNFromLFC : %s %s %s" %
                             (dq2url, status, output))
                # send message to logger
                try:
                    # make message
                    message = 'LFC access : %s %s %s' % (dq2url, status,
                                                         output)
                    # get logger
                    _pandaLogger = PandaLogger()
                    _pandaLogger.lock()
                    _pandaLogger.setParams({'Type': 'broker_util'})
                    logger = _pandaLogger.getHttpLogger(
                        panda_config.loggername)
                    # add message
                    logger.error(message)
                    # release HTTP handler
                    _pandaLogger.release()
                except:
                    pass
                tmpLog.error('_getPFNFromLFC failed')
                return status
            # reset
            strFiles = ''
    tmpLog.debug('_getPFNFromLFC done')
    # return
    return outStr
예제 #12
0
class EventPicker:
    # constructor
    def __init__(self, taskBuffer, siteMapper, evpFileName, ignoreError):
        self.taskBuffer = taskBuffer
        self.siteMapper = siteMapper
        self.ignoreError = ignoreError
        self.evpFileName = evpFileName
        self.token = datetime.datetime.utcnow().isoformat(' ')
        # logger
        self.logger = LogWrapper(_logger, self.token)
        self.pd2p = DynDataDistributer.DynDataDistributer([],
                                                          self.taskBuffer,
                                                          self.siteMapper,
                                                          token=' ',
                                                          logger=self.logger)
        self.userDatasetName = ''
        self.creationTime = ''
        self.params = ''
        self.lockedBy = ''
        self.evpFile = None
        self.userTaskName = ''
        # message buffer
        self.msgBuffer = []
        self.lineLimit = 100
        # JEDI
        self.jediTaskID = None

    # main
    def run(self):
        try:
            self.putLog('start %s' % self.evpFileName)
            # lock evp file
            self.evpFile = open(self.evpFileName)
            try:
                fcntl.flock(self.evpFile.fileno(),
                            fcntl.LOCK_EX | fcntl.LOCK_NB)
            except:
                # relase
                self.putLog("cannot lock %s" % self.evpFileName)
                self.evpFile.close()
                return True
            # options
            runEvtList = []
            eventPickDataType = ''
            eventPickStreamName = ''
            eventPickDS = []
            eventPickAmiTag = ''
            eventPickNumSites = 1
            inputFileList = []
            tagDsList = []
            tagQuery = ''
            tagStreamRef = ''
            skipDaTRI = False
            runEvtGuidMap = {}
            # read evp file
            for tmpLine in self.evpFile:
                tmpMatch = re.search('^([^=]+)=(.+)$', tmpLine)
                # check format
                if tmpMatch == None:
                    continue
                tmpItems = tmpMatch.groups()
                if tmpItems[0] == 'runEvent':
                    # get run and event number
                    tmpRunEvt = tmpItems[1].split(',')
                    if len(tmpRunEvt) == 2:
                        runEvtList.append(tmpRunEvt)
                elif tmpItems[0] == 'eventPickDataType':
                    # data type
                    eventPickDataType = tmpItems[1]
                elif tmpItems[0] == 'eventPickStreamName':
                    # stream name
                    eventPickStreamName = tmpItems[1]
                elif tmpItems[0] == 'eventPickDS':
                    # dataset pattern
                    eventPickDS = tmpItems[1].split(',')
                elif tmpItems[0] == 'eventPickAmiTag':
                    # AMI tag
                    eventPickAmiTag = tmpItems[1]
                elif tmpItems[0] == 'eventPickNumSites':
                    # the number of sites where datasets are distributed
                    try:
                        eventPickNumSites = int(tmpItems[1])
                    except:
                        pass
                elif tmpItems[0] == 'userName':
                    # user name
                    self.userDN = tmpItems[1]
                    self.putLog("user=%s" % self.userDN)
                elif tmpItems[0] == 'userTaskName':
                    # user task name
                    self.userTaskName = tmpItems[1]
                elif tmpItems[0] == 'userDatasetName':
                    # user dataset name
                    self.userDatasetName = tmpItems[1]
                elif tmpItems[0] == 'lockedBy':
                    # client name
                    self.lockedBy = tmpItems[1]
                elif tmpItems[0] == 'creationTime':
                    # creation time
                    self.creationTime = tmpItems[1]
                elif tmpItems[0] == 'params':
                    # parameters
                    self.params = tmpItems[1]
                elif tmpItems[0] == 'inputFileList':
                    # input file list
                    inputFileList = tmpItems[1].split(',')
                    try:
                        inputFileList.remove('')
                    except:
                        pass
                elif tmpItems[0] == 'tagDS':
                    # TAG dataset
                    tagDsList = tmpItems[1].split(',')
                elif tmpItems[0] == 'tagQuery':
                    # query for TAG
                    tagQuery = tmpItems[1]
                elif tmpItems[0] == 'tagStreamRef':
                    # StreamRef for TAG
                    tagStreamRef = tmpItems[1]
                    if not tagStreamRef.endswith('_ref'):
                        tagStreamRef += '_ref'
                elif tmpItems[0] == 'runEvtGuidMap':
                    # GUIDs
                    try:
                        exec "runEvtGuidMap=" + tmpItems[1]
                    except:
                        pass
            # extract task name
            if self.userTaskName == '' and self.params != '':
                try:
                    tmpMatch = re.search('--outDS(=| ) *([^ ]+)', self.params)
                    if tmpMatch != None:
                        self.userTaskName = tmpMatch.group(2)
                        if not self.userTaskName.endswith('/'):
                            self.userTaskName += '/'
                except:
                    pass
            # suppress DaTRI
            if self.params != '':
                if '--eventPickSkipDaTRI' in self.params:
                    skipDaTRI = True
            # get compact user name
            compactDN = self.taskBuffer.cleanUserID(self.userDN)
            # get jediTaskID
            self.jediTaskID = self.taskBuffer.getTaskIDwithTaskNameJEDI(
                compactDN, self.userTaskName)
            # convert
            if tagDsList == [] or tagQuery == '':
                # convert run/event list to dataset/file list
                tmpRet, locationMap, allFiles = self.pd2p.convertEvtRunToDatasets(
                    runEvtList, eventPickDataType, eventPickStreamName,
                    eventPickDS, eventPickAmiTag, self.userDN, runEvtGuidMap)
                if not tmpRet:
                    if 'isFatal' in locationMap and locationMap[
                            'isFatal'] == True:
                        self.ignoreError = False
                    self.endWithError(
                        'Failed to convert the run/event list to a dataset/file list'
                    )
                    return False
            else:
                # get parent dataset/files with TAG
                tmpRet, locationMap, allFiles = self.pd2p.getTagParentInfoUsingTagQuery(
                    tagDsList, tagQuery, tagStreamRef)
                if not tmpRet:
                    self.endWithError(
                        'Failed to get parent dataset/file list with TAG')
                    return False
            # use only files in the list
            if inputFileList != []:
                tmpAllFiles = []
                for tmpFile in allFiles:
                    if tmpFile['lfn'] in inputFileList:
                        tmpAllFiles.append(tmpFile)
                allFiles = tmpAllFiles
            # remove redundant CN from DN
            tmpDN = self.userDN
            tmpDN = re.sub('/CN=limited proxy', '', tmpDN)
            tmpDN = re.sub('(/CN=proxy)+$', '', tmpDN)
            # make dataset container
            tmpRet = self.pd2p.registerDatasetContainerWithDatasets(
                self.userDatasetName,
                allFiles,
                locationMap,
                nSites=eventPickNumSites,
                owner=tmpDN)
            if not tmpRet:
                self.endWithError('Failed to make a dataset container %s' %
                                  self.userDatasetName)
                return False
            # skip DaTRI
            if skipDaTRI:
                # successfully terminated
                self.putLog("skip DaTRI")
                # update task
                self.taskBuffer.updateTaskModTimeJEDI(self.jediTaskID)
            else:
                # get candidates
                tmpRet, candidateMaps = self.pd2p.getCandidates(
                    self.userDatasetName, checkUsedFile=False, useHidden=True)
                if not tmpRet:
                    self.endWithError(
                        'Failed to find candidate for destination')
                    return False
                # collect all candidates
                allCandidates = []
                for tmpDS, tmpDsVal in candidateMaps.iteritems():
                    for tmpCloud, tmpCloudVal in tmpDsVal.iteritems():
                        for tmpSiteName in tmpCloudVal[0]:
                            if not tmpSiteName in allCandidates:
                                allCandidates.append(tmpSiteName)
                if allCandidates == []:
                    self.endWithError('No candidate for destination')
                    return False
                # get list of dataset (container) names
                if eventPickNumSites > 1:
                    # decompose container to transfer datasets separately
                    tmpRet, tmpOut = self.pd2p.getListDatasetReplicasInContainer(
                        self.userDatasetName)
                    if not tmpRet:
                        self.endWithError('Failed to get the size of %s' %
                                          self.userDatasetName)
                        return False
                    userDatasetNameList = tmpOut.keys()
                else:
                    # transfer container at once
                    userDatasetNameList = [self.userDatasetName]
                # loop over all datasets
                sitesUsed = []
                for tmpUserDatasetName in userDatasetNameList:
                    # get size of dataset container
                    tmpRet, totalInputSize = rucioAPI.getDatasetSize(
                        tmpUserDatasetName)
                    if not tmpRet:
                        self.endWithError('Failed to get the size of %s' %
                                          tmpUserDatasetName)
                        return False
                    # run brokerage
                    tmpJob = JobSpec()
                    tmpJob.AtlasRelease = ''
                    self.putLog("run brokerage for %s" % tmpDS)
                    brokerage.broker.schedule([tmpJob],
                                              self.taskBuffer,
                                              self.siteMapper,
                                              True,
                                              allCandidates,
                                              True,
                                              datasetSize=totalInputSize)
                    if tmpJob.computingSite.startswith('ERROR'):
                        self.endWithError('brokerage failed with %s' %
                                          tmpJob.computingSite)
                        return False
                    self.putLog("site -> %s" % tmpJob.computingSite)
                    # send transfer request
                    try:
                        tmpDN = rucioAPI.parse_dn(tmpDN)
                        tmpStatus, userInfo = rucioAPI.finger(tmpDN)
                        if not tmpStatus:
                            raise RuntimeError, 'user info not found for {0} with {1}'.format(
                                tmpDN, userInfo)
                        tmpDN = userInfo['nickname']
                        tmpDQ2ID = self.siteMapper.getSite(
                            tmpJob.computingSite).ddm
                        tmpMsg = "%s ds=%s site=%s id=%s" % (
                            'registerDatasetLocation for DaTRI ',
                            tmpUserDatasetName, tmpDQ2ID, tmpDN)
                        self.putLog(tmpMsg)
                        rucioAPI.registerDatasetLocation(
                            tmpDS, [tmpDQ2ID],
                            lifetime=14,
                            owner=tmpDN,
                            activity="User Subscriptions")
                        self.putLog('OK')
                    except:
                        errType, errValue = sys.exc_info()[:2]
                        tmpStr = 'Failed to send transfer request : %s %s' % (
                            errType, errValue)
                        tmpStr.strip()
                        tmpStr += traceback.format_exc()
                        self.endWithError(tmpStr)
                        return False
                    # list of sites already used
                    sitesUsed.append(tmpJob.computingSite)
                    self.putLog("used %s sites" % len(sitesUsed))
                    # set candidates
                    if len(sitesUsed) >= eventPickNumSites:
                        # reset candidates to limit the number of sites
                        allCandidates = sitesUsed
                        sitesUsed = []
                    else:
                        # remove site
                        allCandidates.remove(tmpJob.computingSite)
                # send email notification for success
                tmpMsg = 'A transfer request was successfully sent to Rucio.\n'
                tmpMsg += 'Your task will get started once transfer is completed.'
                self.sendEmail(True, tmpMsg)
            try:
                # unlock and delete evp file
                fcntl.flock(self.evpFile.fileno(), fcntl.LOCK_UN)
                self.evpFile.close()
                os.remove(self.evpFileName)
            except:
                pass
            # successfully terminated
            self.putLog("end %s" % self.evpFileName)
            return True
        except:
            errType, errValue = sys.exc_info()[:2]
            self.endWithError('Got exception %s:%s %s' %
                              (errType, errValue, traceback.format_exc()))
            return False

    # end with error
    def endWithError(self, message):
        self.putLog(message, 'error')
        # unlock evp file
        try:
            fcntl.flock(self.evpFile.fileno(), fcntl.LOCK_UN)
            self.evpFile.close()
            if not self.ignoreError:
                # remove evp file
                os.remove(self.evpFileName)
                # send email notification
                self.sendEmail(False, message)
        except:
            pass
        # upload log
        if self.jediTaskID != None:
            outLog = self.uploadLog()
            self.taskBuffer.updateTaskErrorDialogJEDI(
                self.jediTaskID, 'event picking failed. ' + outLog)
            # update task
            if not self.ignoreError:
                self.taskBuffer.updateTaskModTimeJEDI(self.jediTaskID,
                                                      'tobroken')
            self.putLog(outLog)
        self.putLog('end %s' % self.evpFileName)

    # put log
    def putLog(self, msg, type='debug'):
        tmpMsg = msg
        if type == 'error':
            self.logger.error(tmpMsg)
        else:
            self.logger.debug(tmpMsg)

    # send email notification
    def sendEmail(self, isSucceeded, message):
        # mail address
        toAdder = Notifier(self.taskBuffer, None, []).getEmail(self.userDN)
        if toAdder == '':
            self.putLog('cannot find email address for %s' % self.userDN,
                        'error')
            return
        # subject
        mailSubject = "PANDA notification for Event-Picking Request"
        # message
        mailBody = "Hello,\n\nHere is your request status for event picking\n\n"
        if isSucceeded:
            mailBody += "Status  : Passed to Rucio\n"
        else:
            mailBody += "Status  : Failed\n"
        mailBody += "Created : %s\n" % self.creationTime
        mailBody += "Ended   : %s\n" % datetime.datetime.utcnow().strftime(
            '%Y-%m-%d %H:%M:%S')
        mailBody += "Dataset : %s\n" % self.userDatasetName
        mailBody += "\n"
        mailBody += "Parameters : %s %s\n" % (self.lockedBy, self.params)
        mailBody += "\n"
        mailBody += "%s\n" % message
        # send
        retVal = MailUtils().send(toAdder, mailSubject, mailBody)
        # return
        return

    # upload log
    def uploadLog(self):
        if self.jediTaskID == None:
            return 'cannot find jediTaskID'
        strMsg = self.logger.dumpToString()
        s, o = Client.uploadLog(strMsg, self.jediTaskID)
        if s != 0:
            return "failed to upload log with {0}.".format(s)
        if o.startswith('http'):
            return '<a href="{0}">log</a>'.format(o)
        return o
예제 #13
0
class AdderGen:
    # constructor
    def __init__(self,taskBuffer,jobID,jobStatus,xmlFile,ignoreTmpError=True,siteMapper=None):
        self.job = None
        self.jobID = jobID
        self.jobStatus = jobStatus
        self.taskBuffer = taskBuffer
        self.ignoreTmpError = ignoreTmpError
        self.lockXML = None
        self.siteMapper = siteMapper
        self.attemptNr = None
        self.xmlFile = xmlFile
        self.datasetMap = {}
        self.extraInfo = {'surl':{},'nevents':{},'lbnr':{}}
        # exstract attemptNr
        try:
            tmpAttemptNr = self.xmlFile.split('/')[-1].split('_')[-1]
            if re.search('^\d+$',tmpAttemptNr) != None:
                self.attemptNr = int(tmpAttemptNr)
        except:
            pass
        # logger
        self.logger = LogWrapper(_logger,self.jobID)
        
    # main
    def run(self):
        try:
            self.logger.debug("new start: %s attemptNr=%s" % (self.jobStatus,self.attemptNr))
            # lock XML
            self.lockXML = open(self.xmlFile)
            try:
                fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB)
            except:
                self.logger.debug("cannot get lock : %s" % self.xmlFile)
                self.lockXML.close()
                # remove XML just in case for the final attempt
                if not self.ignoreTmpError:
                    try:
                        # remove Catalog
                        os.remove(self.xmlFile)
                    except:
                        pass
                return
            # check if file exists
            if not os.path.exists(self.xmlFile):
                self.logger.debug("not exist : %s" % self.xmlFile)
                try:
                    fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                    self.lockXML.close()
                except:
                    pass
                return
            # query job
            self.job = self.taskBuffer.peekJobs([self.jobID],fromDefined=False,
                                                fromArchived=False,
                                                fromWaiting=False,
                                                forAnal=True)[0]
            # check if job has finished
            if self.job == None:
                self.logger.debug(': job not found in DB')
            elif self.job.jobStatus in ['finished','failed','unknown','cancelled','merging']:
                self.logger.error(': invalid state -> %s' % self.job.jobStatus)
            elif self.attemptNr != None and self.job.attemptNr != self.attemptNr:
                self.logger.error('wrong attemptNr -> job=%s <> %s' % (self.job.attemptNr,self.attemptNr))
            else:
                # check file status in JEDI
                fileCheckInJEDI = self.taskBuffer.checkInputFileStatusInJEDI(self.job)
                self.logger.debug("check file status in JEDI : {0}".format(fileCheckInJEDI))                
                if fileCheckInJEDI == None:
                    raise RuntimeError,'failed to check file status in JEDI'
                if fileCheckInJEDI == False:
                    # set job status to failed since some file status is wrong in JEDI 
                    self.jobStatus = 'failed'
                    self.job.ddmErrorCode = ErrorCode.EC_Adder
                    self.job.ddmErrorDiag = "wrong file status in JEDI"
                    self.logger.debug("set jobStatus={0} since input are already cancelled in JEDI".format(self.jobStatus))
                # keep old status
                oldJobStatus = self.job.jobStatus
                # set job status
                if not self.job.jobStatus in ['transferring']:
                    self.job.jobStatus = self.jobStatus
                addResult = None
                adderPlugin = None
                # parse XML
                parseResult = self.parseXML()
                if parseResult < 2:
                    # intraction with DDM
                    try:
                        # set VO=local for DDM free
                        if self.job.destinationSE == 'local':
                            tmpVO = 'local'
                        else:
                            tmpVO = self.job.VO
                        # instantiate concrete plugin
                        adderPluginClass = panda_config.getPlugin('adder_plugins',tmpVO)
                        if adderPluginClass == None:
                            # use ATLAS plugin by default
                            from AdderAtlasPlugin import AdderAtlasPlugin
                            adderPluginClass = AdderAtlasPlugin
                        self.logger.debug('plugin name {0}'.format(adderPluginClass.__name__))
                        adderPlugin = adderPluginClass(self.job,
                                                       taskBuffer=self.taskBuffer,
                                                       siteMapper=self.siteMapper,
                                                       extraInfo=self.extraInfo,
                                                       logger=self.logger)
                        # execute
                        self.logger.debug('plugin is ready')
                        adderPlugin.execute()
                        addResult = adderPlugin.result
                        self.logger.debug('plugin done with %s' % (addResult.statusCode))
                    except:
                        errtype,errvalue = sys.exc_info()[:2]
                        self.logger.error("failed to execute AdderPlugin for VO={0} with {1}:{2}".format(tmpVO,
                                                                                                         errtype,
                                                                                                         errvalue)) 
                        addResult = None
                        self.job.ddmErrorCode = ErrorCode.EC_Adder
                        self.job.ddmErrorDiag = "AdderPlugin failure"
                    # ignore temporary errors
                    if self.ignoreTmpError and addResult != None and addResult.isTemporary():
                        self.logger.debug(': ignore %s ' % self.job.ddmErrorDiag)
                        self.logger.debug('escape')
                        # unlock XML
                        try:
                            fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                            self.lockXML.close()
                        except:
                            type, value, traceBack = sys.exc_info()
                            self.logger.debug(": %s %s" % (type,value))
                            self.logger.debug("cannot unlock XML")
                        return
                    # failed
                    if addResult == None or not addResult.isSucceeded():
                        self.job.jobStatus = 'failed'
                # set file status for failed jobs or failed transferring jobs
                if self.job.jobStatus == 'failed' or self.jobStatus == 'failed':
                    self.job.jobStatus = 'failed'
                    for file in self.job.Files:
                        if file.type in ['output','log']:
                            if addResult != None and file.lfn in addResult.mergingFiles:
                                file.status = 'merging'
                            else:
                                file.status = 'failed'
                else:
                    # reset errors
                    self.job.jobDispatcherErrorCode = 0
                    self.job.jobDispatcherErrorDiag = 'NULL'
                    # set status
                    if addResult != None and addResult.mergingFiles != []:
                        # set status for merging:                        
                        for file in self.job.Files:
                            if file.lfn in addResult.mergingFiles:
                                file.status = 'merging'
                        self.job.jobStatus = 'merging'
                        # propagate transition to prodDB
                        self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
                    elif addResult != None and addResult.transferringFiles != []:
                        # set status for transferring
                        for file in self.job.Files:
                            if file.lfn in addResult.transferringFiles:
                                file.status = 'transferring'
                        self.job.jobStatus = 'transferring'
                        # propagate transition to prodDB
                        self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
                    else:
                        self.job.jobStatus = 'finished'
                # endtime
                if self.job.endTime=='NULL':
                    self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
                # output size and # of outputs
                self.job.nOutputDataFiles = 0
                self.job.outputFileBytes = 0
                for tmpFile in self.job.Files:
                    if tmpFile.type == 'output':
                        self.job.nOutputDataFiles += 1
                        try:
                            self.job.outputFileBytes += tmpFile.fsize
                        except:
                            pass
                # protection
                maxOutputFileBytes = 99999999999
                if self.job.outputFileBytes > maxOutputFileBytes:
                    self.job.outputFileBytes = maxOutputFileBytes
                # set cancelled state
                if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed':
                    self.job.jobStatus = 'cancelled'
                # update job
                self.logger.debug("updating DB")
                retU = self.taskBuffer.updateJobs([self.job],False,oldJobStatusList=[oldJobStatus],
                                                  extraInfo=self.extraInfo)
                self.logger.debug("retU: %s" % retU)
                # failed
                if not retU[0]:
                    self.logger.error('failed to update DB')
                    # unlock XML
                    try:
                        fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                        self.lockXML.close()                            
                    except:
                        type, value, traceBack = sys.exc_info()
                        self.logger.debug(": %s %s" % (type,value))
                        self.logger.debug("cannot unlock XML")
                    return
                # setup for closer
                if not (EventServiceUtils.isEventServiceJob(self.job) and self.job.jobStatus == 'cancelled'):
                    destDBList = []
                    guidList = []
                    for file in self.job.Files:
                        # ignore inputs
                        if file.type == 'input':
                            continue
                        # skip pseudo datasets
                        if file.destinationDBlock in ['',None,'NULL']:
                            continue
                        # start closer for output/log datasets
                        if not file.destinationDBlock in destDBList:
                            destDBList.append(file.destinationDBlock)
                        # collect GUIDs
                        if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['ptest','rc_test','rucio_test'] and \
                                                                  self.job.processingType in ['pathena','prun','gangarobot-rctest','hammercloud'])) \
                                                                  and file.type == 'output':
                            # extract base LFN since LFN was changed to full LFN for CMS
                            baseLFN = file.lfn.split('/')[-1]
                            guidList.append({'lfn':baseLFN,'guid':file.GUID,'type':file.type,
                                             'checksum':file.checksum,'md5sum':file.md5sum,
                                             'fsize':file.fsize,'scope':file.scope})
                    if guidList != []:
                        retG = self.taskBuffer.setGUIDs(guidList)
                    if destDBList != []:
                        # start Closer
                        if adderPlugin != None and hasattr(adderPlugin,'datasetMap') and adderPlugin.datasetMap != {}:
                            cThr = Closer.Closer(self.taskBuffer,destDBList,self.job,datasetMap=adderPlugin.datasetMap)
                        else:
                            cThr = Closer.Closer(self.taskBuffer,destDBList,self.job)
                        self.logger.debug("start Closer")
                        cThr.start()
                        cThr.join()
                        self.logger.debug("end Closer")
            self.logger.debug("end")
            try:
                # remove Catalog
                os.remove(self.xmlFile)
            except:
                pass
            # unlock XML
            if self.lockXML != None:
                fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                self.lockXML.close()            
        except:
            type, value, traceBack = sys.exc_info()
            self.logger.debug(": %s %s" % (type,value))
            self.logger.debug("except")
            # unlock XML just in case
            try:
                if self.lockXML != None:
                    fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
            except:
                type, value, traceBack = sys.exc_info()
                self.logger.debug(": %s %s" % (type,value))
                self.logger.debug("cannot unlock XML")


    # parse XML
    # 0: succeeded, 1: harmless error to exit, 2: fatal error, 3: event service
    def parseXML(self):
        # get LFN and GUID
        self.logger.debug('XML filename : %s' % self.xmlFile)
        # no outputs
        if self.job.Files == []:
            self.logger.debug("has no outputs")
            self.logger.debug("parseXML end")
            return 0
        # get input files
        inputLFNs = []
        for file in self.job.Files:
            if file.type == 'input':
                inputLFNs.append(file.lfn)
        # parse XML
        lfns    = []
        guids   = []
        fsizes  = []
        md5sums = []
        chksums = []
        surls   = []
        fullLfnMap = {}
        nEventsMap = {}
        try:
            root  = xml.dom.minidom.parse(self.xmlFile)
            files = root.getElementsByTagName('File')
            for file in files:
                # get GUID
                guid = str(file.getAttribute('ID'))
                # get PFN and LFN nodes
                logical  = file.getElementsByTagName('logical')[0]
                lfnNode  = logical.getElementsByTagName('lfn')[0]
                # convert UTF8 to Raw
                lfn = str(lfnNode.getAttribute('name'))
                # get metadata
                fsize   = None
                md5sum  = None
                adler32 = None
                surl    = None
                fullLFN = None
                for meta in file.getElementsByTagName('metadata'):
                    # get fsize
                    name = str(meta.getAttribute('att_name'))
                    if name == 'fsize':
                        fsize = long(meta.getAttribute('att_value'))
                    elif name == 'md5sum':
                        md5sum = str(meta.getAttribute('att_value'))
                        # check
                        if re.search("^[a-fA-F0-9]{32}$",md5sum) == None:
                            md5sum = None
                    elif name == 'adler32':
                        adler32 = str(meta.getAttribute('att_value'))
                    elif name == 'surl':
                        surl = str(meta.getAttribute('att_value'))
                    elif name == 'full_lfn':
                        fullLFN = str(meta.getAttribute('att_value'))
                # error check
                if (not lfn in inputLFNs) and (fsize == None or (md5sum == None and adler32 == None)):
                    if EventServiceUtils.isEventServiceMerge(self.job):
                        continue
                    else:
                        raise RuntimeError, 'fsize/md5sum/adler32/surl=None'
                # append
                lfns.append(lfn)
                guids.append(guid)
                fsizes.append(fsize)
                md5sums.append(md5sum)
                surls.append(surl)
                if adler32 != None:
                    # use adler32 if available
                    chksums.append("ad:%s" % adler32)
                else:
                    chksums.append("md5:%s" % md5sum)
                if fullLFN != None:
                    fullLfnMap[lfn] = fullLFN
        except:
            # check if file exists
            if os.path.exists(self.xmlFile):
                type, value, traceBack = sys.exc_info()
                self.logger.error(": %s %s" % (type,value))
                # set failed anyway
                self.job.jobStatus = 'failed'
                # XML error happens when pilot got killed due to wall-time limit or failures in wrapper
                if (self.job.pilotErrorCode in [0,'0','NULL']) and \
                   (self.job.transExitCode  in [0,'0','NULL']):
                    self.job.ddmErrorCode = ErrorCode.EC_Adder
                    self.job.ddmErrorDiag = "Could not get GUID/LFN/MD5/FSIZE/SURL from pilot XML"
                return 2
            else:
                # XML was deleted
                return 1
        # parse metadata to get nEvents
        try:
            root  = xml.dom.minidom.parseString(self.job.metadata)
            files = root.getElementsByTagName('File')
            for file in files:
                # get GUID
                guid = str(file.getAttribute('ID'))
                # get PFN and LFN nodes
                logical  = file.getElementsByTagName('logical')[0]
                lfnNode  = logical.getElementsByTagName('lfn')[0]
                # convert UTF8 to Raw
                lfn = str(lfnNode.getAttribute('name'))
                # get metadata
                nevents = None
                for meta in file.getElementsByTagName('metadata'):
                    # get fsize
                    name = str(meta.getAttribute('att_name'))
                    if name == 'events':
                        nevents = long(meta.getAttribute('att_value'))
                        nEventsMap[lfn] = nevents
                        break
        except:
            pass
        self.logger.debug('nEventsMap=%s' % str(nEventsMap))
        # get lumi block number
        lumiBlockNr = self.job.getLumiBlockNr()
        # check files
        fileList = []
        for file in self.job.Files:
            fileList.append(file.lfn)
            if file.type == 'input':
                if file.lfn in lfns:
                    if self.job.prodSourceLabel in ['user','panda']:
                        # skipped file
                        file.status = 'skipped'
                    elif self.job.prodSourceLabel in ['managed','test','rc_test','ptest']:
                        # failed by pilot
                        file.status = 'failed'
            elif file.type == 'output' or file.type == 'log':
                # add only log file for failed jobs
                if self.jobStatus == 'failed' and file.type != 'log':
                    file.status = 'failed'
                    continue
                # set failed if it is missing in XML
                if not file.lfn in lfns:
                    if self.job.jobStatus == 'finished' and EventServiceUtils.isEventServiceJob(self.job):
                        # unset file status for ES jobs
                        pass
                    else:
                        file.status = 'failed'
                    continue
                # look for GUID with LFN
                try:
                    i = lfns.index(file.lfn)
                    file.GUID   = guids[i]
                    file.fsize  = fsizes[i]
                    file.md5sum = md5sums[i]
                    file.checksum = chksums[i]
                    surl = surls[i]
                    # status
                    file.status = 'ready'
                    # change to full LFN
                    if fullLfnMap.has_key(file.lfn):
                        file.lfn = fullLfnMap[file.lfn]
                    # add SURL to extraInfo
                    self.extraInfo['surl'][file.lfn] = surl
                    # add nevents 
                    if nEventsMap.has_key(file.lfn):
                        self.extraInfo['nevents'][file.lfn] = nEventsMap[file.lfn]
                except:
                    # status
                    file.status = 'failed'
                    type, value, traceBack = sys.exc_info()
                    self.logger.error(": %s %s" % (type,value))
                # set lumi block number
                if lumiBlockNr != None and file.status != 'failed':
                    self.extraInfo['lbnr'][file.lfn] = lumiBlockNr 
        # check consistency between XML and filesTable
        for lfn in lfns:
            if not lfn in fileList:
                self.logger.error("%s is not found in filesTable" % lfn)
                self.job.jobStatus = 'failed'
                for tmpFile in self.job.Files:
                    tmpFile.status = 'failed'
                self.job.ddmErrorCode = ErrorCode.EC_Adder
                self.job.ddmErrorDiag = "pilot XML is inconsistent with filesTable"
                return 2
        # return
        self.logger.debug("parseXML end")
        return 0
예제 #14
0
        # look for python
        if re.search('python',line) == None:
            continue
        # PID
        pid = items[1]
        # start time
        timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line)
        startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6])
        # kill old process
        if startTime < timeLimit:
            tmpLog.debug("old process : %s %s" % (pid,startTime))
            tmpLog.debug(line)            
            commands.getoutput('kill -9 %s' % pid)
except:
    type, value, traceBack = sys.exc_info()
    tmpLog.error("kill process : %s %s" % (type,value))

    
# instantiate TB
taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)

refreshInterval = 3
lockInterval = 3
activeInterval = 60
setInterval = 3

pid = '{0}-{1}_{2}'.format(socket.getfqdn().split('.')[0],os.getpid(),os.getpgrp())

# get minimal resource
minResourceName = taskBuffer.getMinimalResource()
예제 #15
0
        if re.search('python', line) == None:
            continue
        # PID
        pid = items[1]
        # start time
        timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)', line)
        startTime = datetime.datetime(
            *time.strptime(timeM.group(1), '%b %d %H:%M:%S %Y')[:6])
        # kill old process
        if startTime < timeLimit:
            tmpLog.debug("old process : %s %s" % (pid, startTime))
            tmpLog.debug(line)
            commands.getoutput('kill -9 %s' % pid)
except:
    type, value, traceBack = sys.exc_info()
    tmpLog.error("kill process : %s %s" % (type, value))

# instantiate TB
taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1)

refreshInterval = 3
lockInterval = 3
activeInterval = 60
setInterval = 3

pid = '{0}-{1}_{2}'.format(socket.getfqdn().split('.')[0], os.getpid(),
                           os.getpgrp())

# get minimal resource
minResourceName = taskBuffer.getMinimalResource()
예제 #16
0
        # look for python
        if re.search('python',line) == None:
            continue
        # PID
        pid = items[1]
        # start time
        timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line)
        startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6])
        # kill old process
        if startTime < timeLimit:
            tmpLog.debug("old process : %s %s" % (pid,startTime))
            tmpLog.debug(line)            
            commands.getoutput('kill -9 %s' % pid)
except:
    type, value, traceBack = sys.exc_info()
    tmpLog.error("kill process : %s %s" % (type,value))

    
# instantiate TB
taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)

# instantiate sitemapper
aSiteMapper = SiteMapper(taskBuffer)

# delete
tmpLog.debug("Del session")
status,retSel = taskBuffer.querySQLS("SELECT MAX(PandaID) FROM ATLAS_PANDA.jobsDefined4",{})
if retSel != None:
    try:
        maxID = retSel[0][0]
        tmpLog.debug("maxID : %s" % maxID)
예제 #17
0
def getFilesFromLRC(files,
                    url,
                    guids=[],
                    storageName=[],
                    terminateWhenFailed=False,
                    getPFN=False,
                    scopeList=[]):
    tmpLog = LogWrapper(_log, None)
    tmpLog.debug('getFilesFromLRC "%s" %s' % (url, str(storageName)))
    # get PFC
    outSTR = ''
    if url.startswith('mysql://'):
        # from MySQL
        outSTR = _getPFNFromMySQL(files, url)
        # get PFN
        if getPFN:
            outPFN = {}
            # FIXME
            tmpLog.debug('RetPFN:%s ' % str(outPFN))
            return outPFN
    elif url.startswith('http://'):
        # from HTTP I/F
        outSTR = _getPoolFileCatalog(files, url)
        # get PFN
        if getPFN:
            outPFN = {}
            try:
                if not outSTR in ['', None]:
                    root = xml.dom.minidom.parseString(outSTR)
                    fileNodes = root.getElementsByTagName('File')
                    for file in fileNodes:
                        # get PFN and LFN nodes
                        physical = file.getElementsByTagName('physical')[0]
                        pfnNode = physical.getElementsByTagName('pfn')[0]
                        logical = file.getElementsByTagName('logical')[0]
                        lfnNode = logical.getElementsByTagName('lfn')[0]
                        # convert UTF8 to Raw
                        pfn = str(pfnNode.getAttribute('name'))
                        lfn = str(lfnNode.getAttribute('name'))
                        # assign
                        if not outPFN.has_key(lfn):
                            outPFN[lfn] = []
                        outPFN[lfn].append(pfn)
            except:
                type, value, traceBack = sys.exc_info()
                tmpLog.error(outSTR)
                tmpLog.error("could not parse XML - %s %s" % (type, value))
            tmpLog.debug('RetPFN:%s ' % str(outPFN))
            return outPFN
    elif url.startswith('lfc://') or url.startswith('rucio://'):
        # from LFC
        timeStart = datetime.datetime.utcnow()
        outSTR = _getPFNFromLFC(files,
                                url,
                                guids,
                                storageName,
                                scopeList=scopeList,
                                tmpLog=tmpLog)
        regTime = datetime.datetime.utcnow() - timeStart
        tmpLog.debug(
            'file lookup for %s LFNs from %s took %s.%03d sec' %
            (len(files), url, regTime.seconds, regTime.microseconds / 1000))
        # get PFN
        if getPFN:
            outPFN = {}
            try:
                if not outSTR in ['', None]:
                    tmpItems = outSTR.split('LFCRet :')
                    tmpItems.remove('')
                    # loop over all returns
                    for tmpItem in tmpItems:
                        exec "tmpLFNmap = %s" % tmpItem
                        for tmpLFN, tmpPFN in tmpLFNmap.iteritems():
                            outPFN[tmpLFN] = tmpPFN
            except:
                type, value, traceBack = sys.exc_info()
                tmpLog.error(outSTR)
                tmpLog.error("could not parse LFC ret - %s %s" % (type, value))
            tmpLog.debug('RetPFN:%s files' % len(outPFN))
            return outPFN
    # check return
    if not isinstance(outSTR, types.StringType):
        if terminateWhenFailed:
            return None
        # set empty string
        outSTR = ''
    # collect OK Files
    okFiles = []
    for file in files:
        if re.search(file, outSTR) != None:
            okFiles.append(file)
    tmpLog.debug('Ret:%s / %s files' % (str(okFiles[:3]), len(okFiles)))
    return okFiles
예제 #18
0
class AdderGen:
    # constructor
    def __init__(self,taskBuffer,jobID,jobStatus,xmlFile,ignoreTmpError=True,siteMapper=None):
        self.job = None
        self.jobID = jobID
        self.jobStatus = jobStatus
        self.taskBuffer = taskBuffer
        self.ignoreTmpError = ignoreTmpError
        self.lockXML = None
        self.siteMapper = siteMapper
        self.attemptNr = None
        self.xmlFile = xmlFile
        self.datasetMap = {}
        self.extraInfo = {'surl':{},'nevents':{},'lbnr':{},'endpoint':{}, 'guid':{}}
        # exstract attemptNr
        try:
            tmpAttemptNr = self.xmlFile.split('/')[-1].split('_')[-1]
            if re.search('^\d+$',tmpAttemptNr) != None:
                self.attemptNr = int(tmpAttemptNr)
        except:
            pass
        # logger
        self.logger = LogWrapper(_logger,str(self.jobID))


    # dump file report
    def dumpFileReport(self,fileCatalog,attemptNr):
        self.logger.debug("dump file report")
        # dump Catalog into file
        if attemptNr == None:
            xmlFile = '%s/%s_%s_%s' % (panda_config.logdir,self.jobID,self.jobStatus,
                                       str(uuid.uuid4()))
        else:
            xmlFile = '%s/%s_%s_%s_%s' % (panda_config.logdir,self.jobID,self.jobStatus,
                                          str(uuid.uuid4()),attemptNr)
        file = open(xmlFile,'w')
        file.write(fileCatalog)
        file.close()


    # get plugin class
    def getPluginClass(self, tmpVO):
        # instantiate concrete plugin
        adderPluginClass = panda_config.getPlugin('adder_plugins',tmpVO)
        if adderPluginClass == None:
            # use ATLAS plugin by default
            from AdderAtlasPlugin import AdderAtlasPlugin
            adderPluginClass = AdderAtlasPlugin
        self.logger.debug('plugin name {0}'.format(adderPluginClass.__name__))
        return adderPluginClass


    # main
    def run(self):
        try:
            self.logger.debug("new start: %s attemptNr=%s" % (self.jobStatus,self.attemptNr))
            # lock XML
            self.lockXML = open(self.xmlFile)
            try:
                fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB)
            except:
                self.logger.debug("cannot get lock : %s" % self.xmlFile)
                self.lockXML.close()
                # remove XML just in case for the final attempt
                if not self.ignoreTmpError:
                    try:
                        # remove Catalog
                        os.remove(self.xmlFile)
                    except:
                        pass
                return
            # check if file exists
            if not os.path.exists(self.xmlFile):
                self.logger.debug("not exist : %s" % self.xmlFile)
                try:
                    fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                    self.lockXML.close()
                except:
                    pass
                return
            # query job
            self.job = self.taskBuffer.peekJobs([self.jobID],fromDefined=False,
                                                fromWaiting=False,
                                                forAnal=True)[0]
            # check if job has finished
            if self.job == None:
                self.logger.debug(': job not found in DB')
            elif self.job.jobStatus in ['finished','failed','unknown','merging']:
                self.logger.error(': invalid state -> %s' % self.job.jobStatus)
            elif self.attemptNr != None and self.job.attemptNr != self.attemptNr:
                self.logger.error('wrong attemptNr -> job=%s <> %s' % (self.job.attemptNr,self.attemptNr))
            elif self.attemptNr is not None and self.job.jobStatus == 'transferring':
                errMsg = 'XML with attemptNr for {0}'.format(self.job.jobStatus)
                self.logger.error(errMsg)
                # FIXME
                raise RuntimeError, errMsg
            elif self.jobStatus == EventServiceUtils.esRegStatus:
                # instantiate concrete plugin
                adderPluginClass = self.getPluginClass(self.job.VO)
                adderPlugin = adderPluginClass(self.job,
                                               taskBuffer=self.taskBuffer,
                                               siteMapper=self.siteMapper,
                                               logger=self.logger)
                # execute
                self.logger.debug('plugin is ready for ES file registration')
                adderPlugin.registerEventServiceFiles()
            else:
                # check file status in JEDI
                if not self.job.isCancelled() and not self.job.taskBufferErrorCode in [taskbuffer.ErrorCode.EC_PilotRetried]:
                    fileCheckInJEDI = self.taskBuffer.checkInputFileStatusInJEDI(self.job)
                    self.logger.debug("check file status in JEDI : {0}".format(fileCheckInJEDI))                
                    if fileCheckInJEDI == None:
                        raise RuntimeError,'failed to check file status in JEDI'
                    if fileCheckInJEDI == False:
                        # set job status to failed since some file status is wrong in JEDI 
                        self.jobStatus = 'failed'
                        self.job.ddmErrorCode = ErrorCode.EC_Adder
                        errStr = "inconsistent file status between Panda and JEDI. "
                        errStr += "failed to avoid duplicated processing caused by synchronization failure"
                        self.job.ddmErrorDiag = errStr
                        self.logger.debug("set jobStatus={0} since input is inconsistent between Panda and JEDI".format(self.jobStatus))
                    elif self.job.jobSubStatus in ['pilot_closed']:
                        # terminated by the pilot
                        self.logger.debug("going to closed since terminated by the pilot")
                        retClosed = self.taskBuffer.killJobs([self.jobID],'pilot','60',True)
                        if retClosed[0] == True:
                            self.logger.debug("end")
                            try:
                                # remove Catalog
                                os.remove(self.xmlFile)
                            except:
                                pass
                            # unlock XML
                            if self.lockXML != None:
                                fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                                self.lockXML.close()
                            return
                    # check for cloned jobs
                    if EventServiceUtils.isJobCloningJob(self.job):
                        checkJC = self.taskBuffer.checkClonedJob(self.job)
                        if checkJC == None:
                            raise RuntimeError,'failed to check the cloned job'
                        # failed to lock semaphore
                        if checkJC['lock'] == False:
                            self.jobStatus = 'failed'
                            self.job.ddmErrorCode = ErrorCode.EC_Adder
                            self.job.ddmErrorDiag = "failed to lock semaphore for job cloning"
                            self.logger.debug("set jobStatus={0} since did not get semaphore for job cloning".format(self.jobStatus))
                # use failed for cancelled/closed jobs
                if self.job.isCancelled():
                    self.jobStatus = 'failed'
                    # reset error codes to skip retrial module
                    self.job.pilotErrorCode = 0
                    self.job.exeErrorCode = 0
                    self.job.ddmErrorCode = 0
                # keep old status
                oldJobStatus = self.job.jobStatus
                # set job status
                if not self.job.jobStatus in ['transferring']:
                    self.job.jobStatus = self.jobStatus
                addResult = None
                adderPlugin = None
                # parse XML
                parseResult = self.parseXML()
                if parseResult < 2:
                    # intraction with DDM
                    try:
                        # instantiate concrete plugin
                        adderPluginClass = self.getPluginClass(self.job.VO)
                        adderPlugin = adderPluginClass(self.job,
                                                       taskBuffer=self.taskBuffer,
                                                       siteMapper=self.siteMapper,
                                                       extraInfo=self.extraInfo,
                                                       logger=self.logger)
                        # execute
                        self.logger.debug('plugin is ready')
                        adderPlugin.execute()
                        addResult = adderPlugin.result
                        self.logger.debug('plugin done with %s' % (addResult.statusCode))
                    except:
                        errtype,errvalue = sys.exc_info()[:2]
                        self.logger.error("failed to execute AdderPlugin for VO={0} with {1}:{2}".format(self.job.VO,
                                                                                                         errtype,
                                                                                                         errvalue)) 
                        addResult = None
                        self.job.ddmErrorCode = ErrorCode.EC_Adder
                        self.job.ddmErrorDiag = "AdderPlugin failure"
                        
                    # ignore temporary errors
                    if self.ignoreTmpError and addResult != None and addResult.isTemporary():
                        self.logger.debug(': ignore %s ' % self.job.ddmErrorDiag)
                        self.logger.debug('escape')
                        # unlock XML
                        try:
                            fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                            self.lockXML.close()
                        except:
                            type, value, traceBack = sys.exc_info()
                            self.logger.debug(": %s %s" % (type,value))
                            self.logger.debug("cannot unlock XML")
                        return
                    # failed
                    if addResult == None or not addResult.isSucceeded():
                        self.job.jobStatus = 'failed'
                # set file status for failed jobs or failed transferring jobs
                self.logger.debug("status after plugin call :job.jobStatus=%s jobStatus=%s" % (self.job.jobStatus, self.jobStatus))
                if self.job.jobStatus == 'failed' or self.jobStatus == 'failed':
                    # First of all: check if job failed and in this case take first actions according to error table
                    source, error_code, error_diag = None, None, None
                    if self.job.pilotErrorCode:
                        source = 'pilotErrorCode'
                        error_code = self.job.pilotErrorCode
                        error_diag = self.job.pilotErrorDiag
                    elif self.job.exeErrorCode:
                        source = 'exeErrorCode'
                        error_code = self.job.exeErrorCode
                        error_diag = self.job.exeErrorDiag
                    elif self.job.ddmErrorCode:
                        source = 'ddmErrorCode'
                        error_code = self.job.ddmErrorCode
                        error_diag = self.job.ddmErrorDiag
                    elif self.job.transExitCode:
                        source = 'transExitCode'
                        error_code = self.job.transExitCode
                        error_diag = ''
            
                    # _logger.info("updatejob has source %s, error_code %s and error_diag %s"%(source, error_code, error_diag))
                    
                    if source and error_code:
                        try:
                            self.logger.debug("AdderGen.run will call apply_retrial_rules")
                            retryModule.apply_retrial_rules(self.taskBuffer, self.job.PandaID, source, error_code, error_diag, self.job.attemptNr)
                            self.logger.debug("apply_retrial_rules is back")
                        except Exception as e:
                            self.logger.error("apply_retrial_rules excepted and needs to be investigated (%s): %s"%(e, traceback.format_exc()))
                    
                    self.job.jobStatus = 'failed'
                    for file in self.job.Files:
                        if file.type in ['output','log']:
                            if addResult != None and file.lfn in addResult.mergingFiles:
                                file.status = 'merging'
                            else:
                                file.status = 'failed'
                else:
                    # reset errors
                    self.job.jobDispatcherErrorCode = 0
                    self.job.jobDispatcherErrorDiag = 'NULL'
                    # set status
                    if addResult != None and addResult.mergingFiles != []:
                        # set status for merging:                        
                        for file in self.job.Files:
                            if file.lfn in addResult.mergingFiles:
                                file.status = 'merging'
                        self.job.jobStatus = 'merging'
                        # propagate transition to prodDB
                        self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
                    elif addResult != None and addResult.transferringFiles != []:
                        # set status for transferring
                        for file in self.job.Files:
                            if file.lfn in addResult.transferringFiles:
                                file.status = 'transferring'
                        self.job.jobStatus = 'transferring'
                        self.job.jobSubStatus = None
                        # propagate transition to prodDB
                        self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
                    else:
                        self.job.jobStatus = 'finished'
                # endtime
                if self.job.endTime=='NULL':
                    self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
                # output size and # of outputs
                self.job.nOutputDataFiles = 0
                self.job.outputFileBytes = 0
                for tmpFile in self.job.Files:
                    if tmpFile.type == 'output':
                        self.job.nOutputDataFiles += 1
                        try:
                            self.job.outputFileBytes += tmpFile.fsize
                        except:
                            pass
                # protection
                maxOutputFileBytes = 99999999999
                if self.job.outputFileBytes > maxOutputFileBytes:
                    self.job.outputFileBytes = maxOutputFileBytes
                # set cancelled state
                if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed':
                    self.job.jobStatus = 'cancelled'
                # update job
                if oldJobStatus in ['cancelled','closed']:
                    pass
                else:
                    self.logger.debug("updating DB")
                    retU = self.taskBuffer.updateJobs([self.job],False,oldJobStatusList=[oldJobStatus],
                                                      extraInfo=self.extraInfo)
                    self.logger.debug("retU: %s" % retU)
                    # failed
                    if not retU[0]:
                        self.logger.error('failed to update DB for pandaid={0}'.format(self.job.PandaID))
                        # unlock XML
                        try:
                            fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                            self.lockXML.close()                            
                        except:
                            type, value, traceBack = sys.exc_info()
                            self.logger.debug(": %s %s" % (type,value))
                            self.logger.debug("cannot unlock XML")
                        return

                    try:
                        # updateJobs was successful and it failed a job with taskBufferErrorCode
                        self.logger.debug("AdderGen.run will peek the job")
                        job_tmp = self.taskBuffer.peekJobs([self.job.PandaID], fromDefined=False, fromArchived=True,
                                                           fromWaiting=False)[0]
                        self.logger.debug("status {0}, taskBufferErrorCode {1}, taskBufferErrorDiag {2}".format(job_tmp.jobStatus,
                                                                                                                job_tmp.taskBufferErrorCode,
                                                                                                                job_tmp.taskBufferErrorDiag))
                        if job_tmp.jobStatus == 'failed' and job_tmp.taskBufferErrorCode:
                            source = 'taskBufferErrorCode'
                            error_code = job_tmp.taskBufferErrorCode
                            error_diag = job_tmp.taskBufferErrorDiag
                            self.logger.debug("AdderGen.run 2 will call apply_retrial_rules")
                            retryModule.apply_retrial_rules(self.taskBuffer, job_tmp.PandaID, source, error_code,
                                                            error_diag, job_tmp.attemptNr)
                            self.logger.debug("apply_retrial_rules 2 is back")
                    except IndexError:
                        pass
                    except Exception as e:
                        self.logger.error("apply_retrial_rules 2 excepted and needs to be investigated (%s): %s" % (e, traceback.format_exc()))

                    # setup for closer
                    if not (EventServiceUtils.isEventServiceJob(self.job) and self.job.isCancelled()):
                        destDBList = []
                        guidList = []
                        for file in self.job.Files:
                            # ignore inputs
                            if file.type == 'input':
                                continue
                            # skip pseudo datasets
                            if file.destinationDBlock in ['',None,'NULL']:
                                continue
                            # start closer for output/log datasets
                            if not file.destinationDBlock in destDBList:
                                destDBList.append(file.destinationDBlock)
                            # collect GUIDs
                            if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['rucio_test'] + JobUtils.list_ptest_prod_sources and \
                                                                      self.job.processingType in ['pathena','prun','gangarobot-rctest','hammercloud'])) \
                                                                      and file.type == 'output':
                                # extract base LFN since LFN was changed to full LFN for CMS
                                baseLFN = file.lfn.split('/')[-1]
                                guidList.append({'lfn':baseLFN,'guid':file.GUID,'type':file.type,
                                                 'checksum':file.checksum,'md5sum':file.md5sum,
                                                 'fsize':file.fsize,'scope':file.scope})
                        if guidList != []:
                            retG = self.taskBuffer.setGUIDs(guidList)
                        if destDBList != []:
                            # start Closer
                            if adderPlugin != None and hasattr(adderPlugin,'datasetMap') and adderPlugin.datasetMap != {}:
                                cThr = Closer.Closer(self.taskBuffer,destDBList,self.job,datasetMap=adderPlugin.datasetMap)
                            else:
                                cThr = Closer.Closer(self.taskBuffer,destDBList,self.job)
                            self.logger.debug("start Closer")
                            cThr.start()
                            cThr.join()
                            self.logger.debug("end Closer")
                        # run closer for assocaiate parallel jobs
                        if EventServiceUtils.isJobCloningJob(self.job):
                            assDBlockMap = self.taskBuffer.getDestDBlocksWithSingleConsumer(self.job.jediTaskID,self.job.PandaID,
                                                                                            destDBList)
                            for assJobID,assDBlocks in assDBlockMap.iteritems():
                                assJob = self.taskBuffer.peekJobs([assJobID],fromDefined=False,
                                                                  fromArchived=False,
                                                                  fromWaiting=False,
                                                                  forAnal=True)[0]
                                if self.job == None:
                                    self.logger.debug(': associated job PandaID={0} not found in DB'.format(assJobID))
                                else:
                                    cThr = Closer.Closer(self.taskBuffer,assDBlocks,assJob)
                                    self.logger.debug("start Closer for PandaID={0}".format(assJobID))
                                    cThr.start()
                                    cThr.join()
                                    self.logger.debug("end Closer for PandaID={0}".format(assJobID))
            self.logger.debug("end")
            try:
                # remove Catalog
                os.remove(self.xmlFile)
            except:
                pass
            # unlock XML
            if self.lockXML != None:
                fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                self.lockXML.close()            
        except:
            type, value, traceBack = sys.exc_info()
            errStr = ": %s %s " % (type,value)
            errStr += traceback.format_exc()
            self.logger.error(errStr)
            self.logger.error("except")
            # unlock XML just in case
            try:
                if self.lockXML != None:
                    fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
            except:
                type, value, traceBack = sys.exc_info()
                self.logger.error(": %s %s" % (type,value))
                self.logger.error("cannot unlock XML")


    # parse XML
    # 0: succeeded, 1: harmless error to exit, 2: fatal error, 3: event service
    def parseXML(self):
        # get LFN and GUID
        self.logger.debug('XML filename : %s' % self.xmlFile)
        # no outputs
        if self.job.Files == []:
            self.logger.debug("has no outputs")
            self.logger.debug("parseXML end")
            return 0
        # get input files
        inputLFNs = []
        for file in self.job.Files:
            if file.type == 'input':
                inputLFNs.append(file.lfn)
        # parse XML
        lfns    = []
        guids   = []
        fsizes  = []
        md5sums = []
        chksums = []
        surls   = []
        fullLfnMap = {}
        nEventsMap = {}
        guidMap = dict()
        try:
            root  = xml.dom.minidom.parse(self.xmlFile)
            files = root.getElementsByTagName('File')
            for file in files:
                # get GUID
                guid = str(file.getAttribute('ID'))
                # get PFN and LFN nodes
                logical  = file.getElementsByTagName('logical')[0]
                lfnNode  = logical.getElementsByTagName('lfn')[0]
                # convert UTF8 to Raw
                lfn = str(lfnNode.getAttribute('name'))
                # get metadata
                fsize   = None
                md5sum  = None
                adler32 = None
                surl    = None
                fullLFN = None
                for meta in file.getElementsByTagName('metadata'):
                    # get fsize
                    name = str(meta.getAttribute('att_name'))
                    if name == 'fsize':
                        fsize = long(meta.getAttribute('att_value'))
                    elif name == 'md5sum':
                        md5sum = str(meta.getAttribute('att_value'))
                        # check
                        if re.search("^[a-fA-F0-9]{32}$",md5sum) == None:
                            md5sum = None
                    elif name == 'adler32':
                        adler32 = str(meta.getAttribute('att_value'))
                    elif name == 'surl':
                        surl = str(meta.getAttribute('att_value'))
                    elif name == 'full_lfn':
                        fullLFN = str(meta.getAttribute('att_value'))
                # endpoints
                self.extraInfo['endpoint'][lfn] = []
                for epNode in file.getElementsByTagName('endpoint'):
                    self.extraInfo['endpoint'][lfn].append(str(epNode.firstChild.data))
                # error check
                if (not lfn in inputLFNs) and (fsize == None or (md5sum == None and adler32 == None)):
                    if EventServiceUtils.isEventServiceMerge(self.job):
                        continue
                    else:
                        raise RuntimeError, 'fsize/md5sum/adler32/surl=None'
                # append
                lfns.append(lfn)
                guids.append(guid)
                fsizes.append(fsize)
                md5sums.append(md5sum)
                surls.append(surl)
                if adler32 != None:
                    # use adler32 if available
                    chksums.append("ad:%s" % adler32)
                else:
                    chksums.append("md5:%s" % md5sum)
                if fullLFN != None:
                    fullLfnMap[lfn] = fullLFN
        except:
            # parse json
            try:
                import json
                with open(self.xmlFile) as tmpF:
                    jsonDict = json.load(tmpF)
                    for lfn, fileData in jsonDict.iteritems():
                        lfn = str(lfn)
                        fsize   = None
                        md5sum  = None
                        adler32 = None
                        surl    = None
                        fullLFN = None
                        guid = str(fileData['guid'])
                        if 'fsize' in fileData:
                            fsize = long(fileData['fsize'])
                        if 'md5sum' in fileData:
                            md5sum = str(fileData['md5sum'])
                            # check
                            if re.search("^[a-fA-F0-9]{32}$",md5sum) == None:
                                md5sum = None
                        if 'adler32' in fileData:
                            adler32 = str(fileData['adler32'])
                        if 'surl' in fileData:
                            surl = str(fileData['surl'])
                        if 'full_lfn' in fileData:
                            fullLFN = str(fileData['full_lfn'])
                        # endpoints
                        self.extraInfo['endpoint'][lfn] = []
                        if 'endpoint' in fileData:
                            self.extraInfo['endpoint'][lfn] = fileData['endpoint']
                        # error check
                        if (not lfn in inputLFNs) and (fsize == None or (md5sum == None and adler32 == None)):
                            if EventServiceUtils.isEventServiceMerge(self.job):
                                continue
                            else:
                                raise RuntimeError, 'fsize/md5sum/adler32/surl=None'
                        # append
                        lfns.append(lfn)
                        guids.append(guid)
                        fsizes.append(fsize)
                        md5sums.append(md5sum)
                        surls.append(surl)
                        if adler32 != None:
                            # use adler32 if available
                            chksums.append("ad:%s" % adler32)
                        else:
                            chksums.append("md5:%s" % md5sum)
                        if fullLFN != None:
                            fullLfnMap[lfn] = fullLFN
            except:
                # check if file exists
                if os.path.exists(self.xmlFile):
                    type, value, traceBack = sys.exc_info()
                    self.logger.error(": %s %s" % (type,value))
                    # set failed anyway
                    self.job.jobStatus = 'failed'
                    # XML error happens when pilot got killed due to wall-time limit or failures in wrapper
                    if (self.job.pilotErrorCode in [0,'0','NULL']) and \
                       (self.job.taskBufferErrorCode not in [taskbuffer.ErrorCode.EC_WorkerDone]) and \
                       (self.job.transExitCode  in [0,'0','NULL']):
                        self.job.ddmErrorCode = ErrorCode.EC_Adder
                        self.job.ddmErrorDiag = "Could not get GUID/LFN/MD5/FSIZE/SURL from pilot XML"
                    return 2
                else:
                    # XML was deleted
                    return 1
        # parse metadata to get nEvents
        try:
            root  = xml.dom.minidom.parseString(self.job.metadata)
            files = root.getElementsByTagName('File')
            for file in files:
                # get GUID
                guid = str(file.getAttribute('ID'))
                # get PFN and LFN nodes
                logical  = file.getElementsByTagName('logical')[0]
                lfnNode  = logical.getElementsByTagName('lfn')[0]
                # convert UTF8 to Raw
                lfn = str(lfnNode.getAttribute('name'))
                guidMap[lfn] = guid
                # get metadata
                nevents = None
                for meta in file.getElementsByTagName('metadata'):
                    # get fsize
                    name = str(meta.getAttribute('att_name'))
                    if name == 'events':
                        nevents = long(meta.getAttribute('att_value'))
                        nEventsMap[lfn] = nevents
                        break
        except:
            pass
        # parse json
        try:
            import json
            jsonDict = json.loads(self.job.metadata)
            for jsonFileItem in jsonDict['files']['output']:
                for jsonSubFileItem in jsonFileItem['subFiles']:
                    lfn = str(jsonSubFileItem['name'])
                    try:
                        nevents = long(jsonSubFileItem['nentries'])
                        nEventsMap[lfn] = nevents
                    except:
                        pass
                    try:
                        guid = str(jsonSubFileItem['file_guid'])
                        guidMap[lfn] = guid
                    except:
                        pass
        except:
            pass
        self.logger.debug('nEventsMap=%s' % str(nEventsMap))
        self.logger.debug('guidMap=%s' % str(guidMap))
        # get lumi block number
        lumiBlockNr = self.job.getLumiBlockNr()
        # copy files for variable number of outputs
        tmpStat = self.copyFilesForVariableNumOutputs(lfns)
        if not tmpStat:
            self.logger.error("failed to copy files for variable number of outputs")
            return 2
        # check files
        fileList = []
        for file in self.job.Files:
            fileList.append(file.lfn)
            if file.type == 'input':
                if file.lfn in lfns:
                    if self.job.prodSourceLabel in ['user','panda']:
                        # skipped file
                        file.status = 'skipped'
                    elif self.job.prodSourceLabel in ['managed','test'] + JobUtils.list_ptest_prod_sources:
                        # failed by pilot
                        file.status = 'failed'
            elif file.type == 'output' or file.type == 'log':
                # add only log file for failed jobs
                if self.jobStatus == 'failed' and file.type != 'log':
                    file.status = 'failed'
                    continue
                # set failed if it is missing in XML
                if not file.lfn in lfns:
                    if self.job.jobStatus == 'finished' and \
                            (EventServiceUtils.isEventServiceJob(self.job) or EventServiceUtils.isJumboJob(self.job)):
                        # unset file status for ES jobs
                        pass
                    elif file.isAllowedNoOutput():
                        # allowed not to be produced
                        file.status = 'nooutput'
                        self.logger.debug('set {0} to status={1}'.format(file.lfn,file.status))
                    else:
                        file.status = 'failed'
                        self.job.jobStatus = 'failed'
                        self.job.ddmErrorCode = ErrorCode.EC_Adder
                        self.job.ddmErrorDiag = "expected output {0} is missing in pilot XML".format(file.lfn)
                        self.logger.error(self.job.ddmErrorDiag)
                    continue
                # look for GUID with LFN
                try:
                    i = lfns.index(file.lfn)
                    file.GUID   = guids[i]
                    file.fsize  = fsizes[i]
                    file.md5sum = md5sums[i]
                    file.checksum = chksums[i]
                    surl = surls[i]
                    # status
                    file.status = 'ready'
                    # change to full LFN
                    if fullLfnMap.has_key(file.lfn):
                        file.lfn = fullLfnMap[file.lfn]
                    # add SURL to extraInfo
                    self.extraInfo['surl'][file.lfn] = surl
                    # add nevents 
                    if nEventsMap.has_key(file.lfn):
                        self.extraInfo['nevents'][file.lfn] = nEventsMap[file.lfn]
                except:
                    # status
                    file.status = 'failed'
                    type, value, traceBack = sys.exc_info()
                    self.logger.error(": %s %s" % (type,value))
                # set lumi block number
                if lumiBlockNr != None and file.status != 'failed':
                    self.extraInfo['lbnr'][file.lfn] = lumiBlockNr 
        self.extraInfo['guid'] = guidMap
        # check consistency between XML and filesTable
        for lfn in lfns:
            if not lfn in fileList:
                self.logger.error("%s is not found in filesTable" % lfn)
                self.job.jobStatus = 'failed'
                for tmpFile in self.job.Files:
                    tmpFile.status = 'failed'
                self.job.ddmErrorCode = ErrorCode.EC_Adder
                self.job.ddmErrorDiag = "pilot produced {0} inconsistently with jobdef".format(lfn)
                return 2
        # return
        self.logger.debug("parseXML end")
        return 0



    # copy files for variable number of outputs
    def copyFilesForVariableNumOutputs(self,lfns):
        # get original output files
        origOutputs = {}
        updateOrig  = {}
        for tmpFile in self.job.Files:
            if tmpFile.type in ['output','log']:
                origOutputs[tmpFile.lfn] = tmpFile
                if tmpFile.lfn in lfns:
                    # keep original
                    updateOrig[tmpFile.lfn] = False
                else:
                    # overwrite original
                    updateOrig[tmpFile.lfn] = True
        # look for unkown files
        addedNewFiles = False
        for newLFN in lfns:
            if not newLFN in origOutputs:
                # look for corresponding original output
                for origLFN in origOutputs.keys():
                    tmpPatt = '^{0}\.*_\d+$'.format(origLFN)
                    if re.search(tmpPatt,newLFN) != None:
                        # copy file record
                        tmpStat = self.taskBuffer.copyFileRecord(newLFN,origOutputs[origLFN],updateOrig[origLFN])
                        if not tmpStat:
                            return False
                        addedNewFiles = True
                        # disable further overwriting
                        updateOrig[origLFN] = False
                        break
        # refresh job info
        if addedNewFiles:
            self.job = self.taskBuffer.peekJobs([self.jobID],fromDefined=False,
                                                fromWaiting=False,
                                                forAnal=True)[0]
        # return
        return True
예제 #19
0
                                        tmpLog.sendMsg(msg,panda_config.loggername,'userCap','warning')
                        elif tmpNumTotal < maxNumRun*0.9 and (prodUserName, workingGroup) in throttledUsers:
                                # throttle user
                                tmpNumJobs = taskBuffer.unThrottleUserJobs(prodUserName, workingGroup)
                                if tmpNumJobs != None and tmpNumJobs > 0:
                                        msg = 'released jobs for user="******" group={1} since number of running jobs is less than {2}'.format(prodUserName,
                                                                                                                                            workingGroup,
                                                                                                                                            maxNumRun)
                                        tmpLog.debug(msg)
                                        tmpLog.sendMsg(msg,panda_config.loggername,'userCap')
except:
	errtype,errvalue = sys.exc_info()[:2]
	errStr = "cap failed: %s %s" % (job.PandaID,errtype,errvalue)
	errStr.strip()
	errStr += traceback.format_exc()
	tmpLog.error(errStr)
	

# global average 
tmpLog.debug("=== boost jobs")
globalAverageRunDone = float(totalRunDone)/float(totalUsers)

tmpLog.debug("global average : %s" % globalAverageRunDone)

# count the number of users and run/done jobs for each site
siteRunDone = {}
siteUsers = {}
for computingSite,userValMap in usageBreakDownPerSite.iteritems():
	for prodUserName,wgValMap in userValMap.iteritems():
		for workingGroup,statValMap in wgValMap.iteritems():
			# count the number of users and running/done jobs
예제 #20
0
        if re.search('python', line) == None:
            continue
        # PID
        pid = items[1]
        # start time
        timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)', line)
        startTime = datetime.datetime(
            *time.strptime(timeM.group(1), '%b %d %H:%M:%S %Y')[:6])
        # kill old process
        if startTime < timeLimit:
            tmpLog.debug("old process : %s %s" % (pid, startTime))
            tmpLog.debug(line)
            commands.getoutput('kill -9 %s' % pid)
except:
    type, value, traceBack = sys.exc_info()
    tmpLog.error("kill process : %s %s" % (type, value))

# instantiate TB
taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1)

# instantiate sitemapper
aSiteMapper = SiteMapper(taskBuffer)

# delete
tmpLog.debug("Del session")
status, retSel = taskBuffer.querySQLS(
    "SELECT MAX(PandaID) FROM ATLAS_PANDA.jobsDefined4", {})
if retSel != None:
    try:
        maxID = retSel[0][0]
        tmpLog.debug("maxID : %s" % maxID)
예제 #21
0
def _getPFNFromLFC(lfns,dq2url,guids,storageName,scopeList=[],tmpLog=None):
    if tmpLog == None:
        tmpLog = LogWrapper(_log,logPrefix)
    tmpLog.debug('_getPFNFromLFC %s %s / %s LFNs:%s %s' % (dq2url,str(storageName),
                                                         len(lfns),str(lfns[:3]),str(scopeList[:3])))
    outStr = ''
    # check paramter
    if guids == [] or storageName == [] or (len(lfns) != len(guids)):
        tmpLog.debug('_getPFNFromLFC done with empty list')
        return outStr
    # check scopeList
    if not scopeList in [None,[]] and len(lfns) != len(scopeList):
        tmpLog.warning('_getPFNFromLFC wrong scopeList %s %s %s %s' % (dq2url,str(storageName),
                                                                       str(lfns),str(scopeList)))
        tmpLog.error('_getPFNFromLFC failed')
        return outStr
    # loop over all LFNs
    iLFN = 0
    nLFN = 1000
    strFiles = ''    
    outStr = ''
    for iLFN in range(len(lfns)):
        if scopeList != []:
            strFiles  += '%s %s %s\n' % (lfns[iLFN],guids[iLFN],scopeList[iLFN]) 
        else:
            strFiles  += '%s %s\n' % (lfns[iLFN],guids[iLFN]) 
        # bulk operation
        if (iLFN+1) % nLFN == 0 or (iLFN+1) >= len(lfns):
            # write to file
            inFileName = '%s/lfcin.%s'  % (panda_config.logdir,commands.getoutput('uuidgen'))
            ifile = open(inFileName,'w')
            ifile.write(strFiles)
            ifile.close()
            # construct commands
            strStorage = ''
            for storage in storageName:
                strStorage += '%s,' % storage
            strStorage = strStorage[:-1]
            com = 'cd %s > /dev/null 2>&1; export HOME=%s; ' % (panda_config.home_dir_cwd,panda_config.home_dir_cwd)            
            com+= 'unset LD_LIBRARY_PATH; unset PYTHONPATH; export PATH=/usr/local/bin:/bin:/usr/bin; '
            com+= 'source %s; %s/python -Wignore %s/LFCclient.py -f %s -l %s -s %s' % \
                  (panda_config.glite_source,panda_config.native_python32,panda_config.lfcClient_dir,
                   inFileName,dq2url,strStorage)
            tmpLog.debug(com)
            # exeute
            status,output = commands.getstatusoutput(com)
            tmpLog.debug(status)
            if status == 0:
                outStr += output
            else:
                tmpLog.error("_getPFNFromLFC : %s %s %s" % (dq2url,status,output))
                # send message to logger
                try:
                    # make message
                    message = 'LFC access : %s %s %s' % (dq2url,status,output)
                    # get logger
                    _pandaLogger = PandaLogger()
                    _pandaLogger.lock()
                    _pandaLogger.setParams({'Type':'broker_util'})
                    logger = _pandaLogger.getHttpLogger(panda_config.loggername)
                    # add message
                    logger.error(message)
                    # release HTTP handler
                    _pandaLogger.release()
                except:
                    pass
                tmpLog.error('_getPFNFromLFC failed')
                return status
            # reset
            strFiles = ''
    tmpLog.debug('_getPFNFromLFC done')
    # return
    return outStr